diff --git a/.gitignore b/.gitignore
index d11a504bdc56ee98b3d5a0c33f9f75d996e45567..be75938ec401b1d72fa54773c85191aaac7d7f35 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,7 @@ node_modules
 /bazel-*
 /bazel_pip
 /tools/python_bin_path.sh
-/tools/git/gen
+/tensorflow/tools/git/gen
 /pip_test
 /_python_build
 *.pyc
@@ -26,4 +26,11 @@ Podfile.lock
 /tensorflow/contrib/lite/gen/**
 /tensorflow/contrib/lite/examples/ios/simple/data/*.txt
 /tensorflow/contrib/lite/examples/ios/simple/data/*.tflite
-xcuserdata/**
\ No newline at end of file
+xcuserdata/**
+
+# Android
+.gradle
+.idea
+*.iml
+local.properties
+gradleBuild
diff --git a/AUTHORS b/AUTHORS
index a46ae7e616ab3a420d9fb2691ee8d8650032a39f..aa4be5169dcc68c579863e8ba6307cd00e9f9a68 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -7,4 +7,4 @@
 # The email address is not required for organizations.
 
 Google Inc.
-Yuan Tang terrytangyuan@gmail.com
+Yuan Tang <terrytangyuan@gmail.com>
diff --git a/CODEOWNERS b/CODEOWNERS
index 57a4df40e651f45dc03493af631d73332e46c182..007a304c3e706ce968576ec8979c08f1a3bcc552 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,53 +1,53 @@
 # NOTE: Disabled temporarily because it's too noisy on pushes.
 # Where component owners are known, add them here.
 
-#tensorflow/core/platform/windows/* @mrry
-#tensorflow/java/* @asimshankar
-#tensorflow/tensorboard/* @jart @dandelionmane
-#tensorflow/tools/docs/* @markdaoust
+# /tensorflow/core/platform/windows/ @mrry
+# /tensorflow/java/ @asimshankar
+# /tensorflow/tensorboard/ @jart @dandelionmane
+# /tensorflow/tools/docs/ @markdaoust
 
 # contrib
 
-# NEED OWNER: tensorflow/contrib/avro/*
-#tensorflow/contrib/batching/* @alextp @chrisolston
-#tensorflow/contrib/bayesflow/* @ebrevdo @rsepassi @jvdillon
-#tensorflow/contrib/boosted_trees/* @sshrdp @yk5 @nataliaponomareva
-#tensorflow/contrib/cmake/* @mrry @benoitsteiner
-#tensorflow/contrib/copy_graph/* @tucker @poxvoculi
-#tensorflow/contrib/crf/* @kentonl
-#tensorflow/contrib/data/* @mrry
-#tensorflow/contrib/distributions/* @jvdillon @langmore @rsepassi
-#tensorflow/contrib/factorization/* @agarwal-ashish @xavigonzalvo
-#tensorflow/contrib/ffmpeg/* @fredbertsch
-# NEED OWNER: tensorflow/contrib/framework/*
-#tensorflow/contrib/graph_editor/* @purpledog
-# NEED OWNER: tensorflow/contrib/grid_rnn/*
-#tensorflow/contrib/hvx/* @satok16
-#tensorflow/contrib/integrate/* @shoyer
-#tensorflow/contrib/kernel_methods/* @petrosmol
-#tensorflow/contrib/ios_examples/* @petewarden
-#tensorflow/contrib/labeled_tensor/* @shoyer
-#tensorflow/contrib/layers/* @fchollet @martinwicke
-#tensorflow/contrib/learn/* @martinwicke @ispirmustafa @alextp
-#tensorflow/contrib/linalg/* @langmore
-#tensorflow/contrib/linear_optimizer/* @petrosmol @andreasst @katsiapis
-#tensorflow/contrib/lookup/* @ysuematsu @andreasst
-#tensorflow/contrib/losses/* @alextp @ispirmustafa
-#tensorflow/contrib/makefile/* @petewarden @satok16 @wolffg
-#tensorflow/contrib/metrics/* @alextp @honkentuber @ispirmustafa
-#tensorflow/contrib/nccl/* @cwhipkey @zheng-xq
-#tensorflow/contrib/opt/* @strategist333
-#tensorflow/contrib/pi_examples/* @maciekcc
-#tensorflow/contrib/quantization/* @petewarden @cwhipkey @keveman
-#tensorflow/contrib/rnn/* @ebrevdo
-#tensorflow/contrib/saved_model/* @nfiedel @sukritiramesh
-#tensorflow/contrib/seq2seq/* @lukaszkaiser
-#tensorflow/contrib/session_bundle/* @nfiedel @sukritiramesh
-#tensorflow/contrib/slim/* @sguada @thenbasilmanran
-#tensorflow/contrib/stateless/* @girving
-#tensorflow/contrib/tensor_forest/* @gilberthendry @thomascolthurst
-#tensorflow/contrib/testing/* @dandelionmane
-#tensorflow/contrib/timeseries/* @allenlavoie
-#tensorflow/contrib/tpu/* @frankchn @saeta @jhseu
-#tensorflow/contrib/training/* @joel-shor @ebrevdo
-#tensorflow/contrib/util/* @sherrym
+# NEED OWNER: /tensorflow/contrib/avro/
+# /tensorflow/contrib/batching/ @alextp @chrisolston
+# /tensorflow/contrib/bayesflow/ @ebrevdo @rsepassi @jvdillon
+# /tensorflow/contrib/boosted_trees/ @sshrdp @yk5 @nataliaponomareva
+# /tensorflow/contrib/cmake/ @mrry @benoitsteiner
+# /tensorflow/contrib/copy_graph/ @tucker @poxvoculi
+# /tensorflow/contrib/crf/ @kentonl
+# /tensorflow/contrib/data/ @mrry
+# /tensorflow/contrib/distributions/ @jvdillon @langmore @rsepassi
+# /tensorflow/contrib/factorization/ @agarwal-ashish @xavigonzalvo
+# /tensorflow/contrib/ffmpeg/ @fredbertsch
+# NEED OWNER: /tensorflow/contrib/framework/
+# /tensorflow/contrib/graph_editor/ @purpledog
+# NEED OWNER: /tensorflow/contrib/grid_rnn/
+# /tensorflow/contrib/hvx/ @satok16
+# /tensorflow/contrib/integrate/ @shoyer
+# /tensorflow/contrib/kernel_methods/ @petrosmol
+# /tensorflow/contrib/ios_examples/ @petewarden
+# /tensorflow/contrib/labeled_tensor/ @shoyer
+# /tensorflow/contrib/layers/ @fchollet @martinwicke
+# /tensorflow/contrib/learn/ @martinwicke @ispirmustafa @alextp
+# /tensorflow/contrib/linalg/ @langmore
+# /tensorflow/contrib/linear_optimizer/ @petrosmol @andreasst @katsiapis
+# /tensorflow/contrib/lookup/ @ysuematsu @andreasst
+# /tensorflow/contrib/losses/ @alextp @ispirmustafa
+# /tensorflow/contrib/makefile/ @petewarden @satok16 @wolffg
+# /tensorflow/contrib/metrics/ @alextp @honkentuber @ispirmustafa
+# /tensorflow/contrib/nccl/ @cwhipkey @zheng-xq
+# /tensorflow/contrib/opt/ @strategist333
+# /tensorflow/contrib/pi_examples/ @maciekcc
+# /tensorflow/contrib/quantization/ @petewarden @cwhipkey @keveman
+# /tensorflow/contrib/rnn/ @ebrevdo
+# /tensorflow/contrib/saved_model/ @nfiedel @sukritiramesh
+# /tensorflow/contrib/seq2seq/ @lukaszkaiser
+# /tensorflow/contrib/session_bundle/ @nfiedel @sukritiramesh
+# /tensorflow/contrib/slim/ @sguada @thenbasilmanran
+# /tensorflow/contrib/stateless/ @girving
+# /tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst
+# /tensorflow/contrib/testing/ @dandelionmane
+# /tensorflow/contrib/timeseries/ @allenlavoie
+# /tensorflow/contrib/tpu/ @frankchn @saeta @jhseu
+# /tensorflow/contrib/training/ @joel-shor @ebrevdo
+# /tensorflow/contrib/util/ @sherrym
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 43abdaafbf45379430920cd027b26299cd62553b..dc96bc2e3d3960827efd109551f8eaa78a6cfb48 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -20,6 +20,9 @@ Follow either of the two links above to access the appropriate CLA and instructi
 If you have improvements to TensorFlow, send us your pull requests! For those
 just getting started, Github has a [howto](https://help.github.com/articles/using-pull-requests/).
 
+TensorFlow team members will be assigned to review your pull requests. Once the pull requests are approved and pass continuous integration checks, we will merge the pull requests.
+For some pull requests, we will apply the patch for each pull request to our internal version control system first, and export the change out as a new commit later, at which point the original pull request will be closed. The commits in the pull request will be squashed into a single commit with the pull request creator as the author. These pull requests will be labeled as pending merge internally.
+
 If you want to contribute but you're not sure where to start, take a look at the
 [issues with the "contributions welcome" label](https://github.com/tensorflow/tensorflow/labels/stat%3Acontributions%20welcome).
 These are issues that we believe are particularly well suited for outside
@@ -114,6 +117,7 @@ pylint --rcfile=/tmp/pylintrc myfile.py
 * [Google Java Style Guide](https://google.github.io/styleguide/javaguide.html)
 * [Google JavaScript Style Guide](https://google.github.io/styleguide/jsguide.html)
 * [Google Shell Style Guide](https://google.github.io/styleguide/shell.xml)
+* [Google Objective-C Style Guide](http://google.github.io/styleguide/objcguide.html)
 
 #### Running sanity check
 
diff --git a/RELEASE.md b/RELEASE.md
index d8db1f72004b5d944e3035a0f33dfc34a674b7ee..e04bd3fc505d51ade9e9fa12c822cb695e90b4f3 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -494,7 +494,7 @@ answered questions, and were part of inspiring discussions.
 This release contains contributions from many people at Google, as well as:
 
 A. Besir Kurtulmus, Adal Chiriliuc, @akash, Alec-Desouza, Alex Rothberg, Alex
-Sergeev, Alexander Heinecke, Allen Guo, Andreas Madsen, Ankesh Anand, Anton 
+Sergeev, Alexander Heinecke, Allen Guo, Andreas Madsen, Ankesh Anand, Anton
 Loss, @Aravind, @Arie, Ashutosh Das, AuréLien Geron, Bairen Yi, @bakunyo, Ben
 Visser, Brady Zhou, Calpa Liu, Changming Sun, Chih Cheng Liang, Christopher
 Berner, Clark Zinzow, @Conchylicultor, Dan Ellis, Dan J, Dan Jarvis, Daniel
diff --git a/configure.py b/configure.py
index 26da09bd947a0aa3887630d8f2205ec058886b1a..fa2ed2450d4801353622b51da1fdb822778c0811 100644
--- a/configure.py
+++ b/configure.py
@@ -34,8 +34,10 @@ except ImportError:
 
 _TF_BAZELRC = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                            '.tf_configure.bazelrc')
-_DEFAULT_CUDA_VERSION = '8.0'
-_DEFAULT_CUDNN_VERSION = '6'
+_TF_WORKSPACE = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                             'WORKSPACE')
+_DEFAULT_CUDA_VERSION = '9.0'
+_DEFAULT_CUDNN_VERSION = '7'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,5.2'
 _DEFAULT_CUDA_PATH = '/usr/local/cuda'
 _DEFAULT_CUDA_PATH_LINUX = '/opt/cuda'
@@ -44,6 +46,13 @@ _DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing '
 _TF_OPENCL_VERSION = '1.2'
 _DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'
 _DEFAULT_TRISYCL_INCLUDE_DIR = '/usr/local/triSYCL/include'
+_SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15]
+
+_DEFAULT_PROMPT_ASK_ATTEMPTS = 10
+
+
+class UserInputError(Exception):
+  pass
 
 
 def is_windows():
@@ -158,7 +167,7 @@ def get_python_path(environ_cp, python_bin_path):
   try:
     library_paths = run_shell(
         [python_bin_path, '-c',
-         'import site; print("\\n".join(site.getsitepackages()))']).split("\n")
+         'import site; print("\\n".join(site.getsitepackages()))']).split('\n')
   except subprocess.CalledProcessError:
     library_paths = [run_shell(
         [python_bin_path, '-c',
@@ -557,6 +566,218 @@ def set_clang_cuda_compiler_path(environ_cp):
                               clang_cuda_compiler_path)
 
 
+def prompt_loop_or_load_from_env(
+    environ_cp,
+    var_name,
+    var_default,
+    ask_for_var,
+    check_success,
+    error_msg,
+    suppress_default_error=False,
+    n_ask_attempts=_DEFAULT_PROMPT_ASK_ATTEMPTS
+):
+  """Loop over user prompts for an ENV param until receiving a valid response.
+
+  For the env param var_name, read from the environment or verify user input
+  until receiving valid input. When done, set var_name in the environ_cp to its
+  new value.
+
+  Args:
+    environ_cp: (Dict) copy of the os.environ.
+    var_name: (String) string for name of environment variable, e.g. "TF_MYVAR".
+    var_default: (String) default value string.
+    ask_for_var: (String) string for how to ask for user input.
+    check_success: (Function) function that takes one argument and returns a
+      boolean. Should return True if the value provided is considered valid. May
+      contain a complex error message if error_msg does not provide enough
+      information. In that case, set suppress_default_error to True.
+    error_msg: (String) String with one and only one '%s'. Formatted with each
+      invalid response upon check_success(input) failure.
+    suppress_default_error: (Bool) Suppress the above error message in favor of
+      one from the check_success function.
+    n_ask_attempts: (Integer) Number of times to query for valid input before
+      raising an error and quitting.
+
+  Returns:
+    [String] The value of var_name after querying for input.
+
+  Raises:
+    UserInputError: if a query has been attempted n_ask_attempts times without
+    success, assume that the user has made a scripting error, and will continue
+    to provide invalid input. Raise the error to avoid infinitely looping.
+  """
+  default = environ_cp.get(var_name) or var_default
+  full_query = '%s [Default is %s]: ' % (
+      ask_for_var,
+      default,
+  )
+
+  for _ in range(n_ask_attempts):
+    val = get_from_env_or_user_or_default(environ_cp,
+                                          var_name,
+                                          full_query,
+                                          default)
+    if check_success(val):
+      break
+    if not suppress_default_error:
+      print(error_msg % val)
+    environ_cp[var_name] = ''
+  else:
+    raise UserInputError('Invalid %s setting was provided %d times in a row. '
+                         'Assuming to be a scripting mistake.' %
+                         (var_name, n_ask_attempts))
+
+  environ_cp[var_name] = val
+  return val
+
+
+def create_android_ndk_rule(environ_cp):
+  """Set ANDROID_NDK_HOME and write Android NDK WORKSPACE rule."""
+  if is_windows() or is_cygwin():
+    default_ndk_path = cygpath('%s/Android/Sdk/ndk-bundle' %
+                               environ_cp['APPDATA'])
+  elif is_macos():
+    default_ndk_path = '%s/library/Android/Sdk/ndk-bundle' % environ_cp['HOME']
+  else:
+    default_ndk_path = '%s/Android/Sdk/ndk-bundle' % environ_cp['HOME']
+
+  def valid_ndk_path(path):
+    return (os.path.exists(path) and
+            os.path.exists(os.path.join(path, 'source.properties')))
+
+  android_ndk_home_path = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='ANDROID_NDK_HOME',
+      var_default=default_ndk_path,
+      ask_for_var='Please specify the home path of the Android NDK to use.',
+      check_success=valid_ndk_path,
+      error_msg=('The path %s or its child file "source.properties" '
+                 'does not exist.')
+  )
+
+  write_android_ndk_workspace_rule(android_ndk_home_path)
+
+
+def create_android_sdk_rule(environ_cp):
+  """Set Android variables and write Android SDK WORKSPACE rule."""
+  if is_windows() or is_cygwin():
+    default_sdk_path = cygpath('%s/Android/Sdk' % environ_cp['APPDATA'])
+  elif is_macos():
+    default_sdk_path = '%s/library/Android/Sdk/ndk-bundle' % environ_cp['HOME']
+  else:
+    default_sdk_path = '%s/Android/Sdk' % environ_cp['HOME']
+
+  def valid_sdk_path(path):
+    return (os.path.exists(path) and
+            os.path.exists(os.path.join(path, 'platforms')) and
+            os.path.exists(os.path.join(path, 'build-tools')))
+
+  android_sdk_home_path = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='ANDROID_SDK_HOME',
+      var_default=default_sdk_path,
+      ask_for_var='Please specify the home path of the Android SDK to use.',
+      check_success=valid_sdk_path,
+      error_msg=('Either %s does not exist, or it does not contain the '
+                 'subdirectories "platforms" and "build-tools".'))
+
+  platforms = os.path.join(android_sdk_home_path, 'platforms')
+  api_levels = sorted(os.listdir(platforms))
+  api_levels = [x.replace('android-', '') for x in api_levels]
+
+  def valid_api_level(api_level):
+    return os.path.exists(os.path.join(android_sdk_home_path,
+                                       'platforms',
+                                       'android-' + api_level))
+
+  android_api_level = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='ANDROID_API_LEVEL',
+      var_default=api_levels[-1],
+      ask_for_var=('Please specify the Android SDK API level to use. '
+                   '[Available levels: %s]') % api_levels,
+      check_success=valid_api_level,
+      error_msg='Android-%s is not present in the SDK path.')
+
+  build_tools = os.path.join(android_sdk_home_path, 'build-tools')
+  versions = sorted(os.listdir(build_tools))
+
+  def valid_build_tools(version):
+    return os.path.exists(os.path.join(android_sdk_home_path,
+                                       'build-tools',
+                                       version))
+
+  android_build_tools_version = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='ANDROID_BUILD_TOOLS_VERSION',
+      var_default=versions[-1],
+      ask_for_var=('Please specify an Android build tools version to use. '
+                   '[Available versions: %s]') % versions,
+      check_success=valid_build_tools,
+      error_msg=('The selected SDK does not have build-tools version %s '
+                 'available.'))
+
+  write_android_sdk_workspace_rule(android_sdk_home_path,
+                                   android_build_tools_version,
+                                   android_api_level)
+
+
+def write_android_sdk_workspace_rule(android_sdk_home_path,
+                                     android_build_tools_version,
+                                     android_api_level):
+  print('Writing android_sdk_workspace rule.\n')
+  with open(_TF_WORKSPACE, 'a') as f:
+    f.write("""
+android_sdk_repository(
+  name="androidsdk",
+  api_level=%s,
+  path="%s",
+  build_tools_version="%s")\n
+""" % (android_api_level, android_sdk_home_path, android_build_tools_version))
+
+
+def write_android_ndk_workspace_rule(android_ndk_home_path):
+  print('Writing android_ndk_workspace rule.')
+  ndk_api_level = check_ndk_level(android_ndk_home_path)
+  if int(ndk_api_level) not in _SUPPORTED_ANDROID_NDK_VERSIONS:
+    print('WARNING: The API level of the NDK in %s is %s, which is not '
+          'supported by Bazel (officially supported versions: %s). Please use '
+          'another version. Compiling Android targets may result in confusing '
+          'errors.\n' % (android_ndk_home_path, ndk_api_level,
+                         _SUPPORTED_ANDROID_NDK_VERSIONS))
+  with open(_TF_WORKSPACE, 'a') as f:
+    f.write("""
+android_ndk_repository(
+  name="androidndk",
+  path="%s",
+  api_level=%s)\n
+""" % (android_ndk_home_path, ndk_api_level))
+
+
+def check_ndk_level(android_ndk_home_path):
+  """Check the revision number of an Android NDK path."""
+  properties_path = '%s/source.properties' % android_ndk_home_path
+  if is_windows() or is_cygwin():
+    properties_path = cygpath(properties_path)
+  with open(properties_path, 'r') as f:
+    filedata = f.read()
+
+  revision = re.search(r'Pkg.Revision = (\d+)', filedata)
+  if revision:
+    return revision.group(1)
+  return None
+
+
+def workspace_has_any_android_rule():
+  """Check the WORKSPACE for existing android_*_repository rules."""
+  with open(_TF_WORKSPACE, 'r') as f:
+    workspace = f.read()
+  has_any_rule = re.search(r'^android_[ns]dk_repository',
+                           workspace,
+                           re.MULTILINE)
+  return has_any_rule
+
+
 def set_gcc_host_compiler_path(environ_cp):
   """Set GCC_HOST_COMPILER_PATH."""
   default_gcc_host_compiler_path = which('gcc') or ''
@@ -566,23 +787,16 @@ def set_gcc_host_compiler_path(environ_cp):
     # os.readlink is only available in linux
     default_gcc_host_compiler_path = os.path.realpath(cuda_bin_symlink)
 
-  ask_gcc_path = (
-      'Please specify which gcc should be used by nvcc as the '
-      'host compiler. [Default is %s]: ') % default_gcc_host_compiler_path
-  while True:
-    gcc_host_compiler_path = get_from_env_or_user_or_default(
-        environ_cp, 'GCC_HOST_COMPILER_PATH', ask_gcc_path,
-        default_gcc_host_compiler_path)
-
-    if os.path.exists(gcc_host_compiler_path):
-      break
-
-    # Reset and retry
-    print('Invalid gcc path. %s cannot be found' % gcc_host_compiler_path)
-    environ_cp['GCC_HOST_COMPILER_PATH'] = ''
+  gcc_host_compiler_path = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='GCC_HOST_COMPILER_PATH',
+      var_default=default_gcc_host_compiler_path,
+      ask_for_var=
+      'Please specify which gcc should be used by nvcc as the host compiler.',
+      check_success=os.path.exists,
+      error_msg='Invalid gcc path. %s cannot be found.',
+  )
 
-  # Set GCC_HOST_COMPILER_PATH
-  environ_cp['GCC_HOST_COMPILER_PATH'] = gcc_host_compiler_path
   write_action_env_to_bazelrc('GCC_HOST_COMPILER_PATH', gcc_host_compiler_path)
 
 
@@ -592,7 +806,7 @@ def set_tf_cuda_version(environ_cp):
       'Please specify the CUDA SDK version you want to use, '
       'e.g. 7.0. [Leave empty to default to CUDA %s]: ') % _DEFAULT_CUDA_VERSION
 
-  while True:
+  for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
     # Configure the Cuda SDK version to use.
     tf_cuda_version = get_from_env_or_user_or_default(
         environ_cp, 'TF_CUDA_VERSION', ask_cuda_version, _DEFAULT_CUDA_VERSION)
@@ -630,6 +844,11 @@ def set_tf_cuda_version(environ_cp):
     environ_cp['TF_CUDA_VERSION'] = ''
     environ_cp['CUDA_TOOLKIT_PATH'] = ''
 
+  else:
+    raise UserInputError('Invalid TF_CUDA_SETTING setting was provided %d '
+                         'times in a row. Assuming to be a scripting mistake.' %
+                         _DEFAULT_PROMPT_ASK_ATTEMPTS)
+
   # Set CUDA_TOOLKIT_PATH and TF_CUDA_VERSION
   environ_cp['CUDA_TOOLKIT_PATH'] = cuda_toolkit_path
   write_action_env_to_bazelrc('CUDA_TOOLKIT_PATH', cuda_toolkit_path)
@@ -643,7 +862,7 @@ def set_tf_cudnn_version(environ_cp):
       'Please specify the cuDNN version you want to use. '
       '[Leave empty to default to cuDNN %s.0]: ') % _DEFAULT_CUDNN_VERSION
 
-  while True:
+  for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
     tf_cudnn_version = get_from_env_or_user_or_default(
         environ_cp, 'TF_CUDNN_VERSION', ask_cudnn_version,
         _DEFAULT_CUDNN_VERSION)
@@ -702,6 +921,10 @@ def set_tf_cudnn_version(environ_cp):
       print('%s.%s' % (cudnn_path_from_ldconfig, tf_cudnn_version))
 
     environ_cp['TF_CUDNN_VERSION'] = ''
+  else:
+    raise UserInputError('Invalid TF_CUDNN setting was provided %d '
+                         'times in a row. Assuming to be a scripting mistake.' %
+                         _DEFAULT_PROMPT_ASK_ATTEMPTS)
 
   # Set CUDNN_INSTALL_PATH and TF_CUDNN_VERSION
   environ_cp['CUDNN_INSTALL_PATH'] = cudnn_install_path
@@ -810,76 +1033,66 @@ def set_other_cuda_vars(environ_cp):
 def set_host_cxx_compiler(environ_cp):
   """Set HOST_CXX_COMPILER."""
   default_cxx_host_compiler = which('g++') or ''
-  ask_cxx_host_compiler = (
-      'Please specify which C++ compiler should be used as'
-      ' the host C++ compiler. [Default is %s]: ') % default_cxx_host_compiler
-
-  while True:
-    host_cxx_compiler = get_from_env_or_user_or_default(
-        environ_cp, 'HOST_CXX_COMPILER', ask_cxx_host_compiler,
-        default_cxx_host_compiler)
-    if os.path.exists(host_cxx_compiler):
-      break
 
-    # Reset and retry
-    print('Invalid C++ compiler path. %s cannot be found' % host_cxx_compiler)
-    environ_cp['HOST_CXX_COMPILER'] = ''
+  host_cxx_compiler = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='HOST_CXX_COMPILER',
+      var_default=default_cxx_host_compiler,
+      ask_for_var=('Please specify which C++ compiler should be used as the '
+                   'host C++ compiler.'),
+      check_success=os.path.exists,
+      error_msg='Invalid C++ compiler path. %s cannot be found.',
+  )
 
-  # Set HOST_CXX_COMPILER
-  environ_cp['HOST_CXX_COMPILER'] = host_cxx_compiler
   write_action_env_to_bazelrc('HOST_CXX_COMPILER', host_cxx_compiler)
 
 
 def set_host_c_compiler(environ_cp):
   """Set HOST_C_COMPILER."""
   default_c_host_compiler = which('gcc') or ''
-  ask_c_host_compiler = (
-      'Please specify which C compiler should be used as the'
-      ' host C compiler. [Default is %s]: ') % default_c_host_compiler
-
-  while True:
-    host_c_compiler = get_from_env_or_user_or_default(
-        environ_cp, 'HOST_C_COMPILER', ask_c_host_compiler,
-        default_c_host_compiler)
-    if os.path.exists(host_c_compiler):
-      break
 
-    # Reset and retry
-    print('Invalid C compiler path. %s cannot be found' % host_c_compiler)
-    environ_cp['HOST_C_COMPILER'] = ''
+  host_c_compiler = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='HOST_C_COMPILER',
+      var_default=default_c_host_compiler,
+      ask_for_var=('Please specify which C compiler should be used as the host'
+                   'C compiler.'),
+      check_success=os.path.exists,
+      error_msg='Invalid C compiler path. %s cannot be found.',
+  )
 
-  # Set HOST_C_COMPILER
-  environ_cp['HOST_C_COMPILER'] = host_c_compiler
   write_action_env_to_bazelrc('HOST_C_COMPILER', host_c_compiler)
 
 
 def set_computecpp_toolkit_path(environ_cp):
   """Set COMPUTECPP_TOOLKIT_PATH."""
-  ask_computecpp_toolkit_path = ('Please specify the location where ComputeCpp '
-                                 'for SYCL %s is installed. [Default is %s]: '
-                                ) % (_TF_OPENCL_VERSION,
-                                     _DEFAULT_COMPUTECPP_TOOLKIT_PATH)
 
-  while True:
-    computecpp_toolkit_path = get_from_env_or_user_or_default(
-        environ_cp, 'COMPUTECPP_TOOLKIT_PATH', ask_computecpp_toolkit_path,
-        _DEFAULT_COMPUTECPP_TOOLKIT_PATH)
+  def toolkit_exists(toolkit_path):
+    """Check if a computecpp toolkit path is valid."""
     if is_linux():
       sycl_rt_lib_path = 'lib/libComputeCpp.so'
     else:
       sycl_rt_lib_path = ''
 
-    sycl_rt_lib_path_full = os.path.join(computecpp_toolkit_path,
+    sycl_rt_lib_path_full = os.path.join(toolkit_path,
                                          sycl_rt_lib_path)
-    if os.path.exists(sycl_rt_lib_path_full):
-      break
+    exists = os.path.exists(sycl_rt_lib_path_full)
+    if not exists:
+      print('Invalid SYCL %s library path. %s cannot be found' %
+            (_TF_OPENCL_VERSION, sycl_rt_lib_path_full))
+    return exists
 
-    print('Invalid SYCL %s library path. %s cannot be found' %
-          (_TF_OPENCL_VERSION, sycl_rt_lib_path_full))
-    environ_cp['COMPUTECPP_TOOLKIT_PATH'] = ''
+  computecpp_toolkit_path = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='COMPUTECPP_TOOLKIT_PATH',
+      var_default=_DEFAULT_COMPUTECPP_TOOLKIT_PATH,
+      ask_for_var=(
+          'Please specify the location where ComputeCpp for SYCL %s is '
+          'installed.' % _TF_OPENCL_VERSION),
+      check_success=toolkit_exists,
+      error_msg='Invalid SYCL compiler path. %s cannot be found.',
+      suppress_default_error=True)
 
-  # Set COMPUTECPP_TOOLKIT_PATH
-  environ_cp['COMPUTECPP_TOOLKIT_PATH'] = computecpp_toolkit_path
   write_action_env_to_bazelrc('COMPUTECPP_TOOLKIT_PATH',
                               computecpp_toolkit_path)
 
@@ -905,28 +1118,30 @@ def set_trisycl_include_dir(environ_cp):
   write_action_env_to_bazelrc('TRISYCL_INCLUDE_DIR',
                               trisycl_include_dir)
 
+
 def set_mpi_home(environ_cp):
   """Set MPI_HOME."""
+
   default_mpi_home = which('mpirun') or which('mpiexec') or ''
   default_mpi_home = os.path.dirname(os.path.dirname(default_mpi_home))
 
-  ask_mpi_home = ('Please specify the MPI toolkit folder. [Default is %s]: '
-                 ) % default_mpi_home
-  while True:
-    mpi_home = get_from_env_or_user_or_default(environ_cp, 'MPI_HOME',
-                                               ask_mpi_home, default_mpi_home)
-
-    if os.path.exists(os.path.join(mpi_home, 'include')) and os.path.exists(
-        os.path.join(mpi_home, 'lib')):
-      break
-
-    print('Invalid path to the MPI Toolkit. %s or %s cannot be found' %
-          (os.path.join(mpi_home, 'include'),
-           os.path.exists(os.path.join(mpi_home, 'lib'))))
-    environ_cp['MPI_HOME'] = ''
+  def valid_mpi_path(mpi_home):
+    exists = (os.path.exists(os.path.join(mpi_home, 'include')) and
+              os.path.exists(os.path.join(mpi_home, 'lib')))
+    if not exists:
+      print('Invalid path to the MPI Toolkit. %s or %s cannot be found' %
+            (os.path.join(mpi_home, 'include'),
+             os.path.exists(os.path.join(mpi_home, 'lib'))))
+    return exists
 
-  # Set MPI_HOME
-  environ_cp['MPI_HOME'] = str(mpi_home)
+  _ = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='MPI_HOME',
+      var_default=default_mpi_home,
+      ask_for_var='Please specify the MPI toolkit folder.',
+      check_success=valid_mpi_path,
+      error_msg='',
+      suppress_default_error=True)
 
 
 def set_other_mpi_vars(environ_cp):
@@ -969,7 +1184,7 @@ def set_mkl():
       'support.\nPlease note that MKL on MacOS or windows is still not '
       'supported.\nIf you would like to use a local MKL instead of '
       'downloading, please set the environment variable \"TF_MKL_ROOT\" every '
-      'time before build.')
+      'time before build.\n')
 
 
 def set_monolithic():
@@ -1001,6 +1216,15 @@ def create_android_bazelrc_configs():
 def set_grpc_build_flags():
   write_to_bazelrc('build --define grpc_no_ares=true')
 
+def set_windows_build_flags():
+  if is_windows():
+    # The non-monolithic build is not supported yet
+    write_to_bazelrc('build --config monolithic')
+    # Suppress warning messages
+    write_to_bazelrc('build --copt=-w --host_copt=-w')
+    # Output more verbose information when something goes wrong
+    write_to_bazelrc('build --verbose_failures')
+
 
 def main():
   # Make a copy of os.environ to be clear when functions and getting and setting
@@ -1079,7 +1303,25 @@ def main():
   set_cc_opt_flags(environ_cp)
   set_mkl()
   set_monolithic()
+  set_windows_build_flags()
   create_android_bazelrc_configs()
 
+  if workspace_has_any_android_rule():
+    print('The WORKSPACE file has at least one of ["android_sdk_repository", '
+          '"android_ndk_repository"] already set. Will not ask to help '
+          'configure the WORKSPACE. Please delete the existing rules to '
+          'activate the helper.\n')
+  else:
+    if get_var(
+        environ_cp, 'TF_SET_ANDROID_WORKSPACE', 'android workspace',
+        False,
+        ('Would you like to interactively configure ./WORKSPACE for '
+         'Android builds?'),
+        'Searching for NDK and SDK installations.',
+        'Not configuring the WORKSPACE for Android builds.'):
+      create_android_ndk_rule(environ_cp)
+      create_android_sdk_rule(environ_cp)
+
+
 if __name__ == '__main__':
   main()
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index c0a47cf6b4ae2dcfab15472758023480fb48482d..259dde384c794e980be7e958b2448dc92b9be441 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -364,14 +364,6 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-# Make a dummy rule that we can change "default" in select statements to.
-# to disable dependencies in copybara.
-config_setting(
-    name = "dummy_disabled_internal",
-    values = {"define": "with_dummy_disabled_internal=true"},
-    visibility = ["//visibility:public"],
-)
-
 package_group(
     name = "internal",
     packages = [
@@ -427,6 +419,7 @@ filegroup(
         "//tensorflow/compiler/xla/client:all_files",
         "//tensorflow/compiler/xla/client/lib:all_files",
         "//tensorflow/compiler/xla/legacy_flags:all_files",
+        "//tensorflow/compiler/xla/python:all_files",
         "//tensorflow/compiler/xla/service:all_files",
         "//tensorflow/compiler/xla/service/cpu:all_files",
         "//tensorflow/compiler/xla/service/gpu:all_files",
@@ -462,6 +455,7 @@ filegroup(
         "//tensorflow/contrib/data/python/ops:all_files",
         "//tensorflow/contrib/decision_trees/proto:all_files",
         "//tensorflow/contrib/distributions:all_files",
+        "//tensorflow/contrib/eager/proto:all_files",
         "//tensorflow/contrib/eager/python:all_files",
         "//tensorflow/contrib/estimator:all_files",
         "//tensorflow/contrib/factorization:all_files",
@@ -562,6 +556,7 @@ filegroup(
         "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:all_files",
         "//tensorflow/contrib/tpu:all_files",
         "//tensorflow/contrib/tpu/profiler:all_files",
+        "//tensorflow/contrib/tpu/proto:all_files",
         "//tensorflow/contrib/training:all_files",
         "//tensorflow/contrib/util:all_files",
         "//tensorflow/contrib/verbs:all_files",
@@ -576,6 +571,8 @@ filegroup(
         "//tensorflow/core/grappler/optimizers:all_files",
         "//tensorflow/core/grappler/utils:all_files",
         "//tensorflow/core/kernels:all_files",
+        "//tensorflow/core/kernels/data:all_files",
+        "//tensorflow/core/kernels/data/sql:all_files",
         "//tensorflow/core/kernels/fuzzing:all_files",
         "//tensorflow/core/kernels/hexagon:all_files",
         "//tensorflow/core/kernels/neon:all_files",
@@ -609,6 +606,7 @@ filegroup(
         "//tensorflow/java/src/main/native:all_files",
         "//tensorflow/python:all_files",
         "//tensorflow/python/data:all_files",
+        "//tensorflow/python/data/kernel_tests:all_files",
         "//tensorflow/python/data/ops:all_files",
         "//tensorflow/python/data/util:all_files",
         "//tensorflow/python/debug:all_files",
@@ -645,6 +643,7 @@ filegroup(
         "//tensorflow/tools/test:all_files",
         "//tensorflow/user_ops:all_files",
         "//third_party/hadoop:all_files",
+        "//third_party/mpi:all_files",
         "//third_party/sycl:all_files",
         "//third_party/sycl/sycl:all_files",
     ],
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index dd638de3c6933fde6214993ae7b15b40b1acf65b..9b5704702841081d7dde78ac019305140066f688 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -383,12 +383,11 @@ void TF_Reset_Helper(const TF_SessionOptions* opt, const char** containers,
 // be less than the total node count.
 Status ValidateNoCycles(const Graph& g) {
   // TODO(nolivia): check this on a subset of the graph instead of all of it.
-  int total_num_nodes = g.num_node_ids();
   // A node is ready when all of its inputs have been visited.
   std::vector<const Node*> ready;
-  std::vector<int> pending_count(total_num_nodes, 0);
+  std::vector<int> pending_count(g.num_node_ids(), 0);
 
-  for (int i = 0; i < total_num_nodes; ++i) {
+  for (int i = 0; i < g.num_node_ids(); ++i) {
     const Node* n = g.FindNodeId(i);
     if (n == nullptr) continue;
     pending_count[i] = n->in_edges().size();
@@ -421,7 +420,7 @@ Status ValidateNoCycles(const Graph& g) {
     }
   }
 
-  if (processed < total_num_nodes) {
+  if (processed < g.num_nodes()) {
     std::vector<string> nodes_in_cycle;
     for (int i = 0; i < pending_count.size() && nodes_in_cycle.size() < 3;
          ++i) {
@@ -430,7 +429,7 @@ Status ValidateNoCycles(const Graph& g) {
       }
     }
     return errors::InvalidArgument(
-        "Graph is invalid, contains a cycle with ", total_num_nodes - processed,
+        "Graph is invalid, contains a cycle with ", g.num_nodes() - processed,
         " nodes, including: ", str_util::Join(nodes_in_cycle, ", "));
   }
   return Status::OK();
@@ -580,6 +579,7 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
       status->status = InvalidArgument(
           "invalid string tensor encoding (string #", i, " of ",
           srcarray.size(), "): ", status->status.error_message());
+      delete[] base;
       return nullptr;
     }
     dst += consumed;
@@ -589,6 +589,7 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
     status->status = InvalidArgument(
         "invalid string tensor encoding (decoded ", (dst - base),
         " bytes, but the tensor is encoded in ", size, " bytes");
+    delete[] base;
     return nullptr;
   }
 
@@ -625,6 +626,23 @@ Status MessageToBuffer(const tensorflow::protobuf::Message& in,
   return Status::OK();
 }
 
+void RecordMutation(TF_Graph* graph, const TF_Operation& op,
+                    const char* mutation_type)
+    EXCLUSIVE_LOCKS_REQUIRED(graph->mu) {
+  // If any session has already run this node_id, mark this session as
+  // unrunnable.
+  for (auto it : graph->sessions) {
+    if (it.first->last_num_graph_nodes > op.node.id()) {
+      it.second = FailedPrecondition(
+          "Operation '", op.node.DebugString(), "' was changed by ",
+          mutation_type,
+          " after it was run by a session. Nodes can be mutated "
+          "only before they are executed by a session. Either don't modify "
+          "nodes after running them or create a new session.");
+    }
+  }
+}
+
 // Helpers for loading a TensorFlow plugin (a .so file).
 Status LoadLibrary(const char* library_filename, void** result,
                    const void** buf, size_t* len);
@@ -939,13 +957,17 @@ void TF_GraphSetTensorShape(TF_Graph* graph, TF_Output output,
     return;
   }
 
-  std::vector<tensorflow::shape_inference::DimensionHandle> dim_vec;
-  dim_vec.reserve(num_dims);
-  for (int i = 0; i < num_dims; ++i) {
-    dim_vec.push_back(ic->MakeDim(dims[i]));
+  tensorflow::shape_inference::ShapeHandle new_shape;
+  if (num_dims != -1) {
+    std::vector<tensorflow::shape_inference::DimensionHandle> dim_vec;
+    dim_vec.reserve(num_dims);
+    for (int i = 0; i < num_dims; ++i) {
+      dim_vec.push_back(ic->MakeDim(dims[i]));
+    }
+    new_shape = ic->MakeShape(dim_vec);
+  } else {
+    new_shape = ic->UnknownShape();
   }
-
-  tensorflow::shape_inference::ShapeHandle new_shape = ic->MakeShape(dim_vec);
   status->status = graph->refiner.SetShape(node, output.index, new_shape);
 }
 
@@ -1741,7 +1763,6 @@ void TF_OperationToNodeDef(TF_Operation* oper, TF_Buffer* output_node_def,
 TF_Graph::TF_Graph()
     : graph(tensorflow::OpRegistry::Global()),
       refiner(graph.versions().producer(), graph.op_registry()),
-      num_sessions(0),
       delete_requested(false),
       parent(nullptr),
       parent_inputs(nullptr) {}
@@ -1751,7 +1772,7 @@ TF_Graph* TF_NewGraph() { return new TF_Graph; }
 void TF_DeleteGraph(TF_Graph* g) {
   g->mu.lock();
   g->delete_requested = true;
-  const bool del = g->num_sessions == 0;
+  const bool del = g->sessions.empty();
   g->mu.unlock();
   if (del) delete g;
 }
@@ -1831,6 +1852,16 @@ void TF_ImportGraphDefOptionsSetPrefix(TF_ImportGraphDefOptions* opts,
   opts->opts.prefix = prefix;
 }
 
+void TF_ImportGraphDefOptionsSetUniquifyNames(TF_ImportGraphDefOptions* opts,
+                                              unsigned char uniquify_names) {
+  opts->opts.uniquify_names = uniquify_names;
+}
+
+void TF_ImportGraphDefOptionsSetUniquifyPrefix(TF_ImportGraphDefOptions* opts,
+                                               unsigned char uniquify_prefix) {
+  opts->opts.uniquify_prefix = uniquify_prefix;
+}
+
 void TF_ImportGraphDefOptionsAddInputMapping(TF_ImportGraphDefOptions* opts,
                                              const char* src_name,
                                              int src_index, TF_Output dst) {
@@ -1888,12 +1919,12 @@ void TF_ImportGraphDefResultsReturnOperations(TF_ImportGraphDefResults* results,
   *opers = results->return_nodes.data();
 }
 
-void TF_ImportGraphDefResultsUnusedInputMappings(
-    TF_ImportGraphDefResults* results, int* num_unused_input_mappings,
+void TF_ImportGraphDefResultsMissingUnusedInputMappings(
+    TF_ImportGraphDefResults* results, int* num_missing_unused_input_mappings,
     const char*** src_names, int** src_indexes) {
-  *num_unused_input_mappings = results->unused_key_names.size();
-  *src_names = results->unused_key_names.data();
-  *src_indexes = results->unused_key_indexes.data();
+  *num_missing_unused_input_mappings = results->missing_unused_key_names.size();
+  *src_names = results->missing_unused_key_names.data();
+  *src_indexes = results->missing_unused_key_indexes.data();
 }
 
 void TF_DeleteImportGraphDefResults(TF_ImportGraphDefResults* results) {
@@ -1933,18 +1964,21 @@ static void GraphImportGraphDefLocked(TF_Graph* graph, const GraphDef& def,
     tf_results->return_nodes[i] = ToOperation(results.return_nodes[i]);
   }
 
-  // Populate unused map keys
-  DCHECK(tf_results->unused_key_names.empty());
-  DCHECK(tf_results->unused_key_indexes.empty());
-  DCHECK(tf_results->unused_key_names_data.empty());
-  tf_results->unused_key_names.resize(results.unused_input_map_keys.size());
-  tf_results->unused_key_indexes.resize(results.unused_input_map_keys.size());
-  for (int i = 0; i < results.unused_input_map_keys.size(); ++i) {
-    TensorId id = results.unused_input_map_keys[i];
-    tf_results->unused_key_names_data.push_back(id.first.ToString());
-    tf_results->unused_key_names[i] =
-        tf_results->unused_key_names_data.back().c_str();
-    tf_results->unused_key_indexes[i] = id.second;
+  // Populate missing unused map keys
+  DCHECK(tf_results->missing_unused_key_names.empty());
+  DCHECK(tf_results->missing_unused_key_indexes.empty());
+  DCHECK(tf_results->missing_unused_key_names_data.empty());
+
+  size_t size = results.missing_unused_input_map_keys.size();
+  tf_results->missing_unused_key_names.resize(size);
+  tf_results->missing_unused_key_indexes.resize(size);
+
+  for (int i = 0; i < size; ++i) {
+    TensorId id = results.missing_unused_input_map_keys[i];
+    tf_results->missing_unused_key_names_data.push_back(id.first.ToString());
+    tf_results->missing_unused_key_names[i] =
+        tf_results->missing_unused_key_names_data.back().c_str();
+    tf_results->missing_unused_key_indexes[i] = id.second;
   }
 }
 
@@ -2321,11 +2355,12 @@ TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opt,
   Session* session;
   status->status = NewSession(opt->options, &session);
   if (status->status.ok()) {
+    TF_Session* new_session = new TF_Session(session, graph);
     if (graph != nullptr) {
       mutex_lock l(graph->mu);
-      graph->num_sessions += 1;
+      graph->sessions[new_session] = Status::OK();
     }
-    return new TF_Session(session, graph);
+    return new_session;
   } else {
     DCHECK_EQ(nullptr, session);
     return nullptr;
@@ -2389,7 +2424,7 @@ TF_Session* TF_LoadSessionFromSavedModel(
 
   TF_Session* session = new TF_Session(bundle.session.release(), graph);
 
-  graph->num_sessions += 1;
+  graph->sessions[session] = Status::OK();
   session->last_num_graph_nodes = graph->graph.num_node_ids();
   return session;
 #endif  // __ANDROID__
@@ -2404,8 +2439,8 @@ void TF_DeleteSession(TF_Session* s, TF_Status* status) {
   TF_Graph* const graph = s->graph;
   if (graph != nullptr) {
     graph->mu.lock();
-    graph->num_sessions -= 1;
-    const bool del = graph->delete_requested && graph->num_sessions == 0;
+    graph->sessions.erase(s);
+    const bool del = graph->delete_requested && graph->sessions.empty();
     graph->mu.unlock();
     if (del) delete graph;
   }
@@ -2421,6 +2456,13 @@ static bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status) {
     mutex_lock session_lock(session->mu);
     session->graph->mu.lock();
     const Graph& graph = session->graph->graph;
+
+    status->status = session->graph->sessions[session];
+    if (!status->status.ok()) {
+      session->graph->mu.unlock();
+      return false;
+    }
+
     const auto num_nodes = graph.num_node_ids();
     if (session->last_num_graph_nodes < num_nodes) {
       status->status = tensorflow::ValidateNoCycles(session->graph->graph);
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index bb569d67fcbcec29e9494236abd79b3e40db91cd..de9527f86d1f48846b160230c592a398e00e10c5 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -889,6 +889,20 @@ TF_CAPI_EXPORT extern void TF_DeleteImportGraphDefOptions(
 TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetPrefix(
     TF_ImportGraphDefOptions* opts, const char* prefix);
 
+// Set whether to uniquify imported operation names. If true, imported operation
+// names will be modified if their name already exists in the graph. If false,
+// conflicting names will be treated as an error. Note that this option has no
+// effect if a prefix is set, since the prefix will guarantee all names are
+// unique. Defaults to false.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetUniquifyNames(
+    TF_ImportGraphDefOptions* opts, unsigned char uniquify_names);
+
+// If true, the specified prefix will be modified if it already exists as an
+// operation name or prefix in the graph. If false, a conflicting prefix will be
+// treated as an error. This option has no effect if no prefix is specified.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetUniquifyPrefix(
+    TF_ImportGraphDefOptions* opts, unsigned char uniquify_prefix);
+
 // Set any imported nodes with input `src_name:src_index` to have that input
 // replaced with `dst`. `src_name` refers to a node in the graph to be imported,
 // `dst` references a node already existing in the graph being imported into.
@@ -948,16 +962,16 @@ TF_CAPI_EXPORT extern void TF_ImportGraphDefResultsReturnOperations(
     TF_ImportGraphDefResults* results, int* num_opers, TF_Operation*** opers);
 
 // Fetches any input mappings requested via
-// TF_ImportGraphDefOptionsAddInputMapping() that weren't used as input to any
-// node in the imported graph def. The number of fetched mappings is returned in
-// `num_unused_input_mappings`. The array of each mapping's source node name is
-// returned in `src_names`, and the array of each mapping's source index is
-// returned in `src_indexes`.
+// TF_ImportGraphDefOptionsAddInputMapping() that didn't appear in the GraphDef
+// and weren't used as input to any node in the imported graph def. The number
+// of fetched mappings is returned in `num_missing_unused_input_mappings`. The
+// array of each mapping's source node name is returned in `src_names`, and the
+// array of each mapping's source index is returned in `src_indexes`.
 //
 // `*src_names`, `*src_indexes`, and the memory backing each string in
 // `src_names` are owned by and have the lifetime of `results`.
-TF_CAPI_EXPORT extern void TF_ImportGraphDefResultsUnusedInputMappings(
-    TF_ImportGraphDefResults* results, int* num_unused_input_mappings,
+TF_CAPI_EXPORT extern void TF_ImportGraphDefResultsMissingUnusedInputMappings(
+    TF_ImportGraphDefResults* results, int* num_missing_unused_input_mappings,
     const char*** src_names, int** src_indexes);
 
 // Deletes a results object returned by TF_GraphImportGraphDefWithResults().
diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index dcb818b88b6fca460852beb6e948d2eb6964f663..d60d1de315ed37a327bd036ddb914a3c32413f65 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -68,7 +68,7 @@ class NodeNameMapping {
   // This is a superset of values in name_mapping_.
   std::unordered_set<string> used_names_;
   // Mapping from original node name from the graph to the normalized
-  // and uniqified version of it.
+  // and uniquified version of it.
   std::unordered_map<string, string> name_mapping_;
 };
 
@@ -226,12 +226,17 @@ Status FillFunctionBody(
       }
       node_def->add_input(strings::StrCat("^", normalized));
     }
+
+    // A function is stateful if any of its nodes are stateful.
+    if (node->op_def().is_stateful()) {
+      fdef->mutable_signature()->set_is_stateful(true);
+    }
   }
   return Status::OK();
 }
 
 // Graph to FunctionDef conversion. This code is closely modeled on the Python
-// code in third_party/tensorflow/python/framework/function.py.
+// code in tensorflow/python/framework/function.py.
 Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
                           bool append_hash_to_fn_name,
                           const std::vector<const Node*>& body_nodes,
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
index d5580b658992413ae6f9cb79ef88751ee28ce465..2e2293ca85175009d1bcc8db5c830789e4701c1d 100644
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@@ -1482,6 +1482,51 @@ TEST_F(CApiFunctionTest, GetOpDef) {
   EXPECT_EQ(op_def.name(), func_name_);
   EXPECT_EQ(op_def.input_arg_size(), 1);
   EXPECT_EQ(op_def.output_arg_size(), 1);
+  EXPECT_FALSE(op_def.is_stateful());
+
+  TF_DeleteBuffer(buffer);
+}
+
+void DefineStatefulFunction(const char* name, TF_Function** func) {
+  std::unique_ptr<TF_Graph, decltype(&TF_DeleteGraph)> func_graph(
+      TF_NewGraph(), TF_DeleteGraph);
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> s(TF_NewStatus(),
+                                                           TF_DeleteStatus);
+
+  TF_Tensor* tensor_shape = Int32Tensor({37, 1});
+  TF_Operation* shape = Const(tensor_shape, func_graph.get(), s.get(), "shape");
+  TF_Operation* random =
+      RandomUniform(shape, TF_FLOAT, func_graph.get(), s.get());
+
+  TF_Output inputs[] = {};
+  TF_Output outputs[] = {{random, 0}};
+  *func = TF_GraphToFunction(func_graph.get(), name, /*append_hash=*/false, -1,
+                             /*opers=*/nullptr, 0, inputs, 1, outputs,
+                             /*output_names=*/nullptr,
+                             /*opts=*/nullptr, "", s.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(s.get())) << TF_Message(s.get());
+  ASSERT_NE(*func, nullptr);
+  TF_DeleteTensor(tensor_shape);
+}
+
+TEST_F(CApiFunctionTest, StatefulOpDef) {
+  DefineStatefulFunction(func_name_, &func_);
+  TF_GraphCopyFunction(host_graph_, func_, nullptr, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  // Test we can retrieve function OpDef from graph
+  TF_Buffer* buffer = TF_NewBuffer();
+  TF_GraphGetOpDef(host_graph_, func_name_, buffer, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  // Sanity check returned OpDef
+  string data(static_cast<const char*>(buffer->data), buffer->length);
+  OpDef op_def;
+  op_def.ParseFromString(data);
+  EXPECT_EQ(op_def.name(), func_name_);
+  EXPECT_EQ(op_def.input_arg_size(), 0);
+  EXPECT_EQ(op_def.output_arg_size(), 1);
+  EXPECT_TRUE(op_def.is_stateful());
 
   TF_DeleteBuffer(buffer);
 }
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index bb04e01beec931a8ea66d0855eec9625d3a6a5ab..6df77a7f9baed999a2f2cb5e9404cb63451b6212 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -81,12 +81,20 @@ struct TF_Graph {
   std::unordered_map<tensorflow::string, tensorflow::Node*> name_map
       GUARDED_BY(mu);
 
-  // TF_Graph may only / must be deleted when
-  //   num_sessions == 0 && delete_requested == true
-
-  // num_sessions incremented by TF_NewSession, and decremented by
+  // The keys of this map are all the active sessions using this graph.
+  // Each value is the current "runnability" status of the corresponding
+  // session. Under normal conditions all statuses are Status::OK(), but
+  // if some operation is mutated after it was run by a session (this
+  // is detected in RecordMutation function), that session is no longer
+  // safe to run. Its status will contain the error that will be returned
+  // to the user, should she try running this session.
+  //
+  // Sessions are added to this map in TF_NewSession, and removed in
   // TF_DeleteSession.
-  int num_sessions GUARDED_BY(mu);
+  // TF_Graph may only / must be deleted when
+  //   sessions.size() == 0 && delete_requested == true
+  tensorflow::gtl::FlatMap<TF_Session*, tensorflow::Status> sessions
+      GUARDED_BY(mu);
   bool delete_requested GUARDED_BY(mu);  // set true by TF_DeleteGraph
 
   // Used to link graphs contained in TF_WhileParams to the parent graph that
@@ -135,11 +143,11 @@ struct TF_ImportGraphDefOptions {
 struct TF_ImportGraphDefResults {
   std::vector<TF_Output> return_tensors;
   std::vector<TF_Operation*> return_nodes;
-  std::vector<const char*> unused_key_names;
-  std::vector<int> unused_key_indexes;
+  std::vector<const char*> missing_unused_key_names;
+  std::vector<int> missing_unused_key_indexes;
 
-  // Backing memory for unused_key_names values.
-  std::list<tensorflow::string> unused_key_names_data;
+  // Backing memory for missing_unused_key_names values.
+  std::list<tensorflow::string> missing_unused_key_names_data;
 };
 
 struct TF_DeviceList {
@@ -167,6 +175,9 @@ TF_Tensor* TF_TensorFromTensor(const Tensor& src, TF_Status* status);
 
 Status MessageToBuffer(const tensorflow::protobuf::Message& in, TF_Buffer* out);
 
+void RecordMutation(TF_Graph* graph, const TF_Operation& op,
+                    const char* mutation_type);
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_C_C_API_INTERNAL_H_
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index e0057eb51cd82e8d9ed5fcf56e296f9fb0c2fe40..4e89b4fc43973e4cc9a6c64f50e288d81bf22033 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -287,6 +287,13 @@ TEST(CAPI, SetShape) {
   ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
   EXPECT_EQ(-1, num_dims);
 
+  // Set the shape to be unknown, expect no change.
+  TF_GraphSetTensorShape(graph, feed_out_0, /*dims=*/nullptr, -1, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  num_dims = TF_GraphGetTensorNumDims(graph, feed_out_0, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(-1, num_dims);
+
   // Set the shape to be 2 x Unknown
   int64_t dims[] = {2, -1};
   TF_GraphSetTensorShape(graph, feed_out_0, dims, 2, s);
@@ -315,7 +322,17 @@ TEST(CAPI, SetShape) {
   EXPECT_EQ(dims[0], returned_dims[0]);
   EXPECT_EQ(dims[1], returned_dims[1]);
 
-  // Try to set 'unknown' on the shape and see that
+  // Try to set 'unknown' with unknown rank on the shape and see that
+  // it doesn't change.
+  TF_GraphSetTensorShape(graph, feed_out_0, /*dims=*/nullptr, -1, s);
+  EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_GraphGetTensorShape(graph, feed_out_0, returned_dims, num_dims, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(2, num_dims);
+  EXPECT_EQ(2, returned_dims[0]);
+  EXPECT_EQ(3, returned_dims[1]);
+
+  // Try to set 'unknown' with same rank on the shape and see that
   // it doesn't change.
   dims[0] = -1;
   dims[1] = -1;
@@ -756,7 +773,7 @@ TEST(CAPI, ImportGraphDef_WithReturnOutputs) {
   TF_DeleteStatus(s);
 }
 
-TEST(CAPI, ImportGraphDef_UnusedInputMappings) {
+TEST(CAPI, ImportGraphDef_MissingUnusedInputMappings) {
   TF_Status* s = TF_NewStatus();
   TF_Graph* graph = TF_NewGraph();
 
@@ -799,7 +816,7 @@ TEST(CAPI, ImportGraphDef_UnusedInputMappings) {
   int num_unused_input_mappings;
   const char** src_names;
   int* src_indexes;
-  TF_ImportGraphDefResultsUnusedInputMappings(
+  TF_ImportGraphDefResultsMissingUnusedInputMappings(
       results, &num_unused_input_mappings, &src_names, &src_indexes);
   ASSERT_EQ(1, num_unused_input_mappings);
   EXPECT_EQ(string("fake"), string(src_names[0]));
diff --git a/tensorflow/c/c_test_util.cc b/tensorflow/c/c_test_util.cc
index c291a2e440a8515e968b0ce0395b289080f04e8b..37439ff0beac5a5220460465e954b6c093ee1ba9 100644
--- a/tensorflow/c/c_test_util.cc
+++ b/tensorflow/c/c_test_util.cc
@@ -193,6 +193,15 @@ TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph,
   return TF_FinishOperation(desc, s);
 }
 
+TF_Operation* RandomUniform(TF_Operation* shape, TF_DataType dtype,
+                            TF_Graph* graph, TF_Status* s) {
+  TF_OperationDescription* desc =
+      TF_NewOperation(graph, "RandomUniform", "random_uniform");
+  TF_AddInput(desc, {shape, 0});
+  TF_SetAttrType(desc, "dtype", dtype);
+  return TF_FinishOperation(desc, s);
+}
+
 void Split3Helper(TF_Operation* input, TF_Graph* graph, TF_Status* s,
                   const char* name, TF_Operation** op) {
   TF_Operation* zero = ScalarConst(
diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h
index d54733749248fa32c39d88bb0281d329dd50c7bd..3429009a71a863ae6b69b5cd29ace3c7fd078f4c 100644
--- a/tensorflow/c/c_test_util.h
+++ b/tensorflow/c/c_test_util.h
@@ -74,7 +74,10 @@ TF_Operation* Neg(TF_Operation* n, TF_Graph* graph, TF_Status* s,
 
 TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s);
 
-// Split `input` along the first dimention into 3 tensors
+TF_Operation* RandomUniform(TF_Operation* shape, TF_DataType dtype,
+                            TF_Graph* graph, TF_Status* s);
+
+// Split `input` along the first dimension into 3 tensors
 TF_Operation* Split3(TF_Operation* input, TF_Graph* graph, TF_Status* s,
                      const char* name = "split3");
 
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 8359de62b7ff690fec9f6a0e3280f947c62f8b6e..706c89536db019c7f7389af576815746b2425520 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -571,6 +571,12 @@ void TFE_ContextAddFunctionDef(TFE_Context* ctx,
   status->status = ctx->func_lib_def.AddFunctionDef(function_def);
 }
 
+void TFE_ContextAddFunction(TFE_Context* ctx, TF_Function* function,
+                            TF_Status* status) {
+  tensorflow::mutex_lock l(ctx->functions_mu);
+  status->status = ctx->func_lib_def.AddFunctionDef(function->fdef);
+}
+
 }  // extern "C"
 
 TFE_TensorHandle* TFE_NewTensorHandle(const tensorflow::Tensor& t) {
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 865580c5f3a823d9cf49fe460bd007e3b3b88767..ca105962df0d6655946304159937621022e7fcba 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -200,6 +200,13 @@ TF_CAPI_EXPORT extern void TFE_ContextAddFunctionDef(TFE_Context* ctx,
                                                      const char* serialized_function_def,
                                                      size_t size, TF_Status* status);
 
+// Adds a function (created from TF_GraphToFunction or
+// TF_FunctionImportFunctionDef) to the context, allowing it to be executed with
+// TFE_Execute by creating an op with the same name as the function.
+TF_CAPI_EXPORT extern void TFE_ContextAddFunction(TFE_Context* ctx,
+                                                  TF_Function* function,
+                                                  TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 4af91b8853d0e85570bad136752a9d0a04b87da5..3fe0b7efa11bc619ed98bf9a1634ade5b6ed0a7c 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -295,6 +295,67 @@ TEST(CAPI, Execute) {
   TF_DeleteStatus(status);
 }
 
+TEST(CAPI, Function) {
+  // First create a simple identity function.
+  TF_Graph* function_graph = TF_NewGraph();
+  TF_OperationDescription* arg_descr =
+      TF_NewOperation(function_graph, "Placeholder", "arg");
+  TF_SetAttrType(arg_descr, "dtype", TF_INT32);
+  TF_Status* status = TF_NewStatus();
+  TF_Operation* arg = TF_FinishOperation(arg_descr, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_OperationDescription* id_descr =
+      TF_NewOperation(function_graph, "Identity", "id");
+  TF_SetAttrType(id_descr, "T", TF_INT32);
+  TF_AddInput(id_descr, {arg, 0});
+  TF_Operation* id = TF_FinishOperation(id_descr, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_Output input{arg, 0};
+  TF_Output output{id, 0};
+  TF_Function* fn =
+      TF_GraphToFunction(function_graph, "ident", 0, 1, &id, 1, &input, 1,
+                         &output, nullptr, nullptr, "test", status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteGraph(function_graph);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+  TFE_ContextAddFunction(ctx, fn, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteFunction(fn);
+
+  TF_Tensor* t =
+      TF_AllocateTensor(TF_INT32, nullptr, 0, 1 * sizeof(tensorflow::int32));
+  *reinterpret_cast<tensorflow::int32*>(TF_TensorData(t)) = 42;
+  TFE_TensorHandle* h = TFE_NewTensorHandle(t, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteTensor(t);
+
+  TFE_Op* op = TFE_NewOp(ctx, "ident", status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TFE_OpAddInput(op, h, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+
+  std::vector<TFE_TensorHandle*> result;
+  result.push_back(nullptr);
+  int num_retvals = 1;
+  TFE_Execute(op, result.data(), &num_retvals, status);
+  TFE_DeleteOp(op);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  ASSERT_EQ(num_retvals, 1);
+
+  TF_Tensor* r = TFE_TensorHandleResolve(result[0], status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  EXPECT_EQ(*reinterpret_cast<tensorflow::int32*>(TF_TensorData(r)), 42);
+  TFE_DeleteTensorHandle(h);
+  TF_DeleteTensor(r);
+  TFE_DeleteTensorHandle(result[0]);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteStatus(status);
+}
+
 string MatMulFunction() {
   tensorflow::FunctionDef def;
   CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 29d73c5ca43a9ad3dbbc5d0f9c08b0b704724b03..2b65e38f54090af6731685f78d5f7f914a875e3c 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -106,6 +106,12 @@ class VSpace {
 
   // Deletes the input tensor.
   virtual void DeleteGradient(Gradient* gradient) const = 0;
+
+  // Lets this VSpace know that it can release resources held by the
+  // `backward_function`, It will not be called again.
+  // `backward_function` must not be null.
+  virtual void ReleaseBackwardFunction(
+      BackwardFunction* backward_function) const = 0;
 };
 
 // Traces the execution of operations, doing eager garbage collection, and
@@ -113,7 +119,11 @@ class VSpace {
 template <typename Gradient, typename BackwardFunction>
 class GradientTape {
  public:
-  GradientTape() {}
+  // If `persistent` is true, GradientTape will not eagerly delete backward
+  // functions (and hence the tensors they keep alive). Instead, everything
+  // is deleted in ~GradientTape. Persistent GradientTapes are useful when
+  // users want to compute multiple gradients over the same tape.
+  GradientTape(bool persistent) : persistent_(persistent) {}
   ~GradientTape() {
     for (const auto& pair : op_tape_) {
       pair.second.backward_function_deleter();
@@ -150,6 +160,10 @@ class GradientTape {
   // Map from tensor id to number of remaining usages (i.e. how many entries in
   // the tape refer to it); to aid in tape garbage collection.
   std::unordered_map<int64, int64> tensor_usage_;
+
+  // If false, all activations are deleted in the first call to ComputeGradient.
+  // Else, only when this is destructed.
+  bool persistent_;
 };
 
 // Template instantiations here
@@ -279,11 +293,16 @@ struct BackpropInitialState {
   std::unordered_map<int64, int64> op_missing_tensor;
 };
 
+// If `persistent_tape` is true, op_tape is not changed and none of the
+// backwards functions are deleted.
+// If `persistent_tape` is false, op_tape is cleared and backwards functions
+// not needed for gradient computation are deleted. Backwards functions that
+// are needed, are copied and returned in BackpropInitialState.
 template <typename BackwardFunction>
 BackpropInitialState<BackwardFunction> PrepareBackprop(
     gtl::ArraySlice<int64> target, const TensorTape& tensor_tape,
-    OpTape<BackwardFunction> op_tape,
-    const std::unordered_set<int64>& sources_set) {
+    OpTape<BackwardFunction>* op_tape,
+    const std::unordered_set<int64>& sources_set, bool persistent_tape) {
   std::vector<int64> tensor_stack;
   tensor_stack.reserve(target.size());
   for (auto t : target) {
@@ -298,9 +317,9 @@ BackpropInitialState<BackwardFunction> PrepareBackprop(
       continue;
     }
     int64 op_id = op_id_it->second;
-    auto op_it = op_tape.find(op_id);
+    auto op_it = op_tape->find(op_id);
     auto result_op_it = result.op_tape.find(op_id);
-    if (op_id == -1 || op_it == op_tape.end() ||
+    if (op_id == -1 || op_it == op_tape->end() ||
         result_op_it != result.op_tape.end()) {
       continue;
     }
@@ -317,7 +336,9 @@ BackpropInitialState<BackwardFunction> PrepareBackprop(
         }
       }
     }
-    op_tape.erase(op_it);
+    if (!persistent_tape) {
+      op_tape->erase(op_it);
+    }
   }
   for (auto& pair : result.tensor_usage_counts) {
     auto it = tensor_tape.find(pair.first);
@@ -325,9 +346,15 @@ BackpropInitialState<BackwardFunction> PrepareBackprop(
       result.op_missing_tensor[it->second] += 1;
     }
   }
-  // Call destructors for all unneeded gradient functions.
-  for (const auto& op_pair : op_tape) {
-    op_pair.second.backward_function_deleter();
+  if (!persistent_tape) {
+    // Call destructors for all unneeded gradient functions and
+    // clear the op_tape. We can clear the tape because ownership of
+    // backward functions that will be used for gradient computation
+    // has been transferred to `result`.
+    for (const auto& op_pair : *op_tape) {
+      op_pair.second.backward_function_deleter();
+    }
+    op_tape->clear();
   }
   return result;
 }
@@ -369,7 +396,8 @@ Status InitialGradients(
           auto op_it = op_tape.find(tensor_it->second);
           if (op_it == op_tape.end()) {
             return errors::Internal(
-                "Internal state of the gradient tape is invalid.");
+                "Internal state of the gradient tape is invalid: "
+                "failed to find operation producing a tensor");
           }
           bool found = false;
           for (int j = 0; j < op_it->second.output_tensor_info.size(); ++j) {
@@ -383,7 +411,8 @@ Status InitialGradients(
           }
           if (!found) {
             return errors::Internal(
-                "Internal state of the gradient tape is invalid.");
+                "Internal state of the gradient tape is invalid: "
+                "none of operations outputs match expected tensor");
           }
         } else {
           // No record of the target tensor found on the tape, so no gradient
@@ -415,17 +444,19 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
   std::unordered_set<int64> sources_set(source_tensor_ids.begin(),
                                         source_tensor_ids.end());
   BackpropInitialState<BackwardFunction> state = PrepareBackprop(
-      target_tensor_ids, tensor_tape_, std::move(op_tape_), sources_set);
+      target_tensor_ids, tensor_tape_, &op_tape_, sources_set, persistent_);
   std::vector<int64> op_stack =
       InitialStack(state.op_tape, state.op_missing_tensor);
   std::unordered_map<int64, std::vector<Gradient*>> gradients;
   Status s = InitialGradients(vspace, target_tensor_ids, output_gradients,
                               tensor_tape_, state.op_tape,
                               state.tensor_usage_counts, &gradients);
-  auto cleanup = [&state]() {
-    // Release all backprop functions
-    for (const auto& pair : state.op_tape) {
-      pair.second.backward_function_deleter();
+  auto cleanup = [this, &state]() {
+    if (!persistent_) {
+      // Release all backprop functions
+      for (const auto& pair : state.op_tape) {
+        pair.second.backward_function_deleter();
+      }
     }
   };
   if (!s.ok()) {
@@ -460,6 +491,7 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
     state.op_tape.erase(op_it);
     std::vector<Gradient*> out_gradients;
     out_gradients.reserve(trace.output_tensor_info.size());
+    bool any_gradient_nonzero = false;
     for (int i = 0; i < trace.output_tensor_info.size(); ++i) {
       const int64 id = trace.output_tensor_info[i].id;
       auto grad_it = gradients.find(id);
@@ -475,6 +507,7 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
                            trace.output_tensor_info[i].dtype));
         }
       } else {
+        any_gradient_nonzero = true;
         out_gradients.push_back(vspace.AggregateGradients(grad_it->second));
         if (sources_set.find(grad_it->first) == sources_set.end()) {
           gradients.erase(grad_it);
@@ -482,12 +515,26 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
       }
     }
     std::vector<Gradient*> in_gradients;
-    Status s = vspace.CallBackwardFunction(trace.backward_function,
-                                           out_gradients, &in_gradients);
-    if (!s.ok()) {
-      VLOG(1) << "Gradient function failed.";
-      cleanup();
-      return s;
+    if (any_gradient_nonzero) {
+      Status s = vspace.CallBackwardFunction(trace.backward_function,
+                                             out_gradients, &in_gradients);
+      if (!persistent_) {
+        vspace.ReleaseBackwardFunction(trace.backward_function);
+      }
+      if (!s.ok()) {
+        cleanup();
+        return s;
+      }
+    } else {
+      in_gradients.resize(trace.input_tensor_id.size());
+      if (!persistent_) {
+        vspace.ReleaseBackwardFunction(trace.backward_function);
+      }
+      for (Gradient* grad : out_gradients) {
+        if (grad != nullptr) {
+          vspace.DeleteGradient(grad);
+        }
+      }
     }
     VLOG(1) << "Got " << in_gradients.size() << " in_gradients for "
             << trace.input_tensor_id.size() << " sources";
diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc
index ba5a9268b4f671499590d66fb41060dd18e1ce47..6e37cdb5f4beea53d4a2ded0705ae482d0bc2d68 100644
--- a/tensorflow/c/python_api.cc
+++ b/tensorflow/c/python_api.cc
@@ -22,6 +22,7 @@ namespace tensorflow {
 void AddControlInput(TF_Graph* graph, TF_Operation* op, TF_Operation* input) {
   mutex_lock l(graph->mu);
   graph->graph.AddControlEdge(&input->node, &op->node);
+  RecordMutation(graph, *op, "adding control input");
 }
 
 void SetAttr(TF_Graph* graph, TF_Operation* op, const char* attr_name,
@@ -36,11 +37,13 @@ void SetAttr(TF_Graph* graph, TF_Operation* op, const char* attr_name,
 
   mutex_lock l(graph->mu);
   op->node.AddAttr(attr_name, attr_val);
+  RecordMutation(graph, *op, "setting attribute");
 }
 
 void SetRequestedDevice(TF_Graph* graph, TF_Operation* op, const char* device) {
   mutex_lock l(graph->mu);
   op->node.set_requested_device(device);
+  RecordMutation(graph, *op, "setting device");
 }
 
 void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
@@ -75,6 +78,25 @@ void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
   }
   status->status = graph->graph.UpdateEdge(&new_src.oper->node, new_src.index,
                                            &dst.oper->node, dst.index);
+
+  if (status->status.ok()) {
+    // This modification only updates the destination node for
+    // the purposes of running this graph in a session. Thus, we don't
+    // record the source node as being modified.
+    RecordMutation(graph, *dst.oper, "updating input tensor");
+  }
+}
+
+void RemoveAllControlInputs(TF_Graph* graph, TF_Operation* op) {
+  mutex_lock l(graph->mu);
+  std::vector<const Edge*> control_edges;
+  for (const Edge* edge : op->node.in_edges()) {
+    if (!edge->IsControlEdge()) continue;
+    control_edges.push_back(edge);
+  }
+  for (const Edge* edge : control_edges) {
+    graph->graph.RemoveControlEdge(edge);
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h
index f54585b0a1034ff108202272a11416e34985959e..b51ef2b53122802fef598a26bd6f1843976f11b0 100644
--- a/tensorflow/c/python_api.h
+++ b/tensorflow/c/python_api.h
@@ -35,6 +35,8 @@ void SetRequestedDevice(TF_Graph* graph, TF_Operation* op, const char* device);
 void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
                 TF_Status* status);
 
+void RemoveAllControlInputs(TF_Graph* graph, TF_Operation* op);
+
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_C_PYTHON_API_H_
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 80112f9b44b1d5fd65a7d47788b072dc47a2b29a..e354831d7d25af83c068a68a4f844056263a598c 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -421,6 +421,7 @@ tf_cc_test(
 
 tf_gen_op_wrappers_cc(
     name = "cc_ops",
+    api_def_srcs = ["//tensorflow/core:base_api_def"],
     op_lib_names = [
         "array_ops",
         "audio_ops",
@@ -525,6 +526,30 @@ cc_library_with_android_deps(
         "//tensorflow/core:android_tensorflow_lib",
     ],
     copts = tf_copts(),
+    data = [
+        "//tensorflow/core:base_api_def",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:op_gen_lib",
+        "//tensorflow/core:op_gen_overrides_proto_cc",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "cc_op_gen_test",
+    srcs = [
+        "framework/cc_op_gen.cc",
+        "framework/cc_op_gen.h",
+        "framework/cc_op_gen_test.cc",
+    ],
+    data = [
+        "//tensorflow/cc:ops/op_gen_overrides.pbtxt",
+    ],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -533,6 +558,8 @@ cc_library_with_android_deps(
         "//tensorflow/core:op_gen_overrides_proto_cc",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
     ],
 )
 
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index 38a17598b8e4161f96ab8134823de033d3284440..d889c518f9c38a9f070970b37a2ad4b1fc26671b 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -18,10 +18,11 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/cc/framework/cc_op_gen.h"
+#include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
-#include "tensorflow/core/framework/op_gen_overrides.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb_text.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
-
 namespace {
 
 const int kRightMargin = 79;
@@ -297,7 +297,7 @@ string ToCamelCase(const string& str) {
 // argument to a function.
 std::pair<const char*, bool> AttrTypeName(StringPiece attr_type) {
   static const std::unordered_map<StringPiece, std::pair<const char*, bool>,
-                                  StringPiece::Hasher>
+                                  StringPieceHasher>
       attr_type_map{
           {"string", {"StringPiece", false}},
           {"list(string)", {"gtl::ArraySlice<string>", true}},
@@ -325,29 +325,112 @@ std::pair<const char*, bool> AttrTypeName(StringPiece attr_type) {
 }
 
 bool IsCPPKeyword(StringPiece name) {
-  static const std::unordered_set<StringPiece, StringPiece::Hasher>
+  static const std::unordered_set<StringPiece, StringPieceHasher>
       // Keywords obtained from http://en.cppreference.com/w/cpp/keyword
       kCPPReserved{
-          "alignas", "alignof", "and", "and_eq", "asm", "atomic_cancel",
-          "atomic_commit", "atomic_noexcept", "auto", "bitand", "bitor", "bool",
-          "break", "case", "catch", "char", "char16_t", "char32_t", "class",
-          "compl", "concept", "const", "const_cast", "constexpr", "continue",
-          "decltype", "default", "delete", "do", "double", "dynamic_cast",
-          "else", "enum", "explicit", "export", "extern", "false", "final",
-          "float", "for", "friend", "goto", "if", "import", "inline", "int",
-          "long", "module", "mutable", "namespace", "new", "noexcept", "not",
-          "not_eq", "nullptr", "operator", "or", "or_eq", "override", "private",
-          "protected", "public", "register", "reinterpret_cast", "requires",
-          "return", "short", "signed", "sizeof", "static", "static_assert",
-          "static_cast", "struct", "switch", "synchronized", "template", "this",
-          "thread_local", "throw", "true", "try", "typedef", "typeid",
-          "typename", "union", "unsigned", "using", "virtual", "void",
-          "volatile", "wchar_t", "while", "xor", "xor_eq",
+          "alignas",
+          "alignof",
+          "and",
+          "and_eq",
+          "asm",
+          "atomic_cancel",
+          "atomic_commit",
+          "atomic_noexcept",
+          "auto",
+          "bitand",
+          "bitor",
+          "bool",
+          "break",
+          "case",
+          "catch",
+          "char",
+          "char16_t",
+          "char32_t",
+          "class",
+          "compl",
+          "concept",
+          "const",
+          "const_cast",
+          "constexpr",
+          "continue",
+          "decltype",
+          "default",
+          "delete",
+          "do",
+          "double",
+          "dynamic_cast",
+          "else",
+          "enum",
+          "explicit",
+          "export",
+          "extern",
+          "false",
+          "final",
+          "float",
+          "for",
+          "friend",
+          "goto",
+          "if",
+          "import",
+          "inline",
+          "int",
+          "long",
+          "module",
+          "mutable",
+          "namespace",
+          "new",
+          "noexcept",
+          "not",
+          "not_eq",
+          "nullptr",
+          "operator",
+          "or",
+          "or_eq",
+          "override",
+          "private",
+          "protected",
+          "public",
+          "register",
+          "reinterpret_cast",
+          "requires",
+          "return",
+          "short",
+          "signed",
+          "sizeof",
+          "static",
+          "static_assert",
+          "static_cast",
+          "struct",
+          "switch",
+          "synchronized",
+          "template",
+          "this",
+          "thread_local",
+          "throw",
+          "true",
+          "try",
+          "typedef",
+          "typeid",
+          "typename",
+          "union",
+          "unsigned",
+          "using",
+          "virtual",
+          "void",
+          "volatile",
+          "wchar_t",
+          "while",
+          "xor",
+          "xor_eq",
 
           // The following are not C++ keywords, but names of local variables
           // and parameters used in the op constructor. Treating them as
           // keywords, so that other parameter names don't conflict with these.
-          "builder", "node", "ret", "scope", "unique_name",
+          "builder",
+          "node",
+          "ret",
+          "scope",
+          "unique_name",
       };
   return kCPPReserved.count(name) > 0;
 }
@@ -385,10 +468,10 @@ bool ArgIsList(const OpDef::ArgDef& arg) {
 }
 
 bool HasOptionalAttrs(
-    const OpDef& op_def,
+    const ApiDef& api_def,
     const std::unordered_map<string, string>& inferred_input_attrs) {
-  for (int i = 0; i < op_def.attr_size(); ++i) {
-    const auto& attr(op_def.attr(i));
+  for (int i = 0; i < api_def.attr_size(); ++i) {
+    const auto& attr(api_def.attr(i));
     if ((inferred_input_attrs.find(attr.name()) ==
          inferred_input_attrs.end()) &&
         attr.has_default_value()) {
@@ -398,12 +481,21 @@ bool HasOptionalAttrs(
   return false;
 }
 
+const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
+  for (int i = 0; i < api_def.in_arg_size(); ++i) {
+    if (api_def.in_arg(i).name() == name) {
+      return &api_def.in_arg(i);
+    }
+  }
+  return nullptr;
+}
+
 struct OpInfo {
   // graph_op_def: The OpDef used by the runtime, has the names that
   //   must be used when calling NodeBuilder.
   // interface_op_def: The OpDef used in the interface in the generated
   //   code, with possibly overridden names and defaults.
-  explicit OpInfo(const OpDef& graph_op_def, const OpDef& inteface_op_def,
+  explicit OpInfo(const OpDef& graph_op_def, const ApiDef& api_def,
                   const std::vector<string>& aliases);
   string GetOpAttrStruct() const;
   string GetConstructorDecl(StringPiece op_name_prefix,
@@ -423,74 +515,81 @@ struct OpInfo {
   string comment;
 
   const OpDef& graph_op_def;
-  const OpDef& op_def;
+  const ApiDef& api_def;
   const std::vector<string>& aliases;
+  // Map from type attribute to corresponding original argument name.
   std::unordered_map<string, string> inferred_input_attrs;
 };
 
-OpInfo::OpInfo(const OpDef& g_op_def, const OpDef& i_op_def,
-               const std::vector<string>& a)
-    : graph_op_def(g_op_def), op_def(i_op_def), aliases(a) {
-  op_name = op_def.name();
-  InferOpAttributes(op_def, &inferred_input_attrs);
-  has_optional_attrs = HasOptionalAttrs(op_def, inferred_input_attrs);
+OpInfo::OpInfo(const OpDef& graph_op_def, const ApiDef& api_def,
+               const std::vector<string>& aliases)
+    : graph_op_def(graph_op_def), api_def(api_def), aliases(aliases) {
+  op_name = api_def.endpoint(0).name();
+  InferOpAttributes(graph_op_def, &inferred_input_attrs);
+  has_optional_attrs = HasOptionalAttrs(api_def, inferred_input_attrs);
   arg_types.push_back("const ::tensorflow::Scope&");
   arg_names.push_back("scope");
 
-  if (op_def.has_deprecation()) {
-    if (!op_def.summary().empty()) {
-      comment = strings::StrCat(op_def.summary(), "\n");
+  if (graph_op_def.has_deprecation()) {
+    if (!api_def.summary().empty()) {
+      comment = strings::StrCat(api_def.summary(), "\n");
     }
     strings::StrAppend(&comment, "DEPRECATED at GraphDef version ",
-                       op_def.deprecation().version(), ":\n",
-                       op_def.deprecation().explanation(), ".\n");
-  } else if (op_def.summary().empty()) {
+                       graph_op_def.deprecation().version(), ":\n",
+                       graph_op_def.deprecation().explanation(), ".\n");
+  } else if (api_def.summary().empty()) {
     comment = "TODO: add doc.\n";
   } else {
-    comment = strings::StrCat(op_def.summary(), "\n");
+    comment = strings::StrCat(api_def.summary(), "\n");
   }
-  if (!op_def.description().empty()) {
-    strings::StrAppend(&comment, "\n", op_def.description(), "\n");
+  if (!api_def.description().empty()) {
+    strings::StrAppend(&comment, "\n", api_def.description(), "\n");
   }
   strings::StrAppend(&comment, "\nArguments:\n* scope: A Scope object\n");
 
   // Process inputs
-  for (int i = 0; i < op_def.input_arg_size(); ++i) {
-    const auto& arg(op_def.input_arg(i));
+  for (int i = 0; i < api_def.arg_order_size(); ++i) {
+    const auto& arg = *FindInputArg(api_def.arg_order(i), graph_op_def);
+    const auto& api_def_arg = *FindInputArg(api_def.arg_order(i), api_def);
     arg_types.push_back(strings::StrCat(
         "::tensorflow::", ArgIsList(arg) ? "InputList" : "Input"));
-    arg_names.push_back(AvoidCPPKeywords(arg.name()));
+    arg_names.push_back(AvoidCPPKeywords(api_def_arg.rename_to()));
 
     // TODO(keveman): Include input type information.
-    StringPiece description = arg.description();
+    StringPiece description = api_def_arg.description();
     if (!description.empty()) {
       ConsumeEquals(&description);
-      strings::StrAppend(&comment, "* ", AvoidCPPKeywords(arg.name()), ": ",
-                         arg.description(), "\n");
+      strings::StrAppend(&comment, "* ",
+                         AvoidCPPKeywords(api_def_arg.rename_to()), ": ",
+                         api_def_arg.description(), "\n");
     }
   }
 
   // Process attrs
   string required_attrs_comment;
   string optional_attrs_comment;
-  for (int i = 0; i < op_def.attr_size(); ++i) {
-    const auto& attr(op_def.attr(i));
+  for (int i = 0; i < graph_op_def.attr_size(); ++i) {
+    // ApiDef attributes must be in the same order as in OpDef since
+    // we initialize ApiDef based on OpDef.
+    const auto& attr(graph_op_def.attr(i));
+    const auto& api_def_attr(api_def.attr(i));
+    CHECK_EQ(attr.name(), api_def_attr.name());
     // Skip inferred arguments
     if (inferred_input_attrs.count(attr.name()) > 0) continue;
 
     const auto entry = AttrTypeName(attr.type());
     const auto attr_type_name = entry.first;
     const bool use_const = entry.second;
-    string attr_name = AvoidCPPKeywords(attr.name());
+    string attr_name = AvoidCPPKeywords(api_def_attr.rename_to());
 
     string attr_comment;
-    if (!attr.description().empty()) {
+    if (!api_def_attr.description().empty()) {
       // TODO(keveman): Word wrap and indent this, to handle multi-line
       // descriptions.
       strings::StrAppend(&attr_comment, "* ", attr_name, ": ",
-                         attr.description(), "\n");
+                         api_def_attr.description(), "\n");
     }
-    if (attr.has_default_value()) {
+    if (api_def_attr.has_default_value()) {
       strings::StrAppend(&optional_attrs_comment, attr_comment);
     } else {
       strings::StrAppend(&required_attrs_comment, attr_comment);
@@ -508,44 +607,49 @@ OpInfo::OpInfo(const OpDef& g_op_def, const OpDef& i_op_def,
   }
 
   // Process outputs
-  for (int i = 0; i < op_def.output_arg_size(); ++i) {
-    const auto& arg = op_def.output_arg(i);
+  for (int i = 0; i < graph_op_def.output_arg_size(); ++i) {
+    // ApiDef arguments must be in the same order as in OpDef since
+    // we initialize ApiDef based on OpDef.
+    const auto& arg = graph_op_def.output_arg(i);
+    const auto& api_def_arg(api_def.out_arg(i));
+    CHECK_EQ(arg.name(), api_def_arg.name());
+
     bool is_list = ArgIsList(arg);
     output_types.push_back(
         strings::StrCat("::tensorflow::", is_list ? "OutputList" : "Output"));
-    output_names.push_back(AvoidCPPKeywords(arg.name()));
+    output_names.push_back(AvoidCPPKeywords(api_def_arg.rename_to()));
     is_list_output.push_back(is_list);
   }
 
   strings::StrAppend(&comment, "\nReturns:\n");
-  if (op_def.output_arg_size() == 0) {  // No outputs.
+  if (graph_op_def.output_arg_size() == 0) {  // No outputs.
     strings::StrAppend(&comment, "* the created `Operation`\n");
-  } else if (op_def.output_arg_size() == 1) {  // One output
+  } else if (graph_op_def.output_arg_size() == 1) {  // One output
     if (is_list_output[0]) {
       strings::StrAppend(&comment, "* `OutputList`: ");
     } else {
       strings::StrAppend(&comment, "* `Output`: ");
     }
-    if (op_def.output_arg(0).description().empty()) {
-      strings::StrAppend(&comment, "The ", op_def.output_arg(0).name(),
+    if (api_def.out_arg(0).description().empty()) {
+      strings::StrAppend(&comment, "The ", api_def.out_arg(0).name(),
                          " tensor.\n");
     } else {
       // TODO(josh11b): Word wrap this.
-      strings::StrAppend(&comment, op_def.output_arg(0).description(), "\n");
+      strings::StrAppend(&comment, api_def.out_arg(0).description(), "\n");
     }
   } else {  // Multiple outputs.
-    for (int i = 0; i < op_def.output_arg_size(); ++i) {
+    for (int i = 0; i < graph_op_def.output_arg_size(); ++i) {
       if (is_list_output[i]) {
         strings::StrAppend(&comment, "* `OutputList`");
       } else {
         strings::StrAppend(&comment, "* `Output`");
       }
       strings::StrAppend(&comment, " ", output_names[i]);
-      if (op_def.output_arg(i).description().empty()) {
+      if (api_def.out_arg(i).description().empty()) {
         strings::StrAppend(&comment, "\n");
       } else {
         // TODO(josh11b): Word wrap this.
-        strings::StrAppend(&comment, ": ", op_def.output_arg(i).description(),
+        strings::StrAppend(&comment, ": ", api_def.out_arg(i).description(),
                            "\n");
       }
     }
@@ -564,19 +668,20 @@ string OpInfo::GetOpAttrStruct() const {
   string struct_fields;
   string setters;
 
-  for (int i = 0; i < op_def.attr_size(); ++i) {
-    const auto& attr(op_def.attr(i));
+  for (int i = 0; i < graph_op_def.attr_size(); ++i) {
+    const auto& attr(graph_op_def.attr(i));
+    const auto& api_def_attr(api_def.attr(i));
     // If attr will be inferred or it doesn't have a default value, don't
     // add it to the struct.
     if ((inferred_input_attrs.find(attr.name()) !=
          inferred_input_attrs.end()) ||
-        !attr.has_default_value()) {
+        !api_def_attr.has_default_value()) {
       continue;
     }
     const auto entry = AttrTypeName(attr.type());
     const auto attr_type_name = entry.first;
     const bool use_const = entry.second;
-    const string camel_case_name = ToCamelCase(attr.name());
+    const string camel_case_name = ToCamelCase(api_def_attr.rename_to());
     const string suffix =
         (camel_case_name == op_name || camel_case_name == "Attrs") ? "_" : "";
     const string attr_func_def =
@@ -584,22 +689,25 @@ string OpInfo::GetOpAttrStruct() const {
                         attr_type_name, use_const ? "&" : "");
 
     string attr_comment;
-    if (!attr.description().empty()) {
-      strings::StrAppend(&attr_comment, attr.description(), "\n\n");
+    if (!api_def_attr.description().empty()) {
+      strings::StrAppend(&attr_comment, api_def_attr.description(), "\n\n");
     }
     strings::StrAppend(&attr_comment, "Defaults to ",
-                       SummarizeAttrValue(attr.default_value()), "\n");
+                       SummarizeAttrValue(api_def_attr.default_value()), "\n");
     attr_comment = MakeComment(attr_comment, "    ");
 
     strings::StrAppend(&setters, attr_comment);
     strings::StrAppend(&setters, "    Attrs ", attr_func_def, " x) {\n");
     strings::StrAppend(&setters, "      Attrs ret = *this;\n");
-    strings::StrAppend(&setters, "      ret.", attr.name(), "_ = x;\n");
+    strings::StrAppend(&setters, "      ret.", api_def_attr.rename_to(),
+                       "_ = x;\n");
     strings::StrAppend(&setters, "      return ret;\n    }\n\n");
 
     strings::StrAppend(
-        &struct_fields, "    ", attr_type_name, " ", attr.name(), "_ = ",
-        PrintAttrValue(op_def.name(), attr.default_value()), ";\n");
+        &struct_fields, "    ", attr_type_name, " ", api_def_attr.rename_to(),
+        "_ = ",
+        PrintAttrValue(graph_op_def.name(), api_def_attr.default_value()),
+        ";\n");
   }
 
   if (struct_fields.empty()) {
@@ -676,17 +784,18 @@ void OpInfo::WriteClassDecl(WritableFile* h) const {
   // Add the static functions to set optional attrs
   if (has_optional_attrs) {
     strings::StrAppend(&class_decl, "\n");
-    for (int i = 0; i < op_def.attr_size(); ++i) {
-      const auto& attr(op_def.attr(i));
+    for (int i = 0; i < graph_op_def.attr_size(); ++i) {
+      const auto& attr(graph_op_def.attr(i));
+      const auto& api_def_attr(api_def.attr(i));
       if ((inferred_input_attrs.find(attr.name()) !=
            inferred_input_attrs.end()) ||
-          !attr.has_default_value()) {
+          !api_def_attr.has_default_value()) {
         continue;
       }
       const auto entry = AttrTypeName(attr.type());
       const auto attr_type_name = entry.first;
       const bool use_const = entry.second;
-      const string camel_case_name = ToCamelCase(attr.name());
+      const string camel_case_name = ToCamelCase(api_def_attr.rename_to());
       const string suffix =
           (camel_case_name == op_name || camel_case_name == "Attrs") ? "_" : "";
       const string attr_func_def = strings::StrCat(
@@ -726,11 +835,11 @@ void OpInfo::GetOutput(string* out) const {
       strings::StrCat("if (!", scope_str, ".ok()) return;");
 
   // No outputs.
-  if (op_def.output_arg_size() == 0) {
+  if (graph_op_def.output_arg_size() == 0) {
     strings::StrAppend(out, "  this->operation = Operation(ret);\n  return;\n");
     return;
   }
-  if (op_def.output_arg_size() == 1) {
+  if (graph_op_def.output_arg_size() == 1) {
     // One output, no need for NameRangeMap
     if (is_list_output[0]) {
       strings::StrAppend(out,
@@ -752,7 +861,7 @@ void OpInfo::GetOutput(string* out) const {
                      ".UpdateStatus(_status_);\n", "    return;\n");
   strings::StrAppend(out, "  }\n\n");
 
-  for (int i = 0; i < op_def.output_arg_size(); ++i) {
+  for (int i = 0; i < graph_op_def.output_arg_size(); ++i) {
     const string arg_range = strings::StrCat(
         "_outputs_range[\"", graph_op_def.output_arg(i).name(), "\"]");
     if (is_list_output[i]) {
@@ -776,11 +885,13 @@ string OpInfo::GetConstructorBody() const {
 
   strings::StrAppend(&body, "  ", return_on_error, "\n");
 
-  for (int i = 0; i < op_def.input_arg_size(); ++i) {
-    const auto& arg(op_def.input_arg(i));
-    strings::StrAppend(&body, "  auto _", arg.name(), " = ::tensorflow::ops::",
-                       ArgIsList(arg) ? "AsNodeOutList" : "AsNodeOut", "(",
-                       scope_str, ", ", AvoidCPPKeywords(arg.name()), ");\n");
+  for (int i = 0; i < graph_op_def.input_arg_size(); ++i) {
+    const auto& arg(graph_op_def.input_arg(i));
+    const auto& api_def_arg(api_def.in_arg(i));
+    strings::StrAppend(
+        &body, "  auto _", api_def_arg.rename_to(), " = ::tensorflow::ops::",
+        ArgIsList(arg) ? "AsNodeOutList" : "AsNodeOut", "(", scope_str, ", ",
+        AvoidCPPKeywords(api_def_arg.rename_to()), ");\n");
     strings::StrAppend(&body, "  ", return_on_error, "\n");
   }
 
@@ -791,19 +902,21 @@ string OpInfo::GetConstructorBody() const {
       &body, "  auto builder = ::tensorflow::NodeBuilder(unique_name, \"",
       graph_op_def.name(), "\")\n");
   const string spaces = "                     ";
-  for (int i = 0; i < op_def.input_arg_size(); ++i) {
-    const auto& arg(op_def.input_arg(i));
-    strings::StrAppend(&body, spaces, ".Input(_", arg.name(), ")\n");
+  for (int i = 0; i < api_def.in_arg_size(); ++i) {
+    const auto& arg(api_def.in_arg(i));
+    strings::StrAppend(&body, spaces, ".Input(_", arg.rename_to(), ")\n");
   }
-  for (int i = 0; i < op_def.attr_size(); ++i) {
+  for (int i = 0; i < api_def.attr_size(); ++i) {
     const auto& graph_attr(graph_op_def.attr(i));
-    const auto& attr(op_def.attr(i));
-    if (inferred_input_attrs.find(attr.name()) != inferred_input_attrs.end()) {
+    const auto& api_def_attr(api_def.attr(i));
+    if (inferred_input_attrs.find(api_def_attr.name()) !=
+        inferred_input_attrs.end()) {
       continue;
     }
-    const string attr_name = attr.has_default_value()
-                                 ? strings::StrCat("attrs.", attr.name(), "_")
-                                 : AvoidCPPKeywords(attr.name());
+    const string attr_name =
+        api_def_attr.has_default_value()
+            ? strings::StrCat("attrs.", api_def_attr.rename_to(), "_")
+            : AvoidCPPKeywords(api_def_attr.rename_to());
     strings::StrAppend(&body, spaces, ".Attr(\"", graph_attr.name(), "\", ",
                        attr_name, ")\n");
   }
@@ -845,10 +958,10 @@ void OpInfo::WriteClassDef(WritableFile* cc) const {
   TF_CHECK_OK(cc->Append(class_def));
 }
 
-void WriteCCOp(const OpDef& graph_op_def, const OpDef& interface_op_def,
+void WriteCCOp(const OpDef& graph_op_def, const ApiDef& api_def,
                const std::vector<string>& aliases, WritableFile* h,
                WritableFile* cc) {
-  OpInfo op_info(graph_op_def, interface_op_def, aliases);
+  OpInfo op_info(graph_op_def, api_def, aliases);
 
   op_info.WriteClassDecl(h);
   op_info.WriteClassDef(cc);
@@ -943,8 +1056,9 @@ string MakeInternal(const string& fname) {
 
 }  // namespace
 
-void WriteCCOps(const OpList& ops, const string& dot_h_fname,
-                const string& dot_cc_fname, const string& overrides_fnames) {
+void WriteCCOps(const OpList& ops, const ApiDefMap& api_def_map,
+                const string& dot_h_fname, const string& dot_cc_fname,
+                const string& overrides_fnames) {
   Env* env = Env::Default();
 
   // Load the override map.
@@ -984,24 +1098,23 @@ void WriteCCOps(const OpList& ops, const string& dot_h_fname,
     // code depends on it.
     if (graph_op_def.name() == "Const") continue;
 
-    // Incorporate overrides from override_map.
-    OpDef interface_op_def = graph_op_def;
-    const OpGenOverride* op_override =
-        override_map.ApplyOverride(&interface_op_def);
+    const auto* api_def = api_def_map.GetApiDef(graph_op_def.name());
+
     std::vector<string> aliases;
-    if (op_override) {
-      if (op_override->skip()) continue;
-      aliases.assign(op_override->alias().begin(), op_override->alias().end());
-      if (op_override->hide()) {
-        // Write hidden ops to _internal.h and _internal.cc.
-        WriteCCOp(graph_op_def, interface_op_def, aliases, internal_h.get(),
-                  internal_cc.get());
-        continue;
-      }
+    if (api_def->visibility() == ApiDef::SKIP) continue;
+    // First endpoint is canonical, the rest are aliases.
+    for (int endpoint_i = 1; endpoint_i < api_def->endpoint_size();
+         ++endpoint_i) {
+      aliases.push_back(api_def->endpoint(endpoint_i).name());
+    }
+    if (api_def->visibility() == ApiDef::HIDDEN) {
+      // Write hidden ops to _internal.h and _internal.cc.
+      WriteCCOp(graph_op_def, *api_def, aliases, internal_h.get(),
+                internal_cc.get());
+      continue;
     }
-
     // This isn't a hidden op, write it to the main files.
-    WriteCCOp(graph_op_def, interface_op_def, aliases, h.get(), cc.get());
+    WriteCCOp(graph_op_def, *api_def, aliases, h.get(), cc.get());
   }
 
   FinishFiles(false, h.get(), cc.get(), op_header_guard);
diff --git a/tensorflow/cc/framework/cc_op_gen.h b/tensorflow/cc/framework/cc_op_gen.h
index fa5e004f0317d046d82bee005bdf9f17773a45f3..cea28990144b9371e8009ce13f912b44044f9aac 100644
--- a/tensorflow/cc/framework/cc_op_gen.h
+++ b/tensorflow/cc/framework/cc_op_gen.h
@@ -17,13 +17,15 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
 
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
 /// Result is written to files dot_h and dot_cc.
-void WriteCCOps(const OpList& ops, const string& dot_h_fname,
-                const string& dot_cc_fname, const string& overrides_fnames);
+void WriteCCOps(const OpList& ops, const ApiDefMap& api_def_map,
+                const string& dot_h_fname, const string& dot_cc_fname,
+                const string& overrides_fnames);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/cc/framework/cc_op_gen_main.cc b/tensorflow/cc/framework/cc_op_gen_main.cc
index 3b80cf993eb9a5d5f4c41687577414e7216dd174..326d5668b8803ee39ffe24900c92e1db87b93601 100644
--- a/tensorflow/cc/framework/cc_op_gen_main.cc
+++ b/tensorflow/cc/framework/cc_op_gen_main.cc
@@ -16,7 +16,11 @@ limitations under the License.
 #include "tensorflow/cc/framework/cc_op_gen.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -24,10 +28,28 @@ namespace tensorflow {
 namespace {
 
 void PrintAllCCOps(const std::string& dot_h, const std::string& dot_cc,
-                   const std::string& overrides_fnames, bool include_internal) {
+                   const std::string& overrides_fnames, bool include_internal,
+                   const std::vector<string>& api_def_dirs) {
   OpList ops;
   OpRegistry::Global()->Export(include_internal, &ops);
-  WriteCCOps(ops, dot_h, dot_cc, overrides_fnames);
+  ApiDefMap api_def_map(ops);
+  if (!api_def_dirs.empty()) {
+    Env* env = Env::Default();
+    // Only load files that correspond to "ops".
+    for (const auto& op : ops.op()) {
+      for (const auto& api_def_dir : api_def_dirs) {
+        const std::string api_def_file_pattern =
+            io::JoinPath(api_def_dir, "api_def_" + op.name() + ".pbtxt");
+        if (env->FileExists(api_def_file_pattern).ok()) {
+          TF_CHECK_OK(api_def_map.LoadFile(env, api_def_file_pattern));
+        }
+      }
+    }
+  }
+
+  api_def_map.UpdateDocs();
+
+  WriteCCOps(ops, api_def_map, dot_h, dot_cc, overrides_fnames);
 }
 
 }  // namespace
@@ -35,18 +57,24 @@ void PrintAllCCOps(const std::string& dot_h, const std::string& dot_cc,
 
 int main(int argc, char* argv[]) {
   tensorflow::port::InitMain(argv[0], &argc, &argv);
-  if (argc != 5) {
+  // TODO(annarev): Update this file to no longer take op_gen_overrides.pbtxt
+  // as an argument.
+  if (argc != 6) {
     for (int i = 1; i < argc; ++i) {
       fprintf(stderr, "Arg %d = %s\n", i, argv[i]);
     }
     fprintf(stderr,
-            "Usage: %s out.h out.cc overrides1.pbtxt,2.pbtxt include_internal\n"
+            "Usage: %s out.h out.cc overrides1.pbtxt,2.pbtxt include_internal "
+            "api_def_dirs1,api_def_dir2 ...\n"
             "  include_internal: 1 means include internal ops\n",
             argv[0]);
     exit(1);
   }
 
   bool include_internal = tensorflow::StringPiece("1") == argv[4];
-  tensorflow::PrintAllCCOps(argv[1], argv[2], argv[3], include_internal);
+  std::vector<tensorflow::string> api_def_dirs = tensorflow::str_util::Split(
+      argv[5], ",", tensorflow::str_util::SkipEmpty());
+  tensorflow::PrintAllCCOps(argv[1], argv[2], argv[3], include_internal,
+                            api_def_dirs);
   return 0;
 }
diff --git a/tensorflow/cc/framework/cc_op_gen_test.cc b/tensorflow/cc/framework/cc_op_gen_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b7e720a5c7b343415eee1aa157b8de755a1e1a5
--- /dev/null
+++ b/tensorflow/cc/framework/cc_op_gen_test.cc
@@ -0,0 +1,195 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/framework/cc_op_gen.h"
+
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+// TODO(annarev): Remove this op_gen_overrides.pbtxt reference.
+// It is needed only because WriteCCOps takes it as an argument.
+constexpr char kOverridesFnames[] =
+    "tensorflow/cc/ops/op_gen_overrides.pbtxt";
+constexpr char kBaseOpDef[] = R"(
+op {
+  name: "Foo"
+  input_arg {
+    name: "images"
+    description: "Images to process."
+  }
+  input_arg {
+    name: "dim"
+    description: "Description for dim."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    description: "Description for output."
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    description: "Type for images"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+      }
+    }
+    default_value {
+      i: 1
+    }
+  }
+  summary: "Summary for op Foo."
+  description: "Description for op Foo."
+}
+)";
+
+void ExpectHasSubstr(StringPiece s, StringPiece expected) {
+  EXPECT_TRUE(s.contains(expected))
+      << "'" << s << "' does not contain '" << expected << "'";
+}
+
+void ExpectDoesNotHaveSubstr(StringPiece s, StringPiece expected) {
+  EXPECT_FALSE(s.contains(expected))
+      << "'" << s << "' contains '" << expected << "'";
+}
+
+void ExpectSubstrOrder(const string& s, const string& before,
+                       const string& after) {
+  int before_pos = s.find(before);
+  int after_pos = s.find(after);
+  ASSERT_NE(std::string::npos, before_pos);
+  ASSERT_NE(std::string::npos, after_pos);
+  EXPECT_LT(before_pos, after_pos)
+      << before << " is not before " << after << " in " << s;
+}
+
+// Runs WriteCCOps and stores output in (internal_)cc_file_path and
+// (internal_)h_file_path.
+void GenerateCcOpFiles(Env* env, const OpList& ops,
+                       const ApiDefMap& api_def_map, string* h_file_text,
+                       string* internal_h_file_text) {
+  const string& tmpdir = testing::TmpDir();
+
+  const auto h_file_path = io::JoinPath(tmpdir, "test.h");
+  const auto cc_file_path = io::JoinPath(tmpdir, "test.cc");
+  const auto internal_h_file_path = io::JoinPath(tmpdir, "test_internal.h");
+  const auto internal_cc_file_path = io::JoinPath(tmpdir, "test_internal.cc");
+
+  WriteCCOps(ops, api_def_map, h_file_path, cc_file_path, kOverridesFnames);
+
+  TF_ASSERT_OK(ReadFileToString(env, h_file_path, h_file_text));
+  TF_ASSERT_OK(
+      ReadFileToString(env, internal_h_file_path, internal_h_file_text));
+}
+
+TEST(CcOpGenTest, TestVisibilityChangedToHidden) {
+  const string api_def = R"(
+op {
+  graph_op_name: "Foo"
+  visibility: HIDDEN
+}
+)";
+  Env* env = Env::Default();
+  OpList op_defs;
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+  ApiDefMap api_def_map(op_defs);
+
+  string h_file_text, internal_h_file_text;
+  // Without ApiDef
+  GenerateCcOpFiles(env, op_defs, api_def_map, &h_file_text,
+                    &internal_h_file_text);
+  ExpectHasSubstr(h_file_text, "class Foo");
+  ExpectDoesNotHaveSubstr(internal_h_file_text, "class Foo");
+
+  // With ApiDef
+  TF_ASSERT_OK(api_def_map.LoadApiDef(api_def));
+  GenerateCcOpFiles(env, op_defs, api_def_map, &h_file_text,
+                    &internal_h_file_text);
+  ExpectHasSubstr(internal_h_file_text, "class Foo");
+  ExpectDoesNotHaveSubstr(h_file_text, "class Foo");
+}
+
+TEST(CcOpGenTest, TestArgNameChanges) {
+  const string api_def = R"(
+op {
+  graph_op_name: "Foo"
+  arg_order: "dim"
+  arg_order: "images"
+}
+)";
+  Env* env = Env::Default();
+  OpList op_defs;
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+
+  ApiDefMap api_def_map(op_defs);
+  string cc_file_text, h_file_text;
+  string internal_cc_file_text, internal_h_file_text;
+  // Without ApiDef
+  GenerateCcOpFiles(env, op_defs, api_def_map, &h_file_text,
+                    &internal_h_file_text);
+  ExpectSubstrOrder(h_file_text, "Input images", "Input dim");
+
+  // With ApiDef
+  TF_ASSERT_OK(api_def_map.LoadApiDef(api_def));
+  GenerateCcOpFiles(env, op_defs, api_def_map, &h_file_text,
+                    &internal_h_file_text);
+  ExpectSubstrOrder(h_file_text, "Input dim", "Input images");
+}
+
+TEST(CcOpGenTest, TestEndpoints) {
+  const string api_def = R"(
+op {
+  graph_op_name: "Foo"
+  endpoint {
+    name: "Foo1"
+  }
+  endpoint {
+    name: "Foo2"
+  }
+}
+)";
+  Env* env = Env::Default();
+  OpList op_defs;
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+
+  ApiDefMap api_def_map(op_defs);
+  string cc_file_text, h_file_text;
+  string internal_cc_file_text, internal_h_file_text;
+  // Without ApiDef
+  GenerateCcOpFiles(env, op_defs, api_def_map, &h_file_text,
+                    &internal_h_file_text);
+  ExpectHasSubstr(h_file_text, "class Foo {");
+  ExpectDoesNotHaveSubstr(h_file_text, "class Foo1");
+  ExpectDoesNotHaveSubstr(h_file_text, "class Foo2");
+
+  // With ApiDef
+  TF_ASSERT_OK(api_def_map.LoadApiDef(api_def));
+  GenerateCcOpFiles(env, op_defs, api_def_map, &h_file_text,
+                    &internal_h_file_text);
+  ExpectHasSubstr(h_file_text, "class Foo1");
+  ExpectHasSubstr(h_file_text, "typedef Foo1 Foo2");
+  ExpectDoesNotHaveSubstr(h_file_text, "class Foo {");
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index d7446b9560fd7dc8377ea3710641906b274313a9..ebc0c77828dc32ec170d4ddfbfa150d1f38ab27b 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -728,6 +728,24 @@ Status LgammaGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Lgamma", LgammaGrad);
 
+Status SelectGrad(const Scope& scope, const Operation& op,
+                  const std::vector<Output>& grad_inputs,
+                  std::vector<Output>* grad_outputs) {
+  auto comparator = op.input(0);
+  auto x = op.input(1);
+  auto zeros = ZerosLike(scope, x);
+  auto grad = grad_inputs[0];
+
+  auto gx_1 = Where3(scope, comparator, grad, zeros);
+  auto gx_2 = Where3(scope, comparator, zeros, grad);
+
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(gx_1);
+  grad_outputs->push_back(gx_2);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Select", SelectGrad);
+
 Status MinOrMaxGrad(const Scope& scope, const Operation& op,
                     const std::vector<Output>& grad_inputs,
                     std::vector<Output>* grad_outputs) {
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index 6313f41da5e5f9cf88be4c8a84408a8df77f0e25..29def3c3ea2b0be963cae000db587f94fae5af55 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -865,5 +865,13 @@ TEST_F(NaryGradTest, Minimum) {
   RunTest(x, x_init_value, y, shape);
 }
 
+TEST_F(NaryGradTest, Select) {
+  TensorShape shape({3, 4});
+  auto x1 = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto x2 = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Where3(scope_, Greater(scope_, x1, x2), x1, x2);
+  RunTest({x1, x2}, {shape, shape}, {y}, {shape});
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 09fadfcab51575798286876f9a4e0ee9a60940ac..13a3bba5e6d5ca19ff3f0eca76665ba7d3ab628d 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -196,6 +196,18 @@ Status MaxPoolGradV2Helper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("MaxPoolV2", MaxPoolGradV2Helper);
 
+Status LRNGradHelper(const Scope& scope, const Operation& op,
+                     const std::vector<Output>& grad_inputs,
+                     std::vector<Output>* grad_outputs){
+  internal::LRNGrad::Attrs grad_attrs;
+
+  auto dx = internal::LRNGrad(scope, grad_inputs[0], op.input(0), op.output(0),
+                              grad_attrs);
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("LRN", LRNGradHelper);
+
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index ac66f51cf01911957722e94ca28e8e78dc6de2ed..f9063e836509669d81d03b1d2f0d32d1166b6eca 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -191,5 +191,12 @@ TEST_F(NNGradTest, MaxPoolGradV2Helper) {
   RunTest(x, x_init_value, y, y_shape);
 }
 
+TEST_F(NNGradTest, LRN){
+  TensorShape x_shape({1, 1, 2, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  auto y = LRN(scope_, x);
+  RunTest(x, x_shape, y, x_shape);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index f98abc8a817eca7bc129bb03a2ad31b97d957065..acef098c7d07f45d171679bff7c41e13ef0424f1 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -62,6 +62,15 @@ Status ReadSavedModel(const string& export_dir, SavedModel* saved_model_proto) {
                     export_dir);
 }
 
+string GetTagsAsString(const std::unordered_set<string>& tags) {
+  string tags_as_string = "{ ";
+  for (const string& tag : tags) {
+    tags_as_string = strings::StrCat(tags_as_string, tag, " ");
+  }
+  tags_as_string = strings::StrCat(tags_as_string, "}");
+  return tags_as_string;
+}
+
 Status FindMetaGraphDefToLoad(const SavedModel& saved_model_proto,
                               const std::unordered_set<string>& tags,
                               MetaGraphDef* meta_graph_def_to_load) {
@@ -77,14 +86,9 @@ Status FindMetaGraphDefToLoad(const SavedModel& saved_model_proto,
       return Status::OK();
     }
   }
-  string tags_as_string = "{ ";
-  for (const string& tag : tags) {
-    tags_as_string = strings::StrCat(tags_as_string, tag, " ");
-  }
-  tags_as_string = strings::StrCat(tags_as_string, "}");
   return Status(error::Code::NOT_FOUND,
                 "Could not find meta graph def matching supplied tags: " +
-                    tags_as_string +
+                    GetTagsAsString(tags) +
                     ". To inspect available tag-sets in the SavedModel, please "
                     "use the SavedModel CLI: `saved_model_cli`");
 }
@@ -233,7 +237,8 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
     return Status(error::Code::NOT_FOUND,
                   "SavedModel not found in export directory: " + export_dir);
   }
-  LOG(INFO) << "Loading SavedModel from: " << export_dir;
+  LOG(INFO) << "Loading SavedModel with tags: " << GetTagsAsString(tags)
+            << "; from: " << export_dir;
 
   SavedModel saved_model_proto;
   TF_RETURN_IF_ERROR(ReadSavedModel(export_dir, &saved_model_proto));
@@ -281,7 +286,8 @@ Status LoadSavedModel(const SessionOptions& session_options,
     return end_microseconds - start_microseconds;
   }();
   auto log_and_count = [&](const string& status_str) {
-    LOG(INFO) << "Loading SavedModel: " << status_str << ". Took "
+    LOG(INFO) << "SavedModel load for tags " << GetTagsAsString(tags)
+              << "; Status: " << status_str << ". Took "
               << load_latency_microsecs << " microseconds.";
     load_attempt_count->GetCell(export_dir, status_str)->IncrementBy(1);
   };
diff --git a/tensorflow/cc/saved_model/tag_constants.h b/tensorflow/cc/saved_model/tag_constants.h
index 2b0b2d5c7fb33768494c1781669c1adcb875a579..b71cb263ca42dab7e830c1880ec4b311bc272f82 100644
--- a/tensorflow/cc/saved_model/tag_constants.h
+++ b/tensorflow/cc/saved_model/tag_constants.h
@@ -21,6 +21,9 @@ namespace tensorflow {
 /// Tag for the `gpu` graph.
 constexpr char kSavedModelTagGpu[] = "gpu";
 
+/// Tag for the `tpu` graph.
+constexpr char kSavedModelTagTpu[] = "tpu";
+
 /// Tag for the `serving` graph.
 constexpr char kSavedModelTagServe[] = "serve";
 
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index a9a6ea84319a18a8fbce648391bf5918ff6d9a08..5740c040e309bad8d7e3bdc468c09a3323fb99e0 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -24,7 +24,6 @@ tf_cc_test(
     srcs = ["runtime_test.cc"],
     deps = [
         ":runtime",
-        "//tensorflow/compiler/tf2xla:xla_local_runtime_context",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -111,6 +110,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ],
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index ae22f7edc423247b34895411d19d7a3c21f86d4f..53da2881b60db9ad39565567623eb86f754559af 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -101,21 +101,8 @@ Status ComputeArgSizes(const CompileResult& compile_result,
                        std::vector<int64>* arg_sizes) {
   const xla::ProgramShape& ps = compile_result.program_shape;
   for (int i = 0; i < ps.parameters_size(); ++i) {
-    if (i == ps.parameters_size() - 1 && compile_result.has_context_arg) {
-      // If the compiled function needs a XlaLocalRuntimeContext* arg, it's
-      // always last, and must be represented as an opaque type.
-      const xla::PrimitiveType type = ps.parameters(i).element_type();
-      if (type != xla::OPAQUE) {
-        return errors::InvalidArgument(
-            "expected final context arg to be opaque, but got type: ",
-            xla::PrimitiveType_Name(type), ", from program shape: ",
-            xla::ShapeUtil::HumanString(ps));
-      }
-      arg_sizes->push_back(-1);
-    } else {
-      arg_sizes->push_back(xla::ShapeUtil::ByteSizeOf(
-          ps.parameters(i), compile_result.pointer_size));
-    }
+    arg_sizes->push_back(xla::ShapeUtil::ByteSizeOf(
+        ps.parameters(i), compile_result.pointer_size));
   }
   return Status::OK();
 }
@@ -165,11 +152,6 @@ string RewriteWithName(const string& name, string code,
 Status GenArgMethods(const tf2xla::Config& config, const xla::ProgramShape& ps,
                      const CompileResult& compile_result, string* methods) {
   size_t num_args = ps.parameters_size();
-  if (compile_result.has_context_arg) {
-    // If the compiled function needs a XlaLocalRuntimeContext* arg, it's
-    // always last, and is set in the class constructor.
-    num_args--;
-  }
   if (config.feed_size() != num_args) {
     return errors::InvalidArgument("mismatch between feed_size(",
                                    config.feed_size(), ") and num_args(",
@@ -418,7 +400,7 @@ namespace xla { class ExecutableRunOptions; }
 // (Implementation detail) Entry point to the function in the object file.
 extern "C" void {{ENTRY}}(
     void* result, const xla::ExecutableRunOptions* run_options,
-    const void** args, void** temps);
+    const void** args, void** temps, tensorflow::int64* profile_counters);
 
 {{NS_START}}
 // {{CLASS}} represents a computation previously specified in a
@@ -474,7 +456,6 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
       data->temp_sizes = TempSizes();
       data->num_temps = kNumTemps;
       data->result_index = kResultIndex;
-      data->requires_runtime_context = {{HAS_CONTEXT_ARG}};
       data->arg_names = StaticArgNames();
       data->result_names = StaticResultNames();
       data->program_shape = StaticProgramShape();
@@ -483,7 +464,7 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
     return *kStaticData;
   }
 
-  {{CLASS}}(AllocMode alloc_mode = AllocMode::ARGS_RESULTS_AND_TEMPS)
+  {{CLASS}}(AllocMode alloc_mode = AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS)
       : XlaCompiledCpuFunction(StaticData(), alloc_mode) {}
 
   {{CLASS}}(const {{CLASS}}&) = delete;
@@ -496,8 +477,8 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
   // void set_argN_data(void* data)
   //   Sets the buffer of type T for positional argument N. May be called in
   //   any AllocMode. Must be called before Run to have an affect. Must be
-  //   called in AllocMode::RESULTS_AND_TEMPS_ONLY for each positional argument,
-  //   to set the argument buffers.
+  //   called in AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY for each positional
+  //   argument, to set the argument buffers.
   //
   // T* argN_data()
   //   Returns the buffer of type T for positional argument N.
@@ -560,8 +541,6 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
       {"{{ARG_SIZES}}", str_util::Join(arg_sizes, ", ")},
       {"{{CLASS}}", opts.class_name},
       {"{{ENTRY}}", compile_result.entry_point},
-      {"{{HAS_CONTEXT_ARG}}",
-       compile_result.has_context_arg ? "true" : "false"},
       {"{{INCLUDE_XLA_DATA_PROTO}}", include_xla_data_proto},
       {"{{METHODS_ARG}}\n", methods_arg},
       {"{{METHODS_RESULT}}\n", methods_result},
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index 0f6114666fcc89c631434527d2ae8c92c039ffea..75026c57c04a64186a1e5be6c41e4dd7de8520b7 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -145,11 +145,9 @@ TEST(GenerateHeader, Golden) {
       {
           xla::ShapeUtil::MakeShape(xla::F32, {1, 2}),
           xla::ShapeUtil::MakeShape(xla::S64, {3, 4}),
-          xla::ShapeUtil::MakeOpaqueShape(),
       },
       xla::ShapeUtil::MakeTupleShape(
           {xla::ShapeUtil::MakeShape(xla::U32, {5, 6})}));
-  compile_result.has_context_arg = true;
   compile_result.entry_point = "entry_point";
   compile_result.pointer_size = 8;
   string header;
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 65f342ce27ef09092f252f791973f245a8cdd6f3..95ab3a7332f51070732bf5d62c7926c84e3b738d 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -19,7 +19,7 @@ namespace xla { class ExecutableRunOptions; }
 // (Implementation detail) Entry point to the function in the object file.
 extern "C" void entry_point(
     void* result, const xla::ExecutableRunOptions* run_options,
-    const void** args, void** temps);
+    const void** args, void** temps, tensorflow::int64* profile_counters);
 
 namespace foo {
 namespace bar {
@@ -48,7 +48,7 @@ namespace bar {
 //   is guaranteed that no thread may call a non-const method.
 //
 // The logical function signature is:
-//   ((unknown): f32[1,2], (unknown): s64[3,4], (unknown): opaque[]) -> (u32[5,6])
+//   ((unknown): f32[1,2], (unknown): s64[3,4]) -> (u32[5,6])
 //
 // Memory stats:
 //   arg bytes total:    104
@@ -58,11 +58,11 @@ namespace bar {
 class MyClass : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
-  static constexpr size_t kNumArgs = 3;
+  static constexpr size_t kNumArgs = 2;
 
   // Byte size of each argument buffer. There are kNumArgs entries.
   static const intptr_t* ArgSizes() {
-    static constexpr intptr_t kArgSizes[kNumArgs] = {8, 96, -1};
+    static constexpr intptr_t kArgSizes[kNumArgs] = {8, 96};
     return kArgSizes;
   }
 
@@ -77,7 +77,6 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
       data->temp_sizes = TempSizes();
       data->num_temps = kNumTemps;
       data->result_index = kResultIndex;
-      data->requires_runtime_context = true;
       data->arg_names = StaticArgNames();
       data->result_names = StaticResultNames();
       data->program_shape = StaticProgramShape();
@@ -86,7 +85,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
     return *kStaticData;
   }
 
-  MyClass(AllocMode alloc_mode = AllocMode::ARGS_RESULTS_AND_TEMPS)
+  MyClass(AllocMode alloc_mode = AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS)
       : XlaCompiledCpuFunction(StaticData(), alloc_mode) {}
 
   MyClass(const MyClass&) = delete;
@@ -99,8 +98,8 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
   // void set_argN_data(void* data)
   //   Sets the buffer of type T for positional argument N. May be called in
   //   any AllocMode. Must be called before Run to have an affect. Must be
-  //   called in AllocMode::RESULTS_AND_TEMPS_ONLY for each positional argument,
-  //   to set the argument buffers.
+  //   called in AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY for each positional
+  //   argument, to set the argument buffers.
   //
   // T* argN_data()
   //   Returns the buffer of type T for positional argument N.
@@ -236,8 +235,8 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
   // Shape of the args and results.
   static const xla::ProgramShape* StaticProgramShape() {
     static const xla::ProgramShape* kShape = []() {
-      static const char kProto[] = {10,12,16,11,26,2,1,2,42,4,10,2,1,0,10,12,16,5,26,2,3,4,42,4,10,2,1,0,10,2,16,14,18,16,16,13,34,12,16,8,26,2,5,6,42,4,10,2,1,0};
-      static constexpr int kProtoSize = 50;
+      static const char kProto[] = {10,14,16,11,26,2,1,2,42,6,10,2,1,0,32,1,10,14,16,5,26,2,3,4,42,6,10,2,1,0,32,1,18,18,16,13,34,14,16,8,26,2,5,6,42,6,10,2,1,0,32,1};
+      static constexpr int kProtoSize = 52;
       xla::ProgramShape* shape = new xla::ProgramShape;
       shape->ParseFromArray(kProto, kProtoSize);
       return shape;
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 2b8cc6024cb85e4f6269313927ff66d1d9a1cf79..c87f2b75dfa18ad5c3eda4bd6fcbcb3083ef73fd 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -94,9 +94,8 @@ Status CompileGraph(const GraphDef& graph_def, const tf2xla::Config& config,
       xla::ClientLibrary::GetOrCreateCompileOnlyClient(cpu_platform)
           .ValueOrDie();
   xla::Computation computation;
-  TF_RETURN_IF_ERROR(ConvertGraphDefToXla(graph_def, config, client,
-                                          &computation,
-                                          &compile_result->has_context_arg));
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToXla(graph_def, config, client, &computation));
   if (!flags.out_session_module.empty()) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::SessionModule> module,
                         computation.Snapshot());
diff --git a/tensorflow/compiler/aot/compile.h b/tensorflow/compiler/aot/compile.h
index 965c2960816b3acc8d2209e6824d88647de0ce14..e03c5b1aa77c1262ed903aae3072ef65f34d80a2 100644
--- a/tensorflow/compiler/aot/compile.h
+++ b/tensorflow/compiler/aot/compile.h
@@ -34,7 +34,6 @@ struct CompileResult {
   // Contains object file and meta-info.
   std::unique_ptr<xla::cpu::CpuAotCompilationResult> aot;
   xla::ProgramShape program_shape;  // Static shape of args and results.
-  bool has_context_arg = false;     // Is last arg XlaLocalRuntimeContext?
   string entry_point;               // Name of generated function.
   int pointer_size = 0;             // Size of a pointer in bytes.
 };
diff --git a/tensorflow/compiler/aot/runtime_test.cc b/tensorflow/compiler/aot/runtime_test.cc
index ac79c278c1fdf8b6aedcb52121c767b8ba0ad358..6d603a02eb4ceade6832ba67b2981814ee25327a 100644
--- a/tensorflow/compiler/aot/runtime_test.cc
+++ b/tensorflow/compiler/aot/runtime_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/aot/runtime.h"
 
-#include "tensorflow/compiler/tf2xla/xla_local_runtime_context.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index a898eab1d1ab0eb5d55983bf366753c968887296..89c7cd4507cbd476104a039d6083d8f89de11278 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import os
 import sys
 
 from tensorflow.core.protobuf import saver_pb2
@@ -53,7 +54,7 @@ def tfadd_with_ckpt(out_dir):
     sess.run(init_op)
     sess.run(y.assign(y + 42))
     # Without the checkpoint, the variable won't be set to 42.
-    ckpt = '%s/test_graph_tfadd_with_ckpt.ckpt' % out_dir
+    ckpt = os.path.join(out_dir, 'test_graph_tfadd_with_ckpt.ckpt')
     saver.save(sess, ckpt)
 
 
@@ -68,10 +69,10 @@ def tfadd_with_ckpt_saver(out_dir):
     sess.run(init_op)
     sess.run(y.assign(y + 42))
     # Without the checkpoint, the variable won't be set to 42.
-    ckpt_file = '%s/test_graph_tfadd_with_ckpt_saver.ckpt' % out_dir
+    ckpt_file = os.path.join(out_dir, 'test_graph_tfadd_with_ckpt_saver.ckpt')
     saver.save(sess, ckpt_file)
     # Without the SaverDef, the restore op won't be named correctly.
-    saver_file = '%s/test_graph_tfadd_with_ckpt_saver.saver' % out_dir
+    saver_file = os.path.join(out_dir, 'test_graph_tfadd_with_ckpt_saver.saver')
     with open(saver_file, 'wb') as f:
       f.write(saver.as_saver_def().SerializeToString())
 
@@ -129,7 +130,7 @@ def write_graph(build_graph, out_dir):
   g = ops.Graph()
   with g.as_default():
     build_graph(out_dir)
-    filename = '%s/test_graph_%s.pb' % (out_dir, build_graph.__name__)
+    filename = os.path.join(out_dir, 'test_graph_%s.pb' % build_graph.__name__)
     with open(filename, 'wb') as f:
       f.write(g.as_graph_def().SerializeToString())
 
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index 6b037f276ad1d6771b904bb970f45f32ae9531b8..413efd9cea3b6f71574615ad9ca92471ff925781 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -70,7 +70,7 @@ TEST(TFCompileTest, Add) {
 // Run tests that use set_argN_data separately, to avoid accidentally re-using
 // non-existent buffers.
 TEST(TFCompileTest, Add_SetArg) {
-  AddComp add(AddComp::AllocMode::RESULTS_AND_TEMPS_ONLY);
+  AddComp add(AddComp::AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY);
 
   int32 arg_x = 10;
   int32 arg_y = 32;
@@ -258,7 +258,7 @@ TEST(TFCompileTest, MatMul2_SetArg) {
   Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
 
   foo::bar::MatMulComp matmul(
-      foo::bar::MatMulComp::AllocMode::RESULTS_AND_TEMPS_ONLY);
+      foo::bar::MatMulComp::AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY);
   matmul.set_thread_pool(&device);
 
   // Test using the set_argN_data() methods.
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 1e22b760b8a4189165a59ac307374277474bbc31..542451ed2d14fbceca00c6ccb6e28c1c3a0d4321 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -152,7 +152,7 @@ def tf_library(name, graph, config,
            " --target_triple=" + target_llvm_triple() +
            " --out_header=$(@D)/" + header_file +
            " --out_object=$(@D)/" + object_file +
-           flags),
+           " " + flags),
       tools=[tfcompile_tool],
       visibility=visibility,
       testonly=testonly,
@@ -189,7 +189,7 @@ def tf_library(name, graph, config,
            " --cpp_class=" + cpp_class +
            " --target_triple=" + target_llvm_triple() +
            " --out_session_module=$(@D)/" + session_module_pb +
-           flags),
+           " " + flags),
       tools=[tfcompile_tool],
       visibility=visibility,
       testonly=testonly,
@@ -267,7 +267,6 @@ def tf_library(name, graph, config,
         srcs=[test_file],
         deps=[
             ":" + name,
-            "@org_tensorflow//tensorflow/compiler/tf2xla:xla_local_runtime_context",
             "@org_tensorflow//tensorflow/compiler/aot:runtime",
             "@org_tensorflow//tensorflow/compiler/aot:tf_library_test_main",
             "@org_tensorflow//tensorflow/compiler/xla:executable_run_options",
@@ -313,7 +312,6 @@ def tf_library(name, graph, config,
         linkopts = if_android(["-pie", "-s"]),
         deps=[
             ":" + name,
-            "@org_tensorflow//tensorflow/compiler/tf2xla:xla_local_runtime_context",
             "@org_tensorflow//tensorflow/compiler/aot:benchmark",
             "@org_tensorflow//tensorflow/compiler/aot:runtime",
             "@org_tensorflow//tensorflow/compiler/xla:executable_run_options",
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index bf7d9cf14d10f41aa48ea594a8d63db97b9973e1..026a1bf879d373fd0f5f4444b3ce10d01702f82b 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -251,6 +251,7 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 22899ebeebc929055518893b358f7950d380d6f6..407b7dcbfb4b36674928d68eedaf58fcefc645f2 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -16,7 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 
 #include <functional>
+#include <memory>
 #include <numeric>
+#include <string>
+#include <unordered_map>
+#include <vector>
 
 #include "tensorflow/compiler/jit/graph_to_functiondef.h"
 #include "tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.h"
@@ -32,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -48,6 +53,52 @@ const char* const kXlaNumResourceArgsAttr = "_XlaNumResourceArgs";
 
 namespace {
 
+bool AreAllParentsConst(const Node& n,
+                        const gtl::FlatSet<const Node*>& runtime_const_nodes) {
+  if (n.type_string() == "GuaranteeConst" || n.type_string() == "Const") {
+    // If the current node is itself a cast-to-const, no need
+    // to look at the incoming edges.
+    return true;
+  }
+
+  bool all_parents_const = true;
+  bool atleast_one_non_control_edge = false;
+  for (const Edge* in : n.in_edges()) {
+    atleast_one_non_control_edge =
+        atleast_one_non_control_edge || !in->IsControlEdge();
+    if (!in->IsControlEdge() && runtime_const_nodes.count(in->src()) == 0) {
+      all_parents_const = false;
+      break;
+    }
+  }
+  return all_parents_const && atleast_one_non_control_edge;
+}
+
+void MarkGuaranteedConstants(
+    const Graph& graph,
+    const std::vector<std::pair<const Node*, Node*>>& src_arg_pairs) {
+  gtl::FlatSet<const Node*> guaranteed_const_nodes;
+  std::vector<const Node*> srcs;
+  srcs.reserve(src_arg_pairs.size());
+  for (const auto& src_arg : src_arg_pairs) {
+    srcs.push_back(src_arg.first);
+  }
+  ReverseDFSFrom(graph, srcs, /*enter=*/nullptr,
+                 /*leave=*/[&guaranteed_const_nodes](const Node* n) {
+                   // TODO(vinuraja): Doesn't work in the presence of loops.
+                   if (AreAllParentsConst(*n, guaranteed_const_nodes)) {
+                     guaranteed_const_nodes.insert(n);
+                   }
+                 });
+
+  for (auto& src_arg : src_arg_pairs) {
+    if (guaranteed_const_nodes.count(src_arg.first) != 0) {
+      VLOG(1) << "Guaranteed const found: " << src_arg.first->DebugString();
+      src_arg.second->AddAttr("_is_guaranteed_constant", true);
+    }
+  }
+}
+
 // A node/slot pair.
 // TODO(phawkins): is there a common definition of this?
 struct NodeSlot {
@@ -75,6 +126,11 @@ struct NodeSlot {
   };
 };
 
+// TODO(phawkins) add a canonical copy of these operator names and refactor
+// everything to use it.
+static const char* const kArgOp = "_Arg";
+static const char* const kRetValOp = "_Retval";
+
 class Encapsulator {
  public:
   Encapsulator(string group_attribute, Graph const* graph_in)
@@ -99,54 +155,167 @@ class Encapsulator {
   Status BuildOutputGraph(bool parallel_checking, Graph* graph_out);
 
  private:
-  // Returns the key attribute associated with a node. Returns the empty string
-  // if no key attribute is found.
-  string GetFunctionNameAttr(const Node* node) const;
-
   // A subgraph of the input, all marked with a common 'group_attribute'
   // value.
-  struct Subgraph {
+  class Subgraph {
+   public:
+    // Creates a graph to build the subgraph in, if it doesn't already exist,
+    // using the same op registry and versions as graph_in.
+    Node* MakeNodeImage(const Graph* graph_in, Node* node);
+
+    // Returns the graph the subgraph is being built in.
+    Graph* GetGraph() const;
+
+    // Builds a FunctionDef, and adds it to 'library'. The value of the
+    // 'group_attribute' annotations becomes the function name.  If
+    // 'reuse_existing_functions' is set, use an existing function with the same
+    // name, if any.  If 'rewrite_subgraph_fn' is set, it is applied to the
+    // subgraph before function conversion.
+    Status BuildFunctionDef(const string& name_in,
+                            const RewriteSubgraphFn& rewrite_subgraph_fn,
+                            bool reuse_existing_functions,
+                            FunctionLibraryDefinition* library);
+
+    // Adds the function call node to graph_out.
+    Status AddFunctionCallNode(
+        const std::unordered_map<const Node*, Node*>& node_images,
+        bool parallel_checking, Graph* graph_out);
+
+    // Returns the Node that inputs to the function should be wired up to.
+    Node* GetCallNodeForInputs() const;
+
+    // Returns the Node that outputs to the function should be wired up to.
+    Node* GetCallNodeForOutputs() const;
+
+    // Returns the index of the arg that the dst of edge should connect to.
+    int GetArgIndexForEdge(const Edge* edge) const;
+
+    // Returns the index of the result that the src of edge should connect to.
+    int GetResultIndexForEdge(const Edge* edge) const;
+
+    // Creates an _Arg node for the src node of edge, and add its index to
+    // args_by_src_, if none exists yet. Also adds its index to args_by_dst_,
+    // and adds the edge within the subgraph from the _Arg node to the image of
+    // the dst node.
+    Status RecordArg(const Edge* edge,
+                     const std::unordered_map<const Node*, Node*>& node_images,
+                     std::vector<std::pair<const Node*, Node*>>* src_arg_pairs);
+
+    // Creates a _Retval node for the src node of edge, and add it to results_,
+    // if none exists yet. If a new _Retval node is created, also adds the edge
+    // within the subgraph from the src to the _Retval node.
+    Status RecordResult(
+        const Edge* edge,
+        const std::unordered_map<const Node*, Node*>& node_images);
+
+   private:
+    // Builds a ParallelCheck op that compares the output of the original
+    // subgraph with the encapsulated subgraph.
+    Status BuildParallelCheckOp(
+        const std::unordered_map<const Node*, Node*>& node_images,
+        Graph* graph_out);
+
     // The subgraph extracted from the input graph, suitable for being turned
     // into a FunctionDef. Inputs are fed by _Arg nodes, and outputs are
     // returned by _Retval nodes.
-    std::unique_ptr<Graph> graph;
+    std::unique_ptr<Graph> graph_;
 
     // Which device are these nodes on? Used to assign a device to the call
     // node.
-    string device;
+    string device_;
 
     // NodeDef for the function call node.
-    NodeDef call_node_def;
+    NodeDef call_node_def_;
 
     // Function call node(s) in the output graph. Not owned.
     // If parallel_checking is enabled, 'call_node_inputs' is the function call
     // node to which inputs should be fed, and 'call_node_outputs' is the
     // parallel check op from which outputs should be read. If parallel checking
     // is disabled, both point to the function call node.
-    Node* call_node_inputs;
-    Node* call_node_outputs;
+    Node* call_node_inputs_;
+    Node* call_node_outputs_;
 
     // Maps from source (producer node/slot) and destination
     // (consumer node/slot) tensors in the input graph to _Arg numbers in
     // the subgraph. The source map is one-to-one, whereas the dest map may be
     // many-to-one.
-    std::unordered_map<NodeSlot, int, NodeSlot::Hasher> args_by_src;
-    std::unordered_map<NodeSlot, int, NodeSlot::Hasher> args_by_dst;
+    std::unordered_map<NodeSlot, int, NodeSlot::Hasher> args_by_src_;
+    std::unordered_map<NodeSlot, int, NodeSlot::Hasher> args_by_dst_;
 
     // The _Arg nodes in the subgraph, in order by argument number.
-    std::vector<Node*> args;
+    std::vector<Node*> args_;
 
     // Map from source tensor in the input graph to result #.
-    std::unordered_map<NodeSlot, int, NodeSlot::Hasher> results;
+    std::unordered_map<NodeSlot, int, NodeSlot::Hasher> results_;
   };
 
-  // Builds a ParallelCheck op that compares the output of the original subgraph
-  // with the encapsulated subgraph.
-  Status BuildParallelCheckOp(
+  // Returns the key attribute associated with a node in attr. Sets attr to the
+  // empty string if the attribute is not found.
+  Status GetFunctionNameAttr(const Node* node, string* attr) const;
+
+  // Copies edges local to a subgraph. Adds _Arg and _Retval nodes to subgraphs
+  // for data edges that cross subgraph boundaries.
+  Status CopySubgraphEdges(
+      const std::unordered_map<const Node*, Node*>& node_images,
+      std::vector<std::pair<const Node*, Node*>>* src_arg_pairs);
+
+  // Copies all marked nodes to a subgraph. Does nothing for unmarked nodes.
+  Status CopySubgraphNodes(std::unordered_map<const Node*, Node*>* node_images);
+
+  // Copies all nodes that aren't in a compiled subgraph to the output graph.
+  Status CopyNodesToOutputGraph(
+      bool parallel_checking, Graph* graph_out,
+      std::unordered_map<const Node*, Node*>* node_images);
+
+  // Adds function call nodes for each compiled subgraph.
+  Status AddFunctionCallNodes(
       const std::unordered_map<const Node*, Node*>& node_images,
-      const Subgraph& subgraph, Graph* graph_out, Node** parallel_check_op);
+      bool parallel_checking, Graph* graph_out);
+
+  // Finds the image of an edge source in the output graph. If the edge crosses
+  // a subgraph boundary it is the output of a call node, otherwise it is a node
+  // in the output graph.
+  Status FindOutputImageOfEdgeSrc(
+      const string& src_func_id, const string& dst_func_id,
+      const std::unordered_map<const Node*, Node*>& node_images,
+      const Node* original_src_node, Node** src_image);
+
+  // Finds an edge source slot in the output graph. If the edge crosses a
+  // subgraph boundary it is a slot on the output of a call node, otherwise it
+  // is a slot on a node in the output graph.
+  int FindOutputSlotOfEdgeSrc(const string& src_func_id,
+                              const string& dst_func_id, const Edge* edge);
+
+  // Finds the image of an edge destination in the output graph. If the edge
+  // crosses a subgraph boundary it is the input of a call node, otherwise it is
+  // a node in the output graph.
+  Status FindOutputImageOfEdgeDst(
+      const string& src_func_id, const string& dst_func_id,
+      const std::unordered_map<const Node*, Node*>& node_images,
+      const Node* original_dst_node, Node** dst_image);
+
+  // Finds an edge destination slot in the output graph. If the edge crosses a
+  // subgraph boundary it is a slot on the input of a call node, otherwise it is
+  // a slot on a node in the output graph.
+  int FindOutputSlotOfEdgeDst(const string& src_func_id,
+                              const string& dst_func_id, const Edge* edge);
+
+  // Copies a single edge to the output graph. The edge is either entirely
+  // within the output graph, or crosses into or out of a compiled subgraph.
+  Status CopyEdgeToOutputGraph(
+      const Edge* edge, const string& src_func_id, const string& dst_func_id,
+      const std::unordered_map<const Node*, Node*>& node_images,
+      bool parallel_checking, Graph* graph_out,
+      std::unordered_set<std::pair<NodeSlot, NodeSlot>, NodeSlot::PairHasher>*
+          edges_added);
+
+  // Adds all edges to the output graph.
+  Status AddEdgesToOutputGraph(
+      const std::unordered_map<const Node*, Node*>& node_images,
+      bool parallel_checking, Graph* graph_out);
 
   const string group_attribute_;
+  const string outside_compilation_attribute_;
   const Graph* graph_in_;
 
   std::unordered_map<string, Subgraph> subgraphs_;
@@ -154,224 +323,184 @@ class Encapsulator {
   TF_DISALLOW_COPY_AND_ASSIGN(Encapsulator);
 };
 
-// TODO(phawkins) add a canonical copy of these operator names and refactor
-// everything to use it.
-static const char* const kArgOp = "_Arg";
-static const char* const kRetValOp = "_Retval";
-
-// Returns the function name attached to 'node', or the empty string if there is
-// none.
-string Encapsulator::GetFunctionNameAttr(Node const* node) const {
-  string attr;
-  if (!GetNodeAttr(node->attrs(), group_attribute_, &attr).ok()) {
-    attr.clear();
-  }
-  return attr;
+Node* Encapsulator::Subgraph::GetCallNodeForInputs() const {
+  return call_node_inputs_;
 }
 
-Status Encapsulator::SplitIntoSubgraphs() {
-  Status s;
-
-  // Map from input graph nodes to subgraph nodes.
-  std::unordered_map<Node*, Node*> node_images;
-
-  // Copy all marked nodes to a subgraph. Do nothing for unmarked nodes.
-  for (Node* node : graph_in_->op_nodes()) {
-    string func_id = GetFunctionNameAttr(node);
-    if (func_id.empty()) continue;
+Node* Encapsulator::Subgraph::GetCallNodeForOutputs() const {
+  return call_node_outputs_;
+}
 
-    Subgraph& subgraph = subgraphs_[func_id];
-    if (!subgraph.graph) {
-      subgraph.graph.reset(new Graph(graph_in_->op_registry()));
-      subgraph.graph->set_versions(graph_in_->versions());
-    }
+int Encapsulator::Subgraph::GetArgIndexForEdge(const Edge* edge) const {
+  return args_by_dst_.at(NodeSlot(edge->dst(), edge->dst_input()));
+}
 
-    Node* image = subgraph.graph->CopyNode(node);
-    image->ClearAttr(group_attribute_);
-    node_images[node] = image;
+int Encapsulator::Subgraph::GetResultIndexForEdge(const Edge* edge) const {
+  return results_.at(NodeSlot(edge->src(), edge->src_output()));
+}
 
-    if (subgraph.device.empty()) {
-      subgraph.device = node->assigned_device_name().empty()
-                            ? node->requested_device()
-                            : node->assigned_device_name();
-    }
+Node* Encapsulator::Subgraph::MakeNodeImage(const Graph* graph_in, Node* node) {
+  if (!graph_) {
+    graph_.reset(new Graph(graph_in->op_registry()));
+    graph_->set_versions(graph_in->versions());
   }
 
-  // Copy edges local to a subgraph. Add _Arg and _Retval nodes to subgraphs for
-  // data edges that cross subgraph boundaries.
-  for (const Edge* edge : graph_in_->edges()) {
-    string src_func_id = GetFunctionNameAttr(edge->src());
-    string dst_func_id = GetFunctionNameAttr(edge->dst());
-    Node* src_image = gtl::FindWithDefault(node_images, edge->src(), nullptr);
-    Node* dst_image = gtl::FindWithDefault(node_images, edge->dst(), nullptr);
-
-    // Copy edges that are local to a subgraph.
-    if (!src_func_id.empty() && src_func_id == dst_func_id) {
-      Graph* g = subgraphs_[src_func_id].graph.get();
-      if (edge->IsControlEdge()) {
-        g->AddControlEdge(src_image, dst_image);
-      } else {
-        g->AddEdge(src_image, edge->src_output(), dst_image, edge->dst_input());
-      }
-      continue;
-    }
-
-    // Ignore cross-boundary control edges for right now. We will lift them
-    // onto the enclosing call operators in BuildOutputGraph().
-    if (edge->IsControlEdge()) continue;
+  if (device_.empty()) {
+    device_ = node->assigned_device_name().empty()
+                  ? node->requested_device()
+                  : node->assigned_device_name();
+  }
 
-    // Add 'src' as an output of its subgraph, if applicable.
-    if (!src_func_id.empty()) {
-      Subgraph& src_subgraph = subgraphs_[src_func_id];
-      int ret_index = src_subgraph.results.size();
-      if (src_subgraph.results
-              .emplace(NodeSlot(edge->src(), edge->src_output()), ret_index)
-              .second) {
-        // Create a new _Retval node
-        DataType dtype = edge->src()->output_type(edge->src_output());
-
-        if (IsRefType(dtype)) {
-          return errors::InvalidArgument(
-              "Ref Tensors (e.g., Variables) are not supported: tensor ",
-              edge->src()->name(), ":", edge->src_output());
-        }
+  return graph_->CopyNode(node);
+}
 
-        NodeDef ret_def;
-        ret_def.set_op(kRetValOp);
-        ret_def.set_name(strings::StrCat(edge->src()->name(), "_",
-                                         edge->src_output(), "_retval"));
-        AddNodeAttr("T", dtype, &ret_def);
-        AddNodeAttr("index", ret_index, &ret_def);
-        Node* ret = src_subgraph.graph->AddNode(ret_def, &s);
-        if (!s.ok()) return s;
-
-        // Add an edge from 'src' to _Retval.
-        src_subgraph.graph->AddEdge(src_image, edge->src_output(), ret, 0);
-      }
+Graph* Encapsulator::Subgraph::GetGraph() const { return graph_.get(); }
+
+Status Encapsulator::Subgraph::RecordArg(
+    const Edge* edge, const std::unordered_map<const Node*, Node*>& node_images,
+    std::vector<std::pair<const Node*, Node*>>* src_arg_pairs) {
+  Node* src_node = edge->src();
+  int src_slot = edge->src_output();
+  std::unordered_map<NodeSlot, int, NodeSlot::Hasher>::iterator iter;
+  bool inserted;
+  std::tie(iter, inserted) =
+      args_by_src_.emplace(NodeSlot(src_node, src_slot), args_by_src_.size());
+  int arg_index = iter->second;
+  if (inserted) {
+    // Look at the type of the destination not the source, since Ref output
+    // Tensors can be automatically cast to non-Ref Tensors at the destination.
+    DataType dtype = edge->dst()->input_type(edge->dst_input());
+
+    if (IsRefType(dtype)) {
+      return errors::InvalidArgument(
+          "Ref Tensors (e.g., Variables) are not supported as args: tensor ",
+          src_node->name(), ":", src_slot);
     }
 
-    // Add 'dst' as an input of its subgraph, if applicable.
-    if (!dst_func_id.empty()) {
-      Subgraph& dst_subgraph = subgraphs_[dst_func_id];
-
-      // Create an _Arg node for this tensor, if none exists yet.
-      std::unordered_map<NodeSlot, int, NodeSlot::Hasher>::iterator iter;
-      bool inserted;
-      std::tie(iter, inserted) = dst_subgraph.args_by_src.emplace(
-          NodeSlot(edge->src(), edge->src_output()), dst_subgraph.args.size());
-      int arg_index = iter->second;
-      if (inserted) {
-        // This is the first time we have seen this tensor. Create an _Arg node.
-        DataType dtype = edge->dst()->input_type(edge->dst_input());
-
-        if (IsRefType(dtype)) {
-          return errors::InvalidArgument(
-              "Ref Tensors (e.g., Variables) are not supported: tensor ",
-              edge->src()->name(), ":", edge->src_output());
-        }
+    NodeDef arg_def;
+    NodeDefBuilder builder(
+        strings::StrCat(src_node->name(), "_", src_slot, "_arg"), kArgOp);
+    builder.Attr("T", dtype);
+    builder.Attr("index", arg_index);
+    Status s = builder.Finalize(&arg_def);
+    if (!s.ok()) return s;
 
-        NodeDef arg_def;
-        NodeDefBuilder builder(strings::StrCat(edge->src()->name(), "_",
-                                               edge->src_output(), "_arg"),
-                               kArgOp);
-        builder.Attr("T", dtype);
-        builder.Attr("index", arg_index);
-        s = builder.Finalize(&arg_def);
-        if (!s.ok()) return s;
+    Node* arg = graph_->AddNode(arg_def, &s);
+    if (!s.ok()) return s;
 
-        Node* arg = dst_subgraph.graph->AddNode(arg_def, &s);
-        if (!s.ok()) return s;
+    src_arg_pairs->push_back({src_node, arg});
+    args_.push_back(arg);
+  }
+  Node* dst_node = edge->dst();
+  Node* dst_image = node_images.at(dst_node);
+  int dst_slot = edge->dst_input();
+  args_by_dst_[NodeSlot(dst_node, dst_slot)] = arg_index;
+  graph_->AddEdge(args_[arg_index], 0, dst_image, dst_slot);
+  return Status::OK();
+}
 
-        dst_subgraph.args.push_back(arg);
-      }
-      // Add an edge from the _Arg node to 'dst' in the subgraph.
-      dst_subgraph.args_by_dst[NodeSlot(edge->dst(), edge->dst_input())] =
-          arg_index;
-      dst_subgraph.graph->AddEdge(dst_subgraph.args[arg_index], 0, dst_image,
-                                  edge->dst_input());
+Status Encapsulator::Subgraph::RecordResult(
+    const Edge* edge,
+    const std::unordered_map<const Node*, Node*>& node_images) {
+  Node* src_node = edge->src();
+  Node* src_image = node_images.at(src_node);
+  int src_slot = edge->src_output();
+  std::unordered_map<NodeSlot, int, NodeSlot::Hasher>::iterator iter;
+  bool inserted;
+  std::tie(iter, inserted) =
+      results_.emplace(NodeSlot(src_node, src_slot), results_.size());
+  int ret_index = iter->second;
+  if (inserted) {
+    DataType dtype = src_node->output_type(src_slot);
+
+    if (IsRefType(dtype)) {
+      return errors::InvalidArgument(
+          "Ref Tensors (e.g., Variables) are not supported as results: tensor ",
+          src_node->name(), ":", src_slot);
     }
-  }
 
-  for (auto& entry : subgraphs_) {
-    FixupSourceAndSinkEdges(entry.second.graph.get());
-  }
+    NodeDef ret_def;
+    NodeDefBuilder builder(
+        strings::StrCat(src_node->name(), "_", src_slot, "_retval"), kRetValOp);
+    builder.Attr("T", dtype);
+    builder.Attr("index", ret_index);
+    builder.Input(src_image->name(), src_slot, dtype);
+    Status s = builder.Finalize(&ret_def);
+    if (!s.ok()) return s;
+    Node* ret = graph_->AddNode(ret_def, &s);
+    if (!s.ok()) return s;
 
-  return s;
+    graph_->AddEdge(src_image, src_slot, ret, 0);
+  }
+  return Status::OK();
 }
 
-Status Encapsulator::BuildFunctionDefs(
-    const RewriteSubgraphFn& rewrite_subgraph_fn, bool reuse_existing_functions,
-    FunctionLibraryDefinition* library) {
-  // For each subgraph, build a FunctionDef.
-  for (auto& subgraph_entry : subgraphs_) {
-    string name = subgraph_entry.first;
-    Subgraph& subgraph = subgraph_entry.second;
-
-    subgraph.call_node_def.set_op(name);
-    subgraph.call_node_def.set_name(name);
-    subgraph.call_node_def.set_device(subgraph.device);
-
-    if (rewrite_subgraph_fn) {
-      // Initialize the input and output permutations to the identity.
-      std::vector<int> input_permutation(subgraph.args_by_src.size());
-      std::iota(input_permutation.begin(), input_permutation.end(), 0);
-      std::vector<int> output_permutation(subgraph.results.size());
-      std::iota(output_permutation.begin(), output_permutation.end(), 0);
-
-      TF_RETURN_IF_ERROR(
-          rewrite_subgraph_fn(&subgraph.graph, &input_permutation,
-                              &output_permutation, &subgraph.call_node_def));
-
-      // Apply the input/output permutations to the 'args_by_...' and 'results'
-      // mappings in 'subgraph', so when we build edges in BuildOutputGraph() we
-      // connect them to the right input/output positions.
-      if (input_permutation.size() != subgraph.args_by_src.size()) {
-        return errors::InvalidArgument("Input permutation has incorrect size.");
-      }
-      if (output_permutation.size() != subgraph.results.size()) {
-        return errors::InvalidArgument(
-            "Output permutation has incorrect size.");
-      }
-      for (auto& arg : subgraph.args_by_src) {
-        arg.second = input_permutation[arg.second];
-      }
-      for (auto& arg : subgraph.args_by_dst) {
-        arg.second = input_permutation[arg.second];
-      }
-      for (auto& result : subgraph.results) {
-        result.second = output_permutation[result.second];
-      }
-
-      name = subgraph.call_node_def.op();
+Status Encapsulator::Subgraph::BuildFunctionDef(
+    const string& name_in, const RewriteSubgraphFn& rewrite_subgraph_fn,
+    bool reuse_existing_functions, FunctionLibraryDefinition* library) {
+  // name_in is copied here because name may be modified below if
+  // rewrite_subgraph_fn is true.
+  string name = name_in;
+  call_node_def_.set_op(name);
+  call_node_def_.set_name(name);
+  call_node_def_.set_device(device_);
+
+  if (rewrite_subgraph_fn) {
+    // Initialize the input and output permutations to the identity.
+    std::vector<int> input_permutation(args_by_src_.size());
+    std::iota(input_permutation.begin(), input_permutation.end(), 0);
+    std::vector<int> output_permutation(results_.size());
+    std::iota(output_permutation.begin(), output_permutation.end(), 0);
+
+    TF_RETURN_IF_ERROR(rewrite_subgraph_fn(
+        &graph_, &input_permutation, &output_permutation, &call_node_def_));
+
+    // Apply the input/output permutations to the 'args_by_...' and 'results_'
+    // mappings, so when we build edges in BuildOutputGraph() we
+    // connect them to the right input/output positions.
+    if (input_permutation.size() != args_by_src_.size()) {
+      return errors::InvalidArgument("Input permutation has incorrect size.");
+    }
+    if (output_permutation.size() != results_.size()) {
+      return errors::InvalidArgument("Output permutation has incorrect size.");
+    }
+    for (auto& arg : args_by_src_) {
+      arg.second = input_permutation[arg.second];
+    }
+    for (auto& arg : args_by_dst_) {
+      arg.second = input_permutation[arg.second];
+    }
+    for (auto& result : results_) {
+      result.second = output_permutation[result.second];
     }
 
-    FunctionDef fdef;
-    TF_RETURN_IF_ERROR(GraphToFunctionDef(*subgraph.graph, name, &fdef));
+    name = call_node_def_.op();
+  }
 
-    if (VLOG_IS_ON(1)) {
-      VLOG(2) << "Build function def " << name;
-      dump_graph::DumpGraphToFile(
-          strings::StrCat("encapsulate_fdef_graph_", name), *subgraph.graph,
-          library);
-      dump_graph::DumpFunctionDefToFile(
-          strings::StrCat("encapsulate_fdef_", name), fdef);
-    }
+  FunctionDef fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(*graph_, name, &fdef));
 
-    if (!reuse_existing_functions || library->Find(name) == nullptr) {
-      TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
-    }
+  if (VLOG_IS_ON(1)) {
+    VLOG(2) << "Build function def " << name;
+    dump_graph::DumpGraphToFile(
+        strings::StrCat("encapsulate_fdef_graph_", name), *graph_, library);
+    dump_graph::DumpFunctionDefToFile(
+        strings::StrCat("encapsulate_fdef_", name), fdef);
+  }
+
+  if (!reuse_existing_functions || library->Find(name) == nullptr) {
+    TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
   }
   return Status::OK();
 }
 
-Status Encapsulator::BuildParallelCheckOp(
+Status Encapsulator::Subgraph::BuildParallelCheckOp(
     const std::unordered_map<const Node*, Node*>& node_images,
-    const Encapsulator::Subgraph& subgraph, Graph* graph_out,
-    Node** parallel_check_op) {
+    Graph* graph_out) {
   // Build an index mapping output positions to node/slot pairs in the
   // original graph.
-  std::vector<NodeSlot> results_by_num(subgraph.results.size());
-  for (const auto& entry : subgraph.results) {
+  std::vector<NodeSlot> results_by_num(results_.size());
+  for (const auto& entry : results_) {
     results_by_num[entry.second] = entry.first;
   }
 
@@ -386,22 +515,22 @@ Status Encapsulator::BuildParallelCheckOp(
     expected_outputs[i] =
         NodeDefBuilder::NodeOut(node_images.at(node_slot.node)->name(),
                                 node_slot.slot, result_dtypes[i]);
-    actual_outputs[i] = NodeDefBuilder::NodeOut(subgraph.call_node_def.name(),
-                                                i, result_dtypes[i]);
+    actual_outputs[i] =
+        NodeDefBuilder::NodeOut(call_node_def_.name(), i, result_dtypes[i]);
   }
   // Assign the parallel check op to a CPU on the same task as the cluster it is
   // checking.
   string device, dummy;
   if (!DeviceNameUtils::SplitDeviceName(
-          subgraph.call_node_inputs->assigned_device_name(), &device, &dummy)) {
+          call_node_inputs_->assigned_device_name(), &device, &dummy)) {
     return errors::InvalidArgument("Could not parse device name");
   }
   strings::StrAppend(&device, "/cpu:0");
 
   NodeDef check_def;
   TF_RETURN_IF_ERROR(
-      NodeDefBuilder(graph_out->NewName(strings::StrCat(
-                         subgraph.call_node_def.name(), "_parallel_check")),
+      NodeDefBuilder(graph_out->NewName(strings::StrCat(call_node_def_.name(),
+                                                        "_parallel_check")),
                      "ParallelCheck")
           .Device(device)
           .Attr("T", result_dtypes)
@@ -421,65 +550,303 @@ Status Encapsulator::BuildParallelCheckOp(
     const NodeSlot& node_slot = results_by_num[i];
     graph_out->AddEdge(node_images.at(node_slot.node), node_slot.slot, check_op,
                        i);
-    graph_out->AddEdge(subgraph.call_node_inputs, i, check_op, num_results + i);
+    graph_out->AddEdge(call_node_inputs_, i, check_op, num_results + i);
   }
 
-  *parallel_check_op = check_op;
+  call_node_outputs_ = check_op;
   return Status::OK();
 }
 
-Status Encapsulator::BuildOutputGraph(bool parallel_checking,
-                                      Graph* graph_out) {
+Status Encapsulator::Subgraph::AddFunctionCallNode(
+    const std::unordered_map<const Node*, Node*>& node_images,
+    bool parallel_checking, Graph* graph_out) {
   Status s;
+  call_node_inputs_ = graph_out->AddNode(call_node_def_, &s);
+  if (!s.ok()) return s;
 
-  // Map from nodes in the input graph to nodes in the output graph.
+  // Copy the assigned device and the key_annotation over.
+  call_node_inputs_->set_assigned_device_name(device_);
+  call_node_outputs_ = call_node_inputs_;
+
+  if (parallel_checking) {
+    TF_RETURN_IF_ERROR(BuildParallelCheckOp(node_images, graph_out));
+  }
+  return Status::OK();
+}
+
+Status Encapsulator::GetFunctionNameAttr(Node const* node, string* attr) const {
+  Status s = GetNodeAttr(node->attrs(), group_attribute_, attr);
+  if (s.code() == error::Code::NOT_FOUND) {
+    // Return empty attr if there's no group_attribute.
+    attr->clear();
+    return Status::OK();
+  }
+  return s;
+}
+
+bool IsInSubgraph(const string& func_id) { return !func_id.empty(); }
+
+Status Encapsulator::CopySubgraphNodes(
+    std::unordered_map<const Node*, Node*>* node_images) {
+  for (Node* node : graph_in_->op_nodes()) {
+    string func_id;
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(node, &func_id));
+    if (!IsInSubgraph(func_id)) continue;
+
+    Subgraph& subgraph = subgraphs_[func_id];
+    Node* image = subgraph.MakeNodeImage(graph_in_, node);
+    image->ClearAttr(group_attribute_);
+    (*node_images)[node] = image;
+  }
+  return Status::OK();
+}
+
+Status Encapsulator::CopySubgraphEdges(
+    const std::unordered_map<const Node*, Node*>& node_images,
+    std::vector<std::pair<const Node*, Node*>>* src_arg_pairs) {
+  for (const Edge* edge : graph_in_->edges()) {
+    string src_func_id;
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->src(), &src_func_id));
+    string dst_func_id;
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->dst(), &dst_func_id));
+    Node* src_image = gtl::FindWithDefault(node_images, edge->src(), nullptr);
+    Node* dst_image = gtl::FindWithDefault(node_images, edge->dst(), nullptr);
+
+    // Copy edges that are local to a subgraph.
+    if (IsInSubgraph(src_func_id) && IsInSubgraph(dst_func_id) &&
+        src_func_id == dst_func_id) {
+      Graph* g = subgraphs_[src_func_id].GetGraph();
+      if (edge->IsControlEdge()) {
+        g->AddControlEdge(src_image, dst_image);
+      } else {
+        g->AddEdge(src_image, edge->src_output(), dst_image, edge->dst_input());
+      }
+      continue;
+    }
+
+    // Record 'src' as an output of its subgraph, if applicable.
+    if (IsInSubgraph(src_func_id)) {
+      Subgraph& src_subgraph = subgraphs_[src_func_id];
+      // Ignore control edges leaving the subgraph. We will lift them onto the
+      // enclosing call operators in BuildOutputGraph().
+      if (!edge->IsControlEdge()) {
+        TF_RETURN_IF_ERROR(src_subgraph.RecordResult(edge, node_images));
+      }
+    }
+
+    // Record 'dst' as an input of its subgraph, if applicable.
+    if (IsInSubgraph(dst_func_id)) {
+      Subgraph& dst_subgraph = subgraphs_[dst_func_id];
+      // Ignore control edges entering the subgraph. We will lift them onto
+      // the enclosing call operators in BuildOutputGraph().
+      if (!edge->IsControlEdge()) {
+        TF_RETURN_IF_ERROR(
+            dst_subgraph.RecordArg(edge, node_images, src_arg_pairs));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status Encapsulator::SplitIntoSubgraphs() {
+  Status s;
+
+  // Map from input graph nodes to subgraph nodes.
   std::unordered_map<const Node*, Node*> node_images;
 
-  // Copy all unmarked nodes to the output graph.
+  // Each entry of src_arg_pairs is a pair whose first element is a node in the
+  // original graph that has an output edge in the subgraph, and whose second
+  // element is the arg node in the subgraph that it sends to. The vector will
+  // be filled in below in AddArgs.
+  std::vector<std::pair<const Node*, Node*>> src_arg_pairs;
+
+  TF_RETURN_IF_ERROR(CopySubgraphNodes(&node_images));
+  TF_RETURN_IF_ERROR(CopySubgraphEdges(node_images, &src_arg_pairs));
+
+  MarkGuaranteedConstants(*graph_in_, src_arg_pairs);
+
+  for (auto& entry : subgraphs_) {
+    Subgraph& subgraph = entry.second;
+    FixupSourceAndSinkEdges(subgraph.GetGraph());
+  }
+
+  return s;
+}
+
+Status Encapsulator::BuildFunctionDefs(
+    const RewriteSubgraphFn& rewrite_subgraph_fn, bool reuse_existing_functions,
+    FunctionLibraryDefinition* library) {
+  for (auto& subgraph_entry : subgraphs_) {
+    string name = subgraph_entry.first;
+    Subgraph& subgraph = subgraph_entry.second;
+    TF_RETURN_IF_ERROR(subgraph.BuildFunctionDef(
+        name, rewrite_subgraph_fn, reuse_existing_functions, library));
+  }
+  return Status::OK();
+}
+
+Status Encapsulator::CopyNodesToOutputGraph(
+    bool parallel_checking, Graph* graph_out,
+    std::unordered_map<const Node*, Node*>* node_images) {
   for (Node* node : graph_in_->op_nodes()) {
-    string func_id = GetFunctionNameAttr(node);
+    string func_id;
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(node, &func_id));
 
-    // Don't copy nodes that going to be encapsulated, unless parallel checking
-    // is enabled.
-    if (!func_id.empty() && !parallel_checking) continue;
+    // Don't copy nodes that are going to be encapsulated, unless parallel
+    // checking is enabled.
+    if (IsInSubgraph(func_id) && !parallel_checking) continue;
 
     Node* image = graph_out->CopyNode(node);
-    node_images[node] = image;
+    (*node_images)[node] = image;
   }
-  node_images[graph_in_->source_node()] = graph_out->source_node();
-  node_images[graph_in_->sink_node()] = graph_out->sink_node();
+  (*node_images)[graph_in_->source_node()] = graph_out->source_node();
+  (*node_images)[graph_in_->sink_node()] = graph_out->sink_node();
+  return Status::OK();
+}
 
-  // Add function call nodes for each subgraph.
+Status Encapsulator::AddFunctionCallNodes(
+    const std::unordered_map<const Node*, Node*>& node_images,
+    bool parallel_checking, Graph* graph_out) {
   for (auto& subgraph_entry : subgraphs_) {
-    Subgraph& subgraph = subgraph_entry.second;
+    TF_RETURN_IF_ERROR(subgraph_entry.second.AddFunctionCallNode(
+        node_images, parallel_checking, graph_out));
+  }
+  return Status::OK();
+}
 
-    subgraph.call_node_inputs = graph_out->AddNode(subgraph.call_node_def, &s);
-    if (!s.ok()) return s;
+Status Encapsulator::FindOutputImageOfEdgeSrc(
+    const string& src_func_id, const string& dst_func_id,
+    const std::unordered_map<const Node*, Node*>& node_images,
+    const Node* original_src_node, Node** src_image) {
+  if (IsInSubgraph(src_func_id)) {
+    // The edge is from a subgraph to a regular node in the output graph so
+    // use the subgraph's call node output.
+    *src_image = subgraphs_.at(src_func_id).GetCallNodeForOutputs();
+  } else {
+    // The source of the edge is in the output graph so use the node image in
+    // the output graph.
+    *src_image = node_images.at(original_src_node);
+  }
+  return Status::OK();
+}
+
+int Encapsulator::FindOutputSlotOfEdgeSrc(const string& src_func_id,
+                                          const string& dst_func_id,
+                                          const Edge* edge) {
+  if (IsInSubgraph(src_func_id)) {
+    const Subgraph& src_subgraph = subgraphs_.at(src_func_id);
+    // 'src' is in a subgraph and 'dst' is a regular node in the output
+    // graph. Use the corresponding call output instead.
+    return src_subgraph.GetResultIndexForEdge(edge);
+  } else {
+    // The source of the edge is in the output graph so use the regular edge
+    // slot.
+    return edge->src_output();
+  }
+}
 
-    // Copy the assigned device and the key_annotation over.
-    subgraph.call_node_inputs->set_assigned_device_name(subgraph.device);
-    subgraph.call_node_outputs = subgraph.call_node_inputs;
+Status Encapsulator::FindOutputImageOfEdgeDst(
+    const string& src_func_id, const string& dst_func_id,
+    const std::unordered_map<const Node*, Node*>& node_images,
+    const Node* original_dst_node, Node** dst_image) {
+  if (IsInSubgraph(dst_func_id)) {
+    // The edge is to a subgraph from a regular node in the output graph so
+    // use the subgraph's call node input.
+    *dst_image = subgraphs_.at(dst_func_id).GetCallNodeForInputs();
+  } else {
+    // The destination of the edge is in the output graph so use the node image
+    // in the output graph.
+    *dst_image = node_images.at(original_dst_node);
+  }
+  return Status::OK();
+}
 
+int Encapsulator::FindOutputSlotOfEdgeDst(const string& src_func_id,
+                                          const string& dst_func_id,
+                                          const Edge* edge) {
+  if (IsInSubgraph(dst_func_id)) {
+    const Subgraph& dst_subgraph = subgraphs_.at(dst_func_id);
+    // 'dst' is in a subgraph and 'src' is a regular node in the output
+    // graph. Use the corresponding call input instead.
+    return dst_subgraph.GetArgIndexForEdge(edge);
+  } else {
+    // The destination of the edge is in the output graph so use the regular
+    // edge slot.
+    return edge->dst_input();
+  }
+}
+
+Status Encapsulator::CopyEdgeToOutputGraph(
+    const Edge* edge, const string& src_func_id, const string& dst_func_id,
+    const std::unordered_map<const Node*, Node*>& node_images,
+    bool parallel_checking, Graph* graph_out,
+    std::unordered_set<std::pair<NodeSlot, NodeSlot>, NodeSlot::PairHasher>*
+        edges_added) {
+  Node* src_image;
+  TF_RETURN_IF_ERROR(FindOutputImageOfEdgeSrc(
+      src_func_id, dst_func_id, node_images, edge->src(), &src_image));
+  Node* dst_image;
+  TF_RETURN_IF_ERROR(FindOutputImageOfEdgeDst(
+      src_func_id, dst_func_id, node_images, edge->dst(), &dst_image));
+
+  // If this is a control edge then copy it and return. Lift control edges onto
+  // the enclosing call operator.
+  if (edge->IsControlEdge()) {
+    // Add the control edge, if we have not already added it, using the images
+    // determined above (potentially call operators or RecvAtHost/SendFromHost).
+    if (edges_added->emplace(NodeSlot(src_image, -1), NodeSlot(dst_image, -1))
+            .second) {
+      graph_out->AddControlEdge(src_image, dst_image);
+    }
+
+    // If parallel checking is enabled, also add a control edge to the
+    // corresponding parallel check op.
     if (parallel_checking) {
-      TF_RETURN_IF_ERROR(BuildParallelCheckOp(node_images, subgraph, graph_out,
-                                              &subgraph.call_node_outputs));
+      graph_out->AddControlEdge(src_image, node_images.at(edge->dst()));
     }
+    return Status::OK();
+  }
+
+  int src_output = FindOutputSlotOfEdgeSrc(src_func_id, dst_func_id, edge);
+
+  int dst_input = FindOutputSlotOfEdgeDst(src_func_id, dst_func_id, edge);
+
+  if (IsInSubgraph(dst_func_id) && parallel_checking) {
+    // If we are parallel checking, also feed the tensor as an input to the
+    // corresponding parallel check subgraph.
+    graph_out->AddEdge(src_image, src_output, node_images.at(edge->dst()),
+                       edge->dst_input());
   }
 
+  // Add the edge, if we have not already added it.
+  if (edges_added
+          ->emplace(NodeSlot(src_image, src_output),
+                    NodeSlot(dst_image, dst_input))
+          .second) {
+    graph_out->AddEdge(src_image, src_output, dst_image, dst_input);
+  }
+  return Status::OK();
+}
+
+Status Encapsulator::AddEdgesToOutputGraph(
+    const std::unordered_map<const Node*, Node*>& node_images,
+    bool parallel_checking, Graph* graph_out) {
   // Set of edges already added to the output graph, represented as (src, dst)
   // pairs. We use the set to deduplicate edges; multiple edges in the input
   // graph may map to one edge in the output graph.
   std::unordered_set<std::pair<NodeSlot, NodeSlot>, NodeSlot::PairHasher>
       edges_added;
 
-  // Add edges to the graph_out graph.
   for (const Edge* edge : graph_in_->edges()) {
-    string src_func_id = GetFunctionNameAttr(edge->src());
-    string dst_func_id = GetFunctionNameAttr(edge->dst());
+    string src_func_id;
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->src(), &src_func_id));
+    string dst_func_id;
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->dst(), &dst_func_id));
 
     // Ignore edges that are strictly contained within one subgraph, unless
     // we are constructing parallel check graphs.
-    if (!src_func_id.empty() && src_func_id == dst_func_id) {
+    if (IsInSubgraph(src_func_id) && IsInSubgraph(dst_func_id) &&
+        src_func_id == dst_func_id) {
       if (parallel_checking) {
         Node* src_image = node_images.at(edge->src());
         Node* dst_image = node_images.at(edge->dst());
@@ -493,63 +860,29 @@ Status Encapsulator::BuildOutputGraph(bool parallel_checking,
       continue;
     }
 
-    // We have an edge that crosses a cluster boundary.
-    Node* src_image = src_func_id.empty()
-                          ? node_images.at(edge->src())
-                          : subgraphs_.at(src_func_id).call_node_outputs;
-    Node* dst_image = dst_func_id.empty()
-                          ? node_images.at(edge->dst())
-                          : subgraphs_.at(dst_func_id).call_node_inputs;
-
-    // Copy control edges. Lift control edges onto the enclosing call operator.
-    if (edge->IsControlEdge()) {
-      // Add the control edge, if we have not already added it.
-      if (edges_added.emplace(NodeSlot(src_image, -1), NodeSlot(dst_image, -1))
-              .second) {
-        graph_out->AddControlEdge(src_image, dst_image);
-      }
-
-      // If parallel checking is enabled, also add a control edge to the
-      // corresponding parallel check op.
-      if (parallel_checking) {
-        graph_out->AddControlEdge(src_image, node_images.at(edge->dst()));
-      }
-      continue;
-    }
-
-    int src_output = edge->src_output();
-    if (!src_func_id.empty()) {
-      // 'src' is in a subgraph. Use the corresponding call output instead.
-      const Subgraph& src_subgraph = subgraphs_.at(src_func_id);
-      src_output =
-          src_subgraph.results.at(NodeSlot(edge->src(), edge->src_output()));
-    }
+    // We have an edge that crosses a cluster boundary or is entirely within the
+    // unclustered graph.
+    TF_RETURN_IF_ERROR(CopyEdgeToOutputGraph(edge, src_func_id, dst_func_id,
+                                             node_images, parallel_checking,
+                                             graph_out, &edges_added));
+  }
 
-    int dst_input = edge->dst_input();
+  return Status::OK();
+}
 
-    if (!dst_func_id.empty()) {
-      // 'dst' is in a subgraph. Use the corresponding call input instead.
-      const Subgraph& dst_subgraph = subgraphs_.at(dst_func_id);
-      dst_input =
-          dst_subgraph.args_by_dst.at(NodeSlot(edge->dst(), edge->dst_input()));
+Status Encapsulator::BuildOutputGraph(bool parallel_checking,
+                                      Graph* graph_out) {
+  // Map from nodes in the input graph to nodes in the output graph.
+  std::unordered_map<const Node*, Node*> node_images;
 
-      // If we are parallel checking, also feed the tensor as an input to the
-      // corresponding parallel check subgraph.
-      if (parallel_checking) {
-        graph_out->AddEdge(src_image, src_output, node_images.at(edge->dst()),
-                           edge->dst_input());
-      }
-    }
-    // Add the edge, if we have not already added it.
-    if (edges_added
-            .emplace(NodeSlot(src_image, src_output),
-                     NodeSlot(dst_image, dst_input))
-            .second) {
-      graph_out->AddEdge(src_image, src_output, dst_image, dst_input);
-    }
-  }
+  TF_RETURN_IF_ERROR(
+      CopyNodesToOutputGraph(parallel_checking, graph_out, &node_images));
+  TF_RETURN_IF_ERROR(
+      AddFunctionCallNodes(node_images, parallel_checking, graph_out));
+  TF_RETURN_IF_ERROR(
+      AddEdgesToOutputGraph(node_images, parallel_checking, graph_out));
 
-  return s;
+  return Status::OK();
 }
 
 }  // anonymous namespace
@@ -562,20 +895,18 @@ Status EncapsulateSubgraphsInFunctions(
   Status s;
 
   Encapsulator encapsulator(std::move(group_attribute), &graph_in);
-  s = encapsulator.SplitIntoSubgraphs();
-  if (!s.ok()) return s;
+  TF_RETURN_IF_ERROR(encapsulator.SplitIntoSubgraphs());
 
-  s = encapsulator.BuildFunctionDefs(rewrite_subgraph_fn,
-                                     reuse_existing_functions, library);
-  if (!s.ok()) return s;
+  TF_RETURN_IF_ERROR(encapsulator.BuildFunctionDefs(
+      rewrite_subgraph_fn, reuse_existing_functions, library));
 
   std::unique_ptr<Graph> out(new Graph(library));
   out->set_versions(graph_in.versions());
-  s = encapsulator.BuildOutputGraph(parallel_checking, out.get());
-  if (!s.ok()) return s;
+  TF_RETURN_IF_ERROR(
+      encapsulator.BuildOutputGraph(parallel_checking, out.get()));
 
   *graph_out = std::move(out);
-  return s;
+  return Status::OK();
 }
 
 // Finds the types of the _Arg nodes, indexed by position.
@@ -691,8 +1022,8 @@ Status EncapsulateSubgraphsPass::Run(
 
   TF_RETURN_IF_ERROR(EncapsulateSubgraphsInFunctions(
       kXlaClusterAttr, **options.graph, rewrite_subgraph,
-      flags->tf_xla_parallel_checking, /*reuse_existing_functions=*/false,
-      &graph_out, library));
+      flags->tf_xla_parallel_checking,
+      /*reuse_existing_functions=*/false, &graph_out, library));
 
   if (VLOG_IS_ON(1)) {
     dump_graph::DumpGraphToFile("after_encapsulate_subgraphs", *graph_out,
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 4a1dbaf05dc7824835f3567c6abcf48222720230..717efb360185f1ce26ee1e9adb0ee5bf7f4799f8 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -398,5 +398,109 @@ TEST(EncapsulateSubgraphsTest, ParallelChecking) {
   EXPECT_EQ(expected_edges, GraphEdges(*graph));
 }
 
+const Node* FindNodeByName(const Graph& graph, const string& name) {
+  for (const Node* node : graph.nodes()) {
+    if (node->name() == name) return node;
+  }
+  return nullptr;
+}
+
+bool HasGuaranteeConstAttr(const Node& n) {
+  bool is_guaranteed_constant = false;
+  if (!GetNodeAttr(n.attrs(), "_is_guaranteed_constant",
+                   &is_guaranteed_constant)
+           .ok()) {
+    return false;
+  }
+  return is_guaranteed_constant;
+}
+
+TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Simple) {
+  Scope root = Scope::NewRootScope().ExitOnError().WithDevice(
+      "/job:localhost/replica:0/task:0/cpu:0");
+  auto x1 = ops::Placeholder(root.WithOpName("x1"), DT_FLOAT);
+  auto const_x2 = ops::Const(root.WithOpName("const_x2"), 10.0f);
+  auto const_guarantee_x1 =
+      ops::GuaranteeConst(root.WithOpName("const_guarantee_x1"), x1);
+  auto add1 = ops::Add(root.WithOpName("add1"), const_guarantee_x1, const_x2);
+  add1.node()->AddAttr("_encapsulate", "encapsulate1");
+
+  Graph graph_before(OpRegistry::Global());
+  TF_ASSERT_OK(root.ToGraph(&graph_before));
+
+  std::unique_ptr<Graph> graph_after;
+  FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  int guaranteed_consts = 0;
+  TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
+      "_encapsulate", graph_before,
+      /*rewrite_subgraph_fn=*/
+      [&guaranteed_consts](std::unique_ptr<Graph>* graph_ptr,
+                           std::vector<int>* input_permutation,
+                           std::vector<int>* output_permutation,
+                           NodeDef* call_def) {
+        Graph* graph = graph_ptr->get();
+        for (const Node* n : graph->nodes()) {
+          if (n->type_string() == "_Arg" &&
+              StringPiece(n->name()).starts_with("const")) {
+            ++guaranteed_consts;
+            EXPECT_TRUE(HasGuaranteeConstAttr(*n));
+          } else {
+            EXPECT_FALSE(HasGuaranteeConstAttr(*n));
+          }
+        }
+        return Status::OK();
+      },
+      /*parallel_checking=*/false,
+      /*reuse_existing_functions=*/false, &graph_after, &library));
+  EXPECT_EQ(2, guaranteed_consts);
+}
+
+TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Add) {
+  Scope root = Scope::NewRootScope().ExitOnError().WithDevice(
+      "/job:localhost/replica:0/task:0/cpu:0");
+  auto x1 = ops::Placeholder(root.WithOpName("x1"), DT_FLOAT);
+  auto x2 = ops::Placeholder(root.WithOpName("x2"), DT_FLOAT);
+  auto const_guarantee_x1 =
+      ops::GuaranteeConst(root.WithOpName("const_guarantee_x1"), x1);
+  auto const_guarantee_x2 =
+      ops::GuaranteeConst(root.WithOpName("const_guarantee_x2"), x2);
+  auto const_guarantee_add1 = ops::Add(root.WithOpName("const_guarantee_add1"),
+                                       const_guarantee_x1, const_guarantee_x2);
+  auto add2 = ops::Add(root.WithOpName("add2"), const_guarantee_x1, x2);
+  auto mul1 = ops::Mul(root.WithOpName("mul1"), const_guarantee_add1, add2);
+  mul1.node()->AddAttr("_encapsulate", "encapsulate1");
+
+  Graph graph_before(OpRegistry::Global());
+  TF_ASSERT_OK(root.ToGraph(&graph_before));
+
+  std::unique_ptr<Graph> graph_after;
+  FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  int guaranteed_consts = 0;
+  TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
+      "_encapsulate", graph_before,
+      /*rewrite_subgraph_fn=*/
+      [&guaranteed_consts](std::unique_ptr<Graph>* graph_ptr,
+                           std::vector<int>* input_permutation,
+                           std::vector<int>* output_permutation,
+                           NodeDef* call_def) {
+        Graph* graph = graph_ptr->get();
+        for (const Node* n : graph->nodes()) {
+          if (n->type_string() == "_Arg" &&
+              StringPiece(n->name()).starts_with("const")) {
+            ++guaranteed_consts;
+            EXPECT_TRUE(HasGuaranteeConstAttr(*n));
+          } else {
+            EXPECT_FALSE(HasGuaranteeConstAttr(*n));
+          }
+        }
+        return Status::OK();
+      },
+      /*parallel_checking=*/false,
+      /*reuse_existing_functions=*/false, &graph_after, &library));
+  // Only 1 runtime const, which is const_guarantee_add1. Add2 has one const
+  // and another non-const, so overall non-const.
+  EXPECT_EQ(1, guaranteed_consts);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 459a582e157f5ddc63997ca93e7c0294293517d3..9bea5663319c8a25249fdc265cee0191556a7c04 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -16,7 +16,6 @@ cc_library(
         "//tensorflow/compiler/jit:xla_device",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla:xla_local_runtime_context",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index e481796d9e626fc8cdf36687ad110b0a8a788be0..4f3f17df9c680c63546d17dcc5a2775a1014f6c3 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/tf2xla/xla_local_runtime_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
@@ -103,7 +102,6 @@ xla::StatusOr<gpu::DeviceMemoryBase> XlaAllocator::Allocate(
   }
   void* data =
       reinterpret_cast<void*>(const_cast<char*>(t.tensor_data().data()));
-  TF_RET_CHECK(data != nullptr);
   tensors_[data] = t;
   return gpu::DeviceMemoryBase(data, size);
 }
@@ -111,7 +109,6 @@ xla::StatusOr<gpu::DeviceMemoryBase> XlaAllocator::Allocate(
 Status XlaAllocator::RegisterArgument(const Tensor* t) {
   void* data =
       reinterpret_cast<void*>(const_cast<char*>(t->tensor_data().data()));
-  TF_RET_CHECK(data != nullptr);
   tensors_[data] = *t;
   return Status::OK();
 }
@@ -267,7 +264,6 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   // Builds an XLA allocator for the device.
   XlaAllocator xla_allocator(client->platform(), ctx);
-  XlaLocalRuntimeContext local_runtime_context;
 
   std::unique_ptr<xla::ShapedBuffer> output;
   // Build xla::ShapedBuffers that point directly to the Tensor buffers.
@@ -291,27 +287,22 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
     gpu::DeviceMemoryBase dmem = gpu::DeviceMemoryBase(
         const_cast<char*>(t->tensor_data().data()), t->tensor_data().size());
 
-    arg_buffers[i] =
-        xla::ShapedBuffer::MakeArrayShapedBuffer(
-            shape, client->platform(), client->default_device_ordinal(), dmem)
-            .ConsumeValueOrDie();
+    const xla::Shape on_device_shape =
+        client->backend().transfer_manager()->HostShapeToDeviceShape(shape);
+    CHECK(xla::ShapeUtil::Equal(shape, on_device_shape))
+        << "On-device shape "
+        << xla::ShapeUtil::HumanStringWithLayout(on_device_shape)
+        << " not the same as on-host shape "
+        << xla::ShapeUtil::HumanStringWithLayout(shape);
+    arg_buffers[i] = xla::MakeUnique<xla::ShapedBuffer>(
+        /*on_host_shape=*/shape, /*on_device_shape=*/shape, client->platform(),
+        client->default_device_ordinal());
+    arg_buffers[i]->set_buffer(dmem, /*index=*/{});
     arg_ptrs[i] = arg_buffers[i].get();
 
     OP_REQUIRES_OK(ctx, xla_allocator.RegisterArgument(t));
   }
 
-  // Make the final parameter point at local_runtime_context.
-  if (kernel->requires_runtime_context) {
-    gpu::DeviceMemoryBase local_runtime_context_dmem(
-        &local_runtime_context, sizeof(local_runtime_context));
-    arg_buffers.push_back(
-        xla::ShapedBuffer::MakeArrayShapedBuffer(
-            xla::ShapeUtil::MakeOpaqueShape(), client->platform(),
-            client->default_device_ordinal(), local_runtime_context_dmem)
-            .ConsumeValueOrDie());
-    arg_ptrs.push_back(arg_buffers.back().get());
-  }
-
   // Execute the computation.
   VLOG(2) << "Executing computation.";
   xla::ExecutableRunOptions run_options;
@@ -323,19 +314,13 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   auto run_result = executable->Run(arg_ptrs, run_options);
   OP_REQUIRES(ctx, run_result.ok(), run_result.status());
 
-  if (local_runtime_context.error) {
-    ctx->CtxFailure(errors::InvalidArgument("Compiled kernel returned error: ",
-                                            local_runtime_context.error_msg));
-    return;
-  }
-
   output = run_result.ConsumeValueOrDie()->release();
   auto elapsed = env->NowMicros() - start_time;
   VLOG(2) << "Elapsed time: " << elapsed << "us";
 
   // Computation output should always be a tuple.
   if (VLOG_IS_ON(2)) {
-    VLOG(2) << "Result tuple shape: " << output->shape().DebugString();
+    VLOG(2) << "Result tuple shape: " << output->on_host_shape().DebugString();
   }
   CHECK_EQ(ctx->num_outputs(), kernel->outputs.size());
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 74c9791f5eaf1fbc43b152520df496a3b552af18..1f311a3aedbf7711ce6a081671f5848a81f2bd85 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -172,10 +172,15 @@ bool HasResourceInputOrOutput(const Node& node) {
                    DT_RESOURCE) != node.output_types().end();
 }
 
+struct NodeCompare {
+  bool operator()(const Node* a, const Node* b) { return a->id() < b->id(); }
+};
+using OrderedNodeSet = std::set<Node*, NodeCompare>;
+
 Status FindCompilationCandidates(
     const Graph& graph, FunctionLibraryDefinition* flib_def, Env* env,
     const std::function<bool(const Node*, const DeviceType&)>& is_compilable_fn,
-    std::unordered_set<Node*>* candidates) {
+    OrderedNodeSet* candidates) {
   OptimizerOptions opts;
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
       new ProcessFunctionLibraryRuntime(nullptr, env, TF_GRAPH_DEF_VERSION,
@@ -210,6 +215,13 @@ Status FindCompilationCandidates(
         !IsCompilableWhile(*node, jit_device_type, 0, lib_runtime)) {
       continue;
     }
+    // _Retval nodes in a top-level function represent fetches.
+    // Do not compile them.
+    if (node->type_string() == "_Retval") {
+      VLOG(2) << "Compilation rejected node: return value " << node->name()
+              << ": " << node->type_string();
+      continue;
+    }
     candidates->insert(node);
   }
   return Status::OK();
@@ -347,7 +359,7 @@ Status MarkForCompilationPass::RunImpl(
 
   Graph* graph = options.graph->get();
 
-  std::unordered_set<Node*> compilation_candidates;
+  OrderedNodeSet compilation_candidates;
   TF_RETURN_IF_ERROR(FindCompilationCandidates(
       *graph, options.flib_def,
       (options.session_options != nullptr) ? options.session_options->env
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index b3d258aea177fbefa4bae51d8156da2ff86c9032..454f0aeae98d7afd51f12b2cfb1810de275a57f7 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -525,5 +525,32 @@ TEST(XlaCompilationTest, IllegalCycle_UsefulErrorMessage) {
                             "+-- c\n"));
 }
 
+TEST(XlaCompilationTest, Retval) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  GraphDef graphdef;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("A")
+                                         .WithAttr("dtype", DT_FLOAT)
+                                         .WithAttr("value", Tensor()));
+    Node* b = ops::UnaryOp("Relu", a, builder.opts().WithName("B"));
+    ops::UnaryOp("_Retval", b,
+                 builder.opts()
+                     .WithName("R")
+                     .WithAttr("T", DT_FLOAT)
+                     .WithAttr("index", 0));
+
+    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+  }
+
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+  auto clusters = GetClusters(*graph);
+
+  EXPECT_EQ(2, clusters.size());
+  EXPECT_TRUE(clusters.find("R") == clusters.cend());
+  EXPECT_EQ(clusters["A"], clusters["B"]);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index bc2eccd2779b9ff68ae2121f7bc53d6f74aec3e3..3717c2cc24283e0b218f92ec820d16893cbe0c35 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -214,17 +214,12 @@ Status XlaCompilationCache::BuildExecutable(
     const XlaCompiler::CompilationResult& result,
     std::unique_ptr<xla::LocalExecutable>* executable) {
   VLOG(2) << "Compiling to local executable";
-  xla::Shape opaque_shape = xla::ShapeUtil::MakeOpaqueShape();
 
   std::vector<const xla::Shape*> argument_layouts(
       result.xla_input_shapes.size());
   for (int i = 0; i < result.xla_input_shapes.size(); ++i) {
     argument_layouts[i] = &result.xla_input_shapes[i];
   }
-  if (result.requires_runtime_context) {
-    // The final arg is the XlaLocalRuntimeContext*.
-    argument_layouts.push_back(&opaque_shape);
-  }
   xla::ExecutableBuildOptions build_options;
   build_options.set_device_ordinal(client_->default_device_ordinal());
   build_options.set_result_layout(result.xla_output_shape);
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index fed2c92d763c33aad3c5b3f07c1f33364c797793..c936222f32056e92efced82d5adb3a96c8041a17 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -71,12 +71,14 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
     void* dst_ptr = DMAHelper::base(device_tensor);
     se::DeviceMemoryBase dev_dst_ptr(dst_ptr, total_bytes);
 
-    Status status = Status::OK();
+    Status status;
     stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes);
     // TODO(hpucha): Make this asynchronous.
-    if (!stream_->BlockHostUntilDone()) {
+    Status block_status = stream_->BlockHostUntilDone();
+    if (!block_status.ok()) {
       status = xla::InternalError(
-          "Failed to complete data transfer on stream %p", stream_);
+          "Failed to complete data transfer on stream %p: %s", stream_,
+          block_status.error_message().c_str());
     }
 
     done(status);
@@ -105,12 +107,14 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
     se::DeviceMemoryBase dev_src_ptr(src_ptr, total_bytes);
     void* dst_ptr = DMAHelper::base(cpu_tensor);
 
-    Status status = Status::OK();
+    Status status;
     stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes);
     // TODO(hpucha): Make this asynchronous.
-    if (!stream_->BlockHostUntilDone()) {
+    Status block_status = stream_->BlockHostUntilDone();
+    if (!block_status.ok()) {
       status = xla::InternalError(
-          "Failed to complete data transfer on stream %p", stream_);
+          "Failed to complete data transfer on stream %p: %s", stream_,
+          block_status.error_message().c_str());
     }
 
     done(status);
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 79c4befd3671e1da3fd67e644eb733d2503f9a8b..4f458ecff8f6523a23ca59e0cecb485a7988efad 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -279,6 +279,19 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "image_ops_test",
+    size = "small",
+    srcs = ["image_ops_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "lrn_ops_test",
     size = "medium",
@@ -367,7 +380,15 @@ tf_xla_py_test(
     size = "small",
     srcs = ["random_ops_test.py"],
     # TODO(b/31361304): enable RNG ops on GPU when parallelized.
-    disabled_backends = ["gpu"],
+    disabled_backends = [
+        "gpu",
+        "cpu",
+    ],
+    tags = [
+        "manual",
+        "no_oss",
+        "notap",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -416,6 +437,20 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "scan_ops_test",
+    size = "small",
+    srcs = ["scan_ops_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "segment_reduction_ops_test",
     size = "medium",
@@ -457,6 +492,19 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "stateless_random_ops_test",
+    size = "small",
+    srcs = ["stateless_random_ops_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/contrib/stateless",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "tensor_array_ops_test",
     size = "small",
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 654dc15e86b21c7742d49281d53c1a75e6a45d3b..65706b35d616eb4dce94f0a7056a1604a97ff4c1 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -94,14 +94,12 @@ class BinaryOpsTest(XLATestCase):
           dtype(4),
           expected=np.array([[16], [81]], dtype=dtype))
 
-      atan2_supported = self.device == "XLA_GPU"
-      if atan2_supported:
-        self._testBinary(
-            math_ops.atan2,
-            np.array([0, np.sqrt(2), 1, np.sqrt(2), 0], dtype),
-            np.array([1, np.sqrt(2), 0, -np.sqrt(2), -1], dtype),
-            expected=np.array(
-                [0, np.pi / 4, np.pi / 2, np.pi * 3 / 4, np.pi], dtype=dtype))
+      self._testBinary(
+          math_ops.atan2,
+          np.array([0, np.sqrt(2), 1, np.sqrt(2), 0], dtype),
+          np.array([1, np.sqrt(2), 0, -np.sqrt(2), -1], dtype),
+          expected=np.array(
+              [0, np.pi / 4, np.pi / 2, np.pi * 3 / 4, np.pi], dtype=dtype))
 
       self._testBinary(
           gen_math_ops._reciprocal_grad,
@@ -388,30 +386,28 @@ class BinaryOpsTest(XLATestCase):
               ],
               dtype=dtype))
 
-      atan2_supported = self.device == "XLA_GPU"
-      if atan2_supported:
-        self._testBinary(
-            math_ops.pow,
-            dtype(3 + 2j),
-            dtype(4 - 5j),
-            expected=np.power(dtype(3 + 2j), dtype(4 - 5j)))
-        self._testBinary(  # empty rhs
-            math_ops.pow,
-            np.array([1 + 2j, 2 - 3j], dtype=dtype),
-            np.zeros(shape=[0, 2], dtype=dtype),
-            expected=np.zeros(shape=[0, 2], dtype=dtype))
-        self._testBinary(  # to zero power
-            math_ops.pow,
-            np.array([1 + 2j, 2 - 3j], dtype=dtype),
-            np.zeros(shape=[1, 2], dtype=dtype),
-            expected=np.ones(shape=[1, 2], dtype=dtype))
-        lhs = np.array([1 - 2j, 4 + 3j, 2 - 3j, 3, 2j, 1, 4], dtype=dtype)
-        rhs = np.array([2, 3j, 3 + 4j, 2 + 3j, 3 - 2j, 2, 3 + 3j], dtype=dtype)
-        scalar = dtype(2 + 2j)
-        self._testBinary(math_ops.pow, lhs, rhs, expected=np.power(lhs, rhs))
-        self._testBinary(
-            math_ops.pow, scalar, rhs, expected=np.power(scalar, rhs))
-        self._testBinary(math_ops.pow, lhs, scalar, np.power(lhs, scalar))
+      self._testBinary(
+          math_ops.pow,
+          dtype(3 + 2j),
+          dtype(4 - 5j),
+          expected=np.power(dtype(3 + 2j), dtype(4 - 5j)))
+      self._testBinary(  # empty rhs
+          math_ops.pow,
+          np.array([1 + 2j, 2 - 3j], dtype=dtype),
+          np.zeros(shape=[0, 2], dtype=dtype),
+          expected=np.zeros(shape=[0, 2], dtype=dtype))
+      self._testBinary(  # to zero power
+          math_ops.pow,
+          np.array([1 + 2j, 2 - 3j], dtype=dtype),
+          np.zeros(shape=[1, 2], dtype=dtype),
+          expected=np.ones(shape=[1, 2], dtype=dtype))
+      lhs = np.array([1 - 2j, 4 + 3j, 2 - 3j, 3, 2j, 1, 4], dtype=dtype)
+      rhs = np.array([2, 3j, 3 + 4j, 2 + 3j, 3 - 2j, 2, 3 + 3j], dtype=dtype)
+      scalar = dtype(2 + 2j)
+      self._testBinary(math_ops.pow, lhs, rhs, expected=np.power(lhs, rhs))
+      self._testBinary(
+          math_ops.pow, scalar, rhs, expected=np.power(scalar, rhs))
+      self._testBinary(math_ops.pow, lhs, scalar, np.power(lhs, scalar))
 
       lhs = np.array([4 + 2j, -3 - 1j, 2j, 1], dtype=dtype)
       rhs = np.array([5, -6j, 7 - 3j, -8j], dtype=dtype)
@@ -421,9 +417,8 @@ class BinaryOpsTest(XLATestCase):
       self._testBinary(
           gen_math_ops._sigmoid_grad, lhs, rhs, expected=rhs * lhs * (1 - lhs))
 
-      if atan2_supported:
-        self._testBinary(
-            gen_math_ops._rsqrt_grad, lhs, rhs, expected=lhs**3 * rhs / -2)
+      self._testBinary(
+          gen_math_ops._rsqrt_grad, lhs, rhs, expected=lhs**3 * rhs / -2)
 
       self._testBinary(
           gen_math_ops._sqrt_grad, lhs, rhs, expected=rhs / (2 * lhs))
@@ -547,7 +542,7 @@ class BinaryOpsTest(XLATestCase):
       self._testDivision(dtype)
 
   def testFloatDivision(self):
-    for dtype in self.float_types + self.complex_types:
+    for dtype in self.float_types | self.complex_types:
       self._testDivision(dtype)
 
   def _testRemainder(self, dtype):
diff --git a/tensorflow/compiler/tests/categorical_op_test.py b/tensorflow/compiler/tests/categorical_op_test.py
index 5e06f9a72401935b9681c35a164b51f50a8538ae..035cdea1786d39f3d21bb63be5c8ccffe1608bdf 100644
--- a/tensorflow/compiler/tests/categorical_op_test.py
+++ b/tensorflow/compiler/tests/categorical_op_test.py
@@ -35,6 +35,9 @@ from tensorflow.python.platform import googletest
 class CategoricalTest(XLATestCase):
   """Test cases for random-number generating operators."""
 
+  def output_dtypes(self):
+    return set(self.int_types).intersection([np.int32, np.int64])
+
   def _chi2(self, expected, actual):
     """Returns Chi2 GOF statistic."""
     actual = np.asarray(actual)
@@ -55,7 +58,8 @@ class CategoricalTest(XLATestCase):
     """
     with self.test_session() as sess, self.test_scope():
       random_seed.set_random_seed(1618)
-      op = random_ops.multinomial(logits, num_samples)
+      op = random_ops.multinomial(logits, num_samples,
+                                  output_dtype=dtypes.int32)
       d = sess.run(op)
 
     batch_size, num_classes = logits.shape
@@ -73,11 +77,11 @@ class CategoricalTest(XLATestCase):
 
     return freqs_mat
 
-  def _testRngIsNotConstant(self, rng, dtype):
+  def _testRngIsNotConstant(self, rng, dtype, output_dtype):
     # Tests that 'rng' does not always return the same value.
     with self.test_session() as sess:
       with self.test_scope():
-        x = rng(dtype)
+        x = rng(dtype, output_dtype)
 
       # The random-number generator, if working correctly, should produce the
       # same output multiple times with low probability.
@@ -92,21 +96,25 @@ class CategoricalTest(XLATestCase):
                       (not np.array_equal(y, w)))
 
   def testCategoricalIsNotConstant(self):
-    def rng(unused_dtype):
-      return random_ops.multinomial([[1., 1., 1.]], 10)
+    def rng(dtype, output_dtype):
+      return random_ops.multinomial(np.array([[1., 1., 1.]], dtype=dtype), 10,
+                                    output_dtype=output_dtype)
 
-    dtype = dtypes.float32
-    self._testRngIsNotConstant(rng, dtype)
+    dtype = np.float32
+    for output_dtype in self.output_dtypes():
+      self._testRngIsNotConstant(rng, dtype, output_dtype)
 
   def testCategoricalIsInRange(self):
-    for dtype in [dtypes.float32, dtypes.float64]:
-      with self.test_session() as sess:
-        with self.test_scope():
-          x = random_ops.multinomial(
-              array_ops.ones(shape=[1, 20], dtype=dtype), 1000)
-        y = sess.run(x)
-        self.assertTrue((y >= 0).sum() == 1000)
-        self.assertTrue((y < 20).sum() == 1000)
+    for dtype in self.float_types:
+      for output_dtype in self.output_dtypes():
+        with self.test_session() as sess:
+          with self.test_scope():
+            x = random_ops.multinomial(
+                array_ops.ones(shape=[1, 20], dtype=dtype), 1000,
+                output_dtype=output_dtype)
+          y = sess.run(x)
+          self.assertTrue((y >= 0).sum() == 1000)
+          self.assertTrue((y < 20).sum() == 1000)
 
   def testSamplingCorrectness(self):
     np.random.seed(1618)  # Make it reproducible.
diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py
index 7e3871312c86530b6d3cb0bbacc16c25d3469832..f9db4cf2017c0b4b6dc0cfeeda6dca7bb9d14f19 100644
--- a/tensorflow/compiler/tests/ftrl_test.py
+++ b/tensorflow/compiler/tests/ftrl_test.py
@@ -161,9 +161,9 @@ class FtrlOptimizerTest(XLATestCase):
           ftrl_update.run()
 
         # Validate updated params
-        self.assertAllClose(
+        self.assertAllCloseAccordingToType(
             np.array([-2.55607247, -3.98729396]), var0.eval(), 1e-5, 1e-5)
-        self.assertAllClose(
+        self.assertAllCloseAccordingToType(
             np.array([-0.28232238, -0.56096673]), var1.eval(), 1e-5, 1e-5)
 
   def testFtrlWithL1(self):
@@ -189,10 +189,10 @@ class FtrlOptimizerTest(XLATestCase):
           ftrl_update.run()
 
         # Validate updated params
-        self.assertAllClose(np.array([-7.66718769, -10.91273689]), var0.eval(),
-                            rtol=1e-4)
-        self.assertAllClose(np.array([-0.93460727, -1.86147261]), var1.eval(),
-                            rtol=1e-4)
+        self.assertAllCloseAccordingToType(
+            np.array([-7.66718769, -10.91273689]), var0.eval(), rtol=1e-4)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.93460727, -1.86147261]), var1.eval(), rtol=1e-4)
 
   def testFtrlWithL1_L2(self):
     for dtype in self.float_types:
@@ -217,10 +217,10 @@ class FtrlOptimizerTest(XLATestCase):
           ftrl_update.run()
 
         # Validate updated params
-        self.assertAllClose(np.array([-0.24059935, -0.46829352]), var0.eval(),
-                            rtol=1e-5)
-        self.assertAllClose(np.array([-0.02406147, -0.04830509]), var1.eval(),
-                            rtol=1e-5)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.24059935, -0.46829352]), var0.eval(), rtol=1e-5)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.02406147, -0.04830509]), var1.eval(), rtol=1e-5)
 
   def testFtrlWithL1_L2_L2Shrinkage(self):
     """Test the new FTRL op with support for l2 shrinkage.
@@ -244,18 +244,18 @@ class FtrlOptimizerTest(XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
 
         # Run 10 steps FTRL
         for _ in range(10):
           ftrl_update.run()
 
         # Validate updated params
-        self.assertAllClose(np.array([-0.21931979, -0.40642974]), var0.eval(),
-                            rtol=1e-4)
-        self.assertAllClose(np.array([-0.0282721, -0.07188385]), var1.eval(),
-                            rtol=1e-4)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.21931979, -0.40642974]), var0.eval(), rtol=1e-4)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.0282721, -0.07188385]), var1.eval(), rtol=1e-4)
 
   # When variables are initialized with Zero, FTRL-Proximal has two properties:
   # 1. Without L1&L2 but with fixed learning rate, FTRL-Proximal is identical
@@ -272,8 +272,8 @@ class FtrlOptimizerTest(XLATestCase):
       with self.test_session(), self.test_scope():
         val2, val3 = self.equivAdagradTest_AdagradPart(steps, dtype)
 
-    self.assertAllClose(val0, val2, rtol=1e-4)
-    self.assertAllClose(val1, val3, rtol=1e-4)
+    self.assertAllCloseAccordingToType(val0, val2, rtol=1e-4)
+    self.assertAllCloseAccordingToType(val1, val3, rtol=1e-4)
 
   def testEquivGradientDescentwithoutRegularization(self):
     steps = 5
@@ -284,8 +284,8 @@ class FtrlOptimizerTest(XLATestCase):
         val2, val3 = self.equivGradientDescentTest_GradientDescentPart(
             steps, dtype)
 
-    self.assertAllClose(val0, val2, rtol=1e-5)
-    self.assertAllClose(val1, val3, rtol=1e-5)
+    self.assertAllCloseAccordingToType(val0, val2, rtol=1e-5)
+    self.assertAllCloseAccordingToType(val1, val3, rtol=1e-5)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/function_test.py b/tensorflow/compiler/tests/function_test.py
index cbe2888696c87c6c2f50c3de71e8531977ea395a..11d8a99ffe1a136a54b16e20f1792062203f7969 100644
--- a/tensorflow/compiler/tests/function_test.py
+++ b/tensorflow/compiler/tests/function_test.py
@@ -24,10 +24,12 @@ from tensorflow.compiler.tests.xla_test import XLATestCase
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import googletest
 
 
+@test_util.with_c_api
 class FunctionTest(XLATestCase):
 
   def testFunction(self):
diff --git a/tensorflow/compiler/tests/fused_batchnorm_test.py b/tensorflow/compiler/tests/fused_batchnorm_test.py
index a773b5a94742062511bc8bdc6a202b513ce98db3..a80d69fa5f5099b8a8b67df0da9c92b957e9d194 100644
--- a/tensorflow/compiler/tests/fused_batchnorm_test.py
+++ b/tensorflow/compiler/tests/fused_batchnorm_test.py
@@ -76,7 +76,8 @@ class FusedBatchNormTest(XLATestCase):
       # To avoid constant folding
       t_val = array_ops.placeholder(np.float32, shape=x_shape, name="x")
       scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
-      offset = array_ops.placeholder(np.float32, shape=scale_shape, name="offset")
+      offset = array_ops.placeholder(
+          np.float32, shape=scale_shape, name="offset")
       epsilon = 0.001
       y_ref, mean_ref, var_ref = self._reference_training(
           x_val, scale_val, offset_val, epsilon, data_format)
@@ -112,7 +113,8 @@ class FusedBatchNormTest(XLATestCase):
       # To avoid constant folding
       t_val = array_ops.placeholder(np.float32, shape=x_shape, name="x")
       scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
-      offset = array_ops.placeholder(np.float32, shape=scale_shape, name="offset")
+      offset = array_ops.placeholder(
+          np.float32, shape=scale_shape, name="offset")
       epsilon = 0.001
       y, mean, var = nn.fused_batch_norm(
           t_val,
@@ -153,7 +155,7 @@ class FusedBatchNormTest(XLATestCase):
   def testLearningWithGradientChecker(self):
     self._testLearning(True)
 
-  def testGradient(self):
+  def testGradientTraining(self):
     # TODO(b/64270657): Use gradient_checker here in addition to comparing with
     # this reference implementation.
     channel = 3
@@ -173,7 +175,7 @@ class FusedBatchNormTest(XLATestCase):
       var = array_ops.placeholder(np.float32, shape=scale_shape, name="var")
       scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
       grad_x, grad_scale, grad_offset, _, _ = gen_nn_ops.fused_batch_norm_grad(
-          grad, x, scale, mean, var, data_format="NHWC")
+          grad, x, scale, mean, var, data_format="NHWC", is_training=True)
 
       grad_x_val, grad_scale_val, grad_offset_val = sess.run(
           [grad_x, grad_scale, grad_offset], {
@@ -191,6 +193,53 @@ class FusedBatchNormTest(XLATestCase):
       self.assertAllClose(grad_scale_val, grad_scale_ref, atol=1e-2)
       self.assertAllClose(grad_offset_val, grad_offset_ref, atol=1e-3)
 
+  def testGradientInference(self):
+    # TODO(b/64270657): Use gradient_checker here in addition to comparing with
+    # this reference implementation.
+    channel = 3
+    x_shape = [2, 2, 6, channel]
+    scale_shape = [channel]
+    grad_val = np.random.random_sample(x_shape).astype(np.float32)
+    x_val = np.random.random_sample(x_shape).astype(np.float32)
+    scale_val = np.random.random_sample(scale_shape).astype(np.float32)
+    mean_val = np.random.random_sample(scale_shape).astype(np.float32)
+    var_val = np.random.random_sample(scale_shape).astype(np.float32)
+
+    with self.test_session() as sess, self.test_scope():
+      grad = array_ops.placeholder(np.float32, shape=x_shape, name="grad")
+      x = array_ops.placeholder(np.float32, shape=x_shape, name="x")
+      mean = array_ops.placeholder(np.float32, shape=scale_shape, name="mean")
+      var = array_ops.placeholder(np.float32, shape=scale_shape, name="var")
+      scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
+      with self.test_scope():
+        out = gen_nn_ops.fused_batch_norm_grad(
+            grad, x, scale, mean, var, data_format="NHWC", is_training=False)
+        grad_x, grad_scale, grad_offset, _, _ = out
+
+      ref_x, ref_scale, ref_offset, _, _ = gen_nn_ops.fused_batch_norm_grad(
+          grad, x, scale, mean, var, data_format="NHWC", is_training=False)
+
+      grad_x_val, grad_scale_val, grad_offset_val, = sess.run(
+          [grad_x, grad_scale, grad_offset], {
+              grad: grad_val,
+              x: x_val,
+              mean: mean_val,
+              var: var_val,
+              scale: scale_val
+          })
+      grad_x_ref, grad_scale_ref, grad_offset_ref, = sess.run(
+          [ref_x, ref_scale, ref_offset], {
+              grad: grad_val,
+              x: x_val,
+              mean: mean_val,
+              var: var_val,
+              scale: scale_val
+          })
+
+      self.assertAllClose(grad_x_val, grad_x_ref, atol=1e-2)
+      self.assertAllClose(grad_scale_val, grad_scale_ref, atol=1e-2)
+      self.assertAllClose(grad_offset_val, grad_offset_ref, atol=1e-3)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a04f376ebf6092fd9b6e879796454b1a5c648c96
--- /dev/null
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -0,0 +1,142 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for image ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_image_ops
+from tensorflow.python.platform import test
+
+
+class ResizeBilinearTest(XLATestCase):
+
+  def _assertForwardOpMatchesExpected(self,
+                                      image_np,
+                                      target_shape,
+                                      expected=None):
+    if expected is None:
+      self.fail("expected must be specified")
+    with self.test_session() as sess, self.test_scope():
+      image = array_ops.placeholder(image_np.dtype)
+      resized = gen_image_ops.resize_bilinear(
+          image, target_shape, align_corners=True)
+      out = sess.run(resized, {image: image_np[np.newaxis, :, :, np.newaxis]})
+      self.assertAllClose(expected[np.newaxis, :, :, np.newaxis], out)
+
+  def _assertBackwardOpMatchesExpected(self,
+                                       grads_np,
+                                       input_shape=None,
+                                       dtype=None,
+                                       expected=None):
+    if input_shape is None:
+      self.fail("input_shape must be specified")
+    if expected is None:
+      self.fail("expected must be specified")
+    with self.test_session() as sess, self.test_scope():
+      dtype = dtype or np.float32
+      grads = array_ops.placeholder(np.float32)
+      resized = gen_image_ops._resize_bilinear_grad(
+          grads,
+          np.zeros([1, input_shape[0], input_shape[1], 1], dtype=dtype),
+          align_corners=True)
+      out = sess.run(resized, {grads: grads_np[np.newaxis, :, :, np.newaxis]})
+      self.assertAllClose(expected[np.newaxis, :, :, np.newaxis], out)
+
+  def testAlignCorners1x2To3x2(self):
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array([[1, 2]], dtype=dtype), [3, 3],
+          expected=np.array(
+              [[1, 1.5, 2], [1, 1.5, 2], [1, 1.5, 2]], dtype=np.float32))
+
+  def testAlignCorners1x2To3x2Grad(self):
+    for dtype in self.float_types:
+      self._assertBackwardOpMatchesExpected(
+          np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32),
+          input_shape=[1, 2],
+          dtype=dtype,
+          expected=np.array([[9, 12]], dtype=np.float32))
+
+  def testAlignCorners2x2To1x1(self):
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array([[1, 2], [3, 4]], dtype=dtype), [1, 1],
+          expected=np.array([[1]], dtype=np.float32))
+
+  def testAlignCorners2x2To1x1Grad(self):
+    for dtype in self.float_types:
+      self._assertBackwardOpMatchesExpected(
+          np.array([[7]], dtype=np.float32),
+          input_shape=[2, 2],
+          dtype=dtype,
+          expected=np.array([[7, 0], [0, 0]], dtype=np.float32))
+
+  def testAlignCorners2x2To3x3(self):
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array([[1, 2], [3, 4]], dtype=dtype), [3, 3],
+          expected=np.array(
+              [[1, 1.5, 2], [2, 2.5, 3], [3, 3.5, 4]], dtype=np.float32))
+
+  def testAlignCorners2x2To3x3Grad(self):
+    self._assertBackwardOpMatchesExpected(
+        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32),
+        input_shape=[2, 2],
+        expected=np.array([[5.25, 8.25], [14.25, 17.25]], dtype=np.float32))
+
+  def testAlignCorners3x3To2x2(self):
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=dtype), [2, 2],
+          expected=np.array([[1, 3], [7, 9]], dtype=np.float32))
+
+  def testAlignCorners3x3To2x2Grad(self):
+    for dtype in self.float_types:
+      self._assertBackwardOpMatchesExpected(
+          np.array([[7, 13], [22, 4]], dtype=np.float32),
+          input_shape=[3, 3],
+          dtype=dtype,
+          expected=np.array(
+              [[7, 0, 13], [0, 0, 0], [22, 0, 4]], dtype=np.float32))
+
+  def testAlignCorners4x4To3x3(self):
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array(
+              [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
+              dtype=dtype), [3, 3],
+          expected=np.array(
+              [[1, 2.5, 4], [7, 8.5, 10], [13, 14.5, 16]], dtype=np.float32))
+
+  def testAlignCorners4x4To3x3Grad(self):
+    for dtype in self.float_types:
+      self._assertBackwardOpMatchesExpected(
+          np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32),
+          input_shape=[4, 4],
+          dtype=dtype,
+          expected=np.array(
+              [[1, 1, 1, 3], [2, 1.25, 1.25, 3], [2, 1.25, 1.25, 3],
+               [7, 4, 4, 9]],
+              dtype=np.float32))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/momentum_test.py b/tensorflow/compiler/tests/momentum_test.py
index c00e3035a0982b2b2e59eb6f53499918515ae71d..af9394e7d7dc9cf7dd009420ff9c845aec8785bd 100644
--- a/tensorflow/compiler/tests/momentum_test.py
+++ b/tensorflow/compiler/tests/momentum_test.py
@@ -96,28 +96,27 @@ class MomentumOptimizerTest(XLATestCase):
   def testNesterovMomentum(self):
     for dtype in self.float_types:
       with self.test_session(), self.test_scope():
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
-        var0_np = np.array([1.0, 2.0], dtype=dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype)
+        var0 = resource_variable_ops.ResourceVariable([0.1, 0.2], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([0.3, 0.4], dtype=dtype)
+        var0_np = np.array([0.1, 0.2], dtype=dtype)
+        var1_np = np.array([0.3, 0.4], dtype=dtype)
         accum0_np = np.array([0.0, 0.0], dtype=dtype)
         accum1_np = np.array([0.0, 0.0], dtype=dtype)
-        cost = 5 * var0 * var0 + 3 * var1
+        cost = 0.4 * var0 * var0 + 0.9 * var1
         global_step = resource_variable_ops.ResourceVariable(
             array_ops.zeros([], dtypes.int32), name="global_step")
         mom_op = momentum_lib.MomentumOptimizer(
-            learning_rate=2.0, momentum=0.9, use_nesterov=True)
+            learning_rate=0.1, momentum=0.9, use_nesterov=True)
         opt_op = mom_op.minimize(cost, global_step, [var0, var1])
         variables.global_variables_initializer().run()
         for _ in range(1, 5):
           opt_op.run()
           var0_np, accum0_np = self._update_nesterov_momentum_numpy(
-              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
-          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
-                                                                    accum1_np,
-                                                                    3, 2.0, 0.9)
-          self.assertAllClose(var0_np, var0.eval())
-          self.assertAllClose(var1_np, var1.eval())
+              var0_np, accum0_np, var0_np * 0.8, 0.1, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 0.9, 0.1, 0.9)
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
 
   def testTensorLearningRateAndMomentum(self):
     for dtype in self.float_types:
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index 6a8c3bcd55a6e454a19b6249cf4eb48739c8657f..798daaadbc5be50ef9cf7e1205f6d5a0bde59640 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -2460,6 +2460,36 @@ TEST_F(OpTest, Reshape) {
   });
 }
 
+TEST_F(OpTest, ResizeBilinear) {
+  Repeatedly([this]() {
+    std::vector<int64> in_dims = RandomDims(4, 4);
+    std::vector<int64> out_dims = RandomDims(2, 2);
+
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("ResizeBilinear")
+            .RandomInput(DT_FLOAT, in_dims)
+            .Input(test::AsTensor<int32>(
+                std::vector<int32>(out_dims.begin(), out_dims.end())))
+            .Attr("T", DT_FLOAT)
+            .Attr("align_corners", true));
+  });
+}
+
+TEST_F(OpTest, ResizeBilinearGrad) {
+  Repeatedly([this]() {
+    std::vector<int64> in_dims = RandomDims(4, 4);
+    std::vector<int64> out_dims = RandomDims(2, 2);
+
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("ResizeBilinearGrad")
+            .RandomInput(DT_FLOAT, in_dims)
+            .RandomInput(DT_FLOAT,
+                         {in_dims[0], out_dims[0], out_dims[1], in_dims[3]})
+            .Attr("T", DT_FLOAT)
+            .Attr("align_corners", true));
+  });
+}
+
 TEST_F(OpTest, Reverse) {
   Repeatedly([this]() {
     std::vector<int64> dims = RandomDims(1);
diff --git a/tensorflow/compiler/tests/scan_ops_test.py b/tensorflow/compiler/tests/scan_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3260e63b23226d736a7ddc0f21a94a8c791e0442
--- /dev/null
+++ b/tensorflow/compiler/tests/scan_ops_test.py
@@ -0,0 +1,229 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for scan ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def numpy_reverse(x, axis):
+  length = len(x.shape)
+  if axis < 0:
+    axis = length + axis
+
+  ix = [
+      slice(None, None, -1) if i == axis else slice(None) for i in range(length)
+  ]
+  return x[ix]
+
+
+def handle_options(func, x, axis, exclusive, reverse):
+  """Adds tf options to numpy scan ops."""
+  length = len(x.shape)
+  if axis < 0:
+    axis = length + axis
+
+  if reverse:
+    x = numpy_reverse(x, axis)
+
+  if exclusive:
+    ix_head = [slice(0, 1) if i == axis else slice(None) for i in range(length)]
+    ix_init = [
+        slice(0, -1) if i == axis else slice(None) for i in range(length)
+    ]
+    if func == np.cumsum:
+      init = np.zeros_like(x[ix_head])
+    elif func == np.cumprod:
+      init = np.ones_like(x[ix_head])
+    else:
+      raise ValueError("Unknown scan function.")
+    x = np.concatenate([init, func(x[ix_init], axis)], axis=axis)
+  else:
+    x = func(x, axis=axis)
+
+  if reverse:
+    x = numpy_reverse(x, axis)
+  return x
+
+
+class CumsumTest(XLATestCase):
+
+  valid_dtypes = [np.float32]
+
+  def axis_dtypes(self):
+    return set(self.int_types).intersection([np.int32, np.int64])
+
+  def _compare(self, x, axis, exclusive, reverse):
+    np_out = handle_options(np.cumsum, x, axis, exclusive, reverse)
+    with self.test_session(), self.test_scope():
+      p = array_ops.placeholder(x.dtype)
+      tf_out = math_ops.cumsum(p, axis, exclusive, reverse).eval(
+          feed_dict={p: x})
+
+    self.assertAllClose(np_out, tf_out)
+
+  def _compareAll(self, x, axis):
+    for exclusive in [True, False]:
+      for reverse in [True, False]:
+        self._compare(x, axis, exclusive, reverse)
+
+  def testEmpty(self):
+    for dtype in self.valid_dtypes:
+      x = np.zeros([0]).astype(dtype)
+      for axis in (-1, 0):
+        self._compareAll(x, axis)
+
+  def testAxisType(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 6).reshape([5]).astype(dtype)
+      for axis_dtype in self.axis_dtypes():
+        with self.test_session(), self.test_scope():
+          p = array_ops.placeholder(x.dtype)
+          axis = constant_op.constant(0, axis_dtype)
+          math_ops.cumsum(p, axis).eval(feed_dict={p: x})
+
+  def test1D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 6).reshape([5]).astype(dtype)
+      for axis in (-1, 0):
+        self._compareAll(x, axis)
+
+  def test2D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(0, 10).reshape([2, 5]).astype(dtype)
+      for axis in (-2, -1, 0, 1):
+        self._compareAll(x, axis)
+
+  def test3D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(0, 20).reshape([2, 2, 5]).astype(dtype)
+      for axis in (-3, -2, -1, 0, 1, 2):
+        self._compareAll(x, axis)
+
+  def test6D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 145).reshape([2, 2, 3, 3, 2, 2]).astype(dtype)
+      for axis in range(-6, 6, 3):
+        self._compareAll(x, axis)
+
+  def testInvalidAxis(self):
+    x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
+    with self.test_session(), self.test_scope():
+      input_tensor = ops.convert_to_tensor(x)
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range [-2, 2)" in str(e)):
+        math_ops.cumsum(input_tensor, -3).eval()
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range [-2, 2)" in str(e)):
+        math_ops.cumsum(input_tensor, 2).eval()
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "axis must be a scalar" in str(e)):
+        math_ops.cumsum(input_tensor, [0]).eval()
+
+
+class CumprodTest(XLATestCase):
+
+  valid_dtypes = [np.float32]
+
+  def axis_dtypes(self):
+    return set(self.int_types).intersection([np.int32, np.int64])
+
+  def _compare(self, x, axis, exclusive, reverse):
+    np_out = handle_options(np.cumprod, x, axis, exclusive, reverse)
+    with self.test_session(), self.test_scope():
+      p = array_ops.placeholder(x.dtype)
+      prod = math_ops.cumprod(p, axis, exclusive, reverse)
+      tf_out = prod.eval(feed_dict={p: x})
+
+    self.assertAllClose(np_out, tf_out)
+
+  def _compareAll(self, x, axis):
+    for exclusive in [True, False]:
+      for reverse in [True, False]:
+        self._compare(x, axis, exclusive, reverse)
+
+  def testEmpty(self):
+    for dtype in self.valid_dtypes:
+      x = np.zeros([0]).astype(dtype)
+      for axis in (-1, 0):
+        self._compareAll(x, axis)
+
+  def testAxisType(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 6).reshape([5]).astype(dtype)
+      for axis_dtype in self.axis_dtypes():
+        with self.test_session(), self.test_scope():
+          p = array_ops.placeholder(x.dtype)
+          axis = constant_op.constant(0, axis_dtype)
+          math_ops.cumprod(x, axis).eval(feed_dict={p: x})
+
+  def test1D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 6).reshape([5]).astype(dtype)
+      for axis in (-1, 0):
+        self._compareAll(x, axis)
+
+  def test2D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 11).reshape([2, 5]).astype(dtype)
+      for axis in (-2, -1, 0, 1):
+        self._compareAll(x, axis)
+
+  def test3D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 21).reshape([2, 2, 5]).astype(dtype)
+      for axis in (-3, -2, -1, 0, 1, 2):
+        self._compareAll(x, axis)
+
+  def test6D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 145).reshape([2, 2, 3, 3, 2, 2]).astype(dtype)
+      for axis in range(-6, 6, 3):
+        self._compareAll(x, axis)
+
+  def testInvalidAxis(self):
+    x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
+    with self.test_session(), self.test_scope():
+      input_tensor = ops.convert_to_tensor(x)
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range [-2, 2)" in str(e)):
+        math_ops.cumprod(input_tensor, -3).eval()
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range [-2, 2)" in str(e)):
+        math_ops.cumprod(input_tensor, 2).eval()
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "axis must be a scalar" in str(e)):
+        math_ops.cumprod(input_tensor, [0]).eval()
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4336ebdbd184a081619f0a6951dd4514735c6eb6
--- /dev/null
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -0,0 +1,118 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for stateless random-number generation ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.contrib import stateless
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class StatelessRandomOpsTest(XLATestCase):
+  """Test cases for stateless random-number generator operators."""
+
+  def _random_types(self):
+    return [dtypes.float32]
+
+  def testDeterminism(self):
+    # Stateless values should be equal iff the seeds are equal (roughly)
+    with self.test_session(), self.test_scope():
+      seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+      seeds = [(x, y) for x in range(5) for y in range(5)] * 3
+      for stateless_op in [
+          stateless.stateless_random_uniform, stateless.stateless_random_normal
+      ]:
+        for shape in (), (3,), (2, 5):
+          for dtype in self._random_types():
+            pure = stateless_op(shape, seed=seed_t, dtype=dtype)
+            values = [(seed, pure.eval(feed_dict={
+                seed_t: seed
+            })) for seed in seeds]
+            for s0, v0 in values:
+              for s1, v1 in values:
+                self.assertEqual(s0 == s1, np.all(v0 == v1))
+
+  def testRandomUniformIsInRange(self):
+    with self.test_session() as sess, self.test_scope():
+      for dtype in self._random_types():
+        seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+        x = stateless.stateless_random_uniform(
+            shape=[1000], seed=seed_t, dtype=dtype)
+        y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]})
+        self.assertTrue(np.all(y >= 0))
+        self.assertTrue(np.all(y < 1))
+
+  def _chi_squared(self, x, bins):
+    """Pearson's Chi-squared test."""
+    x = np.ravel(x)
+    n = len(x)
+    histogram, _ = np.histogram(x, bins=bins, range=(0, 1))
+    expected = n / float(bins)
+    return np.sum(np.square(histogram - expected) / expected)
+
+  def testDistributionOfStatelessRandomUniform(self):
+    """Use Pearson's Chi-squared test to test for uniformity."""
+    with self.test_session() as sess, self.test_scope():
+      for dtype in self._random_types():
+        seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+        n = 1000
+        x = stateless.stateless_random_uniform(
+            shape=[n], seed=seed_t, dtype=dtype)
+        y = sess.run(x, {seed_t: [565656, 121212]})
+        # Tests that the values are distributed amongst 10 bins with equal
+        # probability. 16.92 is the Chi^2 value for 9 degrees of freedom with
+        # p=0.05. This test is probabilistic and would be flaky if the random
+        # seed were not fixed.
+        self.assertTrue(self._chi_squared(y, 10) < 16.92)
+
+  def _normal_cdf(self, x):
+    """Cumulative distribution function for a standard normal distribution."""
+    return 0.5 + 0.5 * np.vectorize(math.erf)(x / math.sqrt(2))
+
+  def _anderson_darling(self, x):
+    """Anderson-Darling test for a standard normal distribution."""
+    x = np.sort(np.ravel(x))
+    n = len(x)
+    i = np.linspace(1, n, n)
+    z = np.sum((2 * i - 1) * np.log(self._normal_cdf(x)) +
+               (2 * (n - i) + 1) * np.log(1 - self._normal_cdf(x)))
+    return -n - z / n
+
+  def testDistributionOfStatelessRandomNormal(self):
+    """Use Anderson-Darling test to test distribution appears normal."""
+    with self.test_session() as sess, self.test_scope():
+      for dtype in self._random_types():
+        seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+        n = 1000
+        x = stateless.stateless_random_normal(
+            shape=[n], seed=seed_t, dtype=dtype)
+        y = sess.run(x, {seed_t: [25252, 314159]})
+        # The constant 2.492 is the 5% critical value for the Anderson-Darling
+        # test where the mean and variance are known. This test is probabilistic
+        # so to avoid flakiness the seed is fixed.
+        self.assertTrue(self._anderson_darling(y) < 2.492)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py
index ac039e01623b954e291760fb9b50ef8eae3da7c1..a62925a1818da00cb0a9e82e1281db20fb38b208 100644
--- a/tensorflow/compiler/tests/tensor_array_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_array_ops_test.py
@@ -330,8 +330,7 @@ class TensorArrayTest(xla_test.XLATestCase):
     # Find two different floating point types, create an array of
     # the first type, but try to read the other type.
     if len(self.float_types) > 1:
-      dtype1 = self.float_types[0]
-      dtype2 = self.float_types[1]
+      dtype1, dtype2 = list(self.float_types)[:2]
       with self.test_session(), self.test_scope():
         ta = tensor_array_ops.TensorArray(
             dtype=dtype1, tensor_array_name="foo", size=3)
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index a9a3f4f97f649260e9863fff8ff05d046bd91947..0a6fe04d3cdd29f1d40d33be1f4319090e7ba3d1 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -33,6 +33,17 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import googletest
 
 
+def nhwc_to_format(x, data_format):
+  """Converts a numpy array from NHWC format to `data_format`."""
+  rank = len(x.shape)
+  if data_format == "NCHW":
+    return np.transpose(x, [0, rank - 1] + list(range(1, rank - 1)))
+  elif data_format == "NHWC":
+    return x
+  else:
+    raise ValueError("Unknown format {}".format(data_format))
+
+
 class UnaryOpsTest(XLATestCase):
   """Test cases for unary operators."""
 
@@ -56,7 +67,7 @@ class UnaryOpsTest(XLATestCase):
         output = op(pinp)
       result = session.run(output, {pinp: inp})
       if equality_test is None:
-        equality_test = self.assertAllClose
+        equality_test = self.assertAllCloseAccordingToType
       equality_test(result, expected, rtol=rtol, atol=atol)
 
   def ListsAreClose(self, result, expected, rtol, atol):
@@ -76,6 +87,12 @@ class UnaryOpsTest(XLATestCase):
           array_ops.diag_part,
           np.arange(36).reshape([2, 3, 2, 3]).astype(dtype),
           np.array([[0, 7, 14], [21, 28, 35]], dtype=dtype))
+      self._assertOpOutputMatchesExpected(
+          array_ops.diag, np.array([[1, 2], [3, 4]], dtype=dtype),
+          np.array(
+              [[[[1, 0], [0, 0]], [[0, 2], [0, 0]]], [[[0, 0], [3, 0]],
+                                                      [[0, 0], [0, 4]]]],
+              dtype=dtype))
 
       self._assertOpOutputMatchesExpected(
           array_ops.identity,
@@ -86,6 +103,21 @@ class UnaryOpsTest(XLATestCase):
           array_ops.matrix_diag,
           np.array([[1, 2], [3, 4]], dtype=dtype),
           np.array([[[1, 0], [0, 2]], [[3, 0], [0, 4]]], dtype=dtype))
+      self._assertOpOutputMatchesExpected(
+          array_ops.matrix_diag, np.array([1, 2, 3, 4], dtype=dtype),
+          np.array(
+              [[1, 0, 0, 0], [0, 2, 0, 0], [0, 0, 3, 0], [0, 0, 0, 4]],
+              dtype=dtype))
+      self._assertOpOutputMatchesExpected(
+          array_ops.matrix_diag,
+          np.array(
+              [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=dtype),
+          np.array(
+              [[[[1, 0, 0], [0, 2, 0], [0, 0, 3]],
+                [[4, 0, 0], [0, 5, 0], [0, 0, 6]]],
+               [[[7, 0, 0], [0, 8, 0], [0, 0, 9]],
+                [[10, 0, 0], [0, 11, 0], [0, 0, 12]]]],
+              dtype=dtype))
       self._assertOpOutputMatchesExpected(
           array_ops.matrix_diag_part,
           np.arange(3 * 2 * 4).reshape([3, 2, 4]).astype(dtype),
@@ -331,26 +363,23 @@ class UnaryOpsTest(XLATestCase):
   def testComplexOps(self):
     for dtype in self.complex_types:
 
-      # TODO(b/65408531): Wider support for log (needs atan2).
-      atan2_supported = self.device == "XLA_GPU"
-      if atan2_supported:
-        self._assertOpOutputMatchesExpected(
-            math_ops.acosh,
-            np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype),
-            expected=np.arccosh(
-                np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype)))
+      self._assertOpOutputMatchesExpected(
+          math_ops.acosh,
+          np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype),
+          expected=np.arccosh(
+              np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype)))
 
-        self._assertOpOutputMatchesExpected(
-            math_ops.asinh,
-            np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype),
-            expected=np.arcsinh(
-                np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype)))
+      self._assertOpOutputMatchesExpected(
+          math_ops.asinh,
+          np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype),
+          expected=np.arcsinh(
+              np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype)))
 
-        self._assertOpOutputMatchesExpected(
-            math_ops.atanh,
-            np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype),
-            expected=np.arctanh(
-                np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype)))
+      self._assertOpOutputMatchesExpected(
+          math_ops.atanh,
+          np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype),
+          expected=np.arctanh(
+              np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype)))
 
       self._assertOpOutputMatchesExpected(
           math_ops.cosh,
@@ -377,11 +406,10 @@ class UnaryOpsTest(XLATestCase):
           np.array([[1, 2j, 2 + 3j]], dtype=dtype),
           expected=1.0 / np.array([[1, 2j, 2 + 3j]], dtype=dtype))
 
-      if atan2_supported:
-        self._assertOpOutputMatchesExpected(
-            math_ops.log,
-            np.array([[5j, 3 - 2j]], dtype=dtype),
-            expected=np.log(np.array([[5j, 3 - 2j]], dtype=dtype)))
+      self._assertOpOutputMatchesExpected(
+          math_ops.log,
+          np.array([[5j, 3 - 2j]], dtype=dtype),
+          expected=np.log(np.array([[5j, 3 - 2j]], dtype=dtype)))
 
       self._assertOpOutputMatchesExpected(
           math_ops.sin,
@@ -395,27 +423,26 @@ class UnaryOpsTest(XLATestCase):
 
       # TODO(b/34703906): improve log1p implementation and make tolerance
       # tighter.
-      if atan2_supported:  # TODO(b/34703906): log support
-        self._assertOpOutputMatchesExpected(
-            math_ops.log1p,
-            np.array([[1e-14, 1e-15j, 0.6 - 0.3j]], dtype=dtype),
-            expected=np.log1p(
-                np.array([[1e-14, 1e-15j, 0.6 - 0.3j]], dtype=dtype)))
+      self._assertOpOutputMatchesExpected(
+          math_ops.log1p,
+          np.array([[1e-14, 1e-15j, 0.6 - 0.3j]], dtype=dtype),
+          expected=np.log1p(
+              np.array([[1e-14, 1e-15j, 0.6 - 0.3j]], dtype=dtype)))
 
-        val = np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype)
-        self._assertOpOutputMatchesExpected(
-            math_ops.rsqrt, val, expected=1 / np.sqrt(val))
+      val = np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype)
+      self._assertOpOutputMatchesExpected(
+          math_ops.rsqrt, val, expected=1 / np.sqrt(val))
 
-        self._assertOpOutputMatchesExpected(
-            math_ops.sigmoid, val, expected=1 / (1 + np.exp(-val)))
+      self._assertOpOutputMatchesExpected(
+          math_ops.sigmoid, val, expected=1 / (1 + np.exp(-val)))
 
-        self._assertOpOutputMatchesExpected(
-            math_ops.sqrt, val, expected=np.sqrt(val))
+      self._assertOpOutputMatchesExpected(
+          math_ops.sqrt, val, expected=np.sqrt(val))
 
-        self._assertOpOutputMatchesExpected(
-            math_ops.tanh,
-            np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype),
-            expected=np.tanh(np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype)))
+      self._assertOpOutputMatchesExpected(
+          math_ops.tanh,
+          np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype),
+          expected=np.tanh(np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype)))
 
       self._assertOpOutputMatchesExpected(
           math_ops.tan,
@@ -448,12 +475,10 @@ class UnaryOpsTest(XLATestCase):
           np.array([[-4j, 3 + 2j], [2, -1j]], dtype=dtype),
           expected=np.array([[1, 1], [1, 1]], dtype=dtype))
 
-      if atan2_supported:  # TODO(b/34703906): atan2 support
-        self._assertOpOutputMatchesExpected(
-            math_ops.angle,
-            np.array([1 + 3j, -4 + 7j, 2.7, -3j], dtype=dtype),
-            expected=np.angle(
-                np.array([1 + 3j, -4 + 7j, 2.7, -3j], dtype=dtype)))
+      self._assertOpOutputMatchesExpected(
+          math_ops.angle,
+          np.array([1 + 3j, -4 + 7j, 2.7, -3j], dtype=dtype),
+          expected=np.angle(np.array([1 + 3j, -4 + 7j, 2.7, -3j], dtype=dtype)))
 
       self._assertOpOutputMatchesExpected(
           math_ops.conj,
@@ -541,7 +566,8 @@ class UnaryOpsTest(XLATestCase):
 
   def testCast(self):
     shapes = [[], [4], [2, 3], [2, 0, 4]]
-    types = [dtypes.bool, dtypes.int32, dtypes.float32] + self.complex_tf_types
+    types = (set([dtypes.bool, dtypes.int32, dtypes.float32]) |
+             self.complex_tf_types)
     for shape in shapes:
       for src_type in types:
         for dst_type in types:
@@ -641,55 +667,88 @@ class UnaryOpsTest(XLATestCase):
         equality_test=self.ListsAreClose)
 
   def testDepthToSpace(self):
+    def make_op(data_format):
+      def op(x):
+        return array_ops.depth_to_space(x, block_size=2,
+                                        data_format=data_format)
+      return op
+
     for dtype in self.numeric_types:
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.depth_to_space(x, block_size=2),
-          np.array([[[[1, 2, 3, 4]]]], dtype=dtype),
-          expected=np.array([[[[1], [2]],
-                              [[3], [4]]]], dtype=dtype))
+      for data_format in ["NCHW", "NHWC"]:
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(np.array([[[[1, 2, 3, 4]]]], dtype=dtype),
+                           data_format),
+            expected=nhwc_to_format(np.array([[[[1], [2]],
+                                               [[3], [4]]]], dtype=dtype),
+                                    data_format))
 
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.depth_to_space(x, block_size=2),
-          np.array([[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]], dtype=dtype),
-          expected=np.array([[[[1, 2, 3], [4, 5, 6]],
-                              [[7, 8, 9], [10, 11, 12]]]], dtype=dtype))
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(
+                np.array([[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]],
+                         dtype=dtype),
+                data_format),
+            expected=nhwc_to_format(
+                np.array([[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]],
+                         dtype=dtype),
+                data_format))
 
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.depth_to_space(x, block_size=2),
-          np.array([[[[1, 2, 3, 4],
-                      [5, 6, 7, 8]],
-                     [[9, 10, 11, 12],
-                      [13, 14, 15, 16]]]], dtype=dtype),
-          expected=np.array([[[[1], [2], [5], [6]],
-                              [[3], [4], [7], [8]],
-                              [[9], [10], [13], [14]],
-                              [[11], [12], [15], [16]]]], dtype=dtype))
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(
+                np.array([[[[1, 2, 3, 4],
+                            [5, 6, 7, 8]],
+                           [[9, 10, 11, 12],
+                            [13, 14, 15, 16]]]], dtype=dtype),
+                data_format),
+            expected=nhwc_to_format(
+                np.array([[[[1], [2], [5], [6]],
+                           [[3], [4], [7], [8]],
+                           [[9], [10], [13], [14]],
+                           [[11], [12], [15], [16]]]], dtype=dtype),
+                data_format))
 
   def testSpaceToDepth(self):
+    def make_op(data_format):
+      def op(x):
+        return array_ops.space_to_depth(x, block_size=2,
+                                        data_format=data_format)
+      return op
+
     for dtype in self.numeric_types:
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.space_to_depth(x, block_size=2),
-          np.array([[[[1], [2]],
-                     [[3], [4]]]], dtype=dtype),
-          expected=np.array([[[[1, 2, 3, 4]]]], dtype=dtype))
+      for data_format in ["NCHW", "NHWC"]:
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(np.array([[[[1], [2]],
+                                      [[3], [4]]]], dtype=dtype),
+                           data_format),
+            expected=nhwc_to_format(np.array([[[[1, 2, 3, 4]]]], dtype=dtype),
+                                    data_format))
 
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.space_to_depth(x, block_size=2),
-          np.array([[[[1, 2, 3], [4, 5, 6]],
-                     [[7, 8, 9], [10, 11, 12]]]], dtype=dtype),
-          expected=np.array([[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]],
-                            dtype=dtype))
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(np.array([[[[1, 2, 3], [4, 5, 6]],
+                                      [[7, 8, 9], [10, 11, 12]]]], dtype=dtype),
+                           data_format),
+            expected=nhwc_to_format(
+                np.array([[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]],
+                         dtype=dtype),
+                data_format))
 
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.space_to_depth(x, block_size=2),
-          np.array([[[[1], [2], [5], [6]],
-                     [[3], [4], [7], [8]],
-                     [[9], [10], [13], [14]],
-                     [[11], [12], [15], [16]]]], dtype=dtype),
-          expected=np.array([[[[1, 2, 3, 4],
-                               [5, 6, 7, 8]],
-                              [[9, 10, 11, 12],
-                               [13, 14, 15, 16]]]], dtype=dtype))
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(np.array([[[[1], [2], [5], [6]],
+                                      [[3], [4], [7], [8]],
+                                      [[9], [10], [13], [14]],
+                                      [[11], [12], [15], [16]]]], dtype=dtype),
+                           data_format),
+            expected=nhwc_to_format(
+                np.array([[[[1, 2, 3, 4],
+                            [5, 6, 7, 8]],
+                           [[9, 10, 11, 12],
+                            [13, 14, 15, 16]]]], dtype=dtype),
+                data_format))
 
   def _assertSoftplusMatchesExpected(self, features, dtype):
     features = np.array(features, dtype=dtype)
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index c50342dee45eba6ae54f01653ecc81ef096b547b..b08d6ab21e0746558cb3d4818d4c822c45d2e9ee 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -107,11 +107,26 @@ class VariableOpsTest(XLATestCase):
                  [[[30, 31, 32], [33, 34, 35]], [[0, 1, 2], [3, 4, 5]]]],
             ).astype(dtype), sess.run(x))
 
+  def testShape(self):
+    for dtype in self.numeric_types:
+      init = np.ones([2, 3]).astype(dtype)
+      with self.test_session() as session, self.test_scope():
+        v = resource_variable_ops.ResourceVariable(init)
+        session.run(variables.variables_initializer([v]))
+        h = v.handle
+        s32, s64 = session.run([
+            resource_variable_ops.variable_shape(h),
+            resource_variable_ops.variable_shape(h, out_type=dtypes.int64)
+        ])
+        self.assertEqual(s32.dtype, np.int32)
+        self.assertEqual(s64.dtype, np.int64)
+        self.assertAllEqual(s32, [2, 3])
+        self.assertAllEqual(s64, [2, 3])
+
   def testReadWrite(self):
     """Tests initialization, reading, and writing a resource variable."""
     for dtype in self.numeric_types:
       with self.test_session() as session:
-        print(ops.get_default_graph())
         with self.test_scope():
           with variable_scope.variable_scope("ascope", use_resource=True):
             x = variable_scope.get_variable(
diff --git a/tensorflow/compiler/tests/xla_test.py b/tensorflow/compiler/tests/xla_test.py
index 0be127997e5211f810ca791187486760881fe172..7e1f5c76ed65946363cc3c113ab1a9862f87b289 100644
--- a/tensorflow/compiler/tests/xla_test.py
+++ b/tensorflow/compiler/tests/xla_test.py
@@ -53,41 +53,100 @@ class XLATestCase(test.TestCase):
     super(XLATestCase, self).__init__(method_name)
     self.device = FLAGS.test_device
     self.has_custom_call = (self.device == 'XLA_CPU')
-    self.all_tf_types = [
+    self._all_tf_types = set([
         dtypes.as_dtype(types_pb2.DataType.Value(name))
         for name in FLAGS.types.split(',')
-    ]
-    self.int_tf_types = [
-        dtype for dtype in self.all_tf_types if dtype.is_integer
-    ]
-    self.float_tf_types = [
-        dtype for dtype in self.all_tf_types if dtype.is_floating
-    ]
-    self.complex_tf_types = [
-        dtype for dtype in self.all_tf_types if dtype.is_complex
-    ]
-    self.numeric_tf_types = (
-        self.int_tf_types + self.float_tf_types + self.complex_tf_types)
-
-    self.all_types = [dtype.as_numpy_dtype for dtype in self.all_tf_types]
-    self.int_types = [dtype.as_numpy_dtype for dtype in self.int_tf_types]
-    self.float_types = [dtype.as_numpy_dtype for dtype in self.float_tf_types]
-    self.complex_types = [
+    ])
+    self.int_tf_types = set([
+        dtype for dtype in self._all_tf_types if dtype.is_integer
+    ])
+    self._float_tf_types = set([
+        dtype for dtype in self._all_tf_types if dtype.is_floating
+    ])
+    self.complex_tf_types = set([
+        dtype for dtype in self._all_tf_types if dtype.is_complex
+    ])
+    self._numeric_tf_types = set(
+        self.int_tf_types | self._float_tf_types | self.complex_tf_types)
+
+    self._all_types = set(
+        [dtype.as_numpy_dtype for dtype in self._all_tf_types])
+    self.int_types = set([dtype.as_numpy_dtype for dtype in self.int_tf_types])
+    self._float_types = set(
+        [dtype.as_numpy_dtype for dtype in self._float_tf_types])
+    self.complex_types = set([
         dtype.as_numpy_dtype for dtype in self.complex_tf_types
-    ]
-    self.numeric_types = self.int_types + self.float_types + self.complex_types
+    ])
+    self._numeric_types = set(
+        self.int_types | self._float_types | self.complex_types)
 
     # Parse the manifest file, if any, into a regex identifying tests to
     # disable
     self.disabled_regex = None
+    self._method_types_filter = dict()
+    # TODO(xpan): Make it text proto if it doesn't scale.
+    # Each line of the manifest file specifies an entry. The entry can be
+    # 1) TestNameRegex  // E.g. CumprodTest.* Or
+    # 2) TestName TypeName  // E.g. AdamOptimizerTest.testSharing DT_BFLOAT16
+    # The 1) disables the entire test. While 2) only filter some numeric types
+    # so that they are not used in those tests.
+
     if FLAGS.disabled_manifest is not None:
       comments_re = re.compile('#.*$')
       manifest_file = open(FLAGS.disabled_manifest, 'r')
-      lines = manifest_file.read().splitlines()
-      lines = [comments_re.sub('', l).strip() for l in lines]
-      self.disabled_regex = re.compile('|'.join(lines))
+      disabled_tests = []
+      disabled_method_types = []
+      for l in manifest_file.read().splitlines():
+        entry = comments_re.sub('', l).strip().split(' ')
+        if len(entry) == 1:
+          disabled_tests.append(entry[0])
+        elif len(entry) == 2:
+          disabled_method_types.append(
+              (entry[0], entry[1].strip().split(',')))
+        else:
+          raise ValueError('Bad entry in manifest file.')
+
+      self.disabled_regex = re.compile('|'.join(disabled_tests))
+      for method, types in disabled_method_types:
+        self._method_types_filter[method] = set([
+            dtypes.as_dtype(types_pb2.DataType.Value(name)).as_numpy_dtype
+            for name in types])
       manifest_file.close()
 
+  @property
+  def all_tf_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    tf_types = set([dtypes.as_dtype(t)
+                    for t in self._method_types_filter.get(name, set())])
+    return self._all_tf_types - tf_types
+
+  @property
+  def float_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    return self._float_types - self._method_types_filter.get(name, set())
+
+  @property
+  def float_tf_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    return self._float_tf_types - self._method_types_filter.get(name, set())
+
+  @property
+  def numeric_tf_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    tf_types = set([dtypes.as_dtype(t)
+                    for t in self._method_types_filter.get(name, set())])
+    return self._numeric_tf_types - tf_types
+
+  @property
+  def numeric_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    return self._numeric_types - self._method_types_filter.get(name, set())
+
+  @property
+  def all_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    return self._all_types - self._method_types_filter.get(name, set())
+
   def setUp(self):
     super(XLATestCase, self).setUp()
     name = '{}.{}'.format(type(self).__name__, self._testMethodName)
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 5a81438b1c48e7f0ef66dae072092974db24c621..5d1cb6d73570a1a3efbe0d2d37d9746bc0e2528f 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 
 package_group(
     name = "internal",
@@ -25,6 +25,30 @@ package(
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
 
+cc_library(
+    name = "tf2xla_supported_ops_lib",
+    srcs = ["tf2xla_supported_ops.cc"],
+    hdrs = ["tf2xla_supported_ops.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":xla_compiler",
+        "//tensorflow/compiler/tf2xla/kernels:xla_cpu_only_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_binary(
+    name = "tf2xla_supported_ops",
+    srcs = ["tf2xla_supported_ops_main.cc"],
+    visibility = ["//visibility:public"],
+    deps = [":tf2xla_supported_ops_lib"],
+)
+
 xla_proto_library(
     name = "tf2xla_proto",
     srcs = ["tf2xla.proto"],
@@ -67,7 +91,6 @@ cc_library(
         # Keep dependencies to a minimum here; this library is used in every AOT
         # binary produced by tfcompile.
         "//tensorflow/compiler/aot:runtime",
-        "//tensorflow/compiler/tf2xla:xla_local_runtime_context",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/core:framework_lite",
     ],
@@ -215,6 +238,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ],
@@ -357,13 +381,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "xla_local_runtime_context",
-    hdrs = ["xla_local_runtime_context.h"],
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/core:framework_lite"],
-)
-
 cc_library(
     name = "dump_graph",
     srcs = [
@@ -400,6 +417,7 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/const_analysis.cc b/tensorflow/compiler/tf2xla/const_analysis.cc
index d57273d84442c17565a6ace1c29170a0f3ba583b..ab2f1e9a7ab577bbe704e568b21d9912439605ca 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis.cc
@@ -52,6 +52,8 @@ Status BackwardsConstAnalysis(const Graph& g,
       {"Conv2DBackpropInput", "input_sizes"},
       {"Conv3DBackpropFilterV2", "filter_sizes"},
       {"Conv3DBackpropInputV2", "input_sizes"},
+      {"Cumprod", "axis"},
+      {"Cumsum", "axis"},
       {"DepthwiseConv2dNativeBackpropFilter", "filter_sizes"},
       {"DepthwiseConv2dNativeBackpropInput", "input_sizes"},
       {"DynamicStitch", "indices"},
@@ -78,6 +80,7 @@ Status BackwardsConstAnalysis(const Graph& g,
       {"Range", "limit"},
       {"Range", "delta"},
       {"Reshape", "shape"},
+      {"ResizeBilinear", "size"},
       {"ResourceStridedSliceAssign", "begin"},
       {"ResourceStridedSliceAssign", "end"},
       {"ResourceStridedSliceAssign", "strides"},
diff --git a/tensorflow/compiler/tf2xla/dump_graph.cc b/tensorflow/compiler/tf2xla/dump_graph.cc
index ddd912b87315f7943915153b5bf73531107af54d..03603ee9baefd1d20d220faf63c9c1c427ebdf31 100644
--- a/tensorflow/compiler/tf2xla/dump_graph.cc
+++ b/tensorflow/compiler/tf2xla/dump_graph.cc
@@ -63,7 +63,12 @@ string MakeUniquePath(string name) {
 
 string DumpGraphDefToFile(const string& name, GraphDef const& graph_def) {
   string path = MakeUniquePath(name);
-  TF_CHECK_OK(WriteTextProto(Env::Default(), path, graph_def));
+  Status status = WriteTextProto(Env::Default(), path, graph_def);
+  if (!status.ok()) {
+    VLOG(1) << "Failed to dump GraphDef to file: " << path << " : " << status;
+    path.clear();
+    path = "(unavailable)";
+  }
   return path;
 }
 
@@ -79,7 +84,13 @@ string DumpGraphToFile(const string& name, Graph const& graph,
 
 string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef) {
   string path = MakeUniquePath(name);
-  TF_CHECK_OK(WriteTextProto(Env::Default(), path, fdef));
+  Status status = WriteTextProto(Env::Default(), path, fdef);
+  if (!status.ok()) {
+    VLOG(1) << "Failed to dump FunctionDef to file: " << path << " : "
+            << status;
+    path.clear();
+    path = "(unavailable)";
+  }
   return path;
 }
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 40a484da0980004b43564f1c57be0426d21379fb..dd67a1dea9656bac9cf3eaa09295a0d42e283706 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/lib/gtl/optional.h"
 
@@ -528,255 +529,101 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame,
 
 class FunctionalizeCond {
  public:
-  // Identifies the connected parts of the tf.Cond.
-  struct ClusterHandle {
-    explicit ClusterHandle(int representative = -1)
-        : representative(representative) {}
-
-    bool operator==(const ClusterHandle& other) const {
-      return representative == other.representative;
-    }
-
-    bool operator!=(const ClusterHandle& other) const {
-      return !(*this == other);
-    }
+  // All nodes are assumed to be either in no branch, then branch, else branch,
+  // or both branches (such as merge nodes).
+  enum Branch {
+    kElseBranch = 0,
+    kThenBranch = 1,
+    kBoth = 2,
+    kNeither = 3,
+    kNumBranchTypes = 4
+  };
 
-    bool operator<(const ClusterHandle& other) const {
-      return representative < other.representative;
+  // Returns a textual representation of the Branch b.
+  static string Branch_Name(FunctionalizeCond::Branch b);
+
+  // Comparison function used for sorting nodes consistently.
+  struct CondCmp {
+    bool operator()(const Node* lhs, const Node* rhs) const {
+      bool lhs_is_resource =
+          lhs->num_inputs() > 0 ? (lhs->input_type(0) == DT_RESOURCE) : false;
+      bool rhs_is_resource =
+          rhs->num_inputs() > 0 ? (rhs->input_type(0) == DT_RESOURCE) : false;
+      return std::tie(lhs_is_resource, lhs->name()) <
+             std::tie(rhs_is_resource, rhs->name());
     }
+  };
 
-    bool operator>(const ClusterHandle& other) const {
-      return representative > other.representative;
-    }
+  // Functionalize all the switch-merge nodes of a loop-free graph into XlaIf
+  // nodes. That is, attempt to transform every remaining switch and merge nodes
+  // in the graph into XlaIf nodes.
+  // Precondition: All while loops have been removed from graph.
+  static Status Functionalize(Graph* graph, FunctionLibraryDefinition* library);
 
+ private:
+  struct ForwardFlowNode {
+    explicit ForwardFlowNode(Branch branch = Branch::kNeither)
+        : branch(branch), count(0) {}
     string ToString() const {
-      return strings::StrCat("Cluster_", representative);
+      return strings::StrCat("branch=", Branch_Name(branch), " count=", count);
     }
-
-    // Vector of UnionFind<ClusterHandle> indexable by ClusterHandle and Node*.
-    struct Vector {
-      explicit Vector(size_t size) : clusters(size) {}
-
-      UnionFind<ClusterHandle>& at(const ClusterHandle& cluster) {
-        return clusters.at(cluster.representative);
-      }
-
-      UnionFind<ClusterHandle>& at(const Node* node) {
-        return clusters.at(node->id());
-      }
-
-      UnionFind<ClusterHandle>& operator[](const Node* node) {
-        return clusters.at(node->id());
-      }
-
-      size_t size() const { return clusters.size(); }
-
-      void resize(size_t count) { return clusters.resize(count); }
-
-     private:
-      std::vector<UnionFind<ClusterHandle>> clusters;
-    };
-
-   private:
-    int representative;
-  };
-
-  // Represents a node in the clustered graph consisting of switch_nodes,
-  // merge_nodes as well as the edges into and out of this node to other
-  // Clusters. Each Cluster corresponds to a ClusterHandle and has a
-  // corresponding representative.
-  struct Cluster {
-    std::unordered_set<Node*> switch_nodes;
-    std::unordered_set<Node*> merge_nodes;
-    std::unordered_set<Cluster*> in_nodes;
-    std::unordered_set<Cluster*> out_nodes;
-
-    // A member of the ClusterHandle corresponding to this Cluster.
-    ClusterHandle representative;
-    bool visited = false;
-  };
-
-  // Represent the clustered graph as map from cluster representative to
-  // Cluster.
-  using ClusteredGraph = std::map<ClusterHandle, Cluster>;
-
-  // The arguments and condition of a XlaIf. The arguments are ordered by node
-  // id in the original graph.
-  struct CondArgs {
-    struct CondCmp {
-      bool operator()(const Node* lhs, const Node* rhs) const {
-        bool lhs_is_resource =
-            lhs->num_inputs() > 0 ? (lhs->input_type(0) == DT_RESOURCE) : false;
-        bool rhs_is_resource =
-            rhs->num_inputs() > 0 ? (rhs->input_type(0) == DT_RESOURCE) : false;
-        return std::tie(lhs_is_resource, lhs->name()) <
-               std::tie(rhs_is_resource, rhs->name());
-      }
-    };
-    Node* conditional = nullptr;
-    std::set<Node*, CondCmp> args;
+    Branch branch;
+    int count;
   };
 
-  static Status Functionalize(Graph* graph, FunctionLibraryDefinition* library);
-
- private:
   FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library)
-      : clusters_(graph->num_node_ids()), library_(library), graph_(graph) {}
-
-  // Returns a vector of Merge nodes from the clustered graph where the nodes
-  // are sorted by the number of switch nodes minus number of merge nodes
-  // from a root of the clustered graph to the given Merge node, with ties
-  // broken by the representative of the Cluster.
-  std::vector<std::pair<int, Cluster*>> SortedMergeNodes();
-
-  // Returns whether the graph has no conditionals.
-  bool NoConditionals() const { return merge_nodes_.empty(); }
-
-  // Construct the clustered graph by creating nodes for each cluster and the
-  // connections between the clusters. Switch and Merge nodes partition
-  // clusters, so iterate over those. Note: a Cluster may have neither a
-  // Merge or Switch but will have an in/out edge from a Cluster that has.
-  void CreateClusters();
-
-  // Creates the clustered graph by identifying all the edges between different
-  // clusters and collecting all switch and merge nodes that correspond to a
-  // cluster.
-  void CreateClusteredGraph();
-
-  // If `from` and `to` correspond to different clusters, then merge the nodes
-  // in the clustered graph corresponding to `from` and `to`.
-  //
-  // If `remove_from_graph` is specified then the `from` node is also removed
-  // from the clustered graph post contracting the edge.
-  void ContractEdge(Cluster* from, Cluster* to, bool remove_from_graph = false);
+      : library_(library), graph_(graph) {}
+
+  // Perform the actual cond functionalization. Iterate over groups of switch
+  // nodes (linked by common predicate), from innermost to outermost, and
+  // extract into XlaIf nodes.
+  Status FunctionalizeInternal();
 
   // Converts a Merge node to a XlaIf. This encapsulates the process of
   // extracting the bodies needed for the then and else branch, creates a XlaIf
   // node, removing the nodes of the branches from the graph and replacing the
   // merge node with a XlaIf.
-  Status ConvertMergeToXlaIf(Cluster* merge_cluster);
-
-  // Removes a Switch cluster feeding directly into a Merge cluster by removing
-  // the Switch and Merge nodes and collapsing into a single cluster.
-  Status RemoveTrivialMerge(Cluster* merge_cluster);
-
-  // Returns the switch cluster corresponding to the merge node. This function
-  // only returns the switch cluster in the simple case where we have a switch
-  // node is the entry of a diamond corresponding to a conditional:
-  //
-  //           Switch
-  //          /      \
-  //     Branch      Branch
-  //          \      /
-  //        merge_cluster
-  //
-  // Note: either of the branches may be empty. The case where both branches are
-  // empty is handled by RemoveTrivialMerge.
-  gtl::optional<Cluster*> GetSwitchCluster(const Cluster& merge_cluster);
-
-  // Determines the arguments needed as input to the Merge cluster originating
-  // from the Switch cluster.
-  xla::StatusOr<CondArgs> DetermineCondArgs(const Cluster& merge_cluster,
-                                            const Cluster& switch_cluster);
-
-  // Builds a XlaIfOp to replace the Merge node with.
-  xla::StatusOr<Node*> BuildAndAddXlaIfOp(const CondArgs& cond_args,
-                                          const Cluster& merge_cluster,
-                                          const std::vector<Node*>& outputs);
+  Status ConvertCorrespondingMergeToXlaIf(
+      const std::vector<Node*>& switch_nodes,
+      const std::vector<Node*>& merge_nodes, Node* predicate);
+
+  // Builds a XlaIfOp to replace the Switch-Graph-Merge cluster with.
+  xla::StatusOr<Node*> BuildAndAddXlaIfOp(
+      const std::vector<Node*>& switch_nodes,
+      const std::vector<Node*>& merge_nodes, Node* predicate);
 
   // Extracts a function body corresponding to the given input edge of the merge
   // node.
-  Status ExtractBody(const CondArgs& cond_args, const Cluster& merge_cluster,
-                     const std::vector<Node*>& outputs, int input_edge,
+  Status ExtractBody(const std::vector<Node*>& switch_nodes,
+                     const std::vector<Node*>& merge_nodes, int input_edge,
                      Graph* body);
 
   // Adds all the input edges to `if_node` corresponding to the arguments.
-  Status AddInputEdges(const CondArgs& cond_args, Node* if_node);
+  Status AddInputEdges(const std::vector<Node*>& cond_args, Node* predicate,
+                       Node* if_node);
 
   // Adds all output edges from the `if_node`.
   Status AddOutputEdges(const std::vector<Node*>& outputs, Node* if_node);
 
-  // Removes all nodes from the graph that are part of cluster.
-  void RemoveClusterNodes(Cluster* cluster);
+  // Returns the switches of graph_ in postorder. Dead switch nodes are skipped
+  // and removed from the graph.
+  std::vector<Node*> DetermineSwitchOrder();
 
-  // Removes all argument nodes that are unused.
-  template <class T>
-  void RemoveUnusedArgs(const T& args);
+  // Update the state for destination based on the state of source and the node
+  // being updated.
+  Status Join(const ForwardFlowNode& src_state, const Node* dst,
+              ForwardFlowNode* dst_state);
 
-  // Removes all Merge nodes in merge_cluster.
-  void RemoveMergeNodes(Cluster* merge_cluster);
-
-  // Returns the representative member of the corresponding cluster.
-  ClusterHandle Representative(const Node* node) {
-    return clusters_.at(node).Get();
-  }
+  // Validates that the branch_map and frontier of nodes for the conditional
+  // section are as expected.
+  Status ValidBranchMapAndFrontier(
+      const std::unordered_map<Node*, ForwardFlowNode>& branch_map,
+      const std::unordered_set<Node*>& frontier);
 
-  ClusteredGraph clustered_graph_;
-  ClusterHandle::Vector clusters_;
-  std::unordered_set<Node*> merge_nodes_;
-  std::unordered_set<Node*> switch_nodes_;
   FunctionLibraryDefinition* library_;
   Graph* graph_;
 };
 
-std::ostream& operator<<(std::ostream& os,
-                         const FunctionalizeCond::ClusterHandle& c) {
-  os << c.ToString();
-  return os;
-}
-
-// Returns a dot representation of the clustered graph showing the connections
-// between the nodes and the nodes in each cluster.
-string DebugString(const Graph& graph,
-                   FunctionalizeCond::ClusterHandle::Vector* clusters) {
-  string ret = "digraph {\ncompound=true;labeljust=\"r\";ranksep=0.24\n";
-  std::map<FunctionalizeCond::ClusterHandle, string> subgraphs;
-  auto name = [](const Node* n) {
-    return strings::StrCat(n->type_string(), "_", n->id());
-  };
-  for (Node* n : graph.nodes()) {
-    strings::StrAppend(&subgraphs[clusters->at(n).Get()], n->id(), " [label=\"",
-                       name(n), "\"];\n");
-  }
-  for (auto kv : subgraphs) {
-    strings::StrAppend(&ret, "subgraph cluster_", kv.first.ToString(), " {\n",
-                       "style=filled; color=lightgrey;", "label = \"",
-                       kv.first.ToString(), "\";\n", kv.second, "}\n");
-  }
-  for (Node* n : graph.nodes()) {
-    for (Node* in : n->in_nodes()) {
-      strings::StrAppend(&ret, in->id(), " -> ", n->id(), ";\n");
-    }
-  }
-  return strings::StrCat(ret, "} // end");
-}
-
-string DebugString(const FunctionalizeCond::ClusteredGraph& clustered_graph) {
-  string ret = "digraph {\ncompound=true;labeljust=\"r\";\n";
-  auto name = [](const FunctionalizeCond::Cluster& cluster) {
-    return cluster.representative.ToString();
-  };
-  for (auto kv : clustered_graph) {
-    if (!kv.second.switch_nodes.empty() || !kv.second.merge_nodes.empty()) {
-      strings::StrAppend(
-          &ret, kv.first.ToString(), " [label=\"", name(kv.second),
-          kv.second.switch_nodes.empty()
-              ? ""
-              : strings::StrCat(" switches=", kv.second.switch_nodes.size()),
-          kv.second.merge_nodes.empty()
-              ? ""
-              : strings::StrCat(" merges=", kv.second.merge_nodes.size()),
-          "\"];\n");
-    }
-  }
-  for (auto kv : clustered_graph) {
-    for (auto in : kv.second.in_nodes) {
-      strings::StrAppend(&ret, name(*in), " -> ", name(kv.second), ";\n");
-    }
-  }
-  return strings::StrCat(ret, "} // end");
-}
-
 bool IsDeadSwitch(const Node* node) {
   for (const Edge* e : node->out_edges()) {
     const Node* dst = e->dst();
@@ -792,243 +639,212 @@ bool IsDeadSwitch(const Node* node) {
   return true;
 }
 
-void FunctionalizeCond::CreateClusters() {
-  for (Node* node : graph_->nodes()) {
-    if (IsSwitch(node)) {
-      switch_nodes_.insert(node);
-    } else if (IsMerge(node)) {
-      merge_nodes_.insert(node);
-    }
-    ClusterHandle& cluster = clusters_.at(node).Get();
-    cluster = ClusterHandle(node->id());
-  }
-
-  // If there are no Merge nodes, then terminate.
-  if (merge_nodes_.empty()) {
-    return;
-  }
-
-  // Remove all dead Switch nodes.
-  RemoveUnusedArgs(switch_nodes_);
-
-  // All parent_'s are still nullptr so clusters_ may still be resized. Resize
-  // conservatively assuming all merge nodes become XlaIf nodes.
-  clusters_.resize(clusters_.size() + merge_nodes_.size());
+string FunctionalizeCond::Branch_Name(FunctionalizeCond::Branch b) {
+  const string branch_name[FunctionalizeCond::kNumBranchTypes + 1] = {
+      "else", "then", "both", "neither", "count"};
+  return branch_name[b];
+}
 
-  // Merge a cluster with its input, unless the input is a Switch node or
-  // the node is a Merge node.
-  for (const Node* node : graph_->nodes()) {
-    if (IsMerge(node) || IsSwitch(node) || !node->IsOp()) {
-      continue;
+Status FunctionalizeCond::ValidBranchMapAndFrontier(
+    const std::unordered_map<Node*, FunctionalizeCond::ForwardFlowNode>&
+        branch_map,
+    const std::unordered_set<Node*>& frontier) {
+  std::unordered_set<const Node*> pending[kNumBranchTypes];
+  for (const auto& kv : branch_map) {
+    if (kv.second.count != kv.first->in_edges().size()) {
+      return errors::FailedPrecondition("Value ", kv.first->DebugString(),
+                                        " not dominated by switch nodes.");
     }
-    for (const Node* in : node->in_nodes()) {
-      if (in->IsOp() && !IsSwitch(in) && !IsMerge(in)) {
-        clusters_.at(node).Merge(&clusters_.at(in));
-      }
+    if (VLOG_IS_ON(1)) {
+      // Append attribute to the graph if running with logging to make the
+      // changes clearer in the visualization.
+      kv.first->AddAttr("_XlaFunctionalizeBranch",
+                        Branch_Name(kv.second.branch));
     }
-    // Group all source clusters together.
-    if (node->IsSource() || node->in_edges().empty()) {
-      clusters_.at(node).Merge(&clusters_.at(ClusterHandle(Graph::kSourceId)));
+  }
+  for (Node* n : frontier) {
+    pending[branch_map.at(n).branch].insert(n);
+  }
+  TF_RET_CHECK(pending[kNeither].empty()) << NodesToString(pending[kNeither]);
+  for (const Node* n : pending[kBoth]) {
+    TF_RET_CHECK(IsMerge(n)) << n->DebugString();
+    // Merge nodes may be in then or else branch too
+  }
+  int index = (pending[kThenBranch].size() <= pending[kElseBranch].size())
+                  ? kThenBranch
+                  : kElseBranch;
+  int other = 1 - index;
+  for (const Node* n : pending[index]) {
+    if (pending[other].find(n) != pending[other].end()) {
+      return errors::Internal(
+          "Node (", n->DebugString().c_str(),
+          ") in both Else and Then branch should be in Both.");
     }
   }
+  return Status::OK();
 }
 
-void FunctionalizeCond::ContractEdge(Cluster* from, Cluster* to,
-                                     bool remove_from_graph) {
-  VLOG(3) << "ContractEdge from = " << from->representative
-          << " to = " << to->representative;
-  if (from->representative == to->representative) {
-    return;
-  }
-  to->merge_nodes.insert(from->merge_nodes.begin(), from->merge_nodes.end());
-  from->merge_nodes.clear();
-  to->switch_nodes.insert(from->switch_nodes.begin(), from->switch_nodes.end());
-  from->switch_nodes.clear();
-
-  for (Cluster* from_out : from->out_nodes) {
-    from_out->in_nodes.erase(from);
-    if (from_out->representative != to->representative) {
-      from_out->in_nodes.insert(to);
-      to->out_nodes.insert(from_out);
+Status FunctionalizeCond::Join(const ForwardFlowNode& src_state,
+                               const Node* dst, ForwardFlowNode* dst_state) {
+  TF_RET_CHECK(dst_state->branch != Branch::kBoth &&
+               dst_state->branch != Branch::kNumBranchTypes)
+      << "Unexpected/Invalid branch type: Merging "
+      << Branch_Name(src_state.branch) << " with "
+      << Branch_Name(dst_state->branch);
+  if (dst_state->branch == Branch::kNeither) {
+    dst_state->branch = src_state.branch;
+  } else if (src_state.branch != dst_state->branch &&
+             src_state.branch != Branch::kNeither) {
+    if (IsMerge(dst)) {
+      dst_state->branch = Branch::kBoth;
+    } else {
+      return errors::Internal("Illegal merge: ", src_state.ToString(), " with ",
+                              dst_state->ToString(), " for ",
+                              dst->DebugString());
     }
   }
-  from->out_nodes.clear();
+  ++dst_state->count;
+  return Status::OK();
+}
 
-  for (Cluster* from_in : from->in_nodes) {
-    from_in->out_nodes.erase(from);
-    if (from_in->representative != to->representative) {
-      from_in->out_nodes.insert(to);
-      to->in_nodes.insert(from_in);
+std::vector<Node*> FunctionalizeCond::DetermineSwitchOrder() {
+  std::vector<Node*> dead_switches;
+  std::vector<Node*> switch_order;
+  DFS(*graph_, nullptr, [this, &dead_switches, &switch_order](Node* n) {
+    if (IsSwitch(n)) {
+      if (IsDeadSwitch(n)) {
+        dead_switches.push_back(n);
+      } else {
+        switch_order.push_back(n);
+      }
     }
+  });
+
+  // Remove all dead switch nodes.
+  for (Node* n : dead_switches) {
+    graph_->RemoveNode(n);
   }
-  from->in_nodes.clear();
 
-  to->in_nodes.erase(from);
-  to->out_nodes.erase(from);
-  clusters_.at(to->representative).Merge(&clusters_.at(from->representative));
-  from->visited = true;
+  return switch_order;
+}
 
-  if (remove_from_graph) {
-    clustered_graph_.erase(from->representative);
+Status FunctionalizeCond::FunctionalizeInternal() {
+  std::vector<Node*> switch_order = DetermineSwitchOrder();
+  // If there are no switch nodes, then terminate.
+  if (switch_order.empty()) {
+    return Status::OK();
   }
-}
 
-void FunctionalizeCond::CreateClusteredGraph() {
-  auto update_cluster_for_node = [this](Node* node) -> Cluster& {
-    ClusterHandle repr = Representative(node);
-    Cluster& cluster_node = clustered_graph_[repr];
-    cluster_node.representative = repr;
-    for (const Node* in : node->in_nodes()) {
-      ClusterHandle other_repr = Representative(in);
-      // Skip source, sink and internal edges.
-      if (other_repr == repr) {
-        continue;
-      }
-      Cluster& cluster_node_in = clustered_graph_[other_repr];
-      cluster_node.in_nodes.insert(&cluster_node_in);
-      cluster_node_in.out_nodes.insert(&cluster_node);
-      cluster_node_in.representative = other_repr;
-    }
-    for (const Node* out : node->out_nodes()) {
-      ClusterHandle other_repr = Representative(out);
-      // Skip source, sink and internal edges.
-      if (other_repr == repr) {
-        continue;
-      }
-      Cluster& cluster_node_out = clustered_graph_[other_repr];
-      cluster_node.out_nodes.insert(&cluster_node_out);
-      cluster_node_out.in_nodes.insert(&cluster_node);
-      cluster_node_out.representative = other_repr;
-    }
-    return cluster_node;
+  struct PredicateSwitches {
+    explicit PredicateSwitches(Node* predicate) : predicate(predicate) {}
+
+    Node* predicate;
+    std::vector<Node*> switches;
   };
-  update_cluster_for_node(graph_->source_node());
-  for (Node* node : switch_nodes_) {
-    update_cluster_for_node(node).switch_nodes.insert(node);
-  }
-  for (Node* node : merge_nodes_) {
-    update_cluster_for_node(node).merge_nodes.insert(node);
-  }
 
   // Merge Switch nodes with common predicate.
-  std::unordered_map<Node*, std::vector<Node*>> predicate_to_switch;
-  for (Node* node : switch_nodes_) {
-    Node* tmp;
-    TF_CHECK_OK(node->input_node(1, &tmp));
-    predicate_to_switch[tmp].push_back(node);
-  }
-  for (auto kv : predicate_to_switch) {
-    Cluster& first = clustered_graph_.at(Representative(kv.second.front()));
-    for (Node* switch_node : kv.second) {
-      ClusterHandle handle = Representative(switch_node);
-      Cluster& cluster = clustered_graph_.at(handle);
-      ContractEdge(&cluster, &first, /*remove_from_graph=*/true);
+  std::vector<PredicateSwitches> predicate_switch_order;
+  std::unordered_map<Node*, int> predicate_index;
+  // The nodes in switch_order are in reverse topological order, but the
+  // clustered switches need not be (i.e., when considered as a cluster one
+  // element of a cluster may be later in the topological order than another
+  // node whose cluster is later in the topological order of clustered
+  // switches).
+  for (auto it = switch_order.rbegin(); it != switch_order.rend(); ++it) {
+    Node* pred;
+    TF_CHECK_OK((*it)->input_node(1, &pred));
+    if (predicate_index.find(pred) == predicate_index.end()) {
+      predicate_index[pred] = predicate_switch_order.size();
+      predicate_switch_order.emplace_back(pred);
     }
+    predicate_switch_order[predicate_index[pred]].switches.push_back(*it);
   }
 
-  // Merge Merge nodes with common input together.
-  for (Node* node : merge_nodes_) {
-    Cluster& cluster = clustered_graph_.at(Representative(node));
-    for (const Node* in : node->in_nodes()) {
-      if (!in->IsOp()) {
-        continue;
-      }
-      Cluster& cluster_node_in = clustered_graph_.at(Representative(in));
-      // ContractEdge can modify out_nodes of cluster_node_in, so traverse
-      // over out_nodes assuming it does.
-      for (auto it = cluster_node_in.out_nodes.begin();
-           it != cluster_node_in.out_nodes.end();) {
-        if (!(*it)->merge_nodes.empty()) {
-          ContractEdge(*it++, &cluster, /*remove_from_graph=*/true);
-        } else {
-          ++it;
-        }
-      }
-    }
-  }
+  // Iterate from innermost set of clustered switches to outermost, replacing
+  // matching switch->merge subgraphs with single XlaIf nodes.
+  for (auto it = predicate_switch_order.rbegin();
+       it != predicate_switch_order.rend(); ++it) {
+    auto& ps = *it;
+    VLOG(3) << "Flow down from: " << ps.predicate->name() << " -> "
+            << NodesToString(ps.switches);
 
-  VLOG(3) << "Graph with clusters: " << DebugString(*graph_, &clusters_);
-  VLOG(3) << "ClusteredGraph: " << DebugString(clustered_graph_);
-}
+    std::unordered_map<Node*, ForwardFlowNode> branch_map;
+    std::unordered_set<Node*> frontier;
 
-gtl::optional<FunctionalizeCond::Cluster*> FunctionalizeCond::GetSwitchCluster(
-    const Cluster& merge_cluster) {
-  VLOG(3) << "GetSwitchCluster for " << merge_cluster.representative;
-  gtl::optional<Cluster*> switch_cluster;
-  if (merge_cluster.in_nodes.size() > 2) {
-    return gtl::nullopt;
-  }
-  for (Cluster* in : merge_cluster.in_nodes) {
-    Cluster* cluster = in;
-    if (in->switch_nodes.empty()) {
-      if (in->in_nodes.size() != 1 || in->out_nodes.size() != 1) {
-        return gtl::nullopt;
-      }
-      // There is only a single `in` cluster.
-      cluster = *in->in_nodes.begin();
-    }
-    if (cluster->switch_nodes.empty()) {
-      return gtl::nullopt;
-    }
+    std::vector<Node*> stack = ps.switches;
+    std::vector<bool> visited(graph_->num_node_ids(), false);
+    while (!stack.empty()) {
+      Node* n = stack.back();
+      stack.pop_back();
 
-    if (switch_cluster.has_value() && *switch_cluster != cluster) {
-      return gtl::nullopt;
-    } else {
-      switch_cluster = cluster;
-    }
-  }
-  return switch_cluster;
-}
+      if (visited[n->id()]) {
+        continue;
+      }
+      visited[n->id()] = true;
 
-xla::StatusOr<FunctionalizeCond::CondArgs> FunctionalizeCond::DetermineCondArgs(
-    const Cluster& merge_cluster, const Cluster& switch_cluster) {
-  VLOG(2) << "DetermineCondArgs for " << merge_cluster.representative
-          << " with switch cluster " << switch_cluster.representative;
-  CondArgs ret;
-  auto feeds_into_branch_cluster = [&](Node* switch_cluster) {
-    for (Node* out : switch_cluster->out_nodes()) {
-      ClusterHandle repr = Representative(out);
-      if (repr == merge_cluster.representative) {
-        return true;
+      // Propagate branch state along each edge of a switch node.
+      bool sink_only = true;
+      for (const Edge* e : n->out_edges()) {
+        Node* out = e->dst();
+        if (!out->IsOp()) {
+          continue;
+        }
+        sink_only = false;
+        // Propagate branch information.
+        ForwardFlowNode& ffn = branch_map[out];
+        if (IsSwitch(n)) {
+          int index = e->IsControlEdge() ? Branch::kNeither : e->src_output();
+          TF_RETURN_IF_ERROR(Join(ForwardFlowNode(Branch(index)), out, &ffn));
+        } else {
+          TF_RETURN_IF_ERROR(Join(branch_map[n], out, &ffn));
+        }
+        if (IsMerge(out)) {
+          if (out->in_edges().size() == ffn.count) {
+            frontier.insert(out);
+          }
+        } else if (!visited[out->id()] && ffn.count == out->in_edges().size()) {
+          // If all predecessors are dominated by the switch nodes, then add
+          // the output to the stack.
+          stack.push_back(out);
+        }
       }
-      for (Cluster* in : merge_cluster.in_nodes) {
-        if (repr == in->representative) {
-          return true;
+      if (sink_only) {
+        if (!IsIdentity(n)) {
+          VLOG(1) << "Feeding into sink: " << n->DebugString();
         }
       }
     }
-    return false;
-  };
-  for (Node* switch_cluster_node : switch_cluster.switch_nodes) {
-    if (!feeds_into_branch_cluster(switch_cluster_node)) {
-      continue;
-    }
 
-    Node* tmp;
-    TF_RETURN_IF_ERROR(switch_cluster_node->input_node(1, &tmp));
-    if (ret.conditional == nullptr) {
-      ret.conditional = tmp;
-    } else if (ret.conditional != tmp) {
-      return errors::Unimplemented(
-          "Switch statements with different conditionals cannot be "
-          "converted into functional conditional.");
+    TF_RETURN_IF_ERROR(ValidBranchMapAndFrontier(branch_map, frontier));
+    VLOG(2) << "FunctionalizeControlFlow (before XlaIf conversion): "
+            << dump_graph::DumpGraphToFile("functionalize_bc", *graph_);
+    std::vector<Node*> switch_nodes(ps.switches);
+    std::sort(switch_nodes.begin(), switch_nodes.end(), CondCmp());
+    std::vector<Node*> merge_nodes(frontier.begin(), frontier.end());
+    std::sort(merge_nodes.begin(), merge_nodes.end(), CondCmp());
+    TF_RETURN_IF_ERROR(ConvertCorrespondingMergeToXlaIf(
+        switch_nodes, merge_nodes, ps.predicate));
+    for (auto& del_kv : branch_map) {
+      graph_->RemoveNode(del_kv.first);
+    }
+    for (Node* node : switch_nodes) {
+      graph_->RemoveNode(node);
     }
-    ret.args.insert(switch_cluster_node);
+    VLOG(2) << "FunctionalizeControlFlow (after XlaIf conversion): "
+            << dump_graph::DumpGraphToFile("functionalize_ac", *graph_);
   }
-  return ret;
+  return Status::OK();
 }
 
 xla::StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
-    const CondArgs& cond_args, const Cluster& merge_cluster,
-    const std::vector<Node*>& outputs) {
-  VLOG(2) << "Build if op for " << NodesToString(merge_cluster.merge_nodes)
-          << " with input " << NodesToString(cond_args.args);
+    const std::vector<Node*>& switch_nodes,
+    const std::vector<Node*>& merge_nodes, Node* predicate) {
+  VLOG(2) << "Build if op for " << NodesToString(merge_nodes) << " with input "
+          << NodesToString(switch_nodes);
 
   NodeDef if_def;
   // Create a new If node using the name of the merge node.
-  NodeDefBuilder builder(
-      strings::StrCat((*merge_cluster.merge_nodes.begin())->name(), "_If"),
-      "XlaIf");
+  NodeDefBuilder builder(strings::StrCat(predicate->name(), "_If"), "XlaIf");
   string branch[] = {"else_branch", "then_branch"};
   for (int i = 0; i < 2; ++i) {
     static std::atomic<int64> sequence_num(0LL);
@@ -1038,8 +854,7 @@ xla::StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
     body_name.set_name(
         strings::StrCat("_functionalize_if_", branch[i], "_", id));
     auto body = xla::MakeUnique<Graph>(graph_->op_registry());
-    TF_RETURN_IF_ERROR(
-        ExtractBody(cond_args, merge_cluster, outputs, i, body.get()));
+    TF_RETURN_IF_ERROR(ExtractBody(switch_nodes, merge_nodes, i, body.get()));
     VLOG(3) << "Body " << branch[i] << ": " << DebugString(body.get());
     FunctionDef body_fdef;
     TF_RETURN_IF_ERROR(GraphToFunctionDef(*body, body_name.name(), &body_fdef));
@@ -1050,7 +865,7 @@ xla::StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
   // Build input type.
   std::vector<NodeDefBuilder::NodeOut> inputs;
   DataTypeVector in_arg_types;
-  for (const Node* arg : cond_args.args) {
+  for (const Node* arg : switch_nodes) {
     const Edge* in_edge;
     TF_RETURN_IF_ERROR(arg->input_edge(0, &in_edge));
     if (in_edge->IsControlEdge()) {
@@ -1066,17 +881,17 @@ xla::StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
 
   // Build output type.
   DataTypeVector out_type;
-  for (const Node* merge : merge_cluster.merge_nodes) {
+  for (const Node* merge : merge_nodes) {
     DataType dtype = merge->output_type(0);
     out_type.push_back(dtype);
   }
   builder.Attr("Tout", out_type);
 
   builder.Attr("Tcond", DT_BOOL);
-  builder.Device(cond_args.conditional->assigned_device_name());
+  builder.Device(predicate->assigned_device_name());
   // Conditional should be the first input ...
-  builder.Input(NodeDefBuilder::NodeOut(cond_args.conditional->name(), 0,
-                                        cond_args.conditional->output_type(0)));
+  builder.Input(
+      NodeDefBuilder::NodeOut(predicate->name(), 0, predicate->output_type(0)));
   // ... followed by the other inputs.
   builder.Input(inputs);
 
@@ -1085,53 +900,15 @@ xla::StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
   return if_node;
 }
 
-void FunctionalizeCond::RemoveClusterNodes(Cluster* cluster) {
-  VLOG(3) << "RemoveClusterNodes for " << cluster->representative;
-  ClusterHandle repr = cluster->representative;
-  std::deque<Node*> to_delete;
-  for (Node* node : graph_->nodes()) {
-    if (Representative(node) == repr) {
-      to_delete.push_back(node);
-    }
-  }
-  for (Node* n : to_delete) {
-    graph_->RemoveNode(n);
-  }
-}
-
-template <class T>
-void FunctionalizeCond::RemoveUnusedArgs(const T& args) {
-  VLOG(2) << "RemoveUnusedArgs among: " << NodesToString(args);
-
-  std::deque<Node*> to_delete;
-  for (Node* arg : args) {
-    if (IsDeadSwitch(arg)) {
-      to_delete.push_back(arg);
-      for (Node* n : arg->out_nodes()) {
-        to_delete.push_back(n);
-      }
-    }
-  }
-  for (Node* n : to_delete) {
-    switch_nodes_.erase(n);
-    auto it = clustered_graph_.find(Representative(n));
-    if (it != clustered_graph_.end()) {
-      it->second.switch_nodes.erase(n);
-    }
-    graph_->RemoveNode(n);
-  }
-}
-
-Status FunctionalizeCond::ExtractBody(const CondArgs& cond_args,
-                                      const Cluster& merge_cluster,
-                                      const std::vector<Node*>& outputs,
+Status FunctionalizeCond::ExtractBody(const std::vector<Node*>& switch_nodes,
+                                      const std::vector<Node*>& merge_nodes,
                                       int input_edge, Graph* body) {
-  VLOG(2) << "ExtractBody for " << merge_cluster.representative
-          << " along edge " << input_edge;
+  VLOG(2) << "ExtractBody for " << NodesToString(merge_nodes) << " along edge "
+          << input_edge;
   std::vector<bool> squash_src_outputs(graph_->num_node_ids(), false);
   std::vector<Node*> node_map(graph_->num_node_ids(), nullptr);
   int arg_count = 0;
-  for (const auto* arg : cond_args.args) {
+  for (const auto* arg : switch_nodes) {
     DataType dtype = arg->input_type(0);
     TF_ASSIGN_OR_RETURN(Node * arg_node,
                         BuildArgNode(body, dtype, arg_count++));
@@ -1140,9 +917,9 @@ Status FunctionalizeCond::ExtractBody(const CondArgs& cond_args,
   }
 
   std::vector<Node*> stack;
-  stack.reserve(outputs.size());
-  for (int j = 0; j < outputs.size(); ++j) {
-    Node* node = outputs[j];
+  stack.reserve(switch_nodes.size());
+  for (int j = 0; j < merge_nodes.size(); ++j) {
+    Node* node = merge_nodes[j];
     TF_ASSIGN_OR_RETURN(node_map.at(node->id()),
                         BuildRetvalNode(body, node->output_type(0),
                                         /*index=*/j));
@@ -1153,7 +930,8 @@ Status FunctionalizeCond::ExtractBody(const CondArgs& cond_args,
       node_map.at(in->id()) = body->CopyNode(in);
     }
 
-    if (cond_args.args.find(in) == cond_args.args.end()) {
+    if (std::find(switch_nodes.begin(), switch_nodes.end(), in) ==
+        switch_nodes.end()) {
       body->AddEdge(node_map.at(in->id()), in_edge->src_output(),
                     node_map.at(node->id()), 0);
     } else {
@@ -1168,12 +946,12 @@ Status FunctionalizeCond::ExtractBody(const CondArgs& cond_args,
                       body);
 }
 
-Status FunctionalizeCond::AddInputEdges(const CondArgs& cond_args,
-                                        Node* if_node) {
+Status FunctionalizeCond::AddInputEdges(const std::vector<Node*>& cond_args,
+                                        Node* predicate, Node* if_node) {
   VLOG(3) << "AddInputEdges for " << if_node->name();
   int i = 0;
-  graph_->AddEdge(cond_args.conditional, 0, if_node, i++);
-  for (const Node* arg : cond_args.args) {
+  graph_->AddEdge(predicate, 0, if_node, i++);
+  for (const Node* arg : cond_args) {
     const Edge* in_edge;
     TF_RETURN_IF_ERROR(arg->input_edge(0, &in_edge));
     if (in_edge->IsControlEdge()) {
@@ -1210,173 +988,26 @@ Status FunctionalizeCond::AddOutputEdges(const std::vector<Node*>& outputs,
   return Status::OK();
 }
 
-void FunctionalizeCond::RemoveMergeNodes(Cluster* merge_cluster) {
-  VLOG(3) << "RemoveMergeNodes for " << merge_cluster->representative;
-  // Remove all merge nodes now dead post extraction of If.
-  for (auto it = merge_cluster->merge_nodes.begin();
-       it != merge_cluster->merge_nodes.end();) {
-    Node* node = *it;
-    graph_->RemoveNode(node);
-    merge_cluster->merge_nodes.erase(*it++);
-  }
-}
-
-Status FunctionalizeCond::RemoveTrivialMerge(Cluster* merge_cluster) {
-  Cluster* switch_cluster = *merge_cluster->in_nodes.begin();
-  if (switch_cluster->switch_nodes.empty()) {
-    return errors::FailedPrecondition(
-        "Not a trivial merge: no Switch node feeding into Merge node");
-  }
-
-  for (auto it = merge_cluster->merge_nodes.begin();
-       it != merge_cluster->merge_nodes.end();) {
-    // We have the following structure:
-    //   Op -> Switch -> Merge -> Consumer
-    // and we want to transform it to:
-    //   Op -> Consumer
-    Node* merge_node = *it;
-    Node* switch_node;
-    const Edge* in = nullptr;
-    TF_RETURN_IF_ERROR(merge_node->input_node(0, &switch_node));
-    TF_RETURN_IF_ERROR(switch_node->input_edge(0, &in));
-    for (auto out : merge_node->out_edges()) {
-      int src_output = out->dst_input() == Graph::kControlSlot
-                           ? Graph::kControlSlot
-                           : in->src_output();
-      graph_->AddEdge(in->src(), src_output, out->dst(), out->dst_input());
-    }
-    graph_->RemoveNode(*it++);
-  }
-  RemoveUnusedArgs(switch_cluster->switch_nodes);
-
-  return Status::OK();
-}
-
-Status FunctionalizeCond::ConvertMergeToXlaIf(Cluster* merge_cluster) {
-  VLOG(1) << "ConvertMergeToXlaIf for " << merge_cluster->representative;
-  gtl::optional<Cluster*> switch_cluster = GetSwitchCluster(*merge_cluster);
-  if (!switch_cluster.has_value()) {
-    return errors::FailedPrecondition(
-        "Merge cluster was not part of a simple conditional in the clustered "
-        "graph. Graph nodes in merge cluster ",
-        NodesToString(merge_cluster->merge_nodes));
-  }
-  TF_ASSIGN_OR_RETURN(auto cond_args,
-                      DetermineCondArgs(*merge_cluster, **switch_cluster));
-
-  // Sort the outputs by ID to produce more stable output.
-  std::vector<Node*> outputs(merge_cluster->merge_nodes.begin(),
-                             merge_cluster->merge_nodes.end());
-  std::sort(outputs.begin(), outputs.end(), CondArgs::CondCmp());
+Status FunctionalizeCond::ConvertCorrespondingMergeToXlaIf(
+    const std::vector<Node*>& switch_nodes,
+    const std::vector<Node*>& merge_nodes, Node* predicate) {
+  VLOG(1) << "ConvertMergeToXlaIf for " << NodesToString(switch_nodes) << " -> "
+          << NodesToString(merge_nodes);
 
   // Extract bodies and builds a If operator.
   TF_ASSIGN_OR_RETURN(Node * if_node,
-                      BuildAndAddXlaIfOp(cond_args, *merge_cluster, outputs));
-  TF_RETURN_IF_ERROR(AddInputEdges(cond_args, if_node));
-  TF_RETURN_IF_ERROR(AddOutputEdges(outputs, if_node));
-
-  // Remove the old nodes from the graph_ and contract the edges of the
-  // clustered graph.
-  for (auto in : merge_cluster->in_nodes) {
-    if (in != *switch_cluster) {
-      RemoveClusterNodes(in);
-    }
-  }
-  RemoveMergeNodes(merge_cluster);
-  RemoveUnusedArgs(cond_args.args);
-  auto in_nodes = merge_cluster->in_nodes;
-  for (auto it = in_nodes.begin(); it != in_nodes.end();) {
-    ContractEdge(*it++, merge_cluster);
-  }
-  ContractEdge(*switch_cluster, merge_cluster);
-  clusters_[if_node].Get() = ClusterHandle(merge_cluster->representative);
+                      BuildAndAddXlaIfOp(switch_nodes, merge_nodes, predicate));
+  TF_RETURN_IF_ERROR(AddInputEdges(switch_nodes, predicate, if_node));
+  TF_RETURN_IF_ERROR(AddOutputEdges(merge_nodes, if_node));
 
   return Status::OK();
 }
 
-std::vector<std::pair<int, FunctionalizeCond::Cluster*>>
-FunctionalizeCond::SortedMergeNodes() {
-  VLOG(2) << "ProcessClusteredGraph";
-  std::stack<std::pair<int, Cluster*>> stack;
-  // Initialize with the source node.
-  stack.push({0, &clustered_graph_[ClusterHandle(Graph::kSourceId)]});
-
-  // Perform a depth-first traversal of the clustered graph computing the
-  // switch-merge depth.
-  std::vector<std::pair<int, Cluster*>> queue;
-  std::unordered_set<Cluster*> visited;
-  while (!stack.empty()) {
-    Cluster* n = stack.top().second;
-    size_t depth = stack.top().first;
-    stack.pop();
-
-    auto inserted = visited.insert(n);
-    if (!inserted.second) {
-      continue;
-    }
-
-    size_t new_depth = depth;
-    if (!n->merge_nodes.empty()) {
-      queue.emplace_back(depth, n);
-      --new_depth;
-    }
-    if (!n->switch_nodes.empty()) {
-      ++new_depth;
-    }
-    for (Cluster* e : n->out_nodes) {
-      stack.emplace(new_depth, e);
-    }
-  }
-
-  // Sort in reverse order of switch-merge depth with ties broken by the
-  // ClusterHandle.
-  std::sort(queue.begin(), queue.end(),
-            [](const std::pair<int, Cluster*>& lhs,
-               const std::pair<int, Cluster*>& rhs) {
-              return std::tie(lhs.first, lhs.second->representative) >
-                     std::tie(rhs.first, rhs.second->representative);
-            });
-
-  return queue;
-}
-
 Status FunctionalizeCond::Functionalize(Graph* graph,
                                         FunctionLibraryDefinition* library) {
   VLOG(1) << "FunctionalizeCond::Functionalize";
   FunctionalizeCond fc(graph, library);
-  fc.CreateClusters();
-  if (fc.NoConditionals()) {
-    return Status::OK();
-  }
-  fc.CreateClusteredGraph();
-
-  auto queue = fc.SortedMergeNodes();
-  for (auto it = queue.begin(); it != queue.end();) {
-    Cluster* merge_cluster = (*it).second;
-    ++it;
-    if (merge_cluster->in_nodes.size() == 1) {
-      TF_RETURN_IF_ERROR(fc.RemoveTrivialMerge(merge_cluster));
-    } else {
-      TF_RETURN_IF_ERROR(fc.ConvertMergeToXlaIf(merge_cluster));
-    }
-
-    // Contract newly Merge free merge_cluster with incoming nodes without
-    // Switch or Merge nodes.
-    std::vector<Cluster*> in_nodes(merge_cluster->in_nodes.begin(),
-                                   merge_cluster->in_nodes.end());
-    for (auto in : in_nodes) {
-      if (in->merge_nodes.empty() && in->switch_nodes.empty()) {
-        fc.ContractEdge(in, merge_cluster);
-      }
-    }
-  }
-
-  if (!fc.switch_nodes_.empty()) {
-    return errors::Internal(
-        "Failed to functionalize control flow with Switch nodes remaining: ",
-        NodesToString(fc.switch_nodes_));
-  }
-  return Status::OK();
+  return fc.FunctionalizeInternal();
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index 01d2b282751f387cfa9c8887cdeb48090c96bff4..71f12a13339b9b5495631b8f9350579f6a0785a3 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -109,7 +109,7 @@ TEST(FunctionalizeControlFlow, Conditional) {
     auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32);
     auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
     auto less = ops::Less(scope.WithOpName("cond/Less"), y, x);
-    auto if_op = ops::XlaIf(scope.WithOpName("cond/Merge_If"), less,
+    auto if_op = ops::XlaIf(scope.WithOpName("cond/Less_If"), less,
                             std::initializer_list<Input>{less, y, x}, then_fn,
                             else_fn, {DT_INT32});
     GraphDef expected;
diff --git a/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
new file mode 100644
index 0000000000000000000000000000000000000000..82b3b46a2f1e97001d1e0c6b993ec243170bc7d8
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
@@ -0,0 +1,242 @@
+**Supported operators for device: XLA_CPU_JIT**
+
+Operator                              | Type Constraint
+------------------------------------- | ---------------
+`Abs`                                 | `T={double,float,int32,int64}`
+`Acosh`                               | `T={complex64,double,float}`
+`Add`                                 | `T={complex64,double,float,int32,int64}`
+`AddN`                                | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`All`                                 | `Tidx={int32,int64}`
+`Angle`                               | `Tout={double,float}`<br>`T={complex64}`
+`Any`                                 | `Tidx={int32,int64}`
+`ApproximateEqual`                    | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`ArgMax`                              | `Tidx={int32,int64}`<br>`output_type={int32,int64}`<br>`T={float}`
+`ArgMin`                              | `Tidx={int32,int64}`<br>`output_type={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Asinh`                               | `T={complex64,double,float}`
+`AssignAddVariableOp`                 | `dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`AssignSubVariableOp`                 | `dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`AssignVariableOp`                    | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Atan2`                               | `T={double,float}`
+`Atanh`                               | `T={complex64,double,float}`
+`AvgPool`                             | `T={double,float}`
+`AvgPool3D`                           | `T={double,float}`
+`AvgPool3DGrad`                       | `T={double,float}`
+`AvgPoolGrad`                         | `T={double,float}`
+`BatchMatMul`                         | `T={complex64,double,float,int32}`
+`BatchToSpace`                        | `Tidx={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`BatchToSpaceND`                      | `Tcrops={int32,int64}`<br>`Tblock_shape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAdd`                             | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAddGrad`                         | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAddV1`                           | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BitwiseAnd`                          | `T={int32,int64,uint32,uint64}`
+`BitwiseOr`                           | `T={int32,int64,uint32,uint64}`
+`BroadcastArgs`                       | `T={int32,int64}`
+`BroadcastGradientArgs`               | `T={int32,int64}`
+`Cast`                                | `DstT={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`SrcT={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Ceil`                                | `T={double,float}`
+`Cholesky`                            | `T={complex64,double,float}`
+`Complex`                             | `Tout={complex64}`<br>`T={double,float}`
+`ComplexAbs`                          | `Tout={double,float}`<br>`T={complex64}`
+`Concat`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ConcatOffset`                        |
+`ConcatV2`                            | `Tidx={int32}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Conj`                                | `T={complex64}`
+`Const`                               | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ControlTrigger`                      |
+`Conv2D`                              | `T={float}`
+`Conv2DBackpropFilter`                | `T={float}`
+`Conv2DBackpropInput`                 | `T={float}`
+`Conv3D`                              | `T={double,float}`
+`Conv3DBackpropFilterV2`              | `T={double,float}`
+`Conv3DBackpropInputV2`               | `T={double,float}`
+`Cos`                                 | `T={complex64,double,float}`
+`Cosh`                                | `T={complex64,double,float}`
+`Cross`                               | `T={double,float,int32,int64,uint32,uint64}`
+`Cumprod`                             | `Tidx={int32,int64}`<br>`T={float}`
+`Cumsum`                              | `Tidx={int32,int64}`<br>`T={float}`
+`DepthToSpace`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`DepthwiseConv2dNative`               | `T={double,float}`
+`DepthwiseConv2dNativeBackpropFilter` | `T={double,float}`
+`DepthwiseConv2dNativeBackpropInput`  | `T={double,float}`
+`Diag`                                | `T={complex64,double,float,int32,int64}`
+`DiagPart`                            | `T={complex64,double,float,int32,int64}`
+`Div`                                 | `T={complex64,double,float,int32,int64}`
+`DynamicStitch`                       | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Elu`                                 | `T={double,float}`
+`EluGrad`                             | `T={double,float}`
+`Equal`                               | `T={bool,complex64,double,float,int32,int64}`
+`Exp`                                 | `T={complex64,double,float}`
+`ExpandDims`                          | `Tdim={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Expm1`                               | `T={complex64,double,float}`
+`Fill`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Floor`                               | `T={double,float}`
+`FloorDiv`                            | `T={complex64,double,float,int32,int64}`
+`FloorMod`                            | `T={double,float,int32,int64}`
+`FusedBatchNorm`                      | `T={float}`
+`FusedBatchNormGrad`                  | `T={float}`
+`FusedBatchNormGradV2`                | `U={float}`<br>`T={float}`
+`FusedBatchNormV2`                    | `U={float}`<br>`T={float}`
+`Gather`                              | `Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`GatherV2`                            | `Taxis={int32,int64}`<br>`Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Greater`                             | `T={double,float,int32,int64,uint32,uint64}`
+`GreaterEqual`                        | `T={double,float,int32,int64,uint32,uint64}`
+`Identity`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`IdentityN`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Imag`                                | `Tout={double,float}`<br>`T={complex64}`
+`Inv`                                 | `T={complex64,double,float,int32,int64}`
+`Invert`                              | `T={int32,int64,uint32,uint64}`
+`InvertPermutation`                   | `T={int32}`
+`IsFinite`                            | `T={double,float}`
+`IsInf`                               | `T={double,float}`
+`IsNan`                               | `T={double,float}`
+`L2Loss`                              | `T={double,float}`
+`LRN`                                 | `T={float}`
+`LRNGrad`                             | `T={float}`
+`LeftShift`                           | `T={int32,int64,uint32,uint64}`
+`Less`                                | `T={double,float,int32,int64,uint32,uint64}`
+`LessEqual`                           | `T={double,float,int32,int64,uint32,uint64}`
+`LinSpace`                            | `Tidx={int32,int64}`<br>`T={double,float}`
+`Log`                                 | `T={complex64,double,float}`
+`Log1p`                               | `T={complex64,double,float}`
+`LogSoftmax`                          | `T={double,float}`
+`LogicalAnd`                          |
+`LogicalNot`                          |
+`LogicalOr`                           |
+`MatMul`                              | `T={complex64,double,float}`
+`MatrixDiag`                          | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`MatrixDiagPart`                      | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Max`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`MaxPool`                             | `T={double,float,int32,int64}`
+`MaxPool3D`                           | `T={float}`
+`MaxPool3DGrad`                       | `TInput={float}`<br>`T={float}`
+`MaxPoolGrad`                         | `T={double,float,int32,int64,uint32,uint64}`
+`Maximum`                             | `T={double,float,int32,int64}`
+`Mean`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Min`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Minimum`                             | `T={double,float,int32,int64}`
+`MirrorPad`                           | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Mod`                                 | `T={double,float,int32,int64}`
+`Mul`                                 | `T={complex64,double,float,int32,int64}`
+`Multinomial`                         | `output_dtype={int32,int64}`<br>`T={double,float,int32,int64,uint32,uint64}`
+`Neg`                                 | `T={complex64,double,float,int32,int64}`
+`NoOp`                                |
+`NotEqual`                            | `T={bool,complex64,double,float,int32,int64}`
+`OneHot`                              | `TI={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`OnesLike`                            | `T={bool,complex64,double,float,int32,int64}`
+`Pack`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Pad`                                 | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`PadV2`                               | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ParallelDynamicStitch`               | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Pow`                                 | `T={complex64,double,float,int32,int64}`
+`PreventGradient`                     | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Prod`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`QuantizeAndDequantizeV2`             | `T={double,float}`
+`RandomStandardNormal`                | `dtype={float}`
+`RandomUniform`                       | `T={int32,int64}`<br>`dtype={double,float}`
+`RandomUniformInt`                    | `T={int32,int64}`<br>`Tout={int32,int64}`
+`Range`                               | `Tidx={double,float,int32,int64}`
+`Rank`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ReadVariableOp`                      | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Real`                                | `Tout={double,float}`<br>`T={complex64}`
+`RealDiv`                             | `T={complex64,double,float,int32,int64}`
+`Reciprocal`                          | `T={complex64,double,float,int32,int64}`
+`ReciprocalGrad`                      | `T={complex64,double,float}`
+`Relu`                                | `T={double,float,int32,int64,uint32,uint64}`
+`Relu6`                               | `T={double,float,int32,int64,uint32,uint64}`
+`Relu6Grad`                           | `T={double,float,int32,int64,uint32,uint64}`
+`ReluGrad`                            | `T={double,float,int32,int64,uint32,uint64}`
+`Reshape`                             | `Tshape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ResourceApplyAdagrad`                | `T={double,float}`
+`ResourceApplyAdam`                   | `T={double,float}`
+`ResourceApplyFtrl`                   | `T={double,float}`
+`ResourceApplyFtrlV2`                 | `T={double,float}`
+`ResourceApplyGradientDescent`        | `T={double,float}`
+`ResourceApplyMomentum`               | `T={double,float}`
+`ResourceApplyRMSProp`                | `T={double,float}`
+`ResourceGather`                      | `Tindices={int32,int64}`<br>`dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`ResourceStridedSliceAssign`          | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Reverse`                             | `T={bool,complex64,double,float,int32,int64}`
+`ReverseV2`                           | `T={bool,complex64,double,float,int32,int64}`<br>`Tidx={int32,int64}`
+`RightShift`                          | `T={int32,int64,uint32,uint64}`
+`Rint`                                | `T={double,float}`
+`Round`                               | `T={complex64,double,float,int32,int64}`
+`Rsqrt`                               | `T={complex64,double,float}`
+`RsqrtGrad`                           | `T={complex64,double,float}`
+`Select`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Selu`                                | `T={double,float}`
+`SeluGrad`                            | `T={double,float}`
+`Shape`                               | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ShapeN`                              | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sigmoid`                             | `T={complex64,double,float}`
+`SigmoidGrad`                         | `T={complex64,double,float}`
+`Sign`                                | `T={complex64,double,float,int32,int64}`
+`Sin`                                 | `T={complex64,double,float}`
+`Sinh`                                | `T={complex64,double,float}`
+`Size`                                | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Slice`                               | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Softmax`                             | `T={double,float}`
+`SoftmaxCrossEntropyWithLogits`       | `T={double,float}`
+`Softplus`                            | `T={double,float,int32,int64,uint32,uint64}`
+`SoftplusGrad`                        | `T={double,float,int32,int64,uint32,uint64}`
+`Softsign`                            | `T={double,float,int32,int64,uint32,uint64}`
+`SoftsignGrad`                        | `T={double,float,int32,int64,uint32,uint64}`
+`SpaceToBatch`                        | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SpaceToBatchND`                      | `Tblock_shape={int32,int64}`<br>`Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SpaceToDepth`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SparseMatMul`                        | `Tb={float}`<br>`Ta={float}`
+`SparseSoftmaxCrossEntropyWithLogits` | `Tlabels={int32,int64}`<br>`T={double,float}`
+`Split`                               | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SplitV`                              | `Tlen={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sqrt`                                | `T={complex64,double,float}`
+`SqrtGrad`                            | `T={complex64,double,float}`
+`Square`                              | `T={complex64,double,float,int32,int64}`
+`SquaredDifference`                   | `T={complex64,double,float,int32,int64}`
+`Squeeze`                             | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackCloseV2`                        |
+`StackPopV2`                          | `elem_type={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackPushV2`                         | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackV2`                             | `elem_type={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StatelessRandomNormal`               | `Tseed={int32}`<br>`T={int32,int64}`<br>`dtype={float}`
+`StatelessRandomUniform`              | `Tseed={int32}`<br>`T={int32,int64}`<br>`dtype={float}`
+`StopGradient`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StridedSlice`                        | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StridedSliceGrad`                    | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sub`                                 | `T={complex64,double,float,int32,int64}`
+`Sum`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`SymbolicGradient`                    | `Tout={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`Tin={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Tan`                                 | `T={complex64,double,float,int32,int64}`
+`Tanh`                                | `T={complex64,double,float}`
+`TanhGrad`                            | `T={complex64,double,float}`
+`TensorArrayCloseV3`                  |
+`TensorArrayConcatV3`                 | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayGatherV3`                 | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayGradV3`                   |
+`TensorArrayReadV3`                   | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayScatterV3`                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArraySizeV3`                   |
+`TensorArraySplitV3`                  | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayV3`                       | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayWriteV3`                  | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Tile`                                | `Tmultiples={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Transpose`                           | `Tperm={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TruncateDiv`                         | `T={complex64,double,float,int32,int64}`
+`TruncateMod`                         | `T={double,float,int32,int64}`
+`TruncatedNormal`                     | `T={int32,int64}`<br>`dtype={double,float}`
+`Unpack`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`UnsortedSegmentSum`                  | `Tnumsegments={int32,int64}`<br>`Tindices={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`VarIsInitializedOp`                  |
+`VariableShape`                       | `out_type={int32,int64}`
+`XlaWhile`                            | `T={bool,complex64,double,float,int32,int64,resource,uint32,uint64}`
+`ZerosLike`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_Arg`                                | `T={bool,complex64,double,float,int32,int64,resource,uint32,uint64}`
+`_ArrayToList`                        | `out_types={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_ListToArray`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`Tin={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_Retval`                             | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_XLARecv`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_XLASend`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+
+To regenerate this table, run:
+
+```shell
+bazel run -c opt -- tensorflow/compiler/tf2xla:tf2xla_supported_ops --device=XLA_CPU_JIT
+```
diff --git a/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
new file mode 100644
index 0000000000000000000000000000000000000000..d4b7621ad2858fe17e93d292dd807e4f7c1c336b
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
@@ -0,0 +1,238 @@
+**Supported operators for device: XLA_GPU_JIT**
+
+Operator                              | Type Constraint
+------------------------------------- | ---------------
+`Abs`                                 | `T={double,float,int32,int64}`
+`Acosh`                               | `T={complex64,double,float}`
+`Add`                                 | `T={complex64,double,float,int32,int64}`
+`AddN`                                | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`All`                                 | `Tidx={int32,int64}`
+`Angle`                               | `Tout={double,float}`<br>`T={complex64}`
+`Any`                                 | `Tidx={int32,int64}`
+`ApproximateEqual`                    | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`ArgMax`                              | `Tidx={int32,int64}`<br>`output_type={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`ArgMin`                              | `Tidx={int32,int64}`<br>`output_type={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Asinh`                               | `T={complex64,double,float}`
+`AssignAddVariableOp`                 | `dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`AssignSubVariableOp`                 | `dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`AssignVariableOp`                    | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Atan2`                               | `T={double,float}`
+`Atanh`                               | `T={complex64,double,float}`
+`AvgPool`                             | `T={double,float}`
+`AvgPool3D`                           | `T={double,float}`
+`AvgPool3DGrad`                       | `T={double,float}`
+`AvgPoolGrad`                         | `T={double,float}`
+`BatchMatMul`                         | `T={complex64,double,float,int32}`
+`BatchToSpace`                        | `Tidx={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`BatchToSpaceND`                      | `Tcrops={int32,int64}`<br>`Tblock_shape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAdd`                             | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAddGrad`                         | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAddV1`                           | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BitwiseAnd`                          | `T={int32,int64,uint32,uint64}`
+`BitwiseOr`                           | `T={int32,int64,uint32,uint64}`
+`BroadcastArgs`                       | `T={int32,int64}`
+`BroadcastGradientArgs`               | `T={int32,int64}`
+`Cast`                                | `DstT={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`SrcT={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Ceil`                                | `T={double,float}`
+`Cholesky`                            | `T={complex64,double,float}`
+`Complex`                             | `Tout={complex64}`<br>`T={double,float}`
+`ComplexAbs`                          | `Tout={double,float}`<br>`T={complex64}`
+`Concat`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ConcatOffset`                        |
+`ConcatV2`                            | `Tidx={int32}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Conj`                                | `T={complex64}`
+`Const`                               | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ControlTrigger`                      |
+`Conv2D`                              | `T={float}`
+`Conv2DBackpropFilter`                | `T={float}`
+`Conv2DBackpropInput`                 | `T={float}`
+`Conv3D`                              | `T={double,float}`
+`Conv3DBackpropFilterV2`              | `T={double,float}`
+`Conv3DBackpropInputV2`               | `T={double,float}`
+`Cos`                                 | `T={complex64,double,float}`
+`Cosh`                                | `T={complex64,double,float}`
+`Cross`                               | `T={double,float,int32,int64,uint32,uint64}`
+`Cumprod`                             | `Tidx={int32,int64}`<br>`T={float}`
+`Cumsum`                              | `Tidx={int32,int64}`<br>`T={float}`
+`DepthToSpace`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`DepthwiseConv2dNative`               | `T={double,float}`
+`DepthwiseConv2dNativeBackpropFilter` | `T={double,float}`
+`DepthwiseConv2dNativeBackpropInput`  | `T={double,float}`
+`Diag`                                | `T={complex64,double,float,int32,int64}`
+`DiagPart`                            | `T={complex64,double,float,int32,int64}`
+`Div`                                 | `T={complex64,double,float,int32,int64}`
+`DynamicStitch`                       | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Elu`                                 | `T={double,float}`
+`EluGrad`                             | `T={double,float}`
+`Equal`                               | `T={bool,complex64,double,float,int32,int64}`
+`Exp`                                 | `T={complex64,double,float}`
+`ExpandDims`                          | `Tdim={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Expm1`                               | `T={complex64,double,float}`
+`Fill`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Floor`                               | `T={double,float}`
+`FloorDiv`                            | `T={complex64,double,float,int32,int64}`
+`FloorMod`                            | `T={double,float,int32,int64}`
+`FusedBatchNorm`                      | `T={float}`
+`FusedBatchNormGrad`                  | `T={float}`
+`FusedBatchNormGradV2`                | `U={float}`<br>`T={float}`
+`FusedBatchNormV2`                    | `U={float}`<br>`T={float}`
+`Gather`                              | `Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`GatherV2`                            | `Taxis={int32,int64}`<br>`Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Greater`                             | `T={double,float,int32,int64,uint32,uint64}`
+`GreaterEqual`                        | `T={double,float,int32,int64,uint32,uint64}`
+`Identity`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`IdentityN`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Imag`                                | `Tout={double,float}`<br>`T={complex64}`
+`Inv`                                 | `T={complex64,double,float,int32,int64}`
+`Invert`                              | `T={int32,int64,uint32,uint64}`
+`InvertPermutation`                   | `T={int32}`
+`IsFinite`                            | `T={double,float}`
+`IsInf`                               | `T={double,float}`
+`IsNan`                               | `T={double,float}`
+`L2Loss`                              | `T={double,float}`
+`LRN`                                 | `T={float}`
+`LRNGrad`                             | `T={float}`
+`LeftShift`                           | `T={int32,int64,uint32,uint64}`
+`Less`                                | `T={double,float,int32,int64,uint32,uint64}`
+`LessEqual`                           | `T={double,float,int32,int64,uint32,uint64}`
+`LinSpace`                            | `Tidx={int32,int64}`<br>`T={double,float}`
+`Log`                                 | `T={complex64,double,float}`
+`Log1p`                               | `T={complex64,double,float}`
+`LogSoftmax`                          | `T={double,float}`
+`LogicalAnd`                          |
+`LogicalNot`                          |
+`LogicalOr`                           |
+`MatMul`                              | `T={complex64,double,float}`
+`MatrixDiag`                          | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`MatrixDiagPart`                      | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Max`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`MaxPool`                             | `T={double,float,int32,int64}`
+`MaxPool3D`                           | `T={float}`
+`MaxPool3DGrad`                       | `TInput={float}`<br>`T={float}`
+`MaxPoolGrad`                         | `T={double,float,int32,int64,uint32,uint64}`
+`Maximum`                             | `T={double,float,int32,int64}`
+`Mean`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Min`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Minimum`                             | `T={double,float,int32,int64}`
+`MirrorPad`                           | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Mod`                                 | `T={double,float,int32,int64}`
+`Mul`                                 | `T={complex64,double,float,int32,int64}`
+`Multinomial`                         | `output_dtype={int32,int64}`<br>`T={double,float,int32,int64,uint32,uint64}`
+`Neg`                                 | `T={complex64,double,float,int32,int64}`
+`NoOp`                                |
+`NotEqual`                            | `T={bool,complex64,double,float,int32,int64}`
+`OneHot`                              | `TI={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`OnesLike`                            | `T={bool,complex64,double,float,int32,int64}`
+`Pack`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Pad`                                 | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`PadV2`                               | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ParallelDynamicStitch`               | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Pow`                                 | `T={complex64,double,float,int32,int64}`
+`PreventGradient`                     | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Prod`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`QuantizeAndDequantizeV2`             | `T={double,float}`
+`Range`                               | `Tidx={double,float,int32,int64}`
+`Rank`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ReadVariableOp`                      | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Real`                                | `Tout={double,float}`<br>`T={complex64}`
+`RealDiv`                             | `T={complex64,double,float,int32,int64}`
+`Reciprocal`                          | `T={complex64,double,float,int32,int64}`
+`ReciprocalGrad`                      | `T={complex64,double,float}`
+`Relu`                                | `T={double,float,int32,int64,uint32,uint64}`
+`Relu6`                               | `T={double,float,int32,int64,uint32,uint64}`
+`Relu6Grad`                           | `T={double,float,int32,int64,uint32,uint64}`
+`ReluGrad`                            | `T={double,float,int32,int64,uint32,uint64}`
+`Reshape`                             | `Tshape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ResourceApplyAdagrad`                | `T={double,float}`
+`ResourceApplyAdam`                   | `T={double,float}`
+`ResourceApplyFtrl`                   | `T={double,float}`
+`ResourceApplyFtrlV2`                 | `T={double,float}`
+`ResourceApplyGradientDescent`        | `T={double,float}`
+`ResourceApplyMomentum`               | `T={double,float}`
+`ResourceApplyRMSProp`                | `T={double,float}`
+`ResourceGather`                      | `Tindices={int32,int64}`<br>`dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`ResourceStridedSliceAssign`          | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Reverse`                             | `T={bool,complex64,double,float,int32,int64}`
+`ReverseV2`                           | `T={bool,complex64,double,float,int32,int64}`<br>`Tidx={int32,int64}`
+`RightShift`                          | `T={int32,int64,uint32,uint64}`
+`Rint`                                | `T={double,float}`
+`Round`                               | `T={complex64,double,float,int32,int64}`
+`Rsqrt`                               | `T={complex64,double,float}`
+`RsqrtGrad`                           | `T={complex64,double,float}`
+`Select`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Selu`                                | `T={double,float}`
+`SeluGrad`                            | `T={double,float}`
+`Shape`                               | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ShapeN`                              | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sigmoid`                             | `T={complex64,double,float}`
+`SigmoidGrad`                         | `T={complex64,double,float}`
+`Sign`                                | `T={complex64,double,float,int32,int64}`
+`Sin`                                 | `T={complex64,double,float}`
+`Sinh`                                | `T={complex64,double,float}`
+`Size`                                | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Slice`                               | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Softmax`                             | `T={double,float}`
+`SoftmaxCrossEntropyWithLogits`       | `T={double,float}`
+`Softplus`                            | `T={double,float,int32,int64,uint32,uint64}`
+`SoftplusGrad`                        | `T={double,float,int32,int64,uint32,uint64}`
+`Softsign`                            | `T={double,float,int32,int64,uint32,uint64}`
+`SoftsignGrad`                        | `T={double,float,int32,int64,uint32,uint64}`
+`SpaceToBatch`                        | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SpaceToBatchND`                      | `Tblock_shape={int32,int64}`<br>`Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SpaceToDepth`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SparseMatMul`                        | `Tb={float}`<br>`Ta={float}`
+`SparseSoftmaxCrossEntropyWithLogits` | `Tlabels={int32,int64}`<br>`T={double,float}`
+`Split`                               | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SplitV`                              | `Tlen={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sqrt`                                | `T={complex64,double,float}`
+`SqrtGrad`                            | `T={complex64,double,float}`
+`Square`                              | `T={complex64,double,float,int32,int64}`
+`SquaredDifference`                   | `T={complex64,double,float,int32,int64}`
+`Squeeze`                             | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackCloseV2`                        |
+`StackPopV2`                          | `elem_type={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackPushV2`                         | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackV2`                             | `elem_type={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StatelessRandomNormal`               | `Tseed={int32}`<br>`T={int32,int64}`<br>`dtype={float}`
+`StatelessRandomUniform`              | `Tseed={int32}`<br>`T={int32,int64}`<br>`dtype={float}`
+`StopGradient`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StridedSlice`                        | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StridedSliceGrad`                    | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sub`                                 | `T={complex64,double,float,int32,int64}`
+`Sum`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`SymbolicGradient`                    | `Tout={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`Tin={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Tan`                                 | `T={complex64,double,float,int32,int64}`
+`Tanh`                                | `T={complex64,double,float}`
+`TanhGrad`                            | `T={complex64,double,float}`
+`TensorArrayCloseV3`                  |
+`TensorArrayConcatV3`                 | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayGatherV3`                 | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayGradV3`                   |
+`TensorArrayReadV3`                   | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayScatterV3`                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArraySizeV3`                   |
+`TensorArraySplitV3`                  | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayV3`                       | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayWriteV3`                  | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Tile`                                | `Tmultiples={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Transpose`                           | `Tperm={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TruncateDiv`                         | `T={complex64,double,float,int32,int64}`
+`TruncateMod`                         | `T={double,float,int32,int64}`
+`Unpack`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`UnsortedSegmentSum`                  | `Tnumsegments={int32,int64}`<br>`Tindices={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`VarIsInitializedOp`                  |
+`VariableShape`                       | `out_type={int32,int64}`
+`XlaWhile`                            | `T={bool,complex64,double,float,int32,int64,resource,uint32,uint64}`
+`ZerosLike`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_Arg`                                | `T={bool,complex64,double,float,int32,int64,resource,uint32,uint64}`
+`_ArrayToList`                        | `out_types={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_ListToArray`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`Tin={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_Retval`                             | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_XLARecv`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_XLASend`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+
+To regenerate this table, run:
+
+```shell
+bazel run -c opt -- tensorflow/compiler/tf2xla:tf2xla_supported_ops --device=XLA_GPU_JIT
+```
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 948d7f0b407124613dbd58efb2e189b5fca4f6ed..3e24cf042e17ad4e212d82ac4f24fec06a6c780f 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -35,6 +35,7 @@ tf_kernel_library(
         "gather_op.cc",
         "gather_op_helpers.h",
         "identity_op.cc",
+        "image_resize_ops.cc",
         "index_ops.cc",
         "l2loss_op.cc",
         "lrn_ops.cc",
@@ -54,17 +55,20 @@ tf_kernel_library(
         "reshape_op.cc",
         "retval_op.cc",
         "reverse_op.cc",
+        "scan_ops.cc",
         "segment_reduction_ops.cc",
         "select_op.cc",
         "sendrecv_ops.cc",
         "sequence_ops.cc",
         "shape_op.cc",
+        "shape_util.cc",
         "slice_op.cc",
         "softmax_op.cc",
         "spacetobatch_op.cc",
         "spacetodepth_op.cc",
         "split_op.cc",
         "stack_ops.cc",
+        "stateless_random_ops.cc",
         "strided_slice_op.cc",
         "tensor_array_ops.cc",
         "tile_ops.cc",
@@ -77,6 +81,7 @@ tf_kernel_library(
     hdrs = [
         "gather_op.h",
         "index_ops.h",
+        "shape_util.h",
     ],
     deps = [
         ":while_op",
@@ -84,7 +89,9 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/lib:batch_dot",
         "//tensorflow/compiler/tf2xla/lib:cholesky",
+        "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/tf2xla/ops:sendrecv_ops",
+        "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
@@ -93,9 +100,11 @@ tf_kernel_library(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/core:framework",
+        "//tensorflow/core:image_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:linalg_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:stateless_random_ops_op_lib",
         "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:concat_lib",
         "//tensorflow/core/kernels:constant_op",
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index 248e9d111e556dcdd75581aa6562a66fc8b57063..a249b1869f547f8e5aa725f9f5cf391b10429928 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 // XLA implementation of BatchNorm operations.
-#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -26,43 +26,63 @@ namespace {
 class FusedBatchNormOp : public XlaOpKernel {
  public:
   explicit FusedBatchNormOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("is_training", &is_training_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
-    TensorFormat tensor_format;
-    if (ctx->GetAttr("data_format", &data_format).ok()) {
-      OP_REQUIRES(ctx, FormatFromString(data_format, &tensor_format),
-                  errors::InvalidArgument("Invalid data format"));
-      OP_REQUIRES(
-          ctx, (tensor_format == FORMAT_NHWC || tensor_format == FORMAT_NCHW),
-          errors::InvalidArgument("Not supported format"));
-      feature_index_ = GetTensorFeatureDimIndex(/*num_dims=*/4, tensor_format);
-    }
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(
+        ctx, FormatFromString(data_format_str, &data_format_),
+        errors::InvalidArgument("Invalid data format: ", data_format_str));
+    OP_REQUIRES(ctx,
+                (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW),
+                errors::InvalidArgument(
+                    "Unsupported data format ", ToString(data_format_),
+                    "; supported formats are NHWC and NCHW"));
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
+    xla::PrimitiveType input_type;
+    OP_REQUIRES_OK(ctx,
+                   DataTypeToPrimitiveType(ctx->input_type(0), &input_type));
+    xla::PrimitiveType scale_type;
+    OP_REQUIRES_OK(ctx,
+                   DataTypeToPrimitiveType(ctx->input_type(1), &scale_type));
+
+    xla::ComputationBuilder* builder = ctx->builder();
+
+    xla::ComputationDataHandle input = ctx->Input(0);
+    TensorShape input_shape = ctx->InputShape(0);
+
+    int feature_index =
+        GetTensorFeatureDimIndex(input_shape.dims(), data_format_);
+
+    // TODO(b/69928690): support mixed precision in the XLA batch normalization
+    // operators. As a workaround, cast everything to the statistics type (which
+    // may be more precise than the input type).
+    input = builder->ConvertElementType(input, scale_type);
+
     if (is_training_) {
-      xla::ComputationDataHandle output = ctx->builder()->BatchNormTraining(
-          ctx->Input(0), ctx->Input(1), ctx->Input(2), epsilon_,
-          feature_index_);
+      xla::ComputationDataHandle output = builder->BatchNormTraining(
+          input, ctx->Input(1), ctx->Input(2), epsilon_, feature_index);
 
       // In training mode, outputs the normalized value as well as the
       // calculated mean and variance.
-      for (int i = 0; i < 3; i++) {
-        ctx->SetOutput(i, ctx->builder()->GetTupleElement(output, i));
-      }
+      ctx->SetOutput(0, builder->ConvertElementType(
+                            builder->GetTupleElement(output, 0), input_type));
+      ctx->SetOutput(1, builder->GetTupleElement(output, 1));
+      ctx->SetOutput(2, builder->GetTupleElement(output, 2));
+
       // Output 3 and 4 for "FusedBatchNorm" are currently marked as "reserved
       // space 1 & 2". They are used to pass the per-batch mean and
       // variance to the gradient. Here we maintain the same behavior by setting
       // them to the mean and variance calculated by BatchNormTraining.
-      ctx->SetOutput(3, ctx->builder()->GetTupleElement(output, 1));
-      ctx->SetOutput(4, ctx->builder()->GetTupleElement(output, 2));
+      ctx->SetOutput(3, builder->GetTupleElement(output, 1));
+      ctx->SetOutput(4, builder->GetTupleElement(output, 2));
     } else {
-      xla::ComputationDataHandle output = ctx->builder()->BatchNormInference(
-          ctx->Input(0), ctx->Input(1), ctx->Input(2), ctx->Input(3),
-          ctx->Input(4), epsilon_, feature_index_);
-      ctx->SetOutput(0, output);
+      xla::ComputationDataHandle output = builder->BatchNormInference(
+          input, ctx->Input(1), ctx->Input(2), ctx->Input(3), ctx->Input(4),
+          epsilon_, feature_index);
+      ctx->SetOutput(0, builder->ConvertElementType(output, input_type));
       // Directly send input to output as mean and variance in inference mode.
       ctx->SetOutput(1, ctx->Input(3));
       ctx->SetOutput(2, ctx->Input(4));
@@ -73,55 +93,113 @@ class FusedBatchNormOp : public XlaOpKernel {
 
  private:
   float epsilon_;
-  int64 feature_index_;
+  TensorFormat data_format_;
   bool is_training_;
 };
 
 REGISTER_XLA_OP(Name("FusedBatchNorm"), FusedBatchNormOp);
+REGISTER_XLA_OP(Name("FusedBatchNormV2"), FusedBatchNormOp);
 
 class FusedBatchNormGradOp : public XlaOpKernel {
  public:
   explicit FusedBatchNormGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
-    bool is_training;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("is_training", &is_training));
-    CHECK(is_training) << "FusedBatchNormGradOp with is_training=False cannot "
-                          "be used with XLA for now!";
-    TensorFormat tensor_format;
-    if (ctx->GetAttr("data_format", &data_format).ok()) {
-      OP_REQUIRES(ctx, FormatFromString(data_format, &tensor_format),
-                  errors::InvalidArgument("Invalid data format"));
-      OP_REQUIRES(
-          ctx, (tensor_format == FORMAT_NHWC || tensor_format == FORMAT_NCHW),
-          errors::InvalidArgument("Not supported format"));
-      feature_index_ = GetTensorFeatureDimIndex(4, tensor_format);
-    }
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("is_training", &is_training_));
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(
+        ctx, FormatFromString(data_format_str, &data_format_),
+        errors::InvalidArgument("Invalid data format: ", data_format_str));
+    OP_REQUIRES(ctx,
+                (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW),
+                errors::InvalidArgument(
+                    "Unsupported data format ", ToString(data_format_),
+                    "; supported formats are NHWC and NCHW"));
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    auto grad_output = ctx->Input(0);
-    auto activation = ctx->Input(1);
+    xla::ComputationBuilder* b = ctx->builder();
+
+    auto grad_backprop = ctx->Input(0);
+    auto activations = ctx->Input(1);
     auto scale = ctx->Input(2);
     auto mean = ctx->Input(3);
     auto var = ctx->Input(4);
-    xla::ComputationDataHandle output = ctx->builder()->BatchNormGrad(
-        activation, scale, mean, var, grad_output, epsilon_, feature_index_);
 
-    for (int i = 0; i < 3; i++) {
-      ctx->SetOutput(i, ctx->builder()->GetTupleElement(output, i));
+    TensorShape input_shape = ctx->InputShape(0);
+    int feature_index =
+        GetTensorFeatureDimIndex(input_shape.dims(), data_format_);
+
+    DataType input_dtype = ctx->input_type(0);
+    DataType scale_dtype = ctx->input_type(2);
+    xla::PrimitiveType input_type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(input_dtype, &input_type));
+    xla::PrimitiveType scale_type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(scale_dtype, &scale_type));
+
+    // TODO(b/69928690): support mixed precision in the XLA batch normalization
+    // operators. For now, cast everything to the statistics type (which
+    // may be more precise than the input type).
+    grad_backprop = b->ConvertElementType(grad_backprop, scale_type);
+    activations = b->ConvertElementType(activations, scale_type);
+
+    xla::ComputationDataHandle x_backprop;
+    xla::ComputationDataHandle scale_backprop;
+    xla::ComputationDataHandle offset_backprop;
+    if (is_training_) {
+      xla::ComputationDataHandle output =
+          b->BatchNormGrad(activations, scale, mean, var, grad_backprop,
+                           epsilon_, feature_index);
+
+      x_backprop = b->GetTupleElement(output, 0);
+      scale_backprop = b->GetTupleElement(output, 1);
+      offset_backprop = b->GetTupleElement(output, 2);
+    } else {
+      // Reduce over all dimensions except the feature dim.
+      std::vector<int64> reduction_dims(input_shape.dims() - 1);
+      std::iota(reduction_dims.begin(), reduction_dims.begin() + feature_index,
+                0);
+      std::iota(reduction_dims.begin() + feature_index, reduction_dims.end(),
+                feature_index + 1);
+      // offset_backprop  = sum(y_backprop)
+      // scale_backprop = y_backprop * ((x - pop_mean) * rsqrt(pop_var +
+      // epsilon))
+      // x_backprop = y_backprop * (scale * rsqrt(pop_var + epsilon))
+      offset_backprop =
+          b->Reduce(grad_backprop, XlaHelpers::Zero(b, scale_dtype),
+                    *ctx->GetOrCreateAdd(scale_dtype), reduction_dims);
+
+      // scratch1 = rsqrt(pop_var + epsilon)
+      auto neg_half = XlaHelpers::FloatLiteral(b, scale_dtype, -0.5);
+      auto scratch1 =
+          b->Pow(b->Add(var, b->ConstantR0<float>(epsilon_)), neg_half);
+
+      // scratch2 = sum(y_backprop * (x - mean))
+      auto scratch2 = b->Reduce(
+          b->Mul(grad_backprop, b->Sub(activations, mean, {feature_index})),
+          XlaHelpers::Zero(b, scale_dtype), *ctx->GetOrCreateAdd(scale_dtype),
+          reduction_dims);
+
+      x_backprop =
+          b->Mul(grad_backprop, b->Mul(scratch1, scale), {feature_index});
+      scale_backprop = b->Mul(scratch1, scratch2);
     }
-    ctx->SetOutput(3, ctx->builder()->GetTupleElement(output, 1));
-    ctx->SetOutput(4, ctx->builder()->GetTupleElement(output, 2));
+
+    ctx->SetOutput(0, b->ConvertElementType(x_backprop, input_type));
+    ctx->SetOutput(1, scale_backprop);
+    ctx->SetOutput(2, offset_backprop);
+    ctx->SetConstantOutput(3, Tensor(scale_dtype, {}));
+    ctx->SetConstantOutput(4, Tensor(scale_dtype, {}));
   }
 
  private:
+  TensorFormat data_format_;
   float epsilon_;
-  int64 feature_index_;
+  bool is_training_;
 };
 
 REGISTER_XLA_OP(Name("FusedBatchNormGrad"), FusedBatchNormGradOp);
+REGISTER_XLA_OP(Name("FusedBatchNormGradV2"), FusedBatchNormGradOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 1de91924326464338352b1ac9edf77141f25ad35..2436a6074a11ad66387b232dd1c5aa135875bfc3 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace {
@@ -75,7 +76,7 @@ static xla::ComputationDataHandle FloorDivImpl(xla::ComputationBuilder* b,
   auto abs_y = b->Abs(y);
   auto t = b->Neg(b->Sub(b->Add(abs_x, abs_y), one));
   auto result = b->Select(different_sign, b->Div(t, abs_y), b->Div(x, y));
-  if (dtype == DT_FLOAT || dtype == DT_DOUBLE) {
+  if (DataTypeIsFloating(dtype)) {
     result = b->Floor(result);
   }
   return result;
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index 885f716afafca7ba23770e38f6693eed1ba50982..aaddbe811c6fbf6da296640eb5a75e82b2fedcfa 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -46,72 +46,130 @@ TensorShape ExpandedFilterShapeForDepthwiseConvolution(
   return expanded_shape;
 }
 
+// Broadcast zeros to ExpandedFilterShapeForDepthwiseConvolution.
+xla::ComputationDataHandle CreateExpandedZero(
+    const TensorShape& filter_shape, DataType dtype,
+    xla::ComputationBuilder* builder) {
+  TensorShape expanded_filter_shape =
+      ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
+  return builder->Broadcast(XlaHelpers::Zero(builder, dtype),
+                            expanded_filter_shape.dim_sizes());
+}
+
+// Create a mask for depthwise convolution that will make a normal convolution
+// produce the same results as a depthwise convolution. For a [2, 2, 3, 2]
+// depthwise filter this returns a [2, 2, 3, 6] tesnsor
+//   1 1 0 0 0 0   1 1 0 0 0 0
+//   0 0 1 1 0 0   0 0 1 1 0 0
+//   0 0 0 0 1 1   0 0 0 0 1 1
+//
+//   1 1 0 0 0 0   1 1 0 0 0 0
+//   0 0 1 1 0 0   0 0 1 1 0 0
+//   0 0 0 0 1 1   0 0 0 0 1 1
+//
+// The first step is to create a one tensor, A, that is [3]
+//   0 1 2
+//
+// and another tensor, B,  that is [3 * 2]
+//   0 1 2 3 4 5
+//
+// and divide B it by 2 to get
+//   0 0 1 1 2 2
+//
+// then we broadcast the B to [2, 2, 3, 3 * 2]
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//
+// Finally compare A and broadcasted B in dimension 2 amd return the result at
+// the beginning of the comment.
+xla::ComputationDataHandle CreateExpandedFilterMask(
+    const TensorShape& filter_shape, xla::ComputationBuilder* builder) {
+  TensorShape expanded_filter_shape =
+      ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
+  int64 depthwise_multiplier = filter_shape.dim_size(filter_shape.dims() - 1);
+  int64 input_feature = filter_shape.dim_size(filter_shape.dims() - 2);
+
+  // Create a M sized linspace and an M*N sized linspace that will be
+  // broadcasted into perpendicular dimensions and compared.
+  xla::ComputationDataHandle input_feature_iota;
+  // DT_INT32 Iota will always return status::OK().
+  TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32, input_feature,
+                               &input_feature_iota));
+  xla::ComputationDataHandle expanded_feature_iota;
+  TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32,
+                               input_feature * depthwise_multiplier,
+                               &expanded_feature_iota));
+
+  // Divide the M*N sized linspace by the depthwise_multiplier to create
+  // [0 0 1 1 2 2] in the example in the function comment.
+  expanded_feature_iota =
+      builder->Div(expanded_feature_iota,
+                   XlaHelpers::IntegerLiteral(builder, DataType::DT_INT32,
+                                              depthwise_multiplier));
+
+  // Broadcast the N*M linspace to [H, W, ..., M, M*N].
+  auto expanded_feature_broadcast_dims = expanded_filter_shape.dim_sizes();
+  expanded_feature_broadcast_dims.pop_back();
+  auto broadcasted_expanded_feature_iota = builder->Broadcast(
+      expanded_feature_iota, expanded_feature_broadcast_dims);
+
+  // Compare the broadcasted linspace to the input feature linspace in the
+  // input feature dimension to create a diagonal predicate.
+  return builder->Eq(broadcasted_expanded_feature_iota, input_feature_iota,
+                     {expanded_filter_shape.dims() - 2});
+}
+
 // Expands a filter of shape [H, W, ..., M, N] to [H, W, ..., M, M*N] by adding
 // zeros for the cross-depth filters. Used to build a depthwise convolution.
 xla::ComputationDataHandle ExpandFilterForDepthwiseConvolution(
     const TensorShape& filter_shape, DataType dtype,
     const xla::ComputationDataHandle& filter,
     xla::ComputationBuilder* builder) {
-  // Filter has shape [H, W, ..., M, N]
-  // Dilate to [H, W, ..., M*M, N] using M inter-element padding, and then
-  // reshape to [H, W, ..., M, M*N].
-  int num_spatial_dims = filter_shape.dims() - 2;
-  const int64 in_depth = filter_shape.dim_size(num_spatial_dims);
-  xla::PaddingConfig padding = xla::MakeNoPaddingConfig(filter_shape.dims());
-  padding.mutable_dimensions(num_spatial_dims)->set_interior_padding(in_depth);
-  auto dilated_filter =
-      builder->Pad(filter, XlaHelpers::Zero(builder, dtype), padding);
-
+  int64 depthwise_multiplier = filter_shape.dim_size(filter_shape.dims() - 1);
+  int64 input_feature = filter_shape.dim_size(filter_shape.dims() - 2);
   TensorShape expanded_filter_shape =
       ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
-  return builder->Reshape(dilated_filter, expanded_filter_shape.dim_sizes());
+
+  // Create a [H, W, ..., 1, N*M] reshape of the filter.
+  TensorShape implicit_broadcast_filter_shape = expanded_filter_shape;
+  implicit_broadcast_filter_shape.set_dim(
+      implicit_broadcast_filter_shape.dims() - 2, 1);
+  implicit_broadcast_filter_shape.set_dim(
+      implicit_broadcast_filter_shape.dims() - 1,
+      depthwise_multiplier * input_feature);
+  auto implicit_broadcast_filter =
+      builder->Reshape(filter, implicit_broadcast_filter_shape.dim_sizes());
+
+  // Broadcast the filter to  [H, W, ..., M, M*N].
+  auto expanded_zero = CreateExpandedZero(filter_shape, dtype, builder);
+  auto expanded_filter = builder->Add(implicit_broadcast_filter, expanded_zero);
+
+  // If the filter mask is set, choose the broadcasted filter, othwerwise,
+  // choose zero.
+  return builder->Select(CreateExpandedFilterMask(filter_shape, builder),
+                         expanded_filter, expanded_zero);
 }
 
 // Inverse of ExpandFilterForDepthwiseConvolution.
 xla::ComputationDataHandle ContractFilterForDepthwiseBackprop(
-    const TensorShape& filter_shape, DataType dtype,
+    XlaOpKernelContext* ctx, const TensorShape& filter_shape, DataType dtype,
     const xla::ComputationDataHandle& filter_backprop,
     xla::ComputationBuilder* builder) {
-  int num_spatial_dims = filter_shape.dims() - 2;
-
-  // Reshape to [H, W, ..., M*M, N]
-  TensorShape shape = filter_shape;
-  int64 in_depth = filter_shape.dim_size(num_spatial_dims);
-  shape.set_dim(num_spatial_dims, in_depth * in_depth);
-  auto reshaped = builder->Reshape(filter_backprop, shape.dim_sizes());
-
-  std::vector<int64> zeros(filter_shape.dims());
-  std::vector<int64> strides(filter_shape.dims(), 1LL);
-  strides[num_spatial_dims] = in_depth + 1;
-  return builder->Slice(reshaped, zeros, shape.dim_sizes(), strides);
-
-  // Alternate implementation for backends without strided Slice() support.
-  // TODO(phawkins): Remove when all backends support strided slice.
-  //   // Pad [..., M * (M + 1), N]
-  //   xla::PaddingConfig config =
-  //   xla::MakeNoPaddingConfig(filter_shape.dims());
-  //   config.mutable_dimensions(num_spatial_dims)
-  //     ->set_edge_padding_high(in_depth);
-  //   auto zero = XlaHelpers::Zero(builder, dtype);
-  //   auto padded = builder->Pad(reshaped, zero, config);
-  //
-  //   // Reshape to [..., M, M + 1, N]
-  //   shape = filter_shape;
-  //   shape.set_dim(num_spatial_dims, in_depth);
-  //   shape.set_dim(num_spatial_dims + 1, in_depth + 1);
-  //   int64 out_depth = filter_shape.dim_size(num_spatial_dims + 1);
-  //   shape.AddDim(out_depth);
-  //   reshaped = builder->Reshape(padded, shape.dim_sizes());
-  //
-  //   // Slice to [..., M, 1, N]
-  //   std::vector<int64> zeros(shape.dims());
-  //   std::vector<int64> strides(shape.dims(), 1LL);
-  //   shape.set_dim(num_spatial_dims + 1, 1);
-  //   auto sliced = builder->Slice(reshaped, zeros, shape.dim_sizes(),
-  //   strides);
-  //
-  //   // Reshape to [..., M, N]
-  //   return builder->Reshape(sliced, filter_shape.dim_sizes());
+  TensorShape expanded_filter_shape =
+      ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
+  auto masked_expanded_filter = builder->Select(
+      CreateExpandedFilterMask(filter_shape, builder), filter_backprop,
+      CreateExpandedZero(filter_shape, dtype, builder));
+  return builder->Reshape(
+      builder->Reduce(masked_expanded_filter, XlaHelpers::Zero(builder, dtype),
+                      *ctx->GetOrCreateAdd(dtype),
+                      {expanded_filter_shape.dims() - 2}),
+      filter_shape.dim_sizes());
 }
 
 class ConvOp : public XlaOpKernel {
@@ -121,6 +179,7 @@ class ConvOp : public XlaOpKernel {
       : XlaOpKernel(ctx),
         num_spatial_dims_(num_spatial_dims),
         depthwise_(depthwise) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dilations", &dilations_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &strides_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
 
@@ -144,6 +203,23 @@ class ConvOp : public XlaOpKernel {
         errors::Unimplemented("Current implementation does not yet support "
                               "strides in the batch and depth dimensions."));
 
+    OP_REQUIRES(ctx, dilations_.size() == num_dims(),
+                errors::InvalidArgument("Dilations field must "
+                                        "specify ",
+                                        num_dims(), " dimensions"));
+    OP_REQUIRES(
+        ctx, dilations_[batch_dim] == 1 && dilations_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not yet support "
+                              "dilations in the batch and depth dimensions."));
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      OP_REQUIRES(
+          ctx, dilations_[input_dim] == 1,
+          errors::Unimplemented("Current implementation does not yet support "
+                                "dilations in the ",
+                                i, "th spatial dimension."));
+    }
+
     const TensorShape input_shape = ctx->InputShape(0);
     // Input filter is of the following dimensions:
     // [ filter_rows, filter_cols, ..., in_depth, out_depth]
@@ -184,10 +260,11 @@ class ConvOp : public XlaOpKernel {
     dims.set_input_feature_dimension(feature_dim);
     dims.set_output_feature_dimension(feature_dim);
     for (int i = 0; i < num_spatial_dims_; ++i) {
-      int input_dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
-      dims.add_spatial_dimensions(input_dim);
+      const int64 dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      dims.add_input_spatial_dimensions(dim);
       dims.add_kernel_spatial_dimensions(i);
-      window_strides.push_back(strides_.at(input_dim));
+      dims.add_output_spatial_dimensions(dim);
+      window_strides.push_back(strides_.at(dim));
     }
     dims.set_kernel_input_feature_dimension(num_spatial_dims_);
     dims.set_kernel_output_feature_dimension(num_spatial_dims_ + 1);
@@ -203,6 +280,7 @@ class ConvOp : public XlaOpKernel {
  protected:
   const int num_spatial_dims_;
   const bool depthwise_;
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_ = FORMAT_NHWC;
@@ -240,6 +318,7 @@ class ConvBackpropInputOp : public XlaOpKernel {
       : XlaOpKernel(ctx),
         num_spatial_dims_(num_spatial_dims),
         depthwise_(depthwise) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dilations", &dilations_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &strides_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
     string data_format;
@@ -262,6 +341,23 @@ class ConvBackpropInputOp : public XlaOpKernel {
         errors::Unimplemented("Current implementation does not yet support "
                               "strides in the batch and depth dimensions."));
 
+    OP_REQUIRES(ctx, dilations_.size() == num_dims(),
+                errors::InvalidArgument("Dilations field must "
+                                        "specify ",
+                                        num_dims(), " dimensions"));
+    OP_REQUIRES(
+        ctx, dilations_[batch_dim] == 1 && dilations_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not yet support "
+                              "dilations in the batch and depth dimensions."));
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      OP_REQUIRES(
+          ctx, dilations_[input_dim] == 1,
+          errors::Unimplemented("Current implementation does not yet support "
+                                "dilations in the ",
+                                i, "th spatial dimension."));
+    }
+
     TensorShape input_shape;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &input_shape));
 
@@ -302,9 +398,10 @@ class ConvBackpropInputOp : public XlaOpKernel {
     std::vector<int64> lhs_dilation(num_spatial_dims_);
     std::vector<int64> ones(num_spatial_dims_, 1);
     for (int i = 0; i < num_spatial_dims_; ++i) {
-      dnums.add_spatial_dimensions(
-          GetTensorSpatialDimIndex(num_dims(), data_format_, i));
+      int64 dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      dnums.add_input_spatial_dimensions(dim);
       dnums.add_kernel_spatial_dimensions(i);
+      dnums.add_output_spatial_dimensions(dim);
 
       kernel_spatial_dims[i] = i;
       padding[i] = {dims.spatial_dims[i].pad_before,
@@ -334,6 +431,7 @@ class ConvBackpropInputOp : public XlaOpKernel {
  protected:
   const int num_spatial_dims_;
   const bool depthwise_;
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_ = FORMAT_NHWC;
@@ -371,6 +469,7 @@ class ConvBackpropFilterOp : public XlaOpKernel {
       : XlaOpKernel(ctx),
         num_spatial_dims_(num_spatial_dims),
         depthwise_(depthwise) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dilations", &dilations_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &strides_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
     string data_format;
@@ -390,6 +489,23 @@ class ConvBackpropFilterOp : public XlaOpKernel {
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
 
+    OP_REQUIRES(ctx, dilations_.size() == num_dims(),
+                errors::InvalidArgument("Dilations field must "
+                                        "specify ",
+                                        num_dims(), " dimensions"));
+    OP_REQUIRES(
+        ctx, dilations_[n_dim] == 1 && dilations_[c_dim] == 1,
+        errors::Unimplemented("Current implementation does not yet support "
+                              "dilations in the batch and depth dimensions."));
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      OP_REQUIRES(
+          ctx, dilations_[input_dim] == 1,
+          errors::Unimplemented("Current implementation does not yet support "
+                                "dilations in the ",
+                                i, "th spatial dimension."));
+    }
+
     const TensorShape activations_shape = ctx->InputShape(0);
     TensorShape filter_shape;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(1, &filter_shape));
@@ -424,9 +540,7 @@ class ConvBackpropFilterOp : public XlaOpKernel {
 
     // Swap n_dim and c_dim in the activations.
     dnums.set_input_batch_dimension(c_dim);
-    dnums.set_output_batch_dimension(c_dim);
     dnums.set_input_feature_dimension(n_dim);
-    dnums.set_output_feature_dimension(n_dim);
 
     // The gradients become the RHS of the convolution.
     // The gradients have shape [batch, out_rows, out_cols, ..., out_depth]
@@ -438,9 +552,16 @@ class ConvBackpropFilterOp : public XlaOpKernel {
     std::vector<int64> rhs_dilation(num_spatial_dims_);
     std::vector<int64> ones(num_spatial_dims_, 1);
 
+    // Tensorflow filter shape is [ H, W, ..., inC, outC ].
     for (int i = 0; i < num_spatial_dims_; ++i) {
-      int dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
-      dnums.add_spatial_dimensions(dim);
+      dnums.add_output_spatial_dimensions(i);
+    }
+    dnums.set_output_batch_dimension(num_spatial_dims_);
+    dnums.set_output_feature_dimension(num_spatial_dims_ + 1);
+
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      int64 dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      dnums.add_input_spatial_dimensions(dim);
       dnums.add_kernel_spatial_dimensions(dim);
 
       // We will also need to pad the input with zeros such that after the
@@ -498,31 +619,17 @@ class ConvBackpropFilterOp : public XlaOpKernel {
                               /*window_strides=*/ones, padding,
                               /*lhs_dilation=*/ones, rhs_dilation, dnums);
 
-    // The layout of filter_backprop will match the layout of
-    // padded_activations
-    // and so will have layout: [out_feature, h, w, ..., in_feature]
-    // Tensorflow filter shape is [ H, W, ..., inC, outC ], so we transpose the
-    // output.
-    std::vector<int64> transpose_dims;
-    transpose_dims.reserve(num_dims());
-    for (int i = 0; i < num_spatial_dims_; ++i) {
-      transpose_dims.push_back(dnums.spatial_dimensions(i));
-    }
-    transpose_dims.push_back(c_dim);
-    transpose_dims.push_back(n_dim);
-    xla::ComputationDataHandle filter_backprop_reshaped =
-        b->Transpose(filter_backprop, transpose_dims);
-
     if (depthwise_) {
-      filter_backprop_reshaped = ContractFilterForDepthwiseBackprop(
-          filter_shape, ctx->input_type(0), filter_backprop_reshaped, b);
+      filter_backprop = ContractFilterForDepthwiseBackprop(
+          ctx, filter_shape, ctx->input_type(0), filter_backprop, b);
     }
-    ctx->SetOutput(0, filter_backprop_reshaped);
+    ctx->SetOutput(0, filter_backprop);
   }
 
  protected:
   const int num_spatial_dims_;
   const bool depthwise_;
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_ = FORMAT_NHWC;
diff --git a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
index a4ea65ea89e348cb77412efb0c5c0fcb1a9f33f3..96d7809f7995634b6bc31ab801b93526d9da7e6f 100644
--- a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 namespace {
@@ -23,6 +24,16 @@ namespace {
 class DepthToSpaceOp : public XlaOpKernel {
  public:
   explicit DepthToSpaceOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    OP_REQUIRES(ctx, data_format_ == FORMAT_NCHW || data_format_ == FORMAT_NHWC,
+                errors::InvalidArgument("Unsupported data format ",
+                                        ToString(data_format_),
+                                        "; expected formats NHWC or NCHW"));
+
     OP_REQUIRES_OK(ctx, ctx->GetAttr("block_size", &block_size_));
     OP_REQUIRES(
         ctx, block_size_ > 1,
@@ -31,18 +42,79 @@ class DepthToSpaceOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape input_tensor_shape = ctx->InputShape(0);
-    // The input is presumed to be [batch, height, width, depth]
     int input_rank = input_tensor_shape.dims();
     static const int kRequiredDims = 4;
     OP_REQUIRES(ctx, kRequiredDims == input_rank,
-                errors::InvalidArgument("Input rank should be: ", kRequiredDims,
-                                        " instead of: ", input_rank));
+                errors::InvalidArgument("Input rank should be ", kRequiredDims,
+                                        "; got: ", input_rank));
     const gtl::InlinedVector<int64, 4> input_shape =
         input_tensor_shape.dim_sizes();
 
     xla::ComputationBuilder* b = ctx->builder();
     xla::ComputationDataHandle input = ctx->Input(0);
 
+    int feature_dim = GetTensorFeatureDimIndex(input_rank, data_format_);
+    int num_spatial_dims = GetTensorSpatialDims(input_rank, data_format_);
+
+    std::vector<int64> reshaped_shape;
+    std::vector<int64> transpose_order;
+    std::vector<int64> output_shape;
+    reshaped_shape.reserve(input_rank);
+    transpose_order.reserve(input_rank);
+    output_shape.reserve(input_rank);
+    if (data_format_ == FORMAT_NHWC) {
+      reshaped_shape.push_back(input_shape[0]);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(input_shape[1 + i]);
+      }
+      int64 block_elems = 1;
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(block_size_);
+        block_elems *= block_size_;
+      }
+      reshaped_shape.push_back(input_shape[feature_dim] / block_elems);
+
+      transpose_order.push_back(0);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(i + 1);
+        transpose_order.push_back(i + 1 + num_spatial_dims);
+      }
+      transpose_order.push_back(feature_dim + num_spatial_dims);
+
+      output_shape.push_back(input_shape[0]);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        output_shape.push_back(input_shape[1 + i] * block_size_);
+      }
+      output_shape.push_back(input_shape[feature_dim] / block_elems);
+    } else {
+      // NCHW format.
+      reshaped_shape.push_back(input_shape[0]);
+      int64 block_elems = 1;
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(block_size_);
+        block_elems *= block_size_;
+      }
+      reshaped_shape.push_back(input_shape[feature_dim] / block_elems);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(input_shape[2 + i]);
+      }
+
+      transpose_order.push_back(0);
+      transpose_order.push_back(1 + num_spatial_dims);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(2 + num_spatial_dims + i);
+        transpose_order.push_back(1 + i);
+      }
+
+      output_shape.push_back(input_shape[0]);
+      output_shape.push_back(input_shape[feature_dim] / block_elems);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        output_shape.push_back(input_shape[2 + i] * block_size_);
+      }
+    }
+
+    // Note: comments are given in NHWC format; NCHW is similar with a different
+    // dimension order.
     // 1. Reshape `input` to `reshaped` of shape:
     //
     //      [batch,
@@ -51,14 +123,14 @@ class DepthToSpaceOp : public XlaOpKernel {
     //       block_size_,
     //       block_size_,
     //       depth / (block_size_ * block_size_)]
-    OP_REQUIRES(ctx, input_shape[3] % (block_size_ * block_size_) == 0,
+    OP_REQUIRES(ctx,
+                input_shape[feature_dim] % (block_size_ * block_size_) == 0,
                 errors::InvalidArgument(
                     "Input depth dimension (", input_shape[3],
                     ") is not divisible by square of the block size (",
                     block_size_, ")"));
-    xla::ComputationDataHandle reshaped = b->Reshape(
-        input, {input_shape[0], input_shape[1], input_shape[2], block_size_,
-                block_size_, input_shape[3] / (block_size_ * block_size_)});
+
+    xla::ComputationDataHandle reshaped = b->Reshape(input, reshaped_shape);
 
     // 2. Permute dimensions of `reshaped` to produce
     //    `permuted_reshaped` of shape:
@@ -70,7 +142,7 @@ class DepthToSpaceOp : public XlaOpKernel {
     //       block_size_,
     //       depth / (block_size_ * block_size_)]
     xla::ComputationDataHandle permuted_reshaped =
-        b->Transpose(reshaped, {0, 1, 3, 2, 4, 5});
+        b->Transpose(reshaped, transpose_order);
 
     // 3. Reshape `permuted_reshaped` to flatten `block_shape` into the
     //    batch dimension, producing an output tensor of shape:
@@ -80,15 +152,14 @@ class DepthToSpaceOp : public XlaOpKernel {
     //       input_shape[2] * block_size_,
     //       depth / (block_size_ * block_size_)]
     //
-    xla::ComputationDataHandle output = b->Reshape(
-        permuted_reshaped, {input_shape[0], input_shape[1] * block_size_,
-                            input_shape[2] * block_size_,
-                            input_shape[3] / (block_size_ * block_size_)});
+    xla::ComputationDataHandle output =
+        b->Reshape(permuted_reshaped, output_shape);
 
     ctx->SetOutput(0, output);
   }
 
  private:
+  TensorFormat data_format_;
   int block_size_;
 };
 REGISTER_XLA_OP(Name("DepthToSpace"), DepthToSpaceOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
index ec5017f6ab96bd3fc273a746b77fbb7e74fd9f35..765ea922a532a085a552192348ab360c4c30ff0a 100644
--- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/lib/util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -22,6 +24,62 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+// Create a diagonal / batch diagonal matrix with 'input' on the diagonal.
+xla::StatusOr<xla::ComputationDataHandle> CreateDiagonal(
+    const xla::ComputationDataHandle& input, int64 last_dim_size,
+    tensorflow::gtl::ArraySlice<int64> other_dims, XlaOpKernelContext* ctx,
+    xla::ComputationBuilder* builder) {
+  // Create two matrices that have the following forms, and compare them:
+  //
+  // [[0, 0, 0, 0]            [[0, 1, 2, 3]
+  //  [1, 1, 1, 1]             [0, 1, 2, 3]
+  //  [2, 2, 2, 2]             [0, 1, 2, 3]
+  //  [3, 3, 3, 3]]            [0, 1, 2, 3]]
+  //
+  // This produces a predicate matrix of the right size, with "true" on the
+  // diagonal.
+  xla::ComputationDataHandle iota;
+  TF_RETURN_IF_ERROR(
+      XlaHelpers::Iota(builder, DataType::DT_INT32, last_dim_size, &iota));
+  xla::ComputationDataHandle iota_broadcast =
+      builder->Broadcast(iota, {last_dim_size});
+  xla::ComputationDataHandle mask = builder->Eq(iota_broadcast, iota, {0});
+
+  // If this is a batched diagonal, broadcast the mask across the other
+  // dimensions.
+  if (!other_dims.empty()) {
+    mask = builder->Broadcast(mask, other_dims);
+  }
+
+  // Broadcast the input, and then use the mask computed above to select the
+  // diagonal:
+  // e.g, in 2D:
+  //         [[t, f, f]    [[1, 1, 1]    [[0, 0, 0]      [[1, 0, 0]
+  // select(  [f, t, f]  ,  [4, 4, 4]  ,  [0, 0, 0]  ) =  [0, 4, 0]
+  //          [f, f, t]]    [9, 9, 9]]    [0, 0, 0]]      [0, 0, 9]]
+  //
+  // Broadcasting the input is less-than-trivial, since we need to broadcast
+  // into a "middle" dimension. We can do this with a reshape + implicit
+  // broadcast.
+  // TODO(b/30112114): Replace with in-dim broadcast when those are supported.
+  std::vector<int64> broadcast_dims(other_dims.begin(), other_dims.end());
+  broadcast_dims.push_back(1LL);
+  broadcast_dims.push_back(last_dim_size);
+  xla::ComputationDataHandle input_broadcast =
+      builder->Reshape(input, broadcast_dims);
+
+  broadcast_dims[broadcast_dims.size() - 2] = last_dim_size;
+  xla::PrimitiveType element_type;
+  TF_RETURN_IF_ERROR(
+      DataTypeToPrimitiveType(ctx->input_type(0), &element_type));
+  auto broadcast_shape =
+      xla::ShapeUtil::MakeShape(element_type, broadcast_dims);
+  xla::ComputationDataHandle zeros = Zeros(builder, broadcast_shape);
+
+  input_broadcast = builder->Add(input_broadcast, zeros);
+  return builder->Select(mask, input_broadcast, zeros);
+}
+
 class DiagOp : public XlaOpKernel {
  public:
   explicit DiagOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
@@ -29,6 +87,8 @@ class DiagOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationBuilder* builder = ctx->builder();
 
+    OP_REQUIRES(ctx, ctx->num_inputs() >= 1,
+                errors::InvalidArgument("Diag op must have at an input"));
     const TensorShape input_shape = ctx->InputShape(0);
 
     auto dims = input_shape.dim_sizes();
@@ -36,7 +96,7 @@ class DiagOp : public XlaOpKernel {
                 errors::InvalidArgument("Expected 1 <= dims, got shape ",
                                         input_shape.DebugString()));
 
-    xla::ComputationDataHandle diag = ctx->Input(0);
+    xla::ComputationDataHandle input = ctx->Input(0);
 
     // Picture:
     // tf.diag([1, 2, 3, 4]) ==> [[1, 0, 0, 0]
@@ -46,13 +106,13 @@ class DiagOp : public XlaOpKernel {
 
     // Flattens the input to 1D.
     int64 size = input_shape.num_elements();
-    diag = builder->Reshape(diag, {size});
+    input = builder->Reshape(input, {size});
 
-    // Adds inter-element padding of 'size'.
-    xla::PaddingConfig config;
-    auto* dim = config.add_dimensions();
-    dim->set_interior_padding(size);
-    diag = builder->Pad(diag, XlaHelpers::Zero(builder, input_type(0)), config);
+    // Create an R2 with the R1 diagonal.
+    auto diag_or_status =
+        CreateDiagonal(input, size, /*other_dims=*/{}, ctx, builder);
+    OP_REQUIRES_OK(ctx, diag_or_status.status());
+    xla::ComputationDataHandle diag = diag_or_status.ValueOrDie();
 
     // Reshapes to the final shape.
     std::vector<int64> new_dims(dims.size() * 2);
@@ -141,6 +201,8 @@ class MatrixDiagOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationBuilder* builder = ctx->builder();
 
+    OP_REQUIRES(ctx, ctx->num_inputs() >= 1,
+                errors::InvalidArgument("MatrixDiag op must have at an input"));
     const TensorShape input_shape = ctx->InputShape(0);
 
     auto dims = input_shape.dim_sizes();
@@ -152,17 +214,13 @@ class MatrixDiagOp : public XlaOpKernel {
 
     int last_dim = dims.size() - 1;
     int64 last_dim_size = input_shape.dim_size(last_dim);
+    tensorflow::gtl::ArraySlice<int64> other_dims(dims);
+    other_dims.pop_back();
 
-    // Adds inter-element padding of 'last_dim_size' to the last dimension.
-    xla::PaddingConfig config = xla::MakeNoPaddingConfig(dims.size());
-    auto* dim = config.mutable_dimensions(last_dim);
-    dim->set_interior_padding(last_dim_size);
-    diag = builder->Pad(diag, XlaHelpers::Zero(builder, input_type(0)), config);
-
-    // Reshapes to the final shape.
-    dims.push_back(last_dim_size);
-    diag = builder->Reshape(diag, dims);
-
+    auto diag_or_status =
+        CreateDiagonal(diag, last_dim_size, other_dims, ctx, builder);
+    OP_REQUIRES_OK(ctx, diag_or_status.status());
+    diag = diag_or_status.ValueOrDie();
     ctx->SetOutput(0, diag);
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d91ebb500b4479dbb3c8e2ea7719bc79dc24ba4f
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -0,0 +1,367 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/array4d.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/lib/math/math_util.h"
+
+namespace tensorflow {
+namespace {
+
+// We implement bilinear interpolation by upsampling followed by convolution.
+// The basic idea is as follows. To scale from NxN to RxR:
+//
+//    1. S := (N - 1) /  gcd(N-1, R-1)
+//    2. k := (R - 1) /  gcd(N-1, R-1)
+//    3. Convolution(kxk, stride=S, lhs_dilation=k, padding=k-1)
+//
+// For example, to Scale from 7x7 -> 15x15:
+//
+//    1. S := (7-1) / gcd(7-1, 15-1) = 6 / gcd(6, 14) = 6 / 2 = 3
+//    2. k := (15 - 1) / gcd(7-1, 15-1) = 14 / gcd(6, 14) = 14 / 2 = 7
+//    3. Convolution(7x7, stride=3, lhs_dilation=3, padding=2)
+//
+//
+// The 7x7 -> 15x15 case is much too large to write out in full as an
+// example. The smallest interesting example is 3x3 -> 4x4.
+//
+// S := 2
+// k := 3
+//
+// 00 03 06    00 00 00 00 00 00 00 00 00 00 00      00 02 04 06
+// 09 12 15 -> 00 00 00 00 00 00 00 00 00 00 00   -> 06 08 10 12
+// 18 21 24    00 00 00 00 00 03 00 00 06 00 00      12 14 16 18
+//             00 00 00 00 00 00 00 00 00 00 00      18 20 22 24
+//             00 00 00 00 00 00 00 00 00 00 00
+//             00 00 09 00 00 12 00 00 15 00 00
+//             00 00 00 00 00 00 00 00 00 00 00
+//             00 00 00 00 00 00 00 00 00 00 00
+//             00 00 18 00 00 21 00 00 24 00 00
+//             00 00 00 00 00 00 00 00 00 00 00
+//             00 00 00 00 00 00 00 00 00 00 00
+//
+// with the following convolutional kernel, with stride [2, 2]:
+//       1 2 3 2 1
+//       2 4 6 4 2
+// 1/9 * 3 6 9 6 3
+//       2 4 6 4 2
+//       1 2 3 2 1
+
+// Computes the size of the convolutional kernel and stride to use when resizing
+// from in_size to out_size.
+struct ResizeConvolutionDims {
+  // Size of the kernel to use.
+  std::vector<int64> kernel_size;
+
+  // Stride of the convolution to use.
+  std::vector<int64> stride;
+};
+ResizeConvolutionDims ComputeResizeConvolutionParameters(
+    gtl::ArraySlice<int64> in_size, gtl::ArraySlice<int64> out_size) {
+  CHECK_EQ(in_size.size(), out_size.size());
+  int num_spatial_dims = in_size.size();
+  ResizeConvolutionDims dims;
+  dims.kernel_size.resize(num_spatial_dims);
+  dims.stride.resize(num_spatial_dims);
+  for (int i = 0; i < num_spatial_dims; ++i) {
+    if (in_size[i] == 1) {
+      // We must handle input size 1 specially because XLA convolution does
+      // not allow stride 0.
+      dims.stride[i] = dims.kernel_size[i] = 1;
+    } else if (out_size[i] == 1) {
+      // If in_size[i] > 1 but out_size[i] == 1, then we slice out the first
+      // entry before resizing.
+      dims.stride[i] = dims.kernel_size[i] = 1;
+    } else {
+      int64 gcd = MathUtil::GCD(static_cast<uint64>(in_size[i] - 1),
+                                static_cast<uint64>(out_size[i] - 1));
+      dims.stride[i] = (in_size[i] - 1) / gcd;
+      dims.kernel_size[i] = (out_size[i] - 1) / gcd;
+    }
+  }
+  return dims;
+}
+
+xla::ComputationDataHandle MakeBilinearResizeKernel(
+    xla::ComputationBuilder* builder, gtl::ArraySlice<int64> kernel_size,
+    int64 channels) {
+  // Form a 2D convolution kernel like:
+  //       1 2 3 2 1
+  //       2 4 6 4 2
+  // 1/9 * 3 6 9 6 3
+  //       2 4 6 4 2
+  //       1 2 3 2 1
+  // by multiplying two 1D kernels of the form:
+  // 1/3 * [1 2 3 2 1]
+  auto make_1d_kernel = [](int64 n) {
+    std::vector<float> kernel(n * 2 - 1);
+    for (int64 i = 0; i < n; ++i) {
+      float v = i + 1;
+      kernel[i] = v;
+      kernel[n * 2 - 2 - i] = v;
+    }
+    return kernel;
+  };
+
+  // Form a block diagonal kernel where each channel interacts only with itself.
+  xla::Array4D<float> diag(1, 1, channels, channels, 0.0f);
+  for (int i = 0; i < channels; ++i) {
+    diag(0, 0, i, i) = 1.0f / (kernel_size[0] * kernel_size[1]);
+  }
+  return builder->Mul(
+      builder->ConstantR1<float>(make_1d_kernel(kernel_size[0])),
+      builder->Mul(builder->ConstantR1<float>(make_1d_kernel(kernel_size[1])),
+                   builder->ConstantR4FromArray4D(diag),
+                   /*broadcast_dimensions=*/{1}),
+      /*broadcast_dimensions=*/{0});
+}
+
+class ResizeBilinearOp : public XlaOpKernel {
+ public:
+  explicit ResizeBilinearOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_));
+    OP_REQUIRES(
+        ctx, align_corners_ == true,
+        errors::Unimplemented(
+            "ResizeBilinear with align_corners=False is not yet implemented"));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+
+    TensorShape input_shape = ctx->InputShape(0);
+    OP_REQUIRES(ctx, input_shape.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input_shape.DebugString()));
+    const int64 batch = input_shape.dim_size(0);
+    const std::vector<int64> in_size = {input_shape.dim_size(1),
+                                        input_shape.dim_size(2)};
+    const int64 channels = input_shape.dim_size(3);
+    OP_REQUIRES(ctx, in_size[0] > 0 && in_size[1] > 0,
+                errors::InvalidArgument("input size must be positive, got [",
+                                        in_size[0], ",", in_size[1], "]"));
+
+    std::vector<int64> out_size;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &out_size));
+    OP_REQUIRES(ctx, out_size.size() == 2,
+                errors::InvalidArgument("output size must be length 2, got ",
+                                        out_size.size()));
+    OP_REQUIRES(ctx, out_size[0] > 0 && out_size[1] > 0,
+                errors::InvalidArgument("output size must be positive, got [",
+                                        out_size[0], ",", out_size[1], "]"));
+
+    const int num_spatial_dims = 2;
+
+    xla::ComputationDataHandle input = ctx->Input(0);
+
+    // If in_size[i] > 1 and out_size[i] == 1, slice out the first input in
+    // dimension i.
+    std::vector<int64> slice_size = in_size;
+    bool slice_input = false;
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      if (in_size[i] > 1 && out_size[i] == 1) {
+        // If in_size[i] > 1 but out_size[i] == 1, then we slice out the first
+        // entry before resizing.
+        slice_input = true;
+        slice_size[i] = 1;
+      }
+    }
+    if (slice_input) {
+      input = b->Slice(input, {0, 0, 0, 0},
+                       {batch, slice_size[0], slice_size[1], channels},
+                       {1, 1, 1, 1});
+    }
+
+    // Output is always type float.
+    input = b->ConvertElementType(input, xla::F32);
+
+    // Picture for a 1x3 to 1x4 resize:
+    // stride = 2, kernel size = 3
+    // Input:
+    // 3 6 9
+    // Input with dilation and padding:
+    // 0 0 3 0 0 6 0 0 9 0 0
+    // Convolution kernel:
+    // 1/3 * [1 2 3 2 1]
+    // Output:
+    // 3 5 7 9
+    xla::ConvolutionDimensionNumbers dnums;
+    dnums.set_input_batch_dimension(0);
+    dnums.set_output_batch_dimension(0);
+    dnums.set_input_feature_dimension(3);
+    dnums.set_output_feature_dimension(3);
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      dnums.add_input_spatial_dimensions(1 + i);
+      dnums.add_output_spatial_dimensions(1 + i);
+      dnums.add_kernel_spatial_dimensions(i);
+    }
+    dnums.set_kernel_input_feature_dimension(num_spatial_dims);
+    dnums.set_kernel_output_feature_dimension(num_spatial_dims + 1);
+
+    ResizeConvolutionDims dims =
+        ComputeResizeConvolutionParameters(in_size, out_size);
+    xla::ComputationDataHandle kernel =
+        MakeBilinearResizeKernel(b, dims.kernel_size, channels);
+    xla::ComputationDataHandle output = b->ConvGeneralDilated(
+        input, kernel, dims.stride,
+        /*padding=*/
+        {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1},
+         {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
+        /*lhs_dilation=*/dims.kernel_size,
+        /*rhs_dilation=*/{1, 1}, dnums);
+
+    // Add broadcasts to handle expanding from a size == 1 dimension to a
+    // size > 1 dimension.
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      if (in_size[i] == 1 && out_size[i] > 1) {
+        output = b->Add(output, b->ConstantR1<float>(out_size[i], 0),
+                        /*broadcast_dimensions=*/{1 + i});
+      }
+    }
+
+    ctx->SetOutput(0, output);
+  }
+
+ private:
+  bool align_corners_;
+};
+
+REGISTER_XLA_OP(Name("ResizeBilinear"), ResizeBilinearOp);
+
+class ResizeBilinearGradOp : public XlaOpKernel {
+ public:
+  explicit ResizeBilinearGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_));
+    OP_REQUIRES(
+        ctx, align_corners_ == true,
+        errors::Unimplemented("ResizeBilinearGrad with align_corners=False is "
+                              "not yet implemented"));
+
+    DataType output_dtype;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &output_dtype));
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(output_dtype, &output_type_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+
+    TensorShape input_shape = ctx->InputShape(1);
+    OP_REQUIRES(ctx, input_shape.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input_shape.DebugString()));
+    const int64 batch = input_shape.dim_size(0);
+    const std::vector<int64> in_size = {input_shape.dim_size(1),
+                                        input_shape.dim_size(2)};
+    const int64 channels = input_shape.dim_size(3);
+    OP_REQUIRES(ctx, in_size[0] > 0 && in_size[1] > 0,
+                errors::InvalidArgument("input size must be positive, got [",
+                                        in_size[0], ",", in_size[1], "]"));
+
+    TensorShape grad_shape = ctx->InputShape(0);
+    OP_REQUIRES(ctx, grad_shape.dims() == 4,
+                errors::InvalidArgument("gradient must be 4-dimensional",
+                                        grad_shape.DebugString()));
+    const int64 grad_batch = grad_shape.dim_size(0);
+    const std::vector<int64> grad_size = {grad_shape.dim_size(1),
+                                          grad_shape.dim_size(2)};
+    const int64 grad_channels = grad_shape.dim_size(3);
+    OP_REQUIRES(ctx, batch == grad_batch,
+                errors::InvalidArgument(
+                    "activations and gradients must have the same batch size (",
+                    batch, " vs. ", grad_batch, ")"));
+    OP_REQUIRES(ctx, grad_size[0] > 0 && grad_size[1] > 0,
+                errors::InvalidArgument("gradient size must be positive, got [",
+                                        grad_size[0], ",", grad_size[1], "]"));
+    OP_REQUIRES(
+        ctx, channels == grad_channels,
+        errors::InvalidArgument(
+            "activations and gradients must have the same number of channels (",
+            channels, " vs. ", grad_channels, ")"));
+
+    const int num_spatial_dims = 2;
+
+    xla::ComputationDataHandle grad = ctx->Input(0);
+
+    ResizeConvolutionDims dims =
+        ComputeResizeConvolutionParameters(in_size, grad_size);
+
+    // To form the backward convolution, we keep the kernel unchanged (it is
+    // already symmetric) and swap the roles of strides and LHS dilation.
+    xla::ConvolutionDimensionNumbers dnums;
+    dnums.set_input_batch_dimension(0);
+    dnums.set_output_batch_dimension(0);
+    dnums.set_input_feature_dimension(3);
+    dnums.set_output_feature_dimension(3);
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      dnums.add_input_spatial_dimensions(1 + i);
+      dnums.add_output_spatial_dimensions(1 + i);
+      dnums.add_kernel_spatial_dimensions(i);
+    }
+    dnums.set_kernel_input_feature_dimension(num_spatial_dims);
+    dnums.set_kernel_output_feature_dimension(num_spatial_dims + 1);
+    xla::ComputationDataHandle kernel =
+        MakeBilinearResizeKernel(b, dims.kernel_size, channels);
+
+    // Broadcast the input kernel where the forward op expanded from a size == 1
+    // dimension to a size > 1 dimension. This has the effect of summing the
+    // gradient contributions in that dimension.
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      if (in_size[i] == 1 && grad_size[i] > 1) {
+        kernel = b->Add(kernel, b->ConstantR1<float>(grad_size[i], 0),
+                        /*broadcast_dimensions=*/{i});
+      }
+    }
+
+    xla::ComputationDataHandle output = b->ConvGeneralDilated(
+        grad, kernel, /*window_strides=*/dims.kernel_size,
+        /*padding=*/
+        {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1},
+         {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
+        /*lhs_dilation=*/dims.stride,
+        /*rhs_dilation=*/{1, 1}, dnums);
+
+    // If in_size[i] > 1 and grad_size[i] == 1, pad the output in dimension i.
+    // Opposite of the slice performed by the forward op.
+    xla::PaddingConfig padding = xla::MakeNoPaddingConfig(4);
+    bool pad_output = false;
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      if (in_size[i] > 1 && grad_size[i] == 1) {
+        pad_output = true;
+        padding.mutable_dimensions(1 + i)->set_edge_padding_high(in_size[i] -
+                                                                 1);
+      }
+    }
+    if (pad_output) {
+      output = b->Pad(output, b->ConstantR0<float>(0.0f), padding);
+    }
+
+    output = b->ConvertElementType(output, output_type_);
+    ctx->SetOutput(0, output);
+  }
+
+ private:
+  bool align_corners_;
+  xla::PrimitiveType output_type_;
+};
+
+REGISTER_XLA_OP(Name("ResizeBilinearGrad"), ResizeBilinearGradOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
index fcef497e5845d9080bc83b54e92dcf2fdecf5f12..644abd5905c6ce5a8f61792a1986560bab891040 100644
--- a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
@@ -23,8 +23,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-constexpr std::array<DataType, 4> kMatmulTypes = {
-    {DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64}};
+constexpr std::array<DataType, 5> kMatmulTypes = {
+    {DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64}};
 
 class MatMulOp : public XlaOpKernel {
  public:
@@ -85,10 +85,7 @@ class SparseMatMulOp : public MatMulOp {
   ~SparseMatMulOp() override = default;
 };
 
-REGISTER_XLA_OP(Name("SparseMatMul")
-                    .TypeConstraint("Ta", kFloatTypes)
-                    .TypeConstraint("Tb", kFloatTypes),
-                SparseMatMulOp);
+REGISTER_XLA_OP(Name("SparseMatMul"), SparseMatMulOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..650f8c7dc8be0cb08997ec641ca3f82352166fdd
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -0,0 +1,141 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+// TODO(phawkins): implement double-sized windowed reductions in XLA and remove
+// the type constraint.
+constexpr std::array<DataType, 3> kScanOpTypes = {
+    {DT_HALF, DT_BFLOAT16, DT_FLOAT}};
+
+class ScanOp : public XlaOpKernel {
+ public:
+  ScanOp(OpKernelConstruction* ctx, bool sum) : XlaOpKernel(ctx), sum_(sum) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("reverse", &reverse_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("exclusive", &exclusive_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape input_shape = ctx->InputShape(0);
+    const TensorShape tensor_axis_shape = ctx->InputShape(1);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tensor_axis_shape),
+                errors::InvalidArgument("ScanOp: axis must be a scalar, not ",
+                                        tensor_axis_shape.DebugString()));
+
+    int64 axis;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &axis));
+    if (axis < 0) {
+      axis += input_shape.dims();
+    }
+    OP_REQUIRES(
+        ctx, FastBoundsCheck(axis, input_shape.dims()),
+        errors::InvalidArgument("ScanOp: Expected scan axis in the range [",
+                                -input_shape.dims(), ", ", input_shape.dims(),
+                                "), but got ", axis));
+
+    DataType dtype = ctx->input_type(0);
+
+    if (input_shape.num_elements() == 0) {
+      // Exit early if there is nothing to compute.
+      ctx->SetOutput(0, ctx->Input(0));
+      return;
+    }
+
+    xla::ComputationBuilder* builder = ctx->builder();
+
+    std::vector<int64> window_strides(input_shape.dims(), 1);
+    std::vector<int64> window_dims(input_shape.dims(), 1);
+    window_dims[axis] = input_shape.dim_size(axis);
+
+    std::vector<std::pair<int64, int64>> padding(input_shape.dims(), {0, 0});
+    padding[axis].first = input_shape.dim_size(axis) - 1;
+    // In exclusive mode, add an extra padding element so there is a complete
+    // window of padding before the data starts.
+    if (exclusive_) {
+      ++padding[axis].first;
+    }
+    if (reverse_) {
+      std::swap(padding[axis].first, padding[axis].second);
+    }
+
+    xla::ComputationDataHandle input = ctx->Input(0);
+    xla::ComputationDataHandle init;
+    const xla::Computation* reducer;
+    if (sum_) {
+      init = XlaHelpers::Zero(builder, dtype);
+      reducer = ctx->GetOrCreateAdd(dtype);
+    } else {
+      init = XlaHelpers::One(builder, dtype);
+      reducer = ctx->GetOrCreateMul(dtype);
+    }
+    auto output = builder->ReduceWindowWithGeneralPadding(
+        ctx->Input(0), init, *reducer, window_dims, window_strides, padding);
+
+    // In exclusive mode, we have computed an extra element containing the sum
+    // of all the input elements. Slice off this extra "last" element.
+    if (exclusive_) {
+      if (reverse_) {
+        output = builder->SliceInDim(output, 1, input_shape.dim_size(axis) + 1,
+                                     1, axis);
+
+      } else {
+        output =
+            builder->SliceInDim(output, 0, input_shape.dim_size(axis), 1, axis);
+      }
+    }
+    ctx->SetOutput(0, output);
+  }
+
+ private:
+  const bool sum_;  // True=cumulative sum. False=cumulative product.
+  bool reverse_;
+  bool exclusive_;
+};
+
+class CumsumOp : public ScanOp {
+ public:
+  explicit CumsumOp(OpKernelConstruction* ctx) : ScanOp(ctx, /*sum=*/true) {}
+};
+REGISTER_XLA_OP(Name("Cumsum").TypeConstraint("T", kScanOpTypes), CumsumOp);
+
+class CumprodOp : public ScanOp {
+ public:
+  explicit CumprodOp(OpKernelConstruction* ctx) : ScanOp(ctx, /*sum=*/false) {}
+};
+REGISTER_XLA_OP(Name("Cumprod").TypeConstraint("T", kScanOpTypes), CumprodOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 24a99f253d6dc8bb699fff587c363b12c227e821..e205fadd2b1bcae96a7bfa1bc83096d405ce22c4 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // XLA-specific Shape Ops.
 
+#include "tensorflow/compiler/tf2xla/kernels/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -27,56 +28,42 @@ namespace {
 
 class ShapeOp : public XlaOpKernel {
  public:
-  explicit ShapeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  explicit ShapeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_dtype_));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape input_shape = ctx->InputShape(0);
-    const int rank = input_shape.dims();
-    Tensor shape_constant(DT_INT32, TensorShape({rank}));
-    auto vec = shape_constant.vec<int32>();
-    // TODO(dga): support int64.  b/28119922.
-    for (int i = 0; i < rank; ++i) {
-      int64 dim_size = input_shape.dim_size(i);
-      OP_REQUIRES(
-          ctx, FastBoundsCheck(dim_size, std::numeric_limits<int32>::max()),
-          errors::InvalidArgument("Shape does not support tensors > int32max",
-                                  " but dim ", i, " is ", dim_size));
-      vec(i) = static_cast<int32>(dim_size);
-    }
-
+    Tensor shape_constant(out_dtype_, TensorShape({input_shape.dims()}));
+    OP_REQUIRES_OK(ctx, TensorShapeToConstant(input_shape, &shape_constant));
     ctx->SetConstantOutput(0, shape_constant);
   }
+
+ private:
+  DataType out_dtype_;
 };
 
 REGISTER_XLA_OP(Name("Shape"), ShapeOp);
 
 class ShapeNOp : public XlaOpKernel {
  public:
-  explicit ShapeNOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  explicit ShapeNOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_dtype_));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
     for (int i = 0; i < ctx->num_inputs(); ++i) {
-      const TensorShape shape = ctx->InputShape(i);
-      const int dims = shape.dims();
-      Tensor shape_constant(DT_INT32, TensorShape({dims}));
-      auto vec = shape_constant.vec<int32>();
-
-      // TODO(dga): support int64.  b/28119922.
-      for (int j = 0; j < dims; ++j) {
-        int64 dim_size = shape.dim_size(j);
-        OP_REQUIRES(
-            ctx, FastBoundsCheck(dim_size, std::numeric_limits<int32>::max()),
-            errors::InvalidArgument("Shape does not support tensors > int32max",
-                                    " but shape ", i, " dim ", j, " is ",
-                                    dim_size));
-        vec(j) = static_cast<int32>(dim_size);
-      }
-
+      const TensorShape input_shape = ctx->InputShape(i);
+      Tensor shape_constant(out_dtype_, TensorShape({input_shape.dims()}));
+      OP_REQUIRES_OK(ctx, TensorShapeToConstant(input_shape, &shape_constant));
       ctx->SetConstantOutput(i, shape_constant);
     }
   }
 
   bool IsExpensive() override { return false; }
+
+ private:
+  DataType out_dtype_;
 };
 REGISTER_XLA_OP(Name("ShapeN"), ShapeNOp);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_util.cc b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..76ea5f525598f511f295eb5a30f3cf603fbf57aa
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
@@ -0,0 +1,48 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/kernels/shape_util.h"
+
+#include <limits>
+
+#include "tensorflow/core/kernels/bounds_check.h"
+
+namespace tensorflow {
+
+Status TensorShapeToConstant(const TensorShape& input_shape,
+                             Tensor* shape_constant) {
+  const int dims = input_shape.dims();
+  if (shape_constant->dtype() == DT_INT32) {
+    auto vec = shape_constant->vec<int32>();
+    for (int i = 0; i < dims; ++i) {
+      int64 dim_size = input_shape.dim_size(i);
+      if (!FastBoundsCheck(dim_size, std::numeric_limits<int32>::max())) {
+        return errors::InvalidArgument(
+            "Shape with out_type=int32 does not support tensors > int32max",
+            " but dim ", i, " is ", dim_size);
+      }
+      vec(i) = static_cast<int32>(dim_size);
+    }
+  } else {
+    auto vec = shape_constant->vec<int64>();
+    for (int i = 0; i < dims; ++i) {
+      int64 dim_size = input_shape.dim_size(i);
+      vec(i) = dim_size;
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_util.h b/tensorflow/compiler/tf2xla/kernels/shape_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..575086e118080f6799a54d3ae6409b2b641c4341
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/shape_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_SHAPE_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_SHAPE_UTIL_H_
+
+#include <limits>
+
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+// Converts a TensorShape to a constant Tensor.
+//
+// The input TensorShape input_shape is used to populate the elements of
+// shape_constant, which is modified in place.
+Status TensorShapeToConstant(const TensorShape& input_shape,
+                             Tensor* shape_constant);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_TF2XLA_KERNELS_SHAPE_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
index 89befda346ec06fec23ab1d1c9d910ded8cd806d..806fda632cde64c1b37ae3b9199028d6b6b0a215 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 namespace {
@@ -23,6 +24,16 @@ namespace {
 class SpaceToDepthOp : public XlaOpKernel {
  public:
   explicit SpaceToDepthOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    OP_REQUIRES(ctx, data_format_ == FORMAT_NCHW || data_format_ == FORMAT_NHWC,
+                errors::InvalidArgument("Unsupported data format ",
+                                        ToString(data_format_),
+                                        "; expected formats NHWC or NCHW"));
+
     OP_REQUIRES_OK(ctx, ctx->GetAttr("block_size", &block_size_));
     OP_REQUIRES(
         ctx, block_size_ > 1,
@@ -31,34 +42,100 @@ class SpaceToDepthOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape input_tensor_shape = ctx->InputShape(0);
-    // The input is presumed to be [batch, height, width, depth]
     int input_rank = input_tensor_shape.dims();
     static const int kRequiredDims = 4;
     OP_REQUIRES(ctx, kRequiredDims == input_rank,
-                errors::InvalidArgument("Input rank should be: ", kRequiredDims,
-                                        " instead of: ", input_rank));
+                errors::InvalidArgument("Input rank should be ", kRequiredDims,
+                                        "; got ", input_rank));
     const gtl::InlinedVector<int64, 4> input_shape =
         input_tensor_shape.dim_sizes();
 
     xla::ComputationBuilder* b = ctx->builder();
     xla::ComputationDataHandle input = ctx->Input(0);
 
+    int feature_dim = GetTensorFeatureDimIndex(input_rank, data_format_);
+    int num_spatial_dims = GetTensorSpatialDims(input_rank, data_format_);
+
+    std::vector<int64> reshaped_shape;
+    std::vector<int64> transpose_order;
+    std::vector<int64> output_shape;
+    reshaped_shape.reserve(input_rank);
+    transpose_order.reserve(input_rank);
+    output_shape.reserve(input_rank);
+    if (data_format_ == FORMAT_NHWC) {
+      int64 block_elems = 1;
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        OP_REQUIRES(ctx, input_shape[1 + i] % block_size_ == 0,
+                    errors::InvalidArgument(
+                        "input shape[", 1 + i, "]=", input_shape[1 + i],
+                        " is not divisible by block_size=", block_size_));
+        block_elems *= block_size_;
+      }
+
+      reshaped_shape.push_back(input_shape[0]);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(input_shape[1 + i] / block_size_);
+        reshaped_shape.push_back(block_size_);
+      }
+      reshaped_shape.push_back(input_shape[feature_dim]);
+
+      transpose_order.push_back(0);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(i * 2 + 1);
+      }
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(i * 2 + 2);
+      }
+      transpose_order.push_back(feature_dim + num_spatial_dims);
+
+      output_shape.push_back(input_shape[0]);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        output_shape.push_back(input_shape[1 + i] / block_size_);
+      }
+      output_shape.push_back(input_shape[feature_dim] * block_elems);
+    } else {
+      // FORMAT_NCHW
+      int64 block_elems = 1;
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        OP_REQUIRES(ctx, input_shape[2 + i] % block_size_ == 0,
+                    errors::InvalidArgument(
+                        "input shape[", 2 + i, "]=", input_shape[2 + i],
+                        " is not divisible by block_size=", block_size_));
+        block_elems *= block_size_;
+      }
+
+      reshaped_shape.push_back(input_shape[0]);
+      reshaped_shape.push_back(input_shape[feature_dim]);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(input_shape[2 + i] / block_size_);
+        reshaped_shape.push_back(block_size_);
+      }
+
+      transpose_order.push_back(0);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(i * 2 + 3);
+      }
+      transpose_order.push_back(feature_dim);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(i * 2 + 2);
+      }
+
+      output_shape.push_back(input_shape[0]);
+      output_shape.push_back(input_shape[feature_dim] * block_elems);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        output_shape.push_back(input_shape[2 + i] / block_size_);
+      }
+    }
+
+    // Note: comments are given in NHWC format; NCHW is similar with a different
+    // dimension order.
     // 1. Reshape `input` to `reshaped` of shape:
     //
     //      [batch,
     //       input_shape[1] / block_size_, block_size_,
     //       input_shape[2] / block_size_, block_size_,
     //       depth]
-    const int block_rank = 2;
-    for (int i = 0; i < block_rank; ++i) {
-      OP_REQUIRES(ctx, input_shape[1 + i] % block_size_ == 0,
-                  errors::InvalidArgument(
-                      "input shape[", 1 + i, "]=", input_shape[1 + i],
-                      " is not divisible by block_size=", block_size_));
-    }
-    xla::ComputationDataHandle reshaped = b->Reshape(
-        input, {input_shape[0], input_shape[1] / block_size_, block_size_,
-                input_shape[2] / block_size_, block_size_, input_shape[3]});
+    xla::ComputationDataHandle reshaped = b->Reshape(input, reshaped_shape);
 
     // 2. Permute dimensions of `reshaped` to produce
     //    `permuted_reshaped` of shape:
@@ -69,7 +146,7 @@ class SpaceToDepthOp : public XlaOpKernel {
     //       block_size_, block_size_,
     //       depth]
     xla::ComputationDataHandle permuted_reshaped =
-        b->Transpose(reshaped, {0, 1, 3, 2, 4, 5});
+        b->Transpose(reshaped, transpose_order);
 
     // 3. Reshape `permuted_reshaped` to flatten `block_shape` into the
     //    batch dimension, producing an output tensor of shape:
@@ -79,15 +156,14 @@ class SpaceToDepthOp : public XlaOpKernel {
     //       input_shape[2] / block_size_,
     //       block_size_ * block_size_ * depth]
     //
-    xla::ComputationDataHandle output = b->Reshape(
-        permuted_reshaped, {input_shape[0], input_shape[1] / block_size_,
-                            input_shape[2] / block_size_,
-                            block_size_ * block_size_ * input_shape[3]});
+    xla::ComputationDataHandle output =
+        b->Reshape(permuted_reshaped, output_shape);
 
     ctx->SetOutput(0, output);
   }
 
  private:
+  TensorFormat data_format_;
   int block_size_;
 };
 REGISTER_XLA_OP(Name("SpaceToDepth"), SpaceToDepthOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b10880de77e6b9811008076cd4a959c284e558d1
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -0,0 +1,279 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/casts.h"
+#include "tensorflow/core/lib/math/math_util.h"
+
+namespace tensorflow {
+namespace {
+
+// Rotates a 32-bit integer 'v' left by 'distance' bits.
+xla::ComputationDataHandle RotateLeftS32(xla::ComputationBuilder* builder,
+                                         const xla::ComputationDataHandle& v,
+                                         int distance) {
+  return builder->Or(
+      builder->ShiftLeft(v, builder->ConstantR0<int>(distance)),
+      builder->ShiftRightLogical(v, builder->ConstantR0<int>(32 - distance)));
+}
+
+// TODO(b/65209188): add a primitive XOR to XLA and call it here, rather than
+// building XOR out of other bitwise operators.
+xla::ComputationDataHandle BitwiseXor(xla::ComputationBuilder* builder,
+                                      const xla::ComputationDataHandle& x,
+                                      const xla::ComputationDataHandle& y) {
+  return builder->Or(builder->And(x, builder->Not(y)),
+                     builder->And(builder->Not(x), y));
+}
+
+using ThreeFry2x32State = std::array<xla::ComputationDataHandle, 2>;
+
+// Implements the ThreeFry counter-based PRNG algorithm.
+// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
+// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
+ThreeFry2x32State ThreeFry2x32(xla::ComputationBuilder* builder,
+                               ThreeFry2x32State input, ThreeFry2x32State key) {
+  // Rotation distances specified by the Threefry2x32 algorithm.
+  constexpr std::array<int, 8> rotations = {13, 15, 26, 6, 17, 29, 16, 24};
+  ThreeFry2x32State x;
+
+  std::array<xla::ComputationDataHandle, 3> ks;
+  // 0x1BD11BDA is a parity constant specified by the ThreeFry2x32 algorithm.
+  ks[2] = builder->ConstantR0<int32>(0x1BD11BDA);
+  for (int i = 0; i < 2; ++i) {
+    ks[i] = key[i];
+    x[i] = input[i];
+    ks[2] = BitwiseXor(builder, ks[2], key[i]);
+  }
+
+  x[0] = builder->Add(x[0], ks[0]);
+  x[1] = builder->Add(x[1], ks[1]);
+
+  // Performs a single round of the Threefry2x32 algorithm, with a rotation
+  // amount 'rotation'.
+  auto round = [builder](ThreeFry2x32State v, int rotation) {
+    v[0] = builder->Add(v[0], v[1]);
+    v[1] = RotateLeftS32(builder, v[1], rotation);
+    v[1] = BitwiseXor(builder, v[0], v[1]);
+    return v;
+  };
+
+  // There are no known statistical flaws with 13 rounds of Threefry2x32.
+  // We are conservative and use 20 rounds.
+  x = round(x, rotations[0]);
+  x = round(x, rotations[1]);
+  x = round(x, rotations[2]);
+  x = round(x, rotations[3]);
+  x[0] = builder->Add(x[0], ks[1]);
+  x[1] = builder->Add(builder->Add(x[1], ks[2]), builder->ConstantR0<int32>(1));
+
+  x = round(x, rotations[4]);
+  x = round(x, rotations[5]);
+  x = round(x, rotations[6]);
+  x = round(x, rotations[7]);
+  x[0] = builder->Add(x[0], ks[2]);
+  x[1] = builder->Add(builder->Add(x[1], ks[0]), builder->ConstantR0<int32>(2));
+
+  x = round(x, rotations[0]);
+  x = round(x, rotations[1]);
+  x = round(x, rotations[2]);
+  x = round(x, rotations[3]);
+  x[0] = builder->Add(x[0], ks[0]);
+  x[1] = builder->Add(builder->Add(x[1], ks[1]), builder->ConstantR0<int32>(3));
+
+  x = round(x, rotations[4]);
+  x = round(x, rotations[5]);
+  x = round(x, rotations[6]);
+  x = round(x, rotations[7]);
+  x[0] = builder->Add(x[0], ks[1]);
+  x[1] = builder->Add(builder->Add(x[1], ks[2]), builder->ConstantR0<int32>(4));
+
+  x = round(x, rotations[0]);
+  x = round(x, rotations[1]);
+  x = round(x, rotations[2]);
+  x = round(x, rotations[3]);
+  x[0] = builder->Add(x[0], ks[2]);
+  x[1] = builder->Add(builder->Add(x[1], ks[0]), builder->ConstantR0<int32>(5));
+
+  return x;
+}
+
+// Returns a tensor of 'shape' random values uniformly distributed in the range
+// [minval, maxval)
+xla::ComputationDataHandle RandomUniform(xla::ComputationBuilder* builder,
+                                         const xla::ComputationDataHandle& seed,
+                                         const TensorShape& shape,
+                                         double minval, double maxval) {
+  // Split the seed into two 32-bit scalars to form a key.
+  auto seed0 = builder->Reshape(builder->Slice(seed, {0}, {1}, {1}), {});
+  auto seed1 = builder->Reshape(builder->Slice(seed, {1}, {2}, {1}), {});
+  ThreeFry2x32State key = {seed0, seed1};
+  const int64 size = shape.num_elements();
+
+  const int64 half_size = MathUtil::CeilOfRatio<int64>(size, 2);
+  const bool size_is_odd = (half_size * 2 != size);
+
+  // Fill the generator inputs with unique counter values.
+  ThreeFry2x32State inputs;
+  TF_CHECK_OK(XlaHelpers::Iota(builder, DT_INT32, half_size, &inputs[0]));
+  inputs[1] = builder->Add(inputs[0], builder->ConstantR0<int32>(half_size));
+  ThreeFry2x32State outputs = ThreeFry2x32(builder, inputs, key);
+
+  if (size_is_odd) {
+    outputs[1] = builder->Slice(outputs[1], {0}, {half_size - 1}, {1});
+  }
+
+  auto bits =
+      builder->Reshape(builder->ConcatInDim(outputs, 0), shape.dim_sizes());
+
+  // Form 22 random mantissa bits, with a leading 1 bit. The leading 1 bit
+  // forces the random bits into the mantissa.
+  constexpr int kFloatBits = 32;
+  constexpr int kMantissaBits = 23;
+  bits = builder->Or(
+      builder->ShiftRightLogical(
+          bits, builder->ConstantR0<int32>(kFloatBits - kMantissaBits)),
+      builder->ConstantR0<int32>(bit_cast<int32>(1.0f)));
+  auto floats = builder->BitcastConvertType(bits, xla::F32);
+
+  // We have a floating point number in the range [1.0, 2.0).
+  // Subtract 1.0f to shift to the range [0.0, 1.0)
+  floats = builder->Sub(floats, builder->ConstantR0<float>(1.0f));
+  // Multiply and add to shift to the range [minval, maxval).
+  floats = builder->Mul(floats, builder->ConstantR0<float>(maxval - minval));
+  floats = builder->Add(floats, builder->ConstantR0<float>(minval));
+  return floats;
+}
+
+// Approximation for the inverse error function from
+//   Giles, M., "Approximating the erfinv function".
+// The approximation has the form:
+//   w = -log((1 - x) * (1 + x))
+//   if ( w < 5 ) {
+//     w = w - 2.5
+//     p = sum_{i=1}^n lq[i]*w^i
+//   } else {
+//     w = sqrt(w) - 3
+//     p = sum_{i=1}^n gq[i]*w^i
+//   }
+//   return p*x
+xla::ComputationDataHandle ErfInvF32(xla::ComputationBuilder* b,
+                                     const xla::ComputationDataHandle& x,
+                                     const TensorShape& shape) {
+  constexpr int kDegree = 9;
+  constexpr std::array<float, 9> w_less_than_5_constants = {
+      2.81022636e-08f,  3.43273939e-07f, -3.5233877e-06f,
+      -4.39150654e-06f, 0.00021858087f,  -0.00125372503f,
+      -0.00417768164f,  0.246640727f,    1.50140941f};
+  constexpr std::array<float, 9> w_greater_than_5_constants = {
+      -0.000200214257f, 0.000100950558f, 0.00134934322f,
+      -0.00367342844f,  0.00573950773f,  -0.0076224613f,
+      0.00943887047f,   1.00167406f,     2.83297682f};
+
+  auto one = b->ConstantR0<float>(1.0);
+  auto w = b->Neg(b->Log(b->Mul(b->Sub(one, x), b->Add(one, x))));
+
+  auto lt = b->Lt(w, b->ConstantR0<float>(5.0));
+  auto coefficient = [&](int i) {
+    return b->Select(
+        lt,
+        b->Broadcast(b->ConstantR0<float>(w_less_than_5_constants[i]),
+                     shape.dim_sizes()),
+        b->Broadcast(b->ConstantR0<float>(w_greater_than_5_constants[i]),
+                     shape.dim_sizes()));
+  };
+  w = b->Select(lt, b->Sub(w, b->ConstantR0<float>(2.5f)),
+                b->Sub(b->SqrtF32(w), b->ConstantR0<float>(3.0f)));
+  auto p = coefficient(0);
+  for (int i = 1; i < kDegree; ++i) {
+    p = b->Add(coefficient(i), b->Mul(p, w));
+  }
+  return b->Mul(p, x);
+}
+
+}  // namespace
+
+class StatelessRandomUniformOp : public XlaOpKernel {
+ public:
+  explicit StatelessRandomUniformOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* builder = ctx->builder();
+
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
+
+    TensorShape seed_shape = ctx->InputShape(1);
+    OP_REQUIRES(ctx, seed_shape.dims() == 1 && seed_shape.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_shape.DebugString()));
+    xla::ComputationDataHandle seed = ctx->Input(1);
+    ctx->SetOutput(0, RandomUniform(builder, seed, shape, 0.0, 1.0));
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformOp);
+};
+
+// TODO(phawkins): generalize to non-float, non-int32 seed types.
+REGISTER_XLA_OP(Name("StatelessRandomUniform")
+                    .TypeConstraint("dtype", DT_FLOAT)
+                    .TypeConstraint("Tseed", DT_INT32),
+                StatelessRandomUniformOp);
+
+class StatelessRandomNormalOp : public XlaOpKernel {
+ public:
+  explicit StatelessRandomNormalOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
+
+    TensorShape seed_shape = ctx->InputShape(1);
+    OP_REQUIRES(ctx, seed_shape == TensorShape({2}),
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_shape.DebugString()));
+    xla::ComputationDataHandle seed = ctx->Input(1);
+    xla::ComputationBuilder* builder = ctx->builder();
+    auto uniform = RandomUniform(builder, seed, shape, -1.0, 1.0);
+    // Convert uniform distribution to normal distribution by computing
+    // sqrt(2) * erfinv(x)
+    auto normal = builder->Mul(builder->ConstantR0<float>(std::sqrt(2.0)),
+                               ErfInvF32(builder, uniform, shape));
+    ctx->SetOutput(0, normal);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomNormalOp);
+};
+
+// TODO(phawkins): generalize to non-float, non-int32 seed types.
+REGISTER_XLA_OP(Name("StatelessRandomNormal")
+                    .TypeConstraint("dtype", DT_FLOAT)
+                    .TypeConstraint("Tseed", DT_INT32),
+                StatelessRandomNormalOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 351fda251798e43b607fb445f2c98abd57b3d86b..03c22354a9425189e6cf7ee5a7201c90ecb1908d 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -311,6 +311,32 @@ class TensorArrayGatherOp : public XlaOpKernel {
 
     xla::ComputationDataHandle ta = resource->value;
 
+    // Look for the case where the gather takes a simple slice from the
+    // tensor array (0, 1, 2, 3, 4, ..., N)
+    std::vector<int64> const_indices;
+    Status status = ctx->ConstantInputAsIntVector(1, &const_indices);
+    if (status.ok()) {
+      bool gather_is_dense_slice = true;
+      for (auto i = 0; i < const_indices.size(); i++) {
+        if (const_indices[i] != i) {
+          gather_is_dense_slice = false;
+          break;
+        }
+      }
+
+      if (gather_is_dense_slice) {
+        std::vector<int64> begin(ta_shape.dims(), 0);
+        std::vector<int64> strides(ta_shape.dims(), 1);
+        std::vector<int64> end(ta_shape.dims(), 1);
+        end[0] = const_indices.size();
+        for (auto i = 1; i < ta_shape.dims(); i++) {
+          end[i] = ta_shape.dim_size(i);
+        }
+        ctx->SetOutput(0, b->Slice(ta, begin, end, strides));
+        return;
+      }
+    }
+
     xla::ComputationDataHandle gather = XlaComputeGatherDynamicSlice(
         ctx, ta, ta_shape, indices, indices_shape, 0, dtype_, index_type, b);
     ctx->SetOutput(0, gather);
@@ -352,28 +378,47 @@ class TensorArrayScatterOp : public XlaOpKernel {
     const xla::ComputationDataHandle value = ctx->Input(2);
     const xla::ComputationDataHandle flow = ctx->Input(3);
 
-    auto slice_dims = value_shape.dim_sizes();
-    slice_dims[0] = 1LL;
-
-    std::vector<int64> value_starts(value_shape.dims(), 0);
-    auto value_ends = value_shape.dim_sizes();
-
-    std::vector<int64> value_strides(value_shape.dims(), 1);
-
-    // For every (index, value) pair, update the corresponding TensorArray
-    // storage.
-    for (int i = 0; i < num_indices; ++i) {
-      // Slice out part of the value.
-      value_starts[0] = i;
-      value_ends[0] = i + 1;
-      auto slice = b->Slice(value, value_starts, value_ends, value_strides);
+    // Look for the case where the scatter is for each sub-tensor in order. The
+    // tensor array implementation allows for this to be a straight addition.
+    bool scatter_all_elements_in_order = false;
+    std::vector<int64> const_indices;
+    Status status = ctx->ConstantInputAsIntVector(1, &const_indices);
+    if (status.ok() && num_indices == value_shape.dim_size(0)) {
+      scatter_all_elements_in_order = true;
+      for (auto i = 0; i < num_indices; i++) {
+        if (const_indices[i] != i) {
+          scatter_all_elements_in_order = false;
+          break;
+        }
+      }
+    }
 
-      // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-      auto index = b->Slice(indices, {i}, {i + 1}, {1});
-      auto start_indices =
-          b->Pad(b->Reshape(index, {1}), b->ConstantR0<int32>(0),
-                 xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
-      ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices);
+    if (scatter_all_elements_in_order) {
+      ta = b->Add(ta, value);
+    } else {
+      auto slice_dims = value_shape.dim_sizes();
+      slice_dims[0] = 1LL;
+
+      std::vector<int64> value_starts(value_shape.dims(), 0);
+      auto value_ends = value_shape.dim_sizes();
+
+      std::vector<int64> value_strides(value_shape.dims(), 1);
+
+      // For every (index, value) pair, update the corresponding TensorArray
+      // storage.
+      for (int i = 0; i < num_indices; ++i) {
+        // Slice out part of the value.
+        value_starts[0] = i;
+        value_ends[0] = i + 1;
+        auto slice = b->Slice(value, value_starts, value_ends, value_strides);
+
+        // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
+        auto index = b->Slice(indices, {i}, {i + 1}, {1});
+        auto start_indices =
+                b->Pad(b->Reshape(index, {1}), b->ConstantR0<int32>(0),
+                       xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
+        ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices);
+      }
     }
 
     resource->value = ta;
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index b19ea22f50d2dd44e8d1d81f5930263f364030e1..68847ae7a2cb926edd9d29007e24b0db7fb5a75f 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
+#include "tensorflow/compiler/tf2xla/kernels/shape_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -22,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/no_op.h"
 
 namespace tensorflow {
@@ -121,5 +123,26 @@ class ResourceGatherOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("ResourceGather").TypeConstraint("dtype", kNumericTypes),
                 ResourceGatherOp);
 
+class VariableShapeOp : public XlaOpKernel {
+ public:
+  explicit VariableShapeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    DataType variable_dtype;
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetVariableTypeAndShape(0, &variable_dtype, &shape));
+    Tensor shape_constant(out_dtype_, TensorShape({shape.dims()}));
+    OP_REQUIRES_OK(ctx, TensorShapeToConstant(shape, &shape_constant));
+    ctx->SetConstantOutput(0, shape_constant);
+  }
+
+ private:
+  DataType out_dtype_;
+};
+
+REGISTER_XLA_OP(Name("VariableShape"), VariableShapeOp);
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
index 28a5e6a58bb312f4c4821bcce484a08160009d56..9b0e6174475c22e325c090bec5f1d56822e106bc 100644
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.cc
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
@@ -27,7 +27,6 @@ namespace tensorflow {
 
 // The current implementation simply unrolls the computation along the batch
 // dimension.
-// TODO(andydavis): add batching support to XLA's Dot operator.
 xla::StatusOr<xla::ComputationDataHandle> BatchDot(
     xla::ComputationBuilder* builder, xla::ComputationDataHandle x,
     xla::ComputationDataHandle y, bool transpose_x, bool transpose_y) {
@@ -52,26 +51,20 @@ xla::StatusOr<xla::ComputationDataHandle> BatchDot(
 
   // The batch dimensions must be equal and the matrix dimensions must be
   // valid.
-  std::vector<int64> dimensions;
-  int64 batch_count = 1;
+  std::vector<int64> batch_dimension_numbers;
   for (int i = 0; i < ndims - 2; ++i) {
-    int64 x_size = x_shape->dimensions(i);
-    int64 y_size = y_shape->dimensions(i);
-    if (x_size != y_size) {
+    if (x_shape->dimensions(i) != y_shape->dimensions(i)) {
       return errors::InvalidArgument(
           "Dimension ", i, " of inputs to BatchedDot must be equal: ",
           xla::ShapeUtil::HumanString(*x_shape), " vs ",
           xla::ShapeUtil::HumanString(*y_shape));
     }
-    dimensions.push_back(x_size);
-    batch_count *= x_size;
+    batch_dimension_numbers.push_back(i);
   }
 
   int x_inner_dim = transpose_x ? (ndims - 2) : (ndims - 1);
   int y_inner_dim = transpose_y ? (ndims - 1) : (ndims - 2);
-  int64 x_inner_dim_size = x_shape->dimensions(x_inner_dim);
-  int64 y_inner_dim_size = y_shape->dimensions(y_inner_dim);
-  if (x_inner_dim_size != y_inner_dim_size) {
+  if (x_shape->dimensions(x_inner_dim) != y_shape->dimensions(y_inner_dim)) {
     return errors::InvalidArgument(
         "Dimensions ", x_inner_dim, " and ", y_inner_dim,
         " of arguments to BatchedDot must be equal: ",
@@ -80,19 +73,22 @@ xla::StatusOr<xla::ComputationDataHandle> BatchDot(
         " transpose: ", transpose_y);
   }
 
-  // If there are no batch dimensions, use a regular Dot. This case exists
-  // to improve the readability of the emitted graphs.
-  if (dimensions.empty()) {
-    auto lhs = transpose_x ? builder->Transpose(x, {1, 0}) : x;
-    auto rhs = transpose_y ? builder->Transpose(y, {1, 0}) : y;
-    return builder->Dot(lhs, rhs);
+  // Check for zero lhs/rhs dim size.
+  if (xla::ShapeUtil::HasZeroElements(*x_shape) ||
+      xla::ShapeUtil::HasZeroElements(*y_shape)) {
+    std::vector<int64> dimensions(batch_dimension_numbers.size());
+    for (int i = 0; i < batch_dimension_numbers.size(); ++i) {
+      dimensions[i] = x_shape->dimensions(batch_dimension_numbers[i]);
+    }
+    int x_outer_dim = transpose_x ? (ndims - 1) : (ndims - 2);
+    int y_outer_dim = transpose_y ? (ndims - 2) : (ndims - 1);
+    dimensions.push_back(x_shape->dimensions(x_outer_dim));
+    dimensions.push_back(y_shape->dimensions(y_outer_dim));
+    return builder->Broadcast(
+        builder->ConstantLiteral(xla::Literal::Zero(x_shape->element_type())),
+        dimensions);
   }
 
-  int x_outer_dim = transpose_x ? (ndims - 1) : (ndims - 2);
-  int y_outer_dim = transpose_y ? (ndims - 2) : (ndims - 1);
-  dimensions.push_back(x_shape->dimensions(x_outer_dim));
-  dimensions.push_back(y_shape->dimensions(y_outer_dim));
-
   if (x_shape->element_type() == xla::C64 && transpose_x) {
     x = builder->Conj(x);
   }
@@ -100,55 +96,23 @@ xla::StatusOr<xla::ComputationDataHandle> BatchDot(
     y = builder->Conj(y);
   }
 
-  // Reshape input tensors into 3D tensors by flattening the batch
-  // dimensions. This makes it easier to unroll the batch dimension.
-  auto x_flat =
-      builder->Reshape(x, {batch_count, x_shape->dimensions(ndims - 2),
-                           x_shape->dimensions(ndims - 1)});
-  auto y_flat =
-      builder->Reshape(y, {batch_count, y_shape->dimensions(ndims - 2),
-                           y_shape->dimensions(ndims - 1)});
-
-  // Slice batches into individual matrices and multiply them.
-  std::vector<xla::ComputationDataHandle> out_slices;
-  for (int64 i = 0; i < batch_count; ++i) {
-    // Slice off individual matrices and reshape to 2D tensors.
-    auto x_slice = builder->Slice(
-        x_flat, {i, 0, 0},
-        {i + 1, x_shape->dimensions(ndims - 2), x_shape->dimensions(ndims - 1)},
-        {1, 1, 1});
-    x_slice = builder->Reshape(x_slice, {x_shape->dimensions(ndims - 2),
-                                         x_shape->dimensions(ndims - 1)});
-    auto y_slice = builder->Slice(
-        y_flat, {i, 0, 0},
-        {i + 1, y_shape->dimensions(ndims - 2), y_shape->dimensions(ndims - 1)},
-        {1, 1, 1});
-    y_slice = builder->Reshape(y_slice, {y_shape->dimensions(ndims - 2),
-                                         y_shape->dimensions(ndims - 1)});
-
-    // Transpose if needed.
-    auto lhs = transpose_x ? builder->Transpose(x_slice, {1, 0}) : x_slice;
-    auto rhs = transpose_y ? builder->Transpose(y_slice, {1, 0}) : y_slice;
-
-    // Multiply matrices and add an outer singleton dimension to the output
-    // so we can concatenate along the flattened batch dimension later.
-    auto out = builder->Dot(lhs, rhs);
-    out = builder->Reshape(out,
-                           {1, dimensions[ndims - 2], dimensions[ndims - 1]});
-    out_slices.push_back(out);
+  // If there are no batch dimensions, use a regular Dot.
+  // TODO(b/69062148) Remove this code when Dot emitters can be passed
+  // dimensions to transpose directly (i.e. without requiring a Transpose HLO).
+  if (batch_dimension_numbers.empty()) {
+    auto lhs = transpose_x ? builder->Transpose(x, {1, 0}) : x;
+    auto rhs = transpose_y ? builder->Transpose(y, {1, 0}) : y;
+    return builder->Dot(lhs, rhs);
   }
 
-  // Concatenate output slices and reshape to original number of dimensions.
-  xla::ComputationDataHandle data;
-  if (out_slices.empty()) {
-    // It is illegal to pass an empty list to ConcatInDim.
-    // The batch count is empty, so both inputs must have zero elements.
-    // Arbitrarily use the left input as the argument to Reshape().
-    data = x;
-  } else {
-    data = builder->ConcatInDim(out_slices, 0);
+  xla::DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(x_inner_dim);
+  dot_dnums.add_rhs_contracting_dimensions(y_inner_dim);
+  for (auto batch_dimension_number : batch_dimension_numbers) {
+    dot_dnums.add_lhs_batch_dimensions(batch_dimension_number);
+    dot_dnums.add_rhs_batch_dimensions(batch_dimension_number);
   }
-  return builder->Reshape(data, dimensions);
+  return builder->DotGeneral(x, y, dot_dnums);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index 7ffe0aa6df9b21c4311eb6c8d311fba1e115b3f4..ce24b61b5dc7176f3caa05e3eb9257399fef7926 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -28,7 +28,7 @@ limitations under the License.
 namespace tensorflow {
 
 xla::ComputationDataHandle Zeros(xla::ComputationBuilder* builder,
-                                 xla::Shape& shape) {
+                                 const xla::Shape& shape) {
   return builder->Broadcast(
       builder->ConstantLiteral(xla::Literal::Zero(shape.element_type())),
       xla::AsInt64Slice(shape.dimensions()));
@@ -40,6 +40,9 @@ xla::ComputationDataHandle FloatLiteral(xla::ComputationBuilder* builder,
     case xla::F16:
       return builder->ConstantR0<xla::half>(static_cast<xla::half>(value));
       break;
+    case xla::BF16:
+      return builder->ConstantR0<bfloat16>(static_cast<bfloat16>(value));
+      break;
     case xla::F32:
       return builder->ConstantR0<float>(static_cast<float>(value));
       break;
diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h
index 8fba6b5cf247e9b2c26533c53ece8b0d7d4f4c36..fb138b4f736500aac8184770d97fbf930ced69ea 100644
--- a/tensorflow/compiler/tf2xla/lib/util.h
+++ b/tensorflow/compiler/tf2xla/lib/util.h
@@ -25,7 +25,7 @@ namespace tensorflow {
 
 // Returns a zero-filled tensor with shape `shape`.
 xla::ComputationDataHandle Zeros(xla::ComputationBuilder* builder,
-                                 xla::Shape& shape);
+                                 const xla::Shape& shape);
 
 // Returns a floating point scalar constant of 'type' with 'value'.
 // If 'type' is complex, returns a real value with zero imaginary component.
diff --git a/tensorflow/compiler/tf2xla/sharding_util.cc b/tensorflow/compiler/tf2xla/sharding_util.cc
index d9c839b61019b92b6de3a77a7bec610ae848a9a4..b08a7583cb5ab7efa30a1fa27b973d04992584a7 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.cc
+++ b/tensorflow/compiler/tf2xla/sharding_util.cc
@@ -14,34 +14,59 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
+namespace {
+const char kDeviceSuffixReplicatedCore[] = "REPLICATED_CORE";
+const char kShardingAttribute[] = "_XlaSharding";
+}  // namespace
 
-static const char DEVICE_SUFFIX_REPLICATED_CORE[] = "REPLICATED_CORE";
+namespace {
+xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
+GetShardingFromNodeDef(const NodeDef& node_def) {
+  if (!HasNodeAttr(node_def, kShardingAttribute)) {
+    return tensorflow::gtl::optional<xla::OpSharding>();
+  }
+  string value;
+  xla::OpSharding sharding;
+  TF_RETURN_IF_ERROR(GetNodeAttr(node_def, kShardingAttribute, &value));
+  if (!sharding.ParseFromString(value)) {
+    return xla::InvalidArgument(
+        "Experimental _XlaSharding attribute was not a valid encoded "
+        "xla::OpSharding proto.");
+  }
+  return tensorflow::gtl::optional<xla::OpSharding>(sharding);
+}
 
-static Status CoreOutOfRangeError(int core, int num_cores_per_replica) {
+Status CoreOutOfRangeError(int core, int num_cores_per_replica) {
   return errors::InvalidArgument(
       "Invalid replicated core id: ", core,
       "; num_cores_per_replica=", num_cores_per_replica);
 }
+}  // namespace
 
 xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
-ParseShardingFromDevice(const string& device_name, int num_cores_per_replica) {
+ParseShardingFromDevice(
+    const string& device_name, int num_cores_per_replica,
+    tensorflow::gtl::optional<xla::OpSharding> explicit_sharding) {
   if (device_name.empty()) {
     return tensorflow::gtl::optional<xla::OpSharding>();
   }
-
   DeviceNameUtils::ParsedName parsed_device;
   if (!DeviceNameUtils::ParseFullName(device_name, &parsed_device)) {
     return errors::InvalidArgument("Malformed assigned device '", device_name,
                                    "'");
   }
-  if (!parsed_device.has_type ||
-      !StringPiece(parsed_device.type)
-           .ends_with(DEVICE_SUFFIX_REPLICATED_CORE)) {
+
+  if (explicit_sharding.has_value()) {
+    return explicit_sharding;
+  } else if (!parsed_device.has_type || !parsed_device.has_id ||
+             !StringPiece(parsed_device.type)
+                  .contains(kDeviceSuffixReplicatedCore)) {
     return tensorflow::gtl::optional<xla::OpSharding>();
   } else {
     const int core = parsed_device.id;
@@ -53,20 +78,34 @@ ParseShardingFromDevice(const string& device_name, int num_cores_per_replica) {
   }
 }
 
+xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
+ParseShardingFromDevice(const NodeDef& node_def, int num_cores_per_replica) {
+  const string& device_name = node_def.device();
+  TF_ASSIGN_OR_RETURN(tensorflow::gtl::optional<xla::OpSharding> sharding,
+                      GetShardingFromNodeDef(node_def));
+  return ParseShardingFromDevice(device_name, num_cores_per_replica, sharding);
+}
+
 xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
 ParseShardingFromDevice(const Node& node, int num_cores_per_replica) {
   string device_name = node.assigned_device_name();
   if (device_name.empty()) {
     device_name = node.requested_device();
   }
-  return ParseShardingFromDevice(device_name, num_cores_per_replica);
+  TF_ASSIGN_OR_RETURN(tensorflow::gtl::optional<xla::OpSharding> sharding,
+                      GetShardingFromNodeDef(node.def()));
+  return ParseShardingFromDevice(device_name, num_cores_per_replica, sharding);
 }
+
 void SetShardingDeviceAssignmentFromNode(const Node& src, Node* dst) {
   string device_name = src.assigned_device_name();
   if (device_name.empty()) {
     device_name = src.requested_device();
   }
   dst->set_assigned_device_name(device_name);
+  if (const AttrValue* attr = src.attrs().Find(kShardingAttribute)) {
+    dst->AddAttr(kShardingAttribute, *attr);
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/sharding_util.h b/tensorflow/compiler/tf2xla/sharding_util.h
index f6468bba9f950fec88dcc6b3ec760f014d3a0ef3..9e430e30a1247c7d01910b6d57f7c577964e1dd1 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.h
+++ b/tensorflow/compiler/tf2xla/sharding_util.h
@@ -29,14 +29,21 @@ namespace tensorflow {
 // - if the device name is invalid.
 // - the core is parsed and is out of the range [0, num_cores_per_replica).
 //
-// Otherwise, returns either a non-value or a sharding set as per
-// xla:ShardingBuilder::AssignDevice.
+// Otherwise, returns either:
+// - explicit_sharding if explicit_sharding.has_value()
+// - a non-value if there is no assigned core or
+// - a sharding set as per xla::ShardingBuilder::AssignDevice.
 xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
-ParseShardingFromDevice(const string& device_name, int num_cores_per_replica);
+ParseShardingFromDevice(const string& device_name, int num_cores_per_replica,
+                        tensorflow::gtl::optional<xla::OpSharding>
+                            explicit_sharding = tensorflow::gtl::nullopt);
 
 xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
 ParseShardingFromDevice(const Node& node, int num_cores_per_replica);
 
+xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
+ParseShardingFromDevice(const NodeDef& node_def, int num_cores_per_replica);
+
 void SetShardingDeviceAssignmentFromNode(const Node& src, Node* dst);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index a14c93a2b9494b89f579bc20ee0510c136f8f01b..906f2290433face4cce3296b2f815d50d8c496ce 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -253,8 +253,7 @@ Status CreateXlaArgs(const Graph& graph,
 // Converts the TensorFlow graph into an XLA computation, by executing the
 // graph symbolically, with each op building up the XLA HLO.
 Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
-                         xla::Computation* computation,
-                         bool* requires_runtime_context) {
+                         xla::Computation* computation) {
   XlaOpRegistry::RegisterCompilationKernels();
   for (Node* node : graph->nodes()) {
     node->set_assigned_device_name(
@@ -277,7 +276,6 @@ Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
   TF_RETURN_IF_ERROR(compiler.CompileGraph(XlaCompiler::CompileOptions(),
                                            "tfcompile", std::move(graph),
                                            xla_args, &result));
-  *requires_runtime_context = result.requires_runtime_context;
   *computation = std::move(*result.computation);
 
   int num_const_results = 0;
@@ -352,12 +350,10 @@ Status InitGraph(const GraphDef& graph_def, const tf2xla::Config& config,
 
 Status ConvertGraphDefToXla(const GraphDef& graph_def,
                             const tf2xla::Config& config, xla::Client* client,
-                            xla::Computation* computation,
-                            bool* requires_runtime_context) {
+                            xla::Computation* computation) {
   std::unique_ptr<Graph> graph;
   TF_RETURN_IF_ERROR(InitGraph(graph_def, config, &graph));
-  TF_RETURN_IF_ERROR(ConvertGraphToXla(std::move(graph), client, computation,
-                                       requires_runtime_context));
+  TF_RETURN_IF_ERROR(ConvertGraphToXla(std::move(graph), client, computation));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/tf2xla.h b/tensorflow/compiler/tf2xla/tf2xla.h
index ab99beebf7946237425d4d304a858ac6817177b8..473c431b12d441c652f1d0d6c11c5e87836ab36d 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.h
+++ b/tensorflow/compiler/tf2xla/tf2xla.h
@@ -30,13 +30,9 @@ namespace tensorflow {
 //
 // The computation is built in the context of the given `client`, which may
 // subsequently be used to compile or execute the computation.
-//
-// If `requires_runtime_context` is filled with true, this indicates the last
-// argument of the computation is XlaLocalRuntimeContext*.
 Status ConvertGraphDefToXla(const GraphDef& graph_def,
                             const tf2xla::Config& config, xla::Client* client,
-                            xla::Computation* computation,
-                            bool* requires_runtime_context);
+                            xla::Computation* computation);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7aca889a266439538c4cd1c153460e6cc871b246
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc
@@ -0,0 +1,97 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/tf2xla_supported_ops.h"
+
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace {
+
+void PrintSupportedOps(const string& device, const string& regen_run) {
+  XlaOpRegistry::RegisterCompilationKernels();
+
+  std::vector<const KernelDef*> kdefs =
+      XlaOpRegistry::DeviceKernels(device,
+                                   /*include_compilation_only_kernels=*/true);
+  std::sort(
+      kdefs.begin(), kdefs.end(),
+      [](const KernelDef* a, const KernelDef* b) { return a->op() < b->op(); });
+
+  std::cout << "**Supported operators for device: " << device << "**\n\n"
+            << "Operator | Type Constraint\n"
+            << "-------- | ---------------" << std::endl;
+  for (const KernelDef* kdef : kdefs) {
+    std::vector<string> constraints;
+    for (const KernelDef::AttrConstraint& constraint : kdef->constraint()) {
+      std::vector<string> types;
+      for (int type : constraint.allowed_values().list().type()) {
+        types.push_back(DataTypeString(static_cast<DataType>(type)));
+      }
+      std::sort(types.begin(), types.end());
+      constraints.push_back("`" + constraint.name() + "={" +
+                            str_util::Join(types, ",") + "}`");
+    }
+    std::cout << "`" << kdef->op() << "` | "
+              << str_util::Join(constraints, "<br>") << std::endl;
+  }
+
+  std::cout << "\nTo regenerate this table, run:\n\n```shell\n"
+            << regen_run << " --device=" << device << "\n```" << std::endl;
+}
+
+}  // namespace
+
+void SupportedOpsMain(int argc, char** argv, const char* regen_run) {
+  std::vector<string> device_names = XlaOpRegistry::BackendNames();
+  std::sort(device_names.begin(), device_names.end());
+
+  // Set up and parse flags.
+  string device;
+  std::vector<Flag> flag_list = {
+      {"device", &device,
+       "Name of the compilation device for which to print supported ops, "
+       "one of: " +
+           str_util::Join(device_names, ",")},
+  };
+  string usage = Flags::Usage(argv[0], flag_list);
+  bool parsed_flags_ok = Flags::Parse(&argc, argv, flag_list);
+  QCHECK(parsed_flags_ok) << "\n" << usage;
+  QCHECK(XlaOpRegistry::IsBackendRegistered(device))
+      << "\nUnknown device: " << device << "\n"
+      << usage;
+
+  // Run the program.
+  port::InitMain(usage.c_str(), &argc, &argv);
+  QCHECK(argc == 1) << "\nERROR: This command does not take any arguments "
+                       "other than flags\n\n"
+                    << usage;
+  PrintSupportedOps(device, regen_run);
+}
+
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla_supported_ops.h b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b45fb4cdd3b0173b04e130b7416874a9a406dc5
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.h
@@ -0,0 +1,33 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_TF2XLA_SUPPORTED_OPS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_TF2XLA_SUPPORTED_OPS_H_
+
+namespace tensorflow {
+namespace tf2xla {
+
+// The implementation of a main function for a binary that prints a table of
+// supported tf2xla operators for a given device, along with their type
+// constraints, to stdout.
+//
+// Pass the argc and argv from main, unmodified.  Use regen_run to specify the
+// command used to regenerate the table.
+void SupportedOpsMain(int argc, char** argv, const char* regen_run);
+
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_SUPPORTED_OPS_H_
diff --git a/tensorflow/compiler/tf2xla/tf2xla_supported_ops_main.cc b/tensorflow/compiler/tf2xla/tf2xla_supported_ops_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..690666c2400d45e33c1a5d1818b68a86a70a5be3
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/tf2xla_supported_ops_main.cc
@@ -0,0 +1,22 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/tf2xla_supported_ops.h"
+
+int main(int argc, char** argv) {
+  const char* regen_run =
+      "bazel run -c opt -- tensorflow/compiler/tf2xla:tf2xla_supported_ops";
+  tensorflow::tf2xla::SupportedOpsMain(argc, argv, regen_run);
+}
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index ecd15652fe84b0c19d2f7fc18f877236547f9be9..a9978e697b091715ce120f0d18fdddd259e08b32 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -70,10 +70,7 @@ TEST(ConvertGraphDefToXla, Sum) {
 
   xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
   xla::Computation computation;
-  bool requires_runtime_context;
-  TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation,
-                                    &requires_runtime_context));
-  ASSERT_FALSE(requires_runtime_context);
+  TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
 
   // Set up arguments.
   auto x_literal = xla::Literal::CreateR0<int32>(10);
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 55f2f3149c6ba7bfa18608f961c8a76103a50756..f428a194328935fec1210ea96245344de859e611 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -88,8 +88,8 @@ Status ValidateConfig(const tf2xla::Config& config) {
     TF_RETURN_IF_ERROR(CheckNameDuplicates("fetch", fetch.name(), &names));
   }
   TF_RETURN_IF_ERROR(CheckFeedFetchNameConflicts("fetch", names));
-  if (config.feed().empty() || config.fetch().empty()) {
-    return errors::InvalidArgument("feeds and fetches must be specified");
+  if (config.fetch().empty()) {
+    return errors::InvalidArgument("fetches must be specified");
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
index 436039e154842443f779aba276bc571fc2ab7537..ed10d80609641b090cf78bf2e17364fe2fa89c31 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
@@ -58,24 +58,14 @@ TEST(ValidateConfig, Good) {
 
 TEST(ValidateConfig, BadEmpty) {
   tf2xla::Config config;
-  ExpectErrorContains(ValidateConfig(config),
-                      "feeds and fetches must be specified");
-}
-
-TEST(ValidateConfig, BadNoFeed) {
-  tf2xla::Config config;
-  tf2xla::Fetch* fetch = config.add_fetch();
-  fetch->mutable_id()->set_node_name("foo");
-  ExpectErrorContains(ValidateConfig(config),
-                      "feeds and fetches must be specified");
+  ExpectErrorContains(ValidateConfig(config), "fetches must be specified");
 }
 
 TEST(ValidateConfig, BadNoFetch) {
   tf2xla::Config config;
   tf2xla::Feed* feed = config.add_feed();
   feed->mutable_id()->set_node_name("foo");
-  ExpectErrorContains(ValidateConfig(config),
-                      "feeds and fetches must be specified");
+  ExpectErrorContains(ValidateConfig(config), "fetches must be specified");
 }
 
 TEST(ValidateConfig, BadFeedNodeName) {
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index 4f32c29954b2d809d31ef8c584b6a6c3dcdf5cef..cc459dc87c00f19230c65341d53da213e07fe364 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -100,7 +100,7 @@ void XlaCompilationDevice::Compute(OpKernel* op_kernel,
   b->SetOpMetadata(metadata);
 
   auto sharding_parse_result = ParseShardingFromDevice(
-      op_kernel->requested_device(), std::numeric_limits<int>::max());
+      op_kernel->def(), std::numeric_limits<int>::max());
   OP_REQUIRES_OK(context, sharding_parse_result.status());
   tensorflow::gtl::optional<xla::OpSharding> op_sharding =
       sharding_parse_result.ValueOrDie();
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
index b5c17c5273bb15e20184b2fefd93880d4828105e..79da701fd244a461a60588153b601d5c1870fa89 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
@@ -28,9 +28,10 @@ XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
       temps_(new void*[static_data.num_temps]),
       arg_names_(static_data.arg_names),
       result_names_(static_data.result_names),
-      program_shape_(static_data.program_shape) {
+      program_shape_(static_data.program_shape),
+      hlo_profile_printer_(static_data.hlo_profile_printer) {
   // Allocate arg and temp buffers.
-  if (alloc_mode == AllocMode::ARGS_RESULTS_AND_TEMPS) {
+  if (alloc_mode == AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS) {
     alloc_args_ = tensorflow::tfcompile::runtime::MallocContiguousBuffers(
         static_data.arg_sizes, static_data.num_args, args_,
         /*annotate_initialized=*/false);
@@ -39,9 +40,13 @@ XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
       static_data.temp_sizes, static_data.num_temps, temps_,
       /*annotate_initialized=*/true);
 
-  // The runtime context is always the last arg, if it is required.
-  if (static_data.requires_runtime_context) {
-    args_[static_data.num_args - 1] = &context_;
+  // If Hlo profiling is enabled the generated code expects an appropriately
+  // sized buffer to be passed in as the last argument.  If Hlo profiling is
+  // disabled the last function argument is still present in the function
+  // signature, but it is ignored by the generated code and we pass in null for
+  // it.
+  if (hlo_profiling_enabled()) {
+    profile_counters_ = new int64[static_data.profile_counters_size]();
   }
 }
 
@@ -50,6 +55,7 @@ XlaCompiledCpuFunction::~XlaCompiledCpuFunction() {
   tensorflow::tfcompile::runtime::FreeContiguous(alloc_temps_);
   delete[] args_;
   delete[] temps_;
+  delete[] profile_counters_;
 }
 
 namespace {
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index f49a7889222ff989144217ab10b27595f89e4311..e0ae3ed9a811bcc49ce8862037a67d293e879e57 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -16,10 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILED_CPU_FUNCTION_H_
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILED_CPU_FUNCTION_H_
 
-#include <functional>
+#include <cassert>
 #include <string>
 
-#include "tensorflow/compiler/tf2xla/xla_local_runtime_context.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -27,6 +26,7 @@ limitations under the License.
 // never use this functionality.
 namespace xla {
 class ProgramShape;
+class HloProfilePrinter;
 }
 
 namespace tensorflow {
@@ -48,12 +48,10 @@ namespace tensorflow {
 class XlaCompiledCpuFunction {
  public:
   // Type of the raw function, produced by either JIT or AOT.
-  //
-  // TODO(toddw): Add support for hlo profiling, and replace std::function with
-  // a raw function pointer, for some codesize savings.
-  using RawFunction = std::function<void(
-      void* result, const xla::ExecutableRunOptions* run_options,
-      const void** args, void** temps)>;
+  using RawFunction = void (*)(void* result,
+                               const xla::ExecutableRunOptions* run_options,
+                               const void** args, void** temps,
+                               int64* profile_counters);
 
   // StaticData represents the state necessary to run an XLA-compiled
   // function. For JIT this is backed by data in XlaJitCompiledCpuFunction; for
@@ -71,9 +69,6 @@ class XlaCompiledCpuFunction {
     // The 0-based index of the result tuple, in the temp buffers.
     size_t result_index = 0;
 
-    // Is the final arg XlaLocalRuntimeContext?
-    bool requires_runtime_context = false;
-
     // [Optional] Arrays of arg and result names. These are arrays of C-style
     // strings, where the array is terminated by nullptr.
     const char** arg_names = nullptr;
@@ -81,21 +76,29 @@ class XlaCompiledCpuFunction {
 
     // [Optional] Arg and result shapes.
     const xla::ProgramShape* program_shape = nullptr;
+
+    // [Optional] Profile printer.  Null if profiling is disabled.
+    const xla::HloProfilePrinter* hlo_profile_printer = nullptr;
+
+    // [Optional] The number of profile counters expected in the profile counter
+    // buffer by the generated code and hlo_profile_printer.  0 if profiling is
+    // disabled.
+    int64 profile_counters_size = 0;
   };
 
   // AllocMode controls the buffer allocation mode.
   enum class AllocMode {
-    // Allocate all buffers - args, results and temps.
-    ARGS_RESULTS_AND_TEMPS,
+    // Allocate all buffers - args, results, profile and temps.
+    ARGS_RESULTS_PROFILES_AND_TEMPS,
 
-    // Only allocate result and temp buffers.
+    // Only allocate result, profile and temp buffers.
     // Use set_arg_data to set argument buffers before Run is called.
-    RESULTS_AND_TEMPS_ONLY,
+    RESULTS_PROFILES_AND_TEMPS_ONLY,
   };
 
   XlaCompiledCpuFunction(
       const StaticData& static_data,
-      AllocMode alloc_mode = AllocMode::ARGS_RESULTS_AND_TEMPS);
+      AllocMode alloc_mode = AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS);
   virtual ~XlaCompiledCpuFunction();
 
   XlaCompiledCpuFunction(const XlaCompiledCpuFunction&) = delete;
@@ -104,21 +107,22 @@ class XlaCompiledCpuFunction {
   // Sets the intra-op thread pool used to run individual ops concurrently.
   void set_thread_pool(const Eigen::ThreadPoolDevice* pool) {
     run_options_.set_intra_op_thread_pool(pool);
-    context_.thread_pool = pool;
   }
 
   // Runs the computation, with inputs read from arg buffers, and outputs
   // written to result buffers. Returns true on success and false on failure.
   bool Run() {
-    context_.error = false;
-    context_.error_msg.clear();
     raw_function_(temps_[result_index_], &run_options_,
-                  const_cast<const void**>(args_), temps_);
-    return !context_.error;
+                  const_cast<const void**>(args_), temps_, profile_counters_);
+    return true;
   }
 
   // Returns the error message from the previous failed Run call.
-  const string& error_msg() const { return context_.error_msg; }
+  //
+  // TODO(fschneider): For now this always returns an empty string because there
+  // is no support for error reporting in XLA. Remove this once all callers are
+  // updated.
+  string error_msg() const { return {}; }
 
   // ------------------------------
   // Arg methods for managing input buffers. Buffers are in row-major order.
@@ -141,10 +145,6 @@ class XlaCompiledCpuFunction {
   // tensorflow::tfcompile::runtime::kAlign. If possible, use the functions in
   // tensorflow/compiler/aot/runtime.h to ensure correct alignment.
   //
-  // If StaticData.requires_runtime_context==true, the final argument is an
-  // XlaLocalRuntimeContext, which is managed internally by this class, and
-  // should not be changed.
-  //
   // Aliasing of argument and result buffers is not allowed, and results in
   // undefined behavior.
   void set_arg_data(size_t index, void* data) { args_[index] = data; }
@@ -162,6 +162,16 @@ class XlaCompiledCpuFunction {
     return static_cast<const void* const*>(temps_[result_index_]);
   }
 
+  // Profile counters for this XLA computation.
+  //
+  // When Hlo profiling is enabled (`hlo_profiling_enabled()` return true in
+  // this case) these counters are non-null and are automatically populated by
+  // `Run`.  The counters can then be pretty-printed using
+  // `hlo_profile_printer()`.
+  //
+  // When Hlo profiling is disabled, this accessor returns null.
+  const int64* profile_counters() const { return profile_counters_; }
+
   // Returns the buffer for the positional result at the given `index`.
   void* result_data(size_t index) { return results()[index]; }
   const void* result_data(size_t index) const { return results()[index]; }
@@ -195,6 +205,12 @@ class XlaCompiledCpuFunction {
   // program shape isn't available.
   const xla::ProgramShape* ProgramShape() const { return program_shape_; }
 
+  bool hlo_profiling_enabled() const { return hlo_profile_printer_ != nullptr; }
+  const xla::HloProfilePrinter& hlo_profile_printer() const {
+    assert(hlo_profiling_enabled());
+    return *hlo_profile_printer_;
+  }
+
  private:
   const RawFunction raw_function_;
   const size_t result_index_;
@@ -208,14 +224,17 @@ class XlaCompiledCpuFunction {
   void* alloc_args_ = nullptr;
   void* alloc_temps_ = nullptr;
 
+  // Backing memory for profiling counters.
+  int64* profile_counters_ = nullptr;
+
   // Options and context passed to the compiled function.
   xla::ExecutableRunOptions run_options_;
-  tensorflow::XlaLocalRuntimeContext context_;
 
   // Optional metadata.
   const char** arg_names_ = nullptr;
   const char** result_names_ = nullptr;
   const xla::ProgramShape* program_shape_ = nullptr;
+  const xla::HloProfilePrinter* hlo_profile_printer_ = nullptr;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 48cebdf74c71f974bf075e0255626ec57eb9a149..50da76e514c83912cbf864bdc3aaa7b8e4f77925 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -502,18 +502,6 @@ Status BuildComputation(
   return Status::OK();
 }
 
-void AssignMajorToMinorLayout(xla::Shape* shape) {
-  if (xla::ShapeUtil::IsTuple(*shape)) {
-    for (xla::Shape& elem_shape : *shape->mutable_tuple_shapes()) {
-      AssignMajorToMinorLayout(&elem_shape);
-    }
-  } else {
-    auto& minor_to_major = *shape->mutable_layout()->mutable_minor_to_major();
-    minor_to_major.Resize(xla::ShapeUtil::Rank(*shape), 0);
-    std::iota(minor_to_major.rbegin(), minor_to_major.rend(), 0);
-  }
-}
-
 }  // namespace
 
 Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
@@ -543,8 +531,6 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                      options.resolve_compile_time_constants);
   core::ScopedUnref context_unref(context);
 
-  result->tuple_arg = options.use_tuple_arg;
-
   std::vector<XlaExpression> arg_expressions;
   std::vector<int> arg_cores;
   TF_RETURN_IF_ERROR(BuildArguments(
@@ -564,11 +550,6 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
       result->computation.get(), &num_computation_outputs,
       &num_nonconst_outputs, &result->resource_updates));
 
-  result->requires_runtime_context = context->has_context_parameter();
-
-  // Tuple arguments and runtime context parameters are incompatible.
-  TF_RET_CHECK(!(options.use_tuple_arg && result->requires_runtime_context));
-
   VLOG(2) << "Outputs: total: " << context->retvals().size()
           << " nonconstant: " << num_nonconst_outputs;
   result->outputs.resize(context->retvals().size());
@@ -596,7 +577,7 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
           << xla::ShapeUtil::HumanString(result->xla_output_shape);
 
   // Tensorflow expects a major-to-minor order of results.
-  AssignMajorToMinorLayout(&result->xla_output_shape);
+  xla::LayoutUtil::SetToDefaultLayout(&result->xla_output_shape);
 
   // Converts the output shapes to TensorShapes.
   int computation_output = 0;
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index ac7d4cfb127d1de8c92f3a855191c45af77888ad..380e24e96bc713af4453f92a5359995e9ab4734a 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -54,8 +54,6 @@ namespace tensorflow {
 //   +---------------------+-----------------------------------------+
 // Within each block, the arguments are arranged by the _Arg index from which
 // they were derived.
-// If `Options::requires_runtime_context` is true, then an additional runtime
-// context argument is passed as a final argument.
 //
 // The run-time outputs of the XLA computation are arranged in the following
 // order:
@@ -191,16 +189,9 @@ class XlaCompiler {
     // original arguments, and are not necessarily in the same order.)
     std::vector<int> input_mapping;
 
-    // Does the computation require the local runtime context to be passed as
-    // the last argument?
-    bool requires_runtime_context = false;
-
     // Input shapes of the computation.
     std::vector<xla::Shape> xla_input_shapes;
 
-    // Should the arguments be packed into a single tuple?
-    bool tuple_arg;
-
     // Output shape in XLA format. The output shape is always a tuple.
     xla::Shape xla_output_shape;
 
@@ -232,8 +223,7 @@ class XlaCompiler {
     int graph_def_version = TF_GRAPH_DEF_VERSION;
 
     // If 'allow_cpu_custom_calls' is true, kernels may make use of CustomCall()
-    // for CPU; additionally, an optional XlaLocalRuntimeContext* may be passed
-    // to the computation.
+    // for CPU.
     bool allow_cpu_custom_calls = false;
 
     // If not nullptr, populate_resource_manager is called with the
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 651bafd6c5d946adfedd63ebbe93e4ea016f0b37..5d19dd353fc04744e196bb50c35cb60b35d8b258 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -70,24 +70,6 @@ XlaContext::XlaContext(XlaCompiler* compiler, xla::ComputationBuilder* builder,
       allow_cpu_custom_calls_(allow_cpu_custom_calls),
       resolve_compile_time_constants_(resolve_compile_time_constants) {}
 
-const xla::ComputationDataHandle&
-XlaContext::GetOrCreateRuntimeContextParameter() {
-  CHECK(allow_cpu_custom_calls_);
-  if (has_context_parameter_) return context_parameter_;
-  has_context_parameter_ = true;
-
-  // Allocate the next available parameter for the context parameter.
-  int num_parameters = 0;
-  for (const XlaExpression& arg : args_) {
-    if (!arg.has_constant_value()) {
-      ++num_parameters;
-    }
-  }
-  context_parameter_ = builder_->Parameter(
-      num_parameters, xla::ShapeUtil::MakeOpaqueShape(), "tf_context");
-  return context_parameter_;
-}
-
 string XlaContext::DebugString() { return "TLA JIT context"; }
 
 // This is called by the Retval Op to associate a computed value
@@ -178,6 +160,20 @@ const xla::Computation* XlaContext::GetOrCreateAdd(const DataType type) {
   });
 }
 
+const xla::Computation* XlaContext::GetOrCreateMul(const DataType type) {
+  return LookupOrCreate(type, &mul_func_, [this, type] {
+    const string type_string = DataTypeString(type);
+    VLOG(1) << "Building Mul() for " << type_string;
+    xla::ComputationBuilder b(builder()->client(), "mul<" + type_string + ">");
+    xla::PrimitiveType xla_type;
+    TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
+    auto x = b.Parameter(0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
+    auto y = b.Parameter(1, xla::ShapeUtil::MakeShape(xla_type, {}), "y");
+    b.Mul(x, y);
+    return b.Build().ConsumeValueOrDie();
+  });
+}
+
 const xla::Computation* XlaContext::LookupOrCreate(
     DataType type, ComputationMap* out,
     const std::function<xla::Computation()>& create) {
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index de8aafa3628e6eebdabbc508cd95a2ac86e3472f..1a7dafe8cdb56cc9b8fcd3ba6e262c21c2a07d90 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -56,15 +56,10 @@ class XlaContext : public ResourceBase {
   xla::ComputationBuilder* builder();
 
   bool allow_cpu_custom_calls() const { return allow_cpu_custom_calls_; }
-  bool has_context_parameter() const { return has_context_parameter_; }
 
   const std::vector<XlaExpression>& args() const { return args_; }
   void set_args(std::vector<XlaExpression> args);
 
-  // Get the runtime context parameter, adding one if it does not already exist.
-  // Dies if not compiling a local executable.
-  const xla::ComputationDataHandle& GetOrCreateRuntimeContextParameter();
-
   const std::vector<XlaExpression>& retvals() { return retvals_; }
 
   // This is called by the Retval Op to associate a computed value
@@ -102,6 +97,11 @@ class XlaContext : public ResourceBase {
   // separate specialization of the computation for each DataType.
   const xla::Computation* GetOrCreateAdd(const DataType type);
 
+  // Get an XLA lambda to compute Mul. This is cached in the
+  // XlaContext since it may be used by multiple Ops. There is a
+  // separate specialization of the computation for each DataType.
+  const xla::Computation* GetOrCreateMul(const DataType type);
+
   // The name of the XlaContext resource during symbolic graph execution.
   static const char kXlaContextResourceName[];
 
@@ -116,16 +116,9 @@ class XlaContext : public ResourceBase {
   const bool allow_cpu_custom_calls_;
 
   // If true, constant return values are returned as Tensors instead of
-  // run-time computation outptus.
+  // run-time computation outputs.
   const bool resolve_compile_time_constants_;
 
-  // When 'has_context_parameter_' is true, this is the computation handle
-  // for an additional final parameter to the computation, through which will be
-  // passed a XlaLocalRuntimeContext* at runtime. Created on demand by
-  // GetOrCreateRuntimeContextParameter().
-  bool has_context_parameter_ = false;
-  xla::ComputationDataHandle context_parameter_;
-
   // Arguments to the Tensorflow graph, indexed by _Arg index.
   // Includes both compile-time constant arguments and runtime parameters.
   std::vector<XlaExpression> args_;
@@ -155,6 +148,9 @@ class XlaContext : public ResourceBase {
   // Cached computation to compute Sum of two elements, specialized by type.
   ComputationMap add_func_;
 
+  // Cached computation to compute Mul of two elements, specialized by type.
+  ComputationMap mul_func_;
+
   // Cached computation to compute Sigmoid of an element, specialized by type.
   ComputationMap sigmoid_func_;
 
diff --git a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
index d504613d232c779e47a506657d2825d052e726dc..8ca757e72355d890c13b8b448d35c327d3986696 100644
--- a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
+++ b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
@@ -21,8 +21,6 @@ namespace tensorflow {
 bool GpuOpFilter(KernelDef* kdef) {
   // TODO(b/31361304): The GPU backend does not parallelize PRNG ops, leading to
   // slow code.
-  // TODO(b/34969189) The implementation of TruncatedNormal generates illegal
-  // code on GPU.
   if (kdef->op() == "RandomStandardNormal" || kdef->op() == "RandomUniform" ||
       kdef->op() == "RandomUniformInt" || kdef->op() == "TruncatedNormal") {
     return false;
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 9c3e15d2fa4c84af94d137f2e03107bcc980f4cd..ec9e535b707beec6ea26dc81c7ee76b1d4da9225 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This file defines helper routines for Tla JIT compilation.
+// This file defines helper routines for XLA compilation.
 
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
@@ -121,6 +121,8 @@ xla::ComputationDataHandle XlaHelpers::One(xla::ComputationBuilder* b,
 xla::ComputationDataHandle XlaHelpers::Epsilon(xla::ComputationBuilder* b,
                                                DataType data_type) {
   switch (data_type) {
+    case DT_BFLOAT16:
+      return b->ConstantR0<bfloat16>(bfloat16::epsilon());
     case DT_FLOAT:
       return b->ConstantR0<float>(std::numeric_limits<float>::epsilon());
     case DT_DOUBLE:
@@ -169,6 +171,9 @@ xla::ComputationDataHandle XlaHelpers::IntegerLiteral(
     case xla::S16:
     case xla::U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
+    case xla::BF16:
+      literal = *xla::Literal::CreateR0<bfloat16>(static_cast<bfloat16>(value));
+      break;
     case xla::F16:
       literal =
           *xla::Literal::CreateR0<xla::half>(static_cast<xla::half>(value));
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 1dd454ea8d57e21526e5bcde0c8efc5514983b93..584417bc72c8f6645c05912e857b031cfb394e54 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -37,27 +37,14 @@ namespace {
 
 // Returns a vector of positional argument buffer sizes.
 xla::StatusOr<std::vector<intptr_t>> ComputeArgSizes(
-    const xla::ProgramShape& program_shape, bool requires_runtime_context) {
+    const xla::ProgramShape& program_shape) {
   std::vector<intptr_t> arg_sizes;
   const size_t num_args = program_shape.parameters_size();
   arg_sizes.reserve(num_args);
   for (int i = 0; i < num_args; ++i) {
     const xla::Shape& arg_shape = program_shape.parameters(i);
-    if (i == num_args - 1 && requires_runtime_context) {
-      // If the compiled function needs an XlaLocalRuntimeContext* arg, it's
-      // always last, and must be represented as an opaque type.
-      const xla::PrimitiveType type = arg_shape.element_type();
-      if (type != xla::OPAQUE) {
-        return errors::InvalidArgument(
-            "expected final context arg to be opaque, but got type: ",
-            xla::PrimitiveType_Name(type), ", from program shape: ",
-            xla::ShapeUtil::HumanString(program_shape));
-      }
-      arg_sizes.push_back(-1);
-    } else {
-      constexpr size_t kPointerSize = sizeof(void*);
-      arg_sizes.push_back(xla::ShapeUtil::ByteSizeOf(arg_shape, kPointerSize));
-    }
+    constexpr size_t kPointerSize = sizeof(void*);
+    arg_sizes.push_back(xla::ShapeUtil::ByteSizeOf(arg_shape, kPointerSize));
   }
   return std::move(arg_sizes);
 }
@@ -90,21 +77,6 @@ xla::StatusOr<size_t> ComputeResultIndex(
   return result_slice.index();
 }
 
-// Adapt ComputeFunctionType, which includes a final profile_counters arg, to
-// RawFunction, which doesn't include that final arg.
-//
-// TODO(toddw): Change RawFunction and AOT to also pass the final
-// profile_counters arg, and remove this adapter.
-XlaCompiledCpuFunction::RawFunction RawFunctionAdapter(
-    xla::cpu::CpuExecutable::ComputeFunctionType compute_function) {
-  return [compute_function](void* result,
-                            const xla::ExecutableRunOptions* run_options,
-                            const void** args, void** temps) {
-    return compute_function(result, run_options, args, temps,
-                            /*profile_counters=*/nullptr);
-  };
-}
-
 // Collect names from `entries`, where T is one of tf2xla::{Feed,Fetch}. We hold
 // the actual strings in nonempty_names, and hold arrays of pointers in
 // name_ptrs, terminated by a nullptr entry.
@@ -144,9 +116,8 @@ XlaJitCompiledCpuFunction::Compile(
   TF_ASSIGN_OR_RETURN(xla::LocalClient * client,
                       xla::ClientLibrary::GetOrCreateLocalClient());
   xla::Computation computation;
-  bool requires_runtime_context;
-  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToXla(
-      graph_def, config, client, &computation, &requires_runtime_context));
+  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToXla(graph_def, config, client,
+                                                      &computation));
 
   // Get and verify the program shape.
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::ProgramShape> program_shape,
@@ -177,14 +148,13 @@ XlaJitCompiledCpuFunction::Compile(
   const xla::cpu::CpuExecutable* cpu_executable =
       static_cast<xla::cpu::CpuExecutable*>(executable->executable());
   XlaCompiledCpuFunction::RawFunction raw_function =
-      RawFunctionAdapter(cpu_executable->compute_function());
+      cpu_executable->compute_function();
   const xla::BufferAssignment& buffer_assignment =
       cpu_executable->buffer_assignment();
 
   // Compute buffer sizes and the result index, needed to run the raw function.
-  TF_ASSIGN_OR_RETURN(
-      std::vector<intptr_t> arg_sizes,
-      ComputeArgSizes(*program_shape, requires_runtime_context));
+  TF_ASSIGN_OR_RETURN(std::vector<intptr_t> arg_sizes,
+                      ComputeArgSizes(*program_shape));
   TF_ASSIGN_OR_RETURN(std::vector<intptr_t> temp_sizes,
                       ComputeTempSizes(buffer_assignment));
   TF_ASSIGN_OR_RETURN(size_t result_index,
@@ -203,7 +173,6 @@ XlaJitCompiledCpuFunction::Compile(
   jit->static_data_.temp_sizes = jit->temp_sizes_.data();
   jit->static_data_.num_temps = jit->temp_sizes_.size();
   jit->static_data_.result_index = result_index;
-  jit->static_data_.requires_runtime_context = requires_runtime_context;
   // Optional metadata is collected and set below.
   CollectNames(config.feed(), &jit->nonempty_arg_names_, &jit->arg_names_);
   CollectNames(config.fetch(), &jit->nonempty_result_names_,
@@ -211,6 +180,14 @@ XlaJitCompiledCpuFunction::Compile(
   jit->static_data_.arg_names = jit->arg_names_.data();
   jit->static_data_.result_names = jit->result_names_.data();
   jit->static_data_.program_shape = jit->program_shape_.get();
+
+  if (cpu_executable->hlo_profiling_enabled()) {
+    jit->static_data_.hlo_profile_printer =
+        &cpu_executable->hlo_profile_printer();
+    jit->static_data_.profile_counters_size =
+        cpu_executable->hlo_profile_printer().profile_counters_size();
+  }
+
   return std::move(jit_unique_ptr);
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_local_runtime_context.h b/tensorflow/compiler/tf2xla/xla_local_runtime_context.h
deleted file mode 100644
index dca420d6ee3fec45f88ac3b450ab0cb4fb83d38a..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/xla_local_runtime_context.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_LOCAL_RUNTIME_CONTEXT_H_
-#define TENSORFLOW_COMPILER_TF2XLA_XLA_LOCAL_RUNTIME_CONTEXT_H_
-
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
-
-// Forward-declare the ThreadPoolDevice so that it can be ignored unless it's
-// actually used.  E.g. some ahead-of-time compiled computations don't need a
-// thread pool.
-namespace Eigen {
-struct ThreadPoolDevice;
-}
-
-namespace tensorflow {
-
-// An instance of this class is passed to each call from tensorflow into a
-// compiled XLA computation. See xla_launch_ops.cc.
-struct XlaLocalRuntimeContext {
- public:
-  XlaLocalRuntimeContext() {}
-
-  // Kernels implemented using custom call ops set this if they encounter an
-  // error. The error is checked after the entire XLA computation is
-  // complete.
-  //
-  // error+error_msg are used instead of Status to reduce the binary size
-  // overhead for ahead-of-time compiled binaries.
-  bool error = false;
-  string error_msg;
-
-  // Kernels that need a thread pool can get it from here.
-  const Eigen::ThreadPoolDevice* thread_pool = nullptr;
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaLocalRuntimeContext);
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_LOCAL_RUNTIME_CONTEXT_H_
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index a052bb105e7d3e47f2427c98ce47e52d95af78d9..79d501b511bf37ba4a79ab9d375d6f789a36889b 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -346,9 +346,9 @@ void XlaOpKernelContext::SetConstantOutput(int index, const Tensor& constant) {
 }
 
 void XlaOpKernelContext::SetInvalidOutput(int index) {
-  const TensorShape shape;
   Tensor* output = nullptr;
-  OP_REQUIRES_OK(context_, context_->allocate_output(index, shape, &output));
+  OP_REQUIRES_OK(context_,
+                 context_->allocate_output(index, TensorShape({}), &output));
   XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
   xla::ComputationDataHandle handle;
   handle.set_handle(0);
@@ -417,6 +417,11 @@ const xla::Computation* XlaOpKernelContext::GetOrCreateAdd(
   return XlaContext::Get(context_).GetOrCreateAdd(type);
 }
 
+const xla::Computation* XlaOpKernelContext::GetOrCreateMul(
+    const DataType type) {
+  return XlaContext::Get(context_).GetOrCreateMul(type);
+}
+
 XlaOpKernel::XlaOpKernel(OpKernelConstruction* context) : OpKernel(context) {}
 
 void XlaOpKernel::Compute(OpKernelContext* context) {
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 76bcf594e6a0601763844847583c18ee26d8adf3..f1ae81a5aa9d507a3e0dd577568377385b1844e6 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -178,7 +178,7 @@ class XlaOpKernelContext {
 
   // If this kernel invocation is within a function execution,
   // call_frame() returns the call frame for the function call.
-  FunctionCallFrame* call_frame() const { return context_->call_frame(); }
+  CallFrameInterface* call_frame() const { return context_->call_frame(); }
 
   FunctionLibraryRuntime* function_library() const {
     return context_->function_library();
@@ -210,6 +210,11 @@ class XlaOpKernelContext {
   // separate specialization of the computation for each DataType.
   const xla::Computation* GetOrCreateAdd(const DataType type);
 
+  // Gets an XLA lambda to compute Mul. This is cached in the
+  // XlaContext since it may be used by multiple Ops. There is a
+  // separate specialization of the computation for each DataType.
+  const xla::Computation* GetOrCreateMul(const DataType type);
+
  private:
   OpKernelContext* const context_;
 };
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index 02318cf7fa1d4edc12507f6b4d66a8e897cbe100..faf47434b5dc6b569ec4f9c91a8667de275a6315 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
@@ -187,22 +188,39 @@ void XlaOpRegistry::RegisterCompilationKernels() {
 
       // Constrain each type attribute to the intersection of:
       // a) the types supported by the backend, and
-      // b) the attribute's type constraints.
-      // TODO(phawkins): it may be necessary to also take the intersection with
-      // the set of types supported by the OpDef.
+      // b) the types allowed by the OpDef, and
+      // c) the type constraints.
       for (const string& type_attr : type_attrs) {
         KernelDef::AttrConstraint* attr_constraint = kdef->add_constraint();
         attr_constraint->set_name(type_attr);
         auto* allowed_values =
             attr_constraint->mutable_allowed_values()->mutable_list();
 
-        auto it = op_registration->type_constraints.find(type_attr);
+        const OpDef::AttrDef& op_def_attr = *FindAttr(type_attr, *op_def);
+        const auto* op_def_allowed_types =
+            op_def_attr.has_allowed_values()
+                ? &op_def_attr.allowed_values().list().type()
+                : nullptr;
+        auto constraint_it = op_registration->type_constraints.find(type_attr);
+        const std::set<DataType>* type_constraints =
+            constraint_it != op_registration->type_constraints.end()
+                ? &constraint_it->second
+                : nullptr;
         for (DataType dtype : backend.second.supported_types) {
-          if (it == op_registration->type_constraints.end() ||
-              (it != op_registration->type_constraints.end() &&
-               it->second.find(dtype) != it->second.end())) {
-            allowed_values->add_type(dtype);
+          // Filter out types that aren't allowed by the OpDef.
+          if (op_def_allowed_types != nullptr &&
+              std::find(op_def_allowed_types->begin(),
+                        op_def_allowed_types->end(),
+                        dtype) == op_def_allowed_types->end()) {
+            continue;
           }
+          // Filter out types based on the type constraints.
+          if (type_constraints != nullptr &&
+              type_constraints->find(dtype) == type_constraints->end()) {
+            continue;
+          }
+          // Passed all the filters, this type is allowed.
+          allowed_values->add_type(dtype);
         }
         if (op_registration->allow_resource_types) {
           allowed_values->add_type(DT_RESOURCE);
@@ -245,6 +263,22 @@ std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
   return kernels;
 }
 
+std::vector<string> XlaOpRegistry::BackendNames() {
+  std::vector<string> names;
+  XlaOpRegistry& registry = Instance();
+  mutex_lock lock(registry.mutex_);
+  for (const auto& backend_pair : registry.backends_) {
+    names.push_back(backend_pair.first);
+  }
+  return names;
+}
+
+bool XlaOpRegistry::IsBackendRegistered(const string& name) {
+  XlaOpRegistry& registry = Instance();
+  mutex_lock lock(registry.mutex_);
+  return registry.backends_.find(name) != registry.backends_.end();
+}
+
 XlaOpRegistry& XlaOpRegistry::Instance() {
   static XlaOpRegistry* r = new XlaOpRegistry;
   return *r;
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index 6aee8c91cc01b4382ef867fa8e438eede008ac73..8bfd9758f7af9c6b7ed20954e72f953b629b28a6 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -45,11 +45,11 @@ extern const char* const DEVICE_GPU_XLA_JIT;  // "GPU_XLA_JIT"
 extern const char* const DEVICE_XLA_CPU;
 extern const char* const DEVICE_XLA_GPU;
 
-constexpr std::array<DataType, 3> kFloatTypes = {
-    {DT_HALF, DT_FLOAT, DT_DOUBLE}};
-constexpr std::array<DataType, 8> kNumericTypes = {
+constexpr std::array<DataType, 4> kFloatTypes = {
+    {DT_HALF, DT_FLOAT, DT_DOUBLE, DT_BFLOAT16}};
+constexpr std::array<DataType, 9> kNumericTypes = {
     {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
-     DT_COMPLEX64}};
+     DT_COMPLEX64, DT_BFLOAT16}};
 
 constexpr std::array<DataType, 8> kCpuAllTypes = {
     {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE,
@@ -97,6 +97,12 @@ class XlaOpRegistry {
                               gtl::ArraySlice<DataType> supported_types,
                               BackendOpFilter op_filter);
 
+  // Returns the names of the registered backends.
+  static std::vector<string> BackendNames();
+
+  // Returns true iff a backend with the given name is registered.
+  static bool IsBackendRegistered(const string& name);
+
   // Registers `device_name` for XLA compilation, using information from
   // `registration`.
   static void RegisterCompilationDevice(const string& device_name,
@@ -116,8 +122,8 @@ class XlaOpRegistry {
   static void RegisterCompilationKernels();
 
   // Returns KernelDefs for compilation ops registered on
-  // 'compilation_device_name'.
-  // Does not include kernels registered as CompilationOnly.
+  // 'compilation_device_name'.  Does not include kernels registered as
+  // CompilationOnly, iff include_compilation_only_kernels=false.
   static std::vector<const KernelDef*> DeviceKernels(
       const string& compilation_device_name,
       bool include_compilation_only_kernels);
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index d3f292207fee396fb4248dede5c0eeb5cd2b87c9..cd69c69889b2487ad12abea275e79fee4f5c51e6 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -20,6 +20,10 @@ package_group(
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_py",
+)
 
 # Filegroup used to collect source files for dependency checking.
 filegroup(
@@ -36,6 +40,12 @@ xla_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_proto_library_py(
+    name = "xla_data_proto",  # bzl adds a _py suffix
+    srcs = ["xla_data.proto"],
+    visibility = ["//visibility:public"],
+)
+
 xla_proto_library(
     name = "xla_proto",
     srcs = ["xla.proto"],
diff --git a/tensorflow/compiler/xla/array3d.h b/tensorflow/compiler/xla/array3d.h
index e9449f01ad69a5722f53cce09e2884e20a0def5a..a1c5840a5f3874e27043c821ed4684da2fa6c542 100644
--- a/tensorflow/compiler/xla/array3d.h
+++ b/tensorflow/compiler/xla/array3d.h
@@ -36,6 +36,8 @@ namespace xla {
 template <typename T>
 class Array3D : public Array<T> {
  public:
+  Array3D() : Array<T>(std::vector<int64>{0, 0, 0}) {}
+
   // Creates an array of dimensions n1 x n2 x n3, uninitialized values.
   Array3D(const int64 n1, const int64 n2, const int64 n3)
       : Array<T>(std::vector<int64>{n1, n2, n3}) {}
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index a716159f9e74041c4823ad20b46fa94c2d7b9d8c..c28380b689c7a0e16bf0bcbf15003f4aa15e42a7 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -67,6 +67,15 @@ class Client {
     std::vector<GlobalData*> arguments;
     ExecutionOptions execution_options;
     ExecutionProfile* execution_profile;
+
+    ComputationInstance(const Computation& computation,
+                        std::vector<GlobalData*> arguments,
+                        ExecutionOptions execution_options,
+                        ExecutionProfile* execution_profile)
+        : computation(computation),
+          arguments(std::move(arguments)),
+          execution_options(execution_options),
+          execution_profile(execution_profile) {}
   };
 
   // Executes a list ComputationInstances and returns global data produced from
@@ -133,7 +142,7 @@ class Client {
 
   // Returns a vector of global data handles that point to the tuple elements.
   StatusOr<std::vector<std::unique_ptr<GlobalData>>> DeconstructTuple(
-      const GlobalData& computation);
+      const GlobalData& data);
 
   // Retrieves the statistics of the given computation.
   StatusOr<ComputationStats> GetComputationStats(
diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
index 763d94e94c2167f47b3f0777a31815f02791aa9e..317dcb4e41723b93e7e50d911f16e48bc3505a09 100644
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@@ -153,6 +153,7 @@ bool ComputationBuilder::MakeWindow(
     } else {
       dim->set_window_dilation(1);
     }
+    dim->set_window_reversal(false);
   }
   return true;
 }
@@ -624,7 +625,41 @@ ComputationDataHandle ComputationBuilder::Lt(
 
 ComputationDataHandle ComputationBuilder::Dot(
     const ComputationDataHandle& lhs, const ComputationDataHandle& rhs) {
-  return BinaryOp(BINOP_DOT, lhs, rhs, /*broadcast_dimensions=*/{});
+  StatusOr<std::unique_ptr<Shape>> lhs_shape_or_status = GetShape(lhs);
+  if (!lhs_shape_or_status.ok()) {
+    NoteError(lhs_shape_or_status.status());
+    return ComputationDataHandle();
+  }
+  std::unique_ptr<Shape> lhs_shape = lhs_shape_or_status.ConsumeValueOrDie();
+
+  DotDimensionNumbers dimension_numbers;
+  dimension_numbers.add_lhs_contracting_dimensions(
+      lhs_shape->dimensions_size() == 1 ? 0 : 1);
+  dimension_numbers.add_rhs_contracting_dimensions(0);
+  return DotGeneral(lhs, rhs, dimension_numbers);
+}
+
+ComputationDataHandle ComputationBuilder::DotGeneral(
+    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+    const DotDimensionNumbers& dimension_numbers) {
+  if (!first_error_.ok() || !PrepareComputation().ok()) {
+    return ComputationDataHandle();
+  }
+
+  DotRequest request;
+  *request.mutable_lhs() = lhs;
+  *request.mutable_rhs() = rhs;
+  *request.mutable_dimension_numbers() = dimension_numbers;
+
+  OpRequest op_request;
+  *op_request.mutable_computation() = computation_.handle();
+  *op_request.mutable_dot_request() = request;
+  AddCommonFieldsToOpRequest(&op_request);
+  OpResponse response;
+
+  VLOG(2) << "making Dot request";
+  Status s = client_->stub()->Op(&op_request, &response);
+  return ParseOpResponse(s, &response);
 }
 
 ComputationDataHandle ComputationBuilder::Conv(
@@ -693,11 +728,15 @@ bool ComputationBuilder::VerifyConvolution(
         }
         return true;
       };
-  return check_spatial_dimensions("spatial_dimensions",
-                                  dimension_numbers.spatial_dimensions()) &&
+  return check_spatial_dimensions(
+             "input_spatial_dimensions",
+             dimension_numbers.input_spatial_dimensions()) &&
          check_spatial_dimensions(
              "kernel_spatial_dimensions",
-             dimension_numbers.kernel_spatial_dimensions());
+             dimension_numbers.kernel_spatial_dimensions()) &&
+         check_spatial_dimensions(
+             "output_spatial_dimensions",
+             dimension_numbers.output_spatial_dimensions());
 }
 
 ComputationDataHandle ComputationBuilder::ConvWithGeneralDimensions(
@@ -729,11 +768,11 @@ ComputationDataHandle ComputationBuilder::ConvWithGeneralDimensions(
   }
 
   std::vector<int64> base_area_dimensions(
-      dimension_numbers.spatial_dimensions_size());
+      dimension_numbers.input_spatial_dimensions_size());
   for (std::vector<int64>::size_type i = 0; i < base_area_dimensions.size();
        ++i) {
     base_area_dimensions[i] =
-        lhs_shape->dimensions(dimension_numbers.spatial_dimensions(i));
+        lhs_shape->dimensions(dimension_numbers.input_spatial_dimensions(i));
   }
 
   std::vector<int64> window_dimensions(
@@ -1163,6 +1202,34 @@ ComputationDataHandle ComputationBuilder::ConvertElementType(
   return ParseOpResponse(s, &response);
 }
 
+ComputationDataHandle ComputationBuilder::BitcastConvertType(
+    const ComputationDataHandle& operand, PrimitiveType new_element_type) {
+  if (!first_error_.ok() || !PrepareComputation().ok()) {
+    return ComputationDataHandle();
+  }
+
+  StatusOr<std::unique_ptr<Shape>> shape_status = GetShape(operand);
+  if (!shape_status.ok()) {
+    first_error_ = shape_status.status();
+    return ComputationDataHandle();
+  }
+  std::unique_ptr<Shape> original = shape_status.ConsumeValueOrDie();
+
+  ConvertRequest request;
+  *request.mutable_operand() = operand;
+  request.set_new_element_type(new_element_type);
+  OpRequest op_request;
+  *op_request.mutable_computation() = computation_.handle();
+  *op_request.mutable_bitcast_convert_request() = request;
+  AddCommonFieldsToOpRequest(&op_request);
+  OpResponse response;
+
+  VLOG(2) << "making bitcast convert request";
+  Status s = client_->stub()->Op(&op_request, &response);
+
+  return ParseOpResponse(s, &response);
+}
+
 ComputationDataHandle ComputationBuilder::SquareF32(
     const ComputationDataHandle& operand) {
   return BinaryOp(BINOP_POW, operand, ConstantR0<float>(2.0),
@@ -1437,6 +1504,34 @@ ComputationDataHandle ComputationBuilder::While(
   return ParseOpResponse(s, &response);
 }
 
+ComputationDataHandle ComputationBuilder::Conditional(
+    const ComputationDataHandle& predicate,
+    const ComputationDataHandle& true_operand,
+    const Computation& true_computation,
+    const ComputationDataHandle& false_operand,
+    const Computation& false_computation) {
+  if (!first_error_.ok() || !PrepareComputation().ok()) {
+    return ComputationDataHandle();
+  }
+
+  ConditionalRequest request;
+  *request.mutable_predicate() = predicate;
+  *request.mutable_true_operand() = true_operand;
+  *request.mutable_true_computation() = true_computation.handle();
+  *request.mutable_false_operand() = false_operand;
+  *request.mutable_false_computation() = false_computation.handle();
+  OpRequest op_request;
+  *op_request.mutable_computation() = computation_.handle();
+  *op_request.mutable_conditional_request() = request;
+  AddCommonFieldsToOpRequest(&op_request);
+  OpResponse response;
+
+  VLOG(2) << "making conditional op request";
+  Status s = client_->stub()->Op(&op_request, &response);
+
+  return ParseOpResponse(s, &response);
+}
+
 ComputationDataHandle ComputationBuilder::Reduce(
     const ComputationDataHandle& operand,
     const ComputationDataHandle& init_value, const Computation& computation,
@@ -1816,25 +1911,27 @@ ComputationBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) {
   dimension_numbers.set_kernel_input_feature_dimension(
       kConvKernelInputDimension);
   for (int i = 0; i < num_spatial_dims; ++i) {
-    dimension_numbers.add_spatial_dimensions(i + 2);
+    dimension_numbers.add_input_spatial_dimensions(i + 2);
     dimension_numbers.add_kernel_spatial_dimensions(i + 2);
+    dimension_numbers.add_output_spatial_dimensions(i + 2);
   }
   return dimension_numbers;
 }
 
 /* static */ StatusOr<ConvolutionDimensionNumbers>
 ComputationBuilder::CreateConvDimensionNumbers(
-    int64 input_batch, int64 input_feature, int64 output_batch,
-    int64 output_feature, int64 first_spatial, int64 second_spatial,
+    int64 input_batch, int64 input_feature, int64 input_first_spatial,
+    int64 input_second_spatial, int64 output_batch, int64 output_feature,
+    int64 output_first_spatial, int64 output_second_spatial,
     int64 kernel_output_feature, int64 kernel_input_feature,
     int64 kernel_first_spatial, int64 kernel_second_spatial) {
-  if (std::set<int64>(
-          {input_batch, input_feature, first_spatial, second_spatial})
+  if (std::set<int64>({input_batch, input_feature, input_first_spatial,
+                       input_second_spatial})
           .size() != 4) {
     return FailedPrecondition(
         "dimension numbers for the input are not unique: (%lld, %lld, %lld, "
         "%lld)",
-        input_batch, input_feature, first_spatial, second_spatial);
+        input_batch, input_feature, input_first_spatial, input_second_spatial);
   }
   if (std::set<int64>({kernel_output_feature, kernel_input_feature,
                        kernel_first_spatial, kernel_second_spatial})
@@ -1845,25 +1942,28 @@ ComputationBuilder::CreateConvDimensionNumbers(
         kernel_output_feature, kernel_input_feature, kernel_first_spatial,
         kernel_second_spatial);
   }
-  if (std::set<int64>(
-          {output_batch, output_feature, first_spatial, second_spatial})
+  if (std::set<int64>({output_batch, output_feature, output_first_spatial,
+                       output_second_spatial})
           .size() != 4) {
     return FailedPrecondition(
         "dimension numbers for the output are not unique: (%lld, %lld, %lld, "
         "%lld)",
-        output_batch, output_feature, first_spatial, second_spatial);
+        output_batch, output_feature, output_first_spatial,
+        output_second_spatial);
   }
   ConvolutionDimensionNumbers dimension_numbers;
   dimension_numbers.set_input_batch_dimension(input_batch);
   dimension_numbers.set_input_feature_dimension(input_feature);
-  dimension_numbers.set_output_batch_dimension(output_batch);
-  dimension_numbers.set_output_feature_dimension(output_feature);
-  dimension_numbers.add_spatial_dimensions(first_spatial);
-  dimension_numbers.add_spatial_dimensions(second_spatial);
+  dimension_numbers.add_input_spatial_dimensions(input_first_spatial);
+  dimension_numbers.add_input_spatial_dimensions(input_second_spatial);
   dimension_numbers.set_kernel_output_feature_dimension(kernel_output_feature);
   dimension_numbers.set_kernel_input_feature_dimension(kernel_input_feature);
   dimension_numbers.add_kernel_spatial_dimensions(kernel_first_spatial);
   dimension_numbers.add_kernel_spatial_dimensions(kernel_second_spatial);
+  dimension_numbers.set_output_batch_dimension(output_batch);
+  dimension_numbers.set_output_feature_dimension(output_feature);
+  dimension_numbers.add_output_spatial_dimensions(output_first_spatial);
+  dimension_numbers.add_output_spatial_dimensions(output_second_spatial);
   return dimension_numbers;
 }
 
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index 4c6e320557f9202b738333fc2066ac4394fcff6b..28889ece73f5da72c3eea681c9e4aea7351d3d54 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -121,14 +121,10 @@ class ComputationBuilder {
   // result, OpMetadata is set on the Computation Builder. All subsequent
   // instructions generated via this Computation Builder will have the same
   // OpMetadata attached until a call to ClearOpMetdata.
-  void SetOpMetadata(const OpMetadata& metadata) {
-    metadata_ = metadata;
-  }
+  void SetOpMetadata(const OpMetadata& metadata) { metadata_ = metadata; }
 
   // Clears the HloMetadata state.
-  void ClearOpMetadata() {
-    metadata_.Clear();
-  }
+  void ClearOpMetadata() { metadata_.Clear(); }
 
   // Sets an OpSharding that will be attached to all instructions until cleared.
   void SetSharding(const OpSharding& sharding) { sharding_ = sharding; }
@@ -397,6 +393,11 @@ class ComputationBuilder {
   ComputationDataHandle Dot(const ComputationDataHandle& lhs,
                             const ComputationDataHandle& rhs);
 
+  // Enqueues a general dot instruction onto the computation.
+  ComputationDataHandle DotGeneral(
+      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+      const DotDimensionNumbers& dimension_numbers);
+
   // Default dimension numbers used for a 2D convolution.
   static constexpr int64 kConvBatchDimension = 0;
   static constexpr int64 kConvFeatureDimension = 1;
@@ -417,8 +418,9 @@ class ComputationBuilder {
   // Creates a ConvolutionDimensionNumbers with the given arguments. Returns an
   // error if either the input or the weight dimension numbers have conflicts.
   static StatusOr<ConvolutionDimensionNumbers> CreateConvDimensionNumbers(
-      int64 input_batch, int64 input_feature, int64 output_batch,
-      int64 output_feature, int64 first_spatial, int64 second_spatial,
+      int64 input_batch, int64 input_feature, int64 input_first_spatial,
+      int64 input_second_spatial, int64 output_batch, int64 output_feature,
+      int64 output_first_spatial, int64 output_second_spatial,
       int64 kernel_output_feature, int64 kernel_input_feature,
       int64 kernel_first_spatial, int64 kernel_second_spatial);
 
@@ -673,6 +675,13 @@ class ComputationBuilder {
   ComputationDataHandle ConvertElementType(const ComputationDataHandle& operand,
                                            PrimitiveType new_element_type);
 
+  // Enqueues a no-op instruction onto the computation that changes
+  // the element type of the operand array to primitive_type. The
+  // bit-widths of the source and destination element types must be
+  // identical.
+  ComputationDataHandle BitcastConvertType(const ComputationDataHandle& operand,
+                                           PrimitiveType new_element_type);
+
   // Enqueues a float32 reciprocal instruction onto the computation.
   // (float32 is specified as there is an implicit float32 -1.0f constant
   // exponent).
@@ -732,6 +741,13 @@ class ComputationBuilder {
                               const Computation& body,
                               const ComputationDataHandle& init);
 
+  // Enqueues a conditional node onto the computation.
+  ComputationDataHandle Conditional(const ComputationDataHandle& predicate,
+                                    const ComputationDataHandle& true_operand,
+                                    const Computation& true_computation,
+                                    const ComputationDataHandle& false_operand,
+                                    const Computation& false_computation);
+
   // Enqueues a ReducePrecision node onto the computation.
   ComputationDataHandle ReducePrecision(const ComputationDataHandle& operand,
                                         const int exponent_bits,
@@ -807,7 +823,7 @@ class ComputationBuilder {
   // The operand must represent a constant value, which in this case
   // means that it must not statically depend on any parameter of the
   // computation that is being built other then the ones specified on the
-  // paramtere list. The parameters in the list will be indexed by their
+  // parameter list. The parameters in the list will be indexed by their
   // parameter id property so the number of parameters specified should be at
   // least as many as the largest used parameter index.
   //
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index d936bd870b8b4e63e5c9b067478c19dd2e42006a..5f2b55713e342aa3d0251386d57cb52481fe748d 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -51,7 +51,7 @@ std::unique_ptr<GlobalData> MakeFakeDataViaDeviceOrDie(const Shape& shape,
 
 std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
                                               Client* client) {
-  if (ShapeUtil::ByteSizeOf(shape) < (1LL << 30)) {
+  if (ShapeUtil::ByteSizeOf(shape) < (1LL << 20)) {
     StatusOr<std::unique_ptr<Literal>> literal_status = MakeFakeLiteral(shape);
     if (!literal_status.ok()) {
       // If we got an Unimplemented error, fall back to making the fake data via
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index c3c664f76af78507925274455dc35b2902f0ac4a..7900246a4937a15fda0502c44cd9762c789109a0 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -78,14 +78,14 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
   }
   for (int i = 0; i < arguments.size(); ++i) {
     if (!computation_layout.parameter_layout(i).MatchesLayoutInShape(
-            arguments[i]->shape())) {
+            arguments[i]->on_host_shape())) {
       return InvalidArgument(
           "argument does not match shape or layout of computation parameter "
           "%d: expected %s, got %s",
           i,
           ShapeUtil::HumanString(computation_layout.parameter_layout(i).shape())
               .c_str(),
-          ShapeUtil::HumanString(arguments[i]->shape()).c_str());
+          ShapeUtil::HumanString(arguments[i]->on_host_shape()).c_str());
     }
   }
 
@@ -275,22 +275,15 @@ StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
                                         device_ordinal, options));
 }
 
-// Copy the literal data to the device with the given ordinal and return as a
-// ScopedShapedBuffer. The given memory allocator is used for device memory
-// allocation.
 StatusOr<std::unique_ptr<ScopedShapedBuffer>>
 LocalClient::LiteralToShapedBuffer(const Literal& literal, int device_ordinal,
                                    DeviceMemoryAllocator* allocator) {
   if (allocator == nullptr) {
     allocator = backend().memory_allocator();
   }
-  TF_ASSIGN_OR_RETURN(
-      auto scoped_buffer,
-      ScopedShapedBuffer::Allocate(
-          literal.shape(), allocator, device_ordinal,
-          [this](const Shape& shape) {
-            return backend().transfer_manager()->GetByteSizeRequirement(shape);
-          }));
+  TF_ASSIGN_OR_RETURN(auto scoped_buffer,
+                      backend().transfer_manager()->AllocateScopedShapedBuffer(
+                          literal.shape(), allocator, device_ordinal));
   TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
                       backend().stream_executor(device_ordinal));
   TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
@@ -298,8 +291,6 @@ LocalClient::LiteralToShapedBuffer(const Literal& literal, int device_ordinal,
   return std::move(scoped_buffer);
 }
 
-// Copy the data from the device contained in the given ShapedBuffer and
-// return as a Literal.
 StatusOr<std::unique_ptr<Literal>> LocalClient::ShapedBufferToLiteral(
     const ShapedBuffer& shaped_buffer) {
   TF_ASSIGN_OR_RETURN(
@@ -309,4 +300,22 @@ StatusOr<std::unique_ptr<Literal>> LocalClient::ShapedBufferToLiteral(
                                                                  shaped_buffer);
 }
 
+Status LocalClient::TransferToInfeedLocal(const Literal& literal,
+                                          int device_ordinal) {
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
+                      backend().stream_executor(device_ordinal));
+  return backend().transfer_manager()->TransferLiteralToInfeed(executor,
+                                                               literal);
+}
+
+StatusOr<std::unique_ptr<Literal>> LocalClient::TransferFromOutfeedLocal(
+    const Shape& shape, int device_ordinal) {
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
+                      backend().stream_executor(device_ordinal));
+  auto literal = MakeUnique<Literal>();
+  TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralFromOutfeed(
+      executor, shape, literal.get()));
+  return std::move(literal);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 32fe0d9f84e56f44e4098571e558c7e846d003b5..3ca0d2ef5513cfb6b0dbfbc63b311f81a318356e 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -162,6 +162,20 @@ class LocalClient : public Client {
   StatusOr<std::unique_ptr<Literal>> ShapedBufferToLiteral(
       const ShapedBuffer& shaped_buffer);
 
+  // Transfer the given literal to the infeed queue of the given device.
+  // TODO(b/69670845): Remove the 'Local' from the name when LocalClient does
+  // not inherit from Client and there is no possibility of confusion with
+  // Client::TransferToInfeed.
+  Status TransferToInfeedLocal(const Literal& literal, int device_ordinal);
+
+  // Transfer and return a value of the given shape from the outfeed of the
+  // given device.
+  // TODO(b/69670845): Remove the 'Local' from the name when LocalClient does
+  // not inherit from Client and there is no possibility of confusion with
+  // Client::TransferFromOutfeed.
+  StatusOr<std::unique_ptr<Literal>> TransferFromOutfeedLocal(
+      const Shape& shape, int device_ordinal);
+
   // Returns the platform that the underlying service targets.
   perftools::gputools::Platform* platform() const;
 
diff --git a/tensorflow/compiler/xla/index_util.cc b/tensorflow/compiler/xla/index_util.cc
index 76c0168f370ff1f0749759705b7ecff359a80341..2ee23927d86612a59470dd3d3a219d00055ec65b 100644
--- a/tensorflow/compiler/xla/index_util.cc
+++ b/tensorflow/compiler/xla/index_util.cc
@@ -78,7 +78,7 @@ namespace xla {
   int64 scale = 1;
   int64 linear_index = 0;
   bool first = true;
-  for (auto dimension : shape.layout().minor_to_major()) {
+  for (auto dimension : LayoutUtil::MinorToMajor(shape)) {
     if (first) {
       // Avoid two multiplies on the first loop iteration
       linear_index = multi_index[dimension];
@@ -110,7 +110,7 @@ namespace xla {
 
   // Accumulated product D{L(0)} * D{L(1)} * ...
   int64 divisor = 1;
-  for (auto dimension : shape.layout().minor_to_major()) {
+  for (auto dimension : LayoutUtil::MinorToMajor(shape)) {
     multi_index[dimension] =
         (linear_index / divisor) % shape.dimensions(dimension);
     divisor *= shape.dimensions(dimension);
@@ -133,18 +133,17 @@ namespace xla {
 
 /* static */ int64 IndexUtil::GetDimensionStride(const Shape& shape,
                                                  int64 dimension) {
-  const Layout& layout = shape.layout();
-  int64 pdim_size = layout.padded_dimensions_size();
+  int64 pdim_size = LayoutUtil::PaddedDimensions(shape).size();
   int64 stride = 1;
   DCHECK(pdim_size == 0 || pdim_size == shape.dimensions_size());
-  for (auto dim : layout.minor_to_major()) {
+  for (auto dim : LayoutUtil::MinorToMajor(shape)) {
     if (dim == dimension) {
       break;
     }
     if (pdim_size == 0) {
       stride *= shape.dimensions(dim);
     } else {
-      stride *= layout.padded_dimensions(dim);
+      stride *= LayoutUtil::PaddedDimension(shape, dim);
     }
   }
   return stride;
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index 5c2cc2a7a99cc51ded3d98c9dd5903e4b3078548..f9803be32f5fc3c6b2f7e2527eec0a766647abc7 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -57,6 +57,7 @@ void SetDefaultLayoutToContainer(
 /* static */ Layout LayoutUtil::MakeLayout(
     tensorflow::gtl::ArraySlice<int64> minor_to_major) {
   Layout layout;
+  layout.set_format(DENSE);
   for (int64 dimension_number : minor_to_major) {
     layout.add_minor_to_major(dimension_number);
   }
@@ -68,6 +69,7 @@ namespace {
 // Internal helper that creates a default layout for an array of the given rank.
 Layout CreateDefaultLayoutForRank(int64 rank) {
   Layout layout;
+  layout.set_format(DENSE);
   tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
       minor_to_major = layout.mutable_minor_to_major();
   minor_to_major->Resize(rank, 0);
@@ -105,7 +107,11 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
     for (auto& element_shape : *shape->mutable_tuple_shapes()) {
       SetToDefaultLayout(&element_shape);
     }
+    shape->clear_layout();
+  } else if (ShapeUtil::IsOpaque(*shape)) {
+    shape->clear_layout();
   } else {
+    shape->mutable_layout()->set_format(DENSE);
     tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
         minor_to_major = shape->mutable_layout()->mutable_minor_to_major();
     minor_to_major->Resize(shape->dimensions_size(), 0);
@@ -137,8 +143,10 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
       TF_RETURN_IF_ERROR(ValidateLayoutInShape(element_shape));
     }
     return tensorflow::Status::OK();
-  } else if (ShapeUtil::Rank(shape) == 0 && !shape.has_layout()) {
-    // A scalar without a layout is ok.
+  } else if (ShapeUtil::IsOpaque(shape)) {
+    if (shape.has_layout()) {
+      return InvalidArgument("opaque should not have a layout field");
+    }
     return tensorflow::Status::OK();
   } else {
     // Array shape.
@@ -156,46 +164,59 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
     return InvalidArgument("a single Layout is not valid for tuple shapes");
   }
 
-  if (layout.minor_to_major_size() != ShapeUtil::Rank(shape)) {
+  if (ShapeUtil::IsOpaque(shape)) {
+    return tensorflow::Status::OK();
+  }
+
+  if (layout.format() == INVALID_FORMAT) {
     return InvalidArgument(
-        "layout minor_to_major field contains %d elements, "
-        "but shape is rank %lld: {%s}; shape: %s",
-        layout.minor_to_major_size(), ShapeUtil::Rank(shape),
-        tensorflow::str_util::Join(layout.minor_to_major(), ", ").c_str(),
-        shape.ShortDebugString().c_str());
+        "Layout does not have a valid format: layout {%s}, shape {%s}",
+        layout.ShortDebugString().c_str(), shape.ShortDebugString().c_str());
   }
 
-  std::vector<bool> dimensions_in_layout(ShapeUtil::Rank(shape), false);
-  for (int64 i = 0; i < ShapeUtil::Rank(shape); ++i) {
-    int64 dim = layout.minor_to_major(i);
-    if (dim < 0 || dim >= ShapeUtil::Rank(shape)) {
+  if (layout.format() == DENSE) {
+    if (layout.minor_to_major_size() != ShapeUtil::Rank(shape)) {
       return InvalidArgument(
-          "layout minor_to_major field has out-of-bounds value: %s",
-          HumanString(layout).c_str());
+          "layout minor_to_major field contains %d elements, "
+          "but shape is rank %lld: {%s}; shape: %s",
+          layout.minor_to_major_size(), ShapeUtil::Rank(shape),
+          tensorflow::str_util::Join(layout.minor_to_major(), ", ").c_str(),
+          shape.ShortDebugString().c_str());
     }
-    if (dimensions_in_layout[dim]) {
-      return InvalidArgument(
-          "layout minor_to_major field has duplicate values: {%s}",
-          HumanString(layout).c_str());
-    }
-    dimensions_in_layout[dim] = true;
-  }
 
-  if (layout.padded_dimensions_size() > 0) {
-    if (layout.padded_dimensions_size() != ShapeUtil::Rank(shape)) {
-      return InvalidArgument(
-          "layout has %d padded dimensions, but shape is rank %lld",
-          layout.padded_dimensions_size(), ShapeUtil::Rank(shape));
+    std::vector<bool> dimensions_in_layout(ShapeUtil::Rank(shape), false);
+    for (int64 i = 0; i < ShapeUtil::Rank(shape); ++i) {
+      int64 dim = layout.minor_to_major(i);
+      if (dim < 0 || dim >= ShapeUtil::Rank(shape)) {
+        return InvalidArgument(
+            "layout minor_to_major field has out-of-bounds value: %s",
+            HumanString(layout).c_str());
+      }
+      if (dimensions_in_layout[dim]) {
+        return InvalidArgument(
+            "layout minor_to_major field has duplicate values: {%s}",
+            HumanString(layout).c_str());
+      }
+      dimensions_in_layout[dim] = true;
     }
-    for (int i = 0; i < layout.padded_dimensions_size(); ++i) {
-      if (layout.padded_dimensions(i) < shape.dimensions(i)) {
+
+    if (layout.padded_dimensions_size() > 0) {
+      if (layout.padded_dimensions_size() != ShapeUtil::Rank(shape)) {
         return InvalidArgument(
-            "for dimension %d, dimension padding (%lld) is smaller than "
-            "the dimension size (%lld) of the shape",
-            i, layout.padded_dimensions(i), shape.dimensions(i));
+            "layout has %d padded dimensions, but shape is rank %lld",
+            layout.padded_dimensions_size(), ShapeUtil::Rank(shape));
+      }
+      for (int i = 0; i < layout.padded_dimensions_size(); ++i) {
+        if (layout.padded_dimensions(i) < shape.dimensions(i)) {
+          return InvalidArgument(
+              "for dimension %d, dimension padding (%lld) is smaller than "
+              "the dimension size (%lld) of the shape",
+              i, layout.padded_dimensions(i), shape.dimensions(i));
+        }
       }
     }
   }
+
   return tensorflow::Status::OK();
 }
 
@@ -213,12 +234,23 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
   LayoutUtil::ClearLayout(program_shape->mutable_result());
 }
 
+/* static */ bool LayoutUtil::IsDense(const Shape& shape) {
+  return ShapeUtil::IsArray(shape) && shape.has_layout() &&
+         IsDense(shape.layout());
+}
+
+/* static */ bool LayoutUtil::IsDense(const Layout& layout) {
+  return layout.format() == DENSE;
+}
+
 /* static */ bool LayoutUtil::IsMonotonicWithDim0Minor(const Layout& layout) {
+  CHECK(layout.format() == DENSE);
   return std::is_sorted(layout.minor_to_major().begin(),
                         layout.minor_to_major().end());
 }
 
 /* static */ bool LayoutUtil::IsMonotonicWithDim0Major(const Layout& layout) {
+  CHECK(layout.format() == DENSE);
   return std::is_sorted(layout.minor_to_major().begin(),
                         layout.minor_to_major().end(), std::greater<int64>());
 }
@@ -228,6 +260,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
       shape.layout().padded_dimensions_size() == 0) {
     return false;
   }
+  CHECK(IsDense(shape));
   CHECK_EQ(shape.dimensions_size(), shape.layout().padded_dimensions_size());
   for (int64 i = 0; i < shape.dimensions_size(); ++i) {
     if (shape.layout().padded_dimensions(i) > shape.dimensions(i)) {
@@ -237,15 +270,32 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
   return false;
 }
 
+/* static */ tensorflow::gtl::ArraySlice<const int64>
+LayoutUtil::PaddedDimensions(const Shape& shape) {
+  CHECK(IsDense(shape));
+  return AsInt64Slice(shape.layout().padded_dimensions());
+}
+
+/* static */ int64 LayoutUtil::PaddedDimension(const Shape& shape,
+                                               int64 index) {
+  CHECK(IsDense(shape));
+  return shape.layout().padded_dimensions(index);
+}
+
+/* static */ PaddingValue LayoutUtil::GetPaddingValue(const Shape& shape) {
+  CHECK(IsDense(shape));
+  return shape.layout().padding_value();
+}
+
 /* static */ bool LayoutUtil::HasLayout(const Shape& shape) {
   if (ShapeUtil::IsTuple(shape)) {
     // Tuple shape: all subshapes must have a layout.
     return std::all_of(shape.tuple_shapes().begin(), shape.tuple_shapes().end(),
                        [](const Shape& s) { return HasLayout(s); });
+  } else if (ShapeUtil::IsOpaque(shape)) {
+    return true;
   }
-  // A scalar trivially always has a layout.
-  return (ShapeUtil::Rank(shape) == 0 ||
-          (shape.has_layout() && (shape.layout().minor_to_major_size() > 0)));
+  return shape.has_layout() && shape.layout().format() != INVALID_FORMAT;
 }
 
 /* static */ bool LayoutUtil::HasLayout(const ProgramShape& program_shape) {
@@ -261,6 +311,18 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
   return protobuf_util::ProtobufEquals(lhs, rhs);
 }
 
+/* static */ tensorflow::gtl::ArraySlice<int64> LayoutUtil::MinorToMajor(
+    const Shape& shape) {
+  CHECK(IsDense(shape));
+  return AsInt64Slice(shape.layout().minor_to_major());
+}
+
+/* static */ tensorflow::gtl::ArraySlice<int64> LayoutUtil::MinorToMajor(
+    const Layout& layout) {
+  CHECK(layout.format() == DENSE);
+  return AsInt64Slice(layout.minor_to_major());
+}
+
 /* static */ int64 LayoutUtil::Major(const Layout& layout,
                                      int64 physical_dimension_number) {
   CHECK_LE(0, physical_dimension_number);
@@ -271,6 +333,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 
 /* static */ int64 LayoutUtil::Minor(const Layout& layout,
                                      int64 physical_dimension_number) {
+  CHECK_EQ(layout.format(), DENSE);
   CHECK_LE(0, physical_dimension_number);
   CHECK_LT(physical_dimension_number, layout.minor_to_major_size());
   return layout.minor_to_major(physical_dimension_number);
diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h
index bc42e222292933be35e82d1fe50802e8830d16b3..d00cd03756360a279ad8b803476f72bac0568734 100644
--- a/tensorflow/compiler/xla/layout_util.h
+++ b/tensorflow/compiler/xla/layout_util.h
@@ -71,6 +71,12 @@ class LayoutUtil {
   // Clears the layout on all Shapes within the given ProgramShape.
   static void ClearLayout(ProgramShape* program_shape);
 
+  // Returns whether the given Shape is an array and has a dense format layout.
+  static bool IsDense(const Shape& shape);
+
+  // Returns whether the given Layout has a dense format.
+  static bool IsDense(const Layout& layout);
+
   // Returns whether the layout is monotonic and dim 0 is minor in the layout.
   // * R0 and R1: this is always trivially true.
   // * R2+: equivalent to column-major. Dimension 0 is the minor, dimension 1 is
@@ -88,6 +94,19 @@ class LayoutUtil {
   // dimension size).
   static bool IsPadded(const Shape& shape);
 
+  // Returns the padded_dimensions array for the given Shape.  Requires that the
+  // shape is an array and has a dense layout.
+  static tensorflow::gtl::ArraySlice<const int64> PaddedDimensions(
+      const Shape& shape);
+
+  // Returns the given index of the padded_dimensions array for the given Shape.
+  // Requires that the shape is an array and has a dense layout.
+  static int64 PaddedDimension(const Shape& shape, int64 index);
+
+  // Returns the padding_value for the given Shape.  Requires that the shape is
+  // an array and has a dense layout.
+  static PaddingValue GetPaddingValue(const Shape& shape);
+
   // Returns whether the given shape has a layout. For tuple shapes, true is
   // returned only if all elements have layouts.
   static bool HasLayout(const Shape& shape);
@@ -98,6 +117,11 @@ class LayoutUtil {
   // Returns whether lhs and rhs are identical.
   static bool Equal(const Layout& lhs, const Layout& rhs);
 
+  // Returns the minor_to_major array for the given Shape.  Requires that the
+  // shape is an array and has a dense layout.
+  static tensorflow::gtl::ArraySlice<int64> MinorToMajor(const Shape& shape);
+  static tensorflow::gtl::ArraySlice<int64> MinorToMajor(const Layout& layout);
+
   // Major(0) is the most major logical dimension number, major(1) is the
   // second-most-major logical dimension number and so on.
   //
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 93d3cd425f0a868b51677058796e9c40c2d3dff8..f493460e795deaf66fa57b9fa42918fdd05bdac6 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -64,12 +64,12 @@ Literal::StrideConfig::StrideConfig(
   if (!dimensions.empty()) {
     // Selects the shape with the largest minor dimension as the one upon
     // which to run the tight stride loop.
-    if (dimensions[source_shape.layout().minor_to_major()[0]] >=
-        dimensions[dest_shape.layout().minor_to_major()[0]]) {
-      minor_dimension = source_shape.layout().minor_to_major()[0];
+    if (dimensions[LayoutUtil::Minor(source_shape.layout(), 0)] >=
+        dimensions[LayoutUtil::Minor(dest_shape.layout(), 0)]) {
+      minor_dimension = LayoutUtil::Minor(source_shape.layout(), 0);
       dest_stride = IndexUtil::GetDimensionStride(dest_shape, minor_dimension);
     } else {
-      minor_dimension = dest_shape.layout().minor_to_major()[0];
+      minor_dimension = LayoutUtil::Minor(dest_shape.layout(), 0);
       source_stride =
           IndexUtil::GetDimensionStride(source_shape, minor_dimension);
     }
@@ -252,6 +252,10 @@ Status Literal::Copy(const Literal& src_literal,
       return *Literal::CreateR0<int32>(1);
     case S64:
       return *Literal::CreateR0<int64>(1);
+    case F16:
+      return *Literal::CreateR0<half>(static_cast<half>(1.0f));
+    case BF16:
+      return *Literal::CreateR0<bfloat16>(static_cast<bfloat16>(1.0f));
     case F32:
       return *Literal::CreateR0<float>(1);
     case F64:
@@ -263,8 +267,6 @@ Status Literal::Copy(const Literal& src_literal,
     case S16:
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
-    case F16:
-      return *Literal::CreateR0<half>(static_cast<half>(1.0f));
     case TUPLE:
       LOG(FATAL) << "tuple element type cannot take on value of 1";
     case OPAQUE:
@@ -402,6 +404,27 @@ std::unique_ptr<Literal> Literal::Relayout(
   return outer_result;
 }
 
+std::unique_ptr<Literal> Literal::Relayout(
+    const Shape& shape_with_layout) const {
+  CHECK(ShapeUtil::Compatible(shape_with_layout, shape()))
+      << "Given shape_with_layout " << ShapeUtil::HumanString(shape_with_layout)
+      << " not compatible with literal shape "
+      << ShapeUtil::HumanString(shape());
+  std::unique_ptr<Literal> result = CreateFromShape(shape_with_layout);
+  ShapeUtil::ForEachSubshape(
+      result->shape(),
+      [this, &result](const Shape& subshape, const ShapeIndex& index) {
+        if (ShapeUtil::IsArray(subshape)) {
+          DimensionVector base(ShapeUtil::Rank(subshape), 0);
+          DimensionVector copy_size(subshape.dimensions().begin(),
+                                    subshape.dimensions().end());
+          TF_CHECK_OK(result->GetSubliteral(index).Copy(GetSubliteral(index),
+                                                        base, base, copy_size));
+        }
+      });
+  return result;
+}
+
 StatusOr<std::unique_ptr<Literal>> Literal::Reshape(
     tensorflow::gtl::ArraySlice<int64> dimensions) const {
   if (ShapeUtil::IsTuple(shape())) {
@@ -409,10 +432,8 @@ StatusOr<std::unique_ptr<Literal>> Literal::Reshape(
   }
   std::unique_ptr<Literal> output;
   if (!LayoutUtil::IsMonotonicWithDim0Major(shape().layout())) {
-    std::vector<int64> minor_to_major(ShapeUtil::Rank(shape()));
-    std::iota(minor_to_major.rbegin(), minor_to_major.rend(),
-              static_cast<int64>(0));
-    output = Relayout(LayoutUtil::MakeLayout(minor_to_major));
+    output =
+        Relayout(LayoutUtil::GetDefaultLayoutForRank(ShapeUtil::Rank(shape())));
   } else {
     output = CloneToUnique();
   }
@@ -458,9 +479,10 @@ std::unique_ptr<Literal> Literal::Transpose(
   // dimension has within the transposed array, a layout is affine if
   // MinMaj(Di) == TMinMaj(T(Di)), with TMinMaj() being the minor to major
   // vector of the affine layout.
+  CHECK(LayoutUtil::IsDense(permuted_shape));
   Layout* layout = permuted_shape.mutable_layout();
   layout->clear_minor_to_major();
-  for (auto index : shape().layout().minor_to_major()) {
+  for (auto index : LayoutUtil::MinorToMajor(shape())) {
     layout->add_minor_to_major(inverse_permutation[index]);
   }
   std::unique_ptr<Literal> new_literal = CreateFromShape(permuted_shape);
@@ -484,9 +506,9 @@ std::unique_ptr<Literal> Literal::Slice(
     CHECK_GT(dimension, 0);
     result_dimensions.push_back(dimension);
   }
-  const auto result_shape = ShapeUtil::MakeShapeWithLayout(
-      shape().element_type(), result_dimensions,
-      AsInt64Slice(shape().layout().minor_to_major()));
+  const auto result_shape =
+      ShapeUtil::MakeShapeWithLayout(shape().element_type(), result_dimensions,
+                                     LayoutUtil::MinorToMajor(shape()));
 
   auto result_literal = MakeUnique<Literal>();
   *result_literal->mutable_shape() = result_shape;
@@ -713,7 +735,13 @@ string Literal::ToString(bool print_layout) const {
     pieces.push_back("}");
   } else {
     pieces.push_back(shape_to_string(shape()));
-    pieces.push_back(" {...}");
+    pieces.push_back(" {");
+    EachCellAsString(
+        [&](tensorflow::gtl::ArraySlice<int64> indices, const string& value) {
+          pieces.push_back(" ");
+          pieces.push_back(value);
+        });
+    pieces.push_back("}");
   }
 
   return tensorflow::str_util::Join(pieces, "");
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index f37e529caf54e3aded1a418d1f01c1440cd0f284..c782e0f19e5e15eb03894b2dda40ba40b3dfaba7 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -99,6 +99,7 @@ class Literal {
     f16s_.clear();
     f32s_.clear();
     f64s_.clear();
+    c64s_.clear();
     tuple_literals_.clear();
   }
 
@@ -285,11 +286,15 @@ class Literal {
   std::unique_ptr<Literal> Relayout(const Layout& new_layout,
                                     const ShapeIndex& shape_index = {}) const;
 
-  // Creates a new literal by reshaping this literal to have 'shape'. Both the
-  // original shape and 'shape' must contain the same number of elements. The
+  // An overload of Relayout which changes the layout of the entire shape rather
+  // than being limited to a single array within the shape.
+  std::unique_ptr<Literal> Relayout(const Shape& shape_with_layout) const;
+
+  // Creates a new literal by reshaping this literal to have the given
+  // dimensions. The total number of elements must not change; The
   // implementation currently only supports monotonic dim0-major layouts.
   StatusOr<std::unique_ptr<Literal>> Reshape(
-      tensorflow::gtl::ArraySlice<int64> shape) const;
+      tensorflow::gtl::ArraySlice<int64> dimensions) const;
 
   // Creates a new literal by reordering the dimensions of this literal.
   // The given `permutation` must be a permutation of the dimension numbers
@@ -1106,7 +1111,7 @@ void Literal::PopulateR2WithLayout(
       primitive_util::NativeToPrimitiveType<NativeT>(),
       {static_cast<int64>(values.size()),
        static_cast<int64>(values.begin()->size())},
-      AsInt64Slice(layout.minor_to_major()));
+      LayoutUtil::MinorToMajor(layout));
 
   const int64 dim0_size = values.size();
   const int64 dim1_size = values.begin()->size();
@@ -1137,9 +1142,10 @@ void Literal::PopulateR2(
 template <typename NativeT>
 void Literal::PopulateFromArrayWithLayout(const Array<NativeT>& values,
                                           const Layout& layout) {
+  CHECK_EQ(layout.format(), DENSE);
   *mutable_shape() = ShapeUtil::MakeShapeWithLayout(
       primitive_util::NativeToPrimitiveType<NativeT>(), values.dimensions(),
-      AsInt64Slice(layout.minor_to_major()));
+      LayoutUtil::MinorToMajor(layout));
   Reserve(values.num_elements());
   values.Each([this](tensorflow::gtl::ArraySlice<int64> indices,
                      NativeT value) { this->Set(indices, value); });
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index 816bb3c549eaae4e8fc2b7d438627266603272f9..7ff64c4134155e7fe22ab99584970a7d6d6e8803 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -515,7 +515,7 @@ TYPED_TEST(LiteralUtilTestTemplated, Relayout2x2) {
 
 TEST_F(LiteralUtilTest, ReshapeR0) {
   auto original = Literal::CreateR0<float>(1.7f);
-  auto reshape = original->Reshape(/*shape=*/{}).ConsumeValueOrDie();
+  auto reshape = original->Reshape(/*dimensions=*/{}).ConsumeValueOrDie();
   EXPECT_EQ(*original, *reshape);
 }
 
diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h
index 19c6a138885c61f1304bfae3d8bb5d958a1bb5bc..cb4583d198b454be1432134a9f6a77dbbbe5bdd8 100644
--- a/tensorflow/compiler/xla/primitive_util.h
+++ b/tensorflow/compiler/xla/primitive_util.h
@@ -26,6 +26,13 @@ limitations under the License.
 namespace xla {
 namespace primitive_util {
 
+// The number of exponent bits in a BF16 value.
+const int kBFloat16ExponentBits = 8;
+
+// The number of mantissa bits in a BF16 value. There is an implicit leading
+// 1, so there is an implicit additional bit of precision.
+const int kBFloat16MantissaBits = 7;
+
 // Returns the XLA primitive type (eg, F32) corresponding to the given
 // template parameter native type (eg, float).
 template <typename NativeT>
diff --git a/tensorflow/compiler/xla/ptr_util.h b/tensorflow/compiler/xla/ptr_util.h
index 627ddf535fe734ac55d01dabb7f160b46e6e69d8..c58c19db2cacbe9b038160f27b9bd76aa58146eb 100644
--- a/tensorflow/compiler/xla/ptr_util.h
+++ b/tensorflow/compiler/xla/ptr_util.h
@@ -37,7 +37,7 @@ std::unique_ptr<T> WrapUnique(T* ptr) {
 template <typename T, typename... Args>
 typename tensorflow::helper::MakeUniqueResult<T>::scalar MakeUnique(
     Args&&... args) {
-  return tensorflow::MakeUnique<T, Args>(std::forward<Args>(args)...);
+  return tensorflow::MakeUnique<T, Args...>(std::forward<Args>(args)...);
 }
 
 // Overload for array of unknown bound.
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a6b8158671fd7872fd3492fe647558f7a3c3d1d8
--- /dev/null
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -0,0 +1,82 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+
+py_library(
+    name = "xla_client",
+    srcs = ["xla_client.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":pywrap_xla",
+        "//tensorflow/compiler/xla:xla_data_proto_py",
+    ],
+)
+
+py_test(
+    name = "xla_client_test",
+    srcs = ["xla_client_test.py"],
+    main = "xla_client_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":xla_client",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cc_library(
+    name = "numpy_bridge",
+    srcs = ["numpy_bridge.cc"],
+    hdrs = ["numpy_bridge.h"],
+    deps = [
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/python:numpy_lib",
+    ],
+)
+
+cc_library(
+    name = "local_computation_builder",
+    srcs = ["local_computation_builder.cc"],
+    hdrs = ["local_computation_builder.h"],
+    deps = [
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_py_wrap_cc(
+    name = "pywrap_xla",
+    srcs = ["xla.i"],
+    swig_includes = [
+        "local_computation_builder.i",
+    ],
+    deps = [
+        ":local_computation_builder",
+        ":numpy_bridge",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/compiler/xla/python/__init__.py b/tensorflow/compiler/xla/python/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b0a53fac7adf2c088a3ceb9ae58a5ce2c7adf92
--- /dev/null
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -0,0 +1,265 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/local_computation_builder.h"
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+namespace swig {
+
+CompiledLocalComputation::CompiledLocalComputation(
+    std::unique_ptr<LocalExecutable> executable)
+    : executable_(std::move(executable)) {}
+
+std::unique_ptr<Literal> CompiledLocalComputation::Execute(
+    const std::vector<Literal>& arguments) {
+  LocalClient* client = ClientLibrary::LocalClientOrDie();
+
+  // Transfer arguments in
+  std::vector<std::unique_ptr<ScopedShapedBuffer>> scoped_buffers;
+  scoped_buffers.reserve(arguments.size());
+  for (const Literal& argument : arguments) {
+    scoped_buffers.push_back(
+        client
+            ->LiteralToShapedBuffer(argument,
+                                    /*device_ordinal=*/0,
+                                    client->backend().memory_allocator())
+            .ConsumeValueOrDie());
+  }
+
+  // Execute
+  std::vector<const ShapedBuffer*> argument_buffers;
+  argument_buffers.reserve(scoped_buffers.size());
+  for (auto& buffer : scoped_buffers) {
+    argument_buffers.push_back(buffer.get());
+  }
+  ExecutableRunOptions options;
+  options.set_allocator(client->backend().memory_allocator());
+  options.set_inter_op_thread_pool(client->backend().inter_op_thread_pool());
+  options.set_intra_op_thread_pool(
+      client->backend().eigen_intra_op_thread_pool_device());
+  std::unique_ptr<ScopedShapedBuffer> result_buffer =
+      executable_->Run(argument_buffers, options).ConsumeValueOrDie();
+
+  // Transfer result out
+  return client->ShapedBufferToLiteral(*result_buffer).ConsumeValueOrDie();
+}
+
+LocalComputation::LocalComputation(std::unique_ptr<Computation> computation)
+    : computation_(std::move(computation)) {}
+
+CompiledLocalComputation* LocalComputation::Compile(
+    const std::vector<Shape>& argument_shapes) {
+  std::vector<const Shape*> argument_shape_pointers;
+  argument_shape_pointers.reserve(argument_shapes.size());
+  for (auto& argument_shape : argument_shapes) {
+    argument_shape_pointers.push_back(&argument_shape);
+  }
+
+  LocalClient* client = ClientLibrary::LocalClientOrDie();
+  ExecutableBuildOptions options;
+  return new CompiledLocalComputation(
+      client->Compile(*computation_, argument_shape_pointers, options)
+          .ValueOrDie());
+}
+
+const Computation& LocalComputation::computation() const {
+  return *computation_;
+}
+
+LocalComputationBuilder::LocalComputationBuilder(const string& computation_name)
+    : builder_(ClientLibrary::LocalClientOrDie(), computation_name) {}
+
+LocalComputation* LocalComputationBuilder::Build() {
+  return new LocalComputation(std::unique_ptr<Computation>(
+      new Computation(builder_.Build().ConsumeValueOrDie())));
+}
+
+ComputationDataHandle LocalComputationBuilder::Parameter(int64 parameter_number,
+                                                         const Shape& shape,
+                                                         const string& name) {
+  return builder_.Parameter(parameter_number, shape, name);
+}
+
+std::unique_ptr<Shape> LocalComputationBuilder::GetShape(
+    const ComputationDataHandle& operand) {
+  return builder_.GetShape(operand).ConsumeValueOrDie();
+}
+
+ComputationDataHandle LocalComputationBuilder::ConstantLiteral(
+    const Literal& literal) {
+  return builder_.ConstantLiteral(literal);
+}
+
+ComputationDataHandle LocalComputationBuilder::Broadcast(
+    const ComputationDataHandle& operand,
+    tensorflow::gtl::ArraySlice<int64> broadcast_sizes) {
+  return builder_.Broadcast(operand, broadcast_sizes);
+}
+
+ComputationDataHandle LocalComputationBuilder::Reshape(
+    const ComputationDataHandle& operand,
+    tensorflow::gtl::ArraySlice<int64> dimensions,
+    tensorflow::gtl::ArraySlice<int64> new_sizes) {
+  return builder_.Reshape(operand, dimensions, new_sizes);
+}
+
+ComputationDataHandle LocalComputationBuilder::Slice(
+    const ComputationDataHandle& operand,
+    tensorflow::gtl::ArraySlice<int64> start_indices,
+    tensorflow::gtl::ArraySlice<int64> limit_indices,
+    tensorflow::gtl::ArraySlice<int64> strides) {
+  return builder_.Slice(operand, start_indices, limit_indices, strides);
+}
+
+ComputationDataHandle LocalComputationBuilder::DynamicSlice(
+    const ComputationDataHandle& operand,
+    const ComputationDataHandle& start_indices,
+    tensorflow::gtl::ArraySlice<int64> slice_sizes) {
+  return builder_.DynamicSlice(operand, start_indices, slice_sizes);
+}
+
+ComputationDataHandle LocalComputationBuilder::DynamicUpdateSlice(
+    const ComputationDataHandle& operand, const ComputationDataHandle& update,
+    const ComputationDataHandle& start_indices) {
+  return builder_.DynamicUpdateSlice(operand, update, start_indices);
+}
+
+ComputationDataHandle LocalComputationBuilder::ConcatInDim(
+    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
+    int64 dimension) {
+  return builder_.ConcatInDim(operands, dimension);
+}
+
+ComputationDataHandle LocalComputationBuilder::Select(
+    const ComputationDataHandle& pred, const ComputationDataHandle& on_true,
+    const ComputationDataHandle& on_false) {
+  return builder_.Select(pred, on_true, on_false);
+}
+
+ComputationDataHandle LocalComputationBuilder::Tuple(
+    tensorflow::gtl::ArraySlice<ComputationDataHandle> elements) {
+  return builder_.Tuple(elements);
+}
+
+ComputationDataHandle LocalComputationBuilder::GetTupleElement(
+    const ComputationDataHandle& tuple_data, int64 index) {
+  return builder_.GetTupleElement(tuple_data, index);
+}
+
+ComputationDataHandle LocalComputationBuilder::Dot(
+    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs) {
+  return builder_.Dot(lhs, rhs);
+}
+
+ComputationDataHandle LocalComputationBuilder::ConvertElementType(
+    const ComputationDataHandle& operand, PrimitiveType new_element_type) {
+  return builder_.ConvertElementType(operand, new_element_type);
+}
+
+ComputationDataHandle LocalComputationBuilder::Call(
+    const LocalComputation& local_computation,
+    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands) {
+  return builder_.Call(local_computation.computation(), operands);
+}
+
+ComputationDataHandle LocalComputationBuilder::Transpose(
+    const ComputationDataHandle& operand,
+    tensorflow::gtl::ArraySlice<int64> permutation) {
+  return builder_.Transpose(operand, permutation);
+}
+
+ComputationDataHandle LocalComputationBuilder::Map(
+    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
+    const LocalComputation& local_computation,
+    tensorflow::gtl::ArraySlice<int64> dimensions,
+    tensorflow::gtl::ArraySlice<ComputationDataHandle> static_operands) {
+  return builder_.Map(operands, local_computation.computation(), dimensions,
+                      static_operands);
+}
+
+ComputationDataHandle LocalComputationBuilder::Reduce(
+    const ComputationDataHandle& operand,
+    const ComputationDataHandle& init_value,
+    const LocalComputation& local_computation,
+    tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce) {
+  return builder_.Reduce(operand, init_value, local_computation.computation(),
+                         dimensions_to_reduce);
+}
+
+ComputationDataHandle LocalComputationBuilder::While(
+    const LocalComputation& condition, const LocalComputation& body,
+    const ComputationDataHandle& init) {
+  return builder_.While(condition.computation(), body.computation(), init);
+}
+
+#define _FORWARD(method_name, return_sig, args_sig, args)    \
+  return_sig LocalComputationBuilder::method_name args_sig { \
+    return builder_.method_name args;                        \
+  }
+
+#define _FORWARD_UNOP(method_name)             \
+  _FORWARD(method_name, ComputationDataHandle, \
+           (const ComputationDataHandle& operand), (operand))
+
+#define _FORWARD_BINOP(method_name)                                        \
+  _FORWARD(                                                                \
+      method_name, ComputationDataHandle,                                  \
+      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
+       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions),           \
+      (lhs, rhs, broadcast_dimensions))
+
+_FORWARD_BINOP(Eq)
+_FORWARD_BINOP(Ne)
+_FORWARD_BINOP(Ge)
+_FORWARD_BINOP(Gt)
+_FORWARD_BINOP(Lt)
+_FORWARD_BINOP(Le)
+_FORWARD_BINOP(Add)
+_FORWARD_BINOP(Sub)
+_FORWARD_BINOP(Mul)
+_FORWARD_BINOP(Div)
+_FORWARD_BINOP(Rem)
+_FORWARD_BINOP(Max)
+_FORWARD_BINOP(Min)
+_FORWARD_BINOP(And)
+_FORWARD_BINOP(Or)
+_FORWARD_UNOP(Not)
+_FORWARD_UNOP(Abs)
+_FORWARD_UNOP(Exp)
+_FORWARD_UNOP(Floor)
+_FORWARD_UNOP(Ceil)
+_FORWARD_UNOP(Log)
+_FORWARD_UNOP(Sign)
+_FORWARD_UNOP(Cos)
+_FORWARD_UNOP(Sin)
+_FORWARD_UNOP(Tanh)
+_FORWARD_UNOP(SqrtF32)
+_FORWARD_UNOP(SquareF32)
+_FORWARD_BINOP(Pow)
+_FORWARD_UNOP(IsFinite)
+_FORWARD_UNOP(ReciprocalF32)
+_FORWARD_UNOP(Neg)
+_FORWARD_UNOP(Sort)
+
+#undef _FORWARD
+#undef _FORWARD_UNOP
+#undef _FORWARD_BINOP
+
+}  // namespace swig
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbab45a5f0132eb08f291f542d40df6d0689e7ae
--- /dev/null
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -0,0 +1,210 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_COMPUTATION_BUILDER_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_COMPUTATION_BUILDER_H_
+
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace xla {
+
+namespace swig {
+
+// Wraps a LocalExecutable produced by compiling a
+// LocalComputation. The Execute method forwards to that of the
+// underlying LocalExecutable, and additionally handles tranferring
+// arguments and return values in and back out of the client library's
+// local client. This class is intended to be made available to Python
+// via SWIG.
+class CompiledLocalComputation {
+ public:
+  CompiledLocalComputation(std::unique_ptr<LocalExecutable> executable);
+  std::unique_ptr<Literal> Execute(const std::vector<Literal>& arguments);
+
+ private:
+  std::unique_ptr<LocalExecutable> executable_;
+};
+
+// Wraps a Computation produced by a LocalComputationBuilder. The
+// Compile method compiles the computation to a (local) executable via
+// the client library's local client. This class is intended to be
+// made available to Python via SWIG.
+class LocalComputation {
+ public:
+  LocalComputation(std::unique_ptr<Computation> computation);
+  CompiledLocalComputation* Compile(const std::vector<Shape>& argument_shapes);
+  const Computation& computation() const;
+
+ private:
+  std::unique_ptr<Computation> computation_;
+};
+
+// Wraps the ComputationBuilder API in order to:
+// - Support consumption by SWIG in order to be made available to
+//   Python.
+// - Set up the underlying builder to use the client library's
+//   LocalClient.
+// - Wrap Computations in LocalComputations for Python access.
+// - Correspondingly unwrap incoming LocalComputations.
+class LocalComputationBuilder {
+ public:
+  LocalComputationBuilder(const string& computation_name);
+
+  LocalComputation* Build();
+
+  ComputationDataHandle Parameter(int64 parameter_number, const Shape& shape,
+                                  const string& name);
+
+  std::unique_ptr<Shape> GetShape(const ComputationDataHandle& operand);
+
+  ComputationDataHandle ConstantLiteral(const Literal& literal);
+
+  ComputationDataHandle Broadcast(
+      const ComputationDataHandle& operand,
+      tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
+
+  ComputationDataHandle Reshape(const ComputationDataHandle& operand,
+                                tensorflow::gtl::ArraySlice<int64> dimensions,
+                                tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+  ComputationDataHandle Slice(const ComputationDataHandle& operand,
+                              tensorflow::gtl::ArraySlice<int64> start_indices,
+                              tensorflow::gtl::ArraySlice<int64> limit_indices,
+                              tensorflow::gtl::ArraySlice<int64> strides);
+
+  ComputationDataHandle DynamicSlice(
+      const ComputationDataHandle& operand,
+      const ComputationDataHandle& start_indices,
+      tensorflow::gtl::ArraySlice<int64> slice_sizes);
+
+  ComputationDataHandle DynamicUpdateSlice(
+      const ComputationDataHandle& operand, const ComputationDataHandle& update,
+      const ComputationDataHandle& start_indices);
+
+  ComputationDataHandle ConcatInDim(
+      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
+      int64 dimension);
+
+  ComputationDataHandle Select(const ComputationDataHandle& pred,
+                               const ComputationDataHandle& on_true,
+                               const ComputationDataHandle& on_false);
+
+  ComputationDataHandle Tuple(
+      tensorflow::gtl::ArraySlice<ComputationDataHandle> elements);
+
+  ComputationDataHandle GetTupleElement(const ComputationDataHandle& tuple_data,
+                                        int64 index);
+
+  ComputationDataHandle Dot(const ComputationDataHandle& lhs,
+                            const ComputationDataHandle& rhs);
+
+  ComputationDataHandle ConvertElementType(const ComputationDataHandle& operand,
+                                           PrimitiveType new_element_type);
+
+  ComputationDataHandle Call(
+      const LocalComputation& local_computation,
+      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands);
+
+  ComputationDataHandle Transpose(
+      const ComputationDataHandle& operand,
+      tensorflow::gtl::ArraySlice<int64> permutation);
+
+  ComputationDataHandle Map(
+      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
+      const LocalComputation& local_computation,
+      tensorflow::gtl::ArraySlice<int64> dimensions,
+      tensorflow::gtl::ArraySlice<ComputationDataHandle> static_operands);
+
+  ComputationDataHandle Reduce(
+      const ComputationDataHandle& operand,
+      const ComputationDataHandle& init_value,
+      const LocalComputation& local_computation,
+      tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
+
+  ComputationDataHandle While(const LocalComputation& condition,
+                              const LocalComputation& body,
+                              const ComputationDataHandle& init);
+
+#define _FORWARD(method_name, return_sig, args_sig) \
+  return_sig method_name args_sig;
+
+#define _FORWARD_UNOP(method_name)             \
+  _FORWARD(method_name, ComputationDataHandle, \
+           (const ComputationDataHandle& operand))
+
+#define _FORWARD_BINOP(method_name)                                        \
+  _FORWARD(                                                                \
+      method_name, ComputationDataHandle,                                  \
+      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
+       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions))
+
+  _FORWARD_BINOP(Eq)
+  _FORWARD_BINOP(Ne)
+  _FORWARD_BINOP(Ge)
+  _FORWARD_BINOP(Gt)
+  _FORWARD_BINOP(Lt)
+  _FORWARD_BINOP(Le)
+  _FORWARD_BINOP(Add)
+  _FORWARD_BINOP(Sub)
+  _FORWARD_BINOP(Mul)
+  _FORWARD_BINOP(Div)
+  _FORWARD_BINOP(Rem)
+  _FORWARD_BINOP(Max)
+  _FORWARD_BINOP(Min)
+  _FORWARD_BINOP(And)
+  _FORWARD_BINOP(Or)
+  _FORWARD_UNOP(Not)
+  _FORWARD_UNOP(Abs)
+  _FORWARD_UNOP(Exp)
+  _FORWARD_UNOP(Floor)
+  _FORWARD_UNOP(Ceil)
+  _FORWARD_UNOP(Log)
+  _FORWARD_UNOP(Sign)
+  _FORWARD_UNOP(Cos)
+  _FORWARD_UNOP(Sin)
+  _FORWARD_UNOP(Tanh)
+  _FORWARD_UNOP(SqrtF32)
+  _FORWARD_UNOP(SquareF32)
+  _FORWARD_BINOP(Pow)
+  _FORWARD_UNOP(IsFinite)
+  _FORWARD_UNOP(ReciprocalF32)
+  _FORWARD_UNOP(Neg)
+  _FORWARD_UNOP(Sort)
+
+#undef _FORWARD
+#undef _FORWARD_UNOP
+#undef _FORWARD_BINOP
+
+ private:
+  ComputationBuilder builder_;
+};
+
+static void DeleteLocalComputation(LocalComputation* computation) {
+  delete computation;
+}
+
+static void DeleteCompiledLocalComputation(
+    CompiledLocalComputation* computation) {
+  delete computation;
+}
+
+}  // namespace swig
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_COMPUTATION_BUILDER_H_
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
new file mode 100644
index 0000000000000000000000000000000000000000..ac8f3e4277739cb97c1209a22bb5c6975266e3ee
--- /dev/null
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -0,0 +1,348 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// SWIG typemaps and declarations for building, compiling, and
+// executing XLA computations, wrapping most of what is declared in
+// local_computation_builder.h.
+//
+// The typemaps below implement/assert the following correspondences
+// (with elaborations below):
+//
+//    C++                                  Python
+// -------------------------------------+---------------------------------------
+//  ComputationDataHandle              <-> long
+//  ArraySlice<int64>                  <-  sequence of long
+//  ArraySlice<ComputationDataHandle>  <-  sequence of long
+//  Literal                            <-> (nested tuple of) numpy ndarray
+//  std::vector<Literal>               <-  sequence of (nested tuple of) ndarray
+//  Shape                              <-> pair holding (dtype, dimensions)
+//  std::vector<Shape>                 <-  sequence of shape information pairs
+//  PrimitiveType                      <-  int
+//
+// Arrows indicate whether a conversion only ever occurs in one
+// direction, or whether it is maintained bidirectionally. Also,
+// "long" and "int" denote the Python types so named, not C.
+//
+// The Python objects corresponding to C++ Literals have the type:
+//
+//   T = ndarray | (T, ...)
+//
+// where a terminal numpy ndarray translates to a Literal with a
+// non-tuple Shape, an XLA primitive element type corresponding to the
+// ndarray's dtype. Meanwhile, a non-terminal "tuple of T" translates
+// to a tuple-shaped Literal whose tuple components are translated
+// recursively. For example, if x is a numpy ndarray in Python, with
+// shape (2, 3) and dtype of dtype('float32'), then x translates to a
+// Literal with rank 2, dimension 2 and 3, and XLA primitive type
+// F32. Meanwhile,
+//
+//   (x, (x, x), (x,)),
+//
+// translates to a tuple-shaped XLA Literal, whose component subshapes
+// are a 2x3 F32-shaped literal followed by two tuple-shaped literals.
+//
+// The Python objects corresponding to C++ Shapes have the type:
+//
+//   T            = (dtype, S)
+//   S            = DIMENSIONS | TUPLE_SHAPES
+//   DIMENSIONS   = (int, ...)
+//   TUPLE_SHAPES = (T, ...)
+//
+// In the pair described by the T rule, the terminal dtype determines
+// whether S expands as DIMENSIONS or TUPLE_SHAPES. Namely if it is
+// dtype('O'), numpy's object dtype, the structure represents a tuple
+// shape and the expansion of the non-terminal S is
+// TUPLE_SHAPES. Otherwise, dtype describes a primitive element type
+// and S expands into DIMENSIONS giving dimension sizes. For example:
+//
+//   (dtype('float32'), (3, 5, 7))
+//
+// describes a 3x5x7 array of F32s, and
+//
+//   (dtype('O'), ((dtype('float32'), (2, 3)),
+//                 (dtype('float64'), (4, 5))))
+//
+// describes a tuple shape with two subshapes: the first a 2x3 F32,
+// and the other a 4x5 F64.
+//
+// The Python int corresponding to a PrimitiveType enum must be valid
+// per xla_data.proto (e.g. xla_data.PRED, xla_data.F32).
+//
+// The SWIG object wrappers generated by this file are not intended
+// for end use, but rather for internal use in the Python XLA client,
+// xla_client.py.
+//
+// One central reason for the Python-side indirection is that the
+// Python-side objects produced by the typemaps in this file are
+// further packaged up by xla_client before being passed on. For
+// instance, xla_client wraps the long produced for a C++
+// ComputationDataHandle in a Python ComputationDataHandle proto,
+// rather than exposing a raw long outside of the client. Similarly,
+// the Python pair produced for a C++ Shape is further wrapped in a
+// Python class (xla_client.Shape) so as not to expose the raw pair
+// externally.
+//
+// Other SWIG object wrappers (e.g. of LocalComputation) are further
+// wrapped by xla_client in order to set up a custom destructor that
+// triggers memory deallocation on the C++ side.
+
+%include "tensorflow/python/platform/base.i"
+
+%{
+// Must be included first
+#include "tensorflow/python/lib/core/numpy.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/compiler/xla/python/numpy_bridge.h"
+#include "tensorflow/compiler/xla/python/local_computation_builder.h"
+
+using namespace xla;
+using namespace xla::swig;
+%}
+
+// Required to use PyArray_* functions.
+%init %{
+tensorflow::ImportNumpy();
+%}
+
+// ComputationDataHandle
+
+%typemap(in) const ComputationDataHandle& (ComputationDataHandle temp) {
+  const int64 handle = numpy::PyIntOrPyLongToLong($input);
+  if (handle == -1 && PyErr_Occurred()) {
+    return NULL;
+  }
+  temp.set_handle(handle);
+  $1 = &temp;
+}
+
+%typemap(out) ComputationDataHandle {
+  $result = numpy::LongToPyIntOrPyLong($1.handle());
+}
+
+// ArraySlice<int64>
+
+%typemap(in) tensorflow::gtl::ArraySlice<int64>
+    (std::vector<int64> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    return NULL;
+  }
+  const int size = PySequence_Size($input);
+  temps.resize(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    PyObject* py_int = numpy::PyNumberToPyInt(o);
+    if (!py_int) {
+      PyErr_SetString(
+          PyExc_TypeError,
+          "Argument sequence element cannot be converted to int");
+      Py_DECREF(o);
+      return NULL;
+    }
+    temps[i] = numpy::PyIntOrPyLongToLong(py_int);
+    if (temps[i] == -1 && PyErr_Occurred()) {
+      Py_DECREF(py_int);
+      Py_DECREF(o);
+      return NULL;
+    }
+    Py_DECREF(py_int);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+// ComputationDataHandle
+
+%typemap(in) tensorflow::gtl::ArraySlice<ComputationDataHandle>
+    (std::vector<ComputationDataHandle> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    return NULL;
+  }
+  const int size = PySequence_Size($input);
+  temps.resize(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    PyObject* py_int = numpy::PyNumberToPyInt(o);
+    if (!py_int) {
+      PyErr_SetString(
+          PyExc_TypeError,
+          "Argument sequence element cannot be converted to int");
+      return NULL;
+    }
+    const int64 handle = numpy::PyIntOrPyLongToLong(py_int);
+    if (handle == -1 && PyErr_Occurred()) {
+      Py_DECREF(py_int);
+      Py_DECREF(o);
+      return NULL;
+    }
+    temps[i].set_handle(handle);
+    Py_DECREF(py_int);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+// Literal
+
+%typemap(in) const Literal& (std::unique_ptr<Literal> temp) {
+  temp = numpy::XlaLiteralFromPyObject($input);
+  $1 = &*temp;
+}
+
+%typemap(out) std::unique_ptr<Literal> {
+  $result = numpy::PyObjectFromXlaLiteral(*$1);
+}
+
+%typemap(in) const std::vector<Literal>& (std::vector<Literal> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    return NULL;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    temps.push_back(*numpy::XlaLiteralFromPyObject(o));
+    Py_DECREF(o);
+  }
+  $1 = &temps;
+}
+
+// Shape
+
+%typemap(in) const Shape& (Shape temp) {
+  if (!numpy::CheckPyShapeInfo($input)) {
+    return NULL;
+  }
+  temp = numpy::XlaShapeFromPyShapeInfo($input);
+  $1 = &temp;
+}
+
+%typemap(out) std::unique_ptr<Shape> {
+  $result = numpy::PyShapeInfoFromXlaShape(*$1);
+}
+
+%typemap(in) const std::vector<Shape>& (std::vector<Shape> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    return NULL;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    if (!numpy::CheckPyShapeInfo(o)) {
+      Py_DECREF(o);
+      return NULL;
+    }
+    temps.push_back(numpy::XlaShapeFromPyShapeInfo(o));
+    Py_DECREF(o);
+  }
+  $1 = &temps;
+}
+
+// PrimitiveType
+
+%typemap(in) PrimitiveType {
+  PyObject* py_int = numpy::PyNumberToPyInt($input);
+  if (!py_int) {
+    PyErr_SetString(PyExc_TypeError, "Argument cannot be converted to int");
+    return NULL;
+  }
+  const long value = numpy::PyIntOrPyLongToLong(py_int);
+  if (value == -1 && PyErr_Occurred()) {
+    Py_DECREF(py_int);
+    return NULL;
+  }
+  if (!PrimitiveType_IsValid(value)) {
+    PyErr_SetString(
+        PyExc_TypeError, "Argument not valid for PrimitiveType enum");
+    Py_DECREF(py_int);
+    return NULL;
+  }
+  $1 = static_cast<PrimitiveType>(value);
+}
+
+%ignoreall
+%unignore xla;
+%unignore xla::swig;
+%unignore xla::swig::CompiledLocalComputation;
+%unignore xla::swig::CompiledLocalComputation::Execute;
+%unignore xla::swig::LocalComputation;
+%unignore xla::swig::LocalComputation::Compile;
+%unignore xla::swig::LocalComputationBuilder;
+%unignore xla::swig::LocalComputationBuilder::LocalComputationBuilder;
+%unignore xla::swig::LocalComputationBuilder::Build;
+%unignore xla::swig::LocalComputationBuilder::Parameter;
+%unignore xla::swig::LocalComputationBuilder::GetShape;
+%unignore xla::swig::LocalComputationBuilder::ConstantLiteral;
+%unignore xla::swig::LocalComputationBuilder::ConstantR0;
+%unignore xla::swig::LocalComputationBuilder::Broadcast;
+%unignore xla::swig::LocalComputationBuilder::Reshape;
+%unignore xla::swig::LocalComputationBuilder::Slice;
+%unignore xla::swig::LocalComputationBuilder::DynamicSlice;
+%unignore xla::swig::LocalComputationBuilder::DynamicUpdateSlice;
+%unignore xla::swig::LocalComputationBuilder::ConcatInDim;
+%unignore xla::swig::LocalComputationBuilder::Select;
+%unignore xla::swig::LocalComputationBuilder::Tuple;
+%unignore xla::swig::LocalComputationBuilder::GetTupleElement;
+%unignore xla::swig::LocalComputationBuilder::ConvertElementType;
+%unignore xla::swig::LocalComputationBuilder::Call;
+%unignore xla::swig::LocalComputationBuilder::Transpose;
+%unignore xla::swig::LocalComputationBuilder::Map;
+%unignore xla::swig::LocalComputationBuilder::Reduce;
+%unignore xla::swig::LocalComputationBuilder::While;
+%unignore xla::swig::LocalComputationBuilder::Eq;
+%unignore xla::swig::LocalComputationBuilder::Ne;
+%unignore xla::swig::LocalComputationBuilder::Ge;
+%unignore xla::swig::LocalComputationBuilder::Gt;
+%unignore xla::swig::LocalComputationBuilder::Lt;
+%unignore xla::swig::LocalComputationBuilder::Le;
+%unignore xla::swig::LocalComputationBuilder::Dot;
+%unignore xla::swig::LocalComputationBuilder::Add;
+%unignore xla::swig::LocalComputationBuilder::Sub;
+%unignore xla::swig::LocalComputationBuilder::Mul;
+%unignore xla::swig::LocalComputationBuilder::Div;
+%unignore xla::swig::LocalComputationBuilder::Rem;
+%unignore xla::swig::LocalComputationBuilder::Max;
+%unignore xla::swig::LocalComputationBuilder::Min;
+%unignore xla::swig::LocalComputationBuilder::And;
+%unignore xla::swig::LocalComputationBuilder::Or;
+%unignore xla::swig::LocalComputationBuilder::Not;
+%unignore xla::swig::LocalComputationBuilder::Abs;
+%unignore xla::swig::LocalComputationBuilder::Exp;
+%unignore xla::swig::LocalComputationBuilder::Floor;
+%unignore xla::swig::LocalComputationBuilder::Ceil;
+%unignore xla::swig::LocalComputationBuilder::Log;
+%unignore xla::swig::LocalComputationBuilder::Sign;
+%unignore xla::swig::LocalComputationBuilder::Cos;
+%unignore xla::swig::LocalComputationBuilder::Sin;
+%unignore xla::swig::LocalComputationBuilder::Tanh;
+%unignore xla::swig::LocalComputationBuilder::SqrtF32;
+%unignore xla::swig::LocalComputationBuilder::SquareF32;
+%unignore xla::swig::LocalComputationBuilder::Pow;
+%unignore xla::swig::LocalComputationBuilder::IsFinite;
+%unignore xla::swig::LocalComputationBuilder::ReciprocalF32;
+%unignore xla::swig::LocalComputationBuilder::Neg;
+%unignore xla::swig::LocalComputationBuilder::Sort;
+%unignore xla::swig::DeleteLocalComputation;
+%unignore xla::swig::DeleteCompiledLocalComputation;
+
+%include "tensorflow/compiler/xla/python/local_computation_builder.h"
+
+%unignoreall
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b30bdc3669de3992a08ab70ef49b0aa17cc855f3
--- /dev/null
+++ b/tensorflow/compiler/xla/python/numpy_bridge.cc
@@ -0,0 +1,389 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/numpy_bridge.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace swig {
+
+namespace numpy {
+
+int PrimitiveTypeToNumpyType(PrimitiveType primitive_type) {
+  switch (primitive_type) {
+    case PRED:
+      return NPY_BOOL;
+    case S8:
+      return NPY_INT8;
+    case S16:
+      return NPY_INT16;
+    case S32:
+      return NPY_INT32;
+    case S64:
+      return NPY_INT64;
+    case U8:
+      return NPY_UINT8;
+    case U16:
+      return NPY_UINT16;
+    case U32:
+      return NPY_UINT32;
+    case U64:
+      return NPY_UINT64;
+    case F16:
+      return NPY_FLOAT16;
+    case F32:
+      return NPY_FLOAT32;
+    case F64:
+      return NPY_FLOAT64;
+    case TUPLE:
+      return NPY_OBJECT;
+    default:
+      LOG(FATAL) << "No Numpy type for XLA primitive type " << primitive_type;
+  }
+}
+
+PrimitiveType NumpyTypeToPrimitiveType(int np_type) {
+  switch (np_type) {
+    case NPY_BOOL:
+      return PRED;
+    case NPY_INT8:
+      return S8;
+    case NPY_INT16:
+      return S16;
+    case NPY_INT32:
+      return S32;
+    case NPY_INT64:
+      return S64;
+    case NPY_UINT8:
+      return U8;
+    case NPY_UINT16:
+      return U16;
+    case NPY_UINT32:
+      return U32;
+    case NPY_UINT64:
+      return U64;
+    case NPY_FLOAT16:
+      return F16;
+    case NPY_FLOAT32:
+      return F32;
+    case NPY_FLOAT64:
+      return F64;
+    case NPY_OBJECT:
+      return TUPLE;
+    default:
+      LOG(FATAL) << "No XLA primitive type for Numpy type " << np_type;
+  }
+}
+
+bool NumpyTypeIsValid(int np_type) {
+  switch (np_type) {
+    case NPY_BOOL:
+    case NPY_INT8:
+    case NPY_INT16:
+    case NPY_INT32:
+    case NPY_INT64:
+    case NPY_UINT8:
+    case NPY_UINT16:
+    case NPY_UINT32:
+    case NPY_UINT64:
+    case NPY_FLOAT16:
+    case NPY_FLOAT32:
+    case NPY_FLOAT64:
+    case NPY_OBJECT:
+      return true;
+    default:
+      return false;
+  }
+}
+
+PyObject* PyShapeInfoFromXlaShape(const Shape& shape) {
+  int np_typenum = PrimitiveTypeToNumpyType(shape.element_type());
+  PyArray_Descr* np_dtype = PyArray_DescrFromType(np_typenum);
+
+  PyObject* dimensions;
+  if (ShapeUtil::IsTuple(shape)) {
+    int num_elements = ShapeUtil::TupleElementCount(shape);
+    dimensions = PyTuple_New(ShapeUtil::TupleElementCount(shape));
+    for (int i = 0; i < num_elements; ++i) {
+      PyTuple_SET_ITEM(
+          dimensions, i,
+          PyShapeInfoFromXlaShape(ShapeUtil::GetTupleElementShape(shape, i)));
+    }
+  } else {
+    int rank = ShapeUtil::Rank(shape);
+    dimensions = PyTuple_New(rank);
+    for (int i = 0; i < rank; ++i) {
+      PyTuple_SET_ITEM(dimensions, i,
+                       LongToPyIntOrPyLong(ShapeUtil::GetDimension(shape, i)));
+    }
+  }
+  return PyTuple_Pack(2, np_dtype, dimensions);
+}
+
+// Precondition: o->ob_type == &PyArrayDescr_Type
+static int NumpyTypenum(PyObject* o) {
+  return reinterpret_cast<PyArray_Descr*>(o)->type_num;
+}
+
+bool CheckPyShapeInfo(PyObject* o) {
+  // The object is a tuple (a pair)
+  if (!PyTuple_Check(o)) {
+    PyErr_SetString(PyExc_TypeError, "Shape record must be a tuple");
+    return false;
+  }
+  if (PyTuple_Size(o) != 2) {
+    PyErr_SetString(PyExc_ValueError, "Shape record tuple must be of length 2");
+    return false;
+  }
+
+  // It has a first element, which is a numpy dtype object
+  PyObject* first = PyTuple_GetItem(o, 0);
+  if (!first) {
+    return false;
+  }
+  if (first->ob_type != &PyArrayDescr_Type) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "Shape record does not have a numpy dtype as its first element");
+    return false;
+  }
+  const int np_type = NumpyTypenum(first);
+  if (!NumpyTypeIsValid(np_type)) {
+    PyErr_SetString(PyExc_ValueError,
+                    "Shape record has an invalid integer dtype");
+    return false;
+  }
+
+  // It has a second element, which is a tuple, either of shape
+  // records or of Python ints
+  PyObject* second = PyTuple_GetItem(o, 1);
+  if (!second) {
+    return false;
+  }
+  if (!PyTuple_Check(second)) {
+    PyErr_SetString(PyExc_TypeError,
+                    "Shape record does not have a tuple as its second element");
+    return false;
+  }
+  const int length = PyTuple_Size(second);
+  const PrimitiveType element_type = NumpyTypeToPrimitiveType(np_type);
+  for (int i = 0; i < length; i++) {
+    PyObject* dimension = PyTuple_GetItem(second, i);
+    if (element_type == TUPLE) {
+      if (!CheckPyShapeInfo(dimension)) {
+        return false;
+      }
+    } else if (!CheckPyIntOrLong(dimension)) {
+      PyErr_SetString(PyExc_TypeError,
+                      "Non-tuple shape record has a non-integer dimension");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// Precondition: CheckPyShapeInfo(o)
+Shape XlaShapeFromPyShapeInfo(PyObject* o) {
+  const int np_type = NumpyTypenum(PyTuple_GetItem(o, 0));
+  const PrimitiveType element_type = NumpyTypeToPrimitiveType(np_type);
+  PyObject* py_dimensions = PyTuple_GetItem(o, 1);
+  const int length = PyTuple_Size(py_dimensions);
+  if (element_type == TUPLE) {
+    std::vector<Shape> subshapes;
+    subshapes.reserve(length);
+    for (int i = 0; i < length; i++) {
+      subshapes.push_back(
+          XlaShapeFromPyShapeInfo(PyTuple_GetItem(py_dimensions, i)));
+    }
+    return ShapeUtil::MakeTupleShape(subshapes);
+  } else {
+    std::vector<int64> dimensions(length);
+    for (int i = 0; i < length; i++) {
+      dimensions[i] = PyIntOrPyLongToLong(PyTuple_GetItem(py_dimensions, i));
+      if (dimensions[i] == -1) {
+        CHECK(!PyErr_Occurred());
+      }
+    }
+    return ShapeUtil::MakeShape(element_type, dimensions);
+  }
+}
+
+PyObject* PyObjectFromXlaLiteral(const Literal& literal) {
+  if (ShapeUtil::IsTuple(literal.shape())) {
+    const std::vector<Literal>& tuple_literals = literal.tuple_literals();
+    int num_elements = ShapeUtil::TupleElementCount(literal.shape());
+    PyObject* tuple = PyTuple_New(num_elements);
+    for (int i = 0; i < num_elements; i++) {
+      PyTuple_SET_ITEM(tuple, i, PyObjectFromXlaLiteral(tuple_literals[i]));
+    }
+    return tuple;
+  } else {
+    int rank = ShapeUtil::Rank(literal.shape());
+    std::vector<long> dimensions(rank);  // NOLINT - PyArray requires a long*
+    for (int i = 0; i < rank; i++) {
+      dimensions[i] = ShapeUtil::GetDimension(literal.shape(), i);
+    }
+    int np_type = PrimitiveTypeToNumpyType(literal.shape().element_type());
+    PyObject* array =
+        PyArray_EMPTY(rank, dimensions.data(), np_type, /*fortran=*/0);
+    CopyLiteralToNumpyArray(np_type, literal,
+                            reinterpret_cast<PyArrayObject*>(array));
+    return array;
+  }
+}
+
+std::unique_ptr<Literal> XlaLiteralFromPyObject(PyObject* o) {
+  if (PyTuple_Check(o)) {
+    int num_elements = PyTuple_Size(o);
+    std::vector<std::unique_ptr<Literal>> elements;
+    elements.reserve(num_elements);
+    for (int i = 0; i < num_elements; i++) {
+      PyObject* element = PyTuple_GetItem(o, i);
+      elements.push_back(XlaLiteralFromPyObject(element));
+    }
+    return Literal::MakeTupleOwned(std::move(elements));
+  } else if (PyArray_Check(o)) {
+    PyArrayObject* py_array = reinterpret_cast<PyArrayObject*>(o);
+    int rank = PyArray_NDIM(py_array);
+    std::vector<int64> dimensions(rank);
+    for (int i = 0; i < rank; i++) {
+      dimensions[i] = PyArray_DIM(py_array, i);
+    }
+    int np_type = PyArray_TYPE(py_array);
+    auto literal = Literal::CreateFromDimensions(
+        NumpyTypeToPrimitiveType(np_type), dimensions);
+    CopyNumpyArrayToLiteral(np_type, py_array, literal.get());
+    return literal;
+  } else {
+    LOG(FATAL)
+        << "Non-tuple or Numpy array encountered in conversion to XLA literal";
+  }
+}
+
+void CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
+                             Literal* literal) {
+  switch (np_type) {
+    case NPY_BOOL:
+      CopyNumpyArrayToLiteral<bool>(py_array, literal);
+      break;
+    case NPY_INT32:
+      CopyNumpyArrayToLiteral<int32>(py_array, literal);
+      break;
+    case NPY_INT64:
+      CopyNumpyArrayToLiteral<int64>(py_array, literal);
+      break;
+    case NPY_UINT8:
+      CopyNumpyArrayToLiteral<uint8>(py_array, literal);
+      break;
+    case NPY_UINT32:
+      CopyNumpyArrayToLiteral<uint32>(py_array, literal);
+      break;
+    case NPY_UINT64:
+      CopyNumpyArrayToLiteral<uint64>(py_array, literal);
+      break;
+    case NPY_FLOAT16:
+      CopyNumpyArrayToLiteral<half>(py_array, literal);
+      break;
+    case NPY_FLOAT32:
+      CopyNumpyArrayToLiteral<float>(py_array, literal);
+      break;
+    case NPY_FLOAT64:
+      CopyNumpyArrayToLiteral<double>(py_array, literal);
+      break;
+    default:
+      LOG(FATAL) << "No XLA literal container for Numpy type" << np_type;
+  }
+}
+
+void CopyLiteralToNumpyArray(int np_type, const Literal& literal,
+                             PyArrayObject* py_array) {
+  switch (np_type) {
+    case NPY_BOOL:
+      CopyLiteralToNumpyArray<bool>(literal, py_array);
+      break;
+    case NPY_INT32:
+      CopyLiteralToNumpyArray<int32>(literal, py_array);
+      break;
+    case NPY_INT64:
+      CopyLiteralToNumpyArray<int64>(literal, py_array);
+      break;
+    case NPY_UINT8:
+      CopyLiteralToNumpyArray<uint8>(literal, py_array);
+      break;
+    case NPY_UINT32:
+      CopyLiteralToNumpyArray<uint32>(literal, py_array);
+      break;
+    case NPY_UINT64:
+      CopyLiteralToNumpyArray<uint64>(literal, py_array);
+      break;
+    case NPY_FLOAT16:
+      CopyLiteralToNumpyArray<half>(literal, py_array);
+      break;
+    case NPY_FLOAT32:
+      CopyLiteralToNumpyArray<float>(literal, py_array);
+      break;
+    case NPY_FLOAT64:
+      CopyLiteralToNumpyArray<double>(literal, py_array);
+      break;
+    default:
+      LOG(FATAL) << "No XLA literal container for Numpy type" << np_type;
+  }
+}
+
+PyObject* LongToPyIntOrPyLong(long x) {  // NOLINT
+#if PY_MAJOR_VERSION < 3
+  return PyInt_FromLong(x);
+#else
+  return PyLong_FromLong(x);
+#endif
+}
+
+long PyIntOrPyLongToLong(PyObject* o) {  // NOLINT
+#if PY_MAJOR_VERSION < 3
+  return PyInt_AsLong(o);
+#else
+  return PyLong_AsLong(o);
+#endif
+}
+
+bool CheckPyIntOrLong(PyObject* o) {
+#if PY_MAJOR_VERSION < 3
+  return PyInt_Check(o);
+#else
+  if (!PyLong_Check(o)) {
+    return false;
+  }
+  int overflow = 0;
+  PyLong_AsLongAndOverflow(o, &overflow);
+  return (overflow == 0);
+#endif
+}
+
+PyObject* PyNumberToPyInt(PyObject* o) {
+#if PY_MAJOR_VERSION < 3
+  return PyNumber_Int(o);
+#else
+  return PyNumber_Long(o);
+#endif
+}
+
+}  // namespace numpy
+
+}  // namespace swig
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.h b/tensorflow/compiler/xla/python/numpy_bridge.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e6ecbb0e8b58979ec1f1484e722725c391106fb
--- /dev/null
+++ b/tensorflow/compiler/xla/python/numpy_bridge.h
@@ -0,0 +1,123 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// These functions transform Python/Numpy data structures to XLA data
+// structures and vice versa, performing copies where
+// appropriate. Python tuples and Numpy ndarrays translate to XLA
+// tuples and XLA literals, respectively, and Numpy shape/dtype
+// information is translated to XLA shape information.
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_NUMPY_BRIDGE_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_NUMPY_BRIDGE_H_
+
+#include <algorithm>
+#include <memory>
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/python/lib/core/numpy.h"
+
+namespace xla {
+
+namespace swig {
+
+namespace numpy {
+
+// Maps XLA primitive types (PRED, S8, F32, ..., and TUPLE) to numpy
+// dtypes (NPY_BOOL, NPY_INT8, NPY_FLOAT32, ..., and NPY_OBJECT), and
+// vice versa.
+int PrimitiveTypeToNumpyType(PrimitiveType primitive_type);
+PrimitiveType NumpyTypeToPrimitiveType(int np_type);
+
+// Determines whether an integer-encoded Numpy dtype is valid,
+// i.e. has a supported conversion to an XLA PrimitiveType.
+bool NumpyTypeIsValid(int np_type);
+
+// Converts XLA shape information into a Python pair of the form
+// (numpy dtype, dimensions). If the XLA shape represents a tuple,
+// then the numpy dtype is NPY_OBJECT ('O') and `dimensions` is a
+// Python tuple of shape-description pairs, created
+// recursively. Otherwise, `dimensions` is a Python tuple-of-integers
+// providing the array dimensions.
+//
+// The return value is a new reference.
+PyObject* PyShapeInfoFromXlaShape(const Shape& shape);
+
+// Returns the outcome of a best-effort check that the Python object
+// is a pair of the form (numpy dtype, dimensions), as produced by
+// PyShapeInfoFromXlaShape.
+bool CheckPyShapeInfo(PyObject* o);
+
+// Performs the inverse conversion to that of PyShapeInfoFromXlaShape.
+//
+// The return value is a new reference.
+Shape XlaShapeFromPyShapeInfo(PyObject* o);
+
+// Converts an XLA literal to a Python object, either a Numpy ndarray
+// or a nested Python tuple thereof.
+//
+// To avoid transferring ownership of the data buffers that underlie
+// PyArrays and XLA literals, this function makes deep copies of all
+// array data.
+//
+// The return value is a new reference.
+PyObject* PyObjectFromXlaLiteral(const Literal& literal);
+
+// Converts a Numpy ndarray or a nested Python tuple thereof to a
+// corresponding XLA literal.
+//
+// To avoid transferring ownership of the data buffers that underlie
+// PyArrays and XLA literals, this function makes deep copies of all
+// array data.
+std::unique_ptr<Literal> XlaLiteralFromPyObject(PyObject* o);
+
+// The following functions copy array data from the buffers underlying Numpy
+// ndarrays into those underlying XLA literals, and vice versa.
+
+void CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
+                             Literal* literal);
+
+void CopyLiteralToNumpyArray(int np_type, const Literal& literal,
+                             PyArrayObject* py_array);
+
+template <typename NativeT>
+void CopyNumpyArrayToLiteral(PyArrayObject* py_array, Literal* literal) {
+  NativeT* source = static_cast<NativeT*>(PyArray_DATA(py_array));
+  auto dest = literal->GetMutableArraySlice<NativeT>();
+  std::copy(source, source + PyArray_SIZE(py_array), dest.data());
+}
+
+template <typename NativeT>
+void CopyLiteralToNumpyArray(const Literal& literal, PyArrayObject* py_array) {
+  NativeT* dest = static_cast<NativeT*>(PyArray_DATA(py_array));
+  auto source = literal.GetArraySlice<NativeT>();
+  std::copy(source.begin(), source.end(), dest);
+}
+
+// Workarounds for Python 2 and 3 interop
+
+PyObject* LongToPyIntOrPyLong(long x);  // NOLINT
+long PyIntOrPyLongToLong(PyObject* o);  // NOLINT
+bool CheckPyIntOrLong(PyObject* o);
+PyObject* PyNumberToPyInt(PyObject* o);
+
+}  // namespace numpy
+
+}  // namespace swig
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_NUMPY_BRIDGE_H_
diff --git a/tensorflow/compiler/xla/python/xla.i b/tensorflow/compiler/xla/python/xla.i
new file mode 100644
index 0000000000000000000000000000000000000000..1c4021a558d3fcff2abfdbdbad7f3928e86ed3b8
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xla.i
@@ -0,0 +1,18 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* XLA-wide SWIG wrapper */
+
+%include "tensorflow/compiler/xla/python/local_computation_builder.i"
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..c75d54856dd699ec5cd8a2337007a064ba709de8
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -0,0 +1,605 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An in-process, local XLA client in Python, supporting AOT compilation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+
+from tensorflow.compiler.xla import xla_data_pb2
+from tensorflow.compiler.xla.python import pywrap_xla as c_api
+
+_UNARY_OPS = [
+    'Not',
+    'Abs',
+    'Exp',
+    'Floor',
+    'Ceil',
+    'Log',
+    'Sign',
+    'Cos',
+    'Sin',
+    'Tanh',
+    'SqrtF32',
+    'SquareF32',
+    'IsFinite',
+    'ReciprocalF32',
+    'Neg',
+    'Sort',
+]
+
+_BINARY_OPS = [
+    'Eq',
+    'Ne',
+    'Ge',
+    'Gt',
+    'Lt',
+    'Le',
+    'Add',
+    'Sub',
+    'Mul',
+    'Div',
+    'Rem',
+    'Max',
+    'Min',
+    'And',
+    'Or',
+    'Pow',
+]
+
+# Most functions are snake_case for consistency with other modules,
+# whereas method names of ComputationBuilder and LocalComputation are
+# CamelCase for consistency with XLA.
+# pylint: disable=invalid-name
+
+XLA_ELEMENT_TYPE_TO_DTYPE = {
+    xla_data_pb2.F32: np.dtype(np.float32),
+    xla_data_pb2.F64: np.dtype(np.float64),
+    xla_data_pb2.S32: np.dtype(np.int32),
+    xla_data_pb2.S64: np.dtype(np.int64),
+    xla_data_pb2.PRED: np.dtype(np.bool),
+    xla_data_pb2.TUPLE: np.dtype(np.object),
+}
+
+DTYPE_TO_XLA_ELEMENT_TYPE = {
+    str(v): k
+    for k, v in XLA_ELEMENT_TYPE_TO_DTYPE.items()
+}
+
+
+class Shape(object):
+  """XLA shape.
+
+  Represents an XLA shape by a corresponding Python/Numpy type and a
+  list of dimensions, which are themselves Shapes in case this one
+  represents an XLA tuple.
+  """
+
+  def __init__(self, np_dtype, dimensions):
+    self.np_dtype = np_dtype
+    self._dimensions = dimensions
+
+  def element_type(self):
+    return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.np_dtype)]
+
+  def is_tuple(self):
+    return self.element_type() == xla_data_pb2.TUPLE
+
+  def dimensions(self):
+    if self.is_tuple():
+      raise ValueError('Tuple shape has no dimensions')
+    return self._dimensions
+
+  def tuple_shapes(self):
+    if not self.is_tuple():
+      raise ValueError('Shape is not a tuple shape')
+    return self._dimensions
+
+  @staticmethod
+  def from_numpy(npval):
+
+    def convert(npval):
+      if isinstance(npval, tuple):
+        return Shape(np.dtype('O'), tuple(convert(elt) for elt in npval))
+      else:
+        return Shape(npval.dtype, np.shape(npval))
+
+    return convert(require_numpy_array_layout(npval))
+
+
+def _wrap_shape(shape_info):
+  dtype, dims = shape_info
+  element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(dtype)]
+  if element_type == xla_data_pb2.TUPLE:
+    dims = [_wrap_shape(subshape_info) for subshape_info in dims]
+  return Shape(dtype, dims)
+
+
+def _unwrap_shape(shape):
+  if shape.is_tuple():
+    components = tuple(
+        _unwrap_shape(subshape) for subshape in shape.tuple_shapes())
+  else:
+    components = shape.dimensions()
+  return (shape.np_dtype, components)
+
+
+def _unwrap_shapes(shapes):
+  return [_unwrap_shape(shape) for shape in shapes]
+
+
+def _wrap_data_handle(handle):
+  cdh = xla_data_pb2.ComputationDataHandle()
+  cdh.handle = handle
+  return cdh
+
+
+def _unwrap_data_handle(handle_proto):
+  return handle_proto.handle
+
+
+def _unwrap_data_handles(handle_protos):
+  return [_unwrap_data_handle(cdh) for cdh in handle_protos]
+
+
+def require_numpy_array_layout(value):
+  if isinstance(value, tuple):
+    return tuple(require_numpy_array_layout(x) for x in value)
+  else:
+    return np.require(value, requirements=['C', 'A'])
+
+
+class LocalComputation(object):
+  """Python wrapper for a local XLA Computation.
+
+  A LocalComputation can be executed if it is compiled. Otherwise, it
+  can still be used as a Computation where required by the
+  ComputationBuilder methods.
+  """
+
+  def __init__(self, c_local_computation, is_compiled):
+    self.c_local_computation = c_local_computation
+    self.is_compiled = is_compiled
+
+    # Ensure a reference to C-based destructor for use in __del__.
+    if is_compiled:
+      self._delete = c_api.DeleteCompiledLocalComputation
+    else:
+      self._delete = c_api.DeleteLocalComputation
+
+  def Compile(self, argument_shapes=()):
+    if self.is_compiled:
+      raise ValueError('Attempt to compile a compiled local XLA computation.')
+    return LocalComputation(
+        self.c_local_computation.Compile(_unwrap_shapes(argument_shapes)),
+        is_compiled=True)
+
+  def CompileWithExampleArguments(self, arguments=()):
+    return self.Compile(
+        argument_shapes=[Shape.from_numpy(arg) for arg in arguments])
+
+  def Execute(self, arguments=()):
+    if not self.is_compiled:
+      raise ValueError('Cannot execute an uncompiled local XLA computation.')
+    arguments = tuple(map(require_numpy_array_layout, arguments))
+    return self.c_local_computation.Execute(arguments)
+
+  def __del__(self):
+    self._delete(self.c_local_computation)
+
+
+class ComputationBuilder(object):
+  """XLA computation builder.
+
+  Enqueues XLA ops in sequence and in order to build a
+  LocalComputation, which in turn can be compiled into a
+  CompiledLocalComputation, which in turn can be locally executed.
+  """
+
+  # The methods of this class map 1-to-1 onto the XLA C++
+  # computation builder API. Therefore, there's no need to laboriously list
+  # arguments and return values for every method, especially where it's obvious.
+  #
+  # pylint: disable=g-doc-return-or-yield
+  # pylint: disable=g-doc-args
+
+  def __init__(self, name):
+    self._client = c_api.LocalComputationBuilder(name.encode('utf8'))
+    self._parameter_numbering = itertools.count()
+
+  def Build(self):
+    return LocalComputation(self._client.Build(), is_compiled=False)
+
+  def Constant(self, value):
+    """Enqueues a constant op onto the computation.
+
+    Args:
+      value: value for the constant, as a np.array with an explicit dtype set
+             to one of the supported types.
+
+    Returns:
+      A ComputationDataHandle message.
+    """
+    value = require_numpy_array_layout(value)
+    return _wrap_data_handle(self._client.ConstantLiteral(value))
+
+  def ConstantF32Scalar(self, value):
+    """Convenience method to enqueue a scalar F32 constant op.
+
+    Args:
+      value: a floating-point number.
+
+    Returns:
+      A ComputationDataHandle message.
+    """
+    return self.Constant(np.array(value, dtype=np.float32))
+
+  def ConstantF64Scalar(self, value):
+    """Convenience method to enqueue a scalar F32 constant op.
+
+    Args:
+      value: a floating-point number.
+
+    Returns:
+      A ComputationDataHandle message.
+    """
+    return self.Constant(np.array(value, dtype=np.float64))
+
+  def ConstantS32Scalar(self, value):
+    """Convenience method to enqueue a scalar S32 constant op.
+
+    Args:
+      value: a floating-point number.
+
+    Returns:
+      A ComputationDataHandle message.
+    """
+    return self.Constant(np.array(value, dtype=np.int32))
+
+  def ConstantS64Scalar(self, value):
+    """Convenience method to enqueue a scalar S64 constant op.
+
+    Args:
+      value: a floating-point number.
+
+    Returns:
+      A ComputationDataHandle message.
+    """
+    return self.Constant(np.array(value, dtype=np.int64))
+
+  def ConstantPredScalar(self, value):
+    """Convenience method to enqueue a scalar PRED constant op.
+
+    Args:
+      value: a boolean value.
+
+    Returns:
+      A ComputationDataHandle message.
+    """
+    return self.Constant(np.array(value, dtype=np.bool))
+
+  def ParameterWithShape(self, shape, name=None, parameter_num=None):
+    """Enqueues a Parameter op onto the computation, given a shape.
+
+    Args:
+      shape: the parameter's shape as a Shape object.
+      name: optional string name for the parameter.
+      parameter_num: parameter number in the computation function. If None,
+        the next linear parameter number is used. The default value capability
+        can be used for auto-numbering. If you're using auto-numbering for some
+        parameters, use it for *all* parameters to avoid clashes.
+
+    Returns:
+      A ComputationDataHandle message.
+    """
+    if name is None:
+      name = ''
+    if parameter_num is None:
+      parameter_num = next(self._parameter_numbering)
+
+    return _wrap_data_handle(
+        self._client.Parameter(
+            parameter_num, _unwrap_shape(shape), name.encode('utf8')))
+
+  def ParameterFromNumpy(self, value, name=None, parameter_num=None):
+    """Enqueues a Parameter op onto the computation.
+
+    Args:
+      value: a Numpy array, or a nested tuple thereof, from which the
+        shape is inferred.
+      name: as in ParameterWithShape.
+      parameter_num: as in ParameterWithShape.
+
+    Returns:
+      A ComputationDataHandle message.
+    """
+    return self.ParameterWithShape(
+        Shape.from_numpy(value), name=name, parameter_num=parameter_num)
+
+  def Broadcast(self, operand, sizes):
+    """Enqueues a broadcast operation onto the computation.
+
+    Args:
+      operand: the operand ComputationDataHandle to broadcast.
+      sizes: an iterable of broadcast sizes.
+
+    Returns:
+      A ComputationDataHandle representing the added broadcast op.
+    """
+    return _wrap_data_handle(
+        self._client.Broadcast(_unwrap_data_handle(operand), sizes))
+
+  def Concatenate(self, operands, dimension):
+    """Enqueues a concatenate operation onto the computation.
+
+    Args:
+      operands: the operands to concatenate.
+      dimension: the dimension in which to perform the concatenation.
+
+    Returns:
+      A ComputationDataHandle representing the added concatenate op.
+    """
+    return _wrap_data_handle(
+        self._client.ConcatInDim(_unwrap_data_handles(operands), dimension))
+
+  def ConvertElementType(self, operand, new_element_type):
+    """Enqueues an element type conversion operation onto the computation.
+
+    Args:
+      operand: the operand to convert.
+      new_element_type: the target primitive type.
+
+    Returns:
+      A ComputationDataHandle representing the added conversion op.
+    """
+    return _wrap_data_handle(
+        self._client.ConvertElementType(
+            _unwrap_data_handle(operand), new_element_type))
+
+  def GetShape(self, operand):
+    return _wrap_shape(self._client.GetShape(_unwrap_data_handle(operand)))
+
+  def GetComputationStats(self):
+    raise NotImplementedError()
+
+  def Reshape(self, operand, dimensions, new_sizes):
+    """Reshape op."""
+    return _wrap_data_handle(
+        self._client.Reshape(
+            _unwrap_data_handle(operand), dimensions, new_sizes))
+
+  def Trans(self, operand):
+    """Specialized matrix transpose op."""
+    return _wrap_data_handle(
+        self._client.Transpose(_unwrap_data_handle(operand), [1, 0]))
+
+  def Transpose(self, operand, permutation):
+    """Transpose op."""
+    return _wrap_data_handle(
+        self._client.Transpose(_unwrap_data_handle(operand), permutation))
+
+  def Select(self, pred, on_true, on_false):
+    """Element-wise selection op.
+
+    Constructs an output array from elements of two input arrays, based on the
+    values of a predicate array.
+    """
+    return _wrap_data_handle(
+        self._client.Select(
+            _unwrap_data_handle(pred),
+            _unwrap_data_handle(on_true),
+            _unwrap_data_handle(on_false)))
+
+  def Slice(self, operand, start_indices, limit_indices, strides=None):
+    """Enqueues a slice operation onto the computation.
+
+    Args:
+      operand: ComputationDataHandle for the N dimensional array to be sliced.
+      start_indices: iterable of N integers containing the starting indices of
+        the slice for each dimension.
+      limit_indices: iterable of N integers containing the ending indices
+        (exclusive) of the slice for each dimension.
+      strides: optional iterable of N integers containing the stride sizes for
+        each dimension.
+
+    Returns:
+      A ComputationDataHandle representing the added Slice op.
+    """
+    if strides is None:
+      start_indices = list(start_indices)
+      strides = [1] * len(start_indices)
+    return _wrap_data_handle(
+        self._client.Slice(
+            _unwrap_data_handle(operand),
+            start_indices,
+            limit_indices,
+            strides))
+
+  def DynamicSlice(self, operand, start_indices, slice_sizes):
+    """Enqueues a slice op with dynamic start indices onto the computation.
+
+    Args:
+      operand: ComputationDataHandle for the N dimensional array to be sliced.
+      start_indices: ComputationDataHandle for the 1D array of N integers
+        containing the starting indices of the slice.
+      slice_sizes: iterable of N integers containing the slice sizes in each
+        dimension.
+
+    Returns:
+      A ComputationDataHandle representing the added DynamicSlice op.
+    """
+    return _wrap_data_handle(
+        self._client.DynamicSlice(
+            _unwrap_data_handle(operand),
+            _unwrap_data_handle(start_indices),
+            slice_sizes))
+
+  def DynamicUpdateSlice(self, operand, update, start_indices):
+    """Enqueues a dynamic update slice operation onto the computation.
+
+    Args:
+      operand: ComputationDataHandle for the N dimensional array to be updated.
+      update: N dimensional array comprising the slice update.
+      start_indices: Rank-1 array of N integers comprising the starting indices
+        of the slice along each dimension.
+    Returns:
+      A ComputationDataHandle representing the added DynamicUpdateSlice op.
+    """
+    return _wrap_data_handle(
+        self._client.DynamicUpdateSlice(
+            _unwrap_data_handle(operand),
+            _unwrap_data_handle(update),
+            _unwrap_data_handle(start_indices)))
+
+  def Tuple(self, *ops):
+    """Enqueues a tuple operation onto the computation.
+
+    Args:
+      ops: a sequence of tuple operands (each a ComputationDataHandle).
+
+    Returns:
+      A ComputationDataHandle representing the added Tuple op.
+    """
+    return _wrap_data_handle(self._client.Tuple(_unwrap_data_handles(ops)))
+
+  def GetTupleElement(self, tup, index):
+    """Enqueues a 'get tuple element' operation onto the computation.
+
+    Args:
+      tup: the tuple operand (a ComputationDataHandle).
+      index: numeric index to select from the tuple.
+
+    Returns:
+      A ComputationDataHandle representing the added GetTupleElement op.
+    """
+    return _wrap_data_handle(
+        self._client.GetTupleElement(_unwrap_data_handle(tup), index))
+
+  def Call(self, computation_to_apply, operands):
+    """Enqueues a call operation onto the computation.
+
+    Args:
+      computation_to_apply: a Computation object.
+      operands: an iterable of ComputationDataHandle. The number and types of
+        operands must match the arity of computation_to_apply.
+
+    Returns:
+      A ComputationDataHandle representing the added call op.
+    """
+    return _wrap_data_handle(
+        self._client.Call(computation_to_apply.c_local_computation,
+                          _unwrap_data_handles(operands)))
+
+  def Map(self, operands, computation_to_apply, dimensions, static_operands=()):
+    """Enqueues a map operation onto the computation.
+
+    Args:
+      operands: an iterable of ComputationDataHandle.
+      computation_to_apply: a Computation object.
+      dimensions: dimensions over which to apply map the function.
+      static_operands: auxiliary arguments passed to the applied computation.
+
+    Returns:
+      A ComputationDataHandle representing the added Map op.
+    """
+    return _wrap_data_handle(
+        self._client.Map(
+            _unwrap_data_handles(operands),
+            computation_to_apply.c_local_computation,
+            dimensions,
+            _unwrap_data_handles(static_operands)))
+
+  def Reduce(self, operand, init_value, computation_to_apply, dimensions):
+    """Enqueues a reduction operation onto the computation.
+
+    Args:
+      operand: reduction operand (ComputationDataHandle).
+      init_value: reduction initial value (ComputationDataHandle).
+      computation_to_apply: a Computation object - binary reduction function.
+      dimensions: sequence of dimensions (integers) to reduce on.
+
+    Returns:
+      A ComputationDataHandle representing the added Reduce op.
+    """
+    return _wrap_data_handle(
+        self._client.Reduce(
+            _unwrap_data_handle(operand),
+            _unwrap_data_handle(init_value),
+            computation_to_apply.c_local_computation,
+            dimensions))
+
+  def While(self, cond, body, init):
+    """Enqueues a While operation onto the computation.
+
+    Args:
+      cond: a Computation for the loop condition, which has type T -> PRED
+      body: a Computation for the loop body, which has type T -> T
+      init: an ComputationDataHandle for the initial parameter, which has type T
+
+    Returns: a ComputationDataHandle representing the While operation.
+    """
+    return _wrap_data_handle(
+        self._client.While(cond.c_local_computation,
+                           body.c_local_computation,
+                           _unwrap_data_handle(init)))
+
+  def Dot(self, lhs, rhs):
+    """Matrix multiplication between lhs and rhs."""
+    return _wrap_data_handle(
+        self._client.Dot(_unwrap_data_handle(lhs), _unwrap_data_handle(rhs)))
+
+
+def _forward_methods_to_local_builder():
+  """Forward remaining ComputationBuilder methods to the C API.
+
+  Set up methods, corresponding to unary and binary XLA operations,
+  whose calls are forwarded in a boilerplate manner to the underlying
+  LocalComputationBuilder C-extension API.
+  """
+
+  def forward_to_local_builder_with_handles(target_method, is_binop=False):
+    """Generate a forwarding method that wraps/unwraps data handles."""
+
+    def forward(self, *args, **kwargs):
+      unwrapped_args = [_unwrap_data_handle(arg) for arg in args]
+
+      if is_binop and len(unwrapped_args) < 3:
+        unwrapped_args.append(kwargs.get('broadcast_dimensions', ()))
+
+      return _wrap_data_handle(
+          target_method(
+              self._client,  # pylint: disable=protected-access
+              *unwrapped_args))
+
+    return forward
+
+  for method_name in _UNARY_OPS:
+    forward = forward_to_local_builder_with_handles(
+        getattr(c_api.LocalComputationBuilder, method_name))
+    forward.__name__ = method_name
+    setattr(ComputationBuilder, method_name, forward)
+
+  for method_name in _BINARY_OPS:
+    forward = forward_to_local_builder_with_handles(
+        getattr(c_api.LocalComputationBuilder, method_name), is_binop=True)
+    forward.__name__ = method_name
+    setattr(ComputationBuilder, method_name, forward)
+
+
+_forward_methods_to_local_builder()
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..878cd83edcc4bffee6bcfe31fe6a4e2705edf401
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -0,0 +1,898 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the Python extension-based XLA client."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+
+from tensorflow.compiler.xla.python import xla_client
+import unittest
+
+
+class LocalComputationTest(unittest.TestCase):
+  """Base class for running an XLA Computation through the local client."""
+
+  def _NewComputation(self, name=None):
+    if name is None:
+      name = self.id()
+    return xla_client.ComputationBuilder(name)
+
+  def _ExecuteAndAssertWith(self, assert_func, c, arguments, expected):
+    assert expected is not None
+    compiled_c = c.Build().CompileWithExampleArguments(arguments)
+    result = compiled_c.Execute(arguments)
+    # Numpy's comparison methods are a bit too lenient by treating inputs as
+    # "array-like", meaning that scalar 4 will be happily compared equal to
+    # [[4]]. We'd like to be more strict so assert shapes as well.
+    self.assertEqual(np.asanyarray(result).shape, np.asanyarray(expected).shape)
+    assert_func(result, expected)
+
+  def _ExecuteAndCompareExact(self, c, arguments=(), expected=None):
+    self._ExecuteAndAssertWith(np.testing.assert_equal, c, arguments, expected)
+
+  def _ExecuteAndCompareClose(self, c, arguments=(), expected=None):
+    self._ExecuteAndAssertWith(np.testing.assert_allclose, c, arguments,
+                               expected)
+
+
+def NumpyArrayF32(*args, **kwargs):
+  """Convenience wrapper to create Numpy arrays with a np.float32 dtype."""
+  return np.array(*args, dtype=np.float32, **kwargs)
+
+
+def NumpyArrayF64(*args, **kwargs):
+  """Convenience wrapper to create Numpy arrays with a np.float64 dtype."""
+  return np.array(*args, dtype=np.float64, **kwargs)
+
+
+def NumpyArrayS32(*args, **kwargs):
+  """Convenience wrapper to create Numpy arrays with a np.int32 dtype."""
+  return np.array(*args, dtype=np.int32, **kwargs)
+
+
+def NumpyArrayS64(*args, **kwargs):
+  """Convenience wrapper to create Numpy arrays with a np.int64 dtype."""
+  return np.array(*args, dtype=np.int64, **kwargs)
+
+
+def NumpyArrayBool(*args, **kwargs):
+  """Convenience wrapper to create Numpy arrays with a np.bool dtype."""
+  return np.array(*args, dtype=np.bool, **kwargs)
+
+
+class ComputationsWithConstantsTest(LocalComputationTest):
+  """Tests focusing on Constant ops."""
+
+  def testConstantScalarSumF32(self):
+    c = self._NewComputation()
+    c.Add(c.ConstantF32Scalar(1.11), c.ConstantF32Scalar(3.14))
+    self._ExecuteAndCompareClose(c, expected=4.25)
+
+  def testConstantScalarSumF64(self):
+    c = self._NewComputation()
+    c.Add(c.ConstantF64Scalar(1.11), c.ConstantF64Scalar(3.14))
+    self._ExecuteAndCompareClose(c, expected=4.25)
+
+  def testConstantScalarSumS32(self):
+    c = self._NewComputation()
+    c.Add(c.ConstantS32Scalar(1), c.ConstantS32Scalar(2))
+    self._ExecuteAndCompareClose(c, expected=3)
+
+  def testConstantScalarSumS64(self):
+    c = self._NewComputation()
+    c.Add(c.ConstantS64Scalar(1), c.ConstantS64Scalar(2))
+    self._ExecuteAndCompareClose(c, expected=3)
+
+  def testConstantVectorMulF32(self):
+    c = self._NewComputation()
+    c.Mul(
+        c.Constant(NumpyArrayF32([2.5, 3.3, -1.2, 0.7])),
+        c.Constant(NumpyArrayF32([-1.2, 2, -2, -3])))
+    self._ExecuteAndCompareClose(c, expected=[-3, 6.6, 2.4, -2.1])
+
+  def testConstantVectorMulF64(self):
+    c = self._NewComputation()
+    c.Mul(
+        c.Constant(NumpyArrayF64([2.5, 3.3, -1.2, 0.7])),
+        c.Constant(NumpyArrayF64([-1.2, 2, -2, -3])))
+    self._ExecuteAndCompareClose(c, expected=[-3, 6.6, 2.4, -2.1])
+
+  def testConstantVectorScalarDivF32(self):
+    c = self._NewComputation()
+    c.Div(
+        c.Constant(NumpyArrayF32([1.5, 2.5, 3.0, -10.8])),
+        c.ConstantF32Scalar(2.0))
+    self._ExecuteAndCompareClose(c, expected=[0.75, 1.25, 1.5, -5.4])
+
+  def testConstantVectorScalarDivF64(self):
+    c = self._NewComputation()
+    c.Div(
+        c.Constant(NumpyArrayF64([1.5, 2.5, 3.0, -10.8])),
+        c.ConstantF64Scalar(2.0))
+    self._ExecuteAndCompareClose(c, expected=[0.75, 1.25, 1.5, -5.4])
+
+  def testConstantVectorScalarPowF32(self):
+    c = self._NewComputation()
+    c.Pow(c.Constant(NumpyArrayF32([1.5, 2.5, 3.0])), c.ConstantF32Scalar(2.))
+    self._ExecuteAndCompareClose(c, expected=[2.25, 6.25, 9.])
+
+  def testConstantVectorScalarPowF64(self):
+    c = self._NewComputation()
+    c.Pow(c.Constant(NumpyArrayF64([1.5, 2.5, 3.0])), c.ConstantF64Scalar(2.))
+    self._ExecuteAndCompareClose(c, expected=[2.25, 6.25, 9.])
+
+  def testBooleanAnd(self):
+    c = self._NewComputation()
+    c.And(
+        c.Constant(NumpyArrayBool([True, False, True, False])),
+        c.Constant(NumpyArrayBool([True, True, False, False])))
+    self._ExecuteAndCompareExact(c, expected=[True, False, False, False])
+
+  def testBooleanOr(self):
+    c = self._NewComputation()
+    c.Or(
+        c.Constant(NumpyArrayBool([True, False, True, False])),
+        c.Constant(NumpyArrayBool([True, True, False, False])))
+    self._ExecuteAndCompareExact(c, expected=[True, True, True, False])
+
+  def testSum2DF32(self):
+    c = self._NewComputation()
+    c.Add(
+        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6]])),
+        c.Constant(NumpyArrayF32([[1, -1, 1], [-1, 1, -1]])))
+    self._ExecuteAndCompareClose(c, expected=[[2, 1, 4], [3, 6, 5]])
+
+  def testSum2DF64(self):
+    c = self._NewComputation()
+    c.Add(
+        c.Constant(NumpyArrayF64([[1, 2, 3], [4, 5, 6]])),
+        c.Constant(NumpyArrayF64([[1, -1, 1], [-1, 1, -1]])))
+    self._ExecuteAndCompareClose(c, expected=[[2, 1, 4], [3, 6, 5]])
+
+  def testSum2DWith1DBroadcastDim0F32(self):
+    # sum of a 2D array with a 1D array where the latter is replicated across
+    # dimension 0 to match the former's shape.
+    c = self._NewComputation()
+    c.Add(
+        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        c.Constant(NumpyArrayF32([10, 20, 30])),
+        broadcast_dimensions=(0,))
+    self._ExecuteAndCompareClose(
+        c, expected=[[11, 12, 13], [24, 25, 26], [37, 38, 39]])
+
+  def testSum2DWith1DBroadcastDim0F64(self):
+    # sum of a 2D array with a 1D array where the latter is replicated across
+    # dimension 0 to match the former's shape.
+    c = self._NewComputation()
+    c.Add(
+        c.Constant(NumpyArrayF64([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        c.Constant(NumpyArrayF64([10, 20, 30])),
+        broadcast_dimensions=(0,))
+    self._ExecuteAndCompareClose(
+        c, expected=[[11, 12, 13], [24, 25, 26], [37, 38, 39]])
+
+  def testSum2DWith1DBroadcastDim1F32(self):
+    # sum of a 2D array with a 1D array where the latter is replicated across
+    # dimension 1 to match the former's shape.
+    c = self._NewComputation()
+    c.Add(
+        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        c.Constant(NumpyArrayF32([10, 20, 30])),
+        broadcast_dimensions=(1,))
+    self._ExecuteAndCompareClose(
+        c, expected=[[11, 22, 33], [14, 25, 36], [17, 28, 39]])
+
+  def testSum2DWith1DBroadcastDim1F64(self):
+    # sum of a 2D array with a 1D array where the latter is replicated across
+    # dimension 1 to match the former's shape.
+    c = self._NewComputation()
+    c.Add(
+        c.Constant(NumpyArrayF64([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        c.Constant(NumpyArrayF64([10, 20, 30])),
+        broadcast_dimensions=(1,))
+    self._ExecuteAndCompareClose(
+        c, expected=[[11, 22, 33], [14, 25, 36], [17, 28, 39]])
+
+  def testConstantAxpyF32(self):
+    c = self._NewComputation()
+    c.Add(
+        c.Mul(
+            c.ConstantF32Scalar(2),
+            c.Constant(NumpyArrayF32([2.2, 3.3, 4.4, 5.5]))),
+        c.Constant(NumpyArrayF32([100, -100, 200, -200])))
+    self._ExecuteAndCompareClose(c, expected=[104.4, -93.4, 208.8, -189])
+
+  def testConstantAxpyF64(self):
+    c = self._NewComputation()
+    c.Add(
+        c.Mul(
+            c.ConstantF64Scalar(2),
+            c.Constant(NumpyArrayF64([2.2, 3.3, 4.4, 5.5]))),
+        c.Constant(NumpyArrayF64([100, -100, 200, -200])))
+    self._ExecuteAndCompareClose(c, expected=[104.4, -93.4, 208.8, -189])
+
+
+class ParametersTest(LocalComputationTest):
+  """Tests focusing on Parameter ops and argument-passing."""
+
+  def setUp(self):
+    self.f32_scalar_2 = NumpyArrayF32(2.0)
+    self.f32_4vector = NumpyArrayF32([-2.3, 3.3, -4.3, 5.3])
+    self.f64_scalar_2 = NumpyArrayF64(2.0)
+    self.f64_4vector = NumpyArrayF64([-2.3, 3.3, -4.3, 5.3])
+    self.s32_scalar_3 = NumpyArrayS32(3)
+    self.s32_4vector = NumpyArrayS32([10, 15, -2, 7])
+    self.s64_scalar_3 = NumpyArrayS64(3)
+    self.s64_4vector = NumpyArrayS64([10, 15, -2, 7])
+
+  def testScalarTimesVectorAutonumberF32(self):
+    c = self._NewComputation()
+    p0 = c.ParameterFromNumpy(self.f32_scalar_2)
+    p1 = c.ParameterFromNumpy(self.f32_4vector)
+    c.Mul(p0, p1)
+    self._ExecuteAndCompareClose(
+        c,
+        arguments=[self.f32_scalar_2, self.f32_4vector],
+        expected=[-4.6, 6.6, -8.6, 10.6])
+
+  def testScalarTimesVectorAutonumberF64(self):
+    c = self._NewComputation()
+    p0 = c.ParameterFromNumpy(self.f64_scalar_2)
+    p1 = c.ParameterFromNumpy(self.f64_4vector)
+    c.Mul(p0, p1)
+    self._ExecuteAndCompareClose(
+        c,
+        arguments=[self.f64_scalar_2, self.f64_4vector],
+        expected=[-4.6, 6.6, -8.6, 10.6])
+
+  def testScalarTimesVectorS32(self):
+    c = self._NewComputation()
+    p0 = c.ParameterFromNumpy(self.s32_scalar_3)
+    p1 = c.ParameterFromNumpy(self.s32_4vector)
+    c.Mul(p0, p1)
+    self._ExecuteAndCompareExact(
+        c,
+        arguments=[self.s32_scalar_3, self.s32_4vector],
+        expected=[30, 45, -6, 21])
+
+  def testScalarTimesVectorS64(self):
+    c = self._NewComputation()
+    p0 = c.ParameterFromNumpy(self.s64_scalar_3)
+    p1 = c.ParameterFromNumpy(self.s64_4vector)
+    c.Mul(p0, p1)
+    self._ExecuteAndCompareExact(
+        c,
+        arguments=[self.s64_scalar_3, self.s64_4vector],
+        expected=[30, 45, -6, 21])
+
+  def testScalarMinusVectorExplicitNumberingF32(self):
+    # Use explicit numbering and pass parameter_num first. Sub is used since
+    # it's not commutative and can help catch parameter reversal within the
+    # computation.
+    c = self._NewComputation()
+    p1 = c.ParameterFromNumpy(self.f32_4vector, parameter_num=1)
+    p0 = c.ParameterFromNumpy(self.f32_scalar_2, parameter_num=0)
+    c.Sub(p1, p0)
+    self._ExecuteAndCompareClose(
+        c,
+        arguments=[self.f32_scalar_2, self.f32_4vector],
+        expected=[-4.3, 1.3, -6.3, 3.3])
+
+  def testScalarMinusVectorExplicitNumberingF64(self):
+    # Use explicit numbering and pass parameter_num first. Sub is used since
+    # it's not commutative and can help catch parameter reversal within the
+    # computation.
+    c = self._NewComputation()
+    p1 = c.ParameterFromNumpy(self.f64_4vector, parameter_num=1)
+    p0 = c.ParameterFromNumpy(self.f64_scalar_2, parameter_num=0)
+    c.Sub(p1, p0)
+    self._ExecuteAndCompareClose(
+        c,
+        arguments=[self.f64_scalar_2, self.f64_4vector],
+        expected=[-4.3, 1.3, -6.3, 3.3])
+
+
+class SingleOpTest(LocalComputationTest):
+  """Tests for single ops.
+
+  The goal here is smoke testing - to exercise the most basic functionality of
+  single XLA ops. As minimal as possible number of additional ops are added
+  around the op being tested.
+  """
+
+  def testConcatenateF32(self):
+    c = self._NewComputation()
+    c.Concatenate(
+        (c.Constant(NumpyArrayF32([1.0, 2.0, 3.0])),
+         c.Constant(NumpyArrayF32([4.0, 5.0, 6.0]))),
+        dimension=0)
+    self._ExecuteAndCompareClose(c, expected=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+
+  def testConcatenateF64(self):
+    c = self._NewComputation()
+    c.Concatenate(
+        (c.Constant(NumpyArrayF64([1.0, 2.0, 3.0])),
+         c.Constant(NumpyArrayF64([4.0, 5.0, 6.0]))),
+        dimension=0)
+    self._ExecuteAndCompareClose(c, expected=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+
+  def testConvertElementType(self):
+    xla_types = {
+        np.bool: xla_client.xla_data_pb2.PRED,
+        np.int32: xla_client.xla_data_pb2.S32,
+        np.int64: xla_client.xla_data_pb2.S64,
+        np.float32: xla_client.xla_data_pb2.F32,
+        np.float64: xla_client.xla_data_pb2.F64,
+    }
+
+    def _ConvertAndTest(template, src_dtype, dst_dtype):
+      c = self._NewComputation()
+      x = c.Constant(np.array(template, dtype=src_dtype))
+      c.ConvertElementType(x, xla_types[dst_dtype])
+
+      result = c.Build().Compile().Execute()
+      expected = np.array(template, dtype=dst_dtype)
+
+      self.assertEqual(result.shape, expected.shape)
+      self.assertEqual(result.dtype, expected.dtype)
+      np.testing.assert_equal(result, expected)
+
+    x = [0, 1, 0, 0, 1]
+    for src_dtype, dst_dtype in itertools.product(xla_types, xla_types):
+      _ConvertAndTest(x, src_dtype, dst_dtype)
+
+  def testDotMatrixVectorF32(self):
+    c = self._NewComputation()
+    lhs = NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]])
+    rhs = NumpyArrayF32([[10.0], [20.0]])
+    c.Dot(c.Constant(lhs), c.Constant(rhs))
+    self._ExecuteAndCompareClose(c, expected=np.dot(lhs, rhs))
+
+  def testDotMatrixVectorF64(self):
+    c = self._NewComputation()
+    lhs = NumpyArrayF64([[2.0, 3.0], [4.0, 5.0]])
+    rhs = NumpyArrayF64([[10.0], [20.0]])
+    c.Dot(c.Constant(lhs), c.Constant(rhs))
+    self._ExecuteAndCompareClose(c, expected=np.dot(lhs, rhs))
+
+  def testDotMatrixMatrixF32(self):
+    c = self._NewComputation()
+    lhs = NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]])
+    rhs = NumpyArrayF32([[10.0, 20.0], [100.0, 200.0]])
+    c.Dot(c.Constant(lhs), c.Constant(rhs))
+    self._ExecuteAndCompareClose(c, expected=np.dot(lhs, rhs))
+
+  def testDotMatrixMatrixF64(self):
+    c = self._NewComputation()
+    lhs = NumpyArrayF64([[2.0, 3.0], [4.0, 5.0]])
+    rhs = NumpyArrayF64([[10.0, 20.0], [100.0, 200.0]])
+    c.Dot(c.Constant(lhs), c.Constant(rhs))
+    self._ExecuteAndCompareClose(c, expected=np.dot(lhs, rhs))
+
+  def testBooleanNot(self):
+    c = self._NewComputation()
+    arr = NumpyArrayBool([True, False, True])
+    c.Not(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=~arr)
+
+  def testExp(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Exp(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.exp(arr))
+
+  def testLog(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Log(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.log(arr))
+
+  def testNeg(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Neg(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=-arr)
+
+  def testFloor(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Floor(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.floor(arr))
+
+  def testCeil(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Ceil(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.ceil(arr))
+
+  def testAbs(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, -12.1, 2.4, -1.])
+    c.Abs(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.abs(arr))
+
+  def testTanh(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Tanh(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.tanh(arr))
+
+  def testTrans(self):
+
+    def _TransposeAndTest(array):
+      c = self._NewComputation()
+      c.Trans(c.Constant(array))
+      self._ExecuteAndCompareClose(c, expected=array.T)
+
+    # Test square and non-square matrices in both default (C) and F orders.
+    for array_fun in [NumpyArrayF32, NumpyArrayF64]:
+      _TransposeAndTest(array_fun([[1, 2, 3], [4, 5, 6]]))
+      _TransposeAndTest(array_fun([[1, 2, 3], [4, 5, 6]], order="F"))
+      _TransposeAndTest(array_fun([[1, 2], [4, 5]]))
+      _TransposeAndTest(array_fun([[1, 2], [4, 5]], order="F"))
+
+  def testTranspose(self):
+
+    def _TransposeAndTest(array, permutation):
+      c = self._NewComputation()
+      c.Transpose(c.Constant(array), permutation)
+      expected = np.transpose(array, permutation)
+      self._ExecuteAndCompareClose(c, expected=expected)
+
+    _TransposeAndTest(NumpyArrayF32([[1, 2, 3], [4, 5, 6]]), [0, 1])
+    _TransposeAndTest(NumpyArrayF32([[1, 2, 3], [4, 5, 6]]), [1, 0])
+    _TransposeAndTest(NumpyArrayF32([[1, 2], [4, 5]]), [0, 1])
+    _TransposeAndTest(NumpyArrayF32([[1, 2], [4, 5]]), [1, 0])
+
+    arr = np.random.RandomState(0).randn(2, 3, 4).astype(np.float32)
+    for permutation in itertools.permutations(range(arr.ndim)):
+      _TransposeAndTest(arr, permutation)
+      _TransposeAndTest(np.asfortranarray(arr), permutation)
+
+  def testEq(self):
+    c = self._NewComputation()
+    c.Eq(
+        c.Constant(NumpyArrayS32([1, 2, 3, 4])),
+        c.Constant(NumpyArrayS32([4, 2, 3, 1])))
+    self._ExecuteAndCompareExact(c, expected=[False, True, True, False])
+
+  def testNe(self):
+    c = self._NewComputation()
+    c.Ne(
+        c.Constant(NumpyArrayS32([1, 2, 3, 4])),
+        c.Constant(NumpyArrayS32([4, 2, 3, 1])))
+    self._ExecuteAndCompareExact(c, expected=[True, False, False, True])
+
+    c.Ne(
+        c.Constant(NumpyArrayF32([-2.0, 0.0,
+                                  float("nan"),
+                                  float("nan")])),
+        c.Constant(NumpyArrayF32([2.0, -0.0, 1.0, float("nan")])))
+    self._ExecuteAndAssertWith(
+        np.testing.assert_allclose, c, (), expected=[True, False, True, True])
+
+  def testGt(self):
+    c = self._NewComputation()
+    c.Gt(
+        c.Constant(NumpyArrayS32([1, 2, 3, 4, 9])),
+        c.Constant(NumpyArrayS32([1, 0, 2, 7, 12])))
+    self._ExecuteAndCompareExact(c, expected=[False, True, True, False, False])
+
+  def testGe(self):
+    c = self._NewComputation()
+    c.Ge(
+        c.Constant(NumpyArrayS32([1, 2, 3, 4, 9])),
+        c.Constant(NumpyArrayS32([1, 0, 2, 7, 12])))
+    self._ExecuteAndCompareExact(c, expected=[True, True, True, False, False])
+
+  def testLt(self):
+    c = self._NewComputation()
+    c.Lt(
+        c.Constant(NumpyArrayS32([1, 2, 3, 4, 9])),
+        c.Constant(NumpyArrayS32([1, 0, 2, 7, 12])))
+    self._ExecuteAndCompareExact(c, expected=[False, False, False, True, True])
+
+  def testLe(self):
+    c = self._NewComputation()
+    c.Le(
+        c.Constant(NumpyArrayS32([1, 2, 3, 4, 9])),
+        c.Constant(NumpyArrayS32([1, 0, 2, 7, 12])))
+    self._ExecuteAndCompareExact(c, expected=[True, False, False, True, True])
+
+  def testMax(self):
+    c = self._NewComputation()
+    c.Max(
+        c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0, 9.0])),
+        c.Constant(NumpyArrayF32([1.0, 0.0, 2.0, 7.0, 12.0])))
+    self._ExecuteAndCompareExact(c, expected=[1.0, 2.0, 3.0, 7.0, 12.0])
+
+  def testMaxExplicitBroadcastDim0(self):
+    c = self._NewComputation()
+    c.Max(
+        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        c.Constant(NumpyArrayF32([3, 4, 5])),
+        broadcast_dimensions=(0,))
+    self._ExecuteAndCompareExact(c, expected=[[3, 3, 3], [4, 5, 6], [7, 8, 9]])
+
+  def testMaxExplicitBroadcastDim1(self):
+    c = self._NewComputation()
+    c.Max(
+        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        c.Constant(NumpyArrayF32([3, 4, 5])),
+        broadcast_dimensions=(1,))
+    self._ExecuteAndCompareExact(c, expected=[[3, 4, 5], [4, 5, 6], [7, 8, 9]])
+
+  def testMin(self):
+    c = self._NewComputation()
+    c.Min(
+        c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0, 9.0])),
+        c.Constant(NumpyArrayF32([1.0, 0.0, 2.0, 7.0, 12.0])))
+    self._ExecuteAndCompareExact(c, expected=[1.0, 0.0, 2.0, 4.0, 9.0])
+
+  def testReshape(self):
+    c = self._NewComputation()
+    c.Reshape(
+        c.Constant(NumpyArrayS32([[1, 2], [3, 4], [5, 6]])),
+        dimensions=[0, 1],
+        new_sizes=[2, 3])
+    self._ExecuteAndCompareExact(c, expected=[[1, 2, 3], [4, 5, 6]])
+
+  def testSelect(self):
+    c = self._NewComputation()
+    c.Select(
+        c.Constant(NumpyArrayBool([True, False, False, True, False])),
+        c.Constant(NumpyArrayS32([1, 2, 3, 4, 5])),
+        c.Constant(NumpyArrayS32([-1, -2, -3, -4, -5])))
+    self._ExecuteAndCompareExact(c, expected=[1, -2, -3, 4, -5])
+
+  def testSlice(self):
+    c = self._NewComputation()
+    c.Slice(
+        c.Constant(NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])), [1, 0],
+        [3, 2])
+    self._ExecuteAndCompareExact(c, expected=[[4, 5], [7, 8]])
+
+  def testDynamicSlice(self):
+    c = self._NewComputation()
+    c.DynamicSlice(
+        c.Constant(NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        c.Constant(NumpyArrayS32([1, 0])), [2, 2])
+    self._ExecuteAndCompareExact(c, expected=[[4, 5], [7, 8]])
+
+  def testDynamicUpdateSlice(self):
+    c = self._NewComputation()
+    c.DynamicUpdateSlice(
+        c.Constant(NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        c.Constant(NumpyArrayS32([[1, 2], [3, 4]])),
+        c.Constant(NumpyArrayS32([1, 1])))
+    self._ExecuteAndCompareExact(c, expected=[[1, 2, 3], [4, 1, 2], [7, 3, 4]])
+
+  def testTuple(self):
+    c = self._NewComputation()
+    c.Tuple(
+        c.ConstantS32Scalar(42), c.Constant(NumpyArrayF32([1.0, 2.0])),
+        c.Constant(NumpyArrayBool([True, False, False, True])))
+    result = c.Build().Compile().Execute()
+    self.assertIsInstance(result, tuple)
+    np.testing.assert_equal(result[0], 42)
+    np.testing.assert_allclose(result[1], [1.0, 2.0])
+    np.testing.assert_equal(result[2], [True, False, False, True])
+
+  def testGetTupleElement(self):
+    c = self._NewComputation()
+    c.GetTupleElement(
+        c.Tuple(
+            c.ConstantS32Scalar(42), c.Constant(NumpyArrayF32([1.0, 2.0])),
+            c.Constant(NumpyArrayBool([True, False, False, True]))), 1)
+    self._ExecuteAndCompareClose(c, expected=[1.0, 2.0])
+
+  def testBroadcast(self):
+    c = self._NewComputation()
+    c.Broadcast(c.Constant(NumpyArrayS32([10, 20, 30, 40])), sizes=(3,))
+    self._ExecuteAndCompareExact(
+        c, expected=[[10, 20, 30, 40], [10, 20, 30, 40], [10, 20, 30, 40]])
+
+
+class EmbeddedComputationsTest(LocalComputationTest):
+  """Tests for XLA graphs with embedded computations (such as maps)."""
+
+  def _CreateConstantS32Computation(self):
+    """Computation (f32) -> s32 that returns a constant 1 for any input."""
+    c = self._NewComputation("constant_s32_one")
+    # TODO(eliben): consider adding a nicer way to create new parameters without
+    # having to create dummy Numpy arrays or populating Shape messages. Perhaps
+    # we need our own (Python-client-own) way to represent Shapes conveniently.
+    c.ParameterFromNumpy(NumpyArrayF32(0))
+    c.ConstantS32Scalar(1)
+    return c.Build()
+
+  def _CreateConstantS64Computation(self):
+    """Computation (f64) -> s64 that returns a constant 1 for any input."""
+    c = self._NewComputation("constant_s64_one")
+    # TODO(eliben): consider adding a nicer way to create new parameters without
+    # having to create dummy Numpy arrays or populating Shape messages. Perhaps
+    # we need our own (Python-client-own) way to represent Shapes conveniently.
+    c.ParameterFromNumpy(NumpyArrayF64(0))
+    c.ConstantS64Scalar(1)
+    return c.Build()
+
+  def _CreateConstantF32Computation(self):
+    """Computation (f32) -> f32 that returns a constant 1.0 for any input."""
+    c = self._NewComputation("constant_f32_one")
+    c.ParameterFromNumpy(NumpyArrayF32(0))
+    c.ConstantF32Scalar(1.0)
+    return c.Build()
+
+  def _CreateConstantF64Computation(self):
+    """Computation (f64) -> f64 that returns a constant 1.0 for any input."""
+    c = self._NewComputation("constant_f64_one")
+    c.ParameterFromNumpy(NumpyArrayF64(0))
+    c.ConstantF64Scalar(1.0)
+    return c.Build()
+
+  def _CreateMulF32By2Computation(self):
+    """Computation (f32) -> f32 that multiplies its parameter by 2."""
+    c = self._NewComputation("mul_f32_by2")
+    c.Mul(c.ParameterFromNumpy(NumpyArrayF32(0)), c.ConstantF32Scalar(2.0))
+    return c.Build()
+
+  def _CreateMulF64By2Computation(self):
+    """Computation (f64) -> f64 that multiplies its parameter by 2."""
+    c = self._NewComputation("mul_f64_by2")
+    c.Mul(c.ParameterFromNumpy(NumpyArrayF64(0)), c.ConstantF64Scalar(2.0))
+    return c.Build()
+
+  def _CreateBinaryAddF32Computation(self):
+    """Computation (f32, f32) -> f32 that adds its two parameters."""
+    c = self._NewComputation("add_param0_by_param1")
+    c.Add(
+        c.ParameterFromNumpy(NumpyArrayF32(0)),
+        c.ParameterFromNumpy(NumpyArrayF32(0)))
+    return c.Build()
+
+  def _CreateBinaryAddF64Computation(self):
+    """Computation (f64, f64) -> f64 that adds its two parameters."""
+    c = self._NewComputation("add_param0_by_param1")
+    c.Add(
+        c.ParameterFromNumpy(NumpyArrayF64(0)),
+        c.ParameterFromNumpy(NumpyArrayF64(0)))
+    return c.Build()
+
+  def _CreateBinaryDivF32Computation(self):
+    """Computation (f32, f32) -> f32 that divides its two parameters."""
+    c = self._NewComputation("div_param0_by_param1")
+    c.Div(
+        c.ParameterFromNumpy(NumpyArrayF32(0)),
+        c.ParameterFromNumpy(NumpyArrayF32(0)))
+    return c.Build()
+
+  def _CreateBinaryDivF64Computation(self):
+    """Computation (f64, f64) -> f64 that divides its two parameters."""
+    c = self._NewComputation("div_param0_by_param1")
+    c.Div(
+        c.ParameterFromNumpy(NumpyArrayF64(0)),
+        c.ParameterFromNumpy(NumpyArrayF64(0)))
+    return c.Build()
+
+  def _CreateTestF32Lt10Computation(self):
+    """Computation (f32) -> bool that tests if its parameter is less than 10."""
+    c = self._NewComputation("test_f32_lt_10")
+    c.Lt(c.ParameterFromNumpy(NumpyArrayF32(0)), c.ConstantF32Scalar(10.))
+    return c.Build()
+
+  def _CreateTestF64Lt10Computation(self):
+    """Computation (f64) -> bool that tests if its parameter is less than 10."""
+    c = self._NewComputation("test_f64_lt_10")
+    c.Lt(c.ParameterFromNumpy(NumpyArrayF64(0)), c.ConstantF64Scalar(10.))
+    return c.Build()
+
+  def _MakeSample3DArrayF32(self):
+    return NumpyArrayF32([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]],
+                          [[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])
+
+  def _MakeSample3DArrayF64(self):
+    return NumpyArrayF64([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]],
+                          [[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])
+
+  def testCallF32(self):
+    c = self._NewComputation()
+    c.Call(
+        self._CreateMulF32By2Computation(),
+        operands=(c.ConstantF32Scalar(5.0),))
+    self._ExecuteAndCompareClose(c, expected=10.0)
+
+  def testCallF64(self):
+    c = self._NewComputation()
+    c.Call(
+        self._CreateMulF64By2Computation(),
+        operands=(c.ConstantF64Scalar(5.0),))
+    self._ExecuteAndCompareClose(c, expected=10.0)
+
+  def testMapEachElementToS32Constant(self):
+    c = self._NewComputation()
+    c.Map([c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0]))],
+          self._CreateConstantS32Computation(), [0])
+    self._ExecuteAndCompareExact(c, expected=[1, 1, 1, 1])
+
+  def testMapEachElementToS64Constant(self):
+    c = self._NewComputation()
+    c.Map([c.Constant(NumpyArrayF64([1.0, 2.0, 3.0, 4.0]))],
+          self._CreateConstantS64Computation(), [0])
+    self._ExecuteAndCompareExact(c, expected=[1, 1, 1, 1])
+
+  def testMapMulBy2F32(self):
+    c = self._NewComputation()
+    c.Map([c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0]))],
+          self._CreateMulF32By2Computation(), [0])
+    self._ExecuteAndCompareClose(c, expected=[2.0, 4.0, 6.0, 8.0])
+
+  def testMapMulBy2F64(self):
+    c = self._NewComputation()
+    c.Map([c.Constant(NumpyArrayF64([1.0, 2.0, 3.0, 4.0]))],
+          self._CreateMulF64By2Computation(), [0])
+    self._ExecuteAndCompareClose(c, expected=[2.0, 4.0, 6.0, 8.0])
+
+  def testSimpleMapChainF32(self):
+    # Chains a map of constant-f32 with a map of mul-by-2
+    c = self._NewComputation()
+    const_f32 = c.Map([c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0]))],
+                      self._CreateConstantF32Computation(), [0])
+    c.Map([const_f32], self._CreateMulF32By2Computation(), [0])
+    self._ExecuteAndCompareClose(c, expected=[2.0, 2.0, 2.0, 2.0])
+
+  def testSimpleMapChainF64(self):
+    # Chains a map of constant-f64 with a map of mul-by-2
+    c = self._NewComputation()
+    const_f64 = c.Map([c.Constant(NumpyArrayF64([1.0, 2.0, 3.0, 4.0]))],
+                      self._CreateConstantF64Computation(), [0])
+    c.Map([const_f64], self._CreateMulF64By2Computation(), [0])
+    self._ExecuteAndCompareClose(c, expected=[2.0, 2.0, 2.0, 2.0])
+
+  def testDivVectorsWithMapF32(self):
+    c = self._NewComputation()
+    c.Map((c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0])),
+           c.Constant(NumpyArrayF32([5.0, 5.0, 4.0, 4.0]))),
+          self._CreateBinaryDivF32Computation(), [0])
+    self._ExecuteAndCompareClose(c, expected=[0.2, 0.4, 0.75, 1.0])
+
+  def testDivVectorsWithMapF64(self):
+    c = self._NewComputation()
+    c.Map((c.Constant(NumpyArrayF64([1.0, 2.0, 3.0, 4.0])),
+           c.Constant(NumpyArrayF64([5.0, 5.0, 4.0, 4.0]))),
+          self._CreateBinaryDivF64Computation(), [0])
+    self._ExecuteAndCompareClose(c, expected=[0.2, 0.4, 0.75, 1.0])
+
+  def testReduce1DtoScalarF32(self):
+    c = self._NewComputation()
+    c.Reduce(
+        operand=c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0])),
+        init_value=c.ConstantF32Scalar(0),
+        computation_to_apply=self._CreateBinaryAddF32Computation(),
+        dimensions=[0])
+    self._ExecuteAndCompareClose(c, expected=10)
+
+  def testReduce1DtoScalarF64(self):
+    c = self._NewComputation()
+    c.Reduce(
+        operand=c.Constant(NumpyArrayF64([1.0, 2.0, 3.0, 4.0])),
+        init_value=c.ConstantF64Scalar(0),
+        computation_to_apply=self._CreateBinaryAddF64Computation(),
+        dimensions=[0])
+    self._ExecuteAndCompareClose(c, expected=10)
+
+  def testReduce2DTo1DDim0F32(self):
+    input_array = NumpyArrayF32([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+    c = self._NewComputation()
+    c.Reduce(
+        operand=c.Constant(input_array),
+        init_value=c.ConstantF32Scalar(0),
+        computation_to_apply=self._CreateBinaryAddF32Computation(),
+        dimensions=[0])
+    self._ExecuteAndCompareClose(c, expected=[5, 7, 9])
+
+  def testReduce2DTo1DDim0F64(self):
+    input_array = NumpyArrayF64([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+    c = self._NewComputation()
+    c.Reduce(
+        operand=c.Constant(input_array),
+        init_value=c.ConstantF64Scalar(0),
+        computation_to_apply=self._CreateBinaryAddF64Computation(),
+        dimensions=[0])
+    self._ExecuteAndCompareClose(c, expected=[5, 7, 9])
+
+  def testReduce2DTo1DDim1F32(self):
+    input_array = NumpyArrayF32([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+    c = self._NewComputation()
+    c.Reduce(
+        operand=c.Constant(input_array),
+        init_value=c.ConstantF32Scalar(0),
+        computation_to_apply=self._CreateBinaryAddF32Computation(),
+        dimensions=[1])
+    self._ExecuteAndCompareClose(c, expected=[6, 15])
+
+  def testReduce2DTo1DDim1F64(self):
+    input_array = NumpyArrayF64([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+    c = self._NewComputation()
+    c.Reduce(
+        operand=c.Constant(input_array),
+        init_value=c.ConstantF64Scalar(0),
+        computation_to_apply=self._CreateBinaryAddF64Computation(),
+        dimensions=[1])
+    self._ExecuteAndCompareClose(c, expected=[6, 15])
+
+  def testReduce3DAllPossibleWaysF32(self):
+    input_array = self._MakeSample3DArrayF32()
+
+    def _ReduceAndTest(*dims):
+      c = self._NewComputation()
+      c.Reduce(
+          operand=c.Constant(input_array),
+          init_value=c.ConstantF32Scalar(0),
+          computation_to_apply=self._CreateBinaryAddF32Computation(),
+          dimensions=dims)
+      self._ExecuteAndCompareClose(
+          c, expected=np.sum(input_array, axis=tuple(dims)))
+
+    _ReduceAndTest(0)
+    _ReduceAndTest(0)
+    _ReduceAndTest(0, 1)
+    _ReduceAndTest(0, 2)
+    _ReduceAndTest(1, 2)
+    _ReduceAndTest(0, 1, 2)
+
+  def testReduce3DAllPossibleWaysF64(self):
+    input_array = self._MakeSample3DArrayF64()
+
+    def _ReduceAndTest(*dims):
+      c = self._NewComputation()
+      c.Reduce(
+          operand=c.Constant(input_array),
+          init_value=c.ConstantF64Scalar(0),
+          computation_to_apply=self._CreateBinaryAddF64Computation(),
+          dimensions=dims)
+      self._ExecuteAndCompareClose(
+          c, expected=np.sum(input_array, axis=tuple(dims)))
+
+    _ReduceAndTest(0)
+    _ReduceAndTest(0)
+    _ReduceAndTest(0, 1)
+    _ReduceAndTest(0, 2)
+    _ReduceAndTest(1, 2)
+    _ReduceAndTest(0, 1, 2)
+
+  def testWhileF32(self):
+    cond = self._CreateTestF32Lt10Computation()
+    body = self._CreateMulF32By2Computation()
+    c = self._NewComputation()
+    init = c.ConstantF32Scalar(1.)
+    c.While(cond, body, init)
+    self._ExecuteAndCompareClose(c, expected=16.)
+
+  def testWhileF64(self):
+    cond = self._CreateTestF64Lt10Computation()
+    body = self._CreateMulF64By2Computation()
+    c = self._NewComputation()
+    init = c.ConstantF64Scalar(1.)
+    c.While(cond, body, init)
+    self._ExecuteAndCompareClose(c, expected=16.)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index 90aa9720a1e18bad06842adeead46fc3120d01dd..0a155400159ef178e93c378ea22467c6e257b61d 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -102,7 +102,9 @@ ReferenceUtil::ConvArray3DGeneralDimensionsDilated(
     const Array3D<float>& lhs, const Array3D<float>& rhs, int64 kernel_stride,
     Padding padding, int64 lhs_dilation, int64 rhs_dilation,
     const ConvolutionDimensionNumbers& dnums) {
-  CHECK_EQ(dnums.spatial_dimensions_size(), 1);
+  CHECK_EQ(dnums.input_spatial_dimensions_size(), 1);
+  CHECK_EQ(dnums.kernel_spatial_dimensions_size(), 1);
+  CHECK_EQ(dnums.output_spatial_dimensions_size(), 1);
   // Reuse the code for Array4D-convolution by extending the 3D input into a 4D
   // array by adding a fourth dummy dimension of size 1 without stride, padding
   // and dilation.
@@ -120,8 +122,9 @@ ReferenceUtil::ConvArray3DGeneralDimensionsDilated(
       });
   // Add a second dummy spatial dimensions.
   ConvolutionDimensionNumbers dnums2d = dnums;
-  dnums2d.add_spatial_dimensions(3);
+  dnums2d.add_input_spatial_dimensions(3);
   dnums2d.add_kernel_spatial_dimensions(3);
+  dnums2d.add_output_spatial_dimensions(3);
   std::unique_ptr<Array4D<float>> convr4 = ConvArray4DGeneralDimensionsDilated(
       a4dlhs, a4drhs, {kernel_stride, 1}, padding, {lhs_dilation, 1},
       {rhs_dilation, 1}, dnums2d);
@@ -192,14 +195,26 @@ ReferenceUtil::ReduceWindow1DGeneric(
     const tensorflow::gtl::ArraySlice<int64>& window,
     const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding) {
   std::vector<int64> dim_lengths{static_cast<int64>(operand.size())};
-  auto padding_both = xla::MakePadding(dim_lengths, window, stride, padding);
+  return ReduceWindow1DGeneric(
+      operand, init, reduce_func, window, stride,
+      xla::MakePadding(dim_lengths, window, stride, padding));
+}
 
+/* static  */ std::unique_ptr<std::vector<float>>
+ReferenceUtil::ReduceWindow1DGeneric(
+    const tensorflow::gtl::ArraySlice<float>& operand, float init,
+    const std::function<float(float, float)>& reduce_func,
+    const tensorflow::gtl::ArraySlice<int64>& window,
+    const tensorflow::gtl::ArraySlice<int64>& stride,
+    const tensorflow::gtl::ArraySlice<std::pair<int64, int64>>& padding) {
+  std::vector<int64> dim_lengths{static_cast<int64>(operand.size())};
   std::vector<int64> window_counts(window.size(), 0);
   std::vector<int64> pad_low(window.size(), 0);
   for (int64 i = 0; i < window.size(); ++i) {
+    int64 padded_width = padding[i].first + dim_lengths[i] + padding[i].second;
     window_counts[i] =
-        WindowCount(dim_lengths[i], window[i], stride[i], padding);
-    pad_low[i] = padding_both[i].first;
+        window_util::StridedBound(padded_width, window[i], stride[i]);
+    pad_low[i] = padding[i].first;
   }
   auto result = MakeUnique<std::vector<float>>(window_counts[0]);
 
@@ -465,9 +480,9 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
   }
 
   ordered_input_dimensions[0] =
-      lhs_literal->shape().dimensions(dnums.spatial_dimensions(0));
+      lhs_literal->shape().dimensions(dnums.input_spatial_dimensions(0));
   ordered_input_dimensions[1] =
-      lhs_literal->shape().dimensions(dnums.spatial_dimensions(1));
+      lhs_literal->shape().dimensions(dnums.input_spatial_dimensions(1));
   ordered_kernel_dimensions[0] =
       rhs_literal->shape().dimensions(dnums.kernel_spatial_dimensions(0));
   ordered_kernel_dimensions[1] =
@@ -517,7 +532,7 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
 
   HloEvaluator evaluator;
   std::unique_ptr<Literal> result_literal =
-      evaluator.Evaluate(*computation, {}).ConsumeValueOrDie();
+      evaluator.Evaluate<const Literal*>(*computation, {}).ConsumeValueOrDie();
 
   CHECK_EQ(ShapeUtil::Rank(result_literal->shape()), 4);
   auto result =
@@ -703,137 +718,4 @@ ReferenceUtil::ReduceToRowArray2D(
   return result;
 }
 
-/* static */ std::unique_ptr<Array2D<float>> ReferenceUtil::PadArray2D(
-    const Array2D<float>& operand, const PaddingConfig& padding,
-    const float pad) {
-  int64 in0 = operand.n1();
-  int64 high_padding0 = padding.dimensions(0).edge_padding_high();
-  int64 low_padding0 = padding.dimensions(0).edge_padding_low();
-  int64 interior_padding0 = padding.dimensions(0).interior_padding();
-  int64 out0 =
-      in0 + low_padding0 + high_padding0 + (in0 - 1) * interior_padding0;
-
-  int64 in1 = operand.n2();
-  int64 high_padding1 = padding.dimensions(1).edge_padding_high();
-  int64 low_padding1 = padding.dimensions(1).edge_padding_low();
-  int64 interior_padding1 = padding.dimensions(1).interior_padding();
-  int64 out1 =
-      in1 + low_padding1 + high_padding1 + (in1 - 1) * interior_padding1;
-
-  auto result = MakeUnique<Array2D<float>>(out0, out1);
-  result->Fill(pad);
-  int64 o0 = low_padding0;
-  for (int64 i0 = 0; i0 < in0; ++i0) {
-    int64 o1 = low_padding1;
-    for (int64 i1 = 0; i1 < in1; ++i1) {
-      if (o0 >= 0 && o1 >= 0 && o0 < out0 && o1 < out1) {
-        (*result)(o0, o1) = operand(i0, i1);
-      }
-      o1 += interior_padding1 + 1;
-    }
-    o0 += interior_padding0 + 1;
-  }
-  return result;
-}
-
-/* static */ Array3D<float> ReferenceUtil::PadArray3D(
-    const Array3D<float>& operand, const PaddingConfig& padding,
-    const float pad) {
-  CHECK_EQ(padding.dimensions_size(), 3);
-
-  const std::vector<int64> input_bounds = {operand.n1(), operand.n2(),
-                                           operand.n3()};
-  std::vector<int64> pad_low(3);
-  std::vector<int64> pad_high(3);
-  std::vector<int64> pad_interior(3);
-  std::vector<int64> output_bounds(3);
-  for (int64 i = 0; i < 3; ++i) {
-    pad_low[i] = padding.dimensions(i).edge_padding_low();
-    pad_high[i] = padding.dimensions(i).edge_padding_high();
-    CHECK_LE(0, pad_low[i]);
-    CHECK_LE(0, pad_high[i]);
-    CHECK_LE(0, padding.dimensions(i).interior_padding()) << "not implemented";
-    pad_interior[i] = padding.dimensions(i).interior_padding();
-
-    output_bounds[i] = pad_low[i] + input_bounds[i] + pad_high[i] +
-                       (input_bounds[i] - 1) * pad_interior[i];
-  }
-
-  Array3D<float> result(output_bounds[0], output_bounds[1], output_bounds[2]);
-  std::vector<int> indices = {0, 0, 0};
-  for (indices[0] = 0; indices[0] < output_bounds[0]; ++indices[0]) {
-    for (indices[1] = 0; indices[1] < output_bounds[1]; ++indices[1]) {
-      for (indices[2] = 0; indices[2] < output_bounds[2]; ++indices[2]) {
-        float* value = &result(indices[0], indices[1], indices[2]);
-        bool value_padded = false;
-        for (int i = 0; i < 3; ++i) {
-          bool in_low_padding = indices[i] < pad_low[i];
-          bool in_high_padding = indices[i] >= output_bounds[i] - pad_high[i];
-          if (in_low_padding || in_high_padding) {
-            *value = pad;
-            value_padded = true;
-          }
-          if (pad_interior[i] &&
-              (indices[i] - pad_low[i]) % (pad_interior[i] + 1)) {
-            *value = pad;
-            value_padded = true;
-          }
-        }
-        if (value_padded) {
-          continue;
-        }
-        *value = operand((indices[0] - pad_low[0]) / (pad_interior[0] + 1),
-                         (indices[1] - pad_low[1]) / (pad_interior[1] + 1),
-                         (indices[2] - pad_low[2]) / (pad_interior[2] + 1));
-      }
-    }
-  }
-  return result;
-}
-
-/* static */ Array4D<float> ReferenceUtil::PadArray4D(
-    const Array4D<float>& operand, const PaddingConfig& padding,
-    const float pad) {
-  CHECK_EQ(padding.dimensions_size(), 4);
-
-  const std::vector<int64> input_bounds = {operand.n1(), operand.n2(),
-                                           operand.n3(), operand.n4()};
-  std::vector<int64> pad_low(4);
-  std::vector<int64> pad_high(4);
-  std::vector<int64> pad_interior(4);
-  std::vector<int64> output_bounds(4);
-  for (int64 i = 0; i < 4; ++i) {
-    pad_low[i] = padding.dimensions(i).edge_padding_low();
-    pad_high[i] = padding.dimensions(i).edge_padding_high();
-    CHECK_LE(0, padding.dimensions(i).interior_padding()) << "not implemented";
-    pad_interior[i] = padding.dimensions(i).interior_padding();
-
-    output_bounds[i] = pad_low[i] + input_bounds[i] + pad_high[i] +
-                       (input_bounds[i] - 1) * pad_interior[i];
-  }
-
-  Array4D<float> result(output_bounds[0], output_bounds[1], output_bounds[2],
-                        output_bounds[3]);
-  result.Each([&](tensorflow::gtl::ArraySlice<int64> indices, float* value) {
-    for (int i = 0; i < 4; ++i) {
-      bool in_low_padding = indices[i] < pad_low[i];
-      bool in_high_padding = indices[i] >= output_bounds[i] - pad_high[i];
-      if (in_low_padding || in_high_padding) {
-        *value = pad;
-        return;
-      }
-      if (pad_interior[i] &&
-          (indices[i] - pad_low[i]) % (pad_interior[i] + 1)) {
-        *value = pad;
-        return;
-      }
-    }
-    *value = operand((indices[0] - pad_low[0]) / (pad_interior[0] + 1),
-                     (indices[1] - pad_low[1]) / (pad_interior[1] + 1),
-                     (indices[2] - pad_low[2]) / (pad_interior[2] + 1),
-                     (indices[3] - pad_low[3]) / (pad_interior[3] + 1));
-  });
-  return result;
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/reference_util.h b/tensorflow/compiler/xla/reference_util.h
index 2da17307817858eea60e868f4be1ab8138784385..58e1a844610678f64677838e93f0379b63f65d39 100644
--- a/tensorflow/compiler/xla/reference_util.h
+++ b/tensorflow/compiler/xla/reference_util.h
@@ -70,7 +70,7 @@ class ReferenceUtil {
   // dilation factors.
   static std::unique_ptr<Array4D<float>> ConvArray4DGeneralDimensionsDilated(
       const Array4D<float>& lhs, const Array4D<float>& rhs,
-      std::pair<int64, int64> stride, Padding padding,
+      std::pair<int64, int64> kernel_stride, Padding padding,
       std::pair<int64, int64> lhs_dilation,
       std::pair<int64, int64> rhs_dilation, ConvolutionDimensionNumbers dnums);
 
@@ -184,6 +184,12 @@ class ReferenceUtil {
       const std::function<float(float, float)>& reduce_func,
       const tensorflow::gtl::ArraySlice<int64>& window,
       const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding);
+  static std::unique_ptr<std::vector<float>> ReduceWindow1DGeneric(
+      const tensorflow::gtl::ArraySlice<float>& operand, float init,
+      const std::function<float(float, float)>& reduce_func,
+      const tensorflow::gtl::ArraySlice<int64>& window,
+      const tensorflow::gtl::ArraySlice<int64>& stride,
+      const tensorflow::gtl::ArraySlice<std::pair<int64, int64>>& padding);
   static std::unique_ptr<Array4D<float>> ReduceWindow4DGeneric(
       const Array4D<float>& operand, float init,
       const std::function<float(float, float)>& reduce_func,
@@ -486,19 +492,147 @@ class ReferenceUtil {
   }
 
   // Returns the result of a 2D pad on an input matrix.
-  static std::unique_ptr<Array2D<float>> PadArray2D(
-      const Array2D<float>& operand, const PaddingConfig& padding,
-      const float pad);
+  template <typename NativeT>
+  static std::unique_ptr<Array2D<NativeT>> PadArray2D(
+      const Array2D<NativeT>& operand, const PaddingConfig& padding,
+      const NativeT pad) {
+    int64 in0 = operand.n1();
+    int64 high_padding0 = padding.dimensions(0).edge_padding_high();
+    int64 low_padding0 = padding.dimensions(0).edge_padding_low();
+    int64 interior_padding0 = padding.dimensions(0).interior_padding();
+    int64 out0 =
+        in0 + low_padding0 + high_padding0 + (in0 - 1) * interior_padding0;
+
+    int64 in1 = operand.n2();
+    int64 high_padding1 = padding.dimensions(1).edge_padding_high();
+    int64 low_padding1 = padding.dimensions(1).edge_padding_low();
+    int64 interior_padding1 = padding.dimensions(1).interior_padding();
+    int64 out1 =
+        in1 + low_padding1 + high_padding1 + (in1 - 1) * interior_padding1;
+
+    auto result = MakeUnique<Array2D<NativeT>>(out0, out1);
+    result->Fill(pad);
+    int64 o0 = low_padding0;
+    for (int64 i0 = 0; i0 < in0; ++i0) {
+      int64 o1 = low_padding1;
+      for (int64 i1 = 0; i1 < in1; ++i1) {
+        if (o0 >= 0 && o1 >= 0 && o0 < out0 && o1 < out1) {
+          (*result)(o0, o1) = operand(i0, i1);
+        }
+        o1 += interior_padding1 + 1;
+      }
+      o0 += interior_padding0 + 1;
+    }
+    return result;
+  }
 
   // Returns the result of a 3D pad on an input matrix.
-  static Array3D<float> PadArray3D(const Array3D<float>& operand,
-                                   const PaddingConfig& padding,
-                                   const float pad);
+  template <typename NativeT>
+  static Array3D<NativeT> PadArray3D(const Array3D<NativeT>& operand,
+                                     const PaddingConfig& padding,
+                                     const NativeT pad) {
+    CHECK_EQ(padding.dimensions_size(), 3);
+
+    const std::vector<int64> input_bounds = {operand.n1(), operand.n2(),
+                                             operand.n3()};
+    std::vector<int64> pad_low(3);
+    std::vector<int64> pad_high(3);
+    std::vector<int64> pad_interior(3);
+    std::vector<int64> output_bounds(3);
+    for (int64 i = 0; i < 3; ++i) {
+      pad_low[i] = padding.dimensions(i).edge_padding_low();
+      pad_high[i] = padding.dimensions(i).edge_padding_high();
+      CHECK_LE(0, pad_low[i]);
+      CHECK_LE(0, pad_high[i]);
+      CHECK_LE(0, padding.dimensions(i).interior_padding())
+          << "not implemented";
+      pad_interior[i] = padding.dimensions(i).interior_padding();
+
+      output_bounds[i] = pad_low[i] + input_bounds[i] + pad_high[i] +
+                         (input_bounds[i] - 1) * pad_interior[i];
+    }
+
+    Array3D<NativeT> result(output_bounds[0], output_bounds[1],
+                            output_bounds[2]);
+    std::vector<int> indices = {0, 0, 0};
+    for (indices[0] = 0; indices[0] < output_bounds[0]; ++indices[0]) {
+      for (indices[1] = 0; indices[1] < output_bounds[1]; ++indices[1]) {
+        for (indices[2] = 0; indices[2] < output_bounds[2]; ++indices[2]) {
+          NativeT* value = &result(indices[0], indices[1], indices[2]);
+          bool value_padded = false;
+          for (int i = 0; i < 3; ++i) {
+            bool in_low_padding = indices[i] < pad_low[i];
+            bool in_high_padding = indices[i] >= output_bounds[i] - pad_high[i];
+            if (in_low_padding || in_high_padding) {
+              *value = pad;
+              value_padded = true;
+            }
+            if (pad_interior[i] &&
+                (indices[i] - pad_low[i]) % (pad_interior[i] + 1)) {
+              *value = pad;
+              value_padded = true;
+            }
+          }
+          if (value_padded) {
+            continue;
+          }
+          *value = operand((indices[0] - pad_low[0]) / (pad_interior[0] + 1),
+                           (indices[1] - pad_low[1]) / (pad_interior[1] + 1),
+                           (indices[2] - pad_low[2]) / (pad_interior[2] + 1));
+        }
+      }
+    }
+    return result;
+  }
 
   // Returns the result of a 4D pad on an input array.
-  static Array4D<float> PadArray4D(const Array4D<float>& operand,
-                                   const PaddingConfig& padding,
-                                   const float pad);
+  template <typename NativeT>
+  static Array4D<NativeT> PadArray4D(const Array4D<NativeT>& operand,
+                                     const PaddingConfig& padding,
+                                     const NativeT pad) {
+    CHECK_EQ(padding.dimensions_size(), 4);
+
+    const std::vector<int64> input_bounds = {operand.n1(), operand.n2(),
+                                             operand.n3(), operand.n4()};
+    std::vector<int64> pad_low(4);
+    std::vector<int64> pad_high(4);
+    std::vector<int64> pad_interior(4);
+    std::vector<int64> output_bounds(4);
+    for (int64 i = 0; i < 4; ++i) {
+      pad_low[i] = padding.dimensions(i).edge_padding_low();
+      pad_high[i] = padding.dimensions(i).edge_padding_high();
+      CHECK_LE(0, padding.dimensions(i).interior_padding())
+          << "not implemented";
+      pad_interior[i] = padding.dimensions(i).interior_padding();
+
+      output_bounds[i] = pad_low[i] + input_bounds[i] + pad_high[i] +
+                         (input_bounds[i] - 1) * pad_interior[i];
+    }
+
+    Array4D<NativeT> result(output_bounds[0], output_bounds[1],
+                            output_bounds[2], output_bounds[3]);
+    result.Each(
+        [&](tensorflow::gtl::ArraySlice<int64> indices, NativeT* value) {
+          for (int i = 0; i < 4; ++i) {
+            bool in_low_padding = indices[i] < pad_low[i];
+            bool in_high_padding = indices[i] >= output_bounds[i] - pad_high[i];
+            if (in_low_padding || in_high_padding) {
+              *value = pad;
+              return;
+            }
+            if (pad_interior[i] &&
+                (indices[i] - pad_low[i]) % (pad_interior[i] + 1)) {
+              *value = pad;
+              return;
+            }
+          }
+          *value = operand((indices[0] - pad_low[0]) / (pad_interior[0] + 1),
+                           (indices[1] - pad_low[1]) / (pad_interior[1] + 1),
+                           (indices[2] - pad_low[2]) / (pad_interior[2] + 1),
+                           (indices[3] - pad_low[3]) / (pad_interior[3] + 1));
+        });
+    return result;
+  }
 
   // ApplyElementwise2D(f, x, y, ...) returns the Array2D formed by running
   // f(x[i], y[i], ...) for each array element in the Array2Ds x, y, ....
diff --git a/tensorflow/compiler/xla/reference_util_test.cc b/tensorflow/compiler/xla/reference_util_test.cc
index eb6a71242ffa1499876b90f14f8a60ffdbdd069c..846ccdc83df900e3afedb6ababe07ebb1bd68f41 100644
--- a/tensorflow/compiler/xla/reference_util_test.cc
+++ b/tensorflow/compiler/xla/reference_util_test.cc
@@ -60,7 +60,9 @@ TEST_F(ReferenceUtilTest, TransposeArray2D) {
 
 TEST_F(ReferenceUtilTest, MatmulArray2D) {
   Array2D<float> rhs({
-      {7.f, 8.f}, {9.f, 10.f}, {11.f, 12.f},
+      {7.f, 8.f},
+      {9.f, 10.f},
+      {11.f, 12.f},
   });
   auto result = ReferenceUtil::MatmulArray2D(*matrix_, rhs);
   auto actual_literal = Literal::CreateR2FromArray2D(*result);
@@ -326,8 +328,10 @@ TEST_F(ReferenceUtilTest, ConvGeneralDimensionsWithSamePadding) {
   dimension_numbers.set_input_feature_dimension(0);
   dimension_numbers.set_output_batch_dimension(2);
   dimension_numbers.set_output_feature_dimension(0);
-  dimension_numbers.add_spatial_dimensions(1);
-  dimension_numbers.add_spatial_dimensions(3);
+  dimension_numbers.add_input_spatial_dimensions(1);
+  dimension_numbers.add_output_spatial_dimensions(1);
+  dimension_numbers.add_input_spatial_dimensions(3);
+  dimension_numbers.add_output_spatial_dimensions(3);
   dimension_numbers.set_kernel_output_feature_dimension(0);
   dimension_numbers.set_kernel_input_feature_dimension(2);
   dimension_numbers.add_kernel_spatial_dimensions(1);
@@ -380,8 +384,10 @@ TEST_F(ReferenceUtilTest, ConvGeneralDimensionsWithValidPadding) {
   dimension_numbers.set_input_feature_dimension(0);
   dimension_numbers.set_output_batch_dimension(2);
   dimension_numbers.set_output_feature_dimension(0);
-  dimension_numbers.add_spatial_dimensions(1);
-  dimension_numbers.add_spatial_dimensions(3);
+  dimension_numbers.add_input_spatial_dimensions(1);
+  dimension_numbers.add_output_spatial_dimensions(1);
+  dimension_numbers.add_input_spatial_dimensions(3);
+  dimension_numbers.add_output_spatial_dimensions(3);
 
   dimension_numbers.set_kernel_output_feature_dimension(0);
   dimension_numbers.set_kernel_input_feature_dimension(2);
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index c4e5a7eaf34b4002c072cccf6d8e156f0a311a43..bbf6c128fb3b31657e97b608c56c27bc12045ac1 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -108,6 +108,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
@@ -115,6 +116,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/service:hlo_element_type_converter",
         "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
@@ -1009,9 +1011,9 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "batchnorm_rewriter",
-    srcs = ["batchnorm_rewriter.cc"],
-    hdrs = ["batchnorm_rewriter.h"],
+    name = "batchnorm_expander",
+    srcs = ["batchnorm_expander.cc"],
+    hdrs = ["batchnorm_expander.h"],
     deps = [
         ":hlo",
         ":hlo_pass",
@@ -1029,11 +1031,11 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "batchnorm_rewriter_test",
+    name = "batchnorm_expander_test",
     size = "small",
-    srcs = ["batchnorm_rewriter_test.cc"],
+    srcs = ["batchnorm_expander_test.cc"],
     deps = [
-        ":batchnorm_rewriter",
+        ":batchnorm_expander",
         ":hlo",
         ":hlo_matchers",
         ":hlo_pass",
@@ -1143,6 +1145,22 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "dot_decomposer",
+    srcs = ["dot_decomposer.cc"],
+    hdrs = ["dot_decomposer.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "tuple_simplifier",
     srcs = ["tuple_simplifier.cc"],
@@ -1304,6 +1322,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
@@ -1638,10 +1657,14 @@ cc_library(
     deps = [
         ":buffer_liveness",
         ":hlo",
+        ":hlo_alias_analysis",
+        ":hlo_dce",
+        ":hlo_graph_dumper",
+        ":hlo_ordering",
         ":hlo_pass",
         ":liveness_util",
         ":logical_buffer",
-        ":tuple_points_to_analysis",
+        ":tuple_simplifier",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -1656,15 +1679,17 @@ tf_cc_test(
     deps = [
         ":copy_insertion",
         ":hlo",
+        ":hlo_graph_dumper",
         ":hlo_matchers",
-        ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
     ],
 )
 
@@ -1696,6 +1721,22 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "hlo_verifier_test",
+    srcs = ["hlo_verifier_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_verifier",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "hlo_rematerialization",
     srcs = ["hlo_rematerialization.cc"],
@@ -1882,6 +1923,22 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "hlo_element_type_converter",
+    srcs = ["hlo_element_type_converter.cc"],
+    hdrs = ["hlo_element_type_converter.h"],
+    deps = [
+        ":hlo",
+        ":hlo_evaluator",
+        ":hlo_pass",
+        ":hlo_query",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "device_memory_allocator",
     srcs = ["device_memory_allocator.cc"],
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index bc9a3ac43db08d1dcca72d4df8235fbe6d7f19cc..7dc09a8cbd295753570b6e554d4211335617509e 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -180,19 +180,46 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   static bool Run(
       HloComputation* computation, bool is_layout_sensitive,
       AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-      bool enable_dot_simplification, bool enable_conv_simplification);
+      bool enable_dot_strength_reduction, bool enable_conv_simplification);
 
  private:
   explicit AlgebraicSimplifierVisitor(
       HloComputation* computation, bool is_layout_sensitive,
       AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-      bool enable_dot_simplification, bool enable_conv_simplification)
+      bool enable_dot_strength_reduction, bool enable_conv_simplification)
       : computation_(computation),
         is_layout_sensitive_(is_layout_sensitive),
         valid_bitcast_callback_(std::move(valid_bitcast_callback)),
-        enable_dot_simplification_(enable_dot_simplification),
+        enable_dot_strength_reduction_(enable_dot_strength_reduction),
         enable_conv_simplification_(enable_conv_simplification) {}
 
+  // Transforms Dots where at least one input is a vector or has a degenerate
+  // dimension and converts it into a multiply and reduce. This should enable
+  // more fusion than leaving the nodes as Dot operations.
+  StatusOr<bool> HandleDotStrengthReduction(HloInstruction* dot);
+
+  // Reshapes an instruction to rank 1 if it is not already rank 1.
+  HloInstruction* Flatten(HloInstruction* hlo) {
+    if (ShapeUtil::Rank(hlo->shape()) == 1) {
+      return hlo;
+    }
+    return computation_->AddInstruction(HloInstruction::CreateReshape(
+        ShapeUtil::MakeShape(hlo->shape().element_type(),
+                             {ShapeUtil::ElementsIn(hlo->shape())}),
+        hlo));
+  }
+
+  // Helper method to perform and add reduction in a single dimension.
+  HloInstruction* AddReduce(HloInstruction* hlo, int64 dim) {
+    HloInstruction* zero = computation_->AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
+    HloComputation* AddReduce_computation = CreateScalarBinaryComputation(
+        computation_->parent(), F32, HloOpcode::kAdd);
+    Shape shape = ShapeUtil::DeleteDimension(dim, hlo->shape());
+    return computation_->AddInstruction(HloInstruction::CreateReduce(
+        shape, hlo, zero, {dim}, AddReduce_computation));
+  }
+
   // Convenience method for replacing an instruction with a bitcast.
   void ReplaceWithBitcast(HloInstruction* instruction);
 
@@ -252,6 +279,11 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  StatusOr<HloInstruction*> OptimizeDotOfConcat(HloInstruction* dot);
+  StatusOr<HloInstruction*> OptimizeDotOfConcatHelper(
+      const Shape& dot_shape, HloInstruction* lhs, int64 lhs_contracting_dim,
+      HloInstruction* rhs, int64 rhs_contracting_dim, bool swapped);
+
   // Current HloComputation instance the AlgebraicSimplifierVisitor is
   // traversing.
   HloComputation* computation_;
@@ -265,8 +297,8 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   // Callback used to determine if a bitcast is possible.
   AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback_;
 
-  // Disable dot simplication on platforms where it causes a slowdown.
-  bool enable_dot_simplification_;
+  // Disable dot strength reduction on platforms where it causes a slowdown.
+  bool enable_dot_strength_reduction_;
 
   // Disable convolution simplication on platforms where it causes a slowdown.
   bool enable_conv_simplification_;
@@ -275,10 +307,10 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 bool AlgebraicSimplifierVisitor::Run(
     HloComputation* computation, bool is_layout_sensitive,
     AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-    bool enable_dot_simplification, bool enable_conv_simplification) {
+    bool enable_dot_strength_reduction, bool enable_conv_simplification) {
   AlgebraicSimplifierVisitor visitor(
       computation, is_layout_sensitive, std::move(valid_bitcast_callback),
-      enable_dot_simplification, enable_conv_simplification);
+      enable_dot_strength_reduction, enable_conv_simplification);
   TF_CHECK_OK(computation->Accept(&visitor));
   return visitor.changed_;
 }
@@ -574,68 +606,72 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
   return Status::OK();
 }
 
-Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
-  auto lhs = dot->mutable_operand(0);
-  auto rhs = dot->mutable_operand(1);
-  if (!enable_dot_simplification_) {
-    return Status::OK();
-  }
-  // Only optimize F32 dot operations where the dot, rhs and lhs are rank 2 or
-  // below.
-  if (dot->shape().element_type() != F32 || ShapeUtil::Rank(lhs->shape()) > 2 ||
-      ShapeUtil::Rank(rhs->shape()) > 2 || ShapeUtil::Rank(dot->shape()) > 2) {
-    return Status::OK();
-  }
-
-  // Replace a zero element dot with a broadcast of the constant 0.
-  if (ShapeUtil::HasZeroElements(dot->shape()) ||
-      ShapeUtil::HasZeroElements(lhs->shape()) ||
-      ShapeUtil::HasZeroElements(rhs->shape())) {
-    auto zero = computation_->AddInstruction(
-        HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-    return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateBroadcast(dot->shape(), zero, {}));
-  }
-
-  // Simplify dot(transpose(a), transpose(b)) to transpose(dot(b,a)).
-  if (lhs->IsRank2Transpose() && rhs->IsRank2Transpose()) {
-    auto new_dot = computation_->AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::PermuteDimensions({1, 0}, dot->shape()), HloOpcode::kDot,
-        rhs->mutable_operand(0), lhs->mutable_operand(0)));
-    return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateTranspose(dot->shape(), new_dot, {1, 0}));
-  }
+StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
+    HloInstruction* dot) {
+  HloInstruction* lhs = dot->mutable_operand(0);
+  HloInstruction* rhs = dot->mutable_operand(1);
+  int64 lhs_collapsing_dim =
+      dot->dot_dimension_numbers().lhs_contracting_dimensions(0);
+  if (lhs->IsRank2Transpose()) {
+    lhs = lhs->mutable_operand(0);
+    lhs_collapsing_dim = 1 - lhs_collapsing_dim;
+  }
+  const int64 lhs_kept_dim = 1 - lhs_collapsing_dim;
+
+  int64 rhs_collapsing_dim =
+      dot->dot_dimension_numbers().rhs_contracting_dimensions(0);
+  if (rhs->IsRank2Transpose()) {
+    rhs = rhs->mutable_operand(0);
+    rhs_collapsing_dim = 1 - rhs_collapsing_dim;
+  }
+  const int64 rhs_kept_dim = 1 - rhs_collapsing_dim;
+
+  auto reshape_if_necessary = [&](HloInstruction* hlo) {
+    if (ShapeUtil::SameDimensions(hlo->shape(), dot->shape())) {
+      return hlo;
+    }
+    return computation_->AddInstruction(
+        HloInstruction::CreateReshape(dot->shape(), hlo));
+  };
 
-  // Simplify outer product into multiply with implicit broadcasting.
-  //
-  // A dot(a[M, 1], b[1, N]) = multiply(a [M,1], b [1, N])
-  if (ShapeUtil::Rank(rhs->shape()) == 2 && rhs->shape().dimensions(0) == 1) {
-    return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateBinary(dot->shape(), HloOpcode::kMultiply,
-                                          lhs, rhs));
-  }
+  auto broadcast_to_dim = [&](HloInstruction* hlo, const Shape& shape,
+                              int64 dim) {
+    return computation_->AddInstruction(
+        HloInstruction::CreateBroadcast(shape, hlo, {dim}));
+  };
 
-  // The following graph transformations take Dots where at least one input is a
-  // vector or has a degenerate dimension and converts it into a multiply and
-  // reduce. This should enable more fusion than leaving the nodes as Dot
-  // operations.
+  auto multiply = [&](HloInstruction* local_lhs, HloInstruction* local_rhs) {
+    return computation_->AddInstruction(HloInstruction::CreateBinary(
+        local_lhs->shape(), HloOpcode::kMultiply, local_lhs, local_rhs));
+  };
 
   // Strength reduce dot(a[K] , b[K]) =
   //  reshape(result.shape,
   //          reduce_sum(multiply(a, b), {0}))
   if (ShapeUtil::Rank(rhs->shape()) == 1 &&
       ShapeUtil::Rank(lhs->shape()) == 1) {
-    auto multiply = computation_->AddInstruction(HloInstruction::CreateBinary(
-        rhs->shape(), HloOpcode::kMultiply, lhs, rhs));
-    HloComputation* add_reduce_computation = CreateScalarBinaryComputation(
-        computation_->parent(), F32, HloOpcode::kAdd);
-    auto zero = computation_->AddInstruction(
-        HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-    auto reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
-        ShapeUtil::MakeShape(dot->shape().element_type(), {}), multiply, zero,
-        {0}, add_reduce_computation));
-    return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateReshape(dot->shape(), reduce));
+    TF_RETURN_IF_ERROR(
+        ReplaceInstruction(dot, reshape_if_necessary(AddReduce(
+                                    multiply(Flatten(lhs), Flatten(rhs)), 0))));
+    return true;
+  }
+
+  if (ShapeUtil::IsEffectiveScalar(rhs->shape()) &&
+      ShapeUtil::IsEffectiveScalar(lhs->shape())) {
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, reshape_if_necessary(multiply(Flatten(lhs), Flatten(rhs)))));
+    return true;
+  }
+
+  // Simplify outer product into multiply with implicit broadcasting.
+  //
+  // A dot(a[M, 1], b[1, N]) = multiply(a [M,1], b [1, N])
+  if (ShapeUtil::Rank(rhs->shape()) == 2 &&
+      rhs->shape().dimensions(rhs_collapsing_dim) == 1) {
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, multiply(broadcast_to_dim(Flatten(lhs), dot->shape(), 0),
+                      broadcast_to_dim(Flatten(rhs), dot->shape(), 1))));
+    return true;
   }
 
   // Strength reduce dot(a[1, K], b) =
@@ -646,35 +682,21 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   //      )
   //    )
   if (ShapeUtil::Rank(lhs->shape()) == 1 ||
-      (ShapeUtil::Rank(lhs->shape()) == 2 && lhs->shape().dimensions(0) == 1)) {
-    auto new_lhs = computation_->AddInstruction(HloInstruction::CreateReshape(
-        ShapeUtil::MakeShape(lhs->shape().element_type(),
-                             {ShapeUtil::ElementsIn(lhs->shape())}),
-        lhs));
-    HloComputation* add_reduce_computation = CreateScalarBinaryComputation(
-        computation_->parent(), F32, HloOpcode::kAdd);
-    auto zero = computation_->AddInstruction(
-        HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-    HloInstruction* reduce;
+      (ShapeUtil::Rank(lhs->shape()) == 2 &&
+       lhs->shape().dimensions(lhs_kept_dim) == 1)) {
     if (ShapeUtil::Rank(rhs->shape()) == 1) {
-      auto multiply = computation_->AddInstruction(HloInstruction::CreateBinary(
-          rhs->shape(), HloOpcode::kMultiply, new_lhs, rhs));
-      reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
-          ShapeUtil::MakeShape(dot->shape().element_type(), {}), multiply, zero,
-          {0}, add_reduce_computation));
-    } else {
-      new_lhs = computation_->AddInstruction(
-          HloInstruction::CreateBroadcast(rhs->shape(), new_lhs, {0}));
-      auto multiply = computation_->AddInstruction(HloInstruction::CreateBinary(
-          rhs->shape(), HloOpcode::kMultiply, new_lhs, rhs));
-
-      reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
-          ShapeUtil::MakeShape(dot->shape().element_type(),
-                               {rhs->shape().dimensions(1)}),
-          multiply, zero, {0}, add_reduce_computation));
+      TF_RETURN_IF_ERROR(ReplaceInstruction(
+          dot,
+          reshape_if_necessary(AddReduce(multiply(Flatten(lhs), rhs), 0))));
+      return true;
     }
-    return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateReshape(dot->shape(), reduce));
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, reshape_if_necessary(
+                 AddReduce(multiply(broadcast_to_dim(Flatten(lhs), rhs->shape(),
+                                                     rhs_collapsing_dim),
+                                    rhs),
+                           rhs_collapsing_dim))));
+    return true;
   }
 
   // Strength reduce dot(a, b[K, 1]) =
@@ -682,26 +704,208 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   //    reduce_sum(multiply(a, broadcast(reshape([K],b), {1})), {0})
   //  )
   if (ShapeUtil::Rank(rhs->shape()) == 1 ||
-      (ShapeUtil::Rank(rhs->shape()) == 2 && rhs->shape().dimensions(1) == 1)) {
-    auto new_rhs = computation_->AddInstruction(HloInstruction::CreateReshape(
-        ShapeUtil::MakeShape(rhs->shape().element_type(),
-                             {ShapeUtil::ElementsIn(rhs->shape())}),
-        rhs));
-    new_rhs = computation_->AddInstruction(
-        HloInstruction::CreateBroadcast(lhs->shape(), new_rhs, {1}));
-    auto multiply = computation_->AddInstruction(HloInstruction::CreateBinary(
-        lhs->shape(), HloOpcode::kMultiply, lhs, new_rhs));
-    HloComputation* add_reduce_computation = CreateScalarBinaryComputation(
-        computation_->parent(), F32, HloOpcode::kAdd);
+      (ShapeUtil::Rank(rhs->shape()) == 2 &&
+       rhs->shape().dimensions(rhs_kept_dim) == 1)) {
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, reshape_if_necessary(AddReduce(
+                 multiply(lhs, broadcast_to_dim(Flatten(rhs), lhs->shape(),
+                                                lhs_collapsing_dim)),
+                 lhs_collapsing_dim))));
+    return true;
+  }
+  return false;
+}
+
+StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfConcat(
+    HloInstruction* dot) {
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
+  if (dnums.lhs_contracting_dimensions_size() != 1 ||
+      dnums.lhs_batch_dimensions_size() != 0) {
+    return nullptr;
+  }
+
+  const int64 lhs_contracting_dim = dnums.lhs_contracting_dimensions(0);
+  const int64 rhs_contracting_dim = dnums.rhs_contracting_dimensions(0);
+  HloInstruction* lhs = dot->mutable_operand(0);
+  HloInstruction* rhs = dot->mutable_operand(1);
+
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * optimized_lhs_concat,
+      OptimizeDotOfConcatHelper(dot->shape(), lhs, lhs_contracting_dim, rhs,
+                                rhs_contracting_dim, /*swapped=*/false));
+  if (optimized_lhs_concat) {
+    return optimized_lhs_concat;
+  }
+
+  return OptimizeDotOfConcatHelper(dot->shape(), rhs, rhs_contracting_dim, lhs,
+                                   lhs_contracting_dim, /*swapped=*/true);
+}
+
+StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfConcatHelper(
+    const Shape& dot_shape, HloInstruction* lhs, int64 lhs_contracting_dim,
+    HloInstruction* rhs, int64 rhs_contracting_dim, bool swapped) {
+  bool can_optimize = lhs->opcode() == HloOpcode::kConcatenate &&
+                      lhs->concatenate_dimension() == lhs_contracting_dim &&
+                      rhs->opcode() == HloOpcode::kConstant;
+  if (!can_optimize) {
+    return nullptr;
+  }
+
+  // We're replacing this:
+  //
+  //   +-----+-----+-----+      +-------------------+
+  //   |     |     |     |      |                   |
+  //   |     |     |     |      |        R_0        |
+  //   |     |     |     |      |                   |
+  //   |     |     |     |      +-------------------+
+  //   |     |     |     |      |                   |
+  //   | L_0 | L_1 | L_2 |   *  |        R_1        |
+  //   |     |     |     |      |                   |
+  //   |     |     |     |      +-------------------+
+  //   |     |     |     |      |                   |
+  //   |     |     |     |      |        R_2        |
+  //   |     |     |     |      |                   |
+  //   +-----+-----+-----+      +-------------------+
+  //
+  // with this:
+  //
+  // [Sum over i]
+  //
+  //   +-----+     +-------------------+
+  //   |     |     |                   |
+  //   |     |  *  |        R_i        |
+  //   |     |     |                   |
+  //   |     |     +-------------------+
+  //   |     |
+  //   | L_i |
+  //   |     |
+  //   |     |
+  //   |     |
+  //   |     |
+  //   |     |
+  //   +-----+
+  //
+  // where the LHS is a concatenate operation (so we can "split" the LHS tensor
+  // for free) and the RHS is a constant tensor (and thus can be split at
+  // compile time).  In the future, we may also want to do this when both the
+  // LHS and the RHS are concatenate operations that line up along the dimension
+  // being contracted over.
+  //
+  // We should be able to generalize this transform to work on a non-constant
+  // RHS when/if we have in-place slices or support input-fusing slices into
+  // Dots.
+
+  // Dimension numbers for the new dot instructions we'll create (L_i * R_i in
+  // the diagram above).
+  DotDimensionNumbers new_dot_dnums;
+  new_dot_dnums.add_lhs_contracting_dimensions(swapped ? rhs_contracting_dim
+                                                       : lhs_contracting_dim);
+  new_dot_dnums.add_rhs_contracting_dimensions(swapped ? lhs_contracting_dim
+                                                       : rhs_contracting_dim);
+
+  // Here we use the MKN notation, where the contracted dimension has K
+  // elements and the two non-contracted dimensions have M and N elements.
+  HloInstruction* add_result = nullptr;
+  int64 rhs_contracting_dim_offset = 0;
+  int64 n = rhs->shape().dimensions(1 - rhs_contracting_dim);
+  for (HloInstruction* concat_op : lhs->operands()) {
+    int64 sub_k = concat_op->shape().dimensions(lhs_contracting_dim);
+    Shape rhs_slice_shape(rhs->shape());
+    rhs_slice_shape.set_dimensions(rhs_contracting_dim, sub_k);
+
+    std::array<int64, 2> start_indices;
+    start_indices[rhs_contracting_dim] = rhs_contracting_dim_offset;
+    start_indices[1 - rhs_contracting_dim] = 0;
+
+    std::array<int64, 2> limit_indices;
+    limit_indices[rhs_contracting_dim] = rhs_contracting_dim_offset + sub_k;
+    limit_indices[1 - rhs_contracting_dim] = n;
+
+    HloInstruction* rhs_slice =
+        computation_->AddInstruction(HloInstruction::CreateSlice(
+            rhs_slice_shape, rhs, /*start_indices=*/start_indices,
+            /*limit_indices=*/limit_indices, /*strides=*/{1, 1}));
+
+    // TODO(b/69062148): We can get rid of `swapped` once all backends support
+    // "non-canonical" contraction dimensions (that contracts dimension 1 of the
+    // LHS with dimension 0 of the RHS).  But for now we keep the same
+    // contraction dimensions as the incoming dot operation to ensure the new
+    // dot operations can be lowered.
+    HloInstruction *new_dot_lhs, *new_dot_rhs;
+    if (swapped) {
+      new_dot_lhs = rhs_slice;
+      new_dot_rhs = concat_op;
+    } else {
+      new_dot_lhs = concat_op;
+      new_dot_rhs = rhs_slice;
+    }
+
+    auto* new_dot = computation_->AddInstruction(HloInstruction::CreateDot(
+        dot_shape, new_dot_lhs, new_dot_rhs, new_dot_dnums));
+
+    if (add_result) {
+      add_result = computation_->AddInstruction(HloInstruction::CreateBinary(
+          dot_shape, HloOpcode::kAdd, add_result, new_dot));
+    } else {
+      add_result = new_dot;
+    }
+
+    rhs_contracting_dim_offset += sub_k;
+  }
+
+  return add_result;
+}
+
+Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
+  auto lhs = dot->mutable_operand(0);
+  auto rhs = dot->mutable_operand(1);
+
+  // Only optimize F32 dot operations where the dot, rhs and lhs are rank 2 or
+  // below.
+  if (dot->shape().element_type() != F32 || ShapeUtil::Rank(lhs->shape()) > 2 ||
+      ShapeUtil::Rank(rhs->shape()) > 2 || ShapeUtil::Rank(dot->shape()) > 2) {
+    return Status::OK();
+  }
+
+  // Replace a zero element dot with a broadcast of the constant 0.
+  if (ShapeUtil::HasZeroElements(dot->shape()) ||
+      ShapeUtil::HasZeroElements(lhs->shape()) ||
+      ShapeUtil::HasZeroElements(rhs->shape())) {
     auto zero = computation_->AddInstruction(
         HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-    auto reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
-        ShapeUtil::MakeShape(dot->shape().element_type(),
-                             {lhs->shape().dimensions(0)}),
-        multiply, zero, {1}, add_reduce_computation));
     return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateReshape(dot->shape(), reduce));
+        dot, HloInstruction::CreateBroadcast(dot->shape(), zero, {}));
   }
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * dot_of_concat_optimized,
+                      OptimizeDotOfConcat(dot));
+  if (dot_of_concat_optimized) {
+    VLOG(10) << "Replaced dot(concat(...), constant) with add(dot(..., "
+                "constant)...)";
+    return ReplaceInstruction(dot, dot_of_concat_optimized);
+  }
+
+  if (enable_dot_strength_reduction_ && !is_layout_sensitive_) {
+    TF_ASSIGN_OR_RETURN(bool did_strength_reduction,
+                        HandleDotStrengthReduction(dot));
+    if (did_strength_reduction) {
+      return Status::OK();
+    }
+  }
+
+  // Simplify dot(transpose(a), transpose(b)) to transpose(dot(b,a)).
+  if (lhs->IsRank2Transpose() && rhs->IsRank2Transpose()) {
+    DotDimensionNumbers dot_dimension_numbers;
+    dot_dimension_numbers.add_lhs_contracting_dimensions(1);
+    dot_dimension_numbers.add_rhs_contracting_dimensions(0);
+    auto new_dot = computation_->AddInstruction(HloInstruction::CreateDot(
+        ShapeUtil::PermuteDimensions({1, 0}, dot->shape()),
+        rhs->mutable_operand(0), lhs->mutable_operand(0),
+        dot_dimension_numbers));
+    return ReplaceWithNewInstruction(
+        dot, HloInstruction::CreateTranspose(dot->shape(), new_dot, {1, 0}));
+  }
+
   return Status::OK();
 }
 
@@ -1108,10 +1312,37 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
   if (IsAll(rhs, -1)) {
     auto* one = computation_->AddInstruction(HloInstruction::CreateConstant(
         Literal::One(rhs->shape().element_type()).CloneToUnique()));
+
+    // Explicitly broadcast scalar 1 to the output shape, to avoid implicit
+    // broadcast in divide HLO as we are trying to eliminate implicit
+    // broadcasting at HLO level.
+    auto* broadcast_one = computation_->AddInstruction(
+        HloInstruction::CreateBroadcast(power->shape(), one, {}));
     return ReplaceWithNewInstruction(
         power, HloInstruction::CreateBinary(power->shape(), HloOpcode::kDivide,
-                                            one, lhs));
+                                            broadcast_one, lhs));
   }
+
+  VLOG(10) << "trying transform [pow(pow(A, X), Y) => pow(A, X*Y)]: "
+           << power->ToString();
+
+  // Don't perform this optimization if either of the exponents is complex; this
+  // identity is true only for real-valued exponents.  In addition, we cowardly
+  // refuse to do this transformation if the two expontents have different
+  // element types.
+  if (lhs->opcode() == HloOpcode::kPower &&
+      !ShapeUtil::ElementIsComplex(lhs->operand(1)->shape()) &&
+      !ShapeUtil::ElementIsComplex(rhs->shape()) &&
+      ShapeUtil::SameElementType(lhs->operand(1)->shape(), rhs->shape())) {
+    auto exponent_product =
+        computation_->AddInstruction(HloInstruction::CreateBinary(
+            rhs->shape(), HloOpcode::kMultiply, lhs->mutable_operand(1), rhs));
+    return ReplaceWithNewInstruction(
+        power, HloInstruction::CreateBinary(power->shape(), HloOpcode::kPower,
+                                            lhs->mutable_operand(0),
+                                            exponent_product));
+  }
+
   return Status::OK();
 }
 
@@ -1165,7 +1396,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::
         ShapeUtil::MakeShapeWithLayout(
             user->shape().element_type(),
             AsInt64Slice(operand->shape().dimensions()),
-            AsInt64Slice(operand->shape().layout().minor_to_major())),
+            LayoutUtil::MinorToMajor(operand->shape())),
         new_user_operands));
     VLOG(4) << "  new user: " << new_user->ToString();
     HloInstruction* new_reshape_or_broadcast = nullptr;
@@ -1175,8 +1406,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::
               ShapeUtil::MakeShapeWithLayout(
                   user->shape().element_type(),
                   AsInt64Slice(reshape_or_broadcast->shape().dimensions()),
-                  AsInt64Slice(
-                      reshape_or_broadcast->shape().layout().minor_to_major())),
+                  LayoutUtil::MinorToMajor(reshape_or_broadcast->shape())),
               new_user));
     } else {
       TF_RET_CHECK(reshape_or_broadcast->opcode() == HloOpcode::kBroadcast);
@@ -1185,8 +1415,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::
               ShapeUtil::MakeShapeWithLayout(
                   user->shape().element_type(),
                   AsInt64Slice(reshape_or_broadcast->shape().dimensions()),
-                  AsInt64Slice(
-                      reshape_or_broadcast->shape().layout().minor_to_major())),
+                  LayoutUtil::MinorToMajor(reshape_or_broadcast->shape())),
               new_user, reshape_or_broadcast->dimensions()));
     }
     VLOG(4) << "  new reshape/broadcast: "
@@ -1398,6 +1627,15 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
   auto operand = reduce_window->mutable_operand(0);
   const Window& window = reduce_window->window();
   auto function = reduce_window->to_apply();
+  if (ShapeUtil::IsScalar(operand->shape())) {
+    TF_RET_CHECK(ShapeUtil::IsScalar(reduce_window->shape()));
+    return ReplaceWithNewInstruction(
+        reduce_window,
+        HloInstruction::CreateMap(reduce_window->shape(),
+                                  {operand, reduce_window->mutable_operand(1)},
+                                  function));
+  }
+
   VLOG(10) << "Considering folding Pad: " << operand->ToString()
            << "\ninto reduce-window: " << reduce_window->ToString();
 
@@ -1539,15 +1777,15 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
   // still convert Conv into more efficient Matmul with operand transposition
   // (such as the transposition flags in cuBLAS SGEMM).
   if (!LayoutUtil::Equal(input_shape.layout(), convolution_shape.layout()) ||
-      input_shape.layout().minor_to_major(0) !=
+      LayoutUtil::Minor(input_shape.layout(), 0) !=
           dnums.input_feature_dimension() ||
-      convolution_shape.layout().minor_to_major(0) !=
+      LayoutUtil::Minor(convolution_shape.layout(), 0) !=
           dnums.output_feature_dimension() ||
       // The input feature dimension should come later in the minor-to-major
       // order.
-      (PositionInContainer(filter_shape.layout().minor_to_major(),
+      (PositionInContainer(LayoutUtil::MinorToMajor(filter_shape),
                            dnums.kernel_input_feature_dimension()) <
-       PositionInContainer(filter_shape.layout().minor_to_major(),
+       PositionInContainer(LayoutUtil::MinorToMajor(filter_shape),
                            dnums.kernel_output_feature_dimension()))) {
     return Status::OK();
   }
@@ -1599,8 +1837,11 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
 
   auto new_lhs = add_bitcast(new_input_shape, lhs);
   auto new_rhs = add_bitcast(new_filter_shape, rhs);
-  auto dot = computation_->AddInstruction(HloInstruction::CreateBinary(
-      dot_output_shape, HloOpcode::kDot, new_lhs, new_rhs));
+  DotDimensionNumbers dot_dimension_numbers;
+  dot_dimension_numbers.add_lhs_contracting_dimensions(1);
+  dot_dimension_numbers.add_rhs_contracting_dimensions(0);
+  auto dot = computation_->AddInstruction(HloInstruction::CreateDot(
+      dot_output_shape, new_lhs, new_rhs, dot_dimension_numbers));
   return ReplaceInstruction(convolution, add_bitcast(convolution_shape, dot));
 }
 
@@ -1688,7 +1929,7 @@ StatusOr<bool> AlgebraicSimplifier::Run(HloModule* module) {
   for (auto* comp : module->MakeNonfusionComputations()) {
     if (AlgebraicSimplifierVisitor::Run(
             comp, is_layout_sensitive_, valid_bitcast_callback_,
-            enable_dot_simplification_, enable_conv_simplification_)) {
+            enable_dot_strength_reduction_, enable_conv_simplification_)) {
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index a9f476178c7af74c275a10de7727ea64e17d590f..43315f5cdc7afbe79039420320f4a0d0535e11f1 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -40,11 +40,11 @@ class AlgebraicSimplifier : public HloPassInterface {
   // bitcasts.
   AlgebraicSimplifier(bool is_layout_sensitive,
                       ValidBitcastCallback valid_bitcast_callback,
-                      bool enable_dot_simplification = true,
+                      bool enable_dot_strength_reduction = true,
                       bool enable_conv_simplification = true)
       : is_layout_sensitive_(is_layout_sensitive),
         valid_bitcast_callback_(std::move(valid_bitcast_callback)),
-        enable_dot_simplification_(enable_dot_simplification),
+        enable_dot_strength_reduction_(enable_dot_strength_reduction),
         enable_conv_simplification_(enable_conv_simplification) {}
   ~AlgebraicSimplifier() override = default;
   tensorflow::StringPiece name() const override { return "algsimp"; }
@@ -58,7 +58,7 @@ class AlgebraicSimplifier : public HloPassInterface {
   ValidBitcastCallback valid_bitcast_callback_;
 
   // Enable dot simplication on platforms where it is profitable.
-  bool enable_dot_simplification_;
+  bool enable_dot_strength_reduction_;
 
   // Enable convolution simplication on platforms where it is profitable.
   bool enable_conv_simplification_;
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 620f0a54fa03e7239809e9f910893d887f9ff149..d4739ca113a4094d6d98a6a9d45fbb14cbd124c5 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -327,6 +327,55 @@ TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) {
   EXPECT_EQ(0, negate_shape.dimensions_size());
 }
 
+// pow(pow(A, X), Y) => pow(A, X*Y)
+TEST_F(AlgebraicSimplifierTest, PowerOfPower) {
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  Shape r1f32 = ShapeUtil::MakeShape(F32, {7});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* base = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "param0"));
+  HloInstruction* exp1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32, "param1"));
+  HloInstruction* exp2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, r0f32, "param2"));
+  HloInstruction* inner_power = builder.AddInstruction(
+      HloInstruction::CreateBinary(r1f32, HloOpcode::kPower, base, exp1));
+  builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kPower,
+                                                      inner_power, exp2));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(computation->root_instruction(),
+              op::Power(base, op::Multiply(exp1, exp2)));
+}
+
+// Don't simplify pow(pow(A, X), Y) => pow(A, X*Y) if X and Y are complex
+// numbers.
+TEST_F(AlgebraicSimplifierTest, PowerOfPowerComplex) {
+  Shape r0c64 = ShapeUtil::MakeShape(C64, {});
+  Shape r1f32 = ShapeUtil::MakeShape(F32, {7});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* base = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "param0"));
+  HloInstruction* exp1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0c64, "param1"));
+  HloInstruction* exp2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, r0c64, "param2"));
+  HloInstruction* inner_power = builder.AddInstruction(
+      HloInstruction::CreateBinary(r1f32, HloOpcode::kPower, base, exp1));
+  builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kPower,
+                                                      inner_power, exp2));
+
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+}
+
 // Test that A/1 is simplified to A for a scalar.
 TEST_F(AlgebraicSimplifierTest, DivOneScalar) {
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
@@ -761,8 +810,10 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Divide(op::Constant(), param0));
-  EXPECT_EQ(root->operand(0)->literal().GetFirstElement<float>(), 1);
+  EXPECT_THAT(root, op::Divide(op::Broadcast(), param0));
+  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kBroadcast);
+  EXPECT_EQ(root->operand(0)->operand(0)->literal().GetFirstElement<float>(),
+            1);
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
@@ -1622,8 +1673,11 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
     ConvolutionDimensionNumbers dnums;
     std::vector<int64> in_dims;
     int in_channel_idx = -1;
-    dnums.add_spatial_dimensions(-1);  // filled in later
-    dnums.add_spatial_dimensions(-1);  // filled in later
+    // filled in later
+    dnums.add_input_spatial_dimensions(-1);
+    dnums.add_output_spatial_dimensions(-1);
+    dnums.add_input_spatial_dimensions(-1);
+    dnums.add_output_spatial_dimensions(-1);
     for (int i = 0; i < strlen(options.dim_order); ++i) {
       char ch = options.dim_order[i];
       if (ch == 'N') {
@@ -1631,10 +1685,12 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
         dnums.set_output_batch_dimension(i);
         in_dims.push_back(options.in_batch);
       } else if (ch == 'H') {
-        dnums.set_spatial_dimensions(0, i);
+        dnums.set_input_spatial_dimensions(0, i);
+        dnums.set_output_spatial_dimensions(0, i);
         in_dims.push_back(options.in_height);
       } else if (ch == 'W') {
-        dnums.set_spatial_dimensions(1, i);
+        dnums.set_input_spatial_dimensions(1, i);
+        dnums.set_output_spatial_dimensions(1, i);
         in_dims.push_back(options.in_width);
       } else if (ch == 'C') {
         dnums.set_input_feature_dimension(i);
@@ -2131,8 +2187,10 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, r1f32, "x"));
   HloInstruction* y =
       builder.AddInstruction(HloInstruction::CreateParameter(1, r1f32, "y"));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(r1f32, HloOpcode::kDot, x, y));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  builder.AddInstruction(HloInstruction::CreateDot(r1f32, x, y, dot_dnums));
   std::unique_ptr<HloComputation> dot_computation(builder.Build());
 
   HloComputation::Builder call_builder(TestName() + ".Call");
@@ -2229,5 +2287,210 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
               op::DynamicSlice(op::Parameter(), op::Parameter()));
 }
 
+class DotStrengthReductionTest
+    : public AlgebraicSimplifierTest,
+      public ::testing::WithParamInterface<
+          ::testing::tuple<int, int, int, bool, bool>> {};
+TEST_P(DotStrengthReductionTest, DotStrengthReduction) {
+  int m, k, n;
+  bool transpose_lhs, transpose_rhs;
+  std::tie(m, k, n, transpose_lhs, transpose_rhs) = GetParam();
+
+  Shape dot_shape = ShapeUtil::MakeShape(F32, {m, n});
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {m, k});
+  Shape transposed_lhs_shape = ShapeUtil::MakeShape(F32, {k, m});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {k, n});
+  Shape transposed_rhs_shape = ShapeUtil::MakeShape(F32, {n, k});
+  HloComputation::Builder builder(TestName());
+
+  auto lhs = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, transpose_lhs ? transposed_lhs_shape : lhs_shape, "lhs"));
+  if (transpose_lhs) {
+    lhs = builder.AddInstruction(
+        HloInstruction::CreateTranspose(lhs_shape, lhs, {1, 0}));
+  }
+  auto rhs = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, transpose_rhs ? transposed_rhs_shape : rhs_shape, "rhs"));
+  if (transpose_rhs) {
+    rhs = builder.AddInstruction(
+        HloInstruction::CreateTranspose(rhs_shape, rhs, {1, 0}));
+  }
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  builder.AddInstruction(
+      HloInstruction::CreateDot(dot_shape, lhs, rhs, dot_dnums));
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(module.get()));
+  const bool dot_should_be_transformed = m == 1 || k == 1 || n == 1;
+  const bool computation_should_be_modified =
+      dot_should_be_transformed || (transpose_lhs && transpose_rhs);
+  EXPECT_EQ(changed, computation_should_be_modified);
+  bool has_no_dot = true;
+  for (const auto& hlo : computation->instructions()) {
+    if (hlo->opcode() == HloOpcode::kDot) {
+      has_no_dot = false;
+      break;
+    }
+  }
+  EXPECT_EQ(has_no_dot, dot_should_be_transformed);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DotStrengthReductionTestInstantiation, DotStrengthReductionTest,
+    ::testing::Combine(::testing::Values(1, 2), ::testing::Values(1, 2),
+                       ::testing::Values(1, 2), ::testing::Bool(),
+                       ::testing::Bool()));
+
+struct DotOfConcatTestSpec {
+  int64 m;
+  int64 k;
+  int64 n;
+};
+
+class DotOfConcatSimplificationTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<DotOfConcatTestSpec> {};
+
+// Test that we transform
+//  dot(const, concat(A, B, C))
+// to
+//  add(dot(const_0, A), dot(const_1, B),  dot(const_2, C))
+TEST_P(DotOfConcatSimplificationTest, ConstantLHS) {
+  HloComputation::Builder builder(TestName());
+
+  DotOfConcatTestSpec spec = GetParam();
+
+  ASSERT_GE(spec.k, 3);
+
+  int64 k0 = spec.k / 3;
+  int64 k1 = spec.k / 3;
+  int64 k2 = spec.k - k0 - k1;
+
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {spec.m, spec.k});
+  auto* lhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+          /*from=*/10.0, /*to=*/10000.0, /*rows=*/spec.m, /*cols=*/spec.k)));
+
+  Shape rhs0_shape = ShapeUtil::MakeShape(F32, {k0, spec.n});
+  Shape rhs1_shape = ShapeUtil::MakeShape(F32, {k1, spec.n});
+  Shape rhs2_shape = ShapeUtil::MakeShape(F32, {k2, spec.n});
+
+  HloInstruction* rhs0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, rhs0_shape, "rhs0"));
+  HloInstruction* rhs1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, rhs1_shape, "rhs1"));
+  HloInstruction* rhs2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, rhs2_shape, "rhs2"));
+
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {spec.k, spec.n});
+  HloInstruction* rhs = builder.AddInstruction(
+      HloInstruction::CreateConcatenate(rhs_shape, {rhs0, rhs1, rhs2}, 0));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+
+  Shape dot_shape = ShapeUtil::MakeShape(F32, {spec.m, spec.n});
+  builder.AddInstruction(
+      HloInstruction::CreateDot(dot_shape, lhs, rhs, dot_dnums));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(module.get()));
+  ASSERT_TRUE(run_successful);
+
+  EXPECT_TRUE(
+      ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
+
+  auto match_dot_0 = op::Dot(op::Slice(op::Constant()), op::Parameter(0));
+  auto match_dot_1 = op::Dot(op::Slice(op::Constant()), op::Parameter(1));
+  auto match_dot_2 = op::Dot(op::Slice(op::Constant()), op::Parameter(2));
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Add(match_dot_0, match_dot_1), match_dot_2));
+}
+
+// Test that we transform
+//  dot(concat(A, B, C), const)
+// to
+//  add(dot(A, const_0), dot(B, const_1),  dot(C, const_2))
+TEST_P(DotOfConcatSimplificationTest, ConstantRHS) {
+  HloComputation::Builder builder(TestName());
+
+  DotOfConcatTestSpec spec = GetParam();
+
+  ASSERT_GE(spec.k, 4);
+
+  int64 k0 = spec.k / 4;
+  int64 k1 = spec.k / 4;
+  int64 k2 = spec.k / 4;
+  int64 k3 = spec.k - k0 - k1 - k2;
+
+  Shape lhs0_shape = ShapeUtil::MakeShape(F32, {spec.m, k0});
+  Shape lhs1_shape = ShapeUtil::MakeShape(F32, {spec.m, k1});
+  Shape lhs2_shape = ShapeUtil::MakeShape(F32, {spec.m, k2});
+  Shape lhs3_shape = ShapeUtil::MakeShape(F32, {spec.m, k3});
+
+  HloInstruction* lhs0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, lhs0_shape, "lhs0"));
+  HloInstruction* lhs1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, lhs1_shape, "lhs1"));
+  HloInstruction* lhs2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, lhs2_shape, "lhs2"));
+  HloInstruction* lhs3 = builder.AddInstruction(
+      HloInstruction::CreateParameter(3, lhs2_shape, "lhs3"));
+
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {spec.m, spec.k});
+  HloInstruction* lhs =
+      builder.AddInstruction(HloInstruction::CreateConcatenate(
+          lhs_shape, {lhs0, lhs1, lhs2, lhs3}, 1));
+
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {spec.k, spec.m});
+  auto* rhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+          /*from=*/10.0, /*to=*/10000.0, /*rows=*/spec.k, /*cols=*/spec.m)));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+
+  Shape dot_shape = ShapeUtil::MakeShape(F32, {spec.m, spec.n});
+  builder.AddInstruction(
+      HloInstruction::CreateDot(dot_shape, lhs, rhs, dot_dnums));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(module.get()));
+  ASSERT_TRUE(run_successful);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
+
+  auto match_dot_0 = op::Dot(op::Parameter(0), op::Slice(op::Constant()));
+  auto match_dot_1 = op::Dot(op::Parameter(1), op::Slice(op::Constant()));
+  auto match_dot_2 = op::Dot(op::Parameter(2), op::Slice(op::Constant()));
+  auto match_dot_3 = op::Dot(op::Parameter(3), op::Slice(op::Constant()));
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Add(op::Add(match_dot_0, match_dot_1), match_dot_2),
+                      match_dot_3));
+}
+
+DotOfConcatTestSpec kDotOfConcatTestSpecs[] = {
+    {/*m=*/3, /*k=*/9, /*n=*/3},    //
+    {/*m=*/3, /*k=*/20, /*n=*/3},   //
+    {/*m=*/1, /*k=*/18, /*n=*/5},   //
+    {/*m=*/20, /*k=*/20, /*n=*/1},  //
+    {/*m=*/1, /*k=*/16, /*n=*/1},   //
+};
+
+INSTANTIATE_TEST_CASE_P(DotOfConcatSimplificationTestInstantiation,
+                        DotOfConcatSimplificationTest,
+                        ::testing::ValuesIn(kDotOfConcatTestSpecs));
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index ad2fee2d39a8ca183b87212bdeea22c351aaa88a..b69a6e730fc65b2e590f22115569ce27145bf6ab 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -27,191 +27,163 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace se = ::perftools::gputools;
 
 namespace xla {
 
-AllocationTracker::AllocationTracker() : next_handle_(1) {}
-
-GlobalDataHandle AllocationTracker::Register(Backend* backend,
-                                             int device_ordinal,
-                                             se::DeviceMemoryBase device_memory,
-                                             const Shape& shape,
-                                             const string& tag) {
-  tensorflow::mutex_lock lock(allocation_mutex_);
+StatusOr<GlobalDataHandle> AllocationTracker::Register(
+    std::unique_ptr<ShapedBuffer> shaped_buffer, const string& tag) {
+  tensorflow::mutex_lock lock(mutex_);
   VLOG(2) << "Register";
-  return RegisterInternal(backend, device_ordinal, device_memory, shape, tag,
-                          /*initial_ref_count=*/1);
+  return RegisterInternal(std::move(shaped_buffer), tag);
 }
 
-GlobalDataHandle AllocationTracker::RegisterInternal(
-    Backend* backend, int device_ordinal, se::DeviceMemoryBase device_memory,
-    const Shape& shape, const string& tag, int initial_ref_count) {
+StatusOr<GlobalDataHandle> AllocationTracker::RegisterInternal(
+    std::unique_ptr<ShapedBuffer> shaped_buffer, const string& tag) {
   VLOG(2) << "RegisterInternal("
           << "tag: \"" << tag << "\" "
-          << "device_ordinal: " << device_ordinal << " "
-          << "device_memory: " << device_memory.opaque() << " "
-          << "shape: " << shape.ShortDebugString() << ")";
-  TF_CHECK_OK(ShapeUtil::ValidateShape(shape));
-
-  int64 handle;
-  HandleMap& handle_map = GetOrCreateOpaqueToHandleMap(device_ordinal);
-  auto handle_it = handle_map.find(device_memory.opaque());
-  if (handle_it != handle_map.end()) {
-    handle = handle_it->second;
-    auto& allocation = FindOrDie(handle_to_allocation_, handle);
-    int ref_count = allocation->ref_count();
-    CHECK_GT(ref_count, 0);
-    VLOG(2) << "ref_count: " << ref_count << " -> " <<
-            (ref_count + initial_ref_count);
-    allocation->increment_ref_count(initial_ref_count);
-  } else {
-    handle = next_handle_++;
-    VLOG(2) << "ref_count: " << initial_ref_count;
-    InsertOrDie(&handle_map, device_memory.opaque(), handle);
-    auto inserted = handle_to_allocation_.emplace(
-        handle, MakeUnique<Allocation>(backend, device_ordinal, device_memory,
-                                       shape, tag, initial_ref_count));
-    CHECK(inserted.second);
+          << "shaped_buffer: " << *shaped_buffer;
+  if (shaped_buffer->platform() != backend_->platform()) {
+    return InvalidArgument(
+        "AllocationTracker for platform %s cannot register buffer from "
+        "platform %s",
+        backend_->platform()->Name().c_str(),
+        shaped_buffer->platform()->Name().c_str());
   }
 
+  int64 handle = next_handle_++;
+  std::vector<ShapeIndex> shape_indices;
+  ShapeUtil::ForEachSubshape(shaped_buffer->on_device_shape(),
+                             [this, &shape_indices](const Shape& /*subshape*/,
+                                                    const ShapeIndex& index) {
+                               shape_indices.push_back(index);
+                             });
+  for (const ShapeIndex& index : shape_indices) {
+    AddAllocationOrIncrementRefCount(shaped_buffer->buffer(index),
+                                     shaped_buffer->device_ordinal());
+  }
   GlobalDataHandle result;
   result.set_handle(handle);
+
+  handle_to_shaped_buffer_[handle] = std::move(shaped_buffer);
+
   VLOG(2) << "handle: " << handle;
 
   return result;
 }
 
 tensorflow::Status AllocationTracker::Unregister(const GlobalDataHandle& data) {
-  tensorflow::mutex_lock lock(allocation_mutex_);
-  TF_ASSIGN_OR_RETURN(Allocation * allocation, ResolveInternal(data));
-  std::set<void*> deallocated_buffers;
-  TF_RETURN_IF_ERROR(
-      DeallocateShape(allocation->backend(), allocation->device_ordinal(),
-                      allocation->mutable_device_memory(), allocation->shape(),
-                      &deallocated_buffers));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status AllocationTracker::DeallocateShape(
-    Backend* backend, int device_ordinal, se::DeviceMemoryBase* device_memory,
-    const Shape& shape, std::set<void*>* deallocated_buffers) {
-  VLOG(2) << "DeallocateShape("
-          << "shape: \"" << shape.ShortDebugString() << "\" "
-          << "device_memory: " << device_memory->opaque() << ")";
-  if (ContainsKey(*deallocated_buffers, device_memory->opaque())) {
-    // Buffer has already been deallocated. Nothing to do.
-    VLOG(2) << "already deallocated";
-    return tensorflow::Status::OK();
+  tensorflow::mutex_lock lock(mutex_);
+  VLOG(2) << "Unregister("
+          << "handle: " << data.handle() << ")";
+  TF_ASSIGN_OR_RETURN(ShapedBuffer * shaped_buffer, ResolveInternal(data));
+  std::vector<ShapeIndex> shape_indices;
+  ShapeUtil::ForEachSubshape(shaped_buffer->on_device_shape(),
+                             [this, &shape_indices](const Shape& /*subshape*/,
+                                                    const ShapeIndex& index) {
+                               shape_indices.push_back(index);
+                             });
+  for (const ShapeIndex& index : shape_indices) {
+    TF_RETURN_IF_ERROR(DecrementRefCount(shaped_buffer->buffer(index),
+                                         shaped_buffer->device_ordinal()));
   }
 
-  // Add buffer to deallocated set so we do not try to deallocate it again
-  // if it is encountered again while traversing a tuple.
-  deallocated_buffers->insert(device_memory->opaque());
-
-  HandleMap& handle_map = GetOrCreateOpaqueToHandleMap(device_ordinal);
-  auto handle_it = handle_map.find(device_memory->opaque());
-  if (handle_it != handle_map.end()) {
-    int64 handle = handle_it->second;
-    auto& allocation = FindOrDie(handle_to_allocation_, handle);
-    int ref_count = allocation->ref_count();
-    VLOG(2) << "ref_count: " << ref_count << " -> " << ref_count - 1;
-    allocation->decrement_ref_count();
-    if (allocation->ref_count() > 0) {
-      // Buffer is referred to by another allocation. Don't deallocate it.
-      return tensorflow::Status::OK();
-    }
-    handle_map.erase(device_memory->opaque());
-  }
+  // Keep a nullptr as a tombstone for unregistered handles. This enables better
+  // error messages. That is, "handle has been deallocated" versus "handle does
+  // not exist".
+  handle_to_shaped_buffer_.at(data.handle()).reset();
 
-  if (ShapeUtil::IsTuple(shape)) {
-    // Traverse into tuple recursively deallocating buffers.
-    TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
-                        backend->stream_executor(device_ordinal));
-    TF_ASSIGN_OR_RETURN(std::vector<se::DeviceMemoryBase> elements,
-                        backend->transfer_manager()->ShallowCopyTupleFromDevice(
-                            executor, *device_memory, shape));
-
-    TF_RET_CHECK(ShapeUtil::TupleElementCount(shape) == elements.size())
-        << "tuple has unexpected number of elements: " << elements.size()
-        << " != " << ShapeUtil::TupleElementCount(shape);
-    for (size_t i = 0; i < elements.size(); ++i) {
-      VLOG(2) << "recursing onto the tuple elements";
-      TF_RETURN_IF_ERROR(DeallocateShape(backend, device_ordinal, &elements[i],
-                                         shape.tuple_shapes(i),
-                                         deallocated_buffers));
-    }
-  }
-
-  return backend->memory_allocator()->Deallocate(device_ordinal, device_memory);
+  return tensorflow::Status::OK();
 }
 
 StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
     const GlobalDataHandle& data) {
-  tensorflow::mutex_lock lock(allocation_mutex_);
-  TF_ASSIGN_OR_RETURN(Allocation * allocation, ResolveInternal(data));
+  tensorflow::mutex_lock lock(mutex_);
 
-  if (!ShapeUtil::IsTuple(allocation->shape())) {
+  TF_ASSIGN_OR_RETURN(ShapedBuffer * shaped_buffer, ResolveInternal(data));
+  if (!ShapeUtil::IsTuple(shaped_buffer->on_host_shape())) {
     return InvalidArgument("global data handle %lld is not a tuple",
                            data.handle());
   }
+  // If the on-host representation is a tuple, then the on-device one should be
+  // as well.
+  TF_RET_CHECK(ShapeUtil::IsTuple(shaped_buffer->on_device_shape()));
 
-  if (ShapeUtil::IsNestedTuple(allocation->shape())) {
+  if (ShapeUtil::IsNestedTuple(shaped_buffer->on_device_shape())) {
     return Unimplemented("deconstructing nested tuples not yet supported");
   }
 
-  TF_ASSIGN_OR_RETURN(
-      se::StreamExecutor * executor,
-      allocation->backend()->stream_executor(allocation->device_ordinal()));
-  TF_ASSIGN_OR_RETURN(
-      std::vector<se::DeviceMemoryBase> element_bases,
-      allocation->backend()->transfer_manager()->ShallowCopyTupleFromDevice(
-          executor, allocation->device_memory(), allocation->shape()));
-
   std::vector<GlobalDataHandle> element_handles;
-  element_handles.reserve(element_bases.size());
-  for (int i = 0; i < element_bases.size(); ++i) {
-    element_handles.push_back(RegisterInternal(
-        allocation->backend(), allocation->device_ordinal(), element_bases[i],
-        ShapeUtil::GetSubshape(allocation->shape(), {i}),
-        tensorflow::strings::StrCat(allocation->tag(), ".element_", i),
-        /*initial_ref_count=*/2));
+  for (int i = 0;
+       i < ShapeUtil::TupleElementCount(shaped_buffer->on_device_shape());
+       ++i) {
+    auto element_buffer = MakeUnique<ShapedBuffer>(
+        ShapeUtil::GetTupleElementShape(shaped_buffer->on_host_shape(), i),
+        ShapeUtil::GetTupleElementShape(shaped_buffer->on_device_shape(), i),
+        shaped_buffer->platform(), shaped_buffer->device_ordinal());
+    element_buffer->set_buffer(shaped_buffer->buffer(/*index=*/{i}),
+                               /*index=*/{});
+    TF_ASSIGN_OR_RETURN(
+        GlobalDataHandle element_handle,
+        RegisterInternal(std::move(element_buffer), "deconstructed tuple"));
+
+    element_handles.push_back(element_handle);
   }
   return std::move(element_handles);
 }
 
-StatusOr<const Allocation*> AllocationTracker::Resolve(
+StatusOr<const ShapedBuffer*> AllocationTracker::Resolve(
     const GlobalDataHandle& data) {
-  tensorflow::mutex_lock lock(allocation_mutex_);
+  tensorflow::mutex_lock lock(mutex_);
   return AllocationTracker::ResolveInternal(data);
 }
 
-StatusOr<Allocation*> AllocationTracker::ResolveInternal(
+StatusOr<ShapedBuffer*> AllocationTracker::ResolveInternal(
     const GlobalDataHandle& data) {
   VLOG(2) << "resolve:" << data.handle();
-  auto it = handle_to_allocation_.find(data.handle());
-  if (it == handle_to_allocation_.end()) {
+  auto it = handle_to_shaped_buffer_.find(data.handle());
+  if (it == handle_to_shaped_buffer_.end()) {
     return NotFound("no allocation record for global data handle: %lld",
                     data.handle());
   }
-  Allocation* allocation = it->second.get();
+  ShapedBuffer* shaped_buffer = it->second.get();
 
-  if (allocation->is_deallocated()) {
+  if (shaped_buffer == nullptr) {
     return InvalidArgument("global data handle %lld was previously deallocated",
                            data.handle());
   }
 
-  return allocation;
+  return shaped_buffer;
 }
 
-AllocationTracker::HandleMap& AllocationTracker::GetOrCreateOpaqueToHandleMap(
-    int device_ordinal) {
-  if (opaque_to_handle_.size() <= device_ordinal) {
-    opaque_to_handle_.resize(device_ordinal + 1);
+void AllocationTracker::AddAllocationOrIncrementRefCount(
+    perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) {
+  AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
+  auto it = allocation_map.find(device_memory.opaque());
+  if (it == allocation_map.end()) {
+    allocation_map[device_memory.opaque()] = {device_memory, device_ordinal,
+                                              /*ref_count=*/1};
+  } else {
+    it->second.ref_count++;
   }
-  return opaque_to_handle_[device_ordinal];
+}
+
+Status AllocationTracker::DecrementRefCount(
+    perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) {
+  AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
+  auto it = allocation_map.find(device_memory.opaque());
+  TF_RET_CHECK(it != allocation_map.end());
+  Allocation& allocation = it->second;
+  TF_RET_CHECK(allocation.ref_count >= 1);
+  if (allocation.ref_count == 1) {
+    TF_RETURN_IF_ERROR(backend_->memory_allocator()->Deallocate(
+        device_ordinal, &device_memory));
+    allocation_map.erase(it);
+  } else {
+    allocation.ref_count--;
+  }
+  return tensorflow::Status::OK();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index ebbf35b6fe87bc7322ccb99cfe8f8eed56de06b3..8b25cbb482720f7debe95bb5ff74afe696bd8b73 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -28,147 +28,92 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
-// A global allocation in device space, tracked by the XLA service.
-class Allocation {
- public:
-  Allocation(Backend* backend, int device_ordinal,
-             perftools::gputools::DeviceMemoryBase device_memory,
-             const Shape& shape, const string& tag, int initial_ref_count)
-      : backend_(backend),
-        device_ordinal_(device_ordinal),
-        device_memory_(device_memory),
-        shape_(shape),
-        tag_(tag),
-        ref_count_(initial_ref_count) {}
-
-  Backend* backend() const { return backend_; }
-  int device_ordinal() const { return device_ordinal_; }
-  perftools::gputools::DeviceMemoryBase device_memory() const {
-    return device_memory_;
-  }
-  const Shape& shape() const { return shape_; }
-  const string& tag() const { return tag_; }
-
-  bool is_deallocated() const {
-    CHECK_GE(ref_count_, 0);
-    return ref_count_ == 0;
-  }
-  int ref_count() const {
-    CHECK_GE(ref_count_, 0);
-    return ref_count_;
-  }
-  void increment_ref_count(int inc) {
-    CHECK_GT(ref_count_, 0);
-    CHECK_LE(ref_count_, INT_MAX - inc);
-    ref_count_ += inc;
-  }
-  void decrement_ref_count() {
-    CHECK_GT(ref_count_, 0);
-    --ref_count_;
-  }
-  perftools::gputools::DeviceMemoryBase* mutable_device_memory() {
-    return &device_memory_;
-  }
-
- private:
-  // The backend that the memory is allocated on.
-  Backend* backend_;
-
-  // The device that the memory is allocated on.
-  int device_ordinal_;
-
-  // The pointer to this allocation.
-  perftools::gputools::DeviceMemoryBase device_memory_;
-
-  // The shape of this allocation.
-  Shape shape_;
-
-  // An informal description of this allocation shown in tools.
-  string tag_;
-
-  // This is the number of Allocation objects which refer to this memory
-  // allocation.
-  int ref_count_;
-
-  // Return a string representation of this allocation for debugging or logging
-  // purposes.
-  string ToString() const;
-};
-
 // Tracks allocations for the XLA service; allocations can be registered
 // with shape/device/tag and resolved from a handle for later use.
 class AllocationTracker {
  public:
-  AllocationTracker();
+  // The allocator is used for deallocating memory when allocations are
+  // deregistered. All registered allocations must have the same platform as the
+  // allocator.
+  AllocationTracker(Backend* backend) : backend_(backend), next_handle_(1) {}
 
-  // Registers device memory with a given shape, device identifier, and tag, and
-  // returns a corresponding handle that can be used for talking to XLA
-  // clients.
-  GlobalDataHandle Register(Backend* backend, int device_ordinal,
-                            perftools::gputools::DeviceMemoryBase device_memory,
-                            const Shape& shape, const string& tag);
+  // Registers a shaped buffer of device memory, and returns a corresponding
+  // handle that can be used for talking to XLA clients.
+  StatusOr<GlobalDataHandle> Register(
+      std::unique_ptr<ShapedBuffer> shaped_buffer, const string& tag);
 
   // Unregister the allocation for the given data handle.
-  tensorflow::Status Unregister(const GlobalDataHandle& data);
+  Status Unregister(const GlobalDataHandle& data);
 
   // Returns a vector of global data handles that point to the tuple elements.
   StatusOr<std::vector<GlobalDataHandle>> DeconstructTuple(
       const GlobalDataHandle& Data);
 
-  // Resolve a handle from an XLA client to an allocation, or provide an
-  // error status to say whether it was not found (or found, but found
-  // deallocated).
-  StatusOr<const Allocation*> Resolve(const GlobalDataHandle& data);
+  // Resolve a handle from an XLA client to a shaped buffer, or provide an error
+  // status to say whether it was not found (or found, but found deallocated).
+  StatusOr<const ShapedBuffer*> Resolve(const GlobalDataHandle& data);
 
  private:
-  // Internal helper which resolves the given GlobalDataHandle to an Allocation.
-  StatusOr<Allocation*> ResolveInternal(const GlobalDataHandle& data)
-      EXCLUSIVE_LOCKS_REQUIRED(allocation_mutex_);
-
-  GlobalDataHandle RegisterInternal(
-      Backend* backend, int device_ordinal,
-      perftools::gputools::DeviceMemoryBase device_memory, const Shape& shape,
-      const string& tag, int initial_ref_count)
-      EXCLUSIVE_LOCKS_REQUIRED(allocation_mutex_);
-
-  // Helper function which deallocates the memory buffer containing the given
-  // shape referred to by device_memory. Tuples are traversed recursively
-  // deallocating all nested buffers. The parameter deallocated_buffers contains
-  // the set of buffers deallocated so far stored as opaque values (void *) from
-  // DeviceMemoryBase. Keeping track of deallocated buffers prevents
-  // double-freeing of buffers which may be referred to more than once in a
-  // nested tuple.
-  tensorflow::Status DeallocateShape(
-      Backend* backend, int device_ordinal,
-      perftools::gputools::DeviceMemoryBase* device_memory, const Shape& shape,
-      std::set<void*>* deallocated_buffers)
-      EXCLUSIVE_LOCKS_REQUIRED(allocation_mutex_);
-
-  // Returns the opaque_to_handle_ map for the given device_ordinal, creating
-  // a new map if there is not one for the device_ordinal.
-  using HandleMap = std::map<void*, int64>;
-  HandleMap& GetOrCreateOpaqueToHandleMap(int device_ordinal)
-      EXCLUSIVE_LOCKS_REQUIRED(allocation_mutex_);
-
-  tensorflow::mutex allocation_mutex_;  // Guards the allocation mapping.
+  // Data structure encapsulating single memory allocation on the device.
+  struct Allocation {
+    // The pointer to this allocation.
+    perftools::gputools::DeviceMemoryBase device_memory;
+
+    // The device that the memory is allocated on.
+    int device_ordinal;
+
+    // This is the number of times this memory allocation is refered to by
+    // registered data handles.
+    int ref_count;
+  };
+
+  // Internal helper which resolves the given GlobalDataHandle to a
+  // ShapedBuffer.
+  StatusOr<ShapedBuffer*> ResolveInternal(const GlobalDataHandle& data)
+      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Internal helper which registers a shaped buffer.
+  StatusOr<GlobalDataHandle> RegisterInternal(
+      std::unique_ptr<ShapedBuffer> shaped_buffer, const string& tag)
+      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Adds the given device address to the allocation tracker, or if it already
+  // exists, then increment it's reference count.
+  void AddAllocationOrIncrementRefCount(
+      perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal)
+      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Decrements the reference count of the given device memory. Then, if it is
+  // zero, deallocate the memory.
+  Status DecrementRefCount(perftools::gputools::DeviceMemoryBase device_memory,
+                           int device_ordinal) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // A map from device memory opaque value to allocation. One such map is
+  // maintained per device ordinal.
+  using AllocationMap = tensorflow::gtl::FlatMap<const void*, Allocation>;
+
+  tensorflow::mutex mutex_;
+
+  // Backend to use with this tracker. The backend supplies the memory allocator
+  // to use when deallocating memory.
+  Backend* backend_;
 
   // The next handle to assign to an allocation, guarded by the same mutex as
   // the mapping as they'll be mutated at the same time.
-  int64 next_handle_ GUARDED_BY(allocation_mutex_);
+  int64 next_handle_ GUARDED_BY(mutex_);
 
-  // A map from DeviceMemoryBase to handle for each device_ordinal.
-  std::vector<HandleMap> opaque_to_handle_ GUARDED_BY(allocation_mutex_);
+  // A map from device ordinal to AllocationMap.
+  tensorflow::gtl::FlatMap<int, AllocationMap> opaque_to_allocation_map_
+      GUARDED_BY(mutex_);
 
-  // Mapping from GlobalDataHandle handle to the corresponding registered
-  // Allocation object.
-  std::map<int64, std::unique_ptr<Allocation>> handle_to_allocation_
-      GUARDED_BY(allocation_mutex_);
+  // A map from data handle to ShapedBuffer.
+  tensorflow::gtl::FlatMap<int64, std::unique_ptr<ShapedBuffer>>
+      handle_to_shaped_buffer_ GUARDED_BY(mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(AllocationTracker);
 };
diff --git a/tensorflow/compiler/xla/service/batchnorm_rewriter.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
similarity index 50%
rename from tensorflow/compiler/xla/service/batchnorm_rewriter.cc
rename to tensorflow/compiler/xla/service/batchnorm_expander.cc
index abe881cd1a58a6173b9b93f10a7308d70106c889..b806d61663e2ca371d90dfe39d7fe66becfe4bc2 100644
--- a/tensorflow/compiler/xla/service/batchnorm_rewriter.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/batchnorm_rewriter.h"
+#include "tensorflow/compiler/xla/service/batchnorm_expander.h"
 
 #include <algorithm>
 #include <memory>
@@ -45,9 +45,9 @@ limitations under the License.
 
 namespace xla {
 
-// BatchNormRewriterVisitor traverses the HLO computation and rewrites BatchNorm
+// BatchNormExpanderVisitor traverses the HLO computation and rewrites BatchNorm
 // operations into smaller operations.
-class BatchNormRewriterVisitor : public DfsHloVisitorWithDefault {
+class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
  public:
   // Default visitor action is to do nothing and return OK.
   Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
@@ -68,10 +68,10 @@ class BatchNormRewriterVisitor : public DfsHloVisitorWithDefault {
   // Returns whether any batch norm ops were rewritten.
   const bool changed() const { return changed_; }
 
-  ~BatchNormRewriterVisitor() override = default;
+  ~BatchNormExpanderVisitor() override = default;
 
  private:
-  explicit BatchNormRewriterVisitor(HloComputation* computation,
+  explicit BatchNormExpanderVisitor(HloComputation* computation,
                                     bool rewrite_training_op,
                                     bool rewrite_inference_op,
                                     bool rewrite_grad_op, bool use_fusion)
@@ -85,16 +85,16 @@ class BatchNormRewriterVisitor : public DfsHloVisitorWithDefault {
                                              HloOpcode opcode) {
     HloComputation::Builder b("scalar_computation");
     auto scalar_lhs = b.AddInstruction(HloInstruction::CreateParameter(
-        0, ShapeUtil::MakeShape(F32, {}), "scalar_lhs"));
+        0, ShapeUtil::MakeShape(primitive_type, {}), "scalar_lhs"));
     auto scalar_rhs = b.AddInstruction(HloInstruction::CreateParameter(
-        1, ShapeUtil::MakeShape(F32, {}), "scalar_rhs"));
+        1, ShapeUtil::MakeShape(primitive_type, {}), "scalar_rhs"));
     auto scalar_op = b.AddInstruction(
         HloInstruction::CreateBinary(ShapeUtil::MakeShape(primitive_type, {}),
                                      opcode, scalar_lhs, scalar_rhs));
     return computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
   }
 
-  // Current HloComputation instance the BatchNormRewriter is
+  // Current HloComputation instance the BatchNormExpander is
   // traversing.
   HloComputation* computation_;
 
@@ -130,11 +130,11 @@ class BatchNormRewriterVisitor : public DfsHloVisitorWithDefault {
   }
 };
 
-bool BatchNormRewriterVisitor::Run(HloComputation* computation,
+bool BatchNormExpanderVisitor::Run(HloComputation* computation,
                                    bool rewrite_training_op,
                                    bool rewrite_inference_op,
                                    bool rewrite_grad_op, bool use_fusion) {
-  BatchNormRewriterVisitor visitor(
+  BatchNormExpanderVisitor visitor(
       computation,
       /*rewrite_training_op=*/rewrite_training_op,
       /*rewrite_inference_op=*/rewrite_inference_op,
@@ -144,31 +144,46 @@ bool BatchNormRewriterVisitor::Run(HloComputation* computation,
   return visitor.changed_;
 }
 
-Status BatchNormRewriterVisitor::HandleBatchNormTraining(
+Status BatchNormExpanderVisitor::HandleBatchNormTraining(
     HloInstruction* batch_norm) {
   if (!rewrite_training_op_) {
     return Status::OK();
   }
+
+  std::vector<HloInstruction*> added_instructions;
+  auto add = [&](std::unique_ptr<HloInstruction> inst) {
+    HloInstruction* added_inst = computation_->AddInstruction(std::move(inst));
+    added_instructions.push_back(added_inst);
+    return added_inst;
+  };
+  int64 instruction_count_before = computation_->instruction_count();
+
   // Expand batch norm training into smaller HLO ops.
   HloInstruction* operand = batch_norm->mutable_operand(0);
   const Shape operand_shape = operand->shape();
+  PrimitiveType ptype = operand_shape.element_type();
   int64 feature_index = batch_norm->feature_index();
   const int64 feature_count = operand_shape.dimensions(feature_index);
   const int64 size_in_elements = ShapeUtil::ElementsIn(operand_shape);
-  auto elements_per_feature =
-      computation_->AddInstruction(HloInstruction::CreateConstant(
-          Literal::CreateR0<float>(size_in_elements / feature_count)));
+  auto elements_per_feature_literal =
+      Literal::CreateR0<float>(size_in_elements / feature_count);
+  TF_ASSIGN_OR_RETURN(elements_per_feature_literal,
+                      elements_per_feature_literal->Convert(ptype));
+  auto elements_per_feature = add(
+      HloInstruction::CreateConstant(std::move(elements_per_feature_literal)));
 
   HloInstruction* scale = batch_norm->mutable_operand(1);
   HloInstruction* offset = batch_norm->mutable_operand(2);
   const Shape feature_shape = scale->shape();
 
-  auto zero = computation_->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-
-  auto epsilon = computation_->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0(batch_norm->epsilon())));
+  auto zero_literal = Literal::CreateR0(0.0f);
+  TF_ASSIGN_OR_RETURN(zero_literal, zero_literal->Convert(ptype));
+  auto zero = add(HloInstruction::CreateConstant(std::move(zero_literal)));
 
+  auto epsilon_literal = Literal::CreateR0(batch_norm->epsilon());
+  TF_ASSIGN_OR_RETURN(epsilon_literal, epsilon_literal->Convert(ptype));
+  auto epsilon =
+      add(HloInstruction::CreateConstant(std::move(epsilon_literal)));
   std::vector<int64> dimensions_without_feature;
 
   for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
@@ -177,107 +192,114 @@ Status BatchNormRewriterVisitor::HandleBatchNormTraining(
     }
   }
 
-  auto scale_broadcasted = computation_->AddInstruction(
+  auto scale_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, scale, {feature_index}));
 
-  auto offset_broadcasted = computation_->AddInstruction(
+  auto offset_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, offset, {feature_index}));
 
   HloComputation* add_reduce_computation =
-      GetScalarBinaryComputation(F32, HloOpcode::kAdd);
+      GetScalarBinaryComputation(ptype, HloOpcode::kAdd);
 
   // X^2.
-  auto operand_squared =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kMultiply, operand, operand));
+  auto operand_squared = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kMultiply, operand, operand));
   // Sum[X].
-  auto sum = computation_->AddInstruction(HloInstruction::CreateReduce(
-      feature_shape, operand, zero, dimensions_without_feature,
-      add_reduce_computation));
+  auto sum = add(HloInstruction::CreateReduce(feature_shape, operand, zero,
+                                              dimensions_without_feature,
+                                              add_reduce_computation));
 
   // Sum[X^2].
-  auto squared_sum = computation_->AddInstruction(HloInstruction::CreateReduce(
+  auto squared_sum = add(HloInstruction::CreateReduce(
       feature_shape, operand_squared, zero, dimensions_without_feature,
       add_reduce_computation));
 
   // Fuse two parallel reduces together to improve performance.
-  if (use_fusion_) {
-    auto tuple = computation_->AddInstruction(
-        HloInstruction::CreateTuple({sum, squared_sum}));
+  if (use_fusion_ && !batch_norm->has_sharding()) {
+    auto tuple = add(HloInstruction::CreateTuple({sum, squared_sum}));
 
     auto fused = computation_->CreateFusionInstruction(
         {tuple, sum, squared_sum, operand_squared},
         HloInstruction::FusionKind::kInput);
 
-    sum = computation_->AddInstruction(
-        HloInstruction::CreateGetTupleElement(feature_shape, fused, 0));
+    sum = add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 0));
 
-    squared_sum = computation_->AddInstruction(
-        HloInstruction::CreateGetTupleElement(feature_shape, fused, 1));
+    squared_sum =
+        add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 1));
   }
 
   // E[X].
-  auto mean = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto mean = add(HloInstruction::CreateBinary(
       feature_shape, HloOpcode::kDivide, sum, elements_per_feature));
 
-  auto mean_broadcasted = computation_->AddInstruction(
+  auto mean_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, mean, {feature_index}));
 
   // E[X^2].
-  auto square_mean = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto square_mean = add(HloInstruction::CreateBinary(
       feature_shape, HloOpcode::kDivide, squared_sum, elements_per_feature));
 
   // E^2[X].
-  auto mean_square = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto mean_square = add(HloInstruction::CreateBinary(
       feature_shape, HloOpcode::kMultiply, mean, mean));
 
   // Var[X].
-  auto var = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto var = add(HloInstruction::CreateBinary(
       feature_shape, HloOpcode::kSubtract, square_mean, mean_square));
 
-  auto var_broadcasted = computation_->AddInstruction(
-      HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
+  auto var_broadcasted =
+      add(HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
 
   // Var[X] + epsilon.
-  auto var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
+  auto var_add_epsilon = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
 
-  auto neg_half = computation_->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0(-0.5f)));
+  auto neg_half_literal = Literal::CreateR0(-0.5f);
+  TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype));
+  auto neg_half =
+      add(HloInstruction::CreateConstant(std::move(neg_half_literal)));
 
   // 1 / Sqrt[Var[X] + epsilon].
-  auto rsqrt_var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
+  auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
 
   // X - E[X].
-  auto operand_minus_mean =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
+  auto operand_minus_mean = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon].
-  auto normalized = computation_->AddInstruction(
+  auto normalized = add(
       HloInstruction::CreateBinary(operand_shape, HloOpcode::kMultiply,
                                    operand_minus_mean, rsqrt_var_add_epsilon));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale.
-  auto scaled_normalized =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
+  auto scaled_normalized = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale + offset.
-  auto shifted_normalized = computation_->AddInstruction(
-      HloInstruction::CreateBinary(operand_shape, HloOpcode::kAdd,
-                                   scaled_normalized, offset_broadcasted));
-
-  TF_CHECK_OK(ReplaceWithNewInstruction(
-      batch_norm,
-      HloInstruction::CreateTuple({shifted_normalized, mean, var})));
+  auto shifted_normalized = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kAdd, scaled_normalized, offset_broadcasted));
+
+  auto tuple = HloInstruction::CreateTuple({shifted_normalized, mean, var});
+
+  if (batch_norm->has_sharding()) {
+    int64 instruction_count_after = computation_->instruction_count();
+    CHECK_EQ(instruction_count_after,
+             instruction_count_before + added_instructions.size());
+    for (HloInstruction* inst : added_instructions) {
+      if (ShapeUtil::Equal(inst->shape(), operand_shape)) {
+        inst->set_sharding(batch_norm->sharding());
+      } else {
+        inst->set_sharding(HloSharding::Replicate());
+      }
+    }
+    tuple->set_sharding(batch_norm->sharding());
+  }
+  TF_CHECK_OK(ReplaceWithNewInstruction(batch_norm, std::move(tuple)));
   return Status::OK();
 }
 
-Status BatchNormRewriterVisitor::HandleBatchNormInference(
+Status BatchNormExpanderVisitor::HandleBatchNormInference(
     HloInstruction* batch_norm) {
   if (!rewrite_inference_op_) {
     return Status::OK();
@@ -286,6 +308,7 @@ Status BatchNormRewriterVisitor::HandleBatchNormInference(
   HloInstruction* operand = batch_norm->mutable_operand(0);
   const Shape operand_shape = operand->shape();
   int64 feature_index = batch_norm->feature_index();
+  PrimitiveType ptype = operand_shape.element_type();
 
   HloInstruction* scale = batch_norm->mutable_operand(1);
   HloInstruction* offset = batch_norm->mutable_operand(2);
@@ -293,8 +316,10 @@ Status BatchNormRewriterVisitor::HandleBatchNormInference(
   HloInstruction* var = batch_norm->mutable_operand(4);
   const Shape feature_shape = scale->shape();
 
+  auto epsilon_literal = Literal::CreateR0(batch_norm->epsilon());
+  TF_ASSIGN_OR_RETURN(epsilon_literal, epsilon_literal->Convert(ptype));
   auto epsilon = computation_->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0(batch_norm->epsilon())));
+      HloInstruction::CreateConstant(std::move(epsilon_literal)));
 
   std::vector<int64> dimensions_without_feature;
 
@@ -304,56 +329,75 @@ Status BatchNormRewriterVisitor::HandleBatchNormInference(
     }
   }
 
-  auto scale_broadcasted = computation_->AddInstruction(
+  std::vector<HloInstruction*> added_instructions;
+  auto add = [&](std::unique_ptr<HloInstruction> inst) {
+    HloInstruction* added_inst = computation_->AddInstruction(std::move(inst));
+    added_instructions.push_back(added_inst);
+    return added_inst;
+  };
+  int64 instruction_count_before = computation_->instruction_count();
+
+  auto scale_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, scale, {feature_index}));
 
-  auto offset_broadcasted = computation_->AddInstruction(
+  auto offset_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, offset, {feature_index}));
 
-  auto mean_broadcasted = computation_->AddInstruction(
+  auto mean_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, mean, {feature_index}));
 
-  auto var_broadcasted = computation_->AddInstruction(
-      HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
+  auto var_broadcasted =
+      add(HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
 
   // Var[X] + epsilon.
-  auto var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
+  auto var_add_epsilon = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
 
-  auto neg_half = computation_->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0(-0.5f)));
+  auto neg_half_literal = Literal::CreateR0(-0.5f);
+  TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype));
+  auto neg_half =
+      add(HloInstruction::CreateConstant(std::move(neg_half_literal)));
 
   // 1 / Sqrt[Var[X] + epsilon].
-  auto rsqrt_var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
+  auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
 
   // X - E[X].
-  auto operand_minus_mean =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
+  auto operand_minus_mean = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon].
-  auto normalized = computation_->AddInstruction(
+  auto normalized = add(
       HloInstruction::CreateBinary(operand_shape, HloOpcode::kMultiply,
                                    operand_minus_mean, rsqrt_var_add_epsilon));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale.
-  auto scaled_normalized =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
+  auto scaled_normalized = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale + offset.
   auto shifted_normalized = HloInstruction::CreateBinary(
       operand_shape, HloOpcode::kAdd, scaled_normalized, offset_broadcasted);
 
+  int64 instruction_count_after = computation_->instruction_count();
+  CHECK_EQ(instruction_count_after,
+           instruction_count_before + added_instructions.size());
+  if (batch_norm->has_sharding()) {
+    for (HloInstruction* inst : added_instructions) {
+      if (ShapeUtil::Equal(inst->shape(), operand_shape)) {
+        inst->set_sharding(batch_norm->sharding());
+      } else {
+        inst->set_sharding(HloSharding::Replicate());
+      }
+    }
+    shifted_normalized->set_sharding(batch_norm->sharding());
+  }
   TF_CHECK_OK(
       ReplaceWithNewInstruction(batch_norm, std::move(shifted_normalized)));
   return Status::OK();
 }
 
-Status BatchNormRewriterVisitor::HandleBatchNormGrad(
+Status BatchNormExpanderVisitor::HandleBatchNormGrad(
     HloInstruction* batch_norm) {
   // Use the following formulas to calculate gradients:
   // scale_grad =
@@ -370,9 +414,17 @@ Status BatchNormRewriterVisitor::HandleBatchNormGrad(
   if (!rewrite_grad_op_) {
     return Status::OK();
   }
+  std::vector<HloInstruction*> added_instructions;
+  auto add = [&](std::unique_ptr<HloInstruction> inst) {
+    HloInstruction* added_inst = computation_->AddInstruction(std::move(inst));
+    added_instructions.push_back(added_inst);
+    return added_inst;
+  };
+  int64 instruction_count_before = computation_->instruction_count();
 
   HloInstruction* activation = batch_norm->mutable_operand(0);
   const Shape activation_shape = activation->shape();
+  PrimitiveType ptype = activation_shape.element_type();
   HloInstruction* scale = batch_norm->mutable_operand(1);
   const Shape feature_shape = scale->shape();
   HloInstruction* mean = batch_norm->mutable_operand(2);
@@ -383,18 +435,26 @@ Status BatchNormRewriterVisitor::HandleBatchNormGrad(
 
   const int64 size_in_elements = ShapeUtil::ElementsIn(activation_shape);
   const int64 feature_count = activation_shape.dimensions(feature_index);
-  auto elements_per_feature =
-      computation_->AddInstruction(HloInstruction::CreateConstant(
-          Literal::CreateR0<float>(size_in_elements / feature_count)));
-
-  auto zero = computation_->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-
-  auto neg_half = computation_->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0(-0.5f)));
-
-  auto epsilon = computation_->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0(batch_norm->epsilon())));
+  auto elements_per_feature_literal =
+      Literal::CreateR0<float>(size_in_elements / feature_count);
+  TF_ASSIGN_OR_RETURN(elements_per_feature_literal,
+                      elements_per_feature_literal->Convert(ptype));
+  auto elements_per_feature = add(
+      HloInstruction::CreateConstant(std::move(elements_per_feature_literal)));
+
+  auto zero_literal = Literal::CreateR0(0.0f);
+  TF_ASSIGN_OR_RETURN(zero_literal, zero_literal->Convert(ptype));
+  auto zero = add(HloInstruction::CreateConstant(std::move(zero_literal)));
+
+  auto neg_half_literal = Literal::CreateR0(-0.5f);
+  TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype));
+  auto neg_half =
+      add(HloInstruction::CreateConstant(std::move(neg_half_literal)));
+
+  auto epsilon_literal = Literal::CreateR0(batch_norm->epsilon());
+  TF_ASSIGN_OR_RETURN(epsilon_literal, epsilon_literal->Convert(ptype));
+  auto epsilon =
+      add(HloInstruction::CreateConstant(std::move(epsilon_literal)));
 
   std::vector<int64> dimensions_without_feature;
 
@@ -404,141 +464,146 @@ Status BatchNormRewriterVisitor::HandleBatchNormGrad(
     }
   }
 
-  auto scale_broadcasted =
-      computation_->AddInstruction(HloInstruction::CreateBroadcast(
-          activation_shape, scale, {feature_index}));
-  auto variance_broadcasted =
-      computation_->AddInstruction(HloInstruction::CreateBroadcast(
-          activation_shape, variance, {feature_index}));
+  auto scale_broadcasted = add(HloInstruction::CreateBroadcast(
+      activation_shape, scale, {feature_index}));
+  auto variance_broadcasted = add(HloInstruction::CreateBroadcast(
+      activation_shape, variance, {feature_index}));
 
   // E[X].
-  auto mean_broadcasted = computation_->AddInstruction(
+  auto mean_broadcasted = add(
       HloInstruction::CreateBroadcast(activation_shape, mean, {feature_index}));
 
   // rsqrt[Var[X] + epsilon].
-  auto rsqrt_var_add_epsilon_broadcasted =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          activation_shape, HloOpcode::kPower,
-          computation_->AddInstruction(
-              HloInstruction::CreateBinary(activation_shape, HloOpcode::kAdd,
-                                           variance_broadcasted, epsilon)),
-          neg_half));
-
-  auto rsqrt_var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          feature_shape, HloOpcode::kPower,
-          computation_->AddInstruction(HloInstruction::CreateBinary(
-              feature_shape, HloOpcode::kAdd, variance, epsilon)),
-          neg_half));
+  auto rsqrt_var_add_epsilon_broadcasted = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kPower,
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kAdd,
+                                       variance_broadcasted, epsilon)),
+      neg_half));
+
+  auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
+      feature_shape, HloOpcode::kPower,
+      add(HloInstruction::CreateBinary(feature_shape, HloOpcode::kAdd, variance,
+                                       epsilon)),
+      neg_half));
 
   // X - E[X].
-  auto activation_minus_mean = computation_->AddInstruction(
-      HloInstruction::CreateBinary(activation_shape, HloOpcode::kSubtract,
-                                   activation, mean_broadcasted));
+  auto activation_minus_mean = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kSubtract, activation, mean_broadcasted));
 
   // Grad[Y] * (X - E[X]).
-  auto grad_output_times_activiation_minus_mean = computation_->AddInstruction(
-      HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
-                                   grad_output, activation_minus_mean));
+  auto grad_output_times_activiation_minus_mean =
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
+                                       grad_output, activation_minus_mean));
 
   HloComputation* add_reduce_computation =
-      GetScalarBinaryComputation(F32, HloOpcode::kAdd);
+      GetScalarBinaryComputation(ptype, HloOpcode::kAdd);
 
   // sum(Grad[Y] * (X - E[X])).
   auto sum_grad_output_times_activiation_minus_mean =
-      computation_->AddInstruction(HloInstruction::CreateReduce(
+      add(HloInstruction::CreateReduce(
           feature_shape, grad_output_times_activiation_minus_mean, zero,
           dimensions_without_feature, add_reduce_computation));
 
   // Grad[beta] = Sum(Grad[Y]).
-  auto grad_beta = computation_->AddInstruction(HloInstruction::CreateReduce(
+  auto grad_beta = add(HloInstruction::CreateReduce(
       feature_shape, grad_output, zero, dimensions_without_feature,
       add_reduce_computation));
 
-  if (use_fusion_) {
-    auto tuple = computation_->AddInstruction(HloInstruction::CreateTuple(
+  if (use_fusion_ && !batch_norm->has_sharding()) {
+    auto tuple = add(HloInstruction::CreateTuple(
         {sum_grad_output_times_activiation_minus_mean, grad_beta}));
 
     auto fused = computation_->CreateFusionInstruction(
         {tuple, sum_grad_output_times_activiation_minus_mean, grad_beta},
         HloInstruction::FusionKind::kInput);
 
-    sum_grad_output_times_activiation_minus_mean = computation_->AddInstruction(
-        HloInstruction::CreateGetTupleElement(feature_shape, fused, 0));
+    sum_grad_output_times_activiation_minus_mean =
+        add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 0));
 
-    grad_beta = computation_->AddInstruction(
-        HloInstruction::CreateGetTupleElement(feature_shape, fused, 1));
+    grad_beta =
+        add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 1));
   }
 
   // Grad[scale] = Sum(Grad[Y] * (X - E[X]) * rsqrt[Var[X] + epsilon]).
-  auto grad_scale = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto grad_scale = add(HloInstruction::CreateBinary(
       feature_shape, HloOpcode::kMultiply,
       sum_grad_output_times_activiation_minus_mean, rsqrt_var_add_epsilon));
 
   // I2 = Sum(Grad[Y])
-  auto I2 = computation_->AddInstruction(HloInstruction::CreateBroadcast(
-      activation_shape, grad_beta, {feature_index}));
+  auto i2 = add(HloInstruction::CreateBroadcast(activation_shape, grad_beta,
+                                                {feature_index}));
 
   // I3 = Sum(Grad[Y] * (X - E[X]))
-  auto I3 = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+  auto i3 = add(HloInstruction::CreateBroadcast(
       activation_shape, sum_grad_output_times_activiation_minus_mean,
       {feature_index}));
 
   // I4 = (X - E[X]) * I3
-  auto I4 = computation_->AddInstruction(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kMultiply, I3, activation_minus_mean));
+  auto i4 = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kMultiply, i3, activation_minus_mean));
 
   // I5 = I4 / (Var[X] + epsilon)
-  auto I5 = computation_->AddInstruction(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kDivide, I4,
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          activation_shape, HloOpcode::kAdd, variance_broadcasted, epsilon))));
+  auto i5 = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kDivide, i4,
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kAdd,
+                                       variance_broadcasted, epsilon))));
 
   // scale * rsqrt[Var[X] + epsilon] * 1/N
-  auto scale_times_rsqrt_var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          activation_shape, HloOpcode::kMultiply, scale_broadcasted,
-          rsqrt_var_add_epsilon_broadcasted));
+  auto scale_times_rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kMultiply, scale_broadcasted,
+      rsqrt_var_add_epsilon_broadcasted));
 
-  scale_times_rsqrt_var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          activation_shape, HloOpcode::kDivide,
-          scale_times_rsqrt_var_add_epsilon, elements_per_feature));
+  scale_times_rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kDivide, scale_times_rsqrt_var_add_epsilon,
+      elements_per_feature));
 
-  auto I1 = computation_->AddInstruction(
-      HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
-                                   grad_output, elements_per_feature));
+  auto i1 =
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
+                                       grad_output, elements_per_feature));
 
   // I6 = I1 - I2 - I5
-  auto I6 = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto i6 = add(HloInstruction::CreateBinary(
       activation_shape, HloOpcode::kSubtract,
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          activation_shape, HloOpcode::kSubtract, I1, I2)),
-      I5));
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kSubtract,
+                                       i1, i2)),
+      i5));
 
   // Grad[X] = scale * rsqrt[Var[X] + epsilon] * 1/N * I6.
-  auto grad_activation = computation_->AddInstruction(
-      HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
-                                   scale_times_rsqrt_var_add_epsilon, I6));
+  auto grad_activation =
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
+                                       scale_times_rsqrt_var_add_epsilon, i6));
+  auto tuple =
+      HloInstruction::CreateTuple({grad_activation, grad_scale, grad_beta});
+  if (batch_norm->has_sharding()) {
+    int64 instruction_count_after = computation_->instruction_count();
+    CHECK_EQ(instruction_count_after,
+             instruction_count_before + added_instructions.size());
+    for (HloInstruction* inst : added_instructions) {
+      if (ShapeUtil::Equal(inst->shape(), activation_shape)) {
+        inst->set_sharding(batch_norm->sharding());
+      } else {
+        inst->set_sharding(HloSharding::Replicate());
+      }
+    }
+    tuple->set_sharding(batch_norm->sharding());
+  }
 
-  TF_CHECK_OK(ReplaceWithNewInstruction(
-      batch_norm,
-      HloInstruction::CreateTuple({grad_activation, grad_scale, grad_beta})));
+  TF_CHECK_OK(ReplaceWithNewInstruction(batch_norm, std::move(tuple)));
 
   return Status::OK();
 }
 
-StatusOr<bool> BatchNormRewriter::Run(HloModule* module) {
-  XLA_VLOG_LINES(2, "BatchNormRewriter::Run(), before:\n" + module->ToString());
+StatusOr<bool> BatchNormExpander::Run(HloModule* module) {
+  XLA_VLOG_LINES(2, "BatchNormExpander::Run(), before:\n" + module->ToString());
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations()) {
-    if (BatchNormRewriterVisitor::Run(comp, rewrite_training_op_,
+    if (BatchNormExpanderVisitor::Run(comp, rewrite_training_op_,
                                       rewrite_inference_op_, rewrite_grad_op_,
                                       use_fusion_)) {
       changed = true;
     }
   }
-  XLA_VLOG_LINES(2, "BatchNormRewriter::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "BatchNormExpander::Run(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/batchnorm_rewriter.h b/tensorflow/compiler/xla/service/batchnorm_expander.h
similarity index 83%
rename from tensorflow/compiler/xla/service/batchnorm_rewriter.h
rename to tensorflow/compiler/xla/service/batchnorm_expander.h
index f601741d964376058a2bafade311ede4c8567fd2..4ad987085da91684bb7891070afeefd19be4138f 100644
--- a/tensorflow/compiler/xla/service/batchnorm_rewriter.h
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BATCHNORM_REWRITER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_BATCHNORM_REWRITER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BATCHNORM_EXPANDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_BATCHNORM_EXPANDER_H_
 
 #include <utility>
 
@@ -26,18 +26,18 @@ namespace xla {
 // A pass which rewrites batch norm operations into more operations. Breaking a
 // big operation into smaller operations helps leverage our generic fusion
 // logic.
-class BatchNormRewriter : public HloPassInterface {
+class BatchNormExpander : public HloPassInterface {
  public:
   // When use_fusion is set, a multi-output fusion node is created.
-  BatchNormRewriter(bool rewrite_training_op = false,
+  BatchNormExpander(bool rewrite_training_op = false,
                     bool rewrite_inference_op = false,
                     bool rewrite_grad_op = false, bool use_fusion = true)
       : rewrite_training_op_(rewrite_training_op),
         rewrite_inference_op_(rewrite_inference_op),
         rewrite_grad_op_(rewrite_grad_op),
         use_fusion_(use_fusion) {}
-  ~BatchNormRewriter() = default;
-  tensorflow::StringPiece name() const override { return "batchnorm_rewriter"; }
+  ~BatchNormExpander() = default;
+  tensorflow::StringPiece name() const override { return "batchnorm_expander"; }
 
   // Run operation expander on the given computation. Returns whether the
   // computation was changed.
@@ -52,4 +52,4 @@ class BatchNormRewriter : public HloPassInterface {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_BATCHNORM_REWRITER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_BATCHNORM_EXPANDER_H_
diff --git a/tensorflow/compiler/xla/service/batchnorm_rewriter_test.cc b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
similarity index 93%
rename from tensorflow/compiler/xla/service/batchnorm_rewriter_test.cc
rename to tensorflow/compiler/xla/service/batchnorm_expander_test.cc
index 590f79aee51ccf410823b91fd8ad09fc7c429c7d..aa36e64b07099a372dab67babc7a18a2d39596bc 100644
--- a/tensorflow/compiler/xla/service/batchnorm_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/batchnorm_rewriter.h"
+#include "tensorflow/compiler/xla/service/batchnorm_expander.h"
 
 #include <memory>
 #include <utility>
@@ -36,10 +36,10 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using BatchNormRewriterTest = HloTestBase;
+using BatchNormExpanderTest = HloTestBase;
 
 // Test that we expand BatchNormTraining.
-TEST_F(BatchNormRewriterTest, BatchNormTraining) {
+TEST_F(BatchNormExpanderTest, BatchNormTraining) {
   Shape input_shape = ShapeUtil::MakeShape(F32, {2, 2, 2, 2});
   Shape scale_shape = ShapeUtil::MakeShape(F32, {2});
   Shape offset_shape = ShapeUtil::MakeShape(F32, {2});
@@ -63,7 +63,7 @@ TEST_F(BatchNormRewriterTest, BatchNormTraining) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBatchNormTraining);
-  BatchNormRewriter rewriter(/*rewrite_training_op=*/true,
+  BatchNormExpander rewriter(/*rewrite_training_op=*/true,
                              /*rewrite_inference_op=*/true,
                              /*rewrite_grad_op=*/true);
   ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
@@ -73,7 +73,7 @@ TEST_F(BatchNormRewriterTest, BatchNormTraining) {
 }
 
 // Test that we expand BatchNormGrad.
-TEST_F(BatchNormRewriterTest, BatchNormGrad) {
+TEST_F(BatchNormExpanderTest, BatchNormGrad) {
   Shape input_shape = ShapeUtil::MakeShape(F32, {2, 2, 2, 2});
   Shape scale_shape = ShapeUtil::MakeShape(F32, {2});
   Shape mean_shape = ShapeUtil::MakeShape(F32, {2});
@@ -105,7 +105,7 @@ TEST_F(BatchNormRewriterTest, BatchNormGrad) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBatchNormGrad);
-  BatchNormRewriter rewriter(/*rewrite_training_op=*/true,
+  BatchNormExpander rewriter(/*rewrite_training_op=*/true,
                              /*rewrite_inference_op=*/true,
                              /*rewrite_grad_op=*/true);
   ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 033034b4210fa1bd3ae78f0ef869ec2be879f229..7ece79d781acfaffc21d6a29e8a12e68622a1617 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -581,6 +581,7 @@ Status GatherComputationsByAllocationType(
            instruction->called_computations()) {
         switch (instruction->opcode()) {
           case HloOpcode::kCall:
+          case HloOpcode::kConditional:
           case HloOpcode::kWhile:
             // Call and while must be called from a computation with global
             // allocations as they may return references to buffers inside the
@@ -976,8 +977,8 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
   const HloOrdering& hlo_ordering = assignment->liveness().hlo_ordering();
   if (run_whole_module_heap_simulation) {
     // Run the heap simulation over the whole module. This reduces memory usage,
-    // since buffers for kCall and kWhile sub-computations are only live for the
-    // duration of their calling instructions.
+    // since buffers for kCall, kWhile, and kConditional sub-computations are
+    // only live for the duration of their calling instructions.
     VLOG(1) << "Running whole-module heap simulation";
     SequentialHloOrdering::HloModuleSequence module_sequence;
     FlatSet<const LogicalBuffer*> all_buffers_to_assign;
@@ -1265,7 +1266,6 @@ const LogicalBuffer* AddBufferToColocatedSet(
   // CopyInsertion ensures root points-to set is unambiguous and distinct.
   const auto& points_to = points_to_analysis.GetPointsToSet(instruction);
   DCHECK(!points_to.IsAmbiguous());
-  DCHECK(points_to.IsDistinct());
   colocated_set->push_back(points_to.element(index)[0]);
   return colocated_set->back();
 }
@@ -1273,7 +1273,8 @@ const LogicalBuffer* AddBufferToColocatedSet(
 }  // namespace
 
 // Builds sets of buffers in 'colocated_buffer_sets' which should be colocated
-// in the same allocation (currently just supports kWhile and kCall).
+// in the same allocation (currently just supports kWhile, kCall, and
+// kConditional).
 void BufferAssigner::BuildColocatedBufferSets(
     const HloModule* module, const BufferLiveness& buffer_liveness,
     const LogicalBuffer::SizeFunction& buffer_size,
@@ -1337,6 +1338,26 @@ void BufferAssigner::BuildColocatedBufferSets(
                                       &colocated_set);
               AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
             });
+      } else if (opcode == HloOpcode::kConditional) {
+        const HloInstruction* conditional_hlo = instruction;
+        ShapeUtil::ForEachSubshape(
+            conditional_hlo->shape(),
+            [this, conditional_hlo, &points_to_analysis, colocated_buffer_sets](
+                const Shape& /*subshape*/, const ShapeIndex& index) {
+              std::vector<const LogicalBuffer*> colocated_set;
+              // Add conditional.result.
+              AddBufferToColocatedSet(conditional_hlo, index,
+                                      points_to_analysis, &colocated_set);
+              // Add conditional.true_computation.root.
+              AddBufferToColocatedSet(
+                  conditional_hlo->true_computation()->root_instruction(),
+                  index, points_to_analysis, &colocated_set);
+              // Add conditional.false_computation.root.
+              AddBufferToColocatedSet(
+                  conditional_hlo->false_computation()->root_instruction(),
+                  index, points_to_analysis, &colocated_set);
+              AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
+            });
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 89410f42bd7b5fa8f9b380c868fcd4fedb54576c..6fc9d783f1b34de8c0f93c6aa342591891d08eaf 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -85,7 +85,7 @@ class BufferAssignmentTest : public HloTestBase {
   std::unique_ptr<BufferAssignment> RunBufferAssignment(HloModule* module,
                                                         int64 alignment = 1) {
     return BufferAssigner::Run(
-               module, MakeUnique<DependencyHloOrdering>(module),
+               module, xla::MakeUnique<DependencyHloOrdering>(module),
                backend().compiler()->BufferSizeBytesFunction(),
                [alignment](LogicalBuffer::Color) { return alignment; })
         .ConsumeValueOrDie();
@@ -94,7 +94,7 @@ class BufferAssignmentTest : public HloTestBase {
   std::unique_ptr<BufferAssignment> RunColoredBufferAssignment(
       HloModule* module, BufferLiveness::Colorer colorer, int64 alignment = 1) {
     return BufferAssigner::Run(
-               module, MakeUnique<DependencyHloOrdering>(module),
+               module, xla::MakeUnique<DependencyHloOrdering>(module),
                backend().compiler()->BufferSizeBytesFunction(),
                [alignment](LogicalBuffer::Color) { return alignment; }, false,
                std::move(colorer))
@@ -166,6 +166,15 @@ class BufferAssignmentTest : public HloTestBase {
     return builder.Build();
   }
 
+  std::unique_ptr<HloComputation> BuildR0F32UnaryOpComputation(
+      HloOpcode opcode, const string& name) {
+    auto builder = HloComputation::Builder(name);
+    auto param =
+        builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "x"));
+    builder.AddInstruction(HloInstruction::CreateUnary(r0f32_, opcode, param));
+    return builder.Build();
+  }
+
   // Verifies that the given instruction hlo has a valid input buffer assigned,
   // i.e., the parameter number matches the op's.
   const BufferAllocation& GetAssignedInputAllocation(
@@ -740,6 +749,56 @@ TEST_F(BufferAssignmentTest, ExampleWhile) {
             << " instructions; total buffer size " << size0 + sizec + sizeb;
 }
 
+TEST_F(BufferAssignmentTest, ExampleConditional) {
+  auto module = CreateNewModule();
+  auto true_computation = module->AddEmbeddedComputation(
+      BuildR0F32UnaryOpComputation(HloOpcode::kCeil, "Ceil"));
+  auto false_computation = module->AddEmbeddedComputation(
+      BuildR0F32UnaryOpComputation(HloOpcode::kFloor, "Floor"));
+
+  auto builder = HloComputation::Builder(TestName());
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  auto const1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(56.4f)));
+  auto const2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(12.4f)));
+  auto conditional = builder.AddInstruction(HloInstruction::CreateConditional(
+      r0f32_, pred, const1, true_computation, const2, false_computation));
+  module->AddEntryComputation(builder.Build());
+
+  const std::vector<const HloInstruction*> conditional_instrs =
+      GetInstructions(conditional);
+  const std::vector<const HloInstruction*> true_instrs =
+      GetInstructions(true_computation->root_instruction());
+  const std::vector<const HloInstruction*> false_instrs =
+      GetInstructions(false_computation->root_instruction());
+  EXPECT_EQ(4, conditional_instrs.size());
+  EXPECT_EQ(2, true_instrs.size());
+  EXPECT_EQ(2, false_instrs.size());
+
+  auto buffers = RunBufferAssignment(module.get());
+  ValidateBuffers(conditional_instrs, *buffers);
+  ValidateBuffers(true_instrs, *buffers);
+  ValidateBuffers(false_instrs, *buffers);
+
+  EXPECT_FALSE(BuffersDistinct(conditional_instrs, true_instrs, *buffers))
+      << "Should be reuse between conditional and true computation.";
+  EXPECT_FALSE(BuffersDistinct(conditional_instrs, false_instrs, *buffers))
+      << "Should be reuse between conditional and false computation.";
+  EXPECT_FALSE(BuffersDistinct(true_instrs, false_instrs, *buffers))
+      << "Should be reuse between true and false computations.";
+
+  const BufferAllocation& conditional_buffer =
+      GetTopLevelAllocation(*buffers, conditional);
+  const BufferAllocation& true_buffer =
+      GetTopLevelAllocation(*buffers, true_computation->root_instruction());
+  const BufferAllocation& false_buffer =
+      GetTopLevelAllocation(*buffers, false_computation->root_instruction());
+  EXPECT_EQ(conditional_buffer.size(), true_buffer.size());
+  EXPECT_EQ(conditional_buffer.size(), false_buffer.size());
+}
+
 TEST_F(BufferAssignmentTest, UnaryOpReuseChain) {
   // param0[100] ---> (exp) ---> (tanh) ---> (exp) ---> (neg)
   auto builder = HloComputation::Builder(TestName());
@@ -1360,10 +1419,13 @@ TEST_F(BufferAssignmentTest, OneTempAllocation) {
       HloInstruction::CreateParameter(1, shape_3x4, "param_b"));
   auto param_c = builder.AddInstruction(
       HloInstruction::CreateParameter(2, shape_4x4, "param_c"));
-  auto dot_ab = builder.AddInstruction(HloInstruction::CreateBinary(
-      shape_2x4, HloOpcode::kDot, param_a, param_b));
-  auto dot_bc = builder.AddInstruction(HloInstruction::CreateBinary(
-      shape_3x4, HloOpcode::kDot, param_b, param_c));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto dot_ab = builder.AddInstruction(
+      HloInstruction::CreateDot(shape_2x4, param_a, param_b, dot_dnums));
+  auto dot_bc = builder.AddInstruction(
+      HloInstruction::CreateDot(shape_3x4, param_b, param_c, dot_dnums));
   builder.AddInstruction(
       HloInstruction::CreateConcatenate(shape_5x4, {dot_ab, dot_bc}, 1));
 
@@ -1448,7 +1510,7 @@ class WhileBufferAssignmentTest : public HloTestBase {
     auto sequence =
         CreateMemoryMinimizingSequence(*module, ByteSizeOf).ConsumeValueOrDie();
     return BufferAssigner::Run(
-               module, MakeUnique<SequentialHloOrdering>(module, sequence),
+               module, xla::MakeUnique<SequentialHloOrdering>(module, sequence),
                ByteSizeOf,
                [alignment](LogicalBuffer::Color) { return alignment; })
         .ConsumeValueOrDie();
@@ -1469,7 +1531,7 @@ static void RunCopyInsertion(HloModule* module) {
 }
 
 TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = xla::MakeUnique<HloModule>(TestName());
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
@@ -1526,7 +1588,7 @@ TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
 }
 
 TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = xla::MakeUnique<HloModule>(TestName());
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
@@ -1538,8 +1600,6 @@ TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
       HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0)));
   auto output0 = builder.AddInstruction(
       HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
-  auto output1 = builder.AddInstruction(
-      HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
 
   auto cond0 =
       module->AddEmbeddedComputation(BuildWhileConditionComputation("cond"));
@@ -1556,10 +1616,8 @@ TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
   auto body1 =
       module->AddEmbeddedComputation(BuildWhileBodyComputation("body"));
 
-  auto tuple1 = builder.AddInstruction(
-      HloInstruction::CreateTuple({input0, weights0, output1}));
   auto while1 = builder.AddInstruction(
-      HloInstruction::CreateWhile(loop_state_shape_, cond1, body1, tuple1));
+      HloInstruction::CreateWhile(loop_state_shape_, cond1, body1, while0));
 
   module->AddEntryComputation(builder.Build());
   RunCopyInsertion(module.get());
@@ -1575,7 +1633,7 @@ TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
 }
 
 TEST_F(BufferAssignmentTest, TwoCalls) {
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = xla::MakeUnique<HloModule>(TestName());
   Shape r0f32 = ShapeUtil::MakeShape(xla::F32, {});
   HloComputation* sub_computation;
   {
@@ -1640,7 +1698,7 @@ static bool IsPostOrderTraversal(
 }
 
 TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = xla::MakeUnique<HloModule>(TestName());
   auto builder = HloComputation::Builder(TestName());
 
   auto zero = builder.AddInstruction(
@@ -1676,11 +1734,14 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
   auto while1 = builder.AddInstruction(
       HloInstruction::CreateWhile(loop_state_shape_, cond, body, tuple1));
 
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape_, while0, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape_, while1, 1));
   auto root_add = builder.AddInstruction(HloInstruction::CreateBinary(
-      while0->shape(), HloOpcode::kAdd, while0, while1));
-  module->AddEntryComputation(builder.Build());
+      while0->shape(), HloOpcode::kAdd, gte0, gte1));
 
-  RunCopyInsertion(module.get());
+  module->AddEntryComputation(builder.Build());
 
   {
     FlattenCallGraph flatten;
@@ -1688,84 +1749,35 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
     EXPECT_TRUE(result);
   }
 
+  RunCopyInsertion(module.get());
+
   auto sequence =
       CreateMemoryMinimizingSequence(*module, ByteSizeOf).ConsumeValueOrDie();
 
   // To trigger b/38494731, we want a specific Hlo sequence for the
   // root computation, so we overwrite that entry with a manually
   // crafted sequence.
-  std::vector<const HloInstruction*> sequence_for_buffer_assigment = {
-      input1,   weights1, one,     output1, tuple1, while1,  input0,
-      weights0, zero,     output0, tuple0,  while0, root_add};
+  sequence[module->entry_computation()] = {
+      input1, weights1, one,     output1, while1->operand(0), while1,
+      input0, weights0, zero,    output0, while0->operand(0), while0,
+      gte0,   gte1,     root_add};
 
   // If this ASSERT_TRUE fails, we constructed a bogus sequence above
   // and this test itself is buggy.
-  ASSERT_TRUE(IsPostOrderTraversal(sequence_for_buffer_assigment));
-
-  sequence[module->entry_computation()] =
-      std::move(sequence_for_buffer_assigment);
+  ASSERT_TRUE(IsPostOrderTraversal(sequence[module->entry_computation()]));
 
   auto assignment =
       BufferAssigner::Run(
           module.get(),
-          MakeUnique<SequentialHloOrdering>(module.get(), sequence), ByteSizeOf,
-          [](LogicalBuffer::Color) { return 1; })
+          xla::MakeUnique<SequentialHloOrdering>(module.get(), sequence),
+          ByteSizeOf, [](LogicalBuffer::Color) { return 1; })
           .ConsumeValueOrDie();
 
   EXPECT_TRUE(BuffersDistinct({while0}, {while1}, *assignment));
 }
 
-// Test buffer assignment for while nodes with multiple uses.
-// TODO(b/37245345): Fix buffer assignment for this case.
-TEST_F(WhileBufferAssignmentTest, DISABLED_TwoWhiles) {
-  auto module = MakeUnique<HloModule>(TestName());
-  auto builder = HloComputation::Builder(TestName());
-
-  auto input0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, data_shape_, "input0"));
-  auto weights0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, data_shape_, "weights0"));
-
-  auto zero = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0)));
-  auto output0 = builder.AddInstruction(
-      HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
-
-  auto cond0 =
-      module->AddEmbeddedComputation(BuildWhileConditionComputation("cond"));
-  auto body0 =
-      module->AddEmbeddedComputation(BuildWhileBodyComputation("body"));
-
-  auto tuple0 = builder.AddInstruction(
-      HloInstruction::CreateTuple({input0, weights0, output0}));
-  auto while0 = builder.AddInstruction(
-      HloInstruction::CreateWhile(loop_state_shape_, cond0, body0, tuple0));
-  auto while1 = builder.AddInstruction(
-      HloInstruction::CreateWhile(loop_state_shape_, cond0, body0, while0));
-
-  auto get0 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(data_shape_, while0, 2));
-  auto get1 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(data_shape_, while1, 2));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(data_shape_, HloOpcode::kAdd, get0, get1));
-  module->AddEntryComputation(builder.Build());
-
-  RunCopyInsertion(module.get());
-
-  {
-    FlattenCallGraph flatten;
-    TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module.get()));
-    EXPECT_TRUE(result);
-  }
-
-  auto assignment = RunBufferAssignment(module.get());
-
-  EXPECT_TRUE(BuffersDistinct({while0}, {while1}, *assignment));
-}
-
 TEST_F(WhileBufferAssignmentTest, WhilesDontShareEntryParamIfLiveOut) {
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = xla::MakeUnique<HloModule>(TestName());
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
index 56600b583803e23324db778959de620440fce5cf..13825fe05bb1b98045f1a3dac3d7272a2d1151fb 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@@ -120,7 +120,7 @@ TEST_F(BufferLivenessTest, ElementwiseChain) {
 
   auto liveness =
       BufferLiveness::Run(module.get(),
-                          MakeUnique<DependencyHloOrdering>(module.get()))
+                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, negate));
@@ -167,10 +167,10 @@ TEST_F(BufferLivenessTest, MultipleEntryParameters_Sequential) {
 
   SequentialHloOrdering::HloModuleSequence sequence;
   sequence.insert({entry, {param0, negate, param1, exp, add}});
-  auto liveness = BufferLiveness::Run(
-                      module.get(),
-                      MakeUnique<SequentialHloOrdering>(module.get(), sequence))
-                      .ConsumeValueOrDie();
+  auto liveness =
+      BufferLiveness::Run(module.get(), xla::MakeUnique<SequentialHloOrdering>(
+                                            module.get(), sequence))
+          .ConsumeValueOrDie();
 
   // Entry parameters interfere as if they are defined simultaneously at
   // the very beginning.
@@ -216,7 +216,7 @@ TEST_F(BufferLivenessTest, NonElementwiseOperand) {
 
   auto liveness =
       BufferLiveness::Run(module.get(),
-                          MakeUnique<DependencyHloOrdering>(module.get()))
+                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, exp));
@@ -250,7 +250,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffers) {
 
   auto liveness =
       BufferLiveness::Run(module.get(),
-                          MakeUnique<DependencyHloOrdering>(module.get()))
+                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   EXPECT_TRUE(InstructionsMayInterfere(*liveness, param, negate));
@@ -294,7 +294,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) {
   std::vector<const HloInstruction*> order = {param, negate, exp, add};
   module_sequence.emplace(computation, order);
   auto liveness =
-      BufferLiveness::Run(module.get(), MakeUnique<SequentialHloOrdering>(
+      BufferLiveness::Run(module.get(), xla::MakeUnique<SequentialHloOrdering>(
                                             module.get(), module_sequence))
           .ConsumeValueOrDie();
 
@@ -334,7 +334,7 @@ TEST_F(BufferLivenessTest, TupleLiveOut) {
 
   auto liveness =
       BufferLiveness::Run(module.get(),
-                          MakeUnique<DependencyHloOrdering>(module.get()))
+                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   // All buffers should be live out except the param
@@ -370,7 +370,7 @@ TEST_F(BufferLivenessTest, EmbeddedComputation) {
 
   auto liveness =
       BufferLiveness::Run(module.get(),
-                          MakeUnique<DependencyHloOrdering>(module.get()))
+                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   // Buffers in different computations should always interfere.
@@ -409,7 +409,7 @@ TEST_F(BufferLivenessTest, TupleConstantLiveOut) {
 
   auto liveness =
       BufferLiveness::Run(module.get(),
-                          MakeUnique<DependencyHloOrdering>(module.get()))
+                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   // Only the element buffers of the tuple constant which are pointed to by
@@ -474,7 +474,7 @@ TEST_F(BufferLivenessTest, IndependentTupleElements) {
 
   auto liveness =
       BufferLiveness::Run(module.get(),
-                          MakeUnique<DependencyHloOrdering>(module.get()))
+                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   // We compare tuple element pairs that are input/output to the computation:
@@ -536,7 +536,7 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
 
   auto liveness =
       BufferLiveness::Run(module.get(),
-                          MakeUnique<DependencyHloOrdering>(module.get()))
+                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   // We compare tuple element pairs that are input/output to the computation:
@@ -624,8 +624,8 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
 
     // Run BufferLiveness on 'module'.
     auto liveness =
-        BufferLiveness::Run(module.get(),
-                            MakeUnique<DependencyHloOrdering>(module.get()))
+        BufferLiveness::Run(
+            module.get(), xla::MakeUnique<DependencyHloOrdering>(module.get()))
             .ConsumeValueOrDie();
     // Return whether or not buffers interference is detected between
     // 'tuple_param0' and 'tuple_root' at shape index '{1}'.
@@ -736,8 +736,8 @@ class DynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     module->AddEmbeddedComputation(builder.Build());
     // Run BufferLiveness on 'module'.
     auto liveness =
-        BufferLiveness::Run(module.get(),
-                            MakeUnique<DependencyHloOrdering>(module.get()))
+        BufferLiveness::Run(
+            module.get(), xla::MakeUnique<DependencyHloOrdering>(module.get()))
             .ConsumeValueOrDie();
     // Return whether or not buffers interference is detected between
     // 'tuple_param0' and 'tuple_root' at shape index '{1}'.
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index 1adecdb939cb2c1259003d3be2c90b5a299b0f30..13eb02ca012f44b2b5ed7c6f5becb7d54b07c33c 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -54,6 +54,7 @@ std::ostream& operator<<(std::ostream& out, const CallContext& context) {
 CallContext GetInstructionCallContext(const HloInstruction* instruction) {
   switch (instruction->opcode()) {
     case HloOpcode::kCall:
+    case HloOpcode::kConditional:
     case HloOpcode::kWhile:
       return CallContext::kSequential;
     case HloOpcode::kMap:
diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc
index 0395ea8c8b52315f7ca2221f412750ebadda2dd8..1ea7d538cd515c3098b6a1f03c6146d288330406 100644
--- a/tensorflow/compiler/xla/service/call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/call_graph_test.cc
@@ -34,12 +34,13 @@ using ::testing::UnorderedElementsAre;
 class CallGraphTest : public HloTestBase {
  protected:
   // Build and return a trivial computation taking and returning a scalar.
-  std::unique_ptr<HloComputation> MakeScalarComputation() {
+  std::unique_ptr<HloComputation> MakeScalarComputation(
+      HloOpcode opcode = HloOpcode::kNegate) {
     HloComputation::Builder builder(TestName() + ".ScalarComputation");
     HloInstruction* param0 = builder.AddInstruction(
         HloInstruction::CreateParameter(0, kScalarShape, "param0"));
     builder.AddInstruction(
-        HloInstruction::CreateUnary(kScalarShape, HloOpcode::kNegate, param0));
+        HloInstruction::CreateUnary(kScalarShape, opcode, param0));
     return builder.Build();
   }
 
@@ -236,6 +237,54 @@ TEST_F(CallGraphTest, ContextBothComputations) {
   EXPECT_EQ(CallContext::kBoth, sub_node.context());
 }
 
+TEST_F(CallGraphTest, ComputationWithConditional) {
+  // Test a call graph of a module with a conditional.
+  auto module = CreateNewModule();
+  HloComputation* true_computation =
+      module->AddEmbeddedComputation(MakeScalarComputation(HloOpcode::kCeil));
+  HloComputation* false_computation =
+      module->AddEmbeddedComputation(MakeScalarComputation(HloOpcode::kFloor));
+
+  HloComputation::Builder builder(TestName());
+  HloInstruction* pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  HloInstruction* const1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(56.4f)));
+  HloInstruction* const2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(12.6f)));
+  HloInstruction* conditional =
+      builder.AddInstruction(HloInstruction::CreateConditional(
+          kScalarShape, pred, const1, true_computation, const2,
+          false_computation));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+
+  EXPECT_EQ(3, call_graph->nodes().size());
+
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_computation, entry_node.computation());
+  EXPECT_EQ(1, entry_node.callsites().size());
+
+  const CallSite& conditional_callsite = entry_node.callsites()[0];
+  EXPECT_EQ(conditional, conditional_callsite.instruction());
+  EXPECT_THAT(conditional_callsite.called_computations(),
+              UnorderedElementsAre(true_computation, false_computation));
+  EXPECT_EQ(CallContext::kSequential, conditional_callsite.context());
+  EXPECT_EQ(entry_node.GetCallSite(conditional), &conditional_callsite);
+
+  const CallGraphNode& true_node = call_graph->GetNode(true_computation);
+  EXPECT_TRUE(true_node.callees().empty());
+  EXPECT_EQ(1, true_node.callers().size());
+  EXPECT_EQ(entry_computation, true_node.callers()[0]);
+
+  const CallGraphNode& false_node = call_graph->GetNode(false_computation);
+  EXPECT_TRUE(false_node.callees().empty());
+  EXPECT_EQ(1, false_node.callers().size());
+  EXPECT_EQ(entry_computation, false_node.callers()[0]);
+}
+
 TEST_F(CallGraphTest, ComplexGraph) {
   // Test a call graph of a module with several computation called in various
   // contexts. The call graph looks like:
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 5f021900c8b647077661da1cdec9d462bbb0146e..fc67330f5cbdbcb0d1a259d284599916a908d1fe 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -97,21 +97,32 @@ class Compiler {
   // Returns the ID of the platform that this compiler targets.
   virtual perftools::gputools::Platform::Id PlatformId() const = 0;
 
+  // Runs Hlo passes to optimize the given Hlo module, returns the optimized
+  // module.
+  virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> module,
+      perftools::gputools::StreamExecutor* executor) = 0;
+
   // Compiles the HLO module for execution on a device given by the executor,
-  // and returns an executable object or an error status. Takes ownership of the
-  // HLO module and is free to transform it.
+  // and returns an executable object or an error status. No HLO passes are
+  // applied to module. Generally a module should be passed through RunHloPasses
+  // prior to calling this method because the some HLO passes are required for
+  // correctness. Takes ownership of the HLO module and is free to transform it.
   //
   // The compiler may optionally specialize to the individual device
   // (not just type of device) indicated by the executor.
   //
   // Use the overload below to compile computations that run in parallel.
-  virtual StatusOr<std::unique_ptr<Executable>> Compile(
+  virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
       perftools::gputools::StreamExecutor* executor) = 0;
 
   // Compiles a set of HLO modules that can run in parallel, potentially
   // communicating data between the modules, and returns a corresponding
   // sequence of executable objects.
+  //
+  // TODO(b/68666782): Remove this method after adding support for multiple
+  // modules to RunHloPasses and RunBackends.
   virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> modules,
       std::vector<std::vector<perftools::gputools::StreamExecutor*>>
diff --git a/tensorflow/compiler/xla/service/computation_placer.cc b/tensorflow/compiler/xla/service/computation_placer.cc
index 6b7b0d25e87edf39d9f3c0c19305ebe8f173bafe..657fba6b6231104bf47f9dec80f7cd36a0ba3efd 100644
--- a/tensorflow/compiler/xla/service/computation_placer.cc
+++ b/tensorflow/compiler/xla/service/computation_placer.cc
@@ -52,6 +52,12 @@ Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const {
 /* static */ StatusOr<std::unique_ptr<DeviceAssignment>>
 DeviceAssignment::Deserialize(const DeviceAssignmentProto& proto) {
   TF_RET_CHECK(proto.computation_devices_size() == proto.computation_count());
+  if (proto.replica_count() <= 0 || proto.computation_count() <= 0) {
+    return InvalidArgument(
+        "Invalid device assignment topology: replica_count=%d, "
+        "computation_count=%d",
+        proto.replica_count(), proto.computation_count());
+  }
   auto assignment = MakeUnique<DeviceAssignment>(proto.replica_count(),
                                                  proto.computation_count());
   for (int computation = 0; computation < proto.computation_count();
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 0453a698a09b740d68b35258ede7c537fcf290d4..cd983bc03e993caed883916de01d75dffdbc4bab 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -15,15 +15,17 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 
-#include <memory>
-
+#include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -31,597 +33,1174 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 
+using ::tensorflow::str_util::Join;
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+
 namespace {
 
-using tensorflow::gtl::FlatMap;
-using tensorflow::gtl::FlatSet;
+bool IsEntryParameterValue(const HloValue& value) {
+  const HloComputation* computation = value.defining_instruction()->parent();
+  return value.defining_instruction()->opcode() == HloOpcode::kParameter &&
+         computation == computation->parent()->entry_computation();
+}
+
+bool IsConstantValue(const HloValue& value) {
+  return value.defining_instruction()->opcode() == HloOpcode::kConstant;
+}
+
+bool ValueIsReadOnly(const HloValue& value) {
+  return IsConstantValue(value) || IsEntryParameterValue(value);
+}
 
-// InstructionCopier encapsulates indices at which to copy 'instruction'.
-// All 'instruction' users in 'copy_users' are updated to use the copy.
+// Deep copy the given instructions 'from' and 'to' at the ShapeIndexes given in
+// 'indices_to_copy'. Add control edges from the respective kCopy instructions
+// in deep copy of 'from' to the respective kCopy instruction in the deep copy
+// of 'to'.
 //
-// Instruction copies are generated in two phases:
-// 1) Recording buffer indices at which 'instruction' requires copies (i.e.
-//    setting 'indices_to_copy_[index]'=true).
-// 2) Inserting kCopy instructions based on indices recorded in phase 1).
-//   *) Array instructions are copied by inserting a single kCopy instruction.
-//   *) Tuple-shaped instructions are copied by recursively expanding tuples
-//      (and tuple-shaped elements), and inserting kCopy instructions for any
-//      tuple elements which require a copy. As the recursion unwinds, new tuple
-//      instructions are added to gather the copied (and uncopied) references
-//      into the output tuple (i.e. the copy of the tuple-shaped instruction).
+// Requirements: 'from' and 'to' must have compatible shapes.
 //
-//      Example two-element tuple with one element that needs a copy:
+// For example, suppose 'from' and 'to' are two-element tuples where index 0 is
+// the only index to copy. Prior to deep-copying we have:
 //
-//             original-instruction
-//                   /    \
-//                GTE(0)  GTE(1)
-//                  |       |
-//                 Copy     |
-//                   \     /
-//                    Tuple  // copied-instruction
 //
-//      As an optimization, if the original instruction is itself a Tuple
-//      instruction, we elide the unnecessary extra GTE and Tuple instructions,
-//      and just insert the copy into a new Tuple instruction, with control
-//      dependencies to ensure the copy occurs after any possible interference.
-class InstructionCopier {
- public:
-  InstructionCopier(HloInstruction* instruction,
-                    const std::vector<HloInstruction*>& copy_users)
-      : instruction_(instruction),
-        copy_users_(copy_users),
-        indices_to_copy_(instruction->shape()),
-        control_predecessors_(instruction->shape()) {}
-
-  // Sets indices that are read-only, and thus do not need to be copied.
-  void SetReadOnlyIndices(const ShapeTree<bool>& read_only_indices) {
-    read_only_indices_ = read_only_indices;
-  }
+//      'from'
+//         |
+//        ...
+//         |
+//       'to'
+//
+// DeepCopyAndAddControlEdges produces:
+//
+//       'from'
+//        /   \
+//      GTE   GTE
+//       |     |
+//     Copy    |
+//    /   \   /
+//   |    Tuple
+//   |      |
+//  ctrl   ...
+//  edge    |
+//   |      |
+//   |    'to'
+//   |    /   \
+//   |  GTE   GTE
+//    \  |     |
+//     Copy    |
+//        \   /
+//        Tuple
+//
+StatusOr<std::pair<HloInstruction*, HloInstruction*>>
+DeepCopyAndAddControlEdges(HloInstruction* from, HloInstruction* to,
+                           const ShapeTree<bool>& indices_to_copy) {
+  DCHECK(ShapeUtil::Compatible(from->shape(), to->shape()));
+  // to/from_copy_tree hold the kCopy instruction produces by the deep
+  // copies. Elements which are not copied (indices_to_copy.element(index) ==
+  // false) have nullptr at that index.
+  ShapeTree<HloInstruction*> from_copy_tree(from->shape(),
+                                            /*init_value=*/nullptr);
+  TF_ASSIGN_OR_RETURN(HloInstruction * from_deep_copy,
+                      from->parent()->DeepCopyInstruction(
+                          from, &indices_to_copy, &from_copy_tree));
 
-  // Sets copy overrides, which are copy instructions to use at each index. This
-  // is used to share a single copy of read-only entry parameters and constants
-  // between multiple While loops.
-  void SetCopyOverrides(const ShapeTree<HloInstruction*>& copy_overrides) {
-    copy_overrides_ = copy_overrides;
+  ShapeTree<HloInstruction*> to_copy_tree(to->shape(), /*init_value=*/nullptr);
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * to_deep_copy,
+      to->parent()->DeepCopyInstruction(to, &indices_to_copy, &to_copy_tree));
+
+  // Add control edges between the respective kCopy instructions.
+  for (const auto& pair : from_copy_tree) {
+    const ShapeIndex& index = pair.first;
+    HloInstruction* from_copy = pair.second;
+    HloInstruction* to_copy = to_copy_tree.element(index);
+    if (from_copy == nullptr) {
+      TF_RET_CHECK(to_copy == nullptr);
+      continue;
+    }
+    TF_RET_CHECK(to_copy != nullptr);
+    TF_RETURN_IF_ERROR(from_copy->AddControlDependencyTo(to_copy));
   }
 
-  // Returns true if all recorded indices are false (returns true otherwise).
-  bool HasAllIndicesFalse() const;
+  return std::make_pair(from_deep_copy, to_deep_copy);
+}
 
-  // Records instruction buffer indices which point-to a Parameter or Constant.
-  Status RecordIndicesWhichPointToParamOrConstant(
-      const TuplePointsToAnalysis& points_to_analysis);
+// Compute the indices of the loop state which need copies in order to avoid
+// live range interference. Generally, an element in the loop state does not
+// need to be copied if the element is passed through transparently through the
+// body.
+//
+// Returns whether any indices need to be copied.
+bool IndicesToCopyForWhile(const HloDataflowAnalysis& dataflow,
+                           const HloInstruction* xla_while,
+                           ShapeTree<bool>* indices_to_copy) {
+  DCHECK(ShapeUtil::Compatible(indices_to_copy->shape(), xla_while->shape()));
 
-  // Records instruction buffer indices to copy which are necessary to ensure:
-  // *) PointsToSet of 'instruction_' is unambiguous and distinct.
-  // *) No liveness interference between 'instruction_' and 'other_instruction'.
-  //
-  // If 'read_only_indices_out' is non-null, read-only indices are set to true.
-  Status RecordIndicesToCopyForColocatingBuffers(
-      const BufferLiveness& liveness, const HloInstruction* other_instruction,
-      ShapeTree<bool>* read_only_indices_out);
+  bool any_copies = false;
+  const HloInstruction* init = xla_while->operand(0);
+  for (auto& pair : *indices_to_copy) {
+    const ShapeIndex& index = pair.first;
+    bool& should_copy = pair.second;
+    // If there is any ambiguity, then loop state must be copied.
+    if (dataflow.GetValueSet(init, index).values().size() > 1 ||
+        dataflow.GetValueSet(xla_while, index).values().size() > 1) {
+      should_copy = true;
+    } else {
+      // If the output of the while instruction is not the same as the init
+      // value of the while, then this element is not passed through the body
+      // transparently and must be copied.
+      should_copy = dataflow.GetUniqueValueAt(xla_while, index) !=
+                    dataflow.GetUniqueValueAt(init, index);
+    }
+    any_copies |= should_copy;
+  }
+  return any_copies;
+}
 
-  // Records control predecessors to add for inserted copy instructions.
-  // 'parameter' must have the same shape as the instruction that will be
-  // copied, and must define all buffers in the shape. Control predecessors are
-  // only recorded for indices that have already been marked for copying.
-  Status RecordControlPredecessors(
-      const TuplePointsToAnalysis& points_to_analysis,
-      HloInstruction* parameter);
+// Add kCopy instructions around the given kWhile instruction to eliminate any
+// possible live range interference of HLO values assuming a dependency-based
+// ordering (HloDependencyOrdering). Copies are added conservatively. There
+// likely are copies which are not strictly necessary, but there are removed
+// later in the pass via CopyRemover.
+//
+//
+// Elements (each ShapeIndex) in the loop state are considered independently.  A
+// copy is added to each element of the loop state which is modified in the
+// while body. For each such element, a total of three kCopy instructions are
+// added at following locations:
+//
+//   (1) The init value is copied before the kWhile instruction. Before:
+//
+//           (Init)
+//             |
+//           kWhile
+//             |
+//            ...
+//
+//       After:
+//
+//           (Init)
+//             |
+//           kCopy
+//             |
+//           kWhile
+//             |
+//            ...
+//
+//       This copy is necessary in case the init value is simultaneously live
+//       with the kWhile.
+//
+//   (2) Copies are added to the parameter and root of the while body
+//       computation. Before:
+//
+//           kParameter
+//               |
+//              ...
+//               |
+//           (body root)
+//
+//       After:
+//
+//           kParameter
+//               |
+//             kCopy ----------+
+//               |             |
+//              ...           ctrl
+//               |            edge
+//           (body root)       |
+//               |             |
+//             kCopy <---------+
+//
+//       The root kCopy becomes the new root of the computation. Both copies are
+//       necessary to any potential interference between the parameter value and
+//       the root value. The control edge prevents potential interference
+//       between the copies themselves.
+//
+// If the loop state is a tuple then the above kCopy instructions are a deep
+// copy constructed of kCopy, KGetTupleElement, and kTuple instruction as
+// constructed by HloInstruction::DeepCopyInstruction.
+Status AddCopiesForWhile(const HloAliasAnalysis& alias_analysis,
+                         HloInstruction* xla_while) {
+  VLOG(2) << "Adding copies for kWhile instruction " << xla_while->name();
+  TF_RET_CHECK(xla_while->opcode() == HloOpcode::kWhile);
 
-  // Inserts copies of 'instruction' buffers at indices in 'indices_to_copy',
-  // and replaces all uses for instructions in 'copy_users_' with copy.
-  // Returns the instruction which is a copy 'instruction'.
-  HloInstruction* Copy();
+  ShapeTree<bool> indices_to_copy(xla_while->shape());
+  if (!IndicesToCopyForWhile(alias_analysis.dataflow_analysis(), xla_while,
+                             &indices_to_copy)) {
+    VLOG(2) << "No copies necessary for kWhile instruction "
+            << xla_while->name();
+    return Status::OK();
+  }
 
-  HloInstruction* instruction() { return instruction_; }
+  VLOG(2) << "Adding copies for " << xla_while->name() << " at indices:";
+  for (auto& pair : indices_to_copy) {
+    if (pair.second) {
+      VLOG(2) << "  " << pair.first;
+    }
+  }
 
-  const std::vector<HloInstruction*>& copy_users() const { return copy_users_; }
+  // Deep copy init.
+  HloInstruction* while_init = xla_while->mutable_operand(0);
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * while_init_copy,
+      xla_while->parent()->DeepCopyInstruction(while_init, &indices_to_copy));
+  TF_RETURN_IF_ERROR(while_init->ReplaceUseWith(xla_while, while_init_copy));
 
- private:
-  // Does the given index represent a read-only buffer?
-  bool IsReadOnlyIndex(const ShapeIndex& index) const {
-    return !ShapeUtil::IsNil(read_only_indices_.shape()) &&
-           read_only_indices_.element(index);
-  }
+  // Deep copy the parameter and the root. Extend a control edge from the copy
+  // of the parameter value to the corresponding copy value of the root.
+  HloComputation* body = xla_while->while_body();
+  HloInstruction* param = body->parameter_instruction(0);
+  HloInstruction* root = body->root_instruction();
 
-  // Returns the copy override at the given index, or nullptr.
-  HloInstruction* GetCopyOverride(const ShapeIndex& index) const {
-    return ShapeUtil::IsNil(copy_overrides_.shape())
-               ? nullptr
-               : copy_overrides_.element(index);
-  }
+  // If param is the root then all indices should have been passed through the
+  // while body and we should have returned early above.
+  TF_RET_CHECK(param != root);
 
-  // Records instruction buffer indices which have ambiguous or non-distinct
-  // points-to sets.
-  Status RecordAmbiguousOrNonDistinctIndices(
-      const TuplePointsToAnalysis& points_to_analysis);
+  // Copy users before making a deep copy of the parameter as the deep copy
+  // will create new users of the parameter (eg, the GTE instructions of the
+  // deep copy).
+  std::vector<HloInstruction*> param_users = param->users();
 
-  // Records instruction buffer indices which have interfering live ranges
-  // with 'other_instruction' buffers at same index.
-  Status RecordIndicesWhichInterfereWithOtherInstruction(
-      const BufferLiveness& liveness, const HloInstruction* other_instruction,
-      ShapeTree<bool>* read_only_indices_out);
+  ShapeIndex current_index;
+  TF_ASSIGN_OR_RETURN(auto pair,
+                      DeepCopyAndAddControlEdges(param, root, indices_to_copy));
 
-  // Recursively inserts copies of 'instruction' tuple elements at indices
-  // specified in 'indices_to_copy', and returns the copy of 'instruction'.
-  HloInstruction* CopyTuple(HloInstruction* instruction, ShapeIndex* index);
+  HloInstruction* param_copy = pair.first;
+  HloInstruction* root_copy = pair.second;
 
-  void RecordIndex(const ShapeIndex& index) {
-    *indices_to_copy_.mutable_element(index) = true;
+  for (HloInstruction* user : param_users) {
+    TF_RETURN_IF_ERROR(param->ReplaceUseWith(user, param_copy));
   }
 
-  HloInstruction* instruction_;
-  const std::vector<HloInstruction*> copy_users_;
-  ShapeTree<bool> indices_to_copy_;
-  ShapeTree<std::vector<HloInstruction*>> control_predecessors_;
-  ShapeTree<bool> read_only_indices_;
-  ShapeTree<HloInstruction*> copy_overrides_;
-};
+  body->set_root_instruction(root_copy);
 
-bool InstructionCopier::HasAllIndicesFalse() const {
-  bool all_indices_false = true;
-  indices_to_copy_.ForEachElement(
-      [&all_indices_false](const ShapeIndex& /*index*/, bool data) {
-        if (data) {
-          all_indices_false = false;
-        }
-      });
-  return all_indices_false;
+  return Status::OK();
 }
 
-Status InstructionCopier::RecordIndicesWhichPointToParamOrConstant(
-    const TuplePointsToAnalysis& points_to_analysis) {
-  const PointsToSet& points_to =
-      points_to_analysis.GetPointsToSet(instruction_);
-  // Shallow copy the instruction if the points-to set of the top-level
-  // buffer is ambiguous. This is necessary because the backends must know
-  // statically what the top-level buffer of the result is.
-  if (points_to.element(/*index=*/{}).size() > 1) {
-    RecordIndex({});
+// Removes any control dependencies to or from the given instruction.
+Status StripControlDependenciesFrom(HloInstruction* instruction) {
+  while (!instruction->control_successors().empty()) {
+    TF_RETURN_IF_ERROR(instruction->RemoveControlDependencyTo(
+        instruction->control_successors().front()));
+  }
+
+  while (!instruction->control_predecessors().empty()) {
+    TF_RETURN_IF_ERROR(
+        instruction->control_predecessors().front()->RemoveControlDependencyTo(
+            instruction));
   }
 
-  // Multiple buffers within a parameter/constant may be live out, so collect
-  // a set of indices at which to copy first.
-  points_to.ForEachElement([this](const ShapeIndex& index,
-                                  const PointsToSet::BufferList& buffers) {
-    if (IsReadOnlyIndex(index)) {
-      return;
-    }
-    for (const LogicalBuffer* buffer : buffers) {
-      // pointee is the HloInstruction producing the buffer which may be
-      // liveout.
-      HloInstruction* pointee = buffer->instruction();
-      if (pointee->opcode() == HloOpcode::kParameter ||
-          pointee->opcode() == HloOpcode::kConstant) {
-        VLOG(2) << "Parameter or constant buffer " << buffer->ToString()
-                << " index: " << tensorflow::str_util::Join(index, ",")
-                << " may be live out of computation: " << pointee->ToString();
-        RecordIndex(index);
-        break;
-      }
-    }
-  });
   return Status::OK();
 }
 
-Status InstructionCopier::RecordIndicesToCopyForColocatingBuffers(
-    const BufferLiveness& liveness, const HloInstruction* other_instruction,
-    ShapeTree<bool>* read_only_indices_out) {
-  TF_RETURN_IF_ERROR(
-      RecordAmbiguousOrNonDistinctIndices(liveness.points_to_analysis()));
-  TF_RETURN_IF_ERROR(RecordIndicesWhichInterfereWithOtherInstruction(
-      liveness, other_instruction, read_only_indices_out));
+// Add kCopy instructions to the given module to guarantee there is no
+// live-range interference. Generally interference can only occur around kWhile
+// instructions which have update-in-place semantics.
+Status AddCopiesToResolveInterference(HloModule* module) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                      HloAliasAnalysis::Run(module));
+
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        TF_RETURN_IF_ERROR(AddCopiesForWhile(*alias_analysis, instruction));
+      }
+    }
+  }
   return Status::OK();
 }
 
-Status InstructionCopier::RecordAmbiguousOrNonDistinctIndices(
-    const TuplePointsToAnalysis& points_to_analysis) {
-  const PointsToSet& points_to =
-      points_to_analysis.GetPointsToSet(instruction_);
-  // Mapping from LogicalBuffer to index (used to detect non-distinct indices).
-  FlatMap<const LogicalBuffer*, std::vector<ShapeIndex>>
-      buffer_to_source_indices;
-  points_to.ForEachElement(
-      [this, &buffer_to_source_indices](
-          const ShapeIndex& index, const PointsToSet::BufferList& buffers) {
-        if (buffers.size() > 1) {
-          // Record ambiguous points-to set at 'index'.
-          if (!indices_to_copy_.element(index)) {
-            VLOG(2) << "Adding copy of buffer for instruction: "
-                    << instruction_->name()
-                    << " at index: " << tensorflow::str_util::Join(index, ",")
-                    << " with ambiguous points-to set.";
-            RecordIndex(index);
+// Class for removing unnecessary copies from the module.
+//
+// kCopy instructions are added conservatively to guarantee no live range
+// interference between HLO values. This class uses a more fine-grained analysis
+// to remove some of these added copies which are not strictly necessary.
+class CopyRemover {
+ public:
+  CopyRemover(const HloAliasAnalysis& alias_analysis,
+              const HloOrdering& ordering, HloModule* module)
+      : module_(module),
+        alias_analysis_(alias_analysis),
+        ordering_(ordering),
+        buffer_value_tracker_(*module, alias_analysis, ordering) {}
+
+  // Try to elide the given copy. The copy is elided if the instruction is not
+  // necessary to prevent live-range interference of HLO values. Returns true if
+  // copy was elided.
+  //
+  // The copy instruction is not actually removed here. Instead it is left for
+  // dead in the graph. Later calls to DCE will remove the instruction.
+  StatusOr<bool> TryElideCopy(HloInstruction* copy) {
+    if (buffer_value_tracker_.TryElideCopy(copy)) {
+      TF_RETURN_IF_ERROR(StripControlDependenciesFrom(copy));
+      TF_RETURN_IF_ERROR(copy->ReplaceAllUsesWith(copy->mutable_operand(0)));
+      return true;
+    }
+    return false;
+  }
+
+  string ToString() const {
+    string out = StrCat("CopyRemover, module ", module_->name(), "\n");
+    StrAppend(&out, "  Buffer values, in dependency order:\n");
+    for (const HloBuffer& buffer : alias_analysis_.buffers()) {
+      StrAppend(&out, "    HloBuffer ", buffer.id(), ":\n");
+    }
+    return out;
+  }
+
+ private:
+  // Class which tracks the HLO values within each HLO buffer in the module
+  // during copy removal.
+  //
+  // The values are held in a linked list where there is one list for each
+  // buffer. Removing a copy instruction merges together the values in the
+  // source buffer of the copy to the destination buffer of the copy. This class
+  // tracks these value lists as copies are removed from the graph (and value
+  // lists are merged).
+  //
+  // The BufferValueTracker object is initialized to match the state of
+  // HloAliasAnalysis. However, as copies are removed this state diverges. The
+  // values-to-buffer mapping is maintained outside of HloAliasAnalysis because
+  // a fully updatable alias analysis is very slow.
+  class BufferValueTracker {
+   public:
+    // The values held in a single HLO buffer are represented using a linked
+    // list. An element type in this list is ValueNode.
+    //
+    // This linked list is hand-rolled to enable efficient splicing of lists
+    // using only references to list elements without knowing which lists are
+    // being spliced. std::list requires a reference to the list object to
+    // splice.
+    struct ValueNode {
+      explicit ValueNode(const HloValue* v) : value(v) {}
+
+      const HloValue* value;
+
+      // The uses are maintained outside of HloValue::uses() because
+      // HloValue::uses() is not updatable (a fully updatable dataflow analysis
+      // is slow).
+      std::vector<const HloUse*> uses;
+
+      // next/prev elements in the linked list. The list is circularly linked so
+      // these values are never null for elements in the list.
+      ValueNode* prev = nullptr;
+      ValueNode* next = nullptr;
+    };
+
+    BufferValueTracker(const HloModule& module,
+                       const HloAliasAnalysis& alias_analysis,
+                       const HloOrdering& ordering)
+        : dataflow_(alias_analysis.dataflow_analysis()), ordering_(ordering) {
+      // Construct a list for each HLO buffer in the alias analysis. Maintain a
+      // map from HloValue to the respective list element representing that
+      // value. The map is used to construct the copy info map below.
+      tensorflow::gtl::FlatMap<const HloValue*, ValueNode*> value_to_node;
+      for (const HloBuffer& buffer : alias_analysis.buffers()) {
+        // Verify values contained in the buffer are strictly ordered. This
+        // should always be the case after adding copies to eliminate
+        // interference. Specifically, the addition of the control flow edges
+        // between copies added around aliased operations (kWhile) guarantees
+        // this strict order.
+        for (const HloValue* value_a : buffer.values()) {
+          for (const HloValue* value_b : buffer.values()) {
+            if (value_a != value_b) {
+              DCHECK(ordering_.LiveRangeStrictlyBefore(*value_a, *value_b,
+                                                       dataflow_) ||
+                     ordering_.LiveRangeStrictlyBefore(*value_b, *value_a,
+                                                       dataflow_))
+                  << value_a->ToShortString() << " and "
+                  << value_b->ToShortString() << " are not ordered";
+            }
           }
         }
-        // For each 'buffer': record a mapping from 'buffer' to 'index'.
-        for (const LogicalBuffer* buffer : buffers) {
-          buffer_to_source_indices[buffer].push_back(index);
-        }
-      });
 
-  // Record all non-distinct indices detected in 'buffer_to_source_indices'.
-  for (const auto& buff_to_src : buffer_to_source_indices) {
-    if (buff_to_src.second.size() == 1) {
-      continue;
+        std::vector<const HloValue*> values = buffer.values();
+        std::sort(values.begin(), values.end(),
+                  [this](const HloValue* a, const HloValue* b) {
+                    return ordering_.IsDefinedBefore(*a, *b);
+                  });
+
+        // Create a list containing all of the values in the buffer.
+        AddValueList(values, &value_to_node);
+      }
+
+      // Create copy_map_ which contains the source and destination values
+      // of all copies.
+      CreateCopyMap(module, value_to_node);
+
+      XLA_VLOG_LINES(3, ToString());
+      TF_DCHECK_OK(Verify());
     }
-    for (const ShapeIndex& src_index : buff_to_src.second) {
-      // Record non-distinct points-to set at 'src_index'.
-      if (!indices_to_copy_.element(src_index)) {
-        VLOG(2) << "Adding copy of buffer for instruction: "
-                << instruction_->name()
-                << " at index: " << tensorflow::str_util::Join(src_index, ",")
-                << " because of non-distinct points-to set.";
-        RecordIndex(src_index);
+
+    // Add a list containing the given values to BufferValueTracker. This
+    // represents the values contained in a single buffer. For each value in
+    // 'values' an entry is created in value_to_node which indicates the
+    // respective ValueNode representing that value.
+    void AddValueList(
+        tensorflow::gtl::ArraySlice<const HloValue*> values,
+        tensorflow::gtl::FlatMap<const HloValue*, ValueNode*>* value_to_node) {
+      ValueNode* tail = nullptr;
+      ValueNode* head = nullptr;
+      for (const HloValue* value : values) {
+        auto new_node = new ValueNode(value);
+        (*value_to_node)[value] = new_node;
+
+        // Copy the HLO values's uses into the ValueNode for the value. These
+        // uses in ValueNode are updated as copies are removed.
+        new_node->uses.reserve(value->uses().size());
+        for (const HloUse& use : value->uses()) {
+          new_node->uses.push_back(&use);
+        }
+
+        // Connect the new node into the linked list.
+        if (tail == nullptr) {
+          head = new_node;
+        } else {
+          tail->next = new_node;
+          new_node->prev = tail;
+        }
+        tail = new_node;
       }
+
+      // The linked list is circular so connect the head and tail.
+      tail->next = head;
+      head->prev = tail;
+      value_lists_.insert(head);
     }
-  }
-  return Status::OK();
-}
 
-Status InstructionCopier::RecordIndicesWhichInterfereWithOtherInstruction(
-    const BufferLiveness& liveness, const HloInstruction* other_instruction,
-    ShapeTree<bool>* read_only_indices_out) {
-  // Record all buffer indices for 'instruction_', which interfere with
-  // 'other_instruction' at the same index.
-  ShapeUtil::ForEachSubshape(
-      instruction_->shape(),
-      [this, &liveness, other_instruction, read_only_indices_out](
-          const Shape& /*subshape*/, const ShapeIndex& index) {
-        if (IsReadOnlyIndex(index)) {
-          return;
+    // This method also fills in copy_map_ which indicates which nodes
+    // in the value lists corresponding to the source and destination values of
+    // kCopy instructions. value_to_node should map each HloValue to its
+    // respective ValueNode.
+    void CreateCopyMap(
+        const HloModule& module,
+        const tensorflow::gtl::FlatMap<const HloValue*, ValueNode*>&
+            value_to_node) {
+      for (HloComputation* computation : module.computations()) {
+        for (HloInstruction* instruction : computation->instructions()) {
+          // Add copies with unambiguous source values to the map. Copies with
+          // ambiguous sources are not removable.
+          if (instruction->opcode() == HloOpcode::kCopy) {
+            const HloValueSet& src_value_set =
+                dataflow_.GetValueSet(instruction->operand(0));
+            if (src_value_set.values().size() == 1) {
+              CopyNodes& copy_node = copy_map_[instruction];
+              copy_node.dest =
+                  value_to_node.at(&dataflow_.GetUniqueValueAt(instruction));
+              copy_node.src = value_to_node.at(&src_value_set.GetUniqueValue());
+            }
+          }
         }
-        if (indices_to_copy_.element(index)) {
-          // Return if previous pass already set index.
-          return;
+      }
+    }
+
+    ~BufferValueTracker() {
+      for (const ValueNode* head : value_lists_) {
+        const ValueNode* p = head;
+        do {
+          const ValueNode* tmp = p->next;
+          delete p;
+          p = tmp;
+        } while (p != head);
+      }
+    }
+
+    // Verify invariants within the linked lists.
+    Status Verify() const {
+      for (const ValueNode* head : value_lists_) {
+        const ValueNode* p = head;
+        do {
+          // Verify links between elements are consistent.
+          TF_RET_CHECK(p->prev->next == p);
+          TF_RET_CHECK(p->next->prev == p);
+
+          const HloInstruction* def = p->value->defining_instruction();
+          if (def->opcode() == HloOpcode::kCopy &&
+              ContainsKey(copy_map_, def)) {
+            TF_RET_CHECK(copy_map_.at(def).dest == p);
+          }
+          for (const HloUse* use : p->uses) {
+            if (use->instruction->opcode() == HloOpcode::kCopy &&
+                ContainsKey(copy_map_, use->instruction)) {
+              TF_RET_CHECK(copy_map_.at(use->instruction).src == p);
+            }
+          }
+
+          p = p->next;
+        } while (p != head);
+      }
+      return Status::OK();
+    }
+
+    // Try to elide the given copy. Elision of a copy is possible only if no
+    // live range interference is introduced by the copy's elimination. If
+    // elision is possible, then the internal state (value lists) are updated,
+    // and true is returned. Returns false otherwise.
+    bool TryElideCopy(const HloInstruction* copy) {
+      VLOG(2) << "Trying to remove " << copy->name();
+
+      if (!ContainsKey(copy_map_, copy)) {
+        VLOG(2) << copy->name() << " is not removable";
+        return false;
+      }
+
+      const CopyNodes& copy_node = copy_map_.at(copy);
+      ValueNode* src = copy_node.src;
+      ValueNode* dest = copy_node.dest;
+      DCHECK(src != nullptr);
+      DCHECK(dest != nullptr);
+
+      auto is_live_range_before = [this](const ValueNode& a,
+                                         const ValueNode& b) {
+        if (LiveRangeBefore(a, b)) {
+          VLOG(2) << "  Live range of " << a.value->ToShortString()
+                  << " is before " << b.value->ToShortString();
+          return true;
+        } else {
+          VLOG(2) << "  Live range of " << a.value->ToShortString()
+                  << " is not before " << b.value->ToShortString();
+          return false;
         }
-        const auto& points_to_analysis = liveness.points_to_analysis();
-        // Lookup buffers for 'instruction_' and 'other_instruction'.
-        const auto instruction_buffers =
-            points_to_analysis.GetPointsToSet(instruction_).element(index);
-        // If 'instruction_' has ambiguous points-to-set  at 'index', it would
-        // have been recorded in a previous pass (and we would have returned
-        // early at the entry to this function). As a result, here we know that
-        // 'instruction_' has just one buffer in its points-to-set.
-        CHECK_EQ(1, instruction_buffers.size());
-        const LogicalBuffer* instruction_buffer = instruction_buffers[0];
-
-        const auto other_instruction_buffers =
-            points_to_analysis.GetPointsToSet(other_instruction).element(index);
-        // Do not insert a copy if both instructions point at the same buffer.
-        // This eliminates unnecessary copies of read-only tuple elements.
-        // If 'instruction_' and 'other_instruction' point to the same buffer,
-        // then that buffer is not updated on the path between the two
-        // instructions. Therefore, any other (possibly interference-causing)
-        // users of that buffer from 'other_instruction' will see the same data,
-        // irrespective of whether we insert a copy of this buffer at
-        // 'instruction_' or not.
-        if (other_instruction_buffers.size() == 1 &&
-            other_instruction_buffers[0]->id() == instruction_buffer->id()) {
-          if (read_only_indices_out != nullptr) {
-            *read_only_indices_out->mutable_element(index) = true;
+      };
+
+      VLOG(3) << copy->name() << " copies value "
+              << src->value->ToShortString();
+      VLOG(3) << "Source buffer values: " << ValueListToString(src);
+      VLOG(3) << "Dest buffer values: " << ValueListToString(src);
+
+      // A kCopy instruction copies an HLO value from a source buffer and
+      // defines an HLO value in a destination buffer. Most generally, the
+      // source and destination buffers may each hold more than one value at
+      // different points in the computation so we define the following:
+      //
+      //   Values in source buffer:      {s_0, ..., s_n}
+      //   Values in destination buffer: {d_0, ..., d_m}
+      //
+      // A kCopy instruction between these buffers copies a value s_x in the
+      // source buffer and defines a value d_y in the destination buffer. The
+      // elision of a copy merges the source and destination buffers together,
+      // so the list of values for the source and destination buffers are
+      // merged.
+      //
+      // We handle two different cases for copy elision:
+      //
+      //  (1) the kCopy defines the first value in the destination buffer (d_0).
+      //
+      //  (2) the kCopy copies the last value in the source buffer (s_n).
+      //
+      // For the remaining case where the kCopy copies a not-last value from the
+      // source buffer to a not-first value of the destination buffer, the kCopy
+      // instruction cannot be removed. This case is generated, for example, if
+      // the kCopy copies a while body parameter of the loop state at one tuple
+      // index to a different tuple index in the while body root. Removal of the
+      // copy necessarily results in live range interference of values in the
+      // loop state at the two different tuple indices.
+      //
+      //  We can only perform copy elision if the resulting merged values have
+      //  totally ordered live ranges; otherwise the merged buffer would have
+      //  live range interference.
+      if (IsHead(*dest)) {
+        // The copy copies an arbitrary value in the source buffer (call it s_x)
+        // and defines d_0, the first value in the destination buffer. After
+        // merging, the values in the combined buffer must be strictly ordered
+        // as follows** to elide the copy:
+        //
+        // {s_0, ..., s_x, d_1, ..., d_m, s_{x+1}, ..., s_n}
+        //
+        // Removing the copy eliminates d_0, and uses of d_0 become uses of
+        // s_x. In the above ordering, the live range of d_m must be ordered
+        // before the live range of s_{x+1} and the definition and all uses of
+        // s_x must be ordered before the definition of d_1. These conditions
+        // are checked below prior to elision.
+        //
+        // ** Technically it might be possible to have a non-interfering
+        //    non-trivial interleaving of the values of the source and
+        //    destination buffers in the resulting order. However, this case is
+        //    slow and complicated to check and likely not worth it. So instead
+        //    we simply check for the case where *all* values of the destination
+        //    buffer (d_1 through d_m) are spliced into the point where the copy
+        //    used to be.
+        VLOG(2) << copy->name() << " defines the first value in its buffer";
+        ValueNode* next_dest = Next(*dest);
+        if (next_dest != nullptr) {
+          // Live range of 'from' value (s_x) must be before 'next_dest' (d_1);
+          if (!is_live_range_before(*src, *next_dest)) {
+            return false;
           }
-          return;
         }
-        // We can't say anything about the ambiguity of 'other_instruction' at
-        // this point, so we need to check interference between the single
-        // buffer in the points-to set of 'instruction_' and all buffers in
-        // 'other_instruction_buffers'.
-        for (const LogicalBuffer* other_buffer : other_instruction_buffers) {
-          if (liveness.MayInterfere(*instruction_buffer, *other_buffer)) {
-            VLOG(2) << "Adding copy of buffer for instruction: "
-                    << instruction_->name()
-                    << " instruction_buffer: " << instruction_buffer->ToString()
-                    << " at index: " << tensorflow::str_util::Join(index, ",")
-                    << " because of interference with buffer: "
-                    << other_buffer->ToString();
-            RecordIndex(index);
-            break;
+        ValueNode* next_src = Next(*src);
+
+        if (next_src != nullptr) {
+          // Live range of 'last_dest' (d_m) must be before 'next_src' s_{x+1}.
+          ValueNode* last_dest = dest->prev;
+          DCHECK(IsTail(*last_dest));
+          if (!is_live_range_before(*last_dest, *next_src)) {
+            return false;
           }
         }
-      });
-  return Status::OK();
-}
 
-// This is called when 'instruction_' is a while body root, and 'parameter' is
-// the while body parameter. We record all users of all aliases of 'parameter'
-// as control predecessors, so that when we add a copy of 'instruction_', we can
-// mark the control dependencies. This is necessary because points-to and
-// liveness analysis doesn't know about the aliasing between the while body root
-// and param. Without these control dependencies, the copy might get scheduled
-// to run at a point that interferes with users of the buffer.
-Status InstructionCopier::RecordControlPredecessors(
-    const TuplePointsToAnalysis& points_to_analysis,
-    HloInstruction* parameter) {
-  return indices_to_copy_.ForEachElementWithStatus(
-      [this, &points_to_analysis, parameter](const ShapeIndex& index,
-                                             bool will_copy) {
-        if (will_copy) {
-          TF_ASSIGN_OR_RETURN(
-              const LogicalBuffer* buffer,
-              points_to_analysis.GetBufferDefinedAt(parameter, index));
-          for (const BufferAlias& alias :
-               points_to_analysis.GetBufferAliases(*buffer)) {
-            for (HloInstruction* user : alias.instruction()->users()) {
-              if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(),
-                                          user, points_to_analysis)) {
-                continue;
-              }
-
-              if (user != instruction_) {
-                control_predecessors_.mutable_element(index)->push_back(user);
-              }
-            }
+        // Splice in destination buffer values list right after 'src'.
+        SpliceAfter(dest, src);
+      } else if (IsTail(*src)) {
+        // The copy copies the last value in the source buffer, s_n, and defines
+        // an arbitrary value in the destination buffer, d_y.  After
+        // merging, the values in the combined buffer must be strictly ordered
+        // as follows** to elide the copy:
+        //
+        // {d_0, ..., d_{y-1}, s_0, ..., s_n, d_{y+1}, ..., d_m}
+        //
+        // Removing the copy eliminates d_y, and uses of d_y become uses of
+        // s_n. To enforce the above order, the live range of d_{y-1} must be
+        // before the live range of s_0, and the live range of s_n must be
+        // before the live range of d_{y+1}.
+        //
+        // ** See comment above in the code handling Case (1).
+        VLOG(2) << copy->name() << " copies the last value ("
+                << src->value->ToShortString() << ") in its buffer";
+
+        ValueNode* prev_dest = Prev(*dest);
+        // nullptr condition handled above in the first 'if' case.
+        DCHECK(prev_dest != nullptr);
+        ValueNode* first_src = src->next;
+        DCHECK(IsHead(*first_src));
+        if (!is_live_range_before(*prev_dest, *first_src)) {
+          // Live range of value d_{y-1} is not before s_0.
+          return false;
+        }
+        ValueNode* next_dest = Next(*dest);
+        if (next_dest != nullptr) {
+          if (!is_live_range_before(*src, *next_dest)) {
+            // Live range of value s_n is not before d_{y+1}.
+            return false;
           }
         }
-        return Status::OK();
-      });
-}
 
-// Recursively inserts copies of 'instruction' tuple element buffers at
-// indices in 'indices_to_copy_', expanding tuples as needed.
-HloInstruction* InstructionCopier::CopyTuple(HloInstruction* instruction,
-                                             ShapeIndex* index) {
-  const int64 num_tuple_elements =
-      ShapeUtil::TupleElementCount(instruction->shape());
-  std::vector<HloInstruction*> elem_copies(num_tuple_elements);
-  for (int64 i = 0; i < num_tuple_elements; ++i) {
-    HloInstruction* elem;
-    if (instruction->opcode() == HloOpcode::kTuple) {
-      // If the instruction is already a Tuple instruction, we know that the
-      // element buffers are aliased, so we can just grab the operand directly.
-      elem = instruction->mutable_operand(i);
-    } else {
-      // Otherwise we need to add a GTE to unpack the element out of the tuple.
-      elem = instruction->parent()->AddInstruction(
-          HloInstruction::CreateGetTupleElement(
-              ShapeUtil::GetSubshape(instruction->shape(), {i}), instruction,
-              i));
-    }
-    index->push_back(i);
-    if (ShapeUtil::IsTuple(elem->shape())) {
-      elem_copies[i] = CopyTuple(elem, index);
-    } else if (!indices_to_copy_.element(*index)) {
-      elem_copies[i] = elem;
-    } else if (HloInstruction* copy_override = GetCopyOverride(*index)) {
-      elem_copies[i] = copy_override;
-    } else {
-      HloInstruction* elem_copy = elem->parent()->AddInstruction(
-          HloInstruction::CreateUnary(elem->shape(), HloOpcode::kCopy, elem));
-      for (HloInstruction* control_predecessor :
-           control_predecessors_.element(*index)) {
-        VLOG(2) << "Adding control dependency from "
-                << control_predecessor->ToString() << " to "
-                << elem_copy->ToString();
-        TF_CHECK_OK(control_predecessor->AddControlDependencyTo(elem_copy));
+        // Splice source buffer values list right after 'prev_dest'.
+        SpliceAfter(first_src, prev_dest);
+      } else {
+        VLOG(2)
+            << copy->name()
+            << " copies value in middle of source buffer to value in middle "
+               "of destination buffer";
+        return false;
       }
-      elem_copies[i] = elem_copy;
+
+      RemoveCopyValue(dest);
+
+      XLA_VLOG_LINES(4, ToString());
+      TF_DCHECK_OK(Verify());
+
+      return true;
     }
-    index->pop_back();
-  }
-  return instruction->parent()->AddInstruction(
-      HloInstruction::CreateTuple(elem_copies));
-}
 
-// Inserts copies of 'instruction_' buffers at indices in 'indices_to_copy_'.
-HloInstruction* InstructionCopier::Copy() {
-  ShapeIndex index;
-  HloInstruction* copy;
-  if (ShapeUtil::IsTuple(instruction_->shape())) {
-    copy = CopyTuple(instruction_, &index);
-  } else {
-    copy = instruction_->parent()->AddInstruction(HloInstruction::CreateUnary(
-        instruction_->shape(), HloOpcode::kCopy, instruction_));
-  }
-  for (HloInstruction* user : copy_users_) {
-    VLOG(2) << "Adding copy between instruction: " << instruction_->name()
-            << " and user: " << user->name();
-    TF_CHECK_OK(instruction_->ReplaceUseWith(user, copy));
+    // Delete the given ValueNode associated with a elided kCopy
+    // instruction. This should be called after splicing the value lists of the
+    // source and destination buffers together.
+    void RemoveCopyValue(ValueNode* copy_value_node) {
+      CHECK_EQ(copy_value_node->value->defining_instruction()->opcode(),
+               HloOpcode::kCopy);
+      ValueNode* operand_node = copy_value_node->prev;
+      CHECK(operand_node != copy_value_node);
+
+      VLOG(2) << "Removing copy " << operand_node->value->ToShortString()
+              << " => " << copy_value_node->value->ToShortString();
+
+      // Splice out the copy value node.
+      operand_node->next = copy_value_node->next;
+      copy_value_node->next->prev = operand_node;
+
+      // Patch up uses. Remove use of copy from operand_node uses.
+      auto it =
+          std::find_if(operand_node->uses.begin(), operand_node->uses.end(),
+                       [copy_value_node](const HloUse* use) {
+                         return use->instruction ==
+                                copy_value_node->value->defining_instruction();
+                       });
+      CHECK(it != operand_node->uses.end());
+      operand_node->uses.erase(it);
+
+      // If the elided copy has any uses which are themselves kCopy instructions
+      // then patch up the copy info to reflect the that this kCopy instruction
+      // has a different operand (the operand of the elided copy).
+      for (const HloUse* copy_use : copy_value_node->uses) {
+        operand_node->uses.push_back(copy_use);
+        if (copy_use->instruction->opcode() == HloOpcode::kCopy) {
+          copy_map_.at(copy_use->instruction).src = operand_node;
+        }
+      }
+
+      // Delete the copy info and the value node.
+      copy_map_.erase(copy_value_node->value->defining_instruction());
+      delete copy_value_node;
+    }
+
+    // Returns true if the live range of given value 'a' is before the live
+    // range of 'b'.
+    //
+    // We cannot use LiveRangeStrictlyBefore because HloValue::uses() is not
+    // updated as copies are removed.
+    bool LiveRangeBefore(const ValueNode& a, const ValueNode& b) {
+      if (a.uses.empty()) {
+        VLOG(2) << "Empty uses";
+        return ordering_.IsDefinedBefore(*a.value, *b.value);
+      }
+      for (const HloUse* use : a.uses) {
+        VLOG(2) << "use: " << *use;
+        VLOG(2) << "is before:" << *b.value;
+        if (!ordering_.UseIsBeforeValueDefinition(*use, *b.value, dataflow_)) {
+          VLOG(2) << "Not before";
+          return false;
+        }
+      }
+      return true;
+    }
+
+    // Returns whether 'node' is the last node in its list.
+    bool IsTail(const ValueNode& node) const {
+      return ContainsKey(value_lists_, node.next);
+    }
+
+    // Returns whether 'node' is the first node in its list.
+    bool IsHead(const ValueNode& node) const {
+      return ContainsKey(value_lists_, &node);
+    }
+
+    // Returns the next node in the list after 'node'. If 'node' is the
+    // tail, then nullptr is returned.
+    ValueNode* Next(const ValueNode& node) const {
+      if (IsTail(node)) {
+        return nullptr;
+      } else {
+        return node.next;
+      }
+    }
+
+    // Returns the previous node in the list before 'node'. If 'node'
+    // is the head, then nullptr is returned.
+    ValueNode* Prev(const ValueNode& node) const {
+      if (IsHead(node)) {
+        return nullptr;
+      } else {
+        return node.prev;
+      }
+    }
+
+    // Splices the entire linked list with 'head' as its head right after the
+    // node 'insert_after' in another linked list.
+    void SpliceAfter(ValueNode* head, ValueNode* insert_after) {
+      DCHECK(IsHead(*head));
+      value_lists_.erase(head);
+
+      ValueNode* tail = head->prev;
+      tail->next = insert_after->next;
+      insert_after->next->prev = tail;
+
+      insert_after->next = head;
+      head->prev = insert_after;
+    }
+
+    string ValueListToString(const ValueNode* element) {
+      const ValueNode* head = element;
+      while (!IsHead(*head)) {
+        head = Prev(*head);
+      }
+      std::vector<const HloValue*> values;
+      for (const ValueNode* p = head; p != nullptr; p = Next(*p)) {
+        values.push_back(p->value);
+      }
+      return StrCat("{",
+                    Join(values, ", ",
+                         [](string* s, const HloValue* value) {
+                           StrAppend(s, value->ToShortString());
+                         }),
+                    "}");
+    }
+
+    string ToString() const {
+      string out = StrCat("BufferValueTracker:\n");
+      StrAppend(&out, "  Def-use chains in each buffer:\n");
+      for (const ValueNode* head : value_lists_) {
+        StrAppend(&out, "    Buffer defined by ", head->value->ToShortString(),
+                  ":\n");
+        const ValueNode* p = head;
+        do {
+          StrAppend(&out, "      ", p->value->ToShortString(), ", uses: ",
+                    Join(p->uses, "; ",
+                         [](string* s, const HloUse* use) {
+                           StrAppend(s, use->ToString());
+                         }),
+                    "\n");
+
+          p = p->next;
+        } while (p != head);
+      }
+      StrAppend(&out, "  Potentially removable copies:\n");
+      for (const auto& pair : copy_map_) {
+        const HloInstruction* copy = pair.first;
+        const CopyNodes& copy_info = pair.second;
+
+        StrAppend(&out, "    ", copy->name(), " : ",
+                  copy_info.src->value->ToShortString(), " => ",
+                  copy_info.dest->value->ToShortString(), "\n");
+      }
+      return out;
+    }
+
+   private:
+    const HloDataflowAnalysis& dataflow_;
+    const HloOrdering& ordering_;
+
+    // The heads of all the value lists. Each value list represents the HLO
+    // values contained in a particular HLO buffer. The values in the list are
+    // in dependency order.
+    tensorflow::gtl::FlatSet<const ValueNode*> value_lists_;
+
+    // Copy removal requires fast access to the value list elements
+    // corresponding to the source and destination values of the kCopy
+    // instruction. This data structure holds pointers to these elements for
+    // each kCopy instruction in the graph.
+    struct CopyNodes {
+      // The source and destinations values of the kCopy instruction.
+      ValueNode* src = nullptr;
+      ValueNode* dest = nullptr;
+    };
+    tensorflow::gtl::FlatMap<const HloInstruction*, CopyNodes> copy_map_;
+  };
+
+  HloModule* module_;
+  const HloAliasAnalysis& alias_analysis_;
+  const HloOrdering& ordering_;
+
+  // Object tracking the HLO values contained in each HLO buffer.
+  BufferValueTracker buffer_value_tracker_;
+};
+
+// Try to remove as many copies from the module as possible without introducing
+// live range interference. Copy instructions (identified by their unique id) in
+// the set copies_to_exclude are not considered for removal.
+Status RemoveUnnecessaryCopies(
+    const HloOrdering& ordering,
+    const tensorflow::gtl::FlatSet<int>& copies_to_exclude, HloModule* module) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                      HloAliasAnalysis::Run(module));
+  CopyRemover copy_remover(*alias_analysis, ordering, module);
+  XLA_VLOG_LINES(3, copy_remover.ToString());
+
+  tensorflow::gtl::FlatSet<int> existing_copies;
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kCopy &&
+          !ContainsKey(copies_to_exclude, instruction->unique_id())) {
+        TF_RETURN_IF_ERROR(copy_remover.TryElideCopy(instruction).status());
+      }
+    }
   }
-  return copy;
+
+  return Status::OK();
 }
 
-// The 'read_only_indices' are initialized based on points-to analysis on the
-// while body corresponding to 'while_hlo'. If the init buffer corresponding to
-// a read-only index aliases with a constant, it cannot be considered read-only,
-// and must be copied. This is necessary because BufferAssignment does not
-// currently assign an allocation for constants (b/32248867).
-// This function performs this fix-up of 'read_only_indices'.
+// Add copies to address special constraints on the roots of computations not
+// related to live range interference:
 //
-// Returns a ShapeTree of copy_overrides, which implements an optimization to
-// allow multiple while loops that share the same read-only constants to
-// share a single copy.
-StatusOr<ShapeTree<HloInstruction*>> RevertReadOnlyIndicesForConstants(
-    const HloInstruction* while_hlo,
-    const TuplePointsToAnalysis& points_to_analysis,
-    ShapeTree<bool>* read_only_indices,
-    FlatMap<const HloInstruction*, HloInstruction*>* shared_copies) {
-  const HloInstruction* init_hlo = while_hlo->operand(0);
-  const PointsToSet& points_to = points_to_analysis.GetPointsToSet(init_hlo);
-
-  // Mapping from LogicalBuffer to index (used to detect non-distinct indices).
-  FlatSet<const LogicalBuffer*> buffer_set;
-
-  ShapeTree<HloInstruction*> copy_overrides(init_hlo->shape());
-  points_to.ForEachElement([init_hlo, read_only_indices, shared_copies,
-                            &buffer_set, &copy_overrides](
-                               const ShapeIndex& index,
-                               const PointsToSet::BufferList& buffers) {
-    // Look for read-only entry parameters.
-    if (!read_only_indices->element(index)) {
-      return;
-    }
-    for (const LogicalBuffer* buffer : buffers) {
-      HloInstruction* pointee = buffer->instruction();
-      const bool is_constant = pointee->opcode() == HloOpcode::kConstant;
-      if (!is_constant) {
-        continue;
-      }
+//    (1) Entry computation root must be unambiguous and distinct.
+//
+//    (2) Any computation called by a kCall instruction must have an
+//        unambiguous root.
+//
+//    (3) Constants and parameters cannot be live out of the entry computation
+//
+Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                      HloAliasAnalysis::Run(module));
+
+  // Identify which shape indices of which instructions need to be copied. Store
+  // these results in 'instructions_to_copy'.
+  std::unordered_map<HloInstruction*, ShapeTree<bool>> instructions_to_copy;
+  auto add_index_to_copy = [&instructions_to_copy](HloInstruction* instruction,
+                                                   const ShapeIndex& index) {
+    auto it = instructions_to_copy.find(instruction);
+    if (it == instructions_to_copy.end()) {
+      auto it_added = instructions_to_copy.emplace(
+          std::piecewise_construct, std::forward_as_tuple(instruction),
+          std::forward_as_tuple(instruction->shape(), /*init_value=*/false));
+      it = it_added.first;
+    }
+    *it->second.mutable_element(index) = true;
+  };
 
-      // We have found an constant that is read-only in
-      // the while body. These buffers are managed by the caller, and cannot
-      // be aliased with HLO buffers. Revert this read-only index,
-      // to allow it to be copied.
-      *read_only_indices->mutable_element(index) = false;
-
-      // Optimization to allow multiple while loops that share the same
-      // read-only entry constants to share a single copy.
-      // Only unambiguous and distinct array-shaped buffers are allowed, to
-      // reduce code complexity. The shape of the entry parameter must be
-      // identical to the shape of the init_hlo at this index, to ensure
-      // there were no intervening bitcast or GTE instructions, which are
-      // also hard to handle.
-      const Shape& pointee_shape = pointee->shape();
-      const Shape& init_shape =
-          ShapeUtil::GetSubshape(init_hlo->shape(), index);
-      if (buffers.size() == 1 && ShapeUtil::IsArray(pointee_shape) &&
-          ShapeUtil::Equal(pointee_shape, init_shape) &&
-          buffer_set.count(buffer) < 1) {
-        HloInstruction** copy = &(*shared_copies)[pointee];
-        if (*copy == nullptr) {
-          *copy = pointee->parent()->AddInstruction(HloInstruction::CreateUnary(
-              pointee_shape, HloOpcode::kCopy, pointee));
+  // Iterate through values of all constants and entry parameters. These values
+  // are special because they are held in read-only buffers. If any of these
+  // values share a buffer with other values (for example, the init value of a
+  // while is a constant) then copy the value at its definition and replace all
+  // its uses with the copy.
+  for (const HloValue* value : alias_analysis->dataflow_analysis().values()) {
+    if (ValueIsReadOnly(*value) &&
+        alias_analysis->GetBufferContainingValue(*value).values().size() > 1) {
+      VLOG(2) << "Value " << value->ToShortString()
+              << " is read only, but its buffer contains more than one value. "
+                 "Copying.";
+      add_index_to_copy(value->defining_instruction(), value->defining_index());
+    }
+  }
+
+  // Identify copies which must be added at root instructions
+  for (HloComputation* computation : module->computations()) {
+    const CallGraphNode& node = call_graph.GetNode(computation);
+    if (node.context() == CallContext::kParallel) {
+      continue;
+    }
+    TF_RET_CHECK(node.context() == CallContext::kSequential);
+
+    const bool is_entry = computation == module->entry_computation();
+    HloInstruction* root = computation->root_instruction();
+
+    // Mark nondistinct/ambiguous indices.
+    tensorflow::gtl::FlatSet<const HloBuffer*> seen;
+    ShapeUtil::ForEachSubshape(
+        root->shape(), [&](const Shape& /*subshape*/, const ShapeIndex& index) {
+          std::vector<const HloBuffer*> buffers_at_index =
+              alias_analysis->ComputeBuffersAt(root, index);
+          bool buffer_seen_before = false;
+          for (const HloBuffer* buffer : buffers_at_index) {
+            buffer_seen_before |= !seen.insert(buffer).second;
+          }
+          if (buffers_at_index.size() > 1 || (buffer_seen_before && is_entry)) {
+            VLOG(2) << "Index " << index << " of root of computation "
+                    << computation->name() << " (" << root->name()
+                    << ") has ambiguous or non-distinct buffer. Copying.";
+            add_index_to_copy(root, index);
+          }
+        });
+
+    // For entry instructions, mark any parameter or constant values.
+    if (is_entry) {
+      for (const auto& pair :
+           alias_analysis->dataflow_analysis().GetInstructionValueSet(root)) {
+        const ShapeIndex& index = pair.first;
+        const HloValueSet& value_set = pair.second;
+        for (const HloValue* value : value_set.values()) {
+          if (ValueIsReadOnly(*value)) {
+            VLOG(2) << "Root of entry computation (" << root->name()
+                    << ") has constant or entry parameter value at index "
+                    << index << ". Copying.";
+            add_index_to_copy(root, index);
+          }
         }
-        // Add the copy as an override.
-        *copy_overrides.mutable_element(index) = *copy;
       }
+    }
+  }
 
-      // Tracks whether this current buffer is distinct.
-      buffer_set.insert(buffer);
+  // Add copy instructions indicated in 'instructions_to_copy' to the module.
+  for (const auto& pair : instructions_to_copy) {
+    HloInstruction* instruction = pair.first;
+    const ShapeTree<bool>& indices_to_copy = pair.second;
 
-      // We've already reverted the read-only index and handled the
-      // single-copy optimization above, so there's nothing more to do.
-      break;
+    std::vector<HloInstruction*> users = instruction->users();
+    TF_ASSIGN_OR_RETURN(HloInstruction * deep_copy,
+                        instruction->parent()->DeepCopyInstruction(
+                            instruction, &indices_to_copy));
+    for (HloInstruction* user : users) {
+      TF_RETURN_IF_ERROR(instruction->ReplaceUseWith(user, deep_copy));
+    }
+    if (instruction == instruction->parent()->root_instruction()) {
+      instruction->parent()->set_root_instruction(deep_copy);
     }
-  });
-  return copy_overrides;
+  }
+
+  return Status::OK();
+}
+
+Status VerifyNoLiveRangeInterference(HloModule* module) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                      HloAliasAnalysis::Run(module));
+  DependencyHloOrdering ordering(module);
+  TF_RET_CHECK(!alias_analysis->HasLiveRangeInterference(ordering));
+  return Status::OK();
 }
 
-}  // anonymous namespace
-
-// NOTE: This is only called by gpu::CopyInsertion. It's not called here in the
-// base class, since the regular CopyInsertion logic above selectively copies
-// tuple elements, while this method assumes all buffers need to be deep copied.
-StatusOr<HloInstruction*> CopyInsertion::FindOrInsertCopy(HloInstruction* hlo) {
-  auto copy_it = inserted_copies_.find(hlo);
-  if (copy_it == inserted_copies_.end()) {
-    HloInstruction* copy = hlo->parent()->DeepCopyInstruction(hlo).ValueOrDie();
-    inserted_copies_.insert({hlo, copy});
-    return copy;
-  } else {
-    return copy_it->second;
+void MaybeDumpModule(const string& message, const HloModule& module) {
+  if (VLOG_IS_ON(3)) {
+    VLOG(3) << message;
+    XLA_VLOG_LINES(3, module.ToString());
+    hlo_graph_dumper::MaybeDumpHloModule(module, message);
   }
 }
 
+}  // namespace
+
 StatusOr<bool> CopyInsertion::Run(HloModule* module) {
-  bool changed = false;
-  VLOG(2) << "CopyInsertion for module " << module->name();
+  // Copy insertion is performed in three steps:
+  //
+  // (1) Add copies conservatively to guarantee that there is no live-range
+  //     interference. This is done simplistically and usually results in more
+  //     copies than is strictly necessary.
+  //
+  // (2) Using a more fine-grained analysis, remove as many copies that were
+  //     added in (1) as possible while ensuring no live-range interference.
+  //
+  // (3) Add copies to resolve issues not related to live range interference
+  //     such as parameters and constants live out of the entry computation.
+  //
+  // We add copies then remove them (step (1) then (2)) rather than simply
+  // adding only the copies that are necessary because, in general, it is
+  // difficult to figure out the minimal set of copies to add once there is
+  // interference. On the other hand, it is easy to determine if removing a copy
+  // will introduce interference.
+  //
+  // The final copy insertion in (3) is done separately to simplify the
+  // implementation of copy removal in (2) which is the most complicated part of
+  // the pass. As is, copy removal only has to reason about live range
+  // interference. If all copies were added in step (1) then copy removal would
+  // also have to reason about things like constants and parameters live out of
+  // the computation.
+  MaybeDumpModule("before copy insertion", *module);
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<BufferLiveness> liveness,
-      BufferLiveness::Run(module, MakeUnique<DependencyHloOrdering>(module)));
-  const auto& points_to_analysis = liveness->points_to_analysis();
-  XLA_VLOG_LINES(2, points_to_analysis.ToString());
-  XLA_VLOG_LINES(2, module->ToString());
-
-  // Gather all while body computations and while instructions.
-  FlatSet<const HloComputation*> while_body_computations;
-  std::vector<HloInstruction*> while_instructions;
-  for (auto* computation : module->computations()) {
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  if (!call_graph->IsFlattened()) {
+    return FailedPrecondition(
+        "Call graph must be flattened before copy insertion.");
+  }
+
+  // Gather Ids of existing kCopy instructions in the module. We avoid removing
+  // these copies (except via DCE in TupleSimplifier) because they may have been
+  // added for reasons not considered by copy insertion (eg, layout assignment).
+  // Instruction id is used instead of HloInstruction* because the pointer
+  // values may be recycled.
+  tensorflow::gtl::FlatSet<int> existing_copies;
+  for (HloComputation* computation : module->computations()) {
     for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() == HloOpcode::kWhile) {
-        while_body_computations.insert(instruction->while_body());
-        while_instructions.push_back(instruction);
+      if (instruction->opcode() == HloOpcode::kCopy) {
+        existing_copies.insert(instruction->unique_id());
       }
     }
   }
 
-  // Collect instruction buffer indices to copy in 'instructions_to_copy'.
-  std::vector<InstructionCopier> instructions_to_copy;
-
-  // Add copies of computation root instructions, if needed.
-  FlatMap<const HloComputation*, ShapeTree<bool>> while_body_read_only_indices;
-  for (auto* computation : module->MakeNonfusionComputations()) {
-    VLOG(2) << "computation " << computation->name();
-    InstructionCopier root_copier(computation->root_instruction(),
-                                  /*copy_users=*/{});
-    if (while_body_computations.count(computation) > 0) {
-      // Record root indices to copy for while body sub-computations. We do not
-      // need to call RecordIndicesWhichPointToParamOrConstant for the while
-      // body root instruction here, because any necessary copies needed to
-      // avoid constants or parameters in the output are handled by while.init
-      // operand copy insertion below (which will share an allocation).
-      HloInstruction* while_body_param = computation->parameter_instruction(0);
-      ShapeTree<bool> read_only_indices(while_body_param->shape());
-      TF_RETURN_IF_ERROR(root_copier.RecordIndicesToCopyForColocatingBuffers(
-          *liveness, while_body_param, &read_only_indices));
-      while_body_read_only_indices[computation] = read_only_indices;
-
-      // Mark control predecessors, based on the body param, for any copies
-      // we'll be inserting. This ensures the copy doesn't run too early.
-      TF_RETURN_IF_ERROR(root_copier.RecordControlPredecessors(
-          points_to_analysis, while_body_param));
-    } else {
-      // Record root indices to copy for general computations.
-      TF_RETURN_IF_ERROR(root_copier.RecordIndicesWhichPointToParamOrConstant(
-          points_to_analysis));
+  TF_RETURN_IF_ERROR(AddCopiesToResolveInterference(module));
+
+  // Simplify the tuple structures introduced by the deep copies. This should be
+  // done before removing copies (RemoveUnnecessaryCopies) because tuple
+  // simplification changes dependencies in the graph which changes live range
+  // interference in the graph. Also run DCE to remove the dead Tuple/GTE
+  // instructions introduced by tuple simplification.
+  TupleSimplifier tuple_simplifier;
+  HloDCE dce;
+  TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
+  TF_RETURN_IF_ERROR(dce.Run(module).status());
+
+  TF_DCHECK_OK(VerifyNoLiveRangeInterference(module));
+
+  MaybeDumpModule("after adding copies to resolve interference", *module);
+
+  DependencyHloOrdering ordering(module);
+  TF_RETURN_IF_ERROR(
+      RemoveUnnecessaryCopies(ordering, existing_copies, module));
+
+  MaybeDumpModule("after removing unnecessary copies", *module);
+
+  TF_RETURN_IF_ERROR(AddSpecialCaseCopies(*call_graph, module));
+
+  MaybeDumpModule("after adding special-case copies", *module);
+
+  TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
+  TF_RETURN_IF_ERROR(dce.Run(module).status());
+  TF_DCHECK_OK(VerifyNoLiveRangeInterference(module));
+
+  MaybeDumpModule("after copy insertion", *module);
+
+  if (VLOG_IS_ON(1)) {
+    int64 num_total_copies = 0;
+    for (HloComputation* computation : module->computations()) {
+      for (HloInstruction* instruction : computation->instructions()) {
+        if (instruction->opcode() == HloOpcode::kCopy) {
+          num_total_copies++;
+        }
+      }
     }
-    instructions_to_copy.push_back(root_copier);
+    VLOG(1) << "Num copies before copy-insertion: " << existing_copies.size();
+    VLOG(1) << "Num copies after copy-insertion: " << num_total_copies;
   }
 
-  // Add copies of while 'init' operand instructions, if needed. 'shared_copies'
-  // is used to ensure that multiple while loops can share a single copy of the
-  // same entry parameter or constant, if all loops use it read-only.
-  //
-  // TODO(b/33301720) Remove redundant while instruction copies.
-  FlatMap<const HloInstruction*, HloInstruction*> shared_copies;
-  for (HloInstruction* while_hlo : while_instructions) {
-    // Fix read_only_indices to account for entry constants. Also
-    // initialize copy_overrides, which ensures a single copy for each read-only
-    // constant that is used in multiple while loops.
-    ShapeTree<bool>* read_only_indices =
-        &while_body_read_only_indices[while_hlo->while_body()];
-    TF_ASSIGN_OR_RETURN(
-        const ShapeTree<HloInstruction*> copy_overrides,
-        RevertReadOnlyIndicesForConstants(while_hlo, points_to_analysis,
-                                          read_only_indices, &shared_copies));
-    // Create InstructionCopier for init operand of while instruction.
-    HloInstruction* init_hlo = while_hlo->mutable_operand(0);
-    InstructionCopier init_copier(init_hlo, {while_hlo});
-    init_copier.SetReadOnlyIndices(*read_only_indices);
-    init_copier.SetCopyOverrides(copy_overrides);
-    // Record 'init' buffer indices which point-to a Constant or Parameter.
-    TF_RETURN_IF_ERROR(init_copier.RecordIndicesWhichPointToParamOrConstant(
-        points_to_analysis));
-    // Record indices necessary to colocate while and init operand buffers.
-    TF_RETURN_IF_ERROR(init_copier.RecordIndicesToCopyForColocatingBuffers(
-        *liveness, while_hlo, /*read_only_indices_out=*/nullptr));
-    instructions_to_copy.push_back(init_copier);
+  return true;
+}
+
+namespace {
+
+bool IsWhileBody(const HloComputation* computation,
+                 const CallGraph& call_graph) {
+  const CallGraphNode& node = call_graph.GetNode(computation);
+
+  if (node.context() == CallContext::kSequential &&
+      !node.caller_callsites().empty()) {
+    // Callgraph should be flattened so sequential context computations can
+    // have at most one caller.
+    CHECK_EQ(node.caller_callsites().size(), 1);
+    const HloInstruction* calling_instruction =
+        node.caller_callsites()[0].instruction();
+    if (calling_instruction->opcode() == HloOpcode::kWhile &&
+        calling_instruction->while_body() == node.computation()) {
+      return true;
+    }
   }
+  return false;
+}
 
-  for (InstructionCopier& to_copy : instructions_to_copy) {
-    if (to_copy.HasAllIndicesFalse()) {
+}  // namespace
+
+/* static */ StatusOr<bool> CopyInsertion::AddCopiesForBufferAssignment(
+    HloModule* module) {
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloDataflowAnalysis> dataflow,
+                      HloDataflowAnalysis::Run(module));
+
+  bool changed = false;
+
+  // If a buffer live out of a computation is a constant, a parameter, or not
+  // defined in the computation, then copy it to account for the limited
+  // computation-scoped analysis in buffer assignment. An exception to this rule
+  // is the while body which is handled properly without copies.
+  for (HloComputation* computation : module->computations()) {
+    if (computation == module->entry_computation() ||
+        IsWhileBody(computation, *call_graph)) {
       continue;
     }
-    changed = true;
 
-    // Copy instruction at recorded buffer indices.
-    HloComputation* computation = to_copy.instruction()->parent();
-    HloInstruction* copy = to_copy.Copy();
-    if (to_copy.instruction() == computation->root_instruction()) {
-      computation->set_root_instruction(copy);
+    HloInstruction* root = computation->root_instruction();
+    ShapeTree<bool> indices_to_copy(root->shape(), /*init_value=*/false);
+    bool copy_root = false;
+    for (const auto& pair : dataflow->GetInstructionValueSet(root)) {
+      const ShapeIndex& index = pair.first;
+      const HloValueSet& value_set = pair.second;
+      for (const HloValue* value : value_set.values()) {
+        HloInstruction* def = value->defining_instruction();
+        if (def->parent() != computation ||
+            def->opcode() == HloOpcode::kConstant ||
+            def->opcode() == HloOpcode::kParameter) {
+          *indices_to_copy.mutable_element(index) = true;
+          copy_root = true;
+        }
+      }
+    }
+    if (copy_root) {
+      TF_ASSIGN_OR_RETURN(
+          HloInstruction * root_copy,
+          computation->DeepCopyInstruction(root, &indices_to_copy));
+      computation->set_root_instruction(root_copy);
+      changed = true;
     }
   }
 
-  VLOG(3) << "After copy insertion for module " << module->name();
-  XLA_VLOG_LINES(3, module->ToString());
+  TupleSimplifier tuple_simplifier;
+  HloDCE dce;
+  TF_ASSIGN_OR_RETURN(bool tuple_simplifier_changed,
+                      tuple_simplifier.Run(module));
+  TF_ASSIGN_OR_RETURN(bool dce_changed, dce.Run(module));
 
-  return changed;
+  return changed || tuple_simplifier_changed || dce_changed;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/copy_insertion.h b/tensorflow/compiler/xla/service/copy_insertion.h
index 28bb62e40c7674960dbb1bb63dc8967b06956028..65e3d31e347e2cb249a072e7d06ca10c55401748 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.h
+++ b/tensorflow/compiler/xla/service/copy_insertion.h
@@ -25,12 +25,25 @@ limitations under the License.
 
 namespace xla {
 
-// HLO pass which inserts a copy of the root instruction (creating a new root)
-// if the root is or points-to any constant or parameter instruction.
-// If the root instruction is a Tuple, only tuple elements which point to
-// constant or parameter instructions will be copied.
-// Copy insertion is necessary because constant and parameter arrays have
-// different lifetimes than computation results.
+// Copy insertion is a legalization HLO pass which inserts copies (kCopy
+// instructions) to eliminate several kinds of problems in the HLO module.
+//
+//   (1) Entry parameter or a constant live out of the entry computation.  Entry
+//       computation arguments and constants have different lifetimes than the
+//       computation result and cannot share the same allocation. Parameters and
+//       constants live out of non-entry computations do not need copies.
+//
+//   (2) Different values which are simultaneously live and which must be held
+//       in the same buffer. This can occur in while bodies. Specifically, the
+//       while loop state (the arguments to the while instruction) is updated
+//       in-place and the update may clobber the value from the previous
+//       iteration before the previous value is dead. Computations called from
+//       kCall instructions do not need such copies because kCall has no update
+//       in-place semantics.
+//
+//   (3) The buffer set of the root instruction of the entry computation must be
+//       unambiguous and distinct. That is, InstructionAliasSet::IsAmbiguous and
+//       InstructionAliasSet::IsDistinct return true.
 class CopyInsertion : public HloPassInterface {
  public:
   tensorflow::StringPiece name() const override { return "copy-insertion"; }
@@ -39,14 +52,16 @@ class CopyInsertion : public HloPassInterface {
   // (copies were inserted).
   StatusOr<bool> Run(HloModule* module) override;
 
- protected:
-  // Returns a copy of `hlo`. Looks in inserted_copies_ first to avoid making
-  // duplicate copies.
-  StatusOr<HloInstruction*> FindOrInsertCopy(HloInstruction* hlo);
-
-  // A map containing all copies inserted during the copy insertion pass. The
-  // key is the copied instruction and the value is the copy.
-  tensorflow::gtl::FlatMap<HloInstruction*, HloInstruction*> inserted_copies_;
+  // The CPU and GPU backend need additional copies added due to deficiencies in
+  // buffer assignment. Specifically, copies are needed for constants live-out
+  // of computations, and for values which are live-in and live-out of the same
+  // computation. These copies are needed because buffer-assignment uses a
+  // computation-scoped analyis (TuplePointsToAnalysis) and has limited
+  // visibility across computation boundaries. This method adds these necessary
+  // copies. Returns whether the module was modified.
+  //
+  // TODO(b/62548313): Remove this when buffer assignment is module-scoped.
+  static StatusOr<bool> AddCopiesForBufferAssignment(HloModule* module);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index a2eacc5c7dae2424e01fdd49d82546b5488d4312..8388574716ad1b78eb8868a8cd732005050b3310 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -17,18 +17,19 @@ limitations under the License.
 
 #include <set>
 
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace op = xla::testing::opcode_matchers;
 
@@ -37,35 +38,53 @@ namespace {
 
 using ::testing::UnorderedElementsAre;
 
+int64 CountCopies(const HloComputation& computation) {
+  int64 count = 0;
+  for (const auto& instruction : computation.instructions()) {
+    if (instruction->opcode() == HloOpcode::kCopy) {
+      count++;
+    }
+  }
+  return count;
+}
+
+int64 CountCopies(const HloModule& module) {
+  int64 count = 0;
+  for (const auto& computation : module.computations()) {
+    count += CountCopies(*computation);
+  }
+  return count;
+}
+
+int64 CountControlEdges(const HloComputation& computation) {
+  int64 count = 0;
+  for (const auto& instruction : computation.instructions()) {
+    count += instruction->control_successors().size();
+  }
+  return count;
+}
+
+int64 CountControlEdges(const HloModule& module) {
+  int64 count = 0;
+  for (const auto& computation : module.computations()) {
+    count += CountControlEdges(*computation);
+  }
+  return count;
+}
+
 class CopyInsertionTest : public HloTestBase {
  protected:
   void InsertCopies(HloModule* module) {
     CopyInsertion copy_insertion;
-    EXPECT_IS_OK(copy_insertion.Run(module).status());
-
-    // Verify the points to set of the root of the computation after copy
-    // insertion contains no constants or parameters, and is distinct and
-    // non-ambiguous.
-    auto points_to_analysis =
-        TuplePointsToAnalysis::Run(module).ConsumeValueOrDie();
-    const auto& points_to = points_to_analysis->GetPointsToSet(
-        module->entry_computation()->root_instruction());
-    EXPECT_TRUE(points_to.IsDistinct());
-    EXPECT_TRUE(!points_to.IsAmbiguous());
-
-    auto maybe_live_out_buffers =
-        points_to_analysis
-            ->GetPointsToSet(module->entry_computation()->root_instruction())
-            .CreateFlattenedSet();
-
-    for (const LogicalBuffer* buffer : maybe_live_out_buffers) {
-      EXPECT_NE(buffer->instruction()->opcode(), HloOpcode::kConstant);
-      EXPECT_NE(buffer->instruction()->opcode(), HloOpcode::kParameter);
-    }
+    ASSERT_IS_OK(copy_insertion.Run(module).status());
   }
+
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
 };
 
 TEST_F(CopyInsertionTest, SingleParameter) {
+  // Computation is a single parameter passed into a tuple. The parameter should
+  // be copied before entering the tuple.
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* x = builder.AddInstruction(
       HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "x"));
@@ -77,14 +96,15 @@ TEST_F(CopyInsertionTest, SingleParameter) {
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
-  HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Tuple(op::Copy(old_root->operand(0))));
+              op::Tuple(op::Copy(x)));
 }
 
 TEST_F(CopyInsertionTest, SingleConstant) {
+  // Computation is a single constant passed into a tuple. The parameter should
+  // be copied before entering the tuple.
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* constant = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
@@ -96,11 +116,42 @@ TEST_F(CopyInsertionTest, SingleConstant) {
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
-  HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Tuple(op::Copy(old_root->operand(0))));
+              op::Tuple(op::Copy(constant)));
+}
+
+TEST_F(CopyInsertionTest, ExistingCopiesNotRemoved) {
+  // Verify that an kCopy instructions which exist in the pass before
+  // copy-insertion remain in the graph after copy-insertion.
+  auto module = CreateNewModule();
+
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  HloInstruction* copy_1 = builder.AddInstruction(HloInstruction::CreateUnary(
+      constant->shape(), HloOpcode::kCopy, constant));
+  HloInstruction* copy_2 = builder.AddInstruction(HloInstruction::CreateUnary(
+      constant->shape(), HloOpcode::kCopy, constant));
+  HloInstruction* add = builder.AddInstruction(HloInstruction::CreateBinary(
+      constant->shape(), HloOpcode::kAdd, copy_1, copy_2));
+  HloInstruction* add_copy = builder.AddInstruction(
+      HloInstruction::CreateUnary(constant->shape(), HloOpcode::kCopy, add));
+
+  module->AddEntryComputation(builder.Build());
+
+  EXPECT_EQ(CountCopies(*module), 3);
+
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 3);
+
+  EXPECT_EQ(module->entry_computation()->root_instruction(), add_copy);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Copy(op::Add(op::Copy(op::Constant()), op::Copy(op::Constant()))));
 }
 
 TEST_F(CopyInsertionTest, MultipleConstantsAndParameters) {
@@ -127,12 +178,12 @@ TEST_F(CopyInsertionTest, MultipleConstantsAndParameters) {
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
-  HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 2);
 
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Tuple(op::Copy(old_root->operand(0)),
-                        op::Copy(old_root->operand(1)), old_root->operand(2)));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(op::Copy(constant2), op::Copy(x), op::Add(constant1, y)));
 }
 
 TEST_F(CopyInsertionTest, AmbiguousPointsToSet) {
@@ -165,6 +216,7 @@ TEST_F(CopyInsertionTest, AmbiguousPointsToSet) {
 
   HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 2);
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::Tuple(op::Copy(op::GetTupleElement(old_root)),
@@ -187,6 +239,7 @@ TEST_F(CopyInsertionTest, BitcastParameter) {
 
   HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::Copy(old_root));
@@ -208,6 +261,7 @@ TEST_F(CopyInsertionTest, BitcastConstant) {
 
   HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::Copy(old_root));
@@ -227,11 +281,11 @@ TEST_F(CopyInsertionTest, BitcastTupleElementParameter) {
 
   EXPECT_THAT(x->users(), UnorderedElementsAre(bitcast));
 
-  HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Tuple(op::Copy(old_root->operand(0))));
+              op::Tuple(op::Copy(bitcast)));
 }
 
 TEST_F(CopyInsertionTest, NestedTupleParameter) {
@@ -257,6 +311,8 @@ TEST_F(CopyInsertionTest, NestedTupleParameter) {
 
   HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 3);
+
   HloInstruction* new_root = module->entry_computation()->root_instruction();
   EXPECT_NE(old_root, new_root);
 
@@ -283,7 +339,7 @@ TEST_F(CopyInsertionTest, ElementOfNestedTupleParameter) {
            ShapeUtil::MakeShape(F32, {42})}),
       "param0"));
 
-  // The return value of the computation is the zero-th elemnt of the nested
+  // The return value of the computation is the zero-th element of the nested
   // tuple. This element is itself a tuple.
   auto gte = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
       ShapeUtil::GetSubshape(param->shape(), {0}), param, 0));
@@ -293,12 +349,13 @@ TEST_F(CopyInsertionTest, ElementOfNestedTupleParameter) {
 
   EXPECT_EQ(gte, module->entry_computation()->root_instruction());
 
-  HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 2);
 
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Tuple(op::Copy(op::GetTupleElement(old_root)),
-                        op::Copy(op::GetTupleElement(old_root))));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(op::Copy(op::GetTupleElement(op::GetTupleElement(param))),
+                op::Copy(op::GetTupleElement(op::GetTupleElement(param)))));
 }
 
 TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
@@ -331,6 +388,7 @@ TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
 
   HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::Copy(old_root));
@@ -346,12 +404,10 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
   // The parameter 'nested' specifies the loop state shape from which to
   // read the induction variable.
   std::unique_ptr<HloComputation> BuildConditionComputation(
-      bool nested = false) {
+      const Shape& loop_state_shape) {
     auto builder = HloComputation::Builder(TestName() + ".Condition");
     auto limit_const = builder.AddInstruction(
         HloInstruction::CreateConstant(Literal::CreateR0<int32>(10)));
-    const Shape& loop_state_shape =
-        nested ? nested_loop_state_shape_ : loop_state_shape_;
     auto loop_state = builder.AddInstruction(
         HloInstruction::CreateParameter(0, loop_state_shape, "loop_state"));
     auto induction_variable =
@@ -582,7 +638,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
       auto loop_state_init = builder.AddInstruction(
           HloInstruction::CreateTuple({induction_var_init, inner_init}));
       auto while_hlo = builder.AddInstruction(HloInstruction::CreateWhile(
-          loop_state_shape_, condition, body, loop_state_init));
+          loop_state_init->shape(), condition, body, loop_state_init));
       module_->AddEntryComputation(builder.Build());
       return while_hlo;
     }
@@ -658,11 +714,28 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
     auto one_vec = builder.AddInstruction(HloInstruction::CreateConstant(
         Literal::CreateR1<float>({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f})));
     // Take a reference to 'data_init' to make it interfere with while result.
-    builder.AddInstruction(HloInstruction::CreateBinary(
+    auto add = builder.AddInstruction(HloInstruction::CreateBinary(
         data_shape_, HloOpcode::kAdd, data_init, one_vec));
 
-    return BuildWhileInstructionWithCustomInit(loop_state_shape_, data_init,
-                                               &builder);
+    auto xla_while = BuildWhileInstructionWithCustomInit(loop_state_shape_,
+                                                         data_init, &builder);
+
+    // Add an additional binary operation operating on the while and the
+    // interfering add so that neither operation is dead.
+    auto gte = xla_while->parent()->AddInstruction(
+        HloInstruction::CreateGetTupleElement(
+            ShapeUtil::GetSubshape(xla_while->shape(), {1}), xla_while, 1));
+    auto sub = xla_while->parent()->AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kSubtract, add, gte));
+    auto gte0 = xla_while->parent()->AddInstruction(
+        HloInstruction::CreateGetTupleElement(
+            ShapeUtil::GetSubshape(xla_while->shape(), {0}), xla_while, 0));
+    auto tuple = xla_while->parent()->AddInstruction(
+        HloInstruction::CreateTuple({gte0, sub}));
+
+    xla_while->parent()->set_root_instruction(tuple);
+
+    return xla_while;
   }
 
   HloInstruction* BuildWhileInstructionWithCustomInit(
@@ -672,8 +745,8 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
         ShapeUtil::Equal(loop_state_shape, nested_loop_state_shape_);
     auto induction_var_init = builder->AddInstruction(
         HloInstruction::CreateConstant(Literal::CreateR0<int32>(0)));
-    auto condition =
-        module_->AddEmbeddedComputation(BuildConditionComputation(nested));
+    auto condition = module_->AddEmbeddedComputation(
+        BuildConditionComputation(loop_state_shape));
     auto body = module_->AddEmbeddedComputation(
         BuildIndependentBodyComputation(nested));
     auto loop_state_init = builder->AddInstruction(
@@ -706,23 +779,21 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
 // CopyInsertion pass should not generate any copies.
 //
 TEST_F(WhileCopyInsertionTest, IndependentTupleElements) {
-  auto condition = module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto condition = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape_));
   auto body =
       module_->AddEmbeddedComputation(BuildIndependentBodyComputation());
   auto while_hlo = BuildWhileInstruction(condition, body);
 
-  const HloInstruction* old_init = while_hlo->operand(0);
-  HloInstruction* old_root = body->root_instruction();
   InsertCopies(module_.get());
-  HloInstruction* new_root = body->root_instruction();
-  const HloInstruction* new_init = while_hlo->operand(0);
 
-  // No copies should be inserted so root should not be updated.
-  EXPECT_EQ(old_root, new_root);
+  // Body should have no copies as the adds can be done inplace.
+  EXPECT_EQ(CountCopies(*body), 0);
+  EXPECT_EQ(CountControlEdges(*module_), 0);
 
-  // Both init indices need copies.
-  EXPECT_THAT(new_init, op::Tuple(op::Copy(old_init->operand(0)),
-                                  op::Copy(old_init->operand(1))));
+  // Both init indices need copies as they are constants.
+  EXPECT_THAT(while_hlo->operand(0),
+              op::Tuple(op::Copy(op::Constant()), op::Copy(op::Constant())));
 }
 
 // Tests while body computation with dependent tuple elements:
@@ -737,20 +808,33 @@ TEST_F(WhileCopyInsertionTest, IndependentTupleElements) {
 //     Tuple(Copy(out0), out1)
 //
 TEST_F(WhileCopyInsertionTest, DependentTupleElements) {
-  auto condition = module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto condition = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape_));
   auto body = module_->AddEmbeddedComputation(BuildDependentBodyComputation());
   auto while_hlo = BuildWhileInstruction(condition, body);
 
-  const HloInstruction* old_init = while_hlo->operand(0);
-  HloInstruction* old_root = body->root_instruction();
   InsertCopies(module_.get());
-  HloInstruction* new_root = body->root_instruction();
-  const HloInstruction* new_init = while_hlo->operand(0);
 
-  EXPECT_THAT(new_root,
-              op::Tuple(op::Copy(old_root->operand(0)), old_root->operand(1)));
-  EXPECT_THAT(new_init, op::Tuple(op::Copy(old_init->operand(0)),
-                                  op::Copy(old_init->operand(1))));
+  EXPECT_EQ(CountCopies(*body), 1);
+  EXPECT_EQ(CountControlEdges(*body), 0);
+
+  EXPECT_THAT(
+      body->root_instruction(),
+      op::Tuple(op::Add(), op::Add(op::GetTupleElement(), op::Broadcast())));
+
+  auto add = body->root_instruction()->operand(0);
+  auto bcast = body->root_instruction()->operand(1)->operand(1);
+  ASSERT_EQ(add->opcode(), HloOpcode::kAdd);
+  ASSERT_EQ(bcast->opcode(), HloOpcode::kBroadcast);
+
+  EXPECT_THAT(
+      while_hlo->while_body()->root_instruction(),
+      op::Tuple(op::Add(op::Copy(), op::Constant()),
+                op::Add(op::GetTupleElement(), op::Broadcast(op::Copy()))));
+
+  // Both init indices need copies as they are constants.
+  EXPECT_THAT(while_hlo->operand(0),
+              op::Tuple(op::Copy(op::Constant()), op::Copy(op::Constant())));
 }
 
 // Tests while body computation with read-only tuple element 0:
@@ -768,33 +852,26 @@ TEST_F(WhileCopyInsertionTest, DependentTupleElements) {
 //
 // CopyInsertion pass should not generate any copies for the while body.
 TEST_F(WhileCopyInsertionTest, DependentTupleElements_OneReadOnly) {
-  auto condition = module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto condition = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape_));
   auto body = module_->AddEmbeddedComputation(
       BuildDependentBodyOneReadOnlyComputation());
-  auto while_hlo = BuildWhileInstruction(condition, body);
+  BuildWhileInstruction(condition, body);
 
-  const HloInstruction* old_init = while_hlo->operand(0);
-  HloInstruction* old_root = body->root_instruction();
   InsertCopies(module_.get());
-  HloInstruction* new_root = body->root_instruction();
-  const HloInstruction* new_init = while_hlo->operand(0);
-
-  // No copies should be inserted in the body, so root should not be updated.
-  EXPECT_EQ(old_root, new_root);
 
-  // Both indices need copies, even though Index 0 is read-only, since both are
-  // constants, which must be copied.
-  EXPECT_THAT(new_init, op::Tuple(op::Copy(old_init->operand(0)),
-                                  op::Copy(old_init->operand(1))));
+  // No copies or control edges should be inserted. The body is legal as is.
+  EXPECT_EQ(CountCopies(*body), 0);
+  EXPECT_EQ(CountControlEdges(*body), 0);
 }
 
 // Same as above, but with two while loops, sharing entry parameters.
 TEST_F(WhileCopyInsertionTest,
        DependentTupleElements_OneReadOnly_TwoLoops_EntryParams) {
-  auto condition1 =
-      module_->AddEmbeddedComputation(BuildConditionComputation());
-  auto condition2 =
-      module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto condition1 = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape_));
+  auto condition2 = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape_));
   auto body1 = module_->AddEmbeddedComputation(
       BuildDependentBodyOneReadOnlyComputation());
   auto body2 = module_->AddEmbeddedComputation(
@@ -812,30 +889,46 @@ TEST_F(WhileCopyInsertionTest,
       loop_state_shape_, condition1, body1, loop_init));
   auto while_hlo2 = builder.AddInstruction(HloInstruction::CreateWhile(
       loop_state_shape_, condition2, body2, loop_init));
-  module_->AddEntryComputation(builder.Build());
+
+  // Add a couple elements from each of the while so both whiles are live.
+  auto gte1 = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeUtil::GetSubshape(while_hlo1->shape(), {0}), while_hlo1, 0));
+  auto gte2 = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeUtil::GetSubshape(while_hlo2->shape(), {0}), while_hlo2, 0));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(gte1->shape(), HloOpcode::kAdd, gte1, gte2));
+
+  auto entry = module_->AddEntryComputation(builder.Build());
 
   InsertCopies(module_.get());
 
-  // Both while loops alias iter_param, since index 0 is read-only in the body.
-  EXPECT_EQ(while_hlo1->operand(0)->operand(0),
-            while_hlo2->operand(0)->operand(0));
-  EXPECT_EQ(while_hlo1->operand(0)->operand(0), iter_param);
+  // Neither body should have any copies or control edges in them.
+  EXPECT_EQ(CountCopies(*body1), 0);
+  EXPECT_EQ(CountCopies(*body2), 0);
+  EXPECT_EQ(CountControlEdges(*body1), 0);
+  EXPECT_EQ(CountControlEdges(*body2), 0);
 
-  // Each while loop gets its own copy of data_param, since index 1 is not
-  // read-only in the body.
+  // Only two copies should be necessary. Each of the whiles should have
+  // a copy of tuple element 1 (init value is a parameter, and the element is
+  // not non-read-only) so each of the while bodies gets its own buffer to write
+  // element 1 into.
+  EXPECT_EQ(CountCopies(*entry), 2);
+
+  EXPECT_EQ(while_hlo1->operand(0)->operand(1)->opcode(), HloOpcode::kCopy);
+  EXPECT_EQ(while_hlo2->operand(0)->operand(1)->opcode(), HloOpcode::kCopy);
+
+  // The two copies of element 1 should be different.
   EXPECT_NE(while_hlo1->operand(0)->operand(1),
             while_hlo2->operand(0)->operand(1));
-  EXPECT_THAT(while_hlo1->operand(0)->operand(1), op::Copy(data_param));
-  EXPECT_THAT(while_hlo2->operand(0)->operand(1), op::Copy(data_param));
 }
 
 // Same as above, but with two while loops, sharing non-parameters.
 TEST_F(WhileCopyInsertionTest,
        DependentTupleElements_OneReadOnly_TwoLoops_NonParams) {
-  auto condition1 =
-      module_->AddEmbeddedComputation(BuildConditionComputation());
-  auto condition2 =
-      module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto condition1 = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape_));
+  auto condition2 = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape_));
   auto body1 = module_->AddEmbeddedComputation(
       BuildDependentBodyOneReadOnlyComputation());
   auto body2 = module_->AddEmbeddedComputation(
@@ -858,21 +951,28 @@ TEST_F(WhileCopyInsertionTest,
       loop_state_shape_, condition1, body1, loop_init));
   auto while_hlo2 = builder.AddInstruction(HloInstruction::CreateWhile(
       loop_state_shape_, condition2, body2, loop_init));
-  module_->AddEntryComputation(builder.Build());
+
+  // Add a couple elements from each of the while so both whiles are not dead.
+  auto gte1 = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeUtil::GetSubshape(while_hlo1->shape(), {0}), while_hlo1, 0));
+  auto gte2 = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeUtil::GetSubshape(while_hlo2->shape(), {0}), while_hlo2, 0));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(gte1->shape(), HloOpcode::kAdd, gte1, gte2));
+  auto entry = module_->AddEntryComputation(builder.Build());
 
   InsertCopies(module_.get());
 
-  // No copies of iter_value are necessary, since index 0 is read-only in both
-  // while bodies.
-  EXPECT_EQ(while_hlo1->operand(0)->operand(0), iter_value);
-  EXPECT_EQ(while_hlo2->operand(0)->operand(0), iter_value);
+  // Ideally only one copy should be necessary. One of the whiles should
+  // have a copy of tuple element 1 (the non-read-only element) so each of the
+  // while bodies gets its own buffer to write element 1 into. However, the
+  // analysis isn't perfect and adds an additional copy of element 0.
+  EXPECT_EQ(CountCopies(*entry), 2);
 
-  // Each while loop gets its own copy of data_value, since index 1 is not
-  // read-only in the body.
-  EXPECT_NE(while_hlo1->operand(0)->operand(1),
-            while_hlo2->operand(0)->operand(1));
-  EXPECT_THAT(while_hlo1->operand(0)->operand(1), op::Copy(data_value));
-  EXPECT_THAT(while_hlo2->operand(0)->operand(1), op::Copy(data_value));
+  EXPECT_THAT(while_hlo1->operand(0),
+              op::Tuple(op::Exp(), op::Copy(op::Exp())));
+  EXPECT_THAT(while_hlo2->operand(0),
+              op::Tuple(op::Exp(), op::Copy(op::Exp())));
 }
 
 // Tests while body computation with nested tuple elements:
@@ -905,18 +1005,34 @@ TEST_F(WhileCopyInsertionTest,
 //                     Tuple  // new root
 //
 TEST_F(WhileCopyInsertionTest, NestedTupleElements) {
-  auto condition =
-      module_->AddEmbeddedComputation(BuildConditionComputation(true));
+  auto condition = module_->AddEmbeddedComputation(
+      BuildConditionComputation(nested_loop_state_shape_));
   auto body = module_->AddEmbeddedComputation(BuildNestedBodyComputation());
   BuildWhileInstruction(condition, body, true);
 
-  HloInstruction* old_root = body->root_instruction();
+  //  HloInstruction* old_root = body->root_instruction();
   InsertCopies(module_.get());
 
-  EXPECT_THAT(body->root_instruction(),
-              op::Tuple(old_root->operand(0),
-                        op::Tuple(old_root->operand(1)->operand(0),
-                                  op::Copy(old_root->operand(1)->operand(1)))));
+  // The only copy necessary is for the kReverse as it cannot be done
+  // in-place (instruction can share buffer with operand). The other elements of
+  // the loop state are kAdd instructions which can be done in-place.
+  EXPECT_EQ(CountCopies(*body), 1);
+
+  // Each element of the init needs a copy as all are constants.
+  EXPECT_EQ(CountCopies(*module_), 4);
+
+  // Either the kReverse itself must be copied or the operand of the kReverse
+  // must be copied.
+  if (body->root_instruction()->operand(1)->operand(1)->opcode() ==
+      HloOpcode::kCopy) {
+    EXPECT_THAT(
+        body->root_instruction(),
+        op::Tuple(op::Add(), op::Tuple(op::Add(), op::Copy(op::Reverse()))));
+  } else {
+    EXPECT_THAT(
+        body->root_instruction(),
+        op::Tuple(op::Add(), op::Tuple(op::Add(), op::Reverse(op::Copy()))));
+  }
 }
 
 // Tests while init instruction which points-to a constant.
@@ -927,11 +1043,13 @@ TEST_F(WhileCopyInsertionTest, NestedTupleElements) {
 //
 TEST_F(WhileCopyInsertionTest, InitPointsToConstant) {
   auto while_hlo = BuildWhileInstruction_InitPointsToConstant();
-  auto old_init = while_hlo->operand(0);
+
   InsertCopies(module_.get());
+  EXPECT_EQ(CountCopies(*while_hlo->while_body()), 0);
+  EXPECT_EQ(CountCopies(*module_), 2);
 
-  EXPECT_THAT(while_hlo->operand(0), op::Tuple(op::Copy(old_init->operand(0)),
-                                               op::Copy(old_init->operand(1))));
+  EXPECT_THAT(while_hlo->operand(0),
+              op::Tuple(op::Copy(op::Constant()), op::Copy(op::Constant())));
 }
 
 // Tests while init instruction which points-to a parameter.
@@ -942,11 +1060,13 @@ TEST_F(WhileCopyInsertionTest, InitPointsToConstant) {
 //
 TEST_F(WhileCopyInsertionTest, InitPointsToParameter) {
   auto while_hlo = BuildWhileInstruction_InitPointsToParameter();
-  auto old_init = while_hlo->operand(0);
+
   InsertCopies(module_.get());
+  EXPECT_EQ(CountCopies(*while_hlo->while_body()), 0);
+  EXPECT_EQ(CountCopies(*module_), 2);
 
-  EXPECT_THAT(while_hlo->operand(0), op::Tuple(op::Copy(old_init->operand(0)),
-                                               op::Copy(old_init->operand(1))));
+  EXPECT_THAT(while_hlo->operand(0),
+              op::Tuple(op::Copy(op::Constant()), op::Copy(op::Parameter())));
 }
 
 // Tests while init instruction which has an ambiguous points-to set.
@@ -975,15 +1095,34 @@ TEST_F(WhileCopyInsertionTest, InitPointsToParameter) {
 //
 TEST_F(WhileCopyInsertionTest, InitPointsToAmbiguous) {
   auto while_hlo = BuildWhileInstruction_InitPointsToAmbiguous();
-  auto old_init = while_hlo->operand(0);
-  InsertCopies(module_.get());
 
-  EXPECT_THAT(
-      while_hlo->operand(0),
-      op::Tuple(
-          op::Copy(old_init->operand(0)),
-          op::Tuple(op::Copy(op::GetTupleElement(old_init->operand(1))),
-                    op::Copy(op::GetTupleElement(old_init->operand(1))))));
+  InsertCopies(module_.get());
+  EXPECT_EQ(CountCopies(*module_), 4);
+  // The entry computation requires three copies to resolve the ambiguity of two
+  // init elements and the constant passed in as one of the init elements.
+  EXPECT_EQ(CountCopies(*module_->entry_computation()), 3);
+  EXPECT_THAT(while_hlo->operand(0),
+              op::Tuple(op::Copy(op::Constant()),
+                        op::Tuple(op::Copy(op::GetTupleElement()),
+                                  op::Copy(op::GetTupleElement()))));
+
+  // The body requires one copy because the buffer set is not distinct: the
+  // result of one of the adds is written into two elements of the output of the
+  // loop body. Either element might be copied.
+  EXPECT_EQ(CountCopies(*while_hlo->while_body()), 1);
+  if (while_hlo->while_body()
+          ->root_instruction()
+          ->operand(1)
+          ->operand(0)
+          ->opcode() == HloOpcode::kCopy) {
+    EXPECT_THAT(
+        while_hlo->while_body()->root_instruction(),
+        op::Tuple(op::Add(), op::Tuple(op::Copy(op::Add()), op::Add())));
+  } else {
+    EXPECT_THAT(
+        while_hlo->while_body()->root_instruction(),
+        op::Tuple(op::Add(), op::Tuple(op::Add(), op::Copy(op::Add()))));
+  }
 }
 
 // Tests while init instruction which has a non-distinct points-to set.
@@ -1011,13 +1150,43 @@ TEST_F(WhileCopyInsertionTest, InitPointsToAmbiguous) {
 //
 TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinct) {
   auto while_hlo = BuildWhileInstruction_InitPointsToNonDistinct();
-  auto old_init = while_hlo->operand(0);
+
   InsertCopies(module_.get());
 
-  EXPECT_THAT(while_hlo->operand(0),
-              op::Tuple(op::Copy(old_init->operand(0)),
-                        op::Tuple(op::Copy(old_init->operand(1)->operand(0)),
-                                  op::Copy(old_init->operand(1)->operand(0)))));
+  // The entry computation requires two copies to resolve the non-disinctness of
+  // two init elements and the constant passed in as one of the init
+  // elements. Either element can be copied for the distinctness issue.
+  EXPECT_EQ(CountCopies(*module_->entry_computation()), 2);
+  if (while_hlo->operand(0)->operand(1)->operand(0)->opcode() ==
+      HloOpcode::kCopy) {
+    EXPECT_THAT(
+        while_hlo->operand(0),
+        op::Tuple(op::Copy(op::Constant()),
+                  op::Tuple(op::Copy(op::Broadcast()), op::Broadcast())));
+  } else {
+    EXPECT_THAT(
+        while_hlo->operand(0),
+        op::Tuple(op::Copy(op::Constant()),
+                  op::Tuple(op::Broadcast(), op::Copy(op::Broadcast()))));
+  }
+
+  // The body requires one copy because the buffer set is not distinct: the
+  // result of one of the adds is written into two elements of the output of the
+  // loop body. Either element might be copied.
+  EXPECT_EQ(CountCopies(*while_hlo->while_body()), 1);
+  if (while_hlo->while_body()
+          ->root_instruction()
+          ->operand(1)
+          ->operand(0)
+          ->opcode() == HloOpcode::kCopy) {
+    EXPECT_THAT(
+        while_hlo->while_body()->root_instruction(),
+        op::Tuple(op::Add(), op::Tuple(op::Copy(op::Add()), op::Add())));
+  } else {
+    EXPECT_THAT(
+        while_hlo->while_body()->root_instruction(),
+        op::Tuple(op::Add(), op::Tuple(op::Add(), op::Copy(op::Add()))));
+  }
 }
 
 // Tests while init instruction buffer which interferes with while result
@@ -1031,11 +1200,13 @@ TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinct) {
 //
 TEST_F(WhileCopyInsertionTest, InitPointsToInterfering) {
   auto while_hlo = BuildWhileInstruction_InitPointsToInterfering();
-  auto old_init = while_hlo->operand(0);
+
   InsertCopies(module_.get());
+  EXPECT_EQ(CountCopies(*module_), 2);
+  EXPECT_EQ(CountCopies(*while_hlo->while_body()), 0);
 
-  EXPECT_THAT(while_hlo->operand(0), op::Tuple(op::Copy(old_init->operand(0)),
-                                               op::Copy(old_init->operand(1))));
+  EXPECT_THAT(while_hlo->operand(0),
+              op::Tuple(op::Copy(op::Constant()), op::Copy(op::Broadcast())));
 }
 
 // Tests while init instruction buffer which has a non-distinct points-to set:
@@ -1044,18 +1215,21 @@ TEST_F(WhileCopyInsertionTest, InitPointsToInterfering) {
 //                  Parameter(F32, {8})))
 //
 // where the second and third parameters are identical *and* the tuple shared
-// by another while instruction..
+// by another while instruction.
 //
 // Verifies that the resulting point-to set is distinct in the resulting Tuple
 // (non-identical Copys). In other words, verifies that copy sharing does not
 // insert identical copies to the resulting tuple.
 TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinctUsedByTwoWhileLoops) {
-  auto condition1 =
-      module_->AddEmbeddedComputation(BuildConditionComputation());
-  auto condition2 =
-      module_->AddEmbeddedComputation(BuildConditionComputation());
   // Loop body that outputs tuple comprises two elements dependent on the init
   // tuple.
+  const Shape& loop_state_shape = ShapeUtil::MakeTupleShape(
+      {induction_variable_shape_, data_shape_, data_shape_});
+
+  auto condition1 = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape));
+  auto condition2 = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape));
   auto body1 =
       module_->AddEmbeddedComputation(BuildDependentBodyComputation2());
   auto body2 =
@@ -1072,8 +1246,6 @@ TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinctUsedByTwoWhileLoops) {
   auto loop_init = builder.AddInstruction(
       HloInstruction::CreateTuple({iter_param, data_param, data_param}));
 
-  const Shape& loop_state_shape = ShapeUtil::MakeTupleShape(
-      {induction_variable_shape_, data_shape_, data_shape_});
 
   // Two while loops shares the same loop init tuple.
   auto while_hlo1 = builder.AddInstruction(HloInstruction::CreateWhile(
@@ -1081,43 +1253,478 @@ TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinctUsedByTwoWhileLoops) {
   auto while_hlo2 = builder.AddInstruction(HloInstruction::CreateWhile(
       loop_state_shape, condition2, body2, loop_init));
 
-  module_->AddEntryComputation(builder.Build());
+  // Add add instruction so neither while is dead.
+  auto gte1 = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeUtil::GetSubshape(while_hlo1->shape(), {0}), while_hlo1, 0));
+  auto gte2 = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeUtil::GetSubshape(while_hlo1->shape(), {0}), while_hlo2, 0));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(gte1->shape(), HloOpcode::kAdd, gte1, gte2));
 
-  auto points_to_analysis =
-      TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
+  module_->AddEntryComputation(builder.Build());
 
-  // Asserts that the init tuples before copy insertion is non-distinct.
-  ASSERT_FALSE(
-      points_to_analysis->GetPointsToSet(while_hlo1->operand(0)).IsDistinct());
-  ASSERT_FALSE(
-      points_to_analysis->GetPointsToSet(while_hlo2->operand(0)).IsDistinct());
+  InsertCopies(module_.get());
 
-  auto old_init1 = while_hlo1->operand(0);
-  auto old_init2 = while_hlo2->operand(0);
+  // None of the bodies should have copies or control flow edges.
+  EXPECT_EQ(CountCopies(*body1), 0);
+  EXPECT_EQ(CountCopies(*body2), 0);
 
-  InsertCopies(module_.get());
+  // The loop bodies pass through elements 1 and 2 in the init tuple, so ideally
+  // these should not need to be copied before either while. However, copy
+  // insertion is not able to reason about the transparency of elements through
+  // while bodies in all circumstances so extra copies are added (b/xxx).
+  EXPECT_EQ(CountCopies(*module_->entry_computation()), 2);
 
   EXPECT_THAT(while_hlo1->operand(0),
-              op::Tuple(op::Copy(old_init1->operand(0)),
-                        op::Copy(old_init1->operand(1)),
-                        op::Copy(old_init1->operand(2))));
-
+              op::Tuple(op::Copy(), op::Parameter(), op::Parameter()));
   EXPECT_THAT(while_hlo2->operand(0),
-              op::Tuple(op::Copy(old_init2->operand(0)),
-                        op::Copy(old_init2->operand(1)),
-                        op::Copy(old_init2->operand(2))));
-
-  // Verifies the init tuples after copy insertion is distinct.
-  points_to_analysis =
-      TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
-  const auto& points_to1 =
-      points_to_analysis->GetPointsToSet(while_hlo1->operand(0));
-  EXPECT_TRUE(points_to1.IsDistinct());
-
-  const auto& points_to2 =
-      points_to_analysis->GetPointsToSet(while_hlo2->operand(0));
-  EXPECT_TRUE(points_to2.IsDistinct());
+              op::Tuple(op::Copy(), op::Parameter(), op::Parameter()));
 }
 
+TEST_F(CopyInsertionTest, SwizzlingWhile) {
+  // Test a while instruction with a body which permutes its tuple parameter
+  // elements.
+  auto module = CreateNewModule();
+  const Shape loop_state_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  // Body simply interchanges the two tuple elements in the loop state.
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, loop_state_shape, "param"));
+  auto body_element_0 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 0));
+  auto body_element_1 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
+  body_builder.AddInstruction(
+      HloInstruction::CreateTuple({body_element_1, body_element_0}));
+  HloComputation* body = module->AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, loop_state_shape, "param"));
+  auto cond_constant = cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  cond_builder.AddInstruction(HloInstruction::CreateUnary(
+      cond_constant->shape(), HloOpcode::kNot, cond_constant));
+  HloComputation* condition =
+      module->AddEmbeddedComputation(cond_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
+  auto tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant1, constant2}));
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape, condition, body, tuple));
+  module->AddEntryComputation(builder.Build());
+
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 6);
+
+  // The loop state elements should be copied at the parameter and at the root
+  // with a control edge in between (see DeepCopyAndAddControlEdges). This is
+  // technically one more copy than is strictly necessary, but in order to have
+  // only three copies the copies of different loop state elements must be
+  // ordered with a control edge.
+  EXPECT_EQ(CountCopies(*body), 4);
+  EXPECT_EQ(CountControlEdges(*body), 2);
+
+  EXPECT_THAT(body->root_instruction(),
+              op::Tuple(op::Copy(op::Copy()), op::Copy(op::Copy())));
+
+  EXPECT_EQ(CountCopies(*module->entry_computation()), 2);
+  EXPECT_THAT(xla_while->operand(0), op::Tuple(op::Copy(), op::Copy()));
+}
+
+TEST_F(CopyInsertionTest, SwizzlingWhileWithOneOp) {
+  // Test a while instruction with a body which permutes its tuple parameter
+  // elements and applies one operation to one of the elements. The addition of
+  // the operation (instruction) on the element makes the live range of the
+  // respective input and output elements different than if the instruction were
+  // not there (as in the SwizzlingWhile test above).
+  auto module = CreateNewModule();
+  const Shape loop_state_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  // Body interchanges the two tuple elements in the loop state and negates one
+  // of them.
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, loop_state_shape, "param"));
+  auto body_element_0 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 0));
+  auto body_element_1 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
+  auto negate = body_builder.AddInstruction(HloInstruction::CreateUnary(
+      scalar_shape_, HloOpcode::kNegate, body_element_1));
+  body_builder.AddInstruction(
+      HloInstruction::CreateTuple({negate, body_element_0}));
+  HloComputation* body = module->AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, loop_state_shape, "param"));
+  auto cond_constant = cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  cond_builder.AddInstruction(HloInstruction::CreateUnary(
+      cond_constant->shape(), HloOpcode::kNot, cond_constant));
+  HloComputation* condition =
+      module->AddEmbeddedComputation(cond_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
+  auto tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant1, constant2}));
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape, condition, body, tuple));
+  module->AddEntryComputation(builder.Build());
+
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 6);
+
+  // The loop state elements should be copied at the parameter and at the root
+  // with a control edge in between (see DeepCopyAndAddControlEdges).
+  EXPECT_EQ(CountCopies(*body), 4);
+  EXPECT_EQ(CountControlEdges(*body), 2);
+
+  EXPECT_THAT(
+      body->root_instruction(),
+      op::Tuple(op::Copy(op::Negate(op::Copy())), op::Copy(op::Copy())));
+
+  EXPECT_EQ(CountCopies(*module->entry_computation()), 2);
+  EXPECT_THAT(xla_while->operand(0), op::Tuple(op::Copy(), op::Copy()));
+}
+
+TEST_F(CopyInsertionTest, SwizzlingWhileSharedInput) {
+  // Test a while instruction with a body which permutes it's tuple parameter
+  // elements similar to SwizzlinWhile above. However, in this test the input to
+  // the while body is a single constant (both loop state elements are the same
+  // constant). This means no copies are necessary because both loop state
+  // elements are the same so interchanging them is a no-op.
+  auto module = CreateNewModule();
+  const Shape loop_state_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  // Body simply interchanges the two tuple elements in the loop state.
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, loop_state_shape, "param"));
+  auto body_element_0 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 0));
+  auto body_element_1 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
+  body_builder.AddInstruction(
+      HloInstruction::CreateTuple({body_element_1, body_element_0}));
+  HloComputation* body = module->AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, loop_state_shape, "param"));
+  auto cond_constant = cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  cond_builder.AddInstruction(HloInstruction::CreateUnary(
+      cond_constant->shape(), HloOpcode::kNot, cond_constant));
+  HloComputation* condition =
+      module->AddEmbeddedComputation(cond_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({constant, constant}));
+  builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape, condition, body, tuple));
+  module->AddEntryComputation(builder.Build());
+
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 2);
+  EXPECT_EQ(CountCopies(*body), 0);
+
+  EXPECT_EQ(CountCopies(*module->entry_computation()), 2);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(), op::Copy()));
+}
+
+TEST_F(CopyInsertionTest, SequentialWhiles) {
+  // Construct a computation with a series of sequential while instructions
+  // containing four loop state elements:
+  //
+  //   element 0 is passed to each while directly from an entry parameter.
+  //
+  //   element 1 is passed transparently in series through all the while bodies.
+  //
+  //   element 2 is negated in each while body. (in-place possible)
+  //
+  //   element 3 is reversed in each while body. (in-place not possible)
+  //
+  const Shape element_shape = ShapeUtil::MakeShape(F32, {42});
+  const Shape loop_state_shape = ShapeUtil::MakeTupleShape(
+      {element_shape, element_shape, element_shape, element_shape});
+
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  auto param_0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, element_shape, "param_0"));
+  auto param_1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, element_shape, "param_1"));
+  auto param_2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, element_shape, "param_2"));
+  auto param_3 = builder.AddInstruction(
+      HloInstruction::CreateParameter(3, element_shape, "param_3"));
+
+  // The number of sequential kWhile instructions.
+  const int kNumWhiles = 3;
+
+  HloInstruction* prev_element_1 = param_1;
+  HloInstruction* prev_element_2 = param_2;
+  HloInstruction* prev_element_3 = param_3;
+
+  // Vector containing all of the while instructions.
+  std::vector<const HloInstruction*> whiles;
+  for (int i = 0; i < kNumWhiles; ++i) {
+    auto body_builder = HloComputation::Builder("body");
+    auto body_param = body_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_state_shape, "param"));
+    auto body_element_0 = body_builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(element_shape, body_param, 0));
+    auto body_element_1 = body_builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(element_shape, body_param, 1));
+    auto body_element_2 = body_builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(element_shape, body_param, 2));
+    auto body_element_3 = body_builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(element_shape, body_param, 3));
+    auto negate = body_builder.AddInstruction(HloInstruction::CreateUnary(
+        element_shape, HloOpcode::kNegate, body_element_2));
+    auto reverse = body_builder.AddInstruction(
+        HloInstruction::CreateReverse(element_shape, body_element_3, {0}));
+    body_builder.AddInstruction(HloInstruction::CreateTuple(
+        {body_element_0, body_element_1, negate, reverse}));
+    HloComputation* body = module->AddEmbeddedComputation(body_builder.Build());
+
+    auto cond_builder = HloComputation::Builder("condition");
+    cond_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_state_shape, "param"));
+    auto cond_constant = cond_builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+    cond_builder.AddInstruction(HloInstruction::CreateUnary(
+        cond_constant->shape(), HloOpcode::kNot, cond_constant));
+    HloComputation* condition =
+        module->AddEmbeddedComputation(cond_builder.Build());
+
+    auto while_init = builder.AddInstruction(HloInstruction::CreateTuple(
+        {param_0, prev_element_1, prev_element_2, prev_element_3}));
+
+    auto xla_while = builder.AddInstruction(HloInstruction::CreateWhile(
+        loop_state_shape, condition, body, while_init));
+    whiles.push_back(xla_while);
+    if (i != kNumWhiles - 1) {
+      prev_element_1 = builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(element_shape, xla_while, 1));
+      prev_element_2 = builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(element_shape, xla_while, 2));
+      prev_element_3 = builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(element_shape, xla_while, 3));
+    }
+  }
+
+  module->AddEntryComputation(builder.Build());
+
+  InsertCopies(module.get());
+
+  // Each while body has one copy. And each loop state element is copied once in
+  // the entry computation.
+  EXPECT_EQ(CountCopies(*module), 4 + kNumWhiles);
+
+  // Each while body should have exactly one copy for element three which is an
+  // op (kReverse) which cannot be done in place.
+  for (const HloInstruction* xla_while : whiles) {
+    EXPECT_EQ(CountCopies(*xla_while->while_body()), 1);
+  }
+
+  EXPECT_THAT(whiles[0]->operand(0), op::Tuple(op::Parameter(), op::Parameter(),
+                                               op::Copy(), op::Copy()));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(), op::Copy(), op::GetTupleElement(),
+                        op::GetTupleElement()));
+}
+
+TEST_F(CopyInsertionTest, WhileBodyWithConstantRoot) {
+  // Test a while body and condition which are each simply a constant (root of
+  // computation is a constant). The body constant should be copied.
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  auto param_0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param_0"));
+
+  auto body_builder = HloComputation::Builder("body");
+  body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  body_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(123.0)));
+  HloComputation* body = module->AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module->AddEmbeddedComputation(cond_builder.Build());
+
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(scalar_shape_, condition, body, param_0));
+
+  module->AddEntryComputation(builder.Build());
+
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 2);
+
+  EXPECT_THAT(xla_while->operand(0), op::Copy(op::Parameter()));
+  EXPECT_THAT(body->root_instruction(), op::Copy(op::Constant()));
+  EXPECT_THAT(condition->root_instruction(), op::Constant());
+}
+
+std::unique_ptr<HloComputation> MakeTrivialCondition(const Shape& shape) {
+  auto builder = HloComputation::Builder("trivial_condition");
+  builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "loop_state"));
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  builder.AddInstruction(HloInstruction::CreateUnary(
+      constant->shape(), HloOpcode::kNot, constant));
+  return builder.Build();
+}
+
+std::unique_ptr<HloComputation> MakeBenchmarkWhileBody() {
+  auto builder = HloComputation::Builder("benchmark_loop_body");
+  const Shape element_shape = ShapeUtil::MakeShape(F32, {42});
+  const Shape loop_state_shape =
+      ShapeUtil::MakeTupleShape({element_shape, element_shape, element_shape});
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, loop_state_shape, "loop_state"));
+  HloInstruction* element_0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(element_shape, param, 0));
+  HloInstruction* element_1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(element_shape, param, 1));
+  HloInstruction* element_2 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(element_shape, param, 2));
+
+  HloInstruction* rev_1 = builder.AddInstruction(
+      HloInstruction::CreateReverse(element_shape, element_1, {0}));
+  HloInstruction* add_1_2 = builder.AddInstruction(HloInstruction::CreateBinary(
+      element_shape, HloOpcode::kAdd, element_1, element_2));
+
+  builder.AddInstruction(
+      HloInstruction::CreateTuple({element_0, rev_1, add_1_2}));
+  return builder.Build();
+}
+
+void BM_SequentialWhiles(int num_iters, int num_whiles) {
+  // This benchmark constructs a chain of sequential while instructions.
+  tensorflow::testing::StopTiming();
+  for (int i = 0; i < num_iters; ++i) {
+    HloModuleConfig config;
+    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    HloModule module("BM_SequentialWhiles", VersionedComputationHandle(),
+                     config);
+
+    auto builder = HloComputation::Builder("BM_SequentialWhiles");
+    HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {42}), "x"));
+    HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {42}), "y"));
+    HloInstruction* z = builder.AddInstruction(HloInstruction::CreateParameter(
+        2, ShapeUtil::MakeShape(F32, {42}), "z"));
+    HloInstruction* init =
+        builder.AddInstruction(HloInstruction::CreateTuple({x, y, z}));
+
+    HloInstruction* prev_loop_state = init;
+    for (int w = 0; w < num_whiles; ++w) {
+      HloComputation* condition =
+          module.AddEmbeddedComputation(MakeTrivialCondition(init->shape()));
+      HloComputation* body =
+          module.AddEmbeddedComputation(MakeBenchmarkWhileBody());
+      prev_loop_state = builder.AddInstruction(HloInstruction::CreateWhile(
+          init->shape(), condition, body, prev_loop_state));
+    }
+    module.AddEntryComputation(builder.Build());
+
+    CopyInsertion copy_insertion;
+
+    tensorflow::testing::StartTiming();
+    ASSERT_IS_OK(copy_insertion.Run(&module).status());
+    tensorflow::testing::StopTiming();
+
+    // The entry computation should have three copies, and each body has one.
+    ASSERT_EQ(CountCopies(module), 3 + num_whiles);
+  }
+}
+
+void BM_ParallelWhiles(int num_iters, int num_whiles) {
+  // This benchmark constructs a fan-out of parallel while instructions.
+  tensorflow::testing::StopTiming();
+  for (int i = 0; i < num_iters; ++i) {
+    HloModuleConfig config;
+    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    HloModule module("BM_SequentialWhiles", VersionedComputationHandle(),
+                     config);
+
+    auto builder = HloComputation::Builder("BM_ParallelWhiles");
+    HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {42}), "x"));
+    HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {42}), "y"));
+    HloInstruction* z = builder.AddInstruction(HloInstruction::CreateParameter(
+        2, ShapeUtil::MakeShape(F32, {42}), "z"));
+    HloInstruction* init =
+        builder.AddInstruction(HloInstruction::CreateTuple({x, y, z}));
+
+    HloInstruction* sum = nullptr;
+    for (int w = 0; w < num_whiles; ++w) {
+      HloComputation* condition =
+          module.AddEmbeddedComputation(MakeTrivialCondition(init->shape()));
+      HloComputation* body =
+          module.AddEmbeddedComputation(MakeBenchmarkWhileBody());
+
+      HloInstruction* xla_while = builder.AddInstruction(
+          HloInstruction::CreateWhile(init->shape(), condition, body, init));
+
+      if (sum == nullptr) {
+        sum = builder.AddInstruction(
+            HloInstruction::CreateGetTupleElement(x->shape(), xla_while, 0));
+      } else {
+        HloInstruction* element_0 = builder.AddInstruction(
+            HloInstruction::CreateGetTupleElement(x->shape(), xla_while, 0));
+        sum = builder.AddInstruction(HloInstruction::CreateBinary(
+            x->shape(), HloOpcode::kAdd, sum, element_0));
+      }
+    }
+    module.AddEntryComputation(builder.Build());
+
+    CopyInsertion copy_insertion;
+
+    tensorflow::testing::StartTiming();
+    ASSERT_IS_OK(copy_insertion.Run(&module).status());
+    tensorflow::testing::StopTiming();
+
+    // Each body receives of copy of two of the parameters (the corresponding
+    // elements in the body are modifed), and there is one copy in each body.
+    ASSERT_EQ(CountCopies(module), 3 * num_whiles);
+  }
+}
+
+BENCHMARK(BM_SequentialWhiles)->Arg(512)->Arg(1024)->Arg(2048)->Arg(4096);
+BENCHMARK(BM_ParallelWhiles)->Arg(512)->Arg(1024)->Arg(2048)->Arg(4096);
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 78216f2ffb9c58d7f4b7ca31cb740d547ea1d470..ed142bd077fc20f5e2e563132df95bff10a37f0f 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -79,15 +79,16 @@ cc_library(
     deps = [
         ":compiler_functor",
         ":conv_canonicalization",
+        ":cpu_copy_insertion",
         ":cpu_executable",
         ":cpu_instruction_fusion",
+        ":cpu_layout_assignment",
         ":cpu_options",
         ":cpu_parallelization_preparation",
         ":disassembler",
         ":dot_op_emitter",
         ":ir_emission_utils",
         ":ir_emitter",
-        ":layout_assignment",
         ":parallel_cpu_executable",
         ":parallel_task_assignment",
         ":simple_orc_jit",
@@ -99,17 +100,18 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
-        "//tensorflow/compiler/xla/service:batchnorm_rewriter",
+        "//tensorflow/compiler/xla/service:batchnorm_expander",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:call_inliner",
-        "//tensorflow/compiler/xla/service:copy_insertion",
+        "//tensorflow/compiler/xla/service:dot_decomposer",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dce",
+        "//tensorflow/compiler/xla/service:hlo_element_type_converter",
         "//tensorflow/compiler/xla/service:hlo_ordering",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
@@ -250,6 +252,8 @@ cc_library(
         ":dot_op_emitter",
         ":external_constant_pool",
         ":ir_emission_utils",
+        ":ir_function",
+        ":parallel_loop_emitter",
         ":shape_partition",
         ":simple_orc_jit",
         "//tensorflow/compiler/xla:shape_util",
@@ -273,12 +277,48 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:ops",
         "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops",
         "//tensorflow/core:lib",
+        "@llvm//:analysis",
+        "@llvm//:code_gen",
         "@llvm//:core",
         "@llvm//:support",
         "@llvm//:target",
     ],
 )
 
+cc_library(
+    name = "ir_function",
+    srcs = ["ir_function.cc"],
+    hdrs = ["ir_function.h"],
+    deps = [
+        ":ir_emission_utils",
+        ":shape_partition",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/service/cpu:cpu_runtime",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/core:lib",
+        "@llvm//:core",
+    ],
+)
+
+cc_library(
+    name = "parallel_loop_emitter",
+    srcs = ["parallel_loop_emitter.cc"],
+    hdrs = ["parallel_loop_emitter.h"],
+    deps = [
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter",
+        "//tensorflow/core:lib",
+        "@llvm//:core",
+    ],
+)
+
 cc_library(
     name = "dot_op_emitter",
     srcs = ["dot_op_emitter.cc"],
@@ -614,13 +654,14 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla/service:hlo",
+        "@llvm//:core",
     ],
 )
 
 cc_library(
-    name = "layout_assignment",
-    srcs = ["layout_assignment.cc"],
-    hdrs = ["layout_assignment.h"],
+    name = "cpu_layout_assignment",
+    srcs = ["cpu_layout_assignment.cc"],
+    hdrs = ["cpu_layout_assignment.h"],
     deps = [
         ":dot_op_emitter",
         ":ir_emission_utils",
@@ -632,11 +673,11 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "layout_assignment_test",
+    name = "cpu_layout_assignment_test",
     size = "small",
-    srcs = ["layout_assignment_test.cc"],
+    srcs = ["cpu_layout_assignment_test.cc"],
     deps = [
-        ":layout_assignment",
+        ":cpu_layout_assignment",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
@@ -750,6 +791,38 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "cpu_copy_insertion",
+    srcs = ["cpu_copy_insertion.cc"],
+    hdrs = ["cpu_copy_insertion.h"],
+    deps = [
+        "//tensorflow/compiler/xla/service:copy_insertion",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "cpu_copy_insertion_test",
+    srcs = ["cpu_copy_insertion_test.cc"],
+    deps = [
+        ":cpu_copy_insertion",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_graph_dumper",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
index 44cd2171afdc6eecc22f3f920276a4d95f930573..2136aeb3877685373efaf5bf702a42b39a63f082 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
@@ -41,19 +41,17 @@ StatusOr<bool> ConvCanonicalization::Run(HloModule* module) {
       auto kernel_input_feature_dim = dnums.kernel_input_feature_dimension();
       auto kernel_output_feature_dim = dnums.kernel_output_feature_dimension();
 
-      int num_spatial_dims = dnums.spatial_dimensions_size();
-      int num_dims = num_spatial_dims + 2;
+      const int64 num_spatial_dims = dnums.output_spatial_dimensions_size();
+      const int64 num_dims = num_spatial_dims + 2;
 
       // A canonical convolution's dimension numbers need to satisfy the
       // following conditions (see cs/PotentiallyImplementedAsEigenConvolution).
       //
-      // - the input is in NHWC or NWHC order.
-      // - the kernel is in HWIO or WHIO order.
-      // - the spatial dimensions are in the same relative order in the input,
-      //   kernel and output.
+      // - the input is in NHWC order.
+      // - the kernel is in HWIO order.
       //
       // For simplicity, as a first step, we reshape the input and filter to
-      // NHWC and HWIO order, respectively. This may lose precision but not
+      // NHWC and HWIO order, respectively. This may lose precision but won't
       // break the soundness.
       HloInstruction* input = hlo->mutable_operand(0);
 
@@ -61,10 +59,10 @@ StatusOr<bool> ConvCanonicalization::Run(HloModule* module) {
       std::vector<int64> new_input_dims(num_dims);
       new_input_dim_order[0] = input_batch_dim;
       new_input_dims[0] = input->shape().dimensions(input_batch_dim);
-      for (int i = 0; i < num_spatial_dims; ++i) {
-        new_input_dim_order[i + 1] = dnums.spatial_dimensions(i);
+      for (int64 i = 0; i < num_spatial_dims; ++i) {
+        new_input_dim_order[i + 1] = dnums.input_spatial_dimensions(i);
         new_input_dims[i + 1] =
-            input->shape().dimensions(dnums.spatial_dimensions(i));
+            input->shape().dimensions(dnums.input_spatial_dimensions(i));
       }
       new_input_dim_order[num_dims - 1] = input_feature_dim;
       new_input_dims[num_dims - 1] =
@@ -80,7 +78,7 @@ StatusOr<bool> ConvCanonicalization::Run(HloModule* module) {
 
       std::vector<int64> new_kernel_dim_order(num_dims);
       std::vector<int64> new_kernel_dims(num_dims);
-      for (int i = 0; i < num_spatial_dims; ++i) {
+      for (int64 i = 0; i < num_spatial_dims; ++i) {
         new_kernel_dim_order[i] = dnums.kernel_spatial_dimensions(i);
         new_kernel_dims[i] =
             kernel->shape().dimensions(dnums.kernel_spatial_dimensions(i));
@@ -98,14 +96,18 @@ StatusOr<bool> ConvCanonicalization::Run(HloModule* module) {
           HloInstruction::CreateTranspose(new_kernel_shape, kernel,
                                           new_kernel_dim_order));
 
+      std::vector<int64> new_output_dim_order(num_dims);
       std::vector<int64> new_conv_dims(num_dims);
       auto output_batch_dim = dnums.output_batch_dimension();
       auto output_feature_dim = dnums.output_feature_dimension();
+      new_output_dim_order[0] = output_batch_dim;
       new_conv_dims[0] = hlo->shape().dimensions(output_batch_dim);
-      for (int i = 0; i < num_spatial_dims; ++i) {
+      for (int64 i = 0; i < num_spatial_dims; ++i) {
+        new_output_dim_order[i + 1] = dnums.output_spatial_dimensions(i);
         new_conv_dims[i + 1] =
-            hlo->shape().dimensions(dnums.spatial_dimensions(i));
+            hlo->shape().dimensions(dnums.output_spatial_dimensions(i));
       }
+      new_output_dim_order[num_dims - 1] = output_feature_dim;
       new_conv_dims[num_dims - 1] = hlo->shape().dimensions(output_feature_dim);
       Shape new_conv_shape =
           ShapeUtil::MakeShape(hlo->shape().element_type(), new_conv_dims);
@@ -113,9 +115,10 @@ StatusOr<bool> ConvCanonicalization::Run(HloModule* module) {
       ConvolutionDimensionNumbers new_dnums;
       new_dnums.set_input_batch_dimension(0);
       new_dnums.set_output_batch_dimension(0);
-      for (int i = 0; i < num_spatial_dims; ++i) {
-        new_dnums.add_spatial_dimensions(i + 1);
+      for (int64 i = 0; i < num_spatial_dims; ++i) {
+        new_dnums.add_input_spatial_dimensions(i + 1);
         new_dnums.add_kernel_spatial_dimensions(i);
+        new_dnums.add_output_spatial_dimensions(i + 1);
       }
       new_dnums.set_input_feature_dimension(num_dims - 1);
       new_dnums.set_output_feature_dimension(num_dims - 1);
@@ -129,14 +132,11 @@ StatusOr<bool> ConvCanonicalization::Run(HloModule* module) {
           HloInstruction::CreateConvolve(new_conv_shape, new_input, new_kernel,
                                          hlo->window(), new_dnums));
 
-      // kConvolution inherits the dimension mapping of its input, so we need to
-      // reshape the output back to the shape of the original convolution. This
-      // is done by apply the inverse permutation of the collapsing order of the
-      // input reshape.
+      // Reshape the output back to the shape of the original convolution.
       TF_RETURN_IF_ERROR(module->entry_computation()->ReplaceWithNewInstruction(
           hlo, HloInstruction::CreateTranspose(
                    hlo->shape(), new_conv,
-                   InversePermutation(new_input_dim_order))));
+                   InversePermutation(new_output_dim_order))));
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
index d593ba26b655d00a0f0f0b9a94c9e62fa1835080..968f53d5c706651d2a470a853e0e9b601c0ed2df 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
@@ -69,8 +69,10 @@ TEST_F(ConvCanonicalizationTest, NonCanonicalToCanonical) {
   ConvolutionDimensionNumbers dnums;
   dnums.set_input_batch_dimension(1);
   dnums.set_output_batch_dimension(1);
-  dnums.add_spatial_dimensions(2);
-  dnums.add_spatial_dimensions(3);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(3);
+  dnums.add_output_spatial_dimensions(3);
   dnums.set_input_feature_dimension(0);
   dnums.set_output_feature_dimension(0);
   dnums.add_kernel_spatial_dimensions(2);
@@ -125,8 +127,10 @@ TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) {
   ConvolutionDimensionNumbers dnums;
   dnums.set_input_batch_dimension(0);
   dnums.set_output_batch_dimension(0);
-  dnums.add_spatial_dimensions(1);
-  dnums.add_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
   dnums.set_input_feature_dimension(3);
   dnums.set_output_feature_dimension(3);
   dnums.add_kernel_spatial_dimensions(0);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index f5b95d3657cb91623aa043f7544760c11fc87408..6dc30bfe2cd036f7e83b054d25361072ac5077e9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -42,32 +42,34 @@ limitations under the License.
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
-#include "tensorflow/compiler/xla/service/batchnorm_rewriter.h"
+#include "tensorflow/compiler/xla/service/batchnorm_expander.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
-#include "tensorflow/compiler/xla/service/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/cpu/compiler_functor.h"
 #include "tensorflow/compiler/xla/service/cpu/conv_canonicalization.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h"
 #include "tensorflow/compiler/xla/service/cpu/disassembler.h"
 #include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emitter.h"
-#include "tensorflow/compiler/xla/service/cpu/layout_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h"
 #include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/dot_decomposer.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
@@ -197,28 +199,35 @@ void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) {
 class CollectProfileCandidates : public DfsHloVisitorWithDefault {
  public:
   static StatusOr<std::unordered_map<const HloInstruction*, size_t>>
-  GetCandidatesForComputation(HloComputation* computation) {
+  GetCandidatesForComputation(
+      HloComputation* computation,
+      const std::unordered_map<const HloInstruction*, int64>&
+          assigned_indices) {
     std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx;
     CollectProfileCandidates profile_candidates_for_computation(
-        &hlo_to_profile_idx);
+        &hlo_to_profile_idx, assigned_indices);
     TF_RETURN_IF_ERROR(
         computation->Accept(&profile_candidates_for_computation));
     return hlo_to_profile_idx;
   }
 
  private:
-  explicit CollectProfileCandidates(
-      std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx)
-      : hlo_to_profile_idx_(hlo_to_profile_idx) {}
+  CollectProfileCandidates(
+      std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx,
+      const std::unordered_map<const HloInstruction*, int64>& assigned_indices)
+      : hlo_to_profile_idx_(hlo_to_profile_idx),
+        assigned_indices_(assigned_indices) {}
 
   Status DefaultAction(HloInstruction* hlo_instruction) override {
-    hlo_to_profile_idx_->insert({hlo_instruction, hlo_to_profile_idx_->size()});
+    hlo_to_profile_idx_->insert(
+        {hlo_instruction, FindOrDie(assigned_indices_, hlo_instruction)});
     return Status::OK();
   }
 
   Status HandleCall(HloInstruction* call) override {
     TF_RETURN_IF_ERROR(DefaultAction(call));
-    CollectProfileCandidates candidates_for_call(hlo_to_profile_idx_);
+    CollectProfileCandidates candidates_for_call(hlo_to_profile_idx_,
+                                                 assigned_indices_);
     TF_RETURN_IF_ERROR(call->to_apply()->Accept(&candidates_for_call));
     return Status::OK();
   }
@@ -232,17 +241,20 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
   Status HandleWhile(HloInstruction* xla_while) override {
     TF_RETURN_IF_ERROR(DefaultAction(xla_while));
 
-    CollectProfileCandidates candidates_for_condition(hlo_to_profile_idx_);
+    CollectProfileCandidates candidates_for_condition(hlo_to_profile_idx_,
+                                                      assigned_indices_);
     TF_RETURN_IF_ERROR(
         xla_while->while_condition()->Accept(&candidates_for_condition));
 
-    CollectProfileCandidates candidates_for_body(hlo_to_profile_idx_);
+    CollectProfileCandidates candidates_for_body(hlo_to_profile_idx_,
+                                                 assigned_indices_);
     TF_RETURN_IF_ERROR(xla_while->while_body()->Accept(&candidates_for_body));
 
     return Status::OK();
   }
 
   std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx_;
+  const std::unordered_map<const HloInstruction*, int64>& assigned_indices_;
 };
 }  // namespace
 
@@ -262,14 +274,14 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
   // TODO(b/65775800): Fix wrong output bug in Call and remove the CallInliner
   // pass.
   pipeline.AddPass<CallInliner>();
-
+  pipeline.AddPass<DotDecomposer>();
   pipeline.AddPass<ConvCanonicalization>();
   {
     auto& pass =
         pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
     pass.AddInvariantChecker<HloVerifier>(ShapeSizeBytesFunction());
 
-    pass.AddPass<BatchNormRewriter>(
+    pass.AddPass<BatchNormExpander>(
         /*rewrite_training_op=*/true,
         /*rewrite_inference_op=*/true,
         /*rewrite_grad_op=*/true,
@@ -277,7 +289,7 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
     pass.AddPass<AlgebraicSimplifier>(
         /*is_layout_sensitive=*/false,
         [](const Shape&, const Shape&) { return false; },
-        /*enable_dot_simplification=*/false);
+        /*enable_dot_strength_reduction=*/false);
     pass.AddPass<TupleSimplifier>();
     pass.AddPass<WhileLoopSimplifier>();
     pass.AddPass<HloDCE>();
@@ -306,8 +318,9 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
       /*is_layout_sensitive=*/true,
       [](const Shape&, const Shape&) { return true; },
-      /*enable_dot_simplification=*/false);
+      /*enable_dot_strength_reduction=*/false);
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
+  pipeline.AddPass<HloElementTypeConverter>(BF16, F32);
   // Outline ops in the entry computation into calls to subcomputations.
   const int max_parallelism =
       module->config().intra_op_parallelism_threads() > 0
@@ -332,15 +345,16 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
   // (and sometime after) copy insertion, to avoid dead code from interfering
   // with the rewrites.
   pipeline.AddPass<HloDCE>();
-  pipeline.AddPass<CopyInsertion>();
+  pipeline.AddPass<FlattenCallGraph>();
+  pipeline.AddPass<CpuCopyInsertion>();
   if (options::CpuParallelBackendRequested(module->config())) {
     // Re-run the outlining, in case any copies were inserted into the entry
     // computation.
     pipeline.AddPass<ParallelizationPreparation>(max_parallelism,
                                                  ShapeSizeBytesFunction());
+    pipeline.AddPass<CpuCopyInsertion>();
   }
   pipeline.AddPass<HloDCE>();
-  pipeline.AddPass<FlattenCallGraph>();
   return pipeline.Run(module).status();
 }
 
@@ -426,11 +440,25 @@ Status InitializeModuleHooks(
 
 }  // namespace
 
-StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
-    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec) {
+StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
+    std::unique_ptr<HloModule> module,
+    perftools::gputools::StreamExecutor* /*stream_exec*/) {
+  VLOG(2) << "Before optimization:";
+  XLA_VLOG_LINES(2, module->ToString());
+
+  TF_RETURN_IF_ERROR(RunHloPasses(module.get(), /*is_aot_compile=*/false));
+
+  VLOG(2) << "After optimization:";
+  XLA_VLOG_LINES(2, module->ToString());
+  return std::move(module);
+}
+
+StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
+    std::unique_ptr<HloModule> module,
+    perftools::gputools::StreamExecutor* stream_exec) {
   const string timer_message =
       "Compiling [" + module->name() + "] for CPU using JIT";
-  ScopedLoggingTimer compiling_timer(timer_message, 1);
+  XLA_SCOPED_LOGGING_TIMER(timer_message);
 
   VLOG(1) << "Compiling: " << module->name();
   TF_RET_CHECK(stream_exec != nullptr);
@@ -444,11 +472,11 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
       &pre_optimization_ir_hook, &post_optimization_ir_hook));
 
   // Compile must be thread-safe so create a new LLVM context for the module.
-  auto llvm_context = MakeUnique<llvm::LLVMContext>();
+  auto llvm_context = xla::MakeUnique<llvm::LLVMContext>();
   auto llvm_module =
-      MakeUnique<llvm::Module>("__compute_module", *llvm_context);
+      xla::MakeUnique<llvm::Module>("__compute_module", *llvm_context);
 
-  auto jit = MakeUnique<SimpleOrcJIT>(
+  auto jit = xla::MakeUnique<SimpleOrcJIT>(
       CompilerTargetOptions(module->config()),
       CodeGenOptLevel(module->config()),
       options::OptimizeForSizeRequested(module->config()),
@@ -458,20 +486,29 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
   llvm_module->setDataLayout(jit->data_layout());
   llvm_module->setTargetTriple(jit->target_triple().getTriple());
 
-  VLOG(2) << "Before optimization:";
-  XLA_VLOG_LINES(2, module->ToString());
-
-  TF_RETURN_IF_ERROR(RunHloPasses(module.get(), /*is_aot_compile=*/false));
-
-  VLOG(2) << "After optimization:";
-  XLA_VLOG_LINES(2, module->ToString());
-
   HloComputation* computation = module->entry_computation();
   std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx;
+  std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
+  std::unique_ptr<HloProfilePrinter> hlo_profile_printer;
   if (module->config().hlo_profiling_enabled()) {
+    hlo_profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
+
     TF_ASSIGN_OR_RETURN(
         hlo_to_profile_idx,
-        CollectProfileCandidates::GetCandidatesForComputation(computation));
+        CollectProfileCandidates::GetCandidatesForComputation(
+            computation, hlo_profile_index_map->instruction_to_profile_idx()));
+
+    auto shape_size_bytes = [](const Shape& shape) {
+      // On the cpu, opaques are pointers.
+      if (ShapeUtil::IsOpaque(shape)) {
+        return static_cast<int64>(sizeof(void*));
+      }
+      return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+    };
+
+    HloCostAnalysis cost_analysis(shape_size_bytes);
+    hlo_profile_printer =
+        CreateHloProfilePrinter(*hlo_profile_index_map, cost_analysis);
   }
 
   std::unique_ptr<Executable> cpu_executable;
@@ -494,9 +531,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     // uses data dependencies for determining order.
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<BufferAssignment> assignment,
-        BufferAssigner::Run(module.get(),
-                            MakeUnique<DependencyHloOrdering>(module.get()),
-                            BufferSizeBytesFunction(), memory_alignment));
+        BufferAssigner::Run(
+            module.get(), xla::MakeUnique<DependencyHloOrdering>(module.get()),
+            BufferSizeBytesFunction(), memory_alignment));
     // BufferAssignment::ToString() includes a header, so no need for us to
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
@@ -523,7 +560,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
         const void* data = instruction->literal().InternalData();
         int64 size = CpuExecutable::ShapeSizeBytes(instruction->shape());
         auto iter = aligned_constants.emplace(
-            instruction, MakeUnique<unsigned char[]>(size));
+            instruction, xla::MakeUnique<unsigned char[]>(size));
         CHECK_EQ(iter.second, true);
         unsigned char* aligned_data = iter.first->second.get();
         memcpy(aligned_data, data, size);
@@ -537,11 +574,17 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
       parallel_computations.emplace(to_apply, instruction);
     }
 
-    size_t entry_computation_profile_idx = hlo_to_profile_idx.size();
-    IrEmitter ir_emitter(
-        *module, *assignment, llvm_module.get(), std::move(hlo_to_profile_idx),
-        /*entry_computation_profile_idx=*/entry_computation_profile_idx,
-        jit->target_machine(), jit->external_constant_pool());
+    // We always profile the entire computation as a whole, even if hlo
+    // profiling is disabled.  When hlo profiling is diabled, we pass in a
+    // profile counter array of just one element, which corresponds to the whole
+    // computation.
+    size_t entry_computation_profile_idx =
+        hlo_profile_index_map ? hlo_profile_index_map->GetProfileIndexFor(
+                                    *module->entry_computation())
+                              : 0;
+    IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
+                         hlo_to_profile_idx, entry_computation_profile_idx,
+                         jit->target_machine(), jit->external_constant_pool());
 
     std::unique_ptr<HloInstructionMap<string>> function_names(
         new HloInstructionMap<string>());
@@ -560,7 +603,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
           llvm::Function * ir_function,
           ir_emitter.EmitComputation(
               embedded_computation, embedded_computation->name(),
-              /*is_entry_computation=*/computation_is_parallel,
+              /*is_top_level_computation=*/computation_is_parallel,
               /*instruction_order=*/nullptr));
       // If this computation is parallel, remember it in the function name map.
       // This way we know what function to execute when we try to run code for
@@ -581,8 +624,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     jit->AddModule(std::move(llvm_module));
     cpu_executable.reset(new ParallelCpuExecutable(
         std::move(jit), std::move(assignment), std::move(module),
-        std::move(function_names), std::move(hlo_to_profile_idx),
-        std::move(aligned_constants)));
+        std::move(function_names), std::move(aligned_constants),
+        std::move(hlo_profile_printer), std::move(hlo_profile_index_map)));
 
     if (embed_ir_in_executable) {
       static_cast<CpuExecutable&>(*cpu_executable)
@@ -602,10 +645,10 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     // temporary buffers are required to run the computation.
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<BufferAssignment> assignment,
-        BufferAssigner::Run(
-            module.get(),
-            MakeUnique<SequentialHloOrdering>(module.get(), module_sequence),
-            BufferSizeBytesFunction(), memory_alignment));
+        BufferAssigner::Run(module.get(),
+                            xla::MakeUnique<SequentialHloOrdering>(
+                                module.get(), module_sequence),
+                            BufferSizeBytesFunction(), memory_alignment));
     // BufferAssignment::ToString() includes a header, so no need for us to
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
@@ -615,15 +658,23 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
       TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
           proto, xla_dump_hlo_proto_to, module->name()));
     }
+    // We always profile the entire computation as a whole, even if hlo
+    // profiling is disabled.  When hlo profiling is diabled, we pass in a
+    // profile counter array of just one element, which corresponds to the whole
+    // computation.
+    size_t entry_computation_profile_idx =
+        hlo_profile_index_map ? hlo_profile_index_map->GetProfileIndexFor(
+                                    *module->entry_computation())
+                              : 0;
+
     // Each computation is a single function.  Emit all embedded computations
     // before the entry computation. The order of computations returned from
     // GetEmbeddedComputations guarantees that a called computation occurs
     // before a caller computation.
-    size_t entry_computation_profile_idx = hlo_to_profile_idx.size();
-    IrEmitter ir_emitter(
-        *module, *assignment, llvm_module.get(), std::move(hlo_to_profile_idx),
-        /*entry_computation_profile_idx=*/entry_computation_profile_idx,
-        jit->target_machine(), jit->external_constant_pool());
+
+    IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
+                         hlo_to_profile_idx, entry_computation_profile_idx,
+                         jit->target_machine(), jit->external_constant_pool());
 
     for (auto embedded_computation :
          computation->MakeEmbeddedComputationsList()) {
@@ -634,7 +685,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
           ir_emitter
               .EmitComputation(embedded_computation,
                                embedded_computation->name(),
-                               /*is_entry_computation=*/false,
+                               /*is_top_level_computation=*/false,
                                &module_sequence.at(embedded_computation))
               .status());
     }
@@ -643,7 +694,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     TF_ASSIGN_OR_RETURN(
         llvm::Function * entry_function,
         ir_emitter.EmitComputation(computation, function_name_prefix,
-                                   /*is_entry_computation=*/true,
+                                   /*is_top_level_computation=*/true,
                                    &module_sequence.at(computation)));
 
     string function_name = llvm_ir::AsString(entry_function->getName());
@@ -656,7 +707,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     jit->AddModule(std::move(llvm_module));
     cpu_executable.reset(new CpuExecutable(
         std::move(jit), std::move(assignment), std::move(module), function_name,
-        std::move(hlo_to_profile_idx)));
+        std::move(hlo_profile_printer), std::move(hlo_profile_index_map)));
 
     if (embed_ir_in_executable) {
       static_cast<CpuExecutable&>(*cpu_executable)
@@ -776,7 +827,8 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<BufferAssignment> assignment,
         BufferAssigner::Run(
-            module, MakeUnique<SequentialHloOrdering>(module, module_sequence),
+            module,
+            xla::MakeUnique<SequentialHloOrdering>(module, module_sequence),
             BufferSizeBytesFunction(), memory_alignment));
     // BufferAssignment::ToString() includes a header, so no need for us to
     // print one ourselves.
@@ -807,7 +859,7 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
           ir_emitter
               .EmitComputation(embedded_computation,
                                embedded_computation->name(),
-                               /*is_entry_computation=*/false,
+                               /*is_top_level_computation=*/false,
                                &module_sequence.at(embedded_computation))
               .status());
     }
@@ -815,7 +867,7 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
     TF_ASSIGN_OR_RETURN(
         llvm::Function * entry_function,
         ir_emitter.EmitComputation(computation, entry_point_name,
-                                   /*is_entry_computation=*/true,
+                                   /*is_top_level_computation=*/true,
                                    &module_sequence.at(computation)));
 
     CHECK(entry_function->getName() == llvm_ir::AsStringRef(entry_point_name));
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index 963aced208813e58b3d069a80bd88fcb05d8253f..ebed7058d8f7968c6e03ef90d0da6b2325037eb0 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -116,7 +116,11 @@ class CpuCompiler : public LLVMCompiler {
   //        stream_execs)
   using LLVMCompiler::Compile;
 
-  StatusOr<std::unique_ptr<Executable>> Compile(
+  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> module,
+      perftools::gputools::StreamExecutor* stream_exec) override;
+
+  StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
       perftools::gputools::StreamExecutor* stream_exec) override;
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.cc b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.cc
new file mode 100644
index 0000000000000000000000000000000000000000..baaacd2ecc9611946678f71ac36ef787ecb57b4e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.cc
@@ -0,0 +1,43 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h"
+
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/copy_insertion.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+StatusOr<bool> CpuCopyInsertion::Run(HloModule* module) {
+  CopyInsertion generic_copy_insertion;
+
+  TF_ASSIGN_OR_RETURN(bool generic_changed, generic_copy_insertion.Run(module));
+
+  // The CPU backend needs additional copies added due to deficiencies in
+  // buffer assignment.
+  TF_ASSIGN_OR_RETURN(bool buffer_assignment_changed,
+                      CopyInsertion::AddCopiesForBufferAssignment(module));
+
+  return generic_changed || buffer_assignment_changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h
new file mode 100644
index 0000000000000000000000000000000000000000..3313d1e6eb71bff39f509c3d24858568df786422
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_COPY_INSERTION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_COPY_INSERTION_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Besides the modifications made by the generic xla::CopyInsertion, this
+// CPU-specific copy insertion pass also adds copies to values live out of
+// computations satisfying certain conditions (defined by constant or parameter,
+// etc). This is necessary because of deficiencies of buffer
+// assignment. Specifically, buffer assignment is computation-scoped and does
+// not recognized aliasing between arguments and outputs of computations.
+//
+// TODO(b/62548313): Remove this when buffer assignment is smarter
+// (module-scoped).
+class CpuCopyInsertion : public HloPassInterface {
+ public:
+  tensorflow::StringPiece name() const override { return "copy-insertion"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_COPY_INSERTION_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a05a26941786cbf404c4685abb098c9ac8caaa09
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc
@@ -0,0 +1,139 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h"
+
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+int64 CountCopies(const HloComputation& computation) {
+  int64 count = 0;
+  for (const auto& instruction : computation.instructions()) {
+    if (instruction->opcode() == HloOpcode::kCopy) {
+      count++;
+    }
+  }
+  return count;
+}
+
+int64 CountCopies(const HloModule& module) {
+  int64 count = 0;
+  for (const auto& computation : module.computations()) {
+    count += CountCopies(*computation);
+  }
+  return count;
+}
+
+class CpuCopyInsertionTest : public HloTestBase {
+ protected:
+  void InsertCopies(HloModule* module) {
+    CpuCopyInsertion copy_insertion;
+    ASSERT_IS_OK(copy_insertion.Run(module).status());
+  }
+
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
+};
+
+TEST_F(CpuCopyInsertionTest, WhileBodyWithConstantRoot) {
+  // Test a while body and condition which are each simply a constant (root of
+  // computation is a constant). Each constant should be copied.
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  auto param_0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param_0"));
+
+  auto body_builder = HloComputation::Builder("body");
+  body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  body_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(123.0)));
+  HloComputation* body = module->AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module->AddEmbeddedComputation(cond_builder.Build());
+
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(scalar_shape_, condition, body, param_0));
+
+  module->AddEntryComputation(builder.Build());
+
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 3);
+
+  EXPECT_THAT(xla_while->operand(0), op::Copy(op::Parameter()));
+  EXPECT_THAT(body->root_instruction(), op::Copy(op::Constant()));
+  EXPECT_THAT(condition->root_instruction(), op::Copy(op::Constant()));
+}
+
+TEST_F(CpuCopyInsertionTest, TupleCall) {
+  // Test a kCall instruction which calls a computation which produces a three
+  // element tuple: one is a constant, one is a parameter, and one is produced
+  // in the computation. The constant and parameter should be copied.
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param_0"));
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_, scalar_shape_});
+
+  auto sub_builder = HloComputation::Builder("subcomputation");
+  auto sub_param = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  auto constant = sub_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(123.0)));
+  auto add = sub_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, sub_param, constant));
+  sub_builder.AddInstruction(
+      HloInstruction::CreateTuple({sub_param, constant, add}));
+  HloComputation* subcomputation =
+      module->AddEmbeddedComputation(sub_builder.Build());
+
+  builder.AddInstruction(
+      HloInstruction::CreateCall(tuple_shape, {param}, subcomputation));
+
+  module->AddEntryComputation(builder.Build());
+
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*subcomputation), 2);
+  EXPECT_THAT(subcomputation->root_instruction(),
+              op::Tuple(op::Copy(op::Parameter()), op::Copy(op::Constant()),
+                        op::Add()));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index f62353bee7b1058dc237169b70341c33ab19fc52..028f827337979de14ec557a8f0d7a47f095bf55e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/host/host_stream.h"
 
 namespace se = ::perftools::gputools;
 
@@ -54,11 +55,12 @@ CpuExecutable::CpuExecutable(
     std::unique_ptr<const BufferAssignment> assignment,
     std::unique_ptr<const HloModule> hlo_module,
     const string& entry_function_name,
-    std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx)
-    : Executable(std::move(hlo_module)),
+    std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+    : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+                 std::move(hlo_profile_index_map)),
       jit_(std::move(jit)),
-      assignment_(std::move(assignment)),
-      hlo_to_profile_idx_(std::move(hlo_to_profile_idx)) {
+      assignment_(std::move(assignment)) {
   // Resolve symbols in the constructor rather than at execution time to avoid
   // races because FindSymbol is not thread safe.
   llvm::JITSymbol sym = jit_->FindSymbol(entry_function_name);
@@ -71,28 +73,6 @@ CpuExecutable::CpuExecutable(
       reinterpret_cast<ComputeFunctionType>(cantFail(sym.getAddress()));
 }
 
-// Given a pointer to an output buffer (following the CPU JIT calling
-// conventions), mark addresses that are "live". The initial pointer itself is
-// trivially live. If the shape of the buffer is a tuple, this analysis looks
-// into the tuple's elements and marks them live as well (since tuples keep
-// pointers to buffers) and also works recursively.  address is an in-memory
-// buffer address that contains some runtime XLA object.  shape is its
-// shape. marked_addresses is the set of live addresses to populate.
-static void MarkLiveAddressesInOutput(
-    const void* address, const Shape& shape,
-    std::unordered_set<const void*>* marked_addresses) {
-  marked_addresses->insert(address);
-  const uintptr_t* address_buffer = static_cast<const uintptr_t*>(address);
-  if (ShapeUtil::IsTuple(shape)) {
-    for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
-      const uintptr_t* element_address = address_buffer + i;
-      const void* element = reinterpret_cast<const void*>(*element_address);
-      MarkLiveAddressesInOutput(
-          element, ShapeUtil::GetTupleElementShape(shape, i), marked_addresses);
-    }
-  }
-}
-
 Status CpuExecutable::AllocateBuffers(
     DeviceMemoryAllocator* memory_allocator, int device_ordinal,
     std::vector<perftools::gputools::DeviceMemoryBase>* buffers) {
@@ -146,19 +126,6 @@ Status CpuExecutable::ExecuteComputeFunction(
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
     HloExecutionProfile* hlo_execution_profile) {
-  std::vector<se::DeviceMemoryBase> argument_buffers;
-  for (int i = 0; i < arguments.size(); ++i) {
-    argument_buffers.push_back(arguments[i]->buffer(/*index=*/{}));
-  }
-  return ExecuteComputeFunction(run_options, argument_buffers, buffers,
-                                hlo_execution_profile);
-}
-
-Status CpuExecutable::ExecuteComputeFunction(
-    const ExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
-    HloExecutionProfile* hlo_execution_profile) {
   // The calling convention for JITed functions is:
   //
   //  void function(void* result, const void* run_options, void** args_array,
@@ -174,16 +141,23 @@ Status CpuExecutable::ExecuteComputeFunction(
   //               determined by buffer analysis.
   //
   std::vector<const void*> args_array;
-  for (se::DeviceMemoryBase arg_mem : arguments) {
-    args_array.push_back(arg_mem.opaque());
+  for (const ShapedBuffer* argument : arguments) {
+    args_array.push_back(argument->root_buffer().opaque());
   }
 
   uint64 start_micros = tensorflow::Env::Default()->NowMicros();
 
   // Allocate profiling counters for each hlo instruction that we would like to
-  // profile.  Allocate an additional profile counter for the entire
-  // computation.
-  std::vector<uint64> profile_counters(hlo_to_profile_idx_.size() + 1);
+  // profile.  Even when not Hlo profiling, we allocate a counter for the entire
+  // computation, which we use to update ExecutionProfile below.
+  std::vector<int64>* profile_counters = nullptr;
+  std::vector<int64> profile_counter_for_entry_computation;
+  if (hlo_execution_profile) {
+    profile_counters = hlo_execution_profile->mutable_profile_counters();
+  } else {
+    profile_counters = &profile_counter_for_entry_computation;
+    profile_counter_for_entry_computation.push_back(0);
+  }
 
   // Call the computation function following the calling convention.
   std::vector<void*> buffer_pointers;
@@ -198,7 +172,7 @@ Status CpuExecutable::ExecuteComputeFunction(
     VLOG(3) << tensorflow::strings::Printf(
         "  func(void* result, void* params[%zu], void* temps[%zu], "
         "uint64 profile_counters[%zu])",
-        args_array.size(), buffer_pointers.size(), profile_counters.size());
+        args_array.size(), buffer_pointers.size(), profile_counters->size());
     VLOG(3) << tensorflow::strings::Printf("    result = %p", result_buffer);
     auto ptr_printer = [](string* out, const void* p) {
       tensorflow::strings::StrAppend(out, tensorflow::strings::Printf("%p", p));
@@ -210,11 +184,11 @@ Status CpuExecutable::ExecuteComputeFunction(
         "    temps = [%s]",
         tensorflow::str_util::Join(buffer_pointers, ", ", ptr_printer).c_str());
     VLOG(3) << tensorflow::strings::Printf("    profile_counters = %p",
-                                           profile_counters.data());
+                                           profile_counters->data());
   }
 
   compute_function_(result_buffer, run_options, args_array.data(),
-                    buffer_pointers.data(), profile_counters.data());
+                    buffer_pointers.data(), profile_counters->data());
 
   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
 
@@ -223,67 +197,97 @@ Status CpuExecutable::ExecuteComputeFunction(
     const double nanoseconds = (end_micros - start_micros) * 1000.0;
     execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
 
-    // The last profile counter is used for the computation as a whole.
-    execution_profile_.set_compute_cycle_count(profile_counters.back());
-  }
-
-  if (hlo_execution_profile != nullptr) {
-    hlo_execution_profile->set_total_cycles_executed(
-        *module().entry_computation(), profile_counters.back());
-
-    for (auto hlo_prof_idx : hlo_to_profile_idx_) {
-      const HloInstruction* hlo = hlo_prof_idx.first;
-      uint64 cycles_taken = profile_counters[hlo_prof_idx.second];
-      hlo_execution_profile->SetCyclesTakenBy(hlo, cycles_taken);
+    if (hlo_execution_profile) {
+      execution_profile_.set_compute_cycle_count(
+          hlo_execution_profile->total_cycles_executed(
+              *module().entry_computation()));
+    } else {
+      execution_profile_.set_compute_cycle_count(profile_counters->back());
     }
   }
+
   return Status::OK();
 }
 
-StatusOr<perftools::gputools::DeviceMemoryBase> CpuExecutable::ExecuteOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
-  se::Stream* stream = run_options->stream();
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-  std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
-
-  TF_RETURN_IF_ERROR(AllocateBuffers(
-      memory_allocator, stream->parent()->device_ordinal(), &buffers));
-  TF_RETURN_IF_ERROR(ExecuteComputeFunction(
-      &run_options->run_options(), arguments, buffers, hlo_execution_profile));
-
-  // Mark the buffers that are actually live (used in the output) when the
-  // computation finishes executing.
-  std::unordered_set<const void*> marked_addresses;
-  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
-                      assignment_->GetUniqueTopLevelOutputSlice());
-  se::DeviceMemoryBase top_level_output = buffers[result_slice.index()];
-  MarkLiveAddressesInOutput(top_level_output.opaque(), result_shape(),
-                            &marked_addresses);
+static void LogLiveAddresses(
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
+    const std::vector<bool>& buffers_in_result) {
+  if (!VLOG_IS_ON(3)) {
+    return;
+  }
 
+  CHECK_EQ(buffers.size(), buffers_in_result.size());
+  std::vector<const void*> live_out_buffers;
+  for (int i = 0; i < buffers.size(); ++i) {
+    if (buffers_in_result[i]) {
+      live_out_buffers.push_back(buffers[i].opaque());
+    }
+  }
   VLOG(3) << "Live addresses in output marking found "
-          << marked_addresses.size() << " addresses:\n"
+          << live_out_buffers.size() << " addresses:\n"
           << tensorflow::str_util::Join(
-                 marked_addresses, ", ", [](string* out, const void* address) {
+                 live_out_buffers, ", ", [](string* out, const void* address) {
                    tensorflow::strings::StrAppend(
                        out, tensorflow::strings::Printf("%p", address));
                  });
+}
 
-  // Computation is done - deallocate temp buffers. Keep those marked live
-  // because they are referenced by the output of the computation and are needed
+static Status DeallocateTempBuffers(
+    DeviceMemoryAllocator* allocator, se::Stream* stream,
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
+    const std::vector<bool>& buffers_in_result) {
+  // Keep those buffers in the output of the marked live because they are needed
   // by the service. They will be deallocated by the service.
   for (size_t i = 0; i < buffers.size(); ++i) {
     se::DeviceMemoryBase alloc = buffers[i];
-    if (marked_addresses.count(alloc.opaque()) == 0 && !alloc.is_null()) {
+    if (!buffers_in_result[i] && !alloc.is_null()) {
       VLOG(3) << "CpuExecutable deallocating buffer #" << i << " ["
               << alloc.opaque() << "]";
-      TF_RETURN_IF_ERROR(memory_allocator->Deallocate(
-          stream->parent()->device_ordinal(), &alloc));
+      TF_RETURN_IF_ERROR(
+          allocator->Deallocate(stream->parent()->device_ordinal(), &alloc));
     }
   }
 
-  return top_level_output;
+  return Status::OK();
+}
+
+StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::CreateResultShapedBuffer(
+    const ServiceExecutableRunOptions* run_options,
+    tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
+        allocated_buffers,
+    std::vector<bool>* buffers_in_result) {
+  se::Stream* stream = run_options->stream();
+  auto result_buffer = MakeUnique<ShapedBuffer>(
+      /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(),
+      stream->parent()->platform(), stream->parent()->device_ordinal());
+
+  // Copy DeviceMemoryBase values which contain the array(s) of the result into
+  // the respective location in ShapedBuffer which is returned to the caller.
+  TF_RETURN_IF_ERROR(result_buffer->buffers().ForEachMutableElementWithStatus(
+      [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
+        const auto& sources = this->GetRootPointsToSet().element(index);
+        // The points to set is unambiguous so the set should be a
+        // singleton.
+        CHECK_EQ(1, sources.size());
+        const LogicalBuffer* buffer_source = sources[0];
+        HloInstruction* src = buffer_source->instruction();
+
+        // The source for this result buffer can be a nested buffer such as
+        // a tuple element. The source instruction should have a
+        // non-parameter buffer assigned.
+        TF_ASSIGN_OR_RETURN(
+            const BufferAllocation::Slice slice,
+            this->assignment_->GetUniqueSlice(src, buffer_source->index()));
+        CHECK(!slice.allocation()->is_entry_computation_parameter());
+
+        const BufferAllocation::Index buffer_index = slice.index();
+        const se::DeviceMemoryBase& buffer = allocated_buffers[buffer_index];
+        CHECK(!buffer.is_null() || buffer.size() == 0);
+        *device_memory = buffer;
+        (*buffers_in_result)[buffer_index] = true;
+        return Status::OK();
+      }));
+  return std::move(result_buffer);
 }
 
 StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteOnStream(
@@ -298,70 +302,60 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteOnStream(
   DeviceMemoryAllocator* memory_allocator = run_options->allocator();
   std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
 
-  auto result_buffer =
-      MakeUnique<ShapedBuffer>(result_shape(), stream->parent()->platform(),
-                               stream->parent()->device_ordinal());
-
   TF_RETURN_IF_ERROR(AllocateBuffers(
       memory_allocator, stream->parent()->device_ordinal(), &buffers));
   TF_RETURN_IF_ERROR(ExecuteComputeFunction(
       &run_options->run_options(), arguments, buffers, hlo_execution_profile));
 
-  // Copy DeviceMemoryBase values which contain the array(s) of the result into
-  // the respective location in ShapedBuffer which is returned to the caller.
   std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
-  TF_RETURN_IF_ERROR(
-      result_buffer->mutable_shape_index_to_buffer_entry()
-          ->ForEachMutableElementWithStatus(
-              [&buffers, &buffers_in_result, &result_buffer, this](
-                  const ShapeIndex& index, size_t* buffer_entry) {
-                const auto& sources = this->GetRootPointsToSet().element(index);
-                // The points to set is unambiguous so the set should be a
-                // singleton.
-                CHECK_EQ(1, sources.size());
-                const LogicalBuffer* buffer_source = sources[0];
-                HloInstruction* src = buffer_source->instruction();
-
-                // The source for this result buffer can be a nested buffer
-                // such as a tuple element.
-
-                // The source instruction should have a non-parameter buffer
-                // assigned.
-                TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
-                                    this->assignment_->GetUniqueSlice(
-                                        src, buffer_source->index()));
-                CHECK(!slice.allocation()->is_entry_computation_parameter());
-
-                const BufferAllocation::Index buffer_index = slice.index();
-                const se::DeviceMemoryBase& buffer = buffers[buffer_index];
-                CHECK(!buffer.is_null() || buffer.size() == 0);
-                *buffer_entry = result_buffer->mutable_buffers()->size();
-                result_buffer->mutable_buffers()->push_back(buffer);
-                buffers_in_result[buffer_index] = true;
-                return Status::OK();
-              }));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<ShapedBuffer> result_buffer,
+      CreateResultShapedBuffer(run_options, buffers, &buffers_in_result));
 
   // Free all buffers not in the result.
-  for (size_t i = 0; i < buffers.size(); ++i) {
-    se::DeviceMemoryBase alloc = buffers[i];
-    if (!buffers_in_result[i] && !alloc.is_null()) {
-      VLOG(3) << "CpuExecutable deallocating buffer #" << i << " ["
-              << alloc.opaque() << "]";
-      TF_RETURN_IF_ERROR(memory_allocator->Deallocate(
-          stream->parent()->device_ordinal(), &alloc));
-    }
-  }
+  TF_RETURN_IF_ERROR(DeallocateTempBuffers(memory_allocator, stream, buffers,
+                                           buffers_in_result));
 
   return std::move(result_buffer);
 }
 
-StatusOr<perftools::gputools::DeviceMemoryBase>
-CpuExecutable::ExecuteAsyncOnStream(
+StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
-  // TODO(b/30671675): Implement asynchronous execution mode.
-  return Unimplemented(
-      "Asynchronous execution on stream is not yet supported on CPU.");
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
+  if (hlo_profiling_enabled()) {
+    return Unimplemented(
+        "Asynchronous execution on stream with hlo profiling is not yet "
+        "supported on CPU.");
+  }
+
+  auto* host_stream = dynamic_cast<perftools::gputools::host::HostStream*>(
+      run_options->stream()->implementation());
+  se::Stream* stream = run_options->stream();
+  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
+  std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
+
+  TF_RETURN_IF_ERROR(AllocateBuffers(
+      memory_allocator, stream->parent()->device_ordinal(), &buffers));
+
+  std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<ShapedBuffer> result_buffer,
+      CreateResultShapedBuffer(run_options, buffers, &buffers_in_result));
+
+  LogLiveAddresses(buffers, buffers_in_result);
+
+  host_stream->EnqueueTask([this, run_options, arguments, buffers,
+                            buffers_in_result, memory_allocator, stream]() {
+    // Failing a CHECK here is not great, but I don't see an obvious way to
+    // return a failed Status asynchronously.
+    TF_CHECK_OK(ExecuteComputeFunction(&run_options->run_options(), arguments,
+                                       buffers,
+                                       /*hlo_execution_profile=*/nullptr));
+    TF_CHECK_OK(DeallocateTempBuffers(memory_allocator, stream, buffers,
+                                      buffers_in_result));
+  });
+
+  return std::move(result_buffer);
 }
 
 /*static*/ int64 CpuExecutable::ShapeSizeBytes(const Shape& shape) {
@@ -377,9 +371,5 @@ const PointsToSet& CpuExecutable::GetRootPointsToSet() const {
       module().entry_computation()->root_instruction());
 }
 
-std::unique_ptr<HloCostAnalysis> CpuExecutable::CreateCostAnalysis() const {
-  return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
-}
-
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 238bc9b46ae2bf1b519eaf137d9ae063e769bd2e..50443a59954e222f65fc935e83effdaf6d6c8bf0 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -47,29 +47,22 @@ namespace cpu {
 // architecture, so JIT-ed code and host code share the same ABI.
 class CpuExecutable : public Executable {
  public:
-  CpuExecutable(
-      std::unique_ptr<SimpleOrcJIT> jit,
-      std::unique_ptr<const BufferAssignment> assignment,
-      std::unique_ptr<const HloModule> hlo_module,
-      const string& entry_function_name,
-      std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx);
+  CpuExecutable(std::unique_ptr<SimpleOrcJIT> jit,
+                std::unique_ptr<const BufferAssignment> assignment,
+                std::unique_ptr<const HloModule> hlo_module,
+                const string& entry_function_name,
+                std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+                std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~CpuExecutable() override {}
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
-
   StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
+  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments) override;
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
   // This should be called after set_ir_module_string.
   const string& ir_module_string() const { return ir_module_string_; }
@@ -85,12 +78,10 @@ class CpuExecutable : public Executable {
 
   static int64 ShapeSizeBytes(const Shape& shape);
 
-  std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
-
   // Type of the computation function we expect in the JIT.
   using ComputeFunctionType = void (*)(
       void* /*result*/, const ExecutableRunOptions* /*run_options*/,
-      const void** /*args*/, void** /*temps*/, uint64* /*profile_counters*/);
+      const void** /*args*/, void** /*temps*/, int64* /*profile_counters*/);
 
   const ComputeFunctionType& compute_function() const {
     return compute_function_;
@@ -110,13 +101,6 @@ class CpuExecutable : public Executable {
 
   // Calls the generated function performing the computation with the given
   // arguments using the supplied buffers.
-  Status ExecuteComputeFunction(
-      const ExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          buffers,
-      HloExecutionProfile* hlo_execution_profile);
   Status ExecuteComputeFunction(
       const ExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
@@ -124,6 +108,18 @@ class CpuExecutable : public Executable {
           buffers,
       HloExecutionProfile* hlo_execution_profile);
 
+  // Create a ShapedBuffer for holding the result of the computation. The
+  // addresses (DeviceMemoryBases) are set according to buffer assignment.
+  // 'buffers_in_result' should point to a vector of the same size as
+  // 'allocated_buffers'. An element in buffers_in_result is set to true if the
+  // corresponding buffer is live out of the computation (and thus contained in
+  // the returned ShapedBuffer).
+  StatusOr<std::unique_ptr<ShapedBuffer>> CreateResultShapedBuffer(
+      const ServiceExecutableRunOptions* run_options,
+      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
+          allocated_buffers,
+      std::vector<bool>* buffers_in_result);
+
   // Returns the points-to set of the root instruction of the entry
   // computation. Uses points-to analysis from buffer assignment.
   const PointsToSet& GetRootPointsToSet() const;
@@ -145,9 +141,6 @@ class CpuExecutable : public Executable {
   // Entry function name for the computation.
   const string entry_function_name_;
 
-  // Maps HLOs to their index into the profile counter array.
-  const std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(CpuExecutable);
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index f87ee3cecd932faac140636a3db7cd4aa0371b85..482e04052d5a914eab0e5bff2c7a83f3b698052f 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -26,7 +26,7 @@ int64 BytesInDimension(const Shape& shape, int64 dimension) {
          shape.dimensions(dimension);
 }
 
-bool IsFusile(const HloInstruction& hlo) {
+bool CanBeLoopFused(const HloInstruction& hlo) {
   // These are the only ones we fuse since we rely on effective elemental IR
   // generation.
   return hlo.IsElementwise() ||  //
@@ -42,6 +42,23 @@ bool IsFusile(const HloInstruction& hlo) {
          hlo.opcode() == HloOpcode::kTranspose;
 }
 
+bool IsMatrixVectorDot(const HloInstruction* hlo) {
+  const Shape& hlo_shape = hlo->shape();
+  return hlo->opcode() == HloOpcode::kDot && hlo_shape.dimensions_size() == 2 &&
+         (hlo_shape.dimensions(0) == 1 || hlo_shape.dimensions(1) == 1);
+}
+
+bool CanBeOutputFused(const HloInstruction* producer,
+                      const HloInstruction* consumer) {
+  return consumer->opcode() == HloOpcode::kAdd && IsMatrixVectorDot(producer) &&
+         producer->user_count() == 1;
+}
+
+bool CanBeOutputFusedIntoSomeOperand(const HloInstruction* consumer) {
+  return consumer->opcode() == HloOpcode::kAdd &&
+         (CanBeOutputFused(consumer->operand(0), consumer) ||
+          CanBeOutputFused(consumer->operand(1), consumer));
+}
 }  // namespace
 
 bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
@@ -52,7 +69,15 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
 
   constexpr int kFusionThresholdBytes = 16 * 1024;
 
-  if (!IsFusile(*producer)) {
+  if (CanBeOutputFused(producer, consumer)) {
+    return true;
+  }
+
+  if (CanBeOutputFusedIntoSomeOperand(producer)) {
+    return false;
+  }
+
+  if (!CanBeLoopFused(*producer)) {
     VLOG(2) << "Producer is not fusile.";
     return false;
   }
@@ -108,16 +133,13 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
     }
   }
 
-  if (consumer->opcode() == HloOpcode::kFusion) {
-    // InstructionFusion::ShouldFuse above only allows kLoop and kInput fusions.
-    // The CPU backend does not create kInput fusions, so we only expect to see
-    // kLoop here.
-    CHECK(consumer->fusion_kind() == HloInstruction::FusionKind::kLoop);
+  if (consumer->opcode() == HloOpcode::kFusion &&
+      consumer->fusion_kind() == HloInstruction::FusionKind::kLoop) {
     VLOG(2) << "Fusing: consumer is a fusion node.";
     return true;
   }
 
-  if (IsFusile(*consumer)) {
+  if (CanBeLoopFused(*consumer)) {
     VLOG(2) << "Fusing: consumer is elementwise or fusile.";
     return true;
   }
@@ -126,5 +148,11 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
   return false;
 }
 
+HloInstruction::FusionKind CpuInstructionFusion::ChooseKind(
+    const HloInstruction* producer, const HloInstruction* consumer) {
+  return CanBeOutputFused(producer, consumer)
+             ? HloInstruction::FusionKind::kOutput
+             : HloInstruction::FusionKind::kLoop;
+}
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h
index 0eca4c3473e1454fe5dbd8bf855b4418cf553a94..07aff34974e0cfa6c7a129f82017b280fb1ccd59 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h
@@ -30,6 +30,8 @@ class CpuInstructionFusion : public InstructionFusion {
 
  protected:
   bool ShouldFuse(HloInstruction* consumer, int64 operand_index) override;
+  HloInstruction::FusionKind ChooseKind(
+      const HloInstruction* producer, const HloInstruction* consumer) override;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index b9e4d006d77ae76e33ac51440349400ea4eff118..595c3f55b321f47e2312b93e0c238c7637495d77 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -31,6 +31,14 @@ namespace {
 
 using InstructionFusionTest = HloTestBase;
 
+std::unique_ptr<HloInstruction> MakeDot(const Shape& shape, HloInstruction* lhs,
+                                        HloInstruction* rhs) {
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  return HloInstruction::CreateDot(shape, lhs, rhs, dot_dnums);
+}
+
 TEST_F(InstructionFusionTest, DotOperationFusion_Basic_0) {
   HloComputation::Builder builder(TestName());
   HloInstruction* arg0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -40,8 +48,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Basic_0) {
 
   HloInstruction* exp0 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {1024, 256}), HloOpcode::kExp, arg0));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1024, 1}), HloOpcode::kDot, exp0, arg1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), exp0, arg1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -59,8 +67,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Basic_1) {
 
   HloInstruction* exp1 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {256, 1024}), HloOpcode::kExp, arg1));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1, 1024}), HloOpcode::kDot, arg0, exp1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1, 1024}), arg0, exp1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -80,8 +88,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Bitcast) {
       ShapeUtil::MakeShape(S32, {2, 512, 2, 128}), HloOpcode::kExp, arg0));
   HloInstruction* bitcast0 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {1024, 256}), HloOpcode::kBitcast, exp0));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1024, 1}), HloOpcode::kDot, bitcast0, arg1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), bitcast0, arg1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -102,8 +110,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Reshape) {
   HloInstruction* reshape0 =
       builder.AddInstruction(HloInstruction::CreateReshape(
           ShapeUtil::MakeShape(S32, {1024, 256}), exp0));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1024, 1}), HloOpcode::kDot, reshape0, arg1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), reshape0, arg1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -121,8 +129,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_TooLarge) {
 
   HloInstruction* exp1 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {256, 32 * 1024}), HloOpcode::kExp, arg1));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1, 32 * 1024}), HloOpcode::kDot, arg0, exp1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1, 32 * 1024}), arg0, exp1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -140,8 +148,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_ElementReuse) {
 
   HloInstruction* exp1 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {256, 1024}), HloOpcode::kExp, arg1));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {2, 1024}), HloOpcode::kDot, arg0, exp1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {2, 1024}), arg0, exp1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -162,8 +170,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_TransposeFusion) {
   HloInstruction* transpose1 =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(S32, {256, 1024}), exp1, {1, 0}));
-  builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1, 1024}), HloOpcode::kDot, arg0, transpose1));
+  builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1, 1024}), arg0, transpose1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -188,7 +196,9 @@ class OpcodeFusionTest : public InstructionFusionTest {
   // Runs CPU instruction fusion on the given module, and tests that the result
   // contains a fused op at the root with exactly the given multiset of opcodes.
   void RunFusionAndCheckOpcodesWereFused(
-      HloModule* module, const std::multiset<HloOpcode>& expected_opcodes) {
+      HloModule* module, const std::multiset<HloOpcode>& expected_opcodes,
+      HloInstruction::FusionKind fusion_kind =
+          HloInstruction::FusionKind::kLoop) {
     auto computation = module->entry_computation();
     auto did_fusion = CpuInstructionFusion().Run(module);
     ASSERT_TRUE(did_fusion.ok());
@@ -196,7 +206,7 @@ class OpcodeFusionTest : public InstructionFusionTest {
 
     HloInstruction* root = computation->root_instruction();
     ASSERT_THAT(root, op::Fusion());
-    EXPECT_EQ(root->fusion_kind(), HloInstruction::FusionKind::kLoop);
+    EXPECT_EQ(root->fusion_kind(), fusion_kind);
 
     std::vector<HloOpcode> fused_opcodes(root->fused_instruction_count());
     std::transform(root->fused_instructions().begin(),
@@ -608,6 +618,88 @@ TEST_F(OpcodeFusionTest, ReuseViaImplicitBroadcastBinary) {
               Not(op::Fusion()));
 }
 
+void CreateComputationForDotAddOutputFusionTest(const string& test_name,
+                                                HloModule* module, int m, int k,
+                                                int n,
+                                                bool add_extra_use_for_dot) {
+  HloComputation::Builder builder(test_name);
+
+  Shape dot_lhs_shape = ShapeUtil::MakeShape(F32, {m, k});
+  Shape dot_rhs_shape = ShapeUtil::MakeShape(F32, {k, n});
+  Shape dot_shape = ShapeUtil::MakeShape(F32, {m, n});
+
+  auto* dot_lhs = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, dot_lhs_shape, "param0"));
+  auto* dot_rhs = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, dot_rhs_shape, "param1"));
+  auto* addend = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, dot_shape, "param2"));
+
+  auto* dot = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(dot_shape, dot_lhs, dot_rhs));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(dot_shape, HloOpcode::kAdd, dot, addend));
+
+  if (add_extra_use_for_dot) {
+    builder.AddInstruction(
+        HloInstruction::CreateOutfeed(dot_shape, dot, "no_config"));
+  }
+
+  module->AddEntryComputation(builder.Build());
+}
+
+TEST_F(OpcodeFusionTest, DotAddOutputFusion_1x50x19) {
+  auto module = CreateNewModule();
+  CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/1,
+                                             /*k=*/50, /*n=*/19,
+                                             /*add_extra_use_for_dot=*/false);
+
+  RunFusionAndCheckOpcodesWereFused(
+      module.get(),
+      {HloOpcode::kDot, HloOpcode::kAdd, HloOpcode::kParameter,
+       HloOpcode::kParameter, HloOpcode::kParameter},
+      HloInstruction::FusionKind::kOutput);
+}
+
+TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1) {
+  auto module = CreateNewModule();
+  CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
+                                             /*k=*/50, /*n=*/1,
+                                             /*add_extra_use_for_dot=*/false);
+
+  RunFusionAndCheckOpcodesWereFused(
+      module.get(),
+      {HloOpcode::kDot, HloOpcode::kAdd, HloOpcode::kParameter,
+       HloOpcode::kParameter, HloOpcode::kParameter},
+      HloInstruction::FusionKind::kOutput);
+}
+
+TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x19) {
+  auto module = CreateNewModule();
+  CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
+                                             /*k=*/50, /*n=*/19,
+                                             /*add_extra_use_for_dot=*/false);
+
+  TF_ASSERT_OK_AND_ASSIGN(bool fused_something,
+                          CpuInstructionFusion().Run(module.get()));
+  EXPECT_FALSE(fused_something);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              Not(op::Fusion()));
+}
+
+TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1_multi_use) {
+  auto module = CreateNewModule();
+  CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
+                                             /*k=*/50, /*n=*/1,
+                                             /*add_extra_use_for_dot=*/true);
+
+  TF_ASSERT_OK_AND_ASSIGN(bool fused_something,
+                          CpuInstructionFusion().Run(module.get()));
+  EXPECT_FALSE(fused_something);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              Not(op::Fusion()));
+}
+
 }  // namespace
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/layout_assignment.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
similarity index 55%
rename from tensorflow/compiler/xla/service/cpu/layout_assignment.cc
rename to tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
index 3f2d101959db50d9f775097f01d5a2ba25a0da8c..e8117377e61a4e21b8c45b929c518a18878fcb60 100644
--- a/tensorflow/compiler/xla/service/cpu/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/cpu/layout_assignment.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h"
 
 #include <numeric>
 
@@ -25,58 +25,77 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-Status CpuLayoutAssignment::AddBackendConstraints(
-    LayoutConstraints* constraints) {
-  auto row_major_shape = [](const Shape& old_shape) {
-    Shape new_shape(old_shape);
-    std::vector<int64> dimension_order(new_shape.dimensions_size());
-    std::iota(dimension_order.rbegin(), dimension_order.rend(), 0);
-    *new_shape.mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
-    return new_shape;
-  };
-  auto col_major_shape = [](const Shape& old_shape) {
-    Shape new_shape(old_shape);
-    std::vector<int64> dimension_order(new_shape.dimensions_size());
-    std::iota(dimension_order.begin(), dimension_order.end(), 0);
-    *new_shape.mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
-    return new_shape;
-  };
-
-  // We want to change the layout of constant arrays to be column major when all
-  // of their users are dot operations that can be made faster with the flipped
-  // layout.  To avoid going quadriatic over the # of instructions, we cache
-  // this property in should_make_rhs_col_major -- it maps a constant to true if
-  // all of the users of said constant are dot operations that can be sped up.
-  // This cache is populated lazily as we encounter dot operations traversing
-  // the instruction stream.
-  tensorflow::gtl::FlatMap<const HloInstruction*, bool>
-      should_make_rhs_col_major_cache;
-  auto should_make_rhs_col_major = [&](const HloInstruction& instruction) {
-    if (ProfitableToImplementDotInUntiledLlvmIr(instruction) !=
-        DotInLlvmIrProfitable::kWithColumnMajorRhs) {
+// We want to change the layout of constant arrays to be column major when all
+// of their users are dot operations that can be made faster with the flipped
+// layout.  To avoid going quadriatic over the # of instructions, we cache this
+// property in should_make_rhs_col_major -- it maps a constant to true if all of
+// the users of said constant are dot operations that can be sped up.  This
+// cache is populated lazily as we encounter dot operations traversing the
+// instruction stream.
+
+namespace {
+using ::tensorflow::gtl::nullopt;
+using ::tensorflow::gtl::optional;
+
+using ShouldMakeOperandColMajorCache =
+    tensorflow::gtl::FlatMap<const HloInstruction*, bool>;
+}  // namespace
+
+static bool ShouldMakeAllUsersColMajor(const HloInstruction* instruction) {
+  for (auto* user : instruction->users()) {
+    optional<int64> operand_idx = ProfitableToMakeDotOperandColumnMajor(*user);
+    if (!operand_idx || user->operand(*operand_idx) != instruction ||
+        std::count(user->operands().begin(), user->operands().end(),
+                   instruction) != 1) {
       return false;
     }
+  }
+  return true;
+}
 
-    const auto* rhs = instruction.operand(1);
-    if (rhs->opcode() != HloOpcode::kConstant) {
-      return false;
-    }
+static optional<int64> ShouldMakeOperandColumnMajor(
+    ShouldMakeOperandColMajorCache* cache, const HloInstruction& instruction) {
+  optional<int64> operand_idx =
+      ProfitableToMakeDotOperandColumnMajor(instruction);
+  if (!operand_idx) {
+    return nullopt;
+  }
 
-    auto it = should_make_rhs_col_major_cache.find(rhs);
-    if (it != should_make_rhs_col_major_cache.end()) {
-      return it->second;
-    }
+  const HloInstruction* operand = instruction.operand(*operand_idx);
+  if (operand->opcode() != HloOpcode::kConstant) {
+    return nullopt;
+  }
 
-    bool result = std::all_of(
-        rhs->users().begin(), rhs->users().end(), [&](HloInstruction* user) {
-          return ProfitableToImplementDotInUntiledLlvmIr(*user) ==
-                     DotInLlvmIrProfitable::kWithColumnMajorRhs &&
-                 user->operand(0) != rhs;
-        });
+  auto it = cache->find(operand);
+  if (it == cache->end()) {
+    auto insert_result =
+        cache->insert({operand, ShouldMakeAllUsersColMajor(operand)});
+    CHECK(insert_result.second);
+    it = insert_result.first;
+  }
 
-    InsertOrDie(&should_make_rhs_col_major_cache, rhs, result);
-    return result;
-  };
+  return it->second ? operand_idx : nullopt;
+}
+
+static Shape RowMajorShape(const Shape& old_shape) {
+  Shape new_shape(old_shape);
+  std::vector<int64> dimension_order(new_shape.dimensions_size());
+  std::iota(dimension_order.rbegin(), dimension_order.rend(), 0);
+  *new_shape.mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
+  return new_shape;
+}
+
+static Shape ColMajorShape(const Shape& old_shape) {
+  Shape new_shape(old_shape);
+  std::vector<int64> dimension_order(new_shape.dimensions_size());
+  std::iota(dimension_order.begin(), dimension_order.end(), 0);
+  *new_shape.mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
+  return new_shape;
+}
+
+Status CpuLayoutAssignment::AddBackendConstraints(
+    LayoutConstraints* constraints) {
+  ShouldMakeOperandColMajorCache cache;
 
   const HloComputation* computation = constraints->computation();
   for (auto* instruction : computation->instructions()) {
@@ -91,9 +110,9 @@ Status CpuLayoutAssignment::AddBackendConstraints(
       //
       // These constraints are not hard constraints. Ideally, we should decide
       // which layouts to choose according to some cost model.
-      Shape output_shape(row_major_shape(convolution->shape()));
-      Shape input_shape(row_major_shape(lhs_instruction->shape()));
-      Shape filter_shape(row_major_shape(rhs_instruction->shape()));
+      Shape output_shape(RowMajorShape(convolution->shape()));
+      Shape input_shape(RowMajorShape(lhs_instruction->shape()));
+      Shape filter_shape(RowMajorShape(rhs_instruction->shape()));
 
       // Set layouts of the instructions' shapes.
       TF_RETURN_IF_ERROR(
@@ -102,11 +121,11 @@ Status CpuLayoutAssignment::AddBackendConstraints(
           constraints->SetOperandLayout(filter_shape, convolution, 1));
       TF_RETURN_IF_ERROR(
           constraints->SetInstructionLayout(output_shape, convolution));
-    } else if (should_make_rhs_col_major(*instruction)) {
-      auto* dot = instruction;
-      const auto& rhs_shape = dot->operand(1)->shape();
-      TF_RETURN_IF_ERROR(
-          constraints->SetOperandLayout(col_major_shape(rhs_shape), dot, 1));
+    } else if (optional<int64> op_idx =
+                   ShouldMakeOperandColumnMajor(&cache, *instruction)) {
+      const HloInstruction* op = instruction->operand(*op_idx);
+      TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
+          ColMajorShape(op->shape()), instruction, *op_idx));
     } else if (PotentiallyImplementedAsEigenDot(*instruction)) {
       const HloInstruction* dot = instruction;
       // In order to implement `dot` with Eigen dot, the layouts of the lhs,
@@ -114,17 +133,17 @@ Status CpuLayoutAssignment::AddBackendConstraints(
       //
       // These constraints are not hard constraints. Ideally, we should decide
       // which layouts to choose according to some cost model.
-      Shape output_shape(row_major_shape(dot->shape()));
+      Shape output_shape(RowMajorShape(dot->shape()));
 
       const HloInstruction* lhs_instruction = dot->operand(0);
-      Shape lhs_shape(row_major_shape(lhs_instruction->shape()));
+      Shape lhs_shape(RowMajorShape(lhs_instruction->shape()));
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(lhs_shape, dot, 0));
 
       // dot is a kDot or a kTransposeDot fusion node.  In the latter case, if
       // it represents X @ X, it may have just one operand.
       if (dot->operand_count() > 1) {
         const HloInstruction* rhs_instruction = dot->operand(1);
-        Shape rhs_shape(row_major_shape(rhs_instruction->shape()));
+        Shape rhs_shape(RowMajorShape(rhs_instruction->shape()));
         TF_RETURN_IF_ERROR(constraints->SetOperandLayout(rhs_shape, dot, 1));
       }
 
@@ -141,8 +160,12 @@ Status CpuLayoutAssignment::AddBackendConstraints(
         if (constraints->OperandBufferForwarded(instruction, operand_no)) {
           continue;
         }
+        // Skip operands with non-array shapes.
+        if (!ShapeUtil::IsArray(instruction->operand(operand_no)->shape())) {
+          continue;
+        }
         Shape operand_shape(
-            row_major_shape(instruction->operand(operand_no)->shape()));
+            RowMajorShape(instruction->operand(operand_no)->shape()));
         TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
             operand_shape, instruction, operand_no));
       }
diff --git a/tensorflow/compiler/xla/service/cpu/layout_assignment.h b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
similarity index 86%
rename from tensorflow/compiler/xla/service/cpu/layout_assignment.h
rename to tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
index 4fd8d68dd6b4f2a8b16f6c048743a996ea76a560..c8edbb9e15a5b6f9c574f5fe9d130d149499ebd2 100644
--- a/tensorflow/compiler/xla/service/cpu/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LAYOUT_ASSIGNMENT_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LAYOUT_ASSIGNMENT_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_LAYOUT_ASSIGNMENT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_LAYOUT_ASSIGNMENT_H_
 
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
@@ -38,4 +38,4 @@ class CpuLayoutAssignment : public LayoutAssignment {
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LAYOUT_ASSIGNMENT_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_LAYOUT_ASSIGNMENT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/layout_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
similarity index 54%
rename from tensorflow/compiler/xla/service/cpu/layout_assignment_test.cc
rename to tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
index 1ea5e8c7fc4896512e62396d0a756cda44785f11..6ba030fff3bbc5f413bfb133114ceb5309b77672 100644
--- a/tensorflow/compiler/xla/service/cpu/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/cpu/layout_assignment.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h"
 
 #include <initializer_list>
 #include <memory>
@@ -40,6 +40,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 
@@ -61,8 +63,8 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantRhsTensor) {
       HloInstruction::CreateParameter(0, lhs_shape, "param0"));
   auto dot_rhs = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateFromShape(rhs_shape)));
-  auto result = builder.AddInstruction(HloInstruction::CreateBinary(
-      result_shape, HloOpcode::kDot, dot_lhs, dot_rhs));
+  auto result = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
   auto module = CreateNewModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
@@ -98,10 +100,10 @@ TEST_F(CpuLayoutAssignmentTest, MultipleDotsWithSameConstantRhsTensor0) {
       HloInstruction::CreateParameter(1, lhs_shape, "param1"));
   auto dot_rhs = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateFromShape(rhs_shape)));
-  auto dot_a_result = builder.AddInstruction(HloInstruction::CreateBinary(
-      result_shape, HloOpcode::kDot, dot_a_lhs, dot_rhs));
-  auto dot_b_result = builder.AddInstruction(HloInstruction::CreateBinary(
-      result_shape, HloOpcode::kDot, dot_b_lhs, dot_rhs));
+  auto dot_a_result = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(result_shape, dot_a_lhs, dot_rhs));
+  auto dot_b_result = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(result_shape, dot_b_lhs, dot_rhs));
   builder.AddInstruction(HloInstruction::CreateBinary(
       result_shape, HloOpcode::kAdd, dot_a_result, dot_b_result));
 
@@ -142,10 +144,10 @@ TEST_F(CpuLayoutAssignmentTest, MultipleDotsWithSameConstantRhsTensor1) {
       HloInstruction::CreateParameter(1, lhs_b_shape, "param1"));
   auto dot_rhs = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateFromShape(rhs_shape)));
-  auto dot_a_result = builder.AddInstruction(HloInstruction::CreateBinary(
-      result_a_shape, HloOpcode::kDot, dot_a_lhs, dot_rhs));
-  auto dot_b_result = builder.AddInstruction(HloInstruction::CreateBinary(
-      result_b_shape, HloOpcode::kDot, dot_b_lhs, dot_rhs));
+  auto dot_a_result = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(result_a_shape, dot_a_lhs, dot_rhs));
+  auto dot_b_result = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(result_b_shape, dot_b_lhs, dot_rhs));
   auto tuple_result = builder.AddInstruction(
       HloInstruction::CreateTuple({dot_a_result, dot_b_result}));
 
@@ -180,8 +182,8 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantLhsTensor) {
       HloInstruction::CreateConstant(Literal::CreateFromShape(lhs_shape)));
   auto dot_rhs = builder.AddInstruction(
       HloInstruction::CreateParameter(0, rhs_shape, "param0"));
-  auto dot_result = builder.AddInstruction(HloInstruction::CreateBinary(
-      result_shape, HloOpcode::kDot, dot_lhs, dot_rhs));
+  auto dot_result = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
   auto module = CreateNewModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
@@ -220,8 +222,8 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantRhsTensorThroughGTE) {
       HloInstruction::CreateParameter(0, lhs_shape, "param0"));
   auto dot_rhs = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(rhs_shape, constant, 1));
-  auto dot_result = builder.AddInstruction(HloInstruction::CreateBinary(
-      result_shape, HloOpcode::kDot, dot_lhs, dot_rhs));
+  auto dot_result = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
   auto module = CreateNewModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
@@ -241,5 +243,172 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantRhsTensorThroughGTE) {
     EXPECT_NE(instruction->opcode(), HloOpcode::kCopy);
   }
 }
+
+struct DotOutputFusionLayoutAssignmentResult {
+  bool layout_assignment_changed_something;
+  const HloInstruction* dot_lhs_fusion_param;
+  const HloInstruction* dot_rhs_fusion_param;
+  const HloInstruction* addend_fusion_param;
+};
+
+static StatusOr<DotOutputFusionLayoutAssignmentResult> RunDotOutputFusion(
+    HloModule* module, const string& test_name, int m, int k, int n,
+    const int64 dot_operand_idx_in_add) {
+  DotOutputFusionLayoutAssignmentResult result;
+
+  CHECK(dot_operand_idx_in_add == 0 || dot_operand_idx_in_add == 1);
+
+  auto builder = HloComputation::Builder(test_name);
+
+  Shape dot_lhs_shape = ShapeUtil::MakeShape(F32, {m, k});
+  Shape dot_rhs_shape = ShapeUtil::MakeShape(F32, {k, n});
+  Shape dot_shape = ShapeUtil::MakeShape(F32, {m, n});
+
+  HloInstruction* dot_lhs = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, dot_lhs_shape, "param0"));
+  HloInstruction* addend = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, dot_shape, "param1"));
+  HloInstruction* dot_rhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateFromShape(dot_rhs_shape)));
+  HloInstruction* dot_result = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(dot_shape, dot_lhs, dot_rhs));
+  HloInstruction* add_result;
+  if (dot_operand_idx_in_add == 0) {
+    add_result = builder.AddInstruction(HloInstruction::CreateBinary(
+        dot_shape, HloOpcode::kAdd, dot_result, addend));
+  } else {
+    add_result = builder.AddInstruction(HloInstruction::CreateBinary(
+        dot_shape, HloOpcode::kAdd, addend, dot_result));
+  }
+
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloInstruction* fusion_instruction =
+      module->entry_computation()->AddInstruction(HloInstruction::CreateFusion(
+          dot_shape, HloInstruction::FusionKind::kOutput, add_result));
+  TF_RETURN_IF_ERROR(
+      computation->ReplaceInstruction(add_result, fusion_instruction));
+
+  HloInstruction* fused_add =
+      fusion_instruction->fused_instructions_computation()->root_instruction();
+  HloInstruction* fused_dot = fusion_instruction->FuseInstruction(dot_result);
+
+  TF_RETURN_IF_ERROR(
+      computation->RemoveInstructionAndUnusedOperands(dot_result));
+
+  ComputationLayout computation_layout(computation->ComputeProgramShape());
+  *computation_layout.mutable_parameter_layout(0) =
+      ShapeLayout(LayoutUtil::GetWithDefaultLayout(dot_lhs_shape));
+  *computation_layout.mutable_parameter_layout(1) =
+      ShapeLayout(LayoutUtil::GetWithDefaultLayout(dot_shape));
+  *computation_layout.mutable_result_layout() =
+      ShapeLayout(LayoutUtil::GetWithDefaultLayout(dot_shape));
+
+  result.dot_lhs_fusion_param =
+      fusion_instruction->operand(fused_dot->operand(0)->parameter_number());
+  result.dot_rhs_fusion_param =
+      fusion_instruction->operand(fused_dot->operand(1)->parameter_number());
+  result.addend_fusion_param = fusion_instruction->operand(
+      fused_add->operand(1 - dot_operand_idx_in_add)->parameter_number());
+
+  cpu::CpuLayoutAssignment layout_assignment(&computation_layout);
+  TF_ASSIGN_OR_RETURN(result.layout_assignment_changed_something,
+                      layout_assignment.Run(module));
+
+  return result;
+}
+
+static void AssertCorrectLayoutForDotOutputFusion(
+    const HloComputation* computation,
+    const DotOutputFusionLayoutAssignmentResult& layout_assignment_result,
+    bool expect_col_major_dot_rhs) {
+  Layout expected_dot_rhs_layout = expect_col_major_dot_rhs
+                                       ? LayoutUtil::MakeLayout({0, 1})
+                                       : LayoutUtil::MakeLayout({1, 0});
+  EXPECT_TRUE(LayoutUtil::Equal(
+      expected_dot_rhs_layout,
+      layout_assignment_result.dot_rhs_fusion_param->shape().layout()));
+
+  EXPECT_TRUE(LayoutUtil::Equal(
+      LayoutUtil::MakeLayout({1, 0}),
+      layout_assignment_result.dot_lhs_fusion_param->shape().layout()));
+
+  EXPECT_TRUE(LayoutUtil::Equal(
+      LayoutUtil::MakeLayout({1, 0}),
+      layout_assignment_result.addend_fusion_param->shape().layout()));
+  EXPECT_THAT(computation->instructions(), Each(Not(op::Copy())));
+}
+
+TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_0) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  TF_ASSERT_OK_AND_ASSIGN(
+      DotOutputFusionLayoutAssignmentResult layout_assignment_result,
+      RunDotOutputFusion(module.get(), TestName(), /*m=*/1, /*k=*/50, /*n=*/19,
+                         /*dot_operand_idx_in_add=*/0));
+  ASSERT_TRUE(layout_assignment_result.layout_assignment_changed_something);
+  AssertCorrectLayoutForDotOutputFusion(module->entry_computation(),
+                                        layout_assignment_result,
+                                        /*expect_col_major_dot_rhs=*/true);
+}
+
+TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_1) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  TF_ASSERT_OK_AND_ASSIGN(
+      DotOutputFusionLayoutAssignmentResult layout_assignment_result,
+      RunDotOutputFusion(module.get(), TestName(), /*m=*/1, /*k=*/50, /*n=*/19,
+                         /*dot_operand_idx_in_add=*/1));
+  ASSERT_TRUE(layout_assignment_result.layout_assignment_changed_something);
+  AssertCorrectLayoutForDotOutputFusion(module->entry_computation(),
+                                        layout_assignment_result,
+                                        /*expect_col_major_dot_rhs=*/true);
+}
+
+TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_0) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  TF_ASSERT_OK_AND_ASSIGN(
+      DotOutputFusionLayoutAssignmentResult layout_assignment_result,
+      RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/1,
+                         /*dot_operand_idx_in_add=*/0));
+  ASSERT_TRUE(layout_assignment_result.layout_assignment_changed_something);
+  AssertCorrectLayoutForDotOutputFusion(module->entry_computation(),
+                                        layout_assignment_result,
+                                        /*expect_col_major_dot_rhs=*/false);
+}
+
+TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_1) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  TF_ASSERT_OK_AND_ASSIGN(
+      DotOutputFusionLayoutAssignmentResult layout_assignment_result,
+      RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/1,
+                         /*dot_operand_idx_in_add=*/1));
+  ASSERT_TRUE(layout_assignment_result.layout_assignment_changed_something);
+  AssertCorrectLayoutForDotOutputFusion(module->entry_computation(),
+                                        layout_assignment_result,
+                                        /*expect_col_major_dot_rhs=*/false);
+}
+
+TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_0) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  TF_ASSERT_OK_AND_ASSIGN(
+      DotOutputFusionLayoutAssignmentResult layout_assignment_result,
+      RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/19,
+                         /*dot_operand_idx_in_add=*/0));
+  ASSERT_TRUE(layout_assignment_result.layout_assignment_changed_something);
+  AssertCorrectLayoutForDotOutputFusion(module->entry_computation(),
+                                        layout_assignment_result,
+                                        /*expect_col_major_dot_rhs=*/false);
+}
+
+TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_1) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  TF_ASSERT_OK_AND_ASSIGN(
+      DotOutputFusionLayoutAssignmentResult layout_assignment_result,
+      RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/19,
+                         /*dot_operand_idx_in_add=*/1));
+  ASSERT_TRUE(layout_assignment_result.layout_assignment_changed_something);
+  AssertCorrectLayoutForDotOutputFusion(module->entry_computation(),
+                                        layout_assignment_result,
+                                        /*expect_col_major_dot_rhs=*/false);
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
index acfada8540d89bb098bb0b04e109441e2123e678..74ae6d00c91be07c0d181ea324e570c73c6b2e77 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
@@ -38,14 +38,16 @@ typedef float V8F32AVX __attribute__((__vector_size__(32)));
 
 extern "C" {
 
+#ifdef __AVX__
 // The following functions are vectorized versions of a selection of libm
 // library functions.
 // References to these functions are created by the LLVM vectorizer.
 xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_ExpV8F32AVX(
-    xla::cpu::runtime::V8F32AVX x) TF_ATTRIBUTE_WEAK;
+    xla::cpu::runtime::V8F32AVX x);
 
 xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_LogV8F32AVX(
-    xla::cpu::runtime::V8F32AVX x) TF_ATTRIBUTE_WEAK;
+    xla::cpu::runtime::V8F32AVX x);
+#endif
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_AVX_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h
index 75cb16b273973d2bf665d378084343fd612a2941..645a43858fb8c3d8e7e94709333c88503b6cc52d 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h
@@ -49,14 +49,16 @@ struct V4F32NEON;
 
 extern "C" {
 
+#ifdef __ARM_NEON__
 // The following functions are vectorized versions of a selection of libm
 // library functions.
 // References to these functions are created by the LLVM vectorizer.
 xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_ExpV4F32NEON(
-    xla::cpu::runtime::V4F32NEON x) TF_ATTRIBUTE_WEAK;
+    xla::cpu::runtime::V4F32NEON x);
 
 xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_LogV4F32NEON(
-    xla::cpu::runtime::V4F32NEON x) TF_ATTRIBUTE_WEAK;
+    xla::cpu::runtime::V4F32NEON x);
+#endif  // __ARM_NEON__
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_NEON_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
index 96587d10d2b86e14ff6a7400fdf14ca0d994ddc5..1bd8494bf8494d2100e68841f974c86e2beb3859 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
@@ -39,14 +39,17 @@ typedef float V4F32SSE __attribute__((__vector_size__(16)));
 
 extern "C" {
 
+#ifdef __SSE4_1__
 // The following functions are vectorized versions of a selection of libm
 // library functions.
 // References to these functions are created by the LLVM vectorizer.
 xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_ExpV4F32SSE(
-    xla::cpu::runtime::V4F32SSE x) TF_ATTRIBUTE_WEAK;
+    xla::cpu::runtime::V4F32SSE x);
 
 xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_LogV4F32SSE(
-    xla::cpu::runtime::V4F32SSE x) TF_ATTRIBUTE_WEAK;
+    xla::cpu::runtime::V4F32SSE x);
+#endif
+
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_SSE4_1_H_
diff --git a/tensorflow/compiler/xla/service/cpu/disassembler.h b/tensorflow/compiler/xla/service/cpu/disassembler.h
index b6feaa7e45cee26eb7f850081bd1fad2cb63b15c..5e302f88990ee4a3c37758881ecec4d6f71dd8e6 100644
--- a/tensorflow/compiler/xla/service/cpu/disassembler.h
+++ b/tensorflow/compiler/xla/service/cpu/disassembler.h
@@ -37,7 +37,7 @@ struct DisassemblerResult {
   DisassemblerResult(const string& text, size_t code_size_bytes)
       : text(text), code_size_bytes(code_size_bytes) {}
 
-  // The dissassembled text sections of the object file.
+  // The disassembled text sections of the object file.
   string text;
   // The total number of bytes of executable code in the object file.
   uint64_t code_size_bytes;
@@ -53,7 +53,7 @@ class Disassembler {
   // Returns a DisassemblerResult for the given object file, containing the
   // disassembled code.
   //
-  // If we couldnt' retrieve a disassembler for this platform, an error status
+  // If we couldn't retrieve a disassembler for this platform, an error status
   // is returned.
   StatusOr<DisassemblerResult> DisassembleObjectFile(
       const llvm::object::ObjectFile& object_file) const;
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 4c40dae5122b0853a72d6428fc120220e3a69237..74f71e5ad575134d78f834a9e63723c22ae49111 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -143,7 +143,8 @@ class ColumnMajorMatrixVectorProductEmitter {
   ColumnMajorMatrixVectorProductEmitter(PrimitiveType scalar_type,
                                         int64 tile_rows, int64 tile_cols,
                                         int64 m, int64 k, llvm::Value* lhs,
-                                        llvm::Value* rhs, llvm::Value* result,
+                                        llvm::Value* rhs, llvm::Value* addend,
+                                        llvm::Value* result,
                                         llvm::IRBuilder<>* ir_builder)
       : scalar_type_(scalar_type),
         tile_rows_(tile_rows),
@@ -152,6 +153,7 @@ class ColumnMajorMatrixVectorProductEmitter {
         k_(k),
         lhs_(lhs),
         rhs_(rhs),
+        addend_(addend),
         result_(result),
         ir_builder_(ir_builder),
         ksl_(ir_builder_),
@@ -198,6 +200,7 @@ class ColumnMajorMatrixVectorProductEmitter {
   int64 k_;
   llvm::Value* lhs_;
   llvm::Value* rhs_;
+  llvm::Value* addend_;
   llvm::Value* result_;
   llvm::IRBuilder<>* ir_builder_;
   KernelSupportLibrary ksl_;
@@ -242,9 +245,10 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
            /*step=*/tile_rows_, [&](llvm::Value* row) {
              std::vector<llvm::Value*> lhs_tile =
                  lhs_tile_loader->LoadTile(/*minor_dim_offset=*/row);
-             llvm::Value* accumulator = is_first_column
-                                            ? vsl_.GetZeroVector()
-                                            : vsl_.LoadVector(result_, row);
+             llvm::Value* accumulator =
+                 is_first_column ? (addend_ ? vsl_.LoadVector(addend_, row)
+                                            : vsl_.GetZeroVector())
+                                 : vsl_.LoadVector(result_, row);
              for (int i = 0; i < columns; i++) {
                accumulator = vsl_.MulAdd(lhs_tile[i], rhs_tile[i], accumulator);
              }
@@ -288,7 +292,18 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
                   ir_builder_->getInt1(is_first_tiled_column));
               ksl_.If(
                   setting_result_first_time,
-                  [&]() { vsl_.StoreScalar(product, result_, scalar_row); },
+                  /*true_block_generator=*/
+                  [&]() {
+                    if (addend_) {
+                      vsl_.StoreScalar(
+                          vsl_.Add(vsl_.LoadScalar(addend_, scalar_row),
+                                   product),
+                          result_, scalar_row);
+                    } else {
+                      vsl_.StoreScalar(product, result_, scalar_row);
+                    }
+                  },
+                  /*false_block_generator=*/
                   [&]() {
                     vsl_.StoreScalar(
                         vsl_.Add(vsl_.LoadScalar(result_, scalar_row), product),
@@ -353,7 +368,7 @@ class RowMajorMatrixVectorProductEmitter {
   RowMajorMatrixVectorProductEmitter(PrimitiveType scalar_type, int64 tile_rows,
                                      int64 tile_cols, int64 m, int64 k,
                                      llvm::Value* lhs, llvm::Value* rhs,
-                                     llvm::Value* result,
+                                     llvm::Value* addend, llvm::Value* result,
                                      llvm::IRBuilder<>* ir_builder)
       : scalar_type_(scalar_type),
         tile_rows_(tile_rows),
@@ -362,6 +377,7 @@ class RowMajorMatrixVectorProductEmitter {
         k_(k),
         lhs_(lhs),
         rhs_(rhs),
+        addend_(addend),
         result_(result),
         ir_builder_(ir_builder),
         ksl_(ir_builder_),
@@ -394,6 +410,7 @@ class RowMajorMatrixVectorProductEmitter {
   int64 k_;
   llvm::Value* lhs_;
   llvm::Value* rhs_;
+  llvm::Value* addend_;
   llvm::Value* result_;
   llvm::IRBuilder<>* ir_builder_;
   KernelSupportLibrary ksl_;
@@ -415,11 +432,32 @@ void RowMajorMatrixVectorProductEmitter::EmitOuterLoopBody(llvm::Value* row,
   EmitInnerLoopEpilogue(/*current_tile_row=*/row, /*rows=*/row_count,
                         &scalar_accumulators);
 
+  std::vector<llvm::Value*> accumulator_values;
+  std::transform(
+      vector_accumulators.begin(), vector_accumulators.end(),
+      std::back_inserter(accumulator_values),
+      [](const VectorVariable& vector_var) { return vector_var.Get(); });
+
+  std::vector<llvm::Value*> horizontal_sums;
+  if (row_count == vsl_.vector_size()) {
+    if (addend_) {
+      horizontal_sums = vsl_.ComputeHorizontalSums(
+          std::move(accumulator_values), vsl_.LoadVector(addend_, row));
+    } else {
+      horizontal_sums =
+          vsl_.ComputeHorizontalSums(std::move(accumulator_values));
+    }
+  } else {
+    horizontal_sums = vsl_.ComputeHorizontalSums(std::move(accumulator_values));
+  }
+
   for (int i = 0; i < row_count; i++) {
     llvm::Value* result_value =
-        vsl_.Add(vsl_.AddReduce(vector_accumulators[i].Get()),
-                 scalar_accumulators[i].Get());
+        vsl_.Add(horizontal_sums[i], scalar_accumulators[i].Get());
     llvm::Value* offset = ir_builder_->CreateAdd(ir_builder_->getInt64(i), row);
+    if (addend_ && row_count != vsl_.vector_size()) {
+      result_value = vsl_.Add(vsl_.LoadScalar(addend_, offset), result_value);
+    }
     vsl_.StoreScalar(result_value, result_, offset);
   }
 }
@@ -483,20 +521,19 @@ void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
 
 }  // namespace
 
-DotOpEmitter::DotOpEmitter(const HloInstruction& dot, bool transpose_lhs,
-                           bool transpose_rhs,
-                           const llvm_ir::IrArray& target_array,
-                           const llvm_ir::IrArray& lhs_array,
-                           const llvm_ir::IrArray& rhs_array,
-                           llvm::Value* executable_run_options_value,
-                           llvm::IRBuilder<>* ir_builder,
-                           const HloModuleConfig& hlo_module_config)
+DotOpEmitter::DotOpEmitter(
+    const HloInstruction& dot, bool transpose_lhs, bool transpose_rhs,
+    const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& lhs_array,
+    const llvm_ir::IrArray& rhs_array, const llvm_ir::IrArray* addend_array,
+    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
+    const HloModuleConfig& hlo_module_config)
     : dot_(dot),
       transpose_lhs_(transpose_lhs),
       transpose_rhs_(transpose_rhs),
       target_array_(target_array),
       lhs_array_(lhs_array),
       rhs_array_(rhs_array),
+      addend_array_(addend_array),
       executable_run_options_value_(executable_run_options_value),
       ir_builder_(ir_builder),
       hlo_module_config_(hlo_module_config) {}
@@ -504,28 +541,29 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot, bool transpose_lhs,
 /* static */ tensorflow::Status DotOpEmitter::EmitDotOperation(
     const HloInstruction& dot, bool transpose_lhs, bool transpose_rhs,
     const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& lhs_array,
-    const llvm_ir::IrArray& rhs_array,
+    const llvm_ir::IrArray& rhs_array, const llvm_ir::IrArray* addend_array,
     llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
     const HloModuleConfig& hlo_module_config) {
   PrimitiveType type = target_array.GetShape().element_type();
   TF_RET_CHECK(F32 == type || F64 == type || C64 == type);
   DotOpEmitter dot_emitter(dot, transpose_lhs, transpose_rhs, target_array,
-                           lhs_array, rhs_array, executable_run_options_value,
-                           ir_builder, hlo_module_config);
+                           lhs_array, rhs_array, addend_array,
+                           executable_run_options_value, ir_builder,
+                           hlo_module_config);
   return dot_emitter.Emit();
 }
 
 bool DotOpEmitter::ShapesAreLegalForRuntimeDot() const { return true; }
 
 bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
-  if (dot_.shape().dimensions_size() != 2 ||
-      ProfitableToImplementDotInUntiledLlvmIr(dot_) ==
-          DotInLlvmIrProfitable::kYes) {
+  if (dot_.shape().dimensions_size() != 2) {
     return false;
   }
 
-  if (!primitive_util::IsFloatingPointType(dot_.shape().element_type()) &&
-      !primitive_util::IsIntegralType(dot_.shape().element_type())) {
+  PrimitiveType primitive_type = dot_.shape().element_type();
+
+  if (!primitive_util::IsFloatingPointType(primitive_type) &&
+      !primitive_util::IsIntegralType(primitive_type)) {
     return false;
   }
 
@@ -575,30 +613,63 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
   int64 tiling_factor = GetGemvTilingFactor();
   CHECK_GT(tiling_factor, 0);
 
+  llvm::Value* result_op = target_array_.GetBasePointer();
+  llvm::Value* lhs_op =
+      swap_operands ? rhs_array_.GetBasePointer() : lhs_array_.GetBasePointer();
+  llvm::Value* rhs_op =
+      swap_operands ? lhs_array_.GetBasePointer() : rhs_array_.GetBasePointer();
+
+  const bool enable_fast_math =
+      hlo_module_config_.debug_options().xla_enable_fast_math();
+  const bool optimize_for_size =
+      options::OptimizeForSizeRequested(hlo_module_config_);
+
   if (is_column_major_matrix_vector) {
     VLOG(2) << "Emitting column major matrix-vector multiply with m = " << m
             << " and k = " << k;
-    ColumnMajorMatrixVectorProductEmitter emitter(
-        dot_.shape().element_type(), /*tile_rows=*/8,
-        /*tile_cols=*/tiling_factor, m, k,
-        swap_operands ? rhs_array_.GetBasePointer()
-                      : lhs_array_.GetBasePointer(),
-        swap_operands ? lhs_array_.GetBasePointer()
-                      : rhs_array_.GetBasePointer(),
-        target_array_.GetBasePointer(), ir_builder_);
-    emitter.Emit();
+    int64 tile_rows = 8;
+    int64 tile_cols = tiling_factor;
+
+    string kernel_name = tensorflow::strings::StrCat(
+        "col_major_gemv_", PrimitiveType_Name(primitive_type), "_", tile_rows,
+        "_", tile_cols, "_", m, "_", k, addend_array_ ? "_with_addend" : "");
+
+    KernelSupportLibrary::EmitAndCallOutlinedKernel(
+        /*enable_fast_math=*/enable_fast_math,
+        /*optimize_for_size=*/optimize_for_size, ir_builder_, kernel_name,
+        lhs_op, rhs_op,
+        addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op,
+        [this, tile_rows, tile_cols, m, k, primitive_type](
+            llvm::Value* lhs_op, llvm::Value* rhs_op, llvm::Value* addend_op,
+            llvm::Value* result_op) {
+          ColumnMajorMatrixVectorProductEmitter emitter(
+              primitive_type, tile_rows, tile_cols, m, k, lhs_op, rhs_op,
+              addend_op, result_op, ir_builder_);
+          emitter.Emit();
+        });
   } else {
     VLOG(2) << "Emitting row major matrix-vector multiply with m = " << m
             << " and k = " << k;
-    RowMajorMatrixVectorProductEmitter emitter(
-        dot_.shape().element_type(), /*tile_rows=*/tiling_factor,
-        /*tile_cols=*/8, m, k,
-        swap_operands ? rhs_array_.GetBasePointer()
-                      : lhs_array_.GetBasePointer(),
-        swap_operands ? lhs_array_.GetBasePointer()
-                      : rhs_array_.GetBasePointer(),
-        target_array_.GetBasePointer(), ir_builder_);
-    emitter.Emit();
+    int64 tile_rows = tiling_factor;
+    int64 tile_cols = 8;
+
+    string kernel_name = tensorflow::strings::StrCat(
+        "row_major_gemv_", PrimitiveType_Name(primitive_type), "_", tile_rows,
+        "_", tile_cols, "_", m, "_", k, addend_array_ ? "_with_addend" : "");
+
+    KernelSupportLibrary::EmitAndCallOutlinedKernel(
+        /*enable_fast_math=*/enable_fast_math,
+        /*optimize_for_size=*/optimize_for_size, ir_builder_, kernel_name,
+        lhs_op, rhs_op,
+        addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op,
+        [this, tile_rows, tile_cols, m, k, primitive_type](
+            llvm::Value* lhs_op, llvm::Value* rhs_op, llvm::Value* addend_op,
+            llvm::Value* result_op) {
+          RowMajorMatrixVectorProductEmitter emitter(
+              primitive_type, tile_rows, tile_cols, m, k, lhs_op, rhs_op,
+              addend_op, result_op, ir_builder_);
+          emitter.Emit();
+        });
   }
 
   return true;
@@ -641,6 +712,8 @@ tensorflow::Status DotOpEmitter::Emit() {
     return Status::OK();
   }
 
+  CHECK_EQ(addend_array_, nullptr);
+
   if (PotentiallyImplementedAsEigenDot(dot_)) {
     return EmitCallToRuntime();
   }
@@ -915,8 +988,8 @@ DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
   return {lhs_shape.dimensions(transpose_lhs_ ? 1 : 0),
           lhs_shape.dimensions(transpose_lhs_ ? 0 : 1),
           rhs_shape.dimensions(transpose_rhs_ ? 0 : 1),
-          lhs_shape.layout().minor_to_major(0) == 0,
-          rhs_shape.layout().minor_to_major(0) == 0};
+          LayoutUtil::Minor(lhs_shape.layout(), 0) == 0,
+          LayoutUtil::Minor(rhs_shape.layout(), 0) == 0};
 }
 
 llvm_ir::IrArray::Index DotOpEmitter::EmitOperandArrayLoopNest(
@@ -927,8 +1000,8 @@ llvm_ir::IrArray::Index DotOpEmitter::EmitOperandArrayLoopNest(
   // reduction dimension.
   std::vector<int64> dimensions;
   const Shape& shape = operand_array.GetShape();
-  for (int i = shape.layout().minor_to_major_size() - 1; i >= 0; --i) {
-    int64 dimension = shape.layout().minor_to_major(i);
+  for (int i = LayoutUtil::MinorToMajor(shape).size() - 1; i >= 0; --i) {
+    int64 dimension = LayoutUtil::Minor(shape.layout(), i);
     if (dimension != reduction_dimension) {
       dimensions.push_back(dimension);
     }
@@ -977,9 +1050,7 @@ bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo) {
       return false;
     }
 
-    if (ProfitableToImplementDotInUntiledLlvmIr(hlo) ==
-            DotInLlvmIrProfitable::kYes ||
-        ProfitableToImplementDotInTiledLlvmIr(hlo)) {
+    if (ProfitableToImplementDotInTiledLlvmIr(hlo)) {
       return false;
     }
 
@@ -1010,46 +1081,42 @@ bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo) {
   return false;
 }
 
-DotInLlvmIrProfitable ProfitableToImplementDotInUntiledLlvmIr(
-    const HloInstruction& dot) {
-  if (dot.opcode() == HloOpcode::kDot && dot.shape().dimensions_size() == 2) {
-    const Shape& result_shape = dot.shape();
-    // kReductionDimensionThresholdBytes was chosen to be 1/4 of a typical L1
-    // cache line size, so that we can have the reduction dimension of both the
-    // LHS and RHS matrices and still have some space "left over".  This needs
-    // to be tuned further.
-    const int64 kReductionDimensionThresholdBytes = 8 * 1024;
-    const bool single_threaded_eigen =
-        !dot.GetModule()->config().debug_options().xla_cpu_multi_thread_eigen();
-
-    // This is the point at which it is better to call into Eigen and shard the
-    // dot across multiple worker threads.  This is a rough estimate by running
-    // a matmult benchmark on my local machine, and it can be tuned further.
-    const int64 kMaxSingleThreadedFlops = 16 * 1024;
-
-    const int64 M = result_shape.dimensions(0);
-    const int64 N = result_shape.dimensions(1);
-    const int64 K = dot.operand(1)->shape().dimensions(0);
-    const int64 primitive_type_size =
-        ShapeUtil::ByteSizeOfPrimitiveType(result_shape.element_type());
-    if (M == 1 &&
-        K * primitive_type_size <= kReductionDimensionThresholdBytes &&
-        (single_threaded_eigen || M * K * N <= kMaxSingleThreadedFlops)) {
-      // Heuristics:
-      //
-      //  - Look for a configuration where we will likely be able to keep LHS in
-      //    L1 and do a cache-optimal traversal of RHS.
-      //
-      //  - Bail out on matrices that are large enough that Eigen can profitably
-      //    shard the computation across multiple cores.  This only applies when
-      //    multi-threading is enabled.
-      return LayoutUtil::IsMonotonicWithDim0Major(
-                 dot.operand(1)->shape().layout())
-                 ? DotInLlvmIrProfitable::kWithColumnMajorRhs
-                 : DotInLlvmIrProfitable::kYes;
+// For vector-matrix dot products, it is always profitable to make the Rhs
+// column major.
+tensorflow::gtl::optional<int64> ProfitableToMakeDotOperandColumnMajor(
+    const HloInstruction& hlo) {
+  if (hlo.opcode() == HloOpcode::kDot && hlo.shape().dimensions_size() == 2 &&
+      hlo.shape().dimensions(0) == 1) {
+    if (hlo.dot_dimension_numbers().rhs_contracting_dimensions(0) == 0) {
+      return 1;
+    }
+    return {};
+  }
+
+  if (hlo.opcode() == HloOpcode::kFusion &&
+      hlo.fusion_kind() == HloInstruction::FusionKind::kOutput) {
+    auto* fusion_root =
+        hlo.fused_instructions_computation()->root_instruction();
+    if (fusion_root->opcode() != HloOpcode::kAdd) {
+      return {};
+    }
+
+    for (auto* fusion_root_op : fusion_root->operands()) {
+      if (fusion_root_op->opcode() != HloOpcode::kDot) {
+        continue;
+      }
+      if (auto operand_num =
+              ProfitableToMakeDotOperandColumnMajor(*fusion_root_op)) {
+        auto* operand = fusion_root_op->operand(*operand_num);
+        if (operand->opcode() == HloOpcode::kParameter &&
+            operand->user_count() == 1) {
+          return operand->parameter_number();
+        }
+      }
     }
   }
-  return DotInLlvmIrProfitable::kNo;
+
+  return {};
 }
 
 bool ProfitableToImplementDotInTiledLlvmIr(const HloInstruction& dot) {
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index c9168ccc0f6629c2a2bfbc7d4dc9c7ebab0a5708..2118965a70872846204974e25555340baca718cf 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -32,19 +32,11 @@ namespace cpu {
 
 bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo);
 
-enum class DotInLlvmIrProfitable { kYes, kNo, kWithColumnMajorRhs };
-
-// Returns a value to indicate if (and under what conditions) will lowering
-// |dot| as a untiled LLVM IR dot operation be profitable over calling into
-// Eigen or emitting a tiled LLVM IR implementation.  Possible return values
-// are:
-//
-//  * DotInLlvmIrProfitable::kYes - always profitable.
-//  * DotInLlvmIrProfitable::kNo - never profitable.
-//  * DotInLlvmIrProfitable::kWithColumnMajorRhs - only if we can manage to make
-//    the Rhs layout column major.
-DotInLlvmIrProfitable ProfitableToImplementDotInUntiledLlvmIr(
-    const HloInstruction& dot);
+// Returns the index for an operand to `hlo` that should ideally be column
+// major.  Returns nullopt if there is no such operand or if `hlo` is not a dot
+// or a fusion containing a dot.
+tensorflow::gtl::optional<int64> ProfitableToMakeDotOperandColumnMajor(
+    const HloInstruction& hlo);
 
 // Returns true to indicate that we can generate a tiled LLVM IR implementation
 // for |dot|.
@@ -57,10 +49,15 @@ class DotOpEmitter {
   // place the result in target_array. IR is emitted at current insert point of
   // the builder. Upon completion of the method, the insert point is set to the
   // end of all instructions emitted for this operation.
+  //
+  // If `addend_array` is not nullptr then it must be an array of the same
+  // dimensions as the result, and the result is computed as `addend_array` +
+  // dot(`lhs_array`, `rhs_array`).  A non-null `addend_array` is only supported
+  // for Matrix-vector products.
   static tensorflow::Status EmitDotOperation(
       const HloInstruction& dot, bool transpose_lhs, bool transpose_rhs,
       const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& lhs_array,
-      const llvm_ir::IrArray& rhs_array,
+      const llvm_ir::IrArray& rhs_array, const llvm_ir::IrArray* addend_array,
       llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
       const HloModuleConfig& hlo_module_config);
 
@@ -69,6 +66,7 @@ class DotOpEmitter {
                bool transpose_rhs, const llvm_ir::IrArray& target_array,
                const llvm_ir::IrArray& lhs_array,
                const llvm_ir::IrArray& rhs_array,
+               const llvm_ir::IrArray* addend_array,
                llvm::Value* executable_run_options_value,
                llvm::IRBuilder<>* ir_builder,
                const HloModuleConfig& hlo_module_config);
@@ -140,6 +138,7 @@ class DotOpEmitter {
   const llvm_ir::IrArray& target_array_;
   const llvm_ir::IrArray& lhs_array_;
   const llvm_ir::IrArray& rhs_array_;
+  const llvm_ir::IrArray* addend_array_;
   llvm::Value* executable_run_options_value_;
   llvm::IRBuilder<>* ir_builder_;
   const HloModuleConfig& hlo_module_config_;
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index ba693ec89ab7c4090f8c9d1e4d65f17a80d0ac55..ebd96c4c42759b71b79408c73814605301af03c1 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -44,15 +44,11 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitFloatUnaryOp(
         default:
           return Unimplemented("tanh");
       }
-      // Create function type for the function.
-      llvm::FunctionType* function_type = llvm::FunctionType::get(
-          llvm_ir::PrimitiveTypeToIrType(element_type, module_),
-          llvm_ir::PrimitiveTypeToIrType(element_type, module_),
-          /*isVarArg=*/false);
       // Create function declaration for 'tanhf'.
       llvm::Function* function =
           llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-              llvm_ir::AsStringRef(function_name), function_type));
+              llvm_ir::AsStringRef(function_name), operand_value->getType(),
+              operand_value->getType()));
       function->setCallingConv(llvm::CallingConv::C);
       function->setDoesNotThrow();
       function->setDoesNotAccessMemory();
@@ -64,6 +60,31 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitFloatUnaryOp(
   }
 }
 
+StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(
+    PrimitiveType prim_type, llvm::Value* lhs, llvm::Value* rhs) const {
+  string function_name;
+  switch (prim_type) {
+    case F32:
+      function_name = "atan2f";
+      break;
+    case F64:
+      function_name = "atan2";
+      break;
+    default:
+      return Unimplemented("atan2");
+  }
+  // Create function declaration for 'atan2'.
+  llvm::Function* function =
+      llvm::cast<llvm::Function>(module_->getOrInsertFunction(
+          llvm_ir::AsStringRef(function_name), lhs->getType(), lhs->getType(),
+          rhs->getType()));
+  function->setCallingConv(llvm::CallingConv::C);
+  function->setDoesNotThrow();
+  function->setDoesNotAccessMemory();
+  // Create instruction to call 'atan2'.
+  return ir_builder_->CreateCall(function, {lhs, rhs});
+}
+
 llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator(
     const HloInstruction* hlo,
     const HloToElementGeneratorMap& operand_to_generator) const {
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
index 7e9f27befb456c17581f556868712f92fd8fd083..4446dfd2821fb4b6e75f33694367392ecbcdd8bf 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
@@ -41,6 +41,8 @@ class CpuElementalIrEmitter : public ElementalIrEmitter {
  protected:
   StatusOr<llvm::Value*> EmitFloatUnaryOp(
       const HloInstruction* op, llvm::Value* operand_value) const override;
+  StatusOr<llvm::Value*> EmitAtan2(PrimitiveType prim_type, llvm::Value* lhs,
+                                   llvm::Value* rhs) const override;
 
   IrEmitter* ir_emitter_;
 };
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
index cb5cb8a6dd6d01febde46ac7dc0950f947fd3265..788217aab6172b4e548452b3f6ffd4197c163ce4 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
@@ -29,10 +29,8 @@ bool PotentiallyImplementedAsEigenConvolution(
   // The following conditions are necessary (but not sufficient) for
   // implementing `convolution` with Eigen convolution:
   // - the input and kernel have a non-zero number of elements.
-  // - the input is in NHWC or NWHC order.
-  // - the kernel is in HWIO or WHIO order.
-  // - the spatial dimensions are in the same relative order in the input,
-  //   kernel and output.
+  // - the input is in NHWC order.
+  // - the kernel is in HWIO order.
   //
   // To be sufficient, certain layout constraints need to be satisfied as well.
   const Shape& input_shape = convolution.operand(0)->shape();
@@ -46,20 +44,30 @@ bool PotentiallyImplementedAsEigenConvolution(
       ShapeUtil::ElementIsComplex(kernel_shape)) {
     return false;
   }
+  if (window_util::HasWindowReversal(convolution.window())) {
+    return false;
+  }
 
   const ConvolutionDimensionNumbers& dnums =
       convolution.convolution_dimension_numbers();
   // Only 1D and 2D convolutions are supported at the moment.
   // TODO(b/32897908): add an optimized implementation for 3D convolution.
-  if (dnums.spatial_dimensions_size() > 2) {
+  const int64 num_spatial_dims = dnums.output_spatial_dimensions_size();
+  if (num_spatial_dims > 2) {
     return false;
   }
 
-  bool input_spatial_dims_ascending = std::is_sorted(
-      dnums.spatial_dimensions().begin(), dnums.spatial_dimensions().end());
-  bool kernel_spatial_dims_ascending =
-      std::is_sorted(dnums.kernel_spatial_dimensions().begin(),
-                     dnums.kernel_spatial_dimensions().end());
+  for (int64 i = 0; i < num_spatial_dims; ++i) {
+    if (dnums.input_spatial_dimensions(i) != i + 1) {
+      return false;
+    }
+    if (dnums.kernel_spatial_dimensions(i) != i) {
+      return false;
+    }
+    if (dnums.output_spatial_dimensions(i) != i + 1) {
+      return false;
+    }
+  }
 
   const Shape& output_shape = convolution.shape();
   return dnums.input_batch_dimension() == 0 &&
@@ -67,7 +75,6 @@ bool PotentiallyImplementedAsEigenConvolution(
          dnums.output_batch_dimension() == 0 &&
          dnums.output_feature_dimension() ==
              output_shape.dimensions_size() - 1 &&
-         input_spatial_dims_ascending == kernel_spatial_dims_ascending &&
          dnums.kernel_input_feature_dimension() ==
              kernel_shape.dimensions_size() - 2 &&
          dnums.kernel_output_feature_dimension() ==
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
index ac361ddfb4c8d253ffb1c99200939f6324cad2bb..34b2003916933f5ec0a15d9e219063c0a912fa40 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMISSION_UTILS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMISSION_UTILS_H_
 
+#include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
@@ -23,6 +24,19 @@ namespace cpu {
 
 bool PotentiallyImplementedAsEigenConvolution(
     const HloInstruction& convolution);
+
+// Dynamic loop bounds are specified as an array of dimension index
+// [start, limit) pairs of ir values (one for each partitioned outer dimension).
+//
+// EX: Let 'shape' = [8, 16, 32], with the loop bounds of the two-most major
+//     dimensions dynamic. Then 'dynamic_loop_bounds' will contain the
+//     following ir values for the two most-major dimensions:
+//       [dim0_index_start_ir_value, dim0_index_limit_ir_value]
+//       [dim1_index_start_ir_value, dim1_index_limit_ir_value]
+//
+// See IrFunction and ParallelLoopEmitter for details.
+using DynamicLoopBounds = std::vector<std::pair<llvm::Value*, llvm::Value*>>;
+
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index c00f1d5c1dbe8a7dcb92e98df6604081d5e496ae..ef33260c17168b1516264a2f69cb80afb04ddeef 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -24,16 +24,17 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/logging.h"
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
@@ -42,6 +43,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/cpu/ir_function.h"
+#include "tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
@@ -124,131 +127,27 @@ StatusOr<llvm::Function*> IrEmitter::EmitComputation(
   } else {
     TF_RETURN_IF_ERROR(computation->AcceptOrdered(this, *instruction_order));
   }
-  InsertOrDie(&emitted_functions_, computation, compute_function_);
-
-  return compute_function_;
-}
-
-static llvm::Argument* GetArg(llvm::Function* f, int idx) {
-  llvm::Function::arg_iterator arg_iter = f->arg_begin();
-  std::advance(arg_iter, idx);
-  return &*arg_iter;
+  llvm::Function* ir_function = compute_function_->function();
+  InsertOrDie(&emitted_functions_, computation, ir_function);
+  // Delete 'compute_function', finalizing 'ir_function' and restoring caller
+  // IR insert point.
+  compute_function_.reset();
+  return ir_function;
 }
 
 void IrEmitter::InitializeIrFunction(const string& function_name) {
-  // The function signature is:
-  //   void function(i8* retval, i8* run_options, i8** params, i8** temps,
-  //                 i64* dynamic_loop_bounds, i64* prof_counters)
-  //
-  // retval: points to the returned value.
-  // params: address of an array with pointers to parameters.
-  // temps: address of an array with pointers to temporary buffers.
-  //
-  // Therefore, the generated function's signature (FunctionType) is statically
-  // determined - parameter unpacking is done in code generated into the
-  // function, rather than by a prologue dictated by the platform ABI.
-  //
-  //                      /--------------\
-  //   retval ----------> | return value |
-  //                      \--------------/
-  //
-  //                      /-------------------------------\
-  //   run_options -----> | xla::ExecutableRunOptions |
-  //                      \-------------------------------/
-  //
-  //                     /---------------------------------------------\
-  //   params -------->  |  param 0  |  param 1  | ..... |  param N-1  |
-  //                     |   addr    |   addr    |       |   addr      |
-  //                     \---------------------------------------------/
-  //                          |           |                   |
-  //                          |           |                   |
-  //                          V           V                   V
-  //                     /---------\  /---------\         /-----------\
-  //                     | param 0 |  | param 1 |         | param N-1 |
-  //                     \---------/  \---------/         \-----------/
-  //
-  //                     /---------------------------------------------\
-  //   temps --------->  |  temp  0  |  temp  1  | ..... |  temp  N-1  |
-  //                     |   addr    |   addr    |       |   addr      |
-  //                     \---------------------------------------------/
-  //                          |           |                   |
-  //                          |           |                   |
-  //                          V           V                   V
-  //                     /---------\  /---------\         /-----------\
-  //                     | temp  0 |  | temp  1 |         | temp  N-1 |
-  //                     \---------/  \---------/         \-----------/
-  //
-  //                        /--------------------------------------------\
-  // dynamic loop bounds -> | outer_dim0_start | outer_dim0_limit | .....|
-  //  (elided for aot)      \--------------------------------------------/
-  //
-  //                     /---------------------------------------------\
-  //   prof counters ->  | counter 0 | counter 1 | ..... | counter N-1 |
-  //  (elided for aot)   \---------------------------------------------/
-
-  // Even though the type of params and temps is void** in the host's view, in
-  // LLVM IR this is represented by i8*, similarly to void*. It's up to the code
-  // to use GEPs to unravel the indirection layers.
-  llvm::FunctionType* compute_function_type = llvm::FunctionType::get(
-      /*Result=*/llvm::Type::getVoidTy(module_->getContext()),
-      /*Params=*/GetComputeFunctionParams(),
-      /*isVarArg=*/false);
-
   // Functions with local linkage get an inlining bonus.  Because we know
   // a-priori that embedded functions (non-entry functions) will not have its
   // name resolved, give it local linkage.
   llvm::Function::LinkageTypes linkage =
       is_top_level_computation_ ? llvm::GlobalValue::ExternalLinkage
                                 : llvm::GlobalValue::InternalLinkage;
-  compute_function_ =
-      llvm::Function::Create(/*Ty=*/compute_function_type,
-                             /*Linkage=*/linkage,
-                             /*Name=*/AsStringRef(function_name),
-                             /*Module=*/module_);
-  compute_function_->setCallingConv(llvm::CallingConv::C);
-
-  // Set meaningful names for the function's arguments: useful for debugging.
-  llvm::Function::arg_iterator arg_iter = compute_function_->arg_begin();
-  arg_iter->setName("retval");
-  (++arg_iter)->setName("run_options");
-  (++arg_iter)->setName("params");
-  (++arg_iter)->setName("temps");
-  if (num_dynamic_loop_bounds_ > 0) {
-    (++arg_iter)->setName("dynamic_loop_bounds");
-  }
-  (++arg_iter)->setName("prof_counters");
-
-  // We know a-priori that the function arguments are guaranteed to point to
-  // disjoint objects.
-  llvm::Argument* retval = GetResultArgument();
-  for (llvm::Argument& argument : compute_function_->args()) {
-    // However, the return buffer aliases the temporaries and thus cannot be
-    // marked noalias.
-    if (&argument == retval) {
-      continue;
-    }
-    compute_function_->addAttribute(argument.getArgNo() + 1,
-                                    llvm::Attribute::NoAlias);
-  }
-
-  // Add the optize attribute to the function if optimizing for size. This
-  // controls internal behavior of some optimization passes (e.g. loop
-  // unrolling).
-  if (options::OptimizeForSizeRequested(hlo_module_config_)) {
-    compute_function_->addFnAttr(llvm::Attribute::OptimizeForSize);
-  }
-
-  if (hlo_module_config_.debug_options().xla_enable_fast_math()) {
-    compute_function_->addFnAttr("unsafe-fp-math", "true");
-    compute_function_->addFnAttr("no-infs-fp-math", "true");
-    compute_function_->addFnAttr("no-nans-fp-math", "true");
-    compute_function_->addFnAttr("no-signed-zeros-fp-math", "true");
-  }
-
-  ir_builder_.SetInsertPoint(llvm::BasicBlock::Create(
-      /*Context=*/module_->getContext(),
-      /*Name=*/"entry",
-      /*Parent=*/compute_function_));
+  // Create and initialize new IrFunction.
+  compute_function_.reset(
+      new IrFunction(function_name, linkage,
+                     options::OptimizeForSizeRequested(hlo_module_config_),
+                     hlo_module_config_.debug_options().xla_enable_fast_math(),
+                     module_, &ir_builder_, num_dynamic_loop_bounds_));
 }
 
 IrEmitter::~IrEmitter() {}
@@ -344,11 +243,12 @@ int IrEmitter::MinimumAlignmentForBufferSize(int64 buffer_size) {
 
 // Calculate the alignment of a buffer allocated for a given primitive type.
 int IrEmitter::MinimumAlignmentForPrimitiveType(PrimitiveType primitive_type) {
-  int64 buffer_size = ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
-  DCHECK_GE(buffer_size, 0);
-  DCHECK_LE(buffer_size, SIZE_MAX);
-
-  return MinimumAlignmentForBufferSize(buffer_size);
+  int64 byte_size = ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
+  DCHECK_GE(byte_size, 0);
+  // Largest scalar is a complex64 so we don't need to worry about the
+  // int64->int truncation here.
+  DCHECK_LE(byte_size, 8);
+  return byte_size;
 }
 
 int64 IrEmitter::ByteSizeOf(const Shape& shape) const {
@@ -357,6 +257,10 @@ int64 IrEmitter::ByteSizeOf(const Shape& shape) const {
 
 // Calculate the alignment of a buffer allocated for a given shape.
 int IrEmitter::MinimumAlignmentForShape(const Shape& shape) {
+  if (ShapeUtil::IsScalar(shape)) {
+    return MinimumAlignmentForPrimitiveType(shape.element_type());
+  }
+
   int64 buffer_size = ByteSizeOf(shape);
   DCHECK_GE(buffer_size, 0);
   DCHECK_LE(buffer_size, SIZE_MAX);
@@ -612,7 +516,7 @@ Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
   HloComputation* function = reduce_window->to_apply();
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
       /*instruction=*/*reduce_window, /*operands=*/{operand},
-      /*supported_types=*/{F32}));
+      /*supported_types=*/{F32, BF16}));
 
   // TODO(b/31410564): Implement dilation for reduce-window.
   if (window_util::HasDilation(window)) {
@@ -795,7 +699,7 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
   // operand index is within the bounds. The unsigned comparison includes
   // checking whether the operand index >= 0.
   llvm_ir::IrArray::Index operand_index(source_index.size());
-  llvm::Value* in_bounds_condition = ir_builder_.getInt1(true);
+  llvm::Value* in_bounds_condition = ir_builder_.getTrue();
   for (int64 i = 0; i < rank; ++i) {
     llvm::Value* strided_index = ir_builder_.CreateNSWMul(
         source_index[i], ir_builder_.getInt64(window.dimensions(i).stride()));
@@ -822,14 +726,16 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
   // If the initialized_flag is false, initialize the selected value and index
   // with the currently visiting operand.
   SetToFirstInsertPoint(if_initialized.false_block, &ir_builder_);
-  const auto save_operand_index = [&](
-      const llvm_ir::IrArray::Index& operand_index) {
-    for (int64 i = 0; i < rank; ++i) {
-      llvm::Value* selected_index_address_slot = ir_builder_.CreateInBoundsGEP(
-          selected_index_address, {ir_builder_.getInt32(i)});
-      ir_builder_.CreateStore(operand_index[i], selected_index_address_slot);
-    }
-  };
+  const auto save_operand_index =
+      [&](const llvm_ir::IrArray::Index& operand_index) {
+        for (int64 i = 0; i < rank; ++i) {
+          llvm::Value* selected_index_address_slot =
+              ir_builder_.CreateInBoundsGEP(selected_index_address,
+                                            {ir_builder_.getInt32(i)});
+          ir_builder_.CreateStore(operand_index[i],
+                                  selected_index_address_slot);
+        }
+      };
   llvm_ir::IrArray operand_array(GetIrArrayFor(operand));
   llvm::Value* operand_data =
       operand_array.EmitReadArrayElement(operand_index, &ir_builder_);
@@ -896,6 +802,24 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
       /*instruction=*/*dot, /*operands=*/{lhs, rhs},
       /*supported_types=*/{F32, F64, C64}));
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
+  if (dnums.lhs_batch_dimensions_size() > 0 ||
+      dnums.rhs_batch_dimensions_size() > 0) {
+    return Unimplemented("Dot with batch dimensions not implemented.");
+  }
+
+  if (dnums.lhs_contracting_dimensions_size() != 1) {
+    // This is disallowed by ShapeInference today.
+    return Unimplemented(
+        "Dot with multiple contracting dimensions not implemented.");
+  }
+
+  if (dnums.lhs_contracting_dimensions(0) !=
+          std::min(lhs->shape().dimensions_size() - 1, 1) ||
+      dnums.rhs_contracting_dimensions(0) != 0) {
+    return Unimplemented(
+        "Dot with non-standard contracting dimensions not implemented.");
+  }
 
   llvm_ir::IrArray lhs_array(GetIrArrayFor(lhs));
   llvm_ir::IrArray rhs_array(GetIrArrayFor(rhs));
@@ -914,8 +838,8 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   // Dot operation is complicated so we delegate to a helper class.
   return DotOpEmitter::EmitDotOperation(
       *dot, /*transpose_lhs=*/false, /*transpose_rhs=*/false, target_array,
-      lhs_array, rhs_array, GetExecutableRunOptionsArgument(), &ir_builder_,
-      hlo_module_config_);
+      lhs_array, rhs_array, /*addend_array=*/nullptr,
+      GetExecutableRunOptionsArgument(), &ir_builder_, hlo_module_config_);
 }
 
 Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
@@ -952,11 +876,12 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
       // Input tensor.
       const Shape& input_shape = convolution->operand(0)->shape();
       int64 input_batch = input_shape.dimensions(dnums.input_batch_dimension());
-      int64 input_rows = input_shape.dimensions(dnums.spatial_dimensions(0));
+      int64 input_rows =
+          input_shape.dimensions(dnums.input_spatial_dimensions(0));
       int64 input_cols =
           one_dim_convolution
               ? 1
-              : input_shape.dimensions(dnums.spatial_dimensions(1));
+              : input_shape.dimensions(dnums.input_spatial_dimensions(1));
       int64 input_channels =
           input_shape.dimensions(dnums.input_feature_dimension());
 
@@ -976,11 +901,11 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
       // Output tensor.
       const Shape& convolution_shape = convolution->shape();
       int64 output_rows =
-          convolution_shape.dimensions(dnums.spatial_dimensions(0));
-      int64 output_cols =
-          one_dim_convolution
-              ? 1
-              : convolution_shape.dimensions(dnums.spatial_dimensions(1));
+          convolution_shape.dimensions(dnums.output_spatial_dimensions(0));
+      int64 output_cols = one_dim_convolution
+                              ? 1
+                              : convolution_shape.dimensions(
+                                    dnums.output_spatial_dimensions(1));
 
       // Extract the window stride for the convolution.
       const Window& window = convolution->window();
@@ -1068,10 +993,10 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
   return EmitTargetElementLoop(
       convolution, [this, convolution, lhs, rhs, window,
                     dnums](const llvm_ir::IrArray::Index& index) {
-        int num_spatial_dims = dnums.spatial_dimensions_size();
+        int num_spatial_dims = dnums.output_spatial_dimensions_size();
         std::vector<llvm::Value*> output_spatial(num_spatial_dims);
         for (int i = 0; i < num_spatial_dims; ++i) {
-          output_spatial[i] = index[dnums.spatial_dimensions(i)];
+          output_spatial[i] = index[dnums.output_spatial_dimensions(i)];
         }
         llvm::Value* output_feature = index[dnums.output_feature_dimension()];
         llvm::Value* batch = index[dnums.output_batch_dimension()];
@@ -1091,8 +1016,9 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
         for (int i = 0; i < num_spatial_dims; ++i) {
           kernel_spatial[i] =
               loops
-                  .AddLoop(0, rhs->shape().dimensions(
-                                  dnums.kernel_spatial_dimensions(i)),
+                  .AddLoop(0,
+                           rhs->shape().dimensions(
+                               dnums.kernel_spatial_dimensions(i)),
                            tensorflow::strings::StrCat("k", i))
                   ->GetIndVarValue();
         }
@@ -1108,17 +1034,18 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
         // Calculate the spatial index in the input array, taking striding,
         // dilation and padding into account. An index in the padding will be
         // out of the bounds of the array.
-        const auto calculate_input_index = [this](
-            llvm::Value* output_index, llvm::Value* kernel_index,
-            const WindowDimension& window_dim) {
-          llvm::Value* strided_index = ir_builder_.CreateNSWMul(
-              output_index, ir_builder_.getInt64(window_dim.stride()));
-          llvm::Value* dilated_kernel_index = ir_builder_.CreateNSWMul(
-              kernel_index, ir_builder_.getInt64(window_dim.window_dilation()));
-          return ir_builder_.CreateNSWSub(
-              ir_builder_.CreateNSWAdd(strided_index, dilated_kernel_index),
-              ir_builder_.getInt64(window_dim.padding_low()));
-        };
+        const auto calculate_input_index =
+            [this](llvm::Value* output_index, llvm::Value* kernel_index,
+                   const WindowDimension& window_dim) {
+              llvm::Value* strided_index = ir_builder_.CreateNSWMul(
+                  output_index, ir_builder_.getInt64(window_dim.stride()));
+              llvm::Value* dilated_kernel_index = ir_builder_.CreateNSWMul(
+                  kernel_index,
+                  ir_builder_.getInt64(window_dim.window_dilation()));
+              return ir_builder_.CreateNSWSub(
+                  ir_builder_.CreateNSWAdd(strided_index, dilated_kernel_index),
+                  ir_builder_.getInt64(window_dim.padding_low()));
+            };
         std::vector<llvm::Value*> input_spatial(num_spatial_dims);
         for (int i = 0; i < num_spatial_dims; ++i) {
           input_spatial[i] = calculate_input_index(
@@ -1140,11 +1067,11 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
           return ir_builder_.CreateICmpEQ(remainder, ir_builder_.getInt64(0));
         };
 
-        llvm::Value* in_bounds_condition = nullptr;
+        llvm::Value* in_bounds_condition = ir_builder_.getInt1(true);
         for (int i = 0; i < num_spatial_dims; ++i) {
           llvm::ConstantInt* input_bound =
               ir_builder_.getInt64(window_util::DilatedBound(
-                  lhs->shape().dimensions(dnums.spatial_dimensions(i)),
+                  lhs->shape().dimensions(dnums.input_spatial_dimensions(i)),
                   window.dimensions(i).base_dilation()));
           llvm::Value* dim_in_bound =
               ir_builder_.CreateICmpULT(input_spatial[i], input_bound);
@@ -1153,9 +1080,7 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
           llvm::Value* dim_ok =
               ir_builder_.CreateAnd(dim_in_bound, dim_not_in_hole);
           in_bounds_condition =
-              in_bounds_condition
-                  ? ir_builder_.CreateAnd(in_bounds_condition, dim_ok)
-                  : dim_ok;
+              ir_builder_.CreateAnd(in_bounds_condition, dim_ok);
         }
 
         // Now we need to map the dilated base coordinates back to the actual
@@ -1178,7 +1103,7 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
         int num_dims = num_spatial_dims + 2;
         llvm_ir::IrArray::Index input_index(num_dims);
         for (int i = 0; i < num_spatial_dims; ++i) {
-          input_index[dnums.spatial_dimensions(i)] = input_spatial[i];
+          input_index[dnums.input_spatial_dimensions(i)] = input_spatial[i];
         }
         input_index[dnums.input_feature_dimension()] = input_feature;
         input_index[dnums.input_batch_dimension()] = batch;
@@ -1186,8 +1111,14 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
         llvm_ir::IrArray kernel_array(GetIrArrayFor(rhs));
         llvm_ir::IrArray::Index kernel_index(num_dims);
         for (int i = 0; i < num_spatial_dims; ++i) {
-          kernel_index[dnums.kernel_spatial_dimensions(i)] = kernel_spatial[i];
+          kernel_index[dnums.kernel_spatial_dimensions(i)] =
+              window.dimensions(i).window_reversal()
+                  ? ir_builder_.CreateNSWSub(
+                        ir_builder_.getInt64(window.dimensions(i).size() - 1),
+                        kernel_spatial[i])
+                  : kernel_spatial[i];
         }
+
         kernel_index[dnums.kernel_input_feature_dimension()] = input_feature;
         kernel_index[dnums.kernel_output_feature_dimension()] = output_feature;
 
@@ -1449,15 +1380,20 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   //
   // Where Param is the actual element type of the underlying buffer (for
   // example, float for an XLA F32 element type).
-  llvm::Argument* params = GetArg(compute_function_, 2);
+  llvm::Value* params = compute_function_->parameters_arg();
   llvm::Value* param_address_offset =
       llvm_ir::EmitBufferIndexingGEP(params, param_number, &ir_builder_);
   llvm::LoadInst* param_address_untyped =
       ir_builder_.CreateLoad(param_address_offset);
   param_address_untyped->setName(AsStringRef(IrName(parameter, "untyped")));
-  if (hlo_module_config_.debug_options()
+  if (is_top_level_computation_ &&
+      hlo_module_config_.debug_options()
           .xla_llvm_enable_invariant_load_metadata()) {
-    // We never reassign parameters, so this load is invariant.
+    // In the entry computation the parameter slots in the %params argument are
+    // invariant through program execution.  In computations that are called
+    // from the entry computation (via kWhile, kCall and kConditional) the
+    // parameter slots are *not* invariant since they're written to by their
+    // callers.
     param_address_untyped->setMetadata(
         llvm::LLVMContext::MD_invariant_load,
         llvm::MDNode::get(param_address_untyped->getContext(), /*MDs=*/{}));
@@ -1584,13 +1520,9 @@ IrEmitter::ReductionGenerator IrEmitter::MatchReductionGenerator(
 
 IrEmitter::ShardedVectorType IrEmitter::CreateShardedVectorType(
     PrimitiveType element_type, unsigned element_count) {
-  // Here we assume that the largest register is a vector register.
-  int max_vector_register_size_in_bytes =
-      target_machine_features_.largest_register_size_in_bytes(
-          compute_function_);
-
   int vector_register_size_in_elements =
-      max_vector_register_size_in_bytes /
+      target_machine_features_.vector_register_byte_size(
+          *compute_function_->function()) /
       ShapeUtil::ByteSizeOfPrimitiveType(element_type);
 
   ShardedVectorType sharded_vector_type;
@@ -1745,19 +1677,6 @@ void IrEmitter::EmitShardedVectorStore(
   }
 }
 
-namespace {
-// TODO(sanjoy): This is duplicated in tensorflow/core/lib/core/arena.cc.
-// Extract out a common implementation to tensorflow/core/lib/math/math_util.h
-uint32 GCD(uint32 x, uint32 y) {
-  while (y != 0) {
-    uint32 r = x % y;
-    x = y;
-    y = r;
-  }
-  return x;
-}
-}  // namespace
-
 StatusOr<bool> IrEmitter::EmitVectorizedReduce(
     HloInstruction* reduce, HloInstruction* arg, HloInstruction* init_value,
     tensorflow::gtl::ArraySlice<int64> dimensions, HloComputation* function,
@@ -1778,11 +1697,12 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
 
   bool is_reduction_over_minor_dimension =
       std::find(dimensions.begin(), dimensions.end(),
-                arg->shape().layout().minor_to_major(0)) != dimensions.end();
+                LayoutUtil::Minor(arg->shape().layout(), 0)) !=
+      dimensions.end();
 
-  unsigned element_alignment =
-      GCD(ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type()),
-          MinimumAlignmentForPrimitiveType(reduce->shape().element_type()));
+  unsigned element_alignment = tensorflow::MathUtil::GCD<unsigned>(
+      ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type()),
+      MinimumAlignmentForPrimitiveType(reduce->shape().element_type()));
 
   if (is_reduction_over_minor_dimension) {
     // TODO(sanjoy): Implement vectorized reduction over the minor dimension.
@@ -1815,8 +1735,9 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
 
   llvm_ir::ForLoopNest loop_nest(IrName(reduce), &ir_builder_);
   llvm_ir::IrArray::Index array_index(reduce->shape().dimensions_size());
-  for (int i = reduce->shape().layout().minor_to_major_size() - 1; i > 0; --i) {
-    int64 dimension = reduce->shape().layout().minor_to_major(i);
+  for (int i = LayoutUtil::MinorToMajor(reduce->shape()).size() - 1; i > 0;
+       --i) {
+    int64 dimension = LayoutUtil::Minor(reduce->shape().layout(), i);
     int64 start_index = 0;
     int64 end_index = reduce->shape().dimensions(dimension);
     std::unique_ptr<llvm_ir::ForLoop> loop =
@@ -1825,7 +1746,7 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
     array_index[dimension] = loop->GetIndVarValue();
   }
 
-  int64 innermost_dimension = reduce->shape().layout().minor_to_major(0);
+  int64 innermost_dimension = LayoutUtil::Minor(reduce->shape().layout(), 0);
   int64 innermost_dimension_size =
       reduce->shape().dimensions(innermost_dimension);
 
@@ -1861,10 +1782,10 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
                            target_array);
 
     if (auto exit_terminator = loop->GetExitBasicBlock()->getTerminator()) {
-      CHECK_GT(reduce->shape().layout().minor_to_major_size(), 1);
+      CHECK_GT(LayoutUtil::MinorToMajor(reduce->shape()).size(), 1);
       ir_builder_.SetInsertPoint(exit_terminator);
     } else {
-      CHECK_EQ(reduce->shape().layout().minor_to_major_size(), 1);
+      CHECK_EQ(LayoutUtil::MinorToMajor(reduce->shape()).size(), 1);
       ir_builder_.SetInsertPoint(loop->GetExitBasicBlock());
     }
   }
@@ -1992,7 +1913,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
   VLOG(2) << "HandleSlice: " << slice->ToString();
   auto operand = slice->operand(0);
   // The code below emits a sequential loop nest. For the parallel backend, use
-  // EmitParallelTargetElementLoop() which respects dynamic loop bounds.
+  // ParallelLoopEmitter which respects dynamic loop bounds.
   if (ShouldEmitParallelLoopFor(*slice)) {
     return DefaultAction(slice);
   }
@@ -2024,7 +1945,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
   // * Implement the memcpy within the innermost loop.
 
   tensorflow::gtl::FlatSet<int64> inner_dims;
-  for (int64 dim : layout.minor_to_major()) {
+  for (int64 dim : LayoutUtil::MinorToMajor(layout)) {
     if (operand->shape().dimensions(dim) != slice->shape().dimensions(dim)) {
       break;
     }
@@ -2051,7 +1972,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
 
   // memcpy_dim is the innermost (in terms of layout) dimension for which the
   // slice does *not* just copy all the elements along the dimension.
-  const int64 memcpy_dim = layout.minor_to_major(inner_dims.size());
+  const int64 memcpy_dim = LayoutUtil::Minor(layout, inner_dims.size());
 
   const bool memcpy_is_contiguous = slice->slice_strides(memcpy_dim) == 1;
   // The number of logical elements that can be copied in a single call
@@ -2260,8 +2181,8 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
     TF_RETURN_IF_ERROR(DotOpEmitter::EmitDotOperation(
         *root, root->operand(0)->IsRank2Transpose(),
         root->operand(1)->IsRank2Transpose(), target_array, lhs_array,
-        rhs_array, GetExecutableRunOptionsArgument(), &ir_builder_,
-        hlo_module_config_));
+        rhs_array, /*addend_array=*/nullptr, GetExecutableRunOptionsArgument(),
+        &ir_builder_, hlo_module_config_));
     return Status::OK();
   } else if (llvm_ir::CanEmitFusedDynamicUpdateSliceInPlace(fusion,
                                                             assignment_)) {
@@ -2282,6 +2203,35 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
     TF_RETURN_IF_ERROR(fusion->fused_expression_root()->Accept(&fused_emitter));
 
     return EmitTargetElementLoop(fusion, fused_emitter.GetRootGenerator());
+  } else if (fusion->fusion_kind() == HloInstruction::FusionKind::kOutput) {
+    VLOG(3) << "HandleFusion kOutput";
+    int64 dot_op_index = root->operand(0)->opcode() == HloOpcode::kDot ? 0 : 1;
+    const HloInstruction* dot = root->operand(dot_op_index);
+    CHECK_EQ(dot->opcode(), HloOpcode::kDot)
+        << dot->ToString() << "  "
+        << fusion->fused_instructions_computation()->ToString();
+
+    int64 dot_lhs_param_number = dot->operand(0)->parameter_number();
+    int64 dot_rhs_param_number = dot->operand(1)->parameter_number();
+    int64 addend_param_number =
+        root->operand(1 - dot_op_index)->parameter_number();
+
+    Shape target_shape = fusion->shape();
+    TF_RETURN_IF_ERROR(EmitTargetAddressForOp(fusion));
+    llvm_ir::IrArray target_array = GetIrArrayFor(fusion);
+
+    llvm_ir::IrArray lhs_array(
+        GetIrArrayFor(fusion->operand(dot_lhs_param_number)));
+    llvm_ir::IrArray rhs_array(
+        GetIrArrayFor(fusion->operand(dot_rhs_param_number)));
+    llvm_ir::IrArray addend_array(
+        GetIrArrayFor(fusion->operand(addend_param_number)));
+
+    TF_RETURN_IF_ERROR(DotOpEmitter::EmitDotOperation(
+        *dot, /*transpose_lhs=*/false, /*transpose_rhs=*/false, target_array,
+        lhs_array, rhs_array, &addend_array, GetExecutableRunOptionsArgument(),
+        &ir_builder_, hlo_module_config_));
+    return Status::OK();
   } else {
     return Unimplemented("Fusion kind not implemented on CPU");
   }
@@ -2302,9 +2252,17 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
       !parallel_cpu_backend_) {
     // ParallelTaskAssignment assigned partitions, emit call to
     // ParallelForkJoin.
-    TF_RETURN_IF_ERROR(EmitParallelForkJoin(parameter_addresses,
-                                            emitted_value_[call], computation,
-                                            call_ir_function));
+    std::vector<llvm::Value*> call_args = GetArrayFunctionCallArguments(
+        parameter_addresses, &ir_builder_, computation->name(),
+        /*return_value_buffer=*/emitted_value_[call],
+        /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(),
+        /*temp_buffers_arg=*/GetTempBuffersArgument(),
+        /*profile_counters_arg=*/GetProfileCountersArgument());
+
+    HloInstruction* root = computation->root_instruction();
+    TF_RETURN_IF_ERROR(EmitCallToParallelForkJoin(
+        call_args, root->shape(), root->outer_dimension_partitions(),
+        &ir_builder_, call_ir_function, computation->name()));
   } else {
     EmitArrayFunctionCallInto(call_ir_function, parameter_addresses,
                               emitted_value_[call], computation->name());
@@ -2407,7 +2365,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   // Terminates the current block with a branch to a while header.
   llvm::BasicBlock* header_bb = llvm::BasicBlock::Create(
       module_->getContext(), AsStringRef(IrName(xla_while, "header")),
-      compute_function_);
+      compute_function_->function());
   ir_builder_.CreateBr(header_bb);
   ir_builder_.SetInsertPoint(header_bb);
 
@@ -2424,7 +2382,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   // Branches to the body or to the while exit depending on the condition.
   llvm::BasicBlock* body_bb = llvm::BasicBlock::Create(
       module_->getContext(), AsStringRef(IrName(xla_while, "body")),
-      compute_function_);
+      compute_function_->function());
   llvm::BasicBlock* exit_bb = llvm::BasicBlock::Create(
       module_->getContext(), AsStringRef(IrName(xla_while, "exit")));
   ir_builder_.CreateCondBr(while_predicate, body_bb, exit_bb);
@@ -2439,7 +2397,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   ir_builder_.CreateBr(header_bb);
 
   // Adds the exit block to the function and sets the insert point there.
-  compute_function_->getBasicBlockList().push_back(exit_bb);
+  compute_function_->function()->getBasicBlockList().push_back(exit_bb);
   ir_builder_.SetInsertPoint(exit_bb);
 
   return Status::OK();
@@ -2475,14 +2433,13 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
 
   int64 concat_dim = concatenate->dimensions(0);
   const Layout& output_layout = output_shape.layout();
+  auto output_min2maj = LayoutUtil::MinorToMajor(output_layout);
   auto concat_dim_layout_itr =
-      std::find(output_layout.minor_to_major().begin(),
-                output_layout.minor_to_major().end(), concat_dim);
+      std::find(output_min2maj.begin(), output_min2maj.end(), concat_dim);
 
-  std::vector<int64> inner_dims(output_layout.minor_to_major().begin(),
-                                concat_dim_layout_itr);
+  std::vector<int64> inner_dims(output_min2maj.begin(), concat_dim_layout_itr);
   std::vector<int64> outer_dims(std::next(concat_dim_layout_itr),
-                                output_layout.minor_to_major().end());
+                                output_min2maj.end());
 
   llvm::Type* i8_ptr_type = ir_builder_.getInt8PtrTy();
   llvm::Type* i8_type = ir_builder_.getInt8Ty();
@@ -2557,7 +2514,7 @@ void IrEmitter::EmitTransferElements(llvm::Value* target, llvm::Value* source,
                                      const llvm_ir::IrArray& source_array) {
   unsigned primitive_type_size =
       ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
-  unsigned element_alignment = GCD(
+  unsigned element_alignment = tensorflow::MathUtil::GCD<unsigned>(
       primitive_type_size, MinimumAlignmentForPrimitiveType(primitive_type));
   llvm::Type* primitive_ptr_type = llvm::PointerType::getUnqual(
       llvm_ir::PrimitiveTypeToIrType(primitive_type, module_));
@@ -2604,6 +2561,65 @@ Status IrEmitter::HandleConcatenate(HloInstruction* concatenate) {
   return DefaultAction(concatenate);
 }
 
+Status IrEmitter::HandleConditional(HloInstruction* conditional) {
+  auto pred = conditional->operand(0);
+  auto true_arg = conditional->operand(1);
+  auto false_arg = conditional->operand(2);
+  TF_RET_CHECK(ShapeUtil::IsScalar(pred->shape()) &&
+               pred->shape().element_type() == PRED)
+      << "Predicate on a Conditional must be bool; got: "
+      << ShapeUtil::HumanString(pred->shape());
+
+  HloComputation* true_computation = conditional->true_computation();
+  HloComputation* false_computation = conditional->false_computation();
+  TF_RET_CHECK(ShapeUtil::Equal(conditional->shape(),
+                                true_computation->root_instruction()->shape()))
+      << "Shape of conditional should be same as the shape of the true "
+      << "computation; got: " << ShapeUtil::HumanString(conditional->shape())
+      << " and "
+      << ShapeUtil::HumanString(true_computation->root_instruction()->shape());
+
+  TF_RET_CHECK(ShapeUtil::Equal(conditional->shape(),
+                                false_computation->root_instruction()->shape()))
+      << "Shape of conditional should be same as the shape of the false "
+      << "computation; got: " << ShapeUtil::HumanString(conditional->shape())
+      << " and "
+      << ShapeUtil::HumanString(false_computation->root_instruction()->shape());
+
+  llvm::Function* true_function =
+      FindOrDie(emitted_functions_, true_computation);
+  llvm::Function* false_function =
+      FindOrDie(emitted_functions_, false_computation);
+
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(conditional));
+  llvm::Value* conditional_result = GetEmittedValueFor(conditional);
+
+  // Generating:
+  //   if (pred)
+  //     cond_result = true_computation(true_operand)
+  //   else
+  //     cond_result = false_computation(false_operand)
+  llvm::LoadInst* pred_value = ir_builder_.CreateLoad(
+      GetIrArrayFor(pred).GetBasePointer(), "load_predicate_value");
+  llvm::Value* pred_cond = ir_builder_.CreateICmpNE(
+      pred_value,
+      llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0),
+      "boolean_predicate");
+  llvm_ir::LlvmIfData if_data =
+      llvm_ir::EmitIfThenElse(pred_cond, "conditional", &ir_builder_);
+
+  SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+  EmitArrayFunctionCallInto(true_function, {GetEmittedValueFor(true_arg)},
+                            conditional_result, IrName(conditional, "_true"));
+
+  SetToFirstInsertPoint(if_data.false_block, &ir_builder_);
+  EmitArrayFunctionCallInto(false_function, {GetEmittedValueFor(false_arg)},
+                            conditional_result, IrName(conditional, "_false"));
+
+  SetToFirstInsertPoint(if_data.after_block, &ir_builder_);
+  return Status::OK();
+}
+
 Status IrEmitter::FinishVisit(HloInstruction* root) {
   // When this method is called, we should have already emitted an IR value for
   // the root (return) op. The IR value holds the address of the buffer holding
@@ -2639,7 +2655,6 @@ Status IrEmitter::FinishVisit(HloInstruction* root) {
   if (prof_counter) {
     profiling_state_.RecordCompleteComputation(&ir_builder_, prof_counter);
   }
-  ir_builder_.CreateRetVoid();
   return Status::OK();
 }
 
@@ -2780,43 +2795,16 @@ llvm::Type* IrEmitter::IrShapeType(const Shape& shape) {
   return llvm_ir::ShapeToIrType(shape, module_);
 }
 
-std::vector<llvm::Type*> IrEmitter::GetComputeFunctionParams() {
-  llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(module_->getContext());
-  llvm::Type* i8_ptr_ptr_type = i8_ptr_type->getPointerTo();
-  llvm::Type* i64_ptr_type = llvm::Type::getInt64PtrTy(module_->getContext());
-  std::vector<llvm::Type*> compute_function_params(
-      {i8_ptr_type, i8_ptr_type, i8_ptr_ptr_type, i8_ptr_ptr_type});
-  if (num_dynamic_loop_bounds_ > 0) {
-    compute_function_params.push_back(i64_ptr_type);
-  }
-  compute_function_params.push_back(i64_ptr_type);
-  return compute_function_params;
-}
-
-llvm::Argument* IrEmitter::GetResultArgument() {
-  return GetArg(compute_function_, 0);
-}
-
-llvm::Argument* IrEmitter::GetProfileCountersArgument() {
-  const int64 arg_index = num_dynamic_loop_bounds_ > 0 ? 5 : 4;
-  return GetArg(compute_function_, arg_index);
+llvm::Value* IrEmitter::GetProfileCountersArgument() {
+  return compute_function_->profile_counters_arg();
 }
 
 llvm::Value* IrEmitter::GetTempBuffersArgument() {
-  return GetArg(compute_function_, 3);
-}
-
-llvm::Value* IrEmitter::GetDynamicLoopBound(const int64 offset) {
-  CHECK_GT(num_dynamic_loop_bounds_, 0);
-  CHECK_LT(offset, num_dynamic_loop_bounds_ * 2);
-  llvm::Argument* loop_bounds_arg = GetArg(compute_function_, 4);
-  string name = tensorflow::strings::StrCat("dynamic_loop_bound_", offset);
-  return ir_builder_.CreateLoad(ir_builder_.CreateGEP(
-      loop_bounds_arg, ir_builder_.getInt64(offset), AsStringRef(name)));
+  return compute_function_->temp_buffers_arg();
 }
 
 llvm::Value* IrEmitter::GetExecutableRunOptionsArgument() {
-  return GetArg(compute_function_, 1);
+  return compute_function_->exec_run_options_arg();
 }
 
 llvm::Value* IrEmitter::EmitTempBufferPointer(
@@ -2847,10 +2835,14 @@ llvm::Value* IrEmitter::EmitTempBufferPointer(
       GetTempBuffersArgument(), slice.index(), &ir_builder_);
   llvm::LoadInst* tempbuf_address_base =
       ir_builder_.CreateLoad(tempbuf_address_ptr);
-  if (hlo_module_config_.debug_options()
+  if (is_top_level_computation_ &&
+      hlo_module_config_.debug_options()
           .xla_llvm_enable_invariant_load_metadata()) {
-    // Loading the address of a buffer is invariant of the point at which the
-    // load is executed in the program because we never reassign buffers.
+    // In the entry computation the parameter slots in the %params argument are
+    // invariant through program execution.  In computations that are called
+    // from the entry computation (via kWhile, kCall and kConditional) the
+    // parameter slots are *not* invariant since they're written to by their
+    // callers.
     tempbuf_address_base->setMetadata(
         llvm::LLVMContext::MD_invariant_load,
         llvm::MDNode::get(tempbuf_address_base->getContext(), /*MDs=*/{}));
@@ -2881,42 +2873,6 @@ llvm::Value* IrEmitter::EmitElementFunctionCall(
       AsStringRef(tensorflow::strings::StrCat(name, "_return_value")));
 }
 
-// Emits code to allocate an array of parameter address pointers, and store
-// each address from 'parameter_addresses'.
-// Returns an array of compute function call arguments (including parameter
-// address buffer).
-std::vector<llvm::Value*> IrEmitter::GetArrayFunctionCallArguments(
-    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-    llvm::Value* return_value_buffer, tensorflow::StringPiece name) {
-  llvm::Value* parameter_addresses_buffer =
-      llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-          ir_builder_.getInt8PtrTy(),
-          ir_builder_.getInt32(parameter_addresses.size()),
-          tensorflow::strings::StrCat(name, "_parameter_addresses"),
-          &ir_builder_);
-  for (size_t i = 0; i < parameter_addresses.size(); ++i) {
-    llvm::Value* parameter_as_i8ptr = ir_builder_.CreateBitCast(
-        parameter_addresses[i], ir_builder_.getInt8PtrTy(),
-        AsStringRef(tensorflow::strings::StrCat(name, "_parameter_", i,
-                                                "_address_as_i8ptr")));
-    llvm::Value* slot_in_param_adresses = ir_builder_.CreateInBoundsGEP(
-        parameter_addresses_buffer, {ir_builder_.getInt64(i)});
-    ir_builder_.CreateStore(parameter_as_i8ptr, slot_in_param_adresses);
-  }
-
-  const auto to_int8_ptr = [this](llvm::Value* ptr) {
-    return ir_builder_.CreatePointerCast(ptr, ir_builder_.getInt8PtrTy());
-  };
-  std::vector<llvm::Value*> arguments{
-      to_int8_ptr(return_value_buffer),
-      to_int8_ptr(GetExecutableRunOptionsArgument()),
-      parameter_addresses_buffer, GetTempBuffersArgument()};
-  if (auto* profile_counters = GetProfileCountersArgument()) {
-    arguments.push_back(profile_counters);
-  }
-  return arguments;
-}
-
 // Emits a core function call based on the following pseudo-code.
 //
 //   char** parameter_addresses_buffer =
@@ -2932,8 +2888,12 @@ void IrEmitter::EmitArrayFunctionCallInto(
     tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
     llvm::Value* return_value_buffer, tensorflow::StringPiece name) {
   ir_builder_.CreateCall(
-      function, GetArrayFunctionCallArguments(parameter_addresses,
-                                              return_value_buffer, name));
+      function, GetArrayFunctionCallArguments(
+                    parameter_addresses, &ir_builder_, name,
+                    /*return_value_buffer=*/return_value_buffer,
+                    /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(),
+                    /*temp_buffers_arg=*/GetTempBuffersArgument(),
+                    /*profile_counters_arg=*/GetProfileCountersArgument()));
 }
 
 llvm::Value* IrEmitter::EmitArrayFunctionCall(
@@ -2953,117 +2913,13 @@ llvm::Value* IrEmitter::EmitArrayFunctionCall(
   return return_value_buffer;
 }
 
-// Emits a call to a runtime fork/join function which dispatches parallel
-// calls to 'parallel_function' (and joins threads before returning).
-Status IrEmitter::EmitParallelForkJoin(
-    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-    llvm::Value* output_address, HloComputation* computation,
-    llvm::Function* parallel_function) {
-  HloInstruction* root = computation->root_instruction();
-
-  // Build ParallelForkJoin function type.
-  std::vector<llvm::Type*> compute_function_params = GetComputeFunctionParams();
-  // Number of parallel compute functions.
-  compute_function_params.push_back(ir_builder_.getInt32Ty());
-  // Array of partitions. There is an array element for each
-  // partition x partition_dim x 2 (for dimension start and limit).
-  compute_function_params.push_back(
-      llvm::Type::getInt64PtrTy(module_->getContext()));
-  // Number of partitioned most-major dimensions in 'root.shape'.
-  compute_function_params.push_back(ir_builder_.getInt32Ty());
-  // Function pointer for compute function to be dispatched in parallel.
-  compute_function_params.push_back(
-      llvm::Type::getInt8PtrTy(module_->getContext()));
-
-  llvm::FunctionType* fork_join_type = llvm::FunctionType::get(
-      /*Result=*/llvm::Type::getVoidTy(module_->getContext()),
-      /*Params=*/compute_function_params,
-      /*isVarArg=*/false);
-
-  llvm::Function* fork_join_func =
-      llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-          runtime::kParallelForkJoinSymbolName, fork_join_type));
-  fork_join_func->setCallingConv(llvm::CallingConv::C);
-  fork_join_func->setDoesNotThrow();
-
-  // Add common compute function arguments.
-  const string name = computation->name();
-  std::vector<llvm::Value*> arguments =
-      GetArrayFunctionCallArguments(parameter_addresses, output_address, name);
-
-  // Create ShapePartitionIterator to generate all partitions of 'root.shape'.
-  ShapePartitionIterator partition_iterator(root->shape(),
-                                            root->outer_dimension_partitions());
-  const int64 num_partitions = partition_iterator.GetTotalPartitionCount();
-  // Add argument specifying the number of parallel partitions.
-  arguments.push_back(ir_builder_.getInt32(num_partitions));
-
-  // The number of partitioned most-major dimensions in 'root.shape'.
-  const int32 num_partitioned_dims = root->outer_dimension_partitions().size();
-  // A dimension partition consists of two elements: [start_index, limit_index).
-  const int32 dim_partition_size = 2;
-  // Calculate array partition stride.
-  const int32 array_partition_stride =
-      num_partitioned_dims * dim_partition_size;
-  // Calculate the total number of elements in the partition array.
-  const int32 partition_array_size =
-      dim_partition_size * num_partitioned_dims * num_partitions;
-
-  // Store dimension partition values as llvm constants in 'partitions'.
-  // See comments in runtime_fork_join.cc for array layout description.
-  std::vector<llvm::Constant*> partitions(partition_array_size);
-  for (int32 i = 0; i < num_partitions; ++i) {
-    std::vector<std::pair<int64, int64>> dim_partitions =
-        partition_iterator.GetPartition(i);
-    CHECK_EQ(num_partitioned_dims, dim_partitions.size());
-    const int32 partition_index = i * array_partition_stride;
-    for (int32 j = 0; j < num_partitioned_dims; ++j) {
-      const std::pair<int64, int64>& dim_partition = dim_partitions[j];
-      const int32 index = partition_index + j * dim_partition_size;
-      // Store partition [dim_start, dim_limit) intervals for each dimension.
-      partitions[index] = ir_builder_.getInt64(dim_partition.first);
-      partitions[index + 1] =
-          ir_builder_.getInt64(dim_partition.first + dim_partition.second);
-    }
-  }
-
-  // Create global variable out of dimension partitions in 'partitions'.
-  llvm::ArrayType* partitions_array_type =
-      llvm::ArrayType::get(ir_builder_.getInt64Ty(), partition_array_size);
-  llvm::Constant* partitions_array =
-      llvm::ConstantArray::get(partitions_array_type, partitions);
-  llvm::GlobalVariable* global_partitions_array = new llvm::GlobalVariable(
-      /*Module=*/*module_,
-      /*Type=*/partitions_array_type,
-      /*isConstant=*/true,
-      /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
-      /*Initializer=*/partitions_array,
-      /*Name=*/
-      AsStringRef(
-          tensorflow::strings::StrCat(name, "_parallel_dimension_partitions")));
-
-  // Add argument specifying parallel dimension partitions.
-  arguments.push_back(ir_builder_.CreateBitCast(
-      global_partitions_array,
-      llvm::Type::getInt64PtrTy(module_->getContext())));
-  // Add argument specifying the number of partitioned most-major dimensions.
-  arguments.push_back(ir_builder_.getInt32(num_partitioned_dims));
-  // Add argument for parallel compute function pointer.
-  arguments.push_back(
-      ir_builder_.CreateBitCast(parallel_function, ir_builder_.getInt8PtrTy()));
-  // Emit call to parallel fork/join.
-  ir_builder_.CreateCall(fork_join_func, arguments);
-
-  return Status::OK();
-}
-
 Status IrEmitter::EmitTargetAddressForOp(const HloInstruction* op) {
   llvm::Value* addr;
   const Shape& target_shape = op->shape();
   if (op == op->parent()->root_instruction()) {
     // For the root node, we write directly to the output buffer of the
     // function.
-    llvm::Argument* retval = GetResultArgument();
+    llvm::Argument* retval = compute_function_->result_arg();
     if (!ShapeUtil::IsNil(target_shape)) {
       llvm::AttrBuilder attr_builder;
       attr_builder.addAlignmentAttr(MinimumAlignmentForShape(target_shape));
@@ -3124,8 +2980,13 @@ Status IrEmitter::EmitTargetElementLoop(
 
   } else {
     if (ShouldEmitParallelLoopFor(*target_op)) {
-      TF_RETURN_IF_ERROR(EmitParallelTargetElementLoop(
-          target_shape, element_generator, IrName(target_op), &target_array));
+      // Emit code to read dynamic loop bounds from compute function argument.
+      std::vector<std::pair<llvm::Value*, llvm::Value*>> dynamic_loop_bounds =
+          compute_function_->GetDynamicLoopBounds();
+      // Emit parallel loop with dynamic loop bounds for most-major dimensions.
+      TF_RETURN_IF_ERROR(ParallelLoopEmitter(element_generator, target_array,
+                                             &dynamic_loop_bounds, &ir_builder_)
+                             .EmitLoop(IrName(target_op)));
     } else {
       TF_RETURN_IF_ERROR(
           llvm_ir::LoopEmitter(element_generator, target_array, &ir_builder_)
@@ -3135,60 +2996,6 @@ Status IrEmitter::EmitTargetElementLoop(
   return Status::OK();
 }
 
-Status IrEmitter::EmitParallelTargetElementLoop(
-    const Shape& target_shape,
-    const llvm_ir::ElementGenerator& element_generator,
-    tensorflow::StringPiece loop_name, llvm_ir::IrArray* target_array) {
-  CHECK(!ShapeUtil::IsTuple(target_shape));
-  CHECK(!ShapeUtil::IsScalar(target_shape));
-
-  // Emit code to read dynamic loop bounds from function argument 4.
-  std::vector<llvm::Value*> dynamic_loop_bounds(2 * num_dynamic_loop_bounds_);
-  for (int i = 0; i < 2 * num_dynamic_loop_bounds_; ++i) {
-    dynamic_loop_bounds[i] = GetDynamicLoopBound(i);
-  }
-
-  llvm_ir::ForLoopNest loop_nest(loop_name, &ir_builder_);
-  const int64 num_dims = target_shape.dimensions_size();
-  llvm_ir::IrArray::Index array_index(num_dims);
-
-  // Add loops from outer-most to inner-most dimensions.
-  for (int i = target_shape.layout().minor_to_major_size() - 1; i >= 0; --i) {
-    const int64 dimension = target_shape.layout().minor_to_major(i);
-    const int bounds_index = num_dims - 1 - i;
-    if (bounds_index < num_dynamic_loop_bounds_) {
-      // Emit dynamic loop bounds for this dimension. Dynamic loop bounds
-      // are read from ir function dynamic loop bounds argument.
-      llvm::Value* start_index = dynamic_loop_bounds[bounds_index * 2 + 0];
-      llvm::Value* end_index = dynamic_loop_bounds[bounds_index * 2 + 1];
-
-      std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
-          /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension),
-          start_index, end_index);
-      array_index[dimension] = loop->GetIndVarValue();
-    } else {
-      // Emit static loop bounds for this dimension.
-      std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
-          /*start_index=*/0,
-          /*end_index=*/target_shape.dimensions(dimension),
-          /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension));
-      array_index[dimension] = loop->GetIndVarValue();
-    }
-  }
-  // Point IR builder at inner loop BB.
-  SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(), &ir_builder_);
-
-  // Emit loop body.
-  TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
-                      element_generator(array_index));
-  target_array->EmitWriteArrayElement(array_index, target_element,
-                                      &ir_builder_);
-  // Point IR builder at outer loop exit BB.
-  SetToFirstInsertPoint(loop_nest.GetOuterLoopExitBasicBlock(), &ir_builder_);
-
-  return Status::OK();
-}
-
 Status IrEmitter::EmitMemcpy(const HloInstruction& source,
                              const HloInstruction& destination) {
   llvm::Value* source_value = GetEmittedValueFor(&source);
@@ -3247,36 +3054,26 @@ StatusOr<llvm::Value*> IrEmitter::EmitScalarCall(
                                  argument_addrs, name);
 }
 
-unsigned TargetMachineFeatures::largest_register_size_in_bytes(
-    llvm::Function* function) {
-  auto itr = largest_register_size_in_bytes_.find(function);
-  if (itr != largest_register_size_in_bytes_.end()) {
-    return itr->second;
+llvm::TargetTransformInfo* TargetMachineFeatures::GetTargetTransformInfoFor(
+    const llvm::Function& function) {
+  auto it = target_transform_infos_.find(&function);
+  if (it == target_transform_infos_.end()) {
+    // Using a dummy function analysis manager is kind of hacky, but LLVM's
+    // TargetTransformInfoWrapperPass::getTTI does the same thing.
+    //
+    // TODO(sanjoy): Fix this within LLVM by directly exposing
+    // TargetTransformInfo factories from TargetMachine.
+    llvm::FunctionAnalysisManager DummyFAM;
+    llvm::TargetTransformInfo target_transform_info =
+        target_machine_->getTargetIRAnalysis().run(function, DummyFAM);
+    auto emplace_result = target_transform_infos_.emplace(
+        &function, std::move(target_transform_info));
+    CHECK(emplace_result.second);
+    it = emplace_result.first;
   }
 
-  int result = largest_register_size_in_bytes_impl(function);
-
-  InsertOrDie(&largest_register_size_in_bytes_, function, result);
-  DCHECK_EQ(result, largest_register_size_in_bytes_.begin()->second);
-  return result;
+  return &it->second;
 }
 
-unsigned TargetMachineFeatures::largest_register_size_in_bytes_impl(
-    llvm::Function* function) const {
-  auto register_info =
-      target_machine_->getSubtargetImpl(*function)->getRegisterInfo();
-
-  unsigned largest_register_size = 0;
-  for (const llvm::TargetRegisterClass* register_class :
-       register_info->regclasses()) {
-    if (register_class->isAllocatable()) {
-      largest_register_size =
-          std::max(largest_register_size,
-                   register_info->getRegSizeInBits(*register_class));
-    }
-  }
-
-  return largest_register_size / 8;
-}
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 351c95278c17f536e56d9f085b938a9baea9cde1..2341e3ea72ff312f2ca54b9495aff4065b34cd81 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -18,11 +18,13 @@ limitations under the License.
 
 #include <stddef.h>
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
@@ -30,6 +32,7 @@ limitations under the License.
 #include "llvm/Target/TargetMachine.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/external_constant_pool.h"
+#include "tensorflow/compiler/xla/service/cpu/ir_function.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -52,15 +55,6 @@ namespace cpu {
 
 // Wraps an llvm::TargetMachine and parses out some information that feeds into
 // code LLVM IR generation decisions.
-//
-// Ideally we'd be able to use llvm::TargetTransformInfo here (since its
-// interface is pretty much a perfect fit for our use case), but obtaining an
-// instance of llvm::TargetTransformInfo outside an LLVM pass pipeline without
-// super-ugly hacks is difficult.
-//
-// TODO(b/66049221): See if the LLVM community will be receptive to exposing an
-// API that lets us directly create and use llvm::TargetTransformInfo instances
-// outside of a pass manager.
 class TargetMachineFeatures {
  public:
   TargetMachineFeatures(llvm::TargetMachine* target_machine)
@@ -75,20 +69,21 @@ class TargetMachineFeatures {
     return 128;
   }
 
-  // Return the size of the largest register size in bytes.  We need to pass in
+  // Return the size of the largest vector size in bytes.  We need to pass in
   // "function" since llvm functions can contain annotations for specializing
   // them to specific micro-architectures (though currently XLA does not use
   // this functionality).
-  //
-  // Ideally we should have been able to use
-  // llvm::TargetTransformInfo::getRegisterBitWidth(true) here.
-  unsigned largest_register_size_in_bytes(llvm::Function* function);
+  int vector_register_byte_size(const llvm::Function& function) {
+    llvm::TargetTransformInfo* tti = GetTargetTransformInfoFor(function);
+    return tti->getRegisterBitWidth(/*Vector=*/true) / 8;
+  }
 
  private:
-  unsigned largest_register_size_in_bytes_impl(llvm::Function* function) const;
+  llvm::TargetTransformInfo* GetTargetTransformInfoFor(
+      const llvm::Function& function);
 
-  tensorflow::gtl::FlatMap<llvm::Function*, int>
-      largest_register_size_in_bytes_;
+  tensorflow::gtl::FlatMap<const llvm::Function*, llvm::TargetTransformInfo>
+      target_transform_infos_;
   llvm::TargetMachine* target_machine_;
 };
 
@@ -189,6 +184,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleCustomCall(HloInstruction* custom_call) override;
   Status HandleWhile(HloInstruction* xla_while) override;
   Status HandleConcatenate(HloInstruction* concatenate) override;
+  Status HandleConditional(HloInstruction* conditional) override;
   Status FinishVisit(HloInstruction* root) override;
 
   Status Preprocess(HloInstruction* hlo) override;
@@ -233,16 +229,9 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // Convenience function to get the IR type matching the given shape.
   llvm::Type* IrShapeType(const Shape& shape);
 
-  // Returns an array of compute function parameter types.
-  std::vector<llvm::Type*> GetComputeFunctionParams();
-
-  // Get the llvm::Value* that represents the "retval" argument of the
-  // computation function being emitted by this emitter.
-  llvm::Argument* GetResultArgument();
-
   // Get the llvm::Value* that represents the "prof_counters" argument of the
   // computation function being emitted by this emitter.
-  llvm::Argument* GetProfileCountersArgument();
+  llvm::Value* GetProfileCountersArgument();
 
   // Get the xla::ExecutableRunOptions that represents the "run_options"
   // argument of the computation function being emitted by this emitter.
@@ -252,11 +241,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // computation function being emitted by this emitter.
   llvm::Value* GetTempBuffersArgument();
 
-  // Emit ir to read and return the ir value for the dynamic loop bound at
-  // 'offset' from the "dynamic_loop_bounds" argument of the computation
-  // function being emitted by this emitter.
-  llvm::Value* GetDynamicLoopBound(const int64 offset);
-
   // Emits code that computes the address of the given temporary buffer to the
   // function. target_shape is the shape of this temporary buffer.
   // The returned Value's type is a pointer to element_type.
@@ -310,18 +294,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
       tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
       tensorflow::StringPiece name);
 
-  // Returns an array of compute function call arguments.
-  std::vector<llvm::Value*> GetArrayFunctionCallArguments(
-      tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-      llvm::Value* return_value_buffer, tensorflow::StringPiece name);
-
-  // Emits a call to a runtime fork/join function which dispatches parallel
-  // calls to 'parallel_function' (and joins threads before returning).
-  Status EmitParallelForkJoin(
-      tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-      llvm::Value* output_address, HloComputation* computation,
-      llvm::Function* parallel_function);
-
   // Verifies that the element types of all of the given operand instructions
   // match and are of one of the given supported types.
   Status ElementTypesSameAndSupported(
@@ -346,15 +318,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
       HloInstruction* target_op, tensorflow::StringPiece desc,
       const llvm_ir::ElementGenerator& element_generator);
 
-  // Emit IR to perform a computation for every element in a partition/slice of
-  // 'target_shape'. The loop bounds for the outer-dimension partitions are
-  // passed into the compute function as a runtime argument (accessible from
-  // GetDynamicLoopBound).
-  Status EmitParallelTargetElementLoop(
-      const Shape& target_shape,
-      const llvm_ir::ElementGenerator& element_generator,
-      tensorflow::StringPiece loop_name, llvm_ir::IrArray* target_array);
-
   // Emits a memcpy from the source instruction's result value to the
   // destination's.  Both source and destination must have an entry in the
   // emitted_value_ table.
@@ -476,8 +439,10 @@ class IrEmitter : public DfsHloVisitorWithDefault {
       thread_local_buffers_;
 
   // The following fields track the IR emission state. According to LLVM memory
-  // management rules, their memory is owned by the module.
-  llvm::Function* compute_function_;
+  // management rules, their memory is owned by the module (Note that IrFunction
+  // creates the encapsulated llvm::Function s.t. it is added to the llvm
+  // module's function list).
+  std::unique_ptr<IrFunction> compute_function_;
   llvm::IRBuilder<> ir_builder_;
 
   // Maps HLOs to their index into the profile counter array.
@@ -490,7 +455,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   llvm_ir::AliasAnalysis alias_analysis_;
 
   // The number of root instruction outer dimensions used in parallel loop
-  // emission (EmitParallelTargetElementLoop).
+  // emission (ParallelLoopEmitter).
   int64 num_dynamic_loop_bounds_ = 0;
 
   // Returns whether the given instruction should be emitted as a parallel loop.
@@ -510,7 +475,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
           use_rdtscp_(false),
           prof_counters_(nullptr) {}
     ProfilingState(bool is_top_level_computation, bool use_rdtscp,
-                   llvm::Argument* prof_counters)
+                   llvm::Value* prof_counters)
         : is_top_level_computation_(is_top_level_computation),
           use_rdtscp_(use_rdtscp),
           prof_counters_(prof_counters) {}
@@ -543,7 +508,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
     bool use_rdtscp_;
 
     // The argument which corresponds to the profile counter buffer.
-    llvm::Argument* prof_counters_;
+    llvm::Value* prof_counters_;
 
     // The first read cycle counter in the program.
     llvm::Value* first_read_cycle_start_ = nullptr;
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.cc b/tensorflow/compiler/xla/service/cpu/ir_function.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca8c290dd1c4959e42026c3917d37f8fc95a1011
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.cc
@@ -0,0 +1,333 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iterator>
+
+#include "tensorflow/compiler/xla/service/cpu/ir_function.h"
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
+#include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace xla {
+
+namespace {
+using llvm_ir::AsStringRef;
+}  // namespace
+
+namespace cpu {
+
+static std::vector<llvm::Type*> GetComputeFunctionParams(
+    llvm::Module* llvm_module, const int64 num_dynamic_loop_bounds) {
+  llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(llvm_module->getContext());
+  llvm::Type* i8_ptr_ptr_type = i8_ptr_type->getPointerTo();
+  llvm::Type* i64_ptr_type =
+      llvm::Type::getInt64PtrTy(llvm_module->getContext());
+  std::vector<llvm::Type*> compute_function_params(
+      {i8_ptr_type, i8_ptr_type, i8_ptr_ptr_type, i8_ptr_ptr_type});
+  if (num_dynamic_loop_bounds > 0) {
+    compute_function_params.push_back(i64_ptr_type);
+  }
+  compute_function_params.push_back(i64_ptr_type);
+  return compute_function_params;
+}
+
+IrFunction::IrFunction(const string& function_name,
+                       llvm::Function::LinkageTypes linkage,
+                       const bool optimize_for_size_requested,
+                       const bool enable_fast_math, llvm::Module* llvm_module,
+                       llvm::IRBuilder<>* ir_builder,
+                       int64 num_dynamic_loop_bounds)
+    : ir_builder_(ir_builder),
+      llvm_module_(llvm_module),
+      caller_insert_point_guard_(*ir_builder),
+      num_dynamic_loop_bounds_(num_dynamic_loop_bounds) {
+  Initialize(function_name, linkage, optimize_for_size_requested,
+             enable_fast_math);
+}
+
+IrFunction::~IrFunction() {
+  // Emit function return value.
+  ir_builder_->CreateRetVoid();
+}
+
+DynamicLoopBounds IrFunction::GetDynamicLoopBounds() {
+  DynamicLoopBounds dynamic_loop_bounds(num_dynamic_loop_bounds_);
+  for (int i = 0; i < num_dynamic_loop_bounds_; ++i) {
+    dynamic_loop_bounds[i].first = GetDynamicLoopBound(i * 2 + 0);
+    dynamic_loop_bounds[i].second = GetDynamicLoopBound(i * 2 + 1);
+  }
+  return dynamic_loop_bounds;
+}
+
+void IrFunction::Initialize(const string& function_name,
+                            llvm::Function::LinkageTypes linkage,
+                            const bool optimize_for_size_requested,
+                            const bool enable_fast_math) {
+  // The function signature is:
+  //   void function(i8* retval, i8* run_options, i8** params, i8** temps,
+  //                 i64* dynamic_loop_bounds, i64* prof_counters)
+  //
+  // retval: points to the returned value.
+  // params: address of an array with pointers to parameters.
+  // temps: address of an array with pointers to temporary buffers.
+  //
+  // Therefore, the generated function's signature (FunctionType) is statically
+  // determined - parameter unpacking is done in code generated into the
+  // function, rather than by a prologue dictated by the platform ABI.
+  //
+  //                      /--------------\
+  //   retval ----------> | return value |
+  //                      \--------------/
+  //
+  //                      /-------------------------------\
+  //   run_options -----> | xla::ExecutableRunOptions |
+  //                      \-------------------------------/
+  //
+  //                     /---------------------------------------------\
+  //   params -------->  |  param 0  |  param 1  | ..... |  param N-1  |
+  //                     |   addr    |   addr    |       |   addr      |
+  //                     \---------------------------------------------/
+  //                          |           |                   |
+  //                          |           |                   |
+  //                          V           V                   V
+  //                     /---------\  /---------\         /-----------\
+  //                     | param 0 |  | param 1 |         | param N-1 |
+  //                     \---------/  \---------/         \-----------/
+  //
+  //                     /---------------------------------------------\
+  //   temps --------->  |  temp  0  |  temp  1  | ..... |  temp  N-1  |
+  //                     |   addr    |   addr    |       |   addr      |
+  //                     \---------------------------------------------/
+  //                          |           |                   |
+  //                          |           |                   |
+  //                          V           V                   V
+  //                     /---------\  /---------\         /-----------\
+  //                     | temp  0 |  | temp  1 |         | temp  N-1 |
+  //                     \---------/  \---------/         \-----------/
+  //
+  //                        /--------------------------------------------\
+  // dynamic loop bounds -> | outer_dim0_start | outer_dim0_limit | .....|
+  //  (elided for aot)      \--------------------------------------------/
+  //
+  //                     /---------------------------------------------\
+  //   prof counters ->  | counter 0 | counter 1 | ..... | counter N-1 |
+  //                     \---------------------------------------------/
+
+  // Even though the type of params and temps is void** in the host's view, in
+  // LLVM IR this is represented by i8*, similarly to void*. It's up to the code
+  // to use GEPs to unravel the indirection layers.
+  llvm::FunctionType* function_type = llvm::FunctionType::get(
+      /*Result=*/llvm::Type::getVoidTy(llvm_module_->getContext()),
+      /*Params=*/
+      GetComputeFunctionParams(llvm_module_, num_dynamic_loop_bounds_),
+      /*isVarArg=*/false);
+
+  // Functions with local linkage get an inlining bonus.  Because we know
+  // a-priori that embedded functions (non-entry functions) will not have its
+  // name resolved, give it local linkage.
+  function_ =
+      llvm_ir::CreateFunction(function_type, linkage,
+                              /*enable_fast_math=*/enable_fast_math,
+                              /*optimize_for_size=*/optimize_for_size_requested,
+                              function_name, llvm_module_);
+
+  // Set meaningful names for the function's arguments: useful for debugging.
+  llvm::Function::arg_iterator arg_iter = function_->arg_begin();
+  arg_iter->setName("retval");
+  result_arg_ = &*arg_iter;
+  (++arg_iter)->setName("run_options");
+  exec_run_options_arg_ = &*arg_iter;
+  (++arg_iter)->setName("params");
+  parameters_arg_ = &*arg_iter;
+  (++arg_iter)->setName("temps");
+  temp_buffers_arg_ = &*arg_iter;
+  if (num_dynamic_loop_bounds_ > 0) {
+    (++arg_iter)->setName("dynamic_loop_bounds");
+    dynamic_loop_bounds_arg_ = &*arg_iter;
+  }
+  (++arg_iter)->setName("prof_counters");
+  profile_counters_arg_ = &*arg_iter;
+
+  // We know a-priori that the function arguments are guaranteed to point to
+  // disjoint objects.
+  llvm::Argument* retval = result_arg();
+  for (llvm::Argument& argument : function_->args()) {
+    // However, the return buffer aliases the temporaries and thus cannot be
+    // marked noalias.
+    if (&argument == retval) {
+      continue;
+    }
+    function_->addAttribute(argument.getArgNo() + 1, llvm::Attribute::NoAlias);
+  }
+
+  ir_builder_->SetInsertPoint(llvm::BasicBlock::Create(
+      /*Context=*/llvm_module_->getContext(),
+      /*Name=*/"entry",
+      /*Parent=*/function_));
+}
+
+llvm::Value* IrFunction::GetDynamicLoopBound(const int64 offset) {
+  CHECK_GT(num_dynamic_loop_bounds_, 0);
+  CHECK_LT(offset, num_dynamic_loop_bounds_ * 2);
+  string name = tensorflow::strings::StrCat("dynamic_loop_bound_", offset);
+  return ir_builder_->CreateLoad(
+      ir_builder_->CreateGEP(CHECK_NOTNULL(dynamic_loop_bounds_arg_),
+                             ir_builder_->getInt64(offset), AsStringRef(name)));
+}
+
+// Emits code to allocate an array of parameter address pointers, and store
+// each address from 'parameter_addresses'.
+// Returns an array of compute function call arguments (including parameter
+// address buffer).
+std::vector<llvm::Value*> GetArrayFunctionCallArguments(
+    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
+    llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece name,
+    llvm::Value* return_value_buffer, llvm::Value* exec_run_options_arg,
+    llvm::Value* temp_buffers_arg, llvm::Value* profile_counters_arg) {
+  llvm::Value* parameter_addresses_buffer =
+      llvm_ir::EmitAllocaAtFunctionEntryWithCount(
+          ir_builder->getInt8PtrTy(),
+          ir_builder->getInt32(parameter_addresses.size()),
+          tensorflow::strings::StrCat(name, "_parameter_addresses"),
+          ir_builder);
+  for (size_t i = 0; i < parameter_addresses.size(); ++i) {
+    llvm::Value* parameter_as_i8ptr = ir_builder->CreateBitCast(
+        parameter_addresses[i], ir_builder->getInt8PtrTy(),
+        AsStringRef(tensorflow::strings::StrCat(name, "_parameter_", i,
+                                                "_address_as_i8ptr")));
+    llvm::Value* slot_in_param_adresses = ir_builder->CreateInBoundsGEP(
+        parameter_addresses_buffer, {ir_builder->getInt64(i)});
+    ir_builder->CreateStore(parameter_as_i8ptr, slot_in_param_adresses);
+  }
+
+  const auto to_int8_ptr = [=](llvm::Value* ptr) {
+    return ir_builder->CreatePointerCast(ptr, ir_builder->getInt8PtrTy());
+  };
+  std::vector<llvm::Value*> arguments{
+      to_int8_ptr(return_value_buffer), to_int8_ptr(exec_run_options_arg),
+      parameter_addresses_buffer, temp_buffers_arg};
+  if (profile_counters_arg != nullptr) {
+    arguments.push_back(profile_counters_arg);
+  }
+  return arguments;
+}
+
+// Emits a call to a runtime fork/join function which dispatches parallel
+// calls to 'parallel_function' (and joins threads before returning).
+Status EmitCallToParallelForkJoin(
+    const std::vector<llvm::Value*>& arguments, const Shape& shape,
+    const std::vector<int64>& dimension_partition_counts,
+    llvm::IRBuilder<>* ir_builder, llvm::Function* parallel_function,
+    const string& name) {
+  llvm::Module* module = ir_builder->GetInsertBlock()->getModule();
+
+  // Build ParallelForkJoin function type.
+  std::vector<llvm::Type*> compute_function_params =
+      GetComputeFunctionParams(module, /*num_dynamic_loop_bounds=*/0);
+  // Number of parallel compute functions.
+  compute_function_params.push_back(ir_builder->getInt32Ty());
+  // Array of partitions. There is an array element for each
+  // partition x partition_dim x 2 (for dimension start and limit).
+  compute_function_params.push_back(
+      llvm::Type::getInt64PtrTy(module->getContext()));
+  // Number of partitioned most-major dimensions in 'shape'.
+  compute_function_params.push_back(ir_builder->getInt32Ty());
+  // Function pointer for compute function to be dispatched in parallel.
+  compute_function_params.push_back(
+      llvm::Type::getInt8PtrTy(module->getContext()));
+
+  llvm::FunctionType* fork_join_type = llvm::FunctionType::get(
+      /*Result=*/llvm::Type::getVoidTy(module->getContext()),
+      /*Params=*/compute_function_params,
+      /*isVarArg=*/false);
+
+  llvm::Function* fork_join_func =
+      llvm::cast<llvm::Function>(module->getOrInsertFunction(
+          runtime::kParallelForkJoinSymbolName, fork_join_type));
+  fork_join_func->setCallingConv(llvm::CallingConv::C);
+  fork_join_func->setDoesNotThrow();
+
+  // Add common compute function arguments.
+  std::vector<llvm::Value*> fork_join_arguments(arguments);
+
+  // Create ShapePartitionIterator to generate all partitions of 'shape'.
+  ShapePartitionIterator partition_iterator(shape, dimension_partition_counts);
+  const int64 num_partitions = partition_iterator.GetTotalPartitionCount();
+  // Add argument specifying the number of parallel partitions.
+  fork_join_arguments.push_back(ir_builder->getInt32(num_partitions));
+
+  // The number of partitioned most-major dimensions in 'shape'.
+  const int32 num_partitioned_dims = dimension_partition_counts.size();
+  // A dimension partition consists of two elements: [start_index, limit_index).
+  const int32 dim_partition_size = 2;
+  // Calculate array partition stride.
+  const int32 array_partition_stride =
+      num_partitioned_dims * dim_partition_size;
+  // Calculate the total number of elements in the partition array.
+  const int32 partition_array_size =
+      dim_partition_size * num_partitioned_dims * num_partitions;
+
+  // Store dimension partition values as llvm constants in 'partitions'.
+  // See comments in runtime_fork_join.cc for array layout description.
+  std::vector<llvm::Constant*> partitions(partition_array_size);
+  for (int32 i = 0; i < num_partitions; ++i) {
+    std::vector<std::pair<int64, int64>> dim_partitions =
+        partition_iterator.GetPartition(i);
+    CHECK_EQ(num_partitioned_dims, dim_partitions.size());
+    const int32 partition_index = i * array_partition_stride;
+    for (int32 j = 0; j < num_partitioned_dims; ++j) {
+      const std::pair<int64, int64>& dim_partition = dim_partitions[j];
+      const int32 index = partition_index + j * dim_partition_size;
+      // Store partition [dim_start, dim_limit) intervals for each dimension.
+      partitions[index] = ir_builder->getInt64(dim_partition.first);
+      partitions[index + 1] =
+          ir_builder->getInt64(dim_partition.first + dim_partition.second);
+    }
+  }
+
+  // Create global variable out of dimension partitions in 'partitions'.
+  llvm::ArrayType* partitions_array_type =
+      llvm::ArrayType::get(ir_builder->getInt64Ty(), partition_array_size);
+  llvm::Constant* partitions_array =
+      llvm::ConstantArray::get(partitions_array_type, partitions);
+  llvm::GlobalVariable* global_partitions_array = new llvm::GlobalVariable(
+      /*M=*/*module,
+      /*Ty=*/partitions_array_type,
+      /*isConstant=*/true,
+      /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
+      /*Initializer=*/partitions_array,
+      /*Name=*/
+      AsStringRef(
+          tensorflow::strings::StrCat(name, "_parallel_dimension_partitions")));
+
+  // Add argument specifying parallel dimension partitions.
+  fork_join_arguments.push_back(ir_builder->CreateBitCast(
+      global_partitions_array,
+      llvm::Type::getInt64PtrTy(module->getContext())));
+  // Add argument specifying the number of partitioned most-major dimensions.
+  fork_join_arguments.push_back(ir_builder->getInt32(num_partitioned_dims));
+  // Add argument for parallel compute function pointer.
+  fork_join_arguments.push_back(
+      ir_builder->CreateBitCast(parallel_function, ir_builder->getInt8PtrTy()));
+  // Emit call to parallel fork/join.
+  ir_builder->CreateCall(fork_join_func, fork_join_arguments);
+
+  return Status::OK();
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.h b/tensorflow/compiler/xla/service/cpu/ir_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fd2da4dce23982ed030f3aa8ec604182d0ebab8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.h
@@ -0,0 +1,134 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_FUNCTION_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_FUNCTION_H_
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace xla {
+namespace cpu {
+
+// IrFunction creates and encapsulates an llvm::Function, exposing methods to
+// emitters for function and function argument access.
+// The llvm::Function is created with the standard function signature
+// used in the XLA CPU backend (see ir_function.cc for argument details).
+// In addtion IrFunction saves the callers IR insert point during contruction,
+// and restores it after desctruction.
+//
+// Example usage:
+//
+//    // Create and initialize new IrFunction.
+//    std::unique_ptr<IrFunction> compute_function(new IrFunction(...));
+//    // Emit IR for function body using IrFunction helper methods.
+//    ...
+//    // Store reference to llvm::Function for future invocation.
+//    ir_functions.push_back(compute_function.function());
+//    // Delete IrFunction (finalizes IR function and restores caller insertion
+//    // point).
+//    compute_function.reset();
+//
+
+class IrFunction {
+ public:
+  IrFunction(const string& function_name, llvm::Function::LinkageTypes linkage,
+             const bool optimize_for_size_requested,
+             const bool enable_fast_math, llvm::Module* llvm_module,
+             llvm::IRBuilder<>* ir_builder, int64 num_dynamic_loop_bounds);
+  ~IrFunction();
+
+  // Emit ir to read and return the set of ir values representing the dynamic
+  // loop bounds argument of this function.
+  // Each element in returned vector is a pair of ir values representing
+  // the loop bounds for a specific dimension, where the first element of the
+  // pair is the dimension start index, and the second element of the pair
+  // is the dimension limit.
+  // EX: [dimension_i_index_start_ir_value, dimension_i_index_limit_ir_value]
+  //
+  DynamicLoopBounds GetDynamicLoopBounds();
+
+  // Returns the encapculated llvm::Function.
+  llvm::Function* function() { return function_; }
+
+  // Get the llvm::Value* that represents this functions "retval" argument.
+  llvm::Argument* result_arg() { return result_arg_; }
+
+  // Get the xla::ExecutableRunOptions that represents this functions
+  // "run_options" argument.
+  llvm::Value* exec_run_options_arg() { return exec_run_options_arg_; }
+
+  // Get the llvm::Value* that represents this functions parameters argument.
+  llvm::Value* parameters_arg() { return parameters_arg_; }
+
+  // Get the llvm::Value* that represents this functions "temps" argument.
+  llvm::Value* temp_buffers_arg() { return temp_buffers_arg_; }
+
+  // Get the llvm::Value* that represents this functions "prof_counters"
+  // argument.
+  llvm::Value* profile_counters_arg() { return profile_counters_arg_; }
+
+ private:
+  // Initialize an llvm::Function with standard signature based on arguments.
+  void Initialize(const string& function_name,
+                  llvm::Function::LinkageTypes linkage,
+                  bool optimize_for_size_requested, bool enable_fast_math);
+
+  // Emit ir to read and return the ir value for the dynamic loop bound at
+  // 'offset' from the "dynamic_loop_bounds" argument of this function.
+  llvm::Value* GetDynamicLoopBound(int64 offset);
+
+  llvm::IRBuilder<>* ir_builder_;
+  llvm::Module* llvm_module_;
+  llvm::IRBuilder<>::InsertPointGuard caller_insert_point_guard_;
+
+  int64 num_dynamic_loop_bounds_ = 0;
+  // Encapsulated llvm::Function.
+  llvm::Function* function_;
+  // Function argument IR values.
+  llvm::Argument* result_arg_;
+  llvm::Value* exec_run_options_arg_;
+  llvm::Value* parameters_arg_;
+  llvm::Value* temp_buffers_arg_;
+  llvm::Value* dynamic_loop_bounds_arg_ = nullptr;
+  llvm::Value* profile_counters_arg_;
+};
+
+// Returns an array of compute function call argument ir values.
+std::vector<llvm::Value*> GetArrayFunctionCallArguments(
+    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
+    llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece name,
+    llvm::Value* return_value_buffer, llvm::Value* exec_run_options_arg,
+    llvm::Value* temp_buffers_arg, llvm::Value* profile_counters_arg);
+
+// Emits a call to a runtime fork/join function which dispatches parallel
+// calls to 'parallel_function' (and joins threads before returning).
+Status EmitCallToParallelForkJoin(
+    const std::vector<llvm::Value*>& arguments, const Shape& shape,
+    const std::vector<int64>& dimension_partition_counts,
+    llvm::IRBuilder<>* ir_builder, llvm::Function* parallel_function,
+    const string& name);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_FUNCTION_H_
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index aff61296ced47a911ded207f611747564b5ac7eb..d1b88b27f068962fb86477fcad3e4390b1636c2b 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -59,19 +59,20 @@ ParallelCpuExecutable::ParallelCpuExecutable(
     std::unique_ptr<const BufferAssignment> assignment,
     std::unique_ptr<const HloModule> hlo_module,
     std::unique_ptr<const HloInstructionMap<string>> function_names,
-    std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
     std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>>
-        aligned_constants)
-    : Executable(std::move(hlo_module)),
+        aligned_constants,
+    std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+    : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+                 std::move(hlo_profile_index_map)),
       jit_(std::move(jit)),
       assignment_(std::move(assignment)),
       function_names_(std::move(function_names)),
-      hlo_to_profile_idx_(std::move(hlo_to_profile_idx)),
       aligned_constants_(std::move(aligned_constants)) {}
 
 // Type of the computation function we expect in the JIT.
 using ComputeFunctionType = void (*)(void*, const void*, const void**, void**,
-                                     int64*, uint64*);
+                                     int64*, int64*);
 
 // Given a pointer to an output buffer (following the CPU JIT calling
 // conventions), mark addresses that are "live". The initial pointer itself is
@@ -106,7 +107,7 @@ class Executor {
            const ServiceExecutableRunOptions* run_options,
            std::list<HloInstruction*>* pending,
            HloInstructionMap<const void*>* results, void** temps_array,
-           uint64* profile_counters_array, const BufferAssignment* assignment)
+           int64* profile_counters_array, const BufferAssignment* assignment)
       : functions_(functions),
         run_options_(run_options),
         pending_(pending),
@@ -147,7 +148,7 @@ class Executor {
   std::list<HloInstruction*>* pending_;
   HloInstructionMap<const void*>* results_;
   void** temps_array_;
-  uint64* profile_counters_array_;
+  int64* profile_counters_array_;
   tensorflow::thread::ThreadPool* thread_pool_;
   const BufferAssignment* assignment_;
 
@@ -375,23 +376,12 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
     HloExecutionProfile* hlo_execution_profile) {
-  std::vector<se::DeviceMemoryBase> argument_buffers(arguments.size());
-  for (int i = 0; i < arguments.size(); ++i) {
-    argument_buffers[i] = arguments[i]->buffer(/*index=*/{});
-  }
-  return ExecuteComputeFunctions(run_options, argument_buffers, buffers,
-                                 hlo_execution_profile);
-}
-
-Status ParallelCpuExecutable::ExecuteComputeFunctions(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
-    HloExecutionProfile* hlo_execution_profile) {
   // Allocate profiling counters for each hlo instruction that we would like to
-  // profile.  Allocate an additional profile counter for the entire
-  // computation.
-  std::vector<uint64> profile_counters(hlo_to_profile_idx_.size() + 1);
+  // profile.
+  std::vector<int64>* profile_counters = nullptr;
+  if (hlo_execution_profile) {
+    profile_counters = hlo_execution_profile->mutable_profile_counters();
+  }
 
   std::vector<void*> buffer_pointers;
   buffer_pointers.reserve(buffers.size());
@@ -425,8 +415,9 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
     // just copy the existing buffer into the map containing instruction
     // results..
     if (instruction->opcode() == HloOpcode::kParameter) {
-      InsertOrDie(&results, instruction,
-                  arguments[instruction->parameter_number()].opaque());
+      InsertOrDie(
+          &results, instruction,
+          arguments[instruction->parameter_number()]->root_buffer().opaque());
     } else if (instruction->opcode() == HloOpcode::kConstant) {
       unsigned char* aligned_data =
           FindOrDie(aligned_constants_, instruction).get();
@@ -441,9 +432,9 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
   // For example, if we expect a library conv/matmul call to run at max
   // concurrency, we should not dispatch runnable instructions until the
   // library call is finished (to avoid expensive cache invalidation).
-  Executor executor(functions, run_options, &pending, &results,
-                    buffer_pointers.data(), profile_counters.data(),
-                    assignment_.get());
+  Executor executor(
+      functions, run_options, &pending, &results, buffer_pointers.data(),
+      profile_counters ? profile_counters->data() : nullptr, assignment_.get());
 
   TF_RETURN_IF_ERROR(executor.Run());
 
@@ -453,86 +444,11 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
     tensorflow::mutex_lock lock(mutex_);
     double nanoseconds = (end_micros - start_micros) * 1000.0;
     execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
-    // The last profile counter is used for the computation as a whole.
-    execution_profile_.set_compute_cycle_count(profile_counters.back());
-  }
-  if (hlo_execution_profile != nullptr) {
-    hlo_execution_profile->set_total_cycles_executed(entry_computation,
-                                                     profile_counters.back());
-
-    for (auto hlo_prof_idx : hlo_to_profile_idx_) {
-      const HloInstruction* hlo = hlo_prof_idx.first;
-      uint64 cycles_taken = profile_counters[hlo_prof_idx.second];
-      hlo_execution_profile->SetCyclesTakenBy(hlo, cycles_taken);
-    }
   }
 
   return Status::OK();
 }
 
-StatusOr<perftools::gputools::DeviceMemoryBase>
-ParallelCpuExecutable::ExecuteOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
-  se::Stream* stream = run_options->stream();
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-  VLOG(3) << "ExecuteOnStream arg size: " << arguments.size();
-  if (!arguments.empty()) {
-    VLOG(3) << "ExecuteOnStream arg[0]: " << arguments.at(0).opaque();
-  }
-
-  // Allocate the temporary buffers required for the computation.
-  se::StreamExecutor* stream_executor = stream->parent();
-  int device_ordinal = stream_executor->device_ordinal();
-  int64 buffer_count = assignment_->Allocations().size();
-  VLOG(3) << "temp buffer count: " << buffer_count;
-
-  std::vector<se::DeviceMemoryBase> device_allocations(
-      assignment_->Allocations().size());
-  TF_RETURN_IF_ERROR(AllocateBuffers(memory_allocator,
-                                     stream->parent()->device_ordinal(),
-                                     &device_allocations));
-
-  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
-                      assignment_->GetUniqueTopLevelOutputSlice());
-  const BufferAllocation::Index result_index = result_slice.index();
-  VLOG(3) << "result index: " << result_index;
-
-  TF_RETURN_IF_ERROR(ExecuteComputeFunctions(
-      run_options, arguments, device_allocations, hlo_execution_profile));
-
-  // Mark the buffers that are actually live (used in the output) when the
-  // computation finishes executing.
-  std::unordered_set<const void*> marked_addresses;
-  MarkLiveAddressesInOutput(device_allocations[result_index].opaque(),
-                            result_shape(), &marked_addresses);
-
-  VLOG(3) << "Live addresses in output marking found "
-          << marked_addresses.size() << " addresses:\n"
-          << tensorflow::str_util::Join(
-                 marked_addresses, ", ", [](string* out, const void* address) {
-                   tensorflow::strings::StrAppend(
-                       out, tensorflow::strings::Printf("%p", address));
-                 });
-
-  // Computation is done - deallocate temp buffers. Keep those marked
-  // live because they are referenced by the output of the computation
-  // and are needed by the service. They will be deallocated by the
-  // service.
-  for (size_t i = 0; i < device_allocations.size(); ++i) {
-    auto alloc = device_allocations[i];
-    if (marked_addresses.count(alloc.opaque()) == 0 &&
-        alloc.opaque() != nullptr) {
-      VLOG(3) << "ParallelCpuExecutable deallocating buffer #" << i << " ["
-              << alloc.opaque() << "]";
-      TF_RETURN_IF_ERROR(memory_allocator->Deallocate(device_ordinal, &alloc));
-    }
-  }
-
-  return device_allocations[result_index];
-}
-
 StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
@@ -545,9 +461,9 @@ StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
   DeviceMemoryAllocator* memory_allocator = run_options->allocator();
   std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
 
-  auto result_buffer =
-      MakeUnique<ShapedBuffer>(result_shape(), stream->parent()->platform(),
-                               stream->parent()->device_ordinal());
+  auto result_buffer = MakeUnique<ShapedBuffer>(
+      /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(),
+      stream->parent()->platform(), stream->parent()->device_ordinal());
 
   TF_RETURN_IF_ERROR(AllocateBuffers(
       memory_allocator, stream->parent()->device_ordinal(), &buffers));
@@ -558,37 +474,30 @@ StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
   // Copy DeviceMemoryBase values which into the respective location in
   // ShapedBuffer which is returned to the caller.
   std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
-  TF_RETURN_IF_ERROR(
-      result_buffer->mutable_shape_index_to_buffer_entry()
-          ->ForEachMutableElementWithStatus(
-              [&buffers, &buffers_in_result, &result_buffer, this](
-                  const ShapeIndex& index, size_t* buffer_entry) {
-                  const auto& sources =
-                      this->GetRootPointsToSet().element(index);
-                  // The points to set is unambiguous so the set should be a
-                  // singleton.
-                  CHECK_EQ(1, sources.size());
-                  const LogicalBuffer* buffer_source = sources[0];
-                  HloInstruction* src = buffer_source->instruction();
-
-                  // The source for this result buffer can be a nested buffer
-                  // such as a tuple element.
-
-                  // The source instruction should have a non-parameter buffer
-                  // assigned.
-                  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
-                                      this->assignment_->GetUniqueSlice(
-                                          src, buffer_source->index()));
-                  CHECK(!slice.allocation()->is_entry_computation_parameter());
-
-                  const BufferAllocation::Index buffer_index = slice.index();
-                  const se::DeviceMemoryBase& buffer = buffers[buffer_index];
-                  CHECK(!buffer.is_null() || buffer.size() == 0);
-                  *buffer_entry = result_buffer->mutable_buffers()->size();
-                  result_buffer->mutable_buffers()->push_back(buffer);
-                  buffers_in_result[buffer_index] = true;
-                return Status::OK();
-              }));
+  TF_RETURN_IF_ERROR(result_buffer->buffers().ForEachMutableElementWithStatus(
+      [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
+        const auto& sources = this->GetRootPointsToSet().element(index);
+
+        // The points to set is unambiguous so the set should be a singleton.
+        CHECK_EQ(1, sources.size());
+        const LogicalBuffer* buffer_source = sources[0];
+        HloInstruction* src = buffer_source->instruction();
+
+        // The source for this result buffer can be a nested buffer such as a
+        // tuple element. The source instruction should have a non-parameter
+        // buffer assigned.
+        TF_ASSIGN_OR_RETURN(
+            const BufferAllocation::Slice slice,
+            this->assignment_->GetUniqueSlice(src, buffer_source->index()));
+        CHECK(!slice.allocation()->is_entry_computation_parameter());
+
+        const BufferAllocation::Index buffer_index = slice.index();
+        const se::DeviceMemoryBase& buffer = buffers[buffer_index];
+        CHECK(!buffer.is_null() || buffer.size() == 0);
+        *device_memory = buffer;
+        buffers_in_result[buffer_index] = true;
+        return Status::OK();
+      }));
 
   // Free all buffers not in the result.
   for (size_t i = 0; i < buffers.size(); ++i) {
@@ -604,10 +513,10 @@ StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
   return std::move(result_buffer);
 }
 
-StatusOr<perftools::gputools::DeviceMemoryBase>
+StatusOr<std::unique_ptr<ShapedBuffer>>
 ParallelCpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   // TODO(b/30671675): Implement asynchronous execution mode.
   return Unimplemented(
       "Asynchronous execution on stream is not yet supported on CPU.");
@@ -618,10 +527,5 @@ const PointsToSet& ParallelCpuExecutable::GetRootPointsToSet() const {
       module().entry_computation()->root_instruction());
 }
 
-std::unique_ptr<HloCostAnalysis> ParallelCpuExecutable::CreateCostAnalysis()
-    const {
-  return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
-}
-
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index db16aaf48b0ef2aaa727c1bd0106bc51d1a65095..90ac94ef9288b2e860cb30c47ed44a7b96e4825d 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -52,27 +52,21 @@ class ParallelCpuExecutable : public Executable {
       std::unique_ptr<const BufferAssignment> assignment,
       std::unique_ptr<const HloModule> hlo_module,
       std::unique_ptr<const HloInstructionMap<string>> function_names,
-      std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
       std::unordered_map<const HloInstruction*,
                          std::unique_ptr<unsigned char[]>>
-          aligned_constants);
+          aligned_constants,
+      std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~ParallelCpuExecutable() override {}
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
-
   StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
+  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments) override;
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
   // This should be called after set_ir_module_string.
   const string& ir_module_string() const { return ir_module_string_; }
@@ -95,8 +89,6 @@ class ParallelCpuExecutable : public Executable {
         "Equality test on CPU parallel executable is not implemented.");
   }
 
-  std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
-
  private:
   // Allocate buffers required for execution and assign them to the elements of
   // "buffers". "buffers" should be sized to the number of buffers in buffer
@@ -109,13 +101,6 @@ class ParallelCpuExecutable : public Executable {
 
   // Calls the generated functions in 'function_names_', performing the
   // computation with the given arguments using the supplied buffers.
-  Status ExecuteComputeFunctions(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          buffers,
-      HloExecutionProfile* hlo_execution_profile);
   Status ExecuteComputeFunctions(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
@@ -143,9 +128,6 @@ class ParallelCpuExecutable : public Executable {
   // Map containing the JITted function names for each HLO instruction.
   const std::unique_ptr<const HloInstructionMap<string>> function_names_;
 
-  // Maps HLOs to their index into the profile counter array.
-  const std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx_;
-
   // Map from HLO Constant instructions to a pointer to their literal data.
   // The data stored in the protocol buffer might be insufficiently aligned,
   // we create a sufficiently aligned copy and store it in this map.
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1e439cde11cf74272101b80c867a308e51ab26a6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h"
+
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace xla {
+namespace cpu {
+
+ParallelLoopEmitter::ParallelLoopEmitter(
+    const llvm_ir::ElementGenerator& target_element_generator,
+    const llvm_ir::IrArray& target_array,
+    const DynamicLoopBounds* dynamic_loop_bounds, llvm::IRBuilder<>* ir_builder)
+    : LoopEmitter(target_element_generator, target_array, ir_builder),
+      dynamic_loop_bounds_(dynamic_loop_bounds) {}
+
+llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
+    tensorflow::StringPiece loop_name) {
+  CHECK(!ShapeUtil::IsTuple(shape_));
+  CHECK(!ShapeUtil::IsScalar(shape_));
+
+  llvm_ir::ForLoopNest loop_nest(loop_name, ir_builder_);
+  const int64 num_dims = shape_.dimensions_size();
+  llvm_ir::IrArray::Index array_index(num_dims);
+
+  // Add loops from outer-most to inner-most dimensions.
+  for (int i = LayoutUtil::MinorToMajor(shape_).size() - 1; i >= 0; --i) {
+    const int64 dimension = LayoutUtil::Minor(shape_.layout(), i);
+    const int bounds_index = num_dims - 1 - i;
+    if (bounds_index < dynamic_loop_bounds_->size()) {
+      // Emit dynamic loop bounds for this dimension. Dynamic loop bounds
+      // are read from ir function dynamic loop bounds argument.
+      llvm::Value* start_index = (*dynamic_loop_bounds_)[bounds_index].first;
+      llvm::Value* end_index = (*dynamic_loop_bounds_)[bounds_index].second;
+
+      std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
+          /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension),
+          start_index, end_index);
+      array_index[dimension] = loop->GetIndVarValue();
+    } else {
+      // Emit static loop bounds for this dimension.
+      std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
+          /*start_index=*/0,
+          /*end_index=*/shape_.dimensions(dimension),
+          /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension));
+      array_index[dimension] = loop->GetIndVarValue();
+    }
+  }
+  // Point IR builder at inner loop BB.
+  llvm_ir::SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(),
+                                 ir_builder_);
+
+  // Set exit_bb_ to the exit block of the loop nest.
+  exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock();
+  CHECK(exit_bb_ != nullptr);
+
+  return array_index;
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
new file mode 100644
index 0000000000000000000000000000000000000000..9335d2818e99eb3588537d80dabddda08c1c020e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
@@ -0,0 +1,73 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
+
+namespace xla {
+namespace cpu {
+
+// ParallelLoopEmitter emits a loop nest for the target array shape.
+// The outer loop bounds of the loop nest are passed as ir values at runtime
+// (specified in 'dynamic_loop_bounds'), and the inner loop bounds are static.
+// Dynamic loop bounds are specified as an array of dimension index
+// [start, limit) pairs of ir values (one for each partitioned outer dimension).
+//
+// EX: Let 'shape' = [8, 16, 32], with the loop bounds of the two-most major
+//     dimensions dynamic. Then 'dynamic_loop_bounds' will contain the
+//     following ir values for the two most-major dimensions:
+//       [dim0_index_start_ir_value, dim0_index_limit_ir_value]
+//       [dim1_index_start_ir_value, dim1_index_limit_ir_value]
+//
+// Code emitted by ParallelLoopEmitter will be called in a multi-threaded
+// context where each thread will be assigned a different set of outer dimension
+// partitions, and where all threads will collectively iterate over the
+// entire target array shape.
+//
+// Outer dimension partitions can be generated using the ShapePartitionAssigner
+// and ShapePartitionIterator utility classes from shape_partition.cc.
+//
+class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
+ public:
+  // Constructs a ParallelLoopEmitter which uses 'target_element_generator' to
+  // generate elements, 'dynamic_loop_bounds' to set the loop bounds of the
+  // most-major dimensions, and 'target_array.' shape to set the static loop
+  // bounds for the most-minor dimensions.
+  ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator,
+                      const llvm_ir::IrArray& target_array,
+                      const DynamicLoopBounds* dynamic_loop_bounds,
+                      llvm::IRBuilder<>* ir_builder);
+
+  ParallelLoopEmitter(const ParallelLoopEmitter&) = delete;
+  ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete;
+  ~ParallelLoopEmitter() override = default;
+
+  llvm_ir::IrArray::Index EmitIndexAndSetExitBasicBlock(
+      tensorflow::StringPiece loop_name) override;
+
+ private:
+  const DynamicLoopBounds* dynamic_loop_bounds_;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index cda2783307925b77ac6d8cfe679c5b325db2befc..c942cd6bf12c58873d5195f7454249763e639f91 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -102,9 +102,21 @@ llvm::StringRef GetHostCpuName() {
 
 CompilerFunctor::VectorIntrinsics GetAvailableIntrinsics() {
   CompilerFunctor::VectorIntrinsics intrinsics;
-  intrinsics.sse_intrinsics = (&__xla_cpu_runtime_ExpV4F32SSE != nullptr);
-  intrinsics.avx_intrinsics = (&__xla_cpu_runtime_ExpV8F32AVX != nullptr);
-  intrinsics.neon_intrinsics = (&__xla_cpu_runtime_ExpV4F32NEON != nullptr);
+#ifdef __SSE4_1__
+  intrinsics.sse_intrinsics = true;
+#else
+  intrinsics.sse_intrinsics = false;
+#endif
+#ifdef __AVX__
+  intrinsics.avx_intrinsics = true;
+#else
+  intrinsics.avx_intrinsics = false;
+#endif
+#ifdef __ARM_NEON__
+  intrinsics.neon_intrinsics = true;
+#else
+  intrinsics.neon_intrinsics = false;
+#endif
   return intrinsics;
 }
 
@@ -201,12 +213,18 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
+#ifdef __ARM_NEON__
   REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32NEON);
-  REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32SSE);
-  REGISTER_CPU_RUNTIME_SYMBOL(ExpV8F32AVX);
   REGISTER_CPU_RUNTIME_SYMBOL(LogV4F32NEON);
+#endif
+#ifdef __SSE4_1__
+  REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32SSE);
   REGISTER_CPU_RUNTIME_SYMBOL(LogV4F32SSE);
+#endif
+#ifdef __AVX__
+  REGISTER_CPU_RUNTIME_SYMBOL(ExpV8F32AVX);
   REGISTER_CPU_RUNTIME_SYMBOL(LogV8F32AVX);
+#endif
   REGISTER_CPU_RUNTIME_SYMBOL(ParallelForkJoin);
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseInfeedBufferAfterDequeue);
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseOutfeedBufferAfterPopulation);
@@ -275,7 +293,11 @@ bool RegisterKnownJITSymbols() {
   REGISTER_LIBM_SYMBOL(scalbln, double (*)(double, long));
   REGISTER_LIBM_SYMBOL(scalbn, double (*)(double, int));
   REGISTER_LIBM_SYMBOL(sin, double (*)(double));
+#ifdef __APPLE__
+  REGISTER_LIBM_SYMBOL(__sincos, void (*)(double, double*, double*));
+#else
   REGISTER_LIBM_SYMBOL(sincos, void (*)(double, double*, double*));
+#endif
   REGISTER_LIBM_SYMBOL(sinh, double (*)(double));
   REGISTER_LIBM_SYMBOL(sqrt, double (*)(double));
   REGISTER_LIBM_SYMBOL(tan, double (*)(double));
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index bc73839a88d8d3f231b4f3e924706b1a207562c6..0d54e325e618a7b1aae38407958ddf7b41ef1cda 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -86,6 +86,9 @@ class DfsHloVisitorBase {
   virtual Status HandleConvert(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
+  virtual Status HandleBitcastConvert(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
   virtual Status HandleCopy(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
@@ -208,6 +211,7 @@ class DfsHloVisitorBase {
   virtual Status HandleReduceWindow(HloInstructionPtr hlo) = 0;
   virtual Status HandleSelectAndScatter(HloInstructionPtr hlo) = 0;
   virtual Status HandleWhile(HloInstructionPtr hlo) = 0;
+  virtual Status HandleConditional(HloInstructionPtr hlo) = 0;
 
   virtual Status HandlePad(HloInstructionPtr hlo) = 0;
 
@@ -243,6 +247,10 @@ class DfsHloVisitorBase {
   // affecting correctness.
   void ReserveVisitStates(int num) { visit_state_.Reserve(num); }
 
+  // Useful when we want to visit the same computation more than once with the
+  // same visitor.
+  void ResetVisitStates() { visit_state_.Reset(); }
+
   void SetVisitState(int id, VisitState state) {
     visit_state_.SetState(id, state);
   }
@@ -322,6 +330,7 @@ class DfsHloVisitorBase {
       *w = (*w & ~mask) | (static_cast<uint64>(state) << shift);
       DCHECK_EQ(GetState(id), state);
     }
+    void Reset() { states_.clear(); }
 
    private:
     static const uint32 kStatesPerWord = sizeof(uint64) / 2 /*bits per entry*/;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 5415bab5b358edb3f64467f457e5273d117429b8..133aa2509405738de8388708b0c61a82023e2738 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -167,6 +167,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleWhile(HloInstructionPtr xla_while) override {
     return DefaultAction(xla_while);
   }
+  Status HandleConditional(HloInstructionPtr conditional) override {
+    return DefaultAction(conditional);
+  }
   Status HandleRecv(HloInstructionPtr recv) override {
     return DefaultAction(recv);
   }
diff --git a/tensorflow/compiler/xla/service/dot_decomposer.cc b/tensorflow/compiler/xla/service/dot_decomposer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..12faed69677cd99c6ed82c8d13dad3138d9461b7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dot_decomposer.cc
@@ -0,0 +1,185 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dot_decomposer.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace {
+
+// TODO(b/69062148) Remove this code when all backends support BatchDot
+// natively.
+Status DecomposeBatchDot(HloInstruction* dot) {
+  auto computation = dot->parent();
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
+  HloInstruction* lhs = dot->mutable_operand(0);
+  HloInstruction* rhs = dot->mutable_operand(1);
+  const Shape& lhs_shape = lhs->shape();
+  const Shape& rhs_shape = rhs->shape();
+  const Shape& dot_shape = dot->shape();
+
+  // ShapeInference should guarantee that lhs/rhs batch dimensions match.
+  CHECK_EQ(dnums.lhs_batch_dimensions_size(),
+           dnums.rhs_batch_dimensions_size());
+  const int64 num_batch_dims = dnums.lhs_batch_dimensions_size();
+  // Calculate total batch size (note that ShapeInference requires that
+  // the batch dimensions are most-major).
+  int64 batch_size = 1;
+  for (int i = 0; i < num_batch_dims; ++i) {
+    CHECK_EQ(lhs_shape.dimensions(dnums.lhs_batch_dimensions(i)),
+             rhs_shape.dimensions(dnums.rhs_batch_dimensions(i)));
+    batch_size *= lhs_shape.dimensions(dnums.lhs_batch_dimensions(i));
+  }
+
+  // Set lhs/rhs_transpose.
+  CHECK_EQ(1, dnums.lhs_contracting_dimensions_size());
+  const int64 lhs_contracting_dim_number = dnums.lhs_contracting_dimensions(0);
+  const bool lhs_transpose = (lhs_contracting_dim_number - num_batch_dims) == 0;
+
+  CHECK_EQ(1, dnums.rhs_contracting_dimensions_size());
+  const int64 rhs_contracting_dim_number = dnums.rhs_contracting_dimensions(0);
+  const bool rhs_transpose = (rhs_contracting_dim_number - num_batch_dims) == 1;
+
+  // Compute R3 and R3 shapes for lhs.
+  PrimitiveType lhs_type = lhs_shape.element_type();
+  const int64 lhs_rows = lhs_shape.dimensions(num_batch_dims + 0);
+  const int64 lhs_cols = lhs_shape.dimensions(num_batch_dims + 1);
+  Shape lhs_shape_r3 =
+      ShapeUtil::MakeShape(lhs_type, {batch_size, lhs_rows, lhs_cols});
+  Shape lhs_slice_shape_r3 =
+      ShapeUtil::MakeShape(lhs_type, {1, lhs_rows, lhs_cols});
+  Shape lhs_slice_shape_r2 =
+      ShapeUtil::MakeShape(lhs_type, {lhs_rows, lhs_cols});
+
+  // Compute R3 and R3 shapes for rhs.
+  PrimitiveType rhs_type = rhs_shape.element_type();
+  const int64 rhs_rows = rhs_shape.dimensions(num_batch_dims + 0);
+  const int64 rhs_cols = rhs_shape.dimensions(num_batch_dims + 1);
+  Shape rhs_shape_r3 =
+      ShapeUtil::MakeShape(rhs_type, {batch_size, rhs_rows, rhs_cols});
+  Shape rhs_slice_shape_r3 =
+      ShapeUtil::MakeShape(rhs_type, {1, rhs_rows, rhs_cols});
+  Shape rhs_slice_shape_r2 =
+      ShapeUtil::MakeShape(rhs_type, {rhs_rows, rhs_cols});
+
+  // Compute R3 and R3 shapes for dot output.
+  PrimitiveType dot_type = dot_shape.element_type();
+  const int64 dot_rows = dot_shape.dimensions(num_batch_dims + 0);
+  const int64 dot_cols = dot_shape.dimensions(num_batch_dims + 1);
+  Shape dot_shape_r2 = ShapeUtil::MakeShape(dot_type, {dot_rows, dot_cols});
+  Shape dot_shape_r3 = ShapeUtil::MakeShape(dot_type, {1, dot_rows, dot_cols});
+  Shape concat_shape_r3 =
+      ShapeUtil::MakeShape(dot_type, {batch_size, dot_rows, dot_cols});
+
+  // Reshape lhs/rhs into R3.
+  auto lhs_r3 = computation->AddInstruction(
+      HloInstruction::CreateReshape(lhs_shape_r3, lhs));
+  auto rhs_r3 = computation->AddInstruction(
+      HloInstruction::CreateReshape(rhs_shape_r3, rhs));
+
+  // Loop through batch size, slicing out required lhs/rhs to compute each Dot.
+  std::vector<HloInstruction*> output_slices(batch_size);
+  for (int64 i = 0; i < batch_size; ++i) {
+    // Slice R3 shape from 'lhs' and reshape to R2.
+    auto lhs_slice_r3 = computation->AddInstruction(
+        HloInstruction::CreateSlice(lhs_slice_shape_r3, lhs_r3, {i, 0, 0},
+                                    {i + 1, lhs_rows, lhs_cols}, {1, 1, 1}));
+    auto lhs_slice_r2 = computation->AddInstruction(
+        HloInstruction::CreateReshape(lhs_slice_shape_r2, lhs_slice_r3));
+
+    // Slice R3 shape from 'rhs' and reshape to R2.
+    auto rhs_slice_r3 = computation->AddInstruction(
+        HloInstruction::CreateSlice(rhs_slice_shape_r3, rhs_r3, {i, 0, 0},
+                                    {i + 1, rhs_rows, rhs_cols}, {1, 1, 1}));
+    auto rhs_slice_r2 = computation->AddInstruction(
+        HloInstruction::CreateReshape(rhs_slice_shape_r2, rhs_slice_r3));
+
+    // Transpose lhs/rhs (if needed).
+    if (lhs_transpose) {
+      Shape lhs_slice_shape_r2_transpose =
+          ShapeUtil::MakeShape(lhs_type, {lhs_cols, lhs_rows});
+      lhs_slice_r2 =
+          computation->AddInstruction(HloInstruction::CreateTranspose(
+              lhs_slice_shape_r2_transpose, lhs_slice_r2, {1, 0}));
+    }
+    if (rhs_transpose) {
+      Shape rhs_slice_shape_r2_transpose =
+          ShapeUtil::MakeShape(rhs_type, {rhs_cols, rhs_rows});
+      rhs_slice_r2 =
+          computation->AddInstruction(HloInstruction::CreateTranspose(
+              rhs_slice_shape_r2_transpose, rhs_slice_r2, {1, 0}));
+    }
+
+    // Compute Dot of lhs/rhs R2 slices.
+    DotDimensionNumbers dot_dnums;
+    dot_dnums.add_lhs_contracting_dimensions(1);
+    dot_dnums.add_rhs_contracting_dimensions(0);
+    auto dot_r2 = computation->AddInstruction(HloInstruction::CreateDot(
+        dot_shape_r2, lhs_slice_r2, rhs_slice_r2, dot_dnums));
+
+    // Reshape Dot to R3 so we can concat along batch dimension.
+    auto dot_r3 = computation->AddInstruction(
+        HloInstruction::CreateReshape(dot_shape_r3, dot_r2));
+
+    output_slices[i] = dot_r3;
+  }
+
+  // Concatenate slices from 'output_slices' along batch dimension.
+  auto concat = computation->AddInstruction(
+      HloInstruction::CreateConcatenate(concat_shape_r3, output_slices, 0));
+  // Reshape output 'new_dot' to original dimensions.
+  auto new_dot = computation->AddInstruction(
+      HloInstruction::CreateReshape(dot_shape, concat));
+
+  // Replace all uses of 'dot' in 'computation' with 'new_dot'.
+  return computation->ReplaceInstruction(dot, new_dot);
+}
+
+}  // namespace
+
+StatusOr<bool> DotDecomposer::Run(HloModule* module) {
+  XLA_VLOG_LINES(2, "DotDecomposer ENTRY\n" + module->ToString());
+  // Gather all batch Dot operations.
+  std::vector<HloInstruction*> batch_dots;
+  for (auto* computation : module->MakeNonfusionComputations()) {
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kDot) {
+        continue;
+      }
+      const DotDimensionNumbers& dnums = instruction->dot_dimension_numbers();
+      if (dnums.lhs_batch_dimensions_size() > 0 && decompose_batch_dot_) {
+        batch_dots.push_back(instruction);
+      }
+    }
+  }
+  // Decompose each batch Dot in 'batch_dots'.
+  bool changed = false;
+  for (auto* dot : batch_dots) {
+    TF_RETURN_IF_ERROR(DecomposeBatchDot(dot));
+    changed = true;
+  }
+  XLA_VLOG_LINES(2, "DotDecompose EXIT\n" + module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dot_decomposer.h b/tensorflow/compiler/xla/service/dot_decomposer.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ff0ab34eac0cd0fbc264b408c57653c944402a6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dot_decomposer.h
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DECOMPOSER_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DECOMPOSER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// DotDecomposer is a pass which decomposes batch Dot operations into a
+// sequence of smaller (R2) Dot operations.
+class DotDecomposer : public HloPassInterface {
+ public:
+  // Decomposes batch Dot operations when 'decompose_batch_dot' is true.
+  DotDecomposer(bool decompose_batch_dot = true)
+      : decompose_batch_dot_(decompose_batch_dot) {}
+  ~DotDecomposer() = default;
+  tensorflow::StringPiece name() const override { return "dot_decomposer"; }
+
+  // Run DotDecomposer pass on computations in 'module'.
+  // Returns whether the 'module' was changed.
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  bool decompose_batch_dot_;
+};
+
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DECOMPOSER_H_
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 606868034ac54c6fe0062d20e7a185c0a9ccd841..37929294327d2a57bb0ab1c48e90b6843cba6ae4 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -50,11 +50,161 @@ using llvm_ir::IrName;
 using llvm_ir::SetToFirstInsertPoint;
 using tensorflow::strings::StrCat;
 
+namespace {
+
+llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits,
+                                      int64 mantissa_bits,
+                                      llvm::IRBuilder<>* ir_builder) {
+  // Integer and float types for casting and constant generation.
+  llvm::Type* float_type = x->getType();
+  llvm::IntegerType* int_type = ir_builder->getInt32Ty();
+
+  // Cast the input value to an integer for bitwise manipulation.
+  llvm::Value* x_as_int = ir_builder->CreateBitCast(x, int_type);
+
+  if (mantissa_bits < 23) {
+    // Last remaining mantissa bit.
+    const uint32_t last_mantissa_bit_mask = 1u << (23 - mantissa_bits);
+
+    // Compute rounding bias for round-to-nearest with ties to even.  This is
+    // equal to a base value of 0111... plus one bit if the last remaining
+    // mantissa bit is 1.
+    const uint32_t base_rounding_bias = (last_mantissa_bit_mask >> 1) - 1;
+    llvm::Value* x_last_mantissa_bit = ir_builder->CreateLShr(
+        ir_builder->CreateAnd(
+            x_as_int, llvm::ConstantInt::get(int_type, last_mantissa_bit_mask)),
+        (23 - mantissa_bits));
+    llvm::Value* x_rounding_bias = ir_builder->CreateAdd(
+        x_last_mantissa_bit,
+        llvm::ConstantInt::get(int_type, base_rounding_bias));
+
+    // Add rounding bias, and mask out truncated bits.  Note that the case
+    // where adding the rounding bias overflows into the exponent bits is
+    // correct; the non-masked mantissa bits will all be zero, and the
+    // exponent will be incremented by one.
+    const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1);
+    x_as_int = ir_builder->CreateAdd(x_as_int, x_rounding_bias);
+    x_as_int = ir_builder->CreateAnd(
+        x_as_int, llvm::ConstantInt::get(int_type, truncation_mask));
+  }
+
+  if (exponent_bits < 8) {
+    // Masks for f32 values.
+    const uint32_t f32_sign_bit_mask = 1u << 31;
+    const uint32_t f32_exp_bits_mask = 0xffu << 23;
+
+    // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the most-
+    // significant bit -- is equal to 1.0f for all exponent sizes.  Adding
+    // 2^(n-1)-1 to this gives us the highest non-infinite exponent for a bit-
+    // size of n, and subtracting 2^(n-1)-1 from this gives us the lowest'
+    // exponent (corresponding to 0.0f).
+    //
+    // Thus, the f32 exponent corresponding to the highest non-infinite
+    // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32
+    // exponent corresponding to the lowest exponent for a bit size of n is
+    // (2^7-1) - 2^(n-1)-1.
+    //
+    // Note that we have already checked that exponents_bits >= 1.
+    const uint32_t f32_exponent_bias = (1 << 7) - 1;
+    const uint32_t reduced_exponent_bias = (1 << (exponent_bits - 1)) - 1;
+    const uint32_t reduced_max_exponent =
+        f32_exponent_bias + reduced_exponent_bias;
+    const uint32_t reduced_min_exponent =
+        f32_exponent_bias - reduced_exponent_bias;
+
+    // Do we overflow or underflow?
+    llvm::Value* x_exponent = ir_builder->CreateAnd(
+        x_as_int, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
+    llvm::Value* x_overflows = ir_builder->CreateICmpUGT(
+        x_exponent,
+        llvm::ConstantInt::get(int_type, reduced_max_exponent << 23));
+    llvm::Value* x_underflows = ir_builder->CreateICmpULE(
+        x_exponent,
+        llvm::ConstantInt::get(int_type, reduced_min_exponent << 23));
+
+    // Compute appropriately-signed values of zero and infinity.
+    llvm::Value* x_signed_zero = ir_builder->CreateAnd(
+        x_as_int, llvm::ConstantInt::get(int_type, f32_sign_bit_mask));
+    llvm::Value* x_signed_inf = ir_builder->CreateOr(
+        x_signed_zero, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
+
+    // Force to zero or infinity if overflow or underflow.  (Note that this
+    // truncates all denormal values to zero, rather than rounding them.)
+    x_as_int = ir_builder->CreateSelect(x_overflows, x_signed_inf, x_as_int);
+    x_as_int = ir_builder->CreateSelect(x_underflows, x_signed_zero, x_as_int);
+  }
+
+  // Cast the result back to a floating-point type.
+  llvm::Value* result = ir_builder->CreateBitCast(x_as_int, float_type);
+
+  // Correct result for NaN inputs.
+  //
+  // The exponent handling will "normalize" NaN values to infinities, which is
+  // undesirable (except in the case with no mantissa bits, in which case it
+  // is mandatory).  This logic also handles cases where mantissa-rounding
+  // causes a NaN's mantissa to overflow into the exponent bits, which would
+  // otherwise create an erroneous zero value.
+  //
+  // If the fast-math flags are set to assume no NaNs, the comparison is likely
+  // to be optimized away, so there's no point in even emitting it.
+  if (!ir_builder->getFastMathFlags().noNaNs()) {
+    llvm::Value* x_is_nan = ir_builder->CreateFCmpUNO(x, x);
+
+    if (mantissa_bits > 0) {
+      result = ir_builder->CreateSelect(x_is_nan, x, result);
+    } else {
+      result = ir_builder->CreateSelect(
+          x_is_nan, llvm::ConstantFP::getInfinity(float_type), result);
+    }
+  }
+  return result;
+}
+
+llvm::Value* EmitF32ToBF16(llvm::Value* f32_value,
+                           llvm::IRBuilder<>* ir_builder) {
+  auto reduced_precision = EmitReducePrecisionFloat(
+      f32_value,
+      /*exponent_bits=*/primitive_util::kBFloat16ExponentBits,
+      /*mantissa_bits=*/primitive_util::kBFloat16MantissaBits, ir_builder);
+  auto as_int32 =
+      ir_builder->CreateBitCast(reduced_precision, ir_builder->getInt32Ty());
+  auto shifted = ir_builder->CreateLShr(as_int32, 16);
+  auto truncated = ir_builder->CreateTrunc(shifted, ir_builder->getInt16Ty());
+  return ir_builder->CreateBitCast(truncated, ir_builder->getInt16Ty());
+}
+
+llvm::Value* EmitBF16ToF32(llvm::Value* bf16_value,
+                           llvm::IRBuilder<>* ir_builder) {
+  auto as_int16 =
+      ir_builder->CreateBitCast(bf16_value, ir_builder->getInt16Ty());
+  auto as_int32 = ir_builder->CreateZExt(as_int16, ir_builder->getInt32Ty());
+  auto shifted = ir_builder->CreateShl(as_int32, 16);
+  return ir_builder->CreateBitCast(shifted, ir_builder->getFloatTy());
+}
+
+llvm::Value* EmitIntegralToFloating(llvm::Value* integer_value,
+                                    PrimitiveType from_type,
+                                    PrimitiveType to_type, llvm::Module* module,
+                                    llvm::IRBuilder<>* ir_builder) {
+  if (primitive_util::IsSignedIntegralType(from_type)) {
+    return ir_builder->CreateSIToFP(
+        integer_value, llvm_ir::PrimitiveTypeToIrType(to_type, module));
+  } else {
+    CHECK(primitive_util::IsUnsignedIntegralType(from_type) ||
+          from_type == PRED);
+    return ir_builder->CreateUIToFP(
+        integer_value, llvm_ir::PrimitiveTypeToIrType(to_type, module));
+  }
+}
+
+}  // namespace
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitUnaryOp(
     const HloInstruction* op, llvm::Value* operand_value) const {
   if (op->opcode() == HloOpcode::kCopy) {
     return operand_value;
-  } else if (operand_value->getType()->isIntegerTy()) {
+  } else if (ShapeUtil::ElementIsIntegral(op->operand(0)->shape()) ||
+             op->operand(0)->shape().element_type() == PRED) {
     return EmitIntegerUnaryOp(op, operand_value);
   } else if (ShapeUtil::ElementIsComplex(op->operand(0)->shape())) {
     return EmitComplexUnaryOp(op, operand_value);
@@ -79,15 +229,14 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
             primitive_util::IsSignedIntegralType(to_type));
       }
       if (primitive_util::IsFloatingPointType(to_type)) {
-        if (primitive_util::IsSignedIntegralType(from_type)) {
-          return ir_builder_->CreateSIToFP(
-              operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
-        }
-        if (primitive_util::IsUnsignedIntegralType(from_type) ||
-            from_type == PRED) {
-          return ir_builder_->CreateUIToFP(
-              operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
+        if (to_type == BF16) {
+          return EmitF32ToBF16(
+              EmitIntegralToFloating(operand_value, from_type, F32, module_,
+                                     ir_builder_),
+              ir_builder_);
         }
+        return EmitIntegralToFloating(operand_value, from_type, to_type,
+                                      module_, ir_builder_);
       }
       if (primitive_util::IsComplexType(to_type)) {
         auto to_ir_component_type = llvm_ir::PrimitiveTypeToIrType(
@@ -110,6 +259,26 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
                            PrimitiveType_Name(from_type).c_str(),
                            PrimitiveType_Name(to_type).c_str());
     }
+    case HloOpcode::kBitcastConvert: {
+      PrimitiveType from_type = op->operand(0)->shape().element_type();
+      PrimitiveType to_type = op->shape().element_type();
+      CHECK(primitive_util::IsIntegralType(from_type));
+      if (from_type == to_type) {
+        return operand_value;
+      }
+      if (primitive_util::BitWidth(from_type) ==
+          primitive_util::BitWidth(to_type)) {
+        return ir_builder_->CreateBitCast(
+            operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
+      }
+      return InvalidArgument(
+          "bitcast conversion from primitive type %s to %s with unequal "
+          "bit-widths (%u versus %u) ",
+          PrimitiveType_Name(from_type).c_str(),
+          PrimitiveType_Name(to_type).c_str(),
+          primitive_util::BitWidth(from_type),
+          primitive_util::BitWidth(to_type));
+    }
     case HloOpcode::kAbs: {
       bool is_signed =
           primitive_util::IsSignedIntegralType(op->shape().element_type());
@@ -187,6 +356,17 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
                 llvm_ir::PrimitiveTypeToIrType(to_component_type, module_)),
             nullptr);
       }
+      if (from_type == BF16) {
+        TF_RET_CHECK(to_type != BF16);
+        operand_value = EmitBF16ToF32(operand_value, ir_builder_);
+        from_type = F32;
+        if (from_type == to_type) {
+          return operand_value;
+        }
+      }
+      if (from_type == F32 && to_type == BF16) {
+        return EmitF32ToBF16(operand_value, ir_builder_);
+      }
       if (primitive_util::IsFloatingPointType(to_type)) {
         return ir_builder_->CreateFPCast(
             operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
@@ -203,22 +383,34 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
                            PrimitiveType_Name(from_type).c_str(),
                            PrimitiveType_Name(to_type).c_str());
     }
+    case HloOpcode::kBitcastConvert: {
+      PrimitiveType from_type = op->operand(0)->shape().element_type();
+      PrimitiveType to_type = op->shape().element_type();
+      CHECK(primitive_util::IsFloatingPointType(from_type));
+      if (from_type == to_type) {
+        return operand_value;
+      }
+      if (primitive_util::BitWidth(from_type) ==
+          primitive_util::BitWidth(to_type)) {
+        return ir_builder_->CreateBitCast(
+            operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
+      }
+      return InvalidArgument(
+          "bitcast conversion from primitive type %s to %s with unequal "
+          "bit-widths (%u versus %u) ",
+          PrimitiveType_Name(from_type).c_str(),
+          PrimitiveType_Name(to_type).c_str(),
+          primitive_util::BitWidth(from_type),
+          primitive_util::BitWidth(to_type));
+    }
     case HloOpcode::kExp:
-      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::exp, {operand_value},
-                                          {operand_value->getType()},
-                                          ir_builder_);
+      return EmitExp(op->shape().element_type(), operand_value);
     case HloOpcode::kLog:
-      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::log, {operand_value},
-                                          {operand_value->getType()},
-                                          ir_builder_);
+      return EmitLog(op->shape().element_type(), operand_value);
     case HloOpcode::kCos:
-      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::cos, {operand_value},
-                                          {operand_value->getType()},
-                                          ir_builder_);
+      return EmitCos(op->shape().element_type(), operand_value);
     case HloOpcode::kSin:
-      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {operand_value},
-                                          {operand_value->getType()},
-                                          ir_builder_);
+      return EmitSin(op->shape().element_type(), operand_value);
     case HloOpcode::kFloor:
       return llvm_ir::EmitCallToIntrinsic(
           llvm::Intrinsic::floor, {operand_value}, {operand_value->getType()},
@@ -269,9 +461,25 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
     const HloInstruction* op, llvm::Value* operand_value) const {
+  PrimitiveType input_type = op->operand(0)->shape().element_type();
+  PrimitiveType component_type =
+      primitive_util::IsComplexType(input_type)
+          ? primitive_util::ComplexComponentType(input_type)
+          : input_type;
   switch (op->opcode()) {
-    // TODO(b/65209142): Angle/Log require atan2.
-    // case HloOpcode::kLog:  // log(a+bi) = .5*log(a^2+b^2) + i*atan2(b, a)
+    case HloOpcode::kLog: {
+      // log(a+bi) = .5*log(a^2+b^2) + i*atan2(b, a)
+      auto a = EmitExtractReal(operand_value);
+      auto b = EmitExtractImag(operand_value);
+      llvm::Type* llvm_ty = a->getType();
+      auto sum_sq = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(a, a),
+                                            ir_builder_->CreateFMul(b, b));
+      TF_ASSIGN_OR_RETURN(auto log_sum_sq, EmitLog(component_type, sum_sq));
+      TF_ASSIGN_OR_RETURN(auto angle, EmitAtan2(component_type, b, a));
+      auto one_half = llvm::ConstantFP::get(llvm_ty, 0.5);
+      return EmitComposeComplex(
+          op, ir_builder_->CreateFMul(one_half, log_sum_sq), angle);
+    }
     case HloOpcode::kConvert: {
       PrimitiveType from_type = op->operand(0)->shape().element_type();
       TF_RET_CHECK(primitive_util::IsComplexType(from_type));
@@ -293,15 +501,12 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
     }
     case HloOpcode::kExp: {
       // e^(a+bi) = e^a*(cos(b)+sin(b)i)
-      auto exp_a = llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::exp, {EmitExtractReal(operand_value)},
-          {EmitExtractReal(operand_value)->getType()}, ir_builder_);
-      auto cos_b = llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::cos, {EmitExtractImag(operand_value)},
-          {EmitExtractImag(operand_value)->getType()}, ir_builder_);
-      auto sin_b = llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::sin, {EmitExtractImag(operand_value)},
-          {EmitExtractImag(operand_value)->getType()}, ir_builder_);
+      TF_ASSIGN_OR_RETURN(
+          auto exp_a, EmitExp(component_type, EmitExtractReal(operand_value)));
+      TF_ASSIGN_OR_RETURN(
+          auto cos_b, EmitCos(component_type, EmitExtractImag(operand_value)));
+      TF_ASSIGN_OR_RETURN(
+          auto sin_b, EmitSin(component_type, EmitExtractImag(operand_value)));
       return EmitComposeComplex(op, ir_builder_->CreateFMul(exp_a, cos_b),
                                 ir_builder_->CreateFMul(exp_a, sin_b));
     }
@@ -316,16 +521,13 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto a = EmitExtractReal(operand_value);
       auto b = EmitExtractImag(operand_value);
       auto type = a->getType();
-      auto exp_b = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::exp, {b},
-                                                {type}, ir_builder_);
+      TF_ASSIGN_OR_RETURN(auto exp_b, EmitExp(component_type, b));
       auto half_exp_b =
           ir_builder_->CreateFMul(llvm::ConstantFP::get(type, 0.5), exp_b);
       auto half_exp_neg_b =
           ir_builder_->CreateFDiv(llvm::ConstantFP::get(type, 0.5), exp_b);
-      auto cos_a = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::cos, {a},
-                                                {type}, ir_builder_);
-      auto sin_a = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {a},
-                                                {type}, ir_builder_);
+      TF_ASSIGN_OR_RETURN(auto cos_a, EmitCos(component_type, a));
+      TF_ASSIGN_OR_RETURN(auto sin_a, EmitSin(component_type, a));
       return EmitComposeComplex(
           op,
           ir_builder_->CreateFMul(
@@ -346,16 +548,13 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto a = EmitExtractReal(operand_value);
       auto b = EmitExtractImag(operand_value);
       auto type = a->getType();
-      auto exp_b = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::exp, {b},
-                                                {type}, ir_builder_);
+      TF_ASSIGN_OR_RETURN(auto exp_b, EmitExp(component_type, b));
       auto half_exp_b =
           ir_builder_->CreateFMul(llvm::ConstantFP::get(type, 0.5), exp_b);
       auto half_exp_neg_b =
           ir_builder_->CreateFDiv(llvm::ConstantFP::get(type, 0.5), exp_b);
-      auto cos_a = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::cos, {a},
-                                                {type}, ir_builder_);
-      auto sin_a = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {a},
-                                                {type}, ir_builder_);
+      TF_ASSIGN_OR_RETURN(auto cos_a, EmitCos(component_type, a));
+      TF_ASSIGN_OR_RETURN(auto sin_a, EmitSin(component_type, a));
       return EmitComposeComplex(
           op,
           ir_builder_->CreateFMul(
@@ -363,6 +562,58 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
           ir_builder_->CreateFMul(
               cos_a, ir_builder_->CreateFSub(half_exp_b, half_exp_neg_b)));
     }
+    case HloOpcode::kTanh: {
+      /*
+      tanh=(exp(x)-exp(-x)) / (exp(x)+exp(-x))
+      e^(a+bi) = e^a*(cos(b)+sin(b)i)
+      so tanh=(((cos(b)+sin(b)i)e^a - (cos(-b)+sin(-b)i)e^-a)) /
+              (((cos(b)+sin(b)i)e^a + (cos(-b)+sin(-b)i)e^-a))
+      cos(b)=cos(-b), sin(-b)=-sin(b)
+      so tanh=(((cos(b)+sin(b)i)e^a - (cos(b)-sin(b)i)e^-a)) /
+              (((cos(b)+sin(b)i)e^a + (cos(b)-sin(b)i)e^-a))
+             =(cos(b)e^a+i*sin(b)e^a + cos(b)(-e^-a)+i*sin(b)e^-a) /
+              (cos(b)e^a+i*sin(b)e^a + cos(b)e^-a+i*sin(b)(-e^-a))
+             =(cos(b)(e^a-e^-a) + i*sin(b)(e^a+e^-a)) /
+              (cos(b)(e^a+e^-a) + i*sin(b)(e^a-e^-a))
+      This is a complex division, so we can multiply by denom_conj/denom_conj
+             =(cos(b)(e^a-e^-a) + i*sin(b)(e^a+e^-a)) *
+              (cos(b)(e^a+e^-a) - i*sin(b)(e^a-e^-a)) /
+              ((cos(b)(e^a+e^-a))^2 + (sin(b)(e^a-e^-a))^2)
+             =(cos(b)^2(e^(2a)-e^(-2a)) + sin(b)^2(e^(2a)-e^(-2a)) +
+               i*(cos(b)sin(b)(e^a+e^-a)^2 - cos(b)sin(b)(e^a-e^-a)^2)) /
+              ((cos(b)(e^a+e^-a))^2 + (sin(b)(e^a-e^-a))^2)
+      */
+      auto a = EmitExtractReal(operand_value);
+      auto b = EmitExtractImag(operand_value);
+      TF_ASSIGN_OR_RETURN(auto exp_a, EmitExp(component_type, a));
+      TF_ASSIGN_OR_RETURN(auto cos_b, EmitCos(component_type, b));
+      TF_ASSIGN_OR_RETURN(auto sin_b, EmitSin(component_type, b));
+      auto exp_neg_a = ir_builder_->CreateFDiv(
+          llvm::ConstantFP::get(exp_a->getType(), 1), exp_a);
+      auto exp_2a_minus_exp_neg_2a = ir_builder_->CreateFSub(
+          ir_builder_->CreateFMul(exp_a, exp_a),
+          ir_builder_->CreateFMul(exp_neg_a, exp_neg_a));
+      auto cos_b_sq = ir_builder_->CreateFMul(cos_b, cos_b);
+      auto sin_b_sq = ir_builder_->CreateFMul(sin_b, sin_b);
+      auto real_num = ir_builder_->CreateFAdd(
+          ir_builder_->CreateFMul(cos_b_sq, exp_2a_minus_exp_neg_2a),
+          ir_builder_->CreateFMul(sin_b_sq, exp_2a_minus_exp_neg_2a));
+      auto cos_b_sin_b = ir_builder_->CreateFMul(cos_b, sin_b);
+      auto exp_a_plus_exp_neg_a = ir_builder_->CreateFAdd(exp_a, exp_neg_a);
+      auto exp_a_plus_exp_neg_a_sq =
+          ir_builder_->CreateFMul(exp_a_plus_exp_neg_a, exp_a_plus_exp_neg_a);
+      auto exp_a_minus_exp_neg_a = ir_builder_->CreateFSub(exp_a, exp_neg_a);
+      auto exp_a_minus_exp_neg_a_sq =
+          ir_builder_->CreateFMul(exp_a_minus_exp_neg_a, exp_a_minus_exp_neg_a);
+      auto imag_num = ir_builder_->CreateFMul(
+          cos_b_sin_b, ir_builder_->CreateFSub(exp_a_plus_exp_neg_a_sq,
+                                               exp_a_minus_exp_neg_a_sq));
+      auto denom = ir_builder_->CreateFAdd(
+          ir_builder_->CreateFMul(cos_b_sq, exp_a_plus_exp_neg_a_sq),
+          ir_builder_->CreateFMul(sin_b_sq, exp_a_minus_exp_neg_a_sq));
+      return EmitComposeComplex(op, ir_builder_->CreateFDiv(real_num, denom),
+                                ir_builder_->CreateFDiv(imag_num, denom));
+    }
     case HloOpcode::kAbs: {
       auto sum_sq = ir_builder_->CreateFAdd(
           ir_builder_->CreateFMul(EmitExtractReal(operand_value),
@@ -409,7 +660,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitBinaryOp(
     const HloInstruction* op, llvm::Value* lhs_value,
     llvm::Value* rhs_value) const {
   PrimitiveType operand_type = op->operand(0)->shape().element_type();
-  if (lhs_value->getType()->isIntegerTy()) {
+  if (ShapeUtil::ElementIsIntegral(op->operand(0)->shape()) ||
+      operand_type == PRED) {
     return EmitIntegerBinaryOp(
         op, lhs_value, rhs_value,
         primitive_util::IsSignedIntegralType(operand_type));
@@ -424,7 +676,6 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
     const HloInstruction* op, llvm::Value* lhs_value,
     llvm::Value* rhs_value) const {
   switch (op->opcode()) {
-    // case HloOpcode::kAtan2:  // TODO(b/65209142): CPU atan2 support
     case HloOpcode::kComplex:
       return EmitComposeComplex(op, lhs_value, rhs_value);
     case HloOpcode::kAdd:
@@ -468,10 +719,9 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
     case HloOpcode::kMinimum:
       return EmitFloatMin(lhs_value, rhs_value);
     case HloOpcode::kPower:
-      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::pow,
-                                          {lhs_value, rhs_value},
-                                          {lhs_value->getType()}, ir_builder_);
-
+      return EmitPow(op->shape().element_type(), lhs_value, rhs_value);
+    case HloOpcode::kAtan2:
+      return EmitAtan2(op->shape().element_type(), lhs_value, rhs_value);
     default:
       return Unimplemented("binary floating point op '%s'",
                            HloOpcodeString(op->opcode()).c_str());
@@ -567,9 +817,40 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
                                   EmitExtractImag(lhs_value),
                                   EmitExtractImag(rhs_value), ir_builder_));
 
-    // TODO(b/65209142): requires arg(z) -> requires atan|atan2 intrinsic
-    // case HloOpcode::kPower:
-    // // (a+bi)^(c+di) = exp(i(c+di)*arg(a+bi)) * (a*a+b*b)^(c/2+di/2)
+    case HloOpcode::kPower: {
+      // (a+bi)^(c+di) =
+      //    (a*a+b*b)^(0.5c) * exp(-d*atan2(b,a)) * (cos(q) + i*sin(q)),
+      //    where q = c*atan2(b,a)+0.5d*ln(a*a+b*b)
+      PrimitiveType component_type =
+          primitive_util::ComplexComponentType(op->shape().element_type());
+      auto a = EmitExtractReal(lhs_value);
+      auto b = EmitExtractImag(lhs_value);
+      auto c = EmitExtractReal(rhs_value);
+      auto d = EmitExtractImag(rhs_value);
+      auto aa_p_bb = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(a, a),
+                                             ir_builder_->CreateFMul(b, b));
+      auto one_half = llvm::ConstantFP::get(a->getType(), 0.5);
+      auto half_c = ir_builder_->CreateFMul(one_half, c);
+
+      TF_ASSIGN_OR_RETURN(auto aa_p_bb_to_half_c,
+                          EmitPow(component_type, aa_p_bb, half_c));
+      auto neg_d = ir_builder_->CreateFNeg(d);
+      TF_ASSIGN_OR_RETURN(auto arg_lhs, EmitAtan2(component_type, b, a));
+      auto neg_d_arg_lhs = ir_builder_->CreateFMul(neg_d, arg_lhs);
+      TF_ASSIGN_OR_RETURN(auto e_to_neg_d_arg_lhs,
+                          EmitExp(component_type, neg_d_arg_lhs));
+      auto coeff =
+          ir_builder_->CreateFMul(aa_p_bb_to_half_c, e_to_neg_d_arg_lhs);
+      TF_ASSIGN_OR_RETURN(auto ln_aa_p_bb, EmitLog(component_type, aa_p_bb));
+      auto half_d = ir_builder_->CreateFMul(one_half, d);
+      auto q =
+          ir_builder_->CreateFAdd(ir_builder_->CreateFMul(c, arg_lhs),
+                                  ir_builder_->CreateFMul(half_d, ln_aa_p_bb));
+      TF_ASSIGN_OR_RETURN(auto cos_q, EmitCos(component_type, q));
+      TF_ASSIGN_OR_RETURN(auto sin_q, EmitSin(component_type, q));
+      return EmitComposeComplex(op, ir_builder_->CreateFMul(coeff, cos_q),
+                                ir_builder_->CreateFMul(coeff, sin_q));
+    }
     default:
       return Unimplemented("binary complex op '%s'",
                            HloOpcodeString(op->opcode()).c_str());
@@ -672,116 +953,51 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfcInv(
   return EmitErfInv(prim_type, ir_builder_->CreateFSub(one, value));
 }
 
-StatusOr<llvm::Value*> ElementalIrEmitter::EmitReducePrecision(
-    const HloInstruction* hlo, llvm::Value* x) const {
-  if (hlo->operand(0)->shape().element_type() != F32) {
-    return Unimplemented("reduce-precision only implemented for F32");
-  }
-
-  // Integer and float types for casting and constant generation.
-  llvm::Type* float_type = x->getType();
-  llvm::IntegerType* int_type = ir_builder_->getInt32Ty();
-
-  // Cast the input value to an integer for bitwise manipulation.
-  llvm::Value* x_as_int = ir_builder_->CreateBitCast(x, int_type);
-
-  if (hlo->mantissa_bits() < 23) {
-    // Last remaining mantissa bit.
-    const uint32_t last_mantissa_bit_mask = 1u << (23 - hlo->mantissa_bits());
-
-    // Compute rounding bias for round-to-nearest with ties to even.  This is
-    // equal to a base value of 0111... plus one bit if the last remaining
-    // mantissa bit is 1.
-    const uint32_t base_rounding_bias = (last_mantissa_bit_mask >> 1) - 1;
-    llvm::Value* x_last_mantissa_bit = ir_builder_->CreateLShr(
-        ir_builder_->CreateAnd(
-            x_as_int, llvm::ConstantInt::get(int_type, last_mantissa_bit_mask)),
-        (23 - hlo->mantissa_bits()));
-    llvm::Value* x_rounding_bias = ir_builder_->CreateAdd(
-        x_last_mantissa_bit,
-        llvm::ConstantInt::get(int_type, base_rounding_bias));
-
-    // Add rounding bias, and mask out truncated bits.  Note that the case
-    // where adding the rounding bias overflows into the exponent bits is
-    // correct; the non-masked mantissa bits will all be zero, and the
-    // exponent will be incremented by one.
-    const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1);
-    x_as_int = ir_builder_->CreateAdd(x_as_int, x_rounding_bias);
-    x_as_int = ir_builder_->CreateAnd(
-        x_as_int, llvm::ConstantInt::get(int_type, truncation_mask));
-  }
-
-  if (hlo->exponent_bits() < 8) {
-    // Masks for f32 values.
-    const uint32_t f32_sign_bit_mask = 1u << 31;
-    const uint32_t f32_exp_bits_mask = 0xffu << 23;
-
-    // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the most-
-    // significant bit -- is equal to 1.0f for all exponent sizes.  Adding
-    // 2^(n-1)-1 to this gives us the highest non-infinite exponent for a bit-
-    // size of n, and subtracting 2^(n-1)-1 from this gives us the lowest'
-    // exponent (corresponding to 0.0f).
-    //
-    // Thus, the f32 exponent corresponding to the highest non-infinite
-    // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32
-    // exponent corresponding to the lowest exponent for a bit size of n is
-    // (2^7-1) - 2^(n-1)-1.
-    //
-    // Note that we have already checked that exponents_bits >= 1.
-    const uint32_t f32_exponent_bias = (1 << 7) - 1;
-    const uint32_t reduced_exponent_bias =
-        (1 << (hlo->exponent_bits() - 1)) - 1;
-    const uint32_t reduced_max_exponent =
-        f32_exponent_bias + reduced_exponent_bias;
-    const uint32_t reduced_min_exponent =
-        f32_exponent_bias - reduced_exponent_bias;
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog(PrimitiveType prim_type,
+                                                   llvm::Value* value) const {
+  return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::log, {value},
+                                      {value->getType()}, ir_builder_);
+}
 
-    // Do we overflow or underflow?
-    llvm::Value* x_exponent = ir_builder_->CreateAnd(
-        x_as_int, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
-    llvm::Value* x_overflows = ir_builder_->CreateICmpUGT(
-        x_exponent,
-        llvm::ConstantInt::get(int_type, reduced_max_exponent << 23));
-    llvm::Value* x_underflows = ir_builder_->CreateICmpULE(
-        x_exponent,
-        llvm::ConstantInt::get(int_type, reduced_min_exponent << 23));
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitSin(PrimitiveType prim_type,
+                                                   llvm::Value* value) const {
+  return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {value},
+                                      {value->getType()}, ir_builder_);
+}
 
-    // Compute appropriately-signed values of zero and infinity.
-    llvm::Value* x_signed_zero = ir_builder_->CreateAnd(
-        x_as_int, llvm::ConstantInt::get(int_type, f32_sign_bit_mask));
-    llvm::Value* x_signed_inf = ir_builder_->CreateOr(
-        x_signed_zero, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitCos(PrimitiveType prim_type,
+                                                   llvm::Value* value) const {
+  return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::cos, {value},
+                                      {value->getType()}, ir_builder_);
+}
 
-    // Force to zero or infinity if overflow or underflow.  (Note that this
-    // truncates all denormal values to zero, rather than rounding them.)
-    x_as_int = ir_builder_->CreateSelect(x_overflows, x_signed_inf, x_as_int);
-    x_as_int = ir_builder_->CreateSelect(x_underflows, x_signed_zero, x_as_int);
-  }
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitExp(PrimitiveType prim_type,
+                                                   llvm::Value* value) const {
+  return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::exp, {value},
+                                      {value->getType()}, ir_builder_);
+}
 
-  // Cast the result back to a floating-point type.
-  llvm::Value* result = ir_builder_->CreateBitCast(x_as_int, float_type);
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitPow(PrimitiveType prim_type,
+                                                   llvm::Value* lhs,
+                                                   llvm::Value* rhs) const {
+  return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::pow, {lhs, rhs},
+                                      {lhs->getType()}, ir_builder_);
+}
 
-  // Correct result for NaN inputs.
-  //
-  // The exponent handling will "normalize" NaN values to infinities, which is
-  // undesirable (except in the case with no mantissa bits, in which case it
-  // is mandatory).  This logic also handles cases where mantissa-rounding
-  // causes a NaN's mantissa to overflow into the exponent bits, which would
-  // otherwise create an erroneous zero value.
-  //
-  // If the fast-math flags are set to assume no NaNs, the comparison is likely
-  // to be optimized away, so there's no point in even emitting it.
-  if (!ir_builder_->getFastMathFlags().noNaNs()) {
-    llvm::Value* x_is_nan = ir_builder_->CreateFCmpUNO(x, x);
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitAtan2(PrimitiveType prim_type,
+                                                     llvm::Value* lhs,
+                                                     llvm::Value* rhs) const {
+  return Unimplemented("atan2");
+}
 
-    if (hlo->mantissa_bits() > 0) {
-      result = ir_builder_->CreateSelect(x_is_nan, x, result);
-    } else {
-      result = ir_builder_->CreateSelect(
-          x_is_nan, llvm::ConstantFP::getInfinity(float_type), result);
-    }
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitReducePrecision(
+    const HloInstruction* hlo, llvm::Value* x) const {
+  if (hlo->operand(0)->shape().element_type() != F32) {
+    return Unimplemented("reduce-precision only implemented for F32");
   }
-  return result;
+  return EmitReducePrecisionFloat(x, /*exponent_bits=*/hlo->exponent_bits(),
+                                  /*mantissa_bits=*/hlo->mantissa_bits(),
+                                  ir_builder_);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
@@ -865,7 +1081,7 @@ llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex(
 
   // If no implicit broadcast is needed for this operand, returns the target
   // index as the source index.
-  if (ShapeUtil::Compatible(operand_shape, hlo.shape())) {
+  if (ShapeUtil::CompatibleIgnoringElementType(operand_shape, hlo.shape())) {
     return target_index;
   }
 
@@ -1073,6 +1289,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kCeil:
     case HloOpcode::kConvert:
+    case HloOpcode::kBitcastConvert:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
@@ -1081,11 +1298,11 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kIsFinite:
     case HloOpcode::kLog:
     case HloOpcode::kNegate:
+    case HloOpcode::kNot:
     case HloOpcode::kReal:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
     case HloOpcode::kTanh:
-    case HloOpcode::kNot:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
         TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
@@ -1094,6 +1311,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         return EmitUnaryOp(hlo, operand_value);
       };
     case HloOpcode::kAdd:
+    case HloOpcode::kAnd:
     case HloOpcode::kAtan2:
     case HloOpcode::kComplex:
     case HloOpcode::kDivide:
@@ -1106,14 +1324,13 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kMinimum:
     case HloOpcode::kMultiply:
     case HloOpcode::kNe:
+    case HloOpcode::kOr:
     case HloOpcode::kPower:
     case HloOpcode::kRemainder:
-    case HloOpcode::kSubtract:
-    case HloOpcode::kAnd:
-    case HloOpcode::kOr:
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical:
+    case HloOpcode::kSubtract:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
         const HloInstruction* lhs = hlo->operand(0);
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index cccb498f82936283a215370787907b293827ff2d..1a48eb5fcb960b60d524ea56a43e15269576db76 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -39,7 +39,7 @@ class ElementalIrEmitter {
         module_(module),
         hlo_module_config_(hlo_module_config) {}
 
-  virtual ~ElementalIrEmitter() {}
+  virtual ~ElementalIrEmitter() = default;
 
   virtual StatusOr<llvm::Value*> EmitUnaryOp(const HloInstruction* op,
                                              llvm::Value* operand_value) const;
@@ -92,6 +92,26 @@ class ElementalIrEmitter {
   virtual StatusOr<llvm::Value*> EmitErfcInv(PrimitiveType prim_type,
                                              llvm::Value* value) const;
 
+  virtual StatusOr<llvm::Value*> EmitAtan2(PrimitiveType prim_type,
+                                           llvm::Value* lhs,
+                                           llvm::Value* rhs) const;
+
+  virtual StatusOr<llvm::Value*> EmitLog(PrimitiveType prim_type,
+                                         llvm::Value* value) const;
+
+  virtual StatusOr<llvm::Value*> EmitSin(PrimitiveType prim_type,
+                                         llvm::Value* value) const;
+
+  virtual StatusOr<llvm::Value*> EmitCos(PrimitiveType prim_type,
+                                         llvm::Value* value) const;
+
+  virtual StatusOr<llvm::Value*> EmitExp(PrimitiveType prim_type,
+                                         llvm::Value* value) const;
+
+  virtual StatusOr<llvm::Value*> EmitPow(PrimitiveType prim_type,
+                                         llvm::Value* lhs,
+                                         llvm::Value* rhs) const;
+
   virtual StatusOr<llvm::Value*> EmitReducePrecision(const HloInstruction* hlo,
                                                      llvm::Value* x) const;
 
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 9c96d9eb30b5f9e51b7f5d82391c6b9f366898d6..c50aaec5725021eeaa2fe0c3247f7539327268ae 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -26,23 +26,23 @@ limitations under the License.
 
 namespace xla {
 
-StatusOr<std::vector<perftools::gputools::DeviceMemoryBase>>
+StatusOr<std::vector<std::unique_ptr<ShapedBuffer>>>
 Executable::ExecuteOnStreams(
     tensorflow::gtl::ArraySlice<const ServiceExecutableRunOptions> run_options,
     tensorflow::gtl::ArraySlice<
-        tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
+        tensorflow::gtl::ArraySlice<const ShapedBuffer*>>
         arguments) {
   TF_RET_CHECK(run_options.size() == arguments.size());
 
+  std::vector<std::unique_ptr<ShapedBuffer>> return_values(run_options.size());
+
   if (run_options.size() == 1) {
-    TF_ASSIGN_OR_RETURN(auto result,
+    TF_ASSIGN_OR_RETURN(return_values[0],
                         ExecuteOnStream(&run_options[0], arguments[0],
                                         /*hlo_execution_profile=*/nullptr));
-    return std::vector<perftools::gputools::DeviceMemoryBase>({result});
+    return std::move(return_values);
   }
 
-  std::vector<perftools::gputools::DeviceMemoryBase> return_values(
-      run_options.size());
   for (size_t i = 0; i < run_options.size(); ++i) {
     // We cannot BlockHostUntilDone() on the already-launched executions in case
     // of error, since if the executions communicate, the initially launched
@@ -52,9 +52,9 @@ Executable::ExecuteOnStreams(
   }
   for (const auto& options : run_options) {
     TF_RET_CHECK(options.stream() != nullptr);
-    options.stream()->BlockHostUntilDone();
+    TF_RETURN_IF_ERROR(options.stream()->BlockHostUntilDone());
   }
-  return return_values;
+  return std::move(return_values);
 }
 
 Status Executable::DumpSessionModule() {
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 2135707371809f119f0ed427f250ea500f786d3c..23864dda78fa9e9aeefc44c5aa018686e998a558 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -44,8 +44,15 @@ namespace xla {
 // interface that is used for launching compiled programs across platforms.
 class Executable {
  public:
-  explicit Executable(std::unique_ptr<const HloModule> hlo_module)
-      : hlo_module_(std::move(hlo_module)) {}
+  explicit Executable(std::unique_ptr<const HloModule> hlo_module,
+                      std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+                      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+      : hlo_module_(std::move(hlo_module)),
+        hlo_profile_printer_(std::move(hlo_profile_printer)),
+        hlo_profile_index_map_(std::move(hlo_profile_index_map)) {
+    CHECK_EQ(hlo_profile_printer_.get() == nullptr,
+             hlo_profile_index_map_.get() == nullptr);
+  }
   virtual ~Executable() {}
 
   // Enqueues the compilation result on the provided stream, passing the given
@@ -54,16 +61,7 @@ class Executable {
   // If the hlo_execution_profile is provided as non-nullptr, profiling will be
   // enabled.
   //
-  // Returns the device memory region that a successful execution would
-  // populate.
-  virtual StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      HloExecutionProfile* hlo_execution_profile) = 0;
-
-  // Overload of ExecuteOnStream which returns and takes arguments as
-  // ShapedBuffers. Used for LocalService execution.
+  // Returns a shaped buffer containing the result of the computation.
   virtual StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
@@ -71,21 +69,19 @@ class Executable {
 
   // Same as ExecuteOnStream(), but this call is non-blocking and returns as
   // soon as all of the operations are enqueued for launch on the stream.
-  virtual StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
+  virtual StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments) = 0;
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) = 0;
 
   // Same as ExecuteOnStream(), but runs this executable on multiple
   // streams. arguments[i] contains the arguments to the execution on
   // run_options[i]->stream() and the returned value is at index i of the
   // returned vector.
-  virtual StatusOr<std::vector<perftools::gputools::DeviceMemoryBase>>
-  ExecuteOnStreams(
+  virtual StatusOr<std::vector<std::unique_ptr<ShapedBuffer>>> ExecuteOnStreams(
       tensorflow::gtl::ArraySlice<const ServiceExecutableRunOptions>
           run_options,
       tensorflow::gtl::ArraySlice<
-          tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
+          tensorflow::gtl::ArraySlice<const ShapedBuffer*>>
           arguments);
 
   // Populates `hlo_execution_profile` from `executor`. This is implicit in any
@@ -123,12 +119,20 @@ class Executable {
         "Equality test on this executable is not implemented.");
   }
 
+  const HloProfilePrinter& hlo_profile_printer() const {
+    CHECK(hlo_profiling_enabled());
+    return *hlo_profile_printer_;
+  }
+
+  const HloProfileIndexMap& hlo_profile_index_map() const {
+    CHECK(hlo_profiling_enabled());
+    return *hlo_profile_index_map_;
+  }
+
   // Returns whether this executable was compiled with HLO profilings support
   // enabled. If not, the caller should not expect an hlo_execution_profile
   // passed to ExecuteOnStream above to be populated during execution.
-  bool hlo_profiling_enabled() const {
-    return hlo_module_->config().hlo_profiling_enabled();
-  }
+  bool hlo_profiling_enabled() const { return hlo_profile_printer_ != nullptr; }
 
   const HloModule& module() const { return *hlo_module_; }
 
@@ -160,10 +164,6 @@ class Executable {
   static Status DumpToDirectory(const string& directory_path, string filename,
                                 const SessionModule& session_module);
 
-  // Returns a cost analysis object appropriate for the platform on which this
-  // executable can run.
-  virtual std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const = 0;
-
  protected:
   mutable tensorflow::mutex mutex_;
 
@@ -181,6 +181,9 @@ class Executable {
   // Execution count, used to generate a unique filename for each dumped
   // execution.
   int64 execution_count_ = 0;
+
+  std::unique_ptr<HloProfilePrinter> hlo_profile_printer_;
+  std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map_;
 };
 
 template <typename ReturnT, typename ArgT>
@@ -200,7 +203,8 @@ StatusOr<ReturnT> Executable::ExecuteOnStreamWrapper(
   std::unique_ptr<HloExecutionProfile> profile_ptr =
       module_config().debug_options().xla_hlo_profile() &&
               hlo_profiling_enabled()
-          ? MakeUnique<HloExecutionProfile>(module(), *CreateCostAnalysis())
+          ? MakeUnique<HloExecutionProfile>(&hlo_profile_printer(),
+                                            &hlo_profile_index_map())
           : nullptr;
 
   auto return_value =
@@ -208,14 +212,19 @@ StatusOr<ReturnT> Executable::ExecuteOnStreamWrapper(
 
   if (profile != nullptr) {
     VLOG(1) << "enqueueing 'stop timer' and blocking host until done...";
-    stream->ThenStopTimer(timer.get()).BlockHostUntilDone();
+    stream->ThenStopTimer(timer.get());
+    TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
     VLOG(1) << "done with block-host-until-done";
 
     // Merge in run-time profile information from execution_profile.
     profile->MergeFrom(execution_profile());
 
     // Overall execution time (in nanoseconds) from the executor timer.
-    profile->set_compute_and_transfer_time_ns(timer->Nanoseconds());
+    if (stream->ok()) {
+      // Don't read timer->Nanoseconds() if the stream isn't OK -- that's
+      // illegal.
+      profile->set_compute_and_transfer_time_ns(timer->Nanoseconds());
+    }
 
     // TODO(b/28123297): On GPU we end up including transfer time in
     // the compute time this way. Instead, we should get the correct
diff --git a/tensorflow/compiler/xla/service/execution_tracker.cc b/tensorflow/compiler/xla/service/execution_tracker.cc
index c225e62e3e11d2d01251b0f92272b0949eff8af1..2f0b9ed2bd98fbea4e67c0a30d5aa41ff6a06979 100644
--- a/tensorflow/compiler/xla/service/execution_tracker.cc
+++ b/tensorflow/compiler/xla/service/execution_tracker.cc
@@ -39,9 +39,7 @@ AsyncExecution::AsyncExecution(Backend* backend,
 
 tensorflow::Status AsyncExecution::BlockUntilDone() const {
   for (auto& stream : streams_) {
-    if (!stream->BlockHostUntilDone()) {
-      return InternalError("failed to block until done");
-    }
+    TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
   }
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.cc b/tensorflow/compiler/xla/service/flatten_call_graph.cc
index dfba22a6c4c5cf071c2cd8621643b8da6587ee3b..2b6caa149439a86d6d047605099bc3ff7b295a8e 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph.cc
@@ -26,7 +26,10 @@ namespace xla {
 
 namespace {
 
-// Helper to replace the called computation at a while- or call-instruction.
+// Helper to replace the called computation at a while-, call-, or
+// conditional-instruction. This function replaces exactly one instance of
+// 'computation' with 'new_computation' even if 'instruction' calls
+// 'computation' more than once.
 void ReplaceCalledComputation(HloInstruction* instruction,
                               HloComputation* computation,
                               HloComputation* new_computation) {
@@ -45,6 +48,15 @@ void ReplaceCalledComputation(HloInstruction* instruction,
       instruction->set_to_apply(new_computation);
       break;
     }
+    case HloOpcode::kConditional: {
+      if (computation == instruction->true_computation()) {
+        instruction->set_true_computation(new_computation);
+      } else {
+        CHECK_EQ(computation, instruction->false_computation());
+        instruction->set_false_computation(new_computation);
+      }
+      break;
+    }
     default:
       LOG(FATAL) << "unexpected opcode: "
                  << HloOpcodeString(instruction->opcode());
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
index a68e90b7d009890012f94baa790d911871c9c960..d3854b40de3572a60df1ad99d8a4589f59ad7194 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
@@ -223,5 +223,35 @@ TEST_F(FlattenCallGraphTest, FlattenCalls) {
   EXPECT_EQ(1, b_node.caller_callsites().size());
 }
 
+TEST_F(FlattenCallGraphTest, FlattenCallsInConditional) {
+  auto module = CreateNewModule();
+  HloComputation* sub_computation =
+      module->AddEmbeddedComputation(MakeScalarComputation());
+
+  // Create entry computation, which is a conditional that has the same
+  // computation in the true and false branch.
+  HloComputation::Builder builder(TestName());
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(56.0f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(12.0f)));
+  builder.AddInstruction(HloInstruction::CreateConditional(
+      kScalarShape, pred, constant1, sub_computation, constant2,
+      sub_computation));
+  module->AddEntryComputation(builder.Build());
+  EXPECT_EQ(2, module->computation_count());
+
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
+  EXPECT_TRUE(result);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+  // The true and false computations must now be different.
+  EXPECT_EQ(3, module->computation_count());
+
+  const CallGraphNode& sub_node = call_graph->GetNode(sub_computation);
+  EXPECT_EQ(1, sub_node.caller_callsites().size());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index 74aa77b4f165be76fbc0a8aa1a4a7e90a8e9acec..271a856efd66f9f977ac4e201161ba4b505f31e1 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -51,83 +51,7 @@ se::Platform::Id GenericTransferManager::PlatformId() const {
   return platform_id_;
 }
 
-Status GenericTransferManager::TransferLiteralFromDevice(
-    se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
-    const Shape& device_shape, const Shape& literal_shape, Literal* literal) {
-  VLOG(2) << "transferring literal shape from device: "
-          << ShapeUtil::HumanString(literal_shape)
-          << "; device location: " << source.opaque();
-  TF_RET_CHECK(ShapeUtil::Compatible(device_shape, literal_shape));
-
-  // Tuples are a special case and contain one or more shapes inside of them to
-  // an arbitrary nesting depth.
-  if (device_shape.element_type() == TUPLE) {
-    *literal->mutable_shape() = literal_shape;
-    TF_ASSIGN_OR_RETURN(
-        std::vector<se::DeviceMemoryBase> element_buffers,
-        ShallowCopyTupleFromDevice(executor, source, device_shape));
-    TF_RET_CHECK(element_buffers.size() ==
-                 ShapeUtil::TupleElementCount(device_shape));
-    for (int64 i = 0; i < element_buffers.size(); ++i) {
-      const Shape& element_device_shape = device_shape.tuple_shapes(i);
-      const Shape& element_literal_shape = literal_shape.tuple_shapes(i);
-      Literal* element_literal = literal->add_tuple_literals();
-      // Recursively call TransferFromDevice to copy over the data in the
-      // element array.
-      TF_RETURN_IF_ERROR(TransferLiteralFromDevice(
-          executor, element_buffers[i], /*device_shape=*/element_device_shape,
-          /*literal_shape=*/element_literal_shape, element_literal));
-    }
-    return Status::OK();
-  }
-
-  *literal->mutable_shape() = device_shape;
-  literal->Reserve(ShapeUtil::ElementsIn(device_shape));
-  TF_RETURN_IF_ERROR(TransferBufferFromDevice(
-      executor, source, /*size=*/ShapeUtil::ByteSizeOf(device_shape),
-      /*destination=*/literal->MutableInternalData()));
-  if (!ShapeUtil::Equal(literal_shape, device_shape)) {
-    *literal = std::move(*literal->Relayout(literal_shape.layout()));
-  }
-  TF_RET_CHECK(ShapeUtil::Equal(literal_shape, literal->shape()));
-  return Status::OK();
-}
-
-StatusOr<std::vector<se::DeviceMemoryBase>>
-GenericTransferManager::ShallowCopyTupleFromDevice(
-    se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
-    const Shape& shape) {
-  TF_RET_CHECK(ShapeUtil::IsTuple(shape));
-
-  // For devices which use the GenericTransferManager, a tuple is stored as an
-  // array of pointers to buffers. Copy the contents of the tuple buffer into
-  // a vector of void* pointers.
-  std::vector<void*> element_pointers(ShapeUtil::TupleElementCount(shape),
-                                      nullptr);
-  int64 tuple_size = ShapeUtil::ByteSizeOf(shape, pointer_size_);
-  auto copy_status = executor->SynchronousMemcpyD2H(source, tuple_size,
-                                                    element_pointers.data());
-  if (!copy_status.ok()) {
-    return AddStatus(
-        Status(static_cast<tensorflow::error::Code>(copy_status.code()),
-               copy_status.error_message()),
-        "failed transfer of tuple buffer " + ShapeUtil::HumanString(shape));
-  }
-
-  // Create a DeviceMemoryBase from each void* pointer.
-  std::vector<se::DeviceMemoryBase> destination;
-  for (size_t i = 0; i < element_pointers.size(); ++i) {
-    if (element_pointers[i] == nullptr &&
-        !ShapeUtil::HasZeroElements(shape.tuple_shapes(i))) {
-      return FailedPrecondition("tuple contains nullptr at element %lu", i);
-    }
-    destination.emplace_back(element_pointers[i],
-                             GetByteSizeRequirement(shape.tuple_shapes(i)));
-  }
-  return std::move(destination);
-}
-
-Status GenericTransferManager::WriteTuplePointersToDevice(
+Status GenericTransferManager::WriteSingleTupleIndexTable(
     perftools::gputools::StreamExecutor* executor,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> elements,
     const Shape& shape, perftools::gputools::DeviceMemoryBase* region) {
@@ -145,16 +69,19 @@ StatusOr<std::unique_ptr<Literal>>
 GenericTransferManager::TransferLiteralFromDevice(
     se::StreamExecutor* executor, const ShapedBuffer& device_buffer) {
   VLOG(2) << "transferring literal from device ordinal "
-          << executor->device_ordinal() << "; device shape: "
-          << ShapeUtil::HumanStringWithLayout(device_buffer.shape())
-          << "; opaque: " << device_buffer.buffer(/*index=*/{}).opaque();
+          << executor->device_ordinal() << "; device buffer: " << device_buffer;
   TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal());
 
+  // The on-host and on-device shape should always be the same for the generic
+  // transfer manager.
+  TF_RET_CHECK(ShapeUtil::Equal(device_buffer.on_device_shape(),
+                                device_buffer.on_host_shape()));
+
   std::unique_ptr<Literal> literal =
-      Literal::CreateFromShape(device_buffer.shape());
+      Literal::CreateFromShape(device_buffer.on_host_shape());
 
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
-      device_buffer.shape(),
+      device_buffer.on_host_shape(),
       [&](const Shape& subshape, const ShapeIndex& index) -> Status {
         if (!ShapeUtil::IsTuple(subshape)) {
           TF_RETURN_IF_ERROR(TransferBufferFromDevice(
@@ -175,16 +102,22 @@ Status GenericTransferManager::TransferLiteralToDevice(
     const ShapedBuffer& device_buffer) {
   const Shape& shape = literal.shape();
   VLOG(2) << "transferring literal shape to device: "
-          << ShapeUtil::HumanString(shape) << "; device location: "
-          << device_buffer.buffer(/*index=*/{}).opaque();
+          << ShapeUtil::HumanString(shape)
+          << "; device buffer: " << device_buffer;
+
+  // The on-host and on-device shape should always be the same for the generic
+  // transfer manager.
+  TF_RET_CHECK(ShapeUtil::Equal(device_buffer.on_device_shape(),
+                                device_buffer.on_host_shape()));
 
-  TF_RET_CHECK(ShapeUtil::Compatible(literal.shape(), device_buffer.shape()));
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(literal.shape(), device_buffer.on_host_shape()));
   TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal());
 
   TF_RETURN_IF_ERROR(WriteTupleIndexTables(executor, device_buffer));
 
   return ShapeUtil::ForEachSubshapeWithStatus(
-      device_buffer.shape(),
+      device_buffer.on_host_shape(),
       [&](const Shape& device_subshape, const ShapeIndex& index) -> Status {
         se::DeviceMemoryBase device_memory = device_buffer.buffer(index);
         if (ShapeUtil::IsArray(device_subshape)) {
@@ -212,33 +145,6 @@ Status GenericTransferManager::TransferLiteralToDevice(
       });
 }
 
-Status GenericTransferManager::TransferLiteralToDevice(
-    se::StreamExecutor* executor, const Literal& literal,
-    se::DeviceMemoryBase* destination) {
-  const Shape& shape = literal.shape();
-  VLOG(2) << "transferring literal shape to device: "
-          << ShapeUtil::HumanString(shape)
-          << "; device location: " << destination->opaque();
-
-  if (ShapeUtil::IsTuple(literal.shape())) {
-    std::vector<void*> tuple_elements_on_device;
-    for (const Literal& tuple_element : literal.tuple_literals()) {
-      se::DeviceMemoryBase allocation = executor->AllocateArray<uint8>(
-          GetByteSizeRequirement(tuple_element.shape()));
-      TF_RETURN_IF_ERROR(
-          TransferLiteralToDevice(executor, tuple_element, &allocation));
-      tuple_elements_on_device.push_back(allocation.opaque());
-    }
-    return TransferBufferToDevice(
-        executor, tuple_elements_on_device.size() * sizeof(void*),
-        tuple_elements_on_device.data(), destination);
-  }
-
-  return TransferBufferToDevice(executor,
-                                /*size=*/GetByteSizeRequirement(shape),
-                                /*source=*/literal.InternalData(), destination);
-}
-
 Status GenericTransferManager::TransferLiteralToInfeed(
     se::StreamExecutor* executor, const Literal& literal) {
   return Unimplemented("Generic transfer to Infeed");
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h
index 50dca6aec5012f0b02cb54846b622f008600e48e..63a7c820cf4e5fbbdf870086a4fb5316ac50d10b 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@@ -42,16 +42,6 @@ class GenericTransferManager : public TransferManager {
 
   perftools::gputools::Platform::Id PlatformId() const override;
 
-  Status TransferLiteralFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const perftools::gputools::DeviceMemoryBase& source,
-      const Shape& device_shape, const Shape& literal_shape,
-      Literal* literal) override;
-
-  Status TransferLiteralToDevice(
-      perftools::gputools::StreamExecutor* executor, const Literal& literal,
-      perftools::gputools::DeviceMemoryBase* destination) override;
-
   StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
       perftools::gputools::StreamExecutor* executor,
       const ShapedBuffer& device_buffer) override;
@@ -62,9 +52,6 @@ class GenericTransferManager : public TransferManager {
 
   Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
                                  const Literal& literal) override;
-  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
-                                int64 size, const void* source) override;
-
   Status TransferLiteralFromOutfeed(
       perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
       Literal* literal) override;
@@ -73,16 +60,13 @@ class GenericTransferManager : public TransferManager {
       tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
           executors) override;
 
-  StatusOr<std::vector<perftools::gputools::DeviceMemoryBase>>
-  ShallowCopyTupleFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const perftools::gputools::DeviceMemoryBase& source,
-      const Shape& shape) override;
-
   int64 GetByteSizeRequirement(const Shape& shape) const override;
 
  protected:
-  Status WriteTuplePointersToDevice(
+  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
+                                int64 size, const void* source) override;
+
+  Status WriteSingleTupleIndexTable(
       perftools::gputools::StreamExecutor* executor,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           elements,
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 364b76b93c288f13f2bf447cebfc25f705d77826..f673f0cbd079b2e3a7e783c02ab9d9af2f466b63 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -343,15 +343,16 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "copy_insertion",
-    srcs = ["copy_insertion.cc"],
-    hdrs = ["copy_insertion.h"],
+    name = "gpu_copy_insertion",
+    srcs = ["gpu_copy_insertion.cc"],
+    hdrs = ["gpu_copy_insertion.h"],
     deps = [
         ":ir_emission_utils",
+        "//tensorflow/compiler/xla/service:call_graph",
         "//tensorflow/compiler/xla/service:copy_insertion",
         "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:logical_buffer",
-        "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
+        "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
+        "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
     ],
 )
@@ -427,14 +428,14 @@ cc_library(
     hdrs = ["gpu_compiler.h"],
     deps = [
         ":convolution_folding",
-        ":copy_insertion",
         ":fusion_merger",
+        ":gpu_copy_insertion",
         ":gpu_executable",
+        ":gpu_layout_assignment",
         ":hlo_schedule",
         ":instruction_fusion",
         ":ir_emission_utils",
         ":ir_emitter",
-        ":layout_assignment",
         ":pad_insertion",
         ":partition_assignment",
         ":stream_assignment",
@@ -444,10 +445,11 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
-        "//tensorflow/compiler/xla/service:batchnorm_rewriter",
+        "//tensorflow/compiler/xla/service:batchnorm_expander",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:call_inliner",
+        "//tensorflow/compiler/xla/service:dot_decomposer",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
@@ -491,9 +493,9 @@ cc_library(
 )
 
 cc_library(
-    name = "layout_assignment",
-    srcs = ["layout_assignment.cc"],
-    hdrs = ["layout_assignment.h"],
+    name = "gpu_layout_assignment",
+    srcs = ["gpu_layout_assignment.cc"],
+    hdrs = ["gpu_layout_assignment.h"],
     deps = [
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:shape_util",
@@ -507,10 +509,10 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "layout_assignment_test",
-    srcs = ["layout_assignment_test.cc"],
+    name = "gpu_layout_assignment_test",
+    srcs = ["gpu_layout_assignment_test.cc"],
     deps = [
-        ":layout_assignment",
+        ":gpu_layout_assignment",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -574,11 +576,14 @@ tf_cc_test(
     deps = [
         ":instruction_fusion",
         ":while_transformer",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/service:copy_insertion",
+        "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding.cc b/tensorflow/compiler/xla/service/gpu/convolution_folding.cc
index 5aaf072f9d2c95e2fff70a1c5337432a12a1aa48..b0626ca3bc9f843e513d4727932f0e2d5fa37748 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_folding.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_folding.cc
@@ -55,28 +55,20 @@ MatchBackwardFilter(HloInstruction* conv) {
   //               v       v
   //              Convolution
   //                 conv
-  //                   |
-  //                   v
-  //               Transpose (optional if identity transposition)
   CHECK_EQ(HloOpcode::kConvolution, conv->opcode());
-  // If the forward convolution is followed by a transpose, we can fuse the
-  // transpose into the backward convolution as well.
-  HloInstruction* transpose = nullptr;
-  if (conv->user_count() == 1) {
-    HloInstruction* single_user = *conv->users().begin();
-    if (single_user->opcode() == HloOpcode::kTranspose) {
-      transpose = single_user;
-    }
-  }
 
   // Step 2: match paddings and dimension numbers of the forward convolution.
   const ConvolutionDimensionNumbers& conv_dnums =
       conv->convolution_dimension_numbers();
   auto input_batch_dim = conv_dnums.input_batch_dimension();
   auto input_feature_dim = conv_dnums.input_feature_dimension();
+  auto input_spatial_dims = conv_dnums.input_spatial_dimensions();
+  auto kernel_input_feature_dim = conv_dnums.kernel_input_feature_dimension();
+  auto kernel_output_feature_dim = conv_dnums.kernel_output_feature_dimension();
+  auto kernel_spatial_dims = conv_dnums.kernel_spatial_dimensions();
   auto output_batch_dim = conv_dnums.output_batch_dimension();
   auto output_feature_dim = conv_dnums.output_feature_dimension();
-  auto spatial_dims = conv_dnums.spatial_dimensions();
+  auto output_spatial_dims = conv_dnums.output_spatial_dimensions();
 
   for (const WindowDimension& window_dim : conv->window().dimensions()) {
     if (window_dim.stride() != 1) {
@@ -95,9 +87,14 @@ MatchBackwardFilter(HloInstruction* conv) {
       VLOG(1) << "Padding low should be non-negative.";
       return no_match_result;
     }
+    if (window_dim.window_reversal()) {
+      VLOG(1) << "Window reversal field not supported";
+      return no_match_result;
+    }
     // Padding high will be checked in Step 3.
   }
-  if (transpose == nullptr && !window_util::HasWindowDilation(conv->window())) {
+  if (input_batch_dim == output_batch_dim &&
+      !window_util::HasWindowDilation(conv->window())) {
     VLOG(1) << conv->ToString()
             << " is a regular forward convolution. No need "
                "to fold it to a backward filter convolution.";
@@ -108,11 +105,11 @@ MatchBackwardFilter(HloInstruction* conv) {
   //
   // Compute the window of the backward convolution.
   Window backward_conv_window;
-  for (int i = 0; i < spatial_dims.size(); ++i) {
+  for (int i = 0; i < input_spatial_dims.size(); ++i) {
     WindowDimension* dim = backward_conv_window.add_dimensions();
     // The window size of the backward convolution equals the output size of the
     // forward convolution.
-    int64 filter_size = conv->shape().dimensions(spatial_dims[i]);
+    int64 filter_size = conv->shape().dimensions(output_spatial_dims[i]);
     dim->set_size(filter_size);
     // The window stride equals the window dilation of the forward convolution.
     dim->set_stride(conv->window().dimensions(i).window_dilation());
@@ -120,7 +117,8 @@ MatchBackwardFilter(HloInstruction* conv) {
     // activations.
     dim->set_padding_low(conv->window().dimensions(i).padding_low());
 
-    int64 input_size = conv->operand(0)->shape().dimensions(spatial_dims[i]);
+    int64 input_size =
+        conv->operand(0)->shape().dimensions(input_spatial_dims[i]);
     int64 output_size = conv->window().dimensions(i).size();
     // Compute the range of the amount of valid high padding. We first compute
     // min_padding_high, the amount of padding on the right/bottom to ensure the
@@ -167,50 +165,32 @@ MatchBackwardFilter(HloInstruction* conv) {
     }
   }
 
-  // To make future HLO passes easier, we canonicalize the fused expression by
-  // adding an identity transposition if it's omitted in the pattern.
-  if (transpose == nullptr) {
-    // Create an identity transposition with the same rank as the forward
-    // convolution.
-    HloComputation* parent_computation = conv->parent();
-    std::vector<int64> transpose_dimensions(ShapeUtil::Rank(conv->shape()));
-    std::iota(transpose_dimensions.begin(), transpose_dimensions.end(), 0);
-    transpose =
-        parent_computation->AddInstruction(HloInstruction::CreateTranspose(
-            conv->shape(), conv, transpose_dimensions));
-    TF_CHECK_OK(conv->ReplaceAllUsesWith(transpose));
-  }
-
   // Restore the dimension numbers of the backward convolution from the forward
   // convolution. The two activation dimensions are reversed (batch and
   // feature).
   ConvolutionDimensionNumbers backward_conv_dnums;
   backward_conv_dnums.set_input_batch_dimension(input_feature_dim);
   backward_conv_dnums.set_input_feature_dimension(input_batch_dim);
-  backward_conv_dnums.set_output_batch_dimension(output_feature_dim);
-  backward_conv_dnums.set_output_feature_dimension(output_batch_dim);
-  for (int i = 0; i < spatial_dims.size(); ++i) {
-    backward_conv_dnums.add_spatial_dimensions(spatial_dims[i]);
+  for (int i = 0; i < input_spatial_dims.size(); ++i) {
+    backward_conv_dnums.add_input_spatial_dimensions(input_spatial_dims[i]);
+  }
+  backward_conv_dnums.set_output_batch_dimension(kernel_input_feature_dim);
+  backward_conv_dnums.set_output_feature_dimension(kernel_output_feature_dim);
+  for (int i = 0; i < kernel_spatial_dims.size(); ++i) {
+    backward_conv_dnums.add_output_spatial_dimensions(kernel_spatial_dims[i]);
   }
   // The dimension numbering of the output of the forward convolution (before
   // transposition) is the same as that of the activations (according to the
   // semantics of kConvolution). The batch dimension of the activations should
   // be treated as the input feature dimension, and the feature dimension should
   // be treated as the output feature.
-  //
-  // The output of the forward convolution needs to be transposed to fit into
-  // the dimension numbering of the weight gradients. This transposition maps
-  // dimension i to PositionInContainer(transpose->dimensions(), i).
-  backward_conv_dnums.set_kernel_input_feature_dimension(
-      PositionInContainer(transpose->dimensions(), output_batch_dim));
-  backward_conv_dnums.set_kernel_output_feature_dimension(
-      PositionInContainer(transpose->dimensions(), output_feature_dim));
-  for (int i = 0; i < spatial_dims.size(); ++i) {
-    backward_conv_dnums.add_kernel_spatial_dimensions(
-        PositionInContainer(transpose->dimensions(), spatial_dims[i]));
+  backward_conv_dnums.set_kernel_input_feature_dimension(output_batch_dim);
+  backward_conv_dnums.set_kernel_output_feature_dimension(output_feature_dim);
+  for (int i = 0; i < output_spatial_dims.size(); ++i) {
+    backward_conv_dnums.add_kernel_spatial_dimensions(output_spatial_dims[i]);
   }
 
-  return std::make_tuple(true, std::vector<HloInstruction*>({transpose, conv}),
+  return std::make_tuple(true, std::vector<HloInstruction*>({conv}),
                          backward_conv_window, backward_conv_dnums);
 }
 
@@ -270,14 +250,20 @@ MatchBackwardInput(HloInstruction* conv) {
               << " should have no window dilation.";
       return no_match_result;
     }
+    if (window_dim.window_reversal()) {
+      VLOG(1) << "Window reversal field not supported";
+      return no_match_result;
+    }
   }
 
-  const auto& spatial_dims = dnums.spatial_dimensions();
-  CHECK_EQ(conv->window().dimensions().size(), spatial_dims.size());
+  const auto& input_spatial_dims = dnums.input_spatial_dimensions();
+  const auto& output_spatial_dims = dnums.output_spatial_dimensions();
+  CHECK_EQ(conv->window().dimensions().size(), input_spatial_dims.size());
+  CHECK_EQ(output_spatial_dims.size(), input_spatial_dims.size());
 
   const Window& old_window = conv->window();
   Window new_window = old_window;
-  for (size_t i = 0; i < spatial_dims.size(); ++i) {
+  for (size_t i = 0; i < input_spatial_dims.size(); ++i) {
     // Restore backward convolution's padding config from the matched pattern.
     // See the comment in tensorflow/core/kernels/conv_grad_tuple_ops.cc
     // for how we convert backward input convolution to a variant of forward
@@ -310,8 +296,9 @@ MatchBackwardInput(HloInstruction* conv) {
     // end at the border. The maximum amount (max_padding_high) equals
     // min_padding_high+stride-1 -- max_padding_high+1 would cause the output
     // size to change.
-    auto unpadded_input_size = conv->shape().dimensions(spatial_dims[i]);
-    auto output_size = conv->operand(0)->shape().dimensions(spatial_dims[i]);
+    auto unpadded_input_size = conv->shape().dimensions(output_spatial_dims[i]);
+    auto output_size =
+        conv->operand(0)->shape().dimensions(input_spatial_dims[i]);
     auto padded_input_size = kernel_size + dim->stride() * (output_size - 1);
     auto total_pad_size = padded_input_size - unpadded_input_size;
     auto min_padding_high = total_pad_size - backward_padding_low;
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc b/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc
index 19b122ba0603b4ec08d73e05da4c2ae11a760553..34e6bdb117d47a3d7e1eb3bae5806e130e94ea79 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc
@@ -46,23 +46,27 @@ class ConvolutionFoldingTest : public HloTestBase {
     //
     // TODO(jingyue): Add more tests on NCHW input order which TF also supports.
     tf_default_dnums_for_backward_filter_.set_input_batch_dimension(3);
-    tf_default_dnums_for_backward_filter_.set_output_batch_dimension(3);
     tf_default_dnums_for_backward_filter_.set_input_feature_dimension(0);
-    tf_default_dnums_for_backward_filter_.set_output_feature_dimension(0);
-    tf_default_dnums_for_backward_filter_.add_spatial_dimensions(1);
-    tf_default_dnums_for_backward_filter_.add_spatial_dimensions(2);
+    tf_default_dnums_for_backward_filter_.add_input_spatial_dimensions(1);
+    tf_default_dnums_for_backward_filter_.add_input_spatial_dimensions(2);
     tf_default_dnums_for_backward_filter_.set_kernel_input_feature_dimension(0);
     tf_default_dnums_for_backward_filter_.set_kernel_output_feature_dimension(
         3);
     tf_default_dnums_for_backward_filter_.add_kernel_spatial_dimensions(1);
     tf_default_dnums_for_backward_filter_.add_kernel_spatial_dimensions(2);
+    tf_default_dnums_for_backward_filter_.add_output_spatial_dimensions(0);
+    tf_default_dnums_for_backward_filter_.add_output_spatial_dimensions(1);
+    tf_default_dnums_for_backward_filter_.set_output_batch_dimension(2);
+    tf_default_dnums_for_backward_filter_.set_output_feature_dimension(3);
 
     tf_default_dnums_for_backward_input_.set_input_batch_dimension(0);
     tf_default_dnums_for_backward_input_.set_output_batch_dimension(0);
     tf_default_dnums_for_backward_input_.set_input_feature_dimension(3);
     tf_default_dnums_for_backward_input_.set_output_feature_dimension(3);
-    tf_default_dnums_for_backward_input_.add_spatial_dimensions(1);
-    tf_default_dnums_for_backward_input_.add_spatial_dimensions(2);
+    tf_default_dnums_for_backward_input_.add_input_spatial_dimensions(1);
+    tf_default_dnums_for_backward_input_.add_output_spatial_dimensions(1);
+    tf_default_dnums_for_backward_input_.add_input_spatial_dimensions(2);
+    tf_default_dnums_for_backward_input_.add_output_spatial_dimensions(2);
     tf_default_dnums_for_backward_input_.set_kernel_input_feature_dimension(3);
     tf_default_dnums_for_backward_input_.set_kernel_output_feature_dimension(2);
     tf_default_dnums_for_backward_input_.add_kernel_spatial_dimensions(0);
@@ -82,7 +86,7 @@ class ConvolutionFoldingTest : public HloTestBase {
   ConvolutionDimensionNumbers tf_default_dnums_for_backward_input_;
 };
 
-TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithoutTranspose) {
+TEST_F(ConvolutionFoldingTest, BackwardFilterConvolve) {
   HloComputation::Builder builder(TestName());
   HloInstruction* activations =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -132,7 +136,7 @@ TEST_F(ConvolutionFoldingTest,
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
-  EXPECT_FALSE(FoldConvolution(module.get()));
+  EXPECT_TRUE(FoldConvolution(module.get()));
 }
 
 // Extracted from block35 training.
@@ -151,13 +155,9 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedActivations) {
     conv_window.mutable_dimensions(i)->set_padding_low(1);
     conv_window.mutable_dimensions(i)->set_padding_high(1);
   }
-  HloInstruction* convolution =
-      builder.AddInstruction(HloInstruction::CreateConvolve(
-          ShapeUtil::MakeShape(F32, {32, 3, 3, 32}), activations, gradients,
-          conv_window, tf_default_dnums_for_backward_filter_));
-
-  builder.AddInstruction(HloInstruction::CreateTranspose(
-      ShapeUtil::MakeShape(F32, {3, 3, 32, 32}), convolution, {1, 2, 3, 0}));
+  builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {32, 3, 3, 32}), activations, gradients,
+      conv_window, tf_default_dnums_for_backward_filter_));
 
   auto module = CreateNewModule();
   HloComputation* entry_computation =
@@ -185,13 +185,9 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedGradients) {
     conv_window.mutable_dimensions(i)->set_padding_high(-1);
     conv_window.mutable_dimensions(i)->set_window_dilation(2);
   }
-  HloInstruction* convolution =
-      builder.AddInstruction(HloInstruction::CreateConvolve(
-          ShapeUtil::MakeShape(F32, {320, 3, 3, 192}), activations, gradients,
-          conv_window, tf_default_dnums_for_backward_filter_));
-
-  builder.AddInstruction(HloInstruction::CreateTranspose(
-      ShapeUtil::MakeShape(F32, {3, 3, 192, 320}), convolution, {1, 2, 3, 0}));
+  builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {320, 3, 3, 192}), activations, gradients,
+      conv_window, tf_default_dnums_for_backward_filter_));
 
   auto module = CreateNewModule();
   HloComputation* entry_computation =
@@ -218,13 +214,9 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithUnevenPadding) {
     // Uneven padding: padding_low=0, padding_high=1
     conv_window.mutable_dimensions(i)->set_padding_high(1);
   }
-  HloInstruction* convolution =
-      builder.AddInstruction(HloInstruction::CreateConvolve(
-          ShapeUtil::MakeShape(F32, {32, 2, 2, 32}), activations, gradients,
-          conv_window, tf_default_dnums_for_backward_filter_));
-
-  builder.AddInstruction(HloInstruction::CreateTranspose(
-      ShapeUtil::MakeShape(F32, {2, 2, 32, 32}), convolution, {1, 2, 3, 0}));
+  builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {32, 2, 2, 32}), activations, gradients,
+      conv_window, tf_default_dnums_for_backward_filter_));
 
   auto module = CreateNewModule();
   HloComputation* entry_computation =
@@ -258,8 +250,10 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveEvenPadding) {
   conv_dnums.set_output_batch_dimension(0);
   conv_dnums.set_input_feature_dimension(1);
   conv_dnums.set_output_feature_dimension(1);
-  conv_dnums.add_spatial_dimensions(2);
-  conv_dnums.add_spatial_dimensions(3);
+  conv_dnums.add_input_spatial_dimensions(2);
+  conv_dnums.add_output_spatial_dimensions(2);
+  conv_dnums.add_input_spatial_dimensions(3);
+  conv_dnums.add_output_spatial_dimensions(3);
   conv_dnums.set_kernel_input_feature_dimension(0);
   conv_dnums.set_kernel_output_feature_dimension(1);
   conv_dnums.add_kernel_spatial_dimensions(2);
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index e79d0a4c795c16a5c3298f69b3e3dcea55a97b9c..899cc5c83b99f1bb6154f883ca17871863e1f457 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -29,12 +29,12 @@ namespace se = ::perftools::gputools;
 namespace xla {
 namespace gpu {
 
+using se::dnn::AlgorithmDesc;
 using se::dnn::BatchDescriptor;
 using se::dnn::ConvolutionDescriptor;
 using se::dnn::DataLayout;
 using se::dnn::FilterDescriptor;
 using se::dnn::FilterLayout;
-using se::dnn::AlgorithmDesc;
 
 ConvolveScratchAllocator::ConvolveScratchAllocator(
     int device_ordinal, DeviceMemoryAllocator* memory_allocator)
@@ -131,8 +131,9 @@ tensorflow::Status ConvolutionThunk::ExecuteOnStream(
   const int effective_num_dimensions = std::max(2, num_dimensions);
 
   CHECK_EQ(F32, output_shape_.element_type());
-  CHECK_EQ(num_dimensions, dim_nums_.spatial_dimensions_size());
+  CHECK_EQ(num_dimensions, dim_nums_.input_spatial_dimensions_size());
   CHECK_EQ(num_dimensions, dim_nums_.kernel_spatial_dimensions_size());
+  CHECK_EQ(num_dimensions, dim_nums_.output_spatial_dimensions_size());
   for (const WindowDimension& dim : window_.dimensions()) {
     CHECK_EQ(dim.padding_low(), dim.padding_high());
   }
@@ -148,7 +149,7 @@ tensorflow::Status ConvolutionThunk::ExecuteOnStream(
     // Note that the dimensions are reversed. The same holds below.
     input_descriptor.set_spatial_dim(
         static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-        input_shape_.dimensions(dim_nums_.spatial_dimensions(dim)));
+        input_shape_.dimensions(dim_nums_.input_spatial_dimensions(dim)));
   }
 
   FilterDescriptor filter_descriptor(effective_num_dimensions);
@@ -182,7 +183,7 @@ tensorflow::Status ConvolutionThunk::ExecuteOnStream(
   for (int dim = 0; dim < num_dimensions; ++dim) {
     output_descriptor.set_spatial_dim(
         static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-        output_shape_.dimensions(dim_nums_.spatial_dimensions(dim)));
+        output_shape_.dimensions(dim_nums_.output_spatial_dimensions(dim)));
   }
 
   // Add a singleton dimension in the 1D convolution case.
@@ -258,22 +259,19 @@ tensorflow::Status ConvolutionThunk::Convolve(
 }
 
 std::vector<AlgorithmDesc> ConvolutionThunk::GetAlgorithms(
-    se::StreamExecutor* stream_exec) const {
+    bool with_winograd_nonfused, se::StreamExecutor* stream_exec) const {
   std::vector<AlgorithmDesc> algorithms;
-  // TODO(yangzihao): Currently disable the use of winograd nonfused in XLA
-  // by default. Should send in conv parameters and enable it when
-  // ShouldIncludeWinogradNonfusedAlgo() returns true.
   switch (convolution_kind_) {
     case ConvolutionKind::kBackwardFilter:
       CHECK(stream_exec->GetConvolveBackwardFilterAlgorithms(
-          /*with_winograd_nonfused=*/false, &algorithms));
+          with_winograd_nonfused, &algorithms));
       break;
     case ConvolutionKind::kBackwardInput:
       CHECK(stream_exec->GetConvolveBackwardDataAlgorithms(
-          /*with_winograd_nonfused=*/false, &algorithms));
+          with_winograd_nonfused, &algorithms));
       break;
     case ConvolutionKind::kForward:
-      CHECK(stream_exec->GetConvolveAlgorithms(/*with_winograd_nonfused=*/false,
+      CHECK(stream_exec->GetConvolveAlgorithms(with_winograd_nonfused,
                                                &algorithms));
       break;
   }
@@ -287,6 +285,26 @@ static string AlgorithmToString(const se::dnn::AlgorithmDesc& algo) {
   return tensorflow::strings::StrCat(algo.algo_id());
 }
 
+// Determines whether we can safely perform a winograd non-fused convolution for
+// the given input and output descriptors.  This works around b/68264959, an
+// integer overflow in cuDNNv5 and cuDNNv6.
+static bool ShouldIncludeWinogradNonfusedAlgo(
+    const BatchDescriptor& input_descriptor,
+    const BatchDescriptor& output_descriptor) {
+  int64 batch = input_descriptor.count();
+  int64 in_depths = input_descriptor.feature_map_count();
+  int64 in_rows = input_descriptor.height();
+  int64 in_cols = input_descriptor.width();
+  int64 out_depths = output_descriptor.feature_map_count();
+
+  int64 total_size = 16 * std::ceil(batch / 16.0) *
+                     std::max(in_depths, out_depths) * in_cols * in_rows *
+                     sizeof(float);
+  int64 threshold = 1L << 31;
+
+  return total_size < threshold;
+}
+
 tensorflow::Status ConvolutionThunk::ConvolveWithTune(
     const BatchDescriptor& input_descriptor, se::DeviceMemory<float> input_data,
     const FilterDescriptor& filter_descriptor,
@@ -296,16 +314,22 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
     const ConvolutionDescriptor& convolution_descriptor,
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
   // TODO(b/29126320): Try cudnn v5's new auto-tuner when it's rolled out.
-  if (best_algorithm_.algorithm().is_default()) {
+  if (!best_algorithm_.has_value()) {
+    best_algorithm_.emplace();
+
     // Auto-tuning either is disabled or only happens in the first run of this
     // function.
     VLOG(2) << "Profiling for best convolution algorithm used for "
                "ConvolutionThunk: "
             << this;
 
+    bool with_winograd_nonfused =
+        ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor);
+
     se::dnn::ProfileResult best_result;
     se::dnn::ProfileResult best_result_without_scratch;
-    std::vector<AlgorithmDesc> algorithms = GetAlgorithms(stream->parent());
+    std::vector<AlgorithmDesc> algorithms =
+        GetAlgorithms(with_winograd_nonfused, stream->parent());
     for (auto algorithm : algorithms) {
       ConvolveScratchAllocator scratch_allocator(
           buffer_allocations.device_ordinal(),
@@ -341,35 +365,35 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
     }
 
     if (best_result.is_valid()) {
-      best_algorithm_.set_algorithm(best_result.algorithm());
+      best_algorithm_->set_algorithm(best_result.algorithm());
     } else {
       LOG(ERROR) << "No convolution algorithm works with profiling. Fall back "
                     "to the default algorithm.";
-      best_algorithm_.set_algorithm(AlgorithmDesc());
+      best_algorithm_->set_algorithm(AlgorithmDesc());
     }
 
     if (best_result_without_scratch.is_valid()) {
-      best_algorithm_.set_algorithm_no_scratch(
+      best_algorithm_->set_algorithm_no_scratch(
           best_result_without_scratch.algorithm());
     } else {
       LOG(ERROR) << "No convolution algorithm without scratch works with "
                     "profiling. Fall back "
                     "to the default algorithm.";
-      best_algorithm_.set_algorithm_no_scratch(AlgorithmDesc());
+      best_algorithm_->set_algorithm_no_scratch(AlgorithmDesc());
     }
   }
 
   {
     VLOG(2) << "Using convolution algorithm ("
-            << AlgorithmToString(best_algorithm_.algorithm()) << ", "
-            << AlgorithmToString(best_algorithm_.algorithm_no_scratch())
+            << AlgorithmToString(best_algorithm_->algorithm()) << ", "
+            << AlgorithmToString(best_algorithm_->algorithm_no_scratch())
             << ") for ConvolutionThunk: " << this;
     ConvolveScratchAllocator scratch_allocator(
         buffer_allocations.device_ordinal(),
         buffer_allocations.memory_allocator());
     return Convolve(input_descriptor, input_data, filter_descriptor,
                     filter_data, output_descriptor, output_data,
-                    convolution_descriptor, best_algorithm_, stream,
+                    convolution_descriptor, *best_algorithm_, stream,
                     &scratch_allocator, nullptr);
   }
 }
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index 13432301b2af34ab4bd0864e39ce22366cc1d11d..7c25a2e6450e30292667ecd7de54b50ac2450767 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace xla {
@@ -87,6 +88,14 @@ class ConvolutionThunk : public Thunk {
       const BufferAllocations& buffer_allocations,
       perftools::gputools::Stream* stream) override;
 
+  // Returns true if the next run of ExecuteOnStream will do autotuning.  If so,
+  // we want the GPU to be quiescent during autotuning, so as not to introduce
+  // noise in our results.
+  bool ShouldHaltAllActivityBeforeRunning(
+      perftools::gputools::Stream*) override {
+    return !best_algorithm_.has_value();
+  }
+
  private:
   tensorflow::Status ConvolveWithTune(
       const perftools::gputools::dnn::BatchDescriptor& input_descriptor,
@@ -116,13 +125,15 @@ class ConvolutionThunk : public Thunk {
 
   // Returns the convolve algorithms that can be used for this ConvolutionThunk.
   std::vector<perftools::gputools::dnn::AlgorithmDesc> GetAlgorithms(
+      bool with_winograd_nonfused,
       perftools::gputools::StreamExecutor* stream_exec) const;
 
   // Fastest cuDNN convolution algorithm for this thunk learned from
   // auto-tuning. If auto-tuning is disabled or failed, best_algorithm_ is set
-  // to the default value indicating cuDNN's convolution will choose
-  // the best algorithm from some heuristics based on its parameters.
-  perftools::gputools::dnn::AlgorithmConfig best_algorithm_;
+  // to the default value, indicating cuDNN's convolution will choose the best
+  // algorithm from some heuristics based on its parameters.
+  tensorflow::gtl::optional<perftools::gputools::dnn::AlgorithmConfig>
+      best_algorithm_;
 
   const ConvolutionKind convolution_kind_;
 
diff --git a/tensorflow/compiler/xla/service/gpu/copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/copy_insertion.cc
deleted file mode 100644
index 3dc85552015be67c20db9099704334c864b44b51..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/copy_insertion.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/copy_insertion.h"
-
-#include <memory>
-#include <set>
-#include <vector>
-
-#include "tensorflow/compiler/xla/service/copy_insertion.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/logical_buffer.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace xla {
-namespace gpu {
-
-StatusOr<bool> GpuCopyInsertion::Run(HloModule* module) {
-  TF_ASSIGN_OR_RETURN(bool changed, CopyInsertion::Run(module));
-
-  TF_ASSIGN_OR_RETURN(auto points_to_analysis,
-                      TuplePointsToAnalysis::Run(module));
-
-  // Make sure all operands of a library call are in memory instead of constants
-  // in IR. The top-level (index {}) of the points-to set of each operand
-  // indicates the source(s) of the array buffer. If any of these are constant,
-  // then add a copy to materialize the array.
-  HloComputation* computation = module->entry_computation();
-  for (HloInstruction* hlo : computation->MakeInstructionPostOrder()) {
-    if (ImplementedAsLibraryCall(*hlo)) {
-      for (int64 i = 0; i < hlo->operand_count(); ++i) {
-        HloInstruction* operand = hlo->mutable_operand(i);
-        const PointsToSet& points_to =
-            points_to_analysis->GetPointsToSet(operand);
-        const auto& element = points_to.element(/*index=*/{});
-        if (std::any_of(element.begin(), element.end(),
-                        [](const LogicalBuffer* buffer_source) {
-                          return buffer_source->instruction()->opcode() ==
-                                 HloOpcode::kConstant;
-                        })) {
-          TF_ASSIGN_OR_RETURN(HloInstruction * copy,
-                              CopyInsertion::FindOrInsertCopy(operand));
-          TF_RETURN_IF_ERROR(hlo->ReplaceOperandWith(i, copy));
-          changed = true;
-        }
-      }
-    }
-  }
-
-  return changed;
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 6bf00cfb8a53723ae9608093480bf2eed10144dd..4b511cb4bb94addfae53d6b2e6d6f86d5b9afd84 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -135,10 +135,6 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatBinaryOp(
   PrimitiveType rhs_input_type = op->operand(1)->shape().element_type();
   PrimitiveType output_type = op->shape().element_type();
   switch (op->opcode()) {
-    case HloOpcode::kAtan2:
-      return EmitLibdeviceMathCall("__nv_atan2", {lhs_value, rhs_value},
-                                   {lhs_input_type, rhs_input_type},
-                                   output_type);
     case HloOpcode::kRemainder: {
       return EmitLibdeviceMathCall("__nv_fmod", {lhs_value, rhs_value},
                                    {lhs_input_type, rhs_input_type},
@@ -199,29 +195,50 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitErfcInv(
   return EmitLibdeviceMathCall("__nv_erfcinv", {value}, {prim_type}, prim_type);
 }
 
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLog(
+    PrimitiveType prim_type, llvm::Value* value) const {
+  return EmitLibdeviceMathCall("__nv_log", {value}, {prim_type}, prim_type);
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitSin(
+    PrimitiveType prim_type, llvm::Value* value) const {
+  return EmitLibdeviceMathCall("__nv_sin", {value}, {prim_type}, prim_type);
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitCos(
+    PrimitiveType prim_type, llvm::Value* value) const {
+  return EmitLibdeviceMathCall("__nv_cos", {value}, {prim_type}, prim_type);
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitExp(
+    PrimitiveType prim_type, llvm::Value* value) const {
+  return EmitLibdeviceMathCall("__nv_exp", {value}, {prim_type}, prim_type);
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitPow(PrimitiveType prim_type,
+                                                      llvm::Value* lhs,
+                                                      llvm::Value* rhs) const {
+  return EmitLibdeviceMathCall("__nv_pow", {lhs, rhs}, {prim_type, prim_type},
+                               prim_type);
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitAtan2(
+    PrimitiveType prim_type, llvm::Value* lhs, llvm::Value* rhs) const {
+  return EmitLibdeviceMathCall("__nv_atan2", {lhs, rhs}, {prim_type, prim_type},
+                               prim_type);
+}
+
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatUnaryOp(
     const HloInstruction* op, llvm::Value* operand_value) const {
   PrimitiveType input_type = op->operand(0)->shape().element_type();
   PrimitiveType output_type = op->shape().element_type();
   switch (op->opcode()) {
-    case HloOpcode::kExp:
-      return EmitLibdeviceMathCall("__nv_exp", {operand_value}, {input_type},
-                                   output_type);
     case HloOpcode::kFloor:
       return EmitLibdeviceMathCall("__nv_floor", {operand_value}, {input_type},
                                    output_type);
     case HloOpcode::kCeil:
       return EmitLibdeviceMathCall("__nv_ceil", {operand_value}, {input_type},
                                    output_type);
-    case HloOpcode::kLog:
-      return EmitLibdeviceMathCall("__nv_log", {operand_value}, {input_type},
-                                   output_type);
-    case HloOpcode::kCos:
-      return EmitLibdeviceMathCall("__nv_cos", {operand_value}, {input_type},
-                                   output_type);
-    case HloOpcode::kSin:
-      return EmitLibdeviceMathCall("__nv_sin", {operand_value}, {input_type},
-                                   output_type);
     case HloOpcode::kTanh:
       return EmitLibdeviceMathCall("__nv_tanh", {operand_value}, {input_type},
                                    output_type);
@@ -230,224 +247,6 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatUnaryOp(
   }
 }
 
-StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitComplexBinaryOp(
-    const HloInstruction* op, llvm::Value* lhs_value,
-    llvm::Value* rhs_value) const {
-  PrimitiveType input_type = op->operand(0)->shape().element_type();
-  TF_RET_CHECK(primitive_util::IsComplexType(input_type));
-  PrimitiveType component_type =
-      primitive_util::ComplexComponentType(input_type);
-  switch (op->opcode()) {
-    case HloOpcode::kPower: {
-      // (a+bi)^(c+di) =
-      //    (a*a+b*b)^(0.5c) * exp(-d*atan2(b,a)) * (cos(q) + i*sin(q)),
-      //    where q = c*atan2(b,a)+0.5d*ln(a*a+b*b)
-      auto a = EmitExtractReal(lhs_value);
-      auto b = EmitExtractImag(lhs_value);
-      auto c = EmitExtractReal(rhs_value);
-      auto d = EmitExtractImag(rhs_value);
-      auto aa_p_bb = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(a, a),
-                                             ir_builder_->CreateFMul(b, b));
-      auto one_half = llvm::ConstantFP::get(a->getType(), 0.5);
-      auto half_c = ir_builder_->CreateFMul(one_half, c);
-
-      TF_ASSIGN_OR_RETURN(
-          auto aa_p_bb_to_half_c,
-          EmitLibdeviceMathCall("__nv_pow", {aa_p_bb, half_c},
-                                {component_type, component_type},
-                                component_type));
-      auto neg_d = ir_builder_->CreateFNeg(d);
-      TF_ASSIGN_OR_RETURN(
-          auto arg_lhs, EmitLibdeviceMathCall("__nv_atan2", {b, a},
-                                              {component_type, component_type},
-                                              component_type));
-      auto neg_d_arg_lhs = ir_builder_->CreateFMul(neg_d, arg_lhs);
-      TF_ASSIGN_OR_RETURN(
-          auto e_to_neg_d_arg_lhs,
-          EmitLibdeviceMathCall("__nv_exp", {neg_d_arg_lhs}, {component_type},
-                                component_type));
-      auto coeff =
-          ir_builder_->CreateFMul(aa_p_bb_to_half_c, e_to_neg_d_arg_lhs);
-      TF_ASSIGN_OR_RETURN(
-          auto ln_aa_p_bb,
-          EmitLibdeviceMathCall("__nv_log", {aa_p_bb}, {component_type},
-                                component_type));
-      auto half_d = ir_builder_->CreateFMul(one_half, d);
-      auto q =
-          ir_builder_->CreateFAdd(ir_builder_->CreateFMul(c, arg_lhs),
-                                  ir_builder_->CreateFMul(half_d, ln_aa_p_bb));
-      TF_ASSIGN_OR_RETURN(
-          auto cos_q, EmitLibdeviceMathCall("__nv_cos", {q}, {component_type},
-                                            component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto sin_q, EmitLibdeviceMathCall("__nv_sin", {q}, {component_type},
-                                            component_type));
-      return EmitComposeComplex(op, ir_builder_->CreateFMul(coeff, cos_q),
-                                ir_builder_->CreateFMul(coeff, sin_q));
-    }
-    default:
-      return ElementalIrEmitter::EmitComplexBinaryOp(op, lhs_value, rhs_value);
-  }
-}
-
-StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitComplexUnaryOp(
-    const HloInstruction* op, llvm::Value* operand_value) const {
-  PrimitiveType input_type = op->operand(0)->shape().element_type();
-  PrimitiveType component_type =
-      primitive_util::IsComplexType(input_type)
-          ? primitive_util::ComplexComponentType(input_type)
-          : input_type;
-
-  switch (op->opcode()) {
-    case HloOpcode::kLog: {
-      // log(a+bi) = .5*log(a^2+b^2) + i*atan2(b, a)
-      auto a = EmitExtractReal(operand_value);
-      auto b = EmitExtractImag(operand_value);
-      llvm::Type* llvm_ty = a->getType();
-      auto sum_sq = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(a, a),
-                                            ir_builder_->CreateFMul(b, b));
-      TF_ASSIGN_OR_RETURN(
-          auto log_sum_sq,
-          EmitLibdeviceMathCall("__nv_log", {sum_sq}, {component_type},
-                                component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto angle, EmitLibdeviceMathCall("__nv_atan2", {b, a},
-                                            {component_type, component_type},
-                                            component_type));
-      auto one_half = llvm::ConstantFP::get(llvm_ty, 0.5);
-      return EmitComposeComplex(
-          op, ir_builder_->CreateFMul(one_half, log_sum_sq), angle);
-    }
-    case HloOpcode::kExp: {
-      // e^(a+bi) = e^a*(cos(b)+sin(b)i)
-      auto b = EmitExtractImag(operand_value);
-      TF_ASSIGN_OR_RETURN(
-          auto exp_a,
-          EmitLibdeviceMathCall("__nv_exp", {EmitExtractReal(operand_value)},
-                                {component_type}, component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto cos_b, EmitLibdeviceMathCall("__nv_cos", {b}, {component_type},
-                                            component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto sin_b, EmitLibdeviceMathCall("__nv_sin", {b}, {component_type},
-                                            component_type));
-      return EmitComposeComplex(op, ir_builder_->CreateFMul(exp_a, cos_b),
-                                ir_builder_->CreateFMul(exp_a, sin_b));
-    }
-    case HloOpcode::kCos: {
-      // cos(a+bi) = .5(cos(a)*(e^-b+e^b) + i*sin(a)*(e^-b-e^b))
-      auto a = EmitExtractReal(operand_value);
-      auto llvm_ty = a->getType();
-      TF_ASSIGN_OR_RETURN(
-          auto exp_b,
-          EmitLibdeviceMathCall("__nv_exp", {EmitExtractImag(operand_value)},
-                                {component_type}, component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto cos_a, EmitLibdeviceMathCall("__nv_cos", {a}, {component_type},
-                                            component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto sin_a, EmitLibdeviceMathCall("__nv_sin", {a}, {component_type},
-                                            component_type));
-      auto half_exp_b =
-          ir_builder_->CreateFMul(llvm::ConstantFP::get(llvm_ty, 0.5), exp_b);
-      auto half_exp_neg_b =
-          ir_builder_->CreateFDiv(llvm::ConstantFP::get(llvm_ty, 0.5), exp_b);
-      return EmitComposeComplex(
-          op,
-          ir_builder_->CreateFMul(
-              cos_a, ir_builder_->CreateFAdd(half_exp_neg_b, half_exp_b)),
-          ir_builder_->CreateFMul(
-              sin_a, ir_builder_->CreateFSub(half_exp_neg_b, half_exp_b)));
-    }
-
-    case HloOpcode::kSin: {
-      // sin(a+bi) = 0.5(sin(a)*(e^b+e^-b) + i*cos(a)*(e^b-e^-b)
-      auto a = EmitExtractReal(operand_value);
-      auto llvm_ty = a->getType();
-      TF_ASSIGN_OR_RETURN(
-          auto exp_b,
-          EmitLibdeviceMathCall("__nv_exp", {EmitExtractImag(operand_value)},
-                                {component_type}, component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto cos_a, EmitLibdeviceMathCall("__nv_cos", {a}, {component_type},
-                                            component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto sin_a, EmitLibdeviceMathCall("__nv_sin", {a}, {component_type},
-                                            component_type));
-      auto half_exp_b =
-          ir_builder_->CreateFMul(llvm::ConstantFP::get(llvm_ty, 0.5), exp_b);
-      auto half_exp_neg_b =
-          ir_builder_->CreateFDiv(llvm::ConstantFP::get(llvm_ty, 0.5), exp_b);
-      return EmitComposeComplex(
-          op,
-          ir_builder_->CreateFMul(
-              sin_a, ir_builder_->CreateFAdd(half_exp_b, half_exp_neg_b)),
-          ir_builder_->CreateFMul(
-              cos_a, ir_builder_->CreateFSub(half_exp_b, half_exp_neg_b)));
-    }
-    case HloOpcode::kTanh: {
-      /*
-      tanh=(exp(x)-exp(-x)) / (exp(x)+exp(-x))
-      e^(a+bi) = e^a*(cos(b)+sin(b)i)
-      so tanh=(((cos(b)+sin(b)i)e^a - (cos(-b)+sin(-b)i)e^-a)) /
-              (((cos(b)+sin(b)i)e^a + (cos(-b)+sin(-b)i)e^-a))
-      cos(b)=cos(-b), sin(-b)=-sin(b)
-      so tanh=(((cos(b)+sin(b)i)e^a - (cos(b)-sin(b)i)e^-a)) /
-              (((cos(b)+sin(b)i)e^a + (cos(b)-sin(b)i)e^-a))
-             =(cos(b)e^a+i*sin(b)e^a + cos(b)(-e^-a)+i*sin(b)e^-a) /
-              (cos(b)e^a+i*sin(b)e^a + cos(b)e^-a+i*sin(b)(-e^-a))
-             =(cos(b)(e^a-e^-a) + i*sin(b)(e^a+e^-a)) /
-              (cos(b)(e^a+e^-a) + i*sin(b)(e^a-e^-a))
-      This is a complex division, so we can multiply by denom_conj/denom_conj
-             =(cos(b)(e^a-e^-a) + i*sin(b)(e^a+e^-a)) *
-              (cos(b)(e^a+e^-a) - i*sin(b)(e^a-e^-a)) /
-              ((cos(b)(e^a+e^-a))^2 + (sin(b)(e^a-e^-a))^2)
-             =(cos(b)^2(e^(2a)-e^(-2a)) + sin(b)^2(e^(2a)-e^(-2a)) +
-               i*(cos(b)sin(b)(e^a+e^-a)^2 - cos(b)sin(b)(e^a-e^-a)^2)) /
-              ((cos(b)(e^a+e^-a))^2 + (sin(b)(e^a-e^-a))^2)
-      */
-      auto a = EmitExtractReal(operand_value);
-      auto b = EmitExtractImag(operand_value);
-      TF_ASSIGN_OR_RETURN(
-          auto exp_a, EmitLibdeviceMathCall("__nv_exp", {a}, {component_type},
-                                            component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto cos_b, EmitLibdeviceMathCall("__nv_cos", {b}, {component_type},
-                                            component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto sin_b, EmitLibdeviceMathCall("__nv_sin", {b}, {component_type},
-                                            component_type));
-      auto exp_neg_a = ir_builder_->CreateFDiv(
-          llvm::ConstantFP::get(exp_a->getType(), 1), exp_a);
-      auto exp_2a_minus_exp_neg_2a = ir_builder_->CreateFSub(
-          ir_builder_->CreateFMul(exp_a, exp_a),
-          ir_builder_->CreateFMul(exp_neg_a, exp_neg_a));
-      auto cos_b_sq = ir_builder_->CreateFMul(cos_b, cos_b);
-      auto sin_b_sq = ir_builder_->CreateFMul(sin_b, sin_b);
-      auto real_num = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(cos_b_sq, exp_2a_minus_exp_neg_2a),
-          ir_builder_->CreateFMul(sin_b_sq, exp_2a_minus_exp_neg_2a));
-      auto cos_b_sin_b = ir_builder_->CreateFMul(cos_b, sin_b);
-      auto exp_a_plus_exp_neg_a = ir_builder_->CreateFAdd(exp_a, exp_neg_a);
-      auto exp_a_plus_exp_neg_a_sq =
-          ir_builder_->CreateFMul(exp_a_plus_exp_neg_a, exp_a_plus_exp_neg_a);
-      auto exp_a_minus_exp_neg_a = ir_builder_->CreateFSub(exp_a, exp_neg_a);
-      auto exp_a_minus_exp_neg_a_sq =
-          ir_builder_->CreateFMul(exp_a_minus_exp_neg_a, exp_a_minus_exp_neg_a);
-      auto imag_num = ir_builder_->CreateFMul(
-          cos_b_sin_b, ir_builder_->CreateFSub(exp_a_plus_exp_neg_a_sq,
-                                               exp_a_minus_exp_neg_a_sq));
-      auto denom = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(cos_b_sq, exp_a_plus_exp_neg_a_sq),
-          ir_builder_->CreateFMul(sin_b_sq, exp_a_minus_exp_neg_a_sq));
-      return EmitComposeComplex(op, ir_builder_->CreateFDiv(real_num, denom),
-                                ir_builder_->CreateFDiv(imag_num, denom));
-    }
-    default:
-      return ElementalIrEmitter::EmitComplexUnaryOp(op, operand_value);
-  }
-}
-
 llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall(
     const string& callee_name,
     tensorflow::gtl::ArraySlice<llvm::Value*> operands,
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index 6a537d015209bc507af36b13eeb5d69ce58d8fea..77d4569b1e8e398005e8f517ff086a77aedd382d 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -54,20 +54,31 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<llvm::Value*> EmitFloatUnaryOp(
       const HloInstruction* op, llvm::Value* operand_value) const override;
 
-  StatusOr<llvm::Value*> EmitComplexUnaryOp(
-      const HloInstruction* op, llvm::Value* operand_value) const override;
-
   StatusOr<llvm::Value*> EmitFloatBinaryOp(
       const HloInstruction* op, llvm::Value* lhs_value,
       llvm::Value* rhs_value) const override;
 
-  StatusOr<llvm::Value*> EmitComplexBinaryOp(
-      const HloInstruction* op, llvm::Value* lhs_value,
-      llvm::Value* rhs_value) const override;
-
   StatusOr<llvm::Value*> EmitErfcInv(PrimitiveType prim_type,
                                      llvm::Value* value) const override;
 
+  StatusOr<llvm::Value*> EmitLog(PrimitiveType prim_type,
+                                 llvm::Value* value) const override;
+
+  StatusOr<llvm::Value*> EmitSin(PrimitiveType prim_type,
+                                 llvm::Value* value) const override;
+
+  StatusOr<llvm::Value*> EmitCos(PrimitiveType prim_type,
+                                 llvm::Value* value) const override;
+
+  StatusOr<llvm::Value*> EmitExp(PrimitiveType prim_type,
+                                 llvm::Value* value) const override;
+
+  StatusOr<llvm::Value*> EmitPow(PrimitiveType prim_type, llvm::Value* lhs,
+                                 llvm::Value* rhs) const override;
+
+  StatusOr<llvm::Value*> EmitAtan2(PrimitiveType prim_type, llvm::Value* lhs,
+                                   llvm::Value* rhs) const override;
+
   llvm::Value* EmitThreadId() const override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index e784046450ed1cca088770c65c786e80adda869f..8e3aebbc12b5e6d746700956b9743bc94db50167 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -264,9 +264,9 @@ tensorflow::Status GemmThunk::ExecuteOnStream(
 
   auto make_descriptor = [this](se::DeviceMemoryBase data, const Shape& shape,
                                 bool transpose) -> MatrixDescriptor {
-    bool is_row_major = shape.layout().minor_to_major(0) != 0;
-    bool layout_mismatch = shape.layout().minor_to_major(0) !=
-                           output_shape_.layout().minor_to_major(0);
+    bool is_row_major = LayoutUtil::Minor(shape.layout(), 0) != 0;
+    bool layout_mismatch = LayoutUtil::Minor(shape.layout(), 0) !=
+                           LayoutUtil::Minor(output_shape_.layout(), 0);
     return MatrixDescriptor(data, transpose ^ layout_mismatch,
                             shape.dimensions(is_row_major),
                             shape.dimensions(!is_row_major));
@@ -320,7 +320,7 @@ tensorflow::Status GemmThunk::ExecuteOnStream(
   };
 
   bool launch_ok;
-  if (output_shape_.layout().minor_to_major(0) == 0) {
+  if (LayoutUtil::Minor(output_shape_.layout(), 0) == 0) {
     launch_ok = launch(
         lhs_descriptor, rhs_descriptor,
         MatrixDescriptor(output_data, false, output_num_rows, output_num_cols),
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
index 983cb872924f22be0dfad8aa9ad86f233b909c46..8c6a1f51a8a09ef78950dfe7e89994a3fe247f49 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
@@ -52,6 +52,15 @@ class GemmThunk : public Thunk {
       const BufferAllocations& buffer_allocations,
       perftools::gputools::Stream* stream) override;
 
+  // Returns true if we'll perform autotuning if run on the given stream.  If
+  // so, we want the GPU to be quiescent during autotuning, so as not to
+  // introduce noise in our results.
+  bool ShouldHaltAllActivityBeforeRunning(
+      perftools::gputools::Stream* stream) override {
+    return autotune_results_.count(
+               stream->parent()->GetDeviceDescription().name()) != 0;
+  }
+
  private:
   const BufferAllocation::Slice lhs_buffer_;
   const BufferAllocation::Slice rhs_buffer_;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 23fb308ec6b4ec363cfba318fa4e1236766069ae..fc3b299936779dc938a6777e7da7907a3b43a3be 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -27,21 +27,22 @@ limitations under the License.
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
-#include "tensorflow/compiler/xla/service/batchnorm_rewriter.h"
+#include "tensorflow/compiler/xla/service/batchnorm_expander.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
+#include "tensorflow/compiler/xla/service/dot_decomposer.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_folding.h"
-#include "tensorflow/compiler/xla/service/gpu/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
-#include "tensorflow/compiler/xla/service/gpu/layout_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "tensorflow/compiler/xla/service/gpu/pad_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
@@ -126,7 +127,7 @@ string GetLibdeviceDir(const string& config_cuda_data_dir) {
 
 // Runs optimization passes on the given HLO module.
 tensorflow::Status OptimizeHloModule(
-    HloModule* hlo_module, const se::DeviceDescription& device_desc,
+    HloModule* hlo_module,
     const HloCostAnalysis::ShapeSizeFunction& shape_size_function) {
   {
     HloPassPipeline pipeline("optimization");
@@ -137,15 +138,15 @@ tensorflow::Status OptimizeHloModule(
 
     // TODO(b/64094172): make Call work on GPU instead of inlining.
     pipeline.AddPass<CallInliner>();
-
+    pipeline.AddPass<DotDecomposer>();
     {
       auto& pass =
           pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
       pass.AddInvariantChecker<HloVerifier>(shape_size_function);
 
-      // TODO(b/62764704): Do not rewrite on GPU, use cuDNN's BatchNorm APIs
+      // TODO(b/62764704): Do not expand on GPU, use cuDNN's BatchNorm APIs
       // instead.
-      pass.AddPass<BatchNormRewriter>(
+      pass.AddPass<BatchNormExpander>(
           /*rewrite_training_op=*/true,
           /*rewrite_inference_op=*/true,
           /*rewrite_grad_op=*/true,
@@ -224,9 +225,8 @@ tensorflow::Status PrepareHloModuleForIrEmitting(
   // (and sometime after) copy insertion, to avoid dead code from interfering
   // with the rewrites.
   pipeline.AddPass<HloDCE>();
-  pipeline.AddPass<GpuCopyInsertion>();
-  pipeline.AddPass<HloDCE>();
   pipeline.AddPass<FlattenCallGraph>();
+  pipeline.AddPass<GpuCopyInsertion>();
   return pipeline.Run(hlo_module).status();
 }
 
@@ -295,21 +295,26 @@ StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
 }  // namespace
 
 GpuCompiler::GpuCompiler()
-    : pointer_size_(llvm::DataLayout(kDataLayout).getPointerSize()) {}
+    : pointer_size_(llvm::DataLayout(kDataLayout)
+                        .getPointerSize(0 /* default address space */)) {}
+
+StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
+    std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/) {
+  XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunHloPasses");
+  Tracing::TraceMe annotation("HLO Transforms", module->name(),
+                              /*is_expensive=*/true);
+  TF_RETURN_IF_ERROR(OptimizeHloModule(module.get(), ShapeSizeBytesFunction()));
+  return std::move(module);
+}
 
-StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
+StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec) {
+  XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend");
+
   TF_RET_CHECK(stream_exec != nullptr);
 
-  {
-    Tracing::TraceMe annotation("HLO Transforms", module->name(),
-                                /*is_expensive=*/true);
-    TF_RETURN_IF_ERROR(OptimizeHloModule(module.get(),
-                                         stream_exec->GetDeviceDescription(),
-                                         ShapeSizeBytesFunction()));
-    TF_RETURN_IF_ERROR(
-        PrepareHloModuleForIrEmitting(module.get(), ShapeSizeBytesFunction()));
-  }
+  TF_RETURN_IF_ERROR(
+      PrepareHloModuleForIrEmitting(module.get(), ShapeSizeBytesFunction()));
 
   llvm::LLVMContext llvm_context;
   std::string buffer;
@@ -362,8 +367,11 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
   HloComputation* entry_computation = module->entry_computation();
   IrEmitterUnnested ir_emitter(module->config(), entry_computation,
                                &ir_emitter_context);
-  TF_RETURN_IF_ERROR(
-      entry_computation->root_instruction()->Accept(&ir_emitter));
+  {
+    XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - IR emission");
+    TF_RETURN_IF_ERROR(
+        entry_computation->root_instruction()->Accept(&ir_emitter));
+  }
 
   if (user_pre_optimization_hook_) {
     TF_CHECK_OK(user_pre_optimization_hook_(llvm_module));
@@ -412,9 +420,12 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
     cc_minor = 0;
   }
 
-  TF_ASSIGN_OR_RETURN(string ptx,
-                      CompileToPtx(&llvm_module, {cc_major, cc_minor},
-                                   module->config(), libdevice_dir));
+  string ptx;
+  {
+    XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - CompileToPtx");
+    TF_ASSIGN_OR_RETURN(ptx, CompileToPtx(&llvm_module, {cc_major, cc_minor},
+                                          module->config(), libdevice_dir));
+  }
 
   if (!ir_dump_directory.empty()) {
     TF_RETURN_IF_ERROR(llvm_ir::DumpIRToDirectory(
@@ -456,10 +467,20 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
   VLOG(2) << "Printing the thunk schedule...";
   XLA_VLOG_LINES(2, thunk_schedule->ToString());
 
-  auto* gpu_executable =
-      new GpuExecutable(ptx, cubin, {cc_major, cc_minor},
-                        std::move(thunk_schedule), std::move(module),
-                        std::move(buffer_assignment), ShapeSizeBytesFunction());
+  std::unique_ptr<HloProfileIndexMap> profile_index_map;
+  std::unique_ptr<HloProfilePrinter> profile_printer;
+
+  if (module->config().hlo_profiling_enabled()) {
+    HloCostAnalysis cost_analysis(ShapeSizeBytesFunction());
+    profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
+    profile_printer =
+        CreateHloProfilePrinter(*profile_index_map, cost_analysis);
+  }
+
+  auto* gpu_executable = new GpuExecutable(
+      ptx, cubin, {cc_major, cc_minor}, std::move(thunk_schedule),
+      std::move(module), std::move(buffer_assignment),
+      std::move(profile_printer), std::move(profile_index_map));
   if (embed_ir_in_executable) {
     DCHECK_NE("", ir_module_string_before_opt);
     gpu_executable->set_ir_module_string(ir_module_string_before_opt);
@@ -470,6 +491,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
 std::vector<uint8> GpuCompiler::CompilePtxOrGetCachedResult(const string& ptx,
                                                             int cc_major,
                                                             int cc_minor) {
+  XLA_SCOPED_LOGGING_TIMER("GpuCompiler::CompilePtxOrGetCachedResult");
   Tracing::TraceMe annotation("PTX->CUBIN", /*is_expensive=*/true);
   bool inserted;
   decltype(compilation_cache_.begin()) iter;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index fe5fce615fc1fbf12b14d626398b56dc7ece81e8..18e34340205b6f51497e26c45520799d21c55a46 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -49,7 +49,11 @@ class GpuCompiler : public LLVMCompiler {
   //        stream_execs)
   using LLVMCompiler::Compile;
 
-  StatusOr<std::unique_ptr<Executable>> Compile(
+  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> module,
+      perftools::gputools::StreamExecutor* stream_exec) override;
+
+  StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
       perftools::gputools::StreamExecutor* stream_exec) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
new file mode 100644
index 0000000000000000000000000000000000000000..33d739b79d3664fec3586bbc924b7fa2e10d3256
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
@@ -0,0 +1,112 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h"
+
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/copy_insertion.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace gpu {
+
+StatusOr<HloInstruction*> GpuCopyInsertion::FindOrInsertCopy(
+    HloInstruction* hlo) {
+  HloInstruction*& copy = inserted_copies_[hlo];
+  if (copy == nullptr) {
+    TF_ASSIGN_OR_RETURN(copy, hlo->parent()->DeepCopyInstruction(hlo));
+  }
+  return copy;
+}
+
+StatusOr<bool> GpuCopyInsertion::Run(HloModule* module) {
+  CopyInsertion generic_copy_insertion;
+
+  TF_ASSIGN_OR_RETURN(bool changed, generic_copy_insertion.Run(module));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloDataflowAnalysis> dataflow,
+                      HloDataflowAnalysis::Run(module));
+
+  // Make sure all operands of a library call are in memory instead of constants
+  // in IR.
+  for (HloInstruction* hlo :
+       module->entry_computation()->MakeInstructionPostOrder()) {
+    if (ImplementedAsLibraryCall(*hlo)) {
+      for (int64 i = 0; i < hlo->operand_count(); ++i) {
+        HloInstruction* operand = hlo->mutable_operand(i);
+        TF_RET_CHECK(ShapeUtil::IsArray(operand->shape()));
+        const auto& values = dataflow->GetValueSet(operand).values();
+        if (std::any_of(values.begin(), values.end(),
+                        [](const HloValue* value) {
+                          return value->defining_instruction()->opcode() ==
+                                 HloOpcode::kConstant;
+                        })) {
+          TF_ASSIGN_OR_RETURN(HloInstruction * copy, FindOrInsertCopy(operand));
+          TF_RETURN_IF_ERROR(hlo->ReplaceOperandWith(i, copy));
+          changed = true;
+        }
+      }
+    }
+  }
+
+  // Init values of a while node cannot be constants. Insert copies for any
+  // constants found at the operand of a while.
+  tensorflow::gtl::FlatSet<HloInstruction*> copied_constants;
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kWhile) {
+        continue;
+      }
+      for (auto& pair :
+               dataflow->GetInstructionValueSet(instruction->operand(0))) {
+        const HloValueSet& value_set = pair.second;
+        for (const HloValue* value : value_set.values()) {
+          if (value->defining_instruction()->opcode() ==
+              HloOpcode::kConstant &&
+              !ContainsKey(copied_constants, value->defining_instruction())) {
+            HloInstruction* constant = value->defining_instruction();
+            TF_ASSIGN_OR_RETURN(HloInstruction * copy,
+                                FindOrInsertCopy(constant));
+            TF_RETURN_IF_ERROR(constant->ReplaceAllUsesWith(copy));
+            copied_constants.insert(constant);
+            changed = true;
+          }
+        }
+      }
+    }
+  }
+
+  // The GPU backend needs additional copies added due to deficiencies in
+  // buffer assignment.
+  TF_ASSIGN_OR_RETURN(bool buffer_assignment_changed,
+                      CopyInsertion::AddCopiesForBufferAssignment(module));
+
+  return changed || buffer_assignment_changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/copy_insertion.h b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
similarity index 56%
rename from tensorflow/compiler/xla/service/gpu/copy_insertion.h
rename to tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
index 11077dad2e5506eab4fa84d47ad13a26ed1c035a..4d77f337e6eb20f7d79acc0829fde26bbe443f25 100644
--- a/tensorflow/compiler/xla/service/gpu/copy_insertion.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_COPY_INSERTION_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_COPY_INSERTION_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COPY_INSERTION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COPY_INSERTION_H_
 
-#include "tensorflow/compiler/xla/service/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
 namespace gpu {
@@ -25,12 +25,23 @@ namespace gpu {
 // Besides the modifications made by the generic xla::CopyInsertion, this
 // GPU-specific copy insertion also materializes operands of library calls by
 // inserting kCopy instructions.
-class GpuCopyInsertion : public CopyInsertion {
+class GpuCopyInsertion : public HloPassInterface {
  public:
+  tensorflow::StringPiece name() const override { return "copy-insertion"; }
+
   StatusOr<bool> Run(HloModule* module) override;
+
+ protected:
+  // Returns a copy of `hlo`. Looks in inserted_copies_ first to avoid making
+  // duplicate copies.
+  StatusOr<HloInstruction*> FindOrInsertCopy(HloInstruction* hlo);
+
+  // A map containing all copies inserted to materialize operands of library
+  // calls. The key is the copied instruction and the value is the copy.
+  tensorflow::gtl::FlatMap<HloInstruction*, HloInstruction*> inserted_copies_;
 };
 
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_COPY_INSERTION_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COPY_INSERTION_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index c6f23f9b0506186c4f76a887e6a540dafdd79962..366d87e9c30ed043b38c8e0cea889d5d90e7c8d9 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -69,7 +69,7 @@ class HloExecutionProfiler {
   ~HloExecutionProfiler() {
     if (do_profile_) {
       stream_->ThenStopTimer(execution_timer_.get());
-      stream_->BlockHostUntilDone();
+      stream_->BlockHostUntilDone().IgnoreError();
       profile_->set_total_cycles_executed(
           *computation_, execution_timer_->Nanoseconds() * clock_rate_ghz_);
     }
@@ -87,7 +87,7 @@ class HloExecutionProfiler {
   void FinishOperation(const HloInstruction* hlo_instruction) {
     if (do_profile_) {
       stream_->ThenStopTimer(per_op_timer_.get());
-      stream_->BlockHostUntilDone();
+      stream_->BlockHostUntilDone().IgnoreError();
       profile_->SetCyclesTakenBy(
           hlo_instruction, per_op_timer_->Nanoseconds() * clock_rate_ghz_);
     }
@@ -113,14 +113,15 @@ GpuExecutable::GpuExecutable(
     std::unique_ptr<const ThunkSchedule> thunk_schedule,
     std::unique_ptr<const HloModule> hlo_module,
     std::unique_ptr<const BufferAssignment> assignment,
-    HloCostAnalysis::ShapeSizeFunction shape_size_function)
-    : Executable(std::move(hlo_module)),
+    std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+    : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+                 std::move(hlo_profile_index_map)),
       ptx_(ptx),
       cubin_(cubin),
       compute_capability_(compute_capability),
       thunk_schedule_(std::move(thunk_schedule)),
-      assignment_(std::move(assignment)),
-      shape_size_function_(std::move(shape_size_function)) {}
+      assignment_(std::move(assignment)) {}
 
 Status GpuExecutable::ExecuteThunks(
     const ServiceExecutableRunOptions* run_options,
@@ -166,9 +167,16 @@ Status GpuExecutable::ExecuteThunks(
       stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get());
     }
 
+    // If this thunk requests it, wait for all currently-executing thunks to
+    // finish.  This is useful e.g. if the thunk is about to perform autotuning.
+    if (thunk->ShouldHaltAllActivityBeforeRunning(stream)) {
+      TF_RETURN_IF_ERROR(main_stream->BlockHostUntilDone());
+    }
+
     profiler.StartOperation();
     VLOG(2) << "Executing the thunk for "
-            << thunk->hlo_instruction()->ToString();
+            << thunk->hlo_instruction()->ToString() << " on stream "
+            << stream_no;
     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream));
     if (thunk_schedule_->Depended(thunk)) {
       auto finish_event = MakeUnique<se::Event>(main_stream->parent());
@@ -183,90 +191,16 @@ Status GpuExecutable::ExecuteThunks(
   // Make sure kernels are completed before deallocating temporary buffers.
   // TODO(b/30100571): we could potentially postpone deallocating the temp
   // buffers until a different computation is executed.
-  if (block_host_until_done && !main_stream->BlockHostUntilDone()) {
-    return InternalError("Failed to complete all kernels launched on stream %p",
-                         main_stream);
-  }
-
-  return Status::OK();
-}
-
-StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
-  se::Stream* stream = run_options->stream();
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-
-  BufferAllocations::Builder buffer_allocations_builder;
-  for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
-       ++i) {
-    const BufferAllocation& allocation = assignment_->GetAllocation(i);
-    if (allocation.is_entry_computation_parameter()) {
-      buffer_allocations_builder.RegisterBuffer(
-          i, arguments[allocation.parameter_number()]);
+  if (block_host_until_done) {
+    Status block_status = main_stream->BlockHostUntilDone();
+    if (!block_status.ok()) {
+      return InternalError(
+          "Failed to complete all kernels launched on stream %p: %s",
+          main_stream, block_status.error_message().c_str());
     }
   }
-  se::StreamExecutor* executor = stream->parent();
-  TF_ASSIGN_OR_RETURN(
-      auto buffer_allocations,
-      buffer_allocations_builder.Build(*assignment_, executor->device_ordinal(),
-                                       memory_allocator));
 
-  bool block_host_until_done =
-      !memory_allocator->AllowsAsynchronousDeallocation();
-  TF_RETURN_IF_ERROR(ExecuteThunks(run_options, *buffer_allocations,
-                                   block_host_until_done,
-                                   hlo_execution_profile));
-
-  HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
-  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice output_slice,
-                      assignment_->GetUniqueTopLevelOutputSlice());
-  se::DeviceMemoryBase output_buffer_address =
-      buffer_allocations->GetDeviceAddress(output_slice.index());
-
-  if (ShapeUtil::IsTuple(root->shape())) {
-    std::set<se::DeviceMemoryBase> referred_by_output;
-    if (GetRootPointsToSet().IsAmbiguous()) {
-      // The points-to set of the root is ambiguous so we need to examine the
-      // result data to determine which buffers are contained in the result.
-      TF_ASSIGN_OR_RETURN(
-          TransferManager * transfer_manager,
-          TransferManager::GetForPlatform(executor->platform()));
-      TF_ASSIGN_OR_RETURN(referred_by_output,
-                          transfer_manager->GatherBufferPointersFromTuple(
-                              executor, output_buffer_address, root->shape()));
-    } else {
-      // The points-to set of the root is unambiguous so it's known statically
-      // which buffers are in the result. Gather these buffers using the root's
-      // points-to set.
-      TF_RETURN_IF_ERROR(GetRootPointsToSet().ForEachElementWithStatus(
-          [&referred_by_output, &buffer_allocations, this](
-              const ShapeIndex& /*index*/,
-              const PointsToSet::BufferList& buffers) {
-            // The points to set is unambiguous so the set should be a
-            // singleton. That is, we know exactly which instruction produced
-            // the array at this element.
-            CHECK_EQ(1, buffers.size());
-            HloInstruction* hlo = buffers[0]->instruction();
-            TF_ASSIGN_OR_RETURN(
-                const BufferAllocation::Slice slice,
-                this->assignment_->GetUniqueSlice(hlo, buffers[0]->index()));
-            CHECK(!slice.allocation()->is_entry_computation_parameter());
-            referred_by_output.insert(
-                buffer_allocations->GetDeviceAddress(slice.index()));
-            return Status::OK();
-          }));
-    }
-    TF_RETURN_IF_ERROR(
-        buffer_allocations->TearDown(referred_by_output, *assignment_));
-  } else {
-    // If the computation result is not a tuple, we can delete all temporary
-    // buffers that are not the output.
-    TF_RETURN_IF_ERROR(
-        buffer_allocations->TearDown({output_buffer_address}, *assignment_));
-  }
-  return output_buffer_address;
+  return Status::OK();
 }
 
 StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
@@ -286,7 +220,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
     if (allocation.is_entry_computation_parameter()) {
       auto param_no = allocation.parameter_number();
       buffer_allocations_builder.RegisterBuffer(
-          i, arguments[param_no]->buffer(/*index=*/{}));
+          i, arguments[param_no]->root_buffer());
     }
   }
   se::StreamExecutor* executor = run_options->stream()->parent();
@@ -304,50 +238,46 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
   HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
   auto device_ordinal = executor->device_ordinal();
   auto shaped_buffer = MakeUnique<ShapedBuffer>(
-      root->shape(), executor->platform(), device_ordinal);
+      root->shape(), root->shape(), executor->platform(), device_ordinal);
 
   // Copy DeviceMemoryBase values which contain the array(s) of the result into
   // the respective location in ShapedBuffer.
   std::set<se::DeviceMemoryBase> buffers_in_result;
-  TF_RETURN_IF_ERROR(
-      shaped_buffer->mutable_shape_index_to_buffer_entry()
-          ->ForEachMutableElementWithStatus(
-              [&buffer_allocations, &buffers_in_result, &shaped_buffer, this](
-                  const ShapeIndex& index, size_t* buffer_entry) {
-                const auto& sources = this->GetRootPointsToSet().element(index);
-                // The points-to set is unambiguous so the set should be a
-                // singleton. That is, we know exactly which instruction
-                // produced the array at this element.
-                CHECK_EQ(1, sources.size());
-                auto src_hlo = sources[0]->instruction();
-
-                VLOG(4) << "Looking at: " << sources[0];
-
-                // The source instruction should have a non-parameter buffer
-                // assigned.
-                TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
-                                    this->assignment_->GetUniqueSlice(
-                                        src_hlo, sources[0]->index()));
-                CHECK(!slice.allocation()->is_entry_computation_parameter());
-
-                perftools::gputools::DeviceMemoryBase src_base =
-                    buffer_allocations->GetDeviceAddress(slice.index());
-                CHECK(!src_base.is_null() || src_base.size() == 0);
-                shaped_buffer->mutable_buffers()->push_back(src_base);
-                *buffer_entry = shaped_buffer->mutable_buffers()->size() - 1;
-
-                buffers_in_result.insert(src_base);
-                return Status::OK();
-              }));
+  TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus(
+      [&buffer_allocations, &buffers_in_result, &shaped_buffer, this](
+          const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
+        const auto& sources = this->GetRootPointsToSet().element(index);
+        // The points-to set is unambiguous so the set should be a
+        // singleton. That is, we know exactly which instruction
+        // produced the array at this element.
+        CHECK_EQ(1, sources.size());
+        auto src_hlo = sources[0]->instruction();
+
+        VLOG(4) << "Looking at: " << sources[0];
+
+        // The source instruction should have a non-parameter buffer
+        // assigned.
+        TF_ASSIGN_OR_RETURN(
+            const BufferAllocation::Slice slice,
+            this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index()));
+        CHECK(!slice.allocation()->is_entry_computation_parameter());
+
+        perftools::gputools::DeviceMemoryBase src_base =
+            buffer_allocations->GetDeviceAddress(slice.index());
+        CHECK(!src_base.is_null() || src_base.size() == 0);
+        *device_memory = src_base;
+        buffers_in_result.insert(src_base);
+        return Status::OK();
+      }));
   TF_RETURN_IF_ERROR(
       buffer_allocations->TearDown(buffers_in_result, *assignment_));
 
   return std::move(shaped_buffer);
 }
 
-StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteAsyncOnStream(
+StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   // TODO(b/30671675): Implement asynchronous execution mode.
   return Unimplemented(
       "Asynchronous execution on stream is not yet supported on GPU.");
@@ -358,9 +288,5 @@ const PointsToSet& GpuExecutable::GetRootPointsToSet() const {
       module().entry_computation()->root_instruction());
 }
 
-std::unique_ptr<HloCostAnalysis> GpuExecutable::CreateCostAnalysis() const {
-  return MakeUnique<HloCostAnalysis>(shape_size_function_);
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index a3815370c19af1da612bc6d9663cc0f8896062f7..00da64dfade8ddb0694c0ee7ac158c9f2e15a508 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -54,7 +54,8 @@ class GpuExecutable : public Executable {
                 std::unique_ptr<const ThunkSchedule> thunk_schedule,
                 std::unique_ptr<const HloModule> hlo_module,
                 std::unique_ptr<const BufferAssignment> assignment,
-                HloCostAnalysis::ShapeSizeFunction shape_size_function);
+                std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+                std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
 
   // This should be called after set_ir_module_string.
   const string& ir_module_string() const { return ir_module_string_; }
@@ -71,32 +72,22 @@ class GpuExecutable : public Executable {
   // empty, in which case compilation is left up to the GPU driver.
   const std::vector<uint8>& cubin() const { return cubin_; }
 
-  // Both overloads of ExecuteOnStream will fail if the compute capability of
-  // the stream doesn't match the compute capability passed to this object's
-  // constructor.
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
-
+  // ExecuteOnStream will fail if the compute capability of the stream doesn't
+  // match the compute capability passed to this object's constructor.
   StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
+  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments) override;
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
   const Status EqualOrFail(const Executable& executable) {
     // TODO(b/62952745) Implement equality test on GPU executable.
     return Unimplemented("Equality test on GPU executable is not implemented.");
   }
 
-  std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
-
  private:
   // If `block_host_until_done` is false, execution will not block the host
   // until the kernels have completed. This is used as an optimization for
@@ -140,9 +131,6 @@ class GpuExecutable : public Executable {
   // memory for every output/temp buffers.
   const std::unique_ptr<const BufferAssignment> assignment_;
 
-  // Function to compute the size of a given Shape, in bytes.
-  const HloCostAnalysis::ShapeSizeFunction shape_size_function_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
similarity index 93%
rename from tensorflow/compiler/xla/service/gpu/layout_assignment.cc
rename to tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 0bbd63fb7bfc657cb7bb1de673253c198f5bd25f..50a249f448e7b4956e7bf6bd603d256eca88f71d 100644
--- a/tensorflow/compiler/xla/service/gpu/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/layout_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
 
 #include <memory>
 
@@ -80,9 +80,9 @@ Status GpuLayoutAssignment::AddBackendConstraints(
       const ConvolutionDimensionNumbers& dimension_numbers =
           instruction->convolution_dimension_numbers();
       std::vector<int64> input_layout;
-      for (int i = dimension_numbers.spatial_dimensions_size() - 1; i >= 0;
-           --i) {
-        input_layout.push_back(dimension_numbers.spatial_dimensions(i));
+      for (int i = dimension_numbers.input_spatial_dimensions_size() - 1;
+           i >= 0; --i) {
+        input_layout.push_back(dimension_numbers.input_spatial_dimensions(i));
       }
       input_layout.push_back(dimension_numbers.input_feature_dimension());
       input_layout.push_back(dimension_numbers.input_batch_dimension());
@@ -102,9 +102,9 @@ Status GpuLayoutAssignment::AddBackendConstraints(
       *filter_shape.mutable_layout() = LayoutUtil::MakeLayout(filter_layout);
 
       std::vector<int64> output_layout;
-      for (int i = dimension_numbers.spatial_dimensions_size() - 1; i >= 0;
-           --i) {
-        output_layout.push_back(dimension_numbers.spatial_dimensions(i));
+      for (int i = dimension_numbers.output_spatial_dimensions_size() - 1;
+           i >= 0; --i) {
+        output_layout.push_back(dimension_numbers.output_spatial_dimensions(i));
       }
       output_layout.push_back(dimension_numbers.output_feature_dimension());
       output_layout.push_back(dimension_numbers.output_batch_dimension());
diff --git a/tensorflow/compiler/xla/service/gpu/layout_assignment.h b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
similarity index 86%
rename from tensorflow/compiler/xla/service/gpu/layout_assignment.h
rename to tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
index 169041eb85c633cb4f1f679bcea127714828308f..7655a3ebf45f83c0125a4257baae7a7229ebdc6d 100644
--- a/tensorflow/compiler/xla/service/gpu/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LAYOUT_ASSIGNMENT_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LAYOUT_ASSIGNMENT_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_LAYOUT_ASSIGNMENT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_LAYOUT_ASSIGNMENT_H_
 
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
@@ -38,4 +38,4 @@ class GpuLayoutAssignment : public LayoutAssignment {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LAYOUT_ASSIGNMENT_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_LAYOUT_ASSIGNMENT_H_
diff --git a/tensorflow/compiler/xla/service/gpu/layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
similarity index 97%
rename from tensorflow/compiler/xla/service/gpu/layout_assignment_test.cc
rename to tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
index ac206b89d329d7e4ac91ee51162c9694f6899d78..f68b23c8ce969372a01ce77840e016d82ca5d2ed 100644
--- a/tensorflow/compiler/xla/service/gpu/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/layout_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index f0f036f7f381db15b84db85d3efeec5d8141884e..ae92daef8882de2e7d64b69f68452061cb5507f2 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -44,7 +44,7 @@ GpuTransferManager::GpuTransferManager()
     : GenericTransferManager(
           se::cuda::kCudaPlatformId,
           /*pointer_size=*/llvm::DataLayout(gpu::GpuCompiler::kDataLayout)
-              .getPointerSize()) {}
+              .getPointerSize(0 /* default address space */)) {}
 
 Status GpuTransferManager::TransferLiteralToInfeed(se::StreamExecutor* executor,
                                                    const Literal& literal) {
@@ -105,12 +105,13 @@ Status GpuTransferManager::EnqueueBuffersToInfeed(
   // infeed requests, blocking on the stream might be
   // heavy-handed. Figure out if finer-grained acknowledgement is
   // possible.
-  if (!stream->BlockHostUntilDone()) {
+  Status block_status = stream->BlockHostUntilDone();
+  if (!block_status.ok()) {
     for (gpu::InfeedBuffer* b : buffers) {
       b->Done();
     }
-    return InternalError("Failed to complete data transfer on stream %p",
-                         stream);
+    return InternalError("Failed to complete data transfer on stream %p: %s",
+                         stream, block_status.error_message().c_str());
   }
 
   infeed_manager->EnqueueBuffers(buffers);
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
index e33e904692ca5ad41e17d2e165dbb40b6bd4aa33..2ac95ceb692447c7ac6dbbcd8b9a38876f7a77b6 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
@@ -30,9 +30,8 @@ InfeedThunk::InfeedThunk(
                              tuple_element_buffers.end()),
       destination_buffer_(destination_buffer) {}
 
-tensorflow::Status InfeedThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
+Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                                    perftools::gputools::Stream* stream) {
   VLOG(2) << "Infeeding to GPU ";
 
   perftools::gputools::DeviceMemoryBase destination_address =
@@ -66,15 +65,16 @@ tensorflow::Status InfeedThunk::ExecuteOnStream(
                        buffer->length());
   }
 
-  if (!stream->BlockHostUntilDone()) {
-    return InternalError("Failed to complete data transfer on stream %p",
-                         stream);
+  Status block_status = stream->BlockHostUntilDone();
+  if (!block_status.ok()) {
+    return InternalError("Failed to complete data transfer on stream %p: %s",
+                         stream, block_status.error_message().c_str());
   }
 
   infeed_manager->ReleaseBuffers(infeed_buffers);
 
   VLOG(2) << "Infeeding to GPU complete";
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
index 371d71f9dbdd21cb5f36cc3108c8f398a4a91c29..86918705fa0305217f11753e383200c7bd71474b 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
@@ -43,9 +43,8 @@ class InfeedThunk : public Thunk {
   InfeedThunk(const InfeedThunk&) = delete;
   InfeedThunk& operator=(const InfeedThunk&) = delete;
 
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         perftools::gputools::Stream* stream) override;
 
  private:
   const std::vector<BufferAllocation::Slice> tuple_element_buffers_;
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 9a4bfd0905bb62c02c70e7f2eea46872c07bca89..1d47ffde4331868cbc8a8afb2d01b11e77a7fab0 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -156,8 +156,10 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfConvolutionUnfused) {
   conv_dnums.set_output_batch_dimension(0);
   conv_dnums.set_input_feature_dimension(1);
   conv_dnums.set_output_feature_dimension(1);
-  conv_dnums.add_spatial_dimensions(2);
-  conv_dnums.add_spatial_dimensions(3);
+  conv_dnums.add_input_spatial_dimensions(2);
+  conv_dnums.add_output_spatial_dimensions(2);
+  conv_dnums.add_input_spatial_dimensions(3);
+  conv_dnums.add_output_spatial_dimensions(3);
   conv_dnums.set_kernel_output_feature_dimension(0);
   conv_dnums.set_kernel_input_feature_dimension(1);
   conv_dnums.add_kernel_spatial_dimensions(2);
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 8fb7a6adda9dc7c36eb9aabcbcdc9d77e6c22c4a..c04a7e0bf8fb5a4f4f73892bdef1b0b3e9879778 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -100,7 +100,7 @@ bool ImplementedAsDnnConvolution(const HloInstruction& hlo) {
   if (hlo.opcode() == HloOpcode::kConvolution) {
     const ConvolutionDimensionNumbers& dnums =
         hlo.convolution_dimension_numbers();
-    if (dnums.spatial_dimensions_size() > 3) {
+    if (dnums.input_spatial_dimensions_size() > 3) {
       return false;
     }
 
@@ -110,6 +110,10 @@ bool ImplementedAsDnnConvolution(const HloInstruction& hlo) {
       return false;
     }
 
+    if (window_util::HasWindowReversal(hlo.window())) {
+      return false;
+    }
+
     return true;
   }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 6e2bd4e11d3c4ff576edb0df3b724abebfc0e424..e71aa0d13306c9d6571c5c26b0b6f430655df09f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -173,7 +173,7 @@ Status IrEmitter::EmitCallToNestedComputation(
   return Status::OK();
 }
 
-bool IrEmitter::MaybeEmitSpecialAtomicOperation(
+bool IrEmitter::MaybeEmitDirectAtomicOperation(
     const HloComputation& computation, llvm::Value* output_address,
     llvm::Value* source_address) {
   CHECK_EQ(2, computation.num_parameters());
@@ -233,102 +233,189 @@ bool IrEmitter::MaybeEmitSpecialAtomicOperation(
   return false;
 }
 
-Status IrEmitter::EmitAtomicOperationForNestedComputation(
-    const HloComputation& computation, llvm::Value* output_address,
-    llvm::Value* source_address) {
-  if (computation.num_parameters() != 2) {
-    // TODO(b/30258929): We only accept binary computations so far.
-    return Unimplemented(
-        "We only support atomic functions with exactly two parameters, but "
-        "computation %s has %lld.",
-        computation.name().c_str(), computation.num_parameters());
-  }
+// Implements atomic binary operations using atomic compare-and-swap
+// (atomicCAS) as follows:
+//   1. Reads the value from the memory pointed to by output_address and
+//     records it as old_output.
+//   2. Uses old_output as one of the source operand to perform the binary
+//     operation and stores the result in new_output.
+//   3. Calls atomicCAS which implements compare-and-swap as an atomic
+//     operation. In particular, atomicCAS reads the value from the memory
+//     pointed to by output_address, and compares the value with old_output. If
+//     the two values equal, new_output is written to the same memory location
+//     and true is returned to indicate that the atomic operation succeeds.
+//     Otherwise, the new value read from the memory is returned. In this case,
+//     the new value is copied to old_output, and steps 2. and 3. are repeated
+//     until atomicCAS succeeds.
+//
+// On Nvidia GPUs, atomicCAS can only operate on 32 bit and 64 bit integers. If
+// the element type of the binary operation is 32 bits or 64 bits, the integer
+// type of the same size is used for the atomicCAS operation. On the other hand,
+// if the element type is smaller than 32 bits, int32 is used for the atomicCAS
+// operation. In this case, atomicCAS reads and writes 32 bit values from
+// the memory, which is larger than the memory size required by the original
+// atomic binary operation. We mask off the last two bits of the output_address
+// and use the result as an address to read the 32 bit values from the memory.
+// This can avoid out of bound memory accesses if tensor buffers are 4 byte
+// aligned and have a size of 4N, an assumption that the runtime can guarantee.
+//
+// The pseudo code is shown below. Variables *_address are pointers to a memory
+// region with a size equal to the size of the atomicCAS operation, with the
+// exception that new_output_address is a pointer to a memory region with a size
+// equal to the element size of the binary operation.
+//
+//   element_size = sizeof(element_type);
+//   atomic_size = max(32, element_size);
+//   cas_new_output_address = alloca(atomic_size);
+//   cas_old_output_address = alloca(atomic_size);
+//   if (atomic_size != element_size) {
+//     atomic_address = output_address & ((int64)(-2));
+//     new_output_address = cas_new_output_address + (output_address & 3);
+//   } else {
+//     atomic_address = output_address;
+//     new_output_address = cas_new_output_address;
+//   }
+//
+//   *cas_old_output_address = *atomic_address;
+//   do {
+//     *cas_new_output_address = *cas_old_output_address;
+//     *new_output_address = operation(*new_output_address, *source_address);
+//     (*cas_old_output_address, success) =
+//       atomicCAS(atomic_address, *cas_old_output_address,
+//       *cas_new_output_address);
+//   } while (!success);
+//
+Status IrEmitter::EmitAtomicOperationUsingCAS(const HloComputation& computation,
+                                              llvm::Value* output_address,
+                                              llvm::Value* source_address) {
+  llvm::PointerType* output_address_type =
+      llvm::dyn_cast<llvm::PointerType>(output_address->getType());
+  CHECK_NE(output_address_type, nullptr);
+
+  // element_type is the data type for the binary operation.
+  llvm::Type* element_type = output_address_type->getPointerElementType();
+  int element_size = llvm_ir::GetSizeInBits(element_type);
+  llvm::Type* element_address_type = element_type->getPointerTo();
+
+  int atomic_size = (element_size < 32) ? 32 : element_size;
+  llvm::Type* atomic_type = ir_builder_.getIntNTy(atomic_size);
+  llvm::Type* atomic_address_type =
+      atomic_type->getPointerTo(output_address_type->getPointerAddressSpace());
+
+  // cas_old_output_address and cas_new_output_address point to the scratch
+  // memory where we store the old and new values for the repeated atomicCAS
+  // operations.
+  llvm::Value* cas_old_output_address = ir_builder_.CreateAlloca(
+      atomic_type, /*ArraySize=*/nullptr, "cas_old_output_address");
+  llvm::Value* cas_new_output_address = ir_builder_.CreateAlloca(
+      atomic_type, /*ArraySize=*/nullptr, "cas_new_output_address");
 
-  if (MaybeEmitSpecialAtomicOperation(computation, output_address,
-                                      source_address)) {
-    return Status::OK();
-  }
-
-  // Other binary computations can be made atomic as following (labels are basic
-  // block names used in the IR emitting code later).
-  //
-  // atomic_op_loop_preheader:
-  //   ...
-  //   source = *source_address;
-  //   old_output = *output_address;
-  //   do {
-  // atomic_op_loop_body_entry:
-  //     new_output = computation(old_output, source);
-  //     (old_output, success) =
-  //         atomicCAS(output_address, old_output, new_output);
-  //   } while (!success);
-  //
-  // atomic_op_loop_exit:
-  //   ...
-  //
-  // TODO(jingyue): Consider encapsulate the logic of emitting control flow to
-  // something similar to llvm_ir::ForLoop.
-  //
   // Emit preparation code to the preheader.
   llvm::BasicBlock* loop_preheader_bb = ir_builder_.GetInsertBlock();
-  llvm::Type* element_ir_type =
-      output_address->getType()->getPointerElementType();
-  // old_output = *output_address;
-  llvm::Value* old_output_location = ir_builder_.CreateAlloca(
-      element_ir_type, /*ArraySize=*/nullptr, "old_output_location");
-  ir_builder_.CreateStore(ir_builder_.CreateLoad(output_address, "old_output"),
-                          old_output_location);
+
+  llvm::Value* atomic_memory_address;
+  // binop_output_address points to the scratch memory that stores the
+  // result of the binary operation.
+  llvm::Value* binop_output_address;
+  if (element_size < 32) {
+    // Assume the element size is an integer number of bytes.
+    CHECK_EQ((element_size % sizeof(char)), 0);
+    llvm::Type* address_int_type =
+        module_->getDataLayout().getIntPtrType(output_address_type);
+    atomic_memory_address =
+        ir_builder_.CreatePtrToInt(output_address, address_int_type);
+    llvm::Value* mask = llvm::ConstantInt::get(address_int_type, 3);
+    llvm::Value* offset = ir_builder_.CreateAnd(atomic_memory_address, mask);
+    mask = llvm::ConstantInt::get(address_int_type, -2);
+    atomic_memory_address = ir_builder_.CreateAnd(atomic_memory_address, mask);
+    atomic_memory_address =
+        ir_builder_.CreateIntToPtr(atomic_memory_address, atomic_address_type);
+    binop_output_address = ir_builder_.CreateAdd(
+        ir_builder_.CreatePtrToInt(cas_new_output_address, address_int_type),
+        offset);
+    binop_output_address =
+        ir_builder_.CreateIntToPtr(binop_output_address, element_address_type);
+  } else {
+    atomic_memory_address =
+        ir_builder_.CreateBitCast(output_address, atomic_address_type);
+    binop_output_address =
+        ir_builder_.CreateBitCast(cas_new_output_address, element_address_type);
+  }
+
+  // Use the value from the memory that atomicCAS operates on to initialize
+  // cas_old_output.
+  llvm::Value* cas_old_output =
+      ir_builder_.CreateLoad(atomic_memory_address, "cas_old_output");
+  ir_builder_.CreateStore(cas_old_output, cas_old_output_address);
+
   llvm::BasicBlock* loop_exit_bb = loop_preheader_bb->splitBasicBlock(
       ir_builder_.GetInsertPoint(), "atomic_op_loop_exit");
-
-  // Emit the body of the loop that repeatedly invokes atomicCAS.
   llvm::BasicBlock* loop_body_bb =
       llvm::BasicBlock::Create(ir_builder_.getContext(), "atomic_op_loop_body",
                                ir_builder_.GetInsertBlock()->getParent());
   ir_builder_.SetInsertPoint(loop_body_bb);
   // Change preheader's successor from loop_exit_bb to loop_body_bb.
   loop_preheader_bb->getTerminator()->setSuccessor(0, loop_body_bb);
-  // new_output = computation(old_output, source);
-  llvm::Value* new_output_location = ir_builder_.CreateAlloca(
-      element_ir_type, /*ArraySize=*/nullptr, "new_output_location");
+
+  // Emit the body of the loop that repeatedly invokes atomicCAS.
+  //
+  // Use cas_old_output to initialize cas_new_output.
+  cas_old_output =
+      ir_builder_.CreateLoad(cas_old_output_address, "cas_old_output");
+  ir_builder_.CreateStore(cas_old_output, cas_new_output_address);
+  // Emits code to calculate new_output = operation(old_output, source);
   TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-      computation, {old_output_location, source_address}, new_output_location));
-
-  // (old_output, success) = atomicCAS(output_address, old_output, new_output);
-  int num_bits = llvm_ir::GetSizeInBits(element_ir_type);
-  llvm::Type* element_int_ir_type = ir_builder_.getIntNTy(num_bits);
-  // cmpxchg accepts integer only, and bitcast refuses to operate on aggregate
-  // types, so we bitcast load and store addresses to intN* of the same bit
-  // width.
-  llvm::Value* old_output = ir_builder_.CreateLoad(
-      ir_builder_.CreateBitCast(old_output_location,
-                                element_int_ir_type->getPointerTo()),
-      "old_output");
-  llvm::Value* new_output = ir_builder_.CreateLoad(
-      ir_builder_.CreateBitCast(new_output_location,
-                                element_int_ir_type->getPointerTo()),
-      "new_output");
+      computation, {binop_output_address, source_address},
+      binop_output_address));
+
+  llvm::Value* cas_new_output =
+      ir_builder_.CreateLoad(cas_new_output_address, "cas_new_output");
+
+  // Emit code to perform the atomicCAS operation
+  // (cas_old_output, success) = atomicCAS(memory_address, cas_old_output,
+  //                                       cas_new_output);
   llvm::Value* ret_value = ir_builder_.CreateAtomicCmpXchg(
-      ir_builder_.CreateBitCast(output_address,
-                                element_int_ir_type->getPointerTo()),
-      old_output, new_output, llvm::AtomicOrdering::SequentiallyConsistent,
+      atomic_memory_address, cas_old_output, cas_new_output,
+      llvm::AtomicOrdering::SequentiallyConsistent,
       llvm::AtomicOrdering::SequentiallyConsistent);
-  // cmpxchg returns a pair. The first element is the original value at
-  // output_address and the second element is whether the swap is successful.
+
+  // Extract the memory value returned from atomicCAS and store it as
+  // cas_old_output.
   ir_builder_.CreateStore(
-      ir_builder_.CreateExtractValue(ret_value, 0, "old_output"),
-      ir_builder_.CreateBitCast(old_output_location,
-                                element_int_ir_type->getPointerTo()));
+      ir_builder_.CreateExtractValue(ret_value, 0, "cas_old_output"),
+      cas_old_output_address);
+  // Extract the success bit returned from atomicCAS and generate a
+  // conditional branch on the success bit.
   ir_builder_.CreateCondBr(
       ir_builder_.CreateExtractValue(ret_value, 1, "success"), loop_exit_bb,
       loop_body_bb);
 
-  // Restore the insertion point to the exit basic block so that the caller of
+  // Set the insertion point to the exit basic block so that the caller of
   // this method can continue emitting code to the right place.
   SetToFirstInsertPoint(loop_exit_bb, &ir_builder_);
   return Status::OK();
 }
 
+Status IrEmitter::EmitAtomicOperationForNestedComputation(
+    const HloComputation& computation, llvm::Value* output_address,
+    llvm::Value* source_address) {
+  if (computation.num_parameters() != 2) {
+    // TODO(b/30258929): We only accept binary computations so far.
+    return Unimplemented(
+        "We only support atomic functions with exactly two parameters, but "
+        "computation %s has %lld.",
+        computation.name().c_str(), computation.num_parameters());
+  }
+
+  if (MaybeEmitDirectAtomicOperation(computation, output_address,
+                                     source_address)) {
+    return Status::OK();
+  }
+
+  return EmitAtomicOperationUsingCAS(computation, output_address,
+                                     source_address);
+}
+
 Status IrEmitter::HandleSelect(HloInstruction* select) {
   auto pred = select->operand(0);
   auto on_true = select->operand(1);
@@ -640,6 +727,37 @@ Status IrEmitter::HandleRng(HloInstruction* random) {
       .EmitLoop(IrName(random));
 }
 
+Status IrEmitter::HandleConditional(HloInstruction* conditional) {
+  auto pred = conditional->operand(0);
+  auto true_arg = conditional->operand(1);
+  auto false_arg = conditional->operand(2);
+
+  llvm::Value* conditional_result = GetBasePointer(*conditional);
+
+  llvm::LoadInst* pred_value = ir_builder_.CreateLoad(
+      GetBasePointer(*pred),
+      llvm_ir::AsStringRef(IrName(conditional, "load_predicate_value")));
+  llvm::Value* pred_cond = ir_builder_.CreateICmpNE(
+      pred_value,
+      llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0),
+      llvm_ir::AsStringRef(IrName(conditional, "boolean_predicate")));
+  llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
+      pred_cond, IrName(conditional, "if_then_else"), &ir_builder_);
+
+  SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+  TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
+      *conditional->true_computation(), {GetBasePointer(*true_arg)},
+      conditional_result));
+
+  SetToFirstInsertPoint(if_data.false_block, &ir_builder_);
+  TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
+      *conditional->false_computation(), {GetBasePointer(*false_arg)},
+      conditional_result));
+
+  SetToFirstInsertPoint(if_data.after_block, &ir_builder_);
+  return Status::OK();
+}
+
 llvm_ir::IrArray::Index IrEmitter::EmitOperandArrayLoopNest(
     const llvm_ir::IrArray& operand_array, int64 reduction_dimension,
     tensorflow::StringPiece name_suffix, llvm_ir::ForLoopNest* loop_nest) {
@@ -648,8 +766,8 @@ llvm_ir::IrArray::Index IrEmitter::EmitOperandArrayLoopNest(
   // reduction dimension.
   std::vector<int64> dimensions;
   const Shape& shape = operand_array.GetShape();
-  for (int i = shape.layout().minor_to_major_size() - 1; i >= 0; --i) {
-    int64 dimension = shape.layout().minor_to_major(i);
+  for (int i = 0; i < LayoutUtil::MinorToMajor(shape).size(); ++i) {
+    int64 dimension = LayoutUtil::Major(shape.layout(), i);
     if (dimension != reduction_dimension) {
       dimensions.push_back(dimension);
     }
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 9c01f5b7c72f429822300af28bfd5261150d33d1..08bbbe36c72872ba68104c8f328c2f602eb30fa8 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -95,6 +95,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleCall(HloInstruction* call) override;
   Status HandleCustomCall(HloInstruction* custom_call) override;
   Status HandleRng(HloInstruction* random) override;
+  Status HandleConditional(HloInstruction* conditional) override;
 
   Status FinishVisit(HloInstruction* root) override { return Status::OK(); }
 
@@ -185,9 +186,16 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // be simply implemented using an LLVM atomic instruction. If "computation" is
   // one of this kind, emits code to do that and returns true; otherwise,
   // returns false.
-  bool MaybeEmitSpecialAtomicOperation(const HloComputation& computation,
-                                       llvm::Value* output_address,
-                                       llvm::Value* source_address);
+  bool MaybeEmitDirectAtomicOperation(const HloComputation& computation,
+                                      llvm::Value* output_address,
+                                      llvm::Value* source_address);
+
+  // A helper method for EmitAtomicOperationForNestedComputation. It implements
+  // binary atomic operations using atomicCAS with special handling to support
+  // small data types.
+  Status EmitAtomicOperationUsingCAS(const HloComputation& computation,
+                                     llvm::Value* output_address,
+                                     llvm::Value* source_address);
 
   StatusOr<llvm::Value*> ComputeNestedElement(
       const HloComputation& computation,
@@ -227,6 +235,7 @@ class IrEmitterUnnested : public IrEmitter {
   // IrEmitterUnnested handles the following instructions differently from
   // IrEmitter.
   Status HandleCopy(HloInstruction* copy) override;
+  Status HandleConditional(HloInstruction* conditional) override;
   Status HandleConvolution(HloInstruction* convolution) override;
   Status HandleDot(HloInstruction* dot) override;
   Status HandleFusion(HloInstruction* fusion) override;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 1b863c9e3c51d6e757751154abd653cd1fdcb8a7..022c63de8db00dba8a626e76751113a3f9356537 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -123,10 +123,12 @@ void UpdateLaunchDimensions(const LaunchDimensions& launch_dims, Thunk* thunk,
   llvm::ConstantInt* threads_per_block_ir_value = llvm::ConstantInt::get(
       llvm::IntegerType::get(llvm_context, /*NumBits=*/32),
       launch_dims.threads_per_block());
+  // Our launch bounds are exact, so we can specify them as reqntidx rather than
+  // maxntidx.
   nvvm_annotations_node->addOperand(llvm::MDNode::get(
       llvm_context,
       {llvm::ConstantAsMetadata::get(ir_kernel),
-       llvm::MDString::get(llvm_context, "maxntidx"),
+       llvm::MDString::get(llvm_context, "reqntidx"),
        llvm::ConstantAsMetadata::get(threads_per_block_ir_value)}));
 }
 }  // namespace
@@ -246,6 +248,11 @@ Status IrEmitterUnnested::DefaultAction(HloInstruction* hlo) {
 }
 
 Status IrEmitterUnnested::HandleDot(HloInstruction* dot) {
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
+  if (dnums.lhs_batch_dimensions_size() > 0 ||
+      dnums.rhs_batch_dimensions_size() > 0) {
+    return Unimplemented("Dot with batch dimensions not implemented.");
+  }
   if (ImplementedAsGemm(*dot)) {
     thunk_sequence_->emplace_back(BuildGemmThunk(dot));
     return Status::OK();
@@ -254,6 +261,11 @@ Status IrEmitterUnnested::HandleDot(HloInstruction* dot) {
   return IrEmitter::HandleDot(dot);
 }
 
+Status IrEmitterUnnested::HandleConditional(HloInstruction* conditional) {
+  thunk_sequence_->push_back(BuildKernelThunk(conditional));
+  return IrEmitter::HandleConditional(conditional);
+}
+
 Status IrEmitterUnnested::HandleConvolution(HloInstruction* convolution) {
   if (ImplementedAsDnnConvolution(*convolution)) {
     thunk_sequence_->emplace_back(BuildConvolutionThunk(convolution));
@@ -421,10 +433,10 @@ std::tuple<bool, Shape, Shape> IsTranspose021(const Shape& a, const Shape& b) {
   CHECK(ShapeUtil::Compatible(a, b));
   std::vector<int64> perm(a.dimensions().size());
   {
-    std::vector<int64> layout_a(a.layout().minor_to_major().rbegin(),
-                                a.layout().minor_to_major().rend());
-    std::vector<int64> layout_b(b.layout().minor_to_major().rbegin(),
-                                b.layout().minor_to_major().rend());
+    auto layout_a_orig = LayoutUtil::MinorToMajor(a);
+    std::vector<int64> layout_a(layout_a_orig.rbegin(), layout_a_orig.rend());
+    auto layout_b_orig = LayoutUtil::MinorToMajor(b);
+    std::vector<int64> layout_b(layout_b_orig.rbegin(), layout_b_orig.rend());
     for (size_t i = 0; i < perm.size(); ++i) {
       perm[i] = PositionInContainer(layout_b, layout_a[i]);
     }
@@ -800,9 +812,9 @@ Status IrEmitterUnnested::EmitColumnReduction(
         // normalized_input_shape to input_matrix_shape.
         const Shape normalized_input_shape =
             ShapeUtil::NormalizeShapeToMonotonicDim0MajorLayout(input_shape);
+        auto input_shape_min2maj = LayoutUtil::MinorToMajor(input_shape);
         const std::vector<int64> transpose_dimension_mapping(
-            input_shape.layout().minor_to_major().rbegin(),
-            input_shape.layout().minor_to_major().rend());
+            input_shape_min2maj.rbegin(), input_shape_min2maj.rend());
 
         const Shape input_matrix_shape =
             ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
@@ -1043,9 +1055,9 @@ Status IrEmitterUnnested::EmitRowReduction(
         // normalized_input_shape to input_3d_tensor_shape.
         const Shape normalized_input_shape =
             ShapeUtil::NormalizeShapeToMonotonicDim0MajorLayout(input_shape);
+        auto input_shape_min2maj = LayoutUtil::MinorToMajor(input_shape);
         const std::vector<int64> transpose_dimension_mapping(
-            input_shape.layout().minor_to_major().rbegin(),
-            input_shape.layout().minor_to_major().rend());
+            input_shape_min2maj.rbegin(), input_shape_min2maj.rend());
         const Shape input_3d_tensor_shape =
             ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
                 input_shape.element_type(), {depth, height, width});
@@ -1177,9 +1189,9 @@ Status IrEmitterUnnested::EmitReductionToVector(
   // whether another dimension is major or minor of them.
   std::sort(input_dims_to_keep.begin(), input_dims_to_keep.end(),
             [&input_shape](int64 dim_a, int64 dim_b) {
-              return PositionInContainer(input_shape.layout().minor_to_major(),
+              return PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
                                          dim_a) <
-                     PositionInContainer(input_shape.layout().minor_to_major(),
+                     PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
                                          dim_b);
             });
   // Now, if output rank is at least 1, `input_dims_to_keep.front()` is
@@ -1224,14 +1236,14 @@ Status IrEmitterUnnested::EmitReductionToVector(
     int64 width = 1;
     for (int64 input_dim = 0; input_dim < ShapeUtil::Rank(input_shape);
          ++input_dim) {
-      if (PositionInContainer(input_shape.layout().minor_to_major(),
+      if (PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
                               input_dim) >
-          PositionInContainer(input_shape.layout().minor_to_major(),
+          PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
                               input_dims_to_keep.back())) {
         depth *= input_shape.dimensions(input_dim);
-      } else if (PositionInContainer(input_shape.layout().minor_to_major(),
+      } else if (PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
                                      input_dim) <
-                 PositionInContainer(input_shape.layout().minor_to_major(),
+                 PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
                                      input_dims_to_keep.front())) {
         width *= input_shape.dimensions(input_dim);
       }
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 1cb963be611de23cfb9fbb6eca639019208b3d7a..059943d48cd34b0ac487b91c3f3079ee3f761229 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/CodeGen/CommandFlags.def"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
@@ -77,7 +77,7 @@ static string GetLibdeviceFilename(const string& libdevice_dir_path,
   // Since CUDA 9.0, all GPU versions are included in a single file
   const char* unified_libdevice_filename = "libdevice.10.bc";
   std::vector<string> unified_libdevice_files;
-  const tensorflow::Status status = 
+  const tensorflow::Status status =
     tensorflow::Env::Default()->GetMatchingPaths(
       tensorflow::io::JoinPath(libdevice_dir_path, unified_libdevice_filename),
       &unified_libdevice_files);
@@ -492,9 +492,8 @@ StatusOr<string> CompileToPtx(llvm::Module* module,
     tensorflow::port::Tracing::TraceMe annotation(
         "Compiling IR", llvm_ir::AsString(module->getName()),
         /*is_expensive=*/true);
-    ScopedLoggingTimer compilation_timer(
-        "Compile module " + llvm_ir::AsString(module->getName()),
-        /*vlog_level=*/2);
+    XLA_SCOPED_LOGGING_TIMER("Compile module " +
+                             llvm_ir::AsString(module->getName()));
     TF_ASSIGN_OR_RETURN(
         ptx, CompileModuleToPtx(module, compute_capability, hlo_module_config,
                                 libdevice_dir_path));
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
index 9274e16a455fc1a958cee5101b6a9ef7ce619347..c29fee0879c02021fdc23ac0e02ab398cf40f99e 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
@@ -49,8 +49,8 @@ HloInstruction* MaybePaddedAndSlicedInput(
     // applies positive padding and dilation.
     PaddingConfig padding_config =
         MakeNoPaddingConfig(input->shape().dimensions_size());
-    for (size_t i = 0; i < conv_dnums.spatial_dimensions().size(); ++i) {
-      int64 dim = conv_dnums.spatial_dimensions(i);
+    for (size_t i = 0; i < conv_dnums.input_spatial_dimensions().size(); ++i) {
+      int64 dim = conv_dnums.input_spatial_dimensions(i);
       padding_config.mutable_dimensions(dim)->set_edge_padding_low(
           std::max<int64>(0LL, conv_window.dimensions(i).padding_low()));
       padding_config.mutable_dimensions(dim)->set_edge_padding_high(
@@ -81,8 +81,8 @@ HloInstruction* MaybePaddedAndSlicedInput(
     std::vector<int64> limit_indices(input->shape().dimensions().begin(),
                                      input->shape().dimensions().end());
     std::vector<int64> strides(input->shape().dimensions_size(), 1);
-    for (size_t i = 0; i < conv_dnums.spatial_dimensions().size(); ++i) {
-      int64 dim = conv_dnums.spatial_dimensions(i);
+    for (size_t i = 0; i < conv_dnums.input_spatial_dimensions().size(); ++i) {
+      int64 dim = conv_dnums.input_spatial_dimensions(i);
       // If dimension "dim" has negative padding, increase the start index or
       // decrement the limit index by the amount of negative padding.
       start_indices[dim] +=
@@ -117,8 +117,8 @@ HloInstruction* MaybePaddedKernel(const Window& conv_window,
   for (size_t i = 0; i < kernel->shape().dimensions_size(); ++i) {
     padding_config.add_dimensions();
   }
-  for (size_t i = 0; i < conv_dnums.spatial_dimensions().size(); ++i) {
-    int64 dim = conv_dnums.spatial_dimensions(i);
+  for (size_t i = 0; i < conv_dnums.kernel_spatial_dimensions().size(); ++i) {
+    int64 dim = conv_dnums.kernel_spatial_dimensions(i);
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         conv_window.dimensions(i).window_dilation() - 1);
   }
@@ -202,8 +202,7 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
   //   ABCD0 = Pad(ABCD, padding_high=1)
   //   BackwardFilterConv(ABCD0, xyz, padding_low=pading_high=1)
   // We choose the lesser of padding_low and padding_high as the new padding.
-  HloInstruction* transpose = backward_conv->fused_expression_root();
-  HloInstruction* forward_conv = transpose->mutable_operand(0);
+  HloInstruction* forward_conv = backward_conv->fused_expression_root();
   HloInstruction* input = backward_conv->mutable_operand(0);
   Window new_forward_conv_window = forward_conv->window();
   Window new_backward_conv_window = backward_conv->window();
@@ -229,7 +228,7 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
     // later. Therefore, the amount of new padding (low or high) is the minimum
     // of the amount of old padding low and old padding high.
     int64 new_conv_padding = std::min(padding_low, padding_high);
-    int64 dim = backward_conv_dnums.spatial_dimensions(i);
+    int64 dim = backward_conv_dnums.input_spatial_dimensions(i);
     input_padding_config.mutable_dimensions(dim)->set_edge_padding_low(
         padding_low - new_conv_padding);
     input_padding_config.mutable_dimensions(dim)->set_edge_padding_high(
@@ -269,19 +268,10 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
               .ConsumeValueOrDie(),
           padded_input, output, new_forward_conv_window, forward_conv_dnums));
 
-  HloInstruction* new_transpose =
-      computation->AddInstruction(HloInstruction::CreateTranspose(
-          ShapeInference::InferTransposeShape(new_forward_conv->shape(),
-                                              transpose->dimensions())
-              .ConsumeValueOrDie(),
-          new_forward_conv, transpose->dimensions()));
-
-  // Fuse the new forward convolution and the new transpose to the new backward
-  // convolution.
+  // Fuse the new forward convolution to the new backward convolution.
   HloInstruction* new_backward_conv =
       computation->CreateFusionInstructionForBackwardConvolution(
-          {new_transpose, new_forward_conv},
-          HloInstruction::FusionKind::kConvBackwardFilter,
+          {new_forward_conv}, HloInstruction::FusionKind::kConvBackwardFilter,
           new_backward_conv_window, backward_conv_dnums);
 
   VLOG(1) << "Canonicalizing backward filter conv";
@@ -369,12 +359,11 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
   std::vector<int64> limit_indices(
       new_backward_conv->shape().dimensions().begin(),
       new_backward_conv->shape().dimensions().end());
-  std::vector<int64> strides(new_backward_conv->shape().dimensions_size(),
-                             1LL);
+  std::vector<int64> strides(new_backward_conv->shape().dimensions_size(), 1LL);
   for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
     int64 padding_low = backward_conv->window().dimensions(i).padding_low();
     int64 padding_high = backward_conv->window().dimensions(i).padding_high();
-    int64 dim = backward_conv_dnums.spatial_dimensions(i);
+    int64 dim = backward_conv_dnums.output_spatial_dimensions(i);
     if (padding_low > padding_high) {
       // If the amount of low padding (of the old backward convolution) is
       // larger, we internally pad the low end of the activations and slice
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
index d0d2deee24848184278e3e51dcaa3bb673b5fadc..6cf280df05496716a0780d61ded92efd9982734c 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
@@ -44,37 +44,41 @@ std::ostream& operator<<(std::ostream& out,
 
 // Calculates the launch dimensions used to invoke `hlo`.
 LaunchDimensions CalculateLaunchDimensions(
-    const Shape& shape, const se::DeviceDescription& device_desc,
-    PartitionStrategy partition_strategy) {
-  int64 warp_size = device_desc.threads_per_warp();
-
+    const Shape& shape, const se::DeviceDescription& device_desc) {
   int64 num_elements = ShapeUtil::ElementsIn(shape);
   if (num_elements <= 1) {
     return LaunchDimensions();
   }
 
-  // Calculate the number of threads per block.
-  // Initialize threads_per_block as the threads-per-block limit.
-  int64 threads_per_block = device_desc.threads_per_block_limit();
-  VLOG(2) << "Initial # of threads per block = " << threads_per_block;
-
-  if (partition_strategy == PartitionStrategy::kLatency) {
-    // Limit the thread count to allow maximum number of registers per thread.
-    // TODO(b/28560520): We don't have to assume the emitted kernel will use up
-    // all the registers. We could use ptxas to examine the actual number of
-    // register used, and set the thread count accordingly.
-    int64 threads_per_block_limit_due_to_registers =
-        device_desc.registers_per_core_limit() /
-        device_desc.registers_per_thread_limit();
-    CHECK_NE(0, threads_per_block_limit_due_to_registers);
-    if (threads_per_block_limit_due_to_registers < threads_per_block) {
-      threads_per_block =
-          // Make `threads_per_block` a multiple of warp size to use GPU
-          // efficiently.
-          warp_size *
-          std::max(1LL, threads_per_block_limit_due_to_registers / warp_size);
-      VLOG(2) << "Update # of threads per block due to register pressure = "
-              << threads_per_block;
+  // Since we don't do any inter-warp communication, we're free to choose any
+  // block size we want, subject to hardware constraints.  We choose the
+  // smallest block size that allows the GPU to reach full occupancy (assuming
+  // the kernel uses sufficiently few registers).  This gives us max performance
+  // when the kernel uses few registers, and lets us scale down gracefully as
+  // the kernel uses more registers.
+  //
+  // Specifically, we choose the number of threads per block such that
+  //
+  //   <num threads per block> * <max blocks per core> = <max threads per core>
+
+  auto threads_per_core = device_desc.threads_per_core_limit();
+  auto blocks_per_core = device_desc.blocks_per_core_limit();
+  int64 threads_per_block;
+  if (threads_per_core != 0 && blocks_per_core != 0) {
+    threads_per_block = device_desc.threads_per_core_limit() /
+                        device_desc.blocks_per_core_limit();
+  } else {
+    static std::atomic<int64> log_count{0};
+    if (log_count.fetch_add(1) < 8) {
+      LOG(WARNING) << "Attempting to calculate launch dimensions for GPU "
+                      "without full information about its capabilities.  "
+                      "StreamExecutor's PopulateDeviceDescription should be "
+                      "updated for this device.";
+    }
+    threads_per_block = device_desc.threads_per_warp();
+    if (threads_per_block == 0) {
+      // Fall back to *something* if we can't even get num threads per warp.
+      threads_per_block = 32;
     }
   }
 
@@ -84,8 +88,6 @@ LaunchDimensions CalculateLaunchDimensions(
             << threads_per_block << ") because the latter is smaller.";
   }
 
-  // Calculate the block count. We copy the strategy used by Eigen:
-  // eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
   int64 block_count = CeilOfRatio(num_elements, threads_per_block);
   VLOG(2) << tensorflow::strings::Printf(
       "Initialized the block count to ceil(# of elements / threads per "
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.h b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
index 8f7fce884acc93fd39510ad0826b819a6d9731a7..0bf463a6ef95d5a32784838c08ad239752fd1acf 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
@@ -30,14 +30,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-enum class PartitionStrategy {
-  // Optimized for latency by allowing maximum number of registers per thread.
-  kLatency,
-  // Optimized for throughput. This may limit registers per thread and cause
-  // longer latency.
-  kThroughput
-};
-
 // Encapsulates the launch dimensions of a kernel, e.g., the block count and the
 // number of threads per block.
 class LaunchDimensions {
@@ -66,8 +58,7 @@ std::ostream& operator<<(std::ostream& out,
 
 LaunchDimensions CalculateLaunchDimensions(
     const Shape& shape,
-    const perftools::gputools::DeviceDescription& device_desc,
-    PartitionStrategy partition_strategy = PartitionStrategy::kLatency);
+    const perftools::gputools::DeviceDescription& device_desc);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 0ff27888ad72f8190400c22a9086d1965448662c..486ea7d7e1dad3f7f37d50565e176fbf567f5cc4 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -70,6 +70,19 @@ class Thunk {
     return tensorflow::Status::OK();
   }
 
+  // Users of Thunk should call ShouldHaltAllActivityBeforeRunning(stream)
+  // before calling ExecuteOnStream(stream).  If it returns true, it's the
+  // user's responsibility to wait for all activity on the GPU to finish before
+  // calling ExecuteOnStream.
+  //
+  // This value is not required to be constant for a given Thunk.  For example,
+  // a Thunk that performs autotuning may return true for its first run and
+  // false thereafter.
+  virtual bool ShouldHaltAllActivityBeforeRunning(
+      perftools::gputools::Stream* /*stream*/) {
+    return false;
+  }
+
   // Execute the kernel for the thunk on the given stream. This method must be
   // called after Initialize and can be called multiple times over Thunk's
   // lifetime. Stream argument must be non-null.
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index 0d2412096abf7838b7b0e7617811c789f507a4a1..c21559af6d2e5dfb5aaf62afcdcaed514e0914c9 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -34,16 +34,14 @@ WhileThunk::WhileThunk(
       body_thunk_sequence_(
           MakeUnique<SequentialThunk>(std::move(*body_thunk_sequence), hlo)) {}
 
-tensorflow::Status WhileThunk::Initialize(const GpuExecutable& executable) {
+Status WhileThunk::Initialize(const GpuExecutable& executable) {
   TF_RETURN_IF_ERROR(condition_thunk_sequence_->Initialize(executable));
   TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status WhileThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
-
+Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                                   perftools::gputools::Stream* stream) {
   perftools::gputools::DeviceMemoryBase condition_result_data =
       buffer_allocations.GetDeviceAddress(condition_result_buffer_index_);
 
@@ -55,9 +53,11 @@ tensorflow::Status WhileThunk::ExecuteOnStream(
     // Copy the result of condition computation and break the loop if 'false'.
     bool condition_result;
     stream->ThenMemcpy(&condition_result, condition_result_data, sizeof(bool));
-    if (!stream->BlockHostUntilDone()) {
+    Status block_status = stream->BlockHostUntilDone();
+    if (!block_status.ok()) {
       return InternalError(
-          "Failed to complete all kernels launched on stream %p", stream);
+          "Failed to complete all kernels launched on stream %p: %s", stream,
+          block_status.error_message().c_str());
     }
 
     if (!condition_result) {
@@ -68,7 +68,7 @@ tensorflow::Status WhileThunk::ExecuteOnStream(
     TF_RETURN_IF_ERROR(
         body_thunk_sequence_->ExecuteOnStream(buffer_allocations, stream));
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h
index 95ed5497cea4fa3ba5dcdc6762cbd53cec88339a..4c9f45de9e42494df58706d0a4a3eb0c4220b8b8 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@@ -45,10 +45,9 @@ class WhileThunk : public Thunk {
   WhileThunk(const WhileThunk&) = delete;
   WhileThunk& operator=(const WhileThunk&) = delete;
 
-  tensorflow::Status Initialize(const GpuExecutable& executable) override;
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+  Status Initialize(const GpuExecutable& executable) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         perftools::gputools::Stream* stream) override;
 
  private:
   const BufferAllocation::Slice condition_result_buffer_index_;
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.cc b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
index ccdd1717593e4fa7c1d1deb3f0f9ebfab1bf7209..ab94d7d5436e8edd12f68f7e0c395c53f303e6eb 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
@@ -44,7 +44,7 @@ namespace {
 //
 //            Parameter
 //               |
-//   Const  GetTupleElemet
+//   Const  GetTupleElement
 //      \   /
 //       Add (root)
 //
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
index 44188473d39088923c67216facab472a4e4ee09f..f16daa0b5481474e754c880ead1945297ca50168 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
@@ -17,9 +17,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
@@ -33,8 +36,6 @@ class WhileTransformerTest : public HloTestBase {
       : module_(CreateNewModule()),
         induction_variable_shape_(ShapeUtil::MakeShape(S32, {})),
         data_shape_(ShapeUtil::MakeShape(F32, {8})),
-        loop_state_shape_(ShapeUtil::MakeTupleShape(
-            {induction_variable_shape_, data_shape_})),
         condition_result_shape_(ShapeUtil::MakeShape(PRED, {})) {}
 
   std::unique_ptr<HloComputation> BuildConditionComputation(
@@ -42,8 +43,8 @@ class WhileTransformerTest : public HloTestBase {
     auto builder = HloComputation::Builder(TestName() + ".Condition");
     auto limit_const = builder.AddInstruction(
         HloInstruction::CreateConstant(Literal::CreateR0<int32>(limit)));
-    auto loop_state = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_state_shape_, "loop_state"));
+    auto loop_state = builder.AddInstruction(HloInstruction::CreateParameter(
+        0, GetLoopStateShape(tuple_index), "loop_state"));
     auto induction_variable =
         builder.AddInstruction(HloInstruction::CreateGetTupleElement(
             limit_const->shape(), loop_state, tuple_index));
@@ -58,8 +59,8 @@ class WhileTransformerTest : public HloTestBase {
       const int64 increment) {
     auto builder = HloComputation::Builder(TestName() + ".Body");
     // Create param instruction to access loop state.
-    auto loop_state = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_state_shape_, "loop_state"));
+    auto loop_state = builder.AddInstruction(HloInstruction::CreateParameter(
+        0, GetLoopStateShape(ind_var_tuple_index), "loop_state"));
     // Update the induction variable GTE(ind_var_tuple_index).
     auto induction_variable =
         builder.AddInstruction(HloInstruction::CreateGetTupleElement(
@@ -73,7 +74,7 @@ class WhileTransformerTest : public HloTestBase {
         data_shape_, loop_state, data_tuple_index));
     // Use 'induction_variable' in computation with no path to output tuple.
     auto update = builder.AddInstruction(
-        HloInstruction::CreateBroadcast(data_shape_, induction_variable, {8}));
+        HloInstruction::CreateBroadcast(data_shape_, induction_variable, {}));
     auto add1 = builder.AddInstruction(HloInstruction::CreateBinary(
         data_shape_, HloOpcode::kAdd, data, update));
     // Create output Tuple.
@@ -98,8 +99,9 @@ class WhileTransformerTest : public HloTestBase {
                   HloInstruction::CreateTuple({induction_var_init, data_init}))
             : builder.AddInstruction(
                   HloInstruction::CreateTuple({data_init, induction_var_init}));
-    auto while_hlo = builder.AddInstruction(HloInstruction::CreateWhile(
-        loop_state_shape_, condition, body, loop_state_init));
+    auto while_hlo = builder.AddInstruction(
+        HloInstruction::CreateWhile(GetLoopStateShape(ind_var_tuple_index),
+                                    condition, body, loop_state_init));
     module_->AddEntryComputation(builder.Build());
     return while_hlo;
   }
@@ -115,18 +117,34 @@ class WhileTransformerTest : public HloTestBase {
   }
 
   void RunCopyInsertionPass() {
+    HloVerifier verifier([](const Shape& shape) {
+      return ShapeUtil::ByteSizeOf(shape, /*pointer_size=*/sizeof(void*));
+    });
+    TF_ASSERT_OK(verifier.Run(module_.get()).status());
     CopyInsertion copy_insertion;
-    EXPECT_IS_OK(copy_insertion.Run(module_.get()).status());
+    TF_ASSERT_OK(copy_insertion.Run(module_.get()).status());
+  }
+
+  Shape GetLoopStateShape(const int64 ind_var_tuple_index) {
+    if (ind_var_tuple_index == 0) {
+      return ShapeUtil::MakeTupleShape(
+          {induction_variable_shape_, data_shape_});
+    } else {
+      return ShapeUtil::MakeTupleShape(
+          {data_shape_, induction_variable_shape_});
+    }
   }
 
   std::unique_ptr<HloModule> module_;
   Shape induction_variable_shape_;
   Shape data_shape_;
-  Shape loop_state_shape_;
   Shape condition_result_shape_;
 };
 
-TEST_F(WhileTransformerTest, InductionVariableAtTupleElement0) {
+// TODO(b/68830972): The while transformer is far too fragile. It patterns
+// matches the exact expressions of opcodes. Re-enable when transformation is
+// more general
+TEST_F(WhileTransformerTest, DISABLED_InductionVariableAtTupleElement0) {
   // Build computation with induction variable at tuple element 0.
   auto condition =
       module_->AddEmbeddedComputation(BuildConditionComputation(0, 10));
@@ -137,13 +155,16 @@ TEST_F(WhileTransformerTest, InductionVariableAtTupleElement0) {
   RunCopyInsertionPass();
   // Run WhileTransformer.
   auto result = gpu::CanTransformWhileToFor(while_hlo);
-  ASSERT_TRUE(result.ok());
+  TF_ASSERT_OK(result.status());
   // Check results.
   EXPECT_THAT(result.ConsumeValueOrDie(),
               Eq(std::tuple<int64, int64, int64>(0, 10, 1)));
 }
 
-TEST_F(WhileTransformerTest, InductionVariableAtTupleElement1) {
+// TODO(b/68830972): The while transformer is far too fragile. It patterns
+// matches the exact expressions of opcodes. Re-enable when transformation is
+// more general
+TEST_F(WhileTransformerTest, DISABLED_InductionVariableAtTupleElement1) {
   // Build computation with induction variable at tuple element 1.
   auto condition =
       module_->AddEmbeddedComputation(BuildConditionComputation(1, 10));
@@ -154,13 +175,16 @@ TEST_F(WhileTransformerTest, InductionVariableAtTupleElement1) {
   RunCopyInsertionPass();
   // Run WhileTransformer.
   auto result = gpu::CanTransformWhileToFor(while_hlo);
-  ASSERT_TRUE(result.ok());
+  TF_ASSERT_OK(result.status());
   // Check results.
   EXPECT_THAT(result.ConsumeValueOrDie(),
               Eq(std::tuple<int64, int64, int64>(0, 10, 1)));
 }
 
-TEST_F(WhileTransformerTest, InvalidLoopLimit) {
+// TODO(b/68830972): The while transformer is far too fragile. It patterns
+// matches the exact expressions of opcodes. Re-enable when transformation is
+// more general
+TEST_F(WhileTransformerTest, DISABLED_InvalidLoopLimit) {
   // Build computation with invalid loop limit.
   auto condition =
       module_->AddEmbeddedComputation(BuildConditionComputation(0, 5));
@@ -176,7 +200,10 @@ TEST_F(WhileTransformerTest, InvalidLoopLimit) {
               HasSubstr("Loop start must be less than loop limit."));
 }
 
-TEST_F(WhileTransformerTest, InvalidLoopIncrement) {
+// TODO(b/68830972): The while transformer is far too fragile. It patterns
+// matches the exact expressions of opcodes. Re-enable when transformation is
+// more general
+TEST_F(WhileTransformerTest, DISABLED_InvalidLoopIncrement) {
   // Build computation with invalid loop increment.
   auto condition =
       module_->AddEmbeddedComputation(BuildConditionComputation(0, 10));
diff --git a/tensorflow/compiler/xla/service/graphviz_example.cc b/tensorflow/compiler/xla/service/graphviz_example.cc
index 049e8d80d80c835bca4a4d38592564ba82a3ecf9..05017008e2ddbe0b9e78d06275fdec5d08d94bfa 100644
--- a/tensorflow/compiler/xla/service/graphviz_example.cc
+++ b/tensorflow/compiler/xla/service/graphviz_example.cc
@@ -108,8 +108,11 @@ std::unique_ptr<HloModule> MakeBigGraph() {
       HloInstruction::CreateUnary(vshape, HloOpcode::kCopy, param_v0));
   auto clamp = builder.AddInstruction(HloInstruction::CreateTernary(
       vshape, HloOpcode::kClamp, copy, param_v1, param_v2));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(vshape, HloOpcode::kDot, clamp, param_v0));
+      HloInstruction::CreateDot(vshape, clamp, param_v0, dot_dnums));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({dot, param_s, clamp}));
   auto scalar = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 17b926c8748e45b55f380e7595711b9e7a748f64..387b649a731ebcbfd8307807469f39f22d192b06 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -259,8 +259,11 @@ TEST_F(HeapSimulatorTest, MultiplyDot) {
       HloInstruction::CreateParameter(2, f32scalar_, "paramY"));
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, mul, paramY));
+      HloInstruction::CreateDot(f32vec4_, mul, paramY, dot_dnums));
 
   // The buffer for dot is the output, and it cannot be shared with the buffer
   // for mul, since dot isn't elementwise.
@@ -292,8 +295,11 @@ TEST_F(HeapSimulatorTest, MultiplyDotAdd) {
       HloInstruction::CreateParameter(2, f32scalar_, "paramY"));
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, mul, paramY));
+      HloInstruction::CreateDot(f32vec4_, mul, paramY, dot_dnums));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, dot, paramA));
 
@@ -327,10 +333,13 @@ TEST_F(HeapSimulatorTest, MultiplyDotDot) {
       HloInstruction::CreateParameter(2, f32scalar_, "paramY"));
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot0 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, mul, paramY));
+      HloInstruction::CreateDot(f32vec4_, mul, paramY, dot_dnums));
   auto dot1 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, dot0, paramY));
+      HloInstruction::CreateDot(f32vec4_, dot0, paramY, dot_dnums));
 
   // The buffer for dot1 is the output.  No buffers can be shared.  The buffer
   // for mul is freed before the end, since it's no longer used after dot0
@@ -365,10 +374,13 @@ TEST_F(HeapSimulatorTest, MultiplyDotDotTuple) {
       HloInstruction::CreateParameter(2, f32scalar_, "paramY"));
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot0 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, mul, paramY));
+      HloInstruction::CreateDot(f32vec4_, mul, paramY, dot_dnums));
   auto dot1 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, dot0, paramY));
+      HloInstruction::CreateDot(f32vec4_, dot0, paramY, dot_dnums));
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({dot0, dot1}));
 
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 79493c4112804f8454d200f3f83aa85d718f0d0a..e4aed7593c51a2d1bb156493666b3c1b03dcc626 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -36,6 +36,9 @@ option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
 message HloInstructionProto {
+  reserved 10;
+  reserved "parameter_name";
+
   string name = 1;
   string opcode = 2;
   xla.Shape shape = 3;
@@ -50,9 +53,8 @@ message HloInstructionProto {
   // Literal, only present for kConstant.
   xla.LiteralProto literal = 8;
 
-  // Parameter info, only present for kParameter.
+  // Parameter number is only present for kParameter.
   int64 parameter_number = 9;
-  string parameter_name = 10;
 
   // Fusion state, only present for kFusion.
   string fusion_kind = 11;
@@ -118,6 +120,9 @@ message HloInstructionProto {
 
   // Shape of outfeed request.
   xla.Shape outfeed_shape = 29;
+
+  // Describes the dimension numbers used for a dot operation
+  xla.DotDimensionNumbers dot_dimension_numbers = 30;
 }
 
 // Serialization of HloComputation.
@@ -250,7 +255,3 @@ message HloProto {
   HloOrderingProto hlo_ordering = 2;
   BufferAssignmentProto buffer_assignment = 3;
 }
-
-message HloProtos {
-  repeated HloProto hlo_protos = 1;
-}
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index 6f8099475146e6bbcfb61d2e5a91a7a6f9e63e58..6d2a3aa5b531650a658502531e050702ffbd3760 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -144,8 +144,10 @@ class BufferValueMap {
   // Move the given value into the given buffer.
   void MoveValueToBuffer(const HloValue& value, BufferNumber buffer_number) {
     BufferNumber old_buffer_number = value_to_buffer_number_.at(&value);
-    buffers_.at(old_buffer_number).erase(&value);
-    if (buffers_.at(old_buffer_number).empty()) {
+    tensorflow::gtl::FlatSet<const HloValue*>& old_value_set =
+        buffers_.at(old_buffer_number);
+    old_value_set.erase(&value);
+    if (old_value_set.empty()) {
       buffers_.erase(old_buffer_number);
     }
 
@@ -175,7 +177,7 @@ class BufferValueMap {
     // Value is init of a while (use is while).
     std::vector<BufferNumber> aliased_buffers;
     for (const HloUse& use : value.uses()) {
-      VLOG(1) << "use of value " << value.ToShortString() << ": " << use;
+      VLOG(2) << "use of value " << value.ToShortString() << ": " << use;
       if (use.instruction->opcode() == HloOpcode::kWhile) {
         // Determine the while value that this shares a buffer with.
         const HloValue& while_value =
@@ -411,7 +413,7 @@ string HloAliasAnalysis::ToString() const {
 /* static */
 StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
     HloModule* module) {
-  VLOG(1) << "HloAliasAnalysis::Run on module " << module->name();
+  VLOG(2) << "HloAliasAnalysis::Run on module " << module->name();
   XLA_VLOG_LINES(2, module->ToString());
 
   auto alias_analysis = WrapUnique(new HloAliasAnalysis(module));
@@ -444,7 +446,7 @@ StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
 
   TF_DCHECK_OK(alias_analysis->Verify());
 
-  XLA_VLOG_LINES(1, alias_analysis->ToString());
+  XLA_VLOG_LINES(2, alias_analysis->ToString());
   return std::move(alias_analysis);
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 8056bcf0f791bee949c02d6ecae4af633da84179..a63affa06caf75f1ccab084bd114e39ba7c91a38 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -131,9 +131,9 @@ Status HloComputation::RemoveParameter(int64 param_no) {
 
   while (param_no < param_instructions_.size()) {
     param_instruction = param_instructions_[param_no];
-    string param_name = param_instruction->parameter_name();
+    string param_name = param_instruction->name();
     // Fusion parameters are named foo.param_1, bar.param_2, etc. We are
-    // renumbering the parameters so replace the final number in the name with
+    // renumbering the parameters, so replace the final number in the name with
     // the updated value.
     const string param_underscore = ".param_";
     size_t index = param_name.rfind(param_underscore);
@@ -176,10 +176,6 @@ bool HloComputation::IsRemovable(const HloInstruction* instruction) {
     return false;
   }
 
-  if (instruction->HasSideEffect()) {
-    return false;
-  }
-
   return true;
 }
 
@@ -207,7 +203,8 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
     worklist.pop();
 
     if (removed.count(item) != 0 || item->user_count() != 0 ||
-        item == root_instruction() || !IsRemovable(item)) {
+        item == root_instruction() || !IsRemovable(item) ||
+        item->HasSideEffect()) {
       continue;
     }
     for (int i = 0; i < item->operand_count(); ++i) {
@@ -367,26 +364,27 @@ std::list<HloComputation*> HloComputation::MakeEmbeddedComputationsList()
   return post_order;
 }
 
-string HloComputation::ToString(int nested_level,
-                                bool include_large_constants) const {
+string HloComputation::ToString(const HloPrintOptions& options) const {
   std::ostringstream s;
-  for (int i = 0; i < nested_level; i++) {
+  for (int i = 0; i < options.indent_amount(); i++) {
     s << "    ";
   }
-  s << "%" << name() << " " << ShapeUtil::HumanString(ComputeProgramShape())
-    << " {\n";
+  if (options.print_percent()) {
+    s << "%";
+  }
+  s << name();
+  if (options.print_program_shape()) {
+    s << " " << ShapeUtil::HumanString(ComputeProgramShape());
+  }
+  s << " {\n";
   for (const HloInstruction* instruction : MakeInstructionPostOrder()) {
-    for (int i = 0; i < nested_level; i++) {
+    for (int i = 0; i < options.indent_amount(); i++) {
       s << "    ";
     }
     s << "  " << (instruction == root_instruction_ ? "ROOT " : "")
-      << instruction->ToString(
-             /*compact_operands=*/false,
-             /*include_metadata=*/true,
-             /*include_large_constants=*/include_large_constants)
-      << "\n";
+      << instruction->ToString(options) << "\n";
   }
-  for (int i = 0; i < nested_level; i++) {
+  for (int i = 0; i < options.indent_amount(); i++) {
     s << "    ";
   }
   s << "}";
@@ -407,16 +405,18 @@ HloComputationProto HloComputation::ToProto() const {
 /* static */ StatusOr<std::unique_ptr<HloComputation>>
 HloComputation::CreateFromProto(
     HloModule* module, const HloComputationProto& proto,
-    tensorflow::gtl::FlatMap<string, HloComputation*>* computation_map,
+    const tensorflow::gtl::FlatMap<string, HloComputation*>& computation_map,
+    const std::function<void(std::unique_ptr<HloComputation>)>&
+        add_fused_computation,
     HloInstruction* fusion_instruction) {
   std::vector<std::unique_ptr<HloInstruction>> instructions;
   tensorflow::gtl::FlatMap<string, HloInstruction*> instruction_map;
   int64 parameter_count = 0;
   for (const HloInstructionProto& instruction_proto : proto.instructions()) {
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloInstruction> instruction,
-        HloInstruction::CreateFromProto(module, instruction_proto,
-                                        instruction_map, computation_map));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloInstruction> instruction,
+                        HloInstruction::CreateFromProto(
+                            module, instruction_proto, instruction_map,
+                            computation_map, add_fused_computation));
     if (instruction->opcode() == HloOpcode::kParameter) {
       parameter_count++;
     }
@@ -541,7 +541,7 @@ ProgramShape HloComputation::ComputeProgramShape() const {
 
   for (auto* param_instruction : param_instructions_) {
     *program_shape.add_parameters() = param_instruction->shape();
-    *program_shape.add_parameter_names() = param_instruction->parameter_name();
+    *program_shape.add_parameter_names() = param_instruction->name();
   }
   *program_shape.mutable_result() = root_instruction_->shape();
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index 2835dbbb846b24599840a9ee3ea72809d3f97dd2..6436815f910405477ec21a33dec75ef71df08602 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -138,8 +138,11 @@ class HloComputation {
   void UniquifyName(NameUniquer* name_uniquer);
 
   // Return a string representation of the computation.
-  string ToString(int nested_level = 0,
-                  bool include_large_constants = false) const;
+  //
+  // (We express the default options using an overload rather than a default
+  // param because gdb ignores default params, but does resolve overloads.)
+  string ToString() const { return ToString(HloPrintOptions()); }
+  string ToString(const HloPrintOptions& options) const;
 
   // Returns a serialized representation of this computation.
   HloComputationProto ToProto() const;
@@ -152,12 +155,16 @@ class HloComputation {
   //   computation_map: a map from computation name to HloComputation*. This map
   //     must contain all computations which the newly constructed computation
   //     calls.
-  //  fusion_instruction: if non-null then the newly created computation will be
-  //     constructed as a fused computation with this instruction as its fusion
-  //     parent.
+  //   add_fused_computation: A function to call to add a fused
+  //     computation. Used only when the instruction is a fusion instruction.
+  //   fusion_instruction: if non-null then the newly created computation will
+  //     be constructed as a fused computation with this instruction as its
+  //     fusion parent.
   static StatusOr<std::unique_ptr<HloComputation>> CreateFromProto(
       HloModule* module, const HloComputationProto& proto,
-      tensorflow::gtl::FlatMap<string, HloComputation*>* computation_map,
+      const tensorflow::gtl::FlatMap<string, HloComputation*>& computation_map,
+      const std::function<void(std::unique_ptr<HloComputation>)>&
+          add_fused_computation,
       HloInstruction* fusion_instruction = nullptr);
 
   // Gets the instructions in this computation.
@@ -309,11 +316,17 @@ class HloComputation {
           replacements,
       HloModule* module = nullptr, const string& suffix = "clone");
 
-  // Returns true if the given instruction can be removed from the
-  // computation. Instructions such as parameters and send/receive instructions
-  // cannot be removed without violating invariants of the HLO computation or
-  // module with the exception of fusion computation.  A parameter instruction
-  // is removable for a fusion computation.
+  // Returns true if the given instruction can be removed from the computation.
+  // Parameter instructions cannot be removed without violating invariants of
+  // the HLO computation with the exception of fusion computation. A parameter
+  // instruction is removable for a fusion computation.
+  //
+  // Note that IsRemovable() is a necessariy condition to remove an instruction
+  // rather than a sufficient condition. For example, instructions with
+  // side-effect (e.g., Send, Infeed) may be removed from a computation, but the
+  // transformation must guarantee the invariants relevant to the instructions
+  // still hold (e.g., Send and Recv must be removed together to make each
+  // channel complete).
   bool IsRemovable(const HloInstruction* instruction);
 
   // Returns true if this computation has a side effect. A computation has a
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 1877065f672bdf705f044568e2d77ac342a808cc..b933695b823871c6c0174da6d6f99e618219442a 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -22,13 +22,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 
 namespace xla {
 
 constexpr char HloCostAnalysis::kFlopsKey[];
 constexpr char HloCostAnalysis::kTranscendentalsKey[];
 constexpr char HloCostAnalysis::kBytesAccessedKey[];
-constexpr char HloCostAnalysis::kSecondsKey[];
+constexpr char HloCostAnalysis::kOptimalSecondsKey[];
 
 HloCostAnalysis::HloCostAnalysis(const ShapeSizeFunction& shape_size)
     : HloCostAnalysis(shape_size, {}) {}
@@ -60,16 +61,16 @@ Status HloCostAnalysis::Postprocess(const HloInstruction* hlo) {
   if (current_should_compute_bottleneck_time_) {
     // Compute the time as the time of the bottleneck, i.e. the slowest property
     // given the per-second rate of each property.
-    float max_seconds = 0.0f;
+    float optimal_seconds = 0.0f;
     for (const auto& property : current_properties_) {
-      if (property.first != kSecondsKey) {
-        max_seconds = std::max(
-            max_seconds,
+      if (property.first != kOptimalSecondsKey) {
+        optimal_seconds = std::max(
+            optimal_seconds,
             property.second /
                 GetProperty(property.first, per_second_rates_, INFINITY));
       }
     }
-    current_properties_[kSecondsKey] = max_seconds;
+    current_properties_[kOptimalSecondsKey] = optimal_seconds;
   }
 
   TF_RET_CHECK(hlo_properties_.emplace(hlo, current_properties_).second);
@@ -200,10 +201,11 @@ Status HloCostAnalysis::HandleCopy(const HloInstruction*) {
 Status HloCostAnalysis::HandleDot(const HloInstruction* dot) {
   const Shape& lhs_shape = dot->operand(0)->shape();
   const Shape& rhs_shape = dot->operand(1)->shape();
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
   // Count of elements along the reduction dimension (last dimension for the
   // rhs).
-  int64 reduction_width = lhs_shape.dimensions(ShapeUtil::Rank(lhs_shape) - 1);
-
+  int64 reduction_width =
+      lhs_shape.dimensions(dnums.lhs_contracting_dimensions(0));
   // First divide by reduction width before multiplying by rhs elements to avoid
   // overflow.
   int64 fma_count;
@@ -396,7 +398,14 @@ Status HloCostAnalysis::HandleCrossReplicaSum(const HloInstruction* crs) {
   //
   // TODO(b/33004697): Compute correct cost here, taking the actual number of
   // replicas into account.
-  current_properties_[kFlopsKey] = ShapeUtil::ElementsIn(crs->shape());
+  double flops = 0.0;
+  ShapeUtil::ForEachSubshape(
+      crs->shape(), [&, this](const Shape& subshape, const ShapeIndex&) {
+        if (ShapeUtil::IsArray(subshape)) {
+          flops += ShapeUtil::ElementsIn(subshape);
+        }
+      });
+  current_properties_[kFlopsKey] = flops;
   return Status::OK();
 }
 
@@ -480,6 +489,25 @@ Status HloCostAnalysis::HandleWhile(const HloInstruction* xla_while) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleConditional(const HloInstruction* conditional) {
+  // Compute the cost of the true and false computations and take the maximum
+  // from those for each property.
+  TF_ASSIGN_OR_RETURN(const Properties true_computation_properties,
+                      ProcessSubcomputation(conditional->true_computation()));
+  TF_ASSIGN_OR_RETURN(const Properties false_computation_properties,
+                      ProcessSubcomputation(conditional->false_computation()));
+  current_properties_ = true_computation_properties;
+  for (const auto& property : false_computation_properties) {
+    if (!tensorflow::gtl::InsertIfNotPresent(&current_properties_, property)) {
+      current_properties_[property.first] =
+          std::max(current_properties_[property.first], property.second);
+    }
+  }
+  current_should_compute_bottleneck_time_ = false;
+
+  return Status::OK();
+}
+
 Status HloCostAnalysis::FinishVisit(const HloInstruction*) {
   return Status::OK();
 }
@@ -496,8 +524,8 @@ float HloCostAnalysis::bytes_accessed() const {
   return GetProperty(kBytesAccessedKey, properties_sum_);
 }
 
-float HloCostAnalysis::seconds() const {
-  return GetProperty(kSecondsKey, properties_sum_);
+float HloCostAnalysis::optimal_seconds() const {
+  return GetProperty(kOptimalSecondsKey, properties_sum_);
 }
 
 int64 HloCostAnalysis::flop_count(const HloInstruction& hlo) const {
@@ -512,8 +540,8 @@ int64 HloCostAnalysis::bytes_accessed(const HloInstruction& hlo) const {
   return GetPropertyForHlo(hlo, kBytesAccessedKey, hlo_properties_);
 }
 
-float HloCostAnalysis::seconds(const HloInstruction& hlo) const {
-  return GetPropertyForHlo(hlo, kSecondsKey, hlo_properties_);
+float HloCostAnalysis::optimal_seconds(const HloInstruction& hlo) const {
+  return GetPropertyForHlo(hlo, kOptimalSecondsKey, hlo_properties_);
 }
 
 StatusOr<HloCostAnalysis::Properties> HloCostAnalysis::ProcessSubcomputation(
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 0f447753788d870e91204fcb03eb2de204c958bf..fade19522cf0c30eab037aa355de1f9203f80014 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -42,7 +42,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   static constexpr char kFlopsKey[] = "flops";
   static constexpr char kTranscendentalsKey[] = "transcendentals";
   static constexpr char kBytesAccessedKey[] = "bytes accessed";
-  static constexpr char kSecondsKey[] = "seconds";
+  static constexpr char kOptimalSecondsKey[] = "optimal_seconds";
 
   // shape_size is a function which returns the size in bytes of the top-level
   // buffer of a shape.
@@ -97,6 +97,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleReshape(const HloInstruction* reshape) override;
   Status HandleTranspose(const HloInstruction* transpose) override;
   Status HandleWhile(const HloInstruction* xla_while) override;
+  Status HandleConditional(const HloInstruction* conditional) override;
   Status FinishVisit(const HloInstruction* root) override;
 
   Status Preprocess(const HloInstruction* hlo) override;
@@ -118,14 +119,14 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   float flop_count() const;
   float transcendental_count() const;
   float bytes_accessed() const;
-  float seconds() const;
+  float optimal_seconds() const;
 
   // Returns the respective cost computed for a particular HLO instruction, or 0
   // if the HLO was not found to have a cost in the analysis.
   int64 flop_count(const HloInstruction& hlo) const;
   int64 transcendental_count(const HloInstruction& hlo) const;
   int64 bytes_accessed(const HloInstruction& hlo) const;
-  float seconds(const HloInstruction& hlo) const;
+  float optimal_seconds(const HloInstruction& hlo) const;
 
   const Properties& properties() const { return properties_sum_; }
   const float property(const string& key) const {
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index 0eaa21ef254e3461baaaca57503ab24ce35ac929..3b289c240a45e8f3df8156ed89e879da2132d01a 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -389,7 +389,7 @@ TEST_F(FusionCostAnalysis, LoopFusion) {
     static_assert(bytes_accessed == 64, "");
     EXPECT_EQ(fusion_analysis.bytes_accessed(), bytes_accessed);
 
-    EXPECT_EQ(fusion_analysis.seconds(), 1 << i);
+    EXPECT_EQ(fusion_analysis.optimal_seconds(), 1 << i);
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 3f34b9ceb34abc89fca5b896bb8fbe3a06cd6ed4..2a335843f507e2071807245d4dd256e1ec6f08c8 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -333,6 +333,21 @@ bool HloDataflowAnalysis::UpdateCallValueSet(HloInstruction* call) {
   return false;
 }
 
+bool HloDataflowAnalysis::UpdateConditionalValueSet(
+    HloInstruction* conditional) {
+  CHECK_EQ(conditional->opcode(), HloOpcode::kConditional);
+  std::vector<const InstructionValueSet*> inputs = {
+      &GetInstructionValueSet(
+          conditional->true_computation()->root_instruction()),
+      &GetInstructionValueSet(
+          conditional->false_computation()->root_instruction())};
+  // A phi-node is not defined for a kConditional instruction even though it
+  // represents a join point. This is because the current approach is to define
+  // a phi-node only for kWhile to account for the dataflow through back-edges
+  // and deal with the ambiguity in other cases.
+  return GetInstructionValueSet(conditional).AssignUnionOf(inputs);
+}
+
 bool HloDataflowAnalysis::UpdateCopyValueSet(HloInstruction* copy) {
   CHECK_EQ(copy->opcode(), HloOpcode::kCopy);
   bool changed = false;
@@ -394,7 +409,7 @@ bool HloDataflowAnalysis::UpdateParameterValueSet(HloInstruction* parameter) {
   CHECK_EQ(call_graph_node.context(), CallContext::kSequential);
 
   std::vector<const InstructionValueSet*> inputs;
-  bool called_from_while = false;
+  bool need_phi = false;
   for (const CallSite& callsite : call_graph_node.caller_callsites()) {
     if (callsite.instruction()->opcode() == HloOpcode::kCall) {
       // The operand values of a call instruction are forwarded to the
@@ -416,14 +431,32 @@ bool HloDataflowAnalysis::UpdateParameterValueSet(HloInstruction* parameter) {
         inputs.push_back(&GetInstructionValueSet(
             callsite.instruction()->while_body()->root_instruction()));
       }
-      called_from_while = true;
+      need_phi = true;
+    } else if (callsite.instruction()->opcode() == HloOpcode::kConditional) {
+      CHECK_EQ(parameter->parameter_number(), 0);
+      auto conditional = callsite.instruction();
+      // Conditional has 3 operands. Operand 0 is the predicate, operand 1 is
+      // the argument to the true computation and operand 2 is the argument to
+      // the false computation.
+      //
+      // If the parameter belongs to conditional's true computation, then
+      // operand 1 is forwarded to this parameter instruction. If the parameter
+      // belongs to conditional's false computation, then operand 2 is forwarded
+      // to this parameter instruction.
+      if (parameter->parent() == conditional->true_computation()) {
+        inputs.push_back(&GetInstructionValueSet(conditional->operand(1)));
+      } else {
+        CHECK_EQ(parameter->parent(), conditional->false_computation());
+        inputs.push_back(&GetInstructionValueSet(conditional->operand(2)));
+      }
+      need_phi = true;
     } else {
       LOG(FATAL) << "CallContext::kSequential computations should only be "
-                    "called from call or while instructions";
+                    "called from call, while, or conditional instructions";
     }
   }
 
-  if (ssa_form_ && called_from_while) {
+  if (ssa_form_ && need_phi) {
     return Phi(parameter, inputs);
   } else {
     return GetInstructionValueSet(parameter).AssignUnionOf(inputs);
@@ -512,6 +545,8 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet(
       return UpdateSendValueSet(instruction);
     case HloOpcode::kRecvDone:
       return UpdateRecvDoneValueSet(instruction);
+    case HloOpcode::kConditional:
+      return UpdateConditionalValueSet(instruction);
     default:
       // Instruction does not forward HloValues (it defines all values in its
       // output). No update is necessary.
@@ -550,13 +585,31 @@ void HloDataflowAnalysis::Propagate() {
 
       // If user sequentially calls a computation, then the respective
       // parameter(s) of the computation need to be updated.
-      for (HloComputation* called_computation : user->called_computations()) {
-        const CallGraphNode& call_graph_node =
-            call_graph_->GetNode(called_computation);
-        if (call_graph_node.context() == CallContext::kSequential) {
-          for (int64 operand_number : user->OperandIndices(instruction)) {
-            worklist.push(
-                called_computation->parameter_instruction(operand_number));
+      if (user->opcode() == HloOpcode::kConditional) {
+        // If operand 0 is the use of instruction, then no parameters need to be
+        // updated, since that is the predicate of the conditional.
+        // If operand 1 is the use of instruction, then the true_computation's
+        // parameter need to be updated.
+        // If operand 2 is the use of instruction, then the false_computation's
+        // parameter need to be updated.
+        //
+        // Note that the same instruction can be used in both operand 1 and
+        // operand 2.
+        if (user->operand(1) == instruction) {
+          worklist.push(user->true_computation()->parameter_instruction(0));
+        }
+        if (user->operand(2) == instruction) {
+          worklist.push(user->false_computation()->parameter_instruction(0));
+        }
+      } else {
+        for (HloComputation* called_computation : user->called_computations()) {
+          const CallGraphNode& call_graph_node =
+              call_graph_->GetNode(called_computation);
+          if (call_graph_node.context() == CallContext::kSequential) {
+            for (int64 operand_number : user->OperandIndices(instruction)) {
+              worklist.push(
+                  called_computation->parameter_instruction(operand_number));
+            }
           }
         }
       }
@@ -568,7 +621,8 @@ void HloDataflowAnalysis::Propagate() {
       const CallGraphNode& call_graph_node =
           call_graph_->GetNode(instruction->parent());
       for (const CallSite& callsite : call_graph_node.caller_callsites()) {
-        if (callsite.instruction()->opcode() == HloOpcode::kCall) {
+        if ((callsite.instruction()->opcode() == HloOpcode::kCall) ||
+            (callsite.instruction()->opcode() == HloOpcode::kConditional)) {
           worklist.push(callsite.instruction());
         } else if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
           // Add the while itself, and the body and condition parameters.
@@ -636,6 +690,7 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
           break;
         case HloOpcode::kWhile:
         case HloOpcode::kCall:
+        case HloOpcode::kConditional:
         case HloOpcode::kGetTupleElement:
           // These instructions define no values. The values in their output
           // flow from their operands or from cross computation dataflow.
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index dfd81ae951042f7a4d6d3c24af4d5b7e046c272d..469620d01295f90e0c36a48cac9be47c12473a68 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -146,6 +146,7 @@ class HloDataflowAnalysis {
   // the instruction value set changed.
   bool UpdateBitcastValueSet(HloInstruction* bitcast);
   bool UpdateCallValueSet(HloInstruction* call);
+  bool UpdateConditionalValueSet(HloInstruction* conditional);
   bool UpdateCopyValueSet(HloInstruction* copy);
   bool UpdateGetTupleElementValueSet(HloInstruction* gte);
   bool UpdateParameterValueSet(HloInstruction* parameter);
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index f08f0b1d6833b028baa5f997929a17eb5abae205..e714b2567fd1b3eab607a19f0bb7e3288150dc64 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::testing::ElementsAre;
 using ::testing::UnorderedElementsAre;
 
 // Test is parameterized on a bool which is whether the dataflow analysis is
@@ -77,11 +78,23 @@ class HloDataflowAnalysisTest : public HloTestBase,
                                  analysis_->GetValueDefinedAt(b), *analysis_);
   }
 
+  std::unique_ptr<HloComputation> CreateR0F32UnaryOpComputation(
+      HloOpcode opcode) {
+    HloComputation::Builder builder(TestName() + "." + HloOpcodeString(opcode));
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape_, "param0"));
+    builder.AddInstruction(
+        HloInstruction::CreateUnary(scalar_shape_, opcode, param0));
+    return builder.Build();
+  }
+
   std::unique_ptr<HloModule> module_;
   std::unique_ptr<HloDataflowAnalysis> analysis_;
 
   const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
   const Shape vector_shape_ = ShapeUtil::MakeShape(F32, {42});
+  const Shape tuple_shape_ = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(F32, {})});
 };
 
 TEST_P(HloDataflowAnalysisTest, BinaryOperation) {
@@ -1528,6 +1541,315 @@ TEST_P(HloDataflowAnalysisTest, EmbeddedComputationInterference) {
   EXPECT_TRUE(InstructionsMayInterfere(ordering, negate, embedded_log));
 }
 
+TEST_P(HloDataflowAnalysisTest, ConditionalWithIdentity) {
+  // Test conditional with identity computations in both true and false cases.
+  //
+  // true_computation(F32[] %true_param):
+  //   return %true_param
+  //
+  // false_computation(F32[] %false_param):
+  //   return %false_param
+  //
+  // entry:
+  //   %pred = Constant(true)
+  //   %constant1 = Constant(56.0)
+  //   %constant2 = Constant(12.0)
+  //   return Conditional(%pred, %constant1, true_computation,
+  //                      %constant2, false_computation)
+
+  auto true_builder = HloComputation::Builder(TestName() + "_true");
+  auto true_param = true_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "true_param"));
+  HloComputation* true_computation =
+      module_->AddEmbeddedComputation(true_builder.Build());
+
+  auto false_builder = HloComputation::Builder(TestName() + "_false");
+  auto false_param = false_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "false_param"));
+  HloComputation* false_computation =
+      module_->AddEmbeddedComputation(false_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(56.0f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(12.0f)));
+  auto conditional = builder.AddInstruction(HloInstruction::CreateConditional(
+      scalar_shape_, pred, constant1, true_computation, constant2,
+      false_computation));
+  module_->AddEntryComputation(builder.Build());
+
+  const HloDataflowAnalysis& analysis = RunAnalysis(GetParam());
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(pred));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant2));
+
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(true_param));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(false_param));
+
+  EXPECT_EQ(analysis.GetUniqueValueAt(true_param),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(false_param),
+            analysis.GetValueDefinedAt(constant2));
+
+  EXPECT_THAT(analysis.GetValueDefinedAt(pred).uses(),
+              ElementsAre(HloUse{conditional, 0, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
+              ElementsAre(HloUse{conditional, 1, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
+              ElementsAre(HloUse{conditional, 2, {}}));
+
+  EXPECT_EQ(analysis.values().size(), 3);
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
+  EXPECT_THAT(HloValuesAt(conditional),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(constant1),
+                                   analysis.GetValueDefinedAt(constant2)));
+}
+
+TEST_P(HloDataflowAnalysisTest, ConditionalTakingTupleOperand) {
+  // Test conditional with true and false computations taking a tuple operand.
+  //
+  // true_computation((F32[], F32[]) %true_param):
+  //   %true_x = GetTupleElement(%true_param, 0)
+  //   %true_y = GetTupleElement(%true_param, 1)
+  //   return Add(%true_x, %true_y)
+  //
+  // false_computation((F32[], F32[]) %false_param):
+  //   %false_x = GetTupleElement(%false_param, 0)
+  //   %false_y = GetTupleElement(%false_param, 1)
+  //   return Subtract(%false_x, %false_y)
+  //
+  // entry:
+  //   %pred = Constant(true)
+  //   %constant1 = Constant(56.0)
+  //   %constant2 = Constant(12.0)
+  //   %tuple_operand = Tuple(%constant1, %constant2)
+  //   return Conditional(%pred, %tuple_operand, true_computation,
+  //                      %tuple_operand, false_computation)
+
+  auto true_builder = HloComputation::Builder(TestName() + "_true");
+  auto true_param = true_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape_, "true_param"));
+  auto true_x = true_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, true_param, 0));
+  auto true_y = true_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, true_param, 1));
+  auto add = true_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, true_x, true_y));
+  HloComputation* true_computation =
+      module_->AddEmbeddedComputation(true_builder.Build());
+
+  auto false_builder = HloComputation::Builder(TestName() + "_false");
+  auto false_param = false_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape_, "false_param"));
+  auto false_x = false_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, false_param, 0));
+  auto false_y = false_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, false_param, 1));
+  auto sub = false_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kSubtract, false_x, false_y));
+  HloComputation* false_computation =
+      module_->AddEmbeddedComputation(false_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(56.0f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(12.0f)));
+  auto tuple_operand = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant1, constant2}));
+  auto conditional = builder.AddInstruction(HloInstruction::CreateConditional(
+      scalar_shape_, pred, tuple_operand, true_computation, tuple_operand,
+      false_computation));
+  module_->AddEntryComputation(builder.Build());
+
+  const HloDataflowAnalysis& analysis = RunAnalysis(GetParam());
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(pred));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant2));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(tuple_operand));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(add));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(sub));
+
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(true_param));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(false_param));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(true_x));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(true_y));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(false_x));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(false_y));
+
+  EXPECT_EQ(analysis.GetUniqueValueAt(true_param),
+            analysis.GetValueDefinedAt(tuple_operand));
+  EXPECT_EQ(analysis.GetUniqueValueAt(false_param),
+            analysis.GetValueDefinedAt(tuple_operand));
+  EXPECT_EQ(analysis.GetUniqueValueAt(true_x),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(true_y),
+            analysis.GetValueDefinedAt(constant2));
+  EXPECT_EQ(analysis.GetUniqueValueAt(false_x),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(false_y),
+            analysis.GetValueDefinedAt(constant2));
+
+  EXPECT_THAT(analysis.GetValueDefinedAt(pred).uses(),
+              ElementsAre(HloUse{conditional, 0, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
+              UnorderedElementsAre(HloUse{conditional, 1, {0}},
+                                   HloUse{conditional, 2, {0}},
+                                   HloUse{add, 0, {}}, HloUse{sub, 0, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
+              UnorderedElementsAre(HloUse{conditional, 1, {1}},
+                                   HloUse{conditional, 2, {1}},
+                                   HloUse{add, 1, {}}, HloUse{sub, 1, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(tuple_operand).uses(),
+              UnorderedElementsAre(
+                  HloUse{conditional, 1, {}}, HloUse{conditional, 2, {}},
+                  HloUse{true_x, 0, {}}, HloUse{true_y, 0, {}},
+                  HloUse{false_x, 0, {}}, HloUse{false_y, 0, {}}));
+
+  EXPECT_EQ(analysis.values().size(), 6);
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
+  EXPECT_THAT(HloValuesAt(conditional),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(add),
+                                   analysis.GetValueDefinedAt(sub)));
+}
+
+TEST_P(HloDataflowAnalysisTest, NestedConditionals) {
+  // computation1(F32[] %param1):
+  //   %ceil = Ceil(%param1)
+  //   return %ceil
+  //
+  // computation2(F32[] %param2):
+  //   %floor = Floor(%param2)
+  //   return %floor
+  //
+  // computation3(F32[] %param3):
+  //   %negate = Negate(%param3)
+  //   return %negate
+  //
+  // inner_conditional((PRED, F32[], F32[]) %param_cond):
+  //   %pred_cond = GetTupleElement(%param_cond, 0)
+  //   %true_operand_cond = GetTupleElement(%param_cond, 1)
+  //   %false_opearnd_cond = GetTupleElement(%param_cond, 2)
+  //   return Conditional(%pred_cond, %true_operand_cond, computation1,
+  //                      %false_operand_cond, computation2)
+  //
+  // entry:
+  //   %pred1 = Constant(true)
+  //   %pred2 = Constant(false)
+  //   %constant1 = Constant(1.1);
+  //   %constant2 = Constant(2.2);
+  //   %constant3 = Constant(3.3);
+  //   return Conditional(%pred1, (%pred2, %constant1, %constant2),
+  //                      inner_conditional, %constant3, computation3)
+
+  auto computation1 = module_->AddEmbeddedComputation(
+      CreateR0F32UnaryOpComputation(HloOpcode::kCeil));
+  auto computation2 = module_->AddEmbeddedComputation(
+      CreateR0F32UnaryOpComputation(HloOpcode::kFloor));
+  auto computation3 = module_->AddEmbeddedComputation(
+      CreateR0F32UnaryOpComputation(HloOpcode::kNegate));
+
+  // Build inner_conditional computation.
+  const Shape scalar_bool_shape = ShapeUtil::MakeShape(PRED, {});
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {scalar_bool_shape, scalar_shape_, scalar_shape_});
+  auto inner_builder =
+      HloComputation::Builder(TestName() + "_inner_conditional");
+  auto param_cond = inner_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_param_shape, "param_cond"));
+  auto pred_cond = inner_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_bool_shape, param_cond, 0));
+  auto true_operand_cond = inner_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param_cond, 1));
+  auto false_operand_cond = inner_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param_cond, 2));
+  auto inner_conditional =
+      inner_builder.AddInstruction(HloInstruction::CreateConditional(
+          scalar_shape_, pred_cond, true_operand_cond, computation1,
+          false_operand_cond, computation2));
+  auto inner_conditional_computation =
+      module_->AddEmbeddedComputation(inner_builder.Build());
+
+  // Build entry computation.
+  auto builder = HloComputation::Builder(TestName());
+  auto pred1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  auto pred2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.2f)));
+  auto constant3 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(3.3f)));
+  auto tuple_operand = builder.AddInstruction(
+      HloInstruction::CreateTuple({pred2, constant1, constant2}));
+  auto conditional = builder.AddInstruction(HloInstruction::CreateConditional(
+      scalar_shape_, pred1, tuple_operand, inner_conditional_computation,
+      constant3, computation3));
+  module_->AddEntryComputation(builder.Build());
+
+  const HloDataflowAnalysis& analysis = RunAnalysis(GetParam());
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(pred1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(pred2));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant2));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant3));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(tuple_operand));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(computation1->root_instruction()));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(computation2->root_instruction()));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(computation3->root_instruction()));
+
+  auto computation1_param = computation1->parameter_instruction(0);
+  auto computation2_param = computation2->parameter_instruction(0);
+  auto computation3_param = computation3->parameter_instruction(0);
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(computation1_param));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(computation2_param));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(computation3_param));
+  EXPECT_EQ(analysis.GetUniqueValueAt(computation1_param),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(computation2_param),
+            analysis.GetValueDefinedAt(constant2));
+  EXPECT_EQ(analysis.GetUniqueValueAt(computation3_param),
+            analysis.GetValueDefinedAt(constant3));
+
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(param_cond));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(pred_cond));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(true_operand_cond));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(false_operand_cond));
+  EXPECT_EQ(analysis.GetUniqueValueAt(param_cond),
+            analysis.GetValueDefinedAt(tuple_operand));
+  EXPECT_EQ(analysis.GetUniqueValueAt(pred_cond),
+            analysis.GetValueDefinedAt(pred2));
+  EXPECT_EQ(analysis.GetUniqueValueAt(true_operand_cond),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(false_operand_cond),
+            analysis.GetValueDefinedAt(constant2));
+
+  EXPECT_EQ(analysis.values().size(), 9);
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(inner_conditional));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
+  EXPECT_THAT(
+      HloValuesAt(inner_conditional),
+      UnorderedElementsAre(
+          analysis.GetValueDefinedAt(computation1->root_instruction()),
+          analysis.GetValueDefinedAt(computation2->root_instruction())));
+  EXPECT_THAT(
+      HloValuesAt(conditional),
+      UnorderedElementsAre(
+          analysis.GetValueDefinedAt(computation1->root_instruction()),
+          analysis.GetValueDefinedAt(computation2->root_instruction()),
+          analysis.GetValueDefinedAt(computation3->root_instruction())));
+}
+
 INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
                         HloDataflowAnalysisTest,
                         ::testing::Values(false, true));
diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc
index a4921232f5848dbe1789c4c641e2b0ba3c1848bb..1e5f0f797a13fd7e7ce1cc934387a274a74153bc 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce.cc
@@ -37,6 +37,9 @@ namespace xla {
 StatusOr<bool> HloDCE::Run(HloModule* module) {
   bool changed = false;
 
+  VLOG(2) << "Before dce:";
+  XLA_VLOG_LINES(2, module->ToString());
+
   for (auto* computation : module->MakeNonfusionComputations()) {
     std::unordered_set<HloInstruction*> live_instructions;
     TF_RETURN_IF_ERROR(computation->root_instruction()->Accept(
@@ -52,12 +55,15 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
     for (auto* instruction : computation->instructions()) {
       if (instruction->user_count() == 0 &&
           live_instructions.count(instruction) == 0 &&
-          computation->IsRemovable(instruction)) {
+          computation->IsRemovable(instruction) &&
+          !instruction->HasSideEffect()) {
         dead_roots.push_back(instruction);
       }
     }
 
     for (HloInstruction* dead_root : dead_roots) {
+      VLOG(1) << "Removing dead root " << dead_root->ToString()
+              << " and it's unused operands";
       TF_RETURN_IF_ERROR(
           computation->RemoveInstructionAndUnusedOperands(dead_root));
       changed = true;
@@ -87,6 +93,9 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
     }
   }
 
+  VLOG(2) << "After dce:";
+  XLA_VLOG_LINES(2, module->ToString());
+
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc
index d54b9a27087a42fd23eab0bd06e8deaca567312b..5a56607a665c4cbeb7b2572f182b88e890602968 100644
--- a/tensorflow/compiler/xla/service/hlo_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc
@@ -70,6 +70,26 @@ TEST_F(HloDceTest, NoDeadCode) {
   EXPECT_EQ(3, computation->instruction_count());
 }
 
+TEST_F(HloDceTest, InstructionsWithSideEffect) {
+  // Verify that side-effect instructions (Send in this test) are not removed.
+  auto builder = HloComputation::Builder(TestName());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
+  builder.AddInstruction(
+      HloInstruction::CreateSend(constant, /*channel_id=*/0));
+  builder.AddInstruction(HloInstruction::CreateTuple({}));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_EQ(3, computation->instruction_count());
+
+  HloDCE dce;
+  EXPECT_FALSE(dce.Run(module.get()).ValueOrDie());
+
+  EXPECT_EQ(3, computation->instruction_count());
+}
+
 TEST_F(HloDceTest, DeadParameters) {
   // Verify that dead parameters are not removed, but use of the dead parameters
   // are.
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1773bb401d380031f6c860d295e76d2f62c9e5ff
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
@@ -0,0 +1,137 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+namespace {
+
+HloInstruction* ToElementType(HloInstruction* hlo, PrimitiveType type) {
+  if (hlo->shape().element_type() != type) {
+    Shape shape = ShapeUtil::ChangeElementType(hlo->shape(), type);
+    hlo = hlo->parent()->AddInstruction(
+        HloInstruction::CreateConvert(shape, hlo));
+  }
+  CHECK_EQ(hlo->shape().element_type(), type);
+  return hlo;
+}
+
+bool HasOperandType(HloInstruction* hlo, PrimitiveType type) {
+  for (HloInstruction* operand : hlo->operands()) {
+    if (operand->shape().element_type() == type) {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+HloElementTypeConverter::HloElementTypeConverter(
+    PrimitiveType eliminate_type, PrimitiveType replace_with_type)
+    : eliminate_type_(eliminate_type), replace_with_type_(replace_with_type) {}
+
+StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
+  XLA_VLOG_LINES(
+      3, "HloElementTypeConverter::Run(), before:\n" + module->ToString());
+  bool changed = false;
+  for (auto* computation : module->computations()) {
+    for (auto* hlo : computation->MakeInstructionPostOrder()) {
+      // These are ops where it does not make sense to convert them.
+      if (hlo->opcode() == HloOpcode::kParameter ||
+          hlo->opcode() == HloOpcode::kConstant ||
+          hlo->opcode() == HloOpcode::kTuple ||
+          hlo->opcode() == HloOpcode::kConvert ||
+          hlo->opcode() == HloOpcode::kGetTupleElement ||
+          hlo->opcode() == HloOpcode::kInfeed ||
+          hlo->opcode() == HloOpcode::kOutfeed) {
+        continue;
+      }
+
+      // We cannot change a CustomCall since we have no way of adjusting the
+      // called binary to expect the updated type.
+      if (hlo->opcode() == HloOpcode::kCustomCall) {
+        continue;
+      }
+
+      // These are ops with embedded computations where it suffices to convert
+      // the embedded computations instead of converting the ops themselves.
+      if (hlo->opcode() == HloOpcode::kWhile ||
+          hlo->opcode() == HloOpcode::kCall ||
+          hlo->opcode() == HloOpcode::kFusion ||
+          hlo->opcode() == HloOpcode::kMap ||
+          hlo->opcode() == HloOpcode::kReduce ||
+          hlo->opcode() == HloOpcode::kReduceWindow ||
+          hlo->opcode() == HloOpcode::kSelectAndScatter ||
+          hlo->opcode() == HloOpcode::kConditional) {
+        continue;
+      }
+      TF_RET_CHECK(hlo->called_computations().empty()) << hlo->ToString();
+
+      if (!HasOperandType(hlo, eliminate_type_)) {
+        // If this CHECK fires, then this was an instruction that does not take
+        // the elimination type as an operand but it does return it. This pass
+        // does not have a feature to change the output type in that case, so
+        // instead of silently failing to eliminate the type, it fails loudly.
+        TF_RET_CHECK(hlo->shape().element_type() != eliminate_type_);
+        continue;
+      }
+
+      std::vector<HloInstruction*> new_operands;
+      for (HloInstruction* operand : hlo->operands()) {
+        if (operand->shape().element_type() == eliminate_type_) {
+          operand = ToElementType(operand, replace_with_type_);
+        }
+        new_operands.push_back(operand);
+      }
+
+      HloInstruction* new_hlo;
+      if (hlo->shape().element_type() == eliminate_type_) {
+        Shape shape =
+            ShapeUtil::ChangeElementType(hlo->shape(), replace_with_type_);
+        new_hlo = computation->AddInstruction(
+            hlo->CloneWithNewOperands(shape, new_operands, hlo->GetModule()));
+        new_hlo = ToElementType(new_hlo, eliminate_type_);
+      } else {
+        new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands(
+            hlo->shape(), new_operands, hlo->GetModule()));
+      }
+      TF_RETURN_IF_ERROR(computation->ReplaceInstruction(hlo, new_hlo));
+      changed = true;
+    }
+  }
+  XLA_VLOG_LINES(
+      2, "HloElementTypeConverter::Run(), after:\n" + module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.h b/tensorflow/compiler/xla/service/hlo_element_type_converter.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b109225d0b192e5c9e4f6d841377ffad8078dc2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.h
@@ -0,0 +1,49 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ELEMENT_TYPE_CONVERTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ELEMENT_TYPE_CONVERTER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass that eliminates certain element types as the input or output of ops by
+// inserting Convert ops. This allows a backend to support an element type while
+// only actually implementing the Convert op for that element type. This is
+// generally not the fastest approach, but it works.
+class HloElementTypeConverter : public HloPassInterface {
+ public:
+  // eliminate_type is the type to eliminate as the input or output of ops,
+  // using Convert ops to replace it with replace_with_type.
+  HloElementTypeConverter(PrimitiveType eliminate_type,
+                          PrimitiveType replace_with_type);
+
+  tensorflow::StringPiece name() const override {
+    return "element_type_converter";
+  }
+
+  // Returns the pass on the module and returns whether the module was modified.
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  PrimitiveType eliminate_type_;
+  PrimitiveType replace_with_type_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ELEMENT_TYPE_CONVERTER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index a722d1b3d99462f7252c259f74dcef1dfa4967b7..173f0e2c42bed2ea461eef27d811e0a626c4fee3 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
@@ -167,11 +168,37 @@ StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOpImpl(
 
 }  // namespace
 
-template <typename ReturnT>
+template <typename ReturnT, typename ElementwiseT>
 class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
  public:
   explicit TypedVisitor(HloEvaluator* p) : parent_(p) {}
 
+  // The following higher-order functions convert a function with ElementwiseT
+  // to a function with ReturnT.
+  std::function<ReturnT(ReturnT)> ConvertUnaryFunction(
+      const std::function<ElementwiseT(ElementwiseT)>& unary_op) {
+    return [&unary_op](ReturnT arg) {
+      return static_cast<ReturnT>(unary_op(static_cast<ElementwiseT>(arg)));
+    };
+  }
+  std::function<ReturnT(ReturnT, ReturnT)> ConvertBinaryFunction(
+      const std::function<ElementwiseT(ElementwiseT, ElementwiseT)>&
+          binary_op) {
+    return [&binary_op](ReturnT arg1, ReturnT arg2) {
+      return static_cast<ReturnT>(binary_op(static_cast<ElementwiseT>(arg1),
+                                            static_cast<ElementwiseT>(arg2)));
+    };
+  }
+  std::function<ReturnT(ReturnT, ReturnT, ReturnT)> ConvertTernaryFunction(
+      const std::function<ElementwiseT(ElementwiseT, ElementwiseT,
+                                       ElementwiseT)>& ternary_op) {
+    return [&ternary_op](ReturnT arg1, ReturnT arg2, ReturnT arg3) {
+      return static_cast<ReturnT>(ternary_op(static_cast<ElementwiseT>(arg1),
+                                             static_cast<ElementwiseT>(arg2),
+                                             static_cast<ElementwiseT>(arg3)));
+    };
+  }
+
   Status DefaultAction(HloInstruction* hlo_instruction) override {
     return Unimplemented("unhandled HLO ops for HloEvaluator: %s.",
                          HloOpcodeString(hlo_instruction->opcode()).c_str());
@@ -197,24 +224,25 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
                               is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleAbs(HloInstruction* abs) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
-                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
+                        ElementWiseUnaryOp(abs, [](ElementwiseT elem_operand) {
                           return std::abs(elem_operand);
                         }));
     return Status::OK();
   }
 
   Status HandleAbs(HloInstruction* abs) override {
-    return HandleAbs<ReturnT>(abs);
+    return HandleAbs<ElementwiseT>(abs);
   }
 
   template <
       typename NativeT,
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleRound(HloInstruction* round) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[round],
-                        ElementWiseUnaryOp(round, [](ReturnT elem_operand) {
-                          return std::round(elem_operand);
-                        }));
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[round],
+        ElementWiseUnaryOp(round, [](ElementwiseT elem_operand) {
+          return std::round(elem_operand);
+        }));
     return Status::OK();
   }
 
@@ -264,7 +292,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleCeil(HloInstruction* ceil) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[ceil],
-                        ElementWiseUnaryOp(ceil, [](ReturnT elem_operand) {
+                        ElementWiseUnaryOp(ceil, [](ElementwiseT elem_operand) {
                           return std::ceil(elem_operand);
                         }));
     return Status::OK();
@@ -299,7 +327,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleExp(HloInstruction* exp) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[exp],
-                        ElementWiseUnaryOp(exp, [](ReturnT elem_operand) {
+                        ElementWiseUnaryOp(exp, [](ElementwiseT elem_operand) {
                           return std::exp(elem_operand);
                         }));
     return Status::OK();
@@ -309,10 +337,11 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleFloor(HloInstruction* floor) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[floor],
-                        ElementWiseUnaryOp(floor, [](ReturnT elem_operand) {
-                          return std::floor(elem_operand);
-                        }));
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[floor],
+        ElementWiseUnaryOp(floor, [](ElementwiseT elem_operand) {
+          return std::floor(elem_operand);
+        }));
     return Status::OK();
   }
 
@@ -329,18 +358,40 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleLog(HloInstruction* log) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[log],
-                        ElementWiseUnaryOp(log, [](ReturnT elem_operand) {
+                        ElementWiseUnaryOp(log, [](ElementwiseT elem_operand) {
                           return std::log(elem_operand);
                         }));
     return Status::OK();
   }
 
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_integral<NativeT>::value &&
+                !std::is_same<NativeT, bool>::value>::type* = nullptr>
+  Status HandleNot(HloInstruction* not_) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
+                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
+                          return ~elem_operand;
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
   Status HandleNot(HloInstruction* not_) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
-                        ElementWiseUnaryOp(not_, [](ReturnT elem_operand) {
+                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
+                          return !elem_operand;
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_same<NativeT, bool>::value>::type* =
+                nullptr>
+  Status HandleNot(HloInstruction* not_) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
+                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
                           return !elem_operand;
                         }));
     return Status::OK();
@@ -354,25 +405,47 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleNot(HloInstruction* not_) override {
-    return HandleNot<ReturnT>(not_);
+    return HandleNot<ElementwiseT>(not_);
   }
 
-  Status HandleNegate(HloInstruction* negate) override {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[negate],
-                        ElementWiseUnaryOp(negate, [](ReturnT elem_operand) {
-                          return -elem_operand;
-                        }));
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_signed<NativeT>::value &&
+                !std::is_floating_point<NativeT>::value>::type* = nullptr>
+  Status HandleNegate(HloInstruction* negate) {
+    using type = typename std::make_unsigned<NativeT>::type;
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[negate],
+        ElementWiseUnaryOp(negate, [](ElementwiseT elem_operand) {
+          return NativeT(-type(elem_operand));
+        }));
     return Status::OK();
   }
 
+  template <typename NativeT,
+            typename std::enable_if<
+                !std::is_signed<NativeT>::value ||
+                std::is_floating_point<NativeT>::value>::type* = nullptr>
+  Status HandleNegate(HloInstruction* negate) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[negate],
+        ElementWiseUnaryOp(
+            negate, [](ElementwiseT elem_operand) { return -elem_operand; }));
+    return Status::OK();
+  }
+
+  Status HandleNegate(HloInstruction* negate) override {
+    return HandleNegate<ReturnT>(negate);
+  }
+
   template <
       typename NativeT,
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleSign(HloInstruction* sign) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
-                        ElementWiseUnaryOp(sign, [](ReturnT elem_operand) {
-                          return (ReturnT(0) < elem_operand) -
-                                 (elem_operand < ReturnT(0));
+                        ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
+                          return (ElementwiseT(0) < elem_operand) -
+                                 (elem_operand < ElementwiseT(0));
                         }));
     return Status::OK();
   }
@@ -382,9 +455,9 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleSign(HloInstruction* sign) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
-                        ElementWiseUnaryOp(sign, [](ReturnT elem_operand) {
+                        ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
                           auto abs_val = std::abs(elem_operand);
-                          return 0 == abs_val ? ReturnT(0)
+                          return 0 == abs_val ? ElementwiseT(0)
                                               : elem_operand / abs_val;
                         }));
     return Status::OK();
@@ -396,45 +469,71 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleTanh(HloInstruction* tanh) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[tanh],
-                        ElementWiseUnaryOp(tanh, [](ReturnT elem_operand) {
+                        ElementWiseUnaryOp(tanh, [](ElementwiseT elem_operand) {
                           return std::tanh(elem_operand);
                         }));
     return Status::OK();
   }
 
-  Status HandleMultiply(HloInstruction* multiply) override {
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_signed<NativeT>::value &&
+                !std::is_floating_point<NativeT>::value>::type* = nullptr>
+  Status HandleMultiply(HloInstruction* multiply) {
+    using type = typename std::make_unsigned<NativeT>::type;
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[multiply],
-        ElementWiseBinaryOp(multiply, [](ReturnT lhs_elem, ReturnT rhs_elem) {
-          return lhs_elem * rhs_elem;
-        }));
+        ElementWiseBinaryOp(multiply,
+                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+                              return NativeT(type(lhs_elem) * type(rhs_elem));
+                            }));
     return Status::OK();
   }
 
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_unsigned<NativeT>::value ||
+                              std::is_floating_point<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleMultiply(HloInstruction* multiply) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[multiply],
+        ElementWiseBinaryOp(multiply,
+                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+                              return lhs_elem * rhs_elem;
+                            }));
+    return Status::OK();
+  }
+
+  Status HandleMultiply(HloInstruction* multiply) override {
+    return HandleMultiply<ElementwiseT>(multiply);
+  }
+
   Status HandleSubtract(HloInstruction* subtract) override {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[subtract],
-        ElementWiseBinaryOp(subtract, [](ReturnT lhs_elem, ReturnT rhs_elem) {
-          return lhs_elem - rhs_elem;
-        }));
+        ElementWiseBinaryOp(subtract,
+                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+                              return lhs_elem - rhs_elem;
+                            }));
     return Status::OK();
   }
 
   Status HandleAdd(HloInstruction* add) override {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[add],
-        ElementWiseBinaryOp(add, [](ReturnT lhs_elem, ReturnT rhs_elem) {
-          return lhs_elem + rhs_elem;
-        }));
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[add],
+                        ElementWiseBinaryOp(add, [](ElementwiseT lhs_elem,
+                                                    ElementwiseT rhs_elem) {
+                          return lhs_elem + rhs_elem;
+                        }));
     return Status::OK();
   }
 
   Status HandleDivide(HloInstruction* divide) override {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[divide],
-        ElementWiseBinaryOp(divide, [](ReturnT lhs_elem, ReturnT rhs_elem) {
-          return lhs_elem / rhs_elem;
-        }));
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[divide],
+                        ElementWiseBinaryOp(divide, [](ElementwiseT lhs_elem,
+                                                       ElementwiseT rhs_elem) {
+                          return lhs_elem / rhs_elem;
+                        }));
     return Status::OK();
   }
 
@@ -444,7 +543,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleMaximum(HloInstruction* maximum) {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[maximum],
-        ElementWiseBinaryOp(maximum, [](ReturnT lhs, ReturnT rhs) {
+        ElementWiseBinaryOp(maximum, [](ElementwiseT lhs, ElementwiseT rhs) {
           return std::fmax(lhs, rhs);
         }));
     return Status::OK();
@@ -458,18 +557,18 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleMaximum(HloInstruction* maximum) override {
-    return HandleMaximum<ReturnT>(maximum);
+    return HandleMaximum<ElementwiseT>(maximum);
   }
 
   template <
       typename NativeT,
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleMinimum(HloInstruction* minimum) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[minimum],
-        ElementWiseBinaryOp(minimum, [](ReturnT lhs_el, ReturnT rhs_el) {
-          return std::fmin(lhs_el, rhs_el);
-        }));
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[minimum],
+                        ElementWiseBinaryOp(minimum, [](ElementwiseT lhs_el,
+                                                        ElementwiseT rhs_el) {
+                          return std::fmin(lhs_el, rhs_el);
+                        }));
     return Status::OK();
   }
 
@@ -481,15 +580,15 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleMinimum(HloInstruction* minimum) override {
-    return HandleMinimum<ReturnT>(minimum);
+    return HandleMinimum<ElementwiseT>(minimum);
   }
 
   Status HandlePower(HloInstruction* power) override {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[power],
-        ElementWiseBinaryOp(power, [](ReturnT lhs_el, ReturnT rhs_el) {
-          return std::pow(lhs_el, rhs_el);
-        }));
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[power],
+                        ElementWiseBinaryOp(power, [](ElementwiseT lhs_el,
+                                                      ElementwiseT rhs_el) {
+                          return std::pow(lhs_el, rhs_el);
+                        }));
     return Status::OK();
   }
 
@@ -497,11 +596,11 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleRemainder(HloInstruction* remainder) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[remainder],
-        ElementWiseBinaryOp(remainder, [](ReturnT lhs_el, ReturnT rhs_el) {
-          return std::fmod(lhs_el, rhs_el);
-        }));
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[remainder],
+                        ElementWiseBinaryOp(remainder, [](ElementwiseT lhs_el,
+                                                          ElementwiseT rhs_el) {
+                          return std::fmod(lhs_el, rhs_el);
+                        }));
     return Status::OK();
   }
 
@@ -513,16 +612,27 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleRemainder(HloInstruction* remainder) override {
-    return HandleRemainder<ReturnT>(remainder);
+    return HandleRemainder<ElementwiseT>(remainder);
   }
 
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  template <typename NativeT,
+            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
+                nullptr>
   Status HandleAnd(HloInstruction* and_) {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[and_],
-        ElementWiseBinaryOp(and_, [](ReturnT lhs_el, ReturnT rhs_el) {
+        ElementWiseBinaryOp(and_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
+          return lhs_el & rhs_el;
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleAnd(HloInstruction* and_) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[and_],
+        ElementWiseBinaryOp(and_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
           return lhs_el && rhs_el;
         }));
     return Status::OK();
@@ -536,16 +646,27 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleAnd(HloInstruction* and_) override {
-    return HandleAnd<ReturnT>(and_);
+    return HandleAnd<ElementwiseT>(and_);
   }
 
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  template <typename NativeT,
+            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
+                nullptr>
   Status HandleOr(HloInstruction* or_) {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[or_],
-        ElementWiseBinaryOp(or_, [](ReturnT lhs_el, ReturnT rhs_el) {
+        ElementWiseBinaryOp(or_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
+          return lhs_el | rhs_el;
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleOr(HloInstruction* or_) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[or_],
+        ElementWiseBinaryOp(or_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
           return lhs_el || rhs_el;
         }));
     return Status::OK();
@@ -559,7 +680,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleOr(HloInstruction* or_) override {
-    return HandleOr<ReturnT>(or_);
+    return HandleOr<ElementwiseT>(or_);
   }
 
   template <typename NativeT,
@@ -584,7 +705,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleShiftLeft(HloInstruction* shl) override {
-    return HandleShiftLeft<ReturnT>(shl);
+    return HandleShiftLeft<ElementwiseT>(shl);
   }
   template <typename NativeT,
             typename std::enable_if<
@@ -610,7 +731,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleShiftRightArithmetic(HloInstruction* shra) override {
-    return HandleShiftRightArithmetic<ReturnT>(shra);
+    return HandleShiftRightArithmetic<ElementwiseT>(shra);
   }
 
   template <typename NativeT,
@@ -637,19 +758,21 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleShiftRightLogical(HloInstruction* shrl) override {
-    return HandleShiftRightLogical<ReturnT>(shrl);
+    return HandleShiftRightLogical<ElementwiseT>(shrl);
   }
 
   template <
       typename NativeT,
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleClamp(HloInstruction* clamp) {
-    std::function<ReturnT(ReturnT, ReturnT, ReturnT)> clamp_op =
-        [](ReturnT low, ReturnT high, ReturnT value) {
+    std::function<ElementwiseT(ElementwiseT, ElementwiseT, ElementwiseT)>
+        clamp_op = [](ElementwiseT low, ElementwiseT value, ElementwiseT high) {
           return std::fmax(low, std::fmin(value, high));
         };
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[clamp],
-                        ElementWiseTernaryOp(clamp, std::move(clamp_op)));
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[clamp],
+        ElementwiseTernaryOp(clamp,
+                             std::move(ConvertTernaryFunction(clamp_op))));
     return Status::OK();
   }
 
@@ -661,7 +784,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleClamp(HloInstruction* clamp) override {
-    return HandleClamp<ReturnT>(clamp);
+    return HandleClamp<ElementwiseT>(clamp);
   }
 
   Status HandleSelect(HloInstruction* select) override {
@@ -674,7 +797,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
           return on_false;
         };
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[select],
-                        ElementWiseTernaryOp(select, std::move(select_op)));
+                        ElementwiseTernaryOp(select, std::move(select_op)));
     return Status::OK();
   }
 
@@ -724,7 +847,8 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     CHECK(ShapeUtil::SameElementType(lhs_shape, result_shape));
 
     const auto& dnums = conv->convolution_dimension_numbers();
-    const int64 num_spatial_dims = dnums.spatial_dimensions_size();
+    const int64 num_spatial_dims = dnums.output_spatial_dimensions_size();
+    CHECK_EQ(num_spatial_dims, dnums.input_spatial_dimensions_size());
     CHECK_EQ(num_spatial_dims, dnums.kernel_spatial_dimensions_size());
     CHECK_GE(num_spatial_dims, 0);
     CHECK_EQ(window.dimensions_size(), num_spatial_dims);
@@ -771,7 +895,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size());
 
     auto func = [&](tensorflow::gtl::ArraySlice<int64> out_index) {
-      ReturnT result_val = static_cast<ReturnT>(0);
+      ElementwiseT result_val = static_cast<ElementwiseT>(0);
 
       std::fill(lhs_index.begin(), lhs_index.end(), 0);
       std::fill(rhs_index.begin(), rhs_index.end(), 0);
@@ -789,13 +913,15 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
           // Find corresponding spatial dimension index for input (lhs).
           for (int64 ki = 0; ki < rhs_spatial_index.size(); ++ki) {
             // Spatial dimension number for input (lhs) and output.
-            const int64 spatial_dim = dnums.spatial_dimensions(ki);
+            const int64 input_spatial_dim = dnums.input_spatial_dimensions(ki);
+            const int64 output_spatial_dim =
+                dnums.output_spatial_dimensions(ki);
 
             // Calculate lhs (input) index without taking base dilation into
             // account.
             const auto& window_dim = window.dimensions(ki);
             const int64 undilated_index =
-                out_index[spatial_dim] * window_dim.stride() -
+                out_index[output_spatial_dim] * window_dim.stride() -
                 window_dim.padding_low() +
                 rhs_spatial_index[ki] * window_dim.window_dilation();
             // Skip if the lhs (input) index is to be dilated.
@@ -804,26 +930,30 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
             }
 
             // Calculate the actual lhs (input) index after dilation.
-            lhs_index[spatial_dim] =
+            lhs_index[input_spatial_dim] =
                 undilated_index / window_dim.base_dilation();
 
             // Skip if input index is not in bound.
-            if (!(lhs_index[spatial_dim] >= 0 &&
-                  lhs_index[spatial_dim] < lhs_shape.dimensions(spatial_dim))) {
+            if (!(lhs_index[input_spatial_dim] >= 0 &&
+                  lhs_index[input_spatial_dim] <
+                      lhs_shape.dimensions(input_spatial_dim))) {
               goto cnt;
             }
 
             rhs_index[dnums.kernel_spatial_dimensions(ki)] =
-                rhs_spatial_index[ki];
+                window_dim.window_reversal()
+                    ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
+                    : rhs_spatial_index[ki];
           }
 
-          result_val += lhs_literal.Get<ReturnT>(lhs_index) *
-                        rhs_literal.Get<ReturnT>(rhs_index);
+          result_val +=
+              static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index)) *
+              static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
         }
-      cnt:;
+      cnt : {}
       } while (IndexUtil::BumpIndices(window_shape, &rhs_spatial_index));
 
-      return result_val;
+      return static_cast<ReturnT>(result_val);
     };
 
     auto result = Literal::CreateFromShape(result_shape);
@@ -873,7 +1003,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     auto result = Literal::CreateFromShape(dot->shape());
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
         [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
-          ReturnT result_val = static_cast<ReturnT>(0);
+          ElementwiseT result_val = static_cast<ElementwiseT>(0);
 
           std::vector<int64> lhs_index(lhs_rank, 0);
           std::vector<int64> rhs_index(rhs_rank, 0);
@@ -890,11 +1020,12 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
             lhs_index[lhs_contracted_dimension] = i;
             rhs_index[rhs_contracted_dimension] = i;
 
-            result_val += lhs_literal.Get<ReturnT>(lhs_index) *
-                          rhs_literal.Get<ReturnT>(rhs_index);
+            result_val +=
+                static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index)) *
+                static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
           }
 
-          return result_val;
+          return static_cast<ReturnT>(result_val);
         }));
 
     parent_->evaluated_[dot] = std::move(result);
@@ -1080,6 +1211,97 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  template <typename NativeT>
+  StatusOr<std::unique_ptr<Literal>> MapImpl(HloInstruction* map) {
+    auto operands = map->operands();
+    HloComputation* computation = map->to_apply();
+
+    auto result = Literal::CreateFromShape(map->shape());
+
+    HloEvaluator embedded_evaluator;
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          std::vector<std::unique_ptr<Literal>> arg_literals;
+          arg_literals.reserve(operands.size());
+
+          // Construct scalar literal parameters to be passed to the map
+          // computation.
+          for (auto operand : operands) {
+            const Literal& arg_literal =
+                parent_->GetEvaluatedLiteralFor(operand);
+
+            auto curr_val = arg_literal.Get<NativeT>(multi_index);
+            auto curr_val_literal = Literal::CreateR0<NativeT>(curr_val);
+
+            arg_literals.push_back(std::move(curr_val_literal));
+          }
+
+          std::unique_ptr<Literal> computed_result =
+              embedded_evaluator
+                  .Evaluate<std::unique_ptr<Literal>>(*computation,
+                                                      arg_literals)
+                  .ConsumeValueOrDie();
+          // Clear visit states so that the we can use the evaluate again on
+          // the same computation.
+          embedded_evaluator.ResetVisitStates();
+
+          return computed_result->Get<ReturnT>({});
+        }));
+    return std::move(result);
+  }
+
+  Status HandleMap(HloInstruction* map) override {
+    switch (map->operand(0)->shape().element_type()) {
+      case PRED: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<bool>(map));
+        break;
+      }
+      case U8: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint8>(map));
+        break;
+      }
+      case U32: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint32>(map));
+        break;
+      }
+      case U64: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint64>(map));
+        break;
+      }
+      case S8: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int8>(map));
+        break;
+      }
+      case S32: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int32>(map));
+        break;
+      }
+      case S64: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int64>(map));
+        break;
+      }
+      case F32: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<float>(map));
+        break;
+      }
+      case F64: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<double>(map));
+        break;
+      }
+      case C64: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<complex64>(map));
+        break;
+      }
+      default:
+        LOG(FATAL) << "HandleMap: unhandled primitive type for "
+                      "input operand: "
+                   << PrimitiveType_Name(
+                          map->operand(0)->shape().element_type());
+    }
+
+    return Status::OK();
+  }
+
   Status HandleReduce(HloInstruction* reduce) override {
     auto arg = reduce->operand(0);
     auto init_value = reduce->operand(1);
@@ -1126,6 +1348,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
       }
     }
 
+    HloEvaluator embedded_evaluator;
     // For each resulting dimension, calculate and assign computed value.
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
         [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
@@ -1145,13 +1368,12 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
             std::vector<const Literal*> args = {curr_val_literal.get(),
                                                 result_val_literal.get()};
 
-            // We need a new visitor for each evaluation, so that the same
-            // computation can be visited more than once (with different
-            // inputs).
-            HloEvaluator embedded_evaluator;
             std::unique_ptr<Literal> computed_result =
-                embedded_evaluator.Evaluate(*function, args)
+                embedded_evaluator.Evaluate<const Literal*>(*function, args)
                     .ConsumeValueOrDie();
+            // Clear visit states so that the we can use the evaluate again on
+            // the same computation.
+            embedded_evaluator.ResetVisitStates();
 
             // Assign computed result to result_val.
             result_val = computed_result->Get<ReturnT>({});
@@ -1208,6 +1430,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     DimensionVector window_index(window.dimensions_size());
     DimensionVector operand_index(ShapeUtil::Rank(operand_literal.shape()));
 
+    HloEvaluator embedded_evaluator;
     // For each resulting dimension, calculate and assign computed value.
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
         [&](tensorflow::gtl::ArraySlice<int64> output_index) {
@@ -1239,14 +1462,14 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
                 Literal::CreateR0<ReturnT>(result_val);
             const std::vector<const Literal*> args = {curr_val_literal.get(),
                                                       result_val_literal.get()};
-            // We need a new visitor for each evaluation, so that the same
-            // computation can be visited more than once (with different
-            // inputs).
-            HloEvaluator embedded_evaluator;
             std::unique_ptr<Literal> computed_result =
-                embedded_evaluator.Evaluate(*function, args)
+                embedded_evaluator.Evaluate<const Literal*>(*function, args)
                     .ConsumeValueOrDie();
 
+            // Clear visit states so that the we can use the evaluate again on
+            // the same computation.
+            embedded_evaluator.ResetVisitStates();
+
             result_val = computed_result->Get<ReturnT>({});
           } while (IndexUtil::BumpIndices(window_shape, &window_index));
 
@@ -1287,6 +1510,50 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleSin(HloInstruction* sin) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sin],
+                        ElementWiseUnaryOp(sin, [](ElementwiseT elem_operand) {
+                          return std::sin(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_integral<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleSin(HloInstruction* sin) {
+    return InvalidArgument("Unsupported type for Sin");
+  }
+
+  Status HandleSin(HloInstruction* sin) override {
+    return HandleSin<ElementwiseT>(sin);
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleCos(HloInstruction* cos) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[cos],
+                        ElementWiseUnaryOp(cos, [](ElementwiseT elem_operand) {
+                          return std::cos(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_integral<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleCos(HloInstruction* cos) {
+    return InvalidArgument("Unsupported type for Cos");
+  }
+
+  Status HandleCos(HloInstruction* cos) override {
+    return HandleCos<ElementwiseT>(cos);
+  }
+
  private:
   template <typename IndexT>
   StatusOr<std::unique_ptr<Literal>> DynamicSlice(
@@ -1349,22 +1616,27 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
   StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOp(
       HloInstruction* instruction,
-      const std::function<ReturnT(ReturnT)>& unary_op) {
+      const std::function<ElementwiseT(ElementwiseT)>& unary_op) {
     const Literal& operand_literal =
         parent_->GetEvaluatedLiteralFor(instruction->operand(0));
-    return ElementWiseUnaryOpImpl<ReturnT, ReturnT>(instruction, unary_op,
-                                                    operand_literal);
+    TF_ASSIGN_OR_RETURN(
+        auto result_literal,
+        (ElementWiseUnaryOpImpl<ReturnT, ReturnT>(
+            instruction, ConvertUnaryFunction(unary_op), operand_literal)));
+
+    return std::move(result_literal);
   }
 
   StatusOr<std::unique_ptr<Literal>> ElementWiseBinaryOp(
       HloInstruction* instruction,
-      const std::function<ReturnT(ReturnT, ReturnT)>& binary_op) {
+      const std::function<ElementwiseT(ElementwiseT, ElementwiseT)>&
+          binary_op) {
     const auto shape = instruction->shape();
     const auto* lhs = instruction->operand(0);
     const auto* rhs = instruction->operand(1);
 
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
-    // removed.
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast
+    // is removed.
     if (!(ShapeUtil::SameDimensions(shape, rhs->shape()) &&
           ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()))) {
       return Unimplemented(
@@ -1382,14 +1654,15 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
         [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
-          return binary_op(lhs_literal.Get<ReturnT>(multi_index),
-                           rhs_literal.Get<ReturnT>(multi_index));
+          return ConvertBinaryFunction(binary_op)(
+              lhs_literal.Get<ReturnT>(multi_index),
+              rhs_literal.Get<ReturnT>(multi_index));
         }));
     return std::move(result);
   }
 
   template <typename LhsType, typename RhsType, typename EhsType>
-  StatusOr<std::unique_ptr<Literal>> ElementWiseTernaryOp(
+  StatusOr<std::unique_ptr<Literal>> ElementwiseTernaryOp(
       HloInstruction* instruction,
       const std::function<ReturnT(LhsType, RhsType, EhsType)>& ternary_op) {
     const auto shape = instruction->shape();
@@ -1397,8 +1670,8 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     const auto* rhs = instruction->operand(1);
     const auto* ehs = instruction->operand(2);
 
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
-    // removed.
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit
+    // broadcast is removed.
     if (!(ShapeUtil::SameDimensions(shape, lhs->shape()) &&
           ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()) &&
           ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()))) {
@@ -1451,9 +1724,11 @@ HloEvaluator::HloEvaluator() {
   typed_visitors_[F64] = MakeUnique<TypedVisitor<double>>(this);
   typed_visitors_[C64] = MakeUnique<TypedVisitor<complex64>>(this);
 
-  typed_visitors_[BF16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
-    return Unimplemented("HloEvaluator: unhandled primitive type: BF16.");
-  });
+  // Most of the evaluator computations we use don't support BF16 (e.g.,
+  // std::ceil, std::tanh). To make evaluator work with BF16, we set all
+  // elementwise computations to be done in F32 and do BF16<->F32 conversion
+  // around the input and the output of the computations.
+  typed_visitors_[BF16] = MakeUnique<TypedVisitor<bfloat16, float>>(this);
   typed_visitors_[TUPLE] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
     return Unimplemented("HloEvaluator: unhandled primitive type: TUPLE.");
   });
@@ -1462,13 +1737,17 @@ HloEvaluator::HloEvaluator() {
   });
 }
 
+template <typename LiteralPtr>
 StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
     const HloModule& module,
-    tensorflow::gtl::ArraySlice<const Literal*> arg_literals) {
+    tensorflow::gtl::ArraySlice<LiteralPtr> arg_literals) {
   XLA_VLOG_LINES(2, "HloEvaluator::Evaluate module:\n" + module.ToString());
 
-  arg_literals_ = arg_literals;
   evaluated_.clear();
+  arg_literals_.clear();
+  for (const auto& literal_ptr : arg_literals) {
+    arg_literals_.push_back(&*literal_ptr);
+  }
 
   TF_RETURN_IF_ERROR(module.entry_computation()->Accept(this));
 
@@ -1476,27 +1755,36 @@ StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
       GetEvaluatedLiteralFor(module.entry_computation()->root_instruction()));
 }
 
+template <typename LiteralPtr>
 StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
     const HloComputation& computation,
-    tensorflow::gtl::ArraySlice<const Literal*> arg_literals) {
+    tensorflow::gtl::ArraySlice<LiteralPtr> arg_literals) {
   XLA_VLOG_LINES(
       2, "HloEvaluator::Evaluate computation:\n" + computation.ToString());
-  arg_literals_ = arg_literals;
+
   evaluated_.clear();
+  arg_literals_.clear();
+  for (const auto& literal_ptr : arg_literals) {
+    arg_literals_.push_back(&*literal_ptr);
+  }
 
   TF_RETURN_IF_ERROR(computation.Accept(this));
   return MakeUnique<Literal>(
       GetEvaluatedLiteralFor(computation.root_instruction()));
 }
 
+template <typename LiteralPtr>
 StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
     HloInstruction* instruction,
-    tensorflow::gtl::ArraySlice<const Literal*> operands) {
+    tensorflow::gtl::ArraySlice<LiteralPtr> arg_literals) {
   TF_RET_CHECK(hlo_query::AllOperandsAreParametersOrConstants(*instruction));
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(instruction->shape()));
 
-  arg_literals_ = operands;
   evaluated_.clear();
+  arg_literals_.clear();
+  for (const auto& literal_ptr : arg_literals) {
+    arg_literals_.push_back(&*literal_ptr);
+  }
 
   // Evaluate operands of Parameter type against the input literals which
   // caches the evaluated literal results.
@@ -1565,6 +1853,7 @@ StatusOr<std::unique_ptr<Literal>> HloEvaluator::EvaluateWithSubstitutions(
   }
 
   std::vector<HloInstruction*> operands;
+  operands.reserve(owned_operands.size());
   for (auto& operand : owned_operands) {
     operands.push_back(operand.get());
   }
@@ -1583,9 +1872,13 @@ StatusOr<std::unique_ptr<Literal>> HloEvaluator::EvaluateWithSubstitutions(
 }
 
 Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
+  CHECK_LT(parameter->parameter_number(), arg_literals_.size());
   const Literal* input_literal = arg_literals_[parameter->parameter_number()];
   VLOG(2) << "Parameter evaluated to: " << input_literal->ToString();
-  DCHECK(ShapeUtil::Equal(parameter->shape(), input_literal->shape()));
+  DCHECK(ShapeUtil::Equal(parameter->shape(), input_literal->shape()))
+      << "parameter shape is: " << ShapeUtil::HumanString(parameter->shape())
+      << ", but input literal shape is: "
+      << ShapeUtil::HumanString(input_literal->shape());
 
   evaluated_[parameter] = MakeUnique<Literal>(*input_literal);
   return Status::OK();
@@ -1610,8 +1903,8 @@ Status HloEvaluator::HandleTranspose(HloInstruction* transpose) {
 Status HloEvaluator::HandleConcatenate(HloInstruction* concatenate) {
   tensorflow::gtl::ArraySlice<HloInstruction*> operands(
       concatenate->operands());
-  // The result concatenate dimension is going to be the sum of all concatenate
-  // dimensions of the operands taking part of the operation.
+  // The result concatenate dimension is going to be the sum of all
+  // concatenate dimensions of the operands taking part of the operation.
   const Shape& reference_shape = operands[0]->shape();
   CHECK(!ShapeUtil::IsTuple(reference_shape));
   const int64 rank = ShapeUtil::Rank(reference_shape);
@@ -1821,4 +2114,30 @@ Status HloEvaluator::Postprocess(HloInstruction* hlo) {
   return Status::OK();
 }
 
+// Explicit instantiation of templatized Evaluate* methods.
+//
+template StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate<
+    const Literal*>(const HloModule& module,
+                    tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+template StatusOr<std::unique_ptr<Literal>>
+HloEvaluator::Evaluate<std::unique_ptr<Literal>>(
+    const HloModule& module,
+    tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> arg_literals);
+
+template StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate<
+    const Literal*>(const HloComputation& computation,
+                    tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+template StatusOr<std::unique_ptr<Literal>>
+HloEvaluator::Evaluate<std::unique_ptr<Literal>>(
+    const HloComputation& computation,
+    tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> arg_literals);
+
+template StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate<
+    const Literal*>(HloInstruction* instruction,
+                    tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+template StatusOr<std::unique_ptr<Literal>>
+HloEvaluator::Evaluate<std::unique_ptr<Literal>>(
+    HloInstruction* instruction,
+    tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> arg_literals);
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 7557aaa2484d184555411a79d8dce2c9241427b0..02bb8b0a47065c359603a113f49626bf3ad344d8 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -42,9 +42,12 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // Precondition: The indices of arg_literals correspond to the parameter
   // numbers of the HLO parameters in the computation. See comment below for an
   // example.
+  // `LiteralPtr` accepts either std::unique_ptr<Literal> or const Literal*
+  // type.
+  template <typename LiteralPtr>
   StatusOr<std::unique_ptr<Literal>> Evaluate(
       const HloModule& module,
-      tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+      tensorflow::gtl::ArraySlice<LiteralPtr> arg_literals);
 
   // Evaluates an HLO computation and an array of pointers to literals.
   // Returns the evaluated result as a literal if successful.
@@ -62,9 +65,12 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // where Parameter0 has parameter_number 0 and Parameter1 has parameter_number
   // 1 in this computation. The input literals array will then have its first
   // literal map to Parameter0 and the second map to Parameter1.
+  // `LiteralPtr` accepts either std::unique_ptr<Literal> or const Literal*
+  // type.
+  template <typename LiteralPtr>
   StatusOr<std::unique_ptr<Literal>> Evaluate(
       const HloComputation& computation,
-      tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+      tensorflow::gtl::ArraySlice<LiteralPtr> arg_literals);
 
   // Evaluates a single HLO instruction and an array of pointers to literals.
   // Return the evaluated result as literal if successful.
@@ -72,10 +78,12 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // 1. argument literals correspond to the input instruction's parameters in
   // their post-ordering.
   // 2. the instruction's operands must be of either Parameter or Constant type.
-  // TODO(b/35950897): implement more ops other than element-wise ops.
+  // `LiteralPtr` accepts either std::unique_ptr<Literal> or const Literal*
+  // type.
+  template <typename LiteralPtr>
   StatusOr<std::unique_ptr<Literal>> Evaluate(
       HloInstruction* instruction,
-      tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+      tensorflow::gtl::ArraySlice<LiteralPtr> arg_literals);
 
   // Evaluates a single HLO instruction with constant operands.
   // Returns the evaluated result as literal if successful.
@@ -100,12 +108,16 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
  protected:
   // Templated DfsHloVisitor. Typically ReturnT here indicates the resulting
   // literal type of each evaluated Handle* method of a TypedVisitor.
-  // There are however a few notable exceptions to this is rule, notably:
+  // There are however a few notable exceptions to this rule, notably:
   // - HandleCompare and HandleIsFinite: where the resulting literal type is
   // always boolean.
   // These operations are handled outside of the parent HloEvaluator handlers
   // instead of from within TypedVisitor.
-  template <typename ReturnT>
+  //
+  // Type params:
+  //   - ReturnT: The type of input and output of each operation.
+  //   - ElementwiseT: The type in which internal computation are done.
+  template <typename ReturnT, typename ElementwiseT = ReturnT>
   class TypedVisitor;
 
   // Wraps around instruction handling to infer types before dispatching to
@@ -134,6 +146,7 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   Status HandleIsFinite(HloInstruction* is_finite) override;
 
   Status HandleCompare(HloInstruction* compare) override;
+
   Status HandleTuple(HloInstruction* tuple) override;
 
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
@@ -167,13 +180,15 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // TODO(b/35950897): have better memory management here to free instructions
   // that are no longer a parent for any other subsequent instruction in
   // post-orderring.
+  // Must be cleared for each evaluation.
   tensorflow::gtl::FlatMap<const HloInstruction*, std::unique_ptr<Literal>>
       evaluated_;
 
-  // Stores input literals, assuming they are in post-order. Literals are not
-  // owned by this class, and they must outlive the lifetime of the instance of
-  // this class.
-  tensorflow::gtl::ArraySlice<const Literal*> arg_literals_;
+  // Caches pointers to input literals, assuming they are in post-order.
+  // Literals are not owned by this class, and they must outlive the lifetime of
+  // each invocation to the Evaluate* method.
+  // Must be cleared for each evaluation.
+  std::vector<const Literal*> arg_literals_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(HloEvaluator);
 };
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 85477af6fe26f53504c07204348566c16a24392c..97697d06b73e606351ab8dff638483aa7d844bfc 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -25,8 +25,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -35,46 +37,124 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace {
 
-class HloEvaluatorTest : public HloVerifiedTestBase {
+static std::array<bool, 2> use_bf16_params{true, false};
+
+class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
+                         public HloVerifiedTestBase {
  protected:
-  HloEvaluatorTest() { evaluator_ = MakeUnique<HloEvaluator>(); }
+  HloEvaluatorTest() : use_bfloat16_(GetParam()) {
+    evaluator_ = MakeUnique<HloEvaluator>();
+  }
+
+  std::unique_ptr<Literal> Evaluate(
+      tensorflow::gtl::ArraySlice<const Literal*> arg_literals = {}) {
+    if (use_bfloat16_) {
+      // In BF16 mode, we convert all F32 type to BF16 and evaluate the module.
+      auto type_converter = HloElementTypeConverter(F32, BF16);
+      type_converter.Run(&module()).ValueOrDie();
+    }
+    return evaluator_->Evaluate(*module().entry_computation(), arg_literals)
+        .ConsumeValueOrDie();
+  }
 
   std::unique_ptr<HloEvaluator> evaluator_;
+
+  void TestUnaryOp(HloOpcode opcode, std::unique_ptr<Literal> expected,
+                   std::unique_ptr<Literal> input, float aabs = 0) {
+    HloComputation::Builder b(TestName());
+    auto c1 =
+        b.AddInstruction(HloInstruction::CreateConstant(std::move(input)));
+    b.AddInstruction(
+        HloInstruction::CreateUnary(expected->shape(), opcode, c1));
+    module().AddEntryComputation(b.Build());
+
+    std::unique_ptr<Literal> result = Evaluate();
+
+    auto element_type = expected->shape().element_type();
+    if (element_type == F32 || element_type == F64) {
+      ErrorSpec error(aabs);
+      LiteralTestUtil::ExpectNear(*expected, *result, error);
+    } else {
+      LiteralTestUtil::ExpectEqual(*expected, *result);
+    }
+  }
+
+  void TestBinaryOp(HloOpcode opcode, std::unique_ptr<Literal> expected,
+                    std::unique_ptr<Literal> lhs,
+                    std::unique_ptr<Literal> rhs) {
+    HloComputation::Builder b(TestName());
+    auto c1 = b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs)));
+    auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs)));
+    b.AddInstruction(
+        HloInstruction::CreateBinary(expected->shape(), opcode, c1, c2));
+    module().AddEntryComputation(b.Build());
+
+    std::unique_ptr<Literal> result = Evaluate();
+
+    LiteralTestUtil::ExpectEqual(*expected, *result);
+  }
+
+  bool use_bfloat16_;
 };
 
+#define XLA_TYPED_TEST_P(test_case_name, test_name, test_type1) \
+  TEST_P(test_case_name, test_name)
+
 // Verifies that HloEvaluator evaluates a HLO instruction that performs clamp
 // with 3 operands.
-TEST_F(HloEvaluatorTest, DoesClamp) {
+TEST_P(HloEvaluatorTest, DoesClamp) {
   auto low = Literal::CreateR2<float>({{0.f, 2.f}, {2.f, 4.f}});
-  auto high = Literal::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
   auto value = Literal::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
+  auto high = Literal::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
 
   Shape shape = low->shape();
   HloComputation::Builder b(TestName());
   auto c1 = b.AddInstruction(HloInstruction::CreateConstant(std::move(low)));
-  auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(high)));
-  auto c3 = b.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
-  auto instruction = b.AddInstruction(
+  auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
+  auto c3 = b.AddInstruction(HloInstruction::CreateConstant(std::move(high)));
+  b.AddInstruction(
       HloInstruction::CreateTernary(shape, HloOpcode::kClamp, c1, c2, c3));
   module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<float>({{0, 4}, {2, 4}});
 
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
+TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
+  auto low = Literal::CreateR0<float>(0.f);
+  auto value = Literal::CreateR2<float>({{-1.f, 0.f}, {1.f, 2.f}});
+  auto high = Literal::CreateR0<float>(1.f);
+
+  Shape shape = value->shape();
+  HloComputation::Builder b(TestName());
+  auto c1 = b.AddInstruction(HloInstruction::CreateConstant(std::move(low)));
+  auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
+  auto c3 = b.AddInstruction(HloInstruction::CreateConstant(std::move(high)));
+  b.AddInstruction(
+      HloInstruction::CreateTernary(shape, HloOpcode::kClamp, c1, c2, c3));
+  module().AddEntryComputation(b.Build());
+
+  std::unique_ptr<Literal> result = Evaluate();
+
+  auto expected = Literal::CreateR2<float>({{0, 0}, {1, 1}});
+
+  LiteralTestUtil::ExpectEqual(*expected, *result);
+}
+
 // Verifies that HloEvaluator evaluates a HLO instruction that performs select
 // with 3 operands.
-TEST_F(HloEvaluatorTest, DoesSelect) {
+TEST_P(HloEvaluatorTest, DoesSelect) {
   auto pred = Literal::CreateR2<bool>({{true, false}, {false, true}});
   auto on_true = Literal::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
   auto on_false = Literal::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
@@ -86,12 +166,11 @@ TEST_F(HloEvaluatorTest, DoesSelect) {
       b.AddInstruction(HloInstruction::CreateConstant(std::move(on_true)));
   auto c3 =
       b.AddInstruction(HloInstruction::CreateConstant(std::move(on_false)));
-  auto instruction = b.AddInstruction(
+  b.AddInstruction(
       HloInstruction::CreateTernary(shape, HloOpcode::kSelect, c1, c2, c3));
   module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate({});
 
   auto expected = Literal::CreateR2<float>({{2, 5}, {0, 4}});
 
@@ -100,126 +179,108 @@ TEST_F(HloEvaluatorTest, DoesSelect) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise addition with 2 operands.
-TEST_F(HloEvaluatorTest, DoesAdd) {
+TEST_P(HloEvaluatorTest, DoesAdd) {
   auto lhs = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
-
-  Shape shape = ShapeUtil::MakeShape(S64, {2, 2});
-  HloComputation::Builder b(TestName());
-  auto c1 = b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs)));
-  auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs)));
-  auto instruction = b.AddInstruction(
-      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, c1, c2));
-  module().AddEntryComputation(b.Build());
-
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
-
   auto expected = Literal::CreateR2<int64>({{3, 4}, {-96, 8}});
-
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  TestBinaryOp(HloOpcode::kAdd, std::move(expected), std::move(lhs),
+               std::move(rhs));
+}
+// Verifies that HloEvaluator evaluates a HLO instruction that performs
+// element-wise and with 2 operands.
+TEST_P(HloEvaluatorTest, DoesAnd) {
+  auto lhs = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
+  auto rhs = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
+  auto expected = Literal::CreateR2<int64>({{0, 0}, {4, 4}});
+  TestBinaryOp(HloOpcode::kAnd, std::move(expected), std::move(lhs),
+               std::move(rhs));
+}
+// Verifies that HloEvaluator evaluates a HLO instruction that performs
+// element-wise or with 2 operands.
+TEST_P(HloEvaluatorTest, DoesOr) {
+  auto lhs = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
+  auto rhs = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
+  auto expected = Literal::CreateR2<int64>({{3, 4}, {-100, 4}});
+  TestBinaryOp(HloOpcode::kOr, std::move(expected), std::move(lhs),
+               std::move(rhs));
+}
+// Verifies that HloEvaluator evaluates a HLO instruction that performs
+// element-wise multiply with 2 operands.
+TEST_P(HloEvaluatorTest, DoesMultiply) {
+  auto lhs = Literal::CreateR2<int32>({{-1, 0}, {-100, 4}});
+  auto rhs = Literal::CreateR2<int32>(
+      {{std::numeric_limits<int32>::min(), 4}, {4, 4}});
+  auto expected = Literal::CreateR2<int32>(
+      {{std::numeric_limits<int32>::min(), 0}, {-400, 16}});
+  TestBinaryOp(HloOpcode::kMultiply, std::move(expected), std::move(lhs),
+               std::move(rhs));
 }
-
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise divide with 2 operands.
-TEST_F(HloEvaluatorTest, DoesDivideInt64) {
-  auto lhs_s64 = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
-  auto rhs_s64 = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
-
-  Shape shape_s64 = ShapeUtil::MakeShape(S64, {2, 2});
-  HloComputation::Builder b(TestName());
-  auto c1_s64 =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_s64)));
-  auto c2_s64 =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_s64)));
-  auto instruction = b.AddInstruction(HloInstruction::CreateBinary(
-      shape_s64, HloOpcode::kDivide, c1_s64, c2_s64));
-  module().AddEntryComputation(b.Build());
-
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
-
+TEST_P(HloEvaluatorTest, DoesDivideInt64) {
+  auto lhs = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
+  auto rhs = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = Literal::CreateR2<int64>({{0, 0}, {-25, 1}});
-
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  TestBinaryOp(HloOpcode::kDivide, std::move(expected), std::move(lhs),
+               std::move(rhs));
 }
-TEST_F(HloEvaluatorTest, DoesDivideDouble) {
-  auto lhs_f64 = Literal::CreateR2<double>({{1.0, 0.0}, {-100.0, 4.0}});
-  auto rhs_f64 = Literal::CreateR2<double>({{2.2, 4.0}, {4.0, 4.0}});
-
-  Shape shape_f64 = ShapeUtil::MakeShape(F64, {2, 2});
-  HloComputation::Builder b(TestName());
-  auto c1_f64 =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_f64)));
-  auto c2_f64 =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_f64)));
-  auto instruction = b.AddInstruction(HloInstruction::CreateBinary(
-      shape_f64, HloOpcode::kDivide, c1_f64, c2_f64));
-  module().AddEntryComputation(b.Build());
-
-  auto result = evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
-
+TEST_P(HloEvaluatorTest, DoesDivideDouble) {
+  auto lhs = Literal::CreateR2<double>({{1.0, 0.0}, {-100.0, 4.0}});
+  auto rhs = Literal::CreateR2<double>({{2.2, 4.0}, {4.0, 4.0}});
   auto expected =
       Literal::CreateR2<double>({{0.45454545454545453, 0}, {-25, 1}});
-
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  TestBinaryOp(HloOpcode::kDivide, std::move(expected), std::move(lhs),
+               std::move(rhs));
 }
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise abs op with 1 operand.
-TEST_F(HloEvaluatorTest, DoesAbsR2) {
+TEST_P(HloEvaluatorTest, DoesAbsR2) {
   auto operand = Literal::CreateR2<int64>({{1, -20}, {-100, 4}});
-  const Shape& shape = ShapeUtil::MakeShape(S64, {2, 2});
-  HloComputation::Builder b(TestName());
-  auto c1 =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(operand)));
-  auto instruction =
-      b.AddInstruction(HloInstruction::CreateUnary(shape, HloOpcode::kAbs, c1));
-  module().AddEntryComputation(b.Build());
-
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
-
   auto expected = Literal::CreateR2<int64>({{1, 20}, {100, 4}});
-
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
-TEST_F(HloEvaluatorTest, DoesAbsR0) {
-  // For R0 literal.
-  const Shape& r0 = ShapeUtil::MakeShape(F32, {});
+TEST_P(HloEvaluatorTest, DoesAbsR0) {
   auto operand = Literal::CreateR0<float>(-1.0f);
-  HloComputation::Builder b(TestName());
-  auto c1 =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(operand)));
-  auto instruction =
-      b.AddInstruction(HloInstruction::CreateUnary(r0, HloOpcode::kAbs, c1));
-  module().AddEntryComputation(b.Build());
-
-  auto result = evaluator_->Evaluate(instruction).ConsumeValueOrDie();
   auto expected = Literal::CreateR0<float>(1.0f);
-
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
-TEST_F(HloEvaluatorTest, DoesAbsR1WithZeroSize) {
-  // For R1 literal with dimension of size 0.
-  Shape empty_r1 = ShapeUtil::MakeShape(F32, {0});
+TEST_P(HloEvaluatorTest, DoesAbsR1WithZeroSize) {
   auto operand = Literal::CreateR1<float>({});
-  HloComputation::Builder b(TestName());
-  auto c1 =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(operand)));
-  auto instruction = b.AddInstruction(
-      HloInstruction::CreateUnary(empty_r1, HloOpcode::kAbs, c1));
-  module().AddEntryComputation(b.Build());
-
-  auto result = evaluator_->Evaluate(instruction).ConsumeValueOrDie();
   auto expected = Literal::CreateR1<float>({});
-
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
+}
+TEST_P(HloEvaluatorTest, DoesNegateR2) {
+  auto operand = Literal::CreateR2<int32>(
+      {{0, std::numeric_limits<int32>::min()}, {-1, 4}});
+  auto expected =
+      Literal::CreateR2<int32>({{0, std::numeric_limits<int>::min()}, {1, -4}});
+  TestUnaryOp(HloOpcode::kNegate, std::move(expected), std::move(operand));
+}
+TEST_P(HloEvaluatorTest, DoesCosR2) {
+  auto operand = Literal::CreateR2<float>({{0, M_PI}, {-M_PI, 2 * M_PI}});
+  auto expected = Literal::CreateR2<float>({{1, -1}, {-1, 1}});
+  TestUnaryOp(HloOpcode::kCos, std::move(expected), std::move(operand),
+              use_bfloat16_ ? 0x1.0P-5 : 0x1.0P-20);
+}
+TEST_P(HloEvaluatorTest, DoesSinR2) {
+  auto operand = Literal::CreateR2<float>({{0, M_PI}, {-M_PI, 2 * M_PI}});
+  auto expected = Literal::CreateR2<float>({{0, 0}, {0, 0}});
+  TestUnaryOp(HloOpcode::kSin, std::move(expected), std::move(operand),
+              use_bfloat16_ ? 0x1.0P-5 : 0x1.0P-20);
+}
+TEST_P(HloEvaluatorTest, DoesNotR2) {
+  auto operand =
+      Literal::CreateR2<int32>({{0, std::numeric_limits<int>::min()},
+                                {-1, std::numeric_limits<int>::max()}});
+  auto expected =
+      Literal::CreateR2<int32>({{-1, std::numeric_limits<int>::max()},
+                                {0, std::numeric_limits<int>::min()}});
+  TestUnaryOp(HloOpcode::kNot, std::move(expected), std::move(operand));
 }
-
 // Verifies that HloEvaluator evaluates a HLO Computation with non-parameter nor
 // constant operands.
-TEST_F(HloEvaluatorTest, DoesTraverseInstructions) {
+TEST_P(HloEvaluatorTest, DoesTraverseInstructions) {
   auto lhs = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
   auto rhs2 = Literal::CreateR2<int64>({{1, -20}, {-100, 4}});
@@ -239,10 +300,9 @@ TEST_F(HloEvaluatorTest, DoesTraverseInstructions) {
       b.AddInstruction(HloInstruction::CreateParameter(2, shape, "rhs2"));
   b.AddInstruction(HloInstruction::CreateBinary(shape, HloOpcode::kAdd,
                                                 lhs_instruction, param_rhs2));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, args).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate(args);
 
   auto expected = Literal::CreateR2<int64>({{4, -16}, {-196, 12}});
 
@@ -250,7 +310,7 @@ TEST_F(HloEvaluatorTest, DoesTraverseInstructions) {
 }
 
 // Verifies Reshape operation is correctly evaluated.
-TEST_F(HloEvaluatorTest, DoesReshape) {
+TEST_P(HloEvaluatorTest, DoesReshape) {
   HloComputation::Builder b(TestName());
   const int64 dimensions[] = {11, 8, 7, 5, 9};
   TF_ASSERT_OK_AND_ASSIGN(auto literal,
@@ -264,21 +324,20 @@ TEST_F(HloEvaluatorTest, DoesReshape) {
   const int64 permutation[] = {1, 2, 0, 4, 3};
   b.AddInstruction(
       HloInstruction::CreateTranspose(shape, literal_instruction, permutation));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate({});
 
   using NativeT = typename primitive_util::PrimitiveTypeToNative<F32>::type;
   result->EachCell<NativeT>(
       [&](tensorflow::gtl::ArraySlice<int64> indices, NativeT value) {
         std::vector<int64> rindexes = Permute(permutation, indices);
-        EXPECT_TRUE(value == literal_clone->Get<NativeT>(rindexes));
+        EXPECT_NEAR(value, literal_clone->Get<NativeT>(rindexes), 0x1.0P-5);
       });
 }
 
 // Verifies Broadcast operation is correctly evaluated.
-TEST_F(HloEvaluatorTest, DoesBroadcast) {
+TEST_P(HloEvaluatorTest, DoesBroadcast) {
   HloComputation::Builder b(TestName());
   auto input_literal = Literal::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}});
   auto output_literal = Literal::CreateR3<int32>(
@@ -287,15 +346,14 @@ TEST_F(HloEvaluatorTest, DoesBroadcast) {
       HloInstruction::CreateConstant(std::move(input_literal)));
   b.AddInstruction(HloInstruction::CreateBroadcast(
       output_literal->shape(), literal_instruction, {1, 2}));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate({});
 
   LiteralTestUtil::ExpectEqual(*result, *output_literal);
 }
 
-TEST_F(HloEvaluatorTest, DoesBroadcastScalar) {
+TEST_P(HloEvaluatorTest, DoesBroadcastScalar) {
   HloComputation::Builder b(TestName());
   auto input_literal = Literal::CreateR0<int32>(111);
   auto output_literal = Literal::CreateR2<int32>(
@@ -307,15 +365,14 @@ TEST_F(HloEvaluatorTest, DoesBroadcastScalar) {
   b.AddInstruction(HloInstruction::CreateBroadcast(
       output_literal->shape(), literal_instruction,
       /*broadcast_dimensions=*/{}));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate({});
 
   LiteralTestUtil::ExpectEqual(*result, *output_literal);
 }
 
-TEST_F(HloEvaluatorTest, DoesConcatenateSimple) {
+TEST_P(HloEvaluatorTest, DoesConcatenateSimple) {
   HloComputation::Builder b(TestName());
 
   HloInstruction* operand1 = b.AddInstruction(HloInstruction::CreateConstant(
@@ -328,17 +385,16 @@ TEST_F(HloEvaluatorTest, DoesConcatenateSimple) {
   Shape shape = ShapeUtil::MakeShape(S64, {4, 2});
   b.AddInstruction(HloInstruction::CreateConcatenate(shape, operands, 0));
 
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected =
       Literal::CreateR2<int64>({{-1, -2}, {100, 200}, {-2, -3}, {-100, -200}});
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
+TEST_P(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
   HloComputation::Builder b(TestName());
 
   HloInstruction* operand1 = b.AddInstruction(
@@ -351,16 +407,15 @@ TEST_F(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
   Shape shape = ShapeUtil::MakeShape(S64, {2});
   b.AddInstruction(HloInstruction::CreateConcatenate(shape, operands, 0));
 
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR1<int64>({100, 200});
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, ConvertWithSameLayout) {
+TEST_P(HloEvaluatorTest, ConvertWithSameLayout) {
   HloComputation::Builder b(TestName());
 
   auto input_literal = Literal::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}});
@@ -372,15 +427,14 @@ TEST_F(HloEvaluatorTest, ConvertWithSameLayout) {
   HloInstruction* constant = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(input_literal)));
   b.AddInstruction(HloInstruction::CreateConvert(expected->shape(), constant));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   LiteralTestUtil::ExpectEqual(*result, *expected);
 }
 
-TEST_F(HloEvaluatorTest, ConvertWithDifferentLayout) {
+TEST_P(HloEvaluatorTest, ConvertWithDifferentLayout) {
   HloComputation::Builder b(TestName());
 
   auto input_literal = Literal::CreateR2WithLayout<int32>(
@@ -393,10 +447,9 @@ TEST_F(HloEvaluatorTest, ConvertWithDifferentLayout) {
   HloInstruction* constant = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(input_literal)));
   b.AddInstruction(HloInstruction::CreateConvert(expected->shape(), constant));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   LiteralTestUtil::ExpectEqual(*result, *expected);
 }
@@ -414,7 +467,7 @@ PaddingConfig CreatePaddingConfig(
   return padding_config;
 }
 
-TEST_F(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
+TEST_P(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
   auto operand = Literal::CreateR2<int32>({{}, {}});
   HloComputation::Builder b(TestName());
   auto operand_instruction =
@@ -427,11 +480,11 @@ TEST_F(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
 
   auto padding_config = CreatePaddingConfig({{{1, 0, 2}}, {{0, 2, 1}}});
   Shape shape = ShapeUtil::MakeShape(S32, {5, 2});
-  auto pad_instruction = b.AddInstruction(HloInstruction::CreatePad(
+  b.AddInstruction(HloInstruction::CreatePad(
       shape, operand_instruction, padding_value_instruction, padding_config));
   module().AddEntryComputation(b.Build());
 
-  auto result = evaluator_->Evaluate(pad_instruction).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<int32>(
       {{10, 10}, {10, 10}, {10, 10}, {10, 10}, {10, 10}});
@@ -439,7 +492,7 @@ TEST_F(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
+TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> input_array(3, 2, 1, 1, {1, 2, 3, 4, 5, 6});
@@ -456,10 +509,9 @@ TEST_F(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
       CreatePaddingConfig({{{1, 0, 2}}, {{0, 2, 1}}, {{0, 0, 0}}, {{0, 0, 0}}});
   b.AddInstruction(HloInstruction::CreatePad(
       shape, input_instruction, pad_instruction, r4_padding_on_dim0_dim1));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected_array = MakeUnique<Array4D<float>>(8, 5, 1, 1);
   expected_array->Fill(kPadValue);
@@ -475,7 +527,7 @@ TEST_F(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, NegativePadding2D) {
+TEST_P(HloEvaluatorTest, NegativePadding2D) {
   HloComputation::Builder b(TestName());
 
   // input_array:
@@ -501,10 +553,9 @@ TEST_F(HloEvaluatorTest, NegativePadding2D) {
                                              pad_value_instruction,
                                              r2_padding_on_dim0_dim1));
 
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   // f32[1,5] { 7.0, 2.718, 2.718, 2.718, 2.718 }
   auto expected_array = MakeUnique<Array2D<float>>(1, 5);
@@ -515,10 +566,10 @@ TEST_F(HloEvaluatorTest, NegativePadding2D) {
   (*expected_array)(0, 4) = 2.718f;
   auto expected = Literal::CreateR2FromArray2D<float>(*expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  LiteralTestUtil::ExpectNear(*expected, *result, ErrorSpec(0x1.0P-5));
 }
 
-TEST_F(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
+TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
   HloComputation::Builder b(TestName());
 
   // f32[4,3] {
@@ -547,10 +598,9 @@ TEST_F(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
                                              pad_value_instruction,
                                              r2_padding_on_dim0_dim1));
 
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected_array = MakeUnique<Array2D<float>>(0, 9);
   auto expected = Literal::CreateR2FromArray2D<float>(*expected_array);
@@ -558,7 +608,7 @@ TEST_F(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, DotRank2AndRank1) {
+TEST_P(HloEvaluatorTest, DotRank2AndRank1) {
   HloComputation::Builder b(TestName());
 
   // lhs:
@@ -581,12 +631,14 @@ TEST_F(HloEvaluatorTest, DotRank2AndRank1) {
       b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
 
   Shape shape = ShapeUtil::MakeShape(F32, {4, 2});
-  b.AddInstruction(HloInstruction::CreateBinary(
-      shape, HloOpcode::kDot, lhs_instruction, rhs_instruction));
-  auto computation = module().AddEntryComputation(b.Build());
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
+                                             rhs_instruction, dot_dnums));
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   // clang-format off
   auto expected_array = Array2D<float>({
@@ -601,7 +653,7 @@ TEST_F(HloEvaluatorTest, DotRank2AndRank1) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, DotRank1AndRank2) {
+TEST_P(HloEvaluatorTest, DotRank1AndRank2) {
   HloComputation::Builder b(TestName());
 
   // lhs:
@@ -624,19 +676,21 @@ TEST_F(HloEvaluatorTest, DotRank1AndRank2) {
       b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
 
   Shape shape = ShapeUtil::MakeShape(F32, {2});
-  b.AddInstruction(HloInstruction::CreateBinary(
-      shape, HloOpcode::kDot, lhs_instruction, rhs_instruction));
-  auto computation = module().AddEntryComputation(b.Build());
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
+                                             rhs_instruction, dot_dnums));
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR1<float>({22.f, 28.f});
 
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, DotRank2AndRank2) {
+TEST_P(HloEvaluatorTest, DotRank2AndRank2) {
   HloComputation::Builder b(TestName());
 
   // lhs:
@@ -665,12 +719,14 @@ TEST_F(HloEvaluatorTest, DotRank2AndRank2) {
       b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
 
   Shape shape = ShapeUtil::MakeShape(F32, {4, 2});
-  b.AddInstruction(HloInstruction::CreateBinary(
-      shape, HloOpcode::kDot, lhs_instruction, rhs_instruction));
-  auto computation = module().AddEntryComputation(b.Build());
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
+                                             rhs_instruction, dot_dnums));
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected_array = Array2D<float>({
       {22.f, 28.f},
@@ -683,7 +739,7 @@ TEST_F(HloEvaluatorTest, DotRank2AndRank2) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, SimpleConv1D) {
+TEST_P(HloEvaluatorTest, SimpleConv1D) {
   HloComputation::Builder b(TestName());
 
   Array3D<float> lhs_array = {{{1, 2, 3}}};
@@ -711,7 +767,8 @@ TEST_F(HloEvaluatorTest, SimpleConv1D) {
   dnums.set_output_batch_dimension(0);
   dnums.set_input_feature_dimension(1);
   dnums.set_output_feature_dimension(1);
-  dnums.add_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
 
   dnums.set_kernel_output_feature_dimension(0);
   dnums.set_kernel_input_feature_dimension(1);
@@ -720,10 +777,9 @@ TEST_F(HloEvaluatorTest, SimpleConv1D) {
   const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 3});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, window, dnums));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   Array3D<float> expected_array = {{{11.f, 18.f, 9.f}}};
   auto expected = Literal::CreateR3FromArray3D<float>(expected_array);
@@ -731,7 +787,7 @@ TEST_F(HloEvaluatorTest, SimpleConv1D) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
+TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> lhs_array(1, 1, 4, 4);
@@ -775,10 +831,9 @@ TEST_F(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
   const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 4, 4});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, window, dnums));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   Array4D<float> expected_array(1, 1, 4, 4);
   // clang-format off
@@ -794,7 +849,7 @@ TEST_F(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, Conv2DGeneralDimensions) {
+TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
   HloComputation::Builder b(TestName());
 
   // clang-format off
@@ -826,6 +881,8 @@ TEST_F(HloEvaluatorTest, Conv2DGeneralDimensions) {
   auto rhs_literal = Literal::CreateR4FromArray4D<float>(weight);
   HloInstruction* rhs_instruction =
       b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
+  rhs_instruction = b.AddInstruction(HloInstruction::CreateReverse(
+      rhs_instruction->shape(), rhs_instruction, {3, 1}));
 
   Window window;
   WindowDimension dim;
@@ -835,6 +892,7 @@ TEST_F(HloEvaluatorTest, Conv2DGeneralDimensions) {
   dim.set_padding_high(0);
   dim.set_window_dilation(1);
   dim.set_base_dilation(1);
+  dim.set_window_reversal(true);
   *window.add_dimensions() = dim;
   *window.add_dimensions() = dim;
 
@@ -843,8 +901,10 @@ TEST_F(HloEvaluatorTest, Conv2DGeneralDimensions) {
   dnums.set_output_batch_dimension(2);
   dnums.set_input_feature_dimension(0);
   dnums.set_output_feature_dimension(0);
-  dnums.add_spatial_dimensions(1);
-  dnums.add_spatial_dimensions(3);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(3);
+  dnums.add_output_spatial_dimensions(3);
 
   dnums.set_kernel_output_feature_dimension(0);
   dnums.set_kernel_input_feature_dimension(2);
@@ -854,21 +914,99 @@ TEST_F(HloEvaluatorTest, Conv2DGeneralDimensions) {
   const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, window, dnums));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   // clang-format off
   // Result dimensions: [feature=1, height=1, batch=1, width=2]
   Array4D<float> expected_array({{{{2514, 2685}}}});
+  Array4D<float> expected_array_bf16({{{{2512, 2672}}}});
   // clang-format on
-  auto expected = Literal::CreateR4FromArray4D<float>(expected_array);
+  auto expected = Literal::CreateR4FromArray4D<float>(
+      use_bfloat16_ ? expected_array_bf16 : expected_array);
 
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
+TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) {
+  HloComputation::Builder b(TestName());
+
+  // clang-format off
+  // Input dimensions: [feature=2, height=3, batch=1, width=4]
+  Array4D<float> input({
+    {{{1, 2, 3, 4}},
+     {{5, 6, 7, 8}},
+     {{9, 10, 11, 12}}},
+    {{{13, 14, 15, 16}},
+     {{17, 18, 19, 20}},
+     {{21, 22, 23, 24}}}
+  });
+  // Weight dimensions:
+  // [kernel_output_feature=1, width=3, kernel_input_feature=2, height=3]
+  Array4D<float> weight({{
+    {{1, 7, 13},
+     {4, 10, 16}},
+    {{2, 8, 14},
+     {5, 11, 17}},
+    {{3, 9, 15},
+     {6, 12, 18}}
+  }});
+  // clang-format on
+
+  auto lhs_literal = Literal::CreateR4FromArray4D<float>(input);
+  HloInstruction* lhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal)));
+
+  auto rhs_literal = Literal::CreateR4FromArray4D<float>(weight);
+  HloInstruction* rhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
+
+  Window window;
+  WindowDimension dim;
+  dim.set_size(3);
+  dim.set_stride(1);
+  dim.set_padding_low(0);
+  dim.set_padding_high(0);
+  dim.set_window_dilation(1);
+  dim.set_base_dilation(1);
+  *window.add_dimensions() = dim;
+  *window.add_dimensions() = dim;
+
+  ConvolutionDimensionNumbers dnums;
+  dnums.set_input_batch_dimension(2);
+  dnums.set_output_batch_dimension(2);
+  dnums.set_input_feature_dimension(0);
+  dnums.set_output_feature_dimension(0);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(3);
+  dnums.add_output_spatial_dimensions(3);
+
+  dnums.set_kernel_output_feature_dimension(0);
+  dnums.set_kernel_input_feature_dimension(2);
+  dnums.add_kernel_spatial_dimensions(3);
+  dnums.add_kernel_spatial_dimensions(1);
+
+  const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
+  b.AddInstruction(HloInstruction::CreateConvolve(
+      shape, lhs_instruction, rhs_instruction, window, dnums));
+  module().AddEntryComputation(b.Build());
+
+  std::unique_ptr<Literal> result = Evaluate();
+
+  // clang-format off
+  // Result dimensions: [feature=1, height=1, batch=1, width=2]
+  Array4D<float> expected_array({{{{2514, 2685}}}});
+  Array4D<float> expected_array_bf16({{{{2512, 2672}}}});
+  // clang-format on
+  auto expected = Literal::CreateR4FromArray4D<float>(
+      use_bfloat16_ ? expected_array_bf16 : expected_array);
+
+  LiteralTestUtil::ExpectEqual(*expected, *result);
+}
+
+TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> lhs_array(1, 1, 4, 4);
@@ -912,10 +1050,9 @@ TEST_F(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
   const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 7, 7});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, window, dnums));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   Array4D<float> expected_array(1, 1, 7, 7);
   expected_array.FillWithYX(Array2D<float>({
@@ -932,7 +1069,7 @@ TEST_F(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
+TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> lhs_array(1, 1, 4, 4);
@@ -976,10 +1113,9 @@ TEST_F(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
   const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 8, 8});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, window, dnums));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   Array4D<float> expected_array(1, 1, 8, 8);
   expected_array.FillWithYX(Array2D<float>({
@@ -997,7 +1133,7 @@ TEST_F(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest,
+TEST_P(HloEvaluatorTest,
        DilatedWindowAndBaseConv2DWithDifferentLowAndHighPaddingAndStrides) {
   HloComputation::Builder b(TestName());
 
@@ -1048,10 +1184,9 @@ TEST_F(HloEvaluatorTest,
   const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 9, 3});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, window, dnums));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   Array4D<float> expected_array(1, 1, 9, 3);
   expected_array.FillWithYX(Array2D<float>({
@@ -1070,7 +1205,7 @@ TEST_F(HloEvaluatorTest,
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, ReduceAdd) {
+TEST_P(HloEvaluatorTest, ReduceAdd) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1103,17 +1238,16 @@ TEST_F(HloEvaluatorTest, ReduceAdd) {
       HloInstruction::CreateReduce(shape, arg_instruction, init_value,
                                    /*dimensions_to_reduce=*/{1}, add_func));
 
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR1<float>({6, 18});
 
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, ReduceWindowMax) {
+TEST_P(HloEvaluatorTest, ReduceWindowMax) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1156,15 +1290,15 @@ TEST_F(HloEvaluatorTest, ReduceWindowMax) {
   b.AddInstruction(HloInstruction::CreateReduceWindow(
       shape, arg_instruction, init_value, window, max_func));
 
-  auto computation = module().AddEntryComputation(b.Build());
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  module().AddEntryComputation(b.Build());
+
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<float>({{6, 7}});
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, ReduceWindowAdd) {
+TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1213,15 +1347,15 @@ TEST_F(HloEvaluatorTest, ReduceWindowAdd) {
   b.AddInstruction(HloInstruction::CreateReduceWindow(
       shape, arg_instruction, init_value, window, add_func));
 
-  auto computation = module().AddEntryComputation(b.Build());
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  module().AddEntryComputation(b.Build());
+
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<float>({{1, 3, 5}, {5, 11, 13}});
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, ReduceWindowAdd6D) {
+TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
   HloComputation::Builder b(TestName());
 
   // arg: f32[4,4,4,4,4,4] full of ones. Using small dims to limit run-time.
@@ -1274,9 +1408,9 @@ TEST_F(HloEvaluatorTest, ReduceWindowAdd6D) {
   b.AddInstruction(HloInstruction::CreateReduceWindow(
       shape, arg_instruction, init_value, window, add_func));
 
-  auto computation = module().AddEntryComputation(b.Build());
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  module().AddEntryComputation(b.Build());
+
+  std::unique_ptr<Literal> result = Evaluate();
 
   std::vector<int64> output_dims = {4, 3, 3, 3, 4, 4};
   std::unique_ptr<Literal> result_literal =
@@ -1284,7 +1418,7 @@ TEST_F(HloEvaluatorTest, ReduceWindowAdd6D) {
   LiteralTestUtil::ExpectEqual(*result_literal, *result);
 }
 
-TEST_F(HloEvaluatorTest, StridedSlice) {
+TEST_P(HloEvaluatorTest, StridedSlice) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1305,10 +1439,9 @@ TEST_F(HloEvaluatorTest, StridedSlice) {
                                                /*start_indices=*/{0, 2},
                                                /*limit_indices=*/{3, 5},
                                                /*strides=*/{2, 3}));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<float>({
       {3},
@@ -1318,7 +1451,7 @@ TEST_F(HloEvaluatorTest, StridedSlice) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, DynamicSlice) {
+TEST_P(HloEvaluatorTest, DynamicSlice) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1339,10 +1472,9 @@ TEST_F(HloEvaluatorTest, DynamicSlice) {
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand,
                                                       start_indices, {2, 3}));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<float>({
       {2, 3, 4},
@@ -1354,7 +1486,7 @@ TEST_F(HloEvaluatorTest, DynamicSlice) {
 
 // Verifies that the HloEvaluator's implementation goes along with existing
 // backends' behavior, although this is not required by the spec.
-TEST_F(HloEvaluatorTest, DynamicSliceModSlice) {
+TEST_P(HloEvaluatorTest, DynamicSliceModSlice) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1375,10 +1507,9 @@ TEST_F(HloEvaluatorTest, DynamicSliceModSlice) {
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand,
                                                       start_indices, {2, 3}));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<float>({
       {2, 3, 4},
@@ -1388,7 +1519,7 @@ TEST_F(HloEvaluatorTest, DynamicSliceModSlice) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, DynamicSliceUpdate) {
+TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1412,10 +1543,9 @@ TEST_F(HloEvaluatorTest, DynamicSliceUpdate) {
   Shape shape = ShapeUtil::MakeShape(F64, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
       shape, operand, update, start_indices));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<double>({
       {1, -2, -3},
@@ -1425,7 +1555,7 @@ TEST_F(HloEvaluatorTest, DynamicSliceUpdate) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, SetAndGetTuples) {
+TEST_P(HloEvaluatorTest, SetAndGetTuples) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1448,9 +1578,9 @@ TEST_F(HloEvaluatorTest, SetAndGetTuples) {
   Shape shape = ShapeUtil::MakeShape(F64, {2, 3});
   b.AddInstruction(HloInstruction::CreateGetTupleElement(shape, tuple, 1));
 
-  auto computation = module().AddEntryComputation(b.Build());
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  module().AddEntryComputation(b.Build());
+
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<double>({
       {1, 2, 3},
@@ -1460,7 +1590,7 @@ TEST_F(HloEvaluatorTest, SetAndGetTuples) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, SetAndGetNestedTuples) {
+TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1487,9 +1617,9 @@ TEST_F(HloEvaluatorTest, SetAndGetNestedTuples) {
   b.AddInstruction(
       HloInstruction::CreateGetTupleElement(tuple2->shape(), outer_tuple, 1));
 
-  auto computation = module().AddEntryComputation(b.Build());
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  module().AddEntryComputation(b.Build());
+
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto result_inner_literal =
       Literal::CreateR2FromArray2D<double>(*operand_array);
@@ -1501,7 +1631,7 @@ TEST_F(HloEvaluatorTest, SetAndGetNestedTuples) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, Reverse) {
+TEST_P(HloEvaluatorTest, Reverse) {
   HloComputation::Builder b(TestName());
 
   // Input shape is float[4x3x2x1].
@@ -1527,10 +1657,9 @@ TEST_F(HloEvaluatorTest, Reverse) {
 
   const Shape shape = ShapeUtil::MakeShape(F32, {4, 3, 2, 1});
   b.AddInstruction(HloInstruction::CreateReverse(shape, operand, {0, 1}));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   // clang-format off
   auto expected = Literal::CreateR4FromArray4D<float>({
@@ -1555,7 +1684,7 @@ TEST_F(HloEvaluatorTest, Reverse) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, EvaluateWithSubstitutions) {
+TEST_P(HloEvaluatorTest, EvaluateWithSubstitutions) {
   HloComputation::Builder b(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4});
 
@@ -1578,7 +1707,7 @@ TEST_F(HloEvaluatorTest, EvaluateWithSubstitutions) {
 
 // Check that EvaluateWithSubstitutions works if one of the operands to the op
 // we're evaluating is a constant.
-TEST_F(HloEvaluatorTest, EvaluateWithSubstitutionsWithConstantOperand) {
+TEST_P(HloEvaluatorTest, EvaluateWithSubstitutionsWithConstantOperand) {
   HloComputation::Builder b(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4});
 
@@ -1600,5 +1729,8 @@ TEST_F(HloEvaluatorTest, EvaluateWithSubstitutionsWithConstantOperand) {
                                *result.ValueOrDie());
 }
 
+INSTANTIATE_TEST_CASE_P(HloEvaluatorTest_Instantiation, HloEvaluatorTest,
+                        ::testing::ValuesIn(use_bf16_params));
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index 755374b91d05f4b6186e75e98847cbd3ffed0e93..0111cfd5a3d7889f80370f9e3e744457bc4091e4 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -40,7 +40,7 @@ HloProfileIndexMap::HloProfileIndexMap(const HloModule& module) {
   }
 }
 
-static HloProfilePrinter CreateOwnedHloProfilePrinter(
+std::unique_ptr<HloProfilePrinter> CreateHloProfilePrinter(
     const HloProfileIndexMap& hlo_profile_index_map,
     const HloCostAnalysis& cost_analysis) {
   using HloComputationInfo = HloProfilePrinter::HloComputationInfo;
@@ -76,14 +76,14 @@ static HloProfilePrinter CreateOwnedHloProfilePrinter(
       HloProfilePrinter::HloInstructionInfo* instruction_info =
           &computation_info->instructions[instruction_index_in_static_data++];
       instruction_info->long_name = strdup(hlo->ToString().c_str());
-      instruction_info->short_name =
-          strdup(hlo->ToString(/*compact_operands=*/true).c_str());
+      instruction_info->short_name = strdup(
+          hlo->ToString(HloPrintOptions().set_compact_operands(true)).c_str());
       instruction_info->category = strdup(hlo->ToCategory().c_str());
       instruction_info->flop_count = cost_analysis.flop_count(*hlo);
       instruction_info->transcendental_count =
           cost_analysis.transcendental_count(*hlo);
       instruction_info->bytes_accessed = cost_analysis.bytes_accessed(*hlo);
-      instruction_info->seconds = cost_analysis.seconds(*hlo);
+      instruction_info->optimal_seconds = cost_analysis.optimal_seconds(*hlo);
       instruction_info->profile_index =
           hlo_profile_index_map.GetProfileIndexFor(*hlo);
       CHECK_LT(instruction_info->profile_index, max_profile_index);
@@ -108,15 +108,16 @@ static HloProfilePrinter CreateOwnedHloProfilePrinter(
     delete[] computation_infos;
   };
 
-  return HloProfilePrinter(computation_infos,
-                           hlo_profile_index_map.computation_count(), deleter);
+  return MakeUnique<HloProfilePrinter>(
+      computation_infos, hlo_profile_index_map.computation_count(),
+      /*profile_counters_size=*/max_profile_index, deleter);
 }
 
-HloExecutionProfile::HloExecutionProfile(const HloModule& module,
-                                         const HloCostAnalysis& cost_analysis)
-    : hlo_profile_index_map_(module),
-      hlo_profile_printer_(
-          CreateOwnedHloProfilePrinter(hlo_profile_index_map_, cost_analysis)),
+HloExecutionProfile::HloExecutionProfile(
+    const HloProfilePrinter* hlo_profile_printer,
+    const HloProfileIndexMap* hlo_profile_index_map)
+    : hlo_profile_printer_(*hlo_profile_printer),
+      hlo_profile_index_map_(*hlo_profile_index_map),
       profile_counters_(
           /*count*/ hlo_profile_index_map_.total_count(),
           /*value*/ 0) {}
@@ -131,10 +132,4 @@ uint64 HloExecutionProfile::GetCyclesTakenBy(const HloInstruction& hlo) const {
   return profile_counters_[hlo_profile_index_map_.GetProfileIndexFor(hlo)];
 }
 
-string HloExecutionProfile::ToString(
-    const DeviceDescription& device_description) const {
-  return hlo_profile_printer_.ToString(profile_counters_.data(),
-                                       device_description.clock_rate_ghz());
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index 84702680c0c40335098530c4b1fdb164bb7f9374..470fd4ce3c205d84152238f4b18daad77e403f68 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -77,6 +77,11 @@ class HloProfileIndexMap {
   std::unordered_map<const HloComputation*, int64> computation_to_profile_idx_;
 };
 
+// Create an instance of `HloProfilePrinter` that owns its memory.
+std::unique_ptr<HloProfilePrinter> CreateHloProfilePrinter(
+    const HloProfileIndexMap& hlo_profile_index_map,
+    const HloCostAnalysis& cost_analysis);
+
 // Describes how much time each HLO operation took.
 //
 // Each HloComputation takes a certain number of cycles.  This class helps break
@@ -85,8 +90,8 @@ class HloExecutionProfile {
  public:
   using DeviceDescription = perftools::gputools::DeviceDescription;
 
-  HloExecutionProfile(const HloModule& module,
-                      const HloCostAnalysis& cost_analysis);
+  HloExecutionProfile(const HloProfilePrinter* hlo_profile_printer,
+                      const HloProfileIndexMap* hlo_profile_index_map);
 
   // Record how many cycles this HLO took to execute.
   void SetCyclesTakenBy(const HloInstruction* hlo, uint64 cycles_taken);
@@ -114,15 +119,16 @@ class HloExecutionProfile {
   // for the operations in a given computation. Returns an empty string if it
   // wasn't possible to generate a printable version. cost_analysis should be a
   // clean analysis that can be used to visit the computation.
-  string ToString(const DeviceDescription& device_description) const;
+  string ToString(const DeviceDescription& device_description) const {
+    return hlo_profile_printer_.ToString(profile_counters_.data(),
+                                         device_description.clock_rate_ghz());
+  }
 
- private:
-  // hlo_profile_index_map_ maps an Hlo entity (computation or instruction) to
-  // an index in profile_counters_.
-  HloProfileIndexMap hlo_profile_index_map_;
+  std::vector<int64>* mutable_profile_counters() { return &profile_counters_; }
 
-  // Used to print profile_counters_ in a human readable form.
-  HloProfilePrinter hlo_profile_printer_;
+ private:
+  const HloProfilePrinter& hlo_profile_printer_;
+  const HloProfileIndexMap& hlo_profile_index_map_;
 
   // Stores per-Hlo profile counters.  This is the only thing that changes when
   // we execute an XLA computation.
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
index 0628444b34b017297d5da7980202e4c5586879ab..b1e6729e2bccad4bdbe075a635d8a9b1ede6fecb 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
@@ -72,7 +72,11 @@ TEST_F(HloExecutionProfileTest, Basic) {
   };
 
   HloCostAnalysis cost_analysis(shape_size_function);
-  HloExecutionProfile execution_profile(*hlo_module, cost_analysis);
+  HloProfileIndexMap profile_index_map(*hlo_module);
+  std::unique_ptr<HloProfilePrinter> profile_printer =
+      CreateHloProfilePrinter(profile_index_map, cost_analysis);
+  HloExecutionProfile execution_profile(profile_printer.get(),
+                                        &profile_index_map);
 
   const int64 add_cycles = 1000;
   const int64 dot_cycles = 4000;
@@ -90,10 +94,10 @@ TEST_F(HloExecutionProfileTest, Basic) {
   const std::vector<string>& line_3 = lines_and_words[3];
 
   EXPECT_EQ(line_2[kInstructionCyclesIndex], std::to_string(dot_cycles));
-  EXPECT_EQ(line_2[kInstructionNameIndex], dot_instruction->name());
+  EXPECT_EQ(line_2[kInstructionNameIndex], '%' + dot_instruction->name());
 
   EXPECT_EQ(line_3[kInstructionCyclesIndex], std::to_string(add_cycles));
-  EXPECT_EQ(line_3[kInstructionNameIndex], add_instruction->name());
+  EXPECT_EQ(line_3[kInstructionNameIndex], '%' + add_instruction->name());
 }
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index d71a4b42c71154a25d1e6ec029ba3922361fd0b9..44db09208544a4372f37861b0a2a824faa593d60 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -864,9 +864,10 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
   // (eg, parameter).
   switch (instr->opcode()) {
     case HloOpcode::kAbs:
-    case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kAdd:
+    case HloOpcode::kAnd:
     case HloOpcode::kAtan2:
+    case HloOpcode::kBitcastConvert:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
     case HloOpcode::kComplex:
@@ -882,18 +883,19 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kIsFinite:
     case HloOpcode::kLe:
     case HloOpcode::kLog:
-    case HloOpcode::kAnd:
-    case HloOpcode::kNot:
-    case HloOpcode::kOr:
     case HloOpcode::kLt:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kMultiply:
     case HloOpcode::kNe:
     case HloOpcode::kNegate:
+    case HloOpcode::kNot:
+    case HloOpcode::kOr:
     case HloOpcode::kPower:
     case HloOpcode::kReal:
     case HloOpcode::kRemainder:
+    case HloOpcode::kRng:
+    case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical:
@@ -903,7 +905,6 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kSort:
     case HloOpcode::kSubtract:
     case HloOpcode::kTanh:
-    case HloOpcode::kRng:
       // De-emphasize scalar-shaped elementwise ops -- they're generally
       // uninteresting.
       if (ShapeUtil::IsEffectiveScalar(instr->shape())) {
@@ -911,9 +912,9 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
       }
       return kYellow;
     case HloOpcode::kBitcast:
-    case HloOpcode::kTuple:
-    case HloOpcode::kTrace:
     case HloOpcode::kGetTupleElement:
+    case HloOpcode::kTrace:
+    case HloOpcode::kTuple:
       return kWhite;
     case HloOpcode::kBroadcast:
       // De-emphasize nodes which broadcast a scalar within a fusion node --
@@ -952,28 +953,28 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
       return kRed;
     case HloOpcode::kParameter:
       return kParameterColor;
-    case HloOpcode::kBatchNormTraining:
-    case HloOpcode::kBatchNormInference:
     case HloOpcode::kBatchNormGrad:
+    case HloOpcode::kBatchNormInference:
+    case HloOpcode::kBatchNormTraining:
     case HloOpcode::kReduce:
-    case HloOpcode::kSelectAndScatter:
     case HloOpcode::kReduceWindow:
+    case HloOpcode::kSelectAndScatter:
       return kPurple;
-    case HloOpcode::kMap:
     case HloOpcode::kFusion:
+    case HloOpcode::kMap:
       return kGray;
-    case HloOpcode::kSend:
-    case HloOpcode::kSendDone:
-    case HloOpcode::kRecv:
-    case HloOpcode::kRecvDone:
+    case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
-    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kRecv:
+    case HloOpcode::kRecvDone:
+    case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
       return kBrown;
+    case HloOpcode::kCall:
     case HloOpcode::kConditional:
     case HloOpcode::kCustomCall:
     case HloOpcode::kWhile:
-    case HloOpcode::kCall:
       return kDarkGreen;
     case HloOpcode::kConstant:
       LOG(FATAL) << "Constants don't get their own nodes in the graph.";
@@ -1055,7 +1056,7 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
       case HloOpcode::kBatchNormGrad:
         return Printf("feature_index=%lld", instr->feature_index());
       case HloOpcode::kCustomCall:
-        return Printf("custom_call_target=%s", instr->custom_call_target());
+        return Printf("target=%s", instr->custom_call_target());
       case HloOpcode::kSlice:
         return std::all_of(instr->slice_strides().begin(),
                            instr->slice_strides().end(),
@@ -1090,7 +1091,7 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
         instr->shape().dimensions_size() > 1 &&
         !ShapeUtil::IsTuple(instr->shape())) {
       StrAppend(&instr_shape, "{",
-                Join(instr->shape().layout().minor_to_major(), ","), "}");
+                Join(LayoutUtil::MinorToMajor(instr->shape()), ","), "}");
     }
 
     // Some instructions have giant tuples as their shapes, so truncate the
@@ -1353,18 +1354,17 @@ string SaveGraph(const string& graph,
       break;
   }
   string path = JoinPath(
-      dest_path, StrCat("hlo_graph_", output_num++, ".XXXXXX", file_extension));
+      dest_path, StrCat("hlo_graph_", output_num++, "."));
   auto status = Status::OK();
-  int fd = mkstemps(&path[0], file_extension.length());
-  if (fd < 0) {
+  auto env = tensorflow::Env::Default();
+  if (!env->CreateUniqueFileName(&path, file_extension)) {
     status =
         Status(tensorflow::error::Code::UNKNOWN,
                StrCat("Failed to create temporary file to dump HLO graph: ",
                       strerror(errno)));
   } else {
     status =
-        tensorflow::WriteStringToFile(tensorflow::Env::Default(), path, graph);
-    close(fd);
+        tensorflow::WriteStringToFile(env, path, graph);
   }
   if (!status.ok()) {
     LOG(WARNING) << "Saving HLO graph failed: " << status;
@@ -1437,7 +1437,8 @@ void DumpText(const HloModule& module, const string& label,
       do_prefix ? StrCat(prefix, "-", label, ".txt") : StrCat(label, ".txt");
   string path = JoinPath(directory_path, filename);
   TF_CHECK_OK(WriteStringToFile(
-      env, path, module.ToString(/*include_large_constants=*/true)));
+      env, path,
+      module.ToString(HloPrintOptions().set_print_large_constants(true))));
   LOG(INFO) << "dumping module '" << module.name() << "' to " << path;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index c35ca1eb992d98d10a0af1ca2327bcb93c2b4972..89a95b2b991b061acdb5701dc7507b6b0a33fe73 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -52,7 +52,9 @@ using ::tensorflow::strings::StrCat;
 StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     HloModule* module, const HloInstructionProto& proto,
     const tensorflow::gtl::FlatMap<string, HloInstruction*>& instruction_map,
-    tensorflow::gtl::FlatMap<string, HloComputation*>* computation_map) {
+    const tensorflow::gtl::FlatMap<string, HloComputation*>& computation_map,
+    const std::function<void(std::unique_ptr<HloComputation>)>&
+        add_fused_computation) {
   TF_RET_CHECK(!proto.opcode().empty());
   TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(proto.opcode()));
   TF_RET_CHECK(proto.has_shape());
@@ -78,19 +80,19 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     TF_RET_CHECK(!proto.fusion_kind().empty());
     TF_ASSIGN_OR_RETURN(instruction->fusion_kind_,
                         StringToFusionKind(proto.fusion_kind()));
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloComputation> fused_computation,
-        HloComputation::CreateFromProto(
-            module, proto.fused_instructions_computation(), computation_map,
-            /*fusion_instruction=*/instruction.get()));
-    instruction->called_computations_.push_back(
-        module->AddEmbeddedComputation(std::move(fused_computation)));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloComputation> fused_computation,
+                        HloComputation::CreateFromProto(
+                            module, proto.fused_instructions_computation(),
+                            computation_map, add_fused_computation,
+                            /*fusion_instruction=*/instruction.get()));
+    instruction->called_computations_.push_back(fused_computation.get());
+    add_fused_computation(std::move(fused_computation));
   } else {
     for (const string& computation_name : proto.called_computation_names()) {
-      TF_RET_CHECK(ContainsKey(*computation_map, computation_name))
+      TF_RET_CHECK(ContainsKey(computation_map, computation_name))
           << "No computation named " << computation_name;
       instruction->called_computations_.push_back(
-          computation_map->at(computation_name));
+          computation_map.at(computation_name));
     }
   }
 
@@ -102,7 +104,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     instruction->literal_ = MakeUnique<Literal>(proto.literal());
   }
   instruction->parameter_number_ = proto.parameter_number();
-  instruction->parameter_name_ = proto.parameter_name();
 
   instruction->tuple_index_ = proto.tuple_index();
   for (int64 dimension : proto.dimensions()) {
@@ -116,6 +117,10 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         MakeUnique<ConvolutionDimensionNumbers>(
             proto.convolution_dimension_numbers());
   }
+  if (proto.has_dot_dimension_numbers()) {
+    instruction->dot_dimension_numbers_ =
+        MakeUnique<DotDimensionNumbers>(proto.dot_dimension_numbers());
+  }
   for (const HloInstructionProto::SliceDimensions& slice_dimensions :
        proto.slice_dimensions()) {
     instruction->slice_starts_.push_back(slice_dimensions.start());
@@ -148,8 +153,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   auto instruction =
       WrapUnique(new HloInstruction(HloOpcode::kParameter, shape));
   instruction->parameter_number_ = parameter_number;
-  instruction->parameter_name_ = name;
-  instruction->name_ = "%" + name;
+  instruction->name_ = name;
   return instruction;
 }
 
@@ -330,6 +334,31 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDot(
+    const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
+    const DotDimensionNumbers& dimension_numbers) {
+  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kDot, shape));
+  instruction->AppendOperand(lhs);
+  instruction->AppendOperand(rhs);
+  instruction->dot_dimension_numbers_ =
+      MakeUnique<DotDimensionNumbers>(dimension_numbers);
+  return instruction;
+}
+
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateCanonicalDot(
+    const Shape& shape, HloInstruction* lhs, HloInstruction* rhs) {
+  CHECK_EQ(ShapeUtil::Rank(lhs->shape()), 2);
+  CHECK_EQ(ShapeUtil::Rank(rhs->shape()), 2);
+
+  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kDot, shape));
+  instruction->AppendOperand(lhs);
+  instruction->AppendOperand(rhs);
+  instruction->dot_dimension_numbers_ = MakeUnique<DotDimensionNumbers>();
+  instruction->dot_dimension_numbers_->add_lhs_contracting_dimensions(1);
+  instruction->dot_dimension_numbers_->add_rhs_contracting_dimensions(0);
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateReducePrecision(const Shape& shape,
                                       HloInstruction* operand,
@@ -344,12 +373,9 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
 }
 
 /* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateCrossReplicaSum(const Shape& shape,
-                                      HloInstruction* operand) {
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kCrossReplicaSum, shape));
-  instruction->AppendOperand(operand);
-  return instruction;
+HloInstruction::CreateCrossReplicaSum(
+    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
+  return CreateNary(shape, HloOpcode::kCrossReplicaSum, operands);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateInfeed(
@@ -436,6 +462,23 @@ HloInstruction::CreateCrossReplicaSum(const Shape& shape,
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateConditional(
+    const Shape& shape, HloInstruction* pred,
+    HloInstruction* true_computation_arg, HloComputation* true_computation,
+    HloInstruction* false_computation_arg, HloComputation* false_computation) {
+  auto instruction =
+      WrapUnique(new HloInstruction(HloOpcode::kConditional, shape));
+  instruction->AppendOperand(pred);
+  instruction->AppendOperand(true_computation_arg);
+  instruction->AppendOperand(false_computation_arg);
+  // In called_computations_, the index of true_computation must be 0 and that
+  // of false computation must be 1, as defined by kTrueComputationIndex and
+  // kFalseComputationIndex.
+  instruction->called_computations_.push_back(true_computation);
+  instruction->called_computations_.push_back(false_computation);
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSlice(
     const Shape& shape, HloInstruction* operand,
     tensorflow::gtl::ArraySlice<int64> start_indices,
@@ -499,6 +542,15 @@ HloInstruction::CreateDynamicUpdateSlice(const Shape& shape,
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateBitcastConvert(const Shape& shape,
+                                     HloInstruction* operand) {
+  auto instruction =
+      WrapUnique(new HloInstruction(HloOpcode::kBitcastConvert, shape));
+  instruction->AppendOperand(operand);
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReduce(
     const Shape& shape, HloInstruction* arg, HloInstruction* init_value,
     tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce,
@@ -631,7 +683,10 @@ HloInstruction::CreateSelectAndScatter(
   CHECK_EQ(shape.dimensions().size(), operand->shape().dimensions().size());
   CHECK(std::equal(operand->shape().dimensions().begin(),
                    operand->shape().dimensions().end(),
-                   Permute(dimensions, shape.dimensions()).begin()));
+                   Permute(dimensions, shape.dimensions()).begin()))
+      << "shape: " << ShapeUtil::HumanString(shape)
+      << ", operand->shape(): " << ShapeUtil::HumanString(shape)
+      << ", dimensions: {" << Join(dimensions, ", ") << "}";
   auto instruction =
       WrapUnique(new HloInstruction(HloOpcode::kTranspose, shape));
   instruction->AppendOperand(operand);
@@ -791,7 +846,7 @@ HloInstruction* HloInstruction::FuseInstructionInternal(
 HloInstruction* HloInstruction::CloneAndFuseInternal(
     HloInstruction* instruction_to_fuse, bool add_output) {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  CHECK(instruction_to_fuse->IsFusable());
+  CHECK(instruction_to_fuse->IsFusable()) << instruction_to_fuse->ToString();
   VLOG(3) << "CloneAndFuseInternal:\n" << instruction_to_fuse->ToString();
   HloInstruction* clone = nullptr;
   if (called_computations_.empty()) {
@@ -869,10 +924,8 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
       // parameter instruction.
       int64 param_no = fused_parameters.size();
       // Name the parameter after the instruction it represents in the outer
-      // (non-fusion) computation. Strip the leading "%" from the operand name
-      // to avoid a double %%.
-      string param_name =
-          StrCat(operand->name().substr(1), ".param_", param_no);
+      // (non-fusion) computation.
+      string param_name = StrCat(operand->name(), ".param_", param_no);
       fused_param = fused_instructions_computation()->AddParameter(
           CreateParameter(param_no, operand->shape(), param_name));
       AppendOperand(operand);
@@ -956,6 +1009,7 @@ bool HloInstruction::HasSideEffect() const {
     case HloOpcode::kSendDone:
     case HloOpcode::kRecv:
     case HloOpcode::kRecvDone:
+    case HloOpcode::kRng:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kTrace:
@@ -1013,7 +1067,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
   VLOG(3) << "CloneWithNewOperands:\n  " << ToString();
   VLOG(3) << "  new operands:";
   for (const HloInstruction* new_operand : new_operands) {
-    VLOG(3) << "    " << new_operand->name();
+    VLOG(3) << "    %" << new_operand->name();
   }
 
   std::unique_ptr<HloInstruction> clone;
@@ -1057,7 +1111,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kLe:
     case HloOpcode::kLt:
     case HloOpcode::kNe:
-    case HloOpcode::kDot:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kPower:
@@ -1095,6 +1148,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateConvert(shape, new_operands[0]);
       break;
+    case HloOpcode::kBitcastConvert:
+      CHECK_EQ(new_operands.size(), 1);
+      clone = CreateBitcastConvert(shape, new_operands[0]);
+      break;
     case HloOpcode::kReducePrecision:
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateReducePrecision(shape, new_operands[0], exponent_bits_,
@@ -1105,9 +1162,13 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       clone = CreateConvolve(shape, new_operands[0], new_operands[1], *window_,
                              *convolution_dimension_numbers_);
       break;
+    case HloOpcode::kDot:
+      CHECK_EQ(new_operands.size(), 2);
+      clone = CreateDot(shape, new_operands[0], new_operands[1],
+                        *dot_dimension_numbers_);
+      break;
     case HloOpcode::kCrossReplicaSum:
-      CHECK_EQ(new_operands.size(), 1);
-      clone = CreateCrossReplicaSum(shape, new_operands[0]);
+      clone = CreateCrossReplicaSum(shape, new_operands);
       break;
     case HloOpcode::kGetTupleElement:
       CHECK_EQ(new_operands.size(), 1);
@@ -1182,7 +1243,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       clone = CloneFusionWithNewOperands(shape, new_operands, module);
       break;
     case HloOpcode::kParameter:
-      clone = CreateParameter(parameter_number_, shape, parameter_name_);
+      clone = CreateParameter(parameter_number_, shape, name_);
       break;
     case HloOpcode::kBatchNormTraining:
       CHECK_EQ(new_operands.size(), 3);
@@ -1211,6 +1272,11 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
                                   new_operands[4], epsilon(), feature_index());
       break;
     case HloOpcode::kConditional:
+      CHECK_EQ(new_operands.size(), 3);
+      clone = CreateConditional(shape, new_operands[0], new_operands[1],
+                                true_computation(), new_operands[2],
+                                false_computation());
+      break;
     case HloOpcode::kRecv:
     case HloOpcode::kRecvDone:
     case HloOpcode::kSend:
@@ -1476,7 +1542,6 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kCos:
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kDivide:
-    case HloOpcode::kDot:
     case HloOpcode::kEq:
     case HloOpcode::kExp:
     case HloOpcode::kFloor:
@@ -1535,6 +1600,7 @@ bool HloInstruction::IdenticalSlowPath(
     // A convert result is determined by the primitive type that the operand is
     // converted into.
     case HloOpcode::kConvert:
+    case HloOpcode::kBitcastConvert:
       return shape().element_type() == other.shape().element_type();
 
     // A reduce-precision operation is determined by the bit sizes.
@@ -1548,6 +1614,10 @@ bool HloInstruction::IdenticalSlowPath(
              protobuf_util::ProtobufEquals(
                  convolution_dimension_numbers(),
                  other.convolution_dimension_numbers());
+    // Check dot dimension numbers.
+    case HloOpcode::kDot:
+      return protobuf_util::ProtobufEquals(dot_dimension_numbers(),
+                                           other.dot_dimension_numbers());
 
     // Reduction results are determined by the reduction dimension and the
     // reduction computation.
@@ -1814,6 +1884,32 @@ void HloInstruction::set_scatter(HloComputation* computation) {
   called_computations_[kScatterComputationIndex] = computation;
 }
 
+HloComputation* HloInstruction::true_computation() const {
+  CHECK_EQ(HloOpcode::kConditional, opcode_);
+  return called_computations_[kTrueComputationIndex];
+}
+
+HloComputation* HloInstruction::false_computation() const {
+  CHECK_EQ(HloOpcode::kConditional, opcode_);
+  return called_computations_[kFalseComputationIndex];
+}
+
+void HloInstruction::set_true_computation(HloComputation* true_computation) {
+  // Don't allow changing the computation for fused instructions so we don't
+  // have to recompute called_instructions for the entire fusion instruction.
+  CHECK(!IsFused());
+  CHECK_EQ(HloOpcode::kConditional, opcode_);
+  called_computations_[kTrueComputationIndex] = true_computation;
+}
+
+void HloInstruction::set_false_computation(HloComputation* false_computation) {
+  // Don't allow changing the computation for fused instructions so we don't
+  // have to recompute called_instructions for the entire fusion instruction.
+  CHECK(!IsFused());
+  CHECK_EQ(HloOpcode::kConditional, opcode_);
+  called_computations_[kFalseComputationIndex] = false_computation;
+}
+
 string HloInstruction::SignatureString() const {
   string operands =
       Join(operands_, ", ", [](string* out, HloInstruction* operand) {
@@ -1822,16 +1918,23 @@ string HloInstruction::SignatureString() const {
   return StrCat("(", operands, ") -> ", ShapeUtil::HumanString(shape()));
 }
 
-string HloInstruction::ToString(bool compact_operands, bool include_metadata,
-                                bool include_large_constants) const {
+namespace {
+
+string PrintName(const string& name, const HloPrintOptions& options) {
+  return StrCat(options.print_percent() ? "%" : "", name);
+}
+
+}  // namespace
+
+string HloInstruction::ToString(const HloPrintOptions& options) const {
   string result =
-      StrCat(name(), " = ", ShapeUtil::HumanStringWithLayout(shape()), " ",
-             HloOpcodeString(opcode()), "(",
-             OperandsToString(compact_operands, include_large_constants), ")");
-  for (const string& extra : ExtraAttributesToString()) {
+      StrCat(PrintName(name(), options), " = ",
+             ShapeUtil::HumanStringWithLayout(shape()), " ",
+             HloOpcodeString(opcode()), "(", OperandsToString(options), ")");
+  for (const string& extra : ExtraAttributesToString(options)) {
     StrAppend(&result, ", ", extra);
   }
-  if (include_metadata &&
+  if (options.print_metadata() &&
       (!metadata_.op_type().empty() || !metadata_.op_name().empty() ||
        !metadata_.source_file().empty())) {
     StrAppend(&result, ", metadata={", xla::OpMetadataToString(metadata_), "}");
@@ -1839,14 +1942,13 @@ string HloInstruction::ToString(bool compact_operands, bool include_metadata,
   return result;
 }
 
-string HloInstruction::OperandsToString(bool compact,
-                                        bool include_large_constants) const {
+string HloInstruction::OperandsToString(const HloPrintOptions& options) const {
   string operands;
   if (opcode() == HloOpcode::kConstant) {
     // For constants, show the actual value in place of an empty operand list.
     if ((!ShapeUtil::IsTuple(shape()) &&
          ShapeUtil::ElementsIn(shape()) <= 10) ||
-        include_large_constants) {
+        options.print_large_constants()) {
       // Literal::ToString emits multidimensional arrays over multiple
       // lines. Compact this into one line by stripping out white space.
       string tmp = literal().ToString();
@@ -1871,14 +1973,19 @@ string HloInstruction::OperandsToString(bool compact,
   } else {
     tensorflow::gtl::ArraySlice<HloInstruction*> slice(operands_);
     const int64 kMaxOperandsToShowIfCompact = 4;
-    if (compact && slice.size() > kMaxOperandsToShowIfCompact) {
+    if (options.compact_operands() &&
+        slice.size() > kMaxOperandsToShowIfCompact) {
       slice.remove_suffix(slice.size() - kMaxOperandsToShowIfCompact);
     }
     operands = Join(slice, ", ", [&](string* out, HloInstruction* operand) {
-      *out += ShapeUtil::HumanStringWithLayout(operand->shape());
-      if (!compact) {
-        StrAppend(out, " ", operand->name());
+      std::vector<string> str;
+      if (options.print_operand_shape()) {
+        str.push_back(ShapeUtil::HumanStringWithLayout(operand->shape()));
+      }
+      if (!options.compact_operands()) {
+        str.push_back(PrintName(operand->name(), options));
       }
+      StrAppend(out, Join(str, " "));
     });
     const int64 remaining = operands_.size() - slice.size();
     if (slice.size() != operands_.size()) {
@@ -1888,7 +1995,8 @@ string HloInstruction::OperandsToString(bool compact,
   return operands;
 }
 
-std::vector<string> HloInstruction::ExtraAttributesToString() const {
+std::vector<string> HloInstruction::ExtraAttributesToString(
+    const HloPrintOptions& options) const {
   std::vector<string> extra;
   if (opcode() == HloOpcode::kFusion) {
     extra.push_back(StrCat("kind=", xla::ToString(fusion_kind())));
@@ -1896,7 +2004,7 @@ std::vector<string> HloInstruction::ExtraAttributesToString() const {
   if (CanHaveDimensionsField()) {
     extra.push_back(StrCat("dimensions={", Join(dimensions(), ","), "}"));
   }
-  if (window_ != nullptr) {
+  if (window_ != nullptr && window_->dimensions_size() != 0) {
     extra.push_back(StrCat("window={", window_util::ToString(*window_), "}"));
   }
   if (padding_config_ != nullptr) {
@@ -1930,22 +2038,33 @@ std::vector<string> HloInstruction::ExtraAttributesToString() const {
   if (convolution_dimension_numbers_ != nullptr) {
     extra.push_back(ConvolutionDimensionNumbersToString());
   }
+  if (dot_dimension_numbers_ != nullptr) {
+    extra.push_back(DotDimensionNumbersToString());
+  }
 
   if (opcode() == HloOpcode::kWhile) {
-    extra.push_back(StrCat("condition=%", while_condition()->name()));
-    extra.push_back(StrCat("body=%", while_body()->name()));
+    extra.push_back(
+        StrCat("condition=", PrintName(while_condition()->name(), options)));
+    extra.push_back(StrCat("body=", PrintName(while_body()->name(), options)));
   } else if (opcode() == HloOpcode::kSelectAndScatter) {
-    extra.push_back(StrCat("select=%", select()->name()));
-    extra.push_back(StrCat("scatter=%", scatter()->name()));
+    extra.push_back(StrCat("select=", PrintName(select()->name(), options)));
+    extra.push_back(StrCat("scatter=", PrintName(scatter()->name(), options)));
+  } else if (opcode() == HloOpcode::kConditional) {
+    extra.push_back(StrCat("true_computation=",
+                           PrintName(true_computation()->name(), options)));
+    extra.push_back(StrCat("false_computation=",
+                           PrintName(false_computation()->name(), options)));
   } else if (opcode() == HloOpcode::kCall || opcode() == HloOpcode::kMap ||
              opcode() == HloOpcode::kReduceWindow ||
              opcode() == HloOpcode::kReduce) {
-    extra.push_back(StrCat("to_apply=%", to_apply()->name()));
+    extra.push_back(
+        StrCat("to_apply=", PrintName(to_apply()->name(), options)));
   } else if (!called_computations().empty()) {
     extra.push_back(StrCat(
         "calls=", Join(called_computations(), ", ",
-                       [](string* out, const HloComputation* computation) {
-                         StrAppend(out, "%", computation->name());
+                       [&](string* out, const HloComputation* computation) {
+                         StrAppend(out,
+                                   PrintName(computation->name(), options));
                        })));
   }
 
@@ -1963,8 +2082,9 @@ std::vector<string> HloInstruction::ExtraAttributesToString() const {
   if (!control_predecessors_.empty()) {
     extra.push_back(StrCat("control-predecessors={",
                            Join(control_predecessors_, ", ",
-                                [](string* out, HloInstruction* pre) {
-                                  StrAppend(out, pre->name());
+                                [&](string* out, HloInstruction* pre) {
+                                  StrAppend(out,
+                                            PrintName(pre->name(), options));
                                 }),
                            "}"));
   }
@@ -1975,14 +2095,26 @@ std::vector<string> HloInstruction::ExtraAttributesToString() const {
     extra.push_back(
         StrCat("outfeed_config=\"", CEscape(outfeed_config_), "\""));
   }
+  if (opcode() == HloOpcode::kRng) {
+    extra.push_back(
+        StrCat("distribution=", RandomDistributionToString(distribution_)));
+  }
+  if (opcode() == HloOpcode::kReducePrecision) {
+    extra.push_back(StrCat("exponent_bits=", exponent_bits_));
+    extra.push_back(StrCat("mantissa_bits=", mantissa_bits_));
+  }
+  if (opcode() == HloOpcode::kCustomCall) {
+    extra.push_back(
+        StrCat("custom_call_target=\"", CEscape(custom_call_target_), "\""));
+  }
   return extra;
 }
 
 string HloInstruction::ToShortString() const {
-  return StrCat(name(), " = ", HloOpcodeString(opcode()), "(",
+  return StrCat("%", name(), " = ", HloOpcodeString(opcode()), "(",
                 Join(operands_, ", ",
                      [](string* out, HloInstruction* operand) {
-                       StrAppend(out, operand->name());
+                       StrAppend(out, "%", operand->name());
                      }),
                 ")");
 }
@@ -2004,7 +2136,6 @@ HloInstructionProto HloInstruction::ToProto() const {
     *proto.mutable_literal() = literal_->ToProto();
   }
   proto.set_parameter_number(parameter_number_);
-  proto.set_parameter_name(parameter_name_);
   if (opcode() == HloOpcode::kFusion) {
     proto.set_fusion_kind(xla::ToString(fusion_kind()));
     *proto.mutable_fused_instructions_computation() =
@@ -2026,6 +2157,9 @@ HloInstructionProto HloInstruction::ToProto() const {
     *proto.mutable_convolution_dimension_numbers() =
         *convolution_dimension_numbers_;
   }
+  if (dot_dimension_numbers_ != nullptr) {
+    *proto.mutable_dot_dimension_numbers() = *dot_dimension_numbers_;
+  }
   for (int i = 0; i < slice_starts_.size(); ++i) {
     auto* slice_dimension = proto.add_slice_dimensions();
     slice_dimension->set_start(slice_starts_[i]);
@@ -2076,8 +2210,10 @@ string HloInstruction::ToCategory() const {
       bool saw_rank_1 = false;
       bool saw_higher_rank = false;
       for (const auto* operand : operands()) {
-        saw_rank_1 |= ShapeUtil::Rank(operand->shape()) == 1;
-        saw_higher_rank |= ShapeUtil::Rank(operand->shape()) > 1;
+        if (!ShapeUtil::IsTuple(operand->shape())) {
+          saw_rank_1 |= ShapeUtil::Rank(operand->shape()) == 1;
+          saw_higher_rank |= ShapeUtil::Rank(operand->shape()) > 1;
+        }
       }
       if (saw_rank_1 && saw_higher_rank) {
         return "rank-1-broadcast binary fusion";
@@ -2130,25 +2266,13 @@ bool HloInstruction::IsFusable() const {
   if (tracing()) {
     return false;
   }
-
   // Some kinds of instructions don't make sense to fuse.
   switch (opcode_) {
-    case HloOpcode::kInfeed:
-    case HloOpcode::kOutfeed:
     case HloOpcode::kParameter:
-    case HloOpcode::kTrace:
-    case HloOpcode::kRecv:
-    case HloOpcode::kRecvDone:
-    case HloOpcode::kSend:
-    case HloOpcode::kSendDone:
       return false;
-    // Only fuse Rng if it is used once, otherwise the random numbers generated
-    // will be different in each fusion. If it is the root (user count = 0)
-    // then it is the equivalent of having one user.
-    case HloOpcode::kRng:
-      return users_.size() <= 1;
+    // Side effecting instrutions cannot be fused.
     default:
-      return true;
+      return !HasSideEffect();
   }
 }
 
@@ -2199,7 +2323,7 @@ HloInstruction::HloInstruction(HloOpcode opcode, const Shape& shape)
     : unique_id_(-1),
       opcode_(opcode),
       shape_(shape),
-      name_("%" + HloOpcodeString(opcode)) {
+      name_(HloOpcodeString(opcode)) {
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape_));
 }
 
@@ -2259,6 +2383,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleConcatenate(this);
     case HloOpcode::kConvert:
       return visitor->HandleConvert(this);
+    case HloOpcode::kBitcastConvert:
+      return visitor->HandleBitcastConvert(this);
     case HloOpcode::kCopy:
       return visitor->HandleCopy(this);
     case HloOpcode::kMultiply:
@@ -2345,6 +2471,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleFusion(this);
     case HloOpcode::kCall:
       return visitor->HandleCall(this);
+    case HloOpcode::kConditional:
+      return visitor->HandleConditional(this);
     case HloOpcode::kCustomCall:
       return visitor->HandleCustomCall(this);
     case HloOpcode::kRecv:
@@ -2357,7 +2485,6 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleSendDone(this);
 
     // These opcodes are not handled here.
-    case HloOpcode::kConditional:
     case HloOpcode::kTrace:
       break;
   }
@@ -2423,7 +2550,7 @@ static Status PostOrderDFS(HloInstruction* root, Visitor* visitor,
         visitor->GetVisitState(current_id);
     if (visit_state == Visitor::kVisited) {
       dfs_stack.pop_back();
-      VLOG(3) << "Not visiting HLO " << current_node->name()
+      VLOG(3) << "Not visiting HLO %" << current_node->name()
               << " as it was already visited.";
       continue;
     }
@@ -2432,7 +2559,7 @@ static Status PostOrderDFS(HloInstruction* root, Visitor* visitor,
       dfs_stack.pop_back();
 
       TF_RETURN_IF_ERROR(visitor->Preprocess(current_node));
-      VLOG(2) << "Visiting HLO " << current_node->name();
+      VLOG(2) << "Visiting HLO %" << current_node->name();
       TF_RETURN_IF_ERROR(current_node->Visit(visitor));
       visitor->SetVisitState(current_id, Visitor::kVisited);
       TF_RETURN_IF_ERROR(visitor->Postprocess(current_node));
@@ -2477,7 +2604,7 @@ template <typename HloInstructionPtr>
 Status HloInstruction::Accept(DfsHloVisitorBase<HloInstructionPtr>* visitor,
                               bool call_finish_visit,
                               bool ignore_control_predecessors) {
-  VLOG(3) << "HloInstruction::Accept(" << name() << ")";
+  VLOG(3) << "HloInstruction::Accept(%" << name() << ")";
   TF_RETURN_IF_ERROR(
       PostOrderDFS(this, visitor, nullptr, ignore_control_predecessors));
   if (call_finish_visit) {
@@ -2493,7 +2620,7 @@ template Status HloInstruction::Accept(ConstDfsHloVisitor*, bool, bool);
 Status HloInstruction::AcceptWithOperandOrder(
     DfsHloVisitor* visitor, const CompareFunction& operand_order,
     bool call_finish_visit) {
-  VLOG(2) << "HloInstruction::AcceptWithOperandOrder(" << name() << ")";
+  VLOG(2) << "HloInstruction::AcceptWithOperandOrder(%" << name() << ")";
   InternalCompareFunction func = [&operand_order](
                                      std::pair<int, const HloInstruction*> a,
                                      std::pair<int, const HloInstruction*> b) {
@@ -2556,7 +2683,7 @@ Status HloInstruction::Accept(
 
 Status HloInstruction::AcceptOrdered(
     DfsHloVisitor* visitor, const std::vector<const HloInstruction*>& order) {
-  VLOG(2) << "HloInstruction::AcceptOrdered(" << name() << ")";
+  VLOG(2) << "HloInstruction::AcceptOrdered(%" << name() << ")";
   TF_RET_CHECK(OrderIsTopologicalSort(order));
 
   // Compute the predecessors of this instruction.
@@ -2575,7 +2702,7 @@ Status HloInstruction::AcceptOrdered(
     // The visitor can mark instructions as visited to skip particular
     // instructions.
     if (visitor->DidVisit(*const_instruction)) {
-      VLOG(3) << "Not visiting HLO " << const_instruction->name()
+      VLOG(3) << "Not visiting HLO %" << const_instruction->name()
               << " as it was already visited.";
       continue;
     }
@@ -2584,7 +2711,7 @@ Status HloInstruction::AcceptOrdered(
         const_cast<HloInstruction*>(const_instruction);
 
     TF_RETURN_IF_ERROR(visitor->Preprocess(instruction));
-    VLOG(2) << "Visiting HLO " << instruction->name();
+    VLOG(2) << "Visiting HLO %" << instruction->name();
     TF_RETURN_IF_ERROR(instruction->Visit(visitor));
     visitor->SetVisited(*instruction);
     TF_RETURN_IF_ERROR(visitor->Postprocess(instruction));
@@ -2630,6 +2757,7 @@ bool HloInstruction::IsElementwise() const {
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kCeil:
     case HloOpcode::kConvert:
+    case HloOpcode::kBitcastConvert:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
@@ -2947,6 +3075,28 @@ string OpMetadataToString(const OpMetadata& metadata) {
   return Join(result, " ");
 }
 
+string RandomDistributionToString(const RandomDistribution& distribution) {
+  return tensorflow::str_util::Lowercase(RandomDistribution_Name(distribution));
+}
+
+StatusOr<RandomDistribution> StringToRandomDistribution(const string& name) {
+  static std::unordered_map<string, RandomDistribution>* map = [] {
+    static auto* map = new std::unordered_map<string, RandomDistribution>;
+    for (int i = 0; i < RandomDistribution_ARRAYSIZE; i++) {
+      if (RandomDistribution_IsValid(i)) {
+        auto value = static_cast<RandomDistribution>(i);
+        (*map)[RandomDistributionToString(value)] = value;
+      }
+    }
+    return map;
+  }();
+  auto found = map->find(tensorflow::str_util::Lowercase(name));
+  if (found == map->end()) {
+    return InvalidArgument("Unknown distribution");
+  }
+  return found->second;
+}
+
 std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind) {
   return os << ToString(kind);
 }
@@ -2967,25 +3117,25 @@ string HloInstruction::ConvolutionDimensionNumbersToString() const {
 
   // lhs_dims[i] is the symbol of the logical dimension i for the lhs
   // operand. E.g. if batch has dimension number 2, then lhs_dims[2] == "b".
-  std::vector<string> lhs_dims(2 + dnums.spatial_dimensions().size());
+  std::vector<string> lhs_dims(2 + dnums.input_spatial_dimensions().size());
   lhs_dims[dnums.input_batch_dimension()] = 'b';
   lhs_dims[dnums.input_feature_dimension()] = 'f';
-  for (int64 i = 0; i < dnums.spatial_dimensions().size(); ++i) {
-    lhs_dims[dnums.spatial_dimensions(i)] = StrCat(i);
+  for (int64 i = 0; i < dnums.input_spatial_dimensions().size(); ++i) {
+    lhs_dims[dnums.input_spatial_dimensions(i)] = StrCat(i);
   }
 
   std::vector<string> rhs_dims(2 + dnums.kernel_spatial_dimensions().size());
   rhs_dims[dnums.kernel_input_feature_dimension()] = "i";
   rhs_dims[dnums.kernel_output_feature_dimension()] = "o";
-  for (int64 i = 0; i < dnums.spatial_dimensions().size(); ++i) {
+  for (int64 i = 0; i < dnums.kernel_spatial_dimensions().size(); ++i) {
     rhs_dims[dnums.kernel_spatial_dimensions(i)] = StrCat(i);
   }
 
-  std::vector<string> output_dims(2 + dnums.spatial_dimensions().size());
+  std::vector<string> output_dims(2 + dnums.output_spatial_dimensions().size());
   output_dims[dnums.output_batch_dimension()] = 'b';
   output_dims[dnums.output_feature_dimension()] = 'f';
-  for (int64 i = 0; i < dnums.spatial_dimensions().size(); ++i) {
-    output_dims[dnums.spatial_dimensions(i)] = StrCat(i);
+  for (int64 i = 0; i < dnums.output_spatial_dimensions().size(); ++i) {
+    output_dims[dnums.output_spatial_dimensions(i)] = StrCat(i);
   }
 
   result += "dim_labels=";
@@ -2997,6 +3147,30 @@ string HloInstruction::ConvolutionDimensionNumbersToString() const {
   return result;
 }
 
+string HloInstruction::DotDimensionNumbersToString() const {
+  string result;
+  if (dot_dimension_numbers_ == nullptr) {
+    return result;
+  }
+  const DotDimensionNumbers& dnums = *dot_dimension_numbers_;
+  if (!dnums.lhs_batch_dimensions().empty()) {
+    result += "lhs_batch_dims=";
+    StrAppend(&result, Join(dnums.lhs_batch_dimensions(), ","));
+  }
+  result += "lhs_contracting_dims=";
+  StrAppend(&result, Join(dnums.lhs_contracting_dimensions(), ","));
+
+  result += ",";
+  if (!dnums.rhs_batch_dimensions().empty()) {
+    result += "rhs_batch_dims=";
+    StrAppend(&result, Join(dnums.rhs_batch_dimensions(), ","));
+  }
+  result += "rhs_contracting_dims=";
+  StrAppend(&result, Join(dnums.rhs_contracting_dimensions(), ","));
+
+  return result;
+}
+
 bool HloInstruction::CouldBeBitcast() const {
   switch (opcode_) {
     case HloOpcode::kTranspose:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index f5f40ad9475568496ad8da5ad528289f9867c29f..2083c1b81d4a69ea9cdb3c15a8f78d1d3b404309 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -44,6 +44,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/iterator_range.h"
 #include "tensorflow/core/platform/logging.h"
@@ -55,6 +56,90 @@ namespace xla {
 class HloComputation;
 class HloModule;
 
+// A bunch of switches that control how the hlo text should be printed.
+class HloPrintOptions {
+ public:
+  // Constructs the default print options: don't print large constants, don't
+  // compact operands, no indentation.
+  HloPrintOptions()
+      : print_large_constants_(false),
+        print_metadata_(true),
+        compact_operands_(false),
+        print_operand_shape_(true),
+        print_program_shape_(true),
+        print_percent_(true),
+        indent_amount_(0) {}
+
+  static HloPrintOptions ShortParsable() {
+    return HloPrintOptions()
+        .set_print_large_constants(true)
+        .set_print_metadata(false)
+        .set_print_operand_shape(false)
+        .set_print_program_shape(false)
+        .set_print_percent(false);
+  }
+
+  // If true, large constants will be printed out.
+  HloPrintOptions& set_print_large_constants(bool value) {
+    print_large_constants_ = value;
+    return *this;
+  }
+
+  // If true, metatdata will be printed.
+  HloPrintOptions& set_print_metadata(bool value) {
+    print_metadata_ = value;
+    return *this;
+  }
+
+  // If true, operands' shapes will be printed.
+  HloPrintOptions& set_print_operand_shape(bool value) {
+    print_operand_shape_ = value;
+    return *this;
+  }
+
+  // If true, program shape of hlo computations will be printed.
+  HloPrintOptions& set_print_program_shape(bool value) {
+    print_program_shape_ = value;
+    return *this;
+  }
+
+  // If true, names will be printed with prefix '%'.
+  HloPrintOptions& set_print_percent(bool value) {
+    print_percent_ = value;
+    return *this;
+  }
+
+  // If true, only a part of operands will be printed out, and their names will
+  // be omitted (note that in this case the text will not be parsable).
+  HloPrintOptions& set_compact_operands(bool value) {
+    compact_operands_ = value;
+    return *this;
+  }
+
+  // The indent of the hlo text block.
+  HloPrintOptions& set_indent_amount(int value) {
+    indent_amount_ = value;
+    return *this;
+  }
+
+  bool print_large_constants() const { return print_large_constants_; }
+  bool print_metadata() const { return print_metadata_; }
+  bool compact_operands() const { return compact_operands_; }
+  bool print_operand_shape() const { return print_operand_shape_; }
+  bool print_program_shape() const { return print_program_shape_; }
+  bool print_percent() const { return print_percent_; }
+  int indent_amount() const { return indent_amount_; }
+
+ private:
+  bool print_large_constants_;
+  bool print_metadata_;
+  bool compact_operands_;
+  bool print_operand_shape_;
+  bool print_program_shape_;
+  bool print_percent_;
+  int indent_amount_;
+};
+
 // HLO instructions are the IR used by the high-level compiler.
 class HloInstruction {
  public:
@@ -83,12 +168,16 @@ class HloInstruction {
   //     must contain all operands of the newly constructed instruction.
   //   computation_map: a map from computation name to HloComputation*. This map
   //     must contain all computations which the newly constructed instruction
-  //     calls. If the instruction is a fusion instruction, then the fusion
-  //     computation is added to this map and the module.
+  //     calls.
+  //   add_fused_computation: A function to call to add a fused
+  //     computation. Used (clearly) when the instruction is a fusion
+  //     instruction.
   static StatusOr<std::unique_ptr<HloInstruction>> CreateFromProto(
       HloModule* module, const HloInstructionProto& proto,
       const tensorflow::gtl::FlatMap<string, HloInstruction*>& instruction_map,
-      tensorflow::gtl::FlatMap<string, HloComputation*>* computation_map);
+      const tensorflow::gtl::FlatMap<string, HloComputation*>& computation_map,
+      const std::function<void(std::unique_ptr<HloComputation>)>&
+          add_fused_computation);
 
   // Creates a parameter-retrieving instruction.
   static std::unique_ptr<HloInstruction> CreateParameter(int64 parameter_number,
@@ -155,6 +244,18 @@ class HloInstruction {
       const Window& window,
       const ConvolutionDimensionNumbers& dimension_numbers);
 
+  // Creates a dot op with operands 'lhs' and 'rhs' with contracting and batch
+  // dimensions specified in 'dimension_numbers'.
+  static std::unique_ptr<HloInstruction> CreateDot(
+      const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
+      const DotDimensionNumbers& dimension_numbers);
+
+  // Creates a dot op with operands 'lhs' and 'rhs' that contracts dimension 1
+  // of the LHS with dimension 0 of the RHS with no batch dimensions.  Both LHS
+  // and the RHS must be of rank 2.
+  static std::unique_ptr<HloInstruction> CreateCanonicalDot(
+      const Shape& shape, HloInstruction* lhs, HloInstruction* rhs);
+
   // Creates a reduce-precision op, where operand is the data to reduce in
   // precision, and exponent_bits and mantissa_bits describe the precision to
   // reduce it to.
@@ -164,13 +265,19 @@ class HloInstruction {
 
   // Creates a cross replica sum op.
   static std::unique_ptr<HloInstruction> CreateCrossReplicaSum(
-      const Shape& shape, HloInstruction* operand);
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> operands);
 
   // Creates a conversion instruction, where operand is the data to convert and
   // shape is the target shape for the conversion.
   static std::unique_ptr<HloInstruction> CreateConvert(const Shape& shape,
                                                        HloInstruction* operand);
 
+  // Creates a bitcast conversion instruction, where operand is the data to
+  // convert and shape is the target shape for the conversion.
+  static std::unique_ptr<HloInstruction> CreateBitcastConvert(
+      const Shape& shape, HloInstruction* operand);
+
   // Creates an infeed instruction, which reads data of the given shape from the
   // Infeed interface of the device.
   static std::unique_ptr<HloInstruction> CreateInfeed(const Shape& shape,
@@ -305,6 +412,11 @@ class HloInstruction {
                                                      HloComputation* body,
                                                      HloInstruction* init);
 
+  static std::unique_ptr<HloInstruction> CreateConditional(
+      const Shape& shape, HloInstruction* pred,
+      HloInstruction* true_computation_arg, HloComputation* true_computation,
+      HloInstruction* false_computation_arg, HloComputation* false_computation);
+
   // Creates a fusion instruction. A fusion instruction contains one or more
   // fused instructions forming an expression with a single root
   // "fused_root". Additional instructions can be added to the fusion
@@ -406,7 +518,7 @@ class HloInstruction {
   Status RemoveControlDependencyTo(HloInstruction* instruction);
 
   // Returns the set of control predecessors (successors) of this
-  // instruction. Control predecessors (sucessors) must execute before (after)
+  // instruction. Control predecessors (successors) must execute before (after)
   // the current instruction.
   const std::vector<HloInstruction*>& control_predecessors() const {
     return control_predecessors_;
@@ -525,16 +637,6 @@ class HloInstruction {
     return parameter_number_;
   }
 
-  const string& parameter_name() const {
-    CHECK_EQ(HloOpcode::kParameter, opcode_);
-    return parameter_name_;
-  }
-
-  void set_parameter_name(const string& str) {
-    CHECK_EQ(HloOpcode::kParameter, opcode_);
-    parameter_name_ = str;
-  }
-
   // Returns the dimension sizes or numbers associated with this instruction.
   //
   // Precondition: opcode() is one of: concatenate, reduce, broadcast, reshape,
@@ -608,23 +710,34 @@ class HloInstruction {
   void set_select(HloComputation* select);
   void set_scatter(HloComputation* scatter);
 
+  // Gets/sets the true and false HloComputation for Conditional. The setters
+  // should only be called by HloModule or HloComputation methods.
+  //
+  // Precondition: The instruction is a Conditional instruction.
+  HloComputation* true_computation() const;
+  HloComputation* false_computation() const;
+  void set_true_computation(HloComputation* true_computation);
+  void set_false_computation(HloComputation* false_computation);
+
   // Returns a string for the signature of this instruction if considered as a
   // function, e.g. the signature of an F32 add is (F32, F32) -> F32.
   string SignatureString() const;
 
   // Returns a debugging string that represents this instruction.
-  string ToString(bool compact_operands = false, bool include_metadata = true,
-                  bool include_large_constants = false) const;
+  //
+  // (We express the default options using an overload rather than a default
+  // param because gdb ignores default params, but does resolve overloads.)
+  string ToString() const { return ToString(HloPrintOptions()); }
+  string ToString(const HloPrintOptions& options) const;
 
   // Components of the ToString() representation:
 
   // Returns a string representation of the operand list.
-  string OperandsToString(bool compact, bool include_large_constants) const;
+  string OperandsToString(const HloPrintOptions& options) const;
 
   // Returns string representation of op-specific attributes.
-  std::vector<string> ExtraAttributesToString() const;
-
-  string ToStringNoMetadata() const { return ToString(false, false); }
+  std::vector<string> ExtraAttributesToString(
+      const HloPrintOptions& options) const;
 
   // As ToString, but returns a shorter string.
   string ToShortString() const;
@@ -652,13 +765,15 @@ class HloInstruction {
   // Returns feature_index field associated with the instruction. The index
   // represents the index of the feature dimension.
   //
-  // Precondition: opcode() == HloOpcode::kBatchNormTraining
+  // Precondition: opcode() is one of kBatchNormTraining, kBatchNormInference,
+  // or kBatchNormGrad.
   int64 feature_index() const { return feature_index_; }
 
   // Returns a epsilon value associated with the instruction. The is a small
   // number added to the variance to avoid divide-by-zero error.
   //
-  // Precondition: opcode() == HloOpcode::kBatchNormTraining
+  // Precondition: opcode() is one of kBatchNormTraining, kBatchNormInference,
+  // or kBatchNormGrad.
   float epsilon() const { return epsilon_; }
 
   // Returns the infeed configuration string. The infeed configuration includes
@@ -891,6 +1006,15 @@ class HloInstruction {
   // Returns the dump string of the convolution dimension numbers.
   string ConvolutionDimensionNumbersToString() const;
 
+  // Returns data on the dimension numbers used for a dot operation.
+  const DotDimensionNumbers& dot_dimension_numbers() const {
+    CHECK(dot_dimension_numbers_ != nullptr);
+    return *dot_dimension_numbers_;
+  }
+
+  // Returns the dump string of the dot dimension numbers.
+  string DotDimensionNumbersToString() const;
+
   // Returns the random distribution for this rng node.
   //
   // Precondition: opcode() == HloOpcode::kRng
@@ -982,10 +1106,9 @@ class HloInstruction {
   std::tuple<bool, std::vector<int64>, std::vector<int64>>
   ReshapeMerelyInsertsOrDeletes1SizedDimensions() const;
 
-  // Returns a string identifier for this instruction. If no string identifier
-  // has been explicitly set, then the identifier is the serialized pointer to
-  // this instruction.
+  // Gets/sets the string identifier for this instruction.
   const string& name() const { return name_; }
+  void set_name(tensorflow::StringPiece name) { name_ = name.ToString(); }
 
   // Use the given NameUniquer to select a unique name for the instruction based
   // on the instruction's existing name.
@@ -1149,6 +1272,9 @@ class HloInstruction {
   // Describes the dimension numbers used for a convolution.
   std::unique_ptr<ConvolutionDimensionNumbers> convolution_dimension_numbers_;
 
+  // Describes the dimension numbers used for a dot.
+  std::unique_ptr<DotDimensionNumbers> dot_dimension_numbers_;
+
   // Describes the [begin, end) index range for a slice.
   std::vector<int64> slice_starts_;
   std::vector<int64> slice_limits_;
@@ -1174,7 +1300,6 @@ class HloInstruction {
 
   // For parameter instructions this field holds the parameter number.
   int64 parameter_number_ = 0;
-  string parameter_name_;
 
   // Name of a global symbol to call, only present for kCustomCall.
   string custom_call_target_;
@@ -1192,6 +1317,10 @@ class HloInstruction {
     // kSelectAndScatter computations.
     kSelectComputationIndex = 0,
     kScatterComputationIndex = 1,
+
+    // kConditional computations.
+    kTrueComputationIndex = 0,
+    kFalseComputationIndex = 1,
   };
 
   // Outfeed configuration information, only present for kOutfeed.
@@ -1239,9 +1368,12 @@ string ToString(HloInstruction::FusionKind kind);
 StatusOr<HloInstruction::FusionKind> StringToFusionKind(
     const string& kind_name);
 
-// Custom stringification functions for protos that live inside HloInstruction.
+// Custom (de)stringification functions for protos that live inside
+// HloInstruction.
 string PaddingConfigToString(const PaddingConfig& padding);
 string OpMetadataToString(const OpMetadata& metadata);
+string RandomDistributionToString(const RandomDistribution& distribution);
+StatusOr<RandomDistribution> StringToRandomDistribution(const string& name);
 
 std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind);
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index c383dea40555f4768eba6e59c98ac0c932284847..043c751a5e7193d80c3afd6fe2ccdb3434149feb 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -1068,8 +1068,11 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
       builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
   HloInstruction* reshape =
       builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   HloInstruction* dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(sout, HloOpcode::kDot, x, reshape));
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
 
   HloModule module(TestName());
   auto* computation = module.AddEntryComputation(builder.Build());
@@ -1088,48 +1091,6 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
                                root2->operand(1)->operand(0)->shape()));
 }
 
-TEST_F(HloInstructionTest, IsRandomFusable) {
-  auto shape = ShapeUtil::MakeShape(F32, {2, 2});
-  {
-    auto builder = HloComputation::Builder(TestName());
-    auto hlo_module = CreateNewModule();
-    auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-        Literal::CreateR0<float>(0.0)));
-    auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
-        Literal::CreateR0<float>(1.0)));
-    auto rng = builder.AddInstruction(HloInstruction::CreateRng(
-        shape, RandomDistribution::RNG_NORMAL, {const0, const1}));
-
-    auto* computation = hlo_module->AddEntryComputation(builder.Build());
-    computation->CreateFusionInstruction({rng, const0, const1},
-      HloInstruction::FusionKind::kLoop);
-
-    auto* root = computation->root_instruction();
-
-    EXPECT_EQ(HloOpcode::kFusion, root->opcode());
-  }
-  {
-    auto builder = HloComputation::Builder(TestName());
-    auto hlo_module = CreateNewModule();
-    auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-        Literal::CreateR0<float>(0.0)));
-    auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
-        Literal::CreateR0<float>(1.0)));
-    auto rng = builder.AddInstruction(HloInstruction::CreateRng(
-        shape, RandomDistribution::RNG_NORMAL, {const0, const1}));
-    builder.AddInstruction(HloInstruction::CreateUnary(
-        shape, HloOpcode::kNegate, rng));
-    auto* computation = hlo_module->AddEntryComputation(builder.Build());
-    computation->CreateFusionInstruction({rng, const0, const1},
-      HloInstruction::FusionKind::kLoop);
-
-    auto* root = computation->root_instruction();
-
-    EXPECT_EQ(HloOpcode::kFusion, root->operand(0)->opcode());
-  }
-}
-
-
 TEST_F(HloInstructionTest, CloneSuffixNames) {
   // Test that the suffix string added to cloned instructions is not
   // duplicated. Rather a numeric incrementing value should be appended. That
@@ -1138,39 +1099,38 @@ TEST_F(HloInstructionTest, CloneSuffixNames) {
   // Test cloning the same instruction multiple times.
   auto foo =
       HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "foo");
-  EXPECT_EQ(foo->Clone()->name(), "%foo.clone");
-  EXPECT_EQ(foo->Clone()->Clone()->name(), "%foo.clone2");
-  EXPECT_EQ(foo->Clone()->Clone()->Clone()->name(), "%foo.clone3");
+  EXPECT_EQ(foo->Clone()->name(), "foo.clone");
+  EXPECT_EQ(foo->Clone()->Clone()->name(), "foo.clone2");
+  EXPECT_EQ(foo->Clone()->Clone()->Clone()->name(), "foo.clone3");
 
   // Test custom suffixes.
-  EXPECT_EQ(foo->Clone("bar")->name(), "%foo.bar");
-  EXPECT_EQ(foo->Clone("bar")->Clone("bar")->name(), "%foo.bar2");
-  EXPECT_EQ(foo->Clone("bar")->Clone("bar")->Clone()->name(),
-            "%foo.bar2.clone");
+  EXPECT_EQ(foo->Clone("bar")->name(), "foo.bar");
+  EXPECT_EQ(foo->Clone("bar")->Clone("bar")->name(), "foo.bar2");
+  EXPECT_EQ(foo->Clone("bar")->Clone("bar")->Clone()->name(), "foo.bar2.clone");
 
   // Test instruction name with a dot.
   auto foo_baz = HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {}), "foo.baz");
-  EXPECT_EQ(foo_baz->Clone()->name(), "%foo.baz.clone");
+  EXPECT_EQ(foo_baz->Clone()->name(), "foo.baz.clone");
 
   // Test incrementing a large number after the suffix.
   auto foo_clone234 = HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {}), "foo.clone234");
-  EXPECT_EQ(foo_clone234->Clone()->name(), "%foo.clone235");
+  EXPECT_EQ(foo_clone234->Clone()->name(), "foo.clone235");
 
   // Test a non-numeric string after the cloning suffix.
   auto foo_clonexyz = HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {}), "foo.clonexyz");
-  EXPECT_EQ(foo_clonexyz->Clone()->name(), "%foo.clonexyz.clone");
+  EXPECT_EQ(foo_clonexyz->Clone()->name(), "foo.clonexyz.clone");
 
   // Test a name with multiple appearances of the suffix.
   auto foo_clone_clone3 = HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {}), "foo.clone.clone3");
-  EXPECT_EQ(foo_clone_clone3->Clone()->name(), "%foo.clone.clone4");
+  EXPECT_EQ(foo_clone_clone3->Clone()->name(), "foo.clone.clone4");
 }
 
 TEST_F(HloInstructionTest, Stringification) {
-  // Tests stringification of a simple op, fusion, and while.
+  // Tests stringification of a simple op, fusion, while, and conditional.
   const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
   const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
   const Shape s2t = ShapeUtil::MakeShape(F32, {10, 20});
@@ -1183,12 +1143,17 @@ TEST_F(HloInstructionTest, Stringification) {
       builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
   HloInstruction* reshape =
       builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   HloInstruction* dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(sout, HloOpcode::kDot, x, reshape));
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
+
+  auto options = HloPrintOptions().set_print_metadata(false);
 
-  EXPECT_EQ(dot->ToString(false, false),
+  EXPECT_EQ(dot->ToString(options),
             "%dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} "
-            "%transpose)");
+            "%transpose), lhs_contracting_dims=1,rhs_contracting_dims=0");
 
   HloModule module(TestName());
   auto* computation = module.AddEntryComputation(builder.Build());
@@ -1196,15 +1161,25 @@ TEST_F(HloInstructionTest, Stringification) {
       {dot, reshape}, HloInstruction::FusionKind::kTransposeDot);
 
   EXPECT_EQ(
-      fusion->ToString(false, false),
+      fusion->ToString(options),
       "%fusion = f32[5,20]{1,0} fusion(f32[5,10]{1,0} %x, "
       "f32[20,10]{1,0} %y), kind=kTransposeDot, calls=%fused_computation");
 
   HloInstruction* loop = builder.AddInstruction(
       HloInstruction::CreateWhile(sout, computation, computation, x));
-  EXPECT_EQ(loop->ToString(false, false),
+  EXPECT_EQ(loop->ToString(options),
             "%while = f32[5,20]{1,0} while(f32[5,10]{1,0} %x), "
             "condition=%TransposeDot, body=%TransposeDot");
+
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  HloInstruction* conditional =
+      builder.AddInstruction(HloInstruction::CreateConditional(
+          sout, pred, x, computation, x, computation));
+  EXPECT_EQ(conditional->ToString(options),
+            "%conditional = f32[5,20]{1,0} conditional(pred[] %constant, "
+            "f32[5,10]{1,0} %x, f32[5,10]{1,0} %x), "
+            "true_computation=%TransposeDot, false_computation=%TransposeDot");
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 268fa0f632d838c1122f655ea6a548335727390a..992f55788b4900949f4994ba5b7be015bcd0d3de 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -87,6 +87,7 @@ HLO_MATCHER(Call);
 HLO_MATCHER(Ceil);
 HLO_MATCHER(Clamp);
 HLO_MATCHER(Concatenate);
+HLO_MATCHER(Conditional);
 HLO_MATCHER(Constant);
 HLO_MATCHER(Convert);
 HLO_MATCHER(Convolution);
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index d9c223fbbad5a3c20cba6d902ef5bc79e35304d1..6103cab3e7e73079ef9e65b4ada181aa088c4541 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -35,14 +35,15 @@ namespace xla {
 HloModule::HloModule(const string& name,
                      const VersionedComputationHandle& entry_computation_handle,
                      const HloModuleConfig& config)
-    : name_(name),
+    : name_(NameUniquer::GetSanitizedName(name)),
       config_(config),
       has_entry_computation_handle_(true),
       entry_computation_handle_(entry_computation_handle) {}
 
-HloModule::HloModule(const string& name) : name_(name) {}
+HloModule::HloModule(const string& name)
+    : name_(NameUniquer::GetSanitizedName(name)) {}
 HloModule::HloModule(const string& name, const HloModuleConfig& config)
-    : name_(name), config_(config) {}
+    : name_(NameUniquer::GetSanitizedName(name)), config_(config) {}
 
 HloComputation* HloModule::AddComputationInternal(
     std::unique_ptr<HloComputation> computation, bool is_entry,
@@ -170,17 +171,14 @@ void HloModule::ReplaceComputations(
   computations_ = std::move(new_computations);
 }
 
-string HloModule::ToString(bool include_large_constants) const {
+string HloModule::ToString(const HloPrintOptions& options) const {
   std::ostringstream s;
-  s << "HloModule " << name() << ":\n\n";
+  s << "HloModule " << name() << "\n\n";
   for (const HloComputation* computation : MakeComputationPostOrder()) {
     if (computation == entry_computation()) {
       s << "ENTRY ";
     }
-    s << computation->ToString(
-             /*nested_level=*/0,
-             /*include_large_constants=*/include_large_constants)
-      << "\n\n";
+    s << computation->ToString(options) << "\n\n";
   }
   return s.str();
 }
@@ -232,8 +230,8 @@ StatusOr<ProgramShape> ProgramShapeFromProto(const HloModuleProto& module) {
           << "Entry computation has more than one parameter instruction "
              "with parameter number "
           << instruction.parameter_number();
-      parameters[instruction.parameter_number()] = {
-          instruction.parameter_name(), &instruction.shape()};
+      parameters[instruction.parameter_number()] = {instruction.name(),
+                                                    &instruction.shape()};
     }
   }
   TF_RET_CHECK(root != nullptr)
@@ -290,9 +288,16 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
 
   tensorflow::gtl::FlatMap<string, HloComputation*> computation_map;
   for (const HloComputationProto& computation_proto : proto.computations()) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloComputation> computation,
-                        HloComputation::CreateFromProto(
-                            module.get(), computation_proto, &computation_map));
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloComputation> computation,
+        HloComputation::CreateFromProto(
+            module.get(), computation_proto, computation_map,
+            /*add_fused_computation=*/
+            [&module](std::unique_ptr<HloComputation> fused_computation) {
+              module->AddComputationInternal(std::move(fused_computation),
+                                             /*is_entry=*/false,
+                                             /*uniquify_names=*/false);
+            }));
     CHECK_NE(computation.get(), nullptr);
     TF_RET_CHECK(!ContainsKey(computation_map, computation->name()));
     string computation_name = computation->name();
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 5141e7bc8d4cf0ef4cd83310772e0c5d66b5da12..d3bb46bffca15549ef22e2908f129efd8586fa67 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -98,6 +98,10 @@ class HloModule {
     return config_.mutable_entry_computation_layout();
   }
 
+  ComputationLayout entry_computation_layout() const {
+    return config_.entry_computation_layout();
+  }
+
   const VersionedComputationHandle& entry_computation_handle() const {
     return entry_computation_handle_;
   }
@@ -143,7 +147,12 @@ class HloModule {
 
   const HloModuleConfig& config() const { return config_; }
 
-  string ToString(bool include_large_constants = false) const;
+  // Return a string representation of the module.
+  //
+  // (We express the default options using an overload rather than a default
+  // param because gdb ignores default params, but does resolve overloads.)
+  string ToString() const { return ToString(HloPrintOptions()); }
+  string ToString(const HloPrintOptions& options) const;
 
   // Convert an HloModule to or from a proto.
   HloModuleProto ToProto() const;
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index bf6440d66cac0d3a929c377202b212aba262f887..0f5d3dccb74e6e3c88e51685392171f940c03596 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -135,14 +135,15 @@ TEST_F(HloModuleTest, LargeConstantToString) {
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(
-      "HloModule LargeConstantToString:\n\nENTRY %Constant () -> f32[16] {\n  "
+      "HloModule LargeConstantToString\n\nENTRY %Constant () -> f32[16] {\n  "
       "ROOT %constant = f32[16]{0} constant({...})\n}\n\n",
-      module->ToString(/*include_large_constants=*/false));
+      module->ToString(HloPrintOptions().set_print_large_constants(false)));
+
   EXPECT_EQ(
-      "HloModule LargeConstantToString:\n\nENTRY %Constant () -> f32[16] {\n  "
+      "HloModule LargeConstantToString\n\nENTRY %Constant () -> f32[16] {\n  "
       "ROOT %constant = f32[16]{0} constant({42, 42, 42, 42, 42, 42, 42, 42, "
       "42, 42, 42, 42, 42, 42, 42, 42})\n}\n\n",
-      module->ToString(/*include_large_constants=*/true));
+      module->ToString(HloPrintOptions().set_print_large_constants(true)));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 7b07027441670ed3f72ef802770858fb8a7476fe..f3f79357582ac7661a532e94031acdbca0b86784 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -52,6 +52,7 @@ namespace xla {
   V(kBatchNormInference, "batch-norm-inference")             \
   V(kBatchNormTraining, "batch-norm-training")               \
   V(kBitcast, "bitcast")                                     \
+  V(kBitcastConvert, "bitcast-convert")                      \
   V(kBroadcast, "broadcast")                                 \
   V(kCall, "call", kHloOpcodeIsVariadic)                     \
   V(kCeil, "ceil")                                           \
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.cc b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
index 071c5a6629addad1a25116739a4d34e7ce55070a..e944ad15139af0d2f98e8e68d3d48303f47ecf1c 100644
--- a/tensorflow/compiler/xla/service/hlo_profile_printer.cc
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
@@ -50,7 +50,7 @@ string HloProfilePrinter::ToString(const int64* counters,
           /*short_name=*/instruction->short_name, instruction->category,
           counters[instruction->profile_index], instruction->flop_count,
           instruction->transcendental_count, instruction->bytes_accessed,
-          instruction->seconds);
+          instruction->optimal_seconds);
     }
 
     result += builder.ToString();
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.h b/tensorflow/compiler/xla/service/hlo_profile_printer.h
index 45921c66f68e811ef9d0ca3acc37465f5a160c94..2f056490ae027872570f7a0821ee63114f49fab8 100644
--- a/tensorflow/compiler/xla/service/hlo_profile_printer.h
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer.h
@@ -41,7 +41,7 @@ class HloProfilePrinter {
     float flop_count;
     float transcendental_count;
     float bytes_accessed;
-    float seconds;
+    float optimal_seconds;
 
     // The index into the profile counters array for the HloInstruction
     // corresponding to this HloInstructionInfo.
@@ -65,9 +65,11 @@ class HloProfilePrinter {
 
   HloProfilePrinter(
       HloComputationInfo* computation_infos, int64 computation_infos_size,
+      int64 profile_counters_size,
       std::function<void(HloComputationInfo*, int64)> deleter = nullptr)
       : computation_infos_(computation_infos),
         computation_infos_size_(computation_infos_size),
+        profile_counters_size_(profile_counters_size),
         deleter_(std::move(deleter)) {}
 
   HloProfilePrinter(HloProfilePrinter&& other) {
@@ -79,10 +81,13 @@ class HloProfilePrinter {
   HloProfilePrinter(const HloProfilePrinter&) = delete;
   HloProfilePrinter& operator=(const HloProfilePrinter&) = delete;
 
-  // Convert the profile counter sequence `counters` to a human readable string
+  // Converts the profile counter sequence `counters` to a human readable string
   // representation.
   string ToString(const int64* counters, double clock_rate_ghz) const;
 
+  // Returns the size of the profile buffer expected by this printer.
+  int64 profile_counters_size() const { return profile_counters_size_; }
+
   ~HloProfilePrinter();
 
  private:
@@ -90,6 +95,7 @@ class HloProfilePrinter {
   // is manifested as the deleter_ function.
   HloComputationInfo* computation_infos_ = nullptr;
   int64 computation_infos_size_ = 0;
+  int64 profile_counters_size_ = 0;
   std::function<void(HloComputationInfo*, int64)> deleter_;
 };
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.h b/tensorflow/compiler/xla/service/hlo_reachability.h
index d7bdac9c86579f19afbba133772c2c50894853d1..553ec11f6f9a2997ab7113f9b8241e04c7fe20d5 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.h
+++ b/tensorflow/compiler/xla/service/hlo_reachability.h
@@ -30,11 +30,17 @@ namespace xla {
 
 class HloInstruction;
 
-// A class for computing and representing reachability between HloInstructions.
+// A class for representing reachability between HloInstructions.
+//
+// !!! THIS CLASS DOES NOT COMPUTE REACHABILITY !!! It has an adjacency matrix
+// and it is up to the user of the class to set the adjacency matrix such that
+// it represents reachability, i.e. such that it is transitive. That the graph
+// be transitive is thus not an invariant of this class, but it is required for
+// the name of the class and its methods to make sense.
 class HloReachabilityMap {
  public:
-  // Sets up an empty reachable matrix for the full set of instructions
-  // specified in 'instructions'.
+  // Sets up a graph with no edges and where the nodes correspond to the given
+  // instructions.
   explicit HloReachabilityMap(const std::list<HloInstruction*>& instructions);
 
   // Set the reachability set of 'instruction' to the union of the reachability
@@ -42,17 +48,33 @@ class HloReachabilityMap {
   // 'x' is not 'instruction' will return true iff IsReachable(x, input) is true
   // for some 'input' in 'inputs'. Also sets 'instruction' to be reachable from
   // itself. Returns whether the reachability set of 'instruction' changed.
+  //
+  // !!! THIS FUNCTION DOES NOT COMPUTE REACHABILITY !!! It sets the adjacency
+  // vector in the internal graph of this HloReachabilityMap for the given
+  // instruction and does not transitively update any other part of the
+  // adjacency matrix.
   bool SetReachabilityToUnion(
       tensorflow::gtl::ArraySlice<const HloInstruction*> inputs,
       const HloInstruction* instruction);
 
   // Sets entry so that IsReachable(a, b) will return true
+  //
+  // !!! THIS FUNCTION DOES NOT COMPUTE REACHABILITY !!! It sets the adjacency
+  // matrix in the internal graph of this HloReachabilityMap to have an edge
+  // from a to b and does not transitively update any other part of the
+  // adjacency matrix.
   void SetReachable(const HloInstruction* a, const HloInstruction* b);
 
   // Returns true if "b" is reachable from "a"
+  //
+  // Note that this function only correctly answers queries about reachability
+  // if the set of edges that have been provided to this class are transitive.
   bool IsReachable(const HloInstruction* a, const HloInstruction* b) const;
 
   // Returns true if "b" is reachable from "a" or "a" is reachable from "b"
+  //
+  // Note that this function only correctly answers queries about reachability
+  // if the set of edges that have been provided to this class are transitive.
   bool IsConnected(const HloInstruction* a, const HloInstruction* b) const;
 
  private:
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 828be8490c994e1992a99e8a9aa960a279486666..c6b4dc0368d92fd477decdfb38045f74f8696803 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -62,18 +62,11 @@ bool IsRematerializable(const HloInstruction* instruction) {
     case HloOpcode::kConstant:
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kCustomCall:
-    case HloOpcode::kOutfeed:
-    case HloOpcode::kInfeed:
     case HloOpcode::kParameter:
-    case HloOpcode::kRecv:
-    case HloOpcode::kRecvDone:
-    case HloOpcode::kSend:
-    case HloOpcode::kSendDone:
-    case HloOpcode::kTrace:
     case HloOpcode::kWhile:
       return false;
     default:
-      return true;
+      return !instruction->HasSideEffect();
   }
 }
 
@@ -573,7 +566,9 @@ Status MemoryUsageTracker::BeginInstruction(Item* item) {
   VLOG(3) << "  memory usage = " << memory_usage_;
   VLOG(10) << ToString();
 
-  DCHECK(Check());
+  if (VLOG_IS_ON(1)) {
+    DCHECK(Check());
+  }
   return Status::OK();
 }
 
@@ -610,8 +605,9 @@ Status MemoryUsageTracker::EndInstruction() {
   VLOG(3) << "  memory usage = " << memory_usage_;
   VLOG(10) << ToString();
 
-  DCHECK(Check());
-
+  if (VLOG_IS_ON(1)) {
+    DCHECK(Check());
+  }
   return Status::OK();
 }
 
@@ -1028,7 +1024,9 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
 
       HloInstruction* best = best_item->instruction;
       VLOG(1) << "Rematerializing instruction " << best->name() << " (saving "
-              << memory_tracker.MemoryReducedIfRematerialized(best_item) << ")";
+              << HumanReadableNumBytes(
+                     memory_tracker.MemoryReducedIfRematerialized(best_item))
+              << ")";
       changed = true;
       remat_count++;
 
@@ -1108,8 +1106,8 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
         net_instructions_added++;
       }
 
-      VLOG(3) << "memory_usage after rematerialization = "
-              << memory_tracker.memory_usage();
+      VLOG(1) << "memory_usage after rematerialization = "
+              << HumanReadableNumBytes(memory_tracker.memory_usage());
     }
 
     const CallSite* callsite = call_graph_node.GetCallSite(instruction);
@@ -1215,11 +1213,12 @@ StatusOr<bool> HloRematerialization::Run(
 
   XLA_VLOG_LINES(3, "Before HloRematerialization:\n" + module->ToString());
   // Create initial sequence of HLO instructions.
-  TF_ASSIGN_OR_RETURN(*sequence,
-                      CreateMemoryMinimizingSequence(
-                          *module, [this](const LogicalBuffer& buffer) {
-                            return size_function_(buffer.shape());
-                          }));
+  TF_ASSIGN_OR_RETURN(*sequence, CreateMemoryMinimizingSequence(
+                                     *module,
+                                     [this](const LogicalBuffer& buffer) {
+                                       return size_function_(buffer.shape());
+                                     },
+                                     scheduler_algorithm_));
   // Compute peak memory usage of all computations in the module called in a
   // sequential context.
   call_graph_ = CallGraph::Build(module);
@@ -1320,9 +1319,10 @@ StatusOr<bool> HloRematerialization::Run(
 /* static */ StatusOr<bool> HloRematerialization::RematerializeAndSchedule(
     const HloRematerialization::ShapeSizeFunction& size_function,
     int64 memory_limit_bytes, HloModule* hlo_module,
+    SchedulerAlgorithm scheduler_algorithm,
     SequentialHloOrdering::HloModuleSequence* sequence,
     RematerializationSizes* sizes) {
-  HloRematerialization remat(size_function);
+  HloRematerialization remat(scheduler_algorithm, size_function);
   return remat.Run(hlo_module, sequence, memory_limit_bytes, sizes);
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 11f79a6d4158c6251c2faf63e9cac4e742440863..52553439033a3bcfa4b472f13f9cd4b1ecf5ed96 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -20,6 +20,7 @@
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 
 namespace xla {
@@ -65,12 +66,15 @@ class HloRematerialization {
   // code generation.
   static StatusOr<bool> RematerializeAndSchedule(
       const ShapeSizeFunction& size_function, int64 memory_limit_bytes,
-      HloModule* hlo_module, SequentialHloOrdering::HloModuleSequence* sequence,
+      HloModule* hlo_module, SchedulerAlgorithm scheduler_algorithm,
+      SequentialHloOrdering::HloModuleSequence* sequence,
       RematerializationSizes* sizes = nullptr);
 
  protected:
-  HloRematerialization(const ShapeSizeFunction& size_function)
-      : size_function_(size_function) {}
+  HloRematerialization(SchedulerAlgorithm scheduler_algorithm,
+                       const ShapeSizeFunction& size_function)
+      : scheduler_algorithm_(scheduler_algorithm),
+        size_function_(size_function) {}
   ~HloRematerialization() {}
 
   // Runs rematerialization on the given module. Returns whether the module was
@@ -103,6 +107,9 @@ class HloRematerialization {
   StatusOr<int64> CalledComputationsMemoryUsage(
       const HloInstruction* instruction) const;
 
+  // Selects an algorithm to use for HLO scheduling.
+  SchedulerAlgorithm scheduler_algorithm_;
+
   // Function which computes the size of the top-level buffer of a shape.
   const ShapeSizeFunction size_function_;
 
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index d88aa4bb567c6c5f6eab54f12239bf7040339c39..216825959a560bd5baa4b49d1a3cace277e16098 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -158,11 +158,11 @@ TEST_F(HloRematerializationTest, SingleComputation) {
   SequentialHloOrdering::HloModuleSequence sequence;
   // Computation requires 16KB without rematerialization, but uses only 12KB
   // with rematerialization so pick a memory limit between these values (14KB).
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      HloRematerialization::RematerializeAndSchedule(
-          ByteSizeOf,
-          /*memory_limit_bytes=*/14 * 1024, module.get(), &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          HloRematerialization::RematerializeAndSchedule(
+                              ByteSizeOf,
+                              /*memory_limit_bytes=*/14 * 1024, module.get(),
+                              SchedulerAlgorithm::kAuto, &sequence));
   EXPECT_TRUE(changed);
 
   // Root should not have changed.
@@ -191,11 +191,11 @@ TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
   EXPECT_EQ(computation->instruction_count(), 7);
 
   SequentialHloOrdering::HloModuleSequence sequence;
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      HloRematerialization::RematerializeAndSchedule(
-          ByteSizeOf,
-          /*memory_limit_bytes=*/20 * 1024, module.get(), &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          HloRematerialization::RematerializeAndSchedule(
+                              ByteSizeOf,
+                              /*memory_limit_bytes=*/20 * 1024, module.get(),
+                              SchedulerAlgorithm::kAuto, &sequence));
 
   // No instructions should have been materialized.
   EXPECT_FALSE(changed);
@@ -232,11 +232,11 @@ TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
   // while so the peak memory use of the module is 18KB. Set the memory limit a
   // bit lower (17KB) to force rematerialization of the entry computation.
   SequentialHloOrdering::HloModuleSequence sequence;
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      HloRematerialization::RematerializeAndSchedule(
-          ByteSizeOf,
-          /*memory_limit_bytes=*/17 * 1024, module.get(), &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          HloRematerialization::RematerializeAndSchedule(
+                              ByteSizeOf,
+                              /*memory_limit_bytes=*/17 * 1024, module.get(),
+                              SchedulerAlgorithm::kAuto, &sequence));
   EXPECT_TRUE(changed);
 
   // Only the entry computation should have a rematerialized instruction added.
@@ -268,11 +268,11 @@ TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
   EXPECT_EQ(body_computation->instruction_count(), 7);
 
   SequentialHloOrdering::HloModuleSequence sequence;
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      HloRematerialization::RematerializeAndSchedule(
-          ByteSizeOf,
-          /*memory_limit_bytes=*/15 * 1024, module.get(), &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          HloRematerialization::RematerializeAndSchedule(
+                              ByteSizeOf,
+                              /*memory_limit_bytes=*/15 * 1024, module.get(),
+                              SchedulerAlgorithm::kAuto, &sequence));
   EXPECT_TRUE(changed);
 
   // Both computations should have a rematerialized instruction added.
@@ -310,11 +310,11 @@ TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
   // If all computations are maximally rematerialized then peak memory usage is
   // ~12K so pick something slightly larger.
   SequentialHloOrdering::HloModuleSequence sequence;
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      HloRematerialization::RematerializeAndSchedule(
-          ByteSizeOf,
-          /*memory_limit_bytes=*/13 * 1024, module.get(), &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          HloRematerialization::RematerializeAndSchedule(
+                              ByteSizeOf,
+                              /*memory_limit_bytes=*/13 * 1024, module.get(),
+                              SchedulerAlgorithm::kAuto, &sequence));
   EXPECT_TRUE(changed);
 
   // All computations should have a rematerialized instruction added.
@@ -323,6 +323,76 @@ TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
   EXPECT_EQ(inner_computation->instruction_count(), 8);
 }
 
+TEST_F(HloRematerializationTest, RngNotRematerialized) {
+  // Test that a single rng is not rematerialized:
+  //
+  // Entry computation:
+  //   F32[] %param = {...}
+  //   F32[1024] rng = rng(param)
+  //   F32[1024] tanh = tanh(rng)
+  //   F32[1024] exp = exp(rng)
+  //   F32[1024] add_0 = add(rng, tanh)              // LIVE: add_0 + rng +
+  //                                                 //       tanh + exp
+  //
+  //   F32[1024] add_1 = add(rng, add(exp, add_0))   // LIVE: add_1 + add_0 +
+  //                                                 //       rng + tanh + exp
+  //
+  //   F32[1024] add_2 = add(rng, add(tanh, add_1))  // LIVE: add_2 + add_1 +
+  //                                                 //       rng + tanh + exp
+  auto module = CreateNewModule();
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  auto rng = builder.AddInstruction(HloInstruction::CreateRng(
+      vec1024_shape_, RandomDistribution::RNG_BERNOULLI, {param}));
+  auto tanh = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kTanh, rng));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kExp, rng));
+  auto add_0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(vec1024_shape_, HloOpcode::kAdd, rng, tanh));
+  auto add_1 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, rng,
+      builder.AddInstruction(HloInstruction::CreateBinary(
+          vec1024_shape_, HloOpcode::kAdd, exp, add_0))));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, rng,
+      builder.AddInstruction(HloInstruction::CreateBinary(
+          vec1024_shape_, HloOpcode::kAdd, tanh, add_1))));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+
+  auto count_rngs = [](const HloComputation* computation) {
+    int64 rng_count = 0;
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kRng) {
+        ++rng_count;
+      }
+    }
+    return rng_count;
+  };
+  // Before rematerialization there should be a single broadcast rng in
+  // the graph.
+  ASSERT_EQ(count_rngs(entry_computation), 1);
+  const int64 original_instruction_count =
+      entry_computation->instruction_count();
+  SequentialHloOrdering::HloModuleSequence sequence;
+  // Pick a memory limit some where between 24KB (initial peak memory including
+  // parameter and output) and 20KB (peak memory possible with
+  // rematerialization).
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, HloRematerialization::RematerializeAndSchedule(
+                        ByteSizeOf,
+                        /*memory_limit_bytes=*/4 * ByteSizeOf(vec1024_shape_),
+                        module.get(), SchedulerAlgorithm::kAuto, &sequence));
+  EXPECT_TRUE(changed);
+  // The rng should not have been rematerialized.
+  EXPECT_EQ(count_rngs(entry_computation), 1);
+  // There should have been rematerialization.
+  EXPECT_GT(entry_computation->instruction_count(), original_instruction_count);
+}
+
 TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
   // Test that a single instruction is rematerialized several times. Module:
   //
@@ -406,11 +476,11 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
   // Pick a memory limit some where between 24KB (initial peak memory including
   // parameter and output) and 20KB (peak memory possible with
   // rematerialization).
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      HloRematerialization::RematerializeAndSchedule(
-          ByteSizeOf,
-          /*memory_limit_bytes=*/22 * 1024, module.get(), &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          HloRematerialization::RematerializeAndSchedule(
+                              ByteSizeOf,
+                              /*memory_limit_bytes=*/22 * 1024, module.get(),
+                              SchedulerAlgorithm::kAuto, &sequence));
   EXPECT_TRUE(changed);
 
   // The broadcast should have been rematerialized 3 times.
@@ -503,11 +573,11 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
   // Pick a memory limit some where between 24KB (initial peak memory including
   // parameter and output) and 20KB (peak memory possible with
   // rematerialization).
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      HloRematerialization::RematerializeAndSchedule(
-          ByteSizeOf,
-          /*memory_limit_bytes=*/22 * 1024, module.get(), &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          HloRematerialization::RematerializeAndSchedule(
+                              ByteSizeOf,
+                              /*memory_limit_bytes=*/22 * 1024, module.get(),
+                              SchedulerAlgorithm::kAuto, &sequence));
   // Rematerialization should only occur if the rematerializable instruction has
   // no indirect uses.
   if (indirectly_used) {
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 63f2b1296ed06d6477e9a24f8034bb57ceabd5cc..7b3a8cef97b5670b1ab753cee14203a58c1e5c27 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -39,6 +39,14 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 
+/*static*/ StatusOr<std::unique_ptr<HloModule>>
+HloRunner::CreateModuleFromString(const tensorflow::StringPiece hlo_string,
+                                  const DebugOptions& debug_options) {
+  HloModuleConfig config;
+  config.set_debug_options(debug_options);
+  return tools::Parse(hlo_string, config);
+}
+
 /*static*/ StatusOr<std::unique_ptr<HloModule>>
 HloRunner::ReadModuleFromHloProtoFile(const std::string& filename,
                                       const DebugOptions& debug_options) {
@@ -104,26 +112,27 @@ HloRunner::HloRunner(se::Platform* platform) {
   VLOG(1) << "Created HloRunner for platform: " << platform->Name();
 }
 
-HloRunner::~HloRunner() {
-  // Deallocate all the memory allocated during the tests.
-  for (auto& allocation : allocations_) {
-    backend().default_stream_executor()->Deallocate(&allocation);
-  }
-}
+HloRunner::~HloRunner() {}
 
-StatusOr<se::DeviceMemoryBase> HloRunner::Execute(
+StatusOr<std::unique_ptr<Literal>> HloRunner::ExecuteInternal(
     std::unique_ptr<HloModule> module,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
-    Shape* result_shape) {
+    const tensorflow::gtl::ArraySlice<Literal*> arguments,
+    bool run_hlo_passes) {
+  if (run_hlo_passes) {
+    TF_ASSIGN_OR_RETURN(
+        module, backend().compiler()->RunHloPasses(
+                    std::move(module), backend().default_stream_executor()));
+  }
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
-      backend().compiler()->Compile(std::move(module),
-                                    backend().default_stream_executor()));
+      backend().compiler()->RunBackend(std::move(module),
+                                       backend().default_stream_executor()));
 
   se::Stream stream(backend().default_stream_executor());
   stream.Init();
 
   ExecutableRunOptions run_options;
+  run_options.set_device_ordinal(backend().default_device_ordinal());
   run_options.set_stream(&stream);
   run_options.set_allocator(backend().memory_allocator());
   run_options.set_inter_op_thread_pool(backend().inter_op_thread_pool());
@@ -133,71 +142,35 @@ StatusOr<se::DeviceMemoryBase> HloRunner::Execute(
   ServiceExecutableRunOptions service_run_options(
       run_options, backend().StreamBorrower(),
       backend().inter_op_thread_pool());
-  TF_ASSIGN_OR_RETURN(
-      se::DeviceMemoryBase result,
-      executable->ExecuteOnStream(&service_run_options, arguments,
-                                  /*hlo_execution_profile=*/nullptr));
-  TF_RET_CHECK(stream.BlockHostUntilDone());
-
-  allocations_.push_back(result);
 
-  *result_shape = executable->result_shape();
-
-  if (ShapeUtil::IsTuple(*result_shape)) {
-    // We must record element buffers of tuples as well to avoid leaks.
-    DCHECK(!ShapeUtil::IsNestedTuple(*result_shape));
+  // Copy arguments to device.
+  std::vector<std::unique_ptr<ScopedShapedBuffer>> argument_buffers;
+  std::vector<ShapedBuffer*> argument_buffer_ptrs;
+  for (Literal* argument : arguments) {
     TF_ASSIGN_OR_RETURN(
-        std::vector<se::DeviceMemoryBase> element_buffers,
-        backend().transfer_manager()->ShallowCopyTupleFromDevice(
-            backend().default_stream_executor(), result, *result_shape));
-
-    // A tuple may contain the same buffer in more than one element. Keep track
-    // of the buffers already added to avoid duplicates in allocations_.
-    std::set<void*> added_opaques;
-    for (auto element_buffer : element_buffers) {
-      if (added_opaques.count(element_buffer.opaque()) == 0) {
-        CHECK(element_buffer.opaque() != nullptr);
-        added_opaques.insert(element_buffer.opaque());
-        allocations_.push_back(element_buffer);
-      }
-    }
+        std::unique_ptr<ScopedShapedBuffer> argument_buffer,
+        backend().transfer_manager()->AllocateScopedShapedBuffer(
+            argument->shape(), run_options.allocator(),
+            run_options.device_ordinal()));
+    TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
+        stream.parent(), *argument, *argument_buffer));
+    argument_buffers.push_back(std::move(argument_buffer));
+    argument_buffer_ptrs.push_back(argument_buffers.back().get());
   }
 
-  return result;
-}
-
-StatusOr<se::DeviceMemoryBase> HloRunner::TransferToDevice(
-    const Literal& literal) {
-  // Allocate memory on the device using the stream executor.
-  int64 allocation_size =
-      backend().transfer_manager()->GetByteSizeRequirement(literal.shape());
-  se::DeviceMemoryBase allocation =
-      backend().default_stream_executor()->AllocateArray<uint8>(
-          allocation_size);
-  allocations_.push_back(allocation);
-
-  TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
-      backend().default_stream_executor(), literal, &allocation));
-
-  return allocation;
-}
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<ShapedBuffer> result,
+      executable->ExecuteOnStream(&service_run_options, argument_buffer_ptrs,
+                                  /*hlo_execution_profile=*/nullptr));
 
-StatusOr<std::unique_ptr<Literal>> HloRunner::TransferFromDevice(
-    const Shape& shape, se::DeviceMemoryBase device_base) {
-  auto literal = MakeUnique<Literal>();
-  TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralFromDevice(
-      backend().default_stream_executor(), device_base, shape, shape,
-      literal.get()));
-  return std::move(literal);
-}
+  // Create a ScopedShapedBuffer of the result to manage deallocation. This will
+  // deallocate all the device memory when it goes out of scope.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<ScopedShapedBuffer> scoped_result,
+      ScopedShapedBuffer::MakeScoped(result.get(), run_options.allocator()));
 
-StatusOr<std::unique_ptr<Literal>> HloRunner::ExecuteAndTransfer(
-    std::unique_ptr<HloModule> module,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
-  Shape result_shape;
-  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase device_base,
-                      Execute(std::move(module), arguments, &result_shape));
-  return TransferFromDevice(result_shape, device_base);
+  return backend().transfer_manager()->TransferLiteralFromDevice(
+      stream.parent(), *scoped_result);
 }
 
 Backend& HloRunner::backend() {
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index a5732848c6b4191faf8d7b07c749132ca8b14413..d4b221fb52dff64dda264a931df6fd19b86e5260 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -35,7 +35,8 @@ namespace xla {
 
 // A base class for running an HloModule. This executes the given HloModule on a
 // certain backend directly without using the client interface. HloModule can be
-// explicitly built, or loaded from a serialization file (e.g., hlo proto file).
+// explicitly built, or loaded from a serialization file (e.g., hlo proto
+// file), or parsed from a hlo textual IR string.
 class HloRunner {
  public:
   HloRunner();
@@ -44,6 +45,12 @@ class HloRunner {
 
   ~HloRunner();
 
+  // Converts an HloModule from the given hlo textual IR string (in
+  // HloModule::ToString format).
+  static StatusOr<std::unique_ptr<HloModule>> CreateModuleFromString(
+      const tensorflow::StringPiece hlo_string,
+      const DebugOptions& debug_options);
+
   // Reads the proto file in xla.HloProto format, creates and returns the
   // HloModule. Will try to parse the filename as binary proto, then try as
   // text proto if that fails.
@@ -65,32 +72,14 @@ class HloRunner {
   // Executes the given module with given literals as input and returns the
   // result as a Literal. The LiteralPtr type accepts Literal* or
   // std::unique_ptr<Literal>.
+  //
+  // If run_hlo_passes is false, the module will be executed without Hlo
+  // optimization.
   template <typename LiteralPtr>
   StatusOr<std::unique_ptr<Literal>> Execute(
       std::unique_ptr<HloModule> module,
-      const tensorflow::gtl::ArraySlice<LiteralPtr> literals);
-
-  // Executes the given module and returns a global data handle.
-  StatusOr<perftools::gputools::DeviceMemoryBase> Execute(
-      std::unique_ptr<HloModule> module,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      Shape* result_shape);
-
-  // Transfers the given literal to the device and returns the data handle.
-  StatusOr<perftools::gputools::DeviceMemoryBase> TransferToDevice(
-      const Literal& literal);
-
-  // Transfers the array referred to by the given handle from the device and
-  // returns as a Literal.
-  StatusOr<std::unique_ptr<Literal>> TransferFromDevice(
-      const Shape& shape, perftools::gputools::DeviceMemoryBase device_base);
-
-  // Executes the given module and return the result as a Literal.
-  StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
-      std::unique_ptr<HloModule> module,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments);
+      const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
+      bool run_hlo_passes = true);
 
   // If backend is not created in the constructor, creates and returns the
   // default backend. If creation fails, crashes the program.
@@ -100,9 +89,12 @@ class HloRunner {
   Backend& backend();
 
  private:
-  struct EigenThreadPoolWrapper;
+  StatusOr<std::unique_ptr<Literal>> ExecuteInternal(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::ArraySlice<Literal*> arguments,
+      bool run_hlo_passes = true);
 
-  std::vector<perftools::gputools::DeviceMemoryBase> allocations_;
+  struct EigenThreadPoolWrapper;
 
   std::unique_ptr<EigenThreadPoolWrapper> thread_pool_wrapper_;
 
@@ -112,14 +104,14 @@ class HloRunner {
 template <typename LiteralPtr>
 StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
     std::unique_ptr<HloModule> module,
-    const tensorflow::gtl::ArraySlice<LiteralPtr> literals) {
-  std::vector<perftools::gputools::DeviceMemoryBase> arguments;
-  for (const auto& literal : literals) {
-    TF_ASSIGN_OR_RETURN(perftools::gputools::DeviceMemoryBase argument,
-                        TransferToDevice(*literal));
-    arguments.push_back(argument);
+    const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
+    bool run_hlo_passes) {
+  // Construct a vector of plain pointers for the arguments.
+  std::vector<Literal*> argument_pointers;
+  for (const auto& argument : arguments) {
+    argument_pointers.push_back(&*argument);
   }
-  return ExecuteAndTransfer(std::move(module), arguments);
+  return ExecuteInternal(std::move(module), argument_pointers, run_hlo_passes);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.cc b/tensorflow/compiler/xla/service/hlo_scheduling.cc
index 8ccbcaeee4a9c9e94b344231953e20ac8f4b2053..2594c29efd717b3bead34d326c28c7efdf093c50 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling.cc
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.cc
@@ -31,6 +31,8 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
+using ::tensorflow::strings::HumanReadableNumBytes;
+
 namespace xla {
 
 StatusOr<int64> MinimumMemoryForSequence(
@@ -367,7 +369,17 @@ StatusOr<int64> MinimumMemoryForComputation(
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function) {
+    const LogicalBuffer::SizeFunction& size_function,
+    SchedulerAlgorithm algorithm) {
+  VLOG(2) << "Computation: " << computation.name();
+  if (algorithm == SchedulerAlgorithm::kListSchedule) {
+    return ListScheduler::Run(computation, points_to_analysis, size_function);
+  }
+  if (algorithm == SchedulerAlgorithm::kDfsSchedule) {
+    return RunDFSMemoryScheduler(computation, points_to_analysis,
+                                 size_function);
+  }
+
   // We try both a list-scheduler based ordering and a DFS based ordering, and
   // choose whichever returns a lower min-memory, not accounting for
   // fragmentation.
@@ -382,7 +394,7 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
       const int64 list_memory,
       MinimumMemoryForComputation(computation, list_sequence,
                                   points_to_analysis, size_function));
-  VLOG(2) << "Min-memory list sequence: " << list_memory << " bytes";
+  VLOG(2) << "Min-memory list sequence: " << HumanReadableNumBytes(list_memory);
 
   TF_ASSIGN_OR_RETURN(
       std::vector<const HloInstruction*> dfs_sequence,
@@ -391,13 +403,15 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
       const int64 dfs_memory,
       MinimumMemoryForComputation(computation, dfs_sequence, points_to_analysis,
                                   size_function));
-  VLOG(2) << "Min-memory dfs sequence: " << dfs_memory << " bytes";
+  VLOG(2) << "Min-memory dfs sequence: " << HumanReadableNumBytes(dfs_memory);
 
   if (list_memory <= dfs_memory) {
-    VLOG(2) << "Chose min-memory list sequence: " << list_memory << " bytes";
+    VLOG(2) << "Chose min-memory list sequence: "
+            << HumanReadableNumBytes(list_memory);
     return list_sequence;
   } else {
-    VLOG(2) << "Chose min-memory dfs sequence: " << dfs_memory << " bytes";
+    VLOG(2) << "Chose min-memory dfs sequence: "
+            << HumanReadableNumBytes(dfs_memory);
     return dfs_sequence;
   }
 }
@@ -405,27 +419,30 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
 }  // namespace
 
 StatusOr<SequentialHloOrdering::HloModuleSequence>
-CreateMemoryMinimizingSequence(
-    const HloModule& module, const LogicalBuffer::SizeFunction& size_function) {
+CreateMemoryMinimizingSequence(const HloModule& module,
+                               const LogicalBuffer::SizeFunction& size_function,
+                               SchedulerAlgorithm algorithm) {
   SequentialHloOrdering::HloModuleSequence sequence;
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(&module));
   for (const auto* computation : module.MakeNonfusionComputations()) {
-    TF_ASSIGN_OR_RETURN(sequence[computation],
-                        CreateMemoryMinimizingSequence(
-                            *computation, *points_to_analysis, size_function));
+    TF_ASSIGN_OR_RETURN(
+        sequence[computation],
+        CreateMemoryMinimizingSequence(*computation, *points_to_analysis,
+                                       size_function, algorithm));
   }
   return sequence;
 }
 
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
-    const LogicalBuffer::SizeFunction& size_function) {
+    const LogicalBuffer::SizeFunction& size_function,
+    SchedulerAlgorithm algorithm) {
   CHECK(!computation.IsFusionComputation());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(computation.parent()));
   return CreateMemoryMinimizingSequence(computation, *points_to_analysis,
-                                        size_function);
+                                        size_function, algorithm);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.h b/tensorflow/compiler/xla/service/hlo_scheduling.h
index ec92a56b962152b15981f868369683144aa7c76a..1d1eb1e064f75c2220b39e84b010e720a0c37880 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling.h
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.h
@@ -33,17 +33,28 @@ StatusOr<int64> MinimumMemoryForSequence(
     const SequentialHloOrdering::HloModuleSequence& module_sequence,
     const LogicalBuffer::SizeFunction& size_function);
 
+enum class SchedulerAlgorithm {
+  kListSchedule,
+  kDfsSchedule,
+
+  // Selects the available scheduler algorithm that had the minimum memory in
+  // the resulting sequence (a la MinimumMemoryForSequence).
+  kAuto,
+};
+
 // Returns an HloModuleSequence which seeks to minimize the memory required for
 // the computation. size_function is the function returning the number of bytes
 // required for a LogicalBuffer.
 StatusOr<SequentialHloOrdering::HloModuleSequence>
 CreateMemoryMinimizingSequence(
-    const HloModule& module, const LogicalBuffer::SizeFunction& size_function);
+    const HloModule& module, const LogicalBuffer::SizeFunction& size_function,
+    SchedulerAlgorithm algorithm = SchedulerAlgorithm::kAuto);
 
 // Overload of above that computes the sequence for a single computation.
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
-    const LogicalBuffer::SizeFunction& size_function);
+    const LogicalBuffer::SizeFunction& size_function,
+    SchedulerAlgorithm algorithm = SchedulerAlgorithm::kAuto);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 735666345421657f7f3d714826a428784e6072e7..447c2446668253c932b44b51b2db22bfd47f9957 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -160,7 +160,59 @@ bool HloSharding::HasUniqueDevice() const {
   }
 }
 
+Status HloSharding::ValidateTuple(const Shape& shape, int64 num_devices) const {
+  if (!ShapeUtil::IsTuple(shape)) {
+    return tensorflow::errors::InvalidArgument(
+        StrCat("Sharding is tuple-shaped but validation shape is not."));
+  }
+  // The easiest way to get the number of elements in a nested tuple is just to
+  // create a shape tree. We could call GetAsShapeTree, but that will try and
+  // apply our tuple_shardings_ to the shape tree, and that might cause a crash
+  // at this point as we haven't validated them.
+  ShapeTree<bool> bool_shape_tree(shape, false);
+  int64 num_leaves =
+      std::distance(bool_shape_tree.leaf_begin(), bool_shape_tree.leaf_end());
+  if (num_leaves != tuple_elements_.size()) {
+    return tensorflow::errors::InvalidArgument(
+        StrCat("Validation tuple shape has ", num_leaves,
+               " leaf elements, but this sharding contains ",
+               tuple_elements_.size(), " elements."));
+  }
+
+  // Now we've validated the number of tuple elements, it's safe to request a
+  // shape tree.
+  ShapeTree<HloSharding> shape_tree = GetAsShapeTree(shape);
+  for (const auto& index_to_sharding : shape_tree.leaves()) {
+    Status status = index_to_sharding.second.ValidateNonTuple(
+        ShapeUtil::GetSubshape(shape, index_to_sharding.first), num_devices);
+    if (!status.ok()) {
+      tensorflow::errors::AppendToMessage(
+          &status, StrCat("Note: While validating sharding tuple element ",
+                          index_to_sharding.first.ToString(), " which is ",
+                          index_to_sharding.second.ToString()));
+      return status;
+    }
+  }
+  return Status::OK();
+}
+
 Status HloSharding::Validate(const Shape& shape, int64 num_devices) const {
+  Status status = IsTuple() ? ValidateTuple(shape, num_devices)
+                            : ValidateNonTuple(shape, num_devices);
+  if (!status.ok()) {
+    tensorflow::errors::AppendToMessage(
+        &status, StrCat("Note: While validating sharding ", ToString(),
+                        " against shape ", ShapeUtil::HumanString(shape)));
+  }
+  return status;
+}
+
+Status HloSharding::ValidateNonTuple(const Shape& shape,
+                                     int64 num_devices) const {
+  if (ShapeUtil::IsTuple(shape)) {
+    return tensorflow::errors::InvalidArgument(
+        StrCat("Validation shape is a tuple but sharding is not."));
+  }
   if (replicated_) {
     return Status::OK();
   }
@@ -174,13 +226,11 @@ Status HloSharding::Validate(const Shape& shape, int64 num_devices) const {
         // Don't overwrite a bad status, so we report the first error.
         if (status.ok()) {
           if (core >= num_devices) {
-            status =
-                tensorflow::errors::InvalidArgument(tensorflow::strings::StrCat(
-                    "core ", core, " > ", num_devices, " in tile assignment"));
+            status = tensorflow::errors::InvalidArgument(StrCat(
+                "core ", core, " > ", num_devices, " in tile assignment"));
           } else if (seen_cores.count(core) != 0) {
-            status =
-                tensorflow::errors::InvalidArgument(tensorflow::strings::StrCat(
-                    "core ", core, " is not unique in tile assignment"));
+            status = tensorflow::errors::InvalidArgument(
+                StrCat("core ", core, " is not unique in tile assignment"));
           }
         }
         seen_cores.insert(core);
@@ -196,7 +246,8 @@ Status HloSharding::Validate(const Shape& shape, int64 num_devices) const {
   // The tile rank must be the same as the input rank.
   if (ShapeUtil::Rank(shape) != ShapeUtil::Rank(tile_shape_)) {
     return tensorflow::errors::InvalidArgument(
-        "Tile rank is different to the input rank");
+        "Tile rank is different to the input rank. sharding=", ToString(),
+        ", input_shape=", ShapeUtil::HumanString(shape));
   }
 
   // The tile shape must not be the same as the input shape without maximal_
@@ -214,9 +265,9 @@ Status HloSharding::Validate(const Shape& shape, int64 num_devices) const {
     auto tile_dim = tile_shape_.dimensions(i);
     auto shape_dim = shape.dimensions(i);
     if (tile_dim > shape_dim) {
-      return tensorflow::errors::InvalidArgument(tensorflow::strings::StrCat(
-          "Tile is larger than input shape (dimension ", i, ", ", tile_dim,
-          " > ", shape_dim));
+      return tensorflow::errors::InvalidArgument(
+          StrCat("Tile is larger than input shape (dimension ", i, ", ",
+                 tile_dim, " > ", shape_dim));
     }
   }
 
@@ -226,10 +277,10 @@ Status HloSharding::Validate(const Shape& shape, int64 num_devices) const {
     int64 expected_dim =
         CeilOfRatio(shape.dimensions(i), tile_shape_.dimensions(i));
     if (tile_assignment_.dimensions()[i] != expected_dim) {
-      return tensorflow::errors::InvalidArgument(tensorflow::strings::StrCat(
-          "Tile assignment tensor has incorrect shape. Dimension ", i,
-          " expected ", expected_dim, " but got ",
-          tile_assignment_.dimensions()[i]));
+      return tensorflow::errors::InvalidArgument(
+          StrCat("Tile assignment tensor has incorrect shape. Dimension ", i,
+                 " expected ", expected_dim, " but got ",
+                 tile_assignment_.dimensions()[i]));
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index dbd16b7c9d4c942a62b4c7ca73b488f10cb83f73..7263198385cf0c84b1dac1e15177dcac99adaafb 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -80,6 +80,17 @@ class HloSharding {
     return HloSharding(flattened_list);
   }
 
+  // Creates a new sharding for a tuple type. The requested tuple shape must not
+  // be nested. For nested tuples, use the ShapeTree overload.
+  static HloSharding Tuple(const Shape& tuple_shape,
+                           tensorflow::gtl::ArraySlice<HloSharding> shardings) {
+    CHECK(ShapeUtil::IsTuple(tuple_shape));
+    CHECK(!ShapeUtil::IsNestedTuple(tuple_shape));
+    std::vector<HloSharding> flattened_list(shardings.begin(), shardings.end());
+    CHECK_EQ(flattened_list.size(), ShapeUtil::TupleElementCount(tuple_shape));
+    return HloSharding(flattened_list);
+  }
+
   // Create a new sharding from a protobuf OpSharding.
   static StatusOr<HloSharding> FromProto(const OpSharding& proto);
 
@@ -222,6 +233,11 @@ class HloSharding {
         tile_assignment_({0}),
         tuple_elements_(tuple_shardings) {}
 
+  // Internal helper to validate a tuple sharding.
+  Status ValidateTuple(const Shape& shape, int64 num_devices) const;
+  // Internal helper to validate a non-tuple (leaf) sharding.
+  Status ValidateNonTuple(const Shape& shape, int64 num_devices) const;
+
   bool replicated_;
   bool maximal_;
   bool tuple_;
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
index 3161dda271d86cc3eaa24e94d30be28887a604bd..0c7487b3ac77ff181d44dd55ebcf2608feaf02ea 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
@@ -145,11 +145,13 @@ TEST_F(HloShardingTest, NestedTuple) {
       ShapeUtil::MakeShape(F32, {4, 6}),
   });
 
+  HloSharding tiled_sharding = HloSharding::Tile(
+      ShapeUtil::MakeShape(F32, {4, 3}), Array<int64>({{0, 1}}));
   OpSharding proto;
   proto.set_type(OpSharding::Type::OpSharding_Type_TUPLE);
   *proto.add_tuple_shardings() = HloSharding::Replicate().ToProto();
   *proto.add_tuple_shardings() = HloSharding::AssignDevice(0).ToProto();
-  *proto.add_tuple_shardings() = HloSharding::AssignDevice(1).ToProto();
+  *proto.add_tuple_shardings() = tiled_sharding.ToProto();
   HloSharding tuple_sharding =
       HloSharding::FromProto(proto).ConsumeValueOrDie();
 
@@ -157,7 +159,15 @@ TEST_F(HloShardingTest, NestedTuple) {
       tuple_sharding.GetAsShapeTree(nested_tuple_shape);
   EXPECT_EQ(shape_tree.element({0}), HloSharding::Replicate());
   EXPECT_EQ(shape_tree.element({1, 0}), HloSharding::AssignDevice(0));
-  EXPECT_EQ(shape_tree.element({2}), HloSharding::AssignDevice(1));
+  EXPECT_EQ(shape_tree.element({2}), tiled_sharding);
+
+  EXPECT_IS_OK(tuple_sharding.Validate(nested_tuple_shape, /*num_devices=*/5));
+  // Test should fail because tuple element count does not match.
+  EXPECT_IS_NOT_OK(tuple_sharding.Validate(ShapeUtil::MakeTupleShape({}),
+                                           /*num_devices=*/5));
+  // Test should fail because the input type is not a tuple.
+  EXPECT_IS_NOT_OK(tuple_sharding.Validate(ShapeUtil::MakeShape(F32, {}),
+                                           /*num_devices=*/5));
 }
 
 TEST_F(HloShardingTest, Hash) {
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
index 101a710d1cad9401134fdfe1d0ec9df241bc01e1..3dc733940fc89952bd5e75a9b28d9cbf356f8000 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
@@ -166,7 +166,7 @@ void HloTfGraphBuilder::SetNodeAttrs(const HloInstruction* instruction,
       layout_string = ShapeUtil::HumanStringWithLayout(instruction->shape());
     } else {
       layout_string = StrCat(
-          "{", Join(instruction->shape().layout().minor_to_major(), ","), "}");
+          "{", Join(LayoutUtil::MinorToMajor(instruction->shape()), ","), "}");
     }
     attrs["layout"].set_s(layout_string);
   }
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index c938450891ac170b1a9bea5eea0c7af19f8a180d..d963a8a2f4fac563f7e8d4e9d4dc3d6e761d40de 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -59,21 +59,27 @@ class ShapeVerifier : public DfsHloVisitor {
   }
 
   Status HandleConvert(HloInstruction* convert) override {
-    if (ShapeUtil::ElementIsComplex(convert->operand(0)->shape())) {
-      TF_RET_CHECK(ShapeUtil::ElementIsComplex(convert->shape()))
-          << "Unsupported complex->real kConvert";
-    }
     return CheckShape(convert, ShapeInference::InferConvertShape(
                                    convert->operand(0)->shape(),
                                    convert->shape().element_type()));
   }
 
+  Status HandleBitcastConvert(HloInstruction* convert) override {
+    return CheckShape(convert, ShapeInference::InferBitcastConvertShape(
+                                   convert->operand(0)->shape(),
+                                   convert->shape().element_type()));
+  }
+
   Status HandleCopy(HloInstruction* copy) override {
     return CheckUnaryShape(copy);
   }
 
   Status HandleDot(HloInstruction* dot) override {
-    return CheckBinaryShape(dot);
+    TF_ASSIGN_OR_RETURN(const Shape expected,
+                        ShapeInference::InferDotOpShape(
+                            dot->operand(0)->shape(), dot->operand(1)->shape(),
+                            dot->dot_dimension_numbers()));
+    return CheckShape(dot, expected);
   }
 
   Status HandleConvolution(HloInstruction* convolution) override {
@@ -87,8 +93,12 @@ class ShapeVerifier : public DfsHloVisitor {
   }
 
   Status HandleCrossReplicaSum(HloInstruction* crs) override {
-    return CheckShape(crs, ShapeInference::InferCrossReplicaSumShape(
-                               crs->operand(0)->shape()));
+    std::vector<const Shape*> operand_shapes;
+    for (const HloInstruction* operand : crs->operands()) {
+      operand_shapes.push_back(&operand->shape());
+    }
+    return CheckShape(
+        crs, ShapeInference::InferCrossReplicaSumShape(operand_shapes));
   }
 
   Status HandleReducePrecision(HloInstruction* reduce_precision) override {
@@ -141,9 +151,6 @@ class ShapeVerifier : public DfsHloVisitor {
   }
 
   Status HandleBitcast(HloInstruction* bitcast) override {
-    // Bitcasts can be any shape, as long as the size matches the operand size.
-    TF_RET_CHECK(shape_size_fn_(bitcast->shape()) ==
-                 shape_size_fn_(bitcast->operand(0)->shape()));
     return tensorflow::Status::OK();
   }
 
@@ -263,6 +270,15 @@ class ShapeVerifier : public DfsHloVisitor {
                       xla_while->while_body()->ComputeProgramShape().result());
   }
 
+  Status HandleConditional(HloInstruction* conditional) override {
+    TF_RETURN_IF_ERROR(CheckShape(
+        conditional,
+        conditional->true_computation()->ComputeProgramShape().result()));
+    return CheckShape(
+        conditional,
+        conditional->false_computation()->ComputeProgramShape().result());
+  }
+
   Status HandlePad(HloInstruction* pad) override {
     return CheckShape(pad,
                       ShapeInference::InferPadShape(pad->operand(0)->shape(),
@@ -272,7 +288,7 @@ class ShapeVerifier : public DfsHloVisitor {
 
   Status HandleSend(HloInstruction* send) override {
     TF_RET_CHECK(send->users().size() == 1);
-    const HloInstruction* send_done = send->users()[0];
+    const HloInstruction* send_done = send->users().front();
     TF_RET_CHECK(send_done->opcode() == HloOpcode::kSendDone);
     TF_RETURN_IF_ERROR(CheckSameChannel(send, send_done));
     return CheckShape(
@@ -290,7 +306,7 @@ class ShapeVerifier : public DfsHloVisitor {
 
   Status HandleRecv(HloInstruction* recv) override {
     TF_RET_CHECK(recv->users().size() == 1);
-    const HloInstruction* recv_done = recv->users()[0];
+    const HloInstruction* recv_done = recv->users().front();
     TF_RET_CHECK(recv_done->opcode() == HloOpcode::kRecvDone);
     TF_RETURN_IF_ERROR(CheckSameChannel(recv, recv_done));
     return CheckShape(recv,
@@ -418,6 +434,63 @@ string ComputationsToString(
       });
 }
 
+// Verifies various invariants about the structure of the HLO:
+//
+// (1) each instruction has a non-null parent() set to the HloComputation which
+//     contains it.
+//
+// (2) each computation has a non-null parent() set to the HloModule which
+//     contains it.
+//
+// (3) the operands of each instruction are in the same computation as the
+//     instruction.
+Status VerifyHloStructure(HloModule* module) {
+  for (const HloComputation* computation : module->computations()) {
+    if (computation->parent() == nullptr) {
+      return FailedPrecondition("Computation %s has a null parent pointer",
+                                computation->name().c_str());
+    }
+    if (computation->parent() != module) {
+      return FailedPrecondition(
+          "Computation %s parent() does not point to parent module",
+          computation->name().c_str());
+    }
+
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (instruction->parent() == nullptr) {
+        return FailedPrecondition("Instruction %s has a null parent pointer",
+                                  instruction->name().c_str());
+      }
+      if (instruction->parent() != computation) {
+        return FailedPrecondition(
+            "Instruction %s parent() does not point to parent computation",
+            instruction->name().c_str());
+      }
+    }
+  }
+
+  // Check that operands are in the same computation separately from verifying
+  // parent() correctness so conditions like a null HloInstruction::parent() are
+  // identified and reported explicitly above rather than reporting a mismatched
+  // operand.
+  for (const HloComputation* computation : module->computations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      for (int i = 0; i < instruction->operand_count(); ++i) {
+        const HloInstruction* operand = instruction->operand(i);
+        if (operand->parent() != instruction->parent()) {
+          return FailedPrecondition(
+              "Operand %d (%s) of instruction %s is in a different "
+              "computation: %s vs %s",
+              i, operand->name().c_str(), instruction->name().c_str(),
+              operand->parent()->name().c_str(),
+              instruction->parent()->name().c_str());
+        }
+      }
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
 }  // namespace
 
 Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
@@ -538,6 +611,8 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
 }
 
 StatusOr<bool> HloVerifier::Run(HloModule* module) {
+  TF_RETURN_IF_ERROR(VerifyHloStructure(module));
+
   tensorflow::gtl::FlatMap<string, const HloInstruction*> instructions;
   ShapeVerifier shape_verifier(shape_size_fn_);
 
@@ -571,7 +646,7 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
         // or ComputationLowerer::Visit()
         TF_RET_CHECK(instruction->dimensions().size() ==
                      ShapeUtil::Rank(instruction->operand(0)->shape()))
-                << "Broadcast HLO has invalid number of dimensions.";
+            << "Broadcast HLO has invalid number of dimensions.";
       } else if (instruction->opcode() == HloOpcode::kWhile) {
         auto* while_cond = instruction->while_condition();
         auto* while_body = instruction->while_body();
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2a3b55decc5289e7e576d3c5897b333c0b1bc922
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+using ::testing::HasSubstr;
+
+using HloVerifierTest = HloTestBase;
+
+TEST_F(HloVerifierTest, NullInstructionParent) {
+  HloComputation::Builder builder(TestName());
+  const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param"));
+  HloInstruction* negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK(verifier().Run(module.get()).status());
+
+  negate->set_parent(nullptr);
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(), HasSubstr("has a null parent pointer"));
+}
+
+TEST_F(HloVerifierTest, NullComputationParent) {
+  HloComputation::Builder builder(TestName());
+  const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param"));
+  builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
+  auto module = CreateNewModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK(verifier().Run(module.get()).status());
+
+  computation->set_parent(nullptr);
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(), HasSubstr("has a null parent pointer"));
+}
+
+TEST_F(HloVerifierTest, DifferentOperandParents) {
+  HloComputation::Builder builder(TestName());
+  const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param"));
+  HloInstruction* negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  HloComputation::Builder emb_builder(TestName());
+  HloInstruction* emb_param = emb_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param"));
+  module->AddEmbeddedComputation(emb_builder.Build());
+
+  TF_ASSERT_OK(verifier().Run(module.get()).status());
+  TF_ASSERT_OK(negate->ReplaceOperandWith(0, emb_param));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("is in a different computation"));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index de4804996f84ef68ca80cef0178ad786ddaa3a39..ba901b99e4f3c72c84c1ecdf4e19e58ad9ab6506 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -33,7 +33,9 @@ namespace xla {
   switch (instruction.opcode()) {
     // Cheap instructions.
     case HloOpcode::kAdd:
+    case HloOpcode::kAnd:
     case HloOpcode::kBitcast:
+    case HloOpcode::kBitcastConvert:
     case HloOpcode::kBroadcast:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
@@ -53,15 +55,14 @@ namespace xla {
     case HloOpcode::kInfeed:
     case HloOpcode::kIsFinite:
     case HloOpcode::kLe:
-    case HloOpcode::kAnd:
-    case HloOpcode::kNot:
-    case HloOpcode::kOr:
     case HloOpcode::kLt:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kMultiply:
     case HloOpcode::kNe:
     case HloOpcode::kNegate:
+    case HloOpcode::kNot:
+    case HloOpcode::kOr:
     case HloOpcode::kOutfeed:
     case HloOpcode::kPad:
     case HloOpcode::kReal:
@@ -88,9 +89,9 @@ namespace xla {
 
     // Expensive instructions.
     case HloOpcode::kAtan2:
-    case HloOpcode::kBatchNormTraining:
-    case HloOpcode::kBatchNormInference:
     case HloOpcode::kBatchNormGrad:
+    case HloOpcode::kBatchNormInference:
+    case HloOpcode::kBatchNormTraining:
     case HloOpcode::kCall:
     case HloOpcode::kConditional:
     case HloOpcode::kConvolution:
@@ -104,19 +105,19 @@ namespace xla {
     case HloOpcode::kMap:
     case HloOpcode::kParameter:
     case HloOpcode::kPower:
+    case HloOpcode::kRecv:
+    case HloOpcode::kRecvDone:
     case HloOpcode::kReduce:
     case HloOpcode::kReduceWindow:
     case HloOpcode::kRemainder:
     case HloOpcode::kRng:
     case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
     case HloOpcode::kSort:
     case HloOpcode::kTanh:
     case HloOpcode::kTrace:
     case HloOpcode::kWhile:
-    case HloOpcode::kSend:
-    case HloOpcode::kSendDone:
-    case HloOpcode::kRecv:
-    case HloOpcode::kRecvDone:
       return true;
   }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 2704a805a91b93c69b751cdb61305ea7780f0ef2..0819ab3b90b2360c6b0b2afaa89f322afe566eb3 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -92,6 +92,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
     ],
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 6d5796a24b5209355debd80b912b7fa62d40837c..dc63a2224d659fa427d4d1a30c5dc0f94d643b36 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -69,13 +69,19 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   return pipeline.Run(hlo_module).status();
 }
 
-StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::Compile(
+StatusOr<std::unique_ptr<HloModule>> InterpreterCompiler::RunHloPasses(
+    std::unique_ptr<HloModule> hlo_module,
+    se::StreamExecutor* /*stream_exec*/) {
+  VLOG(1) << "Run hlo passes on graph " << hlo_module->name();
+  TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));
+  return std::move(hlo_module);
+}
+
+StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
     std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec) {
   TF_RET_CHECK(stream_exec != nullptr);
 
-  VLOG(1) << "Generate graph " << hlo_module->name();
-
-  TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));
+  VLOG(1) << "Run backend " << hlo_module->name();
 
   // Typically you would visit the HLO graph, building up a compiled equivalent
   // In this case we are using an HloEvaluator at execution time, so we don't
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.h b/tensorflow/compiler/xla/service/interpreter/compiler.h
index cfdc9b6256569b0137784b0d1db846a5f2339a5d..278cf5184227ae25518b1d46c0e16e4cce7bd1a8 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.h
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.h
@@ -43,8 +43,12 @@ class InterpreterCompiler : public Compiler {
   InterpreterCompiler() {}
   ~InterpreterCompiler() override {}
 
-  StatusOr<std::unique_ptr<Executable>> Compile(
-      std::unique_ptr<HloModule> hlo_modules,
+  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> hlo_module,
+      perftools::gputools::StreamExecutor* stream_exec) override;
+
+  StatusOr<std::unique_ptr<Executable>> RunBackend(
+      std::unique_ptr<HloModule> hlo_module,
       perftools::gputools::StreamExecutor* stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index 96f937caf96232a72b2f3d80d2269d6ade2327dc..b01fcccdb4b338ed2575d1f2c48401adc648a09a 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/interpreter/executor.h"
+#include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -42,48 +43,23 @@ namespace sep = ::perftools::gputools::interpreter;
 
 InterpreterExecutable::InterpreterExecutable(
     std::unique_ptr<const HloModule> hlo_module)
-    : Executable(std::move(hlo_module)) {}
+    : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr,
+                 /*hlo_profile_index_map=*/nullptr) {}
 
 InterpreterExecutable::~InterpreterExecutable() {}
 
-static se::DeviceMemoryBase AllocateSingleOutput(
-    sep::InterpreterExecutor* executor, const Literal& literal) {
-  int64 size(xla::ShapeUtil::ByteSizeOf(literal.shape()));
-  void* buf = executor->Allocate(size);
-  const void* src = literal.InternalData();
-  memcpy(buf, src, size);
-  return se::DeviceMemoryBase(buf, size);
-}
-
-static se::DeviceMemoryBase AllocateOutputBuffer(
-    sep::InterpreterExecutor* executor, const Literal& literal) {
-  const Shape& shape = literal.shape();
-  if (shape.element_type() != xla::TUPLE) {
-    return AllocateSingleOutput(executor, literal);
-  } else {
-    int64 size(xla::ShapeUtil::ByteSizeOf(shape, sizeof(void*)));
-    void** buf = reinterpret_cast<void**>(executor->Allocate(size));
-    void** buf_rc = buf;
-    for (int64 n = 0; n < xla::ShapeUtil::TupleElementCount(shape); n++) {
-      se::DeviceMemoryBase out =
-          AllocateSingleOutput(executor, literal.tuple_literals(n));
-      *buf++ = out.opaque();
-    }
-
-    return se::DeviceMemoryBase(buf_rc, size);
-  }
-}
-
-StatusOr<se::DeviceMemoryBase> InterpreterExecutable::ExecuteOnStream(
+StatusOr<std::unique_ptr<ShapedBuffer>> InterpreterExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
   se::Stream* stream = run_options->stream();
+  se::StreamExecutor* executor = stream->parent();
+  const se::Platform* platform = executor->platform();
 
   VLOG(1) << "Execute " << module().name();
   if (VLOG_IS_ON(2)) {
     for (const auto& a : arguments) {
-      VLOG(2) << "-- argument " << a.opaque();
+      VLOG(2) << "-- argument " << *a;
     }
   }
 
@@ -95,33 +71,32 @@ StatusOr<se::DeviceMemoryBase> InterpreterExecutable::ExecuteOnStream(
         "Mismatch between argument count and graph parameter count.");
   }
 
-  // Create the arguments as an vector of XLA literals
+  TF_ASSIGN_OR_RETURN(TransferManager * transfer_manager,
+                      TransferManager::GetForPlatform(platform));
+
+  // Transform the ShapedBuffer arguments into literals which the evaluator
+  // consumes.
   std::vector<std::unique_ptr<Literal>> arg_literals;
-  std::vector<Literal*> arg_literals_ptrs;
   for (int64 p = 0; p < computation->num_parameters(); ++p) {
-    // Create the input literal for the parameter
-    HloInstruction* param = computation->parameter_instruction(p);
-    arg_literals.emplace_back(Literal::CreateFromShape(param->shape()));
-    arg_literals_ptrs.push_back(arg_literals.back().get());
-
-    // Copy in the data from the stream_executor buffers
-    void* buffer = arg_literals.back()->MutableInternalData();
-    memcpy(buffer, arguments[p].opaque(),
-           ShapeUtil::ByteSizeOf(param->shape()));
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<Literal> arg_literal,
+        transfer_manager->TransferLiteralFromDevice(executor, *arguments[p]));
+    arg_literals.push_back(std::move(arg_literal));
   }
 
   // Execute the graph using the HloEvaluator.
   HloEvaluator evaluator;
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> output,
-                      evaluator.Evaluate(*computation, arg_literals_ptrs));
-
-  // Copy the result into the return buffer
-  perftools::gputools::StreamExecutor* executor(stream->parent());
-  sep::InterpreterExecutor* interpreter_executor(
-      static_cast<sep::InterpreterExecutor*>(executor->implementation()));
-
-  se::DeviceMemoryBase ret =
-      AllocateOutputBuffer(interpreter_executor, *(output.get()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Literal> result_literal,
+      evaluator.Evaluate<std::unique_ptr<Literal>>(*computation, arg_literals));
+
+  // Transform the result literal back into a ShapedBuffer.
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<ShapedBuffer> result,
+                      transfer_manager->AllocateShapedBuffer(
+                          result_literal->shape(), run_options->allocator(),
+                          run_options->device_ordinal()));
+  TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice(
+      executor, *result_literal, *result));
 
   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
 
@@ -131,20 +106,13 @@ StatusOr<se::DeviceMemoryBase> InterpreterExecutable::ExecuteOnStream(
     execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
   }
 
-  return ret;
-}
-
-StatusOr<std::unique_ptr<ShapedBuffer>> InterpreterExecutable::ExecuteOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
-  return tensorflow::errors::Unimplemented(
-      "ExecuteOnStream is not yet supported on Interpreter.");
+  return std::move(result);
 }
 
-StatusOr<se::DeviceMemoryBase> InterpreterExecutable::ExecuteAsyncOnStream(
+StatusOr<std::unique_ptr<ShapedBuffer>>
+InterpreterExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   return tensorflow::errors::Unimplemented(
       "ExecuteAsyncOnStream is not yet supported on Interpreter.");
 }
@@ -156,10 +124,5 @@ StatusOr<se::DeviceMemoryBase> InterpreterExecutable::ExecuteAsyncOnStream(
   return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
 }
 
-std::unique_ptr<HloCostAnalysis> InterpreterExecutable::CreateCostAnalysis()
-    const {
-  return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
-}
-
 }  // namespace interpreter
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index c69b0d036d1058a6b24ee609a9923895d3246eec..410110a1adf04c83001c38ed03f5d60dd203dc7e 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -43,26 +43,17 @@ class InterpreterExecutable : public Executable {
   InterpreterExecutable(std::unique_ptr<const HloModule> hlo_module);
   ~InterpreterExecutable() override;
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
-
   StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
+  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments) override;
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
   static int64 ShapeSizeBytes(const Shape& shape);
 
-  std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
-
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(InterpreterExecutable);
 };
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc
index 0bb3259ef43915067e614e72038387e8300ecc41..68371910d76f42c0b6d4b1adad9d6a83bdb858e6 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executor.cc
@@ -85,7 +85,7 @@ bool InterpreterExecutor::HostCallback(Stream *stream,
 bool InterpreterExecutor::CreateStreamDependency(Stream *dependent,
                                                  Stream *other) {
   AsExecutorStream(dependent)->EnqueueTask(
-      [other]() { other->BlockHostUntilDone(); });
+      [other]() { SE_CHECK_OK(other->BlockHostUntilDone()); });
   AsExecutorStream(dependent)->BlockUntilDone();
   return true;
 }
@@ -100,9 +100,9 @@ bool InterpreterExecutor::StopTimer(Stream *stream, Timer *timer) {
   return true;
 }
 
-bool InterpreterExecutor::BlockHostUntilDone(Stream *stream) {
+port::Status InterpreterExecutor::BlockHostUntilDone(Stream *stream) {
   AsExecutorStream(stream)->BlockUntilDone();
-  return true;
+  return port::Status::OK();
 }
 
 DeviceDescription *InterpreterExecutor::PopulateDeviceDescription() const {
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index c59b2ccb1505b78be0c459ac9311428d65cc7e44..c5d07e906dafb033905c50c604069e80e1ce80cd 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -157,7 +157,7 @@ class InterpreterExecutor : public internal::StreamExecutorInterface {
   bool StartTimer(Stream *stream, Timer *timer) override;
   bool StopTimer(Stream *stream, Timer *timer) override;
 
-  bool BlockHostUntilDone(Stream *stream) override;
+  port::Status BlockHostUntilDone(Stream *stream) override;
 
   int PlatformDeviceCount() override { return 1; }
 
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 7eda7c2284c2457703fcfcd4226172e41dd4ae01..42bca3b783c5f3390e9507d54fb07660d9f98e35 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -477,16 +477,10 @@ Status LayoutAssignment::AddMandatoryConstraints(
           /*mandatory=*/true));
     } else if (instruction->opcode() == HloOpcode::kCustomCall) {
       // Add constraints for kCustomCall instruction operands and instructions.
-      // For now we only support row major layouts for all inputs and outputs.
-      auto row_major_shape = [](const Shape& old_shape) {
-        Shape new_shape(old_shape);
-        std::vector<int64> dimension_order(new_shape.dimensions_size());
-        std::iota(dimension_order.rbegin(), dimension_order.rend(), 0);
-        *new_shape.mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
-        return new_shape;
-      };
-
-      Shape result_shape(row_major_shape(instruction->shape()));
+      // For now we only support major-first layouts for all inputs and outputs.
+      Shape result_shape = ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
+          instruction->shape().element_type(),
+          AsInt64Slice(instruction->shape().dimensions()));
       TF_RETURN_IF_ERROR(
           constraints->SetInstructionLayout(result_shape, instruction));
       for (int64 i = 0; i < instruction->operand_count(); ++i) {
@@ -496,7 +490,10 @@ Status LayoutAssignment::AddMandatoryConstraints(
           continue;
         }
 
-        Shape row_major_operand_shape(row_major_shape(operand_shape));
+        Shape row_major_operand_shape =
+            ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
+                operand_shape.element_type(),
+                AsInt64Slice(operand_shape.dimensions()));
         TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
             row_major_operand_shape, instruction, i, /*mandatory=*/true));
       }
@@ -530,9 +527,11 @@ Status CheckCallLayout(HloInstruction* call,
 Status CheckCustomCallLayout(HloInstruction* custom_call) {
   for (const HloInstruction* operand : custom_call->operands()) {
     TF_RET_CHECK(
+        ShapeUtil::IsOpaque(operand->shape()) ||
         LayoutUtil::IsMonotonicWithDim0Major(operand->shape().layout()));
   }
   TF_RET_CHECK(
+      ShapeUtil::IsOpaque(custom_call->shape()) ||
       LayoutUtil::IsMonotonicWithDim0Major(custom_call->shape().layout()));
   return Status::OK();
 }
@@ -711,8 +710,8 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     int64 operand_no) {
   const HloInstruction* operand = instruction->operand(operand_no);
 
-  CHECK(ShapeUtil::IsArray(instruction->shape()) &&
-        ShapeUtil::IsArray(operand->shape()));
+  CHECK(ShapeUtil::IsArray(instruction->shape()));
+  CHECK(ShapeUtil::IsArray(operand->shape()));
 
   if (instruction->IsElementwiseOnOperand(operand_no) &&
       !ShapeUtil::IsScalar(operand->shape()) &&
@@ -742,7 +741,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     const Shape& output_shape = instruction->shape();
     Shape output_shape_with_layout = ShapeUtil::MakeShapeWithLayout(
         output_shape.element_type(), AsInt64Slice(output_shape.dimensions()),
-        AsInt64Slice(output_layout.minor_to_major()));
+        LayoutUtil::MinorToMajor(output_layout));
     Shape operand_shape = operand->shape();
     *operand_shape.mutable_layout() =
         LayoutUtil::GetDefaultLayoutForShape(operand_shape);
@@ -771,7 +770,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     int64 rank = ShapeUtil::Rank(instruction->shape());
     std::vector<int64> new_minor_to_major(rank);
     for (int64 i = 0; i < rank; ++i) {
-      int64 output_dim = output_layout.minor_to_major(i);
+      int64 output_dim = LayoutUtil::Minor(output_layout, i);
       int64 operand_dim = instruction->dimensions(output_dim);
       new_minor_to_major[i] = operand_dim;
     }
@@ -814,7 +813,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
     Shape operand_shape_with_layout = ShapeUtil::MakeShapeWithLayout(
         operand->shape().element_type(),
         AsInt64Slice(operand->shape().dimensions()),
-        AsInt64Slice(operand_layout.minor_to_major()));
+        LayoutUtil::MinorToMajor(operand_layout));
     Shape output_shape = user->shape();
     *output_shape.mutable_layout() =
         LayoutUtil::GetDefaultLayoutForShape(output_shape);
@@ -844,7 +843,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
     std::vector<int64> new_minor_to_major(rank);
     auto inverse_dimensions = InversePermutation(user->dimensions());
     for (int64 i = 0; i < rank; ++i) {
-      int64 operand_dim = operand_layout.minor_to_major(i);
+      int64 operand_dim = LayoutUtil::Minor(operand_layout, i);
       int64 user_dim = inverse_dimensions[operand_dim];
       new_minor_to_major[i] = user_dim;
     }
@@ -1303,7 +1302,7 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints,
     TF_RET_CHECK(LayoutUtil::HasLayout(instruction->shape()));
   }
 
-  // Copy the root instrucion's result if the it does not match the result
+  // Copy the root instruction's result if the it does not match the result
   // layout constraint
   if (constraints.ResultLayout() != nullptr &&
       !constraints.ResultLayout()->MatchesLayoutInShape(
@@ -1328,6 +1327,20 @@ Status LayoutAssignment::RunOnComputation(
           << ")";
   VLOG(2) << "  ComputationLayout = " << computation_layout.ToString();
 
+  // Clear existing layouts of the instructions. All layouts must be assigned by
+  // the LayoutAssignment pass, except for Infeed, Outfeed, Parameters and the
+  // computation result. The latter two are specified in computation_layout, so
+  // we only need to keep the existing layouts for Infeed and Outfeed. Clearing
+  // the layouts here avoids hiding potential bugs in the layout assignment pass
+  // that may accidently use the existing layout.
+  for (HloInstruction* instruction : computation->instructions()) {
+    if (instruction->opcode() == HloOpcode::kInfeed ||
+        instruction->opcode() == HloOpcode::kOutfeed) {
+      continue;
+    }
+    LayoutUtil::ClearLayout(instruction->mutable_shape());
+  }
+
   // Construct LayoutConstraints with all layout constraints of the computation.
   LayoutConstraints constraints(points_to_analysis, computation);
 
diff --git a/tensorflow/compiler/xla/service/liveness_util.cc b/tensorflow/compiler/xla/service/liveness_util.cc
index 53d88eda7a81a8cd0ea245de84011cce0ab3eafe..68c99256a246edcf43a8358f667fc4458b9b4fea 100644
--- a/tensorflow/compiler/xla/service/liveness_util.cc
+++ b/tensorflow/compiler/xla/service/liveness_util.cc
@@ -103,7 +103,7 @@ namespace {
 
 // Returns all uses of all aliases of 'instruction' at 'index' in 'uses'.
 // Each use in 'uses' is a pair (HloInstruction* user, int64 operand_index)
-// where 'user' is a user of an alias of 'intruction' at 'index', and
+// where 'user' is a user of an alias of 'instruction' at 'index', and
 // 'operand_index' is the operand index at which the alias appears in the
 // operand list of 'user'.
 std::vector<std::pair<HloInstruction*, int64>> GetAllUsesOfInstructionAtIndex(
@@ -243,6 +243,31 @@ bool CanShareOperandBufferWithUser(
     std::vector<int64> operand_indices = user->OperandIndices(operand);
     return operand_indices.size() == 1 && operand_indices[0] == 0;
   }
+  if (user->opcode() == HloOpcode::kCall) {
+    // TODO(b/62548313): Remove when buffer assignment is module scoped and
+    // does not assign buffers to calls.
+    // Find called computation parameter associated with 'operand'.
+    const std::vector<int64> operand_indices = user->OperandIndices(operand);
+    if (operand_indices.size() > 1) {
+      return false;
+    }
+    CHECK_EQ(1, operand_indices.size());
+    auto* param = user->to_apply()->parameter_instruction(operand_indices[0]);
+    // Get all uses of 'operand' at 'index' in called computation.
+    auto param_uses = GetAllUsesOfInstructionAtIndex(param, operand_index,
+                                                     points_to_analysis);
+
+    // Return true iff:
+    // *) There exists exactly one use of 'operand' in called computation.
+    // *) The unique use is by the root instruction of called computation.
+    //    (Note: we check the root of the called computation, because the
+    //     root result buffer is required to alias with the Call result buffer).
+    // *) The root instruction of the called computation is element-wise on
+    //    'operand'.
+    auto* callee_root = user->to_apply()->root_instruction();
+    return param_uses.size() == 1 && param_uses[0].first == callee_root &&
+           callee_root->IsElementwiseOnOperand(param_uses[0].second);
+  }
   // Check if 'user' is element-wise.
   return user->IsElementwise();
 }
@@ -322,6 +347,31 @@ bool CanShareOperandBufferWithUser(HloInstruction* operand,
     std::vector<int64> operand_indices = user->OperandIndices(operand);
     return operand_indices.size() == 1 && operand_indices[0] == 0;
   }
+  if (user->opcode() == HloOpcode::kCall) {
+    // Get all uses of value defined by 'operand' at 'operand_index'.
+    const auto& uses =
+        dataflow.GetValueDefinedAt(operand, operand_index).uses();
+    // Return true iff:
+    // *) There exists two uses of 'operand'.
+    // *) One use is by 'user' (caller).
+    // *) One use is by root instruction of called computation (callee root).
+    //    (Note: we check the root of the called computation, because the
+    //     root result buffer is required to alias with the Call result buffer).
+    // *) The root instruction of the called computation is element-wise on
+    //    'operand'.
+    const bool found_caller_use =
+        std::find_if(uses.begin(), uses.end(), [user](const HloUse& use) {
+          return use.instruction == user;
+        }) != uses.end();
+    auto* callee_root = user->to_apply()->root_instruction();
+    const bool found_elementwise_callee_use =
+        std::find_if(
+            uses.begin(), uses.end(), [callee_root](const HloUse& use) {
+              return use.instruction == callee_root &&
+                     callee_root->IsElementwiseOnOperand(use.operand_number);
+            }) != uses.end();
+    return uses.size() == 2 && found_caller_use && found_elementwise_callee_use;
+  }
   // Check if 'user' is element-wise.
   return user->IsElementwise();
 }
diff --git a/tensorflow/compiler/xla/service/liveness_util_test.cc b/tensorflow/compiler/xla/service/liveness_util_test.cc
index b5e15906d3c085f773eb46b543515a614e63c59a..2c2a02f6375343d67dfb155bbb03729ff6e490d2 100644
--- a/tensorflow/compiler/xla/service/liveness_util_test.cc
+++ b/tensorflow/compiler/xla/service/liveness_util_test.cc
@@ -277,8 +277,11 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) {
   auto b = builder.AddInstruction(HloInstruction::CreateConstant(
       Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
 
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(data_shape, HloOpcode::kDot, a, b));
+      HloInstruction::CreateDot(data_shape, a, b, dot_dnums));
 
   auto one = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
@@ -312,8 +315,11 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedTransposeDotAdd) {
   auto b_t = builder.AddInstruction(
       HloInstruction::CreateTranspose(data_shape, b, {1, 0}));
 
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(data_shape, HloOpcode::kDot, a, b_t));
+      HloInstruction::CreateDot(data_shape, a, b_t, dot_dnums));
 
   auto one = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
@@ -415,5 +421,44 @@ TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
       CanShareOperandBufferWithUser(data, {}, whil, {}, *dataflow_analysis_));
 }
 
+// Tests that Call can alias operand buffer if the only use of the operand
+// in the called computation is an elementwise instruction.
+TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) {
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  // Build sub-computation with fusion root.
+  auto sub_builder = HloComputation::Builder(TestName() + "_sub");
+  auto sub_param = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "sub_param"));
+  auto one = sub_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto ones = sub_builder.AddInstruction(
+      HloInstruction::CreateBroadcast(shape, one, {1}));
+  auto add = sub_builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sub_param, ones));
+
+  module_ = CreateNewModule();
+  auto sub_computation = module_->AddEmbeddedComputation(sub_builder.Build());
+  sub_computation->CreateFusionInstruction({add, ones},
+                                           HloInstruction::FusionKind::kLoop);
+
+  // Build entry-computation with kCall which calls 'sub_computation'.
+  auto builder = HloComputation::Builder(TestName());
+
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto reverse =
+      builder.AddInstruction(HloInstruction::CreateReverse(shape, param, {0}));
+  auto call = builder.AddInstruction(
+      HloInstruction::CreateCall(shape, {reverse}, sub_computation));
+  computation_ = module_->AddEntryComputation(builder.Build());
+
+  RunAnalysis();
+
+  EXPECT_TRUE(CanShareOperandBufferWithUser(reverse, {}, call, {},
+                                            *points_to_analysis_));
+  EXPECT_TRUE(CanShareOperandBufferWithUser(reverse, {}, call, {},
+                                            *dataflow_analysis_));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.cc b/tensorflow/compiler/xla/service/llvm_compiler.cc
index ba0304fb8ca0de9cffc705f471eb0b740747ec92..34f3419269abbc73cd0ddb13c723a8da38ab19ff 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.cc
+++ b/tensorflow/compiler/xla/service/llvm_compiler.cc
@@ -27,8 +27,10 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
           "Model partitioning not implemented for the CPU/GPU compilers!");
     }
 
+    TF_ASSIGN_OR_RETURN(
+        modules[i], RunHloPasses(std::move(modules[i]), stream_execs[i][0]));
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                        Compile(std::move(modules[i]), stream_execs[i][0]));
+                        RunBackend(std::move(modules[i]), stream_execs[i][0]));
     result.push_back(std::move(executable));
   }
 
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h
index c4f689eabedd4eabe98d907bd3d6b185dfa4bd10..c5393cef4f961c5d04c32d0d4291732b8ec702f1 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.h
+++ b/tensorflow/compiler/xla/service/llvm_compiler.h
@@ -58,10 +58,14 @@ class LLVMCompiler : public Compiler {
   void RemovePostOptimizationHook() { user_post_optimization_hook_ = nullptr; }
 
   // Bring in
-  // StatusOr<std::unique_ptr<Executable>> Compile(
-  //    std::unique_ptr<HloModule> module,
-  //    perftools::gputools::StreamExecutor* executor)
-  using Compiler::Compile;
+  //   StatusOr<std::unique_ptr<Executable>> RunBackend(
+  //       std::unique_ptr<HloModule> module,
+  //       perftools::gputools::StreamExecutor* stream_exec)
+  //   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+  //       std::unique_ptr<HloModule> module,
+  //       perftools::gputools::StreamExecutor* stream_exec)
+  using Compiler::RunBackend;
+  using Compiler::RunHloPasses;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> modules,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index 7224bd689842d89563b374f3db3d4e314be18764..c558f7388cab587b5858d0594cdb2f3c41d75562 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -39,7 +39,7 @@ IrArray::Index::Index(llvm::Value* linear, const Shape& shape,
       << "Shape " << ShapeUtil::HumanStringWithLayout(shape)
       << " should have a layout.";
   int64 divisor = 1;
-  for (int64 dimension : layout_.minor_to_major()) {
+  for (int64 dimension : LayoutUtil::MinorToMajor(layout_)) {
     int64 size_of_current_dimension = shape.dimensions(dimension);
     // Emit IR instructions that compute
     //   (linear_index / divisor) % current_dimension
@@ -244,8 +244,8 @@ llvm::Value* IrArray::EmitArrayElementAddress(
   //
   //   getelementptr base_ptr_, 0, most major index, ..., most minor index
   std::vector<llvm::Value*> gep_indices(1, ir_builder->getInt64(0));
-  for (int64 i = shape_->layout().minor_to_major_size() - 1; i >= 0; --i) {
-    int64 dimension = shape_->layout().minor_to_major(i);
+  for (int64 i = 0; i < LayoutUtil::MinorToMajor(*shape_).size(); ++i) {
+    int64 dimension = LayoutUtil::Major(shape_->layout(), i);
     gep_indices.push_back(actual_index[dimension]);
   }
   return ir_builder->CreateInBoundsGEP(base_ptr_, gep_indices,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
index 29cc0f81bd2c06538e28d1b593ee6a897fea0f27..23d2d4e87d26f4988ebddcf20f5a27af6a7fe0d6 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 
 namespace xla {
 void KernelSupportLibrary::For(
@@ -62,4 +63,72 @@ void KernelSupportLibrary::If(
   false_block_generator();
   llvm_ir::SetToLastInsertPoint(if_data.after_block, ir_builder_);
 }
+
+void KernelSupportLibrary::EmitAndCallOutlinedKernel(
+    bool enable_fast_math, bool optimize_for_size,
+    llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
+    KernelSupportLibrary::ArgumentVector arguments,
+    const std::function<void(KernelSupportLibrary::ArgumentVector)>&
+        kernel_body_generator) {
+  llvm::Module* module = ir_builder->GetInsertBlock()->getModule();
+  llvm::Function* function =
+      module->getFunction(llvm_ir::AsStringRef(kernel_name));
+
+  int64 null_arg_idx = -1;
+  std::vector<llvm::Value*> sanitized_args;
+  sanitized_args.reserve(arguments.size());
+  for (int64 i = 0, e = arguments.size(); i < e; i++) {
+    if (arguments[i]) {
+      sanitized_args.push_back(arguments[i]);
+    } else {
+      CHECK_EQ(null_arg_idx, -1);
+      null_arg_idx = i;
+    }
+  }
+
+  if (!function) {
+    VLOG(2) << "Generating kernel for " << kernel_name;
+    std::vector<llvm::Type*> arg_types;
+    std::transform(sanitized_args.begin(), sanitized_args.end(),
+                   std::back_inserter(arg_types),
+                   [](llvm::Value* arg) { return arg->getType(); });
+
+    auto* function_type = llvm::FunctionType::get(
+        ir_builder->getVoidTy(), arg_types, /*isVarArg=*/false);
+
+    function = llvm_ir::CreateFunction(
+        function_type, llvm::GlobalValue::InternalLinkage,
+        /*enable_fast_math=*/enable_fast_math,
+        /*optimize_for_size=*/optimize_for_size, kernel_name, module);
+
+    llvm::IRBuilder<>::InsertPointGuard guard(*ir_builder);
+
+    auto* entry_bb =
+        llvm::BasicBlock::Create(ir_builder->getContext(), "entry", function);
+    auto* return_inst = llvm::ReturnInst::Create(ir_builder->getContext(),
+                                                 /*retVal=*/nullptr, entry_bb);
+    // Set the insert point to before return_inst.
+    ir_builder->SetInsertPoint(return_inst);
+
+    std::vector<llvm::Value*> arg_values;
+    /*
+     * clang on OSX doesn't like std::transform or range for loop here.
+     * See https://github.com/tensorflow/tensorflow/issues/15196
+     */
+    for (llvm::Function::arg_iterator arg = function->arg_begin(),
+                                      arg_e = function->arg_end();
+         arg != arg_e; ++arg) {
+      arg_values.push_back(arg);
+    }
+    if (null_arg_idx != -1) {
+      arg_values.insert(arg_values.begin() + null_arg_idx, nullptr);
+    }
+    kernel_body_generator(arg_values);
+  } else {
+    VLOG(3) << "Re-using kernel for " << kernel_name;
+  }
+
+  ir_builder->CreateCall(function, llvm_ir::AsArrayRef(sanitized_args));
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
index 9bafb7b57740b7acd0286c113c8a0585c0f93689..827e092a3fa9116c461716b27c309033f7988745 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
@@ -118,6 +118,60 @@ class KernelSupportLibrary {
           const std::function<void()>& true_block_generator,
           const std::function<void()>& false_block_generator = []() {});
 
+  using ArgumentVector = tensorflow::gtl::ArraySlice<llvm::Value*>;
+
+  // Generates the following control flow structure:
+  //
+  //  define @`kernel_name`(arg0, arg1, ... arg`arguments.size()`) {
+  //    kernel_body_generator({arg0, arg1, ... arg`arguments.size()`});
+  //  }
+  //
+  //  ...
+  //  call @`kernel_name`(arguments[0], arguments[1] ...)
+  //  ...
+  //
+  // If a function called `kernel_name` is already present in the module then
+  // that function is re-used.  In that sense we're using the llvm::Module as a
+  // cache of outlined kernels, keyed by function name.
+  //
+  // If any of the values in `arguments` is nullptr (i.e. a nullptr
+  // llvm::Value*) then we ignore it when generating LLVM IR, and instead pass
+  // in a nullptr llvm::Value* in its position to `kernel_body_generator`.
+  // Currently we only support at most one nullptr value in `arguments`.
+  static void EmitAndCallOutlinedKernel(
+      bool enable_fast_math, bool optimize_for_size,
+      llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
+      ArgumentVector arguments,
+      const std::function<void(ArgumentVector)>& kernel_body_generator);
+
+  // Thin wrappers around the more general EmitAndCallOutlinedKernel above.
+  static void EmitAndCallOutlinedKernel(
+      bool enable_fast_math, bool optimize_for_size,
+      llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
+      llvm::Value* arg0, llvm::Value* arg1, llvm::Value* arg2,
+      const std::function<void(llvm::Value*, llvm::Value*, llvm::Value*)>&
+          kernel_body_generator) {
+    EmitAndCallOutlinedKernel(
+        enable_fast_math, optimize_for_size, ir_builder, kernel_name,
+        {arg0, arg1, arg2}, [&](ArgumentVector args) {
+          kernel_body_generator(args[0], args[1], args[2]);
+        });
+  }
+
+  static void EmitAndCallOutlinedKernel(
+      bool enable_fast_math, bool optimize_for_size,
+      llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
+      llvm::Value* arg0, llvm::Value* arg1, llvm::Value* arg2,
+      llvm::Value* arg3,
+      const std::function<void(llvm::Value*, llvm::Value*, llvm::Value*,
+                               llvm::Value*)>& kernel_body_generator) {
+    EmitAndCallOutlinedKernel(
+        enable_fast_math, optimize_for_size, ir_builder, kernel_name,
+        {arg0, arg1, arg2, arg3}, [&](ArgumentVector args) {
+          kernel_body_generator(args[0], args[1], args[2], args[3]);
+        });
+  }
+
  private:
   llvm::IRBuilder<>* ir_builder_;
   bool prevent_unrolling_;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index cd0c4a371e2b1cd0e1c52b77e47e8b081ab8e836..61c47a0b6eca38db5d78dc622a8cf909f6cf14ee 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -142,6 +142,13 @@ llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
       return llvm::Type::getInt8Ty(module->getContext());
     case S16:
     case U16:
+    case BF16:
+      // For BF16 we just need some type that is 16 bits wide so that it will
+      // take up the right amount of space in memory. LLVM does not have a BF16
+      // type (the LLVM half type is IEEE 16 bit floating point, not bfloat), so
+      // we can't map it directly to an LLVM type. We will not map a BF16
+      // addition to an addition on this type (int16) - this is just the type
+      // used for storage.
       return llvm::Type::getInt16Ty(module->getContext());
     case S32:
     case U32:
@@ -200,8 +207,8 @@ llvm::Type* ShapeToIrType(const Shape& shape, llvm::Module* module) {
   if (ShapeUtil::IsTuple(shape)) {
     // A tuple buffer is an array of pointers.
     result_type = llvm::ArrayType::get(result_type, shape.tuple_shapes_size());
-  } else {
-    for (int64 dimension : shape.layout().minor_to_major()) {
+  } else if (ShapeUtil::IsArray(shape)) {
+    for (int64 dimension : LayoutUtil::MinorToMajor(shape)) {
       result_type =
           llvm::ArrayType::get(result_type, shape.dimensions(dimension));
     }
@@ -280,6 +287,11 @@ llvm::Constant* LiteralToConstant(const Literal& literal, int64 dimension_index,
         value = llvm::ConstantFP::get(ir_element_type,
                                       literal.Get<float>(*multi_index));
         break;
+      case BF16:
+        value = llvm::ConstantInt::get(
+            ir_element_type,
+            tensorflow::bit_cast<uint16>(literal.Get<bfloat16>(*multi_index)));
+        break;
       case F64:
         value = llvm::ConstantFP::get(ir_element_type,
                                       literal.Get<double>(*multi_index));
@@ -304,7 +316,7 @@ llvm::Constant* LiteralToConstant(const Literal& literal, int64 dimension_index,
   // decrements with each recursive call. We want to iterate through the
   // dimensions in major-to-minor order as we recurse so just index into
   // minor_to_major to get the dimension number for this level of the recursion.
-  int64 dimension = shape.layout().minor_to_major(dimension_index);
+  int64 dimension = LayoutUtil::Minor(shape.layout(), dimension_index);
 
   // Recursively call LiteralToConstant to construct subarrays for the
   // more-minor dimensions. Gather the subarrays into a vector for bundling into
@@ -320,7 +332,7 @@ llvm::Constant* LiteralToConstant(const Literal& literal, int64 dimension_index,
   if (elements.empty()) {
     element_type = ir_element_type;
     for (int i = 0; i < dimension_index; ++i) {
-      int64 index = shape.layout().minor_to_major(i);
+      int64 index = LayoutUtil::Minor(shape.layout(), i);
       element_type =
           llvm::ArrayType::get(element_type, shape.dimensions(index));
     }
@@ -676,5 +688,32 @@ Status DumpIRToDirectory(const string& directory_name,
   return f->Close();
 }
 
+llvm::Function* CreateFunction(llvm::FunctionType* function_type,
+                               llvm::GlobalValue::LinkageTypes linkage,
+                               bool enable_fast_math, bool optimize_for_size,
+                               tensorflow::StringPiece name,
+                               llvm::Module* module) {
+  llvm::Function* function =
+      llvm::Function::Create(function_type, linkage, AsStringRef(name), module);
+  function->setCallingConv(llvm::CallingConv::C);
+  function->addFnAttr("no-frame-pointer-elim", "false");
+
+  if (enable_fast_math) {
+    function->addFnAttr("unsafe-fp-math", "true");
+    function->addFnAttr("no-infs-fp-math", "true");
+    function->addFnAttr("no-nans-fp-math", "true");
+    function->addFnAttr("no-signed-zeros-fp-math", "true");
+  }
+
+  // Add the optize attribute to the function if optimizing for size. This
+  // controls internal behavior of some optimization passes (e.g. loop
+  // unrolling).
+  if (optimize_for_size) {
+    function->addFnAttr(llvm::Attribute::OptimizeForSize);
+  }
+
+  return function;
+}
+
 }  // namespace llvm_ir
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index 063ead2b647d8fc5cc4f67004aaded80a2191fe9..6bdc6a01a2b487df3dd80a02e67f5bcf62dead31 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -281,6 +281,12 @@ Status DumpIRToDirectory(const string& directory_name,
                          const string& hlo_module_name,
                          const llvm::Module& llvm_module, bool optimized);
 
+llvm::Function* CreateFunction(llvm::FunctionType* function_type,
+                               llvm::GlobalValue::LinkageTypes linkage,
+                               bool enable_fast_math, bool optimize_for_size,
+                               tensorflow::StringPiece name,
+                               llvm::Module* module);
+
 }  // namespace llvm_ir
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index 6fa4cd08c9e0ac30b83c0e2b49d98d930c2e15df..a5f7c850c33757fe8d48567ade35544d81224e46 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -99,8 +99,8 @@ IrArray::Index LoopEmitter::EmitIndexAndSetExitBasicBlock(
   // dimension (of the target shape).
   ForLoopNest loop_nest(loop_name, ir_builder_);
   IrArray::Index array_index(shape_.dimensions_size());
-  for (int i = shape_.layout().minor_to_major_size() - 1; i >= 0; --i) {
-    int64 dimension = shape_.layout().minor_to_major(i);
+  for (int i = 0; i < LayoutUtil::MinorToMajor(shape_).size(); ++i) {
+    int64 dimension = LayoutUtil::Major(shape_.layout(), i);
     std::unique_ptr<ForLoop> loop = loop_nest.AddLoop(
         /*start_index=*/0,
         /*end_index=*/shape_.dimensions(dimension),
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ops.h b/tensorflow/compiler/xla/service/llvm_ir/ops.h
index 11e84d9cb5defbcb87a8f696d56c139686c960d8..f72f482e3128c61e53cc454e7da8b5795ba6f695 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ops.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ops.h
@@ -40,11 +40,24 @@ bool CanUpdateDynamicSliceInPlace(HloInstruction* dynamic_update_slice,
 inline bool CanEmitFusedDynamicUpdateSliceInPlace(
     HloInstruction* fusion, const BufferAssignment& assignment) {
   CHECK_EQ(fusion->opcode(), HloOpcode::kFusion);
-  return fusion->fusion_kind() == HloInstruction::FusionKind::kLoop &&
-         fusion->fused_expression_root()->opcode() ==
-             HloOpcode::kDynamicUpdateSlice &&
-         CanUpdateDynamicSliceInPlace(fusion->fused_expression_root(),
-                                      assignment);
+  HloInstruction* fused_root = fusion->fused_expression_root();
+  if (fused_root->opcode() != HloOpcode::kDynamicUpdateSlice ||
+      fusion->fusion_kind() != HloInstruction::FusionKind::kLoop) {
+    return false;
+  }
+  // Walk DynamicUpdateSlice operand(0) to fused parameter and get its
+  // associated operand. See if it shares an allocation with this operand.
+  HloInstruction* fusion_operand;
+  ShapeIndex index;
+  std::tie(fusion_operand, index) =
+      fused_root->mutable_operand(0)->LatestNonGteAncestorAndIndex();
+  if (fusion_operand->opcode() != HloOpcode::kParameter) {
+    return false;
+  }
+  auto* operand = fusion->operand(fusion_operand->parameter_number());
+  return assignment.HasAllocationAt(operand, index) &&
+         assignment.HasAllocationAt(fusion, {}) &&
+         assignment.SharesSliceAtIndex(fusion, {}, operand, index);
 }
 
 // Emits IR for running the given dynamic-update-slice op in-place -- that is,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.cc
index e8c6a83618eaa8430521197f1c166cb7eb11a28e..0f6d8483da88ba4bf3f26961c0cbc8d855faa82c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.cc
@@ -34,6 +34,12 @@ VectorSupportLibrary::VectorSupportLibrary(PrimitiveType primitive_type,
 }
 
 llvm::Value* VectorSupportLibrary::Mul(llvm::Value* lhs, llvm::Value* rhs) {
+  CHECK(lhs->getType() == scalar_type() || lhs->getType() == vector_type());
+  return MulInternal(lhs, rhs);
+}
+
+llvm::Value* VectorSupportLibrary::MulInternal(llvm::Value* lhs,
+                                               llvm::Value* rhs) {
   if (scalar_type_->isFloatingPointTy()) {
     return ir_builder()->CreateFMul(lhs, rhs, name());
   } else {
@@ -42,6 +48,12 @@ llvm::Value* VectorSupportLibrary::Mul(llvm::Value* lhs, llvm::Value* rhs) {
 }
 
 llvm::Value* VectorSupportLibrary::Add(llvm::Value* lhs, llvm::Value* rhs) {
+  CHECK(lhs->getType() == scalar_type() || lhs->getType() == vector_type());
+  return AddInternal(lhs, rhs);
+}
+
+llvm::Value* VectorSupportLibrary::AddInternal(llvm::Value* lhs,
+                                               llvm::Value* rhs) {
   if (scalar_type_->isFloatingPointTy()) {
     return ir_builder()->CreateFAdd(lhs, rhs, name());
   } else {
@@ -129,6 +141,122 @@ llvm::Value* VectorSupportLibrary::AddReduce(llvm::Value* vector) {
                                             name());
 }
 
+llvm::Value* VectorSupportLibrary::AvxStyleHorizontalAdd(llvm::Value* lhs,
+                                                         llvm::Value* rhs) {
+  CHECK_EQ(lhs->getType(), vector_type());
+  CHECK_EQ(rhs->getType(), vector_type());
+  CHECK_EQ(vector_size() % 2, 0);
+
+  llvm::SmallVector<llvm::Constant*, 32> mask_a, mask_b;
+
+  // Adding the values shuffled using mask_a and mask_b gives us the
+  // AVX-style horizontal add we want.  The masks work as documented
+  // in https://llvm.org/docs/LangRef.html#shufflevector-instruction
+  //
+  // Here are the masks for vector_width() == 8:
+  //
+  //    index: |0 |1 |2 | 3 |4 |5 | 6 | 7
+  //   --------+--+--+--+---+--+--+---+---
+  //   mask_a: |0 |2 |8 |10 |4 |6 |12 |14
+  //   mask_b: |1 |3 |9 |11 |5 |7 |13 |16
+  //
+  // So, as an example, the value at lane 3 of the result vector is
+  // the result of adding lane 10 and lane 11 in the combined lhs++rhs
+  // vector, which are the lanes 2 and 3 in the rhs vector.
+  for (int i = 0; i < vector_size(); i += 2) {
+    int increment = i < vector_size() / 2 ? 0 : (vector_size() / 2);
+    mask_a.push_back(ir_builder()->getInt32(increment + i));
+    mask_b.push_back(ir_builder()->getInt32(increment + i + 1));
+  }
+  for (int i = 0; i < vector_size(); i += 2) {
+    int increment = i < vector_size() / 2 ? (vector_size() / 2) : vector_size();
+    mask_a.push_back(ir_builder()->getInt32(increment + i));
+    mask_b.push_back(ir_builder()->getInt32(increment + i + 1));
+  }
+
+  llvm::Value* shuffle_0 = ir_builder()->CreateShuffleVector(
+      lhs, rhs, llvm::ConstantVector::get(mask_a));
+  llvm::Value* shuffle_1 = ir_builder()->CreateShuffleVector(
+      lhs, rhs, llvm::ConstantVector::get(mask_b));
+
+  return Add(shuffle_0, shuffle_1);
+}
+
+llvm::Value* VectorSupportLibrary::ExtractLowHalf(llvm::Value* vector) {
+  llvm::SmallVector<llvm::Constant*, 32> mask;
+  for (int i = 0; i < vector_size() / 2; i++) {
+    mask.push_back(ir_builder()->getInt32(i));
+  }
+
+  return ir_builder()->CreateShuffleVector(vector,
+                                           llvm::UndefValue::get(vector_type()),
+                                           llvm::ConstantVector::get(mask));
+}
+
+llvm::Value* VectorSupportLibrary::ExtractHighHalf(llvm::Value* vector) {
+  llvm::SmallVector<llvm::Constant*, 32> mask;
+  for (int i = 0; i < vector_size() / 2; i++) {
+    mask.push_back(ir_builder()->getInt32(i + vector_size() / 2));
+  }
+
+  return ir_builder()->CreateShuffleVector(vector,
+                                           llvm::UndefValue::get(vector_type()),
+                                           llvm::ConstantVector::get(mask));
+}
+
+std::vector<llvm::Value*> VectorSupportLibrary::ComputeHorizontalSums(
+    std::vector<llvm::Value*> vectors, llvm::Value* init_values) {
+  // TODO(sanjoy): Move this magic constant to TargetMachineFeatures.
+  const int kAvxVectorWidth = 8;
+  if (vector_size() == kAvxVectorWidth && vectors.size() == kAvxVectorWidth) {
+    return ComputeAvxOptimizedHorizontalSums(std::move(vectors), init_values);
+  }
+
+  std::vector<llvm::Value*> result;
+  std::transform(vectors.begin(), vectors.end(), std::back_inserter(result),
+                 [this](llvm::Value* vector) { return AddReduce(vector); });
+  if (init_values) {
+    for (int64 i = 0, e = result.size(); i < e; i++) {
+      result[i] = Add(result[i], ir_builder()->CreateExtractElement(
+                                     init_values, ir_builder()->getInt32(i)));
+    }
+  }
+  return result;
+}
+
+std::vector<llvm::Value*>
+VectorSupportLibrary::ComputeAvxOptimizedHorizontalSums(
+    std::vector<llvm::Value*> vectors, llvm::Value* init_values) {
+  while (vectors.size() != 2) {
+    std::vector<llvm::Value*> new_vectors;
+    for (int i = 0; i < vectors.size(); i += 2) {
+      new_vectors.push_back(AvxStyleHorizontalAdd(vectors[i], vectors[i + 1]));
+    }
+
+    vectors = std::move(new_vectors);
+  }
+
+  llvm::Value* low =
+      AddInternal(ExtractLowHalf(vectors[0]), ExtractHighHalf(vectors[0]));
+  if (init_values) {
+    low = AddInternal(ExtractLowHalf(init_values), low);
+  }
+  llvm::Value* high =
+      AddInternal(ExtractLowHalf(vectors[1]), ExtractHighHalf(vectors[1]));
+  if (init_values) {
+    high = AddInternal(ExtractHighHalf(init_values), high);
+  }
+
+  std::vector<llvm::Value*> results;
+  for (int i = 0; i < 8; i++) {
+    llvm::Value* scalar_result = ir_builder()->CreateExtractElement(
+        i < 4 ? low : high, ir_builder()->getInt32(i % 4), name());
+    results.push_back(scalar_result);
+  }
+
+  return results;
+}
+
 llvm::Value* VectorSupportLibrary::GetZeroVector() {
   return llvm::Constant::getNullValue(vector_type());
 }
@@ -142,7 +270,9 @@ LlvmVariable::LlvmVariable(llvm::Type* type, llvm::IRBuilder<>* ir_builder)
   alloca_ = llvm_ir::EmitAllocaAtFunctionEntry(type, "", ir_builder_);
 }
 
-llvm::Value* LlvmVariable::Get() { return ir_builder_->CreateLoad(alloca_); }
+llvm::Value* LlvmVariable::Get() const {
+  return ir_builder_->CreateLoad(alloca_);
+}
 
 void LlvmVariable::Set(llvm::Value* new_value) {
   ir_builder_->CreateStore(new_value, alloca_);
diff --git a/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h
index 3072677ab05aa91c736baaa0dc3023329d810a52..f404687ab6864bd0702d142ff691a394b78278a5 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h
@@ -111,7 +111,12 @@ class VectorSupportLibrary {
     return LoadBroadcast(base_pointer, ir_builder()->getInt64(offset_elements));
   }
 
-  llvm::Value* AddReduce(llvm::Value* vector);
+  // Compute the horizontal sum of each vector in `vectors`.  The i'th element
+  // in the result vector is the (scalar) horizontal sum of the i'th vector in
+  // `vectors`.  If `init_values` is not nullptr then the value in the i'th lane
+  // in `init_values` is added to the i'th horizontal sum.
+  std::vector<llvm::Value*> ComputeHorizontalSums(
+      std::vector<llvm::Value*> vectors, llvm::Value* init_values = nullptr);
 
   llvm::Value* GetZeroVector();
   llvm::Value* GetZeroScalar();
@@ -126,6 +131,33 @@ class VectorSupportLibrary {
   const std::string& name() const { return name_; }
 
  private:
+  llvm::Value* ExtractLowHalf(llvm::Value*);
+  llvm::Value* ExtractHighHalf(llvm::Value*);
+
+  llvm::Value* MulInternal(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* AddInternal(llvm::Value* lhs, llvm::Value* rhs);
+
+  llvm::Value* AddReduce(llvm::Value* vector);
+
+  // Perform an X86 AVX style horizontal add between `lhs` and `rhs`.  The
+  // resulting IR for an 8-float wide vector is expected to lower to a single
+  // vhaddps instruction on a CPU that supports vhaddps, and not be too bad in
+  // other cases.
+  //
+  // For a vector width of 8, the result vector is computed as:
+  //   Result[0] = Lhs[0] + Lhs[1]
+  //   Result[1] = Lhs[2] + Lhs[3]
+  //   Result[2] = Rhs[0] + Rhs[1]
+  //   Result[3] = Rhs[2] + Rhs[3]
+  //   Result[4] = Lhs[4] + Lhs[5]
+  //   Result[5] = Lhs[6] + Lhs[7]
+  //   Result[6] = Rhs[4] + Rhs[5]
+  //   Result[7] = Rhs[6] + Rhs[7]
+  llvm::Value* AvxStyleHorizontalAdd(llvm::Value* lhs, llvm::Value* rhs);
+
+  std::vector<llvm::Value*> ComputeAvxOptimizedHorizontalSums(
+      std::vector<llvm::Value*> vectors, llvm::Value* init_values);
+
   int64 vector_size_;
   PrimitiveType primitive_type_;
   llvm::IRBuilder<>* ir_builder_;
@@ -142,7 +174,7 @@ class LlvmVariable {
  public:
   LlvmVariable(llvm::Type*, llvm::IRBuilder<>* ir_builder);
 
-  llvm::Value* Get();
+  llvm::Value* Get() const;
   void Set(llvm::Value* new_value);
 
  private:
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 06f43bd3cb2376d34a3104133c868c4f4e5cc730..4071b948a5f94bcc2e87d8bb3b9533fb3b1d2cb1 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -118,10 +118,8 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
   TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
                       execute_backend_->stream_executor(device_ordinal));
 
-  std::vector<perftools::gputools::DeviceMemoryBase> argument_buffers(
-      argument_layouts.size());
   return BuildExecutable(versioned_handle, std::move(module_config),
-                         argument_buffers, execute_backend_.get(), executor);
+                         execute_backend_.get(), executor);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc
index a0d08c288dbcc45e83a36ce7b094b04a9dbae532..7d8c05fffa4ab11d7dbf9956d2cb7ebd5bcdd3c4 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer.cc
@@ -17,12 +17,44 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
+namespace {
+
+bool IsAllowed(char character) {
+  auto c = static_cast<unsigned char>(character);
+  return (isalnum(c) != 0) || c == '_' || c == '.' || c == '-';
+}
+
+}  // namespace
+
+NameUniquer::NameUniquer(const string& separator) {
+  CHECK(std::all_of(separator.begin(), separator.end(), IsAllowed))
+      << "separator should comprises allowed characters only";
+  separator_ = separator;
+}
+
+/*static*/ string NameUniquer::GetSanitizedName(const string& name) {
+  string result = name;
+  CHECK(!result.empty()) << "name should not be empty";
+  char c = static_cast<unsigned char>(result[0]);
+  if (!isalpha(c) && c != '_') {
+    result[0] = '_';
+  }
+  for (int i = 1; i < result.length(); i++) {
+    if (!IsAllowed(result[i])) {
+      result[i] = '_';
+    }
+  }
+  return result;
+}
+
 string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) {
   string root = prefix.empty() ? "name" : prefix.ToString();
+  root = GetSanitizedName(root);
 
   // Strip away numeric suffix (if any). Only recognize separator if it is in
   // the middle of the name.
diff --git a/tensorflow/compiler/xla/service/name_uniquer.h b/tensorflow/compiler/xla/service/name_uniquer.h
index ed379b52258463b960dea788721c2c4325ef0260..4139c2700b25e8600182a034a8ac6f4f041c12e6 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.h
+++ b/tensorflow/compiler/xla/service/name_uniquer.h
@@ -28,14 +28,21 @@ namespace xla {
 // Simple stateful class that helps generate "unique" names. To use it, simply
 // call GetUniqueName as many times as needed. The names returned by
 // GetUniqueName are guaranteed to be distinct for this instance of the class.
+// Note that the names will be sanitized to match regexp
+// "[a-zA-Z_][a-zA-Z0-9_.-]*".
 class NameUniquer {
  public:
-  explicit NameUniquer(const string& separator = "__")
-      : separator_(separator) {}
+  // The separator must contain allowed characters only: "[a-zA-Z0-9_.-]".
+  explicit NameUniquer(const string& separator = "__");
 
-  // Get a unique name in a string, with an optional prefix for convenience.
+  // Get a sanitized unique name in a string, with an optional prefix for
+  // convenience.
   string GetUniqueName(tensorflow::StringPiece prefix = "");
 
+  // Sanitizes and returns the name. Unallowed characters will be replaced with
+  // '_'. The result will match the regexp "[a-zA-Z_][a-zA-Z0-9_.-]*".
+  static string GetSanitizedName(const string& name);
+
  private:
   // The string to use to separate the prefix of the name from the uniquing
   // integer value.
diff --git a/tensorflow/compiler/xla/service/name_uniquer_test.cc b/tensorflow/compiler/xla/service/name_uniquer_test.cc
index 9f0747a6e2175a968d8f3661ac51512009e86f29..4258cf16876ab46dce6df062ab701b1b1a4a7580 100644
--- a/tensorflow/compiler/xla/service/name_uniquer_test.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer_test.cc
@@ -60,12 +60,30 @@ TEST_F(NameUniquerTest, NumericSuffixes) {
   EXPECT_EQ("bar", uniquer.GetUniqueName("bar.-1000"));
   EXPECT_EQ("bar.1", uniquer.GetUniqueName("bar.-2000"));
   EXPECT_EQ("bar.2", uniquer.GetUniqueName("bar.1"));
+}
+
+TEST_F(NameUniquerTest, Sanitize) {
+  NameUniquer uniquer("_");
+
+  EXPECT_EQ("foo", uniquer.GetUniqueName("foo"));
+  EXPECT_EQ("foo_1", uniquer.GetUniqueName("foo"));
+  EXPECT_EQ("foo.54", uniquer.GetUniqueName("foo.54"));
+  EXPECT_EQ("foo_54", uniquer.GetUniqueName("foo_54"));
+  EXPECT_EQ("foo_54.1", uniquer.GetUniqueName("foo_54.1"));
+  EXPECT_EQ("foo_55", uniquer.GetUniqueName("foo"));
+
+  // Invalid characters will be replaced with '_'.
+  EXPECT_EQ("bar", uniquer.GetUniqueName("bar<-1000"));
+  EXPECT_EQ("bar_1", uniquer.GetUniqueName("bar<-2000"));
+  EXPECT_EQ("bar_2", uniquer.GetUniqueName("bar_1"));
 
   // Separator is only recognized in the middle of the prefix.
-  EXPECT_EQ(".10", uniquer.GetUniqueName(".10"));
-  EXPECT_EQ(".10.1", uniquer.GetUniqueName(".10"));
-  EXPECT_EQ("foobar.", uniquer.GetUniqueName("foobar."));
-  EXPECT_EQ("foobar..1", uniquer.GetUniqueName("foobar."));
+  EXPECT_EQ("_10", uniquer.GetUniqueName(
+                       ".10"));  // the leading '.' is replaced with '_'.
+  EXPECT_EQ("_10_1", uniquer.GetUniqueName(".10"));
+  EXPECT_EQ("_10_2", uniquer.GetUniqueName("_10"));
+  EXPECT_EQ("foobar_", uniquer.GetUniqueName("foobar_"));
+  EXPECT_EQ("foobar__1", uniquer.GetUniqueName("foobar_"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc
index 3a1818de82d3fd305e2c6b3bd1f2cf8125806a75..aa974ee61a27de9c19e97d8a6eb48f9261ce4bd9 100644
--- a/tensorflow/compiler/xla/service/platform_util.cc
+++ b/tensorflow/compiler/xla/service/platform_util.cc
@@ -33,10 +33,32 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 
+using tensorflow::str_util::Lowercase;
+
 // Minimum supported CUDA compute capability is 3.5.
 constexpr int kMinCudaComputeCapabilityMajor = 3;
 constexpr int kMinCudaComputeCapabilityMinor = 5;
 
+// The name of the interpreter platform.
+constexpr char kInterpreter[] = "interpreter";
+
+namespace {
+
+string CanonicalPlatformName(const string& name) {
+  string platform_str = Lowercase(name);
+  // "cpu" and "host" mean the same thing.
+  if (platform_str == "cpu") {
+    platform_str = "host";
+  }
+  // "gpu" and "cuda" mean the same thing.
+  if (platform_str == "gpu") {
+    platform_str = "cuda";
+  }
+  return platform_str;
+}
+
+}  // namespace
+
 /* static */ StatusOr<std::vector<se::Platform*>>
 PlatformUtil::GetSupportedPlatforms() {
   se::MultiPlatformManager::PlatformMap platform_map;
@@ -78,7 +100,7 @@ PlatformUtil::GetSupportedPlatforms() {
   return platforms;
 }
 
-/* static */ StatusOr<se::Platform*> PlatformUtil::GetDefaultPlatform() {
+/* static */ StatusOr<se::Platform*> PlatformUtil::GetSolePlatform() {
   TF_ASSIGN_OR_RETURN(auto platforms, GetSupportedPlatforms());
   if (platforms.empty()) {
     return NotFound("no platforms found");
@@ -87,13 +109,77 @@ PlatformUtil::GetSupportedPlatforms() {
   }
 
   // Multiple platforms present and we can't pick a reasonable default.
-  auto l = [](string* out, const se::Platform* p) { out->append(p->Name()); };
-  string platforms_string = tensorflow::str_util::Join(platforms, ", ", l);
+  string platforms_string = tensorflow::str_util::Join(
+      platforms, ", ",
+      [](string* out, const se::Platform* p) { out->append(p->Name()); });
   return InvalidArgument(
       "must specify platform because more than one platform found: %s",
       platforms_string.c_str());
 }
 
+/* static */ StatusOr<se::Platform*> PlatformUtil::GetDefaultPlatform() {
+  TF_ASSIGN_OR_RETURN(auto platforms, GetSupportedPlatforms());
+  if (platforms.empty()) {
+    return NotFound("no platforms found");
+  } else if (platforms.size() == 1) {
+    return platforms[0];
+  } else if (platforms.size() == 2) {
+    for (int i = 0; i < 2; i++) {
+      if (Lowercase(platforms[i]->Name()) == kInterpreter &&
+          Lowercase(platforms[1 - i]->Name()) != kInterpreter) {
+        return platforms[1 - i];
+      }
+    }
+  }
+
+  // Multiple platforms present and we can't pick a reasonable default.
+  string platforms_string = tensorflow::str_util::Join(
+      platforms, ", ",
+      [](string* out, const se::Platform* p) { out->append(p->Name()); });
+  return InvalidArgument(
+      "must specify platform because more than one platform (except for the "
+      "interpreter platform) found: %s",
+      platforms_string.c_str());
+}
+
+/*static*/ StatusOr<se::Platform*> PlatformUtil::GetPlatform(
+    const string& platform_name) {
+  string platform_str = CanonicalPlatformName(platform_name);
+  TF_ASSIGN_OR_RETURN(auto platforms, PlatformUtil::GetSupportedPlatforms());
+  for (se::Platform* platform : platforms) {
+    if (Lowercase(platform->Name()) == platform_str) {
+      return platform;
+    }
+  }
+  return InvalidArgument("platform %s not found", platform_name.c_str());
+}
+
+/*static*/ StatusOr<se::Platform*> PlatformUtil::GetPlatformExceptFor(
+    const string& platform_name) {
+  string platform_str = CanonicalPlatformName(platform_name);
+
+  TF_ASSIGN_OR_RETURN(auto platforms, PlatformUtil::GetSupportedPlatforms());
+  std::vector<se::Platform*> matched;
+  for (se::Platform* platform : platforms) {
+    if (Lowercase(platform->Name()) != platform_name) {
+      matched.push_back(platform);
+    }
+  }
+  if (matched.empty()) {
+    return InvalidArgument("unable to find platform that is not %s",
+                           platform_name.c_str());
+  }
+  if (matched.size() == 1) {
+    return matched[0];
+  }
+  string matched_string = tensorflow::str_util::Join(
+      matched, ", ",
+      [](string* out, const se::Platform* p) { out->append(p->Name()); });
+  return InvalidArgument(
+      "found multiple platforms %s, but expected one platform except for %s",
+      matched_string.c_str(), platform_name.c_str());
+}
+
 // Returns whether the device underlying the given StreamExecutor is supported
 // by XLA.
 static bool IsDeviceSupported(se::StreamExecutor* executor) {
diff --git a/tensorflow/compiler/xla/service/platform_util.h b/tensorflow/compiler/xla/service/platform_util.h
index eac573703085aca2801885cd9abbe0022f1c029e..69188820a70707d9c9be10b20fb7de92ad4d9873 100644
--- a/tensorflow/compiler/xla/service/platform_util.h
+++ b/tensorflow/compiler/xla/service/platform_util.h
@@ -16,11 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_PLATFORM_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_PLATFORM_UTIL_H_
 
+#include <string>
 #include <vector>
 
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
@@ -34,10 +37,27 @@ class PlatformUtil {
   static StatusOr<std::vector<perftools::gputools::Platform*>>
   GetSupportedPlatforms();
 
-  // Convenience function which returns the default supported platform. If
+  // Convenience function which returns the default supported platform for
+  // tests. If exactly one supported platform is present, then this platform is
+  // the default platform. If exactly two platforms are present and one of them
+  // is the interpreter platform, then the other platform is the default
+  // platform. Otherwise returns an error.
+  static StatusOr<perftools::gputools::Platform*> GetDefaultPlatform();
+
+  // Convenience function which returns the sole supported platform. If
   // exactly one supported platform is present, then this platform is the
   // default platform. Otherwise returns an error.
-  static StatusOr<perftools::gputools::Platform*> GetDefaultPlatform();
+  static StatusOr<perftools::gputools::Platform*> GetSolePlatform();
+
+  // Returns the platform according to the given name. Returns error if there is
+  // no such platform.
+  static StatusOr<perftools::gputools::Platform*> GetPlatform(
+      const string& platform_name);
+
+  // Returns exactly one platform that does not have given name. Returns error
+  // if there is no such platform, or there are multiple such platforms.
+  static StatusOr<perftools::gputools::Platform*> GetPlatformExceptFor(
+      const string& platform_name);
 
   // Returns a vector of StreamExecutors for the given platform. The vector is
   // indexed by device ordinal (device numbering used by StreamExecutor). If an
diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index 0fb90230f2f39a841973361f63d17af579a1342b..e62bafc50b0e1270702621c9ea7b2ee43e001fe0 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -101,8 +101,9 @@ HloInstruction* FirstNonScalarAndNonTrivialReshapeOperand(
         IsReshapeOrTranspose(operand) &&
         !CanTriviallyChangeShape(operand->operand(0))) {
       VLOG(5) << "Found first non-scalar and non-trivial reshape operand of "
-              << hlo->ToStringNoMetadata() << ":\n\t"
-              << operand->ToStringNoMetadata();
+              << hlo->ToString(HloPrintOptions().set_print_metadata(false))
+              << ":\n\t"
+              << operand->ToString(HloPrintOptions().set_print_metadata(false));
       return operand;
     }
   }
@@ -133,8 +134,9 @@ bool AreEquivalentReshapes(const HloInstruction* a, const HloInstruction* b) {
 bool AllOperandsHaveEasyShapeChanges(
     const HloInstruction* instruction,
     const HloInstruction* first_reshape_operand) {
+  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
   VLOG(3) << "** Checking whether all operands have easy shape changes: "
-          << instruction->ToStringNoMetadata();
+          << instruction->ToString(print_no_metadata);
   // Check whether all operands:
   //    0. Have the same dimensions as the output -- if not, it may be
   //       implicitly broadcast, which can confound the movement's
@@ -151,21 +153,21 @@ bool AllOperandsHaveEasyShapeChanges(
       VLOG(5) << "Operand shape differs from output shape; may be "
                  "implicitly broadcast, so preventing "
                  "movement\n\toperand: "
-              << operand->ToStringNoMetadata()
-              << "\n\tinstruction: " << instruction->ToStringNoMetadata();
+              << operand->ToString(print_no_metadata) << "\n\tinstruction: "
+              << instruction->ToString(print_no_metadata);
       return false;
     }
 
     if (AreEquivalentReshapes(first_reshape_operand, operand)) {
       VLOG(5) << "Are equivalent reshapes:\n\tfirst_reshape_operand: "
-              << first_reshape_operand->ToStringNoMetadata()
-              << "\n\toperand: " << operand->ToStringNoMetadata();
+              << first_reshape_operand->ToString(print_no_metadata)
+              << "\n\toperand: " << operand->ToString(print_no_metadata);
       continue;
     }
 
     if (CanTriviallyChangeShape(operand)) {
       VLOG(5) << "Operand can trivially change shape: "
-              << operand->ToStringNoMetadata();
+              << operand->ToString(print_no_metadata);
       continue;
     }
 
@@ -173,12 +175,12 @@ bool AllOperandsHaveEasyShapeChanges(
     // well.
     VLOG(5) << "Operand is neither equalivant to the first Reshape operand"
                "nor can trivially change shape: "
-            << operand->ToStringNoMetadata();
+            << operand->ToString(print_no_metadata);
     return false;
   }
 
   VLOG(3) << "All operands have easy shape changes: "
-          << instruction->ToStringNoMetadata();
+          << instruction->ToString(print_no_metadata);
   return true;
 }
 
@@ -250,11 +252,13 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
     return false;
   }
 
+  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
   // At this point we've decided to sink reshape/transpose operands.
   const Shape& new_operand_shape = first_reshape_operand->operand(0)->shape();
   VLOG(3) << "** Sinking reshape or transpose: "
-          << instruction->ToStringNoMetadata() << "\n\tfirst reshape operand: "
-          << first_reshape_operand->ToStringNoMetadata()
+          << instruction->ToString(print_no_metadata)
+          << "\n\tfirst reshape operand: "
+          << first_reshape_operand->ToString(print_no_metadata)
           << "\n\tnew operand shape: "
           << ShapeUtil::HumanString(new_operand_shape);
 
@@ -267,7 +271,7 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
       continue;
     }
     VLOG(3) << "Updating operand #" << i << ": "
-            << operands[i]->ToStringNoMetadata();
+            << operands[i]->ToString(print_no_metadata);
     operands[i] = UpdateOperand(computation, first_reshape_operand,
                                 new_operand_shape, operands[i]);
   }
@@ -298,7 +302,7 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
   switch (first_reshape_operand->opcode()) {
     case HloOpcode::kReshape:
       VLOG(3) << "Creating new reshape for new elementwise op: "
-              << new_elementwise->ToStringNoMetadata();
+              << new_elementwise->ToString(print_no_metadata);
       new_reshape =
           HloInstruction::CreateReshape(instruction->shape(), new_elementwise);
       break;
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index ee9501dd4839ffcb6052df14699aad90565ae0e2..e77a46128b1dadbeea0df64a19f5ba980257cf8c 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -60,41 +60,32 @@ namespace xla {
 
 namespace {
 
-// Copies the contents of an Allocation into a Literal proto.
-tensorflow::Status LiteralFromAllocation(const Allocation* allocation,
-                                         const Shape& literal_shape,
-                                         Literal* literal) {
-  TF_ASSIGN_OR_RETURN(
-      se::StreamExecutor * executor,
-      allocation->backend()->stream_executor(allocation->device_ordinal()));
-  return allocation->backend()->transfer_manager()->TransferLiteralFromDevice(
-      executor, allocation->device_memory(), allocation->shape(), literal_shape,
-      literal);
-}
-
 // Records the arguments used to invoke a computation in a SessionModule
 // proto.
 tensorflow::Status RecordArguments(
-    const tensorflow::gtl::ArraySlice<const Allocation*> arg_allocations,
+    const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+    se::StreamExecutor* executor, TransferManager* transfer_manager,
     SessionModule* module) {
   module->clear_arguments();
-  for (const Allocation* allocation : arg_allocations) {
-    Literal argument;
-    TF_RETURN_IF_ERROR(
-        LiteralFromAllocation(allocation, allocation->shape(), &argument));
-    *module->add_arguments() = argument.ToProto();
+  for (const ShapedBuffer* argument : arguments) {
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<Literal> literal,
+        transfer_manager->TransferLiteralFromDevice(executor, *argument));
+    *module->add_arguments() = literal->ToProto();
   }
   return tensorflow::Status::OK();
 }
 
 // Records the result of a computation in a SessionModule proto.
-tensorflow::Status RecordResult(const Allocation* result_allocation,
+tensorflow::Status RecordResult(const ShapedBuffer& result,
+                                se::StreamExecutor* executor,
+                                TransferManager* transfer_manager,
                                 SessionModule* module) {
   module->clear_result();
-  Literal result;
-  TF_RETURN_IF_ERROR(LiteralFromAllocation(
-      result_allocation, result_allocation->shape(), &result));
-  *module->mutable_result() = result.ToProto();
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Literal> literal,
+      transfer_manager->TransferLiteralFromDevice(executor, result));
+  *module->mutable_result() = literal->ToProto();
   return tensorflow::Status::OK();
 }
 
@@ -152,7 +143,9 @@ int ServiceOptions::intra_op_parallelism_threads() const {
 
 Service::Service(const ServiceOptions& options,
                  std::unique_ptr<Backend> execute_backend)
-    : options_(options), execute_backend_(std::move(execute_backend)) {
+    : options_(options),
+      allocation_tracker_(execute_backend.get()),
+      execute_backend_(std::move(execute_backend)) {
   CHECK_GT(options_.number_of_replicas(), 0);
   if (execute_backend_) {
     if (execute_backend_->device_count() > 0) {
@@ -235,35 +228,33 @@ tensorflow::Status Service::ValidateResultShapeWithLayout(
   return ShapeUtil::ValidateShape(shape_with_layout);
 }
 
-StatusOr<std::vector<const Allocation*>> Service::ResolveAndValidateArguments(
+StatusOr<std::vector<const ShapedBuffer*>> Service::ResolveAndValidateArguments(
     tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-    const Backend* backend, int device_ordinal) {
-  std::vector<const Allocation*> allocations;
+    int device_ordinal) {
+  std::vector<const ShapedBuffer*> shaped_buffers;
   for (size_t i = 0; i < arguments.size(); ++i) {
-    auto allocation_status = allocation_tracker_.Resolve(*arguments[i]);
-    if (!allocation_status.ok()) {
-      return Status(allocation_status.status().code(),
-                    StrCat(allocation_status.status().error_message(), ", ",
+    auto buffer_status = allocation_tracker_.Resolve(*arguments[i]);
+    if (!buffer_status.ok()) {
+      return Status(buffer_status.status().code(),
+                    StrCat(buffer_status.status().error_message(), ", ",
                            "failed to resolve allocation for parameter ", i));
     }
-    const Allocation* allocation = allocation_status.ValueOrDie();
+    const ShapedBuffer* shaped_buffer = buffer_status.ValueOrDie();
 
     // Verify allocation is same platform and device as the execution.
-    if (allocation->backend() != backend ||
-        allocation->device_ordinal() != device_ordinal) {
+    if (shaped_buffer->platform() != execute_backend_->platform() ||
+        shaped_buffer->device_ordinal() != device_ordinal) {
       return InvalidArgument(
-          "argument %lu is on device %s but computation will be executed "
+          "argument %lu is on device %s:%d but computation will be executed "
           "on device %s",
-          i,
-          allocation->backend()
-              ->device_name(allocation->device_ordinal())
-              .c_str(),
-          backend->device_name(device_ordinal).c_str());
+          i, shaped_buffer->platform()->Name().c_str(),
+          shaped_buffer->device_ordinal(),
+          execute_backend_->device_name(device_ordinal).c_str());
     }
 
-    allocations.push_back(allocation);
+    shaped_buffers.push_back(shaped_buffer);
   }
-  return allocations;
+  return shaped_buffers;
 }
 
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
@@ -325,11 +316,11 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
 
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
-    tensorflow::gtl::ArraySlice<const Allocation*> arguments,
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     const ExecutionOptions& execution_options) {
   std::vector<const Shape*> argument_shapes;
   for (const auto* arg : arguments) {
-    argument_shapes.push_back(&arg->shape());
+    argument_shapes.push_back(&arg->on_host_shape());
   }
   return CreateModuleConfig(program_shape, argument_shapes, &execution_options);
 }
@@ -398,8 +389,6 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
     const VersionedComputationHandle& versioned_handle,
     std::unique_ptr<HloModuleConfig> module_config,
-    const tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-        arguments,
     Backend* backend, se::StreamExecutor* executor) {
   VLOG(1) << Printf("BuildExecutable on service %p with handle %s", this,
                     versioned_handle.ToString().c_str());
@@ -430,9 +419,12 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
                                           /*include_unreachable_instructions=*/
                                           true));
 
+  TF_ASSIGN_OR_RETURN(
+      module, backend->compiler()->RunHloPasses(std::move(module), executor));
+
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
-      backend->compiler()->Compile(std::move(module), executor));
+      backend->compiler()->RunBackend(std::move(module), executor));
 
   if (!other_directory_path.empty()) {
     executable->set_session_module(std::move(session_module));
@@ -444,8 +436,6 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
 StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
     const VersionedComputationHandle& versioned_handle,
     std::unique_ptr<HloModuleConfig> module_config,
-    const tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-        arguments,
     Backend* backend, perftools::gputools::StreamExecutor* executor,
     ExecutionProfile* profile) {
   std::shared_ptr<Executable> executable =
@@ -468,8 +458,8 @@ StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
   HloModuleConfig original_module_config = *module_config;
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable_unique_ptr,
-      BuildExecutable(versioned_handle, std::move(module_config), arguments,
-                      backend, executor));
+      BuildExecutable(versioned_handle, std::move(module_config), backend,
+                      executor));
 
   if (profile != nullptr) {
     uint64 end_micros = tensorflow::Env::Default()->NowMicros();
@@ -486,9 +476,7 @@ StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
 StatusOr<std::vector<GlobalDataHandle>>
 Service::ExecuteParallelAndRegisterResult(
     tensorflow::gtl::ArraySlice<Executable*> executables,
-    tensorflow::gtl::ArraySlice<
-        std::vector<perftools::gputools::DeviceMemoryBase>>
-        arguments,
+    tensorflow::gtl::ArraySlice<std::vector<const ShapedBuffer*>> arguments,
     Backend* backend, tensorflow::gtl::ArraySlice<DeviceHandle> device_handles,
     tensorflow::gtl::ArraySlice<string> result_tags,
     ExecutionProfile* profile) {
@@ -544,7 +532,7 @@ Service::ExecuteParallelAndRegisterResult(
 
       // Asynchronously launch the computation.
       TF_ASSIGN_OR_RETURN(
-          perftools::gputools::DeviceMemoryBase result,
+          std::unique_ptr<ShapedBuffer> result,
           executables[i]->ExecuteAsyncOnStream(&run_options, arguments[i]));
 
       if (replica == 0 && profile != nullptr) {
@@ -554,17 +542,20 @@ Service::ExecuteParallelAndRegisterResult(
       // All replicas share the same device address for the result allocation,
       // so only one of the replicas need to register the result handle.
       if (replica == 0) {
-        result_handles.push_back(allocation_tracker_.Register(
-            backend, replicas[0]->device_ordinal(), result,
-            executables[i]->result_shape(), result_tags[i]));
+        TF_ASSIGN_OR_RETURN(
+            GlobalDataHandle handle,
+            allocation_tracker_.Register(std::move(result), result_tags[i]));
+        result_handles.push_back(handle);
       }
     }
   }
 
   // Wait for all executions to complete.
   for (int64 i = 0; i < streams.size(); ++i) {
-    if (!streams[i]->BlockHostUntilDone()) {
-      return InternalError("failed to complete execution for stream %lld", i);
+    Status block_status = streams[i]->BlockHostUntilDone();
+    if (!block_status.ok()) {
+      return InternalError("failed to complete execution for stream %lld: %s",
+                           i, block_status.error_message().c_str());
     }
   }
 
@@ -572,12 +563,13 @@ Service::ExecuteParallelAndRegisterResult(
   // profile.
   for (auto& index_to_profiled_stream : index_to_profiled_streams) {
     int64 device = index_to_profiled_stream.first;
-    auto& module = executables[device]->module();
     se::Stream* stream = index_to_profiled_stream.second;
-    HloExecutionProfile hlo_profile(module,
-                                    *executables[device]->CreateCostAnalysis());
-    TF_RETURN_IF_ERROR(executables[device]->PopulateExecutionProfile(
-        &hlo_profile, stream->parent()));
+    Executable* executable = executables[device];
+    const HloModule& module = executable->module();
+    HloExecutionProfile hlo_profile(&executable->hlo_profile_printer(),
+                                    &executable->hlo_profile_index_map());
+    TF_RETURN_IF_ERROR(
+        executable->PopulateExecutionProfile(&hlo_profile, stream->parent()));
     XLA_LOG_LINES(
         tensorflow::INFO,
         hlo_profile.ToString(streams[0]->parent()->GetDeviceDescription()));
@@ -621,8 +613,7 @@ Service::ExecuteParallelAndRegisterResult(
 
 StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
     Executable* executable,
-    const tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-        arguments,
+    const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     Backend* backend, perftools::gputools::StreamExecutor* executor,
     const string& result_tag, ExecutionProfile* profile) {
   // Set up streams.
@@ -647,6 +638,7 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
   for (const Pool<se::Stream>::SmartPtr& stream : streams) {
     ExecutableRunOptions options;
     options.set_stream(stream.get());
+    options.set_device_ordinal(stream->parent()->device_ordinal());
     options.set_allocator(backend->memory_allocator());
     options.set_inter_op_thread_pool(backend->inter_op_thread_pool());
     options.set_intra_op_thread_pool(
@@ -656,24 +648,23 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
                              backend->inter_op_thread_pool());
   }
 
-  perftools::gputools::DeviceMemoryBase result;
+  std::unique_ptr<ShapedBuffer> result;
   if (options_.number_of_replicas() == 1) {
     TF_ASSIGN_OR_RETURN(
-        result, executable->ExecuteOnStreamWrapper<se::DeviceMemoryBase>(
-                    &run_options[0], profile, arguments));
+        result,
+        executable->ExecuteOnStreamWrapper<std::unique_ptr<ShapedBuffer>>(
+            &run_options[0], profile, arguments));
   } else {
-    std::vector<
-        tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
+    // TODO(b/69985541): Support profiling also on this path.
+    std::vector<tensorflow::gtl::ArraySlice<const ShapedBuffer*>>
         repeated_arguments(options_.number_of_replicas(), arguments);
 
     TF_ASSIGN_OR_RETURN(auto results, executable->ExecuteOnStreams(
                                           run_options, repeated_arguments));
     TF_RET_CHECK(!results.empty());
-    result = results[0];
+    result = std::move(results[0]);
   }
-  return allocation_tracker_.Register(backend, executor->device_ordinal(),
-                                      result, executable->result_shape(),
-                                      result_tag);
+  return allocation_tracker_.Register(std::move(result), result_tag);
 }
 
 tensorflow::Status Service::SetReturnValue(const SetReturnValueRequest* arg,
@@ -687,7 +678,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
                                             ExecuteParallelResponse* result) {
   VLOG(1) << "running execute-parallel request: " << arg->ShortDebugString();
 
-  std::vector<std::vector<se::DeviceMemoryBase>> all_arguments;
+  std::vector<std::vector<const ShapedBuffer*>> all_arguments;
   std::vector<std::vector<perftools::gputools::StreamExecutor*>> all_executors;
   std::vector<VersionedComputationHandle> versioned_handles;
   std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
@@ -744,19 +735,14 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
     // In the case of partitioned computations, assume all arguments go on the
     // zeroth core.
     TF_ASSIGN_OR_RETURN(
-        std::vector<const Allocation*> arg_allocations,
-        ResolveAndValidateArguments(request.arguments(), execute_backend_.get(),
+        std::vector<const ShapedBuffer*> arguments,
+        ResolveAndValidateArguments(request.arguments(),
                                     executors[0]->device_ordinal()));
-    std::vector<se::DeviceMemoryBase> arguments;
-    arguments.reserve(arg_allocations.size());
-    for (const Allocation* allocation : arg_allocations) {
-      arguments.push_back(allocation->device_memory());
-    }
 
     // Create an HloModuleConfig object for the computation, given the shape of
     // the program and the argument allocations.
     TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                        CreateModuleConfig(*program_shape, arg_allocations,
+                        CreateModuleConfig(*program_shape, arguments,
                                            request.execution_options()));
     VLOG(3) << "ExecuteParallel created HloModuleConfig computation layout: "
             << module_config->entry_computation_layout().ToString();
@@ -859,35 +845,30 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
       user_computation->ComputeProgramShape(versioned_handle.version));
 
   TF_ASSIGN_OR_RETURN(
-      std::vector<const Allocation*> arg_allocations,
-      ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
+      std::vector<const ShapedBuffer*> arguments,
+      ResolveAndValidateArguments(arg->arguments(),
                                   execute_backend_->default_device_ordinal()));
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(*program_shape, arg_allocations,
-                                         arg->execution_options()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModuleConfig> module_config,
+      CreateModuleConfig(*program_shape, arguments, arg->execution_options()));
 
   VLOG(3) << "Execute created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
 
-  std::vector<se::DeviceMemoryBase> arguments;
-  arguments.reserve(arg_allocations.size());
-  for (const Allocation* allocation : arg_allocations) {
-    arguments.push_back(allocation->device_memory());
-  }
-
   TF_ASSIGN_OR_RETURN(
       std::shared_ptr<Executable> executable,
       BuildAndCacheExecutable(versioned_handle, std::move(module_config),
-                              arguments, execute_backend_.get(),
+                              execute_backend_.get(),
                               execute_backend_->default_stream_executor(),
                               result->mutable_profile()));
 
   if (executable->dumping()) {
     executable->session_module()->set_execution_platform(
         execute_backend_->platform()->Name());
-    TF_RETURN_IF_ERROR(
-        RecordArguments(arg_allocations, executable->session_module()));
+    TF_RETURN_IF_ERROR(RecordArguments(
+        arguments, execute_backend_->default_stream_executor(),
+        execute_backend_->transfer_manager(), executable->session_module()));
   }
 
   TF_ASSIGN_OR_RETURN(
@@ -898,10 +879,11 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
           "result of " + user_computation->name(), result->mutable_profile()));
 
   if (executable->dumping()) {
-    TF_ASSIGN_OR_RETURN(const Allocation* result_allocation,
+    TF_ASSIGN_OR_RETURN(const ShapedBuffer* result_buffer,
                         allocation_tracker_.Resolve(result->output()));
-    TF_RETURN_IF_ERROR(
-        RecordResult(result_allocation, executable->session_module()));
+    TF_RETURN_IF_ERROR(RecordResult(
+        *result_buffer, execute_backend_->default_stream_executor(),
+        execute_backend_->transfer_manager(), executable->session_module()));
     TF_RETURN_IF_ERROR(executable->DumpSessionModule());
   }
 
@@ -927,31 +909,24 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
       user_computation->ComputeProgramShape(versioned_handle.version));
 
   TF_ASSIGN_OR_RETURN(
-      std::vector<const Allocation*> arg_allocations,
-      ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
+      std::vector<const ShapedBuffer*> arguments,
+      ResolveAndValidateArguments(arg->arguments(),
                                   execute_backend_->default_device_ordinal()));
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(*program_shape, arg_allocations,
-                                         arg->execution_options()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModuleConfig> module_config,
+      CreateModuleConfig(*program_shape, arguments, arg->execution_options()));
 
   VLOG(3) << "ExecuteAsync created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
 
-  std::vector<se::DeviceMemoryBase> arguments;
-  arguments.reserve(arg_allocations.size());
-  for (const Allocation* allocation : arg_allocations) {
-    arguments.push_back(allocation->device_memory());
-  }
-
   ExecutionProfile profile;
 
   TF_ASSIGN_OR_RETURN(
       std::shared_ptr<Executable> executable,
-      BuildAndCacheExecutable(versioned_handle, std::move(module_config),
-                              arguments, execute_backend_.get(),
-                              execute_backend_->default_stream_executor(),
-                              &profile));
+      BuildAndCacheExecutable(
+          versioned_handle, std::move(module_config), execute_backend_.get(),
+          execute_backend_->default_stream_executor(), &profile));
 
   TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
                                               SingleComputationDeviceHandle()));
@@ -966,7 +941,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
     streams.push_back(std::move(stream));
   }
 
-  perftools::gputools::DeviceMemoryBase result_data;
+  std::unique_ptr<ShapedBuffer> result_buffer;
   for (const Pool<se::Stream>::SmartPtr& stream : streams) {
     ExecutableRunOptions options;
     options.set_stream(stream.get());
@@ -979,19 +954,19 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
         options, execute_backend_->StreamBorrower());
 
     TF_ASSIGN_OR_RETURN(
-        perftools::gputools::DeviceMemoryBase this_result_data,
+        std::unique_ptr<ShapedBuffer> this_result_buffer,
         executable->ExecuteAsyncOnStream(&service_options, arguments));
 
     // Take the first result.
-    if (result_data == nullptr) {
-      result_data = this_result_data;
+    if (result_buffer == nullptr) {
+      result_buffer = std::move(this_result_buffer);
     }
   }
 
-  auto output = allocation_tracker_.Register(
-      execute_backend_.get(), execute_backend_->default_device_ordinal(),
-      result_data, executable->result_shape(),
-      "result of " + user_computation->name());
+  TF_ASSIGN_OR_RETURN(
+      GlobalDataHandle output,
+      allocation_tracker_.Register(std::move(result_buffer),
+                                   "result of " + user_computation->name()));
 
   *result->mutable_execution() = execution_tracker_.Register(
       execute_backend_.get(), std::move(streams), profile, output);
@@ -1018,38 +993,58 @@ tensorflow::Status Service::WaitForExecution(const WaitForExecutionRequest* arg,
 
 tensorflow::Status Service::TransferToClient(const TransferToClientRequest* arg,
                                              TransferToClientResponse* result) {
-  TF_ASSIGN_OR_RETURN(const Allocation* allocation,
+  TF_ASSIGN_OR_RETURN(const ShapedBuffer* shaped_buffer,
                       allocation_tracker_.Resolve(arg->data()));
 
-  const Shape* literal_shape;
+  const Shape* return_shape;
   if (arg->has_shape_with_layout()) {
     if (!LayoutUtil::HasLayout(arg->shape_with_layout())) {
       return InvalidArgument("shape_with_layout must have layout if present.");
     }
-    literal_shape = &arg->shape_with_layout();
+    return_shape = &arg->shape_with_layout();
   } else {
-    literal_shape = &allocation->shape();
+    return_shape = &shaped_buffer->on_host_shape();
   }
 
-  Literal literal;
-  TF_RETURN_IF_ERROR(
-      LiteralFromAllocation(allocation, *literal_shape, &literal));
-  *result->mutable_literal() = literal.ToProto();
+  TF_ASSIGN_OR_RETURN(
+      se::StreamExecutor * executor,
+      execute_backend_->stream_executor(shaped_buffer->device_ordinal()));
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Literal> result_literal,
+      execute_backend_->transfer_manager()->TransferLiteralFromDevice(
+          executor, *shaped_buffer));
+
+  if (LayoutUtil::LayoutsInShapesEqual(*return_shape,
+                                       result_literal->shape())) {
+    *result->mutable_literal() = result_literal->ToProto();
+  } else {
+    *result->mutable_literal() =
+        result_literal->Relayout(*return_shape)->ToProto();
+  }
   return tensorflow::Status::OK();
 }
 
+namespace {
+
+// Creates a clone of the given shaped buffer with the given device ordinal. The
+// shape and DeviceMemoryBase values of the clone are identical to the original.
+std::unique_ptr<ShapedBuffer> CloneShapedBufferOnDevice(
+    const ShapedBuffer& shaped_buffer, int device_ordinal) {
+  auto clone = MakeUnique<ShapedBuffer>(
+      shaped_buffer.on_host_shape(), shaped_buffer.on_device_shape(),
+      shaped_buffer.platform(), device_ordinal);
+  clone->buffers() = shaped_buffer.buffers();
+  return clone;
+}
+
+}  // namespace
+
 tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
                                              TransferToServerResponse* result) {
   Literal literal = Literal(arg->literal());
   const Shape& shape = literal.shape();
 
-  if (ShapeUtil::IsTuple(shape) && options_.number_of_replicas() > 1) {
-    // TODO(b/32990684): Tuple transfers to host end up allocating further
-    // buffers - implement that correctly.
-    return Unimplemented(
-        "Tuple transfers to the device not supported with replication.");
-  }
-
   std::vector<se::StreamExecutor*> replicas;
   if (arg->has_device_handle()) {
     TF_ASSIGN_OR_RETURN(replicas,
@@ -1059,25 +1054,38 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
         replicas, Replicas(*execute_backend_, SingleComputationDeviceHandle()));
   }
 
-  // Allocate memory on the device, using the stream executor. The size of the
-  // allocation is obtained by examining the shape of the literal passed from
-  // the client. An allocation handle is returned in the response.
-  int64 allocation_size =
-      execute_backend_->transfer_manager()->GetByteSizeRequirement(shape);
-
-  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase allocation,
-                      execute_backend_->memory_allocator()->Allocate(
-                          replicas[0]->device_ordinal(), allocation_size));
-
-  *result->mutable_data() = allocation_tracker_.Register(
-      execute_backend_.get(), replicas[0]->device_ordinal(), allocation, shape,
-      StrCat("TransferToServer literal of size ", allocation_size));
+  // All memory allocation is done on the first replica. The allocations in all
+  // other replicas mirror the firsts'.
+  int master_device_ordinal = replicas[0]->device_ordinal();
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<ShapedBuffer> shaped_buffer,
+      execute_backend_->transfer_manager()->AllocateShapedBuffer(
+          shape, execute_backend_->memory_allocator(), master_device_ordinal));
 
+  // Transfer the data to the replicas.
   for (se::StreamExecutor* executor : replicas) {
-    TF_RETURN_IF_ERROR(
-        execute_backend_->transfer_manager()->TransferLiteralToDevice(
-            executor, literal, &allocation));
+    if (executor->device_ordinal() == master_device_ordinal) {
+      TF_RETURN_IF_ERROR(
+          execute_backend_->transfer_manager()->TransferLiteralToDevice(
+              executor, literal, *shaped_buffer));
+    } else {
+      // The replica is not the master. Create an cloned shaped buffer with
+      // the replica's device ordinal. This is required because
+      // TransferLiteralToDevice verifies that the device ordinal of the shaped
+      // buffer matches that of the executor.
+      std::unique_ptr<ShapedBuffer> clone =
+          CloneShapedBufferOnDevice(*shaped_buffer, executor->device_ordinal());
+      TF_RETURN_IF_ERROR(
+          execute_backend_->transfer_manager()->TransferLiteralToDevice(
+              executor, literal, *clone));
+    }
   }
+  TF_ASSIGN_OR_RETURN(
+      *result->mutable_data(),
+      allocation_tracker_.Register(std::move(shaped_buffer),
+                                   StrCat("TransferToServer literal of shape ",
+                                          ShapeUtil::HumanString(shape))));
+
   return tensorflow::Status::OK();
 }
 
@@ -1228,8 +1236,9 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
                  [](const Literal& literal) { return &literal; });
 
   HloEvaluator evaluator;
-  TF_ASSIGN_OR_RETURN(auto result_literal,
-                      evaluator.Evaluate(*module, parameter_ptrs));
+  TF_ASSIGN_OR_RETURN(auto result_literal, evaluator.Evaluate<const Literal*>(
+                                               *module, parameter_ptrs));
+
   // Since the shape_with_output_layout option in ExecutionOption is
   // non-effective to the Evaluator results, explicit relayout here.
   if (arg->has_output_layout()) {
@@ -1242,9 +1251,9 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
 
 tensorflow::Status Service::GetShape(const GetShapeRequest* arg,
                                      GetShapeResponse* result) {
-  TF_ASSIGN_OR_RETURN(const Allocation* allocation,
+  TF_ASSIGN_OR_RETURN(const ShapedBuffer* buffer,
                       allocation_tracker_.Resolve(arg->data()));
-  *result->mutable_shape() = allocation->shape();
+  *result->mutable_shape() = buffer->on_host_shape();
   return tensorflow::Status::OK();
 }
 
@@ -1353,6 +1362,17 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       handle_status =
           computation->AddConcatenateInstruction(arg->concatenate_request());
       break;
+    case OpRequest::kConditionalRequest: {
+      TF_ASSIGN_OR_RETURN(UserComputation * true_computation,
+                          computation_tracker_.Resolve(
+                              arg->conditional_request().true_computation()));
+      TF_ASSIGN_OR_RETURN(UserComputation * false_computation,
+                          computation_tracker_.Resolve(
+                              arg->conditional_request().false_computation()));
+      handle_status = computation->AddConditionalInstruction(
+          arg->conditional_request(), *true_computation, *false_computation);
+      break;
+    }
     case OpRequest::kConstantRequest:
       handle_status =
           computation->AddConstantInstruction(arg->constant_request());
@@ -1361,6 +1381,10 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       handle_status =
           computation->AddConvertInstruction(arg->convert_request());
       break;
+    case OpRequest::kBitcastConvertRequest:
+      handle_status = computation->AddBitcastConvertInstruction(
+          arg->bitcast_convert_request());
+      break;
     case OpRequest::kConvolveRequest:
       handle_status =
           computation->AddConvolveInstruction(arg->convolve_request());
@@ -1373,6 +1397,9 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       handle_status =
           computation->AddCustomCallInstruction(arg->custom_call_request());
       break;
+    case OpRequest::kDotRequest:
+      handle_status = computation->AddDotInstruction(arg->dot_request());
+      break;
     case OpRequest::kDynamicSliceRequest:
       handle_status =
           computation->AddDynamicSliceInstruction(arg->dynamic_slice_request());
@@ -1493,8 +1520,12 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       handle_status = computation->AddRecvInstruction(arg->recv_request());
       break;
     }
+    case OpRequest::kFftRequest:
+      return Unimplemented("FftRequest not implemented in XLA service.");
+    case OpRequest::OP_NOT_SET:
+      return InvalidArgument("XLA service received OpRequest with OP_NOT_SET");
     default:
-      return InvalidArgument("Unsupported operation");
+      return InvalidArgument("Unsupported operation in XLA service");
   }
   TF_ASSIGN_OR_RETURN(*result->mutable_output(), handle_status);
 
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 47f4f0ade594089aa71717ef1e122886b0a6c7ac..f962d0cdc7d41e1aeab55da5abcb1b40215b4144 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -250,7 +250,7 @@ class Service : public ServiceInterface {
   // class.
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
-      tensorflow::gtl::ArraySlice<const Allocation*> arguments,
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       const ExecutionOptions& execution_options);
 
  protected:
@@ -265,10 +265,10 @@ class Service : public ServiceInterface {
 
   // Resolves the given argument handles in the allocation tracker and returns
   // the corresponding allocations. The function also verifies that each
-  // allocation matches the given backend and device ordinal.
-  StatusOr<std::vector<const Allocation*>> ResolveAndValidateArguments(
+  // allocation matches the execution platform and device ordinal.
+  StatusOr<std::vector<const ShapedBuffer*>> ResolveAndValidateArguments(
       tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-      const Backend* backend, int device_ordinal);
+      int device_ordinal);
 
   // Create a Hlo module config for the given program shape and arguments.
   // execution_options is optional; if not given a default is used.
@@ -281,8 +281,6 @@ class Service : public ServiceInterface {
   StatusOr<std::unique_ptr<Executable>> BuildExecutable(
       const VersionedComputationHandle& versioned_handle,
       std::unique_ptr<HloModuleConfig> module_config,
-      const tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
       Backend* backend, perftools::gputools::StreamExecutor* executor);
 
   // Same as BuildExecutable() above, but builds a list of Executables for the
@@ -299,8 +297,6 @@ class Service : public ServiceInterface {
   StatusOr<std::shared_ptr<Executable>> BuildAndCacheExecutable(
       const VersionedComputationHandle& versioned_handle,
       std::unique_ptr<HloModuleConfig> module_config,
-      const tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
       Backend* backend, perftools::gputools::StreamExecutor* executor,
       ExecutionProfile* profile);
 
@@ -310,8 +306,7 @@ class Service : public ServiceInterface {
   // ExecutionProfile object which will be filled in with profile data.
   StatusOr<GlobalDataHandle> ExecuteAndRegisterResult(
       Executable* executable,
-      const tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
+      const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       Backend* backend, perftools::gputools::StreamExecutor* executor,
       const string& result_tag, ExecutionProfile* profile);
 
@@ -320,9 +315,7 @@ class Service : public ServiceInterface {
   // from the tracker are returned.
   StatusOr<std::vector<GlobalDataHandle>> ExecuteParallelAndRegisterResult(
       tensorflow::gtl::ArraySlice<Executable*> executables,
-      tensorflow::gtl::ArraySlice<
-          std::vector<perftools::gputools::DeviceMemoryBase>>
-          arguments,
+      tensorflow::gtl::ArraySlice<std::vector<const ShapedBuffer*>> arguments,
       Backend* backend,
       tensorflow::gtl::ArraySlice<DeviceHandle> device_handles,
       tensorflow::gtl::ArraySlice<string> result_tags,
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index dcd726f22c71b4bd709dc63b25d6fdea477c83c7..9c1b951d017569a6dc89bc6583c72b5e42f0c07c 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -90,8 +91,6 @@ BinaryOperation OpcodeToBinaryOperation(HloOpcode opcode) {
       return BINOP_ATAN2;
     case HloOpcode::kComplex:
       return BINOP_COMPLEX;
-    case HloOpcode::kDot:
-      return BINOP_DOT;
     case HloOpcode::kMultiply:
       return BINOP_MUL;
     case HloOpcode::kAdd:
@@ -441,6 +440,14 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
 
 /* static */ StatusOr<Shape> ShapeInference::InferConvertShape(
     const Shape& operand_shape, PrimitiveType new_element_type) {
+  auto old_element_type = operand_shape.element_type();
+  if (primitive_util::IsComplexType(old_element_type) &&
+      !primitive_util::IsComplexType(new_element_type)) {
+    return Unimplemented(
+        "Unsupported conversion from complex to real type: %s => %s",
+        ShapeUtil::HumanString(operand_shape).c_str(),
+        PrimitiveType_Name(new_element_type).c_str());
+  }
   if (ShapeUtil::IsTuple(operand_shape) || new_element_type == TUPLE) {
     // Note: we may want to support tuple conversions via this operation in the
     // future, by recursing into the tuple elements to check all sub-conversions
@@ -454,6 +461,36 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   return ShapeUtil::ChangeElementType(operand_shape, new_element_type);
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferBitcastConvertShape(
+    const Shape& operand_shape, PrimitiveType new_element_type) {
+  auto old_element_type = operand_shape.element_type();
+  if (primitive_util::IsComplexType(old_element_type) !=
+      primitive_util::IsComplexType(new_element_type)) {
+    return Unimplemented(
+        "Unsupported conversion between real and complex types: %s => %s",
+        ShapeUtil::HumanString(operand_shape).c_str(),
+        PrimitiveType_Name(new_element_type).c_str());
+  }
+  if (ShapeUtil::IsTuple(operand_shape) || new_element_type == TUPLE) {
+    // Note: we may want to support tuple conversions via this operation in the
+    // future, by recursing into the tuple elements to check all sub-conversions
+    // are valid. For now we just reject them, though.
+    return InvalidArgument(
+        "cannot convert from or to tuple type; requested conversion: %s => %s",
+        ShapeUtil::HumanString(operand_shape).c_str(),
+        PrimitiveType_Name(new_element_type).c_str());
+  }
+  if (primitive_util::BitWidth(old_element_type) !=
+      primitive_util::BitWidth(new_element_type)) {
+    return InvalidArgument(
+        "cannot bitcast types with different bit-widths: %s => %s",
+        PrimitiveType_Name(old_element_type).c_str(),
+        PrimitiveType_Name(new_element_type).c_str());
+  }
+
+  return ShapeUtil::ChangeElementType(operand_shape, new_element_type);
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferReducePrecisionShape(
     const Shape& operand_shape, const int exponent_bits,
     const int mantissa_bits) {
@@ -511,8 +548,113 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   return ShapeUtil::MakeShape(operand_shape.element_type(), dimensions);
 }
 
-/* static */ StatusOr<Shape> ShapeInference::InferDotOpShape(const Shape& lhs,
-                                                             const Shape& rhs) {
+// Current DotDimensionNumbers Requirements:
+//
+// Contracting Dimensions:
+// *) Exactly one contracting dimension on both lhs and rhs.
+// *) Contracting dimension size must be the same on both lhs and rhs.
+// *) Contracting dimension numbers do not need to be the same (i.e. transposes
+//    are passed on to emitter implementations).
+//
+// Batch Dimensions:
+// *) Same number of batch dimensions on both lhs and rhs.
+// *) Same batch dimension numbers (and sizes) on both lhs and rhs.
+// *) Batch dimension numbers must be ordered before contracting and
+//    non-contracting/non-batch dimension numbers.
+//
+// Non-Contracting-Non-Batch Dimensions:
+// *) Can be 0 (matrix-vector) or 1 (matrix-matrix).
+//
+
+namespace {
+
+Status ValidateDotDimensionNumbers(
+    const Shape& lhs, const Shape& rhs,
+    const DotDimensionNumbers& dimension_numbers) {
+  // Check that dimension numbers are in range.
+  auto dims_in_range =
+      [](const int64 rank, tensorflow::gtl::ArraySlice<int64> contracting_dims,
+         tensorflow::gtl::ArraySlice<int64> batch_dims) -> bool {
+    auto in_range = [&rank](int64 i) -> bool { return 0 <= i && i < rank; };
+    return std::all_of(contracting_dims.begin(), contracting_dims.end(),
+                       in_range) &&
+           std::all_of(batch_dims.begin(), batch_dims.end(), in_range);
+  };
+
+  tensorflow::gtl::ArraySlice<int64> lhs_contracting_dimensions =
+      AsInt64Slice(dimension_numbers.lhs_contracting_dimensions());
+  tensorflow::gtl::ArraySlice<int64> rhs_contracting_dimensions =
+      AsInt64Slice(dimension_numbers.rhs_contracting_dimensions());
+  tensorflow::gtl::ArraySlice<int64> lhs_batch_dimensions =
+      AsInt64Slice(dimension_numbers.lhs_batch_dimensions());
+  tensorflow::gtl::ArraySlice<int64> rhs_batch_dimensions =
+      AsInt64Slice(dimension_numbers.rhs_batch_dimensions());
+
+  if (!dims_in_range(ShapeUtil::Rank(lhs), lhs_contracting_dimensions,
+                     lhs_batch_dimensions) ||
+      !dims_in_range(ShapeUtil::Rank(rhs), rhs_contracting_dimensions,
+                     rhs_batch_dimensions)) {
+    return InvalidArgument("A dimension number is out of range in dot: %s",
+                           dimension_numbers.DebugString().c_str());
+  }
+
+  // Check that dimension numbers are unique.
+  auto dims_unique = [](tensorflow::gtl::ArraySlice<int64> contracting_dims,
+                        tensorflow::gtl::ArraySlice<int64> batch_dims) -> bool {
+    tensorflow::gtl::FlatSet<int64> dim_set;
+    auto is_unique = [&dim_set](int64 i) -> bool {
+      return dim_set.insert(i).second;
+    };
+    return std::all_of(contracting_dims.begin(), contracting_dims.end(),
+                       is_unique) &&
+           std::all_of(batch_dims.begin(), batch_dims.end(), is_unique);
+  };
+
+  if (!dims_unique(lhs_contracting_dimensions, lhs_batch_dimensions) ||
+      !dims_unique(rhs_contracting_dimensions, rhs_batch_dimensions)) {
+    return InvalidArgument("A dimension number is not unique in dot: %s",
+                           dimension_numbers.DebugString().c_str());
+  }
+
+  // Check that the count of non-contracting-non-batch dimensions is in {0, 1}.
+  const int64 lhs_non_contracting_non_batch_dims =
+      ShapeUtil::Rank(lhs) -
+      dimension_numbers.lhs_contracting_dimensions_size() -
+      dimension_numbers.lhs_batch_dimensions_size();
+  const int64 rhs_non_contracting_non_batch_dims =
+      ShapeUtil::Rank(rhs) -
+      dimension_numbers.rhs_contracting_dimensions_size() -
+      dimension_numbers.rhs_batch_dimensions_size();
+  if (lhs_non_contracting_non_batch_dims < 0 ||
+      lhs_non_contracting_non_batch_dims > 1 ||
+      rhs_non_contracting_non_batch_dims < 0 ||
+      rhs_non_contracting_non_batch_dims > 1) {
+    return InvalidArgument(
+        "batch and contracting dimension number mismatch "
+        "with rank ");
+  }
+
+  // Check that batch dimension numbers are ordered before all others, and
+  // that they are monotonically increasing.
+  std::vector<int64> batch_dim_numbers(lhs_batch_dimensions.size());
+  std::iota(batch_dim_numbers.begin(), batch_dim_numbers.end(), 0);
+  if (!std::equal(batch_dim_numbers.begin(), batch_dim_numbers.end(),
+                  lhs_batch_dimensions.begin()) ||
+      !std::equal(batch_dim_numbers.begin(), batch_dim_numbers.end(),
+                  rhs_batch_dimensions.begin())) {
+    return InvalidArgument(
+        "batch dimension numbers must precede non-batch dimensions and be"
+        "monotonically increasing.");
+  }
+
+  return Status::OK();
+}
+
+}  // namespace
+
+/* static */ StatusOr<Shape> ShapeInference::InferDotOpShape(
+    const Shape& lhs, const Shape& rhs,
+    const DotDimensionNumbers& dimension_numbers) {
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(lhs, "lhs of dot"));
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(rhs, "rhs of dot"));
 
@@ -532,37 +674,62 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     return fail("element types do not match");
   }
 
-  if (ShapeUtil::Rank(lhs) < 1 || ShapeUtil::Rank(lhs) > 2 ||
-      ShapeUtil::Rank(rhs) < 1 || ShapeUtil::Rank(rhs) > 2) {
-    return fail("dot only supports rank 1 or 2");
+  if ((ShapeUtil::Rank(lhs) < 1) || (ShapeUtil::Rank(rhs) < 1)) {
+    return fail("dot only supports rank 1 or above.");
+  }
+
+  // Validate basic properties of dot dimension numbers.
+  TF_RETURN_IF_ERROR(ValidateDotDimensionNumbers(lhs, rhs, dimension_numbers));
+
+  // Check that there is only one contracting dimension for both lhs and rhs.
+  if (dimension_numbers.lhs_contracting_dimensions_size() !=
+          dimension_numbers.rhs_contracting_dimensions_size() ||
+      dimension_numbers.lhs_contracting_dimensions_size() != 1) {
+    return fail("must specify one contracting dimension for both lhs and rhs.");
   }
 
-  // Determine the index of the contracted dimensions for input tensors.
-  // dimensions -1 of lhs and dimension 0 of rhs are contracted.
-  int64 lhs_contracted_dimension = ShapeUtil::GetDimensionNumber(lhs, -1);
-  int64 rhs_contracted_dimension = 0;
+  // Check that contracting dimension sizes match.
+  const int64 lhs_contracting_dimension =
+      dimension_numbers.lhs_contracting_dimensions(0);
+  const int64 rhs_contracting_dimension =
+      dimension_numbers.rhs_contracting_dimensions(0);
+  if (lhs.dimensions(lhs_contracting_dimension) !=
+      rhs.dimensions(rhs_contracting_dimension)) {
+    return fail("contracting dimension sizes do not match.");
+  }
 
-  // Check if the contracted dimension sizes are the same.
-  if ((lhs_contracted_dimension < ShapeUtil::Rank(lhs) &&
-       rhs_contracted_dimension < ShapeUtil::Rank(rhs)) &&
-      lhs.dimensions(lhs_contracted_dimension) !=
-          rhs.dimensions(rhs_contracted_dimension)) {
-    return fail("contracted dimensions mismatch");
+  // Check that number of batch dimensions match.
+  if (dimension_numbers.lhs_batch_dimensions_size() !=
+      dimension_numbers.rhs_batch_dimensions_size()) {
+    return fail("must the same number of batch dimensions for lhs and rhs.");
+  }
+
+  // Check that batch dimension numbers and sizes match.
+  for (int64 i = 0; i < dimension_numbers.lhs_batch_dimensions_size(); ++i) {
+    if (dimension_numbers.lhs_batch_dimensions(i) !=
+            dimension_numbers.rhs_batch_dimensions(i) ||
+        lhs.dimensions(dimension_numbers.lhs_batch_dimensions(i)) !=
+            rhs.dimensions(dimension_numbers.rhs_batch_dimensions(i))) {
+      return fail("batch dimension numbers and sizes must match for lhs/rhs.");
+    }
   }
 
   // The ranks of lhs and rhs are decremented by 1 respectively due to the
   // contraction, and added for the rank of the result. When an input tensor is
   // a scalar, its contribution to the rank of the result is 0.
   // Generate the result dimensions in order, rhs dimensions followed by lhs
-  // dimensions except the contracted dimensions.
+  // dimensions except the contracted and batch dimensions.
   std::vector<int64> dimensions;
+  std::unordered_set<int64> rhs_batch_dims(
+      dimension_numbers.rhs_batch_dimensions().begin(),
+      dimension_numbers.rhs_batch_dimensions().end());
   for (int64 i = 0; i < ShapeUtil::Rank(lhs); i++) {
-    if (i != lhs_contracted_dimension) {
+    if (i != lhs_contracting_dimension) {
       dimensions.push_back(lhs.dimensions(i));
     }
   }
   for (int64 i = 0; i < ShapeUtil::Rank(rhs); i++) {
-    if (i != rhs_contracted_dimension) {
+    if (i != rhs_contracting_dimension && rhs_batch_dims.count(i) == 0) {
       dimensions.push_back(rhs.dimensions(i));
     }
   }
@@ -778,8 +945,6 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       rhs, tensorflow::strings::StrCat("rhs of binary operation ",
                                        BinaryOperation_Name(operation))));
   switch (operation) {
-    case BINOP_DOT:
-      return InferDotOpShape(lhs, rhs);
     case BINOP_MAX:
     case BINOP_MIN:
     case BINOP_SUB:
@@ -1407,7 +1572,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         ShapeUtil::HumanString(lhs).c_str(),
         ShapeUtil::HumanString(rhs).c_str());
   }
-  if (dnums.spatial_dimensions_size() !=
+  if (dnums.input_spatial_dimensions_size() !=
       dnums.kernel_spatial_dimensions_size()) {
     return InvalidArgument(
         "Both arguments to convolution must have same number of dimensions.\n"
@@ -1415,7 +1580,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         window.DebugString().c_str());
   }
 
-  const int num_spatial_dims = dnums.spatial_dimensions_size();
+  const int num_spatial_dims = dnums.input_spatial_dimensions_size();
   if (window.dimensions_size() != num_spatial_dims) {
     return InvalidArgument(
         "Window must have same number of dimensions as dimension numbers.\n"
@@ -1444,8 +1609,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   std::vector<int64> input_dnums(num_dims);
   input_dnums[0] = dnums.input_batch_dimension();
   input_dnums[1] = dnums.input_feature_dimension();
-  std::copy(dnums.spatial_dimensions().begin(),
-            dnums.spatial_dimensions().end(), input_dnums.begin() + 2);
+  std::copy(dnums.input_spatial_dimensions().begin(),
+            dnums.input_spatial_dimensions().end(), input_dnums.begin() + 2);
   std::sort(input_dnums.begin(), input_dnums.end());
 
   std::vector<int64> window_dnums(num_dims);
@@ -1455,12 +1620,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
             dnums.kernel_spatial_dimensions().end(), window_dnums.begin() + 2);
   std::sort(window_dnums.begin(), window_dnums.end());
 
+  std::vector<int64> output_dnums(num_dims);
+  output_dnums[0] = dnums.output_batch_dimension();
+  output_dnums[1] = dnums.output_feature_dimension();
+  std::copy(dnums.output_spatial_dimensions().begin(),
+            dnums.output_spatial_dimensions().end(), output_dnums.begin() + 2);
+  std::sort(output_dnums.begin(), output_dnums.end());
+
   std::vector<int64> expected_dnums(num_dims);
   std::iota(expected_dnums.begin(), expected_dnums.end(), 0);
 
   const auto in_range = [num_dims](int64 i) { return 0 <= i && i < num_dims; };
   if (!std::all_of(input_dnums.begin(), input_dnums.end(), in_range) ||
-      !std::all_of(window_dnums.begin(), window_dnums.end(), in_range)) {
+      !std::all_of(window_dnums.begin(), window_dnums.end(), in_range) ||
+      !std::all_of(output_dnums.begin(), output_dnums.end(), in_range)) {
     return InvalidArgument(
         "A dimension number is out of range in convolution: %s",
         dnums.DebugString().c_str());
@@ -1478,10 +1651,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         "once: %s",
         dnums.DebugString().c_str());
   }
+  if (output_dnums != expected_dnums) {
+    return InvalidArgument(
+        "Output dimensions of convolution must contain each dimension exactly "
+        "once: %s",
+        dnums.DebugString().c_str());
+  }
 
   std::vector<int64> input_spatial_dims(num_spatial_dims);
   for (int i = 0; i < num_spatial_dims; ++i) {
-    input_spatial_dims[i] = lhs.dimensions(dnums.spatial_dimensions(i));
+    input_spatial_dims[i] = lhs.dimensions(dnums.input_spatial_dimensions(i));
   }
   const int64 input_features = lhs.dimensions(dnums.input_feature_dimension());
   const int64 input_batch = lhs.dimensions(dnums.input_batch_dimension());
@@ -1529,17 +1708,27 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   dimensions[dnums.output_batch_dimension()] = input_batch;
   dimensions[dnums.output_feature_dimension()] = kernel_output_features;
   for (int i = 0; i < num_spatial_dims; ++i) {
-    dimensions[dnums.spatial_dimensions(i)] = window_output_shape.dimensions(i);
+    dimensions[dnums.output_spatial_dimensions(i)] =
+        window_output_shape.dimensions(i);
   }
 
   return ShapeUtil::MakeShape(lhs.element_type(), dimensions);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferCrossReplicaSumShape(
-    const Shape& operand) {
-  TF_RETURN_IF_ERROR(
-      ExpectNotTupleOrOpaque(operand, "operand of cross replica sum"));
-  return operand;
+    tensorflow::gtl::ArraySlice<const Shape*> operand_shapes) {
+  for (const Shape* operand_shape : operand_shapes) {
+    TF_RETURN_IF_ERROR(
+        ExpectNotTupleOrOpaque(*operand_shape, "operand of cross replica sum"));
+  }
+  if (operand_shapes.size() == 1) {
+    return *operand_shapes[0];
+  }
+  std::vector<Shape> operand_shape_values;
+  for (const Shape* operand_shape : operand_shapes) {
+    operand_shape_values.push_back(*operand_shape);
+  }
+  return ShapeUtil::MakeTupleShape(operand_shape_values);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferReduceShape(
@@ -1905,6 +2094,64 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   return init;
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferConditionalShape(
+    const Shape& predicate, const Shape& true_operand,
+    const Shape& false_operand, const ProgramShape& true_computation,
+    const ProgramShape& false_computation) {
+  if (!ShapeUtil::ShapeIs(predicate, PRED, {})) {
+    return InvalidArgument("predicate must be a boolean; got %s.",
+                           ShapeUtil::HumanString(predicate).c_str());
+  }
+
+  if (true_computation.parameters_size() != 1) {
+    return InvalidArgument("true_computation must take 1 argument; got %d.",
+                           true_computation.parameters_size());
+  }
+  if (!ShapeUtil::Compatible(true_computation.parameters(0), true_operand)) {
+    auto true_shape_string = [&]() {
+      return tensorflow::strings::Printf(
+          "true_operand: %s; true_computation: %s",
+          ShapeUtil::HumanString(true_operand).c_str(),
+          ShapeUtil::HumanString(true_computation).c_str());
+    };
+    return InvalidArgument(
+        "true_operand must match the shape of the only parameter of "
+        "true_computation: got %s.",
+        true_shape_string().c_str());
+  }
+
+  if (false_computation.parameters_size() != 1) {
+    return InvalidArgument("false_computation must take 1 argument; got %d.",
+                           false_computation.parameters_size());
+  }
+  if (!ShapeUtil::Compatible(false_computation.parameters(0), false_operand)) {
+    auto false_shape_string = [&]() {
+      return tensorflow::strings::Printf(
+          "false_operand: %s; false_computation: %s",
+          ShapeUtil::HumanString(false_operand).c_str(),
+          ShapeUtil::HumanString(false_computation).c_str());
+    };
+    return InvalidArgument(
+        "false_operand must match the shape of the only parameter of "
+        "false_computation: got %s.",
+        false_shape_string().c_str());
+  }
+  if (!ShapeUtil::Compatible(true_computation.result(),
+                             false_computation.result())) {
+    auto shape_string = [&]() {
+      return tensorflow::strings::Printf(
+          "true_computation result: %s; false_computation result: %s.",
+          ShapeUtil::HumanString(true_computation.result()).c_str(),
+          ShapeUtil::HumanString(false_computation.result()).c_str());
+    };
+    return InvalidArgument(
+        "the result of true_computation and false_computation must have the "
+        "same shape: got %s.",
+        shape_string().c_str());
+  }
+  return true_computation.result();
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferBroadcastShape(
     const Shape& operand, tensorflow::gtl::ArraySlice<int64> broadcast_sizes) {
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(operand, "operand of broadcast"));
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index d5d497176d6c340d8c8f34cdacf6a9e32040c387..c06340d2d5df239642eb0af4836df64a898a1eaf 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -109,8 +109,10 @@ class ShapeInference {
       const Shape& lhs, const Shape& rhs, const Window& window,
       const ConvolutionDimensionNumbers& dimension_numbers);
 
-  // Infers the shape produced a cross replica sum with the given operand shape.
-  static StatusOr<Shape> InferCrossReplicaSumShape(const Shape& operand);
+  // Infers the shape produced a cross replica sum with the given operand
+  // shapes.
+  static StatusOr<Shape> InferCrossReplicaSumShape(
+      tensorflow::gtl::ArraySlice<const Shape*> operand_shapes);
 
   // Infers the shape produced by applying the given reduction computation
   // shape to the given input operand shape.
@@ -178,6 +180,12 @@ class ShapeInference {
                                          const ProgramShape& body,
                                          const Shape& init);
 
+  // Infers the shape produced by a conditional operation.
+  static StatusOr<Shape> InferConditionalShape(
+      const Shape& predicate, const Shape& true_operand,
+      const Shape& false_operand, const ProgramShape& true_computation,
+      const ProgramShape& false_computation);
+
   // Infers the shape produced by a broadcast operation.
   static StatusOr<Shape> InferBroadcastShape(
       const Shape& operand, tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
@@ -204,6 +212,13 @@ class ShapeInference {
   static StatusOr<Shape> InferConvertShape(const Shape& operand_shape,
                                            PrimitiveType new_element_type);
 
+  // Helper that validates the given operand shape can be bitcast converted to
+  // the target output_shape via a bitcast convert instruction -- the
+  // requirement is that the shape is identical except for the element type and
+  // the element types have identical bit-widths.
+  static StatusOr<Shape> InferBitcastConvertShape(
+      const Shape& operand_shape, PrimitiveType new_element_type);
+
   // Helper that validates the input data type for a reduce-precision operation,
   // and returns the result shape.
   static StatusOr<Shape> InferReducePrecisionShape(const Shape& operand_shape,
@@ -222,11 +237,13 @@ class ShapeInference {
       tensorflow::gtl::ArraySlice<const Shape*> arg_shapes,
       const ProgramShape& to_apply);
 
- private:
   // Helper that infers the shape produced by performing a dot operation with
   // the given LHS and RHS shapes.
-  static StatusOr<Shape> InferDotOpShape(const Shape& lhs, const Shape& rhs);
+  static StatusOr<Shape> InferDotOpShape(
+      const Shape& lhs, const Shape& rhs,
+      const DotDimensionNumbers& dimension_numbers);
 
+ private:
   // Helper that infers the shape produced by performing an element-wise binary
   // operation with the given LHS and RHS shapes.
   // Note: By "element-wise" we mean operations that look at a single element in
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index d12f7bd1453890db3280e54719a6ce811006336d..99d87f3b550ae72befe254f23fad080dd210aaf4 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -395,8 +395,10 @@ TEST_F(ShapeInferenceTest, Convolve) {
   dnums.set_output_batch_dimension(0);
   dnums.set_input_feature_dimension(1);
   dnums.set_output_feature_dimension(1);
-  dnums.add_spatial_dimensions(2);
-  dnums.add_spatial_dimensions(3);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(3);
+  dnums.add_output_spatial_dimensions(3);
 
   // Dimension order: x1, batch, feature, x0
   Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 3});
@@ -437,8 +439,10 @@ TEST_F(ShapeInferenceTest, ConvolveWithWindowDilation) {
   dnums.set_output_batch_dimension(0);
   dnums.set_input_feature_dimension(1);
   dnums.set_output_feature_dimension(1);
-  dnums.add_spatial_dimensions(2);
-  dnums.add_spatial_dimensions(3);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(3);
+  dnums.add_output_spatial_dimensions(3);
 
   // Dimension order: x1, batch, feature, x0
   Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 3});
@@ -480,8 +484,10 @@ TEST_F(ShapeInferenceTest, ConvolveWithBaseDilation) {
   dnums.set_output_batch_dimension(0);
   dnums.set_input_feature_dimension(1);
   dnums.set_output_feature_dimension(1);
-  dnums.add_spatial_dimensions(2);
-  dnums.add_spatial_dimensions(3);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(3);
+  dnums.add_output_spatial_dimensions(3);
 
   // Dimension order: x1, batch, feature, x0
   Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 4});
@@ -524,8 +530,10 @@ TEST_F(ShapeInferenceTest, ConvolveDimensionNumbersOverlapError) {
   dnums.set_output_batch_dimension(3);
   dnums.set_input_feature_dimension(2);
   dnums.set_output_feature_dimension(2);
-  dnums.add_spatial_dimensions(0);
-  dnums.add_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(0);
+  dnums.add_output_spatial_dimensions(0);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
   dnums.set_kernel_input_feature_dimension(0);  // duplicated with kernel_x0
   dnums.set_kernel_output_feature_dimension(3);
   dnums.add_kernel_spatial_dimensions(0);
@@ -890,8 +898,11 @@ TEST_F(ShapeInferenceTest, BroadcastScalar) {
 
 // scalar <dot> vector: error
 TEST_F(ShapeInferenceTest, ScalarDotVector) {
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto inferred_status =
-      ShapeInference::InferBinaryOpShape(BINOP_DOT, f32_, vector_32_, {});
+      ShapeInference::InferDotOpShape(f32_, vector_32_, dot_dnums);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().error_message(),
               HasSubstr("dot only supports rank"));
@@ -899,61 +910,199 @@ TEST_F(ShapeInferenceTest, ScalarDotVector) {
 
 // 3D <dot> 2D: error
 TEST_F(ShapeInferenceTest, DotWithRankHigherThanTwo) {
-  auto inferred_status = ShapeInference::InferBinaryOpShape(
-      BINOP_DOT, ShapeUtil::MakeShape(F32, {32, 32, 32}), matrix_32_64_, {});
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto inferred_status = ShapeInference::InferDotOpShape(
+      ShapeUtil::MakeShape(F32, {32, 32, 32}), matrix_32_64_, dot_dnums);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("dot only supports rank"));
+              HasSubstr("batch and contracting dimension number mismatch"));
 }
 
 // vector <dot> vector -> scalar
 TEST_F(ShapeInferenceTest, VectorDotVector) {
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto inferred_status =
-      ShapeInference::InferBinaryOpShape(BINOP_DOT, vector_64_, vector_64_, {});
+      ShapeInference::InferDotOpShape(vector_64_, vector_64_, dot_dnums);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(f32_, inferred_status.ValueOrDie()));
   auto inferred_status_mismatch =
-      ShapeInference::InferBinaryOpShape(BINOP_DOT, vector_64_, vector_32_, {});
+      ShapeInference::InferDotOpShape(vector_64_, vector_32_, dot_dnums);
   ASSERT_FALSE(inferred_status_mismatch.ok());
 }
 
 // matrix <dot> vector -> vector
 TEST_F(ShapeInferenceTest, MatrixDotVector) {
-  auto inferred_status = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, matrix_32_64_, vector_64_, {});
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(matrix_32_64_, vector_64_, dot_dnums);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(inferred_status.ValueOrDie(), vector_32_));
-  auto inferred_status_mismatch = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, matrix_32_64_, vector_32_, {});
+  auto inferred_status_mismatch =
+      ShapeInference::InferDotOpShape(matrix_32_64_, vector_32_, dot_dnums);
   ASSERT_FALSE(inferred_status_mismatch.ok());
 }
 
 // vector <dot> matrix -> vector
 TEST_F(ShapeInferenceTest, VectorDotMatrix) {
-  auto inferred_status = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, vector_32_, matrix_32_64_, {});
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(vector_32_, matrix_32_64_, dot_dnums);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(inferred_status.ValueOrDie(), vector_64_));
-  auto inferred_status_mismatch = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, vector_64_, matrix_32_64_, {});
+  auto inferred_status_mismatch =
+      ShapeInference::InferDotOpShape(vector_64_, matrix_32_64_, dot_dnums);
   ASSERT_FALSE(inferred_status_mismatch.ok());
 }
 
 // matrix <dot> matrix -> matrix
 TEST_F(ShapeInferenceTest, MatrixDotMatrix) {
-  auto inferred_status_match = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, matrix_32_64_, matrix_64_48_, {});
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto inferred_status_match =
+      ShapeInference::InferDotOpShape(matrix_32_64_, matrix_64_48_, dot_dnums);
   ASSERT_IS_OK(inferred_status_match.status());
   ASSERT_TRUE(
       ShapeUtil::Equal(inferred_status_match.ValueOrDie(), matrix_32_48_))
       << "inferred: "
       << ShapeUtil::HumanString(inferred_status_match.ValueOrDie())
       << " expected: " << ShapeUtil::HumanString(matrix_64_48_);
-  auto inferred_status_mismatch = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, matrix_32_64_, matrix_32_64_, {});
+  auto inferred_status_mismatch =
+      ShapeInference::InferDotOpShape(matrix_32_64_, matrix_32_64_, dot_dnums);
   ASSERT_FALSE(inferred_status_mismatch.ok());
 }
 
+// BatchMatMul with two batch dimensions and one contracting dimension.
+TEST_F(ShapeInferenceTest, DotGeneral) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {5, 2, 11, 3});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {5, 2, 3, 14});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {5, 2, 11, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(3);
+  dot_dnums.add_lhs_batch_dimensions(0);
+  dot_dnums.add_lhs_batch_dimensions(1);
+
+  dot_dnums.add_rhs_contracting_dimensions(2);
+  dot_dnums.add_rhs_batch_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(1);
+
+  auto inferred_status_match =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_IS_OK(inferred_status_match.status());
+  ASSERT_TRUE(
+      ShapeUtil::Equal(inferred_status_match.ValueOrDie(), output_shape))
+      << "inferred: "
+      << ShapeUtil::HumanString(inferred_status_match.ValueOrDie())
+      << " expected: " << ShapeUtil::HumanString(output_shape);
+}
+
+// BatchMatMul with two contracting dimensions fails.
+TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsFails) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3, 2});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {2, 11, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(2);
+  dot_dnums.add_lhs_contracting_dimensions(3);
+  dot_dnums.add_lhs_batch_dimensions(0);
+
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_batch_dimensions(0);
+
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("must specify one contracting dimension for both "
+                        "lhs and rhs"));
+}
+
+// BatchMatMul with different batch dimension sizes fails.
+TEST_F(ShapeInferenceTest, DotWithMisatchedBatchDimSizesFails) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {3, 3, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(2);
+  dot_dnums.add_lhs_batch_dimensions(0);
+
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_batch_dimensions(0);
+
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("batch dimension numbers and sizes must match"));
+}
+
+// BatchMatMul with different batch dimension numbers fails.
+TEST_F(ShapeInferenceTest, DotWithMisatchedBatchDimNumbersFails) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {3, 2, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(2);
+  dot_dnums.add_lhs_batch_dimensions(0);
+
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(1);
+
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("batch dimension numbers must precede non-batch"));
+}
+
+// BatchMatMul with out-of-range dimension numbers fails.
+TEST_F(ShapeInferenceTest, DotWithContractingDimNumberOutOfRange) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(3);
+  dot_dnums.add_lhs_batch_dimensions(0);
+
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(1);
+
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("A dimension number is out of range"));
+}
+
+// BatchMatMul with non-unique dimension numbers fails.
+TEST_F(ShapeInferenceTest, DotWithContractingNonUniqueDimNumber) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_lhs_batch_dimensions(0);
+
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(1);
+
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("A dimension number is not unique"));
+}
+
 TEST_F(ShapeInferenceTest, BinOpBroadcastMatrixVector) {
   // Test variations of broadcasting a vector for a binary add with a
   // matrix.
@@ -1288,5 +1437,80 @@ TEST_F(ShapeInferenceTest, Transpose) {
                                     ShapeUtil::MakeShape(F32, {3, 4, 5, 2})));
 }
 
+TEST_F(ShapeInferenceTest, Conditional) {
+  auto inferred_status0 = ShapeInference::InferConditionalShape(
+      pred_, vector_32_, vector_64_,
+      ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+      ShapeUtil::MakeProgramShape({vector_64_}, f32_));
+  EXPECT_IS_OK(inferred_status0.status());
+  EXPECT_TRUE(ShapeUtil::Equal(f32_, inferred_status0.ValueOrDie()));
+
+  auto inferred_status1 = ShapeInference::InferConditionalShape(
+      pred_, matrix_32_48_, vector_32_,
+      ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_),
+      ShapeUtil::MakeProgramShape({vector_32_}, vector_64_));
+  EXPECT_IS_OK(inferred_status1.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_64_, inferred_status1.ValueOrDie()));
+
+  auto tuple_f32_v32 = ShapeUtil::MakeTupleShape({f32_, vector_32_});
+  auto inferred_status2 = ShapeInference::InferConditionalShape(
+      pred_, matrix_32_48_, tuple_f32_v32,
+      ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
+      ShapeUtil::MakeProgramShape({tuple_f32_v32}, vector_32_));
+  EXPECT_IS_OK(inferred_status2.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, inferred_status2.ValueOrDie()));
+
+  auto inferred_status_error0 = ShapeInference::InferConditionalShape(
+      s32_, vector_32_, vector_64_,
+      ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+      ShapeUtil::MakeProgramShape({vector_64_}, f32_));
+  EXPECT_FALSE(inferred_status_error0.ok());
+  EXPECT_THAT(inferred_status_error0.status().error_message(),
+              HasSubstr("predicate must be a boolean"));
+
+  auto inferred_status_error1 = ShapeInference::InferConditionalShape(
+      pred_, ShapeUtil::MakeTupleShape({f32_, vector_32_}), matrix_32_48_,
+      ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_),
+      ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_));
+  EXPECT_FALSE(inferred_status_error1.ok());
+  EXPECT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("true_computation must take 1 argument"));
+
+  auto inferred_status_error2 = ShapeInference::InferConditionalShape(
+      pred_, vector_32_, vector_64_,
+      ShapeUtil::MakeProgramShape({vector_64_}, f32_),
+      ShapeUtil::MakeProgramShape({vector_64_}, f32_));
+  EXPECT_FALSE(inferred_status_error2.ok());
+  EXPECT_THAT(inferred_status_error2.status().error_message(),
+              HasSubstr("true_operand must match the shape of the only "
+                        "parameter of true_computation"));
+
+  auto inferred_status_error3 = ShapeInference::InferConditionalShape(
+      pred_, matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_}),
+      ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
+      ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_));
+  EXPECT_FALSE(inferred_status_error3.ok());
+  EXPECT_THAT(inferred_status_error3.status().error_message(),
+              HasSubstr("false_computation must take 1 argument"));
+
+  auto inferred_status_error4 = ShapeInference::InferConditionalShape(
+      pred_, vector_32_, vector_64_,
+      ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+      ShapeUtil::MakeProgramShape({vector_32_}, f32_));
+  EXPECT_FALSE(inferred_status_error4.ok());
+  EXPECT_THAT(inferred_status_error4.status().error_message(),
+              HasSubstr("false_operand must match the shape of the only "
+                        "parameter of false_computation"));
+
+  auto inferred_status_error5 = ShapeInference::InferConditionalShape(
+      pred_, vector_32_, vector_64_,
+      ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+      ShapeUtil::MakeProgramShape({vector_64_}, vector_32_));
+  EXPECT_FALSE(inferred_status_error5.ok());
+  EXPECT_THAT(inferred_status_error5.status().error_message(),
+              HasSubstr("the result of true_computation and false_computation "
+                        "must have the same shape"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index a7539a1a11d2bbd62c780890c6730dbb212307c4..c679d401c3691b14a43ce77cbe953cd4c64a9e92 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -34,58 +34,32 @@ namespace xla {
 
 using ::tensorflow::strings::Appendf;
 
-/* static */ StatusOr<std::unique_ptr<ShapedBuffer>>
-ShapedBuffer::MakeArrayShapedBuffer(const Shape& shape,
-                                    const se::Platform* platform,
-                                    int device_ordinal,
-                                    const se::DeviceMemoryBase& buffer) {
-  if (ShapeUtil::IsTuple(shape)) {
-    return InvalidArgument("Shape must be an array: %s",
-                           ShapeUtil::HumanStringWithLayout(shape).c_str());
-  }
-  auto shaped_buffer =
-      MakeUnique<ShapedBuffer>(shape, platform, device_ordinal);
-  *shaped_buffer->mutable_shape_index_to_buffer_entry()->mutable_element({}) =
-      0;
-  *shaped_buffer->mutable_buffers() = {buffer};
-  return std::move(shaped_buffer);
-}
-
-ShapedBuffer::ShapedBuffer(const Shape& shape, const se::Platform* platform,
-                           int device_ordinal)
-    : shape_(shape),
+ShapedBuffer::ShapedBuffer(const Shape& on_host_shape,
+                           const Shape& on_device_shape,
+                           const se::Platform* platform, int device_ordinal)
+    : on_host_shape_(on_host_shape),
+      on_device_shape_(on_device_shape),
       platform_(platform),
       device_ordinal_(device_ordinal),
-      shape_index_to_buffer_entry_(shape) {}
+      buffers_(on_device_shape) {}
 
 void ShapedBuffer::clear() {
-  for (se::DeviceMemoryBase& memory_base : buffers_) {
+  for (auto& pair : buffers_) {
     // A default constructed DeviceMemoryBase is a null pointer.
-    memory_base = se::DeviceMemoryBase();
+    pair.second = se::DeviceMemoryBase();
   }
 }
 
-void ShapedBuffer::AddBufferAtIndex(
-    const perftools::gputools::DeviceMemoryBase& buffer,
-    const ShapeIndex& shape_index) {
-  *mutable_shape_index_to_buffer_entry()->mutable_element(shape_index) =
-      buffers().size();
-  mutable_buffers()->push_back(buffer);
-}
-
-const se::DeviceMemoryBase& ShapedBuffer::buffer(
-    const ShapeIndex& index) const {
-  return buffers_[shape_index_to_buffer_entry_.element(index)];
-}
-
-se::DeviceMemoryBase* ShapedBuffer::mutable_buffer(const ShapeIndex& index) {
-  return &buffers_[shape_index_to_buffer_entry_.element(index)];
-}
-
 string ShapedBuffer::ToString() const {
-  string s = "ShapedBuffer(" + platform_->Name() + "):\n";
+  string s = tensorflow::strings::StrCat(
+      "ShapedBuffer(", platform_->Name(), ":", device_ordinal(),
+      "), on-host shape=" + ShapeUtil::HumanStringWithLayout(on_host_shape()),
+      ", on-device shape=" +
+          ShapeUtil::HumanStringWithLayout(on_device_shape()),
+      ":\n");
   ShapeUtil::ForEachSubshape(
-      shape(), [this, &s](const Shape& subshape, const ShapeIndex& index) {
+      on_device_shape(),
+      [this, &s](const Shape& subshape, const ShapeIndex& index) {
         string shape_str;
         if (ShapeUtil::IsTuple(subshape)) {
           shape_str = "tuple";
@@ -105,53 +79,24 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer) {
   return out;
 }
 
-/* static */ StatusOr<std::unique_ptr<ScopedShapedBuffer>>
-ScopedShapedBuffer::Allocate(
-    const Shape& shape, DeviceMemoryAllocator* allocator, int device_ordinal,
-    const std::function<int64(const Shape&)>& shape_size_fn) {
-  if (!LayoutUtil::HasLayout(shape)) {
-    return InvalidArgument("Shape must have a layout: %s",
-                           ShapeUtil::HumanStringWithLayout(shape).c_str());
-  }
-  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(shape));
-  auto shaped_buffer =
-      WrapUnique(new ScopedShapedBuffer(shape, allocator, device_ordinal));
-
-  // Allocate an appropriate sized buffer for each element in the shape
-  // including the tuple pointer arrays.
-  for (auto& pair : shaped_buffer->shape_index_to_buffer_entry_) {
-    const ShapeIndex& index = pair.first;
-    size_t& buffer_entry = pair.second;
-    TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase memory_base,
-                        shaped_buffer->allocator_->Allocate(
-                            shaped_buffer->device_ordinal(),
-                            shape_size_fn(ShapeUtil::GetSubshape(
-                                shaped_buffer->shape(), index))));
-    shaped_buffer->buffers_.push_back(memory_base);
-    buffer_entry = shaped_buffer->buffers_.size() - 1;
-  }
-
-  return std::move(shaped_buffer);
-}
-
 /* static */
 StatusOr<std::unique_ptr<ScopedShapedBuffer>> ScopedShapedBuffer::MakeScoped(
     ShapedBuffer* shaped_buffer, DeviceMemoryAllocator* allocator) {
   auto scoped_buffer = WrapUnique(new ScopedShapedBuffer(
-      shaped_buffer->shape(), allocator, shaped_buffer->device_ordinal()));
+      shaped_buffer->on_host_shape(), shaped_buffer->on_device_shape(),
+      allocator, shaped_buffer->device_ordinal()));
   scoped_buffer->buffers_ = shaped_buffer->buffers();
-  scoped_buffer->shape_index_to_buffer_entry_ =
-      shaped_buffer->shape_index_to_buffer_entry();
-
   shaped_buffer->clear();
 
   return std::move(scoped_buffer);
 }
 
-ScopedShapedBuffer::ScopedShapedBuffer(const Shape& shape,
+ScopedShapedBuffer::ScopedShapedBuffer(const Shape& on_host_shape,
+                                       const Shape& on_device_shape,
                                        DeviceMemoryAllocator* allocator,
                                        int device_ordinal)
-    : ShapedBuffer(shape, allocator->platform(), device_ordinal),
+    : ShapedBuffer(on_host_shape, on_device_shape, allocator->platform(),
+                   device_ordinal),
       allocator_(allocator) {}
 
 ScopedShapedBuffer::~ScopedShapedBuffer() {
@@ -159,7 +104,8 @@ ScopedShapedBuffer::~ScopedShapedBuffer() {
   // in the shape (eg, a tuple with a repeated element) so keep track of what
   // has been deallocated.
   std::set<void*> deallocated_opaques;
-  for (se::DeviceMemoryBase& memory_base : buffers_) {
+  for (auto& pair : buffers_) {
+    se::DeviceMemoryBase& memory_base = pair.second;
     if (!memory_base.is_null() &&
         deallocated_opaques.count(memory_base.opaque()) == 0) {
       deallocated_opaques.insert(memory_base.opaque());
@@ -170,13 +116,10 @@ ScopedShapedBuffer::~ScopedShapedBuffer() {
 }
 
 std::unique_ptr<ShapedBuffer> ScopedShapedBuffer::release() {
-  auto shaped_buffer =
-      MakeUnique<ShapedBuffer>(shape(), platform(), device_ordinal());
-
-  *shaped_buffer->mutable_buffers() = buffers();
-  *shaped_buffer->mutable_shape_index_to_buffer_entry() =
-      shape_index_to_buffer_entry();
+  auto shaped_buffer = MakeUnique<ShapedBuffer>(
+      on_host_shape(), on_device_shape(), platform(), device_ordinal());
 
+  shaped_buffer->buffers() = buffers();
   clear();
 
   return shaped_buffer;
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index fa88caa13ff734995e8ab0925f17d0d3c26b8fda..f570ebb9cbb2837d3eadc32fe269845c995f7f89 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -31,61 +31,68 @@ limitations under the License.
 namespace xla {
 
 // Class which encapsulates a buffer or set of buffers containing data of a
-// particular XLA shape. Used for zero-copy execution interface for a
-// XLA client running in the same process as the service (LocalClient),
+// particular XLA shape.
 class ShapedBuffer {
  public:
-  // Convenience method which creates a ShapedBuffer of array shape (not a
-  // tuple). Its single buffer pointer is set to the given value "buffer". The
-  // given buffer must be large enough to store the given shape as given by
-  // ShapeUtil::ByteSizeOf.
-  static StatusOr<std::unique_ptr<ShapedBuffer>> MakeArrayShapedBuffer(
-      const Shape& shape, const perftools::gputools::Platform* platform,
-      int device_ordinal, const perftools::gputools::DeviceMemoryBase& buffer);
-
-  ShapedBuffer(const Shape& shape,
+  // Construct a ScopedShapedBuffer with null DeviceMemoryBases at each
+  // index. The shape of the data on the host and the device may differ because
+  // the device may have a different representation for different data
+  // types. Therefore, both the on-host and on-device shape are required. The
+  // on-device shape determines the number of device allocations
+  // (DeviceMemoryBase) held by the ShapedBuffer.
+  ShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape,
                const perftools::gputools::Platform* platform,
                int device_ordinal);
 
-  const Shape& shape() const { return shape_; }
+  // Returns the shape of the on-host representation of the data held by this
+  // ShapedBuffer.
+  const Shape& on_host_shape() const { return on_host_shape_; }
+
+  // Returns the shape of the on-device representation of the data held by this
+  // ShapedBuffer.
+  const Shape& on_device_shape() const { return on_device_shape_; }
+
   const perftools::gputools::Platform* platform() const { return platform_; }
   int device_ordinal() const { return device_ordinal_; }
 
+  // Return the root buffer of the shape (shape index {}).
+  const perftools::gputools::DeviceMemoryBase& root_buffer() const {
+    return buffer(/*index=*/{});
+  }
+
   // Returns the buffer at the given shape index where index is defined as in
   // ShapeUtil::GetSubshape.
   const perftools::gputools::DeviceMemoryBase& buffer(
-      const ShapeIndex& index) const;
-  perftools::gputools::DeviceMemoryBase* mutable_buffer(
-      const ShapeIndex& index);
-
-  // Returns the underlying structure which stores the buffer pointers.
-  const std::vector<perftools::gputools::DeviceMemoryBase>& buffers() const {
-    return buffers_;
+      const ShapeIndex& index) const {
+    return buffers_.element(index);
   }
-  std::vector<perftools::gputools::DeviceMemoryBase>* mutable_buffers() {
-    return &buffers_;
+
+  // Sets the device memory buffer at the given index.
+  void set_buffer(const perftools::gputools::DeviceMemoryBase& buffer,
+                  const ShapeIndex& index) {
+    *buffers_.mutable_element(index) = buffer;
   }
 
-  // Returns the tree of indices which map to buffer pointers.
-  const ShapeTree<size_t>& shape_index_to_buffer_entry() const {
-    return shape_index_to_buffer_entry_;
+  // Returns the underlying ShapeTree containing all the device addresses in the
+  // ShapedBuffer.
+  const ShapeTree<perftools::gputools::DeviceMemoryBase>& buffers() const {
+    return buffers_;
   }
-  ShapeTree<size_t>* mutable_shape_index_to_buffer_entry() {
-    return &shape_index_to_buffer_entry_;
+  ShapeTree<perftools::gputools::DeviceMemoryBase>& buffers() {
+    return buffers_;
   }
 
   // Set all device memory pointers in the object to null.
   void clear();
 
-  // Adds a new buffer at the given shape index.
-  void AddBufferAtIndex(const perftools::gputools::DeviceMemoryBase& buffer,
-                        const ShapeIndex& shape_index);
-
   string ToString() const;
 
  protected:
-  // The shape of the device buffer with layout.
-  const Shape shape_;
+  // The shape of the data when represented on the host.
+  const Shape on_host_shape_;
+
+  // The shape of the data on the device.
+  const Shape on_device_shape_;
 
   // The platform the memory is allocated on.
   const perftools::gputools::Platform* platform_;
@@ -93,14 +100,8 @@ class ShapedBuffer {
   // The device the memory is allocated on.
   const int device_ordinal_;
 
-  // The list of DeviceMemoryBase pointers representing this shape.
-  // Note that there can be a many to one relationship between tuple elements
-  // and buffers.  To account for this, shape_index_to_buffer_entry_ allows us
-  // to make from a position in a shape to an index into this list.
-  std::vector<perftools::gputools::DeviceMemoryBase> buffers_;
-
-  // The tree of indices into buffers_.
-  ShapeTree<size_t> shape_index_to_buffer_entry_;
+  // The tree of device buffers. Its shape is on_device_shape().
+  ShapeTree<perftools::gputools::DeviceMemoryBase> buffers_;
 };
 
 std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
@@ -110,20 +111,16 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
 // destructed.
 class ScopedShapedBuffer : public ShapedBuffer {
  public:
-  // Return a newly allocated ScopedShapedBuffer of an arbitrary shape. Array
-  // buffers (leaves in the shape) are allocated and uninitialized. Tuple
-  // buffers (if any) are allocated and initialized to the backend-specific
-  // representation of an array of pointers to the tuple elements.
-  static StatusOr<std::unique_ptr<ScopedShapedBuffer>> Allocate(
-      const Shape& shape, DeviceMemoryAllocator* allocator, int device_ordinal,
-      const std::function<int64(const Shape&)>& shape_size_fn);
-
   // Takes a ShapedBuffer and returns a ScopedShapedBuffer which manages the
   // deallocation of the device memory held in the shaped buffer. All device
   // memory pointers in the given ShapedBuffer are set to null.
   static StatusOr<std::unique_ptr<ScopedShapedBuffer>> MakeScoped(
       ShapedBuffer* shaped_buffer, DeviceMemoryAllocator* allocator);
 
+  // Create a ScopedShapedBuffer with null DeviceMemoryBases at each index.
+  ScopedShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape,
+                     DeviceMemoryAllocator* allocator, int device_ordinal);
+
   // Return the allocator used to allocate the device memory held in this
   // ScopedShapedBuffer.
   DeviceMemoryAllocator* memory_allocator() const { return allocator_; }
@@ -138,8 +135,6 @@ class ScopedShapedBuffer : public ShapedBuffer {
   virtual ~ScopedShapedBuffer();
 
  protected:
-  ScopedShapedBuffer(const Shape& shape, DeviceMemoryAllocator* allocator,
-                     int device_ordinal);
   ScopedShapedBuffer(const ScopedShapedBuffer&) = delete;
   void operator=(const ScopedShapedBuffer&) = delete;
 
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index d5f53ad56fb019d0ae7c27fc28706f05614ece68..2f36e2b16e0f2eed10aef811dd3cceeba6a5b8a9 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -40,6 +40,45 @@ TransferManager::GetPlatformTransferManagers() {
   return r;
 }
 
+Status TransferManager::TransferArrayToDevice(
+    perftools::gputools::StreamExecutor* executor, const Literal& literal,
+    const perftools::gputools::DeviceMemoryBase& dest) {
+  const Shape on_device_shape = HostShapeToDeviceShape(literal.shape());
+  TF_RET_CHECK(ShapeUtil::IsArray(on_device_shape))
+      << "On-device representation of "
+      << ShapeUtil::HumanString(literal.shape())
+      << " is not an array: " << ShapeUtil::HumanString(on_device_shape);
+  if (dest.size() < GetByteSizeRequirement(on_device_shape)) {
+    return FailedPrecondition(
+        "Allocation on device not large enough for array: "
+        "%lld < %lld",
+        dest.size(), GetByteSizeRequirement(on_device_shape));
+  }
+  ShapedBuffer shaped_buffer(/*on_host_shape=*/literal.shape(), on_device_shape,
+                             executor->platform(), executor->device_ordinal());
+  shaped_buffer.set_buffer(dest, /*index=*/{});
+  return TransferLiteralToDevice(executor, literal, shaped_buffer);
+}
+
+StatusOr<std::unique_ptr<Literal>> TransferManager::TransferArrayFromDevice(
+    perftools::gputools::StreamExecutor* executor, const Shape& shape,
+    const perftools::gputools::DeviceMemoryBase& source) {
+  TF_RET_CHECK(ShapeUtil::Equal(HostShapeToDeviceShape(shape), shape))
+      << "Shape " << ShapeUtil::HumanString(shape)
+      << " has a differently shaped representation on-device: "
+      << ShapeUtil::HumanString(HostShapeToDeviceShape(shape));
+  if (source.size() < GetByteSizeRequirement(shape)) {
+    return FailedPrecondition(
+        "Allocation on device not large enough for array: "
+        "%lld < %lld",
+        source.size(), GetByteSizeRequirement(shape));
+  }
+  ShapedBuffer shaped_buffer(/*on_host_shape=*/shape, shape,
+                             executor->platform(), executor->device_ordinal());
+  shaped_buffer.set_buffer(source, /*index=*/{});
+  return TransferLiteralFromDevice(executor, shaped_buffer);
+}
+
 /* static */ void TransferManager::RegisterTransferManager(
     se::Platform::Id platform_id,
     TransferManagerCreationFunction creation_function) {
@@ -75,14 +114,12 @@ TransferManager::GetPlatformTransferManagers() {
 Status TransferManager::WriteTupleIndexTables(
     perftools::gputools::StreamExecutor* executor,
     const ShapedBuffer& device_buffer) {
-  VLOG(2) << "Writing tuple index tables to ShapedBuffer rooted at "
-          << device_buffer.buffer(/*index=*/{}).opaque()
-          << "; shape: " << ShapeUtil::HumanString(device_buffer.shape());
+  VLOG(2) << "Writing tuple index tables for " << device_buffer;
 
   TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal());
 
   return ShapeUtil::ForEachSubshapeWithStatus(
-      device_buffer.shape(),
+      device_buffer.on_device_shape(),
       [&](const Shape& device_subshape, const ShapeIndex& index) -> Status {
         if (ShapeUtil::IsTuple(device_subshape)) {
           se::DeviceMemoryBase device_memory = device_buffer.buffer(index);
@@ -97,7 +134,7 @@ Status TransferManager::WriteTupleIndexTables(
             elements.push_back(device_buffer.buffer(element_index));
             element_index.pop_back();
           }
-          return WriteTuplePointersToDevice(executor, elements, device_subshape,
+          return WriteSingleTupleIndexTable(executor, elements, device_subshape,
                                             &device_memory);
         }
 
@@ -143,31 +180,43 @@ Status TransferManager::TransferBufferToDevice(
   return Status::OK();
 }
 
-StatusOr<std::set<se::DeviceMemoryBase>>
-TransferManager::GatherBufferPointersFromTuple(
-    se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
-    const Shape& shape) {
-  TF_RET_CHECK(ShapeUtil::IsTuple(shape));
-
-  std::set<se::DeviceMemoryBase> buffer_pointers;
-  buffer_pointers.insert(source);
-
-  TF_ASSIGN_OR_RETURN(std::vector<se::DeviceMemoryBase> tuple_elements,
-                      ShallowCopyTupleFromDevice(executor, source, shape));
-  for (auto i = 0; i < tuple_elements.size(); ++i) {
-    const Shape& element_shape = shape.tuple_shapes(i);
-    if (ShapeUtil::IsTuple(element_shape)) {
-      TF_ASSIGN_OR_RETURN(
-          std::set<se::DeviceMemoryBase> buffer_pointers_in_element,
-          GatherBufferPointersFromTuple(executor, tuple_elements[i],
-                                        element_shape));
-      buffer_pointers.insert(buffer_pointers_in_element.begin(),
-                             buffer_pointers_in_element.end());
-    } else {
-      buffer_pointers.insert(tuple_elements[i]);
-    }
+StatusOr<std::unique_ptr<ShapedBuffer>> TransferManager::AllocateShapedBuffer(
+    const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
+    int device_ordinal) {
+  if (!LayoutUtil::HasLayout(on_host_shape)) {
+    return InvalidArgument(
+        "Shape must have a layout: %s",
+        ShapeUtil::HumanStringWithLayout(on_host_shape).c_str());
+  }
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(on_host_shape));
+  const Shape on_device_shape = HostShapeToDeviceShape(on_host_shape);
+  TF_RET_CHECK(LayoutUtil::HasLayout(on_device_shape));
+
+  auto shaped_buffer = WrapUnique(new ShapedBuffer(
+      on_host_shape, on_device_shape, allocator->platform(), device_ordinal));
+
+  // Allocate an appropriate sized buffer for each element in the shape
+  // including the tuple pointer arrays.
+  for (auto& pair : shaped_buffer->buffers()) {
+    const ShapeIndex& index = pair.first;
+    se::DeviceMemoryBase& memory_base = pair.second;
+    const Shape& subshape = ShapeUtil::GetSubshape(on_device_shape, index);
+    TF_ASSIGN_OR_RETURN(memory_base,
+                        allocator->Allocate(shaped_buffer->device_ordinal(),
+                                            GetByteSizeRequirement(subshape)));
   }
-  return std::move(buffer_pointers);
+
+  return std::move(shaped_buffer);
+}
+
+StatusOr<std::unique_ptr<ScopedShapedBuffer>>
+TransferManager::AllocateScopedShapedBuffer(const Shape& on_host_shape,
+                                            DeviceMemoryAllocator* allocator,
+                                            int device_ordinal) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<ShapedBuffer> unscoped_buffer,
+      AllocateShapedBuffer(on_host_shape, allocator, device_ordinal));
+  return ScopedShapedBuffer::MakeScoped(unscoped_buffer.get(), allocator);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index fdc123e54eb7f754c12510bef551b98da01b585d..9f2b5c4aecf0b52f610171e0c2755de577b2bd9e 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -44,55 +44,47 @@ class TransferManager {
   // Returns the ID of the platform that this transfer manager acts on.
   virtual perftools::gputools::Platform::Id PlatformId() const = 0;
 
-  // Transfers the region into the provided literal using the provided
-  // executor. device_shape is the shape, including layout, of the data on the
-  // device, while literal_shape will be the shape for the literal. device_shape
-  // and literal_shape must be compatible, but need not have the same layout.
-  // TODO(b/66694934): Remove TransferLiteral* methods which accept bare
-  // DeviceMemoryBase.
-  virtual Status TransferLiteralFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const perftools::gputools::DeviceMemoryBase& region,
-      const Shape& device_shape, const Shape& literal_shape,
-      Literal* literal) = 0;
-
-  // Transfers the given literal into the provided region output parameter,
-  // using the given executor.
-  virtual Status TransferLiteralToDevice(
-      perftools::gputools::StreamExecutor* executor, const Literal& literal,
-      perftools::gputools::DeviceMemoryBase* region) = 0;
-
-  // Transfers the data held in the given ShapedBuffer into the provided literal
-  // using the provided executor. literal_shape will be the shape for the
-  // literal. The shape of the ShapedBuffer and literal_shape must be
-  // compatible, but need not have the same layout.
+  // Returns the shape of the on-device representation for the given shape on
+  // the host. This is intended for use with ShapedBuffer where buffers are
+  // pre-allocated by the host, e.g. TransferLiteralToDevice, without the user
+  // needing to consider device-specific behaviors.
+  virtual Shape HostShapeToDeviceShape(const Shape& host_shape) const {
+    return host_shape;
+  }
+
+  // Returns a literal containing the data held in the given ShapedBuffer.
+  // using the provided executor. The optional literal_shape will be the shape
+  // for the literal. The shape of the ShapedBuffer and
+  // DeviceShape(literal_shape) must be compatible, but need not have the same
+  // layout.
   virtual StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
       perftools::gputools::StreamExecutor* executor,
       const ShapedBuffer& device_buffer) = 0;
 
   // Transfers the given literal into the previously allocated device memory
-  // represented by the given ShapedBuffer using the given executor.
+  // represented by the given ShapedBuffer using the given executor. The shape
+  // of the ShapedBuffer and DeviceShape(literal.shape()) must be compatible,
+  // but need not have the same layout
   virtual Status TransferLiteralToDevice(
       perftools::gputools::StreamExecutor* executor, const Literal& literal,
       const ShapedBuffer& device_buffer) = 0;
 
+  // Convenience methods for transferring an array to or from the device at a
+  // known address. This avoids having to construct a ShapedBuffer just to
+  // transfer an array at a known address.
+  Status TransferArrayToDevice(
+      perftools::gputools::StreamExecutor* executor, const Literal& literal,
+      const perftools::gputools::DeviceMemoryBase& dest);
+  StatusOr<std::unique_ptr<Literal>> TransferArrayFromDevice(
+      perftools::gputools::StreamExecutor* executor, const Shape& shape,
+      const perftools::gputools::DeviceMemoryBase& source);
+
   // Transfers the given literal into the Infeed interface of the device,
   // using the given executor.
   virtual Status TransferLiteralToInfeed(
       perftools::gputools::StreamExecutor* executor,
       const Literal& literal) = 0;
 
-  // Transfer a memory block of the given size from 'source' buffer to the
-  // Infeed interface of the device using the given executor.
-  //
-  // size is the size to transfer from source in bytes.
-  //
-  // source is the source data that must be in the target-dependent layout that
-  // the Infeed HLO used in the computation expects.
-  virtual Status TransferBufferToInfeed(
-      perftools::gputools::StreamExecutor* executor, int64 size,
-      const void* source) = 0;
-
   // Transfers the given literal from the Outfeed interface of the device,
   // using the given executor.
   virtual Status TransferLiteralFromOutfeed(
@@ -104,37 +96,26 @@ class TransferManager {
       tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
           executor) = 0;
 
-  // Shallow copy a tuple from the device and create a DeviceMemoryBase object
-  // for each element in the tuple. A DeviceMemoryBase object refers to the
-  // buffer containing the data of that element. The DeviceMemoryBase objects
-  // are returned as a vector.
-  virtual StatusOr<std::vector<perftools::gputools::DeviceMemoryBase>>
-  ShallowCopyTupleFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const perftools::gputools::DeviceMemoryBase& source,
-      const Shape& shape) = 0;
-
   // Given an allocated ShapedBuffer, constructs the tuple index table(s) in
   // each buffer of the given ShapedBuffer corresponding to tuple shapes. If the
   // ShapedBuffer is array-shaped this method does nothing.
   Status WriteTupleIndexTables(perftools::gputools::StreamExecutor* executor,
                                const ShapedBuffer& device_buffer);
 
-  // Returns all buffer pointers that the tuple `source` refers to. Unlike
-  // ShallowCopyTupleFromDevice, this function gather buffer pointers in nested
-  // tuples as well. Also, the returned DeviceMemoryBase objects are
-  // deduplicated.
-  StatusOr<std::set<perftools::gputools::DeviceMemoryBase>>
-  GatherBufferPointersFromTuple(
-      perftools::gputools::StreamExecutor* executor,
-      const perftools::gputools::DeviceMemoryBase& source, const Shape& shape);
-
   // Determines the byte size requirement for the given shape on the underlying
   // architecture. This will be used to allocate an appropriately sized memory
   // region for a host-to-device transfer.
   virtual int64 GetByteSizeRequirement(const Shape& shape) const = 0;
 
-  typedef std::unique_ptr<TransferManager> (*TransferManagerCreationFunction)();
+  // Allocate a ShapedBuffer which can hold data with the given on-host
+  // shape. The on-device shape may be different as indicated by
+  // HostShapeToDeviceShape.
+  StatusOr<std::unique_ptr<ShapedBuffer>> AllocateShapedBuffer(
+      const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
+      int device_ordinal);
+  StatusOr<std::unique_ptr<ScopedShapedBuffer>> AllocateScopedShapedBuffer(
+      const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
+      int device_ordinal);
 
   /////
   // The TransferManager class also serves as a point to register objects for
@@ -144,6 +125,7 @@ class TransferManager {
   // assumed to be a singleton, so no ownership is transferred.
   //
   // Precondition: a platform kind must not be registered more than once.
+  typedef std::unique_ptr<TransferManager> (*TransferManagerCreationFunction)();
   static void RegisterTransferManager(
       perftools::gputools::Platform::Id platform_id,
       TransferManagerCreationFunction transfer_manager);
@@ -154,6 +136,17 @@ class TransferManager {
       const perftools::gputools::Platform* platform);
 
  protected:
+  // Transfer a memory block of the given size from 'source' buffer to the
+  // Infeed interface of the device using the given executor.
+  //
+  // size is the size to transfer from source in bytes.
+  //
+  // source is the source data that must be in the target-dependent layout that
+  // the Infeed HLO used in the computation expects.
+  virtual Status TransferBufferToInfeed(
+      perftools::gputools::StreamExecutor* executor, int64 size,
+      const void* source) = 0;
+
   // Transfer a memory block of the given size from the device source into the
   // 'destination' buffer.
   //
@@ -172,10 +165,9 @@ class TransferManager {
       const void* source, perftools::gputools::DeviceMemoryBase* destination);
 
   // Writes the given device-memory pointers in 'elements' to the given region
-  // to construct a tuple in the platform-specific tuple representation. This
-  // can handle nested tuples as well. In the nested case, the element
-  // DeviceMemoryBase points to another array of pointers on the device.
-  virtual Status WriteTuplePointersToDevice(
+  // to construct a tuple index table in the platform-specific tuple
+  // representation.
+  virtual Status WriteSingleTupleIndexTable(
       perftools::gputools::StreamExecutor* executor,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           elements,
diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index 8c2640adf52f10c387e7a9c09c0d73a09c054919..83185ac49e9b7c386d10d1cbc4e20dcdfdfd6cae 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -42,7 +42,7 @@ TransposeFolding::OperandIndices CanFoldOperandsIntoDot(
   TransposeFolding::OperandIndices operand_set;
   for (int64 i = 0; i < dot.operand_count(); ++i) {
     auto& operand = *dot.operand(i);
-    if (operand.IsRank2Transpose() && operand.user_count() == 1) {
+    if (operand.IsRank2Transpose()) {
       operand_set.push_back(i);
     }
   }
@@ -58,27 +58,10 @@ TransposeFolding::OperandIndices CanFoldOperandsIntoConvolution(
     return {};
   }
 
-  const ConvolutionDimensionNumbers& dnums =
-      convolution.convolution_dimension_numbers();
-
   TransposeFolding::OperandIndices operand_set;
   for (int64 i = 0; i < convolution.operand_count(); ++i) {
     auto& operand = *convolution.operand(i);
-    if (operand.opcode() == HloOpcode::kTranspose &&
-        operand.user_count() == 1) {
-      const auto& transpose_dimensions = operand.dimensions();
-      // We can transpose the LHS so long as it doesn't move around spatial
-      // dimensions because ConvolutionDimensionNumbers doesn't have different
-      // fields for input and output spatial dimensions.
-      if (i == 0 &&
-          std::any_of(dnums.spatial_dimensions().begin(),
-                      dnums.spatial_dimensions().end(),
-                      [&](const int64 spatial_dimension) {
-                        return transpose_dimensions[spatial_dimension] !=
-                               spatial_dimension;
-                      })) {
-        continue;
-      }
+    if (operand.opcode() == HloOpcode::kTranspose) {
       operand_set.push_back(i);
     }
   }
@@ -118,6 +101,10 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
   auto& convolution = *pair.first;
   auto& operand_indices = pair.second;
 
+  if (operand_indices.empty()) {
+    return false;
+  }
+
   const ConvolutionDimensionNumbers& dnums =
       convolution.convolution_dimension_numbers();
   ConvolutionDimensionNumbers new_dnums = dnums;
@@ -137,8 +124,9 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
         transpose_dimensions[dnums.input_batch_dimension()]);
     new_dnums.set_input_feature_dimension(
         transpose_dimensions[dnums.input_feature_dimension()]);
-    for (const auto& spatial_dimension : dnums.spatial_dimensions()) {
-      CHECK_EQ(spatial_dimension, transpose_dimensions[spatial_dimension]);
+    for (auto& input_spatial_dimension :
+         *new_dnums.mutable_input_spatial_dimensions()) {
+      input_spatial_dimension = transpose_dimensions[input_spatial_dimension];
     }
     new_lhs = &transpose_operand;
   } else {
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 00462f9be1e9beb2f2694060ebfaa70b0b9dd4a0..caa1a111ad880b9dee62c1c94e32e8275c196fbf 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -64,9 +64,12 @@ TEST_F(TransposeFoldingTest, FoldDotTranspose) {
   HloInstruction* transpose_y =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(F32, {3, 2}), y, {1, 0}));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {2, 2}), /*opcode=*/HloOpcode::kDot,
-      /*lhs=*/x, /*rhs=*/transpose_y));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  HloInstruction* dot = builder.AddInstruction(
+      HloInstruction::CreateDot(ShapeUtil::MakeShape(F32, {2, 2}), /*lhs=*/x,
+                                /*rhs=*/transpose_y, dot_dnums));
 
   HloModule module("test_module");
   HloComputation* entry_computation =
@@ -104,9 +107,12 @@ TEST_F(TransposeFoldingTest, FoldDotTransposeConstant) {
   HloInstruction* transpose1 =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(F32, {2, 3}), const1, {1, 0}));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1, 3}), /*opcode=*/HloOpcode::kDot,
-      /*lhs=*/transpose0, /*rhs=*/transpose1));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
+      ShapeUtil::MakeShape(F32, {1, 3}),
+      /*lhs=*/transpose0, /*rhs=*/transpose1, dot_dnums));
 
   HloModule module("test_module");
   HloComputation* entry_computation =
@@ -169,9 +175,12 @@ TEST_F(TransposeFoldingTest, FoldDotTransposeInWhile) {
   HloInstruction* transpose_y =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(F32, {3, 2}), y, {1, 0}));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {2, 2}), /*opcode=*/HloOpcode::kDot,
-      /*lhs=*/x, /*rhs=*/transpose_y));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  HloInstruction* dot = builder.AddInstruction(
+      HloInstruction::CreateDot(ShapeUtil::MakeShape(F32, {2, 2}), /*lhs=*/x,
+                                /*rhs=*/transpose_y, dot_dnums));
 
   HloModule module("test_module");
   HloComputation* entry_computation =
@@ -362,10 +371,82 @@ TEST_F(TransposeFoldingTest, FoldConvTransposeLhs) {
   EXPECT_EQ(
       dnums.input_batch_dimension(),
       new_conv->convolution_dimension_numbers().input_feature_dimension());
-  EXPECT_EQ(dnums.spatial_dimensions(0),
-            new_conv->convolution_dimension_numbers().spatial_dimensions(0));
-  EXPECT_EQ(dnums.spatial_dimensions(1),
-            new_conv->convolution_dimension_numbers().spatial_dimensions(1));
+  EXPECT_EQ(
+      dnums.input_spatial_dimensions(0),
+      new_conv->convolution_dimension_numbers().input_spatial_dimensions(0));
+  EXPECT_EQ(
+      dnums.input_spatial_dimensions(1),
+      new_conv->convolution_dimension_numbers().input_spatial_dimensions(1));
+  EXPECT_EQ(
+      dnums.output_spatial_dimensions(0),
+      new_conv->convolution_dimension_numbers().output_spatial_dimensions(0));
+  EXPECT_EQ(
+      dnums.output_spatial_dimensions(1),
+      new_conv->convolution_dimension_numbers().output_spatial_dimensions(1));
+}
+
+// Test that a transpose of every dimension in the activations gets folded into
+// convolution.
+TEST_F(TransposeFoldingTest, FoldConvComplexTransposeLhs) {
+  auto builder = HloComputation::Builder("entry_computation");
+  HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {3, 2, 1, 1}),
+      /*name=*/"x"));
+  HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {2, 3, 1, 1}),
+      /*name=*/"y"));
+  HloInstruction* transpose_x =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {2, 3, 1, 1}), x, {1, 0, 3, 2}));
+  auto dnums = ComputationBuilder::CreateDefaultConvDimensionNumbers();
+  Window window;
+  for (int i = 0; i < 2; ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_padding_low(0);
+    dim->set_padding_high(0);
+    dim->set_base_dilation(1);
+    dim->set_window_dilation(1);
+    dim->set_stride(1);
+    dim->set_size(y->shape().dimensions(dnums.kernel_spatial_dimensions(i)));
+  }
+  StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
+      transpose_x->shape(), y->shape(), window, dnums);
+  EXPECT_IS_OK(conv_shape);
+  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      conv_shape.ValueOrDie(), transpose_x, y, window, dnums));
+
+  HloModule module("test_module");
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build(conv));
+  FoldTranspose(&module);
+
+  // Instructions after folding: x, y, and the convolution.
+  std::unordered_set<HloInstruction*> instruction_set(
+      entry_computation->instructions().begin(),
+      entry_computation->instructions().end());
+  EXPECT_EQ(1, instruction_set.erase(x)) << "x is not in entry_computation.";
+  EXPECT_EQ(1, instruction_set.erase(y)) << "y is not in entry_computation.";
+  EXPECT_EQ(1, instruction_set.size())
+      << "entry_computation should contain exactly 3 instructions.";
+  HloInstruction* new_conv = *instruction_set.begin();
+  EXPECT_EQ(HloOpcode::kConvolution, new_conv->opcode());
+  EXPECT_EQ(dnums.input_feature_dimension(),
+            new_conv->convolution_dimension_numbers().input_batch_dimension());
+  EXPECT_EQ(
+      dnums.input_batch_dimension(),
+      new_conv->convolution_dimension_numbers().input_feature_dimension());
+  EXPECT_EQ(
+      dnums.input_spatial_dimensions(0),
+      new_conv->convolution_dimension_numbers().input_spatial_dimensions(1));
+  EXPECT_EQ(
+      dnums.input_spatial_dimensions(1),
+      new_conv->convolution_dimension_numbers().input_spatial_dimensions(0));
+  EXPECT_EQ(
+      dnums.output_spatial_dimensions(0),
+      new_conv->convolution_dimension_numbers().output_spatial_dimensions(0));
+  EXPECT_EQ(
+      dnums.output_spatial_dimensions(1),
+      new_conv->convolution_dimension_numbers().output_spatial_dimensions(1));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 8f63c92e5b957189ad474459d4eed53986cecaae..066ffcd7e958ed40b324dc65da209b33bc0f98f9 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -88,8 +88,6 @@ HloOpcode BinaryOperationToHloOpcode(BinaryOperation binop) {
       return HloOpcode::kAtan2;
     case BINOP_COMPLEX:
       return HloOpcode::kComplex;
-    case BINOP_DOT:
-      return HloOpcode::kDot;
     case BINOP_MUL:
       return HloOpcode::kMultiply;
     case BINOP_ADD:
@@ -765,6 +763,54 @@ StatusOr<ComputationDataHandle> UserComputation::AddWhileInstruction(
   return handle;
 }
 
+StatusOr<ComputationDataHandle> UserComputation::AddConditionalInstruction(
+    const ConditionalRequest& conditional_request,
+    const UserComputation& true_computation,
+    const UserComputation& false_computation) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* pred,
+                      LookUpRequest(conditional_request.predicate()));
+  TF_ASSIGN_OR_RETURN(const OperationRequest* true_operand,
+                      LookUpRequest(conditional_request.true_operand()));
+  TF_ASSIGN_OR_RETURN(const OperationRequest* false_operand,
+                      LookUpRequest(conditional_request.false_operand()));
+
+  VersionedComputationHandle::Version true_computation_version =
+      true_computation.version();
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<const ProgramShape> true_computation_shape,
+      true_computation.ComputeProgramShape(true_computation_version));
+
+  VersionedComputationHandle::Version false_computation_version =
+      false_computation.version();
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<const ProgramShape> false_computation_shape,
+      false_computation.ComputeProgramShape(false_computation_version));
+
+  TF_ASSIGN_OR_RETURN(Shape inferred_shape,
+                      ShapeInference::InferConditionalShape(
+                          pred->output_shape(), true_operand->output_shape(),
+                          false_operand->output_shape(),
+                          *true_computation_shape, *false_computation_shape));
+
+  ComputationDataHandle handle = CreateComputationDataHandle();
+
+  OperationRequest& request =
+      (*session_computation_.mutable_requests())[handle.handle()];
+  *request.mutable_output_handle() = handle;
+  *request.mutable_output_shape() = inferred_shape;
+  request.add_embedded_computation_versions(true_computation_version);
+  request.add_embedded_computation_versions(false_computation_version);
+  *request.mutable_request()->mutable_conditional_request() =
+      conditional_request;
+
+  VLOG(1) << "AddConditionalInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << conditional_request.ShortDebugString();
+  return handle;
+}
+
 StatusOr<ComputationDataHandle> UserComputation::AddBroadcastInstruction(
     const BroadcastRequest& broadcast_request) {
   tensorflow::mutex_lock lock(mutex_);
@@ -994,6 +1040,32 @@ StatusOr<ComputationDataHandle> UserComputation::AddConvertInstruction(
   return handle;
 }
 
+StatusOr<ComputationDataHandle> UserComputation::AddBitcastConvertInstruction(
+    const ConvertRequest& convert_request) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
+                      LookUpRequest(convert_request.operand()));
+
+  TF_ASSIGN_OR_RETURN(Shape new_shape, ShapeInference::InferConvertShape(
+                                           operand->output_shape(),
+                                           convert_request.new_element_type()));
+
+  ComputationDataHandle handle = CreateComputationDataHandle();
+
+  OperationRequest& request =
+      (*session_computation_.mutable_requests())[handle.handle()];
+  *request.mutable_output_handle() = handle;
+  *request.mutable_output_shape() = new_shape;
+  *request.mutable_request()->mutable_bitcast_convert_request() =
+      convert_request;
+
+  VLOG(1) << "AddBitcastConvertInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << convert_request.ShortDebugString();
+  return handle;
+}
+
 StatusOr<ComputationDataHandle> UserComputation::AddReducePrecisionInstruction(
     const ReducePrecisionRequest& reduce_precision_request) {
   tensorflow::mutex_lock lock(mutex_);
@@ -1056,7 +1128,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddCrossReplicaSumInstruction(
   TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
                       LookUpRequest(cross_replica_sum_request.operand()));
   TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferCrossReplicaSumShape(
-                                       operand->output_shape()));
+                                       {&operand->output_shape()}));
 
   ComputationDataHandle handle = CreateComputationDataHandle();
 
@@ -1181,6 +1253,33 @@ StatusOr<ComputationDataHandle> UserComputation::AddCustomCallInstruction(
   return handle;
 }
 
+StatusOr<ComputationDataHandle> UserComputation::AddDotInstruction(
+    const DotRequest& dot_request) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* lhs,
+                      LookUpRequest(dot_request.lhs()));
+  TF_ASSIGN_OR_RETURN(const OperationRequest* rhs,
+                      LookUpRequest(dot_request.rhs()));
+
+  TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferDotOpShape(
+                                       lhs->output_shape(), rhs->output_shape(),
+                                       dot_request.dimension_numbers()));
+
+  const ComputationDataHandle handle = CreateComputationDataHandle();
+
+  OperationRequest& request =
+      (*session_computation_.mutable_requests())[handle.handle()];
+  *request.mutable_output_handle() = handle;
+  *request.mutable_output_shape() = shape;
+  *request.mutable_request()->mutable_dot_request() = dot_request;
+
+  VLOG(1) << "AddDotInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << dot_request.ShortDebugString();
+  return handle;
+}
+
 StatusOr<ComputationDataHandle> UserComputation::AddUnaryInstruction(
     const UnaryOpRequest& unary_request) {
   tensorflow::mutex_lock lock(mutex_);
@@ -1407,7 +1506,7 @@ StatusOr<const OperationRequest*> LookUpRequest(
   return &session_computation.requests().at(handle_value);
 }
 
-// Returns the OperationRequestion corresponding to the root (result) of the
+// Returns the OperationRequest corresponding to the root (result) of the
 // session computation.
 StatusOr<const OperationRequest*> GetRoot(
     VersionedComputationHandle::Version version,
@@ -1453,8 +1552,8 @@ UserComputation::ComputeProgramShape(
             request.request().parameter_request();
         int64 param_no = parameter_request.parameter();
         // Parameters may be out of order so expand ProgramShape parameters
-        // until
-        // it is at least large enough to hold the current parameter number.
+        // until it is at least large enough to hold the current parameter
+        // number.
         while (program_shape->parameters_size() <= param_no) {
           program_shape->add_parameters();
           program_shape->add_parameter_names();
@@ -1603,6 +1702,15 @@ void PureFunctionalVisitor(const SessionComputation& session_computation,
       break;
     }
 
+    case OpRequest::kDotRequest: {
+      const DotRequest& dot_request = request.request().dot_request();
+      PureFunctionalVisitor(session_computation, dot_request.lhs(),
+                            num_parameters, visited, is_functional);
+      PureFunctionalVisitor(session_computation, dot_request.rhs(),
+                            num_parameters, visited, is_functional);
+      break;
+    }
+
     case OpRequest::kSendRequest: {
       *is_functional = false;
       break;
@@ -1713,6 +1821,14 @@ void PureFunctionalVisitor(const SessionComputation& session_computation,
       break;
     }
 
+    case OpRequest::kBitcastConvertRequest: {
+      const ConvertRequest& convert_request =
+          request.request().bitcast_convert_request();
+      PureFunctionalVisitor(session_computation, convert_request.operand(),
+                            num_parameters, visited, is_functional);
+      break;
+    }
+
     case OpRequest::kWhileRequest: {
       const WhileRequest& while_request = request.request().while_request();
       PureFunctionalVisitor(session_computation, while_request.init(),
@@ -1723,6 +1839,23 @@ void PureFunctionalVisitor(const SessionComputation& session_computation,
       break;
     }
 
+    case OpRequest::kConditionalRequest: {
+      const ConditionalRequest& conditional_request =
+          request.request().conditional_request();
+      PureFunctionalVisitor(session_computation,
+                            conditional_request.predicate(), num_parameters,
+                            visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            conditional_request.true_operand(), num_parameters,
+                            visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            conditional_request.false_operand(), num_parameters,
+                            visited, is_functional);
+      // TODO(b/32495713): We aren't checking the true and false computations
+      // themselves.
+      break;
+    }
+
     case OpRequest::kTernaryOpRequest: {
       const TernaryOpRequest& ternary_op_request =
           request.request().ternary_op_request();
@@ -1951,6 +2084,21 @@ UserComputation::GetEmbeddedComputations(
           break;
         }
 
+        case OpRequest::kConditionalRequest: {
+          CHECK_EQ(2, request.embedded_computation_versions_size());
+          const ConditionalRequest& conditional_request =
+              request.request().conditional_request();
+          const VersionedComputationHandle true_computation_versioned_handle = {
+              conditional_request.true_computation(),
+              request.embedded_computation_versions(0)};
+          computations.push_back(true_computation_versioned_handle);
+          const VersionedComputationHandle false_computation_versioned_handle =
+              {conditional_request.false_computation(),
+               request.embedded_computation_versions(1)};
+          computations.push_back(false_computation_versioned_handle);
+          break;
+        }
+
         default:
           // No embedded computation.
           break;
@@ -2037,6 +2185,16 @@ Status UserComputation::RemapEmbeddedComputations(
         TF_RETURN_IF_ERROR(update(while_request->mutable_body()));
         break;
       }
+      case OpRequest::kConditionalRequest: {
+        TF_RET_CHECK(2 == request.embedded_computation_versions_size());
+        ConditionalRequest* conditional_request =
+            request.mutable_request()->mutable_conditional_request();
+        TF_RETURN_IF_ERROR(
+            update(conditional_request->mutable_true_computation()));
+        TF_RETURN_IF_ERROR(
+            update(conditional_request->mutable_false_computation()));
+        break;
+      }
       default:
         // No embedded computation.
         TF_RET_CHECK(0 == request.embedded_computation_versions_size());
@@ -2370,12 +2528,28 @@ static void ForEachOperand(
       break;
     }
 
+    case OpRequest::kBitcastConvertRequest: {
+      const ConvertRequest& convert_request =
+          request.request().bitcast_convert_request();
+      apply(convert_request.operand());
+      break;
+    }
+
     case OpRequest::kWhileRequest: {
       const WhileRequest& while_request = request.request().while_request();
       apply(while_request.init());
       break;
     }
 
+    case OpRequest::kConditionalRequest: {
+      const ConditionalRequest& conditional_request =
+          request.request().conditional_request();
+      apply(conditional_request.predicate());
+      apply(conditional_request.true_operand());
+      apply(conditional_request.false_operand());
+      break;
+    }
+
     case OpRequest::kTernaryOpRequest: {
       const TernaryOpRequest& ternary_op_request =
           request.request().ternary_op_request();
@@ -2412,6 +2586,13 @@ static void ForEachOperand(
       break;
     }
 
+    case OpRequest::kDotRequest: {
+      const DotRequest& dot_request = request.request().dot_request();
+      apply(dot_request.rhs());
+      apply(dot_request.lhs());
+      break;
+    }
+
     case OpRequest::kUnaryOpRequest: {
       const UnaryOpRequest& unary_op_request =
           request.request().unary_op_request();
@@ -2691,13 +2872,22 @@ void ComputationLowerer::Visit(
       break;
     }
 
+    case OpRequest::kDotRequest: {
+      const DotRequest& dot_request = request.request().dot_request();
+      HloInstruction* lhs = lookup_instruction(dot_request.lhs());
+      HloInstruction* rhs = lookup_instruction(dot_request.rhs());
+      hlo_instruction = add_instruction(HloInstruction::CreateDot(
+          request.output_shape(), lhs, rhs, dot_request.dimension_numbers()));
+      break;
+    }
+
     case OpRequest::kCrossReplicaSumRequest: {
       const CrossReplicaSumRequest& cross_replica_sum_request =
           request.request().cross_replica_sum_request();
       HloInstruction* operand =
           lookup_instruction(cross_replica_sum_request.operand());
       hlo_instruction = add_instruction(HloInstruction::CreateCrossReplicaSum(
-          request.output_shape(), operand));
+          request.output_shape(), {operand}));
       break;
     }
 
@@ -2954,6 +3144,15 @@ void ComputationLowerer::Visit(
       break;
     }
 
+    case OpRequest::kBitcastConvertRequest: {
+      const ConvertRequest& convert_request =
+          request.request().bitcast_convert_request();
+      HloInstruction* operand = lookup_instruction(convert_request.operand());
+      hlo_instruction = add_instruction(HloInstruction::CreateBitcastConvert(
+          request.output_shape(), operand));
+      break;
+    }
+
     case OpRequest::kWhileRequest: {
       const WhileRequest& while_request = request.request().while_request();
       CHECK_EQ(2, request.embedded_computation_versions_size());
@@ -2971,6 +3170,30 @@ void ComputationLowerer::Visit(
       break;
     }
 
+    case OpRequest::kConditionalRequest: {
+      const ConditionalRequest& conditional_request =
+          request.request().conditional_request();
+      CHECK_EQ(2, request.embedded_computation_versions_size());
+      VersionedComputationHandle::Version true_computation_version =
+          request.embedded_computation_versions(0);
+      HloComputation* true_computation = ResolveComputation(
+          conditional_request.true_computation(), true_computation_version);
+      VersionedComputationHandle::Version false_computation_version =
+          request.embedded_computation_versions(1);
+      HloComputation* false_computation = ResolveComputation(
+          conditional_request.false_computation(), false_computation_version);
+      HloInstruction* predicate =
+          lookup_instruction(conditional_request.predicate());
+      HloInstruction* true_operand =
+          lookup_instruction(conditional_request.true_operand());
+      HloInstruction* false_operand =
+          lookup_instruction(conditional_request.false_operand());
+      hlo_instruction = add_instruction(HloInstruction::CreateConditional(
+          request.output_shape(), predicate, true_operand, true_computation,
+          false_operand, false_computation));
+      break;
+    }
+
     case OpRequest::kTernaryOpRequest: {
       const TernaryOpRequest& ternary_op_request =
           request.request().ternary_op_request();
@@ -2978,6 +3201,25 @@ void ComputationLowerer::Visit(
       HloInstruction* rhs = lookup_instruction(ternary_op_request.rhs());
       HloInstruction* ehs = lookup_instruction(ternary_op_request.ehs());
       auto hlo_opcode = TernaryOperationToHloOpcode(ternary_op_request.triop());
+
+      if (debug_options_.xla_eliminate_hlo_implicit_broadcast()) {
+        if (!ShapeUtil::SameDimensions(request.output_shape(), lhs->shape())) {
+          // lhs side is being implicitly broadcast. Change to explicit.
+          lhs =
+              ImplicitBroadcastToExplicitBroadcast(lhs, request.output_shape());
+        }
+
+        if (!ShapeUtil::SameDimensions(request.output_shape(), rhs->shape())) {
+          rhs =
+              ImplicitBroadcastToExplicitBroadcast(rhs, request.output_shape());
+        }
+
+        if (!ShapeUtil::SameDimensions(request.output_shape(), ehs->shape())) {
+          ehs =
+              ImplicitBroadcastToExplicitBroadcast(ehs, request.output_shape());
+        }
+      }
+
       hlo_instruction = add_instruction(HloInstruction::CreateTernary(
           request.output_shape(), hlo_opcode, lhs, rhs, ehs));
       break;
@@ -3082,8 +3324,7 @@ void ComputationLowerer::Visit(
         lhs = (lhs == operand_to_broadcast) ? broadcasted_operand : lhs;
         rhs = (rhs == operand_to_broadcast) ? broadcasted_operand : rhs;
       }
-      if (debug_options_.xla_eliminate_hlo_implicit_broadcast() &&
-          binary_op_request.binop() != BINOP_DOT) {
+      if (debug_options_.xla_eliminate_hlo_implicit_broadcast()) {
         if (!ShapeUtil::SameDimensions(request.output_shape(), lhs->shape())) {
           // lhs side is being implicitly broadcast. Change to explicit.
           lhs =
@@ -3137,7 +3378,7 @@ void ComputationLowerer::Visit(
       LOG(FATAL) << "Unexpected request type: " << request.request().op_case();
   }
   (*instructions)[handle.handle()] = hlo_instruction;
-}
+}  // NOLINT(readability/fn_size)
 
 }  // namespace
 
diff --git a/tensorflow/compiler/xla/service/user_computation.h b/tensorflow/compiler/xla/service/user_computation.h
index ac879ce55a75f6241a39f935b79017be46c1816b..8a78d520e19024f5e397d6e0c2f4e0523264e176 100644
--- a/tensorflow/compiler/xla/service/user_computation.h
+++ b/tensorflow/compiler/xla/service/user_computation.h
@@ -70,7 +70,7 @@ class UserComputation {
 
   // Enqueues a pad instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddPadInstruction(
-      const PadRequest& parameter_request);
+      const PadRequest& pad_request);
 
   // Enqueues a tracing instruction onto this user computation.
   // Returns an error status if the operand cannot be resolved.
@@ -105,7 +105,7 @@ class UserComputation {
   // Enqueues a ternary instruction onto this user computation.
   // Returns an error status if the operand indices are out of bounds.
   StatusOr<ComputationDataHandle> AddTernaryInstruction(
-      const TernaryOpRequest& request);
+      const TernaryOpRequest& ternary_request);
 
   // Enqueues a variadic instruction onto this user computation.
   // Returns an error status if the operand indices are out of bounds.
@@ -153,6 +153,10 @@ class UserComputation {
   StatusOr<ComputationDataHandle> AddCustomCallInstruction(
       const CustomCallRequest& custom_call_request);
 
+  // Enqueues a dot instruction onto this user computation.
+  StatusOr<ComputationDataHandle> AddDotInstruction(
+      const DotRequest& dot_request);
+
   // Enqueues a broadcast instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddBroadcastInstruction(
       const BroadcastRequest& broadcast_request);
@@ -179,26 +183,30 @@ class UserComputation {
 
   // Enqueues a concatenate instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddConcatenateInstruction(
-      const ConcatenateRequest& slice_request);
+      const ConcatenateRequest& concatenate_request);
 
   // Enqueues a convert instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddConvertInstruction(
       const ConvertRequest& convert_request);
 
+  // Enqueues a bitcast element instruction onto this user computation.
+  StatusOr<ComputationDataHandle> AddBitcastConvertInstruction(
+      const ConvertRequest& convert_request);
+
   // Enqueues a reduce instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddReduceInstruction(
       const ReduceRequest& reduce_request,
-      const UserComputation& reduction_computation);
+      const UserComputation& to_apply_computation);
 
   // Enqueues a windowed reduce instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddReduceWindowInstruction(
       const ReduceWindowRequest& reduce_window_request,
-      const UserComputation& reduction_computation);
+      const UserComputation& to_apply_computation);
 
   // Enqueues a select-and-scatter instruction onto this user
   // computation.
   StatusOr<ComputationDataHandle> AddSelectAndScatterInstruction(
-      const SelectAndScatterRequest& scatter_to_selected_window_element_request,
+      const SelectAndScatterRequest& select_and_scatter_request,
       const UserComputation& select_computation,
       const UserComputation& scatter_computation);
 
@@ -212,6 +220,12 @@ class UserComputation {
       const UserComputation& condition_computation,
       const UserComputation& body_computation);
 
+  // Enqueues a conditional instruction on this user computation.
+  StatusOr<ComputationDataHandle> AddConditionalInstruction(
+      const ConditionalRequest& conditional_request,
+      const UserComputation& true_computation,
+      const UserComputation& false_computation);
+
   // Enqueues a Send instruction onto this user computation.
   Status AddSendInstruction(const SendRequest& send_request);
 
diff --git a/tensorflow/compiler/xla/service/user_computation_test.cc b/tensorflow/compiler/xla/service/user_computation_test.cc
index 5afaf226ae0cce7e9afc966c6b4adf838aeebc91..ca02115863e6906ef709ba63259024877e0dcef4 100644
--- a/tensorflow/compiler/xla/service/user_computation_test.cc
+++ b/tensorflow/compiler/xla/service/user_computation_test.cc
@@ -65,6 +65,7 @@ TEST_F(UserComputationTest, SimpleComputation) {
 
   OutfeedRequest outfeed_request;
   *outfeed_request.mutable_operand() = constant_handle;
+  *outfeed_request.mutable_shape() = kVectorShape;
   outfeed_request.set_outfeed_config("abc");
   TF_ASSERT_OK(computation.AddOutfeedInstruction(outfeed_request));
 
@@ -334,50 +335,5 @@ TEST_F(UserComputationTest, EliminateDegenerateBroadcastAfterIndimBroadcast) {
               operands[1]->opcode() == HloOpcode::kBroadcast);
 }
 
-TEST_F(UserComputationTest, SkipDotInEliminatingImplicitBroadcast) {
-  auto debug_options = DebugOptions();
-  debug_options.set_xla_eliminate_hlo_implicit_broadcast(true);
-
-  //  %a = Param({1, 3});
-  //  %b = Param({3, 1});
-  //  %dot = Dot(%a, %b);
-  ComputationHandle handle;
-  handle.set_handle(123);
-  UserComputation computation("TheComputation", handle);
-
-  ParameterRequest a_request;
-  *a_request.mutable_shape() = ShapeUtil::MakeShape(F32, {1, 3});
-  a_request.set_name("a");
-  a_request.set_parameter(0);
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle a_handle,
-                          computation.AddParameterInstruction(a_request));
-
-  ParameterRequest b_request;
-  *b_request.mutable_shape() = ShapeUtil::MakeShape(F32, {3, 1});
-  b_request.set_name("b");
-  b_request.set_parameter(1);
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle b_handle,
-                          computation.AddParameterInstruction(b_request));
-
-  BinaryOpRequest dot;
-  dot.set_binop(BINOP_DOT);
-  *dot.mutable_lhs() = a_handle;
-  *dot.mutable_rhs() = b_handle;
-  TF_ASSERT_OK(computation.AddBinaryInstruction(dot).status());
-
-  auto hlo_resolver = [](const VersionedComputationHandle& handle) {
-    return nullptr;
-  };
-  VersionedComputationHandle latest_version = computation.GetVersionedHandle();
-
-  // Build the HLO computation.
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloComputation> hlo_computation,
-      computation.BuildHloComputation(latest_version.version, hlo_resolver,
-                                      debug_options));
-
-  EXPECT_EQ(3, hlo_computation->instruction_count());
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index 2fac914892e07b1935581e770293ddf00af7bc41..fb0e6f7ce00cff48727dc55bf45c07994643331d 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -289,7 +289,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // Don't try this transformation if the while loop isn't removable, since if
   // it succeeds ultimately we're going to have to replace the old while loop
   // with a new one.
-  if (!while_op->parent()->IsRemovable(while_op)) {
+  if (!while_op->parent()->IsRemovable(while_op) || while_op->HasSideEffect()) {
     VLOG(2) << "Can't remove dead parameters from non-removable while op.";
     return false;
   }
@@ -306,6 +306,8 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
     return false;
   }
 
+  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
+
   // Bail if param0 of while_cond or while_body has users which aren't of type
   // get-tuple-element.
   for (const HloInstruction* instr : {while_body->parameter_instruction(0),
@@ -313,9 +315,10 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
     for (const HloInstruction* user : instr->users()) {
       if (user->opcode() != HloOpcode::kGetTupleElement) {
         VLOG(2) << "Cowardly refusing to analyze while loop with "
-                << instr->ToStringNoMetadata()
-                << " used by non-GTE instruction " << user->ToStringNoMetadata()
-                << " in computation " << instr->parent()->name();
+                << instr->ToString(print_no_metadata)
+                << " used by non-GTE instruction "
+                << user->ToString(print_no_metadata) << " in computation "
+                << instr->parent()->name();
         return false;
       }
     }
@@ -342,7 +345,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
       //
       // Careful: HloInstruction::operand_index returns the first index the
       // operand appears in, but it may appear more than once!
-      if (user->user_count() == 1 && user->users()[0] == while_body_root &&
+      if (user->user_count() == 1 && user->users().front() == while_body_root &&
           while_body_root->operand_index(user) == user->tuple_index() &&
           std::count(while_body_root->operands().begin(),
                      while_body_root->operands().end(), user) == 1) {
@@ -351,7 +354,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
 
       used_tuple_indices.insert(user->tuple_index());
       if (used_tuple_indices.size() == tuple_size) {
-        VLOG(2) << "Loop " << while_op->ToStringNoMetadata()
+        VLOG(2) << "Loop " << while_op->ToString(print_no_metadata)
                 << " uses all of its inputs; no simplification possible.";
         return false;
       }
@@ -375,7 +378,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
       used_tuple_indices.insert(i);
 
       if (used_tuple_indices.size() == tuple_size) {
-        VLOG(2) << "Loop " << while_op->ToStringNoMetadata()
+        VLOG(2) << "Loop " << while_op->ToString(print_no_metadata)
                 << " uses all of its inputs; no simplification possible.";
         return false;
       }
@@ -387,7 +390,8 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   CHECK_LT(used_tuple_indices.size(), tuple_size);
 
   VLOG(1) << "Eliminating " << tuple_size - used_tuple_indices.size()
-          << " elements from tuple of " << while_op->ToStringNoMetadata();
+          << " elements from tuple of "
+          << while_op->ToString(print_no_metadata);
 
   // Build up maps from the old/new to the new/old tuple indices.
   std::vector<int64> new_to_old_tuple_idx(used_tuple_indices.begin(),
@@ -403,6 +407,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
 
   // Compute the shape of the while op after we remove the dead indices.
   std::vector<Shape> new_while_tuple_elem_shapes;
+  new_while_tuple_elem_shapes.reserve(new_to_old_tuple_idx.size());
   for (int64 old_idx : new_to_old_tuple_idx) {
     new_while_tuple_elem_shapes.push_back(
         while_init->shape().tuple_shapes(old_idx));
@@ -430,7 +435,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
         continue;
       }
       CHECK_EQ(user->opcode(), HloOpcode::kGetTupleElement)
-          << user->ToStringNoMetadata();
+          << user->ToString(print_no_metadata);
 
       int64 old_idx = user->tuple_index();
       auto new_idx_iter = old_to_new_tuple_idx.find(old_idx);
@@ -443,15 +448,16 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
         // This is a GTE of an index that we've removed.  Remove it from the
         // cloned computation.
         CHECK(user->user_count() == 0 ||
-              user->user_count() == 1 && user->users()[0] == while_body_root)
-            << "Instruction " << user->ToStringNoMetadata()
+              user->user_count() == 1 &&
+                  user->users().front() == while_body_root)
+            << "Instruction " << user->ToString(print_no_metadata)
             << " should be unused (except by root of while body), but has "
                "users: {"
             << tensorflow::str_util::Join(
                    user->users(), ", ",
-                   [](string* out, const HloInstruction* instr) {
+                   [&](string* out, const HloInstruction* instr) {
                      tensorflow::strings::StrAppend(
-                         out, instr->ToStringNoMetadata());
+                         out, instr->ToString(print_no_metadata));
                    })
             << "}";
 
@@ -469,6 +475,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       while_body_replacements = make_while_computation_replacements(while_body);
   std::vector<HloInstruction*> new_while_body_root_elems;
+  new_while_body_root_elems.reserve(new_to_old_tuple_idx.size());
   for (int64 old_idx : new_to_old_tuple_idx) {
     new_while_body_root_elems.push_back(
         while_body_root->mutable_operand(old_idx));
@@ -483,6 +490,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // clean this up in the common case where while_init is a tuple op.  (It's
   // definitely tuple-shaped, but it's not necessarily a tuple op.)
   std::vector<HloInstruction*> new_while_init_elems;
+  new_while_init_elems.reserve(new_to_old_tuple_idx.size());
   for (int64 old_idx : new_to_old_tuple_idx) {
     new_while_init_elems.push_back(
         computation->AddInstruction(HloInstruction::CreateGetTupleElement(
@@ -554,7 +562,7 @@ static StatusOr<bool> TryRemoveWhileLoop(HloInstruction* while_op) {
   // the loop aren't removed, just cloned and added back to the loop.
   // Nevertheless our infrastructure sees loop simplification as removal of
   // these nodes and currently doesn't allow it.
-  if (!while_op->parent()->IsRemovable(while_op)) {
+  if (!while_op->parent()->IsRemovable(while_op) || while_op->HasSideEffect()) {
     VLOG(2) << "Not attempting to remove while loop it is not removable: "
             << while_op->ToShortString();
     return false;
diff --git a/tensorflow/compiler/xla/shape_layout.cc b/tensorflow/compiler/xla/shape_layout.cc
index 5bf9842a6ce7be747f58c10f302f85c6f82ac6f9..789eba5780d37e1fd4d80ec881855951c8bba0eb 100644
--- a/tensorflow/compiler/xla/shape_layout.cc
+++ b/tensorflow/compiler/xla/shape_layout.cc
@@ -32,13 +32,13 @@ tensorflow::Status ShapeLayout::CopyLayoutFromShape(const Shape& other_shape) {
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ShapeLayout::AssignLayoutToShape(Shape* other_shape) const {
-  if (!ShapeUtil::Compatible(*other_shape, shape_)) {
+tensorflow::Status ShapeLayout::AssignLayoutToShape(Shape* to_shape) const {
+  if (!ShapeUtil::Compatible(*to_shape, shape_)) {
     return InvalidArgument("Shape %s is not compatible with shape %s",
-                           ShapeUtil::HumanString(*other_shape).c_str(),
+                           ShapeUtil::HumanString(*to_shape).c_str(),
                            ShapeUtil::HumanString(shape()).c_str());
   }
-  *other_shape = shape_;
+  *to_shape = shape_;
   return tensorflow::Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/shape_layout.h b/tensorflow/compiler/xla/shape_layout.h
index 92564660f21bf1b596c4b9ca04c07eaca27ed192..4c83750f3e6f3c735db66d8e0b86ae3f43e5ca11 100644
--- a/tensorflow/compiler/xla/shape_layout.h
+++ b/tensorflow/compiler/xla/shape_layout.h
@@ -38,18 +38,19 @@ class ShapeLayout {
   explicit ShapeLayout(const Shape& shape) : shape_(shape) {}
 
   // Assigns the layouts in this ShapeLayout to the Layout fields of the given
-  // shape. 'shape' and the shape of the ShapeLayout object must be compatible.
-  tensorflow::Status AssignLayoutToShape(Shape* shape) const;
+  // shape. 'to_shape' and the shape of the ShapeLayout object must be
+  // compatible.
+  tensorflow::Status AssignLayoutToShape(Shape* to_shape) const;
 
   // Returns true if the Layouts in this ShapeLayout match the layouts in the
   // given shape. Returns false otherwise. If the given shape is not compatible
   // with the ShapeLayout's shape, then false is returned.
   bool MatchesLayoutInShape(const Shape& shape) const;
 
-  // Copies the layout from the given shape into this ShapeLayout. 'shape' must
-  // be compatible with the ShapeLayout's shape, and 'shape' must have a layout
-  // (LayoutUtil::HasLayout).
-  tensorflow::Status CopyLayoutFromShape(const Shape& shape);
+  // Copies the layout from the given shape into this ShapeLayout. 'other_shape'
+  // must be compatible with the ShapeLayout's shape, and 'other_shape' must
+  // have a layout (LayoutUtil::HasLayout).
+  tensorflow::Status CopyLayoutFromShape(const Shape& other_shape);
 
   // Clears (Layout::Clear) all the Layouts stored in this object.
   void Clear();
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index bf8d19015079f2ce0bd450594040ed818f94b66b..d752619bd65751779c24f061e44e206d66b01465 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -238,7 +238,7 @@ class ShapeTree {
   //           (or compatible).
   //   index : the index of the element in the shape. See ShapeUtil::GetSubshape
   //           for definition of index.
-  //   data : The data value at this elemnt.
+  //   data : The data value at this element.
   template <typename Fn>
   void ForEachElement(const Fn& func) const;
 
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index c0a0e13f073a639baa46151a68b83cfe92215c23..ead9f5c4ce76a8d452dd18f5cd1803a027556637 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 #include <numeric>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -64,30 +65,36 @@ namespace {
 // the shapes are the same. If compare_layouts is true, then layouts must also
 // match.
 bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
-  if (ShapeUtil::IsTuple(lhs)) {
-    return ShapeUtil::IsTuple(rhs) &&
+  if (ShapeUtil::IsTuple(lhs) || ShapeUtil::IsTuple(rhs)) {
+    return ShapeUtil::IsTuple(lhs) && ShapeUtil::IsTuple(rhs) &&
            ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
                            [=](const Shape& l, const Shape& r) {
                              return CompareShapes(l, r, compare_layouts);
                            });
+  } else if (ShapeUtil::IsOpaque(lhs) || ShapeUtil::IsOpaque(rhs)) {
+    return ShapeUtil::IsOpaque(lhs) && ShapeUtil::IsOpaque(rhs);
   }
-  // Explicitly compare the fields rather than using MessageDifferencer because
-  // we want empty layouts to be treated identically to missing layouts.
+
   if (compare_layouts) {
-    if (!ContainersEqual(lhs.layout().minor_to_major(),
-                         rhs.layout().minor_to_major())) {
-      VLOG(3) << "CompareShapes: lhs layout != rhs layout";
-      return false;
-    }
-    if (!ContainersEqual(lhs.layout().padded_dimensions(),
-                         rhs.layout().padded_dimensions())) {
-      VLOG(3)
-          << "CompareShapes: lhs padded_dimensions != rhs padded_dimensions";
+    if (lhs.layout().format() != rhs.layout().format()) {
       return false;
     }
-    if (lhs.layout().padding_value() != rhs.layout().padding_value()) {
-      VLOG(3) << "CompareShapes: lhs padding value != rhs padding_value";
-      return false;
+    if (LayoutUtil::IsDense(lhs)) {
+      if (!ContainersEqual(LayoutUtil::MinorToMajor(lhs),
+                           LayoutUtil::MinorToMajor(rhs))) {
+        VLOG(3) << "CompareShapes: lhs layout != rhs layout";
+        return false;
+      }
+      if (!ContainersEqual(lhs.layout().padded_dimensions(),
+                           rhs.layout().padded_dimensions())) {
+        VLOG(3)
+            << "CompareShapes: lhs padded_dimensions != rhs padded_dimensions";
+        return false;
+      }
+      if (lhs.layout().padding_value() != rhs.layout().padding_value()) {
+        VLOG(3) << "CompareShapes: lhs padding value != rhs padding_value";
+        return false;
+      }
     }
   }
 
@@ -235,6 +242,7 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 }
 
 /* static */ void ShapeUtil::AppendMajorDimension(int bound, Shape* shape) {
+  CHECK(LayoutUtil::IsDense(*shape));
   shape->mutable_layout()->add_minor_to_major(Rank(*shape));
   shape->add_dimensions(bound);
   TF_DCHECK_OK(ValidateShape(*shape));
@@ -329,6 +337,14 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   return MakeTupleShape(new_elements);
 }
 
+// Returns the shape of a real or imaginary component.
+/* static */ Shape ShapeUtil::ComplexComponentShape(
+    const Shape& complex_shape) {
+  CHECK(ElementIsComplex(complex_shape)) << HumanString(complex_shape);
+  return ChangeElementType(complex_shape, primitive_util::ComplexComponentType(
+                                              complex_shape.element_type()));
+}
+
 /* static */ bool ShapeUtil::ShapeIs(const Shape& shape,
                                      PrimitiveType element_type,
                                      std::initializer_list<int64> dimensions) {
@@ -396,6 +412,26 @@ const string& LowercasePrimitiveTypeName(PrimitiveType s) {
   static PrimitiveTypeNameGenerator* gen = new PrimitiveTypeNameGenerator();
   return gen->LowercaseName(s);
 }
+
+StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
+  static std::unordered_map<string, PrimitiveType>* name_to_type = [] {
+    static auto* map = new std::unordered_map<string, PrimitiveType>;
+    for (int i = 0; i < PrimitiveType_ARRAYSIZE; i++) {
+      if (PrimitiveType_IsValid(i)) {
+        auto value = static_cast<PrimitiveType>(i);
+        (*map)[LowercasePrimitiveTypeName(value)] = value;
+      }
+    }
+    return map;
+  }();
+  auto found = name_to_type->find(name);
+  if (found == name_to_type->end()) {
+    return InvalidArgument("Invalid element type string: \"%s\".",
+                           name.c_str());
+  }
+  return found->second;
+}
+
 }  // namespace
 
 /* static */ string ShapeUtil::HumanStringWithLayout(const Shape& shape) {
@@ -500,17 +536,10 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
                         comma_list_to_int64s(dimensions_string));
 
     // Extract the primitive element type.
-    PrimitiveType primitive_type = PRIMITIVE_TYPE_INVALID;
-    for (PrimitiveType i =
-             static_cast<PrimitiveType>(PRIMITIVE_TYPE_INVALID + 1);
-         i < TUPLE; i = static_cast<PrimitiveType>(i + 1)) {
-      if (tensorflow::str_util::Lowercase(PrimitiveType_Name(i)) ==
-          element_type_string) {
-        primitive_type = i;
-        break;
-      }
-    }
-    if (primitive_type == PRIMITIVE_TYPE_INVALID) {
+    TF_ASSIGN_OR_RETURN(const PrimitiveType primitive_type,
+                        StringToPrimitiveType(element_type_string));
+    if (primitive_type == PRIMITIVE_TYPE_INVALID || primitive_type == TUPLE ||
+        primitive_type == OPAQUE) {
       return InvalidArgument("Invalid element type string: \"%s\".",
                              element_type_string.c_str());
     }
@@ -553,6 +582,16 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   return SameDimensions(lhs, rhs) && SameElementType(lhs, rhs);
 }
 
+/* static */ bool ShapeUtil::CompatibleIgnoringElementType(const Shape& lhs,
+                                                           const Shape& rhs) {
+  if (lhs.element_type() == TUPLE) {
+    return rhs.element_type() == TUPLE &&
+           ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
+                           CompatibleIgnoringElementType);
+  }
+  return SameDimensions(lhs, rhs);
+}
+
 /* static */ int64 ShapeUtil::GetDimension(const Shape& shape,
                                            int64 dimension_number) {
   return shape.dimensions(GetDimensionNumber(shape, dimension_number));
@@ -684,9 +723,9 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   return LayoutUtil::ValidateLayoutInShape(shape);
 }
 
-/* static */ Shape ShapeUtil::ChangeElementType(const Shape& shape,
+/* static */ Shape ShapeUtil::ChangeElementType(const Shape& original,
                                                 PrimitiveType type) {
-  Shape new_shape = shape;
+  Shape new_shape = original;
   new_shape.set_element_type(type);
   return new_shape;
 }
@@ -853,7 +892,9 @@ Status ForEachMutableSubshapeHelper(
     new_shape.add_dimensions(dim);
   }
   if (shape.has_layout()) {
+    CHECK(LayoutUtil::IsDense(shape));
     Layout* new_layout = new_shape.mutable_layout();
+    new_layout->set_format(DENSE);
     new_layout->clear_minor_to_major();
     for (auto index : Permute(permutation, shape.layout().minor_to_major())) {
       new_layout->add_minor_to_major(index);
@@ -1280,6 +1321,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   shape.mutable_dimensions()->erase(shape.dimensions().begin() + dim_to_delete);
   if (LayoutUtil::HasLayout(shape)) {
     Layout* layout = shape.mutable_layout();
+    layout->set_format(DENSE);
     for (size_t i = 0; i < layout->minor_to_major().size();) {
       if (layout->minor_to_major(i) == dim_to_delete) {
         layout->mutable_minor_to_major()->erase(
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 82a513a65ad62904e595b650cc02dcf3e8451958..301247d61c5e1ecd428b061594c042ab35a3364e 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <initializer_list>
 #include <string>
 
+#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -170,7 +171,7 @@ class ShapeUtil {
   // As above, but for program shapes, returns a string for the form:
   //
   // (param_name: f32[42x12], ...) -> f32[24x42]
-  static string HumanString(const ProgramShape& shape);
+  static string HumanString(const ProgramShape& program_shape);
 
   // Parses a ShapeUtil::HumanString-format shape string back into a shape
   // object.
@@ -190,6 +191,11 @@ class ShapeUtil {
   // compatibility.
   static bool Compatible(const Shape& lhs, const Shape& rhs);
 
+  // Returns true if the rank and dimension sizes are identical. Element type
+  // and layout are ignored. Tuple elements are compared recursively for
+  // compatibility.
+  static bool CompatibleIgnoringElementType(const Shape& lhs, const Shape& rhs);
+
   // Returns whether the lhs and rhs shapes are identical protobufs.
   static bool Equal(const Shape& lhs, const Shape& rhs);
 
@@ -319,7 +325,8 @@ class ShapeUtil {
     return shape.element_type() == OPAQUE;
   }
 
-  // Returns whether the shape is an array.
+  // Returns whether the shape is an array.  Note that scalars are considered
+  // arrays.
   static bool IsArray(const Shape& shape) {
     return !IsTuple(shape) && !IsOpaque(shape);
   }
@@ -346,6 +353,10 @@ class ShapeUtil {
   // shape. E.g. a tuple like (f32, s32, u32) would slice via 1,3 to (s32, u32).
   static Shape SliceTuple(const Shape& tuple, int64 start, int64 limit);
 
+  // Returns the shape of the real/imaginary components of the given complex
+  // shape.
+  static Shape ComplexComponentShape(const Shape& complex_shape);
+
   // Shorthand for testing whether a shape is of a given element type and
   // sequence of dimensions.
   //
@@ -497,8 +508,7 @@ class ShapeUtil {
     CHECK_EQ(Rank(shape), base.size());
     CHECK_EQ(incr.size(), base.size());
     CHECK_EQ(count.size(), base.size());
-    const Layout& layout = shape.layout();
-    const int64 rank = layout.minor_to_major_size();
+    const int64 rank = LayoutUtil::MinorToMajor(shape).size();
     // Allows handling R0 arrays, such that the visitor function will be called
     // once with the proper empty indexes.
     int64 n = -1;
@@ -506,7 +516,7 @@ class ShapeUtil {
     while (n < rank && visitor_function(indexes)) {
       // Increments dimensions in minor to major order.
       for (n = 0; n < rank; ++n) {
-        int64 dim = layout.minor_to_major(n);
+        int64 dim = LayoutUtil::Minor(shape.layout(), n);
         indexes[dim] += incr[dim];
         if (indexes[dim] < base[dim] + count[dim]) {
           break;
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 0ba542ad1bec290c35c52a8dd5177893770310fd..3be6d6c4299aff62582c1b9fdc46fb78712f95c8 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -145,6 +145,7 @@ TEST(ShapeUtilTest, IncompatibleTuplesWithSwappedElements) {
   Shape tuple2 = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {3, 2}), ShapeUtil::MakeShape(PRED, {4, 5})});
   EXPECT_FALSE(ShapeUtil::Compatible(tuple1, tuple2));
+  EXPECT_FALSE(ShapeUtil::CompatibleIgnoringElementType(tuple1, tuple2));
 }
 
 TEST(ShapeUtilTest, IncompatibleTuplesWithDifferentPrimitiveType) {
@@ -153,6 +154,7 @@ TEST(ShapeUtilTest, IncompatibleTuplesWithDifferentPrimitiveType) {
   Shape tuple2 = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(PRED, {4, 5}), ShapeUtil::MakeShape(S32, {3, 2})});
   EXPECT_FALSE(ShapeUtil::Compatible(tuple1, tuple2));
+  EXPECT_TRUE(ShapeUtil::CompatibleIgnoringElementType(tuple1, tuple2));
 }
 
 TEST(ShapeUtilTest, IncompatibleTuplesWithDifferentDimensions) {
@@ -163,20 +165,6 @@ TEST(ShapeUtilTest, IncompatibleTuplesWithDifferentDimensions) {
   EXPECT_FALSE(ShapeUtil::Compatible(tuple1, tuple2));
 }
 
-TEST(ShapeUtilTest, EmptyLayoutEqualsMissingLayout) {
-  // A shape with a missing layout should be equal to a shape with an empty
-  // layout.
-  Shape scalar1 = ShapeUtil::MakeShape(F32, {});
-  Shape scalar2 = ShapeUtil::MakeShape(F32, {});
-
-  EXPECT_TRUE(ShapeUtil::Equal(scalar1, scalar2));
-
-  scalar1.clear_layout();    // Remove layout field.
-  scalar2.mutable_layout();  // Create empty layout field.
-
-  EXPECT_TRUE(ShapeUtil::Equal(scalar1, scalar2));
-}
-
 TEST(ShapeUtilTest, CompareShapesWithPaddedDimensionsMismatch) {
   Shape shape1 = ShapeUtil::MakeShape(F32, {20, 30});
   shape1.mutable_layout()->add_padded_dimensions(10);
@@ -197,17 +185,17 @@ TEST(ShapeUtilTest, CompareShapesWithPaddingValueMismatch) {
   EXPECT_FALSE(ShapeUtil::Equal(shape1, shape2));
 }
 
-TEST(ShapeUtilTest, ScalarUnpopulatedLayoutEqualsScalarLayout) {
-  Shape scalar_unpopulated = ShapeUtil::MakeShape(F32, {});
-  scalar_unpopulated.clear_layout();
-  ASSERT_FALSE(scalar_unpopulated.has_layout())
-      << ShapeUtil::HumanStringWithLayout(scalar_unpopulated);
+TEST(ShapeUtilTest, ScalarDefaultLayoutEqualsScalarEmptyMin2Maj) {
+  Shape scalar_default_layout = ShapeUtil::MakeShape(F32, {});
+  ASSERT_TRUE(scalar_default_layout.has_layout())
+      << ShapeUtil::HumanStringWithLayout(scalar_default_layout);
 
-  const Shape scalar_populated = ShapeUtil::MakeShapeWithLayout(F32, {}, {});
-  ASSERT_TRUE(scalar_populated.has_layout())
-      << ShapeUtil::HumanStringWithLayout(scalar_populated);
+  const Shape scalar_empty_min2maj =
+      ShapeUtil::MakeShapeWithLayout(F32, {}, {});
+  ASSERT_TRUE(scalar_empty_min2maj.has_layout())
+      << ShapeUtil::HumanStringWithLayout(scalar_empty_min2maj);
 
-  EXPECT_TRUE(ShapeUtil::Equal(scalar_unpopulated, scalar_populated));
+  EXPECT_TRUE(ShapeUtil::Equal(scalar_default_layout, scalar_empty_min2maj));
 }
 
 TEST(ShapeUtilTest, ByteSizeOfWithoutPadding) {
diff --git a/tensorflow/compiler/xla/statusor_test.cc b/tensorflow/compiler/xla/statusor_test.cc
index 5fa2211ac66177514ac8ecabfa8791e7c8c014a2..f9d25945bc617507735fb6c4d011c39723497f69 100644
--- a/tensorflow/compiler/xla/statusor_test.cc
+++ b/tensorflow/compiler/xla/statusor_test.cc
@@ -32,26 +32,26 @@ namespace {
 class Base1 {
  public:
   virtual ~Base1() {}
-  int pad;
+  int pad_;
 };
 
 class Base2 {
  public:
   virtual ~Base2() {}
-  int yetotherpad;
+  int yetotherpad_;
 };
 
 class Derived : public Base1, public Base2 {
  public:
   ~Derived() override {}
-  int evenmorepad;
+  int evenmorepad_;
 };
 
 class CopyNoAssign {
  public:
-  explicit CopyNoAssign(int value) : foo(value) {}
-  CopyNoAssign(const CopyNoAssign& other) : foo(other.foo) {}
-  int foo;
+  explicit CopyNoAssign(int value) : foo_(value) {}
+  CopyNoAssign(const CopyNoAssign& other) : foo_(other.foo_) {}
+  int foo_;
 
  private:
   const CopyNoAssign& operator=(const CopyNoAssign&);
@@ -253,7 +253,7 @@ TEST(StatusOr, TestCopyCtorNonAssignable) {
   StatusOr<CopyNoAssign> original(value);
   StatusOr<CopyNoAssign> copy(original);
   EXPECT_EQ(copy.status(), original.status());
-  EXPECT_EQ(original.ValueOrDie().foo, copy.ValueOrDie().foo);
+  EXPECT_EQ(original.ValueOrDie().foo_, copy.ValueOrDie().foo_);
 }
 
 TEST(StatusOr, TestCopyCtorStatusOKConverting) {
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index f3885e90214e8ea77d26e5ae250fc5821267826b..d8c0584d10c854ff46c6ce65c37a8ec92e02d6cf 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -69,6 +69,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:lib",
@@ -104,7 +105,9 @@ cc_library(
     hdrs = ["hlo_test_base.h"],
     deps = [
         ":literal_test_util",
+        ":test_utils",
         "//tensorflow/compiler/xla:shape_layout",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -114,6 +117,10 @@ cc_library(
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_runner",
+        "//tensorflow/compiler/xla/service:hlo_verifier",
+        "//tensorflow/compiler/xla/service:interpreter_plugin",  # reference backend
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
@@ -354,6 +361,7 @@ xla_test(
 xla_test(
     name = "map_test",
     srcs = ["map_test.cc"],
+    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
@@ -431,6 +439,28 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "conditional_test",
+    srcs = ["conditional_test.cc"],
+    # Currently, Conditional is supported only in CPU and GPU backends.
+    backends = [
+        "cpu",
+        "gpu",
+        "cpu_parallel",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "unary_op_test",
     srcs = ["unary_op_test.cc"],
@@ -512,6 +542,7 @@ xla_test(
     name = "array_elementwise_ops_test",
     srcs = ["array_elementwise_ops_test.cc"],
     shard_count = 25,
+    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -770,6 +801,41 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "bfloat16_test",
+    srcs = ["bfloat16_test.cc"],
+    blacklisted_backends = [
+        "gpu",
+    ],
+    shard_count = 40,
+    deps = [
+        ":test_utils",
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:array4d",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:reference_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:computation",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "slice_test",
     srcs = ["slice_test.cc"],
@@ -1230,6 +1296,23 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "bitcast_convert_test",
+    srcs = ["bitcast_convert_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "compilation_cache_test",
     srcs = ["compilation_cache_test.cc"],
@@ -1294,6 +1377,7 @@ xla_test(
     srcs = ["client_test.cc"],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -1621,6 +1705,45 @@ xla_test(
     ],
 )
 
+# A demo of textual IR based test.
+xla_test(
+    name = "sample_text_test",
+    srcs = ["sample_text_test.cc"],
+    # You can leave this empty if you want to test all supported backends.
+    backends = [
+        "cpu",
+        "gpu",
+    ],
+    deps = [
+        ":hlo_test_base",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+    ],
+)
+
+# A demo of test that loads an hlo module from a file and compares results on gpu and cpu.
+tf_cc_test(
+    name = "sample_file_test",
+    srcs = ["sample_file_test.cc"],
+    data = ["isolated_convolution.hlo"],
+    tags = ["requires-gpu-sm35"],
+    deps = [
+        ":hlo_test_base",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/service:cpu_plugin",  # reference backend
+        "//tensorflow/compiler/xla/service:gpu_plugin",  # test backend
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
index 028d1251b455b82a291c236f7866e52e27d3590e..7525bc4bdfbaa942ea8af29af31829ae8742e833 100644
--- a/tensorflow/compiler/xla/tests/batch_normalization_test.cc
+++ b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -203,6 +204,15 @@ struct BatchNormTestParam {
   int64 feature_index;
   float random_value_mean;
   float random_value_var;
+
+  friend ::std::ostream& operator<<(::std::ostream& os,
+                                    const BatchNormTestParam& p) {
+    os << "bounds={" << tensorflow::str_util::Join(p.bounds, ", ") << "}, ";
+    os << "feature_index=" << p.feature_index << ", ";
+    os << "random_value_mean=" << p.random_value_mean << ", ";
+    os << "random_value_var=" << p.random_value_var;
+    return os;
+  }
 };
 
 // Tests to test the fused operation of BatchNorm.
diff --git a/tensorflow/compiler/xla/tests/bfloat16_test.cc b/tensorflow/compiler/xla/tests/bfloat16_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ac3f3f4c9ddb03d003a44f5abd7a2e26c42f490d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/bfloat16_test.cc
@@ -0,0 +1,160 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/array4d.h"
+#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/reference_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class Bfloat16Test : public ClientLibraryTestBase {
+ protected:
+  const ErrorSpec error_spec_{0.001, 0.001};
+};
+
+XLA_TEST_F(Bfloat16Test, ScalarOperation) {
+  ComputationBuilder builder(client_, TestName());
+  auto x = builder.ConstantR0<bfloat16>(static_cast<bfloat16>(2.0f));
+  auto y = builder.ConstantR0<bfloat16>(static_cast<bfloat16>(1.0f));
+  builder.Add(x, y);
+
+  ComputeAndCompareR0<bfloat16>(&builder, static_cast<bfloat16>(3.0f), {},
+                                error_spec_);
+}
+
+XLA_TEST_F(Bfloat16Test, LogOperation) {
+  ComputationBuilder builder(client_, TestName());
+  auto x = builder.ConstantR0<bfloat16>(static_cast<bfloat16>(4.0f));
+  builder.Log(x);
+
+  ComputeAndCompareR0<bfloat16>(&builder, static_cast<bfloat16>(1.387f), {},
+                                error_spec_);
+}
+
+XLA_TEST_F(Bfloat16Test, NegateScalarF16) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Neg(builder.ConstantR0<bfloat16>(static_cast<bfloat16>(2.1f)));
+
+  ComputeAndCompareR0<bfloat16>(&builder, static_cast<bfloat16>(-2.1f), {},
+                                error_spec_);
+}
+
+XLA_TEST_F(Bfloat16Test, BatchNormTraining) {
+  const int kFeatureIndex = 2;
+  ComputationBuilder builder(client_, TestName());
+
+  auto operand = builder.ConstantR4FromArray4D<bfloat16>(
+      {{{{static_cast<bfloat16>(1.f)}, {static_cast<bfloat16>(2.f)}},
+        {{static_cast<bfloat16>(3.f)}, {static_cast<bfloat16>(4.f)}}},
+       {{{static_cast<bfloat16>(5.f)}, {static_cast<bfloat16>(6.f)}},
+        {{static_cast<bfloat16>(7.f)}, {static_cast<bfloat16>(8.f)}}}});
+
+  auto scale = builder.ConstantR1<bfloat16>(
+      {static_cast<bfloat16>(2.0f), static_cast<bfloat16>(3.0f)});
+
+  auto offset = builder.ConstantR1<bfloat16>(
+      {static_cast<bfloat16>(1.0f), static_cast<bfloat16>(2.0f)});
+
+  auto tuple = builder.BatchNormTraining(operand, scale, offset,
+                                         /*epsilon=*/0.001, kFeatureIndex);
+
+  auto expected = *Literal::MakeTuple(
+      {Literal::CreateR4<bfloat16>(
+           {{{{static_cast<bfloat16>(-1.7f)}, {static_cast<bfloat16>(-2.04f)}},
+             {{static_cast<bfloat16>(0.105f)}, {static_cast<bfloat16>(0.65f)}}},
+            {{{static_cast<bfloat16>(1.89f)}, {static_cast<bfloat16>(3.35f)}},
+             {{static_cast<bfloat16>(3.7f)}, {static_cast<bfloat16>(6.04f)}}}})
+           .get(),
+       Literal::CreateR1<bfloat16>(
+           {static_cast<bfloat16>(4), static_cast<bfloat16>(5)})
+           .get(),
+       Literal::CreateR1<bfloat16>(
+           {static_cast<bfloat16>(5), static_cast<bfloat16>(5)})
+           .get()});
+
+  ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.01));
+}
+
+XLA_TEST_F(Bfloat16Test, BatchNormGrad) {
+  const int kFeatureIndex = 2;
+  ComputationBuilder builder(client_, TestName());
+
+  auto operand = builder.ConstantR4FromArray4D<bfloat16>(
+      Array4D<bfloat16>(2, 2, 2, 1, static_cast<bfloat16>(0.0f)));
+
+  auto scale = builder.ConstantR1<bfloat16>(
+      {static_cast<bfloat16>(1.0f), static_cast<bfloat16>(1.0f)});
+
+  auto mean = builder.ConstantR1<bfloat16>(
+      {static_cast<bfloat16>(0.0f), static_cast<bfloat16>(0.0f)});
+
+  auto var = builder.ConstantR1<bfloat16>(
+      {static_cast<bfloat16>(1.0f), static_cast<bfloat16>(1.0f)});
+
+  auto grad_output = builder.ConstantR4FromArray4D<bfloat16>(
+      {{{{static_cast<bfloat16>(1.f)}, {static_cast<bfloat16>(2.f)}},
+        {{static_cast<bfloat16>(3.f)}, {static_cast<bfloat16>(4.f)}}},
+       {{{static_cast<bfloat16>(5.f)}, {static_cast<bfloat16>(6.f)}},
+        {{static_cast<bfloat16>(7.f)}, {static_cast<bfloat16>(8.f)}}}});
+
+  builder.BatchNormGrad(operand, scale, mean, var, grad_output,
+                        /*epsilon=*/0.0, kFeatureIndex);
+
+  auto expected = *Literal::MakeTuple(
+      {Literal::CreateR4<bfloat16>(
+           {{{{static_cast<bfloat16>(-3.f)}, {static_cast<bfloat16>(-3.f)}},
+             {{static_cast<bfloat16>(-1.f)}, {static_cast<bfloat16>(-1.f)}}},
+            {{{static_cast<bfloat16>(1.f)}, {static_cast<bfloat16>(1.f)}},
+             {{static_cast<bfloat16>(3.f)}, {static_cast<bfloat16>(3.f)}}}})
+           .get(),
+       Literal::CreateR1<bfloat16>(
+           {static_cast<bfloat16>(0), static_cast<bfloat16>(0)})
+           .get(),
+       Literal::CreateR1<bfloat16>(
+           {static_cast<bfloat16>(16), static_cast<bfloat16>(20)})
+           .get()});
+
+  ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.01));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d94d65c1015fb54ada3fdfc95d0c31d0a0f158b
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class BitcastConvertTest : public ClientLibraryTestBase {
+ public:
+  explicit BitcastConvertTest(perftools::gputools::Platform* platform = nullptr)
+      : ClientLibraryTestBase(platform) {
+    mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
+    mutable_debug_options()->add_xla_disable_hlo_passes("inline");
+  }
+};
+
+TEST_F(BitcastConvertTest, ConvertR1S32ToR1S32) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<int32>({42, 64});
+  builder.BitcastConvertType(a, S32);
+
+  std::vector<int32> expected = {42, 64};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+TEST_F(BitcastConvertTest, ConvertR1F32ToR1F32) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<float>({42.0f, 64.0f});
+  builder.BitcastConvertType(a, F32);
+
+  std::vector<float> expected = {42.0f, 64.0f};
+  ComputeAndCompareR1<float>(&builder, expected, {});
+}
+
+TEST_F(BitcastConvertTest, BitcastR1S32ToR1F32) {
+  ComputationBuilder builder(client_, TestName());
+  auto a =
+      builder.ConstantR1<int32>({0, static_cast<int32>(0x80000000), 0x3F800000,
+                                 static_cast<int32>(0xBF800000), 0x3F000000,
+                                 static_cast<int32>(0xBF000000)});
+  builder.BitcastConvertType(a, F32);
+
+  std::vector<float> expected = {0.0f, -0.0f, 1.0f, -1.0f, 0.5f, -0.5f};
+  ComputeAndCompareR1<float>(&builder, expected, {});
+}
+
+XLA_TEST_F(BitcastConvertTest, ConvertR1S0S32ToR1S0F32) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<int32>({});
+  builder.BitcastConvertType(a, F32);
+
+  std::vector<float> expected = {};
+  ComputeAndCompareR1<float>(&builder, expected, {});
+}
+
+TEST_F(BitcastConvertTest, ConvertR1F32ToR1S32) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<float>({42.6, 64.4});
+  builder.BitcastConvertType(a, S32);
+
+  std::vector<int32> expected = {0x422a6666, 0x4280cccd};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+TEST_F(BitcastConvertTest, ConvertS32Extremes) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<int32>(
+      {std::numeric_limits<int32>::min(), std::numeric_limits<int32>::max()});
+  builder.BitcastConvertType(a, F32);
+
+  std::vector<float> expected = {-0.0f, NAN};
+  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0, 0));
+}
+
+TEST_F(BitcastConvertTest, ConvertMapToS32) {
+  ComputationBuilder builder(client_, TestName());
+  auto b = builder.CreateSubBuilder("convert");
+  auto param = b->Parameter(0, ShapeUtil::MakeShape(F32, {}), "in");
+  b->BitcastConvertType(param, S32);
+  auto a = builder.ConstantR1<float>({42.0f, 64.0f});
+  builder.Map({a}, b->BuildAndNoteError(), {0});
+
+  std::vector<int32> expected = {0x42280000, 0x42800000};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+TEST_F(BitcastConvertTest, ConvertMapToF32) {
+  ComputationBuilder builder(client_, TestName());
+  auto b = builder.CreateSubBuilder("convert");
+  auto param = b->Parameter(0, ShapeUtil::MakeShape(S32, {}), "in");
+  b->BitcastConvertType(param, F32);
+  auto a = builder.ConstantR1<int32>({0x42280000, 0x42800000});
+  builder.Map({a}, b->BuildAndNoteError(), {0});
+
+  std::vector<float> expected = {42.0f, 64.0f};
+  ComputeAndCompareR1<float>(&builder, expected, {});
+}
+
+// Regression test for b/31758660. When ReshapeMover transforms
+//   input -> reshape -> convert
+// to
+//   input -> convert -> reshape
+// the new convert should have the same element type as the old convert.
+TEST_F(BitcastConvertTest, ConvertReshape) {
+  ComputationBuilder builder(client_, TestName());
+  auto input = builder.ConstantR1<int32>({0x42280000});
+  auto reshape = builder.Reshape(input, /*dimensions=*/{0}, /*new_sizes=*/{});
+  builder.BitcastConvertType(reshape, F32);
+
+  ComputeAndCompareR0<float>(&builder, 42.0f, {});
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index f594c609db6282513a27a479a85e6a3dd1a7a3cd..610302ac1256a57db6ed6e18016a4136973e3891 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -29,6 +29,7 @@ def xla_test(name,
              deps,
              xla_test_library_deps=[],
              backends=[],
+             blacklisted_backends=[],
              args=[],
              tags=[],
              copts=[],
@@ -92,17 +93,24 @@ def xla_test(name,
     backends: A list of backends to generate tests for. Supported
       values: "cpu", "cpu_parallel", "gpu". If this list is empty, the test will
       be generated for all supported backends.
+    blacklisted_backends: A list of backends to NOT generate tests for.
     args: Test arguments for the target.
     tags: Tags for the target.
-    backend_args: A dict mapping backend name to list of additional args to
-      use for that target.
+    copts: Additional copts to pass to the build.
+    data: Additional data to pass to the build.
     backend_tags: A dict mapping backend name to list of additional tags to
       use for that target.
+    backend_args: A dict mapping backend name to list of additional args to
+      use for that target.
+    **kwargs: Additional keyword arguments to pass to native.cc_test.
   """
   test_names = []
   if not backends:
     backends = all_backends
 
+  backends = [backend for backend in backends
+              if backend not in blacklisted_backends]
+
   native.cc_library(
       name="%s_lib" % name,
       srcs=srcs,
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index ef54714e46ffe6f22f26410c33fa62c2d528f280..50bf185936808fbd9c49f7fbd5ab0c0b4a76504b 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -262,20 +262,39 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
                  expected.shape().element_type() == PRED)
         << ShapeUtil::HumanString(expected.shape());
   }
+  // We allow using a float expected literal for a bfloat16 output. In this
+  // case, we need to convert the expected literal to bfloat16.
+  const Literal* expected_ptr = &expected;
+  std::unique_ptr<Literal> converted_expected;
+  Shape layout_shape;
+  if (use_bfloat16_) {
+    converted_expected = LiteralTestUtil::ConvertF32ToBF16(expected);
+    expected_ptr = converted_expected.get();
+    if (shape_with_layout != nullptr) {
+      layout_shape = *shape_with_layout;
+      ShapeUtil::ForEachMutableSubshape(
+          &layout_shape, [&](Shape* subshape, const ShapeIndex& /*index*/) {
+            if (subshape->element_type() == F32) {
+              subshape->set_element_type(BF16);
+            }
+          });
+      shape_with_layout = &layout_shape;
+    }
+  }
   auto expect_equal = [&](const Literal& actual, const string& error_message) {
-    LiteralTestUtil::ExpectEqual(expected, actual, error_message);
+    LiteralTestUtil::ExpectEqual(*expected_ptr, actual, error_message);
   };
   if (execution_options_.debug_options().xla_test_all_output_layouts()) {
     return ComputeAndCompareLiteralWithAllOutputLayouts(
-        computation, expected, arguments, expect_equal);
+        computation, *expected_ptr, arguments, expect_equal);
   }
   if (execution_options_.debug_options().xla_test_all_input_layouts()) {
     return ComputeAndCompareLiteralWithAllInputLayouts(
-        computation, expected, arguments, expect_equal, shape_with_layout);
+        computation, *expected_ptr, arguments, expect_equal, shape_with_layout);
   }
   TF_ASSIGN_OR_RETURN(auto actual, ExecuteAndTransfer(computation, arguments,
                                                       shape_with_layout));
-  LiteralTestUtil::ExpectEqual(expected, *actual);
+  LiteralTestUtil::ExpectEqual(*expected_ptr, *actual);
   return tensorflow::Status::OK();
 }
 
@@ -286,20 +305,39 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   TF_RET_CHECK(ShapeUtil::ElementIsFloating(expected.shape()) ||
                ShapeUtil::ElementIsComplex(expected.shape()));
   TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
+  // We allow using a float expected literal for a bfloat16 output. In this
+  // case, we need to convert the expected literal to bfloat16.
+  const Literal* expected_ptr = &expected;
+  std::unique_ptr<Literal> converted_expected;
+  Shape layout_shape;
+  if (use_bfloat16_) {
+    converted_expected = LiteralTestUtil::ConvertF32ToBF16(expected);
+    expected_ptr = converted_expected.get();
+    if (shape_with_layout != nullptr) {
+      layout_shape = *shape_with_layout;
+      ShapeUtil::ForEachMutableSubshape(
+          &layout_shape, [&](Shape* subshape, const ShapeIndex& /*index*/) {
+            if (subshape->element_type() == F32) {
+              subshape->set_element_type(BF16);
+            }
+          });
+      shape_with_layout = &layout_shape;
+    }
+  }
   auto expect_near = [&](const Literal& actual, const string& error_message) {
-    LiteralTestUtil::ExpectNear(expected, actual, error, error_message);
+    LiteralTestUtil::ExpectNear(*expected_ptr, actual, error, error_message);
   };
   if (execution_options_.debug_options().xla_test_all_output_layouts()) {
-    return ComputeAndCompareLiteralWithAllOutputLayouts(computation, expected,
-                                                        arguments, expect_near);
+    return ComputeAndCompareLiteralWithAllOutputLayouts(
+        computation, *expected_ptr, arguments, expect_near);
   }
   if (execution_options_.debug_options().xla_test_all_input_layouts()) {
     return ComputeAndCompareLiteralWithAllInputLayouts(
-        computation, expected, arguments, expect_near, shape_with_layout);
+        computation, *expected_ptr, arguments, expect_near, shape_with_layout);
   }
   TF_ASSIGN_OR_RETURN(auto actual, ExecuteAndTransfer(computation, arguments,
                                                       shape_with_layout));
-  LiteralTestUtil::ExpectNear(expected, *actual, error);
+  LiteralTestUtil::ExpectNear(*expected_ptr, *actual, error);
   return tensorflow::Status::OK();
 }
 
@@ -402,8 +440,11 @@ ClientLibraryTestBase::ComputeValueAndReference(
 
 Computation ClientLibraryTestBase::CreateScalarRelu() {
   ComputationBuilder builder(client_, "relu");
-  auto z_value = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "z_value");
-  auto zero = builder.ConstantR0<float>(0.0);
+  auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {});
+  auto z_value = builder.Parameter(0, shape, "z_value");
+  auto zero = use_bfloat16_
+                  ? builder.ConstantR0<bfloat16>(static_cast<bfloat16>(0.0f))
+                  : builder.ConstantR0<float>(0.0f);
   builder.Max(z_value, zero);
   auto computation_status = builder.Build();
   TF_CHECK_OK(computation_status.status());
@@ -412,8 +453,9 @@ Computation ClientLibraryTestBase::CreateScalarRelu() {
 
 Computation ClientLibraryTestBase::CreateScalarMax() {
   ComputationBuilder builder(client_, "max");
-  auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
-  auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
+  auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {});
+  auto x = builder.Parameter(0, shape, "x");
+  auto y = builder.Parameter(1, shape, "y");
   builder.Max(x, y);
   auto computation_status = builder.Build();
   TF_CHECK_OK(computation_status.status());
@@ -422,11 +464,12 @@ Computation ClientLibraryTestBase::CreateScalarMax() {
 
 Computation ClientLibraryTestBase::CreateScalarReluSensitivity() {
   ComputationBuilder builder(client_, "relu_sensitivity");
-  auto activation =
-      builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "activation");
-  auto backprop =
-      builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "backprop");
-  auto zero = builder.ConstantR0<float>(0.0);
+  auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {});
+  auto activation = builder.Parameter(0, shape, "activation");
+  auto backprop = builder.Parameter(1, shape, "backprop");
+  auto zero = use_bfloat16_
+                  ? builder.ConstantR0<bfloat16>(static_cast<bfloat16>(0.0f))
+                  : builder.ConstantR0<float>(0.0f);
   auto activation_gtz = builder.Gt(activation, zero);
   builder.Select(activation_gtz, /*on_true=*/backprop, /*on_false=*/zero);
 
@@ -461,4 +504,27 @@ ClientLibraryTestBase::CreatePatternedMatrixWithZeroPadding(int rows, int cols,
   return array;
 }
 
+std::unique_ptr<GlobalData>
+ClientLibraryTestBase::CreateParameterAndTransferLiteral(
+    int64 parameter_number, const Literal& literal, const string& name,
+    ComputationBuilder* builder, ComputationDataHandle* data_handle) {
+  const Literal* param_literal = &literal;
+  std::unique_ptr<Literal> converted_literal;
+  if (use_bfloat16_) {
+    converted_literal = LiteralTestUtil::ConvertF32ToBF16(literal);
+    param_literal = converted_literal.get();
+  }
+  std::unique_ptr<GlobalData> data =
+      client_->TransferToServer(*param_literal).ConsumeValueOrDie();
+  *data_handle =
+      builder->Parameter(parameter_number, param_literal->shape(), name);
+  return data;
+}
+
+ComputationDataHandle ClientLibraryTestBase::CreateConstantFromLiteral(
+    const Literal& literal, ComputationBuilder* builder) {
+  return builder->ConstantLiteral(
+      use_bfloat16_ ? *LiteralTestUtil::ConvertF32ToBF16(literal) : literal);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 1dc274c59172313bcc1b6e5e7029657c3fea937f..4d0cf8bf71cf22d7c046bb22754a8d4e299ed9db 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -194,7 +194,7 @@ class ClientLibraryTestBase : public ::testing::Test {
       tensorflow::gtl::ArraySlice<GlobalData*> arguments);
   void ComputeAndCompareTuple(
       ComputationBuilder* builder, const Literal& expected,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec abs_error);
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error);
 
   // Convenience method for running a built computation and comparing the result
   // with the HloEvaluator.
@@ -245,51 +245,102 @@ class ClientLibraryTestBase : public ::testing::Test {
       const int rows, const int cols, const int rows_padded,
       const int cols_padded);
 
-  // Create a parameter instruction that wraps a given value and then stores
+  // Creates a parameter instruction, transfers the literal for the parameter to
+  // server, then stores into "data_handle" the global handle for that
+  // parameter. When the use_bfloat16 flag is set but the literal has F32
+  // elements, the literal will be converted to BF16 before being transferred.
+  std::unique_ptr<GlobalData> CreateParameterAndTransferLiteral(
+      int64 parameter_number, const Literal& literal, const string& name,
+      ComputationBuilder* builder, ComputationDataHandle* data_handle);
+
+  // Creates a constant instruction with the given literal. When the
+  // use_bfloat16 flag is set but the literal has F32 elements, the elements
+  // will be converted to BF16s.
+  ComputationDataHandle CreateConstantFromLiteral(const Literal& literal,
+                                                  ComputationBuilder* builder);
+
+  // Creates a constant instruction with the given array. When the use_bfloat16
+  // flag is set but the array has float elements, the elements will be
+  // converted to bfloat16s.
+  template <typename NativeT>
+  ComputationDataHandle CreateConstantFromArray(const Array<NativeT>& array,
+                                                ComputationBuilder* builder) {
+    return CreateConstantFromLiteral(*Literal::CreateFromArray(array), builder);
+  }
+
+  // Same as CreateConstantFromArray, but for scalars.
+  template <typename NativeT>
+  ComputationDataHandle CreateConstantFromScalar(NativeT value,
+                                                 ComputationBuilder* builder) {
+    return CreateConstantFromLiteral(*Literal::CreateR0<NativeT>(value),
+                                     builder);
+  }
+
+  // Creates a parameter instruction that wraps a given value and then stores
   // into "data_handle" the global handle for that parameter.
   //
   // "parameter_number" is the parameter number.
   // "name" is the name of the parameter instruction.
+  //
+  // When the use_bfloat16 flag is set but NativeT is float, the data will be
+  // converted to bfloat16.
   template <typename NativeT>
   std::unique_ptr<GlobalData> CreateR0Parameter(
       NativeT value, int64 parameter_number, const string& name,
       ComputationBuilder* builder, ComputationDataHandle* data_handle);
 
-  // Create a parameter instruction that wraps the given values and then stores
+  // Creates a parameter instruction that wraps the given values and then stores
   // into "data_handle" the global handle for that parameter.
   //
   // "parameter_number" is the parameter number.
   // "name" is the name of the parameter instruction.
+  //
+  // When the use_bfloat16 flag is set but NativeT is float, the data will be
+  // converted to bfloat16.
   template <typename NativeT>
   std::unique_ptr<GlobalData> CreateR1Parameter(
       tensorflow::gtl::ArraySlice<NativeT> values, int64 parameter_number,
       const string& name, ComputationBuilder* builder,
       ComputationDataHandle* data_handle);
 
-  // Create a parameter instruction that wraps the given constant array
+  // Creates a parameter instruction that wraps the given constant array
   // "array_2d" and then stores to "data_handle" the global handle for that
   // parameter.
   //
   // "parameter_number" is the parameter number.
   // "name" is the name of the parameter instruction.
+  //
+  // When the use_bfloat16 flag is set but NativeT is float, the data will be
+  // converted to bfloat16.
   template <typename NativeT>
   std::unique_ptr<GlobalData> CreateR2Parameter(
       const Array2D<NativeT>& array_2d, int64 parameter_number,
       const string& name, ComputationBuilder* builder,
       ComputationDataHandle* data_handle);
 
-  // Create a parameter instruction that wraps the given constant array
+  // Creates a parameter instruction that wraps the given constant array
   // "array_3d" and then stores to "data_handle" the global handle for that
   // parameter.
   //
   // "parameter_number" is the parameter number.
   // "name" is the name of the parameter instruction.
+  //
+  // When the use_bfloat16 flag is set but NativeT is float, the data will be
+  // converted to bfloat16.
   template <typename NativeT>
   std::unique_ptr<GlobalData> CreateR3Parameter(
       const Array3D<NativeT>& array_3d, int64 parameter_number,
       const string& name, ComputationBuilder* builder,
       ComputationDataHandle* data_handle);
 
+  // Getter and setter for the use_bfloat16 flag, which indicates whether to run
+  // tests with all float-type input/output converted to bfloat16.
+  bool use_bfloat16() const { return use_bfloat16_; }
+  void set_use_bfloat16(bool value) { use_bfloat16_ = value; }
+
+  // The float type used in this test, BF16 or F32 according to use_bfloat16.
+  PrimitiveType FloatType() const { return use_bfloat16_ ? BF16 : F32; }
+
   Client* client_;
   ExecutionOptions execution_options_;
 
@@ -315,6 +366,10 @@ class ClientLibraryTestBase : public ::testing::Test {
   ComputeValueAndReference(ComputationBuilder* builder,
                            const ComputationDataHandle& operand,
                            tensorflow::gtl::ArraySlice<Literal> arguments);
+
+  // Whether to run tests with all float-type input/output converted to
+  // bfloat16.
+  bool use_bfloat16_ = false;
 };
 
 template <typename NativeT>
@@ -333,6 +388,7 @@ void ClientLibraryTestBase::ComputeAndCompareR0(
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
+                    std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
@@ -357,6 +413,7 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
+                    std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
@@ -381,6 +438,7 @@ void ClientLibraryTestBase::ComputeAndCompareR2(
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
+                    std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
@@ -405,6 +463,7 @@ void ClientLibraryTestBase::ComputeAndCompareR3(
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
+                    std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
@@ -429,6 +488,7 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
+                    std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
@@ -442,6 +502,9 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR0Parameter(
     NativeT value, int64 parameter_number, const string& name,
     ComputationBuilder* builder, ComputationDataHandle* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR0(value);
+  if (use_bfloat16_ && literal->shape().element_type() == F32) {
+    literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
+  }
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
   *data_handle = builder->Parameter(parameter_number, literal->shape(), name);
@@ -454,6 +517,9 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR1Parameter(
     const string& name, ComputationBuilder* builder,
     ComputationDataHandle* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR1(values);
+  if (use_bfloat16_ && literal->shape().element_type() == F32) {
+    literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
+  }
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
   *data_handle = builder->Parameter(parameter_number, literal->shape(), name);
@@ -466,6 +532,9 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR2Parameter(
     const string& name, ComputationBuilder* builder,
     ComputationDataHandle* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR2FromArray2D(array_2d);
+  if (use_bfloat16_ && literal->shape().element_type() == F32) {
+    literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
+  }
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
   *data_handle = builder->Parameter(parameter_number, literal->shape(), name);
@@ -478,6 +547,9 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR3Parameter(
     const string& name, ComputationBuilder* builder,
     ComputationDataHandle* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR3FromArray3D(array_3d);
+  if (use_bfloat16_ && literal->shape().element_type() == F32) {
+    literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
+  }
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
   *data_handle = builder->Parameter(parameter_number, literal->shape(), name);
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 183bcf1dd333a6955bcae6dd07d2ef31fe817434..8853ed9e5780672d4006c326291767b8b5253f56 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -20,10 +20,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test.h"
@@ -42,26 +44,26 @@ TEST_F(ClientTest, ExecuteWithLayout) {
     for (const std::vector<int64>& transfer_layout : layouts) {
       b.Add(b.ConstantR2<int32>({{1, 2}, {3, 4}}),
             b.ConstantR2<int32>({{10, 20}, {30, 40}}));
-      auto computation = b.Build();
-      ASSERT_TRUE(computation.ok()) << computation.status();
+      TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
 
       ExecutionOptions execution_options = execution_options_;
       *execution_options.mutable_shape_with_output_layout() =
           ShapeUtil::MakeShapeWithLayout(S32, /*dimensions=*/{2, 2},
                                          execute_layout);
-      std::unique_ptr<GlobalData> data =
-          client_->Execute(computation.ValueOrDie(), {}, &execution_options)
-              .ConsumeValueOrDie();
+      TF_ASSERT_OK_AND_ASSIGN(
+          std::unique_ptr<GlobalData> data,
+          client_->Execute(computation, {}, &execution_options));
 
       std::unique_ptr<Literal> expected_literal =
           Literal::CreateR2WithLayout<int32>(
               {{11, 22}, {33, 44}}, LayoutUtil::MakeLayout(transfer_layout));
 
-      auto computed = client_->Transfer(*data, &expected_literal->shape());
+      TF_ASSERT_OK_AND_ASSIGN(
+          auto computed, client_->Transfer(*data, &expected_literal->shape()));
 
-      LiteralTestUtil::AssertEqualShapesAndLayouts(
-          expected_literal->shape(), computed.ValueOrDie()->shape());
-      LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+      LiteralTestUtil::AssertEqualShapesAndLayouts(expected_literal->shape(),
+                                                   computed->shape());
+      LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
     }
   }
 }
@@ -72,8 +74,7 @@ TEST_F(ClientTest, ExecuteWithTupleLayout) {
   b.Tuple({b.ConstantR2<int32>({{1, 2}, {3, 4}}),
            b.ConstantR2<int32>({{10, 20}, {30, 40}})});
 
-  auto computation = b.Build();
-  ASSERT_TRUE(computation.ok()) << computation.status();
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
 
   ExecutionOptions execution_options = execution_options_;
   // Create a result shape with one element column major and the other row
@@ -85,10 +86,9 @@ TEST_F(ClientTest, ExecuteWithTupleLayout) {
            ShapeUtil::MakeShapeWithLayout(S32, /*dimensions=*/{2, 2},
                                           /*minor_to_major=*/{1, 0})});
 
-  auto result =
-      client_
-          ->ExecuteAndTransfer(computation.ValueOrDie(), {}, &execution_options)
-          .ConsumeValueOrDie();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result,
+      client_->ExecuteAndTransfer(computation, {}, &execution_options));
   LiteralTestUtil::ExpectR2Equal<int32>({{1, 2}, {3, 4}},
                                         result->tuple_literals(0));
   LiteralTestUtil::ExpectR2Equal<int32>({{10, 20}, {30, 40}},
@@ -107,5 +107,42 @@ TEST_F(ClientTest, ExecuteWithTupleLayout) {
                                      /*minor_to_major=*/{1, 0})));
 }
 
+TEST_F(ClientTest, DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(ExecuteParallel))) {
+  Computation add_with_one_arg, mul_with_two_args, dot_with_one_arg;
+  Shape shape = ShapeUtil::MakeShape(S32, {2, 2});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GlobalData> const_arg,
+      client_->TransferToServer(*Literal::CreateR2<int32>({{5, 6}, {7, 8}})));
+
+  ComputationBuilder b(client_, TestName() + ".add");
+  b.Add(b.Parameter(0, shape, "param_0"),
+        b.ConstantR2<int32>({{1, 2}, {3, 4}}));
+  TF_ASSERT_OK_AND_ASSIGN(add_with_one_arg, b.Build());
+
+  // We can't really test parallel execution on CPU since all of the cores in a
+  // CPU are presented as a single device.  So for now we test "parallel"
+  // execution on a single device.
+  std::vector<Client::ComputationInstance> computation_instances;
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<xla::DeviceHandle> devices,
+                          client_->GetDeviceHandles(1));
+  ASSERT_EQ(devices.size(), 1);
+
+  ExecutionOptions options = execution_options_;
+  *options.add_device_handles() = devices[0];
+  computation_instances.push_back(Client::ComputationInstance(
+      add_with_one_arg, {const_arg.get()}, options, nullptr));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto results,
+                          client_->ExecuteParallel(computation_instances));
+  auto expected_result = Literal::CreateR2<int32>({{6, 8}, {10, 12}});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result_literal,
+      client_->Transfer(*results[0], &expected_result->shape()));
+
+  LiteralTestUtil::ExpectEqual(*expected_result, *result_literal);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/codegen_test_base.cc b/tensorflow/compiler/xla/tests/codegen_test_base.cc
index 43ea7f6019415a171123ee0315533b8a3b1ff984..e472408dcf7ed5fec74e886fd0092ce47ee2e7eb 100644
--- a/tensorflow/compiler/xla/tests/codegen_test_base.cc
+++ b/tensorflow/compiler/xla/tests/codegen_test_base.cc
@@ -19,8 +19,11 @@ namespace xla {
 
 StatusOr<std::unique_ptr<Executable>> CodegenTestBase::CompileToExecutable(
     std::unique_ptr<HloModule> hlo_module) {
-  return backend().compiler()->Compile(std::move(hlo_module),
-                                       backend().default_stream_executor());
+  TF_ASSIGN_OR_RETURN(hlo_module, backend().compiler()->RunHloPasses(
+                                      std::move(hlo_module),
+                                      backend().default_stream_executor()));
+  return backend().compiler()->RunBackend(std::move(hlo_module),
+                                          backend().default_stream_executor());
 }
 
 StatusOr<std::unique_ptr<AotCompilationResult>>
diff --git a/tensorflow/compiler/xla/tests/conditional_test.cc b/tensorflow/compiler/xla/tests/conditional_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c8c4932be821e410e25c41741df436544ab876f0
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/conditional_test.cc
@@ -0,0 +1,325 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+class ConditionalOpTest : public ClientLibraryTestBase {
+ protected:
+  Computation CreateR0F32ConstantComputation(float value) {
+    ComputationBuilder builder(client_, "Constant");
+    builder.Parameter(0, empty_tuple_, "tuple");
+    builder.ConstantR0<float>(value);
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateR0F32IdentityComputation() {
+    ComputationBuilder builder(client_, "Identity");
+    builder.Parameter(0, r0f32_, "x");
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateR0F32CeilComputation() {
+    ComputationBuilder builder(client_, "Ceil");
+    auto param = builder.Parameter(0, r0f32_, "param");
+    builder.Ceil(param);
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateR0F32FloorComputation() {
+    ComputationBuilder builder(client_, "Ceil");
+    auto param = builder.Parameter(0, r0f32_, "param");
+    builder.Floor(param);
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateAddTupleComputation(const string& computation_name,
+                                        const Shape& tuple_shape) {
+    ComputationBuilder builder(client_, computation_name);
+    auto tuple = builder.Parameter(0, tuple_shape, "tuple");
+    auto x = builder.GetTupleElement(tuple, 0);
+    auto y = builder.GetTupleElement(tuple, 1);
+    builder.Add(x, y);
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateAddR0Computation() {
+    return CreateAddTupleComputation("AddR0", tuple_2_r0f32_);
+  }
+
+  Computation CreateAddR1Computation() {
+    return CreateAddTupleComputation("AddR1", tuple_2_r1s2f32_);
+  }
+
+  Computation CreateSubTupleComputation(const string& computation_name,
+                                        const Shape& tuple_shape) {
+    ComputationBuilder builder(client_, computation_name);
+    auto tuple = builder.Parameter(0, tuple_shape, "tuple");
+    auto x = builder.GetTupleElement(tuple, 0);
+    auto y = builder.GetTupleElement(tuple, 1);
+    builder.Sub(x, y);
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateSubR0Computation() {
+    return CreateSubTupleComputation("SubR0", tuple_2_r0f32_);
+  }
+
+  Computation CreateSubR1Computation() {
+    return CreateSubTupleComputation("SubR1", tuple_2_r1s2f32_);
+  }
+
+  Shape r0f32_ = ShapeUtil::MakeShape(F32, {});
+  Shape tuple_2_r0f32_ = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(F32, {})});
+  Shape tuple_2_r1s2f32_ = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {2}), ShapeUtil::MakeShape(F32, {2})});
+  Shape empty_tuple_ = ShapeUtil::MakeTupleShape({});
+  ErrorSpec error_spec_{0.001};
+};
+
+// Test true and false computations that do not take any parameters.
+XLA_TEST_F(ConditionalOpTest, Parameters0) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(true);
+  auto operands = builder.Tuple({});
+  auto true_computation = CreateR0F32ConstantComputation(56.0f);
+  auto false_computation = CreateR0F32ConstantComputation(12.0f);
+  auto result = builder.Conditional(pred, operands, true_computation, operands,
+                                    false_computation);
+
+  ComputeAndCompareR0<float>(&builder, 56.0f, {}, error_spec_);
+}
+
+// Test true and false computations that take in 1 parameter.
+XLA_TEST_F(ConditionalOpTest, Parameters1) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(56.0f);
+  auto operand2 = builder.ConstantR0<float>(12.0f);
+  auto identity = CreateR0F32IdentityComputation();
+  auto result =
+      builder.Conditional(pred, operand1, identity, operand2, identity);
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test conditional with two different computations in the true and false cases
+// that take in different arguments.
+XLA_TEST_F(ConditionalOpTest, DiffComputationsDiffArgs) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(56.4f);
+  auto operand2 = builder.ConstantR0<float>(12.6f);
+  auto result =
+      builder.Conditional(pred, operand1, CreateR0F32CeilComputation(),
+                          operand2, CreateR0F32FloorComputation());
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test conditional with two different computations in the true and false cases
+// that take in the same arguments.
+XLA_TEST_F(ConditionalOpTest, DiffComputationsSameArg) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand = builder.ConstantR0<float>(12.6f);
+  auto result = builder.Conditional(pred, operand, CreateR0F32CeilComputation(),
+                                    operand, CreateR0F32FloorComputation());
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test conditional with the same computation in the true and false cases but
+// take in different arguments.
+XLA_TEST_F(ConditionalOpTest, SameComputationDiffArgs) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(56.4f);
+  auto operand2 = builder.ConstantR0<float>(12.6f);
+  auto floor = CreateR0F32FloorComputation();
+  auto result = builder.Conditional(pred, operand1, floor, operand2, floor);
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test conditional with the same computation in the true and false cases that
+// take in the same arguments.
+XLA_TEST_F(ConditionalOpTest, SameComputationSameArg) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand = builder.ConstantR0<float>(12.6f);
+  auto floor = CreateR0F32FloorComputation();
+  auto result = builder.Conditional(pred, operand, floor, operand, floor);
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test conditional with different instances of the same computation in the true
+// and false cases.
+XLA_TEST_F(ConditionalOpTest, SameComputationDiffInstances) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(56.4f);
+  auto operand2 = builder.ConstantR0<float>(12.6f);
+  auto result =
+      builder.Conditional(pred, operand1, CreateR0F32FloorComputation(),
+                          operand2, CreateR0F32FloorComputation());
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test the case when a call invokes a computation that contains a conditional.
+XLA_TEST_F(ConditionalOpTest, ConditionalWithCall) {
+  Shape r0bool = ShapeUtil::MakeShape(PRED, {});
+  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  auto pred_cond = inner_builder.Parameter(0, r0bool, "param0");
+  auto true_operand = inner_builder.Parameter(1, r0f32_, "param1");
+  auto false_operand = inner_builder.Parameter(2, r0f32_, "param2");
+  inner_builder.Conditional(pred_cond, true_operand,
+                            CreateR0F32CeilComputation(), false_operand,
+                            CreateR0F32FloorComputation());
+  auto inner_builder_result = inner_builder.Build();
+
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(56.4f);
+  auto operand2 = builder.ConstantR0<float>(12.6f);
+  builder.Call(inner_builder_result.ConsumeValueOrDie(),
+               {pred, operand1, operand2});
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test true and false computations that take in 2 parameters and predicate is
+// true.
+XLA_TEST_F(ConditionalOpTest, Parameters2TrueBranch) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(true);
+  auto operand1 = builder.ConstantR0<float>(56.0f);
+  auto operand2 = builder.ConstantR0<float>(12.0f);
+  auto operands = builder.Tuple({operand1, operand2});
+  auto result = builder.Conditional(pred, operands, CreateAddR0Computation(),
+                                    operands, CreateSubR0Computation());
+
+  ComputeAndCompareR0<float>(&builder, 68.0f, {}, error_spec_);
+}
+
+// Test true and false computations that take in 2 parameters and predicate is
+// false.
+XLA_TEST_F(ConditionalOpTest, Parameters2FalseBranch) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(56.0f);
+  auto operand2 = builder.ConstantR0<float>(12.0f);
+  auto operands = builder.Tuple({operand1, operand2});
+  auto result = builder.Conditional(pred, operands, CreateAddR0Computation(),
+                                    operands, CreateSubR0Computation());
+
+  ComputeAndCompareR0<float>(&builder, 44.0f, {}, error_spec_);
+}
+
+// Test true and false computations that take in 2 array parameters and
+// predicate is true.
+XLA_TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(true);
+  auto operand1 = builder.ConstantR1<float>({24.0f, 56.0f});
+  auto operand2 = builder.ConstantR1<float>({10.0f, 11.0f});
+  auto operands = builder.Tuple({operand1, operand2});
+  auto result = builder.Conditional(pred, operands, CreateAddR1Computation(),
+                                    operands, CreateSubR1Computation());
+
+  ComputeAndCompareR1<float>(&builder, {34.0f, 67.0f}, {}, error_spec_);
+}
+
+// Test true and false computations that take in 2 array parameters and
+// predicate is false.
+XLA_TEST_F(ConditionalOpTest, Parameters2ArrayFalseBranch) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR1<float>({24.0f, 56.0f});
+  auto operand2 = builder.ConstantR1<float>({10.0f, 11.0f});
+  auto operands = builder.Tuple({operand1, operand2});
+  auto result = builder.Conditional(pred, operands, CreateAddR1Computation(),
+                                    operands, CreateSubR1Computation());
+
+  ComputeAndCompareR1<float>(&builder, {14.0f, 45.0f}, {}, error_spec_);
+}
+
+// Test the case where one conditional is nested within another.
+XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
+  Shape r0bool = ShapeUtil::MakeShape(PRED, {});
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({r0bool, r0f32_, r0f32_});
+  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  auto param0 = inner_builder.Parameter(0, tuple_shape, "param0");
+  auto pred_cond = inner_builder.GetTupleElement(param0, 0);
+  auto true_operand = inner_builder.GetTupleElement(param0, 1);
+  auto false_operand = inner_builder.GetTupleElement(param0, 2);
+  inner_builder.Conditional(pred_cond, true_operand,
+                            CreateR0F32CeilComputation(), false_operand,
+                            CreateR0F32FloorComputation());
+  auto inner_builder_result = inner_builder.Build();
+
+  ComputationBuilder builder(client_, TestName());
+  auto pred1 = builder.ConstantR0<bool>(true);
+  auto pred2 = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(1.1f);
+  auto operand2 = builder.ConstantR0<float>(12.2f);
+  auto operand3 = builder.ConstantR0<float>(43.3f);
+  auto tuple_operand = builder.Tuple({pred2, operand1, operand2});
+  builder.Conditional(pred1, tuple_operand,
+                      inner_builder_result.ConsumeValueOrDie(), operand3,
+                      CreateR0F32IdentityComputation());
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test a mismatch in the shape of the true operand and true computation.
+XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(true);
+  auto operand1 = builder.ConstantR0<float>(56.0f);
+  auto operand2 = builder.ConstantR0<float>(12.0f);
+  auto operands = builder.Tuple({operand1, operand2});
+  builder.Conditional(pred, operands, CreateAddR1Computation(), operands,
+                      CreateSubR0Computation());
+
+  auto result = builder.Build();
+  EXPECT_FALSE(result.ok());
+  EXPECT_THAT(result.status().error_message(),
+              ::testing::HasSubstr("true_operand must match the shape of the "
+                                   "only parameter of true_computation"));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
index b0a63bccbb93f226175beff2e30e2a243fdca1d3..896b34fb6e2762c14bd9ec2bf1ba13c548d4cf60 100644
--- a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
@@ -39,8 +39,8 @@ class ConvolutionDimensionNumbersTest : public ClientLibraryTestBase {};
 // Tests the convolution operation with invalid input dimension numbers.
 TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) {
   auto dimension_numbers_status =
-      ComputationBuilder::CreateConvDimensionNumbers(0, 2, 0, 2, 2, 3, 0, 1, 2,
-                                                     3);
+      ComputationBuilder::CreateConvDimensionNumbers(0, 2, 2, 3, 0, 1, 2, 3, 0,
+                                                     1, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
   ASSERT_THAT(dimension_numbers_status.status().error_message(),
               ::testing::HasSubstr("input are not unique"));
@@ -49,13 +49,23 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) {
 // Tests the convolution operation with invalid weight dimension numbers.
 TEST_F(ConvolutionDimensionNumbersTest, InvalidWeightDimensionNumbers) {
   auto dimension_numbers_status =
-      ComputationBuilder::CreateConvDimensionNumbers(0, 1, 0, 1, 2, 3, 2, 3, 2,
-                                                     3);
+      ComputationBuilder::CreateConvDimensionNumbers(0, 1, 2, 3, 0, 1, 2, 3, 0,
+                                                     2, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
   ASSERT_THAT(dimension_numbers_status.status().error_message(),
               ::testing::HasSubstr("weight are not unique"));
 }
 
+// Tests the convolution operation with invalid output dimension numbers.
+TEST_F(ConvolutionDimensionNumbersTest, InvalidOutputDimensionNumbers) {
+  auto dimension_numbers_status =
+      ComputationBuilder::CreateConvDimensionNumbers(0, 1, 2, 3, 0, 2, 2, 3, 0,
+                                                     1, 2, 3);
+  ASSERT_FALSE(dimension_numbers_status.ok());
+  ASSERT_THAT(dimension_numbers_status.status().error_message(),
+              ::testing::HasSubstr("output are not unique"));
+}
+
 XLA_TEST_F(ConvolutionDimensionNumbersTest,
            TwoConvsWithDifferentDimensionNumbers) {
   auto input_array = MakeUnique<Array4D<float>>(2, 3, 5, 5);
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 7425f778a635c3b52b046d18ff79176a9c26c577..2924c08615fa706bb19addf04bf58e1d5dd5a659 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -370,9 +370,12 @@ XLA_TEST_F(ConvolutionTest, Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid) {
     ConvolutionDimensionNumbers dnums;
     dnums.set_input_batch_dimension(0);
     dnums.set_output_batch_dimension(0);
-    dnums.add_spatial_dimensions(1);
-    dnums.add_spatial_dimensions(2);
-    dnums.add_spatial_dimensions(3);
+    dnums.add_input_spatial_dimensions(1);
+    dnums.add_output_spatial_dimensions(1);
+    dnums.add_input_spatial_dimensions(2);
+    dnums.add_output_spatial_dimensions(2);
+    dnums.add_input_spatial_dimensions(3);
+    dnums.add_output_spatial_dimensions(3);
     dnums.set_input_feature_dimension(4);
     dnums.set_output_feature_dimension(4);
     dnums.add_kernel_spatial_dimensions(0);
@@ -423,8 +426,10 @@ XLA_TEST_F(ConvolutionTest, Convolve2D_1x3x3x5_3x3x5x5_Valid) {
     ConvolutionDimensionNumbers dnums;
     dnums.set_input_batch_dimension(0);
     dnums.set_output_batch_dimension(0);
-    dnums.add_spatial_dimensions(1);
-    dnums.add_spatial_dimensions(2);
+    dnums.add_input_spatial_dimensions(1);
+    dnums.add_output_spatial_dimensions(1);
+    dnums.add_input_spatial_dimensions(2);
+    dnums.add_output_spatial_dimensions(2);
     dnums.set_input_feature_dimension(3);
     dnums.set_output_feature_dimension(3);
     dnums.add_kernel_spatial_dimensions(0);
@@ -458,6 +463,54 @@ XLA_TEST_F(ConvolutionTest, Convolve2D_1x3x3x5_3x3x5x5_Valid) {
                            error_spec_);
 }
 
+// Test fixture to run convolution tests with and without convolution
+// canonicalization enabled.
+class ConvolveWithAndWithoutCanonicalization
+    : public ConvolutionTest,
+      public ::testing::WithParamInterface<bool> {};
+
+XLA_TEST_P(ConvolveWithAndWithoutCanonicalization,
+           DISABLED_ON_GPU(Convolve2D_NoSpatialDims)) {
+  if (GetParam()) {
+    execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
+        "convolution-canonicalization");
+  }
+  ComputationBuilder builder(client_, TestName());
+  Shape input_shape = ShapeUtil::MakeShape(F32, {4, 29});
+  Shape filter_shape = ShapeUtil::MakeShape(F32, {4, 10});
+
+  auto input = builder.Parameter(0, input_shape, "input");
+  auto filter = builder.Parameter(1, filter_shape, "filter");
+
+  ConvolutionDimensionNumbers dnums;
+  dnums.set_input_feature_dimension(0);
+  dnums.set_input_batch_dimension(1);
+  dnums.set_kernel_input_feature_dimension(0);
+  dnums.set_kernel_output_feature_dimension(1);
+  dnums.set_output_batch_dimension(0);
+  dnums.set_output_feature_dimension(1);
+  auto conv = builder.ConvWithGeneralDimensions(input, filter, {},
+                                                Padding::kValid, dnums);
+
+  Array2D<float> param0(4, 29);
+  param0.FillUnique();
+
+  Array2D<float> param1(4, 10);
+  param1.FillUnique();
+
+  Array2D<float> expected_result(29, 10);
+  expected_result.Fill(0);
+
+  ComputeAndCompare(
+      &builder, conv,
+      {*Literal::CreateFromArray(param0), *Literal::CreateFromArray(param1)},
+      error_spec_);
+}
+
+INSTANTIATE_TEST_CASE_P(ConvolveWithAndWithoutCanonicalization_Instantiation,
+                        ConvolveWithAndWithoutCanonicalization,
+                        ::testing::Values(true, false));
+
 struct Convolve1DTestParam {
   int64 input_feature;
   int64 output_feature;
@@ -490,7 +543,8 @@ XLA_TEST_P(Convolve1D1WindowTest, Convolve1D1Window) {
     ConvolutionDimensionNumbers dnums;
     dnums.set_input_batch_dimension(0);
     dnums.set_output_batch_dimension(0);
-    dnums.add_spatial_dimensions(1);
+    dnums.add_input_spatial_dimensions(1);
+    dnums.add_output_spatial_dimensions(1);
     dnums.set_input_feature_dimension(2);
     dnums.set_output_feature_dimension(2);
     dnums.add_kernel_spatial_dimensions(0);
diff --git a/tensorflow/compiler/xla/tests/convolution_variants_test.cc b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
index 9b36e3722b8f8a5d01c426425fdfb0c4b9ae3a16..9c1145def8c11f1222c63adf006102887d49f00d 100644
--- a/tensorflow/compiler/xla/tests/convolution_variants_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
@@ -320,9 +320,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter3x3in2x2Padded) {
   Array4D<float> input_array(1, 1, 2, 2, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
 
-  const Array4D<float> filter_array(1, 1, 3, 3, {10000, 0, 1000,  // row 0
-                                                 0, 100, 0,       // row 1
-                                                 10, 0, 1});      // row 2
+  const Array4D<float> filter_array(1, 1, 3, 3,
+                                    {10000, 0, 1000,  // row 0
+                                     0, 100, 0,       // row 1
+                                     10, 0, 1});      // row 2
   auto filter = builder.ConstantR4FromArray4D<float>(filter_array);
 
   builder.Conv(input, filter, {1, 1}, Padding::kSame);
@@ -472,7 +473,9 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input3x1x2x2) {
   builder.Conv(input, filter, {1, 1}, Padding::kValid);
 
   std::vector<float> expected_data = {
-      23, 33, 43,
+      23,
+      33,
+      43,
   };
   Array4D<float> expected(bs, 1, 1, 1, expected_data);
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
@@ -669,10 +672,11 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) {
   std::iota(input_data.begin(), input_data.end(), 1.0);
   Array4D<float> input_array(1, 1, 3, 4, input_data);
 
-  Array4D<float> filter_array(1, 1, 4, 3, {100, 10, 1,  //
-                                           200, 20, 2,  //
-                                           300, 30, 3,  //
-                                           400, 40, 4});
+  Array4D<float> filter_array(1, 1, 4, 3,
+                              {100, 10, 1,  //
+                               200, 20, 2,  //
+                               300, 30, 3,  //
+                               400, 40, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
   auto filter = builder.ConstantR4FromArray4D<float>(filter_array);
   builder.ConvGeneralDilated(
@@ -681,9 +685,10 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) {
       /*rhs_dilation=*/{},
       ComputationBuilder::CreateDefaultConvDimensionNumbers());
 
-  Array4D<float> expected(1, 1, 3, 5, {204, 40, 406, 60, 608,       //
-                                       1518, 180, 1821, 210, 2124,  //
-                                       4146, 460, 4651, 510, 5156});
+  Array4D<float> expected(1, 1, 3, 5,
+                          {204, 40, 406, 60, 608,       //
+                           1518, 180, 1821, 210, 2124,  //
+                           4146, 460, 4651, 510, 5156});
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
@@ -926,7 +931,8 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter16x16x1x1) {
   ComputeAndCompareR4<float>(&builder, *expected, {}, error_spec_);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x16x16_Filter16x16x16x16) {
+XLA_TEST_F(ConvolutionVariantsTest,
+           RandomData_Input16x16x16x16_Filter16x16x16x16) {
   constexpr int bs = 16;
   constexpr int iz = 16;
   constexpr int oz = 16;
@@ -976,8 +982,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2x1x1Input1x2x3x1GeneralPadding) {
   // NHWC input format.
   dnums.set_input_batch_dimension(0);
   dnums.set_output_batch_dimension(0);
-  dnums.add_spatial_dimensions(1);
-  dnums.add_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
   dnums.set_input_feature_dimension(3);
   dnums.set_output_feature_dimension(3);
 
@@ -1018,8 +1026,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1GeneralPadding) {
   // NHWC input format.
   dnums.set_input_batch_dimension(0);
   dnums.set_output_batch_dimension(0);
-  dnums.add_spatial_dimensions(1);
-  dnums.add_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
   dnums.set_input_feature_dimension(3);
   dnums.set_output_feature_dimension(3);
 
@@ -1060,8 +1070,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1NoPadding) {
   // NHWC input format.
   dnums.set_input_batch_dimension(0);
   dnums.set_output_batch_dimension(0);
-  dnums.add_spatial_dimensions(1);
-  dnums.add_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
   dnums.set_input_feature_dimension(3);
   dnums.set_output_feature_dimension(3);
 
@@ -1099,8 +1111,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) {
   // NHWC input format.
   dnums.set_input_batch_dimension(0);
   dnums.set_output_batch_dimension(0);
-  dnums.add_spatial_dimensions(1);
-  dnums.add_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
   dnums.set_input_feature_dimension(3);
   dnums.set_output_feature_dimension(3);
 
@@ -1131,7 +1145,8 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) {
 //   Conv([1,2,3], Reverse([5,6]), padding_low=1)
 // into
 //   BackwardInputConv([1,2,3], [5,6], padding_low=0, padding_high=1)
-XLA_TEST_F(ConvolutionVariantsTest, BackwardInputLowPaddingLessThanHighPadding) {
+XLA_TEST_F(ConvolutionVariantsTest,
+           BackwardInputLowPaddingLessThanHighPadding) {
   ComputationBuilder builder(client_, TestName());
 
   auto gradients = builder.ConstantR4FromArray4D<float>(
@@ -1149,7 +1164,8 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputLowPaddingLessThanHighPadding)
 //   Conv([1], Reverse([1,10,100]), padding_high=3, base_dilation=3)
 // into
 //   BackwardInputConv([1], [1,10,100], stride=3, padding=(2,1))
-XLA_TEST_F(ConvolutionVariantsTest, BackwardInputLowPaddingGreaterThanHighPadding) {
+XLA_TEST_F(ConvolutionVariantsTest,
+           BackwardInputLowPaddingGreaterThanHighPadding) {
   ComputationBuilder builder(client_, TestName());
 
   auto gradients = builder.ConstantR4FromArray4D<float>(
@@ -1206,7 +1222,8 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputWithNegativePaddingHigh) {
   ComputeAndCompareR4<float>(&builder, {{{{12, 23, 30, 0}}}}, {}, error_spec_);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterLowPaddingLessThanHighPadding) {
+XLA_TEST_F(ConvolutionVariantsTest,
+           BackwardFilterLowPaddingLessThanHighPadding) {
   ComputationBuilder builder(client_, TestName());
 
   // activations:      1,2,3,4  ---pad--> 0,1,2,3,4,0,0
@@ -1230,7 +1247,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterLowPaddingLessThanHighPadding)
 }
 
 XLA_TEST_F(ConvolutionVariantsTest,
-       BackwardFilterLowPaddingGreaterThanHighPadding) {
+           BackwardFilterLowPaddingGreaterThanHighPadding) {
   ComputationBuilder builder(client_, TestName());
 
   // activations:      1,2,3,4  ---pad--> 0,0,1,2,3,4
diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc
index bcb85b04eefa349df1c055e010d584b85b55a4a8..d64bf0aa5bd5e9d6213ea07b3da3305a9c621c65 100644
--- a/tensorflow/compiler/xla/tests/copy_test.cc
+++ b/tensorflow/compiler/xla/tests/copy_test.cc
@@ -56,9 +56,13 @@ class CopyOpTest : public HloTestBase {
                                 tensorflow::gtl::ArraySlice<int64> permutation);
 };
 
-XLA_TEST_F(CopyOpTest, CopyR0Bool) { TestCopyOp(*Literal::CreateR0<bool>(true)); }
+XLA_TEST_F(CopyOpTest, CopyR0Bool) {
+  TestCopyOp(*Literal::CreateR0<bool>(true));
+}
 
-XLA_TEST_F(CopyOpTest, CopyR1S0U32) { TestCopyOp(*Literal::CreateR1<uint32>({})); }
+XLA_TEST_F(CopyOpTest, CopyR1S0U32) {
+  TestCopyOp(*Literal::CreateR1<uint32>({}));
+}
 
 XLA_TEST_F(CopyOpTest, CopyR1S3U32) {
   TestCopyOp(*Literal::CreateR1<uint32>({1, 2, 3}));
@@ -85,7 +89,6 @@ XLA_TEST_F(CopyOpTest, CopyParameterScalar) {
   // Copy literal to device to use as parameter.
   auto literal = Literal::CreateR0<float>(42.0);
   Shape shape = literal->shape();
-  auto constant_device_base = TransferToDevice(*literal);
 
   auto param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, shape, "param0"));
@@ -98,7 +101,7 @@ XLA_TEST_F(CopyOpTest, CopyParameterScalar) {
   module->AddEntryComputation(std::move(computation));
 
   std::unique_ptr<Literal> result =
-      ExecuteAndTransfer(std::move(module), {constant_device_base});
+      ExecuteAndTransfer(std::move(module), {literal.get()});
   LiteralTestUtil::ExpectR0Near<float>(42.0f, *result, error_spec_);
 }
 
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index bfb04fd9f9bf6887c4462cb00fee00250517f5c4..cc683701e6305510d202721fe645310f1009081c 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -51,8 +51,6 @@ class DotOperationTest : public ClientLibraryTestBase {
   template <typename Element>
   void TestNonsquareMatrixDot(bool lhs_row_major = false,
                               bool rhs_row_major = false);
-  void TestMatrixDot(int M, int K, int N, bool lhs_row_major = false,
-                     bool rhs_row_major = false);
 };
 
 XLA_TEST_F(DotOperationTest, ZeroElementVectorDotF32) {
@@ -199,158 +197,182 @@ void DotOperationTest::TestSquareMatrixDot(bool lhs_row_major,
       &builder, expected, {lhs_handle.get(), rhs_handle.get()}, error_spec_);
 }
 
-void DotOperationTest::TestMatrixDot(int M, int K, int N, bool lhs_row_major,
-                                     bool rhs_row_major) {
-  std::unique_ptr<Array2D<float>> lhs_data =
-      MakeLinspaceArray2D(0.0, 1.0, M, K);
-  std::unique_ptr<Literal> lhs_lit = Literal::CreateR2FromArray2DWithLayout(
-      *lhs_data,
-      LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(lhs_row_major)));
-  auto lhs_handle = client_->TransferToServer(*lhs_lit).ConsumeValueOrDie();
+struct DotTestParam {
+  int m;
+  int k;
+  int n;
+  bool dot_lhs_row_major;
+  bool dot_rhs_row_major;
+  bool has_addend;
+  bool addend_row_major;
+};
 
-  std::unique_ptr<Array2D<float>> rhs_data =
-      MakeLinspaceArray2D(0.0, 1.0, K, N);
-  std::unique_ptr<Literal> rhs_lit = Literal::CreateR2FromArray2DWithLayout(
-      *rhs_data,
-      LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(rhs_row_major)));
-  auto rhs_handle = client_->TransferToServer(*rhs_lit).ConsumeValueOrDie();
+string PrintDotTestParam(
+    const ::testing::TestParamInfo<DotTestParam>& test_param) {
+  const DotTestParam& param = test_param.param;
+  if (param.has_addend) {
+    return tensorflow::strings::StrCat(param.m, "x", param.k, "x", param.n,
+                                       "_MajorToMinor",
+                                       param.dot_lhs_row_major ? "T" : "F",
+                                       param.dot_rhs_row_major ? "T" : "F",
+                                       param.addend_row_major ? "T" : "F");
+  } else {
+    return tensorflow::strings::StrCat(param.m, "x", param.k, "x", param.n,
+                                       "_MajorToMinor",
+                                       param.dot_lhs_row_major ? "T" : "F",
+                                       param.dot_rhs_row_major ? "T" : "F");
+  }
+}
+
+class ParametricDotTest : public DotOperationTest,
+                          public ::testing::WithParamInterface<DotTestParam> {};
+
+XLA_TEST_P(ParametricDotTest, TestF32) {
+  DotTestParam param = GetParam();
+
+  std::unique_ptr<Array2D<float>> dot_lhs_data =
+      MakeLinspaceArray2D(0.0, 1.0, param.m, param.k);
+  std::unique_ptr<Literal> dot_lhs_lit = Literal::CreateR2FromArray2DWithLayout(
+      *dot_lhs_data, LayoutUtil::MakeLayout(
+                         MinorToMajorForIsRowMajor(param.dot_lhs_row_major)));
+  std::unique_ptr<GlobalData> dot_lhs_handle =
+      client_->TransferToServer(*dot_lhs_lit).ConsumeValueOrDie();
+
+  std::unique_ptr<Array2D<float>> dot_rhs_data =
+      MakeLinspaceArray2D(0.0, 1.0, param.k, param.n);
+  std::unique_ptr<Literal> dot_rhs_lit = Literal::CreateR2FromArray2DWithLayout(
+      *dot_rhs_data, LayoutUtil::MakeLayout(
+                         MinorToMajorForIsRowMajor(param.dot_rhs_row_major)));
+  std::unique_ptr<GlobalData> dot_rhs_handle =
+      client_->TransferToServer(*dot_rhs_lit).ConsumeValueOrDie();
+
+  std::unique_ptr<Array2D<float>> addend_data;
+  std::unique_ptr<Literal> addend_lit;
+  std::unique_ptr<GlobalData> addend_handle;
+
+  if (param.has_addend) {
+    addend_data = MakeLinspaceArray2D(0.0, 1.0, param.m, param.n);
+    addend_lit = Literal::CreateR2FromArray2DWithLayout(
+        *addend_data, LayoutUtil::MakeLayout(
+                          MinorToMajorForIsRowMajor(param.addend_row_major)));
+    addend_handle = client_->TransferToServer(*addend_lit).ConsumeValueOrDie();
+  }
 
   ComputationBuilder builder(client_, TestName());
   auto prim_type = primitive_util::NativeToPrimitiveType<float>();
   auto result = builder.Dot(
-      builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {M, K}), "lhs"),
-      builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {K, N}), "rhs"));
-
-  std::unique_ptr<Array2D<float>> expected =
-      ReferenceUtil::MatmulArray2D(*lhs_data, *rhs_data);
-
-  ComputeAndCompareR2<float>(&builder, *expected,
-                             {lhs_handle.get(), rhs_handle.get()},
-                             ErrorSpec(0.3, 3e-3));
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_12_117_7_MinorToMajorTF) {
-  TestMatrixDot(12, 117, 7, true, false);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_12_117_7_MinorToMajorFT) {
-  TestMatrixDot(12, 117, 7, false, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_12_117_7_MinorToMajorTT) {
-  TestMatrixDot(12, 117, 7, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_12_117_7_MinorToMajorFF) {
-  TestMatrixDot(12, 117, 7, false, false);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_270_270_520_MinorToMajorTT) {
-  TestMatrixDot(270, 270, 520, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_270_270_520_MinorToMajorTF) {
-  TestMatrixDot(270, 270, 520, true, false);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_270_270_520_MinorToMajorFT) {
-  TestMatrixDot(270, 270, 520, false, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_270_270_520_MinorToMajorFF) {
-  TestMatrixDot(270, 270, 520, false, false);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_260_3_520_MinorToMajorTT) {
-  TestMatrixDot(269, 3, 520, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_260_3_520_MinorToMajorTF) {
-  TestMatrixDot(260, 3, 520, true, false);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_260_3_520_MinorToMajorFT) {
-  TestMatrixDot(260, 3, 520, false, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_260_3_520_MinorToMajorFF) {
-  TestMatrixDot(260, 3, 520, false, false);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x8x8) {
-  TestMatrixDot(1, 8, 8, true, true);
-}
+      builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {param.m, param.k}),
+                        "dot_lhs"),
+      builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {param.k, param.n}),
+                        "dot_rhs"));
+
+  if (param.has_addend) {
+    result = builder.Add(
+        result,
+        builder.Parameter(
+            2, ShapeUtil::MakeShape(prim_type, {param.m, param.n}), "addend"));
+  }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x130x8) {
-  TestMatrixDot(1, 130, 8, true, true);
-}
+  std::unique_ptr<Array2D<float>> expected;
+  if (param.has_addend) {
+    expected = ReferenceUtil::ApplyElementwise2D(
+        std::plus<float>(),
+        *ReferenceUtil::MatmulArray2D(*dot_lhs_data, *dot_rhs_data),
+        *addend_data);
+  } else {
+    expected = ReferenceUtil::MatmulArray2D(*dot_lhs_data, *dot_rhs_data);
+  }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x8x130) {
-  TestMatrixDot(1, 8, 130, true, true);
-}
+  std::vector<GlobalData*> args = {dot_lhs_handle.get(), dot_rhs_handle.get()};
+  if (param.has_addend) {
+    args.push_back(addend_handle.get());
+  }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x290x130) {
-  TestMatrixDot(1, 290, 130, true, true);
+  ComputeAndCompareR2<float>(&builder, *expected, args, ErrorSpec(0.3, 3e-3));
 }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_2x1x1) {
-  TestMatrixDot(2, 1, 1, true, true);
-}
+std::vector<DotTestParam> CreateDotTestParameters() {
+  std::vector<DotTestParam> params;
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_8x8x1) {
-  TestMatrixDot(8, 8, 1, true, true);
-}
+  auto add_matrix_matrix_dot_test = [&](int m, int k, int n) {
+    for (bool lhs_row_major : {true, false}) {
+      for (bool rhs_row_major : {true, false}) {
+        params.push_back({/*m=*/m, /*k=*/k, /*n=*/n,
+                          /*dot_lhs_row_major=*/lhs_row_major,
+                          /*dot_rhs_row_major=*/rhs_row_major,
+                          /*has_addend=*/false, /*addend_row_major=*/true});
+      }
+    }
+  };
+
+  auto add_matrix_vector_dot_test = [&](int k, int n) {
+    for (bool has_addend : {false, true}) {
+      params.push_back({/*m=*/1, /*k=*/k, /*n=*/n,
+                        /*dot_lhs_row_major=*/true, /*dot_rhs_row_major=*/true,
+                        /*has_addend=*/has_addend, /*addend_row_major=*/true});
+      if (n != 1) {
+        params.push_back(
+            {/*m=*/n, /*k=*/k, /*n=*/1,
+             /*dot_lhs_row_major=*/true, /*dot_rhs_row_major=*/true,
+             /*has_addend=*/has_addend, /*addend_row_major=*/true});
+      }
+    }
+  };
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_16x1x1) {
-  TestMatrixDot(16, 1, 1, true, true);
-}
+  add_matrix_matrix_dot_test(/*m=*/12, /*k=*/117, /*n=*/7);
+  add_matrix_matrix_dot_test(/*m=*/270, /*k=*/270, /*n=*/520);
+  add_matrix_matrix_dot_test(/*m=*/260, /*k=*/3, /*n=*/520);
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_16x3x1) {
-  TestMatrixDot(16, 3, 1, true, true);
-}
+  add_matrix_vector_dot_test(/*k=*/8, /*n=*/8);
+  add_matrix_vector_dot_test(/*k=*/130, /*n=*/8);
+  add_matrix_vector_dot_test(/*k=*/8, /*n=*/130);
+  add_matrix_vector_dot_test(/*k=*/290, /*n=*/130);
+  add_matrix_vector_dot_test(/*k=*/1, /*n=*/1);
+  add_matrix_vector_dot_test(/*k=*/1, /*n=*/16);
+  add_matrix_vector_dot_test(/*k=*/3, /*n=*/16);
+  add_matrix_vector_dot_test(/*k=*/3, /*n=*/3);
+  add_matrix_vector_dot_test(/*k=*/29, /*n=*/29);
+  add_matrix_vector_dot_test(/*k=*/8, /*n=*/2);
+  add_matrix_vector_dot_test(/*k=*/2, /*n=*/8);
+  add_matrix_vector_dot_test(/*k=*/259, /*n=*/258);
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_3x3x1) {
-  TestMatrixDot(3, 3, 1, true, true);
+  return params;
 }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_29x29x1) {
-  TestMatrixDot(29, 29, 1, true, true);
-}
+INSTANTIATE_TEST_CASE_P(DotTests, ParametricDotTest,
+                        ::testing::ValuesIn(CreateDotTestParameters()),
+                        PrintDotTestParam);
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x8x2) {
-  TestMatrixDot(1, 8, 2, true, true);
+XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorFF) {
+  TestSquareMatrixDot<float>(false, false);
 }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x2x8) {
-  TestMatrixDot(1, 2, 8, true, true);
+XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorFT) {
+  TestSquareMatrixDot<float>(false, true);
 }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_259x258x1) {
-  TestMatrixDot(259, 258, 1, true, true);
+XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorTF) {
+  TestSquareMatrixDot<float>(true, false);
 }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_259x258x1_FT) {
-  TestMatrixDot(259, 258, 1, false, true);
+XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorTT) {
+  TestSquareMatrixDot<float>(true, true);
 }
 
-XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorFF) {
-  constexpr bool kLhsRowMajor = false;
-  constexpr bool kRhsRowMajor = false;
-  TestSquareMatrixDot<float>(kLhsRowMajor, kRhsRowMajor);
+XLA_TEST_F(DotOperationTest, SquareMatrixDotC64MinorToMajorFF) {
+  TestSquareMatrixDot<complex64>(false, false);
 }
 
-XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorFT) {
-  TestSquareMatrixDot<float>(false, true);
+XLA_TEST_F(DotOperationTest, SquareMatrixDotC64MinorToMajorFT) {
+  TestSquareMatrixDot<complex64>(false, true);
 }
 
-XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorTF) {
-  TestSquareMatrixDot<float>(true, false);
+XLA_TEST_F(DotOperationTest, SquareMatrixDotC64MinorToMajorTF) {
+  TestSquareMatrixDot<complex64>(true, false);
 }
 
-TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorTT) {
-  constexpr bool kLhsRowMajor = true;
-  constexpr bool kRhsRowMajor = true;
-  TestSquareMatrixDot<float>(kLhsRowMajor, kRhsRowMajor);
+XLA_TEST_F(DotOperationTest, SquareMatrixDotC64MinorToMajorTT) {
+  TestSquareMatrixDot<complex64>(true, true);
 }
 
 XLA_TEST_F(DotOperationTest, SquareMatrixDotF64) {
@@ -561,5 +583,95 @@ TEST_F(DotOperationTest, TransposeFolding) {
   }
 }
 
+TEST_F(DotOperationTest, DotOfConcatOptimizationWithConstLHS) {
+  auto prim_type = primitive_util::NativeToPrimitiveType<float>();
+
+  std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
+      {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+
+  ComputationBuilder builder(client_, TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_arg_0 = builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {2, 2}),
+                                     "rhs_arg_0");
+  auto rhs_arg_1 = builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {3, 2}),
+                                     "rhs_arg_1");
+  auto rhs_arg_2 = builder.Parameter(2, ShapeUtil::MakeShape(prim_type, {1, 2}),
+                                     "rhs_arg_2");
+  auto result = builder.Dot(
+      lhs_constant, builder.ConcatInDim({rhs_arg_0, rhs_arg_1, rhs_arg_2}, 0));
+
+  std::unique_ptr<Array2D<float>> arg_0_value_array(
+      new Array2D<float>({{1.0, 2.0}, {3.0, 4.0}}));
+  std::unique_ptr<Array2D<float>> arg_1_value_array(
+      new Array2D<float>({{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}));
+  std::unique_ptr<Array2D<float>> arg_2_value_array(
+      new Array2D<float>({{1.0, 2.0}}));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto arg_0_value,
+      client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<float>(*arg_0_value_array)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto arg_1_value,
+      client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<float>(*arg_1_value_array)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto arg_2_value,
+      client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<float>(*arg_2_value_array)));
+
+  Array2D<float> expected({{53.0, 74.0}, {45.0, 66.0}});
+  ComputeAndCompareR2<float>(
+      &builder, expected,
+      {arg_0_value.get(), arg_1_value.get(), arg_2_value.get()}, error_spec_);
+}
+
+TEST_F(DotOperationTest, DotOfConcatOptimizationWithConstRHS) {
+  auto prim_type = primitive_util::NativeToPrimitiveType<float>();
+
+  std::unique_ptr<Array2D<float>> constant_rhs_array(
+      new Array2D<float>({{1.0, 2.0},
+                          {3.0, 4.0},
+                          {5.0, 6.0},
+                          {6.0, 5.0},
+                          {4.0, 3.0},
+                          {2.0, 1.0}}));
+
+  ComputationBuilder builder(client_, TestName());
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto lhs_arg_0 = builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {2, 2}),
+                                     "lhs_arg_0");
+  auto lhs_arg_1 = builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {2, 3}),
+                                     "lhs_arg_1");
+  auto lhs_arg_2 = builder.Parameter(2, ShapeUtil::MakeShape(prim_type, {2, 1}),
+                                     "lhs_arg_2");
+  auto result = builder.Dot(
+      builder.ConcatInDim({lhs_arg_0, lhs_arg_1, lhs_arg_2}, 1), rhs_constant);
+
+  std::unique_ptr<Array2D<float>> arg_0_value_array(
+      new Array2D<float>({{1.0, 2.0}, {3.0, 4.0}}));
+  std::unique_ptr<Array2D<float>> arg_1_value_array(
+      new Array2D<float>({{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}));
+  std::unique_ptr<Array2D<float>> arg_2_value_array(
+      new Array2D<float>({{1.0}, {2.0}}));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto arg_0_value,
+      client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<float>(*arg_0_value_array)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto arg_1_value,
+      client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<float>(*arg_1_value_array)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto arg_2_value,
+      client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<float>(*arg_2_value_array)));
+
+  Array2D<float> expected({{38.0, 36.0}, {93.0, 91.0}});
+  ComputeAndCompareR2<float>(
+      &builder, expected,
+      {arg_0_value.get(), arg_1_value.get(), arg_2_value.get()}, error_spec_);
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 8baaf39e3cf8fa7f6fa4a0224c1297f82e0d92aa..59be32a8ff584a6189302a0835ba74b2e08956b1 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -559,20 +559,20 @@ void BM_DynamicSlice(int num_iters) {
   auto computation = builder.Build().ConsumeValueOrDie();
 
   // Initialize and transfer parameter buffer.
-  auto shape_size_fn = [client](const Shape& shape) {
-    return client->backend().transfer_manager()->GetByteSizeRequirement(shape);
-  };
-  auto buffer = ScopedShapedBuffer::Allocate(start_indices_shape, &allocator, 0,
-                                             shape_size_fn)
+  auto buffer = client->backend()
+                    .transfer_manager()
+                    ->AllocateScopedShapedBuffer(
+                        start_indices_shape, &allocator, /*device_ordinal=*/0)
                     .ConsumeValueOrDie();
 
   auto start_indices_literal = Literal::CreateR1<int32>({0, 1, 2, 3});
   ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
-      executors[device_ordinal], *start_indices_literal,
-      buffer->mutable_buffer({})));
+      executors[device_ordinal], *start_indices_literal, *buffer));
 
   std::unique_ptr<LocalExecutable> executable =
-      client->Compile(computation, {&buffer->shape()}, ExecutableBuildOptions())
+      client
+          ->Compile(computation, {&buffer->on_host_shape()},
+                    ExecutableBuildOptions())
           .ConsumeValueOrDie();
 
   // Run some warm-up executions.
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index 2686afccc216095345dbb7b43e916fbbe7c8ea39..a292eab1d198fbf69c6dc81c780487ea46756f72 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -816,7 +816,8 @@ void BM_ParallelFusion(int num_iters) {
   std::unique_ptr<LocalExecutable> executable =
       client
           ->Compile(computation,
-                    {&buffer0->shape(), &buffer1->shape(), &buffer2->shape()},
+                    {&buffer0->on_host_shape(), &buffer1->on_host_shape(),
+                     &buffer2->on_host_shape()},
                     ExecutableBuildOptions())
           .ConsumeValueOrDie();
 
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index d73c05ff92578209143e0679558848160cae99bd..a27e0f2c106c2ffa2ba108e1963e7111fd347482 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -15,13 +15,22 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
+#include <memory>
 #include <set>
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -30,44 +39,237 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 
+namespace {
+
+using tensorflow::StringPiece;
+using tensorflow::gtl::ArraySlice;
+using tensorflow::gtl::optional;
+
+constexpr char kInterpreter[] = "interpreter";
+
+// Helper functions to get test and reference platforms.
+se::Platform* GetReferencePlatform() {
+  auto result = PlatformUtil::GetPlatform(kInterpreter);
+  TF_CHECK_OK(result.status()) << "could not get interpreter platform";
+  return result.ValueOrDie();
+}
+
+se::Platform* GetTestPlatform() {
+  auto result = PlatformUtil::GetDefaultPlatform();
+  TF_CHECK_OK(result.status()) << "could not get test platform";
+  return result.ValueOrDie();
+}
+
+bool ProgramShapesEqual(const ProgramShape& lhs, const ProgramShape& rhs) {
+  if (lhs.parameters_size() != rhs.parameters_size()) {
+    return false;
+  }
+  for (int i = 0; i < lhs.parameters_size(); i++) {
+    if (!ShapeUtil::Equal(lhs.parameters(i), rhs.parameters(i))) {
+      return false;
+    }
+  }
+  return ShapeUtil::Equal(lhs.result(), rhs.result());
+}
+
+ProgramShape GetProgramShapeWithLayout(const HloModule& module) {
+  ProgramShape program_shape;
+  const auto* entry = module.entry_computation();
+  for (const auto* param : entry->parameter_instructions()) {
+    *program_shape.add_parameters() = param->shape();
+    *program_shape.add_parameter_names() = param->name();
+  }
+  *program_shape.mutable_result() = entry->root_instruction()->shape();
+  return program_shape;
+}
+
+}  // namespace
+
+HloTestBase::HloTestBase()
+    : HloTestBase(GetTestPlatform(), GetReferencePlatform()) {}
+
+HloTestBase::HloTestBase(se::Platform* test_platform,
+                         se::Platform* reference_platform)
+    : test_runner_(test_platform), reference_runner_(reference_platform) {
+  hlo_verifier_ = MakeUnique<HloVerifier>([this](const Shape& shape) {
+    return backend().transfer_manager()->GetByteSizeRequirement(shape);
+  });
+}
+
 /* static */
 std::unique_ptr<HloModule> HloTestBase::CreateNewModule() {
   HloModuleConfig config;
+  config.set_debug_options(GetDebugOptionsForTest());
+  return MakeUnique<HloModule>(TestName(), VersionedComputationHandle(),
+                               config);
+}
 
+/*static*/ DebugOptions HloTestBase::GetDebugOptionsForTest() {
   auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
   debug_options.add_xla_disable_hlo_passes("constant_folding");
+  return debug_options;
+}
 
-  config.set_debug_options(debug_options);
-
-  return MakeUnique<HloModule>(TestName(), VersionedComputationHandle(),
-                               config);
+StatusOr<std::unique_ptr<Literal>> HloTestBase::Execute(
+    std::unique_ptr<HloModule> module,
+    tensorflow::gtl::ArraySlice<Literal*> arguments) {
+  return test_runner_.Execute(std::move(module), arguments);
 }
 
-StatusOr<perftools::gputools::DeviceMemoryBase> HloTestBase::Execute(
+std::unique_ptr<Literal> HloTestBase::ExecuteAndTransfer(
     std::unique_ptr<HloModule> module,
-    tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-        arguments,
-    Shape* result_shape) {
-  return runner_.Execute(std::move(module), arguments, result_shape);
+    tensorflow::gtl::ArraySlice<Literal*> arguments) {
+  return test_runner_.Execute(std::move(module), arguments).ValueOrDie();
 }
 
-se::DeviceMemoryBase HloTestBase::TransferToDevice(const Literal& literal) {
-  return runner_.TransferToDevice(literal).ValueOrDie();
+StatusOr<std::unique_ptr<HloModule>> HloTestBase::MakeReferenceModule(
+    const HloModule& test_module,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  std::unique_ptr<HloModule> reference_module = test_module.Clone();
+  const auto& program_shape = GetProgramShapeWithLayout(test_module);
+
+  if (reference_preprocessor != nullptr) {
+    reference_preprocessor(reference_module.get());
+    if (!ProgramShapesEqual(program_shape,
+                            GetProgramShapeWithLayout(*reference_module))) {
+      return InvalidArgument(
+          "reference preprocessor must not modify the program shape");
+    }
+  }
+  TF_RETURN_IF_ERROR(VerifyHloModule(*reference_runner_.backend().platform(),
+                                     reference_module.get()));
+  return std::move(reference_module);
 }
 
-std::unique_ptr<Literal> HloTestBase::TransferFromDevice(
-    const Shape& shape, se::DeviceMemoryBase device_base) {
-  return runner_.TransferFromDevice(shape, device_base).ValueOrDie();
+template <typename LiteralPtr>
+StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
+    std::unique_ptr<HloModule> module, const ArraySlice<LiteralPtr> arguments,
+    const optional<ErrorSpec>& error, bool run_hlo_passes,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  static_assert(
+      std::is_same<Literal*, LiteralPtr>::value ||
+          std::is_same<std::unique_ptr<Literal>, LiteralPtr>::value,
+      "The LiteralPtr type only accepts Literal* or std::unique_ptr<Literal>.");
+  TF_RETURN_IF_ERROR(
+      VerifyHloModule(*test_runner_.backend().platform(), module.get()));
+  TF_ASSIGN_OR_RETURN(auto reference_module,
+                      MakeReferenceModule(*module, reference_preprocessor));
+
+  // Execute on two backends.
+  TF_ASSIGN_OR_RETURN(
+      auto test,
+      test_runner_.Execute(std::move(module), arguments, run_hlo_passes));
+  TF_ASSIGN_OR_RETURN(auto reference,
+                      reference_runner_.Execute(std::move(reference_module),
+                                                arguments, run_hlo_passes));
+  return LiteralTestUtil::NearOrEqual(/*expected=*/*reference, /*actual=*/*test,
+                                      error);
 }
 
-std::unique_ptr<Literal> HloTestBase::ExecuteAndTransfer(
-    std::unique_ptr<HloModule> module,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
-  return runner_.ExecuteAndTransfer(std::move(module), arguments).ValueOrDie();
+template <typename LiteralPtr>
+::testing::AssertionResult HloTestBase::RunAndCompare(
+    std::unique_ptr<HloModule> module, const ArraySlice<LiteralPtr> arguments,
+    const optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto result =
+      RunAndCompareInternal(std::move(module), arguments, error,
+                            /*run_hlo_passes=*/true, reference_preprocessor);
+  if (!result.ok()) {
+    return ::testing::AssertionFailure() << result.status();
+  }
+  return result.ValueOrDie();
+}
+
+template <typename LiteralPtr>
+::testing::AssertionResult HloTestBase::RunAndCompareNoHloPasses(
+    std::unique_ptr<HloModule> module, const ArraySlice<LiteralPtr> arguments,
+    const optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto result =
+      RunAndCompareInternal(std::move(module), arguments, error,
+                            /*run_hlo_passes=*/false, reference_preprocessor);
+  if (!result.ok()) {
+    return ::testing::AssertionFailure() << result.status();
+  }
+  return result.ValueOrDie();
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompare(
+    std::unique_ptr<HloModule> module, const optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  const auto& fake_arguments =
+      MakeFakeArguments(module.get()).ConsumeValueOrDie();
+  return RunAndCompare<std::unique_ptr<Literal>>(
+      std::move(module), fake_arguments, error, reference_preprocessor);
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompareNoHloPasses(
+    std::unique_ptr<HloModule> module, const optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  const auto& fake_arguments =
+      MakeFakeArguments(module.get()).ConsumeValueOrDie();
+  return RunAndCompareNoHloPasses<std::unique_ptr<Literal>>(
+      std::move(module), fake_arguments, error, reference_preprocessor);
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompare(
+    const StringPiece hlo_string,
+    const tensorflow::gtl::optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto module_or_status =
+      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+  if (!module_or_status.ok()) {
+    return ::testing::AssertionFailure()
+           << "Error while parsing HLO text format: "
+           << module_or_status.status().ToString();
+  }
+  return RunAndCompare(module_or_status.ConsumeValueOrDie(), error,
+                       reference_preprocessor);
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompareFromFile(
+    const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto module_or_status =
+      HloRunner::ReadModule(filename, GetDebugOptionsForTest());
+  if (!module_or_status.ok()) {
+    return ::testing::AssertionFailure()
+           << "failed reading hlo module from file";
+  }
+  return RunAndCompare(module_or_status.ConsumeValueOrDie(), error,
+                       reference_preprocessor);
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompareNoHloPasses(
+    const StringPiece hlo_string,
+    const tensorflow::gtl::optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto module_or_status =
+      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+  if (!module_or_status.ok()) {
+    return ::testing::AssertionFailure()
+           << "Error while parsing HLO text format: "
+           << module_or_status.status().ToString();
+  }
+  return RunAndCompareNoHloPasses(module_or_status.ConsumeValueOrDie(), error,
+                                  reference_preprocessor);
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompareNoHloPassesFromFile(
+    const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto module_or_status =
+      HloRunner::ReadModule(filename, GetDebugOptionsForTest());
+  if (!module_or_status.ok()) {
+    return ::testing::AssertionFailure()
+           << "failed reading hlo module from file";
+  }
+  return RunAndCompareNoHloPasses(module_or_status.ConsumeValueOrDie(), error,
+                                  reference_preprocessor);
 }
 
-Backend& HloTestBase::backend() { return runner_.backend(); }
+Backend& HloTestBase::backend() { return test_runner_.backend(); }
 
 /* static */
 string HloTestBase::TestName() {
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 7f068dce36be3546298de2f06bf6d33446d07ca2..4aea9fc9fd027231106e529eb16bcd43f23fbe1c 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -24,52 +24,150 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_runner.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
 
-// A base class for tests which build and run HLO code. This is a lower level of
-// abstraction than using the client interface and enables, for one, explicitly
-// building a graph of HLO instructions to run.
+// A base class for tests which build and/or run HLO code. The class includes
+// support for running an HLO module on two platforms and compare the results.
+// This is a lower level of abstraction than using the client interface and
+// enables, for one, explicitly building a graph of HLO instructions to run.
+//
+// This can also be used to write text/file-based test cases. Note that the test
+// target is responsible for linking the needed backends. A covenient way to do
+// this is to make it an xla_test: it will generate test targets linking with
+// the respective backends, which will be used as the test backend; the
+// interpreter backend is already linked with hlo_test_base so it will be the
+// default reference backend. For example, if you want to compare both cpu vs.
+// interpreter, and gpu vs. interpreter, you can:
+//
+//  xla_test (
+//    name = "sample_text_test",
+//    srcs = ["sample_text_test.cc"],
+//    backends = [
+//      "cpu",
+//      "gpu",
+//    ],
+//    deps = [
+//      "//third_party/tensorflow/compiler/xla/tests:hlo_test_base",
+//      ...
+//    ],
+//  )
+//
+// For a more detailed example, see "../tests/sample_text_test.cc".
 class HloTestBase : public ::testing::Test {
  protected:
-  HloTestBase() {}
+  // This uses the interpreter backend as the reference backend and
+  // automatically finds another supported backend as the test backend. If the
+  // interpreter is the only supported backend, it will be both the test backend
+  // and the reference backend.
+  HloTestBase();
+
+  // If your test doesn't use interpreter as the reference backend, you can use
+  // this constructor. Note that your test target is responsible for linking in
+  // both needed backends.
+  HloTestBase(::perftools::gputools::Platform* test_platform,
+              ::perftools::gputools::Platform* reference_platform);
 
   ~HloTestBase() override {}
 
   // Creates a new HLO module for a test. The module created will have
   // TestName() for its name; it will also automatically populate its debug
-  // options from command-line flags. It's recommended to use this method to
-  // create all HloModules for tests.
+  // options from command-line flags. If you want a fresh HloModule object and
+  // then add HloComputations to it, it's recommended to use this method in your
+  // tests.
   static std::unique_ptr<HloModule> CreateNewModule();
 
-  // Executes the given module and returns a global data handle.
-  StatusOr<perftools::gputools::DeviceMemoryBase> Execute(
+  // Populates debug options from command-line flags and adjusts the options for
+  // testing. It is recommended to use this when you need to pass in
+  // DebugOptions, e.g. when creating a module from a string or a file.
+  static DebugOptions GetDebugOptionsForTest();
+
+  // Executes the given module and return the result as a Literal.
+  StatusOr<std::unique_ptr<Literal>> Execute(
       std::unique_ptr<HloModule> module,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      Shape* result_shape);
+      tensorflow::gtl::ArraySlice<Literal*> arguments);
 
-  // Transfers the given literal to the device and returns the data handle.
-  perftools::gputools::DeviceMemoryBase TransferToDevice(
-      const Literal& literal);
+  std::unique_ptr<Literal> ExecuteAndTransfer(
+      std::unique_ptr<HloModule> module,
+      tensorflow::gtl::ArraySlice<Literal*> arguments);
+
+  // Executes the given hlo module on two backends and compares results.
+  //
+  // 'arguments': the input of the hlo module. The LiteralPtr type accepts
+  // Literal* or std::unique_ptr<Literal>.
+  //
+  // 'error': if has value, expects the results to be near (within the error
+  // bound). Otherwise, expects the results to be equal.
+  //
+  // 'reference_preprocessor': the module should be ready to run on the test
+  // backend, but it might need to be tailored so that it is able to run on the
+  // reference backend. Note that the program shape of the module must not be
+  // modified.
+  template <typename LiteralPtr>
+  ::testing::AssertionResult RunAndCompare(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+
+  // Same as above, except that the module will be executed without Hlo
+  // optimization.
+  template <typename LiteralPtr>
+  ::testing::AssertionResult RunAndCompareNoHloPasses(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
 
-  // Transfers the array referred to by the given handle from the device and
-  // returns as a Literal.
-  std::unique_ptr<Literal> TransferFromDevice(
-      const Shape& shape, perftools::gputools::DeviceMemoryBase device_base);
+  // Executes an hlo module with fake inputs and compares the results.
+  ::testing::AssertionResult RunAndCompare(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
 
-  // Executes the given module and return the result as a Literal.
-  std::unique_ptr<Literal> ExecuteAndTransfer(
+  // Same as above, except that the module will be executed without Hlo
+  // optimization.
+  ::testing::AssertionResult RunAndCompareNoHloPasses(
       std::unique_ptr<HloModule> module,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments);
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+
+  // Convenient wrappers for executing and comparing an hlo module with fake
+  // input. Module can be passed in directly, or parsed from an hlo_string,
+  // or loaded from a file.
+  ::testing::AssertionResult RunAndCompare(
+      const tensorflow::StringPiece hlo_string,
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+  ::testing::AssertionResult RunAndCompareFromFile(
+      const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+  ::testing::AssertionResult RunAndCompareNoHloPasses(
+      const tensorflow::StringPiece hlo_string,
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+  ::testing::AssertionResult RunAndCompareNoHloPassesFromFile(
+      const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
 
   // Convenience method to force the layout of a given parameter in a module.
   // The layout of parameter number 'param_no' in the 'module' is set to
@@ -99,14 +197,38 @@ class HloTestBase : public ::testing::Test {
         ->Clear();
   }
 
+  // Return an HLO verifier constructed for the test backend.
+  HloVerifier& verifier() const { return *hlo_verifier_; }
+
   static string TestName();
 
-  // Returns the backend owned by the HloRunner.
+  // Returns the backend owned by the test runner.
   Backend& backend();
 
-  HloRunner runner_;
+  HloRunner test_runner_;
+  HloRunner reference_runner_;
+
+  std::unique_ptr<HloVerifier> hlo_verifier_;
 
   ErrorSpec error_spec_{0.0001};
+
+ private:
+  // Given the test module, makes a reference module that is ready to run on the
+  // reference platform. This assumes that the given module is ready to run on
+  // the test platform.
+  StatusOr<std::unique_ptr<HloModule>> MakeReferenceModule(
+      const HloModule& test_module,
+      const std::function<void(HloModule*)>& reference_preprocessor);
+
+  // Runs the module on two platforms with or without running hlo passes and
+  // compares the results. Returns whether the results are near or equal. If any
+  // error happens before the results are computed, returns the error status.
+  template <typename LiteralPtr>
+  StatusOr<::testing::AssertionResult> RunAndCompareInternal(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
+      const tensorflow::gtl::optional<ErrorSpec>& error, bool run_hlo_passes,
+      const std::function<void(HloModule*)>& reference_preprocessor);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/isolated_convolution.hlo b/tensorflow/compiler/xla/tests/isolated_convolution.hlo
new file mode 100644
index 0000000000000000000000000000000000000000..9452780930efbb1ecc13b35cd4ab53678d36c37f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/isolated_convolution.hlo
@@ -0,0 +1,8 @@
+HloModule convolution.167:
+
+ENTRY %convolution.167 (parameter.0: f32[16,28,28,128], parameter.1: f32[3,3,128,128]) -> f32[16,28,28,128] {
+  %parameter.0 = f32[16,28,28,128]{3,0,2,1} parameter(0)
+  %parameter.1 = f32[3,3,128,128]{3,2,1,0} parameter(1)
+  ROOT %convolution.167 = f32[16,28,28,128]{3,0,2,1} convolution(f32[16,28,28,128]{3,0,2,1} %parameter.0, f32[3,3,128,128]{3,2,1,0} %parameter.1), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01oi->b01f
+}
+
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index 75c9a0d3fb5f11bbf051cd94250212faa30d3688..fb425fe6f3cfbb35d7824f3dd1b7d3a2f869313f 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -57,7 +57,8 @@ namespace xla {
     }
     for (int i = 0; i < expected.tuple_shapes_size(); ++i) {
       ::testing::AssertionResult result =
-          EqualShapes(expected.tuple_shapes(i), actual.tuple_shapes(i));
+          EqualShapes(expected.tuple_shapes(i), actual.tuple_shapes(i))
+          << "mismatch in tuple index " << i;
       if (!result) {
         return result;
       }
@@ -100,6 +101,58 @@ namespace xla {
   ASSERT_EQ(expected.ShortDebugString(), actual.ShortDebugString());
 }
 
+/* static */ std::unique_ptr<Literal> LiteralTestUtil::ConvertBF16ToF32(
+    const Literal& literal) {
+  if (ShapeUtil::IsTuple(literal.shape())) {
+    std::vector<std::unique_ptr<Literal>> converted_elements;
+    for (const auto& element : literal.tuple_literals()) {
+      converted_elements.push_back(ConvertBF16ToF32(element));
+    }
+    return Literal::MakeTupleOwned(std::move(converted_elements));
+  }
+
+  if (literal.shape().element_type() != BF16) {
+    return MakeUnique<Literal>(literal);
+  }
+  Shape converted_shape = literal.shape();
+  converted_shape.set_element_type(F32);
+  auto converted = Literal::CreateFromShape(converted_shape);
+  if (!ShapeUtil::HasZeroElements(converted_shape)) {
+    std::vector<int64> index(converted_shape.dimensions_size(), 0);
+    do {
+      converted->Set<float>(index,
+                            static_cast<float>(literal.Get<bfloat16>(index)));
+    } while (IndexUtil::BumpIndices(converted_shape, &index));
+  }
+  return converted;
+}
+
+/* static */ std::unique_ptr<Literal> LiteralTestUtil::ConvertF32ToBF16(
+    const Literal& literal) {
+  if (ShapeUtil::IsTuple(literal.shape())) {
+    std::vector<std::unique_ptr<Literal>> converted_elements;
+    for (const auto& element : literal.tuple_literals()) {
+      converted_elements.push_back(ConvertF32ToBF16(element));
+    }
+    return Literal::MakeTupleOwned(std::move(converted_elements));
+  }
+
+  if (literal.shape().element_type() != F32) {
+    return MakeUnique<Literal>(literal);
+  }
+  Shape converted_shape = literal.shape();
+  converted_shape.set_element_type(BF16);
+  auto converted = Literal::CreateFromShape(converted_shape);
+  if (!ShapeUtil::HasZeroElements(converted_shape)) {
+    std::vector<int64> index(converted_shape.dimensions_size(), 0);
+    do {
+      converted->Set<bfloat16>(
+          index, static_cast<bfloat16>(literal.Get<float>(index)));
+    } while (IndexUtil::BumpIndices(converted_shape, &index));
+  }
+  return converted;
+}
+
 namespace {
 
 string Hostname() {
@@ -281,23 +334,45 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
   return result;
 }
 
-/* static */ void LiteralTestUtil::ExpectEqualTuple(const Literal& expected,
-                                                    const Literal& actual) {
+/* static */ ::testing::AssertionResult LiteralTestUtil::EqualTuple(
+    const Literal& expected, const Literal& actual) {
   VLOG(1) << "expected: " << expected.ToString();
   VLOG(1) << "actual:   " << actual.ToString();
 
-  ASSERT_TRUE(ShapeUtil::IsTuple(expected.shape()));
-  ASSERT_TRUE(ShapeUtil::IsTuple(actual.shape()));
+  if (!ShapeUtil::IsTuple(expected.shape()) ||
+      !ShapeUtil::IsTuple(actual.shape())) {
+    return ::testing::AssertionFailure()
+           << "tuples expected shape = " << expected.shape().ShortDebugString()
+           << " actual shape = " << actual.shape().ShortDebugString();
+  }
   AssertEqualShapes(expected.shape(), actual.shape());
+
+  ::testing::AssertionResult err = ::testing::AssertionSuccess();
   for (uint64 i = 0; i < expected.tuple_literals_size(); ++i) {
+    SCOPED_TRACE(tensorflow::strings::StrCat(
+        "Tuple index ", i, " in ", ShapeUtil::HumanString(expected.shape())));
     const auto& expected_element = expected.tuple_literals(i);
     const auto& actual_element = actual.tuple_literals(i);
-    if (ShapeUtil::IsTuple(expected_element.shape())) {
-      ExpectEqualTuple(expected_element, actual_element);
-    } else {
-      ExpectEqual(expected_element, actual_element);
+
+    ::testing::AssertionResult res = [&] {
+      if (ShapeUtil::IsTuple(expected_element.shape())) {
+        return EqualTuple(expected_element, actual_element);
+      } else {
+        return Equal(expected_element, actual_element);
+      }
+    }();
+
+    if (!res && err) {
+      err = res;
     }
   }
+
+  return err;
+}
+
+/* static */ void LiteralTestUtil::ExpectEqualTuple(const Literal& expected,
+                                                    const Literal& actual) {
+  EXPECT_TRUE(EqualTuple(expected, actual));
 }
 
 namespace {
@@ -340,6 +415,9 @@ class NearComparator {
     multi_index_.resize(expected.shape().dimensions_size(), 0);
 
     switch (expected.shape().element_type()) {
+      case BF16:
+        ExpectLiteralsNear<bfloat16>(expected, actual, 0);
+        break;
       case F32:
         ExpectLiteralsNear<float>(expected, actual, 0);
         break;
@@ -525,6 +603,13 @@ void NearComparator::ExpectNear<complex64>(complex64 expected, complex64 actual,
       << message;
 }
 
+template <>
+bool NearComparator::ExpectValuesNear<bfloat16>(bfloat16 expected,
+                                                bfloat16 actual) {
+  return ExpectValuesNear(static_cast<float>(expected),
+                          static_cast<float>(actual));
+}
+
 }  // namespace
 
 /* static */ ::testing::AssertionResult LiteralTestUtil::Near(
@@ -553,33 +638,33 @@ void NearComparator::ExpectNear<complex64>(complex64 expected, complex64 actual,
   if (!ShapeUtil::IsTuple(expected.shape()) ||
       !ShapeUtil::IsTuple(actual.shape())) {
     return ::testing::AssertionFailure()
-           << "tuples expected expected shape = "
-           << expected.shape().ShortDebugString()
+           << "tuples expected shape = " << expected.shape().ShortDebugString()
            << " actual shape = " << actual.shape().ShortDebugString();
   }
   AssertEqualShapes(expected.shape(), actual.shape());
+
+  ::testing::AssertionResult err = ::testing::AssertionSuccess();
   for (uint64 i = 0; i < expected.tuple_literals_size(); ++i) {
+    SCOPED_TRACE(tensorflow::strings::StrCat(
+        "Tuple index ", i, " in ", ShapeUtil::HumanString(expected.shape())));
     const auto& expected_element = expected.tuple_literals(i);
     const auto& actual_element = actual.tuple_literals(i);
-    if (ShapeUtil::IsTuple(expected_element.shape())) {
-      auto ret = NearTuple(expected_element, actual_element, error);
-      if (!ret) {
-        return ret;
-      }
-    } else if (ShapeUtil::ElementIsFloating(expected_element.shape())) {
-      auto ret = Near(expected_element, actual_element, error);
-      if (!ret) {
-        return ret;
-      }
-    } else {
-      auto ret = Equal(expected_element, actual_element);
-      if (!ret) {
-        return ret;
+
+    ::testing::AssertionResult res = [&] {
+      if (ShapeUtil::IsTuple(expected_element.shape())) {
+        return NearTuple(expected_element, actual_element, error);
+      } else if (ShapeUtil::ElementIsFloating(expected_element.shape())) {
+        return Near(expected_element, actual_element, error);
+      } else {
+        return Equal(expected_element, actual_element);
       }
+    }();
+
+    if (err && !res) {
+      err = res;
     }
   }
-
-  return ::testing::AssertionSuccess();
+  return err;
 }
 
 /* static */ void LiteralTestUtil::ExpectNearTuple(const Literal& expected,
@@ -588,6 +673,32 @@ void NearComparator::ExpectNear<complex64>(complex64 expected, complex64 actual,
   EXPECT_TRUE(NearTuple(expected, actual, error));
 }
 
+/*static*/ ::testing::AssertionResult LiteralTestUtil::NearOrEqual(
+    const Literal& expected, const Literal& actual,
+    const tensorflow::gtl::optional<ErrorSpec>& error) {
+  bool is_tuple = ShapeUtil::IsTuple(expected.shape());
+  if (error.has_value()) {
+    if (is_tuple) {
+      VLOG(1) << "Expects near tuple";
+      return NearTuple(expected, actual, *error);
+    }
+    VLOG(1) << "Expects near";
+    return Near(expected, actual, *error);
+  }
+  if (is_tuple) {
+    VLOG(1) << "Expects equal tuple";
+    return EqualTuple(expected, actual);
+  }
+  VLOG(1) << "Expects equal";
+  return Equal(expected, actual);
+}
+
+/*static*/ void LiteralTestUtil::ExpectNearOrEqual(
+    const Literal& expected, const Literal& actual,
+    const tensorflow::gtl::optional<ErrorSpec>& error) {
+  EXPECT_TRUE(NearOrEqual(expected, actual, error));
+}
+
 /* static */ string LiteralTestUtil::MultiIndexAsString(
     tensorflow::gtl::ArraySlice<int64> multi_index) {
   return tensorflow::strings::StrCat(
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index 467d44b857b74d2a38e9b3f8a32a9b1d39a4a10d..f53553c70170bdcda717e72ffd791016effd0774 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -59,6 +60,16 @@ class LiteralTestUtil {
   static void AssertEqualShapesAndLayouts(const Shape& expected,
                                           const Shape& actual);
 
+  // If the given literal's data type is bfloat16, converts it to a float
+  // literal; otherwise, returns a copy of it. If the literal is a tuple,
+  // recursively converts its elements.
+  static std::unique_ptr<Literal> ConvertBF16ToF32(const Literal& bf16_literal);
+
+  // If the given literal's data type is float, converts it to a bfloat16
+  // literal; otherwise, returns a copy of it. If the literal is a tuple,
+  // recursively converts its elements.
+  static std::unique_ptr<Literal> ConvertF32ToBF16(const Literal& f32_literal);
+
   // Asserts that the expected and actual literals are (bitwise) equal for all
   // elements in the literal. Also, asserts that the rank, dimensions sizes, and
   // primitive type are equal.
@@ -100,6 +111,10 @@ class LiteralTestUtil {
   static void ExpectR4EqualArray4D(const Array4D<NativeT>& expected,
                                    const Literal& actual);
 
+  // Returns whether the two tuples are equal.
+  static ::testing::AssertionResult EqualTuple(
+      const Literal& expected, const Literal& actual) TF_MUST_USE_RESULT;
+
   // Expects that the values of the elements in the expected and actual tuples
   // are equal. Tuples are matched recursively.
   static void ExpectEqualTuple(const Literal& expected, const Literal& actual);
@@ -167,6 +182,19 @@ class LiteralTestUtil {
   static void ExpectNearTuple(const Literal& expected, const Literal& actual,
                               const ErrorSpec& error);
 
+  // If the error spec is given, returns whether the expected and the actual are
+  // within the error bound; otherwise, returns whether they are equal. Tuples
+  // will be compared recursively.
+  static ::testing::AssertionResult NearOrEqual(
+      const Literal& expected, const Literal& actual,
+      const tensorflow::gtl::optional<ErrorSpec>& error) TF_MUST_USE_RESULT;
+
+  // If the error spec is given, expects the expected and the actual to be near;
+  // otherwise, expects them to be equal. Tuples will be compared recursively.
+  static void ExpectNearOrEqual(
+      const Literal& expected, const Literal& actual,
+      const tensorflow::gtl::optional<ErrorSpec>& error);
+
   // Returns a multi-dimensional index as a string. For example: '{7, 8}' will
   // be returned for a 2-dimensional index with dimension 0 index equal to 7,
   // dimension 1 equal to 8.
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index 62fab6a22434ba20f5d7c068d876188e0661e02e..b5b95967ff9162301a092f3a57996e0f3f78658f 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -73,8 +73,8 @@ class LLVMCompilerTest : public ::testing::Test {
     compiler->SetPostOptimizationHook(post_opt_hook);
 
     ASSERT_TRUE(compiler
-                    ->Compile(std::move(hlo_module),
-                              backend_->default_stream_executor())
+                    ->RunBackend(std::move(hlo_module),
+                                 backend_->default_stream_executor())
                     .ok());
 
     // Test that hooks were called.
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index fbf9739dbceec2b941101881fe28acb38a2003be..e3298e98c67969f97adfdf15d22dfe72592b56aa 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -138,13 +138,13 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentInputLayouts) {
   // Create x as a col-major array.
   auto x_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout(
       {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({0, 1})));
-  EXPECT_TRUE(LayoutUtil::Equal(x_array->shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(x_array->on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({0, 1})));
 
   // Create y as a row-major array.
   auto y_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout(
       {{10.0f, 20.0f}, {30.0f, 40.0f}}, LayoutUtil::MakeLayout({1, 0})));
-  EXPECT_TRUE(LayoutUtil::Equal(y_array->shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(y_array->on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({1, 0})));
 
   std::unique_ptr<ScopedShapedBuffer> result_colmaj =
@@ -179,7 +179,7 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) {
       DefaultExecutableBuildOptions().set_result_layout(
           ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, {0, 1})),
       DefaultExecutableRunOptions());
-  EXPECT_TRUE(LayoutUtil::Equal(result_colmaj->shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(result_colmaj->on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({0, 1})));
   LiteralTestUtil::ExpectR2Near<float>({{11.0f, 22.0f}, {33.0f, 44.0f}},
                                        *ShapedBufferToLiteral(*result_colmaj),
@@ -191,7 +191,7 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) {
       DefaultExecutableBuildOptions().set_result_layout(
           ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, {1, 0})),
       DefaultExecutableRunOptions());
-  EXPECT_TRUE(LayoutUtil::Equal(result_rowmaj->shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(result_rowmaj->on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({1, 0})));
   LiteralTestUtil::ExpectR2Near<float>({{11.0f, 22.0f}, {33.0f, 44.0f}},
                                        *ShapedBufferToLiteral(*result_rowmaj),
@@ -213,8 +213,8 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResult) {
   std::unique_ptr<ScopedShapedBuffer> result =
       ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result->shape()));
-  EXPECT_EQ(3, ShapeUtil::TupleElementCount(result->shape()));
+  EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape()));
+  EXPECT_EQ(3, ShapeUtil::TupleElementCount(result->on_host_shape()));
 
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
   LiteralTestUtil::ExpectR2Equal<float>({{1.0f, 2.0f}, {3.0f, 4.0f}},
@@ -241,8 +241,8 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) {
   std::unique_ptr<ScopedShapedBuffer> result =
       ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result->shape()));
-  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->shape()));
+  EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape()));
+  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->on_host_shape()));
 
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
   LiteralTestUtil::ExpectR2Equal<float>({{1.0f, 2.0f}, {3.0f, 4.0f}},
@@ -320,8 +320,8 @@ XLA_TEST_F(LocalClientExecuteTest, TupleArguments) {
   std::unique_ptr<ScopedShapedBuffer> result =
       ExecuteLocallyOrDie(computation, {x_buffer.get(), y_buffer.get()});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result->shape()));
-  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->shape()));
+  EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape()));
+  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->on_host_shape()));
 
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
   LiteralTestUtil::ExpectR2Equal<float>({{56.0f, 46.0f}, {36.0f, 26.0f}},
@@ -874,11 +874,13 @@ XLA_TEST_F(LocalClientExecuteTest,
           tensorflow::ThreadOptions(), "execute_thread",
           [&] { ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {}); }));
 
-  ASSERT_IS_OK(local_client_->TransferToInfeed(
-      *Literal::CreateR1<float>({-5.0, 123.0, 42.0})));
+  ASSERT_IS_OK(local_client_->TransferToInfeedLocal(
+      *Literal::CreateR1<float>({-5.0, 123.0, 42.0}),
+      local_client_->default_device_ordinal()));
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
-                          local_client_->TransferFromOutfeed(&shape));
+                          local_client_->TransferFromOutfeedLocal(
+                              shape, local_client_->default_device_ordinal()));
 
   LiteralTestUtil::ExpectR1Equal<float>({-4.0, 125.0, 45.0}, *result);
 }
@@ -904,20 +906,18 @@ void BM_LocalClientOverhead(int num_iters) {
   builder.Add(x, x);
   auto computation = builder.Build().ConsumeValueOrDie();
 
-  auto shape_size_fn = [client](const Shape& shape) {
-    return client->backend().transfer_manager()->GetByteSizeRequirement(shape);
-  };
-  auto buffer = ScopedShapedBuffer::Allocate(
-                    shape, &allocator, /*device_ordinal=*/0, shape_size_fn)
-                    .ConsumeValueOrDie();
+  auto buffer =
+      transfer_manager
+          ->AllocateScopedShapedBuffer(shape, &allocator, /*device_ordinal=*/0)
+          .ConsumeValueOrDie();
   auto literal = Literal::CreateR2<float>({{0, 0, 0}, {0, 0, 0}});
   ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
-      executors[device_ordinal], *literal, buffer->mutable_buffer({})));
+      executors[device_ordinal], *literal, *buffer));
 
   const int kWarmups = 2;
 
-  auto executable_status = client->Compile(computation, {&buffer->shape()},
-                                           ExecutableBuildOptions());
+  auto executable_status = client->Compile(
+      computation, {&buffer->on_host_shape()}, ExecutableBuildOptions());
   ASSERT_IS_OK(executable_status);
   std::unique_ptr<LocalExecutable> executable =
       executable_status.ConsumeValueOrDie();
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index 062a9246e49598d5d03dce8c1f437138923449bf..96b976d25d75d35f46adfd104a03aceb363661eb 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -188,7 +188,7 @@ LocalClientTestBase::ExecuteLocally(
     const ExecutableRunOptions& run_options) {
   std::vector<const Shape*> argument_layouts(arguments.size());
   for (int i = 0; i < arguments.size(); ++i) {
-    argument_layouts[i] = &arguments[i]->shape();
+    argument_layouts[i] = &arguments[i]->on_host_shape();
   }
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<LocalExecutable> executable,
diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
index 22d2b917a1d55f4f453e21c2d8fea38e32ff796b..62d24a11fdb164ed6776d1e83877cf3acd319cc6 100644
--- a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
@@ -76,8 +76,11 @@ class MultiOutputFusionTest : public HloTestBase {
         elem_shape2, HloOpcode::kAdd, broadcast, param1));
     HloInstruction* sub = builder.AddInstruction(HloInstruction::CreateBinary(
         elem_shape2, HloOpcode::kSubtract, param1, broadcast));
+    DotDimensionNumbers dot_dnums;
+    dot_dnums.add_lhs_contracting_dimensions(1);
+    dot_dnums.add_rhs_contracting_dimensions(0);
     HloInstruction* dot = builder.AddInstruction(
-        HloInstruction::CreateBinary(elem_shape2, HloOpcode::kDot, sub, add2));
+        HloInstruction::CreateDot(elem_shape2, sub, add2, dot_dnums));
     auto computation = hlo_module->AddEntryComputation(builder.Build(dot));
 
     if (manual_fusion) {
@@ -96,14 +99,13 @@ class MultiOutputFusionTest : public HloTestBase {
           nullptr);
     }
 
-    Literal input;
-    input.PopulateWithValue<float>(2.5f, {size, size});
-    auto p1 = TransferToDevice(input);
-    auto p0 = TransferToDevice(*Literal::CreateR0<float>(-9.0f));
+    Literal arg1;
+    arg1.PopulateWithValue<float>(2.5f, {size, size});
 
     Literal expect;
     expect.PopulateWithValue<float>(size * 1.5f * 3.5f, {size, size});
-    auto actual = ExecuteAndTransfer(std::move(hlo_module), {p0, p1});
+    auto actual = ExecuteAndTransfer(
+        std::move(hlo_module), {Literal::CreateR0<float>(-9.0f).get(), &arg1});
     LiteralTestUtil::ExpectNear(expect, *actual, error_spec_);
   }
 
@@ -133,8 +135,11 @@ class MultiOutputFusionTest : public HloTestBase {
     HloInstruction* reshape =
         builder.AddInstruction(HloInstruction::CreateReshape(
             ShapeUtil::MakeShape(F32, {size, 1}), add));
-    HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(F32, {1}), HloOpcode::kDot, sub, reshape));
+    DotDimensionNumbers dot_dnums;
+    dot_dnums.add_lhs_contracting_dimensions(0);
+    dot_dnums.add_rhs_contracting_dimensions(0);
+    HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
+        ShapeUtil::MakeShape(F32, {1}), sub, reshape, dot_dnums));
     auto computation = hlo_module->AddEntryComputation(builder.Build(dot));
 
     if (manual_fusion) {
@@ -157,11 +162,9 @@ class MultiOutputFusionTest : public HloTestBase {
     Literal input0, input1;
     input0.PopulateWithValue<float>(2.5f, {size});
     input1.PopulateWithValue<double>(1, {size});
-    auto p0 = TransferToDevice(input0);
-    auto p1 = TransferToDevice(input1);
 
     Literal expect = *Literal::CreateR1<float>({size * 1.5f * 3.5f});
-    auto actual = ExecuteAndTransfer(std::move(hlo_module), {p0, p1});
+    auto actual = ExecuteAndTransfer(std::move(hlo_module), {&input0, &input1});
     LiteralTestUtil::ExpectNear(expect, *actual, error_spec_);
   }
 };
diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc
index fda4389f479cdc7a659e4d7c8a2facba55e17e83..c260258d6e9af6dee6075c92cf35dac4ed46abed 100644
--- a/tensorflow/compiler/xla/tests/params_test.cc
+++ b/tensorflow/compiler/xla/tests/params_test.cc
@@ -252,8 +252,8 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) {
 }
 
 // Only run the 3,000-parameter tests in opt mode to avoid test timeouts.
-// Timeout last observed on 2017-09-12.
-#ifndef NDEBUG
+// Timeout last observed on 2017-11-20.
+#ifdef NDEBUG
 
 // TODO(b/65525254) Fails on GPU on 2017-09-10 because we try to reserve too
 // much space in parameter memory for the kernel.
@@ -334,6 +334,106 @@ XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(
   ComputeAndCompareTuple(&builder, *Literal::MakeTuple(ptrs), param_data);
 }
 
+// Test large number of parameters flowing into a while-loop.
+// Construct conceptually the following HLO graph:
+//
+// p0 = parameter(0)
+// p1 = parameter(1)
+// ...
+// pN = parameter(N)
+// result = while (false) {
+//   p0 += (1, 1);
+//   p1 += (1, 1);
+//   ...
+//   pN += (1, 1)
+// }
+// result = {p0, p1, ..., pN}
+//
+// TODO(b/70173746): Times out during compilation on GPU and CPU backends as of
+// 2017-12-12.
+XLA_TEST_F(ParamsTest,
+           DISABLED_ON_CPU(DISABLED_ON_GPU(ManyParametersIntoWhileLoop))) {
+  ComputationBuilder builder(client_, TestName());
+
+  std::vector<std::unique_ptr<GlobalData>> param_data_owner;
+  constexpr int kParamCount = 1900;
+  std::vector<ComputationDataHandle> params;
+  std::vector<Shape> parameter_shapes;
+  for (int i = 0; i < kParamCount; ++i) {
+    std::unique_ptr<Literal> literal = Literal::CreateR1<int32>({i, i});
+    param_data_owner.push_back(
+        std::move(client_->TransferToServer(*literal)).ValueOrDie());
+    ComputationDataHandle param =
+        builder.Parameter(i, literal->shape(), "param");
+    params.push_back(param);
+    parameter_shapes.push_back(literal->shape());
+  }
+
+  // Add bool parameter for the loop condition. Use a parameter HLO instead of a
+  // constant because DCE may eliminate the while-body otherwise.
+  std::unique_ptr<Literal> bool_literal = Literal::CreateR0<bool>(false);
+  param_data_owner.push_back(
+      std::move(client_->TransferToServer(*bool_literal)).ValueOrDie());
+  ComputationDataHandle bool_param =
+      builder.Parameter(kParamCount, bool_literal->shape(), "bool_param");
+  params.push_back(bool_param);
+  parameter_shapes.push_back(bool_literal->shape());
+
+  auto init = builder.Tuple(params);
+
+  // Create a computation for the condition: while(bool_param).
+  Shape while_shape = ShapeUtil::MakeTupleShape(parameter_shapes);
+  Computation condition;
+  {
+    ComputationBuilder builder(client_, "condition");
+    auto condition_parameter =
+        builder.Parameter(0, while_shape, "condition_parameter");
+    builder.GetTupleElement(condition_parameter, kParamCount);
+    condition = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a computation for the body.
+  // Add {1, 1} to the each tuple element.
+  Computation body;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto body_parameter = builder.Parameter(0, while_shape, "body_parameter");
+    std::vector<ComputationDataHandle> updates;
+    for (int i = 0; i < kParamCount; ++i) {
+      auto add = builder.Add(builder.GetTupleElement(body_parameter, i),
+                             builder.ConstantR1<int32>({1, 1}));
+      updates.push_back(add);
+    }
+    // Add bool parameter.
+    updates.push_back(builder.GetTupleElement(body_parameter, kParamCount));
+
+    builder.Tuple(updates);
+    body = builder.Build().ConsumeValueOrDie();
+  }
+
+  auto loop = builder.While(condition, body, init);
+
+  std::vector<ComputationDataHandle> outputs;
+  for (int i = 0; i < kParamCount; ++i) {
+    outputs.push_back(builder.GetTupleElement(loop, i));
+  }
+  builder.Tuple(outputs);
+
+  std::vector<GlobalData*> param_data;
+  param_data.reserve(param_data_owner.size());
+  for (const std::unique_ptr<GlobalData>& data : param_data_owner) {
+    param_data.push_back(data.get());
+  }
+
+  std::vector<std::unique_ptr<Literal>> elements;
+  std::vector<const Literal*> ptrs;
+  for (int i = 0; i < kParamCount; ++i) {
+    elements.push_back(Literal::CreateR1<int32>({i, i}));
+    ptrs.push_back(elements.back().get());
+  }
+  ComputeAndCompareTuple(&builder, *Literal::MakeTuple(ptrs), param_data);
+}
+
 #endif
 
 XLA_TEST_F(ParamsTest,
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index 7bc3185c367f076c9a7d211c9799557e1a91d92f..b09ccdd679b6c8f628e40f78f58dbd1734926af6 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -352,15 +352,13 @@ XLA_TEST_F(ReduceTest, ReduceR2_111x50_01_To_R1) {
 XLA_TEST_F(ReduceTest, ReduceR2_1024x1024_To_R1) { RunR2ToR1Test(1024, 1024); }
 XLA_TEST_F(ReduceTest, ReduceR2_1000x1500_To_R1) { RunR2ToR1Test(1000, 1500); }
 
-// TODO(b/34969189): Invalid CAS generated on GPU.
-XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(AndReduceAllOnesR1_10_Pred)) {
+XLA_TEST_F(ReduceTest, AndReduceAllOnesR1_10_Pred) {
   constexpr int element_count = 10;
   std::vector<int> input(element_count, 1);
   RunR1ToR0PredTest(/*and_reduce=*/true, input);
 }
 
-// TODO(b/34969189): Invalid CAS generated on GPU.
-XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(AndReduceOnesAndZerosR1_10_Pred)) {
+XLA_TEST_F(ReduceTest, AndReduceOnesAndZerosR1_10_Pred) {
   constexpr int element_count = 10;
   std::vector<int> input(element_count);
   for (int i = 0; i < element_count; ++i) {
@@ -369,15 +367,13 @@ XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(AndReduceOnesAndZerosR1_10_Pred)) {
   RunR1ToR0PredTest(/*and_reduce=*/true, input);
 }
 
-// TODO(b/34969189): Invalid CAS generated on GPU.
-XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(OrReduceAllOnesR1_10_Pred)) {
+XLA_TEST_F(ReduceTest, OrReduceAllOnesR1_10_Pred) {
   constexpr int element_count = 10;
   std::vector<int> input(element_count, 1);
   RunR1ToR0PredTest(/*and_reduce=*/false, input);
 }
 
-// TODO(b/34969189): Invalid CAS generated on GPU.
-XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(OrReduceOnesAndZerosR1_10_Pred)) {
+XLA_TEST_F(ReduceTest, OrReduceOnesAndZerosR1_10_Pred) {
   constexpr int element_count = 10;
   std::vector<int> input(element_count);
   for (int i = 0; i < element_count; ++i) {
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 6c9b62b48d8bb2ad93b2ce98839e5e52d8eaa8cc..bf81514bc900792d6c687a6044b83e91920ed8bb 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -41,16 +41,40 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class ReduceWindowTest : public ClientLibraryTestBase {
+#ifdef XLA_BACKEND_SUPPORTS_BFLOAT16
+// Tests both F32 and BF16.
+static std::array<bool, 2> use_bfloat16_params{false, true};
+#else
+// Only tests F32.
+static std::array<bool, 1> use_bfloat16_params{false};
+#endif
+
+class ReduceWindowTestBase : public ClientLibraryTestBase {
  public:
-  ReduceWindowTest() : builder_(client_, TestName()) {}
+  ErrorSpec DefaultErrorSpec() const {
+    if (use_bfloat16()) {
+      return ErrorSpec(1e-1, 5e-2);
+    } else {
+      return ErrorSpec(1e-3, 1e-3);
+    }
+  }
+};
+
+class ReduceWindowTest : public ::testing::WithParamInterface<bool>,
+                         public ReduceWindowTestBase {
+ public:
+  ReduceWindowTest() : builder_(client_, TestName()) {
+    set_use_bfloat16(GetParam());
+  }
 
   void ReduceWindowAdd(const ComputationDataHandle& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
-    builder_.ReduceWindow(input, builder_.ConstantR0<float>(0.0f),
-                          CreateScalarAddComputation(F32, &builder_),
+    auto init =
+        CreateConstantFromLiteral(*Literal::CreateR0<float>(0.0f), &builder_);
+    builder_.ReduceWindow(input, init,
+                          CreateScalarAddComputation(FloatType(), &builder_),
                           window_dimensions, window_strides, padding);
   }
 
@@ -58,30 +82,32 @@ class ReduceWindowTest : public ClientLibraryTestBase {
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
-    builder_.ReduceWindow(
-        input, builder_.ConstantLiteral(Literal::MinValue(F32)),
-        CreateScalarMax(), window_dimensions, window_strides, padding);
+    auto init = CreateConstantFromLiteral(Literal::MinValue(F32), &builder_);
+    builder_.ReduceWindow(input, init, CreateScalarMax(), window_dimensions,
+                          window_strides, padding);
   }
 
   void ReduceWindowMin(const ComputationDataHandle& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
-    builder_.ReduceWindow(input,
-                          builder_.ConstantLiteral(Literal::MaxValue(F32)),
-                          CreateScalarMinComputation(F32, &builder_),
+    auto init = CreateConstantFromLiteral(Literal::MaxValue(F32), &builder_);
+    builder_.ReduceWindow(input, init,
+                          CreateScalarMinComputation(FloatType(), &builder_),
                           window_dimensions, window_strides, padding);
   }
 
   ComputationBuilder builder_;
 };
 
-TEST_F(ReduceWindowTest, MismatchedRanksGivesErrorStatus) {
-  const auto input = builder_.ConstantR1<float>({1, 1, 1, 1});
-  const auto init_value = builder_.ConstantR0<float>(0);
+TEST_P(ReduceWindowTest, MismatchedRanksGivesErrorStatus) {
+  const auto input = CreateConstantFromLiteral(
+      *Literal::CreateR1<float>({1, 1, 1, 1}), &builder_);
+  const auto init_value =
+      CreateConstantFromLiteral(*Literal::CreateR0<float>(0), &builder_);
   TF_ASSERT_OK(builder_.first_error());
   builder_.ReduceWindow(input, init_value,
-                        CreateScalarAddComputation(F32, &builder_),
+                        CreateScalarAddComputation(FloatType(), &builder_),
                         /*window_dimensions=*/{1, 2},
                         /*window_strides=*/{1}, Padding::kValid);
   ASSERT_EQ(builder_.first_error().code(), tensorflow::error::INVALID_ARGUMENT)
@@ -90,79 +116,97 @@ TEST_F(ReduceWindowTest, MismatchedRanksGivesErrorStatus) {
               ::testing::HasSubstr("Want input dimensions size"));
 }
 
-TEST_F(ReduceWindowTest, Min3In5Stride2) {
-  const auto input = builder_.ConstantR1<float>({10000, 1000, 100, 10, 1});
+// Regression test for b/68964348.
+TEST_P(ReduceWindowTest, R0ReduceWindow) {
+  const auto input =
+      CreateConstantFromLiteral(*Literal::CreateR0<float>(42.0), &builder_);
+  const auto init =
+      CreateConstantFromLiteral(*Literal::CreateR0<float>(1.0), &builder_);
+  builder_.ReduceWindow(input, init,
+                        CreateScalarAddComputation(FloatType(), &builder_),
+                        /*window_dimensions=*/{},
+                        /*window_strides=*/{}, Padding::kSame);
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateR0<float>(43.0), {},
+                           ErrorSpec(0.00001));
+}
+
+TEST_P(ReduceWindowTest, Min3In5Stride2) {
+  const auto input = CreateConstantFromLiteral(
+      *Literal::CreateR1<float>({10000, 1000, 100, 10, 1}), &builder_);
   ReduceWindowMin(input, {3}, {2}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {100, 1}, {}, ErrorSpec(0.0001));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateR1<float>({100, 1}), {},
+                           ErrorSpec(0.00001));
 }
 
-XLA_TEST_F(ReduceWindowTest, ZeroElementSmall) {
+XLA_TEST_P(ReduceWindowTest, ZeroElementSmall) {
   Array4D<float> input_array(1, 0, 2, 1);
-
-  const auto input = builder_.ConstantR4FromArray4D<float>(input_array);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {1, 1, 2, 1}, {1, 1, 1, 1}, padding);
 
   auto res = ReferenceUtil::ReduceWindow4DAdd(input_array, 0.0f, {1, 1, 2, 1},
                                               {1, 1, 1, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, NonSquareSmall) {
+TEST_P(ReduceWindowTest, NonSquareSmall) {
   Array4D<float> input_array(1, 2, 2, 1);
-  input_array.FillRandom(2.f);
+  input_array.FillRandom(2.f, 2.f);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
-  const auto input = builder_.ConstantR4FromArray4D<float>(input_array);
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {1, 1, 2, 1}, {1, 1, 1, 1}, padding);
 
   auto res = ReferenceUtil::ReduceWindow4DAdd(input_array, 0.0f, {1, 1, 2, 1},
                                               {1, 1, 1, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, MiddleDimsSmall) {
+TEST_P(ReduceWindowTest, MiddleDimsSmall) {
   Array4D<float> input_array(1, 3, 3, 1);
-  input_array.FillRandom(2.f);
-
-  const auto input = builder_.ConstantR4FromArray4D<float>(input_array);
+  input_array.FillRandom(2.f, 2.f);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {1, 1, 1, 1}, {1, 2, 2, 1}, padding);
 
   auto res = ReferenceUtil::ReduceWindow4DAdd(input_array, 0.0f, {1, 1, 1, 1},
                                               {1, 2, 2, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, Along2ndMinorDim) {
+TEST_P(ReduceWindowTest, Along2ndMinorDim) {
   Array4D<float> input_array(3, 6, 7, 32);
-  input_array.FillRandom(2.f);
+  input_array.FillRandom(2.f, 2.f);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
   // The parameters of this reduction mimic feature norm (e.g. LRN).
   int lrn_diameter = 7;  // diameter = 2*radius + 1 --> must be odd
-  const auto input = builder_.ConstantR4FromArray4D<float>(input_array);
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {1, 1, lrn_diameter, 1}, {1, 1, 1, 1}, padding);
 
   auto res = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {1, 1, lrn_diameter, 1}, {1, 1, 1, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, AmongMajor2Dims) {
+TEST_P(ReduceWindowTest, AmongMajor2Dims) {
   Array4D<float> input_array(4, 4, 6, 8);
   input_array.FillWithMinorDimNum();
+  const auto input_data_handle =
+      CreateConstantFromArray(input_array, &builder_);
 
   int win_len = 3;
   int win_stride = 1;
 
   Padding padding = Padding::kSame;
-  const auto input_data_handle =
-      builder_.ConstantR4FromArray4D<float>(input_array);
   // Reduce only along the x and y dimensions, according to the win_len.
   ReduceWindowAdd(input_data_handle, {win_len, win_len, 1, 1},
                   {win_stride, win_stride, 1, 1}, padding);
@@ -170,18 +214,20 @@ TEST_F(ReduceWindowTest, AmongMajor2Dims) {
   auto result = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {win_len, win_len, 1, 1},
       {win_stride, win_stride, 1, 1}, padding);
-  ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
+
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, AmongMajor2DimsMediumSize) {
+TEST_P(ReduceWindowTest, AmongMajor2DimsMediumSize) {
   Array4D<float> input_array(9, 12, 4, 89);
-  input_array.FillRandom(2.0f);
+  input_array.FillRandom(2.f, 2.f);
 
   int win_len = 3;
   int win_stride = 2;
 
   const auto input_data_handle =
-      builder_.ConstantR4FromArray4D<float>(input_array);
+      CreateConstantFromArray(input_array, &builder_);
 
   Padding padding = Padding::kSame;
   // Reduce only along the x and y dimensions, according to the win_len.
@@ -192,56 +238,28 @@ TEST_F(ReduceWindowTest, AmongMajor2DimsMediumSize) {
       input_array, 0.0f, {win_len, win_len, 1, 1},
       {win_stride, win_stride, 1, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
-}
-
-// TODO(b/32173947): Test support for arbitrary-sized padding.
-TEST_F(ReduceWindowTest, DISABLED_AmongMajor2DimsMediumSizeLargePadding) {
-  Array4D<float> input_array(9, 12, 4, 89);  // simulate Dim0IsMinor layout
-  input_array.FillRandom(2.0f);
-
-  int64 rank = 4;
-  int win_len = 3;
-  int win_stride = 2;
-
-  const auto input_data_handle =
-      builder_.ConstantR4FromArray4D<float>(input_array);
-
-  Padding padding = Padding::kSame;
-  // Reduce only along the x and y dimensions, according to the win_len.
-  // Create padding vector with large padding values in the reduction dims.
-  std::vector<std::pair<int64, int64>> low_high_padding;
-  low_high_padding.resize(rank, {4, 4});
-
-  builder_.ReduceWindowWithGeneralPadding(
-      input_data_handle, builder_.ConstantR0<float>(0.0f),
-      CreateScalarAddComputation(F32, &builder_), {win_len, win_len, 1, 1},
-      {win_stride, win_stride, 1, 1}, low_high_padding);
-
-  auto result = ReferenceUtil::ReduceWindow4DAdd(
-      input_array, 0.0f, {win_len, win_len, 1, 1},
-      {win_stride, win_stride, 1, 1}, padding);
-
-  ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x2) {
+XLA_TEST_P(ReduceWindowTest, Add1x1x2In2x1x2) {
   Array3D<float> input_array(2, 1, 2);
   input_array(0, 0, 0) = 1000;
   input_array(0, 0, 1) = 100;
   input_array(1, 0, 0) = 10;
   input_array(1, 0, 1) = 1;
-  auto input = builder_.ConstantR3FromArray3D<float>(input_array);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
   ReduceWindowAdd(input, {1, 1, 2}, {1, 1, 1}, Padding::kValid);
 
   Array3D<float> expected(2, 1, 1);
   expected(0, 0, 0) = 1100;
   expected(1, 0, 0) = 11;
-  ComputeAndCompareR3<float>(&builder_, expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(expected), {},
+                           DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x3Stride1x1x2) {
+XLA_TEST_P(ReduceWindowTest, Add1x1x2In2x1x3Stride1x1x2) {
   Array3D<float> input_array(2, 1, 3);
   input_array(0, 0, 0) = 100;
   input_array(0, 0, 1) = 10;
@@ -249,17 +267,18 @@ XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x3Stride1x1x2) {
   input_array(1, 0, 0) = 500;
   input_array(1, 0, 1) = 50;
   input_array(1, 0, 2) = 5;
-  auto input = builder_.ConstantR3FromArray3D<float>(input_array);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
   ReduceWindowAdd(input, {1, 1, 2}, {1, 1, 2}, Padding::kValid);
 
   Array3D<float> expected(2, 1, 1);
   expected(0, 0, 0) = 110;
   expected(1, 0, 0) = 550;
-  ComputeAndCompareR3<float>(&builder_, expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(expected), {},
+                           DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x3SamePad) {
+XLA_TEST_P(ReduceWindowTest, Add1x1x2In2x1x3SamePad) {
   Array3D<float> input_array(2, 1, 3);
   input_array(0, 0, 0) = 100;
   input_array(0, 0, 1) = 10;
@@ -267,7 +286,7 @@ XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x3SamePad) {
   input_array(1, 0, 0) = 500;
   input_array(1, 0, 1) = 50;
   input_array(1, 0, 2) = 5;
-  auto input = builder_.ConstantR3FromArray3D<float>(input_array);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
   ReduceWindowAdd(input, {1, 1, 2}, {1, 1, 1}, Padding::kSame);
 
@@ -278,30 +297,34 @@ XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x3SamePad) {
   expected(1, 0, 0) = 550;
   expected(1, 0, 1) = 55;
   expected(1, 0, 2) = 5;
-  ComputeAndCompareR3<float>(&builder_, expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(expected), {},
+                           DefaultErrorSpec());
 }
 
 // Tests a reduction function that is not a simple add/min/max/etc.
-XLA_TEST_F(ReduceWindowTest, NonstandardReduceFunction) {
+XLA_TEST_P(ReduceWindowTest, NonstandardReduceFunction) {
   Array4D<float> input_array(1, 2, 2, 1);
   input_array(0, 0, 0, 0) = 1;
   input_array(0, 0, 1, 0) = 2;
   input_array(0, 1, 0, 0) = 3;
   input_array(0, 1, 1, 0) = 4;
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
-  const auto input = builder_.ConstantR4FromArray4D<float>(input_array);
   Padding padding = Padding::kValid;
-
-  const Shape scalar = ShapeUtil::MakeShape(F32, {});
+  const Shape scalar = ShapeUtil::MakeShape(FloatType(), {});
   auto b = builder_.CreateSubBuilder("unusual");
   auto lhs = b->Parameter(0, scalar, "lhs");
   auto rhs = b->Parameter(1, scalar, "rhs");
-  b->Min(b->Add(lhs, rhs), b->ConstantR0<float>(8.0f));
+  b->Min(b->Add(lhs, rhs),
+         CreateConstantFromLiteral(*Literal::CreateR0<float>(8.0f), b.get()));
   Computation reduce_fn = b->BuildAndNoteError();
 
-  builder_.ReduceWindow(input, builder_.ConstantR0<float>(3.0f), reduce_fn,
-                        /*window_dimensions=*/{1, 1, 2, 1},
-                        /*window_strides=*/{1, 1, 1, 1}, padding);
+  builder_.ReduceWindow(
+      input,
+      CreateConstantFromLiteral(*Literal::CreateR0<float>(3.0f), &builder_),
+      reduce_fn,
+      /*window_dimensions=*/{1, 1, 2, 1},
+      /*window_strides=*/{1, 1, 1, 1}, padding);
 
   const auto reduce_func = [](float arg1, float arg2) {
     return std::min<float>(arg1 + arg2, 8.0f);
@@ -312,17 +335,19 @@ XLA_TEST_F(ReduceWindowTest, NonstandardReduceFunction) {
                                            /*window=*/{1, 1, 2, 1},
                                            /*stride=*/{1, 1, 1, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *expected, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*expected), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, R4UnitWindow) {
+TEST_P(ReduceWindowTest, R4UnitWindow) {
   Array4D<float> input_array(13, 12, 8, 15);
-  input_array.Fill(1.0f);
+  input_array.FillRandom(2.f, 2.f);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({0, 3, 2, 1}));
-  ComputationDataHandle input =
-      builder_.Parameter(0, input_literal->shape(), "operand");
+  ComputationDataHandle input;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "parameter", &builder_, &input);
 
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {1, 1, 7, 1}, {1, 4, 1, 1}, padding);
@@ -330,15 +355,11 @@ TEST_F(ReduceWindowTest, R4UnitWindow) {
   auto res = ReferenceUtil::ReduceWindow4DAdd(input_array, 0.0f, {1, 1, 7, 1},
                                               {1, 4, 1, 1}, padding);
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
-                          client_->TransferToServer(*input_literal));
-  ComputeAndCompareR4<float>(&builder_, *res, {input_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res),
+                           {input_data.get()}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(HloTestBase, R6AddMultipleStrides) {
-  auto b = HloComputation::Builder(TestName());
-
+XLA_TEST_P(ReduceWindowTest, R6AddMultipleStrides) {
   std::vector<int64> input_dims(6, 8);
   auto shape = ShapeUtil::MakeShape(F32, input_dims);
 
@@ -348,56 +369,15 @@ XLA_TEST_F(HloTestBase, R6AddMultipleStrides) {
   };
   TF_EXPECT_OK(arg_literal->Populate<float>(generator));
 
-  auto input =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(arg_literal)));
-
-  auto init_value = b.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.f)));
-
-  HloComputation::Builder add_computation("add");
-  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
-  auto param_lhs = add_computation.AddInstruction(
-      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
-  auto param_rhs = add_computation.AddInstruction(
-      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
-  add_computation.AddInstruction(HloInstruction::CreateBinary(
-      scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-
-  auto module = CreateNewModule();
-  auto add_func = module->AddEmbeddedComputation(add_computation.Build());
-
-  WindowDimension trivial_dim;
-  trivial_dim.set_size(1);
-  trivial_dim.set_stride(1);
-  trivial_dim.set_padding_low(0);
-  trivial_dim.set_padding_high(0);
-  trivial_dim.set_window_dilation(1);
-  trivial_dim.set_base_dilation(1);
-
-  WindowDimension active_dim;
-  active_dim.set_size(3);
-  active_dim.set_stride(1);
-  active_dim.set_padding_low(0);
-  active_dim.set_padding_high(0);
-  active_dim.set_window_dilation(1);
-  active_dim.set_base_dilation(1);
-
-  Window window;
-  *window.add_dimensions() = active_dim;
-  *window.add_dimensions() = trivial_dim;
-  *window.add_dimensions() = active_dim;
-  *window.add_dimensions() = active_dim;
-  *window.add_dimensions() = trivial_dim;
-  *window.add_dimensions() = trivial_dim;
-
-  // Non-monotonic output layout with minor dims trivial.
+  const auto input = CreateConstantFromLiteral(*arg_literal, &builder_);
+
+  Padding padding = Padding::kValid;
+  ReduceWindowAdd(input, {3, 1, 3, 3, 1, 1}, {1, 1, 1, 1, 1, 1}, padding);
+
   std::vector<int64> output_layout = {1, 5, 3, 2, 0, 4};
   std::vector<int64> output_dims = {6, 8, 6, 6, 8, 8};
   Shape result_shape =
       ShapeUtil::MakeShapeWithLayout(F32, output_dims, output_layout);
-  b.AddInstruction(HloInstruction::CreateReduceWindow(
-      result_shape, input, init_value, window, add_func));
-
   std::unique_ptr<Literal> expected = Literal::CreateFromShape(result_shape);
   auto out_generator =
       [&](tensorflow::gtl::ArraySlice<int64> indexes) -> float {
@@ -405,82 +385,37 @@ XLA_TEST_F(HloTestBase, R6AddMultipleStrides) {
   };
   TF_EXPECT_OK(expected->Populate<float>(out_generator));
 
-  module->AddEntryComputation(b.Build());
-  auto actual = ExecuteAndTransfer(std::move(module), {});
-
-  LiteralTestUtil::ExpectNear(*actual, *expected, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(HloTestBase, R6Add) {
-  auto b = HloComputation::Builder(TestName());
-
+XLA_TEST_P(ReduceWindowTest, R6Add) {
   std::vector<int64> input_dims(6, 8);
+  auto shape = ShapeUtil::MakeShape(F32, input_dims);
+
   std::unique_ptr<Literal> arg_literal =
       Literal::CreateFullWithMonotonicDim0MajorLayout<float>(input_dims, 1.0f);
-  auto input =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(arg_literal)));
-
-  auto init_value = b.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.f)));
-
-  HloComputation::Builder add_computation("add");
-  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
-  auto param_lhs = add_computation.AddInstruction(
-      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
-  auto param_rhs = add_computation.AddInstruction(
-      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
-  add_computation.AddInstruction(HloInstruction::CreateBinary(
-      scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-
-  auto module = CreateNewModule();
-  auto add_func = module->AddEmbeddedComputation(add_computation.Build());
-
-  WindowDimension trivial_dim;
-  trivial_dim.set_size(1);
-  trivial_dim.set_stride(1);
-  trivial_dim.set_padding_low(0);
-  trivial_dim.set_padding_high(0);
-  trivial_dim.set_window_dilation(1);
-  trivial_dim.set_base_dilation(1);
-
-  WindowDimension active_dim;
-  active_dim.set_size(3);
-  active_dim.set_stride(1);
-  active_dim.set_padding_low(0);
-  active_dim.set_padding_high(0);
-  active_dim.set_window_dilation(1);
-  active_dim.set_base_dilation(1);
-
-  Window window;
-  *window.add_dimensions() = trivial_dim;
-  *window.add_dimensions() = trivial_dim;
-  *window.add_dimensions() = active_dim;
-  *window.add_dimensions() = active_dim;
-  *window.add_dimensions() = trivial_dim;
-  *window.add_dimensions() = trivial_dim;
-
-  Shape shape = ShapeUtil::MakeShape(F32, {8, 8, 6, 6, 8, 8});
-  b.AddInstruction(HloInstruction::CreateReduceWindow(shape, input, init_value,
-                                                      window, add_func));
+
+  const auto input = CreateConstantFromLiteral(*arg_literal, &builder_);
+
+  Padding padding = Padding::kValid;
+  ReduceWindowAdd(input, {1, 1, 3, 3, 1, 1}, {1, 1, 1, 1, 1, 1}, padding);
 
   std::vector<int64> output_dims = {8, 8, 6, 6, 8, 8};
   std::unique_ptr<Literal> expected =
       Literal::CreateFullWithMonotonicDim0MajorLayout<float>(output_dims, 9.0f);
 
-  module->AddEntryComputation(b.Build());
-  auto actual = ExecuteAndTransfer(std::move(module), {});
-
-  LiteralTestUtil::ExpectNear(*actual, *expected, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, R4SecondMinorStride) {
+XLA_TEST_P(ReduceWindowTest, R4SecondMinorStride) {
   Array4D<float> input_array(2, 1, 27, 119);
   input_array.FillRandom(2.0f);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input =
-      builder_.Parameter(0, input_literal->shape(), "operand");
+  ComputationDataHandle input;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "parameter", &builder_, &input);
 
   int win_len = 1;
   int stride = 8;
@@ -490,20 +425,19 @@ XLA_TEST_F(ReduceWindowTest, R4SecondMinorStride) {
   auto res = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding);
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
-                          client_->TransferToServer(*input_literal));
-  ComputeAndCompareR4<float>(&builder_, *res, {input_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res),
+                           {input_data.get()}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, R4SecondMinorUnitStride) {
+XLA_TEST_P(ReduceWindowTest, R4SecondMinorUnitStride) {
   Array4D<float> input_array(3, 2, 4, 64);
   input_array.FillRandom(2.0f);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input =
-      builder_.Parameter(0, input_literal->shape(), "operand");
+  ComputationDataHandle input;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "parameter", &builder_, &input);
 
   int win_len = 3;
   int stride = 1;
@@ -513,20 +447,19 @@ XLA_TEST_F(ReduceWindowTest, R4SecondMinorUnitStride) {
   auto res = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding);
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
-                          client_->TransferToServer(*input_literal));
-  ComputeAndCompareR4<float>(&builder_, *res, {input_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res),
+                           {input_data.get()}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, R4SecondMinorWin) {
+XLA_TEST_P(ReduceWindowTest, R4SecondMinorWin) {
   Array4D<float> input_array(1, 3, 12, 200);
   input_array.FillRandom(2.0f);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input =
-      builder_.Parameter(0, input_literal->shape(), "operand");
+  ComputationDataHandle input;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "parameter", &builder_, &input);
 
   int win_len = 8;
   int stride = 5;
@@ -536,13 +469,11 @@ XLA_TEST_F(ReduceWindowTest, R4SecondMinorWin) {
   auto res = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding);
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
-                          client_->TransferToServer(*input_literal));
-  ComputeAndCompareR4<float>(&builder_, *res, {input_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res),
+                           {input_data.get()}, DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, AmongMajor2DimsMultipleMinor) {
+TEST_P(ReduceWindowTest, AmongMajor2DimsMultipleMinor) {
   Array4D<float> input_array(6, 4, 10, 130);
   input_array.FillRandom(2.0f);
 
@@ -551,7 +482,7 @@ TEST_F(ReduceWindowTest, AmongMajor2DimsMultipleMinor) {
 
   Padding padding = Padding::kSame;
   const auto input_data_handle =
-      builder_.ConstantR4FromArray4D<float>(input_array);
+      CreateConstantFromArray(input_array, &builder_);
   // Reduce only along the x and y dimensions, according to the win_len.
   ReduceWindowAdd(input_data_handle, {win_len, win_len, 1, 1},
                   {win_stride, win_stride, 1, 1}, padding);
@@ -559,36 +490,42 @@ TEST_F(ReduceWindowTest, AmongMajor2DimsMultipleMinor) {
   auto result = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {win_len, win_len, 1, 1},
       {win_stride, win_stride, 1, 1}, padding);
-  ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, Add24In1152_NoOverlap) {
+XLA_TEST_P(ReduceWindowTest, Add24In1152_NoOverlap) {
   std::vector<float> input_vector(128 * 9, 1);
-  const auto input = builder_.ConstantR1<float>(input_vector);
+  const auto input = CreateConstantFromLiteral(
+      *Literal::CreateR1<float>(input_vector), &builder_);
   ReduceWindowAdd(input, {32}, {128}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {32, 32, 32, 32, 32, 32, 32, 32, 32},
-                             {}, ErrorSpec(0.0001));
+  ComputeAndCompareLiteral(
+      &builder_,
+      *Literal::CreateR1<float>({32, 32, 32, 32, 32, 32, 32, 32, 32}), {},
+      DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, Add128In128Stride128) {
-  const auto input = builder_.ConstantR1<float>(
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+XLA_TEST_P(ReduceWindowTest, Add128In128Stride128) {
+  std::vector<float> input_vector{
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  const auto input = CreateConstantFromLiteral(
+      *Literal::CreateR1<float>(input_vector), &builder_);
   ReduceWindowAdd(input, {128}, {128}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {1088}, {}, ErrorSpec(0.0001));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateR1<float>({1088}), {},
+                           DefaultErrorSpec());
 }
 
 // Regression test for a bug that appeared in Inception (b/34784899).
-TEST_F(ReduceWindowTest, R2ReduceWindowInceptionFromBroadcast) {
+TEST_P(ReduceWindowTest, R2ReduceWindowInceptionFromBroadcast) {
   Array2D<float> input_array(14, 14, 1.0f);
-  ComputationDataHandle input =
-      builder_.Broadcast(builder_.ConstantLiteral(Literal::One(F32)), {14, 14});
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
   int win_len = 3;
   int stride = 1;
@@ -598,13 +535,14 @@ TEST_F(ReduceWindowTest, R2ReduceWindowInceptionFromBroadcast) {
   auto res = ReferenceUtil::ReduceWindow2DAdd(
       input_array, 0.0f, {win_len, win_len}, {stride, stride}, padding);
 
-  ComputeAndCompareR2<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray<float>(*res),
+                           {}, DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, R2ReduceWindowNonOverlappingFromBroadcast) {
+TEST_P(ReduceWindowTest, R2ReduceWindowNonOverlappingFromBroadcast) {
   Array2D<float> input_array(6, 4, 1.0f);
-  ComputationDataHandle input =
-      builder_.Broadcast(builder_.ConstantLiteral(Literal::One(F32)), {6, 4});
+  ComputationDataHandle input = builder_.Broadcast(
+      CreateConstantFromLiteral(Literal::One(F32), &builder_), {6, 4});
 
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {4, 2}, {3, 3}, padding);
@@ -612,9 +550,13 @@ TEST_F(ReduceWindowTest, R2ReduceWindowNonOverlappingFromBroadcast) {
   auto res = ReferenceUtil::ReduceWindow2DAdd(input_array, 0.0f, {4, 2}, {3, 3},
                                               padding);
 
-  ComputeAndCompareR2<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray<float>(*res),
+                           {}, DefaultErrorSpec());
 }
 
+INSTANTIATE_TEST_CASE_P(ReduceWindowTestInstance, ReduceWindowTest,
+                        ::testing::ValuesIn(use_bfloat16_params));
+
 enum Reducer { kAdd, kMax };
 
 struct R4ReduceWindowTestData {
@@ -628,30 +570,36 @@ struct R4ReduceWindowTestData {
 };
 
 string R4ReduceWindowTestDataToString(
-    const ::testing::TestParamInfo<R4ReduceWindowTestData>& data) {
+    const ::testing::TestParamInfo<
+        ::testing::tuple<R4ReduceWindowTestData, bool>>& data) {
+  const auto& param = ::testing::get<0>(data.param);
   string str = tensorflow::strings::StrCat(
-      "base_bounds_",
-      tensorflow::str_util::Join(data.param.base_bounds, "x"),  //
+      "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"),  //
       "__window_bounds_",
-      tensorflow::str_util::Join(data.param.window_bounds, "x"),            //
-      "__strides_", tensorflow::str_util::Join(data.param.strides, "x"),    //
-      "__pad_low_", tensorflow::str_util::Join(data.param.pad_low, "x"),    //
-      "__pad_high_", tensorflow::str_util::Join(data.param.pad_high, "x"),  //
-      (data.param.reducer == kAdd) ? "add" : "max");
-  CHECK(data.param.reducer == kAdd || data.param.reducer == kMax);
+      tensorflow::str_util::Join(param.window_bounds, "x"),            //
+      "__strides_", tensorflow::str_util::Join(param.strides, "x"),    //
+      "__pad_low_", tensorflow::str_util::Join(param.pad_low, "x"),    //
+      "__pad_high_", tensorflow::str_util::Join(param.pad_high, "x"),  //
+      (param.reducer == kAdd) ? "add" : "max");
+  CHECK(param.reducer == kAdd || param.reducer == kMax);
 
   // Test names are not allowed to contain the '-' character.
   std::replace(str.begin(), str.end(), '-', 'n');
+  if (::testing::get<1>(data.param)) {
+    str = tensorflow::strings::StrCat(str, "_bfloat16");
+  }
   return str;
 }
 
-class R4ReduceWindowTest
-    : public ClientLibraryTestBase,
-      public ::testing::WithParamInterface<R4ReduceWindowTestData> {
+class R4ReduceWindowTest : public ReduceWindowTestBase,
+                           public ::testing::WithParamInterface<
+                               ::testing::tuple<R4ReduceWindowTestData, bool>> {
  protected:
+  R4ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
+
   void DoIt() {
     ComputationBuilder b(client_, TestName());
-    const auto& param = GetParam();
+    const auto& param = ::testing::get<0>(GetParam());
 
     const float kInitValue = 0.0f;
 
@@ -660,23 +608,24 @@ class R4ReduceWindowTest
     input.FillIota(1);
     std::unique_ptr<Literal> input_literal =
         Literal::CreateR4FromArray4D(input);
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_arg,
-                            client_->TransferToServer(*input_literal));
+    ComputationDataHandle parameter;
+    auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
+                                                       &b, &parameter);
 
     std::vector<std::pair<int64, int64>> padding(4);
     for (int i = 0; i < 4; ++i) {
       padding[i] = {param.pad_low[i], param.pad_high[i]};
     }
 
-    auto parameter = b.Parameter(0, input_literal->shape(), "p0");
-    auto pad_value = b.ConstantR0<float>(kInitValue);
+    auto init_value =
+        CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b);
     CHECK(param.reducer == kAdd || param.reducer == kMax);
     auto computation = param.reducer == kAdd
-                           ? CreateScalarAddComputation(F32, &b)
-                           : CreateScalarMaxComputation(F32, &b);
+                           ? CreateScalarAddComputation(FloatType(), &b)
+                           : CreateScalarMaxComputation(FloatType(), &b);
     b.ReduceWindowWithGeneralPadding(
         /*operand=*/parameter,
-        /*init_value=*/pad_value,
+        /*init_value=*/init_value,
         /*computation=*/computation,
         /*window_dimensions=*/param.window_bounds,
         /*window_strides=*/param.strides,
@@ -694,8 +643,8 @@ class R4ReduceWindowTest
             /*window=*/param.window_bounds,
             /*stride=*/param.strides,
             /*padding=*/padding);
-    ComputeAndCompareR4<float>(&b, *expected, {input_arg.get()},
-                               ErrorSpec(1e-3, 1e-3));
+    ComputeAndCompareLiteral(&b, *Literal::CreateFromArray(*expected),
+                             {input_arg.get()}, DefaultErrorSpec());
   }
 };
 
@@ -711,6 +660,14 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*pad_high=*/{0, 0, 0, 0},
                            /*reducer=*/kAdd},
 
+    // Arbitrary padding (not kSame or kValid).
+    R4ReduceWindowTestData{/*base_bounds=*/{9, 12, 4, 89},
+                           /*window_bounds=*/{3, 3, 1, 1},
+                           /*strides=*/{2, 2, 1, 1},
+                           /*pad_low=*/{4, 4, 0, 0},
+                           /*pad_high=*/{4, 4, 0, 0},
+                           /*reducer=*/kAdd},
+
     // Zero base bound edge case.
     R4ReduceWindowTestData{/*base_bounds=*/{1, 0, 1, 1},
                            /*window_bounds=*/{1, 1, 1, 1},
@@ -824,9 +781,11 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*reducer=*/kAdd},
 };
 
-INSTANTIATE_TEST_CASE_P(R4ReduceWindowTestInstantiation, R4ReduceWindowTest,
-                        ::testing::ValuesIn(kR4ReduceWindowTestValues),
-                        R4ReduceWindowTestDataToString);
+INSTANTIATE_TEST_CASE_P(
+    R4ReduceWindowTestInstantiation, R4ReduceWindowTest,
+    ::testing::Combine(::testing::ValuesIn(kR4ReduceWindowTestValues),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R4ReduceWindowTestDataToString);
 
 class R4ReduceWindowLargeTest : public R4ReduceWindowTest {};
 
@@ -849,10 +808,11 @@ const R4ReduceWindowTestData kR4ReduceWindowLargeTestValues[] = {
                            /*reducer=*/kAdd},
 };
 
-INSTANTIATE_TEST_CASE_P(R4ReduceWindowLargeTestInstantiation,
-                        R4ReduceWindowLargeTest,
-                        ::testing::ValuesIn(kR4ReduceWindowLargeTestValues),
-                        R4ReduceWindowTestDataToString);
+INSTANTIATE_TEST_CASE_P(
+    R4ReduceWindowLargeTestInstantiation, R4ReduceWindowLargeTest,
+    ::testing::Combine(::testing::ValuesIn(kR4ReduceWindowLargeTestValues),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R4ReduceWindowTestDataToString);
 
 struct R2ReduceWindowTestData {
   int64 base_bounds[2];
@@ -900,26 +860,33 @@ struct R2ReduceWindowTestData {
 };
 
 string R2ReduceWindowTestDataToString(
-    const ::testing::TestParamInfo<R2ReduceWindowTestData>& data) {
+    const ::testing::TestParamInfo<
+        ::testing::tuple<R2ReduceWindowTestData, bool>>& data) {
+  const auto& param = ::testing::get<0>(data.param);
   string str = tensorflow::strings::StrCat(
-      "base_bounds_",
-      tensorflow::str_util::Join(data.param.base_bounds, "x"),  //
+      "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"),  //
       "__window_bounds_",
-      tensorflow::str_util::Join(data.param.window_bounds, "x"),              //
-      "__strides_", tensorflow::str_util::Join(data.param.strides, "x"),      //
-      "__padding_", data.param.padding == Padding::kSame ? "same" : "valid",  //
-      "__layout_", data.param.layout[0], "_", data.param.layout[1],           //
-      "__reducer_", data.param.reducer == kAdd ? "add" : "max");
+      tensorflow::str_util::Join(param.window_bounds, "x"),              //
+      "__strides_", tensorflow::str_util::Join(param.strides, "x"),      //
+      "__padding_", param.padding == Padding::kSame ? "same" : "valid",  //
+      "__layout_", param.layout[0], "_", param.layout[1],                //
+      "__reducer_", param.reducer == kAdd ? "add" : "max");
+  if (::testing::get<1>(data.param)) {
+    str = tensorflow::strings::StrCat(str, "_bfloat16");
+  }
   return str;
 }
 
-class R2ReduceWindowTest
-    : public ClientLibraryTestBase,
-      public ::testing::WithParamInterface<R2ReduceWindowTestData> {};
+class R2ReduceWindowTest : public ReduceWindowTestBase,
+                           public ::testing::WithParamInterface<
+                               ::testing::tuple<R2ReduceWindowTestData, bool>> {
+ protected:
+  R2ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
+};
 
 TEST_P(R2ReduceWindowTest, Add) {
   ComputationBuilder b(client_, TestName());
-  const auto& param = GetParam();
+  const auto& param = ::testing::get<0>(GetParam());
   CHECK(param.reducer == kAdd);
 
   const float kInitValue = 0.0f;
@@ -927,12 +894,15 @@ TEST_P(R2ReduceWindowTest, Add) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR2FromArray2DWithLayout(
           input, LayoutUtil::MakeLayout(param.layout));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_arg,
-                          client_->TransferToServer(*input_literal));
-  b.ReduceWindow(/*operand=*/
-                 b.Parameter(0, input_literal->shape(), "p0"),
-                 /*init_value=*/b.ConstantR0<float>(kInitValue),
-                 /*computation=*/CreateScalarAddComputation(F32, &b),
+
+  ComputationDataHandle parameter;
+  auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
+                                                     &b, &parameter);
+  auto init_value =
+      CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b);
+  b.ReduceWindow(/*operand=*/parameter,
+                 /*init_value=*/init_value,
+                 /*computation=*/CreateScalarAddComputation(FloatType(), &b),
                  /*window_dimensions=*/param.window_bounds,
                  /*window_strides=*/param.strides, /*padding=*/param.padding);
 
@@ -940,90 +910,145 @@ TEST_P(R2ReduceWindowTest, Add) {
       /*operand=*/input, /*init=*/kInitValue, /*window=*/param.window_bounds,
       /*stride=*/param.strides, /*padding=*/param.padding);
 
-  ComputeAndCompareR2<float>(&b, *expected, {input_arg.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&b, *Literal::CreateFromArray(*expected),
+                           {input_arg.get()}, DefaultErrorSpec());
 }
 
-INSTANTIATE_TEST_CASE_P(R2ReduceWindowTestInstantiation, R2ReduceWindowTest,
-                        ::testing::ValuesIn(kR2TestCases),
-                        R2ReduceWindowTestDataToString);
+INSTANTIATE_TEST_CASE_P(
+    R2ReduceWindowTestInstantiation, R2ReduceWindowTest,
+    ::testing::Combine(::testing::ValuesIn(kR2TestCases),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R2ReduceWindowTestDataToString);
 
 struct R1ReduceWindowTestData {
   int64 base_bounds[1];
   int64 window_bounds[1];
   int64 strides[1];
-  Padding padding;
+  int64 pad_low[1];
+  int64 pad_high[1];
   Reducer reducer;
 } kR1TestCases[] = {
     {/*base_bounds=*/{1}, /*window_bounds=*/{1},
      /*strides=*/{1},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({1}, {1}, {1}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({1}, {1}, {1}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{3}, /*window_bounds=*/{3},
      /*strides=*/{1},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({3}, {3}, {1}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({3}, {3}, {1}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{3}, /*window_bounds=*/{2},
      /*strides=*/{1},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({3}, {2}, {1}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({3}, {2}, {1}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{5}, /*window_bounds=*/{1},
      /*strides=*/{1},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kMax},
+     /*pad_low=*/{xla::MakePadding({5}, {1}, {1}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({5}, {1}, {1}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kMax},
 
     {/*base_bounds=*/{16}, /*window_bounds=*/{4},
      /*strides=*/{4},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kMax},
+     /*pad_low=*/{xla::MakePadding({16}, {4}, {4}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({16}, {4}, {4}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kMax},
 
     {/*base_bounds=*/{16}, /*window_bounds=*/{4},
      /*strides=*/{3},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({16}, {4}, {3}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({16}, {4}, {3}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
-    {/*base_bounds=*/{128 * 2}, /*window_bounds=*/{30},
+    {/*base_bounds=*/{128 * 2},
+     /*window_bounds=*/{30},
      /*strides=*/{27},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
-
-    {/*base_bounds=*/{128 * 17}, /*window_bounds=*/{7},
+     /*pad_low=*/
+     {xla::MakePadding({128 * 2}, {30}, {27}, Padding::kValid)[0].first},
+     /*pad_high=*/
+     {xla::MakePadding({128 * 2}, {30}, {27}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{128 * 17},
+     /*window_bounds=*/{7},
      /*strides=*/{64},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
-
-    {/*base_bounds=*/{128 * 2}, /*window_bounds=*/{32},
+     /*pad_low=*/
+     {xla::MakePadding({128 * 17}, {7}, {64}, Padding::kValid)[0].first},
+     /*pad_high=*/
+     {xla::MakePadding({128 * 17}, {7}, {64}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{128 * 2},
+     /*window_bounds=*/{32},
      /*strides=*/{56},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/
+     {xla::MakePadding({128 * 2}, {32}, {56}, Padding::kValid)[0].first},
+     /*pad_high=*/
+     {xla::MakePadding({128 * 2}, {32}, {56}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{3}, /*window_bounds=*/{2},
      /*strides=*/{1},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({3}, {2}, {1}, Padding::kSame)[0].first},
+     /*pad_high=*/{xla::MakePadding({3}, {2}, {1}, Padding::kSame)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{5}, /*window_bounds=*/{3},
      /*strides=*/{2},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({5}, {3}, {2}, Padding::kSame)[0].first},
+     /*pad_high=*/{xla::MakePadding({5}, {3}, {2}, Padding::kSame)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{16}, /*window_bounds=*/{4},
      /*strides=*/{3},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({16}, {4}, {3}, Padding::kSame)[0].first},
+     /*pad_high=*/{xla::MakePadding({16}, {4}, {3}, Padding::kSame)[0].second},
+     /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{5}, /*window_bounds=*/{5},
+     /*strides=*/{1},
+     /*pad_low=*/{0},
+     /*pad_high=*/{5},
+     /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{5}, /*window_bounds=*/{5},
+     /*strides=*/{1},
+     /*pad_low=*/{5},
+     /*pad_high=*/{0},
+     /*reducer=*/Reducer::kAdd},
 };
 
 string R1ReduceWindowTestDataToString(
-    const ::testing::TestParamInfo<R1ReduceWindowTestData>& data) {
+    const ::testing::TestParamInfo<
+        ::testing::tuple<R1ReduceWindowTestData, bool>>& data) {
+  const auto& param = ::testing::get<0>(data.param);
   string str = tensorflow::strings::StrCat(
-      "base_bounds_",
-      tensorflow::str_util::Join(data.param.base_bounds, "x"),  //
-      "__window_bounds_",
-      tensorflow::str_util::Join(data.param.window_bounds, "x"),              //
-      "__strides_", tensorflow::str_util::Join(data.param.strides, "x"),      //
-      "__padding_", data.param.padding == Padding::kSame ? "same" : "valid",  //
-      "__reducer_", data.param.reducer == kAdd ? "add" : "max");
+      "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"),
+      "__window_bounds_", tensorflow::str_util::Join(param.window_bounds, "x"),
+      "__strides_", tensorflow::str_util::Join(param.strides, "x"),
+      "__pad_low_", tensorflow::str_util::Join(param.pad_low, "x"),
+      "__pad_high_", tensorflow::str_util::Join(param.pad_high, "x"),
+      "__reducer_", param.reducer == kAdd ? "add" : "max");
+  if (::testing::get<1>(data.param)) {
+    str = tensorflow::strings::StrCat(str, "_bfloat16");
+  }
   return str;
 }
 
-class R1ReduceWindowTest
-    : public ClientLibraryTestBase,
-      public ::testing::WithParamInterface<R1ReduceWindowTestData> {};
+class R1ReduceWindowTest : public ReduceWindowTestBase,
+                           public ::testing::WithParamInterface<
+                               ::testing::tuple<R1ReduceWindowTestData, bool>> {
+ protected:
+  R1ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
+};
 
 TEST_P(R1ReduceWindowTest, DoIt) {
   ComputationBuilder b(client_, TestName());
-  const auto& param = GetParam();
+  const auto& param = ::testing::get<0>(GetParam());
   CHECK(param.reducer == kAdd || param.reducer == kMax);
 
   const float kInitValue = 0.0f;
@@ -1031,18 +1056,24 @@ TEST_P(R1ReduceWindowTest, DoIt) {
   std::iota(std::begin(input_vector), std::end(input_vector), 0);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR1(tensorflow::gtl::ArraySlice<float>(input_vector));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_arg,
-                          client_->TransferToServer(*input_literal));
+  ComputationDataHandle parameter;
+  auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
+                                                     &b, &parameter);
+
+  std::vector<std::pair<int64, int64>> padding(1);
+  padding[0] = {param.pad_low[0], param.pad_high[0]};
 
   auto computation = param.reducer == kAdd
-                         ? CreateScalarAddComputation(F32, &b)
-                         : CreateScalarMaxComputation(F32, &b);
-  b.ReduceWindow(/*operand=*/
-                 b.Parameter(0, input_literal->shape(), "p0"),
-                 /*init_value=*/b.ConstantR0<float>(kInitValue),
-                 /*computation=*/computation,
-                 /*window_dimensions=*/param.window_bounds,
-                 /*window_strides=*/param.strides, /*padding=*/param.padding);
+                         ? CreateScalarAddComputation(FloatType(), &b)
+                         : CreateScalarMaxComputation(FloatType(), &b);
+  auto init_value =
+      CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b);
+  b.ReduceWindowWithGeneralPadding(
+      /*operand=*/parameter,
+      /*init_value=*/init_value,
+      /*computation=*/computation,
+      /*window_dimensions=*/param.window_bounds,
+      /*window_strides=*/param.strides, /*padding=*/padding);
 
   auto reduce_func = param.reducer == kAdd
                          ? +[](float a, float b) { return a + b; }
@@ -1052,14 +1083,17 @@ TEST_P(R1ReduceWindowTest, DoIt) {
       /*init=*/kInitValue,
       /*reduce_func=*/reduce_func,
       /*window=*/param.window_bounds,
-      /*stride=*/param.strides, /*padding=*/param.padding);
+      /*stride=*/param.strides,
+      /*padding=*/padding);
 
-  ComputeAndCompareR1<float>(&b, tensorflow::gtl::ArraySlice<float>(*expected),
-                             {input_arg.get()}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&b, *Literal::CreateR1<float>(*expected),
+                           {input_arg.get()}, DefaultErrorSpec());
 }
 
-INSTANTIATE_TEST_CASE_P(R1ReduceWindowTestInstantiation, R1ReduceWindowTest,
-                        ::testing::ValuesIn(kR1TestCases),
-                        R1ReduceWindowTestDataToString);
+INSTANTIATE_TEST_CASE_P(
+    R1ReduceWindowTestInstantiation, R1ReduceWindowTest,
+    ::testing::Combine(::testing::ValuesIn(kR1TestCases),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R1ReduceWindowTestDataToString);
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index d235b9a1580ecbd6b82a69fca53d259912ff375e..ddd50d7a5864d73de7916ce736bb7cd40c1c4dc9 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -41,326 +41,467 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class ReshapeTest : public ClientLibraryTestBase {
+// Use a bool parameter to indicate whether to use bfloat16.
+class ReshapeTest : public ::testing::WithParamInterface<bool>,
+                    public ClientLibraryTestBase {
  public:
+  ReshapeTest() { set_use_bfloat16(GetParam()); }
+
   ErrorSpec zero_error_spec_{0.0};
 };
 
 // Collapses 2-dimensional pseudo-scalar (single-element array) to 1 dimension.
-XLA_TEST_F(ReshapeTest, CollapseTrivial1x1) {
+XLA_TEST_P(ReshapeTest, CollapseTrivial1x1) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2<float>({{1.0}});
-  builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {1.0f}, {}, zero_error_spec_);
+  Array2D<float> input_array(1, 1);
+  input_array.Fill(1.0f);
+  auto input_literal = Literal::CreateR2FromArray2D(input_array);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+
+  auto expected_literal = Literal::CreateR1<float>({1.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, CollapseTrivialR1EmptyDims) {
+XLA_TEST_P(ReshapeTest, CollapseTrivialR1EmptyDims) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR1<float>({1.0});
-  builder.Collapse(/*operand=*/a, /*dimensions=*/{});
-
-  ComputeAndCompareR1<float>(&builder, {1.0f}, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateR1<float>({1.0f});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{});
+
+  auto expected_literal = Literal::CreateR1<float>({1.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, CollapseTrivialR1OnlyDim) {
+XLA_TEST_P(ReshapeTest, CollapseTrivialR1OnlyDim) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR1<float>({1.0});
-  builder.Collapse(/*operand=*/a, /*dimensions=*/{0});
-
-  ComputeAndCompareR1<float>(&builder, {1.0f}, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateR1<float>({1.0f});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0});
+
+  auto expected_literal = Literal::CreateR1<float>({1.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Collapses 2-dimensional pseudo-scalar (single-element array) to scalar.
-XLA_TEST_F(ReshapeTest, SingleElementArrayToScalar) {
+XLA_TEST_P(ReshapeTest, SingleElementArrayToScalar) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2<float>({{1.0}});
-  auto reshape =
-      builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1}, /*new_sizes=*/{});
+  Array2D<float> input_array(1, 1);
+  input_array.Fill(1.0f);
+  auto input_literal = Literal::CreateR2FromArray2D(input_array);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
+                                                 &builder, &parameter);
+  auto reshape = builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
+                                 /*new_sizes=*/{});
   auto new_shape = builder.GetShape(reshape).ConsumeValueOrDie();
 
-  ComputeAndCompareR0<float>(&builder, 1.0f, {}, zero_error_spec_);
+  auto expected_literal = Literal::CreateR0<float>(1.0f);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, ScalarToSingleElementArray) {
+XLA_TEST_P(ReshapeTest, ScalarToSingleElementArray) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(1.0f);
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
-
-  auto a = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param0");
-  a = builder.Neg(a);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *param0_literal, "param0",
+                                                 &builder, &parameter);
+  auto a = builder.Neg(parameter);
   auto reshape =
       builder.Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1});
 
-  ComputeAndCompareR1<float>(&builder, {-1.0f}, {param0_data.get()},
-                             zero_error_spec_);
+  auto expected_literal = Literal::CreateR1<float>({-1.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, Trivial0x3) {
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 3));
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {}, {}, zero_error_spec_);
+  Array2D<float> input_array(0, 3);
+  auto input_literal = Literal::CreateR2FromArray2D(input_array);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+  auto expected_literal = Literal::CreateR1<float>({});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // TODO(b/29185393): Make this work with the GPU backend. The GPU backend
 // does not handle zero-sized shapes correctly. Failed last on 2017-05-15
 // with an incorrect result rank.
-XLA_TEST_F(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR2FromArray2D<float>(Array2D<float>(0, 3));
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
-
-  auto a = builder.Parameter(0, ShapeUtil::MakeShape(F32, {0, 3}), "param0");
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {}, {param0_data.get()},
-                             zero_error_spec_);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *param0_literal, "param0",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+  auto expected_literal = Literal::CreateR1<float>({});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, Trivial3x0) {
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial3x0)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(3, 0));
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {}, {}, zero_error_spec_);
+  Array2D<float> input_array(3, 0);
+  auto input_literal = Literal::CreateR2FromArray2D(input_array);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+  auto expected_literal = Literal::CreateR1<float>({});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Collapses a 2-dimensional row vector to 1 dimension.
-XLA_TEST_F(ReshapeTest, Trivial1x3) {
+XLA_TEST_P(ReshapeTest, Trivial1x3) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2<float>({{1.0f, 2.0f, 3.0f}});
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {1.0f, 2.0f, 3.0f}, {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateR2<float>({{1.0f, 2.0f, 3.0f}});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+  auto expected_literal = Literal::CreateR1<float>({1.0f, 2.0f, 3.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Collapses a 2-dimensional column vector to 1 dimension.
-XLA_TEST_F(ReshapeTest, Trivial3x1) {
+XLA_TEST_P(ReshapeTest, Trivial3x1) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2<float>({{1.0f}, {2.0f}, {3.0f}});
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {1.0f, 2.0f, 3.0f}, {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateR2<float>({{1.0f}, {2.0f}, {3.0f}});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+  auto expected_literal = Literal::CreateR1<float>({1.0f, 2.0f, 3.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+//
 // Splits an empty vector into an empty matrix.
-XLA_TEST_F(ReshapeTest, R1ToR2_0_To_2x0) {
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(R1ToR2_0_To_2x0)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR1<float>({});
-  auto result =
-      builder.Reshape(/*operand=*/a, /*dimensions=*/{0}, /*new_sizes=*/{2, 0});
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(2, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateR1<float>({});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0},
+                  /*new_sizes=*/{2, 0});
+  auto expected_literal = Literal::CreateR2<float>({{}, {}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Splits a vector into a matrix.
-XLA_TEST_F(ReshapeTest, R1ToR2_6_To_2x3) {
+XLA_TEST_P(ReshapeTest, R1ToR2_6_To_2x3) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR1<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
-  auto result =
-      builder.Reshape(/*operand=*/a, /*dimensions=*/{0}, /*new_sizes=*/{2, 3});
-  Array2D<float> expected_2x3({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}});
-  ComputeAndCompareR2<float>(&builder, expected_2x3, {}, zero_error_spec_);
+  auto input_literal =
+      Literal::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0},
+                  /*new_sizes=*/{2, 3});
+  auto expected_literal =
+      Literal::CreateR2<float>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+//
 // Transposes a 2x0 array to a 0x2 array.
-XLA_TEST_F(ReshapeTest, Reshape0x2To2x0) {
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Reshape0x2To2x0)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 2));
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1},
-                                /*new_sizes=*/{2, 0});
-
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(2, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(Array2D<float>(0, 2));
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
+                  /*new_sizes=*/{2, 0});
+  auto expected_literal = Literal::CreateR2<float>({{}, {}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Transposes a 2-dimensional row vector to a column vector.
-XLA_TEST_F(ReshapeTest, ReshapeRowToCol) {
+XLA_TEST_P(ReshapeTest, ReshapeRowToCol) {
   ComputationBuilder builder(client_, TestName());
   auto simple = MakeLinspaceArray2D(1.0f, 3.0f, 1, 3);
-  auto a = builder.ConstantR2FromArray2D<float>(*simple);
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1},
-                                /*new_sizes=*/{3, 1});
+  auto input_literal = Literal::CreateFromArray(*simple);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
+                  /*new_sizes=*/{3, 1});
 
   auto expected = ReferenceUtil::TransposeArray2D(*simple);
-  ComputeAndCompareR2<float>(&builder, *expected, {}, zero_error_spec_);
+  auto expected_literal = Literal::CreateFromArray(*expected);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Transposes a 2-dimensional array.
-XLA_TEST_F(ReshapeTest, TransposeAsReshape) {
+XLA_TEST_P(ReshapeTest, TransposeAsReshape) {
   ComputationBuilder builder(client_, TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
-  auto a = builder.ConstantR2FromArray2D<float>(*a4x3);
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{1, 0},
-                                /*new_sizes=*/{3, 4});
-
-  auto expected3x4 = ReferenceUtil::TransposeArray2D(*a4x3);
-  ComputeAndCompareR2<float>(&builder, *expected3x4, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(*a4x3);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
+                  /*new_sizes=*/{3, 4});
+
+  auto expected = ReferenceUtil::TransposeArray2D(*a4x3);
+  auto expected_literal = Literal::CreateFromArray(*expected);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+//
 // Transposes a 0x4 array with ComputationBuilder::Trans.
-XLA_TEST_F(ReshapeTest, Transpose0x4) {
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Transpose0x4)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 4));
-  auto result = builder.Transpose(a, {1, 0});
-
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(4, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(Array2D<float>(0, 4));
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Transpose(parameter, {1, 0});
+  auto expected_literal = Literal::CreateR2<float>({{}, {}, {}, {}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Transposes a 2-dimensional array with ComputationBuilder::Trans.
-XLA_TEST_F(ReshapeTest, Transpose4x3) {
+XLA_TEST_P(ReshapeTest, Transpose4x3) {
   ComputationBuilder builder(client_, TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
-  auto a = builder.ConstantR2FromArray2D<float>(*a4x3);
-  auto result = builder.Transpose(a, {1, 0});
-
-  auto expected3x4 = ReferenceUtil::TransposeArray2D(*a4x3);
-  ComputeAndCompareR2<float>(&builder, *expected3x4, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(*a4x3);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Transpose(parameter, {1, 0});
+
+  auto expected = ReferenceUtil::TransposeArray2D(*a4x3);
+  auto expected_literal = Literal::CreateFromArray(*expected);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+//
 // Reshapes an empty 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), but no reordering (no shuffle).
-XLA_TEST_F(ReshapeTest, ReshapeSplitNoShuffleZeroElements) {
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitNoShuffleZeroElements)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(6, 0));
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1},
-                                /*new_sizes=*/{2, 3, 0, 0});
-
-  ComputeAndCompareR4<float>(&builder, Array4D<float>(2, 3, 0, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(Array2D<float>(6, 0));
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
+                  /*new_sizes=*/{2, 3, 0, 0});
+  auto expected_literal = Literal::CreateFromArray(Array4D<float>(2, 3, 0, 0));
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, ReshapeR4ToR2ZeroElements) {
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeR4ToR2ZeroElements)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR4FromArray4D<float>(Array4D<float>(2, 3, 4, 0));
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1, 2, 3},
-                                /*new_sizes=*/{24, 0});
-
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(24, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(Array4D<float>(2, 3, 4, 0));
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
+                  /*new_sizes=*/{24, 0});
+  auto expected_literal = Literal::CreateFromArray(Array2D<float>(24, 0));
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Reshapes a 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), but no reordering (no shuffle).
-XLA_TEST_F(ReshapeTest, ReshapeSplitNoShuffle) {
+XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffle) {
   ComputationBuilder builder(client_, TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
-  auto a = builder.ConstantR2FromArray2D<float>(*a4x3);
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1},
-                                /*new_sizes=*/{2, 6});
-
-  auto expected2x6 = MakeLinspaceArray2D(1.0f, 12.0f, 2, 6);
-  ComputeAndCompareR2<float>(&builder, *expected2x6, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(*a4x3);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
+                  /*new_sizes=*/{2, 6});
+
+  auto expected = MakeLinspaceArray2D(1.0f, 12.0f, 2, 6);
+  auto expected_literal = Literal::CreateFromArray(*expected);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-// Reshapes a 2-dimensional array with dimensions that are not just a
-// rearrangement of the originals (split), and reorder the input (shuffle).
-XLA_TEST_F(ReshapeTest, ReshapeSplitAndShuffleZeroElements) {
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+//
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitAndShuffleZeroElements)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 6));
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{1, 0},
-                                /*new_sizes=*/{3, 0});
-
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(3, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(Array2D<float>(0, 6));
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
+                  /*new_sizes=*/{3, 0});
+  auto expected_literal = Literal::CreateFromArray(Array2D<float>(3, 0));
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Reshapes a 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), and reorder the input (shuffle).
-XLA_TEST_F(ReshapeTest, ReshapeSplitAndShuffle) {
+XLA_TEST_P(ReshapeTest, ReshapeSplitAndShuffle) {
   ComputationBuilder builder(client_, TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
-  auto a = builder.ConstantR2FromArray2D<float>(*a4x3);
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{1, 0},
-                                /*new_sizes=*/{2, 6});
-
-  Array2D<float> expected2x6({{1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f},
-                              {8.0f, 11.0f, 3.0f, 6.0f, 9.0f, 12.0f}});
-  ComputeAndCompareR2<float>(&builder, expected2x6, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(*a4x3);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
+                  /*new_sizes=*/{2, 6});
+  Array2D<float> expected({{1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f},
+                           {8.0f, 11.0f, 3.0f, 6.0f, 9.0f, 12.0f}});
+  auto expected_literal = Literal::CreateFromArray(expected);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // The following tests use the same input 3D array; they test the examples we
 // show for the Reshape operation in the operation_semantics document.
 // TODO(b/34503277): find a way to show this code in the documentation without
 // duplication on the TF documentation server.
-Array3D<int> v_array_for_doc_R3_tests({{{10, 11, 12}, {15, 16, 17}},
-                                       {{20, 21, 22}, {25, 26, 27}},
-                                       {{30, 31, 32}, {35, 36, 37}},
-                                       {{40, 41, 42}, {45, 46, 47}}});
-
-XLA_TEST_F(ReshapeTest, DocR3_R1_Collapse_012) {
-  ComputationBuilder builder(client_, TestName());
-  auto v = builder.ConstantR3FromArray3D<int>(v_array_for_doc_R3_tests);
-  auto result = builder.Reshape(/*operand=*/v, /*dimensions=*/{0, 1, 2},
-                                /*new_sizes=*/{24});
-  ComputeAndCompareR1<int>(&builder,
-                           {10, 11, 12, 15, 16, 17, 20, 21, 22, 25, 26, 27,
-                            30, 31, 32, 35, 36, 37, 40, 41, 42, 45, 46, 47},
-                           {});
-}
-
-XLA_TEST_F(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
-  ComputationBuilder builder(client_, TestName());
-  auto v = builder.ConstantR3FromArray3D<int>(v_array_for_doc_R3_tests);
-  auto result = builder.Reshape(/*operand=*/v, /*dimensions=*/{0, 1, 2},
-                                /*new_sizes=*/{8, 3});
-  Array2D<int> expected({{10, 11, 12},
-                         {15, 16, 17},
-                         {20, 21, 22},
-                         {25, 26, 27},
-                         {30, 31, 32},
-                         {35, 36, 37},
-                         {40, 41, 42},
-                         {45, 46, 47}});
-  ComputeAndCompareR2<int>(&builder, expected, {});
-}
-
-XLA_TEST_F(ReshapeTest, DocR3_R1_Collapse_120) {
-  ComputationBuilder builder(client_, TestName());
-  auto v = builder.ConstantR3FromArray3D<int>(v_array_for_doc_R3_tests);
-  auto result = builder.Reshape(/*operand=*/v, /*dimensions=*/{1, 2, 0},
-                                /*new_sizes=*/{24});
-  ComputeAndCompareR1<int>(&builder,
-                           {10, 20, 30, 40, 11, 21, 31, 41, 12, 22, 32, 42,
-                            15, 25, 35, 45, 16, 26, 36, 46, 17, 27, 37, 47},
-                           {});
-}
-
-XLA_TEST_F(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
-  ComputationBuilder builder(client_, TestName());
-  auto v = builder.ConstantR3FromArray3D<int>(v_array_for_doc_R3_tests);
-  auto result = builder.Reshape(/*operand=*/v, /*dimensions=*/{1, 2, 0},
-                                /*new_sizes=*/{8, 3});
-  Array2D<int> expected({{10, 20, 30},
-                         {40, 11, 21},
-                         {31, 41, 12},
-                         {22, 32, 42},
-                         {15, 25, 35},
-                         {45, 16, 26},
-                         {36, 46, 17},
-                         {27, 37, 47}});
-  ComputeAndCompareR2<int>(&builder, expected, {});
-}
-
-XLA_TEST_F(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
-  ComputationBuilder builder(client_, TestName());
-  auto v = builder.ConstantR3FromArray3D<int>(v_array_for_doc_R3_tests);
-  auto result = builder.Reshape(/*operand=*/v, /*dimensions=*/{1, 2, 0},
-                                /*new_sizes=*/{2, 6, 2});
-  Array3D<int> expected(
+static Array3D<float> ArrayForDocR3Tests() {
+  return Array3D<float>({{{10, 11, 12}, {15, 16, 17}},
+                         {{20, 21, 22}, {25, 26, 27}},
+                         {{30, 31, 32}, {35, 36, 37}},
+                         {{40, 41, 42}, {45, 46, 47}}});
+}
+
+XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_012) {
+  ComputationBuilder builder(client_, TestName());
+  auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
+                  /*new_sizes=*/{24});
+  auto expected_literal = Literal::CreateR1<float>(
+      {10, 11, 12, 15, 16, 17, 20, 21, 22, 25, 26, 27,
+       30, 31, 32, 35, 36, 37, 40, 41, 42, 45, 46, 47});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
+}
+
+XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
+  ComputationBuilder builder(client_, TestName());
+  auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
+                  /*new_sizes=*/{8, 3});
+  auto expected_literal = Literal::CreateR2<float>({{10, 11, 12},
+                                                    {15, 16, 17},
+                                                    {20, 21, 22},
+                                                    {25, 26, 27},
+                                                    {30, 31, 32},
+                                                    {35, 36, 37},
+                                                    {40, 41, 42},
+                                                    {45, 46, 47}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
+}
+
+XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_120) {
+  ComputationBuilder builder(client_, TestName());
+  auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
+                  /*new_sizes=*/{24});
+  auto expected_literal = Literal::CreateR1<float>(
+      {10, 20, 30, 40, 11, 21, 31, 41, 12, 22, 32, 42,
+       15, 25, 35, 45, 16, 26, 36, 46, 17, 27, 37, 47});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
+}
+
+XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
+  ComputationBuilder builder(client_, TestName());
+  auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
+                  /*new_sizes=*/{8, 3});
+  auto expected_literal = Literal::CreateR2<float>({{10, 20, 30},
+                                                    {40, 11, 21},
+                                                    {31, 41, 12},
+                                                    {22, 32, 42},
+                                                    {15, 25, 35},
+                                                    {45, 16, 26},
+                                                    {36, 46, 17},
+                                                    {27, 37, 47}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
+}
+
+XLA_TEST_P(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
+  ComputationBuilder builder(client_, TestName());
+  auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
+                  /*new_sizes=*/{2, 6, 2});
+  auto expected_literal = Literal::CreateR3<float>(
       {{{10, 20}, {30, 40}, {11, 21}, {31, 41}, {12, 22}, {32, 42}},
        {{15, 25}, {35, 45}, {16, 26}, {36, 46}, {17, 27}, {37, 47}}});
-  ComputeAndCompareR3<int>(&builder, expected, {});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Collapses the low dimensions of a 4D tensor to get a 2D matrix, without
@@ -378,23 +519,26 @@ XLA_TEST_F(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
 // Then we collapse Z be collapsed so we just end up with planes:
 //
 // 1 2 3 4 5 6 1 2 3 4 5 6
-XLA_TEST_F(ReshapeTest, FullyConnectedCollapse) {
+XLA_TEST_P(ReshapeTest, FullyConnectedCollapse) {
   ComputationBuilder builder(client_, TestName());
   Array4D<float> t2x2x2x3(2, 2, 2, 3);
   auto filler2x3 = MakeLinspaceArray2D(1.0f, 6.0f, 2, 3);
   t2x2x2x3.FillWithYX(*filler2x3);
-  auto a = builder.ConstantR4FromArray4D<float>(t2x2x2x3);
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{1, 2, 3});
-
-  Array2D<float> expected2x12(
+  auto input_literal = Literal::CreateFromArray(t2x2x2x3);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{1, 2, 3});
+  auto expected_literal = Literal::CreateR2<float>(
       {{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
        {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
         6.0f}});
-  ComputeAndCompareR2<float>(&builder, expected2x12, {}, zero_error_spec_);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // As above, but uses reshape directly.
-XLA_TEST_F(ReshapeTest, FullyConnectedCollapseDesugared) {
+XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
   ComputationBuilder builder(client_, TestName());
   Array4D<float> t(2, 1, 2, 2);
   t(0, 0, 0, 0) = 0;
@@ -405,52 +549,67 @@ XLA_TEST_F(ReshapeTest, FullyConnectedCollapseDesugared) {
   t(1, 0, 0, 1) = 5;
   t(1, 0, 1, 0) = 6;
   t(1, 0, 1, 1) = 7;
-  auto a = builder.ConstantR4FromArray4D<float>(t);
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1, 2, 3},
-                                /*new_sizes=*/{2, 4});
-
-  Array2D<float> expected({{0, 1, 2, 3}, {4, 5, 6, 7}});
-  ComputeAndCompareR2<float>(&builder, expected, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(t);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
+                  /*new_sizes=*/{2, 4});
+
+  auto expected_literal =
+      Literal::CreateR2<float>({{0, 1, 2, 3}, {4, 5, 6, 7}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Reshape various ranks to a scalar.
-XLA_TEST_F(ReshapeTest, ToScalar) {
+XLA_TEST_P(ReshapeTest, ToScalar) {
   for (int rank = 0; rank < 8; ++rank) {
     ComputationBuilder b(client_, TestName());
-    auto input = Literal::CreateR1<float>({83.0f});
+    auto input_literal = Literal::CreateR1<float>({83.0f});
     std::vector<int64> ones(rank, 1);  // this is {1, ..., 1}.
     std::vector<int64> dimensions(rank);
     std::iota(dimensions.begin(), dimensions.end(), 0);
-    *input->mutable_shape() = ShapeUtil::MakeShape(F32, ones);
-    b.Reshape(b.ConstantLiteral(*input), dimensions, {});
+    *input_literal->mutable_shape() = ShapeUtil::MakeShape(F32, ones);
 
-    ComputeAndCompareR0<float>(&b, 83.0f, {}, zero_error_spec_);
+    ComputationDataHandle parameter;
+    auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                   &b, &parameter);
+    b.Reshape(parameter, dimensions, {});
+
+    auto expected_literal = Literal::CreateR0<float>(83.0f);
+    ComputeAndCompareLiteral(&b, *expected_literal, {input.get()},
+                             zero_error_spec_);
   }
 }
 
-XLA_TEST_F(ReshapeTest, BadDimensions) {
+XLA_TEST_P(ReshapeTest, BadDimensions) {
   ComputationBuilder b(client_, TestName());
-  b.Reshape(b.ConstantR1<int32>({1}), {}, {});
+  auto input_literal = Literal::CreateR1<float>({1.0f});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &b,
+                                                 &parameter);
+  b.Reshape(parameter, {}, {});
   EXPECT_THAT(
       ExecuteToString(&b, {}),
       ::testing::HasSubstr("not a permutation of the operand dimensions"));
 }
 
-XLA_TEST_F(ReshapeTest, BadNewSizes) {
+XLA_TEST_P(ReshapeTest, BadNewSizes) {
   ComputationBuilder b(client_, TestName());
-  b.Reshape(b.ConstantR1<int32>({1, 2}), {1}, {});
+  auto input_literal = Literal::CreateR1<float>({1.0f, 2.0f});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &b,
+                                                 &parameter);
+  b.Reshape(parameter, {1}, {});
   EXPECT_THAT(ExecuteToString(&b, {}),
               ::testing::HasSubstr("mismatched element counts"));
 }
 
-XLA_TEST_F(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
-  const Shape parameter_shape = ShapeUtil::MakeShape(F32, {2, 2, 2, 2});
+XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, parameter_shape, "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 8});
-
   // clang-format off
-  auto literal = Literal::CreateR4FromArray4DWithLayout(Array4D<float>{
+  auto input_literal = Literal::CreateR4FromArray4DWithLayout(Array4D<float>{
     {
       {
         {0, 1},
@@ -474,8 +633,12 @@ XLA_TEST_F(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   },
        LayoutUtil::MakeLayout({0, 1, 2, 3}));
   // clang-format on
-  std::unique_ptr<GlobalData> input =
-      client_->TransferToServer(*literal).ConsumeValueOrDie();
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 8});
+
   Array2D<float> expected_array({
       {0, 1, 2, 3, 100, 101, 102, 103},
       {222, 333, 444, 555, 666, 777, 888, 999},
@@ -484,72 +647,75 @@ XLA_TEST_F(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   Computation computation = builder.Build().ConsumeValueOrDie();
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
-      ShapeUtil::MakeShapeWithLayout(F32, {2, 8}, {1, 0});
+      ShapeUtil::MakeShapeWithLayout(use_bfloat16() ? BF16 : F32, {2, 8},
+                                     {1, 0});
   std::unique_ptr<Literal> actual =
       client_
           ->ExecuteAndTransfer(computation, {input.get()}, &execution_options)
           .ConsumeValueOrDie();
   std::unique_ptr<Literal> expected =
       Literal::CreateR2FromArray2D<float>(expected_array);
+  if (use_bfloat16()) {
+    expected = LiteralTestUtil::ConvertF32ToBF16(*expected);
+  }
   LiteralTestUtil::ExpectEqual(*expected, *actual);
 }
 
-XLA_TEST_F(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
-  std::unique_ptr<Literal> input = Literal::CreateR2<float>({
+XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
+  ComputationBuilder builder(client_, TestName());
+  std::unique_ptr<Literal> input_literal = Literal::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
       {100, 101, 102, 103, 104, 105, 106, 107},
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1}, /*new_sizes=*/{3, 2, 1, 4});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1}, /*new_sizes=*/{3, 2, 1, 4});
 
   // clang-format off
-  Array4D<float> expected = {
+  auto expected_literal = Literal::CreateR4<float>({
     {{{0, 1, 2, 3}},
      {{4, 5, 6, 7}}},
     {{{100, 101, 102, 103}},
      {{104, 105, 106, 107}}},
     {{{200, 201, 202, 203}},
      {{204, 205, 206, 207}}}
-  };
+  });
   // clang-format on
-  ComputeAndCompareR4<float>(&builder, expected, {input_data.get()},
-                             zero_error_spec_);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Tests R2->R4 reshape with the reshape dimensions {1, 0}.
-XLA_TEST_F(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
-  std::unique_ptr<Literal> input = Literal::CreateR2<float>({
+XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
+  ComputationBuilder builder(client_, TestName());
+  std::unique_ptr<Literal> input_literal = Literal::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
       {100, 101, 102, 103, 104, 105, 106, 107},
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{1, 0}, /*new_sizes=*/{3, 2, 1, 4});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{1, 0}, /*new_sizes=*/{3, 2, 1, 4});
 
   // clang-format off
-  Array4D<float> expected = {
+  auto expected_literal = Literal::CreateR4<float>({
     {{{0, 100, 200, 1}},
      {{101, 201, 2, 102}}},
     {{{202, 3, 103, 203}},
      {{4, 104, 204, 5}}},
     {{{105, 205, 6, 106}},
      {{206, 7, 107, 207}}}
-  };
+  });
   // clang-format on
-  ComputeAndCompareR4<float>(&builder, expected, {input_data.get()},
-                             zero_error_spec_);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
+XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
+  ComputationBuilder builder(client_, TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(2, 1, 1, 1);
@@ -559,12 +725,10 @@ XLA_TEST_F(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 1});
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 1});
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape({2, 1}, {1, 0}, *input_literal);
@@ -572,7 +736,8 @@ XLA_TEST_F(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
                            zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
+XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
+  ComputationBuilder builder(client_, TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(2, 1, 4, 1);
@@ -582,12 +747,10 @@ XLA_TEST_F(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{4, 2});
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{4, 2});
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape({4, 2}, {1, 0}, *input_literal);
@@ -596,7 +759,8 @@ XLA_TEST_F(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
 }
 
 // Tests R4->R2 reshape with the reshape dimensions {0, 2, 1, 3}.
-XLA_TEST_F(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
+XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
+  ComputationBuilder builder(client_, TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(5, 10, 2, 3);
@@ -606,12 +770,11 @@ XLA_TEST_F(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 2, 1, 3}, /*new_sizes=*/{5, 60});
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 2, 1, 3},
+                  /*new_sizes=*/{5, 60});
 
   Array2D<float> expected_array(5, 60);
   input.Each([&](tensorflow::gtl::ArraySlice<int64> indices, float* cell) {
@@ -619,10 +782,12 @@ XLA_TEST_F(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
         *cell;
   });
   auto expected = Literal::CreateR2FromArray2D(expected_array);
-  ComputeAndCompareLiteral(&builder, *expected, {input_data.get()});
+  ComputeAndCompareLiteral(&builder, *expected, {input_data.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, NoopReshape) {
+XLA_TEST_P(ReshapeTest, NoopReshape) {
+  ComputationBuilder builder(client_, TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input_array(2, 3, 5, 7);
@@ -632,18 +797,17 @@ XLA_TEST_F(ReshapeTest, NoopReshape) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({1, 2, 3, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto input = builder.Parameter(0, input_literal->shape(), "input");
-  builder.Reshape(input, /*dimensions=*/{3, 0, 1, 2},
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{3, 0, 1, 2},
                   /*new_sizes=*/{7, 2, 3, 5});
   Computation computation = builder.Build().ConsumeValueOrDie();
 
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
-      ShapeUtil::MakeShapeWithLayout(F32, {7, 2, 3, 5}, {2, 3, 0, 1});
+      ShapeUtil::MakeShapeWithLayout(use_bfloat16() ? BF16 : F32, {7, 2, 3, 5},
+                                     {2, 3, 0, 1});
   std::unique_ptr<Literal> output_literal =
       client_
           ->ExecuteAndTransfer(computation, {input_data.get()},
@@ -652,35 +816,45 @@ XLA_TEST_F(ReshapeTest, NoopReshape) {
 
   // Since the reshape is a no-op, verify that it does not change the underlying
   // data.
-  EXPECT_EQ(tensorflow::gtl::ArraySlice<float>(input_literal->f32s()),
-            tensorflow::gtl::ArraySlice<float>(output_literal->f32s()));
+  if (use_bfloat16()) {
+    auto expected = LiteralTestUtil::ConvertF32ToBF16(*input_literal);
+    EXPECT_EQ(tensorflow::gtl::ArraySlice<bfloat16>(expected->bf16s()),
+              tensorflow::gtl::ArraySlice<bfloat16>(output_literal->bf16s()));
+  } else {
+    EXPECT_EQ(tensorflow::gtl::ArraySlice<float>(input_literal->f32s()),
+              tensorflow::gtl::ArraySlice<float>(output_literal->f32s()));
+  }
 }
 
-XLA_TEST_F(ReshapeTest, R4ToR4Reshape_Trivial) {
-  auto literal_1x2x3x4 = Literal::CreateR4(
+XLA_TEST_P(ReshapeTest, R4ToR4Reshape_Trivial) {
+  ComputationBuilder builder(client_, TestName());
+  auto literal_1x2x3x4 = Literal::CreateR4<float>(
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
-  ComputationBuilder builder(client_, TestName());
-  auto input = builder.ConstantLiteral(*literal_1x2x3x4);
-  builder.Reshape(input, /*dimensions=*/{0, 1, 2, 3},
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *literal_1x2x3x4, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3},
                   /*new_sizes=*/{1, 2, 3, 4});
 
-  ComputeAndCompareLiteral(&builder, *literal_1x2x3x4, {});
+  ComputeAndCompareLiteral(&builder, *literal_1x2x3x4, {input.get()});
 }
 
-XLA_TEST_F(ReshapeTest, R4ToR4Reshape) {
-  auto literal_1x2x3x4 = Literal::CreateR4(
+XLA_TEST_P(ReshapeTest, R4ToR4Reshape) {
+  auto literal_1x2x3x4 = Literal::CreateR4<float>(
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
   ComputationBuilder builder(client_, TestName());
-  auto input = builder.ConstantLiteral(*literal_1x2x3x4);
-  builder.Reshape(input, /*dimensions=*/{1, 3, 2, 0},
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *literal_1x2x3x4, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{1, 3, 2, 0},
                   /*new_sizes=*/{2, 4, 3, 1});
 
   // clang-format off
-  auto expected_2x4x3x1 = Literal::CreateR4(
+  auto expected_2x4x3x1 = Literal::CreateR4<float>(
       {{{{1}, {5}, {9}},
         {{2}, {6}, {10}},
         {{3}, {7}, {11}},
@@ -691,10 +865,10 @@ XLA_TEST_F(ReshapeTest, R4ToR4Reshape) {
         {{16}, {20}, {24}}}});
   // clang-format on
 
-  ComputeAndCompareLiteral(&builder, *expected_2x4x3x1, {});
+  ComputeAndCompareLiteral(&builder, *expected_2x4x3x1, {input.get()});
 }
 
-XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeSimple) {
+XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeSimple) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   std::vector<int64> bounds = {2, 2, 2, 2};
@@ -706,12 +880,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeSimple) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 3, 2}, /*new_sizes=*/new_bounds);
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
+                  /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
@@ -723,7 +897,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeSimple) {
                            zero_error_spec_, &expected->shape());
 }
 
-XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
+XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   std::vector<int64> bounds = {1, 1, 250, 300};
@@ -735,12 +909,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 3, 2}, /*new_sizes=*/new_bounds);
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
+                  /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
@@ -752,7 +926,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
                            zero_error_spec_, &expected->shape());
 }
 
-XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
+XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   std::vector<int64> bounds = {5, 5, 1, 10};
@@ -764,12 +938,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 3, 2}, /*new_sizes=*/new_bounds);
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
+                  /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
@@ -781,7 +955,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
                            zero_error_spec_, &expected->shape());
 }
 
-XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
+XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   // This happens in NN-Builder MNIST.
@@ -794,12 +968,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 3, 2}, /*new_sizes=*/new_bounds);
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
+                  /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
@@ -811,7 +985,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
                            zero_error_spec_, &expected->shape());
 }
 
-XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
+XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   std::vector<int64> bounds = {3, 3, 1, 3};
@@ -823,12 +997,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({0, 1, 2, 3}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{1, 0, 2, 3}, /*new_sizes=*/new_bounds);
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{1, 0, 2, 3},
+                  /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape(new_bounds, {1, 0, 2, 3}, *input_literal)
@@ -840,5 +1014,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
                            zero_error_spec_, &expected->shape());
 }
 
+#ifdef XLA_BACKEND_SUPPORTS_BFLOAT16
+INSTANTIATE_TEST_CASE_P(ReshapeTestInstance, ReshapeTest, ::testing::Bool());
+#else
+INSTANTIATE_TEST_CASE_P(ReshapeTestInstance, ReshapeTest,
+                        ::testing::ValuesIn(std::vector<bool>{false}));
+#endif
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/sample_file_test.cc b/tensorflow/compiler/xla/tests/sample_file_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31b104f4e37f77d47f56ff8183ee1de1cc22e44d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/sample_file_test.cc
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This demonstrates how to use hlo_test_base to create a file based testcase
+// and compare results on gpu and cpu.
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class SampleFileTest : public HloTestBase {
+ protected:
+  SampleFileTest()
+      : HloTestBase(
+            /*test_platform=*/PlatformUtil::GetPlatform("gpu").ValueOrDie(),
+            /*reference_platform=*/PlatformUtil::GetPlatform("cpu")
+                .ValueOrDie()) {}
+};
+
+TEST_F(SampleFileTest, Convolution) {
+  const string& filename = "compiler/xla/tests/isolated_convolution.hlo";
+  string test_srcdir = tensorflow::testing::TensorFlowSrcRoot();
+  EXPECT_TRUE(RunAndCompareFromFile(
+      tensorflow::io::JoinPath(test_srcdir, filename), ErrorSpec{0.01}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/sample_text_test.cc b/tensorflow/compiler/xla/tests/sample_text_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b4f2b74e3dc9e80f50454b28eb6f2502cef3e681
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/sample_text_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This demonstrates how to use hlo_test_base to create textual IR based
+// testcases.
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+using tensorflow::gtl::nullopt;
+
+class SampleTextTest : public HloTestBase {};
+
+TEST_F(SampleTextTest, Axpy) {
+  const string& hlo_string = R"(
+HloModule axpy_module:
+ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
+  %alpha = f32[] parameter(0)
+  %broadcast = f32[2,4]{1,0} broadcast(f32[] %alpha), dimensions={}
+  %x = f32[2,4]{1,0} parameter(1)
+  %multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0} %broadcast, f32[2,4]{1,0} %x)
+  %y = f32[2,4]{1,0} parameter(2)
+  ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
+}
+)";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_string, ErrorSpec{0.0001}));
+}
+
+TEST_F(SampleTextTest, Tuple) {
+  const string& hlo_string = R"(
+HloModule TupleCreate_module:
+ENTRY %TupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f32[2,3]) {
+  %v1 = f32[] parameter(0)
+  %v2 = f32[3]{0} parameter(1)
+  %v3 = f32[2,3]{1,0} parameter(2)
+  ROOT %tuple = (f32[], f32[3]{0}, f32[2,3]{1,0}) tuple(f32[] %v1, f32[3]{0} %v2, f32[2,3]{1,0} %v3)
+}
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, nullopt));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index c21124750ad512cad69b1483e708613ee2857ac0..4db566f7841829359ea06fe25408048418c547ad 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -211,6 +212,13 @@ class SliceR1Test : public ClientLibraryTestBase,
   }
 };
 
+string SliceR1TestDataToString(const ::testing::TestParamInfo<R1Spec>& data) {
+  const R1Spec& spec = data.param;
+  return ::tensorflow::strings::Printf("%lld_%lld_%lld_%lld", spec.input_dim0,
+                                       spec.slice_start, spec.slice_limit,
+                                       spec.slice_stride);
+}
+
 XLA_TEST_P(SliceR1Test, DoIt_F32) { Run<float>(GetParam()); }
 
 XLA_TEST_P(SliceR1Test, DoIt_F64) { Run<double>(GetParam()); }
@@ -223,30 +231,66 @@ XLA_TEST_P(SliceR1Test, DoIt_U64) { Run<uint64>(GetParam()); }
 
 XLA_TEST_P(SliceR1Test, DoIt_S64) { Run<int64>(GetParam()); }
 
-INSTANTIATE_TEST_CASE_P(                          //
-    SliceR1TestInstantiation,                     //
-    SliceR1Test,                                  //
-    ::testing::Values(                            //
-        R1Spec{10, 0, 0, 1},                      //
-        R1Spec{10, 7, 7, 1},                      //
-        R1Spec{10, 2, 4, 1},                      //
-        R1Spec{10, 2, 4, 2},                      //
-        R1Spec{10, 0, 10, 1},                     //
-        R1Spec{1024, 1024 - 4, 1024, 1},          //
-        R1Spec{4096, 7, 7 + 1024, 1},             //
-        R1Spec{10, 0, 10, 2},                     //
-        R1Spec{10, 0, 10, 3},                     //
-        R1Spec{10, 0, 10, 4},                     //
-        R1Spec{10, 0, 10, 5},                     //
-        R1Spec{10, 0, 10, 10},                    //
-        R1Spec{500, 200, 400, 7},                 //
-        R1Spec{4096, 1, 4095, 3},                 //
-        R1Spec{2047, 1024 - 24, 1024 + 160, 31},  //
-        R1Spec{2047, 1, 2046, 3 * 128},           //
-        R1Spec{4096, 1024 + 3, 4095, 500},        //
-        R1Spec{8192, 0, 8192, 1024 * 3 + 400}     //
-        )                                         //
+// Tests for R1 slice ops.
+// The format for each testcase is {input size, start, limit, stride}.
+// clang-format off
+INSTANTIATE_TEST_CASE_P(
+    SliceR1TestInstantiation,
+    SliceR1Test,
+    ::testing::Values(
+        R1Spec{10, 0, 0, 1},
+        R1Spec{10, 7, 7, 1},
+        R1Spec{10, 0, 5, 1},
+        R1Spec{10, 3, 5, 1},
+        R1Spec{10, 0, 10, 1},
+        R1Spec{1024, 0, 5, 1},
+        R1Spec{1024, 3, 5, 1},
+        R1Spec{1024 + 17, 0, 5, 1},
+        R1Spec{1024 + 17, 3, 5, 1},
+        R1Spec{1024 + 17, 1024, 1024 + 6, 1},
+        R1Spec{1024 + 17, 1024 + 1, 1024 + 6, 1},
+        R1Spec{1024, 1024 - 4, 1024, 1},
+        R1Spec{4 * 1024, 7, 7 + 1024, 1},
+        R1Spec{4 * 1024, 0, 4 * 1024, 1},
+        R1Spec{4 * 1024, 1, 4 * 1024 - 1, 1},
+        R1Spec{4 * 1024, 1024, 3 * 1024, 1},
+        R1Spec{4 * 1024, 1024 + 1, 3 * 1024 - 1, 1},
+        R1Spec{16 * 1024, 0, 5, 1},
+        R1Spec{16 * 1024, 3, 5, 1},
+        R1Spec{16 * 1024 + 17, 0, 5, 1},
+        R1Spec{16 * 1024 + 17, 3, 5, 1},
+        R1Spec{16 * 1024 + 17, 16 * 1024, 16 * 1024 + 6, 1},
+        R1Spec{16 * 1024 + 17, 16 * 1024 + 1, 16 * 1024 + 6, 1},
+        R1Spec{16 * 1024, 4 * 1024 - 17, 8 * 1024 - 18, 1},
+        R1Spec{64 * 1024, 0, 64 * 1024, 1},
+        R1Spec{64 * 1024, 1, 64 * 1024 - 1, 1},
+        R1Spec{64 * 1024, 1024, 63 * 1024, 1},
+        R1Spec{64 * 1024, 1024 + 1, 63 * 1024 - 1, 1},
+        R1Spec{64 * 1024, 32 * 1024, 33 * 1024, 1},
+        R1Spec{64 * 1024, 32 * 1024 + 1, 33 * 1024 - 1, 1},
+        R1Spec{64 * 1024, 32 * 1024 - 17, 36 * 1024 - 18, 1},
+// TODO(b/69425338): This uses too much memory on GPU.
+#ifndef XLA_TEST_BACKEND_GPU
+        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024, 12 * 1024 * 1024, 1},
+        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 + 1, 12 * 1024 * 1024 - 1, 1},
+        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 - 1, 12 * 1024 * 1024 + 1, 1},
+#endif
+        R1Spec{10, 2, 4, 2},
+        R1Spec{10, 0, 10, 2},
+        R1Spec{10, 0, 10, 3},
+        R1Spec{10, 0, 10, 4},
+        R1Spec{10, 0, 10, 5},
+        R1Spec{10, 0, 10, 10},
+        R1Spec{500, 200, 400, 7},
+        R1Spec{4096, 1, 4095, 3},
+        R1Spec{2047, 1024 - 24, 1024 + 160, 31},
+        R1Spec{2047, 1, 2046, 3 * 128},
+        R1Spec{4096, 1024 + 3, 4095, 500},
+        R1Spec{8192, 0, 8192, 1024 * 3 + 400}
+        ),
+    SliceR1TestDataToString
 );
+// clang-format on
 
 struct R2Spec {
   int64 input_dim0;
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 0d56c9f48363d0569921d7c76050dcc66208931b..f9c62ec217d085e5c5a55f484c4bd712c6ccf05a 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 
@@ -24,6 +25,7 @@ namespace {
 
 template <typename FloatT>
 void PopulateWithRandomFloatingPointData(Literal* literal) {
+  // TODO(b/69179121): Generate data that is less self-similar.
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<FloatT>());
   std::minstd_rand0 engine;
@@ -34,6 +36,19 @@ void PopulateWithRandomFloatingPointData(Literal* literal) {
       }));
 }
 
+// The standard library does not have a case for bfloat16, unsurprisingly, so we
+// handle that one specially.
+template <>
+void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal) {
+  CHECK_EQ(literal->shape().element_type(), BF16);
+  std::minstd_rand0 engine;
+  std::uniform_real_distribution<float> generator(0.0f, 1.0f);
+  TF_CHECK_OK(literal->Populate<bfloat16>(
+      [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
+        return static_cast<bfloat16>(generator(engine));
+      }));
+}
+
 template <typename IntT>
 void PopulateWithRandomIntegralData(Literal* literal) {
   CHECK_EQ(literal->shape().element_type(),
@@ -47,42 +62,131 @@ void PopulateWithRandomIntegralData(Literal* literal) {
       }));
 }
 
-bool LooksLikeSum(const HloInstruction& instruction) {
-  return instruction.opcode() == HloOpcode::kAdd &&
-         instruction.operand(0)->opcode() == HloOpcode::kParameter &&
-         instruction.operand(1)->opcode() == HloOpcode::kParameter &&
-         instruction.operand(0) != instruction.operand(1);
+// Matches binary addition computations.
+bool LooksLikeSum(const HloComputation& computation) {
+  const HloInstruction* const root = computation.root_instruction();
+  return root->opcode() == HloOpcode::kAdd &&
+         computation.num_parameters() == 2 &&
+         root->operand(0)->opcode() == HloOpcode::kParameter &&
+         root->operand(1)->opcode() == HloOpcode::kParameter &&
+         root->operand(0) != root->operand(1);
+}
+
+// Reduce, ReduceWindow, and SelectAndScatter ops may use binary addition,
+// which requires an init_value of 0 rather than a random value.
+bool NeedsZeroInitValue(const HloUse& use) {
+  const HloInstruction* const instruction = use.instruction;
+  const HloOpcode opcode = instruction->opcode();
+  const int64 op_num = use.operand_number;
+  return (
+      ((opcode == HloOpcode::kReduce || opcode == HloOpcode::kReduceWindow) &&
+       op_num == 1 && LooksLikeSum(*instruction->to_apply())) ||
+      (opcode == HloOpcode::kSelectAndScatter && op_num == 2 &&
+       LooksLikeSum(*instruction->scatter())));
 }
 
-// Given an instruction and operand number, replace the given operand with
-// a Literal Constant Zero. Handle the case of a fusion instruction by
-// replacing the fusion's parent's parameter with a Literal Constant Zero,
-// unless the fusion's parent is itself a fusion.
-Status MaybeReplaceParameterInputWithZero(HloInstruction* const instruction,
-                                          const int64 operand_number) {
-  CHECK_LT(operand_number, instruction->operand_count());
-  if (instruction->operand(operand_number)->opcode() != HloOpcode::kParameter) {
-    return Status::OK();
+// Generate random values that are constrained to the input_shape minus the
+// output_shape so as not to produce wrapping slices, for instance.
+std::unique_ptr<Literal> MakeRandomNonwrappingSliceIndex(
+    const Shape& input_shape, const Shape& slice_shape) {
+  const int64 rank = ShapeUtil::Rank(input_shape);
+  std::vector<int32> start_indices(rank);
+  std::minstd_rand0 engine;
+  for (int i = 0; i < rank; ++i) {
+    const int32 upper_bound = ShapeUtil::GetDimension(input_shape, i) -
+                              ShapeUtil::GetDimension(slice_shape, i);
+    std::uniform_int_distribution<int32> generator(0, upper_bound);
+    start_indices[i] = generator(engine);
   }
+  return Literal::CreateR1<int32>(start_indices);
+}
 
-  HloComputation* const computation = instruction->parent();
-  std::unique_ptr<HloInstruction> zero = HloInstruction::CreateConstant(
-      MakeUnique<Literal>(Literal::Zero(instruction->shape().element_type())));
+// Use dataflow analysis on each parameter to see if there are uses that would
+// be problematic when generating input data.  Returns the list of instructions
+// that correspond to their uses.
+//
+// Should be paired with the CreateLiteralForConstrainedUses() function below.
+std::vector<HloInstruction*> FindConstrainedUses(
+    const HloDataflowAnalysis& dataflow, const HloInstruction& param) {
+  std::vector<HloInstruction*> constrained_uses;
+  for (const auto& pair : dataflow.GetInstructionValueSet(&param)) {
+    const HloValue& value = dataflow.GetUniqueValueAt(&param, pair.first);
+    for (const HloUse& use : value.uses()) {
+      HloInstruction* instruction = use.instruction;
+      const HloOpcode opcode = instruction->opcode();
+      const int64 op_num = use.operand_number;
+      if ((opcode == HloOpcode::kDynamicSlice && op_num == 1) ||
+          (opcode == HloOpcode::kDynamicUpdateSlice && op_num == 2)) {
+        constrained_uses.push_back(instruction);
+      } else if (opcode == HloOpcode::kFusion) {
+        const HloInstruction* const to_analyze =
+            instruction->fused_parameter(op_num);
+        auto fused_uses = FindConstrainedUses(dataflow, *to_analyze);
+        constrained_uses.insert(constrained_uses.end(), fused_uses.begin(),
+                                fused_uses.end());
+      } else if (NeedsZeroInitValue(use)) {
+        constrained_uses.push_back(instruction);
+      }
+    }
+  }
+  return constrained_uses;
+}
 
-  if (computation->IsFusionComputation()) {
-    HloInstruction* const fusion_instruction = computation->FusionInstruction();
-    if (fusion_instruction->IsFused()) {
-      return Unimplemented(
-          "Unable to replace fused parameter of fusion instruction");
+// Given a parameter, generate a random Literal to use as input if there exist
+// no constrained uses in the dataflow graph.  If such constraints exist,
+// generate a constrained literal (either bounded in the case of indices, or
+// zero in the case of init_values for reductions).
+StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
+    const tensorflow::gtl::ArraySlice<HloInstruction*> constrained_uses,
+    const HloInstruction& param) {
+  HloInstruction* needs_index = nullptr;
+  HloInstruction* needs_zero = nullptr;
+  for (HloInstruction* use : constrained_uses) {
+    switch (use->opcode()) {
+      case HloOpcode::kDynamicSlice:
+      case HloOpcode::kDynamicUpdateSlice:
+        TF_RET_CHECK(ShapeUtil::Equal(param.shape(), use->operand(0)->shape()));
+        if (needs_index != nullptr &&
+            !ShapeUtil::Equal(needs_index->shape(), use->shape())) {
+          return Unimplemented(
+              "Conflicting operand generation slice index constraints\n");
+        }
+        needs_index = use;
+        break;
+
+      case HloOpcode::kReduce:
+      case HloOpcode::kReduceWindow:
+      case HloOpcode::kSelectAndScatter:
+        needs_zero = use;
+        break;
+
+      default:
+        return Unimplemented(
+            "Constrained operand generation not implemented for %s.",
+            use->ToString().c_str());
     }
-    TF_RETURN_IF_ERROR(fusion_instruction->ReplaceOperandWith(
-        instruction->operand(operand_number)->parameter_number(),
-        fusion_instruction->parent()->AddInstruction(std::move(zero))));
+  }
+  if (needs_index != nullptr && needs_zero != nullptr) {
+    return Unimplemented(
+        "Conflicting operand generation constraints.\nNeeds index: %s\nNeeds "
+        "zero: %s\n",
+        needs_index->ToString().c_str(), needs_zero->ToString().c_str());
+  }
+  if (needs_index != nullptr) {
+    return MakeRandomNonwrappingSliceIndex(param.shape(), needs_index->shape());
+  } else if (needs_zero != nullptr) {
+    return Literal::CreateFromShape(param.shape());
   } else {
-    TF_RETURN_IF_ERROR(instruction->ReplaceOperandWith(
-        operand_number, computation->AddInstruction(std::move(zero))));
+    return MakeFakeLiteral(param.shape());
   }
-  return Status::OK();
+}
+
+// Given a module entry parameter, use the dataflow analysis to see if a
+// special case literal must be created, or if we can generate fake data.
+StatusOr<std::unique_ptr<Literal>> MakeConstrainedArgument(
+    const HloDataflowAnalysis& dataflow, const HloInstruction& param) {
+  const auto constrained_uses = FindConstrainedUses(dataflow, param);
+  return CreateLiteralForConstrainedUses(constrained_uses, param);
 }
 
 }  // namespace
@@ -99,6 +203,9 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape) {
   }
   std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
   switch (shape.element_type()) {
+    case BF16:
+      PopulateWithRandomFloatingPointData<bfloat16>(literal.get());
+      break;
     case F32:
       PopulateWithRandomFloatingPointData<float>(literal.get());
       break;
@@ -146,33 +253,17 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape) {
 }
 
 StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
-    const HloModule& module) {
-  std::vector<std::unique_ptr<Literal>> arguments;
-  for (const ShapeLayout& shape_layout :
-       module.config().entry_computation_layout().parameter_layouts()) {
-    TF_ASSIGN_OR_RETURN(auto literal, MakeFakeLiteral(shape_layout.shape()));
-    arguments.push_back(std::move(literal));
+    HloModule* const module) {
+  TF_ASSIGN_OR_RETURN(auto dataflow, HloDataflowAnalysis::Run(module));
+  const auto params = module->entry_computation()->parameter_instructions();
+  std::vector<std::unique_ptr<Literal>> arguments(params.size());
+  for (int i = 0; i < params.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(arguments[i],
+                        MakeConstrainedArgument(*dataflow, *params[i]));
   }
   return std::move(arguments);
 }
 
-Status ReplaceInitsWithConstants(HloModule* const module) {
-  for (HloComputation* const computation : module->computations()) {
-    for (HloInstruction* const instruction : computation->instructions()) {
-      const HloOpcode opcode = instruction->opcode();
-      if ((opcode == HloOpcode::kReduce ||
-           opcode == HloOpcode::kReduceWindow) &&
-          LooksLikeSum(*instruction->to_apply()->root_instruction())) {
-        TF_RETURN_IF_ERROR(MaybeReplaceParameterInputWithZero(instruction, 1));
-      } else if (opcode == HloOpcode::kSelectAndScatter &&
-                 LooksLikeSum(*instruction->scatter()->root_instruction())) {
-        TF_RETURN_IF_ERROR(MaybeReplaceParameterInputWithZero(instruction, 2));
-      }
-    }
-  }
-  return Status::OK();
-}
-
 Status VerifyHloModule(const perftools::gputools::Platform& platform,
                        HloModule* const module) {
   return HloVerifier(
diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h
index 9aca162a185e5b22888229555b7bce88769c79a6..0fb024ffb074f1c90b75022bc7f5a8b58b03c0c2 100644
--- a/tensorflow/compiler/xla/tests/test_utils.h
+++ b/tensorflow/compiler/xla/tests/test_utils.h
@@ -60,13 +60,11 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape);
 
 // Generates a vector of arguments containing fake data. The number, shape and
 // layout of the arguments is appropriate for given HLO module.
+//
+// Will handle special cases such as making sure that indices used for dynamic
+// slices are bounded, reduces that call adds use 0 as an init value, etc.
 StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
-    const HloModule& module);
-
-// Reductions using Adds, ReduceWindow, and SelectAndScatter, require their
-// init_value to be replaced with the constant 0.0f when testing, otherwise we
-// may generate a bad init_value when looking at the op in isolation.
-Status ReplaceInitsWithConstants(HloModule* const module);
+    HloModule* const module);
 
 // Check that a given module satisfies various constraints before trying to
 // execute it.
diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
index c30cd1b7b8e9be50d33fafb12d70e204e7321864..ed556fafb17cb2d243141783f822400d3c54b459 100644
--- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
@@ -33,29 +33,27 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace se = ::perftools::gputools;
-
 namespace xla {
-
 namespace {
 
 class TransferManagerTest : public LocalClientTestBase {
  protected:
-  TransferManagerTest() {
-    shape_size_fn_ = [this](const Shape& shape) {
-      return transfer_manager_->GetByteSizeRequirement(shape);
-    };
-  }
+  TransferManagerTest()
+      : shape_size_fn_([this](const Shape& shape) {
+          return transfer_manager_->GetByteSizeRequirement(shape);
+        }) {}
 
-  ~TransferManagerTest() override {}
+  ~TransferManagerTest() override = default;
 
   std::unique_ptr<ScopedShapedBuffer> AllocateDeviceBuffer(const Shape& shape) {
-    return ScopedShapedBuffer::Allocate(
-               shape, GetOrCreateAllocator(local_client_->platform()),
-               /*device_ordinal=*/0, shape_size_fn_)
-        .ConsumeValueOrDie();
+    return transfer_manager_
+        ->AllocateScopedShapedBuffer(
+            shape, GetOrCreateAllocator(local_client_->platform()),
+            /*device_ordinal=*/0)
+        .ValueOrDie();
   }
 
+ private:
   std::function<int64(const Shape&)> shape_size_fn_;
 };
 
@@ -214,6 +212,39 @@ XLA_TEST_F(TransferManagerTest, TransferNestedTuple) {
   LiteralTestUtil::ExpectEqual(*literal, *result);
 }
 
-}  // namespace
+XLA_TEST_F(TransferManagerTest, TransferComplexValue) {
+  std::unique_ptr<Literal> literal = Literal::CreateR1<complex64>(
+      {complex64(1.0f, 2.0f), complex64(42.0f, -123.4f)});
+  auto device_buffer = AllocateDeviceBuffer(literal->shape());
+
+  // Round trip literal through device.
+  ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
+      stream_executor_, *literal, *device_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
+                          transfer_manager_->TransferLiteralFromDevice(
+                              stream_executor_, *device_buffer));
+
+  LiteralTestUtil::ExpectEqual(*literal, *result);
+}
 
+XLA_TEST_F(TransferManagerTest, TransferComplexValueInTuple) {
+  std::unique_ptr<Literal> literal = Literal::MakeTuple(
+      {Literal::CreateR1<complex64>(
+           {complex64(1.0f, 2.0f), complex64(42.0f, -123.4f)})
+           .get(),
+       Literal::CreateR1<int32>({1, 2, 3, 4, 5, 6}).get(),
+       Literal::CreateR0<complex64>(complex64(0.3f, -0.4f)).get()});
+  auto device_buffer = AllocateDeviceBuffer(literal->shape());
+
+  // Round trip literal through device.
+  ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
+      stream_executor_, *literal, *device_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
+                          transfer_manager_->TransferLiteralFromDevice(
+                              stream_executor_, *device_buffer));
+
+  LiteralTestUtil::ExpectEqual(*literal, *result);
+}
+
+}  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 4920f17a7ed21d587c15b8deac550d5e5bb566c9..65489cfff19c8fecbdead8a7e295bf9cca56038f 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -180,7 +180,8 @@ XLA_TEST_F(TupleTest, TupleGTEToTuple) {
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
-XLA_TEST_F(TupleTest, SelectBetweenPredTuples) {
+// TODO(b/68395210): GPU does not tolerate ambiguous top-level buffers.
+XLA_TEST_F(TupleTest, DISABLED_ON_GPU(SelectBetweenPredTuples)) {
   ComputationBuilder b(client_, TestName());
   ComputationDataHandle v1, v2;
 
@@ -444,5 +445,61 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
   ComputeAndCompareR1<float>(&builder, expected, arguments, ErrorSpec(1e-5));
 }
 
+XLA_TEST_F(TupleTest, ComplexTuples) {
+  ComputationBuilder builder(client_, TestName());
+  {
+    Shape c64r0 = ShapeUtil::MakeShape(C64, {});
+    Shape c64r1 = ShapeUtil::MakeShape(C64, {2});
+    Shape c64r2 = ShapeUtil::MakeShape(C64, {3, 2});
+    Shape arg0_shape = ShapeUtil::MakeTupleShape(
+        {c64r0, ShapeUtil::MakeTupleShape({c64r1, c64r2})});
+    auto input0 = builder.Parameter(0, arg0_shape, "input0");
+    auto t0 = builder.GetTupleElement(input0, 0);
+    auto t1 = builder.GetTupleElement(input0, 1);
+    auto t10 = builder.GetTupleElement(t1, 0);
+    auto t11 = builder.GetTupleElement(t1, 1);
+    auto sum = builder.Add(builder.Add(t10, t11, {1}), t0);
+    auto input1 = builder.Parameter(1, c64r1, "input1");
+    auto prod = builder.Mul(input1, sum, {1});
+    builder.Tuple({builder.Tuple({prod, sum}),
+                   builder.ConstantR0<complex64>({123, 456})});
+  }
+
+  std::unique_ptr<GlobalData> arg0 =
+      client_
+          ->TransferToServer(*Literal::MakeTuple(
+              {Literal::CreateR0<complex64>({1, 2}).get(),
+               Literal::MakeTuple(
+                   {Literal::CreateR1<complex64>({{10, 20}, {30, 40}}).get(),
+                    Literal::CreateR2<complex64>(
+                        {{{100, 200}, {300, 400}},
+                         {{1000, 2000}, {3000, 4000}},
+                         {{10000, 20000}, {30000, 40000}}})
+                        .get()})
+                   .get()}))
+          .ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> arg1 =
+      client_
+          ->TransferToServer(*Literal::CreateR1<complex64>({{1, 2}, {1, -2}}))
+          .ConsumeValueOrDie();
+  auto sum = Literal::CreateR2<complex64>({{{111, 222}, {331, 442}},
+                                           {{1011, 2022}, {3031, 4042}},
+                                           {{10011, 20022}, {30031, 40042}}});
+  auto prod = Literal::CreateFromShape(sum->shape());
+  ASSERT_TRUE(prod->Populate<complex64>(
+                      [&sum](tensorflow::gtl::ArraySlice<int64> indexes) {
+                        return sum->Get<complex64>(indexes) *
+                               (indexes[indexes.size() - 1] == 0
+                                    ? complex64(1, 2)
+                                    : complex64(1, -2));
+                      })
+                  .ok());
+  auto expected =
+      Literal::MakeTuple({Literal::MakeTuple({prod.get(), sum.get()}).get(),
+                          Literal::CreateR0<complex64>({123, 456}).get()});
+  ComputeAndCompareTuple(&builder, *expected, {arg0.get(), arg1.get()},
+                         error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 49f673f5f0bf9b844ab4030383784208b4e2c58a..0b3430ee1ee515c2c98c64a947b7a7021c04f22b 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -357,8 +357,7 @@ TEST_F(WhileTest, WhileWithVectorResultIntoTuple) {
   ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.0001));
 }
 
-// TODO(b/63003356): 11-06-2017: fails on all back-ends with incorrect result.
-TEST_F(WhileTest, DISABLED_WhileWithPermutationAndTupleResult) {
+TEST_F(WhileTest, WhileWithPermutationAndTupleResult) {
   std::vector<Shape> shape_elements = {
       ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(F32, {3}),
       ShapeUtil::MakeShape(F32, {3}), ShapeUtil::MakeShape(F32, {3})};
@@ -411,8 +410,7 @@ TEST_F(WhileTest, DISABLED_WhileWithPermutationAndTupleResult) {
   ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.0001));
 }
 
-// TODO(b/63003356): 11-06-2017: fails on all back-ends with incorrect result.
-TEST_F(WhileTest, DISABLED_WhileWithPermutationAndVectorResult) {
+TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
   std::vector<Shape> shape_elements = {
       ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(F32, {3}),
       ShapeUtil::MakeShape(F32, {3}), ShapeUtil::MakeShape(F32, {3})};
@@ -913,8 +911,7 @@ TEST_F(WhileTest, WhileWithPrngScalarResult) {
   }
 }
 
-// TODO(b/34969189) Fails with bad AtomicCmpSwap on GPU on 2017-09-11.
-TEST_F(WhileTest, DISABLED_ON_GPU(WhileThatSwapsParameterWithTupleElement)) {
+TEST_F(WhileTest, WhileThatSwapsParameterWithTupleElement) {
   auto element_shape = ShapeUtil::MakeShape(F32, {2});
 
   ComputationBuilder outer(client_, "outer");
@@ -950,8 +947,7 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileThatSwapsParameterWithTupleElement)) {
                          ErrorSpec(1e-6));
 }
 
-// TODO(b/34969189) Fails with bad AtomicCmpSwap on GPU on 2017-09-11.
-TEST_F(WhileTest, DISABLED_ON_GPU(WhileThatSwapsParameterWithBroadcast)) {
+TEST_F(WhileTest, WhileThatSwapsParameterWithBroadcast) {
   auto element_shape = ShapeUtil::MakeShape(F32, {2});
 
   ComputationBuilder outer(client_, "outer");
diff --git a/tensorflow/compiler/xla/tools/parser/BUILD b/tensorflow/compiler/xla/tools/parser/BUILD
index ce936af6c3376387c1ed9fa48da23b8af537f6e5..97aacf6b39f83978e732060817cd93ede81ca782 100644
--- a/tensorflow/compiler/xla/tools/parser/BUILD
+++ b/tensorflow/compiler/xla/tools/parser/BUILD
@@ -34,9 +34,9 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
         "//tensorflow/core:regexp_internal",
     ],
diff --git a/tensorflow/compiler/xla/tools/parser/README.md b/tensorflow/compiler/xla/tools/parser/README.md
index 6232967f5f04cbf316d985357ae84c28335531e2..2e329cc513dfa83070065a34b67b70ec2ca4b2e9 100644
--- a/tensorflow/compiler/xla/tools/parser/README.md
+++ b/tensorflow/compiler/xla/tools/parser/README.md
@@ -1,24 +1,26 @@
 # HloModule string syntax
 
-TODO: Support all subcomputations (for fusion, reduce, ...).
-
-TODO: Support all extra attributes, e.g. dimensions, strides.
-
 ```yacc
 hlo_module
   : 'HloModule' name computations
   ;
 
+/* If no computation is marked as ENTRY, the last computation will be the entry
+computation of the module.*/
 computations
   : computation
   | computation computations
   ;
 
 computation
-  : 'ENTRY' name param_list '->' shape instruction_list
-  | name param_list '->' shape instruction_list
+  : 'ENTRY' name param_list_to_shape instruction_list
+  | name param_list_to_shape instruction_list
+  | 'ENTRY' name instruction_list
+  | name instruction_list
   ;
 
+/* If no instruction is marked as ROOT, the last instruction will be the root of
+its computation. */
 instruction_list
   : '{' instruction_list1 '}'
   ;
@@ -41,6 +43,7 @@ operands1
   ;
 operand
   : shape name
+  | name
   ;
 
 attributes
@@ -60,6 +63,10 @@ attribute_value
   | '{' sub_attributes '}'
   ;
 
+param_list_to_shape
+  : param_list '->' shape
+  ;
+
 param_list
   : '(' param_list1 ')'
   ;
@@ -84,6 +91,7 @@ tuple_elements
 name
   : identifier ':'
   | '%' identifier
+  | identifier
   ;
 
 identifier
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc b/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc
index 56744440db1b17aa1cc8823feb1bad279f8f4f75..6d1e4173d25a032970284fc7abbc3d2ec30b27cd 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <unordered_map>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -153,15 +152,15 @@ TokKind HloLexer::LexToken() {
   }
 }
 
-// Lex a shape, name, keyword, opcode, attribute name, or the dim labels
-// pattern.
+// Lex a shape, name, keyword, attribute name, the dim labels pattern, and
+// other identifiers.
 //
 // shape    ::= ([a-zA-Z0-9_]*[0-9]*)\[([0-9,]*)\](?:\s*{([0-9,]*)})?
 // name     ::= [a-zA-Z_][a-zA-Z0-9_.-]*:
 // keyword  ::= HloModule, ENTRY, ...
-// opcode   ::= add, greater-than, ...
 // attribute_name ::= condition, body, dimensions, ...
 // dim_labels_pattern ::= [0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,}
+// identifiers ::= other cases that match [a-zA-Z_][a-zA-Z0-9_.-]*
 TokKind HloLexer::LexIdentifier() {
   {
     auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
@@ -220,20 +219,6 @@ TokKind HloLexer::LexIdentifier() {
 
 #undef KEYWORD
 
-  // See if this is an opcode.
-  auto opcode = StringToHloOpcode(identifier.ToString());
-  if (opcode.ok()) {
-    opcode_val_ = opcode.ValueOrDie();
-    return TokKind::kOpcode;
-  }
-
-  // See if this is an fusion kind.
-  auto kind = xla::StringToFusionKind(identifier.ToString());
-  if (kind.ok()) {
-    fusion_kind_val_ = kind.ValueOrDie();
-    return TokKind::kFusionKind;
-  }
-
   {
     auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
     static LazyRE2 dim_labels_pattern = {
@@ -244,8 +229,9 @@ TokKind HloLexer::LexIdentifier() {
       return TokKind::kDimLabels;
     }
   }
-  current_ptr_ = token_start_ + 1;
-  return TokKind::kError;
+
+  str_val_ = identifier.ToString();
+  return TokKind::kIdent;
 }
 
 // Lex names after a % character.
@@ -271,7 +257,8 @@ TokKind HloLexer::LexPercent() {
 // fp without exp ::= [-]?([0-9]+[.][0-9]*|[0-9]*[.][0-9]+)
 // dim_labels_pattern ::= [0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,}
 // dxd_pattern ::= [0-9]+(x[0-9]+)+
-// pad_pattern ::= [0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*
+// pad_pattern ::=
+//   [-]?[0-9]+_[-]?[0-9]+(_[0-9]+)?(x[-]?[0-9]+_[-]?[0-9]+(_[0-9]+)?)*
 // int ::=  [-]?[0-9]+
 // negative inf ::= '-inf'
 TokKind HloLexer::LexNumberOrPattern() {
@@ -289,7 +276,7 @@ TokKind HloLexer::LexNumberOrPattern() {
       R"([0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,})"};
   static LazyRE2 dxd_pattern = {R"([0-9]+(x[0-9]+)+)"};
   static LazyRE2 pad_pattern = {
-      R"([0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*)"};
+      R"([-]?[0-9]+_[-]?[0-9]+(_[0-9]+)?(x[-]?[0-9]+_[-]?[0-9]+(_[0-9]+)?)*)"};
 
   if (RE2::Consume(&consumable, *dim_labels_pattern)) {
     current_ptr_ = consumable.begin();
@@ -326,18 +313,43 @@ TokKind HloLexer::LexNumberOrPattern() {
   return TokKind::kError;
 }
 
-StringPiece HloLexer::GetCurrentLine() const {
-  const char* start = token_start_;
-  const char* end = current_ptr_;
-  if (!CanDereference(start) || !CanDereference(end)) {
-    return "LINE OUT OF RANGE";
+std::pair<unsigned, unsigned> HloLexer::GetLineAndColumn(LocTy location) const {
+  unsigned line_no = 1;
+  const char* start = buf_.begin();
+  const char* ptr = start;
+  if (line_no_cache_.last_query && CanDereference(line_no_cache_.last_query) &&
+      line_no_cache_.last_query <= location) {
+    ptr = line_no_cache_.last_query;
+    line_no = line_no_cache_.line_no_of_query;
   }
-  while (start > buf_.begin() && *start != '\n') {
-    start--;
+  for (; ptr != location; ptr++) {
+    if (*ptr == '\n') {
+      line_no++;
+    }
   }
-  while (end < buf_.end() && *end != '\n') {
-    end++;
+
+  // Update the line number cache.
+  line_no_cache_.last_query = ptr;
+  line_no_cache_.line_no_of_query = line_no;
+  size_t line_offset = StringPieceFromPointers(start, ptr).rfind('\n');
+  if (line_offset == StringPiece::npos) {
+    line_offset = 0;
   }
+  return {line_no, ptr - start - line_offset};
+}
+
+StringPiece HloLexer::GetLine(LocTy loc) const {
+  if (!CanDereference(loc)) {
+    return "LINE OUT OF RANGE";
+  }
+  size_t line_start =
+      StringPieceFromPointers(buf_.begin(), loc + 1).rfind('\n');
+  const char* start = line_start == StringPiece::npos
+                          ? buf_.begin()
+                          : buf_.begin() + line_start + 1;
+  size_t line_end = StringPieceFromPointers(loc, buf_.end()).find('\n');
+  const char* end = line_end == StringPiece::npos ? buf_.end() : loc + line_end;
+
   return StringPieceFromPointers(start, end);
 }
 
@@ -428,14 +440,12 @@ string TokKindToString(TokKind kind) {
       return "kDxD";
     case TokKind::kPad:
       return "kPad";
+    case TokKind::kIdent:
+      return "kIdent";
     case TokKind::kString:
       return "kString";
     case TokKind::kShape:
       return "kShape";
-    case TokKind::kOpcode:
-      return "kOpcode";
-    case TokKind::kFusionKind:
-      return "kFusionKind";
     case TokKind::kInt:
       return "kInt";
     case TokKind::kDecimal:
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_lexer.h b/tensorflow/compiler/xla/tools/parser/hlo_lexer.h
index 5c9d1bf3912584040dc5260cc6730247d439fd60..27880b9b8afbfa58abfedc3b2cecd5236b78a6d6 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_lexer.h
+++ b/tensorflow/compiler/xla/tools/parser/hlo_lexer.h
@@ -18,9 +18,8 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/tools/parser/hlo_token.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/logging.h"
@@ -48,6 +47,7 @@ class HloLexer {
       case TokKind::kDxD:
       case TokKind::kPad:
       case TokKind::kString:
+      case TokKind::kIdent:
         return str_val_;
       default:
         LOG(FATAL) << "This token does not have string value";
@@ -57,14 +57,6 @@ class HloLexer {
     CHECK(GetKind() == TokKind::kShape);
     return shape_val_;
   }
-  HloOpcode GetOpcodeVal() const {
-    CHECK(GetKind() == TokKind::kOpcode);
-    return opcode_val_;
-  }
-  HloInstruction::FusionKind GetFusionKindVal() const {
-    CHECK(GetKind() == TokKind::kFusionKind);
-    return fusion_kind_val_;
-  }
   int64 GetInt64Val() const {
     CHECK(GetKind() == TokKind::kInt);
     return int64_val_;
@@ -74,8 +66,16 @@ class HloLexer {
     return decimal_val_;
   }
 
-  // Returns the line of text that is currently being lexed.
-  tensorflow::StringPiece GetCurrentLine() const;
+  typedef const char* LocTy;
+
+  // Returns the location of the current token.
+  LocTy GetLoc() const { return token_start_; }
+
+  // Returns the line and column of a location in the buffer.
+  std::pair<unsigned, unsigned> GetLineAndColumn(LocTy location) const;
+
+  // Returns the whole line given the location.
+  tensorflow::StringPiece GetLine(LocTy loc) const;
 
  private:
   // Returns the current character. If it's neither the end of input buffer nor
@@ -114,10 +114,15 @@ class HloLexer {
   TokKind current_kind_;
   string str_val_;
   Shape shape_val_;
-  HloOpcode opcode_val_;
-  HloInstruction::FusionKind fusion_kind_val_;
   int64 int64_val_;
   double decimal_val_;
+
+  struct LineNoCacheTy {
+    const char* last_query;
+    unsigned line_no_of_query;
+  };
+  // This caches the line number of the previous query.
+  mutable LineNoCacheTy line_no_cache_{nullptr, 0};
 };
 
 }  // namespace tools
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index 2112b3e710a4543d14f0e31243aef74dc6943b54..68fb9dd9ec8fa60b68906448ef55aa669c2506cb 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -40,6 +41,8 @@ const double kF16max = 65504;
 // Parser for the HloModule::ToString() format text.
 class HloParser {
  public:
+  using LocTy = HloLexer::LocTy;
+
   explicit HloParser(StringPiece str, const HloModuleConfig& config)
       : lexer_(str), config_(config) {}
 
@@ -56,7 +59,7 @@ class HloParser {
   // ParseXXX returns false if an error occurred.
   bool ParseHloModule();
   bool ParseComputations();
-  bool ParseComputation();
+  bool ParseComputation(HloComputation** entry_computation);
   bool ParseInstructionList(HloComputation::Builder* builder,
                             string* root_name);
   bool ParseInstruction(HloComputation::Builder* builder, string* root_name);
@@ -104,6 +107,7 @@ class HloParser {
     kPaddingConfig,
     kMetadata,
     kFusionKind,
+    kDistribution,
   };
 
   struct AttrConfig {
@@ -167,6 +171,7 @@ class HloParser {
   bool ParseInt64List(const TokKind start, const TokKind end,
                       const TokKind delim, std::vector<int64>* result);
 
+  bool ParseParamListToShape(Shape* shape, LocTy* shape_loc);
   bool ParseParamList();
   bool ParseName(string* result);
   bool ParseAttributeName(string* result);
@@ -174,13 +179,21 @@ class HloParser {
   bool ParseShape(Shape* result);
   bool ParseOpcode(HloOpcode* result);
   bool ParseFusionKind(HloInstruction::FusionKind* result);
+  bool ParseRandomDistribution(RandomDistribution* result);
   bool ParseInt64(int64* result);
   bool ParseDouble(double* result);
   bool ParseBool(bool* result);
   bool ParseToken(TokKind kind, const string& msg);
 
+  // Returns true if the current token is the beginning of a shape.
+  bool CanBeShape();
+  // Returns true if the current token is the beginning of a
+  // param_list_to_shape.
+  bool CanBeParamListToShape();
+
   // Logs the current parsing line and the given message. Always returns false.
   bool TokenError(StringPiece msg);
+  bool Error(LocTy loc, StringPiece msg);
 
   // If the current token is 'kind', eats it (i.e. lexes the next token) and
   // returns true.
@@ -191,10 +204,12 @@ class HloParser {
 
   // Adds the instruction to the pool. Returns false and emits an error if the
   // instruction already exists.
-  bool AddInstruction(const string& name, HloInstruction* instruction);
+  bool AddInstruction(const string& name, HloInstruction* instruction,
+                      LocTy name_loc);
   // Adds the computation to the pool. Returns false and emits an error if the
   // computation already exists.
-  bool AddComputation(const string& name, HloComputation* computation);
+  bool AddComputation(const string& name, HloComputation* computation,
+                      LocTy name_loc);
 
   // The map from the instruction name to the instruction. This does not own the
   // instructions.
@@ -203,19 +218,30 @@ class HloParser {
 
   HloLexer lexer_;
   std::unique_ptr<HloModule> module_;
+  std::vector<std::unique_ptr<HloComputation>> computations_;
   const HloModuleConfig config_;
   std::vector<string> error_;
 };
 
-bool HloParser::TokenError(StringPiece msg) {
-  const string error =
-      StrCat("was parsing \"", lexer_.GetCurrentLine(), "\"; token ",
-             TokKindToString(lexer_.GetKind()), "; ", msg);
-  VLOG(1) << "TokenError: " << error;
-  error_.push_back(error);
+bool HloParser::Error(LocTy loc, StringPiece msg) {
+  auto line_col = lexer_.GetLineAndColumn(loc);
+  const unsigned line = line_col.first;
+  const unsigned col = line_col.second;
+  std::vector<string> error_lines;
+  error_lines.push_back(
+      StrCat("was parsing ", line, ":", col, ": error: ", msg));
+  error_lines.push_back(lexer_.GetLine(loc).ToString());
+  error_lines.push_back(col == 0 ? "" : StrCat(string(col - 1, ' '), "^"));
+
+  error_.push_back(tensorflow::str_util::Join(error_lines, "\n"));
+  VLOG(1) << "Error: " << error_.back();
   return false;
 }
 
+bool HloParser::TokenError(StringPiece msg) {
+  return Error(lexer_.GetLoc(), msg);
+}
+
 bool HloParser::Run() {
   lexer_.Lex();
   return ParseHloModule();
@@ -241,27 +267,67 @@ bool HloParser::ParseHloModule() {
 
 // computations ::= (computation)+
 bool HloParser::ParseComputations() {
+  HloComputation* entry_computation = nullptr;
   do {
-    if (!ParseComputation()) {
+    if (!ParseComputation(&entry_computation)) {
       return false;
     }
   } while (lexer_.GetKind() != TokKind::kEof);
+
+  for (int i = 0; i < computations_.size(); i++) {
+    // If entry_computation is not nullptr, it means the computation it pointed
+    // to is marked with "ENTRY"; otherwise, no computation is marked with
+    // "ENTRY", and we use the last computation as the entry computation. We
+    // add the non-entry computations as embedded computations to the module.
+    if ((entry_computation != nullptr &&
+         computations_[i].get() != entry_computation) ||
+        (entry_computation == nullptr && i != computations_.size() - 1)) {
+      module_->AddEmbeddedComputation(std::move(computations_[i]));
+      continue;
+    }
+    auto computation =
+        module_->AddEntryComputation(std::move(computations_[i]));
+    // The parameters and result layouts were set to default layout. Here we
+    // set the layouts to what the hlo text says.
+    for (int p = 0; p < computation->num_parameters(); p++) {
+      const Shape& param_shape = computation->parameter_instruction(p)->shape();
+      if (param_shape.has_layout()) {
+        module_->mutable_entry_computation_layout()
+            ->mutable_parameter_layout(p)
+            ->ResetLayout(param_shape.layout());
+      }
+    }
+    const Shape& result_shape = computation->root_instruction()->shape();
+    if (result_shape.has_layout()) {
+      module_->mutable_entry_computation_layout()
+          ->mutable_result_layout()
+          ->ResetLayout(result_shape.layout());
+    }
+  }
+
   return true;
 }
 
-// computation ::= ('ENTRY')? name param_list '->' shape instruction_list
-bool HloParser::ParseComputation() {
+// computation ::= ('ENTRY')? name (param_list_to_shape)? instruction_list
+bool HloParser::ParseComputation(HloComputation** entry_computation) {
+  LocTy maybe_entry_loc = lexer_.GetLoc();
   const bool is_entry_computation = EatIfPresent(TokKind::kw_ENTRY);
+
   string name;
+  LocTy name_loc = lexer_.GetLoc();
   if (!ParseName(&name)) {
     return false;
   }
   auto builder = MakeUnique<HloComputation::Builder>(name);
 
+  LocTy shape_loc = nullptr;
   Shape shape;
+  if (CanBeParamListToShape() && !ParseParamListToShape(&shape, &shape_loc)) {
+    return false;
+  }
+
   string root_name;
-  if (!ParseParamList() || !ParseToken(TokKind::kArrow, "expects '->'") ||
-      !ParseShape(&shape) || !ParseInstructionList(builder.get(), &root_name)) {
+  if (!ParseInstructionList(builder.get(), &root_name)) {
     return false;
   }
 
@@ -273,14 +339,37 @@ bool HloParser::ParseComputation() {
     LOG(FATAL) << "instruction " << root_name
                << " was marked as ROOT but the parser has not seen it before";
   }
+
   // Now root can be either an existing instruction or a nullptr. If it's a
   // nullptr, the implementation of Builder will set the last instruction as
   // root instruction.
-  HloComputation* computation =
-      is_entry_computation
-          ? module_->AddEntryComputation(builder->Build(root))
-          : module_->AddEmbeddedComputation(builder->Build(root));
-  return AddComputation(name, computation);
+  computations_.emplace_back(builder->Build(root));
+  HloComputation* computation = computations_.back().get();
+
+  if (!root) {
+    root = computation->root_instruction();
+  } else {
+    CHECK_EQ(root, computation->root_instruction());
+  }
+
+  // If param_list_to_shape was present, check compatibility.
+  if (shape_loc != nullptr && !ShapeUtil::Compatible(root->shape(), shape)) {
+    return Error(
+        shape_loc,
+        StrCat("Shape of computation ", name, ", ",
+               ShapeUtil::HumanString(shape),
+               ", is not compatible with that of its root instruction ",
+               root_name, ", ", ShapeUtil::HumanString(root->shape())));
+  }
+
+  if (is_entry_computation) {
+    if (*entry_computation != nullptr) {
+      return Error(maybe_entry_loc, "expects only one ENTRY");
+    }
+    *entry_computation = computation;
+  }
+
+  return AddComputation(name, computation, name_loc);
 }
 
 // instruction_list ::= '{' instruction_list1 '}'
@@ -307,13 +396,21 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
   Shape shape;
   HloOpcode opcode;
   std::vector<HloInstruction*> operands;
+
+  LocTy maybe_root_loc = lexer_.GetLoc();
   bool is_root = EatIfPresent(TokKind::kw_ROOT);
+
+  const LocTy name_loc = lexer_.GetLoc();
   if (!ParseName(&name) ||
       !ParseToken(TokKind::kEqual, "expects '=' in instruction") ||
       !ParseShape(&shape) || !ParseOpcode(&opcode)) {
     return false;
   }
+
   if (is_root) {
+    if (!root_name->empty()) {
+      return Error(maybe_root_loc, "one computation should have only one ROOT");
+    }
     *root_name = name;
   }
 
@@ -434,13 +531,21 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
           HloInstruction::CreateConvert(shape, operands[0]));
       break;
     }
-    case HloOpcode::kCrossReplicaSum: {
+    case HloOpcode::kBitcastConvert: {
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(
-          HloInstruction::CreateCrossReplicaSum(shape, operands[0]));
+          HloInstruction::CreateBitcastConvert(shape, operands[0]));
+      break;
+    }
+    case HloOpcode::kCrossReplicaSum: {
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateCrossReplicaSum(shape, operands));
       break;
     }
     case HloOpcode::kReshape: {
@@ -549,13 +654,16 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kReduceWindow: {
       optional<HloComputation*> reduce_computation;
       optional<Window> window;
-      attrs["window"] = {/*required=*/true, AttrTy::kWindow, &window};
+      attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window};
       attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
                            &reduce_computation};
       if (!ParseOperands(&operands, /*expected_size=*/2) ||
           !ParseAttributes(attrs)) {
         return false;
       }
+      if (!window) {
+        window.emplace();
+      }
       instruction = builder->AddInstruction(HloInstruction::CreateReduceWindow(
           shape, /*operand=*/operands[0], /*init_value=*/operands[1], *window,
           *reduce_computation));
@@ -564,13 +672,16 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kConvolution: {
       optional<Window> window;
       optional<ConvolutionDimensionNumbers> dnums;
-      attrs["window"] = {/*required=*/true, AttrTy::kWindow, &window};
+      attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window};
       attrs["dim_labels"] = {/*required=*/true,
                              AttrTy::kConvolutionDimensionNumbers, &dnums};
       if (!ParseOperands(&operands, /*expected_size=*/2) ||
           !ParseAttributes(attrs)) {
         return false;
       }
+      if (!window) {
+        window.emplace();
+      }
       instruction = builder->AddInstruction(HloInstruction::CreateConvolve(
           shape, /*lhs=*/operands[0], /*rhs=*/operands[1], *window, *dnums));
       break;
@@ -644,11 +755,14 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       optional<HloComputation*> scatter;
       attrs["scatter"] = {/*required=*/true, AttrTy::kHloComputation, &scatter};
       optional<Window> window;
-      attrs["window"] = {/*required=*/true, AttrTy::kWindow, &window};
+      attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window};
       if (!ParseOperands(&operands, /*expected_size=*/3) ||
           !ParseAttributes(attrs)) {
         return false;
       }
+      if (!window) {
+        window.emplace();
+      }
       instruction =
           builder->AddInstruction(HloInstruction::CreateSelectAndScatter(
               shape, /*operand=*/operands[0], *select, *window,
@@ -798,15 +912,69 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
           shape, operands[0], config ? *config : ""));
       break;
     }
-    case HloOpcode::kConditional:
-    case HloOpcode::kCustomCall:
-    case HloOpcode::kReducePrecision:
-    case HloOpcode::kRng:
+    case HloOpcode::kRng: {
+      optional<RandomDistribution> distribution;
+      attrs["distribution"] = {/*required=*/true, AttrTy::kDistribution,
+                               &distribution};
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateRng(shape, *distribution, operands));
+      break;
+    }
+    case HloOpcode::kReducePrecision: {
+      optional<int64> exponent_bits;
+      optional<int64> mantissa_bits;
+      attrs["exponent_bits"] = {/*required=*/true, AttrTy::kInt64,
+                                &exponent_bits};
+      attrs["mantissa_bits"] = {/*required=*/true, AttrTy::kInt64,
+                                &mantissa_bits};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateReducePrecision(
+              shape, operands[0], static_cast<int>(*exponent_bits),
+              static_cast<int>(*mantissa_bits)));
+      break;
+    }
+    case HloOpcode::kConditional: {
+      optional<HloComputation*> true_computation;
+      optional<HloComputation*> false_computation;
+      attrs["true_computation"] = {/*required=*/true, AttrTy::kHloComputation,
+                                   &true_computation};
+      attrs["false_computation"] = {/*required=*/true, AttrTy::kHloComputation,
+                                    &false_computation};
+      if (!ParseOperands(&operands, /*expected_size=*/3) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateConditional(
+          shape, /*pred=*/operands[0],
+          /*true_computation_arg=*/operands[1], *true_computation,
+          /*false_computation_arg=*/operands[2], *false_computation));
+      break;
+    }
+    case HloOpcode::kCustomCall: {
+      optional<string> custom_call_target;
+      attrs["custom_call_target"] = {/*required=*/true, AttrTy::kString,
+                                     &custom_call_target};
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateCustomCall(
+          shape, operands, *custom_call_target));
+      break;
+    }
     case HloOpcode::kTrace:
       return TokenError(StrCat("parsing not yet implemented for op: ",
                                HloOpcodeString(opcode)));
   }
 
+  instruction->set_name(name);
+
   // Add common attrs (sharding, control predecessors) to the instruction, if
   // they were seen.
   if (sharding) {
@@ -817,15 +985,15 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     for (auto* pre : *predecessors) {
       Status status = pre->AddControlDependencyTo(instruction);
       if (!status.ok()) {
-        return TokenError(StrCat("error adding control dependency for: ", name,
-                                 " status: ", status.ToString()));
+        return Error(name_loc, StrCat("error adding control dependency for: ",
+                                      name, " status: ", status.ToString()));
       }
     }
   }
   if (metadata) {
     instruction->set_metadata(*metadata);
   }
-  return AddInstruction(name, instruction);
+  return AddInstruction(name, instruction, name_loc);
 }  // NOLINT(readability/fn_size)
 
 // ::= '{' (single_sharding | tuple_sharding) '}'
@@ -871,6 +1039,7 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
     return false;
   }
 
+  LocTy loc = lexer_.GetLoc();
   bool maximal = false;
   bool replicated = false;
   std::vector<int64> devices;
@@ -938,34 +1107,35 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
 
   if (replicated) {
     if (!devices.empty()) {
-      return TokenError(
-          "replicated shardings should not have any devices assigned");
+      return Error(loc,
+                   "replicated shardings should not have any devices assigned");
     }
     if (!ShapeUtil::Equal(tile_shape, Shape())) {
-      return TokenError(
-          "replicated shardings should not have any tile shape set");
+      return Error(loc,
+                   "replicated shardings should not have any tile shape set");
     }
     sharding->set_type(OpSharding::Type::OpSharding_Type_REPLICATED);
   } else if (maximal) {
     if (devices.size() != 1) {
-      return TokenError(
-          "maximal shardings should have exactly one device assigned");
+      return Error(loc,
+                   "maximal shardings should have exactly one device assigned");
     }
     if (!ShapeUtil::Equal(tile_shape, Shape())) {
-      return TokenError("maximal shardings should not have any tile shape set");
+      return Error(loc, "maximal shardings should not have any tile shape set");
     }
     sharding->set_type(OpSharding::Type::OpSharding_Type_MAXIMAL);
     sharding->add_tile_assignment_devices(devices[0]);
   } else {
     if (devices.size() <= 1) {
-      return TokenError(
-          "non-maximal shardings must have more than one device assigned");
+      return Error(
+          loc, "non-maximal shardings must have more than one device assigned");
     }
     if (ShapeUtil::Equal(tile_shape, Shape())) {
-      return TokenError("non-maximal shardings should have a tile shape set");
+      return Error(loc, "non-maximal shardings should have a tile shape set");
     }
     if (tile_assignment_dimensions.empty()) {
-      return TokenError(
+      return Error(
+          loc,
           "non-maximal shardings must have a tile assignment list including "
           "dimensions");
     }
@@ -990,10 +1160,11 @@ bool HloParser::ParseInstructionNames(
                   "expects '{' at the beginning of instruction name list")) {
     return false;
   }
+  LocTy loc = lexer_.GetLoc();
   do {
     string name;
     if (!ParseName(&name)) {
-      return TokenError("expects a instruction name");
+      return Error(loc, "expects a instruction name");
     }
     HloInstruction* instr =
         tensorflow::gtl::FindPtrOrNull(instruction_pool_, name);
@@ -1005,7 +1176,7 @@ bool HloParser::ParseInstructionNames(
   } while (EatIfPresent(TokKind::kComma));
 
   return ParseToken(TokKind::kRbrace,
-                    "expects '}' at the end of control instructions");
+                    "expects '}' at the end of instruction name list");
 }
 
 bool HloParser::SetValueInLiteral(int64 value, int64 linear_index,
@@ -1040,6 +1211,8 @@ bool HloParser::SetValueInLiteral(double value, int64 linear_index,
   switch (shape.element_type()) {
     case F16:
       return SetValueInLiteralHelper<half>(value, linear_index, literal);
+    case BF16:
+      return SetValueInLiteralHelper<bfloat16>(value, linear_index, literal);
     case F32:
       return SetValueInLiteralHelper<float>(value, linear_index, literal);
     case F64:
@@ -1078,7 +1251,8 @@ bool HloParser::SetValueInLiteralHelper(ParsedElemT value, int64 linear_index,
        (std::numeric_limits<ParsedElemT>::infinity() == value ||
         -std::numeric_limits<ParsedElemT>::infinity() == value))) {
     // Skip range checking for non-finite value.
-  } else if (literal->shape().element_type() == F16) {
+  } else if (literal->shape().element_type() == F16 ||
+             literal->shape().element_type() == BF16) {
     if (value > kF16max || value < -kF16max) {
       return TokenError(StrCat(
           "value ", value, " is out of range for literal's primitive type ",
@@ -1164,12 +1338,6 @@ bool HloParser::ParseTupleLiteral(std::unique_ptr<Literal>* literal,
 // rank2345 ::= shape nested_array
 bool HloParser::ParseNonTupleLiteral(std::unique_ptr<Literal>* literal,
                                      const Shape& shape) {
-  const int64 size = ShapeUtil::ElementsIn(shape);
-  if (size == 0) {
-    *literal = Literal::CreateFromShape(shape);
-    return true;
-  }
-
   const int64 rank = ShapeUtil::Rank(shape);
   if (rank > 1 && !EatShapeAndCheckCompatible(shape)) {
     return false;
@@ -1270,20 +1438,22 @@ bool HloParser::ParseNonTupleLiteral(std::unique_ptr<Literal>* literal,
           }
           lexer_.Lex();
         } else if (primitive_util::IsIntegralType(shape.element_type())) {
+          LocTy loc = lexer_.GetLoc();
           int64 value;
           if (!ParseInt64(&value)) {
-            return TokenError(StrCat("expects integer for primitive type: ",
+            return Error(loc, StrCat("expects integer for primitive type: ",
                                      PrimitiveType_Name(shape.element_type())));
           }
           if (!SetValueInLiteral(value, linear_index++, literal->get())) {
             return false;
           }
         } else if (primitive_util::IsFloatingPointType(shape.element_type())) {
+          LocTy loc = lexer_.GetLoc();
           double value;
           if (!ParseDouble(&value)) {
-            return TokenError(
-                StrCat("expect floating point value for primitive type: ",
-                       PrimitiveType_Name(shape.element_type())));
+            return Error(
+                loc, StrCat("expect floating point value for primitive type: ",
+                            PrimitiveType_Name(shape.element_type())));
           }
           if (!SetValueInLiteral(value, linear_index++, literal->get())) {
             return false;
@@ -1305,7 +1475,7 @@ bool HloParser::ParseNonTupleLiteral(std::unique_ptr<Literal>* literal,
 // operands1
 //   ::= /*empty*/
 //   ::= operand (, operand)*
-// operand ::= shape name
+// operand ::= (shape)? name
 bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands) {
   if (!ParseToken(TokKind::kLparen,
                   "expects '(' at the beginning of operands")) {
@@ -1315,15 +1485,21 @@ bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands) {
     // empty
   } else {
     do {
-      Shape shape;
+      LocTy loc = lexer_.GetLoc();
       string name;
-      if (!ParseShape(&shape) || !ParseName(&name)) {
+      if (CanBeShape()) {
+        Shape shape;
+        if (!ParseShape(&shape)) {
+          return false;
+        }
+      }
+      if (!ParseName(&name)) {
         return false;
       }
       HloInstruction* instruction =
           tensorflow::gtl::FindPtrOrNull(instruction_pool_, name);
       if (!instruction) {
-        return TokenError(StrCat("instruction does not exist: ", name));
+        return Error(loc, StrCat("instruction does not exist: ", name));
       }
       operands->push_back(instruction);
     } while (EatIfPresent(TokKind::kComma));
@@ -1333,11 +1509,12 @@ bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands) {
 
 bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands,
                               const int expected_size) {
+  LocTy loc = lexer_.GetLoc();
   if (!ParseOperands(operands)) {
     return false;
   }
   if (expected_size != operands->size()) {
-    return TokenError(StrCat("expects ", expected_size, " operands, but has ",
+    return Error(loc, StrCat("expects ", expected_size, " operands, but has ",
                              operands->size(), " operands"));
   }
   return true;
@@ -1346,6 +1523,7 @@ bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands,
 // sub_attributes ::= '{' (','? attribute)* '}'
 bool HloParser::ParseSubAttributes(
     const std::unordered_map<string, AttrConfig>& attrs) {
+  LocTy loc = lexer_.GetLoc();
   if (!ParseToken(TokKind::kLbrace, "expects '{' to start sub attributes")) {
     return false;
   }
@@ -1364,7 +1542,7 @@ bool HloParser::ParseSubAttributes(
   for (const auto& attr_it : attrs) {
     if (attr_it.second.required &&
         seen_attrs.find(attr_it.first) == seen_attrs.end()) {
-      return TokenError(Printf("sub-attribute %s is expected but not seen",
+      return Error(loc, Printf("sub-attribute %s is expected but not seen",
                                attr_it.first.c_str()));
     }
   }
@@ -1374,6 +1552,7 @@ bool HloParser::ParseSubAttributes(
 // attributes ::= (',' attribute)*
 bool HloParser::ParseAttributes(
     const std::unordered_map<string, AttrConfig>& attrs) {
+  LocTy loc = lexer_.GetLoc();
   std::unordered_set<string> seen_attrs;
   while (EatIfPresent(TokKind::kComma)) {
     if (!ParseAttributeHelper(attrs, &seen_attrs)) {
@@ -1384,7 +1563,7 @@ bool HloParser::ParseAttributes(
   for (const auto& attr_it : attrs) {
     if (attr_it.second.required &&
         seen_attrs.find(attr_it.first) == seen_attrs.end()) {
-      return TokenError(Printf("attribute %s is expected but not seen",
+      return Error(loc, Printf("attribute %s is expected but not seen",
                                attr_it.first.c_str()));
     }
   }
@@ -1394,21 +1573,23 @@ bool HloParser::ParseAttributes(
 bool HloParser::ParseAttributeHelper(
     const std::unordered_map<string, AttrConfig>& attrs,
     std::unordered_set<string>* seen_attrs) {
+  LocTy loc = lexer_.GetLoc();
   string name;
   if (!ParseAttributeName(&name)) {
-    return TokenError("error parsing attributes");
+    return Error(loc, "error parsing attributes");
   }
   VLOG(1) << "Parsing attribute " << name;
   if (!seen_attrs->insert(name).second) {
-    return TokenError(Printf("attribute %s already exists", name.c_str()));
+    return Error(loc, Printf("attribute %s already exists", name.c_str()));
   }
   auto attr_it = attrs.find(name);
   if (attr_it == attrs.end()) {
-    return TokenError(Printf("unexpected attribute %s", name.c_str()));
+    return Error(loc, Printf("unexpected attribute %s", name.c_str()));
   }
   AttrTy attr_type = attr_it->second.attr_type;
   void* attr_out_ptr = attr_it->second.result;
   bool success = [&] {
+    LocTy attr_loc = lexer_.GetLoc();
     switch (attr_type) {
       case AttrTy::kInt64: {
         int64 result;
@@ -1424,7 +1605,7 @@ bool HloParser::ParseAttributeHelper(
           return false;
         }
         if (result != static_cast<int32>(result)) {
-          return TokenError("value out of range for int32");
+          return Error(attr_loc, "value out of range for int32");
         }
         static_cast<optional<int32>*>(attr_out_ptr)
             ->emplace(static_cast<int32>(result));
@@ -1437,7 +1618,7 @@ bool HloParser::ParseAttributeHelper(
         }
         if (result > std::numeric_limits<float>::max() ||
             result < std::numeric_limits<float>::lowest()) {
-          return TokenError("value out of range for float");
+          return Error(attr_loc, "value out of range for float");
         }
         static_cast<optional<float>*>(attr_out_ptr)
             ->emplace(static_cast<float>(result));
@@ -1536,22 +1717,32 @@ bool HloParser::ParseAttributeHelper(
         static_cast<optional<OpMetadata>*>(attr_out_ptr)->emplace(result);
         return true;
       }
+      case AttrTy::kDistribution: {
+        RandomDistribution result;
+        if (!ParseRandomDistribution(&result)) {
+          return false;
+        }
+        static_cast<optional<RandomDistribution>*>(attr_out_ptr)
+            ->emplace(result);
+        return true;
+      }
     }
   }();
   if (!success) {
-    return TokenError(Printf("error parsing attribute %s", name.c_str()));
+    return Error(loc, Printf("error parsing attribute %s", name.c_str()));
   }
   return true;
 }
 
 bool HloParser::ParseComputationName(HloComputation** value) {
   string name;
+  LocTy loc = lexer_.GetLoc();
   if (!ParseName(&name)) {
-    return TokenError("expects computation name");
+    return Error(loc, "expects computation name");
   }
   *value = tensorflow::gtl::FindPtrOrNull(computation_pool_, name);
   if (*value == nullptr) {
-    return TokenError(StrCat("computation does not exist: ", name));
+    return Error(loc, StrCat("computation does not exist: ", name));
   }
   return true;
 }
@@ -1560,6 +1751,7 @@ bool HloParser::ParseComputationName(HloComputation** value) {
 // The subattributes can appear in any order. 'size=' is required, others are
 // optional.
 bool HloParser::ParseWindow(Window* window) {
+  LocTy loc = lexer_.GetLoc();
   if (!ParseToken(TokKind::kLbrace, "expected '{' to start window attribute")) {
     return false;
   }
@@ -1569,10 +1761,12 @@ bool HloParser::ParseWindow(Window* window) {
   std::vector<std::vector<int64>> pad;
   std::vector<int64> lhs_dilate;
   std::vector<int64> rhs_dilate;
+  std::vector<int64> rhs_reversal;
   while (lexer_.GetKind() != TokKind::kRbrace) {
+    LocTy attr_loc = lexer_.GetLoc();
     string field_name;
     if (!ParseAttributeName(&field_name)) {
-      return TokenError("expects sub-attributes in window");
+      return Error(attr_loc, "expects sub-attributes in window");
     }
     bool ok = [&] {
       if (field_name == "size") {
@@ -1590,7 +1784,10 @@ bool HloParser::ParseWindow(Window* window) {
       if (field_name == "pad") {
         return ParseWindowPad(&pad);
       }
-      return TokenError(StrCat("unexpected attribute name: ", field_name));
+      if (field_name == "rhs_reversal") {
+        return ParseDxD("rhs_reversal", &rhs_reversal);
+      }
+      return Error(loc, StrCat("unexpected attribute name: ", field_name));
     }();
     if (!ok) {
       return false;
@@ -1598,20 +1795,20 @@ bool HloParser::ParseWindow(Window* window) {
   }
 
   if (size.empty()) {
-    return TokenError(
-        "sub-attribute 'size=' is required in the window attribute");
+    return Error(loc,
+                 "sub-attribute 'size=' is required in the window attribute");
   }
   if (!stride.empty() && stride.size() != size.size()) {
-    return TokenError("expects 'stride=' has the same size as 'size='");
+    return Error(loc, "expects 'stride=' has the same size as 'size='");
   }
   if (!lhs_dilate.empty() && lhs_dilate.size() != size.size()) {
-    return TokenError("expects 'lhs_dilate=' has the same size as 'size='");
+    return Error(loc, "expects 'lhs_dilate=' has the same size as 'size='");
   }
   if (!rhs_dilate.empty() && rhs_dilate.size() != size.size()) {
-    return TokenError("expects 'rhs_dilate=' has the same size as 'size='");
+    return Error(loc, "expects 'rhs_dilate=' has the same size as 'size='");
   }
   if (!pad.empty() && pad.size() != size.size()) {
-    return TokenError("expects 'pad=' has the same size as 'size='");
+    return Error(loc, "expects 'pad=' has the same size as 'size='");
   }
 
   for (int i = 0; i < size.size(); i++) {
@@ -1626,6 +1823,8 @@ bool HloParser::ParseWindow(Window* window) {
         lhs_dilate.empty() ? 1 : lhs_dilate[i]);
     window->mutable_dimensions(i)->set_window_dilation(
         rhs_dilate.empty() ? 1 : rhs_dilate[i]);
+    window->mutable_dimensions(i)->set_window_reversal(
+        rhs_reversal.empty() ? false : (rhs_reversal[i] == 1));
   }
   return ParseToken(TokKind::kRbrace, "expected '}' to end window attribute");
 }
@@ -1673,7 +1872,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
           StrCat("expects unique lhs dimension numbers, but sees ", lhs));
     }
     for (int i = 0; i < rank - 2; i++) {
-      dnums->add_spatial_dimensions(-1);
+      dnums->add_input_spatial_dimensions(-1);
     }
     for (int i = 0; i < rank; i++) {
       char c = lhs[i];
@@ -1682,7 +1881,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
       } else if (c == 'f') {
         dnums->set_input_feature_dimension(i);
       } else if (c < '0' + rank && c >= '0') {
-        dnums->set_spatial_dimensions(c - '0', i);
+        dnums->set_input_spatial_dimensions(c - '0', i);
       } else {
         return TokenError(
             Printf("expects [0-%lldbf] in lhs dimension numbers", rank - 1));
@@ -1720,6 +1919,9 @@ bool HloParser::ParseConvolutionDimensionNumbers(
       return TokenError(
           StrCat("expects unique output dimension numbers, but sees ", out));
     }
+    for (int i = 0; i < rank - 2; i++) {
+      dnums->add_output_spatial_dimensions(-1);
+    }
     for (int i = 0; i < rank; i++) {
       char c = out[i];
       if (c == 'b') {
@@ -1727,11 +1929,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
       } else if (c == 'f') {
         dnums->set_output_feature_dimension(i);
       } else if (c < '0' + rank && c >= '0') {
-        if (dnums->spatial_dimensions(c - '0') != i) {
-          return TokenError(
-              "output spatial dimensions should be the same as input spatial "
-              "dimensions");
-        }
+        dnums->set_output_spatial_dimensions(c - '0', i);
       } else {
         return TokenError(
             Printf("expects [0-%lldbf] in output dimension numbers", rank - 1));
@@ -1772,20 +1970,19 @@ bool HloParser::ParseSliceRanges(SliceRanges* result) {
     return ParseToken(TokKind::kRbrace, "expects '}' to end ranges");
   }
   do {
+    LocTy loc = lexer_.GetLoc();
     ranges.emplace_back();
     if (!ParseInt64List(TokKind::kLsquare, TokKind::kRsquare, TokKind::kColon,
                         &ranges.back())) {
       return false;
     }
-  } while (EatIfPresent(TokKind::kComma));
-
-  for (const auto& range : ranges) {
+    const auto& range = ranges.back();
     if (range.size() != 2 && range.size() != 3) {
-      return TokenError(Printf(
-          "expects [start:limit:step] or [start:limit], but sees %ld elements.",
-          range.size()));
+      return Error(loc, Printf("expects [start:limit:step] or [start:limit], "
+                               "but sees %ld elements.",
+                               range.size()));
     }
-  }
+  } while (EatIfPresent(TokKind::kComma));
 
   for (const auto& range : ranges) {
     result->starts.push_back(range[0]);
@@ -1821,6 +2018,19 @@ bool HloParser::ParseInt64List(const TokKind start, const TokKind end,
       end, StrCat("expects an int64 list to end with ", TokKindToString(end)));
 }
 
+// param_list_to_shape ::= param_list '->' shape
+bool HloParser::ParseParamListToShape(Shape* shape, LocTy* shape_loc) {
+  if (!ParseParamList() || !ParseToken(TokKind::kArrow, "expects '->'")) {
+    return false;
+  }
+  *shape_loc = lexer_.GetLoc();
+  return ParseShape(shape);
+}
+
+bool HloParser::CanBeParamListToShape() {
+  return lexer_.GetKind() == TokKind::kLparen;
+}
+
 // param_list ::= '(' param_list1 ')'
 // param_list1
 //   ::= /*empty*/
@@ -1837,8 +2047,8 @@ bool HloParser::ParseParamList() {
   } else {
     do {
       Shape shape;
-      if (!ParseToken(TokKind::kName, "expects name in parameter") ||
-          !ParseShape(&shape)) {
+      string name;
+      if (!ParseName(&name) || !ParseShape(&shape)) {
         return false;
       }
     } while (EatIfPresent(TokKind::kComma));
@@ -1877,9 +2087,17 @@ bool HloParser::ParseShape(Shape* result) {
   return true;
 }
 
+bool HloParser::CanBeShape() {
+  // A non-tuple shape starts with a kShape token; a tuple shape starts with
+  // '('.
+  return lexer_.GetKind() == TokKind::kShape ||
+         lexer_.GetKind() == TokKind::kLparen;
+}
+
 bool HloParser::ParseName(string* result) {
   VLOG(1) << "ParseName";
-  if (lexer_.GetKind() != TokKind::kName) {
+  if (lexer_.GetKind() != TokKind::kIdent &&
+      lexer_.GetKind() != TokKind::kName) {
     return TokenError("expects name");
   }
   *result = lexer_.GetStrVal();
@@ -1907,15 +2125,16 @@ bool HloParser::ParseString(string* result) {
 }
 
 bool HloParser::ParseDxD(const string& name, std::vector<int64>* result) {
+  LocTy loc = lexer_.GetLoc();
   if (!result->empty()) {
-    return TokenError(
-        Printf("sub-attribute '%s=' already exists", name.c_str()));
+    return Error(loc,
+                 Printf("sub-attribute '%s=' already exists", name.c_str()));
   }
   // 1D
   if (lexer_.GetKind() == TokKind::kInt) {
     int64 number;
     if (!ParseInt64(&number)) {
-      return TokenError(Printf("expects sub-attribute '%s=i'", name.c_str()));
+      return Error(loc, Printf("expects sub-attribute '%s=i'", name.c_str()));
     }
     result->push_back(number);
     return true;
@@ -1924,8 +2143,8 @@ bool HloParser::ParseDxD(const string& name, std::vector<int64>* result) {
   if (lexer_.GetKind() == TokKind::kDxD) {
     string str = lexer_.GetStrVal();
     if (!SplitAndParseAsInts(str, 'x', result)) {
-      return TokenError(
-          Printf("expects sub-attribute '%s=ixj...'", name.c_str()));
+      return Error(loc,
+                   Printf("expects sub-attribute '%s=ixj...'", name.c_str()));
     }
     lexer_.Lex();
     return true;
@@ -1934,8 +2153,9 @@ bool HloParser::ParseDxD(const string& name, std::vector<int64>* result) {
 }
 
 bool HloParser::ParseWindowPad(std::vector<std::vector<int64>>* pad) {
+  LocTy loc = lexer_.GetLoc();
   if (!pad->empty()) {
-    return TokenError("sub-attribute 'pad=' already exists");
+    return Error(loc, "sub-attribute 'pad=' already exists");
   }
   if (lexer_.GetKind() != TokKind::kPad) {
     return TokenError("expects window pad pattern, e.g., '0_0x3_3'");
@@ -1946,8 +2166,8 @@ bool HloParser::ParseWindowPad(std::vector<std::vector<int64>>* pad) {
     std::vector<int64> low_high;
     if (!SplitAndParseAsInts(padding_str[i], '_', &low_high) ||
         low_high.size() != 2) {
-      return TokenError(
-          "expects padding_low and padding_high separated by '_'");
+      return Error(loc,
+                   "expects padding_low and padding_high separated by '_'");
     }
     pad->push_back(low_high);
   }
@@ -1963,15 +2183,16 @@ bool HloParser::ParsePaddingConfig(PaddingConfig* padding) {
   if (lexer_.GetKind() != TokKind::kPad) {
     return TokenError("expects padding config, e.g., '0_0_0x3_3_1'");
   }
+  LocTy loc = lexer_.GetLoc();
   string str = lexer_.GetStrVal();
   std::vector<string> padding_str = Split(str, 'x');
   for (const auto& padding_dim_str : padding_str) {
     std::vector<int64> padding_dim;
     if (!SplitAndParseAsInts(padding_dim_str, '_', &padding_dim) ||
         (padding_dim.size() != 2 && padding_dim.size() != 3)) {
-      return TokenError(
-          "expects padding config pattern like 'low_high_interior' or "
-          "'low_high'");
+      return Error(loc,
+                   "expects padding config pattern like 'low_high_interior' or "
+                   "'low_high'");
     }
     auto* dim = padding->add_dimensions();
     dim->set_edge_padding_low(padding_dim[0]);
@@ -2013,20 +2234,51 @@ bool HloParser::ParseMetadata(OpMetadata* metadata) {
 
 bool HloParser::ParseOpcode(HloOpcode* result) {
   VLOG(1) << "ParseOpcode";
-  if (lexer_.GetKind() != TokKind::kOpcode) {
+  if (lexer_.GetKind() != TokKind::kIdent) {
     return TokenError("expects opcode");
   }
-  *result = lexer_.GetOpcodeVal();
+  string val = lexer_.GetStrVal();
+  auto status_or_result = StringToHloOpcode(val);
+  if (!status_or_result.ok()) {
+    return TokenError(
+        Printf("expects opcode but sees: %s, error: %s", val.c_str(),
+               status_or_result.status().error_message().c_str()));
+  }
+  *result = status_or_result.ValueOrDie();
   lexer_.Lex();
   return true;
 }
 
 bool HloParser::ParseFusionKind(HloInstruction::FusionKind* result) {
   VLOG(1) << "ParseFusionKind";
-  if (lexer_.GetKind() != TokKind::kFusionKind) {
+  if (lexer_.GetKind() != TokKind::kIdent) {
     return TokenError("expects fusion kind");
   }
-  *result = lexer_.GetFusionKindVal();
+  string val = lexer_.GetStrVal();
+  auto status_or_result = StringToFusionKind(val);
+  if (!status_or_result.ok()) {
+    return TokenError(
+        Printf("expects fusion kind but sees: %s, error: %s", val.c_str(),
+               status_or_result.status().error_message().c_str()));
+  }
+  *result = status_or_result.ValueOrDie();
+  lexer_.Lex();
+  return true;
+}
+
+bool HloParser::ParseRandomDistribution(RandomDistribution* result) {
+  VLOG(1) << "ParseRandomDistribution";
+  if (lexer_.GetKind() != TokKind::kIdent) {
+    return TokenError("expects random distribution");
+  }
+  string val = lexer_.GetStrVal();
+  auto status_or_result = StringToRandomDistribution(val);
+  if (!status_or_result.ok()) {
+    return TokenError(
+        Printf("expects random distribution but sees: %s, error: %s",
+               val.c_str(), status_or_result.status().error_message().c_str()));
+  }
+  *result = status_or_result.ValueOrDie();
   lexer_.Lex();
   return true;
 }
@@ -2092,20 +2344,20 @@ bool HloParser::EatIfPresent(TokKind kind) {
   return true;
 }
 
-bool HloParser::AddInstruction(const string& name,
-                               HloInstruction* instruction) {
+bool HloParser::AddInstruction(const string& name, HloInstruction* instruction,
+                               LocTy name_loc) {
   auto result = instruction_pool_.insert({name, instruction});
   if (!result.second) {
-    return TokenError(StrCat("instruction already exists: ", name));
+    return Error(name_loc, StrCat("instruction already exists: ", name));
   }
   return true;
 }
 
-bool HloParser::AddComputation(const string& name,
-                               HloComputation* computation) {
+bool HloParser::AddComputation(const string& name, HloComputation* computation,
+                               LocTy name_loc) {
   auto result = computation_pool_.insert({name, computation});
   if (!result.second) {
-    return TokenError(StrCat("computation already exists: ", name));
+    return Error(name_loc, StrCat("computation already exists: ", name));
   }
   return true;
 }
@@ -2116,7 +2368,7 @@ StatusOr<std::unique_ptr<HloModule>> Parse(StringPiece str,
                                            const HloModuleConfig& config) {
   HloParser parser(str, config);
   if (!parser.Run()) {
-    return InvalidArgument("Syntax error: %s", parser.GetError().c_str());
+    return InvalidArgument("Syntax error:\n%s", parser.GetError().c_str());
   }
   return parser.ConsumeHloModule();
 }
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
index cb02ef84a9295fb100c77f2951e6acf3cce896f1..e6f7ee7c08f4d17a8d8ac58ec4662756b7c7159f 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
@@ -46,7 +46,7 @@ std::vector<TestData> CreateTestCases() {
 // ax + y
 {
 "AxpyParam",
-R"(HloModule axpy_module:
+R"(HloModule axpy_module
 
 ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   %alpha = f32[] parameter(0)
@@ -62,7 +62,7 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
 // pred constant
 {
 "ConstantPred",
-R"(HloModule constant_pred_module:
+R"(HloModule constant_pred_module
 
 ENTRY %constant_pred () -> pred[] {
   ROOT %constant = pred[] constant(true), metadata={op_type="const" op_name="\"it\'s not a problem\n" source_file="path/to/test.cc" source_line=68}
@@ -73,7 +73,7 @@ ENTRY %constant_pred () -> pred[] {
 // s32 constant
 {
 "ConstantS32",
-R"(HloModule constant_s32_module:
+R"(HloModule constant_s32_module
 
 ENTRY %constant_s32 () -> s32[] {
   ROOT %constant = s32[] constant(-42)
@@ -84,18 +84,40 @@ ENTRY %constant_s32 () -> s32[] {
 // f32 constant, but the value is not a decimal
 {
 "ConstantF32",
-R"(HloModule ConstantF32_module:
+R"(HloModule ConstantF32_module
 
 ENTRY %ConstantF32.v4 () -> f32[] {
   ROOT %constant = f32[] constant(42)
 }
 
+)"
+},
+// f32 constant, rank 1 empty array.
+{
+"ConstantF32R1Empty",
+R"(HloModule ConstantF32Empty_module
+
+ENTRY %ConstantF32Empty.v4 () -> f32[0] {
+  ROOT %constant = f32[0]{0} constant({})
+}
+
+)"
+},
+// f32 constant, rank 4 empty array.
+{
+"ConstantF32R4Empty",
+R"(HloModule ConstantF32R4Empty_module
+
+ENTRY %ConstantF32R4Empty.v4 () -> f32[2,0,4,3] {
+  ROOT %constant = f32[2,0,4,3]{3,2,1,0} constant(f32[2,0,4,3] { { /*i0=0*/ }, { /*i0=1*/ } })
+}
+
 )"
 },
 // constant 4D
 {
 "Constant4D",
-R"(HloModule Small_3x2x1x1_module:
+R"(HloModule Small_3x2x1x1_module
 
 ENTRY %Small_3x2x1x1.v1 () -> f32[3,2,1,1] {
   ROOT %constant = f32[3,2,1,1]{3,2,1,0} constant(f32[3,2,1,1] { { /*i0=0*/ { /*i1=0*/ {-1} }, { /*i1=1*/ {4.1} } }, { /*i0=1*/ { /*i1=0*/ {2} }, { /*i1=1*/ {4.1} } }, { /*i0=2*/ { /*i1=0*/ {5} }, { /*i1=1*/ {4.4} } } })
@@ -106,7 +128,7 @@ ENTRY %Small_3x2x1x1.v1 () -> f32[3,2,1,1] {
 // non-finite constants: nan, inf, -inf
 {
 "ConstantNonFinite",
-R"(HloModule IsFiniteR1F32s_module:
+R"(HloModule IsFiniteR1F32s_module
 
 ENTRY %IsFiniteR1F32s.v2 () -> pred[6] {
   %constant = f32[6]{0} constant({nan, 7, nan, -1, inf, -inf})
@@ -118,18 +140,29 @@ ENTRY %IsFiniteR1F32s.v2 () -> pred[6] {
 // constant f16
 {
 "ConstantF16",
-R"(HloModule ConstantF16_module:
+R"(HloModule ConstantF16_module
 
 ENTRY %ConstantF16.v4 () -> f16[] {
   ROOT %constant = f16[] constant(500)
 }
 
+)"
+},
+// bf16
+{
+"BF16",
+R"(HloModule BF16
+
+ENTRY %BF16.v4 () -> bf16[] {
+  ROOT %constant = bf16[] constant(500)
+}
+
 )"
 },
 // constant + constant
 {
 "AddConstants",
-R"(HloModule add_constants_module:
+R"(HloModule add_constants_module
 
 ENTRY %add_constants () -> f32[] {
   %constant = f32[] constant(3.14)
@@ -141,7 +174,7 @@ ENTRY %add_constants () -> f32[] {
 // tuple constant
 {
 "TupleConstant",
-R"(HloModule TupleConstant_module:
+R"(HloModule TupleConstant_module
 
 ENTRY %TupleConstant.v1 () -> (f32[2,1], f32[2]) {
   ROOT %constant = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { { 1 }, { 2 } }, {2, 42} ))
@@ -152,7 +185,7 @@ ENTRY %TupleConstant.v1 () -> (f32[2,1], f32[2]) {
 // v1 > v2 ? v1 : v2
 {
 "SelectR1F32",
-R"(HloModule SelectR1F32WithCmpR1F32sFromParamsSmall_module:
+R"(HloModule SelectR1F32WithCmpR1F32sFromParamsSmall_module
 
 ENTRY %SelectR1F32WithCmpR1F32sFromParamsSmall.v4 (v1: f32[4], v2: f32[4]) -> f32[4] {
   %v1 = f32[4]{0} parameter(0), sharding={maximal device=1}
@@ -166,7 +199,7 @@ ENTRY %SelectR1F32WithCmpR1F32sFromParamsSmall.v4 (v1: f32[4], v2: f32[4]) -> f3
 // empty tuple
 {
 "EmptyTupleCreate",
-R"(HloModule EmptyTupleCreate_module:
+R"(HloModule EmptyTupleCreate_module
 
 ENTRY %EmptyTupleCreate.v1 () -> () {
   ROOT %tuple = () tuple()
@@ -177,7 +210,7 @@ ENTRY %EmptyTupleCreate.v1 () -> () {
 // tuple
 {
 "TupleCreate",
-R"(HloModule TupleCreate_module:
+R"(HloModule TupleCreate_module
 
 ENTRY %TupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f32[2,3]) {
   %v1 = f32[] parameter(0)
@@ -190,7 +223,7 @@ ENTRY %TupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f
 },
 {
 "ShardedTupleCreate",
-R"(HloModule ShardedTupleCreate_module:
+R"(HloModule ShardedTupleCreate_module
 
 ENTRY %ShardedTupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f32[2,3]) {
   %v1 = f32[] parameter(0)
@@ -205,7 +238,7 @@ ENTRY %ShardedTupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f3
 // while (result < 5) { result = result + 1; }
 {
 "WhileWithScalarS32Result",
-R"(HloModule WhileWithScalarS32Result_module:
+R"(HloModule WhileWithScalarS32Result_module
 
 %body.v3 (prev.1: s32[]) -> s32[] {
   %constant = s32[] constant(1)
@@ -229,7 +262,7 @@ ENTRY %WhileWithScalarS32Result.v2 () -> s32[] {
 // send and recv
 {
 "SendRecv",
-R"(HloModule TwoSendRecvBothWayRecvFist_module:
+R"(HloModule TwoSendRecvBothWayRecvFist_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
   %recv = (f32[], u32[]) recv(), channel_id=15, sharding={maximal device=1}
@@ -244,7 +277,7 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 // get-tuple-element
 {
 "GetTupleElement",
-R"(HloModule GetTupleElement_module:
+R"(HloModule GetTupleElement_module
 
 ENTRY %GetTupleElement.v4 () -> s32[2,3] {
   %constant = f32[3]{0} constant({1, 2, 3})
@@ -258,7 +291,7 @@ ENTRY %GetTupleElement.v4 () -> s32[2,3] {
 // call
 {
 "Call",
-R"(HloModule CallR0F32IdentityScalar_module:
+R"(HloModule CallR0F32IdentityScalar_module
 
 %Identity.v1 (x: f32[]) -> f32[] {
   ROOT %x = f32[] parameter(0)
@@ -274,7 +307,7 @@ ENTRY %CallR0F32IdentityScalar.v2 () -> f32[] {
 // reduce window
 {
 "ReduceWindow",
-R"(HloModule R4UnitWindow_module:
+R"(HloModule R4UnitWindow_module
 
 %add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
   %lhs = f32[] parameter(0)
@@ -288,12 +321,31 @@ ENTRY %R4UnitWindow.v3 (operand: f32[13,12,8,15]) -> f32[13,3,8,15] {
   ROOT %reduce-window = f32[13,3,8,15]{0,3,2,1} reduce-window(f32[13,12,8,15]{0,3,2,1} %operand, f32[] %constant), window={size=1x1x7x1 stride=1x4x1x1 pad=0_0x0_0x3_3x0_0}, to_apply=%add_F32.v3
 }
 
+)"
+},
+// reduce window on scalar
+{
+"ReduceWindowScalar",
+R"(HloModule reduce_window_scalar
+
+%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs)
+}
+
+ENTRY %R4UnitWindowScalar () -> f32[] {
+  %constant = f32[] constant(42)
+  %constant.1 = f32[] constant(1)
+  ROOT %reduce-window = f32[] reduce-window(f32[] %constant, f32[] %constant.1), to_apply=%add_F32.v3
+}
+
 )"
 },
 // convolution
 {
 "Convolution",
-R"(HloModule Convolve1D1Window_0_module:
+R"(HloModule Convolve1D1Window_0_module
 
 ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
   %input = f32[1,2,1]{2,1,0} parameter(0)
@@ -307,12 +359,25 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
 // convolution rank 2
 {
 "ConvolutionR2",
-R"(HloModule ConvolveR2_module:
+R"(HloModule ConvolveR2_module
 
 ENTRY %ConvolveR2.v3 (input: f32[1,2], filter: f32[1,1]) -> f32[1,2] {
   %input = f32[1,2]{1,0} parameter(0)
   %filter = f32[1,1]{1,0} parameter(1)
-  ROOT %convolution = f32[1,2]{0,1} convolution(f32[1,2]{1,0} %input, f32[1,1]{1,0} %filter), window={size=1}, dim_labels=bf_io->bf
+  ROOT %convolution = f32[1,2]{0,1} convolution(f32[1,2]{1,0} %input, f32[1,1]{1,0} %filter), dim_labels=bf_io->bf
+}
+
+)"
+},
+// convolution backward
+{
+"ConvolutionBackward",
+R"(HloModule ConvolveBackward_module
+
+ENTRY %ConvolveBackward (input: f32[128,7,7,512], filter: f32[3,3,512,512]) -> f32[128,14,14,512] {
+  %input = f32[128,7,7,512]{0,3,2,1} parameter(0)
+  %filter = f32[3,3,512,512]{3,2,1,0} parameter(1)
+  ROOT %convolution-base-dilated = f32[128,14,14,512]{0,3,2,1} convolution(f32[128,7,7,512]{0,3,2,1} %input, f32[3,3,512,512]{3,2,1,0} %filter), window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2 rhs_reversal=1x1}, dim_labels=b01f_01oi->b01f
 }
 
 )"
@@ -320,7 +385,7 @@ ENTRY %ConvolveR2.v3 (input: f32[1,2], filter: f32[1,1]) -> f32[1,2] {
 // reverse(constant)
 {
 "Reverse4D",
-R"(HloModule Reverse4DFloatArrayOnDim01_module:
+R"(HloModule Reverse4DFloatArrayOnDim01_module
 
 ENTRY %Reverse4DFloatArrayOnDim01.v2 () -> f32[4,3,2,1] {
   %constant = f32[4,3,2,1]{0,1,2,3} constant(f32[4,3,2,1] { { /*i0=0*/ { /*i1=0*/ {1}, {2} }, { /*i1=1*/ {3}, {4} }, { /*i1=2*/ {5}, {6} } }, { /*i0=1*/ { /*i1=0*/ {7}, {8} }, { /*i1=1*/ {9}, {10} }, { /*i1=2*/ {11}, {12} } }, { /*i0=2*/ { /*i1=0*/ {13}, {14} }, { /*i1=1*/ {15}, {16} }, { /*i1=2*/ {17}, {18} } }, { /*i0=3*/ { /*i1=0*/ {19}, {20} }, { /*i1=1*/ {21}, {22} }, { /*i1=2*/ {23}, {24} } } })
@@ -332,7 +397,7 @@ ENTRY %Reverse4DFloatArrayOnDim01.v2 () -> f32[4,3,2,1] {
 // concat
 {
 "Concat",
-R"(HloModule Concat2x3With2x5_module:
+R"(HloModule Concat2x3With2x5_module
 
 ENTRY %Concat2x3With2x5.v3 () -> f32[2,8] {
   %constant = f32[2,3]{1,0} constant(f32[2,3] { { 0, 1, 2 }, { 1000, 1001, 1002 } })
@@ -342,48 +407,36 @@ ENTRY %Concat2x3With2x5.v3 () -> f32[2,8] {
 
 )"
 },
-// map
+// select and scatter
 {
-"Map",
-R"(HloModule MapBinaryAdder_module:
+"SelectAndScatter",
+R"(HloModule R4F32OverlapSmall_module
 
-%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
+%ge_F32.v3 (lhs: f32[], rhs: f32[]) -> pred[] {
   %lhs = f32[] parameter(0)
   %rhs = f32[] parameter(1)
-  ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs)
-}
-
-ENTRY %MapBinaryAdder.v3 (param0: f32[4], param1: f32[4]) -> f32[4] {
-  %param0 = f32[4]{0} parameter(0)
-  %param1 = f32[4]{0} parameter(1)
-  ROOT %map = f32[4]{0} map(f32[4]{0} %param0, f32[4]{0} %param1), to_apply=%add_F32.v3
+  ROOT %greater-than-or-equal-to = pred[] greater-than-or-equal-to(f32[] %lhs, f32[] %rhs)
 }
 
-)"
-},
-// reduce
-{
-"Reduce",
-R"(HloModule ReduceR3ToR2_module:
-
-%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
-  %lhs = f32[] parameter(0)
-  %rhs = f32[] parameter(1)
-  ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs)
+%add_F32.v3 (lhs.1: f32[], rhs.1: f32[]) -> f32[] {
+  %lhs.1 = f32[] parameter(0)
+  %rhs.1 = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %lhs.1, f32[] %rhs.1)
 }
 
-ENTRY %ReduceR3ToR2.v3 (input: f32[8,16,256]) -> f32[8,16] {
-  %input = f32[8,16,256]{2,1,0} parameter(0)
-  %constant = f32[] constant(0)
-  ROOT %reduce = f32[8,16]{1,0} reduce(f32[8,16,256]{2,1,0} %input, f32[] %constant), dimensions={2}, to_apply=%add_F32.v3
+ENTRY %R4F32OverlapSmall.v4 () -> f32[4,5,1,1] {
+  %constant = f32[4,5,1,1]{3,2,1,0} constant(f32[4,5,1,1] { { /*i0=0*/ { /*i1=0*/ {7} }, { /*i1=1*/ {2} }, { /*i1=2*/ {5} }, { /*i1=3*/ {3} }, { /*i1=4*/ {8} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {8} }, { /*i1=2*/ {9} }, { /*i1=3*/ {3} }, { /*i1=4*/ {4} } }, { /*i0=2*/ { /*i1=0*/ {1} }, { /*i1=1*/ {5} }, { /*i1=2*/ {7} }, { /*i1=3*/ {5} }, { /*i1=4*/ {6} } }, { /*i0=3*/ { /*i1=0*/ {0} }, { /*i1=1*/ {6} }, { /*i1=2*/ {2} }, { /*i1=3*/ {10} }, { /*i1=4*/ {2} } } })
+  %constant.1 = f32[2,2,1,1]{3,2,1,0} constant(f32[2,2,1,1] { { /*i0=0*/ { /*i1=0*/ {2} }, { /*i1=1*/ {6} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {1} } } })
+  %constant.2 = f32[] constant(0)
+  ROOT %select-and-scatter = f32[4,5,1,1]{3,2,1,0} select-and-scatter(f32[4,5,1,1]{3,2,1,0} %constant, f32[2,2,1,1]{3,2,1,0} %constant.1, f32[] %constant.2), window={size=2x3x1x1 stride=2x2x1x1}, select=%ge_F32.v3, scatter=%add_F32.v3
 }
 
 )"
 },
-// select and scatter
+// select and scatter on scalar
 {
-"SelectAndScatter",
-R"(HloModule R4F32OverlapSmall_module:
+"SelectAndScatterScalar",
+R"(HloModule select_and_scatter_scalar
 
 %ge_F32.v3 (lhs: f32[], rhs: f32[]) -> pred[] {
   %lhs = f32[] parameter(0)
@@ -397,11 +450,11 @@ R"(HloModule R4F32OverlapSmall_module:
   ROOT %add = f32[] add(f32[] %lhs.1, f32[] %rhs.1)
 }
 
-ENTRY %R4F32OverlapSmall.v4 () -> f32[4,5,1,1] {
-  %constant = f32[4,5,1,1]{3,2,1,0} constant(f32[4,5,1,1] { { /*i0=0*/ { /*i1=0*/ {7} }, { /*i1=1*/ {2} }, { /*i1=2*/ {5} }, { /*i1=3*/ {3} }, { /*i1=4*/ {8} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {8} }, { /*i1=2*/ {9} }, { /*i1=3*/ {3} }, { /*i1=4*/ {4} } }, { /*i0=2*/ { /*i1=0*/ {1} }, { /*i1=1*/ {5} }, { /*i1=2*/ {7} }, { /*i1=3*/ {5} }, { /*i1=4*/ {6} } }, { /*i0=3*/ { /*i1=0*/ {0} }, { /*i1=1*/ {6} }, { /*i1=2*/ {2} }, { /*i1=3*/ {10} }, { /*i1=4*/ {2} } } })
-  %constant.1 = f32[2,2,1,1]{3,2,1,0} constant(f32[2,2,1,1] { { /*i0=0*/ { /*i1=0*/ {2} }, { /*i1=1*/ {6} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {1} } } })
-  %constant.2 = f32[] constant(0)
-  ROOT %select-and-scatter = f32[4,5,1,1]{3,2,1,0} select-and-scatter(f32[4,5,1,1]{3,2,1,0} %constant, f32[2,2,1,1]{3,2,1,0} %constant.1, f32[] %constant.2), window={size=2x3x1x1 stride=2x2x1x1}, select=%ge_F32.v3, scatter=%add_F32.v3
+ENTRY %SelectAndScatterScalar () -> f32[] {
+  %constant = f32[] constant(42)
+  %constant.1 = f32[] constant(1)
+  %constant.2 = f32[] constant(2)
+  ROOT %select-and-scatter = f32[] select-and-scatter(f32[] %constant, f32[] %constant.1, f32[] %constant.2), select=%ge_F32.v3, scatter=%add_F32.v3
 }
 
 )"
@@ -409,7 +462,7 @@ ENTRY %R4F32OverlapSmall.v4 () -> f32[4,5,1,1] {
 // slice
 {
 "Slice",
-R"(HloModule slice_module:
+R"(HloModule slice_module
 
 ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] {
   %p0 = f32[3,3,4,4]{3,2,1,0} parameter(0)
@@ -421,7 +474,7 @@ ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] {
 // slice, no stride
 {
 "SliceNoStride",
-R"(HloModule Slice3x3x3_To_1x3x3_F32_module:
+R"(HloModule Slice3x3x3_To_1x3x3_F32_module
 
 ENTRY %Slice3x3x3_To_1x3x3_F32.v2 () -> f32[1,3,3] {
   %constant = f32[3,3,3]{2,1,0} constant(f32[3,3,3] { { { 0, 1, 2 }, { 3, 4, 5 }, { 6, 7, 8 } }, { { 9, 10, 11 }, { 12, 13, 14 }, { 15, 16, 17 } }, { { 18, 19, 20 }, { 21, 22, 23 }, { 24, 25, 26 } } })
@@ -433,7 +486,7 @@ ENTRY %Slice3x3x3_To_1x3x3_F32.v2 () -> f32[1,3,3] {
 // slice R0
 {
 "SliceR0",
-R"(HloModule SliceR0_module:
+R"(HloModule SliceR0_module
 
 ENTRY %SliceR0.v2 () -> s32[] {
   %constant = s32[] constant(1)
@@ -445,7 +498,7 @@ ENTRY %SliceR0.v2 () -> s32[] {
 // transpose
 {
 "Transpose",
-R"(HloModule Transpose_module:
+R"(HloModule Transpose_module
 
 ENTRY %Transpose.v2 () -> s32[1,2,3] {
   %constant = s32[1,2,3]{2,1,0} constant(s32[1,2,3] { { { 1, 2, 3 }, { 4, 5, 6 } } })
@@ -457,7 +510,7 @@ ENTRY %Transpose.v2 () -> s32[1,2,3] {
 // Dynamic slice
 {
 "DynamicSlice",
-R"(HloModule DynamicSlice_module:
+R"(HloModule DynamicSlice_module
 
 ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[1]) -> s32[2,2,258] {
   %original_parameter = s32[2,2,258]{2,1,0} parameter(0)
@@ -472,7 +525,7 @@ ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[1]) -
 // Dynamic update slice
 {
 "DynamicUpdateSlice",
-R"(HloModule DynamicUpdateSlice_module:
+R"(HloModule DynamicUpdateSlice_module
 
 ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_indices: s32[4]) -> s32[1,1,25,1] {
   %input = s32[1,1,25,1]{3,2,1,0} parameter(0)
@@ -486,7 +539,7 @@ ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_
 // batch norm training
 {
 "BatchNormTraining",
-R"(HloModule BasicTraining_module:
+R"(HloModule BasicTraining_module
 
 ENTRY %BasicTraining.v4 () -> (f32[2,2,1,2], f32[2], f32[2]) {
   %constant = f32[2,2,1,2]{3,2,1,0} constant(f32[2,2,1,2] { { /*i0=0*/ { /*i1=0*/ {1, 2} }, { /*i1=1*/ {3, 4} } }, { /*i0=1*/ { /*i1=0*/ {5, 6} }, { /*i1=1*/ {7, 8} } } })
@@ -500,7 +553,7 @@ ENTRY %BasicTraining.v4 () -> (f32[2,2,1,2], f32[2], f32[2]) {
 // batch norm inference
 {
 "BatchNormInference",
-R"(HloModule BatchNormInference_module:
+R"(HloModule BatchNormInference_module
 
 ENTRY %BatchNormInference.v6 (input: f32[2,2,2,2], offset: f32[2], scale: f32[2], mean: f32[2], variance: f32[2]) -> f32[2,2,2,2] {
   %input = f32[2,2,2,2]{3,2,1,0} parameter(0)
@@ -516,7 +569,7 @@ ENTRY %BatchNormInference.v6 (input: f32[2,2,2,2], offset: f32[2], scale: f32[2]
 // batch norm grad
 {
 "BatchNormGrad",
-R"(HloModule BatchNormGrad_module:
+R"(HloModule BatchNormGrad_module
 
 ENTRY %BatchNormGrad.v4 (input: f32[2,2,2,2], scale: f32[2], mean: f32[2], variance: f32[2], grad_output: f32[2,2,2,2]) -> (f32[2,2,2,2], f32[2], f32[2]) {
   %input = f32[2,2,2,2]{3,2,1,0} parameter(0)
@@ -532,7 +585,7 @@ ENTRY %BatchNormGrad.v4 (input: f32[2,2,2,2], scale: f32[2], mean: f32[2], varia
 // pad
 {
 "Pad",
-R"(HloModule Pad1DS3Array_module:
+R"(HloModule Pad1DS3Array_module
 
 ENTRY %Pad1DS3Array.v3 () -> f32[8] {
   %constant = f32[3]{0} constant({1, 2, 3})
@@ -545,7 +598,7 @@ ENTRY %Pad1DS3Array.v3 () -> f32[8] {
 // pad has interior
 {
 "PadHasInterior",
-R"(HloModule PadHasInterior_module:
+R"(HloModule PadHasInterior_module
 
 ENTRY %PadHasInterior.v3 (input: f32[1,25,7,7]) -> f32[1,25,17,11] {
   %input = f32[1,25,7,7]{3,2,1,0} parameter(0)
@@ -553,12 +606,25 @@ ENTRY %PadHasInterior.v3 (input: f32[1,25,7,7]) -> f32[1,25,17,11] {
   ROOT %pad = f32[1,25,17,11]{3,2,1,0} pad(f32[1,25,7,7]{3,2,1,0} %input, f32[] %constant), padding=0_0_0x0_0_0x2_2_1x2_2_0
 }
 
+)"
+},
+// Negative padding
+{
+"PadHasNegativePadding",
+R"(HloModule PadHasNegativePadding_module
+
+ENTRY %PadHasNegativePadding (input: f32[1,25,7,7,10]) -> f32[1,15,6,3,29] {
+  %input = f32[1,25,7,7,10]{4,3,2,1,0} parameter(0)
+  %constant = f32[] constant(-5.123)
+  ROOT %pad = f32[1,15,6,3,29]{4,3,2,1,0} pad(f32[1,25,7,7,10]{4,3,2,1,0} %input, f32[] %constant), padding=0_0_0x0_-10_0x0_-1_0x-2_-2_0x-1_-1_3
+}
+
 )"
 },
 // fusion
 {
 "Fusion",
-R"(HloModule fusion_module:
+R"(HloModule fusion_module
 
 %fused_computation (constant.param_0: f32[3,2,1,1], constant.1.param_1: f32[2]) -> f32[3,2,1,1] {
   %constant.param_0 = f32[3,2,1,1]{3,2,1,0} parameter(0)
@@ -573,22 +639,140 @@ ENTRY %fusion.v3 () -> f32[3,2,1,1] {
   ROOT %fusion = f32[3,2,1,1]{3,2,1,0} fusion(f32[3,2,1,1]{3,2,1,0} %constant, f32[2]{0} %constant.1), kind=kLoop, calls=%fused_computation
 }
 
+)"
+}
+  });
+  // clang-format on
+}
+
+std::vector<TestData> CreateShortTestCases() {
+  // clang-format off
+  return std::vector<TestData>({
+// map
+{
+"Map",
+R"(HloModule MapBinaryAdder_module
+
+add_F32.v3 {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY MapBinaryAdder.v3 {
+  param0 = f32[4]{0} parameter(0)
+  param1 = f32[4]{0} parameter(1)
+  ROOT map = f32[4]{0} map(param0, param1), to_apply=add_F32.v3
+}
+
+)"
+},
+// reduce
+{
+"Reduce",
+R"(HloModule ReduceR3ToR2_module
+
+add_F32.v3 {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY ReduceR3ToR2.v3 {
+  input = f32[8,16,256]{2,1,0} parameter(0)
+  constant = f32[] constant(0)
+  ROOT reduce = f32[8,16]{1,0} reduce(input, constant), dimensions={2}, to_apply=add_F32.v3
+}
+
 )"
 },
 // infeed/outfeed
 {
 "InfeedOutfeed",
-R"(HloModule outfeed_module:
+R"(HloModule outfeed_module
+
+ENTRY InfeedToOutfeed {
+  infeed = (u32[3]{0}, pred[]) infeed()
+  outfeed = () outfeed(infeed)
+  ROOT infeed.1 = (u32[3]{0}, pred[]) infeed()
+  outfeed.1 = () outfeed(infeed.1)
+}
+
+)"
+},
+// Rng
+{
+"Rng",
+R"(HloModule rng_module
 
-ENTRY %InfeedToOutfeed () -> (u32[3], pred[]) {
-  %infeed = (u32[3]{0}, pred[]) infeed()
-  %outfeed = () outfeed((u32[3]{0}, pred[]) %infeed)
-  ROOT %infeed.1 = (u32[3]{0}, pred[]) infeed()
-  %outfeed.1 = () outfeed((u32[3]{0}, pred[]) %infeed.1)
+ENTRY Rng {
+  constant = f32[] constant(0)
+  constant.1 = f32[] constant(1)
+  ROOT rng = f32[8]{0} rng(constant, constant.1), distribution=rng_uniform
 }
 
 )"
+},
+// Reduce precision
+{
+"ReducePrevison",
+R"(HloModule reduce_precision
+
+ENTRY ReducePrecision {
+  constant = f32[1]{0} constant({3.14159})
+  ROOT reduce-precision = f32[1]{0} reduce-precision(constant), exponent_bits=8, mantissa_bits=10
 }
+
+)"
+},
+// Conditional
+{
+"Conditional",
+R"(HloModule conditional
+
+Negate {
+  x = f32[] parameter(0)
+  ROOT negate = f32[] negate(x)
+}
+
+Identity {
+  y = f32[] parameter(0)
+  ROOT copy = f32[] copy(y)
+}
+
+ENTRY Parameters1.v4 {
+  constant = pred[] constant(true)
+  constant.1 = f32[] constant(56)
+  constant.2 = f32[] constant(12)
+  ROOT conditional = f32[] conditional(constant, constant.1, constant.2), true_computation=Negate, false_computation=Identity
+}
+
+)"
+},
+// CustomCall
+{
+"CustomCall",
+R"(HloModule custom_call
+
+ENTRY CustomCall {
+  constant = f32[1]{0} constant({12345})
+  ROOT custom-call = f32[1,2,3]{0,2,1} custom-call(constant), custom_call_target="foo\"bar"
+}
+
+)"
+},
+// Variables with non-default names
+{
+"NonDefaultNames",
+R"(HloModule add_constants_module
+
+ENTRY add_constants {
+  foo = f32[] constant(3.14)
+  ROOT bar = f32[] add(foo, foo)
+}
+
+)"
+},
   });
   // clang-format on
 }
@@ -607,18 +791,35 @@ class HloParserTest : public ::testing::Test,
   void ExpectEqual() {
     const string& original = GetParam().module_string;
     auto result = Parse(original);
-    TF_EXPECT_OK(result.status());
+    TF_ASSERT_OK(result.status());
+    EXPECT_EQ(original, result.ValueOrDie()->ToString(
+                            HloPrintOptions().set_print_large_constants(true)));
+  }
+};
+
+class HloParserShortTest : public HloParserTest {
+ protected:
+  void ExpectEqualShort() {
+    const string& original = GetParam().module_string;
+    auto result = Parse(original);
+    TF_ASSERT_OK(result.status());
     EXPECT_EQ(original,
-              result.ValueOrDie()->ToString(/*include_large_constants=*/true));
+              result.ValueOrDie()->ToString(HloPrintOptions::ShortParsable()));
   }
 };
 
 TEST_P(HloParserTest, Run) { ExpectEqual(); }
 
+TEST_P(HloParserShortTest, Run) { ExpectEqualShort(); }
+
 INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserTest,
                         ::testing::ValuesIn(CreateTestCases()),
                         TestDataToString);
 
+INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserShortTest,
+                        ::testing::ValuesIn(CreateShortTestCases()),
+                        TestDataToString);
+
 TEST_F(HloParserTest, Empty) {
   const string original = "";
   auto result = Parse(original);
@@ -682,7 +883,7 @@ ENTRY %blabla (x: f32[]) -> pred[] {
 }
 
 TEST_F(HloParserTest, MoreConstants) {
-  const string original = R"(HloModule SelectScalarS32True_module:
+  const string original = R"(HloModule SelectScalarS32True_module
 
 ENTRY %SelectScalarS32True.v4 () -> s32[] {
   %constant.2 = pred[] constant(true)
@@ -699,7 +900,7 @@ ENTRY %SelectScalarS32True.v4 () -> s32[] {
 }
 
 TEST_F(HloParserTest, LiteralDimensionsMismatch_1) {
-  const string original = R"(HloModule some_2_module:
+  const string original = R"(HloModule some_2_module
 
 ENTRY %some_2 () -> f32[2] {
   ROOT %constant = f32[2]{0} constant({1,{2}})
@@ -713,7 +914,7 @@ ENTRY %some_2 () -> f32[2] {
 }
 
 TEST_F(HloParserTest, LiteralDimensionsMismatch_2) {
-  const string original = R"(HloModule some_2x3_module:
+  const string original = R"(HloModule some_2x3_module
 
 ENTRY %some_2x3 () -> f32[2,3] {
   ROOT %constant = f32[2,3]{1,0} constant(f32[2,3] {1, 2, 3, 4, 5, 6})
@@ -727,7 +928,7 @@ ENTRY %some_2x3 () -> f32[2,3] {
 }
 
 TEST_F(HloParserTest, LiteralDimensionsMismatch_3) {
-  const string original = R"(HloModule some_2x3x2_module:
+  const string original = R"(HloModule some_2x3x2_module
 
 ENTRY %some_2x3x2 () -> f32[2,3,2] {
   ROOT %constant = f32[2,3,2]{2,1,0} constant(f32[2,3,2] {{{1, 2}, {3, 4}, {5, 6}, {7, 8}, {9, 10}, {11, 12}}})
@@ -742,7 +943,7 @@ ENTRY %some_2x3x2 () -> f32[2,3,2] {
 
 TEST_F(HloParserTest, ConstantF16Overflow) {
   const string original =
-      R"(HloModule ConstantF16Overflow_module:
+      R"(HloModule ConstantF16Overflow_module
 
 ENTRY %ConstantF16Overflow.v4 () -> f16[] {
   ROOT %constant = f16[] constant(-65505)
@@ -756,7 +957,7 @@ ENTRY %ConstantF16Overflow.v4 () -> f16[] {
 }
 
 TEST_F(HloParserTest, ConstantWithExp) {
-  const string original = R"(HloModule ConstantWithExp_module:
+  const string original = R"(HloModule ConstantWithExp_module
 
 ENTRY %ConstantWithExp.v4 () -> f32[] {
   %constant.1 = f32[] constant(3e+2)
@@ -771,7 +972,7 @@ ENTRY %ConstantWithExp.v4 () -> f32[] {
 }
 
 TEST_F(HloParserTest, AttibutesAnyOrder) {
-  const string original = R"(HloModule any_order_module:
+  const string original = R"(HloModule any_order_module
 
 ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
   %input = f32[1,2,1]{2,1,0} parameter(0)
@@ -785,7 +986,7 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
 }
 
 TEST_F(HloParserTest, InvalidDimLabels) {
-  string prefix = R"(HloModule invalid_dim_labels_module:
+  string prefix = R"(HloModule invalid_dim_labels_module
 
 ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
   %input = f32[1,2,1]{2,1,0} parameter(0)
@@ -806,16 +1007,10 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
                       .status()
                       .error_message(),
                   "must have the same rank");
-
-  ExpectHasSubstr(Parse(StrCat(prefix, ",dim_labels=0bf_io0->b0f", suffix))
-                      .status()
-                      .error_message(),
-                  "output spatial dimensions should be the same as input "
-                  "spatial dimensions");
 }
 
 TEST_F(HloParserTest, UnexpectedAttribute) {
-  const string original = R"(HloModule unexpected_attr_module:
+  const string original = R"(HloModule unexpected_attr_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
   %recv = (f32[], u32[]) recv(), channel_id=15
@@ -831,7 +1026,7 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 }
 
 TEST_F(HloParserTest, MissingAttribute) {
-  const string original = R"(HloModule missing_attr_module:
+  const string original = R"(HloModule missing_attr_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
   %recv = (f32[], u32[]) recv(), channel_id=15
@@ -847,7 +1042,7 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 }
 
 TEST_F(HloParserTest, PredecessorUndefined) {
-  const string original = R"(HloModule pre_not_found_module:
+  const string original = R"(HloModule pre_not_found_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
   %recv = (f32[], u32[]) recv(), channel_id=15
@@ -863,7 +1058,7 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 }
 
 TEST_F(HloParserTest, SliceAllowOmitStride1) {
-  const string original = R"(HloModule slice_module:
+  const string original = R"(HloModule slice_module
 
 ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] {
   %p0 = f32[3,3,4,4]{3,2,1,0} parameter(0)
@@ -875,7 +1070,7 @@ ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] {
 }
 
 TEST_F(HloParserTest, PaddingConfigIsNotWindowPad) {
-  const string original = R"(HloModule window_pad_module:
+  const string original = R"(HloModule window_pad_module
 
 ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
   %input = f32[1,2,1]{2,1,0} parameter(0)
@@ -890,7 +1085,7 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
 }
 
 TEST_F(HloParserTest, CommaBetweenSubAttributes) {
-  const string original = R"(HloModule test_comma_module:
+  const string original = R"(HloModule test_comma_module
 
 ENTRY %test_comma.v4 () -> f32[] {
   ROOT %constant = f32[] constant(-4.2), metadata={source_line=5, op_type="::const"}
@@ -900,6 +1095,95 @@ ENTRY %test_comma.v4 () -> f32[] {
   TF_EXPECT_OK(Parse(original).status());
 }
 
+TEST_F(HloParserTest, ComputationShapeDoesNotMatchRootShape) {
+  const string original = R"(HloModule custom_call:
+
+ENTRY %CustomCall () -> f32[1] {
+  %constant = f32[1]{0} constant({12345})
+  ROOT %foo = f32[1,2,3]{0,2,1} custom-call(f32[1]{0} %constant), custom_call_target="foo\"bar"
+})";
+  ExpectHasSubstr(Parse(original).status().error_message(),
+                  "Shape of computation CustomCall, f32[1], is not compatible "
+                  "with that of its root instruction foo, f32[1,2,3]");
+}
+
+TEST_F(HloParserTest, EntryComputationWithLayout) {
+  const string original = R"(HloModule layout:
+add_F32.v3 {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY %Reduce (input: f32[8,16,256]) -> f32[8,16] {
+  input = f32[8,16,256]{0,1,2} parameter(0)
+  constant = f32[] constant(0)
+  ROOT reduce = f32[8,16]{0,1} reduce(input, constant), dimensions={2}, to_apply=add_F32.v3
+})";
+
+  auto module = Parse(original);
+  TF_ASSERT_OK(module.status());
+  auto program_layout = module.ValueOrDie()->entry_computation_layout();
+  ASSERT_EQ(program_layout.parameter_count(), 1);
+  auto param_layout = program_layout.parameter_layout(0).layout();
+  auto result_layout = program_layout.result_layout().layout();
+  EXPECT_TRUE(
+      LayoutUtil::Equal(LayoutUtil::MakeLayout({0, 1, 2}), param_layout))
+      << "actual layout of parameter(0) is "
+      << LayoutUtil::HumanString(param_layout);
+  EXPECT_TRUE(LayoutUtil::Equal(LayoutUtil::MakeLayout({0, 1}), result_layout))
+      << "actual layout of result is "
+      << LayoutUtil::HumanString(result_layout);
+}
+
+TEST_F(HloParserTest, NoEntry) {
+  const string original = R"(HloModule no_entry:
+c1 {
+  const1 = f32[1]{0} constant({12345})
+}
+c2 {
+  const2 = f32[1]{0} constant({67890})
+})";
+  auto module = Parse(original);
+  TF_ASSERT_OK(module.status());
+  EXPECT_EQ(module.ValueOrDie()->entry_computation()->name(), "c2");
+}
+
+TEST_F(HloParserTest, NoRoot) {
+  const string original = R"(HloModule no_root:
+ENTRY consts {
+  first = f32[1]{0} constant({12345})
+  last = f32[1]{0} constant({67890})
+})";
+  auto module = Parse(original);
+  TF_ASSERT_OK(module.status());
+  EXPECT_EQ(
+      module.ValueOrDie()->entry_computation()->root_instruction()->name(),
+      "last");
+}
+
+TEST_F(HloParserTest, MultipleEntries) {
+  const string original = R"(HloModule multiple_entries:
+ENTRY c1 {
+  const1 = f32[1]{0} constant({12345})
+}
+ENTRY c2 {
+  const2 = f32[1]{0} constant({67890})
+})";
+  ExpectHasSubstr(Parse(original).status().error_message(),
+                  "expects only one ENTRY");
+}
+
+TEST_F(HloParserTest, MultipleRoots) {
+  const string original = R"(HloModule multiple_roots:
+ENTRY consts {
+  ROOT const1 = f32[1]{0} constant({12345})
+  ROOT const2 = f32[1]{0} constant({12345})
+})";
+  ExpectHasSubstr(Parse(original).status().error_message(),
+                  "one computation should have only one ROOT");
+}
+
 }  // namespace
 }  // namespace tools
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_token.h b/tensorflow/compiler/xla/tools/parser/hlo_token.h
index 07e48804d053f31bdff6678f09ee2c1e3b731e0f..7928bee5c2097f353b182095a555c334d7b69c95 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_token.h
+++ b/tensorflow/compiler/xla/tools/parser/hlo_token.h
@@ -18,6 +18,9 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/types.h"
+
 namespace xla {
 namespace tools {
 
@@ -60,10 +63,9 @@ enum class TokKind {
   kDimLabels,      // [0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,}
   kDxD,            // [0-9]+(x[0-9]+)+
   kPad,            // [0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*
+  kIdent,          // other identifiers
   kString,         // "abcd\"\n"
   kShape,          // f32[2,3]{1,0}
-  kOpcode,         // add
-  kFusionKind,     // kLoop, kOutput, ...
   kInt,            // 42
   kDecimal,        // 4.2
 };
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 503e7d456e1f462b753610e8a08a47db7a714ed6..a7dc5862057047f7c56faeb211cc0b13992caec7 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -59,18 +59,26 @@ namespace xla {
 namespace tools {
 namespace {
 
+// Command-line opts to this tool.  See main() for descriptions of these
+// fields.
+struct Options {
+  string fake_infeed_shape;
+  bool use_fake_data = false;
+  bool print_result = true;
+  int num_runs = 1;
+};
+
 // Invokes the given computation passing arbitrary data for every (unbound)
 // parameter if use_fake_data, Otherwise use recorded data if available.
 //
 // Similarly, infeeds fake data of shape fake_infeed_shape if it is provided;
 // otherwise, no infeed is performed.
 StatusOr<std::unique_ptr<Literal>> ReplayComputation(
-    const SessionModule& module, tensorflow::StringPiece fake_infeed_shape,
-    bool use_fake_data, Client* client) {
+    const SessionModule& module, Client* client, const Options& opts) {
   TF_ASSIGN_OR_RETURN(Computation computation, client->LoadSnapshot(module));
 
   std::vector<std::unique_ptr<GlobalData>> arguments;
-  if (use_fake_data) {
+  if (opts.use_fake_data) {
     arguments = MakeFakeArgumentsOrDie(computation, client);
   } else {  // use recorded data if available
     for (const auto& proto : module.arguments()) {
@@ -85,12 +93,12 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(
   // concurrent infeed occur via the fake_infeed_shape.
   tensorflow::gtl::optional<tensorflow::thread::ThreadPool> pool;
 
-  if (!fake_infeed_shape.empty()) {
+  if (!opts.fake_infeed_shape.empty()) {
     pool.emplace(tensorflow::Env::Default(), "infeed",
                  /*num_threads=*/1);
-    pool->Schedule([fake_infeed_shape, client]() {
+    pool->Schedule([opts, client]() {
       StatusOr<Shape> shape_status =
-          ShapeUtil::ParseShapeString(fake_infeed_shape);
+          ShapeUtil::ParseShapeString(opts.fake_infeed_shape);
       TF_CHECK_OK(shape_status.status());
       Shape shape = std::move(shape_status).ValueOrDie();
       StatusOr<std::unique_ptr<Literal>> data_status = MakeFakeLiteral(shape);
@@ -107,11 +115,32 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(
   for (auto& argument : arguments) {
     execute_arguments.push_back(argument.get());
   }
-  return client->ExecuteAndTransfer(computation, execute_arguments);
+
+  // Run the computation num_runs times, and return the result from the last
+  // execution.
+  std::unique_ptr<Literal> result;
+  for (int i = 0; i < opts.num_runs; ++i) {
+    ExecutionProfile profile;
+    if (opts.print_result) {
+      TF_ASSIGN_OR_RETURN(result, client->ExecuteAndTransfer(
+                                      computation, execute_arguments,
+                                      /*execution_options=*/nullptr, &profile));
+    } else {
+      // If we're not printing the result, execute the computation but don't
+      // bother retrieving the result.  This can be a significant speedup.
+      TF_RETURN_IF_ERROR(client
+                             ->Execute(computation, execute_arguments,
+                                       /*execution_options=*/nullptr, &profile)
+                             .status());
+    }
+    LOG(INFO) << "Execution took "
+              << static_cast<double>(profile.compute_time_ns()) / 1e9 << "s";
+  }
+
+  return std::move(result);
 }
 
-int RealMain(tensorflow::gtl::ArraySlice<char*> args,
-             tensorflow::StringPiece fake_infeed_shape, bool use_fake_data) {
+int RealMain(tensorflow::gtl::ArraySlice<char*> args, const Options& opts) {
   Client* client = ClientLibrary::LocalClientOrDie();
   tensorflow::Env* env = tensorflow::Env::Default();
   int exit_status = EXIT_SUCCESS;
@@ -119,21 +148,24 @@ int RealMain(tensorflow::gtl::ArraySlice<char*> args,
     SessionModule module;
     TF_CHECK_OK(tensorflow::ReadBinaryProto(env, arg, &module));
     StatusOr<std::unique_ptr<Literal>> result_status =
-        ReplayComputation(module, fake_infeed_shape, use_fake_data, client);
+        ReplayComputation(module, client, opts);
     if (!result_status.ok()) {
       fprintf(stderr, "%s: error: %s\n", arg,
               result_status.status().ToString().c_str());
       exit_status = EXIT_FAILURE;
       continue;
     }
+
     std::unique_ptr<Literal> result = result_status.ConsumeValueOrDie();
-    fprintf(stdout, "%s: %s :: %s:%s\n", arg, module.entry().name().c_str(),
-            ShapeUtil::HumanString(result->shape()).c_str(),
-            result->ToString().c_str());
-    if (module.has_result()) {
-      fprintf(stdout, "was %s:%s\n",
-              ShapeUtil::HumanString(module.result().shape()).c_str(),
-              Literal(module.result()).ToString().c_str());
+    if (result != nullptr) {
+      fprintf(stdout, "%s: %s :: %s:%s\n", arg, module.entry().name().c_str(),
+              ShapeUtil::HumanString(result->shape()).c_str(),
+              result->ToString().c_str());
+      if (module.has_result()) {
+        fprintf(stdout, "was %s:%s\n",
+                ShapeUtil::HumanString(module.result().shape()).c_str(),
+                Literal(module.result()).ToString().c_str());
+      }
     }
   }
   return exit_status;
@@ -144,13 +176,15 @@ int RealMain(tensorflow::gtl::ArraySlice<char*> args,
 }  // namespace xla
 
 int main(int argc, char** argv) {
-  // Flags
-  xla::string fake_infeed_shape;
-  bool use_fake_data = false;
+  xla::tools::Options opts;
   const std::vector<tensorflow::Flag> flag_list = {
-      tensorflow::Flag("use_fake_data", &use_fake_data,
+      tensorflow::Flag("use_fake_data", &opts.use_fake_data,
                        "Replay computation using fake data"),
-      tensorflow::Flag("fake_infeed_shape", &fake_infeed_shape,
+      tensorflow::Flag("print_result", &opts.print_result,
+                       "Print the result of the computation to stdout"),
+      tensorflow::Flag("num_runs", &opts.num_runs,
+                       "Number of times to run each computation"),
+      tensorflow::Flag("fake_infeed_shape", &opts.fake_infeed_shape,
                        "Shape of fake data to construct for (infinite) infeed"),
   };
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
@@ -162,5 +196,5 @@ int main(int argc, char** argv) {
 
   tensorflow::gtl::ArraySlice<char*> args(argv, argc);
   args.pop_front();  // Pop off the binary name, argv[0]
-  return xla::tools::RealMain(args, fake_infeed_shape, use_fake_data);
+  return xla::tools::RealMain(args, opts);
 }
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index 2624ef0252fd9482a600fe3aec07f7f328a86d69..fe5d29a6b655a89d559eb1214c2b8dd54d34094c 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -42,15 +42,15 @@ Status WithLogBacktrace(const Status& status) {
 
 }  // namespace
 
-ScopedLoggingTimer::ScopedLoggingTimer(const string& label, int32 vlog_level)
-    : label(label), vlog_level(vlog_level) {
-  if (VLOG_IS_ON(vlog_level)) {
+ScopedLoggingTimer::ScopedLoggingTimer(const string& label, bool enabled)
+    : enabled(enabled), label(label) {
+  if (enabled) {
     start_micros = tensorflow::Env::Default()->NowMicros();
   }
 }
 
 ScopedLoggingTimer::~ScopedLoggingTimer() {
-  if (VLOG_IS_ON(vlog_level)) {
+  if (enabled) {
     uint64 end_micros = tensorflow::Env::Default()->NowMicros();
     double secs = (end_micros - start_micros) / 1000000.0;
 
@@ -191,9 +191,9 @@ std::vector<int64> ComposePermutations(tensorflow::gtl::ArraySlice<int64> p1,
   return output;
 }
 
-bool IsIdentityPermutation(tensorflow::gtl::ArraySlice<int64> p) {
-  for (int64 i = 0; i < p.size(); ++i) {
-    if (p[i] != i) {
+bool IsIdentityPermutation(tensorflow::gtl::ArraySlice<int64> permutation) {
+  for (int64 i = 0; i < permutation.size(); ++i) {
+    if (permutation[i] != i) {
       return false;
     }
   }
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index f58f57b44396c90a3820835a3d0ecc792aaa7cd0..b722095d1f38bf8a984c3ce9092a65f8e0baa911 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -50,13 +50,43 @@ using DimensionVector = tensorflow::gtl::InlinedVector<int64, kInlineRank>;
 // RAII timer that logs with a given label the wall clock time duration in human
 // readable form. This differs from base's ElapsedTimer primarily in that it
 // spits out the human-readable duration form.
+//
+// By default, the timing traces are only printed at VLOG(1) and above:
+//
+//   XLA_SCOPED_LOGGING_TIMER("fooing bar");  // nop if !VLOG_IS_ON(1).
+//
+// but you can control this via:
+//
+//   XLA_SCOPED_LOGGING_TIMER_LEVEL("fooing bar", 2);  // nop if !VLOG_IS_ON(2)
+//
+#define XLA_SCOPED_LOGGING_TIMER(label) \
+  XLA_SCOPED_LOGGING_TIMER_HELPER(label, 1, __COUNTER__)
+#define XLA_SCOPED_LOGGING_TIMER_LEVEL(label, level) \
+  XLA_SCOPED_LOGGING_TIMER_HELPER(label, level, __COUNTER__)
+
+// Helper for implementing macros above.  Do not use directly.
+//
+// Forces the evaluation of "counter", which we expect is equal to __COUNTER__.
+#define XLA_SCOPED_LOGGING_TIMER_HELPER(label, level, counter) \
+  XLA_SCOPED_LOGGING_TIMER_HELPER2(label, level, counter)
+
+// Helper for macros above.  Don't use directly.
+#define XLA_SCOPED_LOGGING_TIMER_HELPER2(label, level, counter)      \
+  ::xla::ScopedLoggingTimer XLA_ScopedLoggingTimerInstance##counter( \
+      label, VLOG_IS_ON(level))
+
+// RAII timer for XLA_SCOPED_LOGGING_TIMER and XLA_SCOPED_LOGGING_TIMER_LEVEL
+// macros above.  Recommended usage is via the macros so you don't have to give
+// the timer a name or worry about calling VLOG_IS_ON yourself.
 struct ScopedLoggingTimer {
-  explicit ScopedLoggingTimer(const string& label, int32 vlog_level = 1);
+  // The timer does nothing if enabled is false.  This lets you pass in your
+  // file's VLOG_IS_ON value.
+  ScopedLoggingTimer(const string& label, bool enabled);
   ~ScopedLoggingTimer();
 
-  uint64 start_micros;
+  bool enabled;
   string label;
-  int32 vlog_level;
+  uint64 start_micros;
 };
 
 // Given a vector<T>, returns a MutableArraySlice<char> that points at its
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index 6f7f1479b90377ea3c2019508acb6db311c5a1ba..293f0781a203d092a7996d5548de1dbf5bf32e4c 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -44,6 +44,9 @@ namespace window_util {
   if (dim.window_dilation() != 1) {
     StrAppend(&str, ",window_dilation=", dim.window_dilation());
   }
+  if (dim.window_reversal()) {
+    StrAppend(&str, ",window_reversal");
+  }
   StrAppend(&str, ")");
   return str;
 }
@@ -85,6 +88,11 @@ string ToString(const Window& window) {
       return StrCat(dim.window_dilation());
     });
   }
+  if (HasWindowReversal(window)) {
+    add_field(" rhs_reversal", [](const WindowDimension& dim) {
+      return StrCat(dim.window_reversal() ? 1 : 0);
+    });
+  }
   return str;
 }
 
@@ -138,6 +146,15 @@ bool HasWindowDilation(const Window& window) {
   return false;
 }
 
+bool HasWindowReversal(const Window& window) {
+  for (const auto& dim : window.dimensions()) {
+    if (dim.window_reversal()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool HasDilation(const Window& window) {
   return HasBaseDilation(window) || HasWindowDilation(window);
 }
diff --git a/tensorflow/compiler/xla/window_util.h b/tensorflow/compiler/xla/window_util.h
index 235cb2d59d451a25dc4f824ab488f8cef6b03bfb..125900dac0c5ab478b834c315b4a438c9238ef6d 100644
--- a/tensorflow/compiler/xla/window_util.h
+++ b/tensorflow/compiler/xla/window_util.h
@@ -39,6 +39,8 @@ bool HasBaseDilation(const Window& window);
 bool HasWindowDilation(const Window& window);
 bool HasDilation(const Window& window);
 
+bool HasWindowReversal(const Window& window);
+
 // Returns the new bound after dilation.
 //
 // If a window with the given bound in some dimension is dilated with the given
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index eac8f2ff07e4a885affdc0f7b1563d3a2cb606d7..95045d5e28b96c8e9b31fccd62a24d5c83523092 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -114,6 +114,14 @@ message PaddingConfig {
   repeated PaddingConfigDimension dimensions = 1;
 }
 
+// A format specifies the method used by a layout to store an array in memory.
+enum Format {
+  INVALID_FORMAT = 0;
+  // The default layout, with exactly one storage location per element (ignoring
+  // padding).
+  DENSE = 1;
+}
+
 // A layout describes how the array is placed in (1D) memory space.  This
 // includes the minor-to-major ordering of dimensions within a shape, as well as
 // any padding present in those dimensions.
@@ -124,19 +132,23 @@ message PaddingConfig {
 //
 // See the XLA documentation for more information on shapes and layouts.
 message Layout {
+  // The method used to store the data in memory. The format determines which of
+  // the other fields are used by the layout.
+  Format format = 4;
+
   // Sequence of dimension numbers, from minor (fastest varying index) to major
   // (slowest varying index). This field is required.
   repeated int64 minor_to_major = 1;
 
-  // The width to which the layout of each dimension is padded up
-  // to. If present, the size of the padded_dimensions must equal the
-  // rank of the shape. The padding appears at the end of a dimension,
-  // not at the beginning. This kind of padding, unlike padding in
-  // e.g. convolution, is not part of the shape.
+  // The width to which the layout of each dimension is padded up to. If
+  // present, the size of the padded_dimensions must equal the rank of the
+  // shape. The padding appears at the end of a dimension, not at the
+  // beginning. This kind of padding, unlike padding in e.g. convolution, is not
+  // part of the shape. This field must be unset unless the format is DENSE.
   repeated int64 padded_dimensions = 2;
 
-  // Describes the values in the padding specified by
-  // padded_dimensions.
+  // Describes the values in the padding specified by padded_dimensions. This
+  // field must be unset unless the format is DENSE.
   PaddingValue padding_value = 3;
 
   // Important: if any field is added, be sure to modify ShapeUtil::Equal()
@@ -357,6 +369,10 @@ message WindowDimension {
   // means no dilation. base_dilation - 1 no-op entries ("holes") are implicitly
   // placed between each base area element. See documentation for convolution.
   int64 base_dilation = 6;
+
+  // Window reversal means that this dimension was logically reversed before the
+  // operation.
+  bool window_reversal = 7;
 }
 
 // Describes the windowing in an operation such as convolution.
@@ -413,15 +429,9 @@ message ConvolutionDimensionNumbers {
   // The number of the dimension that represents features in the input.
   int64 input_feature_dimension = 8;
 
-  // The number of the dimension that represents batch in the output.
-  int64 output_batch_dimension = 9;
-
-  // The number of the dimension that represents features in the output.
-  int64 output_feature_dimension = 10;
-
   // The dimension numbers for the spatial dimensions that the window
-  // moves through in the input (lhs) and output.
-  repeated int64 spatial_dimensions = 5;
+  // moves through in the input.
+  repeated int64 input_spatial_dimensions = 11;
 
   // The number of the dimension that represents input features in the
   // convolutional kernel (rhs).
@@ -435,12 +445,24 @@ message ConvolutionDimensionNumbers {
   // moves through in the kernel (rhs). window.strides(0) is the
   // stride in the kernel_spatial_dimensions(0) dimension.
   repeated int64 kernel_spatial_dimensions = 6;
+
+  // The number of the dimension that represents batch in the output.
+  int64 output_batch_dimension = 9;
+
+  // The number of the dimension that represents features in the output.
+  int64 output_feature_dimension = 10;
+
+  // The dimension numbers for the spatial dimensions that the window
+  // moves through in the output.
+  repeated int64 output_spatial_dimensions = 12;
+
+  // Next = 13
 };
 
 message ConvolveRequest {
   ComputationDataHandle lhs = 2;
   ComputationDataHandle rhs = 3;  // This is the filter/kernel.
-  Window window = 4;              // Describes the filter/kenel.
+  Window window = 4;              // Describes the filter/kernel.
   ConvolutionDimensionNumbers dimension_numbers = 5;
 }
 
@@ -488,6 +510,23 @@ message CustomCallRequest {
   Shape shape = 4;
 }
 
+message DotDimensionNumbers {
+  // The dimension numbers that represent the 'lhs' contracting dimensions.
+  repeated int64 lhs_contracting_dimensions = 1;
+  // The dimension numbers that represent the 'rhs' contracting dimensions.
+  repeated int64 rhs_contracting_dimensions = 2;
+  // The dimension numbers that represent the 'lhs' batch dimensions.
+  repeated int64 lhs_batch_dimensions = 3;
+  // The dimension numbers that represent the 'rhs' batch dimensions.
+  repeated int64 rhs_batch_dimensions = 4;
+};
+
+message DotRequest {
+  ComputationDataHandle lhs = 2;
+  ComputationDataHandle rhs = 3;
+  DotDimensionNumbers dimension_numbers = 4;
+}
+
 message MapRequest {
   repeated ComputationDataHandle operands = 2;
   ComputationHandle to_apply = 3;
@@ -641,6 +680,14 @@ message ConcatenateRequest {
   int64 dimension = 3;
 }
 
+message ConditionalRequest {
+  ComputationDataHandle predicate = 2;
+  ComputationDataHandle true_operand = 3;
+  ComputationHandle true_computation = 4;
+  ComputationDataHandle false_operand = 5;
+  ComputationHandle false_computation = 6;
+}
+
 message WhileRequest {
   ComputationHandle condition = 2;
   ComputationHandle body = 3;
@@ -722,9 +769,6 @@ enum BinaryOperation {
   BINOP_LT = 9;
   BINOP_NE = 10;
 
-  // Dot product, matrix multiply.
-  BINOP_DOT = 12;
-
   // Element-wise maximum.
   BINOP_MAX = 14;
 
@@ -875,6 +919,7 @@ message OpRequest {
     ConvolveRequest convolve_request = 8;
     CrossReplicaSumRequest cross_replica_sum_request = 9;
     CustomCallRequest custom_call_request = 10;
+    DotRequest dot_request = 43;
     DynamicSliceRequest dynamic_slice_request = 11;
     DynamicUpdateSliceRequest dynamic_update_slice_request = 12;
     GetTupleElementRequest get_tuple_element_request = 13;
@@ -903,7 +948,9 @@ message OpRequest {
     BatchNormGradRequest batch_norm_grad_request = 37;
     BatchNormInferenceRequest batch_norm_inference_request = 38;
     FftRequest fft_request = 41;
-    // Next: 42
+    ConvertRequest bitcast_convert_request = 42;
+    ConditionalRequest conditional_request = 44;
+    // Next: 45
   }
 }
 
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index b7ade951150412e0ad3f72c235f0677e68fce66e..6e2320bd0d6376cfddb60f8069e141a88bc93563 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -9,7 +9,12 @@ load("//third_party/mpi:mpi.bzl", "if_mpi")
 
 py_library(
     name = "contrib_py",
-    srcs = glob(["**/*.py"]),
+    srcs = glob(
+        ["**/*.py"],
+        exclude = [
+            "**/*_test.py",
+        ],
+    ),
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
@@ -48,6 +53,7 @@ py_library(
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn",
         "//tensorflow/contrib/legacy_seq2seq:seq2seq_py",
+        "//tensorflow/contrib/libsvm",
         "//tensorflow/contrib/linalg:linalg_py",
         "//tensorflow/contrib/linear_optimizer:sdca_estimator_py",
         "//tensorflow/contrib/linear_optimizer:sdca_ops_py",
@@ -64,6 +70,7 @@ py_library(
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_py",
         "//tensorflow/contrib/nn:nn_py",
         "//tensorflow/contrib/opt:opt_py",
+        "//tensorflow/contrib/periodic_resample:init_py",
         "//tensorflow/contrib/predictor",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/quantize:quantize_graph",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 1eda1abfcf779ece7af3dbf2554c2a0a8c2611e9..08247c6b38a4df663ad28a6b4d3c41a1da41a020 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -55,6 +55,7 @@ from tensorflow.contrib import model_pruning
 from tensorflow.contrib import nccl
 from tensorflow.contrib import nn
 from tensorflow.contrib import opt
+from tensorflow.contrib import periodic_resample
 from tensorflow.contrib import predictor
 from tensorflow.contrib import quantization
 from tensorflow.contrib import quantize
diff --git a/tensorflow/contrib/android/README.md b/tensorflow/contrib/android/README.md
index f49e5857fe5255c2459793cb1389052a2ff5f88f..c7c128bf14f03d3769ef08e83da61f6d2f91fbd2 100644
--- a/tensorflow/contrib/android/README.md
+++ b/tensorflow/contrib/android/README.md
@@ -15,9 +15,9 @@ For prebuilt libraries, see the
 page for a recent build.
 
 The TensorFlow Inference Interface is also available as a
-[JCenter package](https://bintray.com/google/tensorflow/tensorflow-android) and
-can be included quite simply in your android project with a couple of lines in
-the project's `build.gradle` file:
+[JCenter package](https://bintray.com/google/tensorflow/tensorflow)
+(see the tensorflow-android directory) and can be included quite simply in your
+android project with a couple of lines in the project's `build.gradle` file:
 
 ```
 allprojects {
diff --git a/tensorflow/contrib/android/cmake/CMakeLists.txt b/tensorflow/contrib/android/cmake/CMakeLists.txt
index 25ada5ba27aa167e4aaf4cebd6517e3b80aa1058..a115d1610e2334a6626f29674f3dd195e3a3c648 100644
--- a/tensorflow/contrib/android/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/android/cmake/CMakeLists.txt
@@ -34,10 +34,12 @@ add_library(lib_tf STATIC IMPORTED )
 set_target_properties(lib_tf PROPERTIES IMPORTED_LOCATION
         ${PREBUILT_DIR}/lib/libtensorflow-core.a)
 # Change to compile flags should be replicated into bazel build file
+# TODO: Consider options other than -O2 for binary size.
+#       e.g. -Os for gcc, and -Oz for clang.
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIS_SLIM_BUILD \
                      -std=c++11 -fno-rtti -fno-exceptions \
                      -O2 -Wno-narrowing -fomit-frame-pointer \
-                     -mfpu=neon -mfloat-abi=softfp -fPIE \
+                     -mfpu=neon -mfloat-abi=softfp -fPIE -fPIC \
                      -ftemplate-depth=900 \
                      -DGOOGLE_PROTOBUF_NO_RTTI \
                      -DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER")
diff --git a/tensorflow/contrib/android/cmake/README.md b/tensorflow/contrib/android/cmake/README.md
index 6f19b657fe72064bd7b005b568540cd52a5e19e8..934b58c7242fc06064ee3c06bc8f4c2740bd24ef 100644
--- a/tensorflow/contrib/android/cmake/README.md
+++ b/tensorflow/contrib/android/cmake/README.md
@@ -14,7 +14,7 @@ Add TensorFlow-Android-Inference as a dependency of your Android application
 
 ```
 include ':TensorFlow-Android-Inference'
-findProject(":TensorFlow-Android-Inference").projectDir = 
+findProject(":TensorFlow-Android-Inference").projectDir =
             new File("${/path/to/tensorflow_repo}/contrib/android/cmake")
 ```
 
diff --git a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
index 1f423a7a5bf6a115dc627ddd6f5e98c074282585..dc5b9fb88742d78d0f40207b589e29451a6358dd 100644
--- a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
+++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
@@ -160,7 +160,7 @@ public class TensorFlowInferenceInterface {
       throw new RuntimeException("Failed to load model from the input stream", e);
     }
   }
-  
+
   /*
    * Construct a TensorFlowInferenceInterface with provided Graph
    *
@@ -168,7 +168,7 @@ public class TensorFlowInferenceInterface {
    */
   public TensorFlowInferenceInterface(Graph g) {
     prepareNativeRuntime();
-      
+
     // modelName is redundant here, here is for
     // avoiding error in initialization as modelName is marked final.
     this.modelName = "";
@@ -290,7 +290,7 @@ public class TensorFlowInferenceInterface {
    */
   public void feed(String inputName, boolean[] src, long... dims) {
     byte[] b = new byte[src.length];
-    
+
     for (int i = 0; i < src.length; i++) {
       b[i] = src[i] ? (byte) 1 : (byte) 0;
     }
diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index a111cfecb366fe245150cc71d2c43662d0d69090..ea8ac2c680e62ee03a45716aa1e0870d44495f1e 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -82,7 +82,10 @@ cc_library(
 tf_cc_test(
     name = "adaptive_shared_batch_scheduler_test",
     srcs = ["adaptive_shared_batch_scheduler_test.cc"],
-    tags = ["manual"],  # b/69013768
+    tags = [
+        "local",
+        "manual",
+    ],
     deps = [
         ":adaptive_shared_batch_scheduler",
         "//tensorflow/contrib/batching/test_util:fake_clock_env",
diff --git a/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h b/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h
index 6ed177e001758ad8c566c7965e1ec10ae5235fc8..a2cb146b8d69b6cc0eda8912a9c840ac4e0c7030 100644
--- a/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h
@@ -16,9 +16,11 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
 #define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
 
+#include <algorithm>
 #include <functional>
 #include <memory>
 #include <queue>
+#include <random>
 #include <unordered_map>
 #include <vector>
 
@@ -42,19 +44,36 @@ template <typename TaskType>
 class ASBSQueue;
 }  // namespace internal
 
+// EXPERIMENTAL: API MAY BE SUBJECTED TO SUDDEN CHANGES.
+//
 // Shared batch scheduler designed to minimize latency. The scheduler keeps
 // track of a number of queues (one per model or model version) which are
 // continuously enqueuing requests. The scheduler groups the requests into
 // batches which it periodically sends off for processing (see
 // shared_batch_scheduler.h for more details). The AdaptiveSharedBatchScheduler
 // prioritizes batches by age (i.e. the batch's oldest request) irrespective of
-// queue. The scheduler will process the oldest batch at an adjustable rate,
-// regardless of batch size. The user can provide feedback to help set this rate
-// to achieve some goal (i.e. minimize overall latency, limit cpu usage, etc).
+// queue or batch size.
+//
+// The scheduling decision currently exists in two flavors, controlled by the
+// option use_in_flight_batches_implementation. It is expected that setting this
+// option to true will give universally better results; after a period of
+// testing to confirm, the old implementation will be removed.
 //
-// The rate (or rather, the corresponding period) is adjusted each time a batch
-// is processed, using an exponentially weighted moving average to smooth
-// potentially noisy feedback:
+// If use_in_flight_batches_implementation is set to true, the scheduler
+// limits the number of batches which can be processed concurrently.  If a new
+// batch is created, and the number of in flight batches is below the limit,
+// the next (i.e. oldest) batch is immediately scheduled.  Similarly, when a
+// batch finishes processing, the limit is rechecked, and another batch may be
+// scheduled.  To avoid the need to carefully tune the limit for workload,
+// model type, platform, etc, it is dynamically adjusted in order to provide the
+// lowest latency.
+//
+// If use_in_flight_batches_implementation is set to false, the scheduler will
+// process the oldest batch at an adjustable rate, regardless of batch size.
+// The user can provide feedback to help set this rate to achieve some goal
+// (i.e. minimize overall latency, limit cpu usage, etc). The rate (or rather,
+// the corresponding period) is adjusted each time a batch is processed, using
+// an exponentially weighted moving average to smooth noisy feedback:
 // ewma_feedback = ((N - 1) * ewma_feedback + feedback()) / N
 // period *= (1 + K * emwa_feedback)
 //
@@ -82,6 +101,20 @@ class AdaptiveSharedBatchScheduler
     int64 num_batch_threads = port::NumSchedulableCPUs();
     // The environment to use (typically only overridden by test code).
     Env* env = Env::Default();
+    // Which implementation to use (described in class comments above).
+    bool use_in_flight_batches_implementation = false;
+    // Initial limit for number of batches being concurrently processed.
+    // Non-integer values correspond to probabilistic limits - i.e. a value of
+    // 3.2 results in an actual cap of 3 80% of the time, and 4 20% of the time.
+    double initial_in_flight_batches_limit = 3;
+    // Number of batches between adjustments of in_flight_batches_limit.  Larger
+    // numbers will give less noisy latency measurements, but will be less
+    // responsive to changes in workload.
+    int64 batches_to_average_over = 1000;
+
+    // TODO(kte): remove the rate based implementation and corresponding options
+    // below once testing confirms the superiority of the in flight batches
+    // implementation.
     // Initial batch scheduling period in microseconds. Will be altered for
     // non-zero rate_feedback.
     double initial_scheduling_period_micros = 500;
@@ -122,6 +155,11 @@ class AdaptiveSharedBatchScheduler
                   BatchProcessor process_batch_callback,
                   std::unique_ptr<BatchScheduler<TaskType>>* queue);
 
+  double in_flight_batches_limit() {
+    mutex_lock l(mu_);
+    return in_flight_batches_limit_;
+  }
+
  private:
   // access to AddBatch, RemoveQueue, GetEnv.
   friend class internal::ASBSQueue<TaskType>;
@@ -129,10 +167,20 @@ class AdaptiveSharedBatchScheduler
   explicit AdaptiveSharedBatchScheduler(const Options& options);
 
   // Batch scheduling function which runs every scheduling_period_ microseconds.
+  // Only used when options_.use_in_flight_batches_implementation == false.
   void ProcessOneBatch();
 
+  // Tracks processing latency and adjusts in_flight_batches_limit to minimize.
+  // Only used when options_.use_in_flight_batches_implementation == true.
+  void CallbackWrapper(const internal::ASBSBatch<TaskType>* batch,
+                       BatchProcessor callback);
+
+  // Schedules batch if in_flight_batches_limit_ is not met.
+  // Only used when options_.use_in_flight_batches_implementation == true.
+  void MaybeScheduleNextBatch() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   // Notifies scheduler of non-empty batch which is eligible for processing.
-  void AddBatch(internal::ASBSBatch<TaskType>*);
+  void AddBatch(const internal::ASBSBatch<TaskType>* batch);
 
   // Removes queue from scheduler.
   void RemoveQueue(const internal::ASBSQueue<TaskType>* queue);
@@ -149,7 +197,8 @@ class AdaptiveSharedBatchScheduler
   // Collection of batches added by AddBatch, ordered by age. Owned by scheduler
   // until they are released for processing.
   std::priority_queue<const internal::ASBSBatch<TaskType>*,
-                      std::vector<internal::ASBSBatch<TaskType>*>, BatchCompare>
+                      std::vector<const internal::ASBSBatch<TaskType>*>,
+                      BatchCompare>
       batches_ GUARDED_BY(mu_);
 
   // Unowned queues and callbacks added by AddQueue.
@@ -160,19 +209,56 @@ class AdaptiveSharedBatchScheduler
 
   // Responsible for running ProcessOneBatch. PeriodicFunction was used in order
   // to check for deletion so that the thread can be shut down.
+  // Only used when options_.use_in_flight_batches_implementation == false.
   std::unique_ptr<PeriodicFunction> scheduling_thread_;
 
   // Responsible for running the batch processing callbacks.
   std::unique_ptr<thread::ThreadPool> batch_thread_pool_;
 
   // Time interval in microseconds between successive ProcessOneBatch calls.
+  // Only used when options_.use_in_flight_batches_implementation == false.
   double scheduling_period_;
 
   // Exponentially weighted moving average of
   // options_.scheduling_period_feedback() evaluated in each ProcessOneBatch
   // call.
+  // Only used when options_.use_in_flight_batches_implementation == false.
   double ewma_feedback_ = 0;
 
+  // Limit on number of batches which can be concurrently processed.
+  // Non-integer values correspond to probabilistic limits - i.e. a value of 3.2
+  // results in an actual cap of 3 80% of the time, and 4 20% of the time.
+  // Only used when options_.use_in_flight_batches_implementation == true.
+  double in_flight_batches_limit_ GUARDED_BY(mu_);
+
+  // Number of batches currently being processed.
+  // Only used when options_.use_in_flight_batches_implementation == true.
+  int64 in_flight_batches_ GUARDED_BY(mu_) = 0;
+
+  // RNG engine and distribution.
+  // Only used when options_.use_in_flight_batches_implementation == true.
+  std::default_random_engine rand_engine_;
+  std::uniform_real_distribution<double> rand_double_;
+
+  // Fields controlling the dynamic adjustment of in_flight_batches_limit_.
+  // Only used when options_.use_in_flight_batches_implementation == true.
+  // Number of batches since the last in_flight_batches_limit_ adjustment.
+  int64 batch_count_ GUARDED_BY(mu_) = 0;
+  // Sum of processing latency for batches counted by batch_count_.
+  int64 batch_latency_sum_ GUARDED_BY(mu_) = 0;
+  // Average batch latency for previous value of in_flight_batches_limit_.
+  double last_avg_latency_ms_ GUARDED_BY(mu_) = 0;
+  // Did last_avg_latency_ms_ decrease from the previous last_avg_latency_ms_?
+  bool last_latency_decreased_ GUARDED_BY(mu_) = false;
+  // Current direction (+-) to adjust in_flight_batches_limit_
+  int step_direction_ GUARDED_BY(mu_) = 1;
+  // Max adjustment size (as a fraction of in_flight_batches_limit_).
+  constexpr static double kMaxStepSizeMultiplier = 0.125;  // 1/8;
+  // Min adjustment size (as a fraction of in_flight_batches_limit_).
+  constexpr static double kMinStepSizeMultiplier = 0.0078125;  // 1/128
+  // Current adjustment size (as a fraction of in_flight_batches_limit_).
+  double step_size_multiplier_ GUARDED_BY(mu_) = kMaxStepSizeMultiplier;
+
   TF_DISALLOW_COPY_AND_ASSIGN(AdaptiveSharedBatchScheduler);
 };
 
@@ -208,6 +294,8 @@ class ASBSQueue : public BatchScheduler<TaskType> {
   // place any more tasks in this batch.
   void ReleaseBatch(const ASBSBatch<TaskType>* batch);
 
+  size_t max_task_size() const override { return options_.max_batch_size; }
+
  private:
   std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>> scheduler_;
   const QueueOptions options_;
@@ -241,6 +329,12 @@ class ASBSBatch : public Batch<TaskType> {
 
 // ---------------- AdaptiveSharedBatchScheduler ----------------
 
+template <typename TaskType>
+constexpr double AdaptiveSharedBatchScheduler<TaskType>::kMaxStepSizeMultiplier;
+
+template <typename TaskType>
+constexpr double AdaptiveSharedBatchScheduler<TaskType>::kMinStepSizeMultiplier;
+
 template <typename TaskType>
 Status AdaptiveSharedBatchScheduler<TaskType>::Create(
     const Options& options,
@@ -275,6 +369,25 @@ Status AdaptiveSharedBatchScheduler<TaskType>::Create(
         "feedback_smoothing_batches must be positive; was ",
         options.feedback_smoothing_batches);
   }
+  if (options.initial_in_flight_batches_limit > options.num_batch_threads) {
+    return errors::InvalidArgument(
+        "initial_in_flight_batches_limit (",
+        options.initial_in_flight_batches_limit,
+        ") should not be larger than num_batch_threads (",
+        options.num_batch_threads, ")");
+  }
+  if (options.initial_in_flight_batches_limit < 1) {
+    return errors::InvalidArgument(
+        "initial_in_flight_batches_limit should be "
+        "greater than or equal to 1; was ",
+        options.initial_in_flight_batches_limit);
+  }
+  if (options.batches_to_average_over < 1) {
+    return errors::InvalidArgument(
+        "batches_to_average_over should be "
+        "greater than or equal to 1; was ",
+        options.batches_to_average_over);
+  }
   scheduler->reset(new AdaptiveSharedBatchScheduler<TaskType>(options));
   return Status::OK();
 }
@@ -283,14 +396,20 @@ template <typename TaskType>
 AdaptiveSharedBatchScheduler<TaskType>::AdaptiveSharedBatchScheduler(
     const Options& options)
     : options_(options),
-      scheduling_period_(options.initial_scheduling_period_micros) {
+      scheduling_period_(options.initial_scheduling_period_micros),
+      in_flight_batches_limit_(options.initial_in_flight_batches_limit),
+      rand_double_(0.0, 1.0) {
+  std::random_device device;
+  rand_engine_.seed(device());
   PeriodicFunction::Options opts;
   opts.thread_name_prefix = "scheduling_thread";
   opts.env = GetEnv();
-  scheduling_thread_.reset(
-      new PeriodicFunction([this] { ProcessOneBatch(); }, 0, opts));
   batch_thread_pool_.reset(new thread::ThreadPool(
       GetEnv(), options.thread_pool_name, options.num_batch_threads));
+  if (!options.use_in_flight_batches_implementation) {
+    scheduling_thread_.reset(
+        new PeriodicFunction([this] { ProcessOneBatch(); }, 0, opts));
+  }
 }
 
 template <typename TaskType>
@@ -316,9 +435,12 @@ Status AdaptiveSharedBatchScheduler<TaskType>::AddQueue(
 
 template <typename TaskType>
 void AdaptiveSharedBatchScheduler<TaskType>::AddBatch(
-    internal::ASBSBatch<TaskType>* batch) {
+    const internal::ASBSBatch<TaskType>* batch) {
   mutex_lock l(mu_);
   batches_.push(batch);
+  if (options_.use_in_flight_batches_implementation) {
+    MaybeScheduleNextBatch();
+  }
 }
 
 template <typename TaskType>
@@ -328,10 +450,78 @@ void AdaptiveSharedBatchScheduler<TaskType>::RemoveQueue(
   queues_and_callbacks_.erase(queue);
 }
 
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleNextBatch() {
+  if (batches_.empty() || in_flight_batches_ >= in_flight_batches_limit_)
+    return;
+  // Non-integer limit handled probabilistially.
+  if (in_flight_batches_limit_ - in_flight_batches_ < 1 &&
+      rand_double_(rand_engine_) >
+          (in_flight_batches_limit_ - in_flight_batches_))
+    return;
+  const internal::ASBSBatch<TaskType>* batch = batches_.top();
+  batches_.pop();
+  // Queue may destroy itself after ReleaseBatch is called.
+  batch->queue()->ReleaseBatch(batch);
+  batch_thread_pool_->Schedule(
+      std::bind(&AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper, this,
+                batch, queues_and_callbacks_[batch->queue()]));
+  in_flight_batches_++;
+}
+
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper(
+    const internal::ASBSBatch<TaskType>* batch,
+    AdaptiveSharedBatchScheduler<TaskType>::BatchProcessor callback) {
+  int64 start_time = batch->creation_time_micros();
+  callback(std::unique_ptr<Batch<TaskType>>(
+      const_cast<internal::ASBSBatch<TaskType>*>(batch)));
+  int64 end_time = GetEnv()->NowMicros();
+  mutex_lock l(mu_);
+  in_flight_batches_--;
+  batch_count_++;
+  batch_latency_sum_ += end_time - start_time;
+  // Occasionally adjust in_flight_batches_limit_ to minimize average latency.
+  // Although the optimal value may depend on the workload, the latency should
+  // be a simple convex function of in_flight_batches_limit_, allowing us to
+  // locate the global minimum relatively quickly.
+  if (batch_count_ == options_.batches_to_average_over) {
+    double current_avg_latency_ms = (batch_latency_sum_ / 1000.) / batch_count_;
+    bool current_latency_decreased =
+        current_avg_latency_ms < last_avg_latency_ms_;
+    if (current_latency_decreased) {
+      // If latency improvement was because we're moving in the correct
+      // direction, increase step_size so that we can get to the minimum faster.
+      // If latency improvement was due to backtracking from a previous failure,
+      // decrease step_size in order to refine our location.
+      step_size_multiplier_ *= (last_latency_decreased_ ? 2 : 0.5);
+      step_size_multiplier_ =
+          std::min(step_size_multiplier_, kMaxStepSizeMultiplier);
+      step_size_multiplier_ =
+          std::max(step_size_multiplier_, kMinStepSizeMultiplier);
+    } else {
+      // Return (nearly) to previous position and confirm that latency is better
+      // there before decreasing step size.
+      step_direction_ = -step_direction_;
+    }
+    in_flight_batches_limit_ +=
+        step_direction_ * in_flight_batches_limit_ * step_size_multiplier_;
+    in_flight_batches_limit_ =
+        std::min(in_flight_batches_limit_,
+                 static_cast<double>(options_.num_batch_threads));
+    in_flight_batches_limit_ = std::max(in_flight_batches_limit_, 1.0);
+    last_avg_latency_ms_ = current_avg_latency_ms;
+    last_latency_decreased_ = current_latency_decreased;
+    batch_count_ = 0;
+    batch_latency_sum_ = 0;
+  }
+  MaybeScheduleNextBatch();
+}
+
 template <typename TaskType>
 void AdaptiveSharedBatchScheduler<TaskType>::ProcessOneBatch() {
   static const double kFeedbackMultiplier = .001;
-  internal::ASBSBatch<TaskType>* batch = nullptr;
+  const internal::ASBSBatch<TaskType>* batch = nullptr;
   BatchProcessor callback;
   const int64 start_time_micros = GetEnv()->NowMicros();
   {
@@ -355,7 +545,8 @@ void AdaptiveSharedBatchScheduler<TaskType>::ProcessOneBatch() {
     // Queue may destroy itself after ReleaseBatch is called.
     batch->queue()->ReleaseBatch(batch);
     batch_thread_pool_->Schedule([callback, batch] {
-      callback(std::unique_ptr<Batch<TaskType>>(batch));
+      callback(std::unique_ptr<Batch<TaskType>>(
+          const_cast<internal::ASBSBatch<TaskType>*>(batch)));
     });
   }
   const int64 sleep_time =
@@ -425,6 +616,7 @@ Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
     current_batch_->AddTask(std::move(*task));
     num_enqueued_tasks_++;
   }
+  // AddBatch must be called outside of lock, since it may call ReleaseBatch.
   if (new_batch != nullptr) scheduler_->AddBatch(new_batch);
   return Status::OK();
 }
diff --git a/tensorflow/contrib/batching/adaptive_shared_batch_scheduler_test.cc b/tensorflow/contrib/batching/adaptive_shared_batch_scheduler_test.cc
index a07cd6d834fa28904bf7748b16972cca217503c1..18f1e554525a306ffe07460a889411ed4755b89f 100644
--- a/tensorflow/contrib/batching/adaptive_shared_batch_scheduler_test.cc
+++ b/tensorflow/contrib/batching/adaptive_shared_batch_scheduler_test.cc
@@ -141,6 +141,16 @@ TEST(AdaptiveSharedBatchSchedulerTest, BadOptions) {
   options = Scheduler::Options();
   options.feedback_smoothing_batches = 0;
   EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = Scheduler::Options();
+  options.initial_in_flight_batches_limit = 0.5;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = Scheduler::Options();
+  options.num_batch_threads = 5;
+  options.initial_in_flight_batches_limit = 8;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = Scheduler::Options();
+  options.batches_to_average_over = -5;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
 }
 
 TEST(AdaptiveSharedBatchSchedulerTest, ObeysQueueOptions) {
@@ -186,6 +196,7 @@ TEST(AdaptiveSharedBatchSchedulerTest, ObeysQueueOptions) {
     queue_options.max_enqueued_batches = 2;
     TF_ASSERT_OK(
         scheduler->AddQueue(queue_options, queue_0_callback, &queue_0));
+    EXPECT_EQ(10, queue_0->max_task_size());
     queue_options.max_batch_size = 0;
     // Queue must have max_batch_size > 0.
     EXPECT_FALSE(
@@ -433,6 +444,107 @@ TEST(AdaptiveSharedBatchSchedulerTest, QueueCapacityInfo) {
   }
   stop_teardown.Notify();
 }
+
+TEST(AdaptiveSharedBatchSchedulerTest, InFlightBatchesImplementation) {
+  AdaptiveSharedBatchScheduler<FakeTask>::Options options;
+  options.use_in_flight_batches_implementation = true;
+  options.initial_in_flight_batches_limit = 2;
+  options.batches_to_average_over = 1000;
+  std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(
+      AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+  mutex mu;
+  int processed_batches = 0;
+  Notification finish_processing;
+  auto queue_callback = [&mu, &processed_batches, &finish_processing](
+                            std::unique_ptr<Batch<FakeTask>> batch) {
+    ASSERT_TRUE(batch->IsClosed());
+    EXPECT_GT(batch->num_tasks(), 0);
+    mu.lock();
+    int batch_num = ++processed_batches;
+    mu.unlock();
+    if (batch_num == 2) {
+      // Give third batch a chance to process if it's going to.
+      Env::Default()->SleepForMicroseconds(1000);
+      finish_processing.Notify();
+    }
+    if (batch_num == 3) {
+      ASSERT_TRUE(finish_processing.HasBeenNotified());
+    }
+    finish_processing.WaitForNotification();
+  };
+
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
+
+  // Enqueue 3 batches.
+  for (int i = 0; i < 3; i++) {
+    TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  }
+}
+
+TEST(AdaptiveSharedBatchSchedulerTest, InFlightBatchesLimitTuning) {
+  test_util::FakeClockEnv env(Env::Default());
+  Notification start_teardown, stop_teardown;
+  std::unique_ptr<Thread> teardown_thread =
+      CreateFakeClockAdvancerThread(&env, &start_teardown, &stop_teardown);
+  {
+    AdaptiveSharedBatchScheduler<FakeTask>::Options options;
+    options.env = &env;
+    options.use_in_flight_batches_implementation = true;
+    options.initial_in_flight_batches_limit = 2;
+    options.batches_to_average_over = 1;
+    std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
+    TF_ASSERT_OK(
+        AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+    std::unique_ptr<BatchScheduler<FakeTask>> queue;
+    auto queue_callback = [&env](std::unique_ptr<Batch<FakeTask>> batch) {
+      ASSERT_TRUE(batch->IsClosed());
+      switch (batch->size()) {
+        case 0:
+          env.AdvanceByMicroseconds(10);
+          break;
+        case 1:
+          env.AdvanceByMicroseconds(15);
+          break;
+        case 2:
+          env.AdvanceByMicroseconds(10);
+          break;
+        case 3:
+          env.AdvanceByMicroseconds(11);
+          break;
+      }
+    };
+
+    TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
+    TF_ASSERT_OK(ScheduleTask(0, queue.get()));
+    double in_flight_batches_limit = 2;
+    while (scheduler->in_flight_batches_limit() == in_flight_batches_limit) {
+    }
+    // Initial direction will be negative.
+    EXPECT_LT(scheduler->in_flight_batches_limit(), in_flight_batches_limit);
+    in_flight_batches_limit = scheduler->in_flight_batches_limit();
+    TF_ASSERT_OK(ScheduleTask(1, queue.get()));
+    while (scheduler->in_flight_batches_limit() == in_flight_batches_limit) {
+    }
+    // Latency increased -> change direction.
+    EXPECT_GT(scheduler->in_flight_batches_limit(), in_flight_batches_limit);
+    in_flight_batches_limit = scheduler->in_flight_batches_limit();
+    TF_ASSERT_OK(ScheduleTask(2, queue.get()));
+    while (scheduler->in_flight_batches_limit() == in_flight_batches_limit) {
+    }
+    // Latency decreased -> keep going in same direction.
+    EXPECT_GT(scheduler->in_flight_batches_limit(), in_flight_batches_limit);
+    in_flight_batches_limit = scheduler->in_flight_batches_limit();
+    TF_ASSERT_OK(ScheduleTask(3, queue.get()));
+    while (scheduler->in_flight_batches_limit() == in_flight_batches_limit) {
+    }
+    // Latency increased -> change direction.
+    EXPECT_LT(scheduler->in_flight_batches_limit(), in_flight_batches_limit);
+    start_teardown.Notify();
+  }
+  stop_teardown.Notify();
+}
 }  // namespace anonymous
 }  // namespace serving
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/batching/basic_batch_scheduler.h b/tensorflow/contrib/batching/basic_batch_scheduler.h
index 9d3805fbaf39978159dd2f4a754e6d41a07acf6a..91065db2499dffd2687a53bd6304d9b7593f7b3a 100644
--- a/tensorflow/contrib/batching/basic_batch_scheduler.h
+++ b/tensorflow/contrib/batching/basic_batch_scheduler.h
@@ -192,6 +192,10 @@ class BasicBatchScheduler : public BatchScheduler<TaskType> {
   size_t NumEnqueuedTasks() const override;
   size_t SchedulingCapacity() const override;
 
+  size_t max_task_size() const override {
+    return shared_scheduler_queue_->max_task_size();
+  }
+
  private:
   explicit BasicBatchScheduler(
       std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue);
diff --git a/tensorflow/contrib/batching/basic_batch_scheduler_test.cc b/tensorflow/contrib/batching/basic_batch_scheduler_test.cc
index e020301795c7dadee2815c0e0d727e53e5fb9e6e..187823151cf840dcf8058677fcf74d1beffc3bc2 100644
--- a/tensorflow/contrib/batching/basic_batch_scheduler_test.cc
+++ b/tensorflow/contrib/batching/basic_batch_scheduler_test.cc
@@ -73,6 +73,7 @@ TEST(BasicBatchSchedulerTest, Basic) {
     std::unique_ptr<BasicBatchScheduler<FakeTask>> scheduler;
     TF_ASSERT_OK(
         BasicBatchScheduler<FakeTask>::Create(options, callback, &scheduler));
+    EXPECT_EQ(10, scheduler->max_task_size());
     EXPECT_EQ(0, scheduler->NumEnqueuedTasks());
     EXPECT_EQ(3 * 10, scheduler->SchedulingCapacity());
     TF_ASSERT_OK(ScheduleTask(3, scheduler.get()));
diff --git a/tensorflow/contrib/batching/batch_scheduler.h b/tensorflow/contrib/batching/batch_scheduler.h
index a5072f439abad3c5db79a514a7f2baff0b021b39..e18cf6c35059e4d720768e3b2c02b03727a6bac4 100644
--- a/tensorflow/contrib/batching/batch_scheduler.h
+++ b/tensorflow/contrib/batching/batch_scheduler.h
@@ -178,6 +178,10 @@ class BatchScheduler {
   // This method is useful for monitoring, or for guaranteeing a future slot in
   // the schedule (but being mindful about the caveats listed above).
   virtual size_t SchedulingCapacity() const = 0;
+
+  // Returns the maximum allowed size of tasks submitted to the scheduler. (This
+  // is typically equal to a configured maximum batch size.)
+  virtual size_t max_task_size() const = 0;
 };
 
 //////////
diff --git a/tensorflow/contrib/batching/shared_batch_scheduler.h b/tensorflow/contrib/batching/shared_batch_scheduler.h
index 41a3f99137ade2552432fee62ddce17d064148a4..86c45bdc2e66e30fbde15f6cafe481cf969c14d0 100644
--- a/tensorflow/contrib/batching/shared_batch_scheduler.h
+++ b/tensorflow/contrib/batching/shared_batch_scheduler.h
@@ -63,7 +63,7 @@ namespace serving {
 // instead of N independent ones, with their sharing deliberately coordinated.
 //
 // SharedBatchScheduler does not implement the BatchScheduler API; rather, it
-// presents an abstraction of "queues", where each queue coresponds to one type
+// presents an abstraction of "queues", where each queue corresponds to one type
 // of task. Tasks submitted to a given queue are placed in their own batches,
 // and cannot be mixed with other tasks. Queues can be added and deleted
 // dynamically, to accommodate e.g. versions of a model being brought up and
@@ -248,6 +248,9 @@ class Queue {
   // BatchScheduler::SchedulingCapacity().
   size_t SchedulingCapacity() const;
 
+  // Returns the maximum allowed size of tasks submitted to the queue.
+  size_t max_task_size() const { return options_.max_batch_size; }
+
   // Called by a thread that is ready to process a batch, to request one from
   // this queue. Either returns a batch that is ready to be processed, or
   // nullptr if the queue declines to schedule a batch at this time. If it
@@ -338,6 +341,8 @@ class QueueHandle : public BatchScheduler<TaskType> {
   size_t NumEnqueuedTasks() const override;
   size_t SchedulingCapacity() const override;
 
+  size_t max_task_size() const override { return queue_->max_task_size(); }
+
  private:
   // The scheduler that owns 'queue_'.
   std::shared_ptr<SharedBatchScheduler<TaskType>> scheduler_;
diff --git a/tensorflow/contrib/batching/shared_batch_scheduler_test.cc b/tensorflow/contrib/batching/shared_batch_scheduler_test.cc
index 3e924ae5f13519b4fe9a3f4b510773ca2bddaf23..3ac79a8fdc47389816db8ca09f27846d1c4623c2 100644
--- a/tensorflow/contrib/batching/shared_batch_scheduler_test.cc
+++ b/tensorflow/contrib/batching/shared_batch_scheduler_test.cc
@@ -429,6 +429,7 @@ TEST(SharedBatchSchedulerTest, ConstMethods) {
     queue_options.max_enqueued_batches = max_enqueued_batches;
     std::unique_ptr<BatchScheduler<FakeTask>> queue;
     TF_ASSERT_OK(scheduler->AddQueue(queue_options, callback, &queue));
+    EXPECT_EQ(2, queue->max_task_size());
     EXPECT_EQ(0, queue->NumEnqueuedTasks());
     EXPECT_EQ(max_enqueued_batches * 2, queue->SchedulingCapacity());
 
diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD
index a262d4aecdbb69dfcd8b88bc0a09060500d6b1c9..4e0520fa33a57e2f15c39d362ec3a39945202d46 100644
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@@ -99,6 +99,25 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "layers_conv_variational_test",
+    size = "small",
+    srcs = ["python/kernel_tests/layers_conv_variational_test.py"],
+    additional_deps = [
+        ":bayesflow_py",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/distributions:distributions_py",
+        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+    ],
+)
+
 cuda_py_test(
     name = "layers_dense_variational_test",
     size = "small",
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
index b1f108e5f01e4945ee83d8262f1d99877f0fe9f0..cbc66b6dc13db62c25952de6b6c13b2fdfe27f12 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for Hamiltonian Monte Carlo.
-"""
+"""Tests for Hamiltonian Monte Carlo."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -27,6 +26,7 @@ from tensorflow.contrib.bayesflow.python.ops import hmc
 
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -46,6 +46,9 @@ class HMCTest(test.TestCase):
     random_seed.set_random_seed(10003)
     np.random.seed(10003)
 
+  def assertAllFinite(self, x):
+    self.assertAllEqual(np.ones_like(x).astype(bool), np.isfinite(x))
+
   def _log_gamma_log_prob(self, x, event_dims=()):
     """Computes log-pdf of a log-gamma random variable.
 
@@ -345,5 +348,97 @@ class HMCTest(test.TestCase):
   def testAIS12(self):
     self._ais_gets_correct_log_normalizer_wrapper([1, 2])
 
+  def testNanRejection(self):
+    """Tests that an update that yields NaN potentials gets rejected.
+
+    We run HMC with a target distribution that returns NaN
+    log-likelihoods if any element of x < 0, and unit-scale
+    exponential log-likelihoods otherwise. The exponential potential
+    pushes x towards 0, ensuring that any reasonably large update will
+    push us over the edge into NaN territory.
+    """
+    def _unbounded_exponential_log_prob(x):
+      """An exponential distribution with log-likelihood NaN for x < 0."""
+      per_element_potentials = array_ops.where(x < 0,
+                                               np.nan * array_ops.ones_like(x),
+                                               -x)
+      return math_ops.reduce_sum(per_element_potentials)
+
+    with self.test_session() as sess:
+      initial_x = math_ops.linspace(0.01, 5, 10)
+      updated_x, acceptance_probs, _, _ = hmc.kernel(
+          2., 5, initial_x, _unbounded_exponential_log_prob, [0])
+      initial_x_val, updated_x_val, acceptance_probs_val = sess.run(
+          [initial_x, updated_x, acceptance_probs])
+
+      logging.vlog(1, 'initial_x = {}'.format(initial_x_val))
+      logging.vlog(1, 'updated_x = {}'.format(updated_x_val))
+      logging.vlog(1, 'acceptance_probs = {}'.format(acceptance_probs_val))
+
+      self.assertAllEqual(initial_x_val, updated_x_val)
+      self.assertEqual(acceptance_probs_val, 0.)
+
+  def testNanFromGradsDontPropagate(self):
+    """Test that update with NaN gradients does not cause NaN in results."""
+    def _nan_log_prob_with_nan_gradient(x):
+      return np.nan * math_ops.reduce_sum(x)
+
+    with self.test_session() as sess:
+      initial_x = math_ops.linspace(0.01, 5, 10)
+      updated_x, acceptance_probs, new_log_prob, new_grad = hmc.kernel(
+          2., 5, initial_x, _nan_log_prob_with_nan_gradient, [0])
+      initial_x_val, updated_x_val, acceptance_probs_val = sess.run(
+          [initial_x, updated_x, acceptance_probs])
+
+      logging.vlog(1, 'initial_x = {}'.format(initial_x_val))
+      logging.vlog(1, 'updated_x = {}'.format(updated_x_val))
+      logging.vlog(1, 'acceptance_probs = {}'.format(acceptance_probs_val))
+
+      self.assertAllEqual(initial_x_val, updated_x_val)
+      self.assertEqual(acceptance_probs_val, 0.)
+
+      self.assertAllFinite(
+          gradients_impl.gradients(updated_x, initial_x)[0].eval())
+      self.assertTrue(
+          gradients_impl.gradients(new_grad, initial_x)[0] is None)
+
+      # Gradients of the acceptance probs and new log prob are not finite.
+      _ = new_log_prob  # Prevent unused arg error.
+      # self.assertAllFinite(
+      #     gradients_impl.gradients(acceptance_probs, initial_x)[0].eval())
+      # self.assertAllFinite(
+      #     gradients_impl.gradients(new_log_prob, initial_x)[0].eval())
+
+  def testChainWorksIn64Bit(self):
+    def log_prob(x):
+      return - math_ops.reduce_sum(x * x, axis=-1)
+    states, acceptance_probs = hmc.chain(
+        n_iterations=10,
+        step_size=np.float64(0.01),
+        n_leapfrog_steps=10,
+        initial_x=np.zeros(5).astype(np.float64),
+        target_log_prob_fn=log_prob,
+        event_dims=[-1])
+    with self.test_session() as sess:
+      states_, acceptance_probs_ = sess.run([states, acceptance_probs])
+    self.assertEqual(np.float64, states_.dtype)
+    self.assertEqual(np.float64, acceptance_probs_.dtype)
+
+  def testChainWorksIn16Bit(self):
+    def log_prob(x):
+      return - math_ops.reduce_sum(x * x, axis=-1)
+    states, acceptance_probs = hmc.chain(
+        n_iterations=10,
+        step_size=np.float16(0.01),
+        n_leapfrog_steps=10,
+        initial_x=np.zeros(5).astype(np.float16),
+        target_log_prob_fn=log_prob,
+        event_dims=[-1])
+    with self.test_session() as sess:
+      states_, acceptance_probs_ = sess.run([states, acceptance_probs])
+    self.assertEqual(np.float16, states_.dtype)
+    self.assertEqual(np.float16, acceptance_probs_.dtype)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/layers_conv_variational_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/layers_conv_variational_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..57f44aef1a198f62cd8a715472a68a3d889ec3ac
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/layers_conv_variational_test.py
@@ -0,0 +1,289 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for convolutional Bayesian layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.bayesflow.python.ops import layers_conv_variational as prob_layers_lib
+from tensorflow.contrib.bayesflow.python.ops import layers_util as prob_layers_util
+from tensorflow.contrib.distributions.python.ops import independent as independent_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.platform import test
+
+
+class Counter(object):
+  """Helper class to manage incrementing a counting `int`."""
+
+  def __init__(self):
+    self._value = -1
+
+  @property
+  def value(self):
+    return self._value
+
+  def __call__(self):
+    self._value += 1
+    return self._value
+
+
+class MockDistribution(independent_lib.Independent):
+  """Monitors DenseVariational calls to the underlying distribution."""
+
+  def __init__(self, result_sample, result_log_prob, loc=None, scale=None):
+    self.result_sample = result_sample
+    self.result_log_prob = result_log_prob
+    self.result_loc = loc
+    self.result_scale = scale
+    self.result_distribution = normal_lib.Normal(loc=0.0, scale=1.0)
+    if loc is not None and scale is not None:
+      self.result_distribution = normal_lib.Normal(loc=self.result_loc,
+                                                   scale=self.result_scale)
+    self.called_log_prob = Counter()
+    self.called_sample = Counter()
+    self.called_loc = Counter()
+    self.called_scale = Counter()
+
+  def log_prob(self, *args, **kwargs):
+    self.called_log_prob()
+    return self.result_log_prob
+
+  def sample(self, *args, **kwargs):
+    self.called_sample()
+    return self.result_sample
+
+  @property
+  def distribution(self):  # for dummy check on Independent(Normal)
+    return self.result_distribution
+
+  @property
+  def loc(self):
+    self.called_loc()
+    return self.result_loc
+
+  @property
+  def scale(self):
+    self.called_scale()
+    return self.result_scale
+
+
+class MockKLDivergence(object):
+  """Monitors layer calls to the divergence implementation."""
+
+  def __init__(self, result):
+    self.result = result
+    self.args = []
+    self.called = Counter()
+
+  def __call__(self, *args, **kwargs):
+    self.called()
+    self.args.append(args)
+    return self.result
+
+
+class ConvVariational(test.TestCase):
+
+  def _testKLPenaltyKernel(self, layer_class):
+    with self.test_session():
+      layer = layer_class(filters=2, kernel_size=3)
+      if layer_class == prob_layers_lib.Conv1DVariational:
+        inputs = random_ops.random_uniform([2, 3, 1], seed=1)
+      elif layer_class == prob_layers_lib.Conv2DVariational:
+        inputs = random_ops.random_uniform([2, 3, 3, 1], seed=1)
+      elif layer_class == prob_layers_lib.Conv3DVariational:
+        inputs = random_ops.random_uniform([2, 3, 3, 3, 1], seed=1)
+
+      # No keys.
+      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(losses), 0)
+      self.assertListEqual(layer.losses, losses)
+
+      _ = layer(inputs)
+
+      # Yes keys.
+      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(losses), 1)
+      self.assertListEqual(layer.losses, losses)
+
+  def _testKLPenaltyBoth(self, layer_class):
+    def _make_normal(dtype, *args):  # pylint: disable=unused-argument
+      return normal_lib.Normal(
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.))
+    with self.test_session():
+      layer = layer_class(
+          filters=2,
+          kernel_size=3,
+          bias_posterior_fn=prob_layers_util.default_mean_field_normal_fn(),
+          bias_prior_fn=_make_normal)
+      if layer_class == prob_layers_lib.Conv1DVariational:
+        inputs = random_ops.random_uniform([2, 3, 1], seed=1)
+      elif layer_class == prob_layers_lib.Conv2DVariational:
+        inputs = random_ops.random_uniform([2, 3, 3, 1], seed=1)
+      elif layer_class == prob_layers_lib.Conv3DVariational:
+        inputs = random_ops.random_uniform([2, 3, 3, 3, 1], seed=1)
+
+      # No keys.
+      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(losses), 0)
+      self.assertListEqual(layer.losses, losses)
+
+      _ = layer(inputs)
+
+      # Yes keys.
+      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(losses), 2)
+      self.assertListEqual(layer.losses, losses)
+
+  def _testConvVariational(self, layer_class):
+    batch_size, depth, height, width, channels, filters = 2, 4, 4, 4, 3, 5
+    with self.test_session() as sess:
+      seed = Counter()
+      if layer_class == prob_layers_lib.Conv1DVariational:
+        inputs = random_ops.random_uniform(
+            [batch_size, width, channels], seed=seed())
+        kernel_size = (2,)
+      elif layer_class == prob_layers_lib.Conv2DVariational:
+        inputs = random_ops.random_uniform(
+            [batch_size, height, width, channels], seed=seed())
+        kernel_size = (2, 2)
+      elif layer_class == prob_layers_lib.Conv3DVariational:
+        inputs = random_ops.random_uniform(
+            [batch_size, depth, height, width, channels], seed=seed())
+        kernel_size = (2, 2, 2)
+
+      kernel_shape = kernel_size + (channels, filters)
+      kernel_posterior = MockDistribution(
+          result_log_prob=random_ops.random_uniform(kernel_shape, seed=seed()),
+          result_sample=random_ops.random_uniform(kernel_shape, seed=seed()))
+      kernel_prior = MockDistribution(
+          result_log_prob=random_ops.random_uniform(kernel_shape, seed=seed()),
+          result_sample=random_ops.random_uniform(kernel_shape, seed=seed()))
+      kernel_divergence = MockKLDivergence(
+          result=random_ops.random_uniform(kernel_shape, seed=seed()))
+
+      bias_size = (filters,)
+      bias_posterior = MockDistribution(
+          result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
+          result_sample=random_ops.random_uniform(bias_size, seed=seed()))
+      bias_prior = MockDistribution(
+          result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
+          result_sample=random_ops.random_uniform(bias_size, seed=seed()))
+      bias_divergence = MockKLDivergence(
+          result=random_ops.random_uniform(bias_size, seed=seed()))
+
+      convolution_op = nn_ops.Convolution(
+          tensor_shape.TensorShape(inputs.shape),
+          filter_shape=tensor_shape.TensorShape(kernel_shape),
+          padding="SAME")
+      expected_outputs = convolution_op(inputs, kernel_posterior.result_sample)
+      expected_outputs = nn.bias_add(expected_outputs,
+                                     bias_posterior.result_sample,
+                                     data_format="NHWC")
+
+      layer = layer_class(
+          filters=filters,
+          kernel_size=kernel_size,
+          padding="SAME",
+          kernel_posterior_fn=lambda *args: kernel_posterior,
+          kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
+          kernel_prior_fn=lambda *args: kernel_prior,
+          kernel_divergence_fn=kernel_divergence,
+          bias_posterior_fn=lambda *args: bias_posterior,
+          bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
+          bias_prior_fn=lambda *args: bias_prior,
+          bias_divergence_fn=bias_divergence)
+
+      outputs = layer(inputs)
+
+      kl_penalty = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+
+      [
+          expected_outputs_, actual_outputs_,
+          expected_kernel_, actual_kernel_,
+          expected_kernel_divergence_, actual_kernel_divergence_,
+          expected_bias_, actual_bias_,
+          expected_bias_divergence_, actual_bias_divergence_,
+      ] = sess.run([
+          expected_outputs, outputs,
+          kernel_posterior.result_sample, layer.kernel_posterior_tensor,
+          kernel_divergence.result, kl_penalty[0],
+          bias_posterior.result_sample, layer.bias_posterior_tensor,
+          bias_divergence.result, kl_penalty[1],
+      ])
+
+      self.assertAllClose(
+          expected_kernel_, actual_kernel_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_bias_, actual_bias_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_outputs_, actual_outputs_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_kernel_divergence_, actual_kernel_divergence_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_bias_divergence_, actual_bias_divergence_,
+          rtol=1e-6, atol=0.)
+
+      self.assertAllEqual(
+          [[kernel_posterior.distribution,
+            kernel_prior.distribution,
+            kernel_posterior.result_sample]],
+          kernel_divergence.args)
+
+      self.assertAllEqual(
+          [[bias_posterior.distribution,
+            bias_prior.distribution,
+            bias_posterior.result_sample]],
+          bias_divergence.args)
+
+  def testKLPenaltyKernelConv1DVariational(self):
+    self._testKLPenaltyKernel(prob_layers_lib.Conv1DVariational)
+
+  def testKLPenaltyKernelConv2DVariational(self):
+    self._testKLPenaltyKernel(prob_layers_lib.Conv2DVariational)
+
+  def testKLPenaltyKernelConv3DVariational(self):
+    self._testKLPenaltyKernel(prob_layers_lib.Conv3DVariational)
+
+  def testKLPenaltyBothConv1DVariational(self):
+    self._testKLPenaltyBoth(prob_layers_lib.Conv1DVariational)
+
+  def testKLPenaltyBothConv2DVariational(self):
+    self._testKLPenaltyBoth(prob_layers_lib.Conv2DVariational)
+
+  def testKLPenaltyBothConv3DVariational(self):
+    self._testKLPenaltyBoth(prob_layers_lib.Conv3DVariational)
+
+  def testConv1DVariational(self):
+    self._testConvVariational(prob_layers_lib.Conv1DVariational)
+
+  def testConv2DVariational(self):
+    self._testConvVariational(prob_layers_lib.Conv2DVariational)
+
+  def testConv3DVariational(self):
+    self._testConvVariational(prob_layers_lib.Conv3DVariational)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/layers_dense_variational_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/layers_dense_variational_test.py
index 50358fd1c2b7635ffe2d08c5af3219bb0a11498b..4e9f1193511c35beead85914ca988fde69b3afde 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/layers_dense_variational_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/layers_dense_variational_test.py
@@ -18,11 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.contrib.bayesflow.python.ops import layers_dense_variational_impl as prob_layers_lib
+from tensorflow.contrib.bayesflow.python.ops import layers_util as prob_layers_util
+from tensorflow.contrib.distributions.python.ops import independent as independent_lib
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.platform import test
 
 
@@ -41,14 +48,18 @@ class Counter(object):
     return self._value
 
 
-class MockDistribution(normal_lib.Normal):
-  """Monitors DenseVariational calls to the underlying distribution."""
+class MockDistribution(independent_lib.Independent):
+  """Monitors layer calls to the underlying distribution."""
 
   def __init__(self, result_sample, result_log_prob, loc=None, scale=None):
     self.result_sample = result_sample
     self.result_log_prob = result_log_prob
     self.result_loc = loc
     self.result_scale = scale
+    self.result_distribution = normal_lib.Normal(loc=0.0, scale=1.0)
+    if loc is not None and scale is not None:
+      self.result_distribution = normal_lib.Normal(loc=self.result_loc,
+                                                   scale=self.result_scale)
     self.called_log_prob = Counter()
     self.called_sample = Counter()
     self.called_loc = Counter()
@@ -62,6 +73,10 @@ class MockDistribution(normal_lib.Normal):
     self.called_sample()
     return self.result_sample
 
+  @property
+  def distribution(self):  # for dummy check on Independent(Normal)
+    return self.result_distribution
+
   @property
   def loc(self):
     self.called_loc()
@@ -74,7 +89,7 @@ class MockDistribution(normal_lib.Normal):
 
 
 class MockKLDivergence(object):
-  """Monitors DenseVariational calls to the divergence implementation."""
+  """Monitors layer calls to the divergence implementation."""
 
   def __init__(self, result):
     self.result = result
@@ -87,94 +102,125 @@ class MockKLDivergence(object):
     return self.result
 
 
-class DenseVariationalLocalReparametrization(test.TestCase):
+class DenseVariational(test.TestCase):
 
-  def testKLPenaltyKernel(self):
+  def _testKLPenaltyKernel(self, layer_class):
     with self.test_session():
-      dense_vi = prob_layers_lib.DenseVariational(units=2)
+      layer = layer_class(units=2)
       inputs = random_ops.random_uniform([2, 3], seed=1)
 
       # No keys.
-      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 0)
-      self.assertListEqual(dense_vi.losses, loss_keys)
+      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(losses), 0)
+      self.assertListEqual(layer.losses, losses)
 
-      _ = dense_vi(inputs)
+      _ = layer(inputs)
 
       # Yes keys.
-      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 1)
-      self.assertListEqual(dense_vi.losses, loss_keys)
+      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(losses), 1)
+      self.assertListEqual(layer.losses, losses)
 
-  def testKLPenaltyBoth(self):
+  def _testKLPenaltyBoth(self, layer_class):
     def _make_normal(dtype, *args):  # pylint: disable=unused-argument
       return normal_lib.Normal(
           loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.))
     with self.test_session():
-      dense_vi = prob_layers_lib.DenseVariational(
+      layer = layer_class(
           units=2,
-          bias_posterior_fn=prob_layers_lib.default_mean_field_normal_fn(),
+          bias_posterior_fn=prob_layers_util.default_mean_field_normal_fn(),
           bias_prior_fn=_make_normal)
       inputs = random_ops.random_uniform([2, 3], seed=1)
 
       # No keys.
-      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 0)
-      self.assertListEqual(dense_vi.losses, loss_keys)
+      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(losses), 0)
+      self.assertListEqual(layer.losses, losses)
 
-      _ = dense_vi(inputs)
+      _ = layer(inputs)
 
       # Yes keys.
-      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 2)
-      self.assertListEqual(dense_vi.losses, loss_keys)
-
-  def testVariationalNonLocal(self):
+      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(losses), 2)
+      self.assertListEqual(layer.losses, losses)
+
+  def _testDenseSetUp(self, layer_class, batch_size, in_size, out_size,
+                      **kwargs):
+    seed = Counter()
+    inputs = random_ops.random_uniform([batch_size, in_size], seed=seed())
+
+    kernel_size = [in_size, out_size]
+    kernel_posterior = MockDistribution(
+        loc=random_ops.random_uniform(kernel_size, seed=seed()),
+        scale=random_ops.random_uniform(kernel_size, seed=seed()),
+        result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
+        result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
+    kernel_prior = MockDistribution(
+        result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
+        result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
+    kernel_divergence = MockKLDivergence(
+        result=random_ops.random_uniform(kernel_size, seed=seed()))
+
+    bias_size = [out_size]
+    bias_posterior = MockDistribution(
+        result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
+        result_sample=random_ops.random_uniform(bias_size, seed=seed()))
+    bias_prior = MockDistribution(
+        result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
+        result_sample=random_ops.random_uniform(bias_size, seed=seed()))
+    bias_divergence = MockKLDivergence(
+        result=random_ops.random_uniform(bias_size, seed=seed()))
+
+    layer = layer_class(
+        units=out_size,
+        kernel_posterior_fn=lambda *args: kernel_posterior,
+        kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
+        kernel_prior_fn=lambda *args: kernel_prior,
+        kernel_divergence_fn=kernel_divergence,
+        bias_posterior_fn=lambda *args: bias_posterior,
+        bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
+        bias_prior_fn=lambda *args: bias_prior,
+        bias_divergence_fn=bias_divergence,
+        **kwargs)
+
+    outputs = layer(inputs)
+
+    kl_penalty = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+    return (kernel_posterior, kernel_prior, kernel_divergence,
+            bias_posterior, bias_prior, bias_divergence,
+            layer, inputs, outputs, kl_penalty)
+
+  def testKLPenaltyKernelReparameterization(self):
+    self._testKLPenaltyKernel(prob_layers_lib.DenseReparameterization)
+
+  def testKLPenaltyKernelLocalReparameterization(self):
+    self._testKLPenaltyKernel(prob_layers_lib.DenseLocalReparameterization)
+
+  def testKLPenaltyKernelFlipout(self):
+    self._testKLPenaltyKernel(prob_layers_lib.DenseFlipout)
+
+  def testKLPenaltyBothReparameterization(self):
+    self._testKLPenaltyBoth(prob_layers_lib.DenseReparameterization)
+
+  def testKLPenaltyBothLocalReparameterization(self):
+    self._testKLPenaltyBoth(prob_layers_lib.DenseLocalReparameterization)
+
+  def testKLPenaltyBothFlipout(self):
+    self._testKLPenaltyBoth(prob_layers_lib.DenseFlipout)
+
+  def testDenseReparameterization(self):
     batch_size, in_size, out_size = 2, 3, 4
     with self.test_session() as sess:
-      seed = Counter()
-      inputs = random_ops.random_uniform([batch_size, in_size], seed=seed())
-
-      kernel_size = [in_size, out_size]
-      kernel_posterior = MockDistribution(
-          result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
-          result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
-      kernel_prior = MockDistribution(
-          result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
-          result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
-      kernel_divergence = MockKLDivergence(
-          result=random_ops.random_uniform(kernel_size, seed=seed()))
-
-      bias_size = [out_size]
-      bias_posterior = MockDistribution(
-          result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
-          result_sample=random_ops.random_uniform(bias_size, seed=seed()))
-      bias_prior = MockDistribution(
-          result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
-          result_sample=random_ops.random_uniform(bias_size, seed=seed()))
-      bias_divergence = MockKLDivergence(
-          result=random_ops.random_uniform(bias_size, seed=seed()))
+      (kernel_posterior, kernel_prior, kernel_divergence,
+       bias_posterior, bias_prior, bias_divergence, layer, inputs,
+       outputs, kl_penalty) = self._testDenseSetUp(
+           prob_layers_lib.DenseReparameterization,
+           batch_size, in_size, out_size)
 
       expected_outputs = (
           math_ops.matmul(inputs, kernel_posterior.result_sample) +
           bias_posterior.result_sample)
 
-      dense_vi = prob_layers_lib.DenseVariational(
-          units=2,
-          kernel_use_local_reparameterization=False,
-          kernel_posterior_fn=lambda *args: kernel_posterior,
-          kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
-          kernel_prior_fn=lambda *args: kernel_prior,
-          kernel_divergence_fn=kernel_divergence,
-          bias_posterior_fn=lambda *args: bias_posterior,
-          bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
-          bias_prior_fn=lambda *args: bias_prior,
-          bias_divergence_fn=bias_divergence)
-
-      outputs = dense_vi(inputs)
-
-      kl_penalty = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-
       [
           expected_outputs_, actual_outputs_,
           expected_kernel_, actual_kernel_,
@@ -183,9 +229,9 @@ class DenseVariationalLocalReparametrization(test.TestCase):
           expected_bias_divergence_, actual_bias_divergence_,
       ] = sess.run([
           expected_outputs, outputs,
-          kernel_posterior.result_sample, dense_vi.kernel.posterior_tensor,
+          kernel_posterior.result_sample, layer.kernel_posterior_tensor,
           kernel_divergence.result, kl_penalty[0],
-          bias_posterior.result_sample, dense_vi.bias.posterior_tensor,
+          bias_posterior.result_sample, layer.bias_posterior_tensor,
           bias_divergence.result, kl_penalty[1],
       ])
 
@@ -206,40 +252,25 @@ class DenseVariationalLocalReparametrization(test.TestCase):
           rtol=1e-6, atol=0.)
 
       self.assertAllEqual(
-          [[kernel_posterior, kernel_prior, kernel_posterior.result_sample]],
+          [[kernel_posterior.distribution,
+            kernel_prior.distribution,
+            kernel_posterior.result_sample]],
           kernel_divergence.args)
 
       self.assertAllEqual(
-          [[bias_posterior, bias_prior, bias_posterior.result_sample]],
+          [[bias_posterior.distribution,
+            bias_prior.distribution,
+            bias_posterior.result_sample]],
           bias_divergence.args)
 
-  def testVariationalLocal(self):
+  def testDenseLocalReparameterization(self):
     batch_size, in_size, out_size = 2, 3, 4
     with self.test_session() as sess:
-      seed = Counter()
-      inputs = random_ops.random_uniform([batch_size, in_size], seed=seed())
-
-      kernel_size = [in_size, out_size]
-      kernel_posterior = MockDistribution(
-          loc=random_ops.random_uniform(kernel_size, seed=seed()),
-          scale=random_ops.random_uniform(kernel_size, seed=seed()),
-          result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
-          result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
-      kernel_prior = MockDistribution(
-          result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
-          result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
-      kernel_divergence = MockKLDivergence(
-          result=random_ops.random_uniform(kernel_size, seed=seed()))
-
-      bias_size = [out_size]
-      bias_posterior = MockDistribution(
-          result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
-          result_sample=random_ops.random_uniform(bias_size, seed=seed()))
-      bias_prior = MockDistribution(
-          result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
-          result_sample=random_ops.random_uniform(bias_size, seed=seed()))
-      bias_divergence = MockKLDivergence(
-          result=random_ops.random_uniform(bias_size, seed=seed()))
+      (kernel_posterior, kernel_prior, kernel_divergence,
+       bias_posterior, bias_prior, bias_divergence, layer, inputs,
+       outputs, kl_penalty) = self._testDenseSetUp(
+           prob_layers_lib.DenseLocalReparameterization,
+           batch_size, in_size, out_size)
 
       expected_kernel_posterior_affine = normal_lib.Normal(
           loc=math_ops.matmul(inputs, kernel_posterior.result_loc),
@@ -250,21 +281,80 @@ class DenseVariationalLocalReparametrization(test.TestCase):
       expected_outputs = (expected_kernel_posterior_affine_tensor +
                           bias_posterior.result_sample)
 
-      dense_vi = prob_layers_lib.DenseVariational(
-          units=2,
-          kernel_use_local_reparameterization=True,
-          kernel_posterior_fn=lambda *args: kernel_posterior,
-          kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
-          kernel_prior_fn=lambda *args: kernel_prior,
-          kernel_divergence_fn=kernel_divergence,
-          bias_posterior_fn=lambda *args: bias_posterior,
-          bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
-          bias_prior_fn=lambda *args: bias_prior,
-          bias_divergence_fn=bias_divergence)
+      [
+          expected_outputs_, actual_outputs_,
+          expected_kernel_divergence_, actual_kernel_divergence_,
+          expected_bias_, actual_bias_,
+          expected_bias_divergence_, actual_bias_divergence_,
+      ] = sess.run([
+          expected_outputs, outputs,
+          kernel_divergence.result, kl_penalty[0],
+          bias_posterior.result_sample, layer.bias_posterior_tensor,
+          bias_divergence.result, kl_penalty[1],
+      ])
 
-      outputs = dense_vi(inputs)
+      self.assertAllClose(
+          expected_bias_, actual_bias_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_outputs_, actual_outputs_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_kernel_divergence_, actual_kernel_divergence_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_bias_divergence_, actual_bias_divergence_,
+          rtol=1e-6, atol=0.)
+
+      self.assertAllEqual(
+          [[kernel_posterior.distribution,
+            kernel_prior.distribution,
+            None]],
+          kernel_divergence.args)
+
+      self.assertAllEqual(
+          [[bias_posterior.distribution,
+            bias_prior.distribution,
+            bias_posterior.result_sample]],
+          bias_divergence.args)
+
+  def testDenseFlipout(self):
+    batch_size, in_size, out_size = 2, 3, 4
+    with self.test_session() as sess:
+      (kernel_posterior, kernel_prior, kernel_divergence,
+       bias_posterior, bias_prior, bias_divergence, layer, inputs,
+       outputs, kl_penalty) = self._testDenseSetUp(
+           prob_layers_lib.DenseFlipout,
+           batch_size, in_size, out_size, seed=44)
+
+      expected_kernel_posterior_affine = normal_lib.Normal(
+          loc=array_ops.zeros_like(kernel_posterior.result_loc),
+          scale=kernel_posterior.result_scale)
+      expected_kernel_posterior_affine_tensor = (
+          expected_kernel_posterior_affine.sample(seed=42))
 
-      kl_penalty = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      sign_input = random_ops.random_uniform(
+          [batch_size, in_size],
+          minval=0,
+          maxval=2,
+          dtype=dtypes.int32,
+          seed=layer.seed)
+      sign_input = math_ops.cast(2 * sign_input - 1, inputs.dtype)
+      sign_output = random_ops.random_uniform(
+          [batch_size, out_size],
+          minval=0,
+          maxval=2,
+          dtype=dtypes.int32,
+          seed=distribution_util.gen_new_seed(
+              layer.seed, salt="dense_flipout"))
+      sign_output = math_ops.cast(2 * sign_output - 1, inputs.dtype)
+      perturbed_inputs = math_ops.matmul(
+          inputs * sign_input, expected_kernel_posterior_affine_tensor)
+      perturbed_inputs *= sign_output
+
+      expected_outputs = math_ops.matmul(inputs, kernel_posterior.result_loc)
+      expected_outputs += perturbed_inputs
+      expected_outputs += bias_posterior.result_sample
 
       [
           expected_outputs_, actual_outputs_,
@@ -274,7 +364,7 @@ class DenseVariationalLocalReparametrization(test.TestCase):
       ] = sess.run([
           expected_outputs, outputs,
           kernel_divergence.result, kl_penalty[0],
-          bias_posterior.result_sample, dense_vi.bias.posterior_tensor,
+          bias_posterior.result_sample, layer.bias_posterior_tensor,
           bias_divergence.result, kl_penalty[1],
       ])
 
@@ -292,13 +382,62 @@ class DenseVariationalLocalReparametrization(test.TestCase):
           rtol=1e-6, atol=0.)
 
       self.assertAllEqual(
-          [[kernel_posterior, kernel_prior, None]],
+          [[kernel_posterior.distribution, kernel_prior.distribution, None]],
           kernel_divergence.args)
 
       self.assertAllEqual(
-          [[bias_posterior, bias_prior, bias_posterior.result_sample]],
+          [[bias_posterior.distribution,
+            bias_prior.distribution,
+            bias_posterior.result_sample]],
           bias_divergence.args)
 
+  def testRandomDenseFlipout(self):
+    batch_size, in_size, out_size = 2, 3, 4
+    with self.test_session() as sess:
+      seed = Counter()
+      inputs = random_ops.random_uniform([batch_size, in_size], seed=seed())
+
+      kernel_posterior = MockDistribution(
+          loc=random_ops.random_uniform(
+              [in_size, out_size], seed=seed()),
+          scale=random_ops.random_uniform(
+              [in_size, out_size], seed=seed()),
+          result_log_prob=random_ops.random_uniform(
+              [in_size, out_size], seed=seed()),
+          result_sample=random_ops.random_uniform(
+              [in_size, out_size], seed=seed()))
+      bias_posterior = MockDistribution(
+          loc=random_ops.random_uniform(
+              [out_size], seed=seed()),
+          scale=random_ops.random_uniform(
+              [out_size], seed=seed()),
+          result_log_prob=random_ops.random_uniform(
+              [out_size], seed=seed()),
+          result_sample=random_ops.random_uniform(
+              [out_size], seed=seed()))
+      layer_one = prob_layers_lib.DenseFlipout(
+          units=out_size,
+          kernel_posterior_fn=lambda *args: kernel_posterior,
+          kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
+          bias_posterior_fn=lambda *args: bias_posterior,
+          bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
+          seed=44)
+      layer_two = prob_layers_lib.DenseFlipout(
+          units=out_size,
+          kernel_posterior_fn=lambda *args: kernel_posterior,
+          kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
+          bias_posterior_fn=lambda *args: bias_posterior,
+          bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
+          seed=45)
+
+      outputs_one = layer_one(inputs)
+      outputs_two = layer_two(inputs)
+
+      outputs_one_, outputs_two_ = sess.run([
+          outputs_one, outputs_two])
+
+      self.assertLess(np.sum(np.isclose(outputs_one_, outputs_two_)), out_size)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
index 333dce929530adceb30dcb63653a5bd009c059e0..5685a942e98800a39ec718adc67bcfd43aeafd52 100644
--- a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
@@ -27,6 +27,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -174,9 +175,11 @@ def chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
 
     potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
     potential, grad = potential_and_grad(initial_x)
-    return functional_ops.scan(body, array_ops.zeros(n_iterations),
-                               (initial_x, array_ops.zeros(non_event_shape),
-                                -potential, -grad))[:2]
+    return functional_ops.scan(
+        body, array_ops.zeros(n_iterations, dtype=initial_x.dtype),
+        (initial_x,
+         array_ops.zeros(non_event_shape, dtype=initial_x.dtype),
+         -potential, -grad))[:2]
 
 
 def ais_chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
@@ -298,8 +301,9 @@ def ais_chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
       return updated_x, acceptance_probs, w
 
     x, acceptance_probs, w = functional_ops.scan(
-        _body, beta_series, (initial_x, array_ops.zeros(non_event_shape),
-                             array_ops.zeros(non_event_shape)))
+        _body, beta_series,
+        (initial_x, array_ops.zeros(non_event_shape, dtype=initial_x.dtype),
+         array_ops.zeros(non_event_shape, dtype=initial_x.dtype)))
   return w[-1], x[-1], acceptance_probs[-1]
 
 
@@ -446,9 +450,10 @@ def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
   """
   with ops.name_scope(name, 'hmc_kernel', [step_size, n_leapfrog_steps, x]):
     potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
+    x = ops.convert_to_tensor(x, name='x')
 
     x_shape = array_ops.shape(x)
-    m = random_ops.random_normal(x_shape)
+    m = random_ops.random_normal(x_shape, dtype=x.dtype)
 
     kinetic_0 = 0.5 * math_ops.reduce_sum(math_ops.square(m), event_dims)
 
@@ -468,26 +473,33 @@ def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
 
     kinetic_1 = 0.5 * math_ops.reduce_sum(math_ops.square(new_m), event_dims)
 
-    # TODO(mhoffman): It seems like there may be an opportunity for nans here.
-    # I'm delaying addressing this because we're going to refactor this part
-    # to use the more general Metropolis abstraction anyway.
-    acceptance_probs = math_ops.exp(math_ops.minimum(0., log_potential_0 -
-                                                     log_potential_1 +
-                                                     kinetic_0 - kinetic_1))
-    accepted = math_ops.cast(
-        random_ops.random_uniform(array_ops.shape(acceptance_probs)) <
-        acceptance_probs, np.float32)
-    new_log_prob = (-log_potential_0 * (1. - accepted) -
-                    log_potential_1 * accepted)
+    energy_change = log_potential_1 - log_potential_0 + kinetic_1 - kinetic_0
+    # Treat NaN as infinite energy (and therefore guaranteed rejection).
+    energy_change = array_ops.where(
+        math_ops.is_nan(energy_change),
+        array_ops.fill(array_ops.shape(energy_change),
+                       energy_change.dtype.as_numpy_dtype(np.inf)),
+        energy_change)
+    acceptance_probs = math_ops.exp(math_ops.minimum(-energy_change, 0.))
+    accepted = (
+        random_ops.random_uniform(
+            array_ops.shape(acceptance_probs), dtype=x.dtype)
+        < acceptance_probs)
+    new_log_prob = -array_ops.where(accepted, log_potential_1, log_potential_0)
 
     # TODO(b/65738010): This should work, but it doesn't for now.
     # reduced_shape = math_ops.reduced_shape(x_shape, event_dims)
     reduced_shape = array_ops.shape(math_ops.reduce_sum(x, event_dims,
                                                         keep_dims=True))
     accepted = array_ops.reshape(accepted, reduced_shape)
-    new_x = x * (1. - accepted) + new_x * accepted
-    new_grad = -grad_0 * (1. - accepted) - grad_1 * accepted
-
+    accepted = math_ops.logical_or(
+        accepted, math_ops.cast(array_ops.zeros_like(x), dtypes.bool))
+    new_x = array_ops.where(accepted, new_x, x)
+    new_grad = -array_ops.where(accepted, grad_1, grad_0)
+
+  # TODO(langmore) Gradients of acceptance_probs and new_log_prob with respect
+  # to initial_x will propagate NaNs (see testNanFromGradsDontPropagate).  This
+  # should be fixed.
   return new_x, acceptance_probs, new_log_prob, new_grad
 
 
@@ -525,6 +537,7 @@ def leapfrog_integrator(step_size, n_steps, initial_position, initial_momentum,
       Has shape matching `initial_position`.
 
   Example: Simple quadratic potential.
+
   ```python
   def potential_and_grad(position):
     return tf.reduce_sum(0.5 * tf.square(position)), position
@@ -600,6 +613,7 @@ def leapfrog_step(step_size, position, momentum, potential_and_grad, grad,
       Has shape matching `position`.
 
   Example: Simple quadratic potential.
+
   ```python
   def potential_and_grad(position):
     # Simple quadratic potential
diff --git a/tensorflow/contrib/bayesflow/python/ops/layers.py b/tensorflow/contrib/bayesflow/python/ops/layers.py
index dcead38af826a12e776160bdb251ba021e6b953c..93412afae738564d440065f230c9df0036591467 100644
--- a/tensorflow/contrib/bayesflow/python/ops/layers.py
+++ b/tensorflow/contrib/bayesflow/python/ops/layers.py
@@ -23,13 +23,31 @@ from __future__ import print_function
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
+from tensorflow.contrib.bayesflow.python.ops.layers_conv_variational import *
 from tensorflow.contrib.bayesflow.python.ops.layers_dense_variational_impl import *
+from tensorflow.contrib.bayesflow.python.ops.layers_util import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
-    'DenseVariational',
-    'dense_variational',
+    'Convolution1DVariational',
+    'Convolution2DVariational',
+    'Convolution3DVariational',
+    'Conv1DVariational',
+    'Conv2DVariational',
+    'Conv3DVariational',
+    'convolution1d_variational',
+    'convolution2d_variational',
+    'convolution3d_variational',
+    'conv1d_variational',
+    'conv2d_variational',
+    'conv3d_variational',
+    'DenseReparameterization',
+    'DenseLocalReparameterization',
+    'DenseFlipout',
+    'dense_reparameterization',
+    'dense_local_reparameterization',
+    'dense_flipout',
     'default_loc_scale_fn',
     'default_mean_field_normal_fn',
 ]
diff --git a/tensorflow/contrib/bayesflow/python/ops/layers_conv_variational.py b/tensorflow/contrib/bayesflow/python/ops/layers_conv_variational.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ffb55feb1ad754bf96473c075ad6fd38d4e8be9
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/layers_conv_variational.py
@@ -0,0 +1,1415 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Convolutional variational layer classes and their functional aliases.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.bayesflow.python.ops import layers_util
+from tensorflow.contrib.distributions.python.ops import independent as independent_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import base as layers_lib
+from tensorflow.python.layers import utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.ops.distributions import kullback_leibler as kl_lib
+from tensorflow.python.ops.distributions import normal as normal_lib
+
+
+class _ConvVariational(layers_lib.Layer):
+  """Abstract nD convolution layer (private, used as implementation base).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of n integers, specifying the
+      length of the convolution window.
+    strides: An integer or tuple/list of n integers,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
+    dilation_rate: An integer or tuple/list of n integers, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    name: A string, the name of the layer.
+
+  Properties:
+    rank: Python integer, dimensionality of convolution.
+    filters: Python integer, dimensionality of the output space.
+    kernel_size: Size of the convolution window.
+    strides: Stride length of convolution.
+    padding: Python string describing padding approach.
+    data_format: Python string describing input data's dimensions.
+    dilation_rate: Dilation rate for an atrous convolution.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_use_local_reparameterization: Python `bool` indicating whether
+      `kernel` calculation should employ the Local Reparameterization Trick.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+  """
+
+  def __init__(
+      self,
+      rank,
+      filters,
+      kernel_size,
+      strides=1,
+      padding="valid",
+      data_format="channels_last",
+      dilation_rate=1,
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      name=None,
+      **kwargs):
+    super(_ConvVariational, self).__init__(
+        trainable=trainable,
+        name=name,
+        activity_regularizer=activity_regularizer,
+        **kwargs)
+    self.rank = rank
+    self.filters = filters
+    self.kernel_size = utils.normalize_tuple(kernel_size, rank, "kernel_size")
+    self.strides = utils.normalize_tuple(strides, rank, "strides")
+    self.padding = utils.normalize_padding(padding)
+    self.data_format = utils.normalize_data_format(data_format)
+    self.dilation_rate = utils.normalize_tuple(
+        dilation_rate, rank, "dilation_rate")
+    self.activation = activation
+    self.input_spec = layers_lib.InputSpec(ndim=self.rank + 2)
+    self.kernel_posterior_fn = kernel_posterior_fn
+    self.kernel_posterior_tensor_fn = kernel_posterior_tensor_fn
+    self.kernel_prior_fn = kernel_prior_fn
+    self.kernel_divergence_fn = kernel_divergence_fn
+    self.bias_posterior_fn = bias_posterior_fn
+    self.bias_posterior_tensor_fn = bias_posterior_tensor_fn
+    self.bias_prior_fn = bias_prior_fn
+    self.bias_divergence_fn = bias_divergence_fn
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if self.data_format == "channels_first":
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis].value is None:
+      raise ValueError("The channel dimension of the inputs "
+                       "should be defined. Found `None`.")
+    input_dim = input_shape[channel_axis].value
+    kernel_shape = self.kernel_size + (input_dim, self.filters)
+    dtype = dtypes.as_dtype(self.dtype)
+
+    # Must have a posterior kernel.
+    self.kernel_posterior = self.kernel_posterior_fn(
+        dtype, kernel_shape, "kernel_posterior",
+        self.trainable, self.add_variable)
+
+    if self.kernel_prior_fn is None:
+      self.kernel_prior = None
+    else:
+      self.kernel_prior = self.kernel_prior_fn(
+          dtype, kernel_shape, "kernel_prior",
+          self.trainable, self.add_variable)
+    self._built_kernel_divergence = False
+
+    if self.bias_posterior_fn is None:
+      self.bias_posterior = None
+    else:
+      self.bias_posterior = self.bias_posterior_fn(
+          dtype, (self.filters,), "bias_posterior",
+          self.trainable, self.add_variable)
+
+    if self.bias_prior_fn is None:
+      self.bias_prior = None
+    else:
+      self.bias_prior = self.bias_prior_fn(
+          dtype, (self.filters,), "bias_prior",
+          self.trainable, self.add_variable)
+    self._built_bias_divergence = False
+
+    self.input_spec = layers_lib.InputSpec(ndim=self.rank + 2,
+                                           axes={channel_axis: input_dim})
+    self._convolution_op = nn_ops.Convolution(
+        input_shape,
+        filter_shape=tensor_shape.TensorShape(kernel_shape),
+        dilation_rate=self.dilation_rate,
+        strides=self.strides,
+        padding=self.padding.upper(),
+        data_format=utils.convert_data_format(self.data_format,
+                                              self.rank + 2))
+
+    self.built = True
+
+  def call(self, inputs):
+    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
+
+    outputs = self._apply_variational_kernel(inputs)
+    outputs = self._apply_variational_bias(outputs)
+    if self.activation is not None:
+      outputs = self.activation(outputs)
+    if not self._built_kernel_divergence:
+      kernel_posterior = self.kernel_posterior
+      kernel_prior = self.kernel_prior
+      if isinstance(self.kernel_posterior, independent_lib.Independent):
+        kernel_posterior = kernel_posterior.distribution
+      if isinstance(self.kernel_prior, independent_lib.Independent):
+        kernel_prior = kernel_prior.distribution
+      self._apply_divergence(self.kernel_divergence_fn,
+                             kernel_posterior,
+                             kernel_prior,
+                             self.kernel_posterior_tensor,
+                             name="divergence_kernel")
+      self._built_kernel_divergence = True
+    if not self._built_bias_divergence:
+      bias_posterior = self.bias_posterior
+      bias_prior = self.bias_prior
+      if isinstance(self.bias_posterior, independent_lib.Independent):
+        bias_posterior = bias_posterior.distribution
+      if isinstance(self.bias_prior, independent_lib.Independent):
+        bias_prior = bias_prior.distribution
+      self._apply_divergence(self.bias_divergence_fn,
+                             bias_posterior,
+                             bias_prior,
+                             self.bias_posterior_tensor,
+                             name="divergence_bias")
+      self._built_bias_divergence = True
+    return outputs
+
+  def _apply_variational_kernel(self, inputs):
+    self.kernel_posterior_tensor = self.kernel_posterior_tensor_fn(
+        self.kernel_posterior)
+    outputs = self._convolution_op(inputs, self.kernel_posterior_tensor)
+    return outputs
+
+  def _apply_variational_bias(self, inputs):
+    if self.bias_posterior is None:
+      self.bias_posterior_tensor = None
+      return inputs
+    self.bias_posterior_tensor = self.bias_posterior_tensor_fn(
+        self.bias_posterior)
+    outputs = inputs
+    if self.data_format == "channels_first":
+      if self.rank == 1:
+        # nn.bias_add does not accept a 1D input tensor.
+        bias = array_ops.reshape(self.bias_posterior_tensor,
+                                 (1, self.filters, 1))
+        outputs += bias
+      if self.rank == 2:
+        outputs = nn.bias_add(outputs,
+                              self.bias_posterior_tensor,
+                              data_format="NCHW")
+      if self.rank == 3:
+        # As of Mar 2017, direct addition is significantly slower than
+        # bias_add when computing gradients. To use bias_add, we collapse Z
+        # and Y into a single dimension to obtain a 4D input tensor.
+        outputs_shape = outputs.shape.as_list()
+        outputs_4d = array_ops.reshape(outputs,
+                                       [outputs_shape[0], outputs_shape[1],
+                                        outputs_shape[2] * outputs_shape[3],
+                                        outputs_shape[4]])
+        outputs_4d = nn.bias_add(outputs_4d,
+                                 self.bias_posterior_tensor,
+                                 data_format="NCHW")
+        outputs = array_ops.reshape(outputs_4d, outputs_shape)
+    else:
+      outputs = nn.bias_add(outputs,
+                            self.bias_posterior_tensor,
+                            data_format="NHWC")
+    return outputs
+
+  def _apply_divergence(self, divergence_fn, posterior, prior,
+                        posterior_tensor, name):
+    if (divergence_fn is None or
+        posterior is None or
+        prior is None):
+      divergence = None
+      return
+    divergence = standard_ops.identity(
+        divergence_fn(
+            posterior, prior, posterior_tensor),
+        name=name)
+    self.add_loss(divergence)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == "channels_last":
+      space = input_shape[1:-1]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0]] + new_space +
+                                      [self.filters])
+    else:
+      space = input_shape[2:]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0], self.filters] +
+                                      new_space)
+
+
+class Conv1DVariational(_ConvVariational):
+  """1D convolution layer (e.g. temporal convolution).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of a single integer, specifying the
+      length of the 1D convolution window.
+    strides: An integer or tuple/list of a single integer,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: An integer or tuple/list of a single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    name: A string, the name of the layer.
+
+  Properties:
+    filters: Python integer, dimensionality of the output space.
+    kernel_size: Size of the convolution window.
+    strides: Stride length of convolution.
+    padding: Python string describing padding approach.
+    data_format: Python string describing input data's dimensions.
+    dilation_rate: Dilation rate for an atrous convolution.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_use_local_reparameterization: Python `bool` indicating whether
+      `kernel` calculation should employ the Local Reparameterization Trick.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 128, 1])
+  net = tfp.layers.Conv1DVariational(64,
+                                     kernel_size=5,
+                                     padding="SAME",
+                                     activation=tf.nn.relu)(net)
+  net = tf.reshape(net, [-1, 128 * 64])
+  logits = tfp.layers.DenseVariational(10)(net)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+  """
+
+  def __init__(
+      self,
+      filters,
+      kernel_size,
+      strides=1,
+      padding="valid",
+      data_format="channels_last",
+      dilation_rate=1,
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      name=None,
+      **kwargs):
+    super(Conv1DVariational, self).__init__(
+        rank=1,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        kernel_posterior_fn=kernel_posterior_fn,
+        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+        kernel_prior_fn=kernel_prior_fn,
+        kernel_divergence_fn=kernel_divergence_fn,
+        bias_posterior_fn=bias_posterior_fn,
+        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+        bias_prior_fn=bias_prior_fn,
+        bias_divergence_fn=bias_divergence_fn,
+        name=name, **kwargs)
+
+
+def conv1d_variational(
+    inputs,
+    filters,
+    kernel_size,
+    strides=1,
+    padding="valid",
+    data_format="channels_last",
+    dilation_rate=1,
+    activation=None,
+    activity_regularizer=None,
+    trainable=True,
+    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+    kernel_posterior_tensor_fn=lambda d: d.sample(),
+    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+    bias_posterior_tensor_fn=lambda d: d.sample(),
+    bias_prior_fn=None,
+    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    name=None,
+    reuse=None):
+  """Functional interface for 1D convolution layer (e.g. temporal convolution).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    inputs: Tensor input.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of a single integer, specifying the
+      length of the 1D convolution window.
+    strides: An integer or tuple/list of a single integer,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: An integer or tuple/list of a single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 128, 1])
+  net = tfp.layers.conv1d_variational(net,
+                                      64,
+                                      kernel_size=5,
+                                      padding="SAME",
+                                      activation=tf.nn.relu)
+  net = tf.reshape(net, [-1, 128 * 64])
+  logits = tfp.layers.dense_variational(net, 10)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+  """
+  layer = Conv1DVariational(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      activation=activation,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      kernel_posterior_fn=kernel_posterior_fn,
+      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+      kernel_prior_fn=kernel_prior_fn,
+      kernel_divergence_fn=kernel_divergence_fn,
+      bias_posterior_fn=bias_posterior_fn,
+      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+      bias_prior_fn=bias_prior_fn,
+      bias_divergence_fn=bias_divergence_fn,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _scope=name,
+      _reuse=reuse)
+  return layer.apply(inputs)
+
+
+class Conv2DVariational(_ConvVariational):
+  """2D convolution layer (e.g. spatial convolution over images).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
+      height and width of the 2D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the convolution along the height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    name: A string, the name of the layer.
+
+  Properties:
+    filters: Python integer, dimensionality of the output space.
+    kernel_size: Size of the convolution window.
+    strides: Stride length of convolution.
+    padding: Python string describing padding approach.
+    data_format: Python string describing input data's dimensions.
+    dilation_rate: Dilation rate for an atrous convolution.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_use_local_reparameterization: Python `bool` indicating whether
+      `kernel` calculation should employ the Local Reparameterization Trick.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 32, 32, 3])
+  net = tfp.layers.Conv2DVariational(64,
+                                     kernel_size=5,
+                                     padding="SAME",
+                                     activation=tf.nn.relu)(net)
+  net = tf.layers.MaxPooling2D(pool_size=2,
+                               strides=2,
+                               padding="SAME")(net)
+  net = tf.reshape(net, [-1, 8 * 8 * 64])
+  logits = tfp.layers.DenseVariational(10)(net)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+  """
+
+  def __init__(
+      self,
+      filters,
+      kernel_size,
+      strides=(1, 1),
+      padding="valid",
+      data_format="channels_last",
+      dilation_rate=(1, 1),
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      name=None,
+      **kwargs):
+    super(Conv2DVariational, self).__init__(
+        rank=2,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        kernel_posterior_fn=kernel_posterior_fn,
+        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+        kernel_prior_fn=kernel_prior_fn,
+        kernel_divergence_fn=kernel_divergence_fn,
+        bias_posterior_fn=bias_posterior_fn,
+        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+        bias_prior_fn=bias_prior_fn,
+        bias_divergence_fn=bias_divergence_fn,
+        name=name, **kwargs)
+
+
+def conv2d_variational(
+    inputs,
+    filters,
+    kernel_size,
+    strides=(1, 1),
+    padding="valid",
+    data_format="channels_last",
+    dilation_rate=(1, 1),
+    activation=None,
+    activity_regularizer=None,
+    trainable=True,
+    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+    kernel_posterior_tensor_fn=lambda d: d.sample(),
+    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+    bias_posterior_tensor_fn=lambda d: d.sample(),
+    bias_prior_fn=None,
+    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    name=None,
+    reuse=None):
+  """Functional interface for the 2D convolution layer.
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    inputs: Tensor input.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
+      height and width of the 2D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the convolution along the height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 32, 32, 3])
+  net = tfp.layers.conv2d_variational(net,
+                                      64,
+                                      kernel_size=5,
+                                      padding="SAME",
+                                      activation=tf.nn.relu)
+  net = tf.layers.max_pooling2d(net,
+                                pool_size=2,
+                                strides=2,
+                                padding="SAME")
+  net = tf.reshape(net, [-1, 8 * 8 * 64])
+  logits = tfp.layers.dense_variational(net, 10)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+  """
+  layer = Conv2DVariational(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      activation=activation,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      kernel_posterior_fn=kernel_posterior_fn,
+      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+      kernel_prior_fn=kernel_prior_fn,
+      kernel_divergence_fn=kernel_divergence_fn,
+      bias_posterior_fn=bias_posterior_fn,
+      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+      bias_prior_fn=bias_prior_fn,
+      bias_divergence_fn=bias_divergence_fn,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _scope=name,
+      _reuse=reuse)
+  return layer.apply(inputs)
+
+
+class Conv3DVariational(_ConvVariational):
+  """3D convolution layer (e.g. spatial convolution over volumes).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 3 integers, specifying the
+      depth, height and width of the 3D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the convolution along the depth,
+      height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    dilation_rate: An integer or tuple/list of 3 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    name: A string, the name of the layer.
+
+  Properties:
+    filters: Python integer, dimensionality of the output space.
+    kernel_size: Size of the convolution window.
+    strides: Stride length of convolution.
+    padding: Python string describing padding approach.
+    data_format: Python string describing input data's dimensions.
+    dilation_rate: Dilation rate for an atrous convolution.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_use_local_reparameterization: Python `bool` indicating whether
+      `kernel` calculation should employ the Local Reparameterization Trick.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 256, 32, 32, 3])
+  net = tfp.layers.Conv3DVariational(64,
+                                     kernel_size=5,
+                                     padding="SAME",
+                                     activation=tf.nn.relu)(net)
+  net = tf.layers.MaxPooling2D(pool_size=2,
+                               strides=2,
+                               padding="SAME")(net)
+  net = tf.reshape(net, [-1, 256 * 8 * 8 * 64])
+  logits = tfp.layers.DenseVariational(10)(net)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+  """
+
+  def __init__(
+      self,
+      filters,
+      kernel_size,
+      strides=(1, 1, 1),
+      padding="valid",
+      data_format="channels_last",
+      dilation_rate=(1, 1, 1),
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      name=None,
+      **kwargs):
+    super(Conv3DVariational, self).__init__(
+        rank=3,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        kernel_posterior_fn=kernel_posterior_fn,
+        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+        kernel_prior_fn=kernel_prior_fn,
+        kernel_divergence_fn=kernel_divergence_fn,
+        bias_posterior_fn=bias_posterior_fn,
+        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+        bias_prior_fn=bias_prior_fn,
+        bias_divergence_fn=bias_divergence_fn,
+        name=name, **kwargs)
+
+
+def conv3d_variational(
+    inputs,
+    filters,
+    kernel_size,
+    strides=(1, 1, 1),
+    padding="valid",
+    data_format="channels_last",
+    dilation_rate=(1, 1, 1),
+    activation=None,
+    activity_regularizer=None,
+    trainable=True,
+    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+    kernel_posterior_tensor_fn=lambda d: d.sample(),
+    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+    bias_posterior_tensor_fn=lambda d: d.sample(),
+    bias_prior_fn=None,
+    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    name=None,
+    reuse=None):
+  """Functional interface for the 3D convolution layer.
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    inputs: Tensor input.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 3 integers, specifying the
+      depth, height and width of the 3D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the convolution along the depth,
+      height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    dilation_rate: An integer or tuple/list of 3 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 256, 32, 32, 3])
+  net = tfp.layers.conv3d_variational(net,
+                                      64,
+                                      kernel_size=5,
+                                      padding="SAME",
+                                      activation=tf.nn.relu)
+  net = tf.layers.max_pooling2d(net,
+                                pool_size=2,
+                                strides=2,
+                                padding="SAME")
+  net = tf.reshape(net, [-1, 256 * 8 * 8 * 64])
+  logits = tfp.layers.dense_variational(net, 10)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+  """
+  layer = Conv3DVariational(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      activation=activation,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      kernel_posterior_fn=kernel_posterior_fn,
+      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+      kernel_prior_fn=kernel_prior_fn,
+      kernel_divergence_fn=kernel_divergence_fn,
+      bias_posterior_fn=bias_posterior_fn,
+      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+      bias_prior_fn=bias_prior_fn,
+      bias_divergence_fn=bias_divergence_fn,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _scope=name,
+      _reuse=reuse)
+  return layer.apply(inputs)
+
+
+# Aliases
+
+Convolution1DVariational = Conv1DVariational
+Convolution2DVariational = Conv2DVariational
+Convolution3DVariational = Conv3DVariational
+convolution1d_variational = conv1d_variational
+convolution2d_variational = conv2d_variational
+convolution3d_variational = conv3d_variational
diff --git a/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational_impl.py b/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational_impl.py
index b05ce0ffc1dd55ffb029b339a846a9aa5c877620..a749a396f15188ef345b4ae7c53017b6004c5e71 100644
--- a/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational_impl.py
@@ -14,221 +14,60 @@
 # ==============================================================================
 """Dense Bayesian layer using KL-divergence based variational inference.
 
-@@DenseVariational
-@@dense_variational
-
-@@default_loc_scale_fn
-@@default_mean_field_normal_fn
+@@DenseReparameterization
+@@DenseLocalReparameterization
+@@DenseFlipout
+@@dense_reparameterization
+@@dense_local_reparameterization
+@@dense_flipout
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
-from tensorflow.contrib.distributions.python.ops import deterministic as deterministic_lib
+from tensorflow.contrib.bayesflow.python.ops import layers_util
+from tensorflow.contrib.distributions.python.ops import independent as independent_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import base as layers_lib
-from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
-from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import standard_ops
 from tensorflow.python.ops.distributions import kullback_leibler as kl_lib
 from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
-    "DenseVariational",
-    "dense_variational",
-    "default_loc_scale_fn",
-    "default_mean_field_normal_fn",
+    "DenseReparameterization",
+    "DenseLocalReparameterization",
+    "DenseFlipout",
+    "dense_reparameterization",
+    "dense_local_reparameterization",
+    "dense_flipout",
 ]
 
 
-def default_loc_scale_fn(
-    is_singular=False,
-    loc_initializer=init_ops.random_normal_initializer(stddev=0.1),
-    untransformed_scale_initializer=init_ops.random_normal_initializer(
-        mean=-3., stddev=0.1),
-    loc_regularizer=None,
-    untransformed_scale_regularizer=None,
-    loc_constraint=None,
-    untransformed_scale_constraint=None):
-  """Makes closure which creates `loc`, `scale` params from `tf.get_variable`.
-
-  This function produces a closure which produces `loc`, `scale` using
-  `tf.get_variable`. The closure accepts the following arguments:
-
-    dtype: Type of parameter's event.
-    shape: Python `list`-like representing the parameter's event shape.
-    name: Python `str` name prepended to any created (or existing)
-      `tf.Variable`s.
-    trainable: Python `bool` indicating all created `tf.Variable`s should be
-      added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
-    add_variable_fn: `tf.get_variable`-like `callable` used to create (or
-      access existing) `tf.Variable`s.
-
-  Args:
-    is_singular: Python `bool` indicating if `scale is None`. Default: `False`.
-    loc_initializer: Initializer function for the `loc` parameters.
-      The default is `tf.random_normal_initializer(mean=0., stddev=0.1)`.
-    untransformed_scale_initializer: Initializer function for the `scale`
-      parameters. Default value: `tf.random_normal_initializer(mean=-3.,
-      stddev=0.1)`. This implies the softplus transformed result has mean
-      approximately `0.05` and std. deviation approximately `0.005`.
-    loc_regularizer: Regularizer function for the `loc` parameters.
-      The default (`None`) is to use the `tf.get_variable` default.
-    untransformed_scale_regularizer: Regularizer function for the `scale`
-      parameters. The default (`None`) is to use the `tf.get_variable` default.
-    loc_constraint: An optional projection function to be applied to the
-      loc after being updated by an `Optimizer`. The function must take as input
-      the unprojected variable and must return the projected variable (which
-      must have the same shape). Constraints are not safe to use when doing
-      asynchronous distributed training.
-      The default (`None`) is to use the `tf.get_variable` default.
-    untransformed_scale_constraint: An optional projection function to be
-      applied to the `scale` parameters after being updated by an `Optimizer`
-      (e.g. used to implement norm constraints or value constraints). The
-      function must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are not
-      safe to use when doing asynchronous distributed training. The default
-      (`None`) is to use the `tf.get_variable` default.
-
-  Returns:
-    default_loc_scale_fn: Python `callable` which instantiates `loc`, `scale`
-    parameters from args: `dtype, shape, name, trainable, add_variable_fn`.
-  """
-  def _fn(dtype, shape, name, trainable, add_variable_fn):
-    """Creates `loc`, `scale` parameters."""
-    loc = add_variable_fn(
-        name=name + "_loc",
-        shape=shape,
-        initializer=loc_initializer,
-        regularizer=loc_regularizer,
-        constraint=loc_constraint,
-        dtype=dtype,
-        trainable=trainable)
-    if is_singular:
-      return loc, None
-    untransformed_scale = add_variable_fn(
-        name=name + "_untransformed_scale",
-        shape=shape,
-        initializer=untransformed_scale_initializer,
-        regularizer=untransformed_scale_regularizer,
-        constraint=untransformed_scale_constraint,
-        dtype=dtype,
-        trainable=trainable)
-    scale = (np.finfo(dtype.as_numpy_dtype).eps +
-             nn_ops.softplus(untransformed_scale))
-    return loc, scale
-  return _fn
-
-
-def default_mean_field_normal_fn(
-    is_singular=False,
-    loc_initializer=None,
-    untransformed_scale_initializer=None,
-    loc_regularizer=None,
-    untransformed_scale_regularizer=None,
-    loc_constraint=None,
-    untransformed_scale_constraint=None):
-  """Creates a function to build Normal distributions with trainable params.
-
-  This function produces a closure which produces `tf.distributions.Normal`
-  parameterized by a loc` and `scale` each created using `tf.get_variable`. The
-  produced closure accepts the following arguments:
-
-    name: Python `str` name prepended to any created (or existing)
-      `tf.Variable`s.
-    shape: Python `list`-like representing the parameter's event shape.
-    dtype: Type of parameter's event.
-    trainable: Python `bool` indicating all created `tf.Variable`s should be
-      added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
-    add_variable_fn: `tf.get_variable`-like `callable` used to create (or
-      access existing) `tf.Variable`s.
+class _DenseVariational(layers_lib.Layer):
+  """Abstract densely-connected class (private, used as implementation base).
 
-  Args:
-    is_singular: Python `bool` if `True`, forces the special case limit of
-      `scale->0`, i.e., a `Deterministic` distribution.
-    loc_initializer: Initializer function for the `loc` parameters.
-      If `None` (default), values are initialized using the default
-      initializer used by `tf.get_variable`.
-    untransformed_scale_initializer: Initializer function for the `scale`
-      parameters. If `None` (default), values are initialized using the default
-      initializer used by `tf.get_variable`.
-    loc_regularizer: Regularizer function for the `loc` parameters.
-    untransformed_scale_regularizer: Regularizer function for the `scale`
-      parameters.
-    loc_constraint: An optional projection function to be applied to the
-      loc after being updated by an `Optimizer`. The function must take as input
-      the unprojected variable and must return the projected variable (which
-      must have the same shape). Constraints are not safe to use when doing
-      asynchronous distributed training.
-    untransformed_scale_constraint: An optional projection function to be
-      applied to the `scale` parameters after being updated by an `Optimizer`
-      (e.g. used to implement norm constraints or value constraints). The
-      function must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are not
-      safe to use when doing asynchronous distributed training.
-
-  Returns:
-    make_normal_fn: Python `callable` which creates a `tf.distributions.Normal`
-      using from args: `dtype, shape, name, trainable, add_variable_fn`.
-  """
-  loc_scale_fn_ = default_loc_scale_fn(
-      is_singular,
-      loc_initializer,
-      untransformed_scale_initializer,
-      loc_regularizer,
-      untransformed_scale_regularizer,
-      loc_constraint,
-      untransformed_scale_constraint)
-  def _fn(dtype, shape, name, trainable, add_variable_fn):
-    """Creates a batch of `Deterministic` or `Normal` distributions."""
-    loc, scale = loc_scale_fn_(dtype, shape, name, trainable, add_variable_fn)
-    if scale is None:
-      return deterministic_lib.Deterministic(loc=loc)
-    return normal_lib.Normal(loc=loc, scale=scale)
-  return _fn
-
-
-class DenseVariational(layers_lib.Layer):
-  """Densely-connected variational class.
-
-  This layer implements the Bayesian variational inference analogue to:
-  `outputs = activation(matmul(inputs, kernel) + bias)`
-  by assuming the `kernel` and/or the `bias` are random variables.
-
-  The layer implements a stochastic dense calculation by making a Monte Carlo
-  approximation of a [variational Bayesian method based on KL divergence](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods), i.e.,
+  This layer implements the Bayesian variational inference analogue to
+  a dense layer by assuming the `kernel` and/or the `bias` are drawn
+  from distributions. By default, the layer implements a stochastic
+  forward pass via sampling from the kernel and bias posteriors,
 
   ```none
-  -log p(y|x) = -log int_{R**d} p(y|x,w) p(w) dw
-              = -log int_{R**d} p(y,w|x) q(w|x) / q(w|x) dw
-             <= E_q(W|x)[-log p(y,W|x) + log q(W|x)]       # Jensen's
-              = E_q(W|x)[-log p(y|x,W)] + KL[q(W|x), p(W)]
-             ~= m**-1 sum{ -log(y|x,w[j]) : w[j] ~ q(W|x), j=1..m }
-                 + KL[q(W|x), p(W)]
+  kernel, bias ~ posterior
+  outputs = activation(matmul(inputs, kernel) + bias)
   ```
 
-  where `W` denotes the (independent) `kernel` and `bias` random variables, `w`
-  is a random variate or outcome of `W`, `y` is the label, `x` is the evidence`,
-  and `~=` denotes an approximation which becomes exact as `m->inf`. The above
-  bound is sometimes referred to as the negative Evidence Lower BOund or
-  negative [ELBO](https://arxiv.org/abs/1601.00670). In context of a DNN, this
-  layer is appropriate to use when the final loss is a negative log-likelihood.
-
-  The Monte-Carlo sum portion is used for the feed-forward calculation of the
-  DNN. The KL divergence portion can be added to the final loss via:
-  `loss += sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))`.
-
   The arguments permit separate specification of the surrogate posterior
   (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  random variables (which together comprise `W`).
+  distributions.
 
   Args:
     units: Integer or Long, dimensionality of the output space.
@@ -237,10 +76,6 @@ class DenseVariational(layers_lib.Layer):
     activity_regularizer: Regularizer function for the output.
     trainable: Boolean, if `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_use_local_reparameterization: Python `bool` indicating whether
-      `kernel` calculation should employ the Local Reparameterization Trick.
-      When `True`, `kernel_posterior_fn` must create an instance of
-      `tf.distributions.Normal`.
     kernel_posterior_fn: Python `callable` which creates
       `tf.distributions.Distribution` instance representing the surrogate
       posterior of the `kernel` parameter. Default value:
@@ -283,12 +118,14 @@ class DenseVariational(layers_lib.Layer):
     units: Python integer, dimensionality of the output space.
     activation: Activation function (`callable`).
     activity_regularizer: Regularizer function for the output.
-    kernel_use_local_reparameterization: Python `bool` indicating whether
-      `kernel` calculation should employ the Local Reparameterization Trick.
-    kernel: `VariationalKernelParamater` instance containing all `kernel`
-      related properties and `callable`s.
-    bias: `VariationalParameter` instance containing all `kernel`
-      related properties and `callable`s.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
   """
 
   def __init__(
@@ -297,66 +134,33 @@ class DenseVariational(layers_lib.Layer):
       activation=None,
       activity_regularizer=None,
       trainable=True,
-      kernel_use_local_reparameterization=True,
-      kernel_posterior_fn=default_mean_field_normal_fn(),
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
       kernel_posterior_tensor_fn=lambda d: d.sample(),
       kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
           loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
       kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      bias_posterior_fn=default_mean_field_normal_fn(is_singular=True),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
       bias_posterior_tensor_fn=lambda d: d.sample(),
       bias_prior_fn=None,
       bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
       name=None,
       **kwargs):
-    super(DenseVariational, self).__init__(
+    super(_DenseVariational, self).__init__(
         trainable=trainable,
         name=name,
         activity_regularizer=activity_regularizer,
         **kwargs)
-    self._units = units
-    self._activation = activation
-    self._input_spec = layers_lib.InputSpec(min_ndim=2)
-    self._kernel_use_local_reparameterization = (
-        kernel_use_local_reparameterization)
-    self._kernel = VariationalKernelParameter(
-        kernel_posterior_fn,
-        kernel_posterior_tensor_fn,
-        kernel_prior_fn,
-        kernel_divergence_fn)
-    self._bias = VariationalParameter(
-        bias_posterior_fn,
-        bias_posterior_tensor_fn,
-        bias_prior_fn,
-        bias_divergence_fn)
-
-  @property
-  def units(self):
-    return self._units
-
-  @property
-  def activation(self):
-    return self._activation
-
-  @property
-  def input_spec(self):
-    return self._input_spec
-
-  @input_spec.setter
-  def input_spec(self, value):
-    self._input_spec = value
-
-  @property
-  def kernel_use_local_reparameterization(self):
-    return self._kernel_use_local_reparameterization
-
-  @property
-  def kernel(self):
-    return self._kernel
-
-  @property
-  def bias(self):
-    return self._bias
+    self.units = units
+    self.activation = activation
+    self.input_spec = layers_lib.InputSpec(min_ndim=2)
+    self.kernel_posterior_fn = kernel_posterior_fn
+    self.kernel_posterior_tensor_fn = kernel_posterior_tensor_fn
+    self.kernel_prior_fn = kernel_prior_fn
+    self.kernel_divergence_fn = kernel_divergence_fn
+    self.bias_posterior_fn = bias_posterior_fn
+    self.bias_posterior_tensor_fn = bias_posterior_tensor_fn
+    self.bias_prior_fn = bias_prior_fn
+    self.bias_divergence_fn = bias_divergence_fn
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
@@ -368,29 +172,29 @@ class DenseVariational(layers_lib.Layer):
     dtype = dtypes.as_dtype(self.dtype)
 
     # Must have a posterior kernel.
-    self.kernel.posterior = self.kernel.posterior_fn(
+    self.kernel_posterior = self.kernel_posterior_fn(
         dtype, [in_size, self.units], "kernel_posterior",
         self.trainable, self.add_variable)
 
-    if self.kernel.prior_fn is None:
+    if self.kernel_prior_fn is None:
       self.kernel_prior = None
     else:
-      self.kernel.prior = self.kernel.prior_fn(
+      self.kernel_prior = self.kernel_prior_fn(
           dtype, [in_size, self.units], "kernel_prior",
           self.trainable, self.add_variable)
     self._built_kernel_divergence = False
 
-    if self.bias.posterior_fn is None:
-      self.bias.posterior = None
+    if self.bias_posterior_fn is None:
+      self.bias_posterior = None
     else:
-      self.bias.posterior = self.bias.posterior_fn(
+      self.bias_posterior = self.bias_posterior_fn(
           dtype, [self.units], "bias_posterior",
           self.trainable, self.add_variable)
 
-    if self.bias.prior_fn is None:
-      self.bias.prior = None
+    if self.bias_prior_fn is None:
+      self.bias_prior = None
     else:
-      self.bias.prior = self.bias.prior_fn(
+      self.bias_prior = self.bias_prior_fn(
           dtype, [self.units], "bias_prior",
           self.trainable, self.add_variable)
     self._built_bias_divergence = False
@@ -405,54 +209,53 @@ class DenseVariational(layers_lib.Layer):
     if self.activation is not None:
       outputs = self.activation(outputs)  # pylint: disable=not-callable
     if not self._built_kernel_divergence:
-      self._apply_divergence(self.kernel, name="divergence_kernel")
+      kernel_posterior = self.kernel_posterior
+      kernel_prior = self.kernel_prior
+      if isinstance(self.kernel_posterior, independent_lib.Independent):
+        kernel_posterior = kernel_posterior.distribution
+      if isinstance(self.kernel_prior, independent_lib.Independent):
+        kernel_prior = kernel_prior.distribution
+      self._apply_divergence(self.kernel_divergence_fn,
+                             kernel_posterior,
+                             kernel_prior,
+                             self.kernel_posterior_tensor,
+                             name="divergence_kernel")
       self._built_kernel_divergence = True
     if not self._built_bias_divergence:
-      self._apply_divergence(self.bias, name="divergence_bias")
+      bias_posterior = self.bias_posterior
+      bias_prior = self.bias_prior
+      if isinstance(self.bias_posterior, independent_lib.Independent):
+        bias_posterior = bias_posterior.distribution
+      if isinstance(self.bias_prior, independent_lib.Independent):
+        bias_prior = bias_prior.distribution
+      self._apply_divergence(self.bias_divergence_fn,
+                             bias_posterior,
+                             bias_prior,
+                             self.bias_posterior_tensor,
+                             name="divergence_bias")
       self._built_bias_divergence = True
     return outputs
 
-  def _apply_variational_kernel(self, inputs):
-    if not self.kernel_use_local_reparameterization:
-      self.kernel.posterior_tensor = self.kernel.posterior_tensor_fn(
-          self.kernel.posterior)
-      self.kernel.posterior_affine = None
-      self.kernel.posterior_affine_tensor = None
-      return self._matmul(inputs, self.kernel.posterior_tensor)
-    if not isinstance(self.kernel.posterior, normal_lib.Normal):
-      raise TypeError("`kernel_use_local_reparameterization=True` requires "
-                      "`kernel_posterior_fn` produce an instance of "
-                      "`tf.distributions.Normal` (saw: \"{}\").".format(
-                          type(self.kernel.posterior).__name__))
-    self.kernel.posterior_affine = normal_lib.Normal(
-        loc=self._matmul(inputs, self.kernel.posterior.loc),
-        scale=standard_ops.sqrt(self._matmul(
-            standard_ops.square(inputs),
-            standard_ops.square(self.kernel.posterior.scale))))
-    self.kernel.posterior_affine_tensor = (
-        self.kernel.posterior_tensor_fn(self.kernel.posterior_affine))
-    self.kernel.posterior_tensor = None
-    return self.kernel.posterior_affine_tensor
-
   def _apply_variational_bias(self, inputs):
-    if self.bias.posterior is None:
-      self.bias.posterior_tensor = None
+    if self.bias_posterior is None:
+      self.bias_posterior_tensor = None
       return inputs
-    self.bias.posterior_tensor = self.bias.posterior_tensor_fn(
-        self.bias.posterior)
-    return nn.bias_add(inputs, self.bias.posterior_tensor)
-
-  def _apply_divergence(self, param, name):
-    if (param.divergence_fn is None or
-        param.posterior is None or
-        param.prior is None):
-      param.divergence = None
+    self.bias_posterior_tensor = self.bias_posterior_tensor_fn(
+        self.bias_posterior)
+    return nn.bias_add(inputs, self.bias_posterior_tensor)
+
+  def _apply_divergence(self, divergence_fn, posterior, prior,
+                        posterior_tensor, name):
+    if (divergence_fn is None or
+        posterior is None or
+        prior is None):
+      divergence = None
       return
-    param.divergence = standard_ops.identity(
-        param.divergence_fn(
-            param.posterior, param.prior, param.posterior_tensor),
+    divergence = standard_ops.identity(
+        divergence_fn(
+            posterior, prior, posterior_tensor),
         name=name)
-    self.add_loss(param.divergence)
+    self.add_loss(divergence)
 
   def _matmul(self, inputs, kernel):
     if inputs.shape.ndims <= 2:
@@ -469,57 +272,467 @@ class DenseVariational(layers_lib.Layer):
     return input_shape[:-1].concatenate(self.units)
 
 
-def dense_variational(
+class DenseReparameterization(_DenseVariational):
+  """Densely-connected layer class with reparameterization estimator.
+
+  This layer implements the Bayesian variational inference analogue to
+  a dense layer by assuming the `kernel` and/or the `bias` are drawn
+  from distributions. By default, the layer implements a stochastic
+  forward pass via sampling from the kernel and bias posteriors,
+
+  ```none
+  kernel, bias ~ posterior
+  outputs = activation(matmul(inputs, kernel) + bias)
+  ```
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Args:
+    units: Integer or Long, dimensionality of the output space.
+    activation: Activation function (`callable`). Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    name: Python `str`, the name of the layer. Layers with the same name will
+      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
+      such cases.
+    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
+      layer by the same name.
+
+  Properties:
+    units: Python integer, dimensionality of the output space.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tfp.layers.DenseReparameterization(
+      512, activation=tf.nn.relu)(features)
+  logits = tfp.layers.DenseReparameterization(10)(net)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+  """
+
+  def __init__(
+      self,
+      units,
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(
+          is_singular=True),
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      name=None,
+      **kwargs):
+    super(DenseReparameterization, self).__init__(
+        units=units,
+        activation=activation,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        kernel_posterior_fn=kernel_posterior_fn,
+        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+        kernel_prior_fn=kernel_prior_fn,
+        kernel_divergence_fn=kernel_divergence_fn,
+        bias_posterior_fn=bias_posterior_fn,
+        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+        bias_prior_fn=bias_prior_fn,
+        bias_divergence_fn=bias_divergence_fn,
+        name=name,
+        **kwargs)
+
+  def _apply_variational_kernel(self, inputs):
+    self.kernel_posterior_tensor = self.kernel_posterior_tensor_fn(
+        self.kernel_posterior)
+    self.kernel_posterior_affine = None
+    self.kernel_posterior_affine_tensor = None
+    return self._matmul(inputs, self.kernel_posterior_tensor)
+
+
+def dense_reparameterization(
     inputs,
     units,
     activation=None,
     activity_regularizer=None,
     trainable=True,
-    kernel_use_local_reparameterization=True,
-    kernel_posterior_fn=default_mean_field_normal_fn(),
+    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
     kernel_posterior_tensor_fn=lambda d: d.sample(),
     kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
         loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
     kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    bias_posterior_fn=default_mean_field_normal_fn(is_singular=True),
+    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
     bias_posterior_tensor_fn=lambda d: d.sample(),
     bias_prior_fn=None,
     bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
     name=None,
     reuse=None):
-  """Densely-connected variational layer.
+  """Densely-connected layer with reparameterization estimator.
+
+  This layer implements the Bayesian variational inference analogue to
+  a dense layer by assuming the `kernel` and/or the `bias` are drawn
+  from distributions. By default, the layer implements a stochastic
+  forward pass via sampling from the kernel and bias posteriors,
+
+  ```none
+  kernel, bias ~ posterior
+  outputs = activation(matmul(inputs, kernel) + bias)
+  ```
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Args:
+    inputs: Tensor input.
+    units: Integer or Long, dimensionality of the output space.
+    activation: Activation function (`callable`). Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    name: Python `str`, the name of the layer. Layers with the same name will
+      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
+      such cases.
+    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
+      layer by the same name.
+
+  Returns:
+    output: `Tensor` representing a the affine transformed input under a random
+      draw from the surrogate posterior distribution.
+
+  #### Examples
 
-  This layer implements the Bayesian variational inference analogue to:
-  `outputs = activation(matmul(inputs, kernel) + bias)`
-  by assuming the `kernel` and/or the `bias` are random variables.
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
 
-  The layer implements a stochastic dense calculation by making a Monte Carlo
-  approximation of a [variational Bayesian method based on KL divergence](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods), i.e.,
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tfp.layers.dense_reparameterization(
+      features, 512, activation=tf.nn.relu)
+  logits = tfp.layers.dense_reparameterization(net, 10)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+  """
+  layer = DenseReparameterization(
+      units,
+      activation=activation,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      kernel_posterior_fn=kernel_posterior_fn,
+      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+      kernel_prior_fn=kernel_prior_fn,
+      kernel_divergence_fn=kernel_divergence_fn,
+      bias_posterior_fn=bias_posterior_fn,
+      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+      bias_prior_fn=bias_prior_fn,
+      bias_divergence_fn=bias_divergence_fn,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _scope=name,
+      _reuse=reuse)
+  return layer.apply(inputs)
+
+
+class DenseLocalReparameterization(_DenseVariational):
+  """Densely-connected layer class with local reparameterization estimator.
+
+  This layer implements the Bayesian variational inference analogue to
+  a dense layer by assuming the `kernel` and/or the `bias` are drawn
+  from distributions. By default, the layer implements a stochastic
+  forward pass via sampling from the kernel and bias posteriors,
 
   ```none
-  -log p(y|x) = -log int_{R**d} p(y|x,w) p(w) dw
-              = -log int_{R**d} p(y,w|x) q(w|x) / q(w|x) dw
-             <= E_q(W|x)[-log p(y,W|x) + log q(W|x)]       # Jensen's
-              = E_q(W|x)[-log p(y|x,W)] + KL[q(W|x), p(W)]
-             ~= m**-1 sum{ -log(y|x,w[j]) : w[j] ~ q(W|x), j=1..m }
-                 + KL[q(W|x), p(W)]
+  kernel, bias ~ posterior
+  outputs = activation(matmul(inputs, kernel) + bias)
+  ```
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Args:
+    units: Integer or Long, dimensionality of the output space.
+    activation: Activation function (`callable`). Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    name: Python `str`, the name of the layer. Layers with the same name will
+      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
+      such cases.
+    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
+      layer by the same name.
+
+  Properties:
+    units: Python integer, dimensionality of the output space.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tfp.layers.DenseLocalReparameterization(
+      512, activation=tf.nn.relu)(features)
+  logits = tfp.layers.DenseLocalReparameterization(10)(net)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
   ```
 
-  where `W` denotes the (independent) `kernel` and `bias` random variables, `w`
-  is a random variate or outcome of `W`, `y` is the label, `x` is the evidence`,
-  and `~=` denotes an approximation which becomes exact as `m->inf`. The above
-  bound is sometimes referred to as the negative Evidence Lower BOund or
-  negative [ELBO](https://arxiv.org/abs/1601.00670). In context of a DNN, this
-  layer is appropriate to use when the final loss is a negative log-likelihood.
+  It uses local reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+  """
+
+  def __init__(
+      self,
+      units,
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(
+          is_singular=True),
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      name=None,
+      **kwargs):
+    super(DenseLocalReparameterization, self).__init__(
+        units=units,
+        activation=activation,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        kernel_posterior_fn=kernel_posterior_fn,
+        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+        kernel_prior_fn=kernel_prior_fn,
+        kernel_divergence_fn=kernel_divergence_fn,
+        bias_posterior_fn=bias_posterior_fn,
+        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+        bias_prior_fn=bias_prior_fn,
+        bias_divergence_fn=bias_divergence_fn,
+        name=name,
+        **kwargs)
+
+  def _apply_variational_kernel(self, inputs):
+    if (not isinstance(self.kernel_posterior, independent_lib.Independent) or
+        not isinstance(self.kernel_posterior.distribution, normal_lib.Normal)):
+      raise TypeError(
+          "`DenseLocalReparameterization` requires "
+          "`kernel_posterior_fn` produce an instance of "
+          "`tf.distributions.Independent(tf.distributions.Normal)` "
+          "(saw: \"{}\").".format(type(self.kernel_posterior).__name__))
+    self.kernel_posterior_affine = normal_lib.Normal(
+        loc=self._matmul(inputs, self.kernel_posterior.distribution.loc),
+        scale=standard_ops.sqrt(self._matmul(
+            standard_ops.square(inputs),
+            standard_ops.square(self.kernel_posterior.distribution.scale))))
+    self.kernel_posterior_affine_tensor = (
+        self.kernel_posterior_tensor_fn(self.kernel_posterior_affine))
+    self.kernel_posterior_tensor = None
+    return self.kernel_posterior_affine_tensor
 
-  The Monte-Carlo sum portion is used for the feed-forward calculation of the
-  DNN. The KL divergence portion can be added to the final loss via:
-  `loss += sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))`.
+
+def dense_local_reparameterization(
+    inputs,
+    units,
+    activation=None,
+    activity_regularizer=None,
+    trainable=True,
+    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+    kernel_posterior_tensor_fn=lambda d: d.sample(),
+    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    bias_posterior_fn=layers_util.default_mean_field_normal_fn(
+        is_singular=True),
+    bias_posterior_tensor_fn=lambda d: d.sample(),
+    bias_prior_fn=None,
+    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    name=None,
+    reuse=None):
+  """Densely-connected layer with local reparameterization estimator.
+
+  This layer implements the Bayesian variational inference analogue to
+  a dense layer by assuming the `kernel` and/or the `bias` are drawn
+  from distributions. By default, the layer implements a stochastic
+  forward pass via sampling from the kernel and bias posteriors,
+
+  ```none
+  kernel, bias ~ posterior
+  outputs = activation(matmul(inputs, kernel) + bias)
+  ```
 
   The arguments permit separate specification of the surrogate posterior
   (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  random variables (which together comprise `W`).
+  distributions.
 
   Args:
     inputs: Tensor input.
@@ -529,10 +742,6 @@ def dense_variational(
     activity_regularizer: Regularizer function for the output.
     trainable: Boolean, if `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_use_local_reparameterization: Python `bool` indicating whether
-      `kernel` calculation should employ the Local Reparameterization Trick.
-      When `True`, `kernel_posterior_fn` must create an instance of
-      `tf.distributions.Normal`.
     kernel_posterior_fn: Python `callable` which creates
       `tf.distributions.Distribution` instance representing the surrogate
       posterior of the `kernel` parameter. Default value:
@@ -574,14 +783,38 @@ def dense_variational(
   Returns:
     output: `Tensor` representing a the affine transformed input under a random
       draw from the surrogate posterior distribution.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tfp.layers.dense_local_reparameterization(
+      features, 512, activation=tf.nn.relu)
+  logits = tfp.layers.dense_local_reparameterization(net, 10)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses local reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
   """
-  layer = DenseVariational(
+  layer = DenseLocalReparameterization(
       units,
       activation=activation,
       activity_regularizer=activity_regularizer,
       trainable=trainable,
-      kernel_use_local_reparameterization=(
-          kernel_use_local_reparameterization),
       kernel_posterior_fn=kernel_posterior_fn,
       kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
       kernel_prior_fn=kernel_prior_fn,
@@ -597,201 +830,317 @@ def dense_variational(
   return layer.apply(inputs)
 
 
-class NotSet(object):
-  """Helper to track whether a `VariationalParameter` value has been set."""
-  pass
+class DenseFlipout(_DenseVariational):
+  """Densely-connected layer class with Flipout estimator.
 
+  This layer implements the Bayesian variational inference analogue to
+  a dense layer by assuming the `kernel` and/or the `bias` are drawn
+  from distributions. By default, the layer implements a stochastic
+  forward pass via sampling from the kernel and bias posteriors,
 
-class VariationalParameter(object):
-  """Struct-like container of variational parameter properties.
+  ```none
+  kernel, bias ~ posterior
+  outputs = activation(matmul(inputs, kernel) + bias)
+  ```
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
 
-  A `VariationalParameter` is intitialized with Python `callable`s which set the
-  value of correspondingly named members. Corresponding values have "set once"
-  semantics, i.e., once set to any value they are immutable.
+  Args:
+    units: Integer or Long, dimensionality of the output space.
+    activation: Activation function (`callable`). Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    seed: Python scalar `int` which initializes the random number
+      generator. Default value: `None` (i.e., use global seed).
+    name: Python `str`, the name of the layer. Layers with the same name will
+      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
+      such cases.
+    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
+      layer by the same name.
+
+  Properties:
+    units: Python integer, dimensionality of the output space.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+    seed: Python integer, used to create random seeds.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tfp.layers.DenseFlipout(
+      512, activation=tf.nn.relu)(features)
+  logits = tfp.layers.DenseFlipout(10)(net)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses the Flipout gradient estimator to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
   """
 
   def __init__(
       self,
-      posterior_fn,
-      posterior_tensor_fn,
-      prior_fn,
-      divergence_fn):
-    """Creates the `VariationalParameter` struct-like object.
-
-    Args:
-      posterior_fn: Python `callable` which creates a
-        `tf.distribution.Distribution` like object representing the posterior
-        distribution. See `VariationalParameter.posterior_fn` for `callable`'s
-        required parameters.
-      posterior_tensor_fn: Python `callable` which computes a `Tensor`
-        which represents the `posterior`.
-      prior_fn: Python `callable` which creates a
-        `tf.distribution.Distribution` like object representing the prior
-        distribution. See `VariationalParameter.prior_fn` for `callable`'s
-        required parameters.
-      divergence_fn: Python `callable` which computes the KL divergence from
-        `posterior` to `prior`. See `VariationalParameter.divergence_fn` for
-        required `callable`'s parameters.
-    """
-    self._posterior_fn = posterior_fn
-    self._posterior = NotSet()
-    self._posterior_tensor_fn = posterior_tensor_fn
-    self._posterior_tensor = NotSet()
-    self._prior_fn = prior_fn
-    self._prior = NotSet()
-    self._divergence_fn = divergence_fn
-    self._divergence = NotSet()
-    self._init_helper()
-
-  @property
-  def posterior_fn(self):
-    """`callable` which creates `tf.distributions.Distribution`-like posterior.
-
-    The `callable` must accept the following parameters:
-      name: Python `str` name prepended to any created (or existing)
-        `tf.Variable`s.
-      shape: Python `list`-like representing the parameter's event shape.
-      dtype: Type of parameter's event.
-      trainable: Python `bool` indicating all created `tf.Variable`s should be
-        added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
-      add_variable_fn: `tf.get_variable`-like `callable` used to create (or
-        access existing) `tf.Variable`s.
-
-    Returns:
-      posterior_fn: The Python `callable` specified in `__init__`.
-    """
-    return self._posterior_fn
-
-  @property
-  def posterior(self):
-    """`tf.distributions.Distribution`-like instance representing posterior."""
-    return self._posterior
-
-  @posterior.setter
-  def posterior(self, value):
-    """One-time setter of the `posterior` distribution."""
-    if not isinstance(self._posterior, NotSet):
-      raise ValueError("Cannot override already set attribute.")
-    self._posterior = value
-
-  @property
-  def posterior_tensor_fn(self):
-    """Creates `Tensor` representing the `posterior` distribution.
-
-    The `callable` must accept the following parameters:
-      posterior: `tf.distributions.Distribution`-like instance.
-
-    Returns:
-      posterior_tensor_fn: The Python `callable` specified in
-        `__init__`.
-    """
-    return self._posterior_tensor_fn
-
-  @property
-  def posterior_tensor(self):
-    """`Tensor` representing the `posterior` distribution."""
-    return self._posterior_tensor
-
-  @posterior_tensor.setter
-  def posterior_tensor(self, value):
-    """One-time setter of the `posterior_tensor`."""
-    if not isinstance(self._posterior_tensor, NotSet):
-      raise ValueError("Cannot override already set attribute.")
-    self._posterior_tensor = value
-
-  @property
-  def prior_fn(self):
-    """`callable` which creates `tf.distributions.Distribution`-like prior.
-
-    The `callable` must accept the following parameters:
-      name: Python `str` name prepended to any created (or existing)
-        `tf.Variable`s.
-      shape: Python `list`-like representing the parameter's event shape.
-      dtype: Type of parameter's event.
-      trainable: Python `bool` indicating all created `tf.Variable`s should be
-        added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
-      add_variable_fn: `tf.get_variable`-like `callable` used to create (or
-        access existing) `tf.Variable`s.
-
-    Returns:
-      prior_fn: The Python `callable` specified in `__init__`.
-    """
-    return self._prior_fn
-
-  @property
-  def prior(self):
-    """`tf.distributions.Distribution`-like instance representing posterior."""
-    return self._prior
-
-  @prior.setter
-  def prior(self, value):
-    """One-time setter of the `prior` distribution."""
-    if not isinstance(self._prior, NotSet):
-      raise ValueError("Cannot override already set attribute.")
-    self._prior = value
-
-  @property
-  def divergence_fn(self):
-    """`callable` which computes KL-divergence `Tensor` from posterior to prior.
-
-    The `callable` must accept the following parameters:
-      posterior: `tf.distributions.Distribution`-like instance.
-      prior: `tf.distributions.Distribution`-like instance.
-      posterior_tensor: `Tensor` representing value of posterior.
-
-    Returns:
-      divergence_fn: The Python `callable` specified in `__init__`.
-    """
-    return self._divergence_fn
-
-  @property
-  def divergence(self):
-    """`Tensor` representing KL-divergence from posterior to prior."""
-    return self._divergence
-
-  @divergence.setter
-  def divergence(self, value):
-    """One-time setter of the `divergence`."""
-    if not isinstance(self._divergence, NotSet):
-      raise ValueError("Cannot override already set attribute.")
-    self._divergence = value
-
-  def _init_helper(self):
-    pass
-
-
-class VariationalKernelParameter(VariationalParameter):
-  """Struct-like container of variational kernel properties.
-
-  A `VariationalKernelParameter` is intitialized with Python `callable`s which
-  set the value of correspondingly named members. Corresponding values have "set
-  once" semantics, i.e., once set to any value they are immutable.
+      units,
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(
+          is_singular=True),
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      seed=None,
+      name=None,
+      **kwargs):
+    super(DenseFlipout, self).__init__(
+        units=units,
+        activation=activation,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        kernel_posterior_fn=kernel_posterior_fn,
+        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+        kernel_prior_fn=kernel_prior_fn,
+        kernel_divergence_fn=kernel_divergence_fn,
+        bias_posterior_fn=bias_posterior_fn,
+        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+        bias_prior_fn=bias_prior_fn,
+        bias_divergence_fn=bias_divergence_fn,
+        name=name,
+        **kwargs)
+    self.seed = seed
+
+  def _apply_variational_kernel(self, inputs):
+    if (not isinstance(self.kernel_posterior, independent_lib.Independent) or
+        not isinstance(self.kernel_posterior.distribution, normal_lib.Normal)):
+      raise TypeError(
+          "`DenseFlipout` requires "
+          "`kernel_posterior_fn` produce an instance of "
+          "`tf.distributions.Independent(tf.distributions.Normal)` "
+          "(saw: \"{}\").".format(type(self.kernel_posterior).__name__))
+    self.kernel_posterior_affine = normal_lib.Normal(
+        loc=array_ops.zeros_like(self.kernel_posterior.distribution.loc),
+        scale=self.kernel_posterior.distribution.scale)
+    self.kernel_posterior_affine_tensor = (
+        self.kernel_posterior_tensor_fn(self.kernel_posterior_affine))
+    self.kernel_posterior_tensor = None
+
+    input_shape = array_ops.shape(inputs)
+    batch_shape = input_shape[:-1]
+
+    sign_input = random_sign(input_shape, dtype=inputs.dtype, seed=self.seed)
+    sign_output = random_sign(
+        array_ops.concat([batch_shape,
+                          array_ops.expand_dims(self.units, 0)], 0),
+        dtype=inputs.dtype,
+        seed=distribution_util.gen_new_seed(
+            self.seed, salt="dense_flipout"))
+    perturbed_inputs = self._matmul(
+        inputs * sign_input, self.kernel_posterior_affine_tensor) * sign_output
+
+    outputs = self._matmul(inputs, self.kernel_posterior.distribution.loc)
+    outputs += perturbed_inputs
+    return outputs
+
+
+def dense_flipout(
+    inputs,
+    units,
+    activation=None,
+    activity_regularizer=None,
+    trainable=True,
+    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+    kernel_posterior_tensor_fn=lambda d: d.sample(),
+    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    bias_posterior_fn=layers_util.default_mean_field_normal_fn(
+        is_singular=True),
+    bias_posterior_tensor_fn=lambda d: d.sample(),
+    bias_prior_fn=None,
+    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    seed=None,
+    name=None,
+    reuse=None):
+  """Densely-connected layer with Flipout estimator.
+
+  This layer implements the Bayesian variational inference analogue to
+  a dense layer by assuming the `kernel` and/or the `bias` are drawn
+  from distributions. By default, the layer implements a stochastic
+  forward pass via sampling from the kernel and bias posteriors,
+
+  ```none
+  kernel, bias ~ posterior
+  outputs = activation(matmul(inputs, kernel) + bias)
+  ```
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Args:
+    inputs: Tensor input.
+    units: Integer or Long, dimensionality of the output space.
+    activation: Activation function (`callable`). Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    seed: Python scalar `int` which initializes the random number
+      generator. Default value: `None` (i.e., use global seed).
+    name: Python `str`, the name of the layer. Layers with the same name will
+      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
+      such cases.
+    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
+      layer by the same name.
+
+  Returns:
+    output: `Tensor` representing a the affine transformed input under a random
+      draw from the surrogate posterior distribution.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tfp.layers.dense_flipout(
+      features, 512, activation=tf.nn.relu)
+  logits = tfp.layers.dense_flipout(net, 10)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses the Flipout gradient estimator to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
   """
+  layer = DenseFlipout(
+      units,
+      activation=activation,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      kernel_posterior_fn=kernel_posterior_fn,
+      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+      kernel_prior_fn=kernel_prior_fn,
+      kernel_divergence_fn=kernel_divergence_fn,
+      bias_posterior_fn=bias_posterior_fn,
+      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+      bias_prior_fn=bias_prior_fn,
+      bias_divergence_fn=bias_divergence_fn,
+      seed=seed,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _scope=name,
+      _reuse=reuse)
+  return layer.apply(inputs)
+
 
-  @property
-  def posterior_affine(self):
-    """`tf.distributions.Distribution` affine transformed posterior."""
-    return self._posterior_affine
-
-  @posterior_affine.setter
-  def posterior_affine(self, value):
-    """One-time setter of `posterior_affine`."""
-    if not isinstance(self._posterior_affine, NotSet):
-      raise ValueError("Cannot override already set attribute.")
-    self._posterior_affine = value
-
-  @property
-  def posterior_affine_tensor(self):
-    """`Tensor` representing the `posterior_affine` distribution."""
-    return self._posterior_affine_tensor
-
-  @posterior_affine_tensor.setter
-  def posterior_affine_tensor(self, value):
-    """One-time setter of the `posterior_affine_tensor`."""
-    if not isinstance(self._posterior_affine_tensor, NotSet):
-      raise ValueError("Cannot override already set attribute.")
-    self._posterior_affine_tensor = value
-
-  def _init_helper(self):
-    self._posterior_affine = NotSet()
-    self._posterior_affine_tensor = NotSet()
+def random_sign(shape, dtype=dtypes.float32, seed=None):
+  """Draw values from {-1, 1} uniformly, i.e., Rademacher distribution."""
+  random_bernoulli = random_ops.random_uniform(shape, minval=0, maxval=2,
+                                               dtype=dtypes.int32,
+                                               seed=seed)
+  return math_ops.cast(2 * random_bernoulli - 1, dtype)
diff --git a/tensorflow/contrib/bayesflow/python/ops/layers_util.py b/tensorflow/contrib/bayesflow/python/ops/layers_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a4fecf4e5dcb1e1008303b07b4f76d5e5ce557f
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/layers_util.py
@@ -0,0 +1,180 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for probabilistic layers.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import deterministic as deterministic_lib
+from tensorflow.contrib.distributions.python.ops import independent as independent_lib
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import normal as normal_lib
+
+
+def default_loc_scale_fn(
+    is_singular=False,
+    loc_initializer=init_ops.random_normal_initializer(stddev=0.1),
+    untransformed_scale_initializer=init_ops.random_normal_initializer(
+        mean=-3., stddev=0.1),
+    loc_regularizer=None,
+    untransformed_scale_regularizer=None,
+    loc_constraint=None,
+    untransformed_scale_constraint=None):
+  """Makes closure which creates `loc`, `scale` params from `tf.get_variable`.
+
+  This function produces a closure which produces `loc`, `scale` using
+  `tf.get_variable`. The closure accepts the following arguments:
+
+    dtype: Type of parameter's event.
+    shape: Python `list`-like representing the parameter's event shape.
+    name: Python `str` name prepended to any created (or existing)
+      `tf.Variable`s.
+    trainable: Python `bool` indicating all created `tf.Variable`s should be
+      added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
+    add_variable_fn: `tf.get_variable`-like `callable` used to create (or
+      access existing) `tf.Variable`s.
+
+  Args:
+    is_singular: Python `bool` indicating if `scale is None`. Default: `False`.
+    loc_initializer: Initializer function for the `loc` parameters.
+      The default is `tf.random_normal_initializer(mean=0., stddev=0.1)`.
+    untransformed_scale_initializer: Initializer function for the `scale`
+      parameters. Default value: `tf.random_normal_initializer(mean=-3.,
+      stddev=0.1)`. This implies the softplus transformed result has mean
+      approximately `0.05` and std. deviation approximately `0.005`.
+    loc_regularizer: Regularizer function for the `loc` parameters.
+      The default (`None`) is to use the `tf.get_variable` default.
+    untransformed_scale_regularizer: Regularizer function for the `scale`
+      parameters. The default (`None`) is to use the `tf.get_variable` default.
+    loc_constraint: An optional projection function to be applied to the
+      loc after being updated by an `Optimizer`. The function must take as input
+      the unprojected variable and must return the projected variable (which
+      must have the same shape). Constraints are not safe to use when doing
+      asynchronous distributed training.
+      The default (`None`) is to use the `tf.get_variable` default.
+    untransformed_scale_constraint: An optional projection function to be
+      applied to the `scale` parameters after being updated by an `Optimizer`
+      (e.g. used to implement norm constraints or value constraints). The
+      function must take as input the unprojected variable and must return the
+      projected variable (which must have the same shape). Constraints are not
+      safe to use when doing asynchronous distributed training. The default
+      (`None`) is to use the `tf.get_variable` default.
+
+  Returns:
+    default_loc_scale_fn: Python `callable` which instantiates `loc`, `scale`
+    parameters from args: `dtype, shape, name, trainable, add_variable_fn`.
+  """
+  def _fn(dtype, shape, name, trainable, add_variable_fn):
+    """Creates `loc`, `scale` parameters."""
+    loc = add_variable_fn(
+        name=name + "_loc",
+        shape=shape,
+        initializer=loc_initializer,
+        regularizer=loc_regularizer,
+        constraint=loc_constraint,
+        dtype=dtype,
+        trainable=trainable)
+    if is_singular:
+      return loc, None
+    untransformed_scale = add_variable_fn(
+        name=name + "_untransformed_scale",
+        shape=shape,
+        initializer=untransformed_scale_initializer,
+        regularizer=untransformed_scale_regularizer,
+        constraint=untransformed_scale_constraint,
+        dtype=dtype,
+        trainable=trainable)
+    scale = (np.finfo(dtype.as_numpy_dtype).eps +
+             nn_ops.softplus(untransformed_scale))
+    return loc, scale
+  return _fn
+
+
+def default_mean_field_normal_fn(
+    is_singular=False,
+    loc_initializer=None,
+    untransformed_scale_initializer=None,
+    loc_regularizer=None,
+    untransformed_scale_regularizer=None,
+    loc_constraint=None,
+    untransformed_scale_constraint=None):
+  """Creates a function to build Normal distributions with trainable params.
+
+  This function produces a closure which produces `tf.distributions.Normal`
+  parameterized by a loc` and `scale` each created using `tf.get_variable`. The
+  produced closure accepts the following arguments:
+
+    name: Python `str` name prepended to any created (or existing)
+      `tf.Variable`s.
+    shape: Python `list`-like representing the parameter's event shape.
+    dtype: Type of parameter's event.
+    trainable: Python `bool` indicating all created `tf.Variable`s should be
+      added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
+    add_variable_fn: `tf.get_variable`-like `callable` used to create (or
+      access existing) `tf.Variable`s.
+
+  Args:
+    is_singular: Python `bool` if `True`, forces the special case limit of
+      `scale->0`, i.e., a `Deterministic` distribution.
+    loc_initializer: Initializer function for the `loc` parameters.
+      If `None` (default), values are initialized using the default
+      initializer used by `tf.get_variable`.
+    untransformed_scale_initializer: Initializer function for the `scale`
+      parameters. If `None` (default), values are initialized using the default
+      initializer used by `tf.get_variable`.
+    loc_regularizer: Regularizer function for the `loc` parameters.
+    untransformed_scale_regularizer: Regularizer function for the `scale`
+      parameters.
+    loc_constraint: An optional projection function to be applied to the
+      loc after being updated by an `Optimizer`. The function must take as input
+      the unprojected variable and must return the projected variable (which
+      must have the same shape). Constraints are not safe to use when doing
+      asynchronous distributed training.
+    untransformed_scale_constraint: An optional projection function to be
+      applied to the `scale` parameters after being updated by an `Optimizer`
+      (e.g. used to implement norm constraints or value constraints). The
+      function must take as input the unprojected variable and must return the
+      projected variable (which must have the same shape). Constraints are not
+      safe to use when doing asynchronous distributed training.
+
+  Returns:
+    make_normal_fn: Python `callable` which creates a `tf.distributions.Normal`
+      using from args: `dtype, shape, name, trainable, add_variable_fn`.
+  """
+  loc_scale_fn_ = default_loc_scale_fn(
+      is_singular,
+      loc_initializer,
+      untransformed_scale_initializer,
+      loc_regularizer,
+      untransformed_scale_regularizer,
+      loc_constraint,
+      untransformed_scale_constraint)
+  def _fn(dtype, shape, name, trainable, add_variable_fn):
+    """Creates multivariate `Deterministic` or `Normal` distribution."""
+    loc, scale = loc_scale_fn_(dtype, shape, name, trainable, add_variable_fn)
+    if scale is None:
+      dist = deterministic_lib.Deterministic(loc=loc)
+    else:
+      dist = normal_lib.Normal(loc=loc, scale=scale)
+    reinterpreted_batch_ndims = array_ops.shape(dist.batch_shape_tensor())[0]
+    return independent_lib.Independent(
+        dist, reinterpreted_batch_ndims=reinterpreted_batch_ndims)
+  return _fn
diff --git a/tensorflow/contrib/boosted_trees/BUILD b/tensorflow/contrib/boosted_trees/BUILD
index 66a04d42e93331de74b6f3d41f83f071115c1097..392ac7fa1ce600a64ee3b941b70b01447645e4aa 100644
--- a/tensorflow/contrib/boosted_trees/BUILD
+++ b/tensorflow/contrib/boosted_trees/BUILD
@@ -359,8 +359,8 @@ tf_custom_op_library(
     ],
     deps = [
         "//tensorflow/contrib/boosted_trees/lib:example_partitioner",
-        "//tensorflow/contrib/boosted_trees/lib:feature-column-handlers",
         "//tensorflow/contrib/boosted_trees/lib:models",
+        "//tensorflow/contrib/boosted_trees/lib:node-stats",
         "//tensorflow/contrib/boosted_trees/lib:utils",
         "//tensorflow/contrib/boosted_trees/lib:weighted_quantiles",
         "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
@@ -404,10 +404,12 @@ tf_kernel_library(
     name = "split_handler_ops_kernels",
     srcs = ["kernels/split_handler_ops.cc"],
     deps = [
-        "//tensorflow/contrib/boosted_trees/lib:feature-column-handlers",
+        "//tensorflow/contrib/boosted_trees/lib:node-stats",
         "//tensorflow/contrib/boosted_trees/proto:split_info_proto_cc",
         "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
         "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//third_party/eigen3",
     ],
     alwayslink = 1,
 )
@@ -599,6 +601,7 @@ py_library(
         ":init_py",
         "//tensorflow/contrib/boosted_trees:gbdt_batch",
         "//tensorflow/contrib/boosted_trees/estimator_batch:custom_export_strategy",
+        "//tensorflow/contrib/boosted_trees/estimator_batch:dnn_tree_combined_estimator",
         "//tensorflow/contrib/boosted_trees/estimator_batch:init_py",
         "//tensorflow/contrib/boosted_trees/estimator_batch:trainer_hooks",
         "//tensorflow/contrib/boosted_trees/lib:categorical_split_handler",
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index 7792c7127c0285dc2eb5b213da054674f6a81d64..48084d80167cc5c300ae62eaeac53c622dfce2a3 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -50,6 +50,7 @@ py_library(
     deps = [
         "//tensorflow/contrib/learn",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:training",
@@ -129,3 +130,33 @@ py_library(
         "//tensorflow/python:math_ops",
     ],
 )
+
+py_library(
+    name = "dnn_tree_combined_estimator",
+    srcs = ["dnn_tree_combined_estimator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":trainer_hooks",
+        "//tensorflow/contrib/boosted_trees:gbdt_batch",
+        "//tensorflow/contrib/boosted_trees:model_ops_py",
+        "//tensorflow/contrib/learn",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_test(
+    name = "dnn_tree_combined_estimator_test",
+    size = "small",
+    srcs = ["dnn_tree_combined_estimator_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dnn_tree_combined_estimator",
+        "//tensorflow/contrib/boosted_trees:gbdt_batch",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
index ef8dee91b6cc05c4c3dd5eb3c81de4fb65b473e3..6ebc7d7911df878ec91701db8b75feb9a27d18a2 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
@@ -33,6 +33,8 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.saved_model import loader as saved_model_loader
 from tensorflow.python.saved_model import tag_constants
 
+_SPARSE_FLOAT_FEATURE_NAME_TEMPLATE = "%s_%d"
+
 
 def make_custom_export_strategy(name,
                                 convert_fn,
@@ -147,13 +149,12 @@ def convert_to_universal_format(dtec, sorted_feature_names,
           inequality_test.threshold.float_value = split.threshold
         elif node_type == "sparse_float_binary_split_default_left":
           split = gtflow_node.sparse_float_binary_split_default_left.split
-          node.default_direction = (
-              generic_tree_model_pb2.BinaryNode.LEFT)
-          # TODO(nponomareva): adjust this id assignement when we allow multi-
-          # column sparse tensors.
+          node.default_direction = (generic_tree_model_pb2.BinaryNode.LEFT)
           feature_id = split.feature_column + num_dense
           inequality_test = node.inequality_left_child_test
-          inequality_test.feature_id.id.value = sorted_feature_names[feature_id]
+          inequality_test.feature_id.id.value = (
+              _SPARSE_FLOAT_FEATURE_NAME_TEMPLATE %
+              (sorted_feature_names[feature_id], split.dimension_id))
           inequality_test.type = (
               generic_tree_model_pb2.InequalityTest.LESS_OR_EQUAL)
           inequality_test.threshold.float_value = split.threshold
@@ -165,7 +166,9 @@ def convert_to_universal_format(dtec, sorted_feature_names,
           # column sparse tensors.
           feature_id = split.feature_column + num_dense
           inequality_test = node.inequality_left_child_test
-          inequality_test.feature_id.id.value = sorted_feature_names[feature_id]
+          inequality_test.feature_id.id.value = (
+              _SPARSE_FLOAT_FEATURE_NAME_TEMPLATE %
+              (sorted_feature_names[feature_id], split.dimension_id))
           inequality_test.type = (
               generic_tree_model_pb2.InequalityTest.LESS_OR_EQUAL)
           inequality_test.threshold.float_value = split.threshold
@@ -201,10 +204,14 @@ def _get_feature_importances(dtec, feature_names, num_dense_floats,
         split_column = feature_names[split.feature_column]
       elif node_type == "sparse_float_binary_split_default_left":
         split = tree_node.sparse_float_binary_split_default_left.split
-        split_column = feature_names[split.feature_column + num_dense_floats]
+        split_column = _SPARSE_FLOAT_FEATURE_NAME_TEMPLATE % (
+            feature_names[split.feature_column + num_dense_floats],
+            split.dimension_id)
       elif node_type == "sparse_float_binary_split_default_right":
         split = tree_node.sparse_float_binary_split_default_right.split
-        split_column = feature_names[split.feature_column + num_dense_floats]
+        split_column = _SPARSE_FLOAT_FEATURE_NAME_TEMPLATE % (
+            feature_names[split.feature_column + num_dense_floats],
+            split.dimension_id)
       elif node_type == "categorical_id_binary_split":
         split = tree_node.categorical_id_binary_split
         split_column = feature_names[split.feature_column + num_dense_floats +
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py
index 4ed18b2d34c5af47826ab1c058f5d13797593bd4..492d9ca40c5cfa84e186020605429aacc02af6a6 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the conversion code from GTFlow format to Chauffeur."""
+"""Tests for the conversion code and for feature importances export.
+
+Tests that cover conversion from TFBT format to a tensorflow.contrib.
+decision_tree generic_tree_model format and feature importances export.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -95,10 +99,31 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
           }
         }
       }
+      nodes {
+        sparse_float_binary_split_default_right {
+          split {
+            feature_column: 1
+            dimension_id:3
+            threshold: -0.4
+            left_id: 7
+            right_id: 8
+          }
+        }
+        node_metadata {
+            gain: 3600
+        }
+      }
+      nodes {
+        leaf {
+          vector {
+            value: 0.36
+          }
+        }
+      }
       nodes {
         leaf {
           vector {
-            value: 0.3
+            value: 18
           }
         }
       }
@@ -108,17 +133,25 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
     """
     dtec = tree_config_pb2.DecisionTreeEnsembleConfig()
     text_format.Merge(dtec_str, dtec)
-    feature_columns = ["feature_b", "feature_a", "feature_d"]
+    feature_columns = [
+        "feature_b",
+        "feature_a",
+        "feature_a_m",
+        "feature_d",
+    ]
     return dtec, feature_columns
 
   def testConvertModel(self):
     dtec, feature_columns = self._make_trees()
+    # Assume 2 sparse float columns, one with 1 dimension, the second one with
+    # 5 dimensions.
     # The feature columns in the order they were added.
     out = custom_export_strategy.convert_to_universal_format(
-        dtec, feature_columns, 1, 1,
-        1)
+        dtec, feature_columns, 1, 2, 1)
+    # Features a and a_m are sparse float features, a_m is multidimensional.
     expected_tree = """
     features { key: "feature_a" }
+    features { key: "feature_a_m" }
     features { key: "feature_b" }
     features { key: "feature_d" }
     model {
@@ -169,7 +202,6 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
                   }
                 }
               }
-
               nodes {
                 node_id {
                   value: 1
@@ -196,7 +228,7 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
                   inequality_left_child_test {
                     feature_id {
                       id {
-                        value: "feature_a"
+                        value: "feature_a_0"
                       }
                     }
                     threshold {
@@ -259,14 +291,51 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
                 node_id {
                   value: 6
                 }
+                binary_node {
+                  left_child_id {
+                    value: 7
+                  }
+                  right_child_id {
+                    value: 8
+                  }
+                  default_direction: RIGHT
+                  inequality_left_child_test {
+                      feature_id {
+                        id {
+                          value: "feature_a_m_3"
+                        }
+                      }
+                      threshold {
+                        float_value: -0.4
+                      }
+                  }
+                }
+              }
+              nodes {
+                node_id {
+                  value: 7
+                }
                 leaf {
                   vector {
                     value {
-                      float_value: 0.03
+                      float_value: 0.036
                     }
                   }
                 }
               }
+              nodes {
+                node_id {
+                  value: 8
+                }
+                leaf {
+                  vector {
+                    value {
+                      float_value: 1.8
+                    }
+                  }
+                }
+              }
+
             }
           }
           submodel_id {
@@ -280,12 +349,15 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
   def testFeatureImportance(self):
     dtec, feature_columns = self._make_trees()
     feature_importances = custom_export_strategy._get_feature_importances(
-        dtec, feature_columns, 1, 1, 1)
-    self.assertItemsEqual(["feature_b", "feature_a", "feature_d"],
-                          feature_importances.keys())
+        dtec, feature_columns, 1, 2, 1)
+    self.assertItemsEqual(
+        ["feature_b", "feature_a_0", "feature_a_m_3", "feature_d"],
+        feature_importances.keys())
     self.assertAlmostEqual(50.0, feature_importances["feature_b"], places=4)
-    self.assertAlmostEqual(50.0, feature_importances["feature_a"], places=4)
+    self.assertAlmostEqual(50.0, feature_importances["feature_a_0"], places=4)
     self.assertAlmostEqual(50.0, feature_importances["feature_d"], places=4)
+    self.assertAlmostEqual(
+        360.0, feature_importances["feature_a_m_3"], places=4)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..cec3892b57655dc967b4e7926f7f5a6a30084487
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
@@ -0,0 +1,515 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow estimators for combined DNN + GBDT training model.
+
+The combined model trains a DNN first, then trains boosted trees to boost the
+logits of the DNN. The input layer of the DNN (including the embeddings learned
+over sparse features) can optionally be provided to the boosted trees as
+an additional input feature.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib import layers
+from tensorflow.contrib.boosted_trees.estimator_batch import trainer_hooks
+from tensorflow.contrib.boosted_trees.python.ops import model_ops
+from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
+from tensorflow.contrib.layers.python.layers import optimizers
+from tensorflow.contrib.learn.python.learn.estimators import estimator
+from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
+from tensorflow.contrib.learn.python.learn.estimators import model_fn
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
+
+
+_DNN_LEARNING_RATE = 0.001
+
+
+def _get_optimizer(optimizer):
+  if callable(optimizer):
+    return optimizer()
+  else:
+    return optimizer
+
+
+def _add_hidden_layer_summary(value, tag):
+  summary.scalar("%s_fraction_of_zero_values" % tag, nn.zero_fraction(value))
+  summary.histogram("%s_activation" % tag, value)
+
+
+def _dnn_tree_combined_model_fn(
+    features, labels, mode, head, dnn_hidden_units,
+    dnn_feature_columns, tree_learner_config, num_trees,
+    tree_examples_per_layer,
+    config=None, dnn_optimizer="Adagrad",
+    dnn_activation_fn=nn.relu, dnn_dropout=None,
+    dnn_input_layer_partitioner=None,
+    dnn_input_layer_to_tree=True, dnn_steps_to_train=10000,
+    tree_feature_columns=None,
+    tree_center_bias=True):
+  """DNN and GBDT combined model_fn.
+
+  Args:
+    features: `dict` of `Tensor` objects.
+    labels: Labels used to train on.
+    mode: Mode we are in. (TRAIN/EVAL/INFER)
+    head: A `Head` instance.
+    dnn_hidden_units: List of hidden units per layer.
+    dnn_feature_columns: An iterable containing all the feature columns
+      used by the model's DNN.
+    tree_learner_config: A config for the tree learner.
+    num_trees: Number of trees to grow model to after training DNN.
+    tree_examples_per_layer: Number of examples to accumulate before
+      growing the tree a layer. This value has a big impact on model
+      quality and should be set equal to the number of examples in
+      training dataset if possible. It can also be a function that computes
+      the number of examples based on the depth of the layer that's
+      being built.
+    config: `RunConfig` of the estimator.
+    dnn_optimizer: string, `Optimizer` object, or callable that defines the
+      optimizer to use for training the DNN. If `None`, will use the Adagrad
+      optimizer with default learning rate of 0.001.
+    dnn_activation_fn: Activation function applied to each layer of the DNN.
+      If `None`, will use `tf.nn.relu`.
+    dnn_dropout: When not `None`, the probability to drop out a given
+      unit in the DNN.
+    dnn_input_layer_partitioner: Partitioner for input layer of the DNN.
+      Defaults to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+    dnn_input_layer_to_tree: Whether to provide the DNN's input layer
+    as a feature to the tree.
+    dnn_steps_to_train: Number of steps to train dnn for before switching
+      to gbdt.
+    tree_feature_columns: An iterable containing all the feature columns
+      used by the model's boosted trees. If dnn_input_layer_to_tree is
+      set to True, these features are in addition to dnn_feature_columns.
+    tree_center_bias: Whether a separate tree should be created for
+      first fitting the bias.
+
+  Returns:
+    A `ModelFnOps` object.
+  Raises:
+    ValueError: if inputs are not valid.
+  """
+  if not isinstance(features, dict):
+    raise ValueError("features should be a dictionary of `Tensor`s. "
+                     "Given type: {}".format(type(features)))
+
+  if not dnn_feature_columns:
+    raise ValueError("dnn_feature_columns must be specified")
+
+  # Build DNN Logits.
+  dnn_parent_scope = "dnn"
+  dnn_partitioner = dnn_input_layer_partitioner or (
+      partitioned_variables.min_max_variable_partitioner(
+          max_partitions=config.num_ps_replicas,
+          min_slice_size=64 << 20))
+
+  with variable_scope.variable_scope(
+      dnn_parent_scope,
+      values=tuple(six.itervalues(features)),
+      partitioner=dnn_partitioner):
+
+    with variable_scope.variable_scope(
+        "input_from_feature_columns",
+        values=tuple(six.itervalues(features)),
+        partitioner=dnn_partitioner) as input_layer_scope:
+      input_layer = layers.input_from_feature_columns(
+          columns_to_tensors=features,
+          feature_columns=dnn_feature_columns,
+          weight_collections=[dnn_parent_scope],
+          scope=input_layer_scope)
+    previous_layer = input_layer
+    for layer_id, num_hidden_units in enumerate(dnn_hidden_units):
+      with variable_scope.variable_scope(
+          "hiddenlayer_%d" % layer_id,
+          values=(previous_layer,)) as hidden_layer_scope:
+        net = layers.fully_connected(
+            previous_layer,
+            num_hidden_units,
+            activation_fn=dnn_activation_fn,
+            variables_collections=[dnn_parent_scope],
+            scope=hidden_layer_scope)
+        if dnn_dropout is not None and mode == model_fn.ModeKeys.TRAIN:
+          net = layers.dropout(net, keep_prob=(1.0 - dnn_dropout))
+      _add_hidden_layer_summary(net, hidden_layer_scope.name)
+      previous_layer = net
+    with variable_scope.variable_scope(
+        "logits",
+        values=(previous_layer,)) as logits_scope:
+      dnn_logits = layers.fully_connected(
+          previous_layer,
+          head.logits_dimension,
+          activation_fn=None,
+          variables_collections=[dnn_parent_scope],
+          scope=logits_scope)
+    _add_hidden_layer_summary(dnn_logits, logits_scope.name)
+
+    def _dnn_train_op_fn(loss):
+      """Returns the op to optimize the loss."""
+      return optimizers.optimize_loss(
+          loss=loss,
+          global_step=training_util.get_global_step(),
+          learning_rate=_DNN_LEARNING_RATE,
+          optimizer=_get_optimizer(dnn_optimizer),
+          name=dnn_parent_scope,
+          variables=ops.get_collection(
+              ops.GraphKeys.TRAINABLE_VARIABLES,
+              scope=dnn_parent_scope),
+          # Empty summaries to prevent optimizers from logging training_loss.
+          summaries=[])
+
+  # Build Tree Logits.
+  global_step = training_util.get_global_step()
+  with ops.device(global_step.device):
+    ensemble_handle = model_ops.tree_ensemble_variable(
+        stamp_token=0,
+        tree_ensemble_config="",  # Initialize an empty ensemble.
+        name="ensemble_model")
+
+  tree_features = features.copy()
+  if dnn_input_layer_to_tree:
+    tree_features["dnn_input_layer"] = input_layer
+    tree_feature_columns.append(layers.real_valued_column("dnn_input_layer"))
+  gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+      is_chief=config.is_chief,
+      num_ps_replicas=config.num_ps_replicas,
+      ensemble_handle=ensemble_handle,
+      center_bias=tree_center_bias,
+      examples_per_layer=tree_examples_per_layer,
+      learner_config=tree_learner_config,
+      feature_columns=tree_feature_columns,
+      logits_dimension=head.logits_dimension,
+      features=tree_features)
+
+  with ops.name_scope("gbdt"):
+    predictions_dict = gbdt_model.predict(mode)
+    tree_logits = predictions_dict["predictions"]
+
+    def _tree_train_op_fn(loss):
+      """Returns the op to optimize the loss."""
+      update_op = gbdt_model.train(loss, predictions_dict, labels)
+      with ops.control_dependencies(
+          [update_op]), (ops.colocate_with(global_step)):
+        update_op = state_ops.assign_add(global_step, 1).op
+        return update_op
+
+  tree_train_logits = dnn_logits + tree_logits
+
+  def _no_train_op_fn(loss):
+    """Returns a no-op."""
+    del loss
+    return control_flow_ops.no_op()
+
+  model_fn_ops = head.create_model_fn_ops(
+      features=features,
+      mode=mode,
+      labels=labels,
+      train_op_fn=_no_train_op_fn,
+      logits=tree_train_logits)
+  dnn_train_op = head.create_model_fn_ops(
+      features=features,
+      mode=mode,
+      labels=labels,
+      train_op_fn=_dnn_train_op_fn,
+      logits=dnn_logits).train_op
+  tree_train_op = head.create_model_fn_ops(
+      features=tree_features,
+      mode=mode,
+      labels=labels,
+      train_op_fn=_tree_train_op_fn,
+      logits=tree_train_logits).train_op
+
+  if tree_center_bias:
+    num_trees += 1
+  finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor()
+
+  model_fn_ops.training_hooks.extend([
+      trainer_hooks.SwitchTrainOp(
+          dnn_train_op, dnn_steps_to_train, tree_train_op),
+      trainer_hooks.StopAfterNTrees(
+          num_trees, attempted_trees, finalized_trees)])
+
+  return model_fn_ops
+
+
+class DNNBoostedTreeCombinedClassifier(estimator.Estimator):
+  """A classifier that uses a combined DNN/GBDT model."""
+
+  def __init__(self,
+               dnn_hidden_units,
+               dnn_feature_columns,
+               tree_learner_config,
+               num_trees,
+               tree_examples_per_layer,
+               n_classes=2,
+               weight_column_name=None,
+               model_dir=None,
+               config=None,
+               label_name=None,
+               label_keys=None,
+               feature_engineering_fn=None,
+               dnn_optimizer="Adagrad",
+               dnn_activation_fn=nn.relu,
+               dnn_dropout=None,
+               dnn_input_layer_partitioner=None,
+               dnn_input_layer_to_tree=True,
+               dnn_steps_to_train=10000,
+               tree_feature_columns=None,
+               tree_center_bias=True):
+    """Initializes a DNNBoostedTreeCombinedClassifier instance.
+
+    Args:
+      dnn_hidden_units: List of hidden units per layer for DNN.
+      dnn_feature_columns: An iterable containing all the feature columns
+        used by the model's DNN.
+      tree_learner_config: A config for the tree learner.
+      num_trees: Number of trees to grow model to after training DNN.
+      tree_examples_per_layer: Number of examples to accumulate before
+        growing the tree a layer. This value has a big impact on model
+        quality and should be set equal to the number of examples in
+        training dataset if possible. It can also be a function that computes
+        the number of examples based on the depth of the layer that's
+        being built.
+      n_classes: The number of label classes.
+      weight_column_name: The name of weight column.
+      model_dir: Directory for model exports.
+      config: `RunConfig` of the estimator.
+      label_name: String, name of the key in label dict. Can be null if label
+        is a tensor (single headed models).
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      dnn_optimizer: string, `Optimizer` object, or callable that defines the
+        optimizer to use for training the DNN. If `None`, will use the Adagrad
+        optimizer with default learning rate.
+      dnn_activation_fn: Activation function applied to each layer of the DNN.
+        If `None`, will use `tf.nn.relu`.
+      dnn_dropout: When not `None`, the probability to drop out a given
+        unit in the DNN.
+      dnn_input_layer_partitioner: Partitioner for input layer of the DNN.
+        Defaults to `min_max_variable_partitioner` with `min_slice_size`
+        64 << 20.
+      dnn_input_layer_to_tree: Whether to provide the DNN's input layer
+      as a feature to the tree.
+      dnn_steps_to_train: Number of steps to train dnn for before switching
+        to gbdt.
+      tree_feature_columns: An iterable containing all the feature columns
+        used by the model's boosted trees. If dnn_input_layer_to_tree is
+        set to True, these features are in addition to dnn_feature_columns.
+      tree_center_bias: Whether a separate tree should be created for
+        first fitting the bias.
+    """
+    head = head_lib.multi_class_head(
+        n_classes=n_classes,
+        label_name=label_name,
+        label_keys=label_keys,
+        weight_column_name=weight_column_name,
+        enable_centered_bias=False)
+
+    def _model_fn(features, labels, mode, config):
+      return _dnn_tree_combined_model_fn(
+          features, labels, mode, head, dnn_hidden_units, dnn_feature_columns,
+          tree_learner_config, num_trees, tree_examples_per_layer, config,
+          dnn_optimizer, dnn_activation_fn, dnn_dropout,
+          dnn_input_layer_partitioner, dnn_input_layer_to_tree,
+          dnn_steps_to_train,
+          tree_feature_columns, tree_center_bias)
+
+    super(DNNBoostedTreeCombinedClassifier, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir,
+        config=config, feature_engineering_fn=feature_engineering_fn)
+
+
+class DNNBoostedTreeCombinedRegressor(estimator.Estimator):
+  """A regressor that uses a combined DNN/GBDT model."""
+
+  def __init__(self,
+               dnn_hidden_units,
+               dnn_feature_columns,
+               tree_learner_config,
+               num_trees,
+               tree_examples_per_layer,
+               weight_column_name=None,
+               model_dir=None,
+               config=None,
+               label_name=None,
+               label_dimension=1,
+               feature_engineering_fn=None,
+               dnn_optimizer="Adagrad",
+               dnn_activation_fn=nn.relu,
+               dnn_dropout=None,
+               dnn_input_layer_partitioner=None,
+               dnn_input_layer_to_tree=True,
+               dnn_steps_to_train=10000,
+               tree_feature_columns=None,
+               tree_center_bias=True):
+    """Initializes a DNNBoostedTreeCombinedRegressor instance.
+
+    Args:
+      dnn_hidden_units: List of hidden units per layer for DNN.
+      dnn_feature_columns: An iterable containing all the feature columns
+        used by the model's DNN.
+      tree_learner_config: A config for the tree learner.
+      num_trees: Number of trees to grow model to after training DNN.
+      tree_examples_per_layer: Number of examples to accumulate before
+        growing the tree a layer. This value has a big impact on model
+        quality and should be set equal to the number of examples in
+        training dataset if possible. It can also be a function that computes
+        the number of examples based on the depth of the layer that's
+        being built.
+      weight_column_name: The name of weight column.
+      model_dir: Directory for model exports.
+      config: `RunConfig` of the estimator.
+      label_name: String, name of the key in label dict. Can be null if label
+        is a tensor (single headed models).
+      label_dimension: Number of regression labels per example. This is the size
+        of the last dimension of the labels `Tensor` (typically, this has shape
+        `[batch_size, label_dimension]`).
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      dnn_optimizer: string, `Optimizer` object, or callable that defines the
+        optimizer to use for training the DNN. If `None`, will use the Adagrad
+        optimizer with default learning rate.
+      dnn_activation_fn: Activation function applied to each layer of the DNN.
+        If `None`, will use `tf.nn.relu`.
+      dnn_dropout: When not `None`, the probability to drop out a given
+        unit in the DNN.
+      dnn_input_layer_partitioner: Partitioner for input layer of the DNN.
+        Defaults to `min_max_variable_partitioner` with `min_slice_size`
+        64 << 20.
+      dnn_input_layer_to_tree: Whether to provide the DNN's input layer
+      as a feature to the tree.
+      dnn_steps_to_train: Number of steps to train dnn for before switching
+        to gbdt.
+      tree_feature_columns: An iterable containing all the feature columns
+        used by the model's boosted trees. If dnn_input_layer_to_tree is
+        set to True, these features are in addition to dnn_feature_columns.
+      tree_center_bias: Whether a separate tree should be created for
+        first fitting the bias.
+    """
+    head = head_lib.regression_head(
+        label_name=label_name,
+        label_dimension=label_dimension,
+        weight_column_name=weight_column_name,
+        enable_centered_bias=False)
+
+    # num_classes needed for GradientBoostedDecisionTreeModel
+    if label_dimension == 1:
+      tree_learner_config.num_classes = 2
+    else:
+      tree_learner_config.num_classes = label_dimension
+
+    def _model_fn(features, labels, mode, config):
+      return _dnn_tree_combined_model_fn(
+          features, labels, mode, head, dnn_hidden_units, dnn_feature_columns,
+          tree_learner_config, num_trees, tree_examples_per_layer, config,
+          dnn_optimizer, dnn_activation_fn, dnn_dropout,
+          dnn_input_layer_partitioner, dnn_input_layer_to_tree,
+          dnn_steps_to_train, tree_feature_columns, tree_center_bias)
+
+    super(DNNBoostedTreeCombinedRegressor, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir,
+        config=config, feature_engineering_fn=feature_engineering_fn)
+
+
+class DNNBoostedTreeCombinedEstimator(estimator.Estimator):
+  """An estimator that uses a combined DNN/GBDT model.
+
+  Useful for training with user specified `Head`.
+  """
+
+  def __init__(self,
+               dnn_hidden_units,
+               dnn_feature_columns,
+               tree_learner_config,
+               num_trees,
+               tree_examples_per_layer,
+               head,
+               model_dir=None,
+               config=None,
+               feature_engineering_fn=None,
+               dnn_optimizer="Adagrad",
+               dnn_activation_fn=nn.relu,
+               dnn_dropout=None,
+               dnn_input_layer_partitioner=None,
+               dnn_input_layer_to_tree=True,
+               dnn_steps_to_train=10000,
+               tree_feature_columns=None,
+               tree_center_bias=True):
+    """Initializes a DNNBoostedTreeCombinedEstimator instance.
+
+    Args:
+      dnn_hidden_units: List of hidden units per layer for DNN.
+      dnn_feature_columns: An iterable containing all the feature columns
+        used by the model's DNN.
+      tree_learner_config: A config for the tree learner.
+      num_trees: Number of trees to grow model to after training DNN.
+      tree_examples_per_layer: Number of examples to accumulate before
+        growing the tree a layer. This value has a big impact on model
+        quality and should be set equal to the number of examples in
+        training dataset if possible. It can also be a function that computes
+        the number of examples based on the depth of the layer that's
+        being built.
+      head: `Head` instance.
+      model_dir: Directory for model exports.
+      config: `RunConfig` of the estimator.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      dnn_optimizer: string, `Optimizer` object, or callable that defines the
+        optimizer to use for training the DNN. If `None`, will use the Adagrad
+        optimizer with default learning rate.
+      dnn_activation_fn: Activation function applied to each layer of the DNN.
+        If `None`, will use `tf.nn.relu`.
+      dnn_dropout: When not `None`, the probability to drop out a given
+        unit in the DNN.
+      dnn_input_layer_partitioner: Partitioner for input layer of the DNN.
+        Defaults to `min_max_variable_partitioner` with `min_slice_size`
+        64 << 20.
+      dnn_input_layer_to_tree: Whether to provide the DNN's input layer
+      as a feature to the tree.
+      dnn_steps_to_train: Number of steps to train dnn for before switching
+        to gbdt.
+      tree_feature_columns: An iterable containing all the feature columns
+        used by the model's boosted trees. If dnn_input_layer_to_tree is
+        set to True, these features are in addition to dnn_feature_columns.
+      tree_center_bias: Whether a separate tree should be created for
+        first fitting the bias.
+    """
+    def _model_fn(features, labels, mode, config):
+      return _dnn_tree_combined_model_fn(
+          features, labels, mode, head, dnn_hidden_units, dnn_feature_columns,
+          tree_learner_config, num_trees, tree_examples_per_layer, config,
+          dnn_optimizer, dnn_activation_fn, dnn_dropout,
+          dnn_input_layer_partitioner, dnn_input_layer_to_tree,
+          dnn_steps_to_train,
+          tree_feature_columns, tree_center_bias)
+
+    super(DNNBoostedTreeCombinedEstimator, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir,
+        config=config, feature_engineering_fn=feature_engineering_fn)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..83d58c561008e8a5a69eb503d1605bb9e940f281
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
@@ -0,0 +1,105 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for combined DNN + GBDT estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+
+from tensorflow.contrib.boosted_trees.estimator_batch import dnn_tree_combined_estimator as estimator
+from tensorflow.contrib.boosted_trees.proto import learner_pb2
+from tensorflow.contrib.layers.python.layers import feature_column
+from tensorflow.contrib.learn.python.learn.estimators import estimator_test_utils
+from tensorflow.contrib.learn.python.learn.estimators import run_config
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+def _train_input_fn():
+  features = {
+      "x": constant_op.constant([[2.], [1.], [1.]])
+  }
+  label = constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)
+  return features, label
+
+
+def _eval_input_fn():
+  features = {
+      "x": constant_op.constant([[1.], [2.], [2.]])
+  }
+  label = constant_op.constant([[0], [1], [1]], dtype=dtypes.int32)
+  return features, label
+
+
+class DNNBoostedTreeCombinedTest(test_util.TensorFlowTestCase):
+
+  def testClassifierContract(self):
+    estimator_test_utils.assert_estimator_contract(
+        self, estimator.DNNBoostedTreeCombinedClassifier)
+
+  def testRegressorContract(self):
+    estimator_test_utils.assert_estimator_contract(
+        self, estimator.DNNBoostedTreeCombinedRegressor)
+
+  def testEstimatorContract(self):
+    estimator_test_utils.assert_estimator_contract(
+        self, estimator.DNNBoostedTreeCombinedEstimator)
+
+  def testNoDNNFeatureColumns(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        "dnn_feature_columns must be specified"):
+      classifier = estimator.DNNBoostedTreeCombinedClassifier(
+          dnn_hidden_units=[1],
+          dnn_feature_columns=[],
+          tree_learner_config=learner_config,
+          num_trees=1,
+          tree_examples_per_layer=3,
+          n_classes=2)
+      classifier.fit(input_fn=_train_input_fn, steps=5)
+
+  def testFitAndEvaluateDontThrowException(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    classifier = estimator.DNNBoostedTreeCombinedClassifier(
+        dnn_hidden_units=[1],
+        dnn_feature_columns=[feature_column.real_valued_column("x")],
+        tree_learner_config=learner_config,
+        num_trees=1,
+        tree_examples_per_layer=3,
+        n_classes=2,
+        model_dir=model_dir,
+        config=config,
+        dnn_steps_to_train=10,
+        dnn_input_layer_to_tree=False,
+        tree_feature_columns=[feature_column.real_valued_column("x")])
+
+    classifier.fit(input_fn=_train_input_fn, steps=15)
+    classifier.evaluate(input_fn=_eval_input_fn, steps=1)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks.py b/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks.py
index 79193fffc3d3fa97e20a12181bf20e6ad86dcb58..2e4151cac40f770e2bece70d752122eb7f34dd40 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks.py
@@ -24,6 +24,7 @@ from tensorflow.contrib.learn.python.learn import session_run_hook
 from tensorflow.contrib.learn.python.learn.session_run_hook import SessionRunArgs
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import training_util
 from tensorflow.python.training.summary_io import SummaryWriterCache
@@ -175,3 +176,40 @@ class StopAfterNTrees(session_run_hook.SessionRunHook):
       logging.info("Requesting stop since we have reached %d trees.",
                    num_finalized_trees)
       run_context.request_stop()
+
+
+class SwitchTrainOp(session_run_hook.SessionRunHook):
+  """Hook that switches the train op after specified number of steps.
+
+  Hook that replaces the train op depending on the number of steps of training
+  that have taken place. The first_train_op is used till train_steps steps
+  are reached. Thereafter the second_train_op is used.
+  """
+
+  def __init__(self, first_train_op, train_steps, second_train_op):
+    """Initializes a `SwitchTrainOp`."""
+    self._first_train_op = first_train_op
+    self._second_train_op = second_train_op
+    self._train_steps = train_steps
+
+  def _get_train_op_for_global_step(self, current_step):
+    """Gets train_op for current global step."""
+    if current_step < self._train_steps:
+      return self._first_train_op
+    return self._second_train_op
+
+  def begin(self):
+    self._global_step_tensor = training_util.get_global_step()
+    self._current_train_op = control_flow_ops.no_op()
+    if self._global_step_tensor is None:
+      raise RuntimeError(
+          "Global step should be created to use SwitchTrainOp.")
+
+  def before_run(self, run_context):  # pylint: disable=unused-argument
+    return session_run_hook.SessionRunArgs(
+        {"global_step": self._global_step_tensor,
+         "train_op": self._current_train_op})
+
+  def after_run(self, run_context, run_values):
+    self._current_train_op = self._get_train_op_for_global_step(
+        run_values.results["global_step"])
diff --git a/tensorflow/contrib/boosted_trees/examples/boston.py b/tensorflow/contrib/boosted_trees/examples/boston.py
index 2c0a3c4912b82aba88e2f8f1b97a227c894ee2ae..e9dbdb0fd784052eeb36ac1aa9342165ef2ac0a7 100644
--- a/tensorflow/contrib/boosted_trees/examples/boston.py
+++ b/tensorflow/contrib/boosted_trees/examples/boston.py
@@ -22,7 +22,7 @@ r"""Demonstrates a regression on Boston housing data.
 
   python tensorflow/contrib/boosted_trees/examples/boston.py \
   --batch_size=404 --output_dir="/tmp/boston" --depth=4 --learning_rate=0.1 \
-  --num_eval_steps=1 --num_trees=500 --l2=4 \
+  --num_eval_steps=1 --num_trees=500 --l2=0.001 \
   --vmodule=training_ops=1
 
   When training is done, mean squared error on eval data is reported.
@@ -37,8 +37,10 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import os
 import sys
 import tensorflow as tf
+from tensorflow.contrib.boosted_trees.estimator_batch import custom_export_strategy
 from tensorflow.contrib.boosted_trees.estimator_batch.estimator import GradientBoostedDecisionTreeRegressor
 from tensorflow.contrib.boosted_trees.proto import learner_pb2
 from tensorflow.contrib.layers.python.layers import feature_column
@@ -51,22 +53,18 @@ _BOSTON_NUM_FEATURES = 13
 def _get_tfbt(output_dir, feature_cols):
   """Configures TF Boosted Trees estimator based on flags."""
   learner_config = learner_pb2.LearnerConfig()
-
   learner_config.learning_rate_tuner.fixed.learning_rate = FLAGS.learning_rate
   learner_config.regularization.l1 = 0.0
-  # Set the regularization per instance in such a way that
-  # regularization for the full training data is equal to l2 flag.
-  learner_config.regularization.l2 = FLAGS.l2 / FLAGS.batch_size
+  learner_config.regularization.l2 = FLAGS.l2
   learner_config.constraints.max_tree_depth = FLAGS.depth
-  learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
 
   run_config = tf.contrib.learn.RunConfig(save_checkpoints_secs=300)
 
   # Create a TF Boosted trees regression estimator.
   estimator = GradientBoostedDecisionTreeRegressor(
       learner_config=learner_config,
-      # For the WHOLE_TREE strategy, set the examples_per_layer to be equal to
-      # batch size.
+      # This should be the number of examples. For large datasets it can be
+      # larger than the batch_size.
       examples_per_layer=FLAGS.batch_size,
       feature_columns=feature_cols,
       label_dimension=1,
@@ -77,6 +75,14 @@ def _get_tfbt(output_dir, feature_cols):
   return estimator
 
 
+def _convert_fn(dtec, sorted_feature_names, num_dense, num_sparse_float,
+                num_sparse_int, export_dir, unused_eval_result):
+  universal_format = custom_export_strategy.convert_to_universal_format(
+      dtec, sorted_feature_names, num_dense, num_sparse_float, num_sparse_int)
+  with tf.gfile.GFile(os.path.join(export_dir, "tree_proto"), "w") as f:
+    f.write(str(universal_format))
+
+
 def _make_experiment_fn(output_dir):
   """Creates experiment for gradient boosted decision trees."""
   (x_train, y_train), (x_test,
@@ -88,21 +94,31 @@ def _make_experiment_fn(output_dir):
       batch_size=FLAGS.batch_size,
       num_epochs=None,
       shuffle=True)
-
   eval_input_fn = tf.estimator.inputs.numpy_input_fn(
       x={"x": x_test}, y=y_test, num_epochs=1, shuffle=False)
 
   feature_columns = [
       feature_column.real_valued_column("x", dimension=_BOSTON_NUM_FEATURES)
   ]
-
+  feature_spec = tf.contrib.layers.create_feature_spec_for_parsing(
+      feature_columns)
+  serving_input_fn = tf.contrib.learn.utils.build_parsing_serving_input_fn(
+      feature_spec)
+  # An export strategy that outputs the feature importance and also exports
+  # the internal tree representation in another format.
+  export_strategy = custom_export_strategy.make_custom_export_strategy(
+      "exports",
+      convert_fn=_convert_fn,
+      feature_columns=feature_columns,
+      export_input_fn=serving_input_fn)
   return tf.contrib.learn.Experiment(
       estimator=_get_tfbt(output_dir, feature_columns),
       train_input_fn=train_input_fn,
       eval_input_fn=eval_input_fn,
       train_steps=None,
       eval_steps=FLAGS.num_eval_steps,
-      eval_metrics=None)
+      eval_metrics=None,
+      export_strategies=[export_strategy])
 
 
 def main(unused_argv):
diff --git a/tensorflow/contrib/boosted_trees/examples/boston_combined.py b/tensorflow/contrib/boosted_trees/examples/boston_combined.py
new file mode 100644
index 0000000000000000000000000000000000000000..e04b56afbfd266dc13a5b0d78d171ea273415ee3
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/examples/boston_combined.py
@@ -0,0 +1,165 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Regression on Boston housing data using DNNBoostedTreeCombinedRegressor.
+
+  Example Usage:
+
+  python tensorflow/contrib/boosted_trees/examples/boston_combined.py \
+  --batch_size=404 --output_dir="/tmp/boston" \
+  --dnn_hidden_units="8,4" --dnn_steps_to_train=1000 \
+  --tree_depth=4 --tree_learning_rate=0.1 \
+  --num_trees=100 --tree_l2=0.001 --num_eval_steps=1 \
+  --vmodule=training_ops=1
+
+  When training is done, mean squared error on eval data is reported.
+  Point tensorboard to the directory for the run to see how the training
+  progresses:
+
+  tensorboard --logdir=/tmp/boston
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+import tensorflow as tf
+
+from tensorflow.contrib.boosted_trees.estimator_batch.dnn_tree_combined_estimator import DNNBoostedTreeCombinedRegressor
+from tensorflow.contrib.boosted_trees.proto import learner_pb2
+from tensorflow.contrib.layers.python.layers import feature_column
+from tensorflow.contrib.learn.python.learn import learn_runner
+from tensorflow.contrib.learn.python.learn.utils import input_fn_utils
+from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
+
+_BOSTON_NUM_FEATURES = 13
+
+
+def _get_estimator(output_dir, feature_cols):
+  """Configures DNNBoostedTreeCombinedRegressor based on flags."""
+  learner_config = learner_pb2.LearnerConfig()
+  learner_config.learning_rate_tuner.fixed.learning_rate = (
+      FLAGS.tree_learning_rate)
+  learner_config.regularization.l1 = 0.0
+  learner_config.regularization.l2 = FLAGS.tree_l2
+  learner_config.constraints.max_tree_depth = FLAGS.tree_depth
+
+  run_config = tf.contrib.learn.RunConfig(save_summary_steps=1)
+
+  # Create a DNNBoostedTreeCombinedRegressor estimator.
+  estimator = DNNBoostedTreeCombinedRegressor(
+      dnn_hidden_units=[int(x) for x in FLAGS.dnn_hidden_units.split(",")],
+      dnn_feature_columns=feature_cols,
+      tree_learner_config=learner_config,
+      num_trees=FLAGS.num_trees,
+      # This should be the number of examples. For large datasets it can be
+      # larger than the batch_size.
+      tree_examples_per_layer=FLAGS.batch_size,
+      model_dir=output_dir,
+      config=run_config,
+      dnn_input_layer_to_tree=True,
+      dnn_steps_to_train=FLAGS.dnn_steps_to_train)
+  return estimator
+
+
+def _make_experiment_fn(output_dir):
+  """Creates experiment for DNNBoostedTreeCombinedRegressor."""
+  (x_train, y_train), (x_test,
+                       y_test) = tf.keras.datasets.boston_housing.load_data()
+
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={"x": x_train},
+      y=y_train,
+      batch_size=FLAGS.batch_size,
+      num_epochs=None,
+      shuffle=True)
+  eval_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={"x": x_test}, y=y_test, num_epochs=1, shuffle=False)
+
+  feature_columns = [
+      feature_column.real_valued_column("x", dimension=_BOSTON_NUM_FEATURES)
+  ]
+  feature_spec = tf.contrib.layers.create_feature_spec_for_parsing(
+      feature_columns)
+  serving_input_fn = input_fn_utils.build_parsing_serving_input_fn(feature_spec)
+  export_strategies = [
+      saved_model_export_utils.make_export_strategy(serving_input_fn)]
+  return tf.contrib.learn.Experiment(
+      estimator=_get_estimator(output_dir, feature_columns),
+      train_input_fn=train_input_fn,
+      eval_input_fn=eval_input_fn,
+      train_steps=None,
+      eval_steps=FLAGS.num_eval_steps,
+      eval_metrics=None,
+      export_strategies=export_strategies)
+
+
+def main(unused_argv):
+  learn_runner.run(
+      experiment_fn=_make_experiment_fn,
+      output_dir=FLAGS.output_dir,
+      schedule="train_and_evaluate")
+
+
+if __name__ == "__main__":
+  tf.logging.set_verbosity(tf.logging.INFO)
+  parser = argparse.ArgumentParser()
+  # Define the list of flags that users can change.
+  parser.add_argument(
+      "--batch_size",
+      type=int,
+      default=1000,
+      help="The batch size for reading data.")
+  parser.add_argument(
+      "--output_dir",
+      type=str,
+      required=True,
+      help="Choose the dir for the output.")
+  parser.add_argument(
+      "--num_eval_steps",
+      type=int,
+      default=1,
+      help="The number of steps to run evaluation for.")
+  # Flags for configuring DNNBoostedTreeCombinedRegressor.
+  parser.add_argument(
+      "--dnn_hidden_units",
+      type=str,
+      default="8,4",
+      help="Hidden layers for DNN.")
+  parser.add_argument(
+      "--dnn_steps_to_train",
+      type=int,
+      default=1000,
+      help="Number of steps to train DNN.")
+  parser.add_argument(
+      "--tree_depth", type=int, default=4, help="Maximum depth of trees.")
+  parser.add_argument(
+      "--tree_l2", type=float, default=1.0, help="l2 regularization per batch.")
+  parser.add_argument(
+      "--tree_learning_rate",
+      type=float,
+      default=0.1,
+      help=("Learning rate (shrinkage weight) with which each "
+            "new tree is added."))
+  parser.add_argument(
+      "--num_trees",
+      type=int,
+      default=None,
+      required=True,
+      help="Number of trees to grow before stopping.")
+
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
index 3bd30d8678920c1320bf6fedc2f40f5922237a92..18b4abd654ea3541d646a43ac901aca1a678446f 100644
--- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -16,7 +16,7 @@
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h"
+#include "tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h"
 #include "tensorflow/contrib/boosted_trees/proto/split_info.pb.h"
 #include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"
 #include "tensorflow/core/framework/device_base.h"
@@ -490,11 +490,11 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
       }
       dense_split->set_feature_column(feature_column_group_id_);
       // Set the feature index for the best feature column.
-      const int64 best_feature_id =
+      const int64 best_dimension_id =
           bucket_ids_and_dimensions(best_element_idx, 1);
       const int32 best_bucket_id =
           bucket_ids_and_dimensions(best_element_idx, 0);
-      dense_split->set_feature_id(best_feature_id);
+      dense_split->set_dimension_id(best_dimension_id);
       dense_split->set_threshold(bucket_boundaries(best_bucket_id));
 
       auto* left_child = split_info.mutable_left_child();
diff --git a/tensorflow/contrib/boosted_trees/lib/BUILD b/tensorflow/contrib/boosted_trees/lib/BUILD
index 107ff0d295bee530c1711a97849fbd3c6cdb2f00..131bd48562a55a08981ac73277e93024db0d85d3 100644
--- a/tensorflow/contrib/boosted_trees/lib/BUILD
+++ b/tensorflow/contrib/boosted_trees/lib/BUILD
@@ -406,51 +406,9 @@ tf_cc_test(
 )
 
 # Learner/stochastic
-
-cc_library(
-    name = "feature-column-handlers",
-    srcs = [
-        "learner/stochastic/handlers/bias-feature-column-handler.cc",
-        "learner/stochastic/handlers/categorical-feature-column-handler.cc",
-        "learner/stochastic/handlers/dense-quantized-feature-column-handler.cc",
-        "learner/stochastic/handlers/sparse-quantized-feature-column-handler.cc",
-    ],
-    hdrs = [
-        "learner/stochastic/handlers/bias-feature-column-handler.h",
-        "learner/stochastic/handlers/categorical-feature-column-handler.h",
-        "learner/stochastic/handlers/dense-quantized-feature-column-handler.h",
-        "learner/stochastic/handlers/feature-column-handler.h",
-        "learner/stochastic/handlers/sparse-quantized-feature-column-handler.h",
-    ],
-    deps = [
-        ":feature-split-candidate",
-        ":feature-stats-accumulator",
-        "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_cc_test(
-    name = "feature-column-handlers_test",
-    size = "small",
-    srcs = [
-        "learner/stochastic/handlers/bias-feature-column-handler_test.cc",
-        "learner/stochastic/handlers/categorical-feature-column-handler_test.cc",
-        "learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc",
-        "learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc",
-    ],
-    deps = [
-        ":feature-column-handlers",
-        "//tensorflow/core:tensor_testutil",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
 cc_library(
     name = "gradient-stats",
-    hdrs = ["learner/stochastic/stats/gradient-stats.h"],
+    hdrs = ["learner/common/stats/gradient-stats.h"],
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
@@ -459,7 +417,7 @@ cc_library(
 
 cc_library(
     name = "node-stats",
-    hdrs = ["learner/stochastic/stats/node-stats.h"],
+    hdrs = ["learner/common/stats/node-stats.h"],
     deps = [
         ":gradient-stats",
         "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
@@ -471,7 +429,7 @@ cc_library(
 
 cc_library(
     name = "split-stats",
-    hdrs = ["learner/stochastic/stats/split-stats.h"],
+    hdrs = ["learner/common/stats/split-stats.h"],
     deps = [
         ":node-stats",
     ],
@@ -479,7 +437,7 @@ cc_library(
 
 cc_library(
     name = "feature-split-candidate",
-    hdrs = ["learner/stochastic/stats/feature-split-candidate.h"],
+    hdrs = ["learner/common/stats/feature-split-candidate.h"],
     deps = [
         ":split-stats",
         "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
@@ -489,7 +447,7 @@ cc_library(
 tf_cc_test(
     name = "node-stats_test",
     size = "small",
-    srcs = ["learner/stochastic/stats/node-stats_test.cc"],
+    srcs = ["learner/common/stats/node-stats_test.cc"],
     deps = [
         ":node-stats",
         "//tensorflow/core:tensor_testutil",
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
index 72e20aaa127cda592bd314786cddb925cc87a075..7df514cd207c5e781f3b4abaa2020016b197669d 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -436,7 +436,7 @@ def dense_make_stats_update(is_active, are_buckets_ready, float_column,
     quantized_feature = quantile_ops.quantiles([float_column], [],
                                                [quantile_buckets], [], [])
     quantized_feature = math_ops.cast(quantized_feature[0], dtypes.int64)
-    quantized_feature = array_ops.squeeze(quantized_feature)
+    quantized_feature = array_ops.squeeze(quantized_feature, axis=0)
     return (example_partition_ids, quantized_feature, gradients, hessians)
 
   def not_ready_inputs_fn():
@@ -468,7 +468,7 @@ def sparse_make_stats_update(
                                                [sparse_column_indices])
 
     quantized_feature = math_ops.cast(quantized_feature[1], dtypes.int64)
-    quantized_feature = array_ops.squeeze(quantized_feature)
+    quantized_feature = array_ops.squeeze(quantized_feature, axis=0)
 
     example_indices, _ = array_ops.split(
         sparse_column_indices, num_or_size_splits=2, axis=1)
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
index ee16a5f838a65f20db4436eb86527518621b6d8d..54d03018d9e266beabbbabd78ebbb80cfe689c04 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
@@ -1121,6 +1121,87 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase):
     self.assertEqual(len(gains), 0)
     self.assertEqual(len(splits), 0)
 
+  def testDegenerativeCase(self):
+    with self.test_session() as sess:
+      # One data example only, one leaf and thus one quantile bucket.The same
+      # situation is when all examples have the same values. This case was
+      # causing before a failure.
+      gradients = array_ops.constant([0.2])
+      hessians = array_ops.constant([0.12])
+      example_partitions = array_ops.constant([1], dtype=dtypes.int32)
+      indices = array_ops.constant([[0, 0]], dtype=dtypes.int64)
+      values = array_ops.constant([0.58])
+      sparse_column = sparse_tensor.SparseTensor(indices, values, [1, 1])
+
+      gradient_shape = tensor_shape.scalar()
+      hessian_shape = tensor_shape.scalar()
+      class_id = -1
+
+      split_handler = ordinal_split_handler.SparseSplitHandler(
+          l1_regularization=0,
+          l2_regularization=2,
+          tree_complexity_regularization=0,
+          min_node_weight=0,
+          epsilon=0.01,
+          num_quantiles=2,
+          feature_column_group_id=0,
+          sparse_float_column=sparse_column,
+          init_stamp_token=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([1, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          example_partitions,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+
+      with ops.control_dependencies([are_splits_ready]):
+        update_2 = split_handler.update_stats_sync(
+            1,
+            example_partitions,
+            gradients,
+            hessians,
+            empty_gradients,
+            empty_hessians,
+            example_weights,
+            is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_2]):
+        are_splits_ready2, partitions, gains, splits = (
+            split_handler.make_splits(1, 2, class_id))
+        are_splits_ready, are_splits_ready2, partitions, gains, splits = (
+            sess.run([
+                are_splits_ready, are_splits_ready2, partitions, gains, splits
+            ]))
+
+    # During the first iteration, inequality split handlers are not going to
+    # have any splits. Make sure that we return not_ready in that case.
+    self.assertFalse(are_splits_ready)
+    self.assertTrue(are_splits_ready2)
+
+    self.assertAllEqual([1], partitions)
+    self.assertAllEqual([0.0], gains)
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+    split_node = split_info.split_node.sparse_float_binary_split_default_left
+
+    self.assertEqual(0, split_node.split.feature_column)
+
+    self.assertAllClose(0.58, split_node.split.threshold)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/feature-split-candidate.h b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/feature-split-candidate.h
similarity index 90%
rename from tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/feature-split-candidate.h
rename to tensorflow/contrib/boosted_trees/lib/learner/common/stats/feature-split-candidate.h
index fe22691178213094b9affcdee06af98011f85bd2..339c2e0fded10e6a7b140da62e152e2868ffd164 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/feature-split-candidate.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/feature-split-candidate.h
@@ -13,10 +13,10 @@
 // limitations under the License.
 //
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_FEATURE_SPLIT_CANDIDATE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_FEATURE_SPLIT_CANDIDATE_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
 
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/split-stats.h"
+#include "tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h"
 #include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"
 
 namespace tensorflow {
@@ -58,4 +58,4 @@ struct FeatureSplitCandidate {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_FEATURE_SPLIT_CANDIDATE_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/gradient-stats.h b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/gradient-stats.h
similarity index 98%
rename from tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/gradient-stats.h
rename to tensorflow/contrib/boosted_trees/lib/learner/common/stats/gradient-stats.h
index dad64bf165a41bc4f32eea6b37e7afb569887a06..34e3ddb777242553d62035a51f1aec33d0f9ba54 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/gradient-stats.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/gradient-stats.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_GRADIENT_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_GRADIENT_STATS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
 
 #include <math.h>
 
@@ -190,4 +190,4 @@ inline GradientStats operator-(const GradientStats& a, const GradientStats& b) {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_GRADIENT_STATS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats.h b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h
similarity index 98%
rename from tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats.h
rename to tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h
index 4e5f53874df2207ffa6664a33675f84ef055394b..642a183aec5c7e591579fa5ee91d45729bfb624d 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_NODE_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_NODE_STATS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/Eigen/Eigenvalues"
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/gradient-stats.h"
+#include "tensorflow/contrib/boosted_trees/lib/learner/common/stats/gradient-stats.h"
 #include "tensorflow/contrib/boosted_trees/proto/learner.pb.h"
 #include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -298,4 +298,4 @@ struct NodeStats {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_NODE_STATS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
similarity index 99%
rename from tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats_test.cc
rename to tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
index ecb7a04efb96248210d9af770c8377b7f6906598..f867e77d3ef0609774628b2a9c36ca52bcf2a957 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats.h"
+#include "tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h"
 
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/split-stats.h b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h
similarity index 94%
rename from tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/split-stats.h
rename to tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h
index f700cbced833543227de39f54c9ecbb03a7ce7c9..054ccd9a8cd0be0c48b14cca013f15677deba900 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/split-stats.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_SPLIT_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_SPLIT_STATS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
 
 #include <string>
 
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats.h"
+#include "tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h"
 
 namespace tensorflow {
 namespace boosted_trees {
@@ -81,4 +81,4 @@ struct SplitStats {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_SPLIT_STATS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.cc
deleted file mode 100644
index b880cf2c47989b1434f17802befb7dd7c248b36f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-void BiasFeatureColumnHandler::AggregateGradientStats(
-    const std::vector<int32>& example_partition_ids,
-    const Tensor& example_first_order_gradients,
-    const Tensor& example_second_order_gradients,
-    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-        gradient_stats_accumulator) const {
-  // Pass over all examples and aggregate gradient stats for each sub-root.
-  for (int64 example_idx = 0; example_idx < batch_size_; ++example_idx) {
-    auto partition_id = example_partition_ids[example_idx];
-    gradient_stats_accumulator->AddStats(
-        slot_id_, class_id_, partition_id, kBiasFeatureId,
-        GradientStats(example_first_order_gradients,
-                      example_second_order_gradients, example_idx));
-  }
-}
-
-void BiasFeatureColumnHandler::GenerateFeatureSplitCandidates(
-    const LearnerConfig& learner_config, const std::vector<int32>& roots,
-    const std::vector<NodeStats>& root_stats,
-    const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-        gradient_stats_accumulator,
-    std::vector<FeatureSplitCandidate>* split_candidates) const {
-  split_candidates->clear();
-  split_candidates->reserve(roots.size());
-  boosted_trees::trees::TreeNode tree_node;
-  for (size_t root_idx = 0; root_idx < roots.size(); ++root_idx) {
-    const NodeStats& root_node_stats = root_stats[root_idx];
-    tree_node.Clear();
-    root_node_stats.FillLeaf(class_id_, tree_node.mutable_leaf());
-    split_candidates->emplace_back(slot_id_, tree_node,
-                                   SplitStats(learner_config, root_node_stats));
-  }
-}
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.h
deleted file mode 100644
index 5c0f99185a63db33a391a98fa16f37bef99507c9..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_H_  // NOLINT
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_H_  // NOLINT
-
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-// Handler for a bias feature column in the single class case.
-// This handler is useful even if we don't introduce a bias feature because
-// it allows us to aggregate stats per partition which in turn allows us
-// to compute node stats for each root to split.
-class BiasFeatureColumnHandler : public FeatureColumnHandler {
- public:
-  BiasFeatureColumnHandler(const uint32 class_id, const uint32 slot_id,
-                           const int64 batch_size)
-      : FeatureColumnHandler(class_id, slot_id, batch_size) {}
-
-  void AggregateGradientStats(
-      const std::vector<int32>& example_partition_ids,
-      const Tensor& example_first_order_gradients,
-      const Tensor& example_second_order_gradients,
-      FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-          gradient_stats_accumulator) const override;
-
-  void GenerateFeatureSplitCandidates(
-      const LearnerConfig& learner_config, const std::vector<int32>& roots,
-      const std::vector<NodeStats>& root_stats,
-      const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-          gradient_stats_accumulator,
-      std::vector<FeatureSplitCandidate>* split_candidates) const override;
-
-  static constexpr auto kBiasFeatureId = 0;
-};
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_H_  // NOLINT
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler_test.cc
deleted file mode 100644
index f4c7df7fabda1a38d7e6cca4c5c8bc81cb7551b1..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler_test.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.h"
-
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-namespace {
-
-using boosted_trees::learner::LearnerConfig;
-
-const auto kClassId = 7;
-const auto kSlotId = 0;
-const auto kBatchSize = 4;
-
-using FeatureStatsAccumulator =
-    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>;
-
-class BiasFeatureColumnHandlerTest : public ::testing::Test {
- protected:
-  BiasFeatureColumnHandlerTest()
-      : example_first_order_gradients_(
-            test::AsTensor<float>({0.2f, -0.5f, 1.2f, 4.0f}, {4})),
-        example_second_order_gradients_(
-            test::AsTensor<float>({0.12f, 0.07f, 0.2f, 0.13f}, {4})),
-        example_partitions_({0, 0, 1, 3}) {
-    // Set L2 regularization.
-    learner_config_.mutable_regularization()->set_l2(2.0f);
-    learner_config_.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
-
-    // Create handler.
-    handler_.reset(new BiasFeatureColumnHandler(kClassId, kSlotId, kBatchSize));
-  }
-
-  LearnerConfig learner_config_;
-  const Tensor example_first_order_gradients_;
-  const Tensor example_second_order_gradients_;
-  const std::vector<int32> example_partitions_;
-  std::unique_ptr<BiasFeatureColumnHandler> handler_;
-};
-
-TEST_F(BiasFeatureColumnHandlerTest, AggregateGradientStats) {
-  // Create handler.
-  FeatureStatsAccumulator accumulator(1);
-  handler_->AggregateGradientStats(
-      example_partitions_, example_first_order_gradients_,
-      example_second_order_gradients_, &accumulator);
-
-  // Check stats for each partition.
-  // Partition 0.
-  EXPECT_GRADIENT_STATS_EQ(
-      GradientStats(-0.3f, 0.19f),
-      accumulator.GetStats(kSlotId, kClassId, 0,
-                           BiasFeatureColumnHandler::kBiasFeatureId));
-  // Partition 1.
-  EXPECT_GRADIENT_STATS_EQ(
-      GradientStats(1.2f, 0.2f),
-      accumulator.GetStats(kSlotId, kClassId, 1,
-                           BiasFeatureColumnHandler::kBiasFeatureId));
-  // Partition 2.
-  EXPECT_GRADIENT_STATS_EQ(
-      GradientStats(0.0f, 0.0f),
-      accumulator.GetStats(kSlotId, kClassId, 2,
-                           BiasFeatureColumnHandler::kBiasFeatureId));
-  // Partition 3.
-  EXPECT_GRADIENT_STATS_EQ(
-      GradientStats(4.0f, 0.13f),
-      accumulator.GetStats(kSlotId, kClassId, 3,
-                           BiasFeatureColumnHandler::kBiasFeatureId));
-}
-
-TEST_F(BiasFeatureColumnHandlerTest, GenerateFeatureSplitCandidates) {
-  // Create handler.
-  FeatureStatsAccumulator accumulator(1);
-  handler_->AggregateGradientStats(
-      example_partitions_, example_first_order_gradients_,
-      example_second_order_gradients_, &accumulator);
-
-  // Get feature split candidates for two roots 0 and 3.
-  // Root 0 has zero gain and root 3 has the same gain as the leaf.
-  const std::vector<int32> roots = {0, 3};
-  const std::vector<NodeStats>& root_stats = {
-      NodeStats(1), NodeStats(learner_config_, GradientStats(4.0f, 0.13f))};
-  std::vector<FeatureSplitCandidate> split_candidates;
-  handler_->GenerateFeatureSplitCandidates(learner_config_, roots, root_stats,
-                                           accumulator, &split_candidates);
-  // Expect two candidate splits (one per root).
-  EXPECT_EQ(2, split_candidates.size());
-
-  // Verify first candidate for root 0, gain is expected to be the same as
-  // the left child since the root node gain is zero.
-  const SplitStats expected_split_stats0(learner_config_, root_stats[0]);
-  EXPECT_SPLIT_STATS_EQ(expected_split_stats0, split_candidates[0].split_stats);
-  const auto& tree_node0 = split_candidates[0].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::kLeaf, tree_node0.node_case());
-  EXPECT_EQ(1, tree_node0.leaf().sparse_vector().index_size());
-  EXPECT_EQ(kClassId, tree_node0.leaf().sparse_vector().index(0));
-  EXPECT_EQ(1, tree_node0.leaf().sparse_vector().value_size());
-  EXPECT_EQ(root_stats[0].weight_contribution[0],
-            tree_node0.leaf().sparse_vector().value(0));
-
-  // Verify second candidate for root 3, gain is expected to be zero as
-  // the left child gain is equal to the parent gain.
-  const SplitStats expected_split_stats1(learner_config_, root_stats[1]);
-  EXPECT_SPLIT_STATS_EQ(expected_split_stats1, split_candidates[1].split_stats);
-  const auto& tree_node1 = split_candidates[1].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::kLeaf, tree_node1.node_case());
-  EXPECT_EQ(1, tree_node1.leaf().sparse_vector().index_size());
-  EXPECT_EQ(kClassId, tree_node1.leaf().sparse_vector().index(0));
-  EXPECT_EQ(1, tree_node1.leaf().sparse_vector().value_size());
-  EXPECT_EQ(root_stats[1].weight_contribution[0],
-            tree_node1.leaf().sparse_vector().value(0));
-}
-
-}  // namespace
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.cc
deleted file mode 100644
index 3a6c409f846c9ca0bd6b5101e96447642b949978..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.h"
-
-#include "tensorflow/core/platform/macros.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-namespace {
-
-// Creates a categorical Id split node without assigning children.
-boosted_trees::trees::TreeNode CreateCategoricalIdNode(
-    const int32 feature_column, const int32 id) {
-  boosted_trees::trees::TreeNode split_node;
-  auto* split = split_node.mutable_categorical_id_binary_split();
-  split->set_feature_column(feature_column);
-  split->set_feature_id(id);
-  return split_node;
-}
-
-}  // namespace
-
-void CategoricalFeatureColumnHandler::AggregateGradientStats(
-    const std::vector<int32>& example_partition_ids,
-    const Tensor& example_first_order_gradients,
-    const Tensor& example_second_order_gradients,
-    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-        gradient_stats_accumulator) const {
-  // Pass over all rows and aggregate gradient stats for each feature id.
-  const int64 num_rows = indices_.dimension(0);
-  for (int64 row_idx = 0; row_idx < num_rows; ++row_idx) {
-    auto example_idx = indices_(row_idx, 0);
-    auto feature_id = values_(row_idx);
-    const GradientStats norm_gradient_stats(example_first_order_gradients,
-                                            example_second_order_gradients,
-                                            example_idx);
-    auto partition_id = example_partition_ids[example_idx];
-    gradient_stats_accumulator->AddStats(slot_id_, class_id_, partition_id,
-                                         feature_id, norm_gradient_stats);
-  }
-}
-
-void CategoricalFeatureColumnHandler::GenerateFeatureSplitCandidates(
-    const LearnerConfig& learner_config, const std::vector<int32>& roots,
-    const std::vector<NodeStats>& root_stats,
-    const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-        gradient_stats_accumulator,
-    std::vector<FeatureSplitCandidate>* split_candidates) const {
-  // Build a reverse lookup of partition id to root idx.
-  std::unordered_map<int32, size_t> partition_id_to_root_idx;
-  partition_id_to_root_idx.reserve(roots.size());
-  for (size_t root_idx = 0; root_idx < roots.size(); ++root_idx) {
-    partition_id_to_root_idx[roots[root_idx]] = root_idx;
-  }
-
-  // Initialize split candidates.
-  split_candidates->clear();
-  if (!roots.empty()) {
-    FeatureSplitCandidate empty_candidate(
-        root_stats[0].weight_contribution.size());
-    split_candidates->resize(roots.size(), empty_candidate);
-  }
-  for (auto& split_candidate : *split_candidates) {
-    split_candidate.split_stats.gain = std::numeric_limits<float>::lowest();
-  }
-
-  // Evaluate split candidates for every root as each is a separate
-  // logical partition over the examples.
-  // Then for each root, we evaluate every feature id as an equality split
-  // and pick the highest split gain.
-  for (const auto& entry :
-       gradient_stats_accumulator.GetFeatureStats(slot_id_)) {
-    DCHECK_EQ(entry.first.class_id, class_id_);
-
-    // Get partition id and root node stats.
-    const int32 partition_id = entry.first.partition_id;
-    auto root_idx_it = partition_id_to_root_idx.find(partition_id);
-    if (root_idx_it == partition_id_to_root_idx.end()) {
-      // Inactive partition.
-      continue;
-    }
-    size_t root_idx = root_idx_it->second;
-    const NodeStats& root_node_stats = root_stats[root_idx];
-
-    // Get gradient stats.
-    const auto& left_gradient_stats = entry.second;
-    auto right_gradient_stats =
-        root_node_stats.gradient_stats - left_gradient_stats;
-
-    // Get node stats.
-    NodeStats left_node_stats(learner_config, left_gradient_stats);
-    NodeStats right_node_stats(learner_config, right_gradient_stats);
-
-    // Generate split candidate and update best split candidate for the
-    // current root if needed.
-    FeatureSplitCandidate split_candidate(
-        slot_id_,
-        CreateCategoricalIdNode(feature_column_, entry.first.feature_id),
-        SplitStats(learner_config, root_node_stats, left_node_stats,
-                   right_node_stats));
-    FeatureSplitCandidate& best_split_candidate = (*split_candidates)[root_idx];
-    if (TF_PREDICT_FALSE(best_split_candidate.tree_node.node_case() ==
-                         boosted_trees::trees::TreeNode::NODE_NOT_SET)) {
-      // Always replace candidates with no node set.
-      best_split_candidate = std::move(split_candidate);
-    } else if (TF_PREDICT_FALSE(split_candidate.split_stats.gain ==
-                                best_split_candidate.split_stats.gain)) {
-      // Tie break on feature id.
-      auto best_split_feature_id =
-          best_split_candidate.tree_node.categorical_id_binary_split()
-              .feature_id();
-      if (entry.first.feature_id < best_split_feature_id) {
-        best_split_candidate = std::move(split_candidate);
-      }
-    } else if (split_candidate.split_stats.gain >
-               best_split_candidate.split_stats.gain) {
-      best_split_candidate = std::move(split_candidate);
-    }
-  }
-}
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.h
deleted file mode 100644
index ef964ba716c6adf9cf9c291cca5f52f7a6efe26f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_CATEGORICAL_FEATURE_COLUMN_HANDLER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_CATEGORICAL_FEATURE_COLUMN_HANDLER_H_
-
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-// Handler for a categorical feature column in the single class case.
-class CategoricalFeatureColumnHandler : public FeatureColumnHandler {
- public:
-  CategoricalFeatureColumnHandler(const int32 class_id, const int32 slot_id,
-                                  const int64 batch_size,
-                                  const int32 feature_column,
-                                  TTypes<int64>::ConstMatrix indices,
-                                  TTypes<int64>::ConstVec values)
-      : FeatureColumnHandler(class_id, slot_id, batch_size),
-        feature_column_(feature_column),
-        indices_(indices),
-        values_(values) {}
-
-  void AggregateGradientStats(
-      const std::vector<int32>& example_partition_ids,
-      const Tensor& example_first_order_gradients,
-      const Tensor& example_second_order_gradients,
-      FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-          gradient_stats_accumulator) const override;
-
-  void GenerateFeatureSplitCandidates(
-      const LearnerConfig& learner_config, const std::vector<int32>& roots,
-      const std::vector<NodeStats>& root_stats,
-      const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-          gradient_stats_accumulator,
-      std::vector<FeatureSplitCandidate>* split_candidates) const override;
-
- protected:
-  const int32 feature_column_;
-  TTypes<int64>::ConstMatrix indices_;
-  TTypes<int64>::ConstVec values_;
-};
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_CATEGORICAL_FEATURE_COLUMN_HANDLER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler_test.cc
deleted file mode 100644
index ea82b3f086d24dc1f9ceb4783abd68be35b34b00..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler_test.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.h"
-
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-namespace {
-
-using boosted_trees::learner::LearnerConfig;
-
-const auto kClassId = 7;
-const auto kSlotId = 0;
-const auto kBatchSize = 4;
-const auto kFeatureColumn = 3;
-
-using FeatureStatsAccumulator =
-    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>;
-
-class CategoricalFeatureColumnHandlerTest : public ::testing::Test {
- protected:
-  // The data looks like the following:
-  // Example |  Gradients    | Partition | Feature Id |
-  // i0      |  (0.2, 0.12)  |     0     |    1,2     |
-  // i1      |  (-0.5, 0.07) |     0     |            |
-  // i2      |  (1.2, 0.2)   |     0     |     2      |
-  // i3      |  (4.0, 0.13)  |     1     |     0      |
-  CategoricalFeatureColumnHandlerTest()
-      : example_first_order_gradients_(
-            test::AsTensor<float>({0.2f, -0.5f, 1.2f, 4.0f}, {4})),
-        example_second_order_gradients_(
-            test::AsTensor<float>({0.12f, 0.07f, 0.2f, 0.13f}, {4})),
-        example_partitions_({0, 0, 0, 1}),
-        indices_(test::AsTensor<int64>({0, 0, 0, 1, 2, 0, 3, 0}, {4, 2})),
-        values_(test::AsTensor<int64>({1, 2, 2, 0}, {4})) {
-    // Set L2 regularization.
-    learner_config_.mutable_regularization()->set_l2(2.0f);
-    learner_config_.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
-    // Create handler.
-    handler_.reset(new CategoricalFeatureColumnHandler(
-        kClassId, kSlotId, kBatchSize, kFeatureColumn, indices_.matrix<int64>(),
-        values_.vec<int64>()));
-  }
-
-  LearnerConfig learner_config_;
-  const Tensor example_first_order_gradients_;
-  const Tensor example_second_order_gradients_;
-  const std::vector<int32> example_partitions_;
-  const Tensor indices_;
-  const Tensor values_;
-  std::unique_ptr<FeatureColumnHandler> handler_;
-};
-
-TEST_F(CategoricalFeatureColumnHandlerTest, AggregateGradientStats) {
-  // Create handler.
-  FeatureStatsAccumulator accumulator(1);
-  handler_->AggregateGradientStats(
-      example_partitions_, example_first_order_gradients_,
-      example_second_order_gradients_, &accumulator);
-
-  // Check stats for each partition and feature.
-  // Partition 0, Feature 0.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.0f, 0.0f),
-                           accumulator.GetStats(kSlotId, kClassId, 0, 0));
-  // Partition 0, Feature 1.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.2f, 0.12f),
-                           accumulator.GetStats(kSlotId, kClassId, 0, 1));
-  // Partition 0, Feature 2.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.2f + 1.2f, 0.12f + 0.2f),
-                           accumulator.GetStats(kSlotId, kClassId, 0, 2));
-
-  // Partition 1, Feature 0.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(4.0f, 0.13f),
-                           accumulator.GetStats(kSlotId, kClassId, 1, 0));
-  // Partition 1, Feature 1.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.0f, 0.0f),
-                           accumulator.GetStats(kSlotId, kClassId, 1, 1));
-  // Partition 1, Feature 2.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.0f, 0.0f),
-                           accumulator.GetStats(kSlotId, kClassId, 1, 2));
-}
-
-TEST_F(CategoricalFeatureColumnHandlerTest, GenerateFeatureSplitCandidates) {
-  // Create handler.
-  FeatureStatsAccumulator accumulator(1);
-  handler_->AggregateGradientStats(
-      example_partitions_, example_first_order_gradients_,
-      example_second_order_gradients_, &accumulator);
-
-  // Get feature split candidates for two roots 0 and 1.
-  // The root stats are derived from the per-partition total gradient stats.
-  const std::vector<int32> roots = {0, 1, 5};
-  const std::vector<NodeStats>& root_stats = {
-      NodeStats(learner_config_, GradientStats(0.9f, 0.39f)),
-      NodeStats(learner_config_, GradientStats(4.0f, 0.13f)), NodeStats(1)};
-  std::vector<FeatureSplitCandidate> split_candidates;
-  handler_->GenerateFeatureSplitCandidates(learner_config_, roots, root_stats,
-                                           accumulator, &split_candidates);
-  // Expect three candidate splits (one per root).
-  EXPECT_EQ(3, split_candidates.size());
-
-  // Verify candidate for root 0, the best split occurs when we route
-  // example i0, i2 left and i1 right.
-  const NodeStats expected_left_node0(learner_config_,
-                                      GradientStats(0.2f + 1.2f, 0.12f + 0.2f));
-  const NodeStats expected_right_node0(
-      learner_config_,
-      root_stats[0].gradient_stats - expected_left_node0.gradient_stats);
-  const SplitStats expected_split_stats0(learner_config_, root_stats[0],
-                                         expected_left_node0,
-                                         expected_right_node0);
-  EXPECT_SPLIT_STATS_EQ(expected_split_stats0, split_candidates[0].split_stats);
-
-  const auto& tree_node0 = split_candidates[0].tree_node;
-  EXPECT_EQ(
-      boosted_trees::trees::TreeNode::kCategoricalIdBinarySplitFieldNumber,
-      tree_node0.node_case());
-  const auto& split0 = tree_node0.categorical_id_binary_split();
-  EXPECT_EQ(2, split0.feature_id());
-  EXPECT_EQ(kFeatureColumn, split0.feature_column());
-
-  // Verify candidate for root 1, there's only one active feature here
-  // so zero gain is expected.
-  const NodeStats expected_left_node1(learner_config_,
-                                      root_stats[1].gradient_stats);
-  const NodeStats expected_right_node1(learner_config_, GradientStats(0, 0));
-  const SplitStats expected_split_stats1(learner_config_, root_stats[1],
-                                         expected_left_node1,
-                                         expected_right_node1);
-  EXPECT_SPLIT_STATS_EQ(expected_split_stats1, split_candidates[1].split_stats);
-  const auto& tree_node1 = split_candidates[1].tree_node;
-  EXPECT_EQ(
-      boosted_trees::trees::TreeNode::kCategoricalIdBinarySplitFieldNumber,
-      tree_node1.node_case());
-  const auto& split1 = tree_node1.categorical_id_binary_split();
-  EXPECT_EQ(0, split1.feature_id());
-  EXPECT_EQ(kFeatureColumn, split1.feature_column());
-
-  // Verify there are no candidate splits for root 5.
-  const auto& tree_node2 = split_candidates[2].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::NODE_NOT_SET,
-            tree_node2.node_case());
-}
-
-}  // namespace
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.cc
deleted file mode 100644
index ca7bb71e7d0b0fc945ee29092b1e36022d4c0943..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-namespace {
-
-// Creates a dense split node without assigning children.
-boosted_trees::trees::TreeNode CreateDenseSplitNode(const int32 feature_column,
-                                                    const float threshold) {
-  boosted_trees::trees::TreeNode split_node;
-  auto* split = split_node.mutable_dense_float_binary_split();
-  split->set_feature_column(feature_column);
-  split->set_threshold(threshold);
-  return split_node;
-}
-
-}  // namespace
-
-void DenseQuantizedFeatureColumnHandler::AggregateGradientStats(
-    const std::vector<int32>& example_partition_ids,
-    const Tensor& example_first_order_gradients,
-    const Tensor& example_second_order_gradients,
-    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-        gradient_stats_accumulator) const {
-  // Pass over all examples and aggregate gradient stats for each partition
-  // and quantized feature bucket.
-  for (int64 example_idx = 0; example_idx < batch_size_; ++example_idx) {
-    auto partition_id = example_partition_ids[example_idx];
-    auto feature_id = dense_quantized_values_(example_idx);
-    gradient_stats_accumulator->AddStats(
-        slot_id_, class_id_, partition_id, feature_id,
-        GradientStats(example_first_order_gradients,
-                      example_second_order_gradients, example_idx));
-  }
-}
-
-void DenseQuantizedFeatureColumnHandler::GenerateFeatureSplitCandidates(
-    const LearnerConfig& learner_config, const std::vector<int32>& roots,
-    const std::vector<NodeStats>& root_stats,
-    const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-        gradient_stats_accumulator,
-    std::vector<FeatureSplitCandidate>* split_candidates) const {
-  // Evaluate split candidates for every root as each is a separate
-  // logical partition over the examples.
-  // Then for each root, we do a forward-only pass over the quantized
-  // feature buckets accumulating gradients from left to right.
-  // Split gains are evaluated at every threshold and the best split is picked.
-  split_candidates->clear();
-  split_candidates->reserve(roots.size());
-  for (size_t root_idx = 0; root_idx < roots.size(); ++root_idx) {
-    // Get partition Id and root node stats.
-    const int32 partition_id = roots[root_idx];
-    const NodeStats& root_node_stats = root_stats[root_idx];
-
-    // Forward left to right pass over quantiles.
-    GradientStats left_gradient_stats;
-    GradientStats right_gradient_stats(root_node_stats.gradient_stats);
-    FeatureSplitCandidate best_split_candidate(
-        root_node_stats.weight_contribution.size());
-    best_split_candidate.split_stats.gain =
-        std::numeric_limits<float>::lowest();
-    for (int bucket_id = 0; bucket_id < dense_quantiles_.size(); ++bucket_id) {
-      // Get gradient stats.
-      auto gradient_stats = gradient_stats_accumulator.GetStats(
-          slot_id_, class_id_, partition_id, bucket_id);
-      if (gradient_stats.IsZero()) {
-        continue;
-      }
-
-      // Update gradient stats.
-      left_gradient_stats += gradient_stats;
-      right_gradient_stats =
-          root_node_stats.gradient_stats - left_gradient_stats;
-
-      // Get node stats
-      NodeStats left_node_stats(learner_config, left_gradient_stats);
-      NodeStats right_node_stats(learner_config, right_gradient_stats);
-
-      // Generate split candidate.
-      const float threshold = dense_quantiles_(bucket_id);
-      FeatureSplitCandidate split_candidate(
-          slot_id_, CreateDenseSplitNode(dense_feature_column_, threshold),
-          SplitStats(learner_config, root_node_stats, left_node_stats,
-                     right_node_stats));
-      if (split_candidate.split_stats.gain >
-          best_split_candidate.split_stats.gain) {
-        best_split_candidate = std::move(split_candidate);
-      }
-    }
-
-    // Add best candidate for partition.
-    split_candidates->push_back(std::move(best_split_candidate));
-  }
-}
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.h
deleted file mode 100644
index 0f3858e4d8c406e9ec3ae7079b241e94ef4aa35c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_DENSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_DENSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
-
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-// Handler for a dense quantized feature column in the single class case.
-class DenseQuantizedFeatureColumnHandler : public FeatureColumnHandler {
- public:
-  DenseQuantizedFeatureColumnHandler(
-      const int32 class_id, const int32 slot_id, const int64 batch_size,
-      const int32 dense_feature_column, TTypes<float>::ConstVec dense_quantiles,
-      TTypes<int32>::ConstVec dense_quantized_values)
-      : FeatureColumnHandler(class_id, slot_id, batch_size),
-        dense_feature_column_(dense_feature_column),
-        dense_quantiles_(dense_quantiles),
-        dense_quantized_values_(dense_quantized_values) {}
-
-  void AggregateGradientStats(
-      const std::vector<int32>& example_partition_ids,
-      const Tensor& example_first_order_gradients,
-      const Tensor& example_second_order_gradients,
-      FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-          gradient_stats_accumulator) const override;
-
-  void GenerateFeatureSplitCandidates(
-      const LearnerConfig& learner_config, const std::vector<int32>& roots,
-      const std::vector<NodeStats>& root_stats,
-      const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-          gradient_stats_accumulator,
-      std::vector<FeatureSplitCandidate>* split_candidates) const override;
-
- protected:
-  const int32 dense_feature_column_;
-  TTypes<float>::ConstVec dense_quantiles_;
-  TTypes<int32>::ConstVec dense_quantized_values_;
-};
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_DENSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc
deleted file mode 100644
index 1bc9d733ad3090f1cfc9547644061f54d7d2c8c6..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.h"
-
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-namespace {
-
-using boosted_trees::learner::LearnerConfig;
-
-const auto kClassId = 1;
-const auto kSlotId = 0;
-const auto kBatchSize = 4;
-const auto kFeatureColumn = 2;
-
-using FeatureStatsAccumulator =
-    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>;
-
-class DenseQuantizedFeatureColumnHandlerTest : public ::testing::Test {
- protected:
-  // The data looks like the following:
-  // Example |  Gradients    | Partition | Dense Quantile |
-  // i0      |  (0.2, 0.12)  | 0         | 1              |
-  // i1      |  (-0.5, 0.07) | 0         | 1              |
-  // i2      |  (1.2, 0.2)   | 0         | 0              |
-  // i3      |  (4.0, 0.13)  | 1         | 1              |
-  DenseQuantizedFeatureColumnHandlerTest()
-      : example_first_order_gradients_(
-            test::AsTensor<float>({0.2f, -0.5f, 1.2f, 4.0f}, {4})),
-        example_second_order_gradients_(
-            test::AsTensor<float>({0.12f, 0.07f, 0.2f, 0.13f}, {4})),
-        example_partitions_({0, 0, 0, 1}),
-        dense_quantiles_(test::AsTensor<float>({0.3f, 0.52f}, {2})),
-        dense_quantized_values_(test::AsTensor<int32>({1, 1, 0, 1}, {4})) {
-    // Set L2 regularization.
-    learner_config_.mutable_regularization()->set_l2(2.0f);
-    learner_config_.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
-    // Create handler.
-    handler_.reset(new DenseQuantizedFeatureColumnHandler(
-        kClassId, kSlotId, kBatchSize, kFeatureColumn,
-        dense_quantiles_.vec<float>(), dense_quantized_values_.vec<int32>()));
-  }
-
-  LearnerConfig learner_config_;
-  const Tensor example_first_order_gradients_;
-  const Tensor example_second_order_gradients_;
-  const std::vector<int32> example_partitions_;
-  const Tensor dense_quantiles_;
-  const Tensor dense_quantized_values_;
-  std::unique_ptr<FeatureColumnHandler> handler_;
-};
-
-TEST_F(DenseQuantizedFeatureColumnHandlerTest, AggregateGradientStats) {
-  // Create handler.
-  FeatureStatsAccumulator accumulator(1);
-  handler_->AggregateGradientStats(
-      example_partitions_, example_first_order_gradients_,
-      example_second_order_gradients_, &accumulator);
-
-  // Check stats for each partition and feature.
-  // Partition 0, Feature 0.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(1.2f, 0.2f),
-                           accumulator.GetStats(kSlotId, kClassId, 0, 0));
-  // Partition 0, Feature 1.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(-0.3f, 0.19f),
-                           accumulator.GetStats(kSlotId, kClassId, 0, 1));
-  // Partition 1, Feature 0.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.0f, 0.0f),
-                           accumulator.GetStats(kSlotId, kClassId, 1, 0));
-  // Partition 1, Feature 1.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(4.0f, 0.13f),
-                           accumulator.GetStats(kSlotId, kClassId, 1, 1));
-}
-
-TEST_F(DenseQuantizedFeatureColumnHandlerTest, GenerateFeatureSplitCandidates) {
-  // Create handler.
-  FeatureStatsAccumulator accumulator(1);
-  handler_->AggregateGradientStats(
-      example_partitions_, example_first_order_gradients_,
-      example_second_order_gradients_, &accumulator);
-
-  // Get feature split candidates for two roots 0 and 1.
-  // The root stats are derived from the per-partition total gradient stats.
-  const std::vector<int32> roots = {0, 1, 5};
-  const std::vector<NodeStats>& root_stats = {
-      NodeStats(learner_config_, GradientStats(0.9f, 0.39f)),
-      NodeStats(learner_config_, GradientStats(4.0f, 0.13f)), NodeStats(1)};
-  std::vector<FeatureSplitCandidate> split_candidates;
-  handler_->GenerateFeatureSplitCandidates(learner_config_, roots, root_stats,
-                                           accumulator, &split_candidates);
-  // Expect three candidate splits (one per root).
-  EXPECT_EQ(3, split_candidates.size());
-
-  // Verify candidate for root 0, the best split occurs when we route
-  // example i2 left and i0, i1 right.
-  const NodeStats expected_left_node0(learner_config_,
-                                      GradientStats(1.2f, 0.2f));
-  const NodeStats expected_right_node0(
-      learner_config_,
-      root_stats[0].gradient_stats - expected_left_node0.gradient_stats);
-  const SplitStats expected_split_stats0(learner_config_, root_stats[0],
-                                         expected_left_node0,
-                                         expected_right_node0);
-  EXPECT_SPLIT_STATS_EQ(expected_split_stats0, split_candidates[0].split_stats);
-  const auto& tree_node0 = split_candidates[0].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::kDenseFloatBinarySplit,
-            tree_node0.node_case());
-  const auto& split0 = tree_node0.dense_float_binary_split();
-  EXPECT_FLOAT_EQ(dense_quantiles_.vec<float>()(0), split0.threshold());
-  EXPECT_EQ(kFeatureColumn, split0.feature_column());
-
-  // Verify candidate for root 1, there's only one active bucket here
-  // so zero gain is expected.
-  const NodeStats expected_left_node1(learner_config_,
-                                      root_stats[1].gradient_stats);
-  const NodeStats expected_right_node1(learner_config_, GradientStats(0, 0));
-  const SplitStats expected_split_stats1(learner_config_, root_stats[1],
-                                         expected_left_node1,
-                                         expected_right_node1);
-  EXPECT_SPLIT_STATS_EQ(expected_split_stats1, split_candidates[1].split_stats);
-  const auto& tree_node1 = split_candidates[1].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::kDenseFloatBinarySplit,
-            tree_node1.node_case());
-  const auto& split1 = tree_node1.dense_float_binary_split();
-  EXPECT_FLOAT_EQ(dense_quantiles_.vec<float>()(1), split1.threshold());
-  EXPECT_EQ(kFeatureColumn, split1.feature_column());
-
-  // Verify there are no candidate splits for root 5.
-  const auto& tree_node2 = split_candidates[2].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::NODE_NOT_SET,
-            tree_node2.node_case());
-}
-
-}  // namespace
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h
deleted file mode 100644
index 8bd2092f9609cb684b89f70cab35a92789fb39a4..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_FEATURE_COLUMN_HANDLER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_FEATURE_COLUMN_HANDLER_H_
-
-#include <vector>
-#include "tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator.h"
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/feature-split-candidate.h"
-#include "tensorflow/contrib/boosted_trees/proto/learner.pb.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_types.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-// Handler interface for feature columns. Each feature column type may
-// have its own handler which encapsulates the logic of aggregating gradient
-// stats as well as generating split candidates for each partition.
-// Handlers can be stateful and must be thread compatible.
-class FeatureColumnHandler {
- public:
-  FeatureColumnHandler(const int32 class_id, const int32 slot_id,
-                       const int64 batch_size)
-      : class_id_(class_id), slot_id_(slot_id), batch_size_(batch_size) {}
-
-  virtual ~FeatureColumnHandler() {}
-  FeatureColumnHandler(const FeatureColumnHandler& other) = delete;
-  FeatureColumnHandler& operator=(const FeatureColumnHandler& other) = delete;
-
-  // Aggregates example gradient stats for the feature column.
-  virtual void AggregateGradientStats(
-      const std::vector<int32>& example_partition_ids,
-      const Tensor& example_first_order_gradients,
-      const Tensor& example_second_order_gradients,
-      FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-          gradient_stats_accumulator) const = 0;
-
-  // Generates feature column split candidates for the specified roots.
-  virtual void GenerateFeatureSplitCandidates(
-      const LearnerConfig& learner_config, const std::vector<int32>& roots,
-      const std::vector<NodeStats>& root_stats,
-      const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-          gradient_stats_accumulator,
-      std::vector<FeatureSplitCandidate>* split_candidates) const = 0;
-
-  // Accessors.
-  int32 class_id() const { return class_id_; }
-  int32 slot_id() const { return slot_id_; }
-  int64 batch_size() const { return batch_size_; }
-
- protected:
-  // The class Id.
-  const int32 class_id_;
-
-  // The slod Id for use as a unique Id across all feature columns.
-  const int32 slot_id_;
-
-  // Size of the batch of examples.
-  const int64 batch_size_;
-};
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
-
-#endif  //  THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_FEATURE_COLUMN_HANDLER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.cc
deleted file mode 100644
index a0e9efbbc5030e8c2e25fafab98271337a2e582a..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-namespace {
-
-// Creates a sparse default right split node without assigning children.
-boosted_trees::trees::TreeNode CreateSparseSplitNodeDefaultRight(
-    int32 feature_column, float threshold) {
-  boosted_trees::trees::TreeNode split_node;
-  auto* split = split_node.mutable_sparse_float_binary_split_default_right()
-                    ->mutable_split();
-  split->set_feature_column(feature_column);
-  split->set_threshold(threshold);
-  return split_node;
-}
-
-// Creates a sparse default left split node without assigning children.
-boosted_trees::trees::TreeNode CreateSparseSplitNodeDefaultLeft(
-    int32 feature_column, float threshold) {
-  boosted_trees::trees::TreeNode split_node;
-  auto* split = split_node.mutable_sparse_float_binary_split_default_left()
-                    ->mutable_split();
-  split->set_feature_column(feature_column);
-  split->set_threshold(threshold);
-  return split_node;
-}
-
-}  // namespace
-
-void SparseQuantizedFeatureColumnHandler::AggregateGradientStats(
-    const std::vector<int32>& example_partition_ids,
-    const Tensor& example_first_order_gradients,
-    const Tensor& example_second_order_gradients,
-    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-        gradient_stats_accumulator) const {
-  // Pass over all rows and aggregate gradient stats for each partition
-  // and quantized feature bucket.
-  const int64 num_rows = sparse_indices_.dimension(0);
-  for (int64 row_idx = 0; row_idx < num_rows; ++row_idx) {
-    auto example_idx = sparse_indices_(row_idx, 0);
-    auto partition_id = example_partition_ids[example_idx];
-    auto feature_id = sparse_quantized_values_(row_idx);
-    gradient_stats_accumulator->AddStats(
-        slot_id_, class_id_, partition_id, feature_id,
-        GradientStats(example_first_order_gradients,
-                      example_second_order_gradients, example_idx));
-  }
-}
-
-void SparseQuantizedFeatureColumnHandler::GenerateFeatureSplitCandidates(
-    const LearnerConfig& learner_config, const std::vector<int32>& roots,
-    const std::vector<NodeStats>& root_stats,
-    const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-        gradient_stats_accumulator,
-    std::vector<FeatureSplitCandidate>* split_candidates) const {
-  // Evaluate split candidates for every root as each is a separate
-  // logical partition over the examples.
-  // Then for each root, we do both a forward left to right pass and a backward
-  // right to left pass over the quantized feature buckets accumulating
-  // gradients on one side and using the root aggregate gradients to get the
-  // gradients for the other side. Split gains are evaluated for each pass at
-  // every threshold and the best split is picked.
-  split_candidates->clear();
-  split_candidates->reserve(roots.size());
-  for (size_t root_idx = 0; root_idx < roots.size(); ++root_idx) {
-    // Get partition Id and root node stats.
-    const int32 partition_id = roots[root_idx];
-    const NodeStats& root_node_stats = root_stats[root_idx];
-
-    // Forward pass with right default direction.
-    GradientStats left_gradient_stats;
-    GradientStats right_gradient_stats(root_node_stats.gradient_stats);
-    FeatureSplitCandidate best_split_candidate(
-        root_node_stats.weight_contribution.size());
-    best_split_candidate.split_stats.gain =
-        std::numeric_limits<float>::lowest();
-    for (int bucket_id = 0; bucket_id < sparse_quantiles_.size(); ++bucket_id) {
-      // Get gradient stats.
-      auto gradient_stats = gradient_stats_accumulator.GetStats(
-          slot_id_, class_id_, partition_id, bucket_id);
-      if (gradient_stats.IsZero()) {
-        continue;
-      }
-
-      // Update gradient stats.
-      left_gradient_stats += gradient_stats;
-      right_gradient_stats =
-          root_node_stats.gradient_stats - left_gradient_stats;
-
-      // Get node stats
-      NodeStats left_node_stats(learner_config, left_gradient_stats);
-      NodeStats right_node_stats(learner_config, right_gradient_stats);
-
-      // Generate split candidate.
-      const float threshold = sparse_quantiles_(bucket_id);
-      FeatureSplitCandidate split_candidate(
-          slot_id_,
-          CreateSparseSplitNodeDefaultRight(sparse_feature_column_, threshold),
-          SplitStats(learner_config, root_node_stats, left_node_stats,
-                     right_node_stats));
-      if (split_candidate.split_stats.gain >
-          best_split_candidate.split_stats.gain) {
-        best_split_candidate = std::move(split_candidate);
-      }
-    }
-
-    // Determine if we need a backward pass by checking if the residual gradient
-    // after forward aggregation is almost the same as the aggregated gradient.
-    // for the current root. This helps avoid unnecessary computation as well
-    // as consistency due to floating point precision.
-    if (!right_gradient_stats.IsAlmostZero()) {
-      // Backward pass with left default direction.
-      right_gradient_stats = GradientStats();
-      left_gradient_stats = root_node_stats.gradient_stats;
-      for (int bucket_id = sparse_quantiles_.size() - 1; bucket_id > 0;
-           --bucket_id) {
-        // Get gradient stats.
-        auto gradient_stats = gradient_stats_accumulator.GetStats(
-            slot_id_, class_id_, partition_id, bucket_id);
-        if (gradient_stats.IsZero()) {
-          continue;
-        }
-
-        // Update gradient stats.
-        right_gradient_stats += gradient_stats;
-        left_gradient_stats = root_node_stats.gradient_stats - gradient_stats;
-
-        // Get node stats
-        NodeStats left_node_stats(learner_config, left_gradient_stats);
-        NodeStats right_node_stats(learner_config, right_gradient_stats);
-
-        // Generate split candidate.
-        const float threshold = sparse_quantiles_(bucket_id - 1);
-        FeatureSplitCandidate split_candidate(
-            slot_id_,
-            CreateSparseSplitNodeDefaultLeft(sparse_feature_column_, threshold),
-            SplitStats(learner_config, root_node_stats, left_node_stats,
-                       right_node_stats));
-        if (split_candidate.split_stats.gain >
-            best_split_candidate.split_stats.gain) {
-          best_split_candidate = std::move(split_candidate);
-        }
-      }
-    }
-
-    // Add best candidate for partition.
-    split_candidates->push_back(std::move(best_split_candidate));
-  }
-}
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.h
deleted file mode 100644
index eb63e705471a65e8448bda38b2e31eb971d5c1bb..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_SPARSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_SPARSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
-
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-// Handler for a sparse quantized feature column in the single class case.
-class SparseQuantizedFeatureColumnHandler : public FeatureColumnHandler {
- public:
-  SparseQuantizedFeatureColumnHandler(
-      const int32 class_id, const int32 slot_id, const int64 batch_size,
-      const int32 sparse_feature_column,
-      TTypes<float>::ConstVec sparse_quantiles,
-      TTypes<int64>::ConstMatrix sparse_indices,
-      TTypes<int32>::ConstVec sparse_quantized_values)
-      : FeatureColumnHandler(class_id, slot_id, batch_size),
-        sparse_feature_column_(sparse_feature_column),
-        sparse_quantiles_(sparse_quantiles),
-        sparse_indices_(sparse_indices),
-        sparse_quantized_values_(sparse_quantized_values) {}
-
-  void AggregateGradientStats(
-      const std::vector<int32>& example_partition_ids,
-      const Tensor& example_first_order_gradients,
-      const Tensor& example_second_order_gradients,
-      FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-          gradient_stats_accumulator) const override;
-
-  void GenerateFeatureSplitCandidates(
-      const LearnerConfig& learner_config, const std::vector<int32>& roots,
-      const std::vector<NodeStats>& root_stats,
-      const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-          gradient_stats_accumulator,
-      std::vector<FeatureSplitCandidate>* split_candidates) const override;
-
- protected:
-  const int32 sparse_feature_column_;
-  TTypes<float>::ConstVec sparse_quantiles_;
-  TTypes<int64>::ConstMatrix sparse_indices_;
-  TTypes<int32>::ConstVec sparse_quantized_values_;
-};
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
-
-#endif  //  THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_SPARSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc
deleted file mode 100644
index 643d936ad23850e601bc5518d69c8637011f53c0..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.h"
-
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-namespace {
-
-using boosted_trees::learner::LearnerConfig;
-
-const auto kClassId = 3;
-const auto kSlotId = 0;
-const auto kBatchSize = 4;
-const auto kFeatureColumn = 4;
-
-using FeatureStatsAccumulator =
-    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>;
-
-class SparseQuantizedFeatureColumnHandlerTest : public ::testing::Test {
- protected:
-  // The data looks like the following:
-  // Example |  Gradients    | Partition | Sparse Quantile |
-  // i0      |  (0.2, 0.12)  | 0         | 1               |
-  // i1      |  (-0.5, 0.07) | 0         | N/A             |
-  // i2      |  (1.2, 0.2)   | 0         | 0               |
-  // i3      |  (4.0, 0.13)  | 1         | 1               |
-  SparseQuantizedFeatureColumnHandlerTest()
-      : example_first_order_gradients_(
-            test::AsTensor<float>({0.2f, -0.5f, 1.2f, 4.0f}, {4})),
-        example_second_order_gradients_(
-            test::AsTensor<float>({0.12f, 0.07f, 0.2f, 0.13f}, {4})),
-        example_partitions_({0, 0, 0, 1}),
-        sparse_quantiles_(test::AsTensor<float>({0.3f, 0.52f}, {2})),
-        sparse_indices_(test::AsTensor<int64>({0, 0, 2, 0, 3, 0}, {3, 2})),
-        sparse_quantized_values_(test::AsTensor<int32>({1, 0, 1}, {3})) {
-    // Set L2 regularization.
-    learner_config_.mutable_regularization()->set_l2(2.0f);
-    learner_config_.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
-    // Create handler.
-    handler_.reset(new SparseQuantizedFeatureColumnHandler(
-        kClassId, kSlotId, kBatchSize, kFeatureColumn,
-        sparse_quantiles_.vec<float>(), sparse_indices_.matrix<int64>(),
-        sparse_quantized_values_.vec<int32>()));
-  }
-
-  LearnerConfig learner_config_;
-  const Tensor example_first_order_gradients_;
-  const Tensor example_second_order_gradients_;
-  const std::vector<int32> example_partitions_;
-  const Tensor sparse_quantiles_;
-  const Tensor sparse_indices_;
-  const Tensor sparse_quantized_values_;
-  std::unique_ptr<FeatureColumnHandler> handler_;
-};
-
-TEST_F(SparseQuantizedFeatureColumnHandlerTest, AggregateGradientStats) {
-  // Create handler.
-  FeatureStatsAccumulator accumulator(1);
-  handler_->AggregateGradientStats(
-      example_partitions_, example_first_order_gradients_,
-      example_second_order_gradients_, &accumulator);
-
-  // Check stats for each partition and feature.
-  // Partition 0, Feature 0.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(1.2f, 0.2f),
-                           accumulator.GetStats(kSlotId, kClassId, 0, 0));
-  // Partition 0, Feature 1.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.2f, 0.12f),
-                           accumulator.GetStats(kSlotId, kClassId, 0, 1));
-  // Partition 1, Feature 0.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.0f, 0.0f),
-                           accumulator.GetStats(kSlotId, kClassId, 1, 0));
-  // Partition 1, Feature 1.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(4.0f, 0.13f),
-                           accumulator.GetStats(kSlotId, kClassId, 1, 1));
-}
-
-TEST_F(SparseQuantizedFeatureColumnHandlerTest,
-       GenerateFeatureSplitCandidates) {
-  // Create handler.
-  FeatureStatsAccumulator accumulator(1);
-  handler_->AggregateGradientStats(
-      example_partitions_, example_first_order_gradients_,
-      example_second_order_gradients_, &accumulator);
-
-  // Get feature split candidates for two roots 0 and 1.
-  // The root stats are derived from the per-partition total gradient stats.
-  const std::vector<int32> roots = {0, 1, 9};
-  const std::vector<NodeStats>& root_stats = {
-      NodeStats(learner_config_, GradientStats(0.9f, 0.39f)),
-      NodeStats(learner_config_, GradientStats(4.0f, 0.13f)), NodeStats(1)};
-  std::vector<FeatureSplitCandidate> split_candidates;
-  handler_->GenerateFeatureSplitCandidates(learner_config_, roots, root_stats,
-                                           accumulator, &split_candidates);
-  // Expect three candidate splits (one per root).
-  EXPECT_EQ(3, split_candidates.size());
-
-  // Verify candidate for root 0, the best split occurs when we route
-  // example i0 and i2 to the left and i1 to the right (by default direction).
-  const NodeStats expected_left_node0(learner_config_,
-                                      GradientStats(0.2f + 1.2f, 0.12f + 0.2f));
-  const NodeStats expected_right_node0(
-      learner_config_,
-      root_stats[0].gradient_stats - expected_left_node0.gradient_stats);
-  const SplitStats expected_split_stats0(learner_config_, root_stats[0],
-                                         expected_left_node0,
-                                         expected_right_node0);
-  EXPECT_SPLIT_STATS_EQ(expected_split_stats0, split_candidates[0].split_stats);
-  const auto& tree_node0 = split_candidates[0].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::kSparseFloatBinarySplitDefaultRight,
-            tree_node0.node_case());
-  const auto& split0 =
-      tree_node0.sparse_float_binary_split_default_right().split();
-  EXPECT_FLOAT_EQ(sparse_quantiles_.vec<float>()(1), split0.threshold());
-  EXPECT_EQ(kFeatureColumn, split0.feature_column());
-
-  // Verify candidate for root 1, there's only one active bucket here
-  // so zero gain is expected.
-  const NodeStats expected_left_node1(learner_config_,
-                                      root_stats[1].gradient_stats);
-  const NodeStats expected_right_node1(learner_config_, GradientStats(0, 0));
-  const SplitStats expected_split_stats1(learner_config_, root_stats[1],
-                                         expected_left_node1,
-                                         expected_right_node1);
-  EXPECT_SPLIT_STATS_EQ(expected_split_stats1, split_candidates[1].split_stats);
-  const auto& tree_node1 = split_candidates[1].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::kSparseFloatBinarySplitDefaultRight,
-            tree_node1.node_case());
-  const auto& split1 =
-      tree_node1.sparse_float_binary_split_default_right().split();
-  EXPECT_FLOAT_EQ(sparse_quantiles_.vec<float>()(1), split1.threshold());
-  EXPECT_EQ(kFeatureColumn, split1.feature_column());
-
-  // Verify there are no candidate splits for root 9.
-  const auto& tree_node2 = split_candidates[2].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::NODE_NOT_SET,
-            tree_node2.node_case());
-}
-
-}  // namespace
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
index f8750e7191673274772fc869c198dd5fbbefbc49..0e5578693a7b90b16eada1127cad992612fb6dad 100644
--- a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
+++ b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
@@ -52,13 +52,13 @@ int DecisionTree::Traverse(const DecisionTreeConfig& config,
             example.sparse_float_features[split.feature_column()];
         // Feature id for the split when multivalent sparse float column, or 0
         // by default.
-        const int32 feature_id = split.feature_id();
+        const int32 dimension_id = split.dimension_id();
 
-        node_id =
-            !sparse_feature[feature_id].has_value() ||
-                    sparse_feature[feature_id].get_value() <= split.threshold()
-                ? split.left_id()
-                : split.right_id();
+        node_id = !sparse_feature[dimension_id].has_value() ||
+                          sparse_feature[dimension_id].get_value() <=
+                              split.threshold()
+                      ? split.left_id()
+                      : split.right_id();
         break;
       }
       case TreeNode::kSparseFloatBinarySplitDefaultRight: {
@@ -68,12 +68,12 @@ int DecisionTree::Traverse(const DecisionTreeConfig& config,
             example.sparse_float_features[split.feature_column()];
         // Feature id for the split when multivalent sparse float column, or 0
         // by default.
-        const int32 feature_id = split.feature_id();
-        node_id =
-            sparse_feature[feature_id].has_value() &&
-                    sparse_feature[feature_id].get_value() <= split.threshold()
-                ? split.left_id()
-                : split.right_id();
+        const int32 dimension_id = split.dimension_id();
+        node_id = sparse_feature[dimension_id].has_value() &&
+                          sparse_feature[dimension_id].get_value() <=
+                              split.threshold()
+                      ? split.left_id()
+                      : split.right_id();
         break;
       }
       case TreeNode::kCategoricalIdBinarySplit: {
diff --git a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree_test.cc b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree_test.cc
index 93924d429c19aef51b6f1d85655de3798a76e3e0..58fe8e335af28fe811c1ee785578aa58d898335b 100644
--- a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree_test.cc
@@ -190,7 +190,7 @@ TEST_F(DecisionTreeTest, TraverseSparseBinarySplit) {
     tree_config.add_nodes()->mutable_leaf();
 
     // Split on first column
-    split_node->set_feature_id(0);
+    split_node->set_dimension_id(0);
     split_node->set_threshold(2.0f);
 
     // Both instances have this feature value.
@@ -199,7 +199,7 @@ TEST_F(DecisionTreeTest, TraverseSparseBinarySplit) {
     EXPECT_EQ(1, DecisionTree::Traverse(tree_config, 0, *++example_it));
 
     // Split on second column
-    split_node->set_feature_id(1);
+    split_node->set_dimension_id(1);
     split_node->set_threshold(5.0f);
 
     // First instance does not have it (default right), second does have it.
@@ -208,7 +208,7 @@ TEST_F(DecisionTreeTest, TraverseSparseBinarySplit) {
     EXPECT_EQ(1, DecisionTree::Traverse(tree_config, 0, *++example_it));
 
     // Split on third column
-    split_node->set_feature_id(2);
+    split_node->set_dimension_id(2);
     split_node->set_threshold(3.0f);
     example_it = example_iterable.begin();
 
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
index 7a550d6f7328765d8815a947885e47fa0b0a8f8b..badc629a118f768d5aa25ef1b94b8190e6910c7f 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
@@ -56,7 +56,7 @@ class BatchFeatures {
     *num_sparse_int_features = sparse_int_feature_columns_.size();
     if (*num_dense_float_features == 0 && *num_sparse_float_features == 0 &&
         *num_sparse_int_features == 0) {
-      return errors::FailedPrecondition("Not intialized yet.");
+      return errors::FailedPrecondition("Not initialized yet.");
     }
     return Status::OK();
   }
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc
index 0d46565a1962b88cbb267f3d6043610758790578..ccee9530b6897924453461c13b1238402c0f6cfa 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc
@@ -97,7 +97,7 @@ class IndicesRowIterator
   }
 
   bool operator<(const IndicesRowIterator& other) const {
-	return (row_idx_ < other.row_idx_);
+    return (row_idx_ < other.row_idx_);
   }
 
   bool operator==(const IndicesRowIterator& other) const {
diff --git a/tensorflow/contrib/boosted_trees/proto/tree_config.proto b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
index f14abf45a517ad7c4c6d7bb1ab88b7a1d47d6fb6..fc570c1083d01a65760a456c109dad93afd9f62a 100644
--- a/tensorflow/contrib/boosted_trees/proto/tree_config.proto
+++ b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
@@ -53,9 +53,9 @@ message DenseFloatBinarySplit {
   // Float feature column and split threshold describing
   // the rule feature <= threshold.
   int32 feature_column = 1;
-  // If feature column is multivalent, this holds the index of the feature for
-  // the split. Defaults to 0.
-  int32 feature_id = 5;
+  // If feature column is multivalent, this holds the index of the dimensiong
+  // for the split. Defaults to 0.
+  int32 dimension_id = 5;
   float threshold = 2;
 
   // Node children indexing into a contiguous
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
index 9ada844601afbe7f0a6993444c7c4ed0e16a01ca..c1acf351603dd80c2d14c7ee0a5b4c89706bc1bf 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
@@ -93,7 +93,7 @@ def _set_float_split(split, feat_col, thresh, l_id, r_id, feature_dim_id=None):
   split.left_id = l_id
   split.right_id = r_id
   if feature_dim_id is not None:
-    split.feature_id = feature_dim_id
+    split.dimension_id = feature_dim_id
 
 
 def _set_categorical_id_split(split, feat_col, feat_id, l_id, r_id):
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
index 2a72961504b7e8a256afd8f77dce79ba756230f0..888d5c57ed33446c8b6f18d2d1e393647613d132 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
@@ -48,15 +48,16 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
   def testBasicQuantileBuckets(self):
     """Sets up the quantile summary op test as follows.
 
-    Create a batch of 6 examples having a dense and sparse features.
+    Create a batch of 6 examples having a dense and sparse features. SparseM is
+    a sparse multi-dimensional (multivalent) feature.
     The data looks like this
-    | Instance | instance weights | Dense 0  | Sparse 0
-    | 0        |     10           |   1      |
-    | 1        |     1            |   2      |    2
-    | 2        |     1            |   3      |    3
-    | 3        |     1            |   4      |    4
-    | 4        |     1            |   4      |    5
-    | 5        |     1            |   5      |    6
+    | Instance | instance weights | Dense 0  | Sparse 0 | SparseM
+    | 0        |     10           |   1      |          |   |   |
+    | 1        |     1            |   2      |    2     | 2 |   |
+    | 2        |     1            |   3      |    3     | 3 |   |
+    | 3        |     1            |   4      |    4     |   | 4 |
+    | 4        |     1            |   4      |    5     |   | 5 |
+    | 5        |     1            |   5      |    6     |   | 6 |
     """
 
     dense_float_tensor_0 = constant_op.constant(
@@ -66,20 +67,29 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
     sparse_values_0 = constant_op.constant(
         [2, 3, 4, 5, 6], dtype=dtypes.float32)
     sparse_shape_0 = constant_op.constant([6, 1], dtype=dtypes.int64)
+    # Multi-dimensional feature that should have the same quantiles as Sparse 0.
+    sparse_indices_m = constant_op.constant(
+        [[1, 1], [2, 0], [3, 1], [4, 1], [5, 1]], dtype=dtypes.int64)
+    sparse_values_m = constant_op.constant(
+        [2, 3, 4, 5, 6], dtype=dtypes.float32)
+    sparse_shape_m = constant_op.constant([6, 2], dtype=dtypes.int64)
+
     example_weights = constant_op.constant(
         [10, 1, 1, 1, 1, 1], dtype=dtypes.float32)
 
     with self.test_session():
       config = self._gen_config(0.33, 3)
       dense_buckets, sparse_buckets = quantile_ops.quantile_buckets(
-          [dense_float_tensor_0], [sparse_indices_0], [sparse_values_0],
-          [sparse_shape_0],
+          [dense_float_tensor_0], [sparse_indices_0, sparse_indices_m],
+          [sparse_values_0, sparse_values_m], [sparse_shape_0, sparse_shape_m],
           example_weights=example_weights,
           dense_config=[config],
-          sparse_config=[config])
+          sparse_config=[config, config])
 
       self.assertAllEqual([1, 3, 5], dense_buckets[0].eval())
       self.assertAllEqual([2, 4, 6.], sparse_buckets[0].eval())
+      # Multidimensional sparse.
+      self.assertAllEqual([2, 4, 6.], sparse_buckets[1].eval())
 
   def testStreamingQuantileBucketsWithVaryingBatch(self):
     """Sets up the quantile summary op test as follows.
@@ -214,10 +224,10 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
 
       sparse_indices_0 = constant_op.constant(
-          [[1, 0], [2, 0], [3, 0], [4, 0], [5, 0]], dtype=dtypes.int64)
+          [[1, 0], [2, 1], [3, 0], [4, 2], [5, 0]], dtype=dtypes.int64)
       sparse_values_0 = constant_op.constant(
           [2.0, 3.0, 4.0, 5.0, 6.0], dtype=dtypes.float32)
-      sparse_shape_0 = constant_op.constant([6, 1], dtype=dtypes.int64)
+      sparse_shape_0 = constant_op.constant([6, 3], dtype=dtypes.int64)
       example_weights = constant_op.constant(
           [10, 1, 1, 1, 1, 1], dtype=dtypes.float32, shape=[6, 1])
       update = accumulator.add_summary(
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
index 7c2e3a3b208c696731ef12be5e9cbab66dc99355..28834ef55bf8e1f32cc8f2380a4be3bf3824d8e1 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
@@ -240,7 +240,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
 
     self.assertEqual(0, split_node.split.feature_column)
     # Sparse is one dimensional.
-    self.assertEqual(0, split_node.split.feature_id)
+    self.assertEqual(0, split_node.split.dimension_id)
 
     self.assertAllClose(0.52, split_node.split.threshold)
 
@@ -263,7 +263,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
 
     self.assertEqual(0, split_node.split.feature_column)
     # Sparse is one dimensional.
-    self.assertEqual(0, split_node.split.feature_id)
+    self.assertEqual(0, split_node.split.dimension_id)
 
     self.assertAllClose(0.52, split_node.split.threshold)
 
@@ -373,7 +373,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
 
     self.assertEqual(0, split_node.split.feature_column)
     # Split happened on second dimension.
-    self.assertEqual(1, split_node.split.feature_id)
+    self.assertEqual(1, split_node.split.dimension_id)
 
     self.assertAllClose(0.58, split_node.split.threshold)
 
@@ -395,7 +395,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
     self.assertAllClose([expected_right_weight], right_child.value)
 
     self.assertEqual(0, split_node.split.feature_column)
-    self.assertEqual(2, split_node.split.feature_id)
+    self.assertEqual(2, split_node.split.dimension_id)
 
     self.assertAllClose(0.6, split_node.split.threshold)
 
diff --git a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
index 7e8e15e7d8c89d1adaa472b1da7e8bb3c73ca17e..294e04002adac62fc123a3242a05a1b36f422433 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
@@ -45,6 +45,7 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
                init_stamp_token,
                epsilon,
                num_quantiles,
+               max_elements=None,
                name=None,
                container=None):
     """Creates a QuantileAccumulator object.
@@ -53,6 +54,7 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
       init_stamp_token: The initial value for the stamp token.
       epsilon: Error bound on the quantile computation.
       num_quantiles: Number of quantiles to produce from the final summary.
+      max_elements: Maximum number of elements added to the accumulator.
       name: the name to save the accumulator under.
       container: An optional `string`. Defaults to `""`
     """
@@ -67,6 +69,7 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
           self._quantile_accumulator_handle,
           init_stamp_token,
           epsilon=epsilon,
+          max_elements=max_elements,
           num_quantiles=num_quantiles)
       is_initialized_op = gen_quantile_ops.quantile_accumulator_is_initialized(
           self._quantile_accumulator_handle)
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index 6094dae6b59d8b05bb12a28cf167a536e6825287..b95956dae2a62b28643cd31815c5f5650eca337b 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -322,9 +322,11 @@ class GradientBoostedDecisionTreeModel(object):
     self._feature_columns = feature_columns
     self._learner_config_serialized = learner_config.SerializeToString()
     self._attempted_trees = variables.Variable(
-        initial_value=array_ops.zeros([], dtypes.int64), trainable=False)
+        initial_value=array_ops.zeros([], dtypes.int64), trainable=False,
+        name="attempted_trees")
     self._finalized_trees = variables.Variable(
-        initial_value=array_ops.zeros([], dtypes.int64), trainable=False)
+        initial_value=array_ops.zeros([], dtypes.int64), trainable=False,
+        name="finalized_trees")
     if not features:
       raise ValueError("Features dictionary must be specified.")
     (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index 16e24d97ddee0751e0b808b89080074c1b4baba7..dba51d4f527792d2a8dedc693f74c07119fd231d 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -912,8 +912,10 @@ class GbdtTest(test_util.TensorFlowTestCase):
       self.assertEqual(1,
                        len(output.trees[0].nodes[2].leaf.sparse_vector.index))
       self.assertEqual(3, output.trees[0].nodes[2].leaf.sparse_vector.index[0])
-      self.assertAlmostEqual(
-          0.893284678459, output.trees[0].nodes[2].leaf.sparse_vector.value[0])
+      self.assertAllClose(
+          0.893284678459,
+          output.trees[0].nodes[2].leaf.sparse_vector.value[0],
+          atol=1e-4, rtol=1e-4)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/boosted_trees/python/utils/losses_test.py b/tensorflow/contrib/boosted_trees/python/utils/losses_test.py
index dde16426863b60e9df64da1ee6b36caec273bfd6..ccb8509c0347f9c9b6f1e8f4f620230aac9a6c2d 100644
--- a/tensorflow/contrib/boosted_trees/python/utils/losses_test.py
+++ b/tensorflow/contrib/boosted_trees/python/utils/losses_test.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
 import numpy as np
 
 from tensorflow.contrib.boosted_trees.python.utils import losses
@@ -60,35 +58,27 @@ class LossesTest(test_util.TensorFlowTestCase):
       neg_loss = loss_for_negatives.eval()
       # For positive labels, points <= 0.3 get max loss of e.
       # For negative labels, these points have minimum loss of 1/e.
-      for i in range(2):
-        self.assertAlmostEqual(math.exp(1), pos_loss[i], places=4)
-        self.assertAlmostEqual(math.exp(-1), neg_loss[i], places=4)
+      self.assertAllClose(np.exp(np.ones([2, 1])), pos_loss[:2], atol=1e-4)
+      self.assertAllClose(np.exp(-np.ones([2, 1])), neg_loss[:2], atol=1e-4)
 
       # For positive lables, p oints with predictions 0.7 and larger get minimum
       # loss value of 1/e. For negative labels, these points are wrongly
       # classified and get loss e.
-      for i in range(6, 10):
-        self.assertAlmostEqual(math.exp(-1), pos_loss[i], places=4)
-        self.assertAlmostEqual(math.exp(1), neg_loss[i], places=4)
+      self.assertAllClose(np.exp(-np.ones([4, 1])), pos_loss[6:10], atol=1e-4)
+      self.assertAllClose(np.exp(np.ones([4, 1])), neg_loss[6:10], atol=1e-4)
 
       # Points in between 0.5-eps, 0..5+eps get loss exp(-label_m*y), where
       # y = 1/eps *x -1/(2eps), where x is the probability and label_m is either
       # 1 or -1 (for label of 0).
-      for i in range(2, 6):
-        self.assertAlmostEqual(
-            math.exp(-1.0 * (predictions_probs[i] * 1.0 / eps - 0.5 / eps)),
-            pos_loss[i],
-            places=4)
-        self.assertAlmostEqual(
-            math.exp(1.0 * (predictions_probs[i] * 1.0 / eps - 0.5 / eps)),
-            neg_loss[i],
-            places=4)
+      self.assertAllClose(
+          np.exp(-(predictions_probs[2:6] * 1.0 / eps - 0.5 / eps)),
+          pos_loss[2:6], atol=1e-4)
+      self.assertAllClose(
+          np.exp(predictions_probs[2:6] * 1.0 / eps - 0.5 / eps),
+          neg_loss[2:6], atol=1e-4)
 
   def test_per_example_squared_loss(self):
 
-    def _squared_loss(p, y):
-      return np.mean(1.0 * (p - y) * (p - y))
-
     labels = np.array([[0.123], [224.2], [-3], [2], [.3]], dtype=np.float32)
     weights = array_ops.ones([5, 1], dtypes.float32)
     predictions = np.array(
@@ -99,9 +89,8 @@ class LossesTest(test_util.TensorFlowTestCase):
                                                        predictions)
 
       loss = loss_tensor.eval()
-      for i in range(5):
-        self.assertAlmostEqual(
-            _squared_loss(labels[i], predictions[i]), loss[i], places=4)
+      self.assertAllClose(
+          np.square(labels[:5] - predictions[:5]), loss[:5], atol=1e-4)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/cloud/BUILD b/tensorflow/contrib/cloud/BUILD
index aa8f5ed12bc6f779e3c1a923b9225ec283189747..fe8bd072afd43a64fa62a65bd8900b5a98dbe761 100644
--- a/tensorflow/contrib/cloud/BUILD
+++ b/tensorflow/contrib/cloud/BUILD
@@ -60,9 +60,7 @@ tf_py_test(
     size = "small",
     srcs = ["python/ops/bigquery_reader_ops_test.py"],
     additional_deps = [
-        ":bigquery_reader_ops_op_lib",
         ":cloud_py",
-        "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
index b31b882fa19a7eaad304d6d423961234f9affef4..e9b79a066def566096d6c3f3745974423e3371d1 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
@@ -421,7 +421,7 @@ TEST_F(BigQueryTableAccessorTest, MultiplePagesTest) {
   TF_EXPECT_OK(accessor_->ReadRow(&row_id, &example));
   EXPECT_EQ(3, row_id);
   EXPECT_TRUE(accessor_->Done());
-  
+
   Example expected_example;
   ASSERT_TRUE(protobuf::TextFormat::ParseFromString(kTestExampleProtoWithNulls,
                                                     &expected_example));
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index f0144e9faa26801b6491b242b04fda8905f15306..c74da9cabd6816bc9c7891e32937534cff2d677d 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -80,13 +80,9 @@ class TPUClusterResolver(ClusterResolver):
         raise ImportError('googleapiclient must be installed before using the '
                           'TPU cluster resolver')
 
-      # TODO(b/67375680): Remove custom URL once TPU APIs are finalized
       self._service = discovery.build(
-          'tpu',
-          'v1',
-          credentials=self._credentials,
-          discoveryServiceUrl='https://storage.googleapis.com'
-                              '/tpu-api-definition/v1alpha1.json')
+          'tpu', 'v1alpha1',
+          credentials=self._credentials)
     else:
       self._service = service
 
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 77a3fc0c8322117f50265e56952b68480583de02..8d023cc81dd73751f0b5690f3649ded3fc038155 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -18,7 +18,6 @@ cmake_policy(SET CMP0022 NEW)
 
 # Options
 option(tensorflow_VERBOSE "Enable for verbose output" OFF)
-option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
 option(tensorflow_ENABLE_SSL_SUPPORT "Enable boringssl support" OFF)
 option(tensorflow_ENABLE_GRPC_SUPPORT "Enable gRPC support" ON)
 option(tensorflow_ENABLE_HDFS_SUPPORT "Enable HDFS support" OFF)
@@ -34,6 +33,13 @@ option(tensorflow_BUILD_SHARED_LIB "Build TensorFlow as a shared library" OFF)
 option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
 option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")
 option(tensorflow_ENABLE_SNAPPY_SUPPORT "Enable SNAPPY compression support" ON)
+option(tensorflow_DISABLE_EIGEN_FORCEINLINE "Disable forceinline, to speed up build on windows." OFF)
+
+# GPU, CUDA and cuDNN options
+option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
+set(tensorflow_CUDA_VERSION "9.0" CACHE STRING "CUDA version to build against")
+set(tensorflow_CUDNN_VERSION "7" CACHE STRING "cuDNN version to build against")
+
 if(HAIKU)
 	option(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE "Enable PIE support" OFF)
 else()
@@ -53,7 +59,15 @@ if (NOT WIN32)
     set(tensorflow_CUDNN_INCLUDE /usr/include)
   endif (NOT tensorflow_CUDNN_INCLUDE)
   option(tensorflow_PATH_CUDNN_STATIC_LIB "Override PATH_STATIC_LIB for libcudnn_static.a" ${tensorflow_PATH_STATIC_LIB})
+  if (NOT tensorflow_PATH_CUDNN_STATIC_LIB)
+    # option's default value is OFF. Fill it with real default values
+    set (tensorflow_PATH_CUDNN_STATIC_LIB ${tensorflow_PATH_STATIC_LIB})
+  endif (NOT tensorflow_PATH_CUDNN_STATIC_LIB)
   option(tensorflow_PATH_NCCL_STATIC_LIB "Override PATH_STATIC_LIB for libnccl_static.a" ${tensorflow_PATH_STATIC_LIB})
+  if (NOT tensorflow_PATH_NCCL_STATIC_LIB)
+    # option's default value is OFF. Fill it with real default values
+    set (tensorflow_PATH_NCCL_STATIC_LIB ${tensorflow_PATH_STATIC_LIB})
+  endif (NOT tensorflow_PATH_NCCL_STATIC_LIB)
   option(tensorflow_CUDA_LIBRARY_PATH "Designate the default CUDA library paths" /usr/local/cuda/lib64)
   if (NOT tensorflow_CUDA_LIBRARY_PATH)
     # option's default value is OFF. Fill it with real default values
@@ -92,6 +106,10 @@ else()
 	set(CMAKE_POSITION_INDEPENDENT_CODE OFF)
 endif()
 
+if (tensorflow_DISABLE_EIGEN_FORCEINLINE)
+  add_definitions(-DEIGEN_STRONG_INLINE=inline)
+endif()
+
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
   add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
@@ -262,7 +280,7 @@ if (tensorflow_ENABLE_GPU)
     list(APPEND CMAKE_LIBRARY_PATH "${tensorflow_CUDA_LIBRARY_PATH}/stubs")
   endif (NOT WIN32)
 
-  find_package(CUDA 8.0 REQUIRED)
+  find_package(CUDA ${tensorflow_CUDA_VERSION} REQUIRED)
 
   # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
   # CUDA_NVCC_FLAGS and cuda_config.h below
@@ -316,13 +334,16 @@ if (tensorflow_ENABLE_GPU)
       ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${cudnn_STATIC_LIBRARY} ${culibos_STATIC_LIBRARY} ${nccl_STATIC_LIBRARY})
   endif (WIN32)
 
+  # Remove "." from CUDA version variable.
+  string(REPLACE "." "" short_CUDA_VER ${tensorflow_CUDA_VERSION})
+
   # create cuda_config.h
   FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
     "#ifndef CUDA_CUDA_CONFIG_H_\n"
     "#define CUDA_CUDA_CONFIG_H_\n"
     "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.0\"),CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
-    "#define TF_CUDA_VERSION \"64_80\"\n"
-    "#define TF_CUDNN_VERSION \"64_6\"\n"
+    "#define TF_CUDA_VERSION \"64_${short_CUDA_VER}\"\n"
+    "#define TF_CUDNN_VERSION \"64_${tensorflow_CUDNN_VERSION}\"\n"
     "#define TF_CUDA_TOOLKIT_PATH \"${CUDA_TOOLKIT_ROOT_DIR}\"\n"
     "#endif  // CUDA_CUDA_CONFIG_H_\n"
   )
@@ -360,15 +381,15 @@ if (tensorflow_ENABLE_GPU)
   if(WIN32)
     set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
       msvcp_dll_name=msvcp140.dll
-      cudart_dll_name=cudart64_80.dll
-      cuda_version_number=8.0
+      cudart_dll_name=cudart64_${short_CUDA_VER}.dll
+      cuda_version_number=${tensorflow_CUDA_VERSION}
       nvcuda_dll_name=nvcuda.dll
-      cudnn_dll_name=cudnn64_6.dll
-      cudnn_version_number=6)
+      cudnn_dll_name=cudnn64_${tensorflow_CUDNN_VERSION}.dll
+      cudnn_version_number=${tensorflow_CUDNN_VERSION})
   else(WIN32)
     set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
-      cuda_version_number=8.0
-      cudnn_version_number=6)
+	    cuda_version_number=${tensorflow_CUDA_VERSION}
+	    cudnn_version_number=${tensorflow_CUDNN_VERSION})
   endif(WIN32)
 else(tensorflow_ENABLE_GPU)
   set(tensorflow_BUILD_INFO_FLAGS --build_config cpu --key_value
@@ -383,11 +404,7 @@ endif()
 
 # Let's get to work!
 include(tf_core_framework.cmake)
-# NOTE: Disabled until issue #3996 is fixed.
-# include(tf_stream_executor.cmake)
-if (tensorflow_ENABLE_GPU)
-    include(tf_stream_executor.cmake)
-endif()
+include(tf_stream_executor.cmake)
 
 include(tf_core_cpu.cmake)
 include(tf_core_ops.cmake)
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 4ddfec5960d2b759bacb376202cd8dab6ef2b024..4be733a2809f366a214fa2bb853bccffb10ecaba 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -19,23 +19,6 @@ for instructions on how to install a pre-built TensorFlow package on Windows.
 ### Current known limitations
 * It is not possible to load a custom Op library.
 * GCS file system is not supported.
-* The following Ops are not currently implemented:
- - Dequantize
- - QuantizeAndDequantize
- - QuantizedAvgPool
- - QuantizedBatchNomWithGlobalNormalization
- - QuantizedBiasAdd
- - QuantizedConcat
- - QuantizedConv2D
- - QuantizedMatmul
- - QuantizedMaxPoo
- - QuantizeDownAndShrinkRange
- - QuantizedRelu
- - QuantizedRelu6
- - QuantizedReshape
- - QuantizeV2
- - RequantizationRange
- - Requantize
 
 ## Building with CMake
 
diff --git a/tensorflow/contrib/cmake/external/gemmlowp.cmake b/tensorflow/contrib/cmake/external/gemmlowp.cmake
index 3b146657bfc9bdd54db14839195af45972e67aff..a235442dc5c0a07e249653381436eeae81575883 100644
--- a/tensorflow/contrib/cmake/external/gemmlowp.cmake
+++ b/tensorflow/contrib/cmake/external/gemmlowp.cmake
@@ -14,8 +14,8 @@
 # ==============================================================================
 include (ExternalProject)
 
-set(gemmlowp_URL https://mirror.bazel.build/github.com/google/gemmlowp/archive/010bb3e71a26ca1d0884a167081d092b43563996.zip)
-set(gemmlowp_HASH SHA256=dd2557072bde12141419cb8320a9c25e6ec41a8ae53c2ac78c076a347bb46d9d)
+set(gemmlowp_URL https://github.com/google/gemmlowp/archive/6a2a90822e8546fc2bfa7044de0faf1c1cb4862f.zip)
+set(gemmlowp_HASH SHA256=3447948d219f3270383766bbe08942888c0eb4e0ca6663c0e0548502ec5bb77d)
 set(gemmlowp_BUILD ${CMAKE_CURRENT_BINARY_DIR}/gemmlowp/src/gemmlowp)
 set(gemmlowp_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/gemmlowp/src/gemmlowp)
 
diff --git a/tensorflow/contrib/cmake/external/nsync.cmake b/tensorflow/contrib/cmake/external/nsync.cmake
index 155c91cb97dbe5ef33c318efb5544a9fa22166c7..05080060479b6240edb8ab9f65160b3dd182feb9 100644
--- a/tensorflow/contrib/cmake/external/nsync.cmake
+++ b/tensorflow/contrib/cmake/external/nsync.cmake
@@ -16,7 +16,7 @@ include (ExternalProject)
 
 set(nsync_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/nsync/public)
 set(nsync_URL https://github.com/google/nsync)
-set(nsync_TAG 93815892dddafe9146a5f7e7042281d59d0f4323)
+set(nsync_TAG 8502189abfa44c249c01c2cad64e6ed660a9a668)
 set(nsync_BUILD ${CMAKE_CURRENT_BINARY_DIR}/nsync/src/nsync)
 set(nsync_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/nsync/install)
 
diff --git a/tensorflow/contrib/cmake/external/re2.cmake b/tensorflow/contrib/cmake/external/re2.cmake
index b56f4b089813247f3ab1c751538ba4b05cacb5b6..d10f5959f71dd350e6e2bcb81be8882b203fb231 100644
--- a/tensorflow/contrib/cmake/external/re2.cmake
+++ b/tensorflow/contrib/cmake/external/re2.cmake
@@ -45,4 +45,5 @@ ExternalProject_Add(re2
 		endif()
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_INSTALL_PREFIX:STRING=${re2_INSTALL}
+        -DRE2_BUILD_TESTING:BOOL=OFF
 )
diff --git a/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt b/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt
index 594c2492d4fd68b50c8493321a2c4dcc2d41917e..aaae18a313dd082b428654091c9411600c981ec9 100644
--- a/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt
@@ -158,12 +158,21 @@ if (NOT "${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
   elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "NetBSDX")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/netbsd")
     set (NSYNC_POSIX ON)
+    set (NSYNC_OS_EXTRA_SRC
+      "platform/posix/src/nsync_semaphore_mutex.c"
+    )
   elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "FreeBSDX")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/freebsd")
     set (NSYNC_POSIX ON)
+    set (NSYNC_OS_EXTRA_SRC
+      "platform/posix/src/nsync_semaphore_mutex.c"
+    )
   elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "OpenBSDX")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/openbsd")
     set (NSYNC_POSIX ON)
+    set (NSYNC_OS_EXTRA_SRC
+      "platform/posix/src/nsync_semaphore_mutex.c"
+    )
   endif ()
 endif ()
 
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
new file mode 100644
index 0000000000000000000000000000000000000000..92edce77dff699e75d1873ad0f56c6c489fbc571
--- /dev/null
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -0,0 +1,453 @@
+tensorflow
+tensorflow/core
+tensorflow/core/example
+tensorflow/core/framework
+tensorflow/core/lib
+tensorflow/core/lib/core
+tensorflow/core/protobuf
+tensorflow/core/util
+tensorflow/examples
+tensorflow/examples/tutorials
+tensorflow/examples/tutorials/mnist
+tensorflow/python
+tensorflow/python/client
+tensorflow/python/data
+tensorflow/python/data/ops
+tensorflow/python/data/util
+tensorflow/python/debug
+tensorflow/python/debug/cli
+tensorflow/python/debug/examples
+tensorflow/python/debug/lib
+tensorflow/python/debug/wrappers
+tensorflow/python/eager
+tensorflow/python/estimator
+tensorflow/python/estimator/canned
+tensorflow/python/estimator/export
+tensorflow/python/estimator/inputs
+tensorflow/python/estimator/inputs/queues
+tensorflow/python/feature_column
+tensorflow/python/framework
+tensorflow/python/grappler
+tensorflow/python/keras
+tensorflow/python/keras/activations
+tensorflow/python/keras/applications
+tensorflow/python/keras/applications/inception_resnet_v2
+tensorflow/python/keras/applications/inception_v3
+tensorflow/python/keras/applications/mobilenet
+tensorflow/python/keras/applications/resnet50
+tensorflow/python/keras/applications/vgg16
+tensorflow/python/keras/applications/vgg19
+tensorflow/python/keras/applications/xception
+tensorflow/python/keras/backend
+tensorflow/python/keras/callbacks
+tensorflow/python/keras/constraints
+tensorflow/python/keras/datasets
+tensorflow/python/keras/datasets/boston_housing
+tensorflow/python/keras/datasets/cifar10
+tensorflow/python/keras/datasets/cifar100
+tensorflow/python/keras/datasets/fashion_mnist
+tensorflow/python/keras/datasets/imdb
+tensorflow/python/keras/datasets/mnist
+tensorflow/python/keras/datasets/reuters
+tensorflow/python/keras/estimator
+tensorflow/python/keras/initializers
+tensorflow/python/keras/layers
+tensorflow/python/keras/losses
+tensorflow/python/keras/metrics
+tensorflow/python/keras/models
+tensorflow/python/keras/optimizers
+tensorflow/python/keras/preprocessing
+tensorflow/python/keras/preprocessing/image
+tensorflow/python/keras/preprocessing/sequence
+tensorflow/python/keras/preprocessing/text
+tensorflow/python/keras/regularizers
+tensorflow/python/keras/utils
+tensorflow/python/keras/wrappers
+tensorflow/python/keras/wrappers/scikit_learn
+tensorflow/python/keras/_impl
+tensorflow/python/keras/_impl/keras
+tensorflow/python/keras/_impl/keras/applications
+tensorflow/python/keras/_impl/keras/datasets
+tensorflow/python/keras/_impl/keras/engine
+tensorflow/python/keras/_impl/keras/layers
+tensorflow/python/keras/_impl/keras/preprocessing
+tensorflow/python/keras/_impl/keras/utils
+tensorflow/python/keras/_impl/keras/wrappers
+tensorflow/python/kernel_tests
+tensorflow/python/kernel_tests/distributions
+tensorflow/python/kernel_tests/linalg
+tensorflow/python/kernel_tests/random
+tensorflow/python/layers
+tensorflow/python/lib
+tensorflow/python/lib/core
+tensorflow/python/lib/io
+tensorflow/python/ops
+tensorflow/python/ops/distributions
+tensorflow/python/ops/linalg
+tensorflow/python/ops/losses
+tensorflow/python/platform
+tensorflow/python/platform/default
+tensorflow/python/platform/summary
+tensorflow/python/profiler/
+tensorflow/python/profiler/internal
+tensorflow/python/saved_model
+tensorflow/python/summary
+tensorflow/python/summary/writer
+tensorflow/python/tools
+tensorflow/python/training
+tensorflow/python/user_ops
+tensorflow/python/util
+tensorflow/python/util/protobuf
+tensorflow/tools
+tensorflow/tools/graph_transforms
+tensorflow/contrib
+tensorflow/contrib/all_reduce
+tensorflow/contrib/all_reduce/python
+tensorflow/contrib/android
+tensorflow/contrib/android/java
+tensorflow/contrib/android/java/org
+tensorflow/contrib/android/java/org/tensorflow
+tensorflow/contrib/android/java/org/tensorflow/contrib
+tensorflow/contrib/android/java/org/tensorflow/contrib/android
+tensorflow/contrib/android/jni
+tensorflow/contrib/batching
+tensorflow/contrib/batching/kernels
+tensorflow/contrib/batching/python
+tensorflow/contrib/batching/python/ops
+tensorflow/contrib/bayesflow
+tensorflow/contrib/bayesflow/examples
+tensorflow/contrib/bayesflow/examples/reinforce_simple
+tensorflow/contrib/bayesflow/python
+tensorflow/contrib/bayesflow/python/ops
+tensorflow/contrib/boosted_trees
+tensorflow/contrib/boosted_trees/estimator_batch
+tensorflow/contrib/boosted_trees/kernels
+tensorflow/contrib/boosted_trees/ops
+tensorflow/contrib/boosted_trees/proto
+tensorflow/contrib/boosted_trees/python
+tensorflow/contrib/boosted_trees/python/ops
+tensorflow/contrib/cloud
+tensorflow/contrib/cloud/kernels
+tensorflow/contrib/cloud/ops
+tensorflow/contrib/cloud/python
+tensorflow/contrib/cloud/python/ops
+tensorflow/contrib/cluster_resolver
+tensorflow/contrib/cluster_resolver/python
+tensorflow/contrib/cluster_resolver/python/training
+tensorflow/contrib/compiler
+tensorflow/contrib/copy_graph
+tensorflow/contrib/copy_graph/python
+tensorflow/contrib/copy_graph/python/util
+tensorflow/contrib/crf
+tensorflow/contrib/crf/python
+tensorflow/contrib/crf/python/ops
+tensorflow/contrib/cudnn_rnn
+tensorflow/contrib/cudnn_rnn/kernels
+tensorflow/contrib/cudnn_rnn/ops
+tensorflow/contrib/cudnn_rnn/python
+tensorflow/contrib/cudnn_rnn/python/layers
+tensorflow/contrib/cudnn_rnn/python/ops
+tensorflow/contrib/data
+tensorflow/contrib/data/kernels
+tensorflow/contrib/data/python
+tensorflow/contrib/data/python/kernel_tests
+tensorflow/contrib/data/python/ops
+tensorflow/contrib/decision_trees
+tensorflow/contrib/decision_trees/proto
+tensorflow/contrib/deprecated
+tensorflow/contrib/distributions
+tensorflow/contrib/distributions/python
+tensorflow/contrib/distributions/python/ops
+tensorflow/contrib/distributions/python/ops/bijectors
+tensorflow/contrib/eager
+tensorflow/contrib/eager/python
+tensorflow/contrib/estimator
+tensorflow/contrib/estimator/python
+tensorflow/contrib/estimator/python/estimator
+tensorflow/contrib/factorization
+tensorflow/contrib/factorization/examples
+tensorflow/contrib/factorization/kernels
+tensorflow/contrib/factorization/ops
+tensorflow/contrib/factorization/python
+tensorflow/contrib/factorization/python/ops
+tensorflow/contrib/ffmpeg
+tensorflow/contrib/ffmpeg/default
+tensorflow/contrib/framework
+tensorflow/contrib/framework/kernels
+tensorflow/contrib/framework/ops
+tensorflow/contrib/framework/python
+tensorflow/contrib/framework/python/framework
+tensorflow/contrib/framework/python/ops
+tensorflow/contrib/fused_conv
+tensorflow/contrib/fused_conv/kernels
+tensorflow/contrib/fused_conv/python
+tensorflow/contrib/fused_conv/python/ops
+tensorflow/contrib/gan
+tensorflow/contrib/gan/python
+tensorflow/contrib/gan/python/estimator
+tensorflow/contrib/gan/python/estimator/python
+tensorflow/contrib/gan/python/eval
+tensorflow/contrib/gan/python/eval/python
+tensorflow/contrib/gan/python/features
+tensorflow/contrib/gan/python/features/python
+tensorflow/contrib/gan/python/losses
+tensorflow/contrib/gan/python/losses/python
+tensorflow/contrib/graph_editor
+tensorflow/contrib/graph_editor/examples
+tensorflow/contrib/grid_rnn
+tensorflow/contrib/grid_rnn/python
+tensorflow/contrib/grid_rnn/python/ops
+tensorflow/contrib/hooks
+tensorflow/contrib/hooks/python
+tensorflow/contrib/image
+tensorflow/contrib/image/kernels
+tensorflow/contrib/image/ops
+tensorflow/contrib/image/python
+tensorflow/contrib/image/python/ops
+tensorflow/contrib/input_pipeline
+tensorflow/contrib/input_pipeline/kernels
+tensorflow/contrib/input_pipeline/ops
+tensorflow/contrib/input_pipeline/python
+tensorflow/contrib/input_pipeline/python/ops
+tensorflow/contrib/integrate
+tensorflow/contrib/integrate/python
+tensorflow/contrib/integrate/python/ops
+tensorflow/contrib/ios_examples
+tensorflow/contrib/ios_examples/benchmark
+tensorflow/contrib/ios_examples/benchmark/benchmark.xcodeproj
+tensorflow/contrib/ios_examples/benchmark/data
+tensorflow/contrib/ios_examples/camera
+tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj
+tensorflow/contrib/ios_examples/camera/en.lproj
+tensorflow/contrib/ios_examples/simple
+tensorflow/contrib/ios_examples/simple/data
+tensorflow/contrib/ios_examples/simple/tf_ios_makefile_example.xcodeproj
+tensorflow/contrib/keras
+tensorflow/contrib/keras/api
+tensorflow/contrib/keras/api/keras
+tensorflow/contrib/keras/api/keras/activations
+tensorflow/contrib/keras/api/keras/applications
+tensorflow/contrib/keras/api/keras/applications/inception_v3
+tensorflow/contrib/keras/api/keras/applications/mobilenet
+tensorflow/contrib/keras/api/keras/applications/resnet50
+tensorflow/contrib/keras/api/keras/applications/vgg16
+tensorflow/contrib/keras/api/keras/applications/vgg19
+tensorflow/contrib/keras/api/keras/applications/xception
+tensorflow/contrib/keras/api/keras/backend
+tensorflow/contrib/keras/api/keras/callbacks
+tensorflow/contrib/keras/api/keras/constraints
+tensorflow/contrib/keras/api/keras/datasets
+tensorflow/contrib/keras/api/keras/datasets/boston_housing
+tensorflow/contrib/keras/api/keras/datasets/cifar10
+tensorflow/contrib/keras/api/keras/datasets/cifar100
+tensorflow/contrib/keras/api/keras/datasets/imdb
+tensorflow/contrib/keras/api/keras/datasets/mnist
+tensorflow/contrib/keras/api/keras/datasets/reuters
+tensorflow/contrib/keras/api/keras/initializers
+tensorflow/contrib/keras/api/keras/layers
+tensorflow/contrib/keras/api/keras/losses
+tensorflow/contrib/keras/api/keras/metrics
+tensorflow/contrib/keras/api/keras/models
+tensorflow/contrib/keras/api/keras/optimizers
+tensorflow/contrib/keras/api/keras/preprocessing
+tensorflow/contrib/keras/api/keras/preprocessing/image
+tensorflow/contrib/keras/api/keras/preprocessing/sequence
+tensorflow/contrib/keras/api/keras/preprocessing/text
+tensorflow/contrib/keras/api/keras/regularizers
+tensorflow/contrib/keras/api/keras/utils
+tensorflow/contrib/keras/api/keras/wrappers
+tensorflow/contrib/keras/api/keras/wrappers/scikit_learn
+tensorflow/contrib/kernel_methods
+tensorflow/contrib/kernel_methods/python
+tensorflow/contrib/kernel_methods/python/mappers
+tensorflow/contrib/kfac
+tensorflow/contrib/kfac/examples
+tensorflow/contrib/kfac/python
+tensorflow/contrib/kfac/python/ops
+tensorflow/contrib/labeled_tensor
+tensorflow/contrib/labeled_tensor/python
+tensorflow/contrib/labeled_tensor/python/ops
+tensorflow/contrib/layers
+tensorflow/contrib/layers/kernels
+tensorflow/contrib/layers/ops
+tensorflow/contrib/layers/python
+tensorflow/contrib/layers/python/layers
+tensorflow/contrib/layers/python/ops
+tensorflow/contrib/learn
+tensorflow/contrib/learn/python
+tensorflow/contrib/learn/python/learn
+tensorflow/contrib/learn/python/learn/dataframe
+tensorflow/contrib/learn/python/learn/dataframe/queues
+tensorflow/contrib/learn/python/learn/dataframe/transforms
+tensorflow/contrib/learn/python/learn/datasets
+tensorflow/contrib/learn/python/learn/datasets/data
+tensorflow/contrib/learn/python/learn/estimators
+tensorflow/contrib/learn/python/learn/learn_io
+tensorflow/contrib/learn/python/learn/ops
+tensorflow/contrib/learn/python/learn/preprocessing
+tensorflow/contrib/learn/python/learn/utils
+tensorflow/contrib/legacy_seq2seq
+tensorflow/contrib/legacy_seq2seq/python
+tensorflow/contrib/legacy_seq2seq/python/ops
+tensorflow/contrib/libsvm
+tensorflow/contrib/libsvm/python
+tensorflow/contrib/libsvm/python/kernel_tests
+tensorflow/contrib/libsvm/python/ops
+tensorflow/contrib/linalg
+tensorflow/contrib/linalg/python
+tensorflow/contrib/linalg/python/ops
+tensorflow/contrib/linear_optimizer
+tensorflow/contrib/linear_optimizer/kernels
+tensorflow/contrib/linear_optimizer/kernels/g3doc
+tensorflow/contrib/linear_optimizer/python
+tensorflow/contrib/linear_optimizer/python/ops
+tensorflow/contrib/lookup
+tensorflow/contrib/losses
+tensorflow/contrib/losses/python
+tensorflow/contrib/losses/python/losses
+tensorflow/contrib/losses/python/metric_learning
+tensorflow/contrib/makefile
+tensorflow/contrib/memory_stats
+tensorflow/contrib/memory_stats/kernels
+tensorflow/contrib/memory_stats/ops
+tensorflow/contrib/memory_stats/python
+tensorflow/contrib/memory_stats/python/ops
+tensorflow/contrib/meta_graph_transform
+tensorflow/contrib/metrics
+tensorflow/contrib/metrics/ops
+tensorflow/contrib/metrics/python
+tensorflow/contrib/metrics/python/metrics
+tensorflow/contrib/metrics/python/ops
+tensorflow/contrib/model_pruning
+tensorflow/contrib/model_pruning/examples
+tensorflow/contrib/model_pruning/examples/cifar10
+tensorflow/contrib/model_pruning/python
+tensorflow/contrib/model_pruning/python/layers
+tensorflow/contrib/nccl
+tensorflow/contrib/nccl/kernels
+tensorflow/contrib/nccl/ops
+tensorflow/contrib/nccl/python
+tensorflow/contrib/nccl/python/ops
+tensorflow/contrib/ndlstm
+tensorflow/contrib/ndlstm/python
+tensorflow/contrib/nearest_neighbor/kernels
+tensorflow/contrib/nearest_neighbor/ops
+tensorflow/contrib/nearest_neighbor/python
+tensorflow/contrib/nearest_neighbor/python/ops
+tensorflow/contrib/nn
+tensorflow/contrib/nn/python
+tensorflow/contrib/nn/python/ops
+tensorflow/contrib/opt
+tensorflow/contrib/opt/python
+tensorflow/contrib/opt/python/training
+tensorflow/contrib/pi_examples
+tensorflow/contrib/pi_examples/camera
+tensorflow/contrib/pi_examples/label_image
+tensorflow/contrib/pi_examples/label_image/data
+tensorflow/contrib/periodic_resample
+tensorflow/contrib/periodic_resample/python
+tensorflow/contrib/periodic_resample/python/kernels
+tensorflow/contrib/periodic_resample/python/ops
+tensorflow/contrib/predictor
+tensorflow/contrib/quantization
+tensorflow/contrib/quantization/python
+tensorflow/contrib/quantize
+tensorflow/contrib/quantize/python
+tensorflow/contrib/receptive_field
+tensorflow/contrib/receptive_field/python
+tensorflow/contrib/reduce_slice_ops
+tensorflow/contrib/reduce_slice_ops/kernels
+tensorflow/contrib/reduce_slice_ops/ops
+tensorflow/contrib/reduce_slice_ops/python
+tensorflow/contrib/reduce_slice_ops/python/ops
+tensorflow/contrib/remote_fused_graph/pylib
+tensorflow/contrib/remote_fused_graph/pylib/python
+tensorflow/contrib/remote_fused_graph/pylib/python/ops
+tensorflow/contrib/resampler
+tensorflow/contrib/resampler/kernels
+tensorflow/contrib/resampler/ops
+tensorflow/contrib/resampler/python
+tensorflow/contrib/resampler/python/ops
+tensorflow/contrib/rnn
+tensorflow/contrib/rnn/kernels
+tensorflow/contrib/rnn/ops
+tensorflow/contrib/rnn/python
+tensorflow/contrib/rnn/python/kernel_tests
+tensorflow/contrib/rnn/python/ops
+tensorflow/contrib/saved_model
+tensorflow/contrib/saved_model/python
+tensorflow/contrib/saved_model/python/saved_model
+tensorflow/contrib/seq2seq
+tensorflow/contrib/seq2seq/kernels
+tensorflow/contrib/seq2seq/ops
+tensorflow/contrib/seq2seq/python
+tensorflow/contrib/seq2seq/python/ops
+tensorflow/contrib/session_bundle
+tensorflow/contrib/session_bundle/example
+tensorflow/contrib/signal
+tensorflow/contrib/signal/python
+tensorflow/contrib/signal/python/ops
+tensorflow/contrib/slim
+tensorflow/contrib/slim/python
+tensorflow/contrib/slim/python/slim
+tensorflow/contrib/slim/python/slim/data
+tensorflow/contrib/slim/python/slim/nets
+tensorflow/contrib/solvers
+tensorflow/contrib/solvers/python
+tensorflow/contrib/solvers/python/ops
+tensorflow/contrib/sparsemax
+tensorflow/contrib/sparsemax/python
+tensorflow/contrib/sparsemax/python/ops
+tensorflow/contrib/specs
+tensorflow/contrib/specs/python
+tensorflow/contrib/staging
+tensorflow/contrib/stat_summarizer
+tensorflow/contrib/stat_summarizer/python
+tensorflow/contrib/stateless
+tensorflow/contrib/stateless/python
+tensorflow/contrib/summary
+tensorflow/contrib/tensorboard
+tensorflow/contrib/tensorboard/plugins
+tensorflow/contrib/tensorboard/plugins/projector
+tensorflow/contrib/tensor_forest
+tensorflow/contrib/tensor_forest/client
+tensorflow/contrib/tensor_forest/core
+tensorflow/contrib/tensor_forest/core/ops
+tensorflow/contrib/tensor_forest/data
+tensorflow/contrib/tensor_forest/hybrid
+tensorflow/contrib/tensor_forest/hybrid/core
+tensorflow/contrib/tensor_forest/hybrid/core/ops
+tensorflow/contrib/tensor_forest/hybrid/ops
+tensorflow/contrib/tensor_forest/hybrid/python
+tensorflow/contrib/tensor_forest/hybrid/python/layers
+tensorflow/contrib/tensor_forest/hybrid/python/models
+tensorflow/contrib/tensor_forest/hybrid/python/ops
+tensorflow/contrib/tensor_forest/kernels
+tensorflow/contrib/tensor_forest/python
+tensorflow/contrib/tensor_forest/python/ops
+tensorflow/contrib/testing
+tensorflow/contrib/testing/python
+tensorflow/contrib/testing/python/framework
+tensorflow/contrib/text
+tensorflow/contrib/text/kernels
+tensorflow/contrib/text/ops
+tensorflow/contrib/text/python
+tensorflow/contrib/text/python/ops
+tensorflow/contrib/tfprof
+tensorflow/contrib/timeseries
+tensorflow/contrib/timeseries/examples
+tensorflow/contrib/timeseries/examples/data
+tensorflow/contrib/timeseries/python
+tensorflow/contrib/timeseries/python/timeseries
+tensorflow/contrib/timeseries/python/timeseries/state_space_models
+tensorflow/contrib/tpu
+tensorflow/contrib/tpu/ops
+tensorflow/contrib/tpu/profiler
+tensorflow/contrib/tpu/python
+tensorflow/contrib/tpu/python/ops
+tensorflow/contrib/tpu/python/profiler
+tensorflow/contrib/tpu/python/tpu
+tensorflow/contrib/training
+tensorflow/contrib/training/python
+tensorflow/contrib/training/python/training
+tensorflow/contrib/util
diff --git a/tensorflow/contrib/cmake/python_protos.txt b/tensorflow/contrib/cmake/python_protos.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8a9c406d8b118c10ddcaafb0e4fc242aa79cdb57
--- /dev/null
+++ b/tensorflow/contrib/cmake/python_protos.txt
@@ -0,0 +1,19 @@
+tensorflow/core
+tensorflow/core/profiler
+tensorflow/python
+tensorflow/contrib/boosted_trees/proto
+tensorflow/contrib/cloud/kernels
+tensorflow/contrib/decision_trees/proto
+tensorflow/contrib/gdr
+tensorflow/contrib/lite/toco
+tensorflow/contrib/mpi
+tensorflow/contrib/mpi_collectives
+tensorflow/contrib/session_bundle
+tensorflow/contrib/tensor_forest/proto
+tensorflow/contrib/tensorboard/graph_explorer/proto
+tensorflow/contrib/tensorboard/plugins/projector
+tensorflow/contrib/tensorboard/plugins/trace
+tensorflow/contrib/tpu/proto
+tensorflow/contrib/tpu/profiler
+tensorflow/contrib/training/python/training
+tensorflow/contrib/verbs
diff --git a/tensorflow/contrib/cmake/python_protos_cc.txt b/tensorflow/contrib/cmake/python_protos_cc.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d4a257b25c814a1464308d0e6ce3ce65d21f6a36
--- /dev/null
+++ b/tensorflow/contrib/cmake/python_protos_cc.txt
@@ -0,0 +1,5 @@
+tensorflow/core/profiler
+tensorflow/python
+tensorflow/contrib/session_bundle
+tensorflow/contrib/tensorboard
+tensorflow/contrib/training
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index f63aca4a835e213ef6d420845df9bb537514e142..6e2ac203f9a7f96cb14752a91483840a9eb6b451 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -83,7 +83,7 @@ foreach(tf_cc_op_lib_name ${tf_cc_op_lib_names})
                ${cc_ops_target_dir}/${tf_cc_op_lib_name}.cc
                ${cc_ops_target_dir}/${tf_cc_op_lib_name}_internal.h
                ${cc_ops_target_dir}/${tf_cc_op_lib_name}_internal.cc
-        COMMAND ${tf_cc_op_lib_name}_gen_cc ${cc_ops_target_dir}/${tf_cc_op_lib_name}.h ${cc_ops_target_dir}/${tf_cc_op_lib_name}.cc ${tensorflow_source_dir}/tensorflow/cc/ops/op_gen_overrides.pbtxt ${cc_ops_include_internal}
+        COMMAND ${tf_cc_op_lib_name}_gen_cc ${cc_ops_target_dir}/${tf_cc_op_lib_name}.h ${cc_ops_target_dir}/${tf_cc_op_lib_name}.cc ${tensorflow_source_dir}/tensorflow/cc/ops/op_gen_overrides.pbtxt ${cc_ops_include_internal} ${tensorflow_source_dir}/tensorflow/core/api_def/base_api
         DEPENDS ${tf_cc_op_lib_name}_gen_cc create_cc_ops_header_dir
     )
 
diff --git a/tensorflow/contrib/cmake/tf_core_cpu.cmake b/tensorflow/contrib/cmake/tf_core_cpu.cmake
index 5c01ca382fb9cc7a01a6f2b60a510c59f0aa7119..e4213ea2a47da2a7381cccd0504235ad62018d4e 100644
--- a/tensorflow/contrib/cmake/tf_core_cpu.cmake
+++ b/tensorflow/contrib/cmake/tf_core_cpu.cmake
@@ -63,7 +63,7 @@ if (tensorflow_ENABLE_GPU)
   file(GLOB_RECURSE tf_core_gpu_srcs
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/platform/default/gpu/cupti_wrapper.cc"
-    "${tensorflow_source_dir}/tensorflow/core/platform/default/gpu_tracer.cc"
+    "${tensorflow_source_dir}/tensorflow/core/platform/default/device_tracer.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu_device_factory.cc"
     "${tensorflow_source_dir}/tensorflow/core/grappler/devices.h"
     "${tensorflow_source_dir}/tensorflow/core/grappler/devices.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index c607546f4a5244fb6e7cd12db874f07a962f6f4d..5ec1a8d04fa41c6b36400fc0998af77592866150 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -211,7 +211,7 @@ if (NOT tensorflow_ENABLE_GPU)
   list(REMOVE_ITEM tf_core_platform_srcs ${tf_core_platform_gpu_srcs})
 else()
   file(GLOB tf_core_platform_srcs_exclude
-      "${tensorflow_source_dir}/tensorflow/core/platform/default/gpu_tracer.cc")
+      "${tensorflow_source_dir}/tensorflow/core/platform/default/device_tracer.cc")
   list(REMOVE_ITEM tf_core_platform_srcs ${tf_core_platform_srcs_exclude})
 endif()
 
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index b1102cecbe2d64b5bfb8e5ed95ca1478a74c7fa4..d3b6c0bdd385432dc469133c00960ebba0dbeec5 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -55,10 +55,6 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/model_ops.cc"
@@ -89,6 +85,8 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/libsvm/ops/libsvm_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_manager.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc"
@@ -154,9 +152,6 @@ list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_exclude_srcs})
 if(WIN32)
   file(GLOB_RECURSE tf_core_kernels_windows_exclude_srcs
       # not working on windows yet
-      "${tensorflow_source_dir}/tensorflow/core/kernels/meta_support.*"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/neon/*"
       # not in core - those are loaded dynamically as dll
       "${tensorflow_source_dir}/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 4a61ed7a3548b1992ddc71acb8a7761e252296ea..e8c2cd347327843d10d13c1d24a800ff776aa8c1 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -92,6 +92,7 @@ GENERATE_CONTRIB_OP_LIBRARY(image_sirds "${tensorflow_source_dir}/tensorflow/con
 GENERATE_CONTRIB_OP_LIBRARY(layers_sparse_feature_cross "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc")
 GENERATE_CONTRIB_OP_LIBRARY(memory_stats "${tensorflow_source_dir}/tensorflow/contrib/memory_stats/ops/memory_stats_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(nccl "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(periodic_resample "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/ops/array_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(nearest_neighbor "${tensorflow_source_dir}/tensorflow/contrib/nearest_neighbor/ops/nearest_neighbor_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(resampler "${tensorflow_source_dir}/tensorflow/contrib/resampler/ops/resampler_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(rnn_gru "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc")
diff --git a/tensorflow/contrib/cmake/tf_grappler.cmake b/tensorflow/contrib/cmake/tf_grappler.cmake
index a7841c98e83ec8c3eb91edfd9d639e169cb5f440..410490531a300c091afdd857d7f2d4e789a4c80e 100644
--- a/tensorflow/contrib/cmake/tf_grappler.cmake
+++ b/tensorflow/contrib/cmake/tf_grappler.cmake
@@ -23,7 +23,7 @@ file(GLOB tf_grappler_srcs
    "${tensorflow_source_dir}/tensorflow/python/grappler/model_analyzer.cc"
    "${tensorflow_source_dir}/tensorflow/python/grappler/model_analyzer.h"
  )
- 
+
 add_library(tf_grappler OBJECT ${tf_grappler_srcs})
 
 add_dependencies(tf_grappler tf_core_cpu)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 61b3fd715ddc8f47e1f2724cb805dc5065448619..8db6929e31a1a5f5c793721f455a664bd6741b06 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -120,32 +120,34 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS HDRS ROOT_DIR)
   set(${HDRS} ${${HDRS}} PARENT_SCOPE)
 endfunction()
 
-file(GLOB_RECURSE tf_protos_python_srcs RELATIVE ${tensorflow_source_dir}
-    "${tensorflow_source_dir}/tensorflow/core/*.proto"
-    "${tensorflow_source_dir}/tensorflow/core/profiler/*.proto"
-    "${tensorflow_source_dir}/tensorflow/python/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/proto/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/decision_trees/proto/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/proto/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/tpu/profiler/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/training/*.proto"
-)
+FILE(READ python_protos.txt python_protos)
+# Convert file contents into a CMake list (where each element in the list is one line of the file)
+STRING(REGEX REPLACE ";" "\\\\;" python_protos "${python_protos}")
+STRING(REGEX REPLACE "\n" ";" python_protos "${python_protos}")
+
+foreach(python_proto ${python_protos})
+  file(GLOB_RECURSE tf_python_protos_src RELATIVE ${tensorflow_source_dir}
+      "${tensorflow_source_dir}/${python_proto}/*.proto"
+  )
+  list(APPEND tf_python_protos_srcs ${tf_python_protos_src})
+endforeach(python_proto)
+
 RELATIVE_PROTOBUF_GENERATE_PYTHON(
-    ${tensorflow_source_dir} PYTHON_PROTO_GENFILES ${tf_protos_python_srcs}
+    ${tensorflow_source_dir} PYTHON_PROTO_GENFILES ${tf_python_protos_srcs}
 )
 
-# NOTE(mrry): Avoid regenerating the tensorflow/core protos because this
-# can cause benign-but-failing-on-Windows-due-to-file-locking conflicts
-# when two rules attempt to generate the same file.
-file(GLOB_RECURSE tf_python_protos_cc_srcs RELATIVE ${tensorflow_source_dir}
-    "${tensorflow_source_dir}/tensorflow/core/profiler/*.proto"
-    "${tensorflow_source_dir}/tensorflow/python/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/training/*.proto"
-)
+FILE(READ python_protos_cc.txt python_protos_cc)
+# Convert file contents into a CMake list (where each element in the list is one line of the file)
+STRING(REGEX REPLACE ";" "\\\\;" python_protos_cc "${python_protos_cc}")
+STRING(REGEX REPLACE "\n" ";" python_protos_cc "${python_protos_cc}")
+
+foreach(python_proto_cc ${python_protos_cc})
+  file(GLOB_RECURSE tf_python_protos_cc_src RELATIVE ${tensorflow_source_dir}
+      "${tensorflow_source_dir}/${python_proto_cc}/*.proto"
+  )
+  list(APPEND tf_python_protos_cc_srcs ${tf_python_protos_cc_src})
+endforeach(python_proto_cc)
+
 RELATIVE_PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS
     ${tensorflow_source_dir} ${tf_python_protos_cc_srcs}
 )
@@ -191,315 +193,15 @@ function(add_python_module MODULE_NAME)
     endif()
 endfunction()
 
-add_python_module("tensorflow")
-add_python_module("tensorflow/core")
-add_python_module("tensorflow/core/example")
-add_python_module("tensorflow/core/framework")
-add_python_module("tensorflow/core/lib")
-add_python_module("tensorflow/core/lib/core")
-add_python_module("tensorflow/core/protobuf")
-add_python_module("tensorflow/core/util")
-add_python_module("tensorflow/examples")
-add_python_module("tensorflow/examples/tutorials")
-add_python_module("tensorflow/examples/tutorials/mnist")
-add_python_module("tensorflow/python")
-add_python_module("tensorflow/python/client")
-add_python_module("tensorflow/python/data")
-add_python_module("tensorflow/python/data/ops")
-add_python_module("tensorflow/python/data/util")
-add_python_module("tensorflow/python/debug")
-add_python_module("tensorflow/python/debug/cli")
-add_python_module("tensorflow/python/debug/examples")
-add_python_module("tensorflow/python/debug/lib")
-add_python_module("tensorflow/python/debug/wrappers")
-add_python_module("tensorflow/python/eager")
-add_python_module("tensorflow/python/estimator")
-add_python_module("tensorflow/python/estimator/canned")
-add_python_module("tensorflow/python/estimator/export")
-add_python_module("tensorflow/python/estimator/inputs")
-add_python_module("tensorflow/python/estimator/inputs/queues")
-add_python_module("tensorflow/python/feature_column")
-add_python_module("tensorflow/python/framework")
-add_python_module("tensorflow/python/grappler")
-add_python_module("tensorflow/python/keras")
-add_python_module("tensorflow/python/keras/activations")
-add_python_module("tensorflow/python/keras/applications")
-add_python_module("tensorflow/python/keras/applications/inception_resnet_v2")
-add_python_module("tensorflow/python/keras/applications/inception_v3")
-add_python_module("tensorflow/python/keras/applications/mobilenet")
-add_python_module("tensorflow/python/keras/applications/resnet50")
-add_python_module("tensorflow/python/keras/applications/vgg16")
-add_python_module("tensorflow/python/keras/applications/vgg19")
-add_python_module("tensorflow/python/keras/applications/xception")
-add_python_module("tensorflow/python/keras/backend")
-add_python_module("tensorflow/python/keras/callbacks")
-add_python_module("tensorflow/python/keras/constraints")
-add_python_module("tensorflow/python/keras/datasets")
-add_python_module("tensorflow/python/keras/datasets/boston_housing")
-add_python_module("tensorflow/python/keras/datasets/cifar10")
-add_python_module("tensorflow/python/keras/datasets/cifar100")
-add_python_module("tensorflow/python/keras/datasets/fashion_mnist")
-add_python_module("tensorflow/python/keras/datasets/imdb")
-add_python_module("tensorflow/python/keras/datasets/mnist")
-add_python_module("tensorflow/python/keras/datasets/reuters")
-add_python_module("tensorflow/python/keras/estimator")
-add_python_module("tensorflow/python/keras/initializers")
-add_python_module("tensorflow/python/keras/layers")
-add_python_module("tensorflow/python/keras/losses")
-add_python_module("tensorflow/python/keras/metrics")
-add_python_module("tensorflow/python/keras/models")
-add_python_module("tensorflow/python/keras/optimizers")
-add_python_module("tensorflow/python/keras/preprocessing")
-add_python_module("tensorflow/python/keras/preprocessing/image")
-add_python_module("tensorflow/python/keras/preprocessing/sequence")
-add_python_module("tensorflow/python/keras/preprocessing/text")
-add_python_module("tensorflow/python/keras/regularizers")
-add_python_module("tensorflow/python/keras/utils")
-add_python_module("tensorflow/python/keras/wrappers")
-add_python_module("tensorflow/python/keras/wrappers/scikit_learn")
-add_python_module("tensorflow/python/keras/_impl")
-add_python_module("tensorflow/python/keras/_impl/keras")
-add_python_module("tensorflow/python/keras/_impl/keras/applications")
-add_python_module("tensorflow/python/keras/_impl/keras/datasets")
-add_python_module("tensorflow/python/keras/_impl/keras/engine")
-add_python_module("tensorflow/python/keras/_impl/keras/layers")
-add_python_module("tensorflow/python/keras/_impl/keras/preprocessing")
-add_python_module("tensorflow/python/keras/_impl/keras/utils")
-add_python_module("tensorflow/python/keras/_impl/keras/wrappers")
-add_python_module("tensorflow/python/kernel_tests")
-add_python_module("tensorflow/python/kernel_tests/distributions")
-add_python_module("tensorflow/python/kernel_tests/linalg")
-add_python_module("tensorflow/python/layers")
-add_python_module("tensorflow/python/lib")
-add_python_module("tensorflow/python/lib/core")
-add_python_module("tensorflow/python/lib/io")
-add_python_module("tensorflow/python/ops")
-add_python_module("tensorflow/python/ops/distributions")
-add_python_module("tensorflow/python/ops/linalg")
-add_python_module("tensorflow/python/ops/losses")
-add_python_module("tensorflow/python/platform")
-add_python_module("tensorflow/python/platform/default")
-add_python_module("tensorflow/python/platform/summary")
-add_python_module("tensorflow/python/profiler/")
-add_python_module("tensorflow/python/profiler/internal")
-add_python_module("tensorflow/python/saved_model")
-add_python_module("tensorflow/python/summary")
-add_python_module("tensorflow/python/summary/writer")
-add_python_module("tensorflow/python/tools")
-add_python_module("tensorflow/python/training")
-add_python_module("tensorflow/python/user_ops")
-add_python_module("tensorflow/python/util")
-add_python_module("tensorflow/python/util/protobuf")
-add_python_module("tensorflow/tools")
-add_python_module("tensorflow/tools/graph_transforms")
-add_python_module("tensorflow/contrib")
-add_python_module("tensorflow/contrib/all_reduce")
-add_python_module("tensorflow/contrib/all_reduce/python")
-add_python_module("tensorflow/contrib/android")
-add_python_module("tensorflow/contrib/android/java")
-add_python_module("tensorflow/contrib/android/java/org")
-add_python_module("tensorflow/contrib/android/java/org/tensorflow")
-add_python_module("tensorflow/contrib/android/java/org/tensorflow/contrib")
-add_python_module("tensorflow/contrib/android/java/org/tensorflow/contrib/android")
-add_python_module("tensorflow/contrib/android/jni")
-add_python_module("tensorflow/contrib/bayesflow")
-add_python_module("tensorflow/contrib/bayesflow/examples")
-add_python_module("tensorflow/contrib/bayesflow/examples/reinforce_simple")
-add_python_module("tensorflow/contrib/bayesflow/python")
-add_python_module("tensorflow/contrib/bayesflow/python/kernel_tests")
-add_python_module("tensorflow/contrib/bayesflow/python/ops")
-add_python_module("tensorflow/contrib/boosted_trees")
-add_python_module("tensorflow/contrib/boosted_trees/estimator_batch")
-add_python_module("tensorflow/contrib/boosted_trees/ops")
-add_python_module("tensorflow/contrib/boosted_trees/proto")
-add_python_module("tensorflow/contrib/boosted_trees/python")
-add_python_module("tensorflow/contrib/boosted_trees/python/kernel_tests")
-add_python_module("tensorflow/contrib/boosted_trees/python/ops")
-add_python_module("tensorflow/contrib/cloud")
-add_python_module("tensorflow/contrib/cloud/kernels")
-add_python_module("tensorflow/contrib/cloud/ops")
-add_python_module("tensorflow/contrib/cloud/python")
-add_python_module("tensorflow/contrib/cloud/python/ops")
-add_python_module("tensorflow/contrib/cluster_resolver")
-add_python_module("tensorflow/contrib/cluster_resolver/python")
-add_python_module("tensorflow/contrib/cluster_resolver/python/training")
-add_python_module("tensorflow/contrib/compiler")
-add_python_module("tensorflow/contrib/copy_graph")
-add_python_module("tensorflow/contrib/copy_graph/python")
-add_python_module("tensorflow/contrib/copy_graph/python/util")
-add_python_module("tensorflow/contrib/crf")
-add_python_module("tensorflow/contrib/crf/python")
-add_python_module("tensorflow/contrib/crf/python/kernel_tests")
-add_python_module("tensorflow/contrib/crf/python/ops")
-add_python_module("tensorflow/contrib/cudnn_rnn")
-add_python_module("tensorflow/contrib/cudnn_rnn/kernels")
-add_python_module("tensorflow/contrib/cudnn_rnn/ops")
-add_python_module("tensorflow/contrib/cudnn_rnn/python")
-add_python_module("tensorflow/contrib/cudnn_rnn/python/kernel_tests")
-add_python_module("tensorflow/contrib/cudnn_rnn/python/layers")
-add_python_module("tensorflow/contrib/cudnn_rnn/python/ops")
-add_python_module("tensorflow/contrib/data")
-add_python_module("tensorflow/contrib/data/python")
-add_python_module("tensorflow/contrib/data/python/kernel_tests")
-add_python_module("tensorflow/contrib/data/python/ops")
-add_python_module("tensorflow/contrib/decision_trees")
-add_python_module("tensorflow/contrib/decision_trees/proto")
-add_python_module("tensorflow/contrib/deprecated")
-add_python_module("tensorflow/contrib/distributions")
-add_python_module("tensorflow/contrib/distributions/python")
-add_python_module("tensorflow/contrib/distributions/python/kernel_tests")
-add_python_module("tensorflow/contrib/distributions/python/ops")
-add_python_module("tensorflow/contrib/distributions/python/ops/bijectors")
-add_python_module("tensorflow/contrib/eager")
-add_python_module("tensorflow/contrib/eager/python")
-add_python_module("tensorflow/contrib/estimator")
-add_python_module("tensorflow/contrib/estimator/python")
-add_python_module("tensorflow/contrib/estimator/python/estimator")
-add_python_module("tensorflow/contrib/factorization")
-add_python_module("tensorflow/contrib/factorization/examples")
-add_python_module("tensorflow/contrib/factorization/kernels")
-add_python_module("tensorflow/contrib/factorization/ops")
-add_python_module("tensorflow/contrib/factorization/python")
-add_python_module("tensorflow/contrib/factorization/python/kernel_tests")
-add_python_module("tensorflow/contrib/factorization/python/ops")
-add_python_module("tensorflow/contrib/ffmpeg")
-add_python_module("tensorflow/contrib/ffmpeg/default")
-add_python_module("tensorflow/contrib/ffmpeg/testdata")
-add_python_module("tensorflow/contrib/framework")
-add_python_module("tensorflow/contrib/framework/kernels")
-add_python_module("tensorflow/contrib/framework/ops")
-add_python_module("tensorflow/contrib/framework/python")
-add_python_module("tensorflow/contrib/framework/python/framework")
-add_python_module("tensorflow/contrib/framework/python/ops")
-add_python_module("tensorflow/contrib/gan")
-add_python_module("tensorflow/contrib/gan/python")
-add_python_module("tensorflow/contrib/gan/python/eval")
-add_python_module("tensorflow/contrib/gan/python/eval/python")
-add_python_module("tensorflow/contrib/gan/python/features")
-add_python_module("tensorflow/contrib/gan/python/features/python")
-add_python_module("tensorflow/contrib/gan/python/estimator")
-add_python_module("tensorflow/contrib/gan/python/estimator/python")
-add_python_module("tensorflow/contrib/gan/python/losses")
-add_python_module("tensorflow/contrib/gan/python/losses/python")
-add_python_module("tensorflow/contrib/graph_editor")
-add_python_module("tensorflow/contrib/graph_editor/examples")
-add_python_module("tensorflow/contrib/graph_editor/tests")
-add_python_module("tensorflow/contrib/grid_rnn")
-add_python_module("tensorflow/contrib/grid_rnn/python")
-add_python_module("tensorflow/contrib/grid_rnn/python/kernel_tests")
-add_python_module("tensorflow/contrib/grid_rnn/python/ops")
-add_python_module("tensorflow/contrib/hooks")
-add_python_module("tensorflow/contrib/image")
-add_python_module("tensorflow/contrib/image/ops")
-add_python_module("tensorflow/contrib/image/python")
-add_python_module("tensorflow/contrib/image/python/ops")
-add_python_module("tensorflow/contrib/input_pipeline")
-add_python_module("tensorflow/contrib/input_pipeline/ops")
-add_python_module("tensorflow/contrib/input_pipeline/python")
-add_python_module("tensorflow/contrib/input_pipeline/python/ops")
-add_python_module("tensorflow/contrib/integrate")
-add_python_module("tensorflow/contrib/integrate/python")
-add_python_module("tensorflow/contrib/integrate/python/ops")
-add_python_module("tensorflow/contrib/ios_examples")
-add_python_module("tensorflow/contrib/ios_examples/benchmark")
-add_python_module("tensorflow/contrib/ios_examples/benchmark/benchmark.xcodeproj")
-add_python_module("tensorflow/contrib/ios_examples/benchmark/data")
-add_python_module("tensorflow/contrib/ios_examples/camera")
-add_python_module("tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj")
-add_python_module("tensorflow/contrib/ios_examples/camera/en.lproj")
-add_python_module("tensorflow/contrib/ios_examples/simple")
-add_python_module("tensorflow/contrib/ios_examples/simple/data")
-add_python_module("tensorflow/contrib/ios_examples/simple/tf_ios_makefile_example.xcodeproj")
-add_python_module("tensorflow/contrib/keras")
-add_python_module("tensorflow/contrib/keras/api")
-add_python_module("tensorflow/contrib/keras/api/keras")
-add_python_module("tensorflow/contrib/keras/api/keras/activations")
-add_python_module("tensorflow/contrib/keras/api/keras/applications")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/inception_v3")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/mobilenet")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/resnet50")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/vgg16")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/vgg19")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/xception")
-add_python_module("tensorflow/contrib/keras/api/keras/backend")
-add_python_module("tensorflow/contrib/keras/api/keras/callbacks")
-add_python_module("tensorflow/contrib/keras/api/keras/constraints")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/boston_housing")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/cifar10")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/cifar100")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/imdb")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/mnist")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/reuters")
-add_python_module("tensorflow/contrib/keras/api/keras/initializers")
-add_python_module("tensorflow/contrib/keras/api/keras/layers")
-add_python_module("tensorflow/contrib/keras/api/keras/losses")
-add_python_module("tensorflow/contrib/keras/api/keras/metrics")
-add_python_module("tensorflow/contrib/keras/api/keras/models")
-add_python_module("tensorflow/contrib/keras/api/keras/optimizers")
-add_python_module("tensorflow/contrib/keras/api/keras/preprocessing")
-add_python_module("tensorflow/contrib/keras/api/keras/preprocessing/image")
-add_python_module("tensorflow/contrib/keras/api/keras/preprocessing/sequence")
-add_python_module("tensorflow/contrib/keras/api/keras/preprocessing/text")
-add_python_module("tensorflow/contrib/keras/api/keras/regularizers")
-add_python_module("tensorflow/contrib/keras/api/keras/utils")
-add_python_module("tensorflow/contrib/keras/api/keras/wrappers")
-add_python_module("tensorflow/contrib/keras/api/keras/wrappers/scikit_learn")
-add_python_module("tensorflow/contrib/keras/python")
-add_python_module("tensorflow/contrib/keras/python/keras")
-add_python_module("tensorflow/contrib/keras/python/keras/applications")
-add_python_module("tensorflow/contrib/keras/python/keras/datasets")
-add_python_module("tensorflow/contrib/keras/python/keras/engine")
-add_python_module("tensorflow/contrib/keras/python/keras/layers")
-add_python_module("tensorflow/contrib/keras/python/keras/preprocessing")
-add_python_module("tensorflow/contrib/keras/python/keras/utils")
-add_python_module("tensorflow/contrib/keras/python/keras/wrappers")
-add_python_module("tensorflow/contrib/kernel_methods")
-add_python_module("tensorflow/contrib/kernel_methods/python")
-add_python_module("tensorflow/contrib/kernel_methods/python/mappers")
-add_python_module("tensorflow/contrib/kfac")
-add_python_module("tensorflow/contrib/kfac/examples")
-add_python_module("tensorflow/contrib/kfac/python")
-add_python_module("tensorflow/contrib/kfac/python/ops")
-add_python_module("tensorflow/contrib/labeled_tensor")
-add_python_module("tensorflow/contrib/labeled_tensor/python")
-add_python_module("tensorflow/contrib/labeled_tensor/python/ops")
-add_python_module("tensorflow/contrib/layers")
-add_python_module("tensorflow/contrib/layers/kernels")
-add_python_module("tensorflow/contrib/layers/ops")
-add_python_module("tensorflow/contrib/layers/python")
-add_python_module("tensorflow/contrib/layers/python/kernel_tests")
-add_python_module("tensorflow/contrib/layers/python/layers")
-add_python_module("tensorflow/contrib/layers/python/ops")
-add_python_module("tensorflow/contrib/learn")
-add_python_module("tensorflow/contrib/learn/python")
-add_python_module("tensorflow/contrib/learn/python/learn")
-add_python_module("tensorflow/contrib/learn/python/learn/dataframe")
-add_python_module("tensorflow/contrib/learn/python/learn/dataframe/queues")
-add_python_module("tensorflow/contrib/learn/python/learn/dataframe/transforms")
-add_python_module("tensorflow/contrib/learn/python/learn/datasets")
-add_python_module("tensorflow/contrib/learn/python/learn/datasets/data")
-add_python_module("tensorflow/contrib/learn/python/learn/estimators")
-add_python_module("tensorflow/contrib/learn/python/learn/learn_io")
-add_python_module("tensorflow/contrib/learn/python/learn/ops")
-add_python_module("tensorflow/contrib/learn/python/learn/preprocessing")
-add_python_module("tensorflow/contrib/learn/python/learn/preprocessing/tests")
-add_python_module("tensorflow/contrib/learn/python/learn/tests")
-add_python_module("tensorflow/contrib/learn/python/learn/tests/dataframe")
-add_python_module("tensorflow/contrib/learn/python/learn/utils")
-add_python_module("tensorflow/contrib/legacy_seq2seq")
-add_python_module("tensorflow/contrib/legacy_seq2seq/python")
-add_python_module("tensorflow/contrib/legacy_seq2seq/python/ops")
-add_python_module("tensorflow/contrib/linalg")
-add_python_module("tensorflow/contrib/linalg/python")
-add_python_module("tensorflow/contrib/linalg/python/ops")
-add_python_module("tensorflow/contrib/linalg/python/kernel_tests")
-add_python_module("tensorflow/contrib/linear_optimizer")
-add_python_module("tensorflow/contrib/linear_optimizer/kernels")
-add_python_module("tensorflow/contrib/linear_optimizer/kernels/g3doc")
-add_python_module("tensorflow/contrib/linear_optimizer/python")
-add_python_module("tensorflow/contrib/linear_optimizer/python/kernel_tests")
-add_python_module("tensorflow/contrib/linear_optimizer/python/ops")
+FILE(READ python_modules.txt python_modules)
+# Convert file contents into a CMake list (where each element in the list is one line of the file)
+STRING(REGEX REPLACE ";" "\\\\;" python_modules "${python_modules}")
+STRING(REGEX REPLACE "\n" ";" python_modules "${python_modules}")
+
+foreach(python_module ${python_modules})
+  add_python_module(${python_module})
+endforeach(python_module)
+
 add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
     COMMAND ${CMAKE_COMMAND} -E make_directory
     "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/lite")
@@ -513,157 +215,6 @@ add_custom_command(
     TARGET tf_python_copy_scripts_to_destination PRE_BUILD
     COMMAND ${CMAKE_COMMAND} -E touch
     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/lite/python/lite.py)
-add_python_module("tensorflow/contrib/lookup")
-add_python_module("tensorflow/contrib/losses")
-add_python_module("tensorflow/contrib/losses/python")
-add_python_module("tensorflow/contrib/losses/python/losses")
-add_python_module("tensorflow/contrib/losses/python/metric_learning")
-add_python_module("tensorflow/contrib/makefile")
-add_python_module("tensorflow/contrib/makefile/test")
-add_python_module("tensorflow/contrib/memory_stats")
-add_python_module("tensorflow/contrib/memory_stats/kernels")
-add_python_module("tensorflow/contrib/memory_stats/ops")
-add_python_module("tensorflow/contrib/memory_stats/python")
-add_python_module("tensorflow/contrib/memory_stats/python/kernel_tests")
-add_python_module("tensorflow/contrib/memory_stats/python/ops")
-add_python_module("tensorflow/contrib/meta_graph_transform")
-add_python_module("tensorflow/contrib/metrics")
-add_python_module("tensorflow/contrib/metrics/kernels")
-add_python_module("tensorflow/contrib/metrics/ops")
-add_python_module("tensorflow/contrib/metrics/python")
-add_python_module("tensorflow/contrib/metrics/python/kernel_tests")
-add_python_module("tensorflow/contrib/metrics/python/metrics")
-add_python_module("tensorflow/contrib/metrics/python/ops")
-add_python_module("tensorflow/contrib/model_pruning")
-add_python_module("tensorflow/contrib/model_pruning/examples")
-add_python_module("tensorflow/contrib/model_pruning/examples/cifar10")
-add_python_module("tensorflow/contrib/model_pruning/python")
-add_python_module("tensorflow/contrib/model_pruning/python/layers")
-add_python_module("tensorflow/contrib/ndlstm")
-add_python_module("tensorflow/contrib/ndlstm/python")
-add_python_module("tensorflow/contrib/nn")
-add_python_module("tensorflow/contrib/nn/python")
-add_python_module("tensorflow/contrib/nn/python/ops")
-add_python_module("tensorflow/contrib/nccl")
-add_python_module("tensorflow/contrib/nccl/kernels")
-add_python_module("tensorflow/contrib/nccl/ops")
-add_python_module("tensorflow/contrib/nccl/python")
-add_python_module("tensorflow/contrib/nccl/python/ops")
-add_python_module("tensorflow/contrib/nearest_neighbor/kernels")
-add_python_module("tensorflow/contrib/nearest_neighbor/ops")
-add_python_module("tensorflow/contrib/nearest_neighbor/python")
-add_python_module("tensorflow/contrib/nearest_neighbor/python/kernel_tests")
-add_python_module("tensorflow/contrib/nearest_neighbor/python/ops")
-add_python_module("tensorflow/contrib/opt")
-add_python_module("tensorflow/contrib/opt/python")
-add_python_module("tensorflow/contrib/opt/python/training")
-add_python_module("tensorflow/contrib/pi_examples")
-add_python_module("tensorflow/contrib/pi_examples/camera")
-add_python_module("tensorflow/contrib/pi_examples/label_image")
-add_python_module("tensorflow/contrib/pi_examples/label_image/data")
-add_python_module("tensorflow/contrib/predictor")
-add_python_module("tensorflow/contrib/quantization")
-add_python_module("tensorflow/contrib/quantization/python")
-add_python_module("tensorflow/contrib/quantize")
-add_python_module("tensorflow/contrib/quantize/python")
-add_python_module("tensorflow/contrib/remote_fused_graph/pylib")
-add_python_module("tensorflow/contrib/remote_fused_graph/pylib/python")
-add_python_module("tensorflow/contrib/remote_fused_graph/pylib/python/ops")
-add_python_module("tensorflow/contrib/resampler")
-add_python_module("tensorflow/contrib/resampler/kernels")
-add_python_module("tensorflow/contrib/resampler/ops")
-add_python_module("tensorflow/contrib/resampler/python")
-add_python_module("tensorflow/contrib/resampler/python/ops")
-add_python_module("tensorflow/contrib/rnn")
-add_python_module("tensorflow/contrib/rnn/kernels")
-add_python_module("tensorflow/contrib/rnn/ops")
-add_python_module("tensorflow/contrib/rnn/python")
-add_python_module("tensorflow/contrib/rnn/python/kernel_tests")
-add_python_module("tensorflow/contrib/rnn/python/ops")
-add_python_module("tensorflow/contrib/saved_model")
-add_python_module("tensorflow/contrib/saved_model/python")
-add_python_module("tensorflow/contrib/saved_model/python/saved_model")
-add_python_module("tensorflow/contrib/seq2seq")
-add_python_module("tensorflow/contrib/seq2seq/kernels")
-add_python_module("tensorflow/contrib/seq2seq/ops")
-add_python_module("tensorflow/contrib/seq2seq/python")
-add_python_module("tensorflow/contrib/seq2seq/python/kernel_tests")
-add_python_module("tensorflow/contrib/seq2seq/python/ops")
-add_python_module("tensorflow/contrib/session_bundle")
-add_python_module("tensorflow/contrib/session_bundle/example")
-add_python_module("tensorflow/contrib/session_bundle/testdata")
-add_python_module("tensorflow/contrib/signal")
-add_python_module("tensorflow/contrib/signal/python")
-add_python_module("tensorflow/contrib/signal/python/ops")
-add_python_module("tensorflow/contrib/slim")
-add_python_module("tensorflow/contrib/slim/python")
-add_python_module("tensorflow/contrib/slim/python/slim")
-add_python_module("tensorflow/contrib/slim/python/slim/data")
-add_python_module("tensorflow/contrib/slim/python/slim/nets")
-add_python_module("tensorflow/contrib/solvers")
-add_python_module("tensorflow/contrib/solvers/python")
-add_python_module("tensorflow/contrib/solvers/python/ops")
-add_python_module("tensorflow/contrib/sparsemax")
-add_python_module("tensorflow/contrib/sparsemax/python")
-add_python_module("tensorflow/contrib/sparsemax/python/ops")
-add_python_module("tensorflow/contrib/specs")
-add_python_module("tensorflow/contrib/specs/python")
-add_python_module("tensorflow/contrib/staging")
-add_python_module("tensorflow/contrib/stat_summarizer")
-add_python_module("tensorflow/contrib/stateless")
-add_python_module("tensorflow/contrib/tensorboard")
-add_python_module("tensorflow/contrib/tensorboard/plugins")
-add_python_module("tensorflow/contrib/tensorboard/plugins/projector")
-add_python_module("tensorflow/contrib/tensor_forest")
-add_python_module("tensorflow/contrib/tensor_forest/client")
-add_python_module("tensorflow/contrib/tensor_forest/core")
-add_python_module("tensorflow/contrib/tensor_forest/core/ops")
-add_python_module("tensorflow/contrib/tensor_forest/data")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/core")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/core/ops")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/ops")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/python")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/python/layers")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/python/models")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/python/ops")
-add_python_module("tensorflow/contrib/tensor_forest/python")
-add_python_module("tensorflow/contrib/tensor_forest/python/kernel_tests")
-add_python_module("tensorflow/contrib/tensor_forest/python/ops")
-add_python_module("tensorflow/contrib/testing")
-add_python_module("tensorflow/contrib/testing/python")
-add_python_module("tensorflow/contrib/testing/python/framework")
-add_python_module("tensorflow/contrib/text")
-add_python_module("tensorflow/contrib/text/kernels")
-add_python_module("tensorflow/contrib/text/ops")
-add_python_module("tensorflow/contrib/text/python")
-add_python_module("tensorflow/contrib/text/python/ops")
-add_python_module("tensorflow/contrib/tfprof")
-add_python_module("tensorflow/contrib/timeseries")
-add_python_module("tensorflow/contrib/timeseries/examples")
-add_python_module("tensorflow/contrib/timeseries/examples/data")
-add_python_module("tensorflow/contrib/timeseries/python")
-add_python_module("tensorflow/contrib/timeseries/python/timeseries")
-add_python_module("tensorflow/contrib/timeseries/python/timeseries/state_space_models")
-add_python_module("tensorflow/contrib/tpu")
-add_python_module("tensorflow/contrib/tpu/ops")
-add_python_module("tensorflow/contrib/tpu/profiler")
-add_python_module("tensorflow/contrib/tpu/python")
-add_python_module("tensorflow/contrib/tpu/python/ops")
-add_python_module("tensorflow/contrib/tpu/python/profiler")
-add_python_module("tensorflow/contrib/tpu/python/tpu")
-add_python_module("tensorflow/contrib/training")
-add_python_module("tensorflow/contrib/training/python")
-add_python_module("tensorflow/contrib/training/python/training")
-add_python_module("tensorflow/contrib/util")
-add_python_module("tensorflow/contrib/reduce_slice_ops")
-add_python_module("tensorflow/contrib/reduce_slice_ops/kernels")
-add_python_module("tensorflow/contrib/reduce_slice_ops/ops")
-add_python_module("tensorflow/contrib/reduce_slice_ops/python")
-add_python_module("tensorflow/contrib/reduce_slice_ops/python/kernel_tests")
-add_python_module("tensorflow/contrib/reduce_slice_ops/python/ops")
-add_python_module("tensorflow/contrib/summary")
 
 # Generate the tensorflow.python.platform.build_info module.
 set(BUILD_INFO_PY "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/platform/build_info.py")
@@ -738,7 +289,7 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
     # containing the wrappers.
     add_custom_command(
       OUTPUT ${GENERATE_PYTHON_OP_LIB_DESTINATION}
-      COMMAND ${tf_python_op_lib_name}_gen_python @${tensorflow_source_dir}/tensorflow/python/ops/hidden_ops.txt ${require_shape_fn} > ${GENERATE_PYTHON_OP_LIB_DESTINATION}
+      COMMAND ${tf_python_op_lib_name}_gen_python ${tensorflow_source_dir}/tensorflow/core/api_def/base_api,${tensorflow_source_dir}/tensorflow/core/api_def/python_api @${tensorflow_source_dir}/tensorflow/python/ops/hidden_ops.txt ${require_shape_fn} > ${GENERATE_PYTHON_OP_LIB_DESTINATION}
       DEPENDS ${tf_python_op_lib_name}_gen_python
     )
 
@@ -816,6 +367,9 @@ GENERATE_PYTHON_OP_LIB("contrib_memory_stats_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/memory_stats/ops/gen_memory_stats_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_nccl_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/nccl/ops/gen_nccl_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_periodic_resample_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/periodic_resample/python/ops/gen_periodic_resample_op.py)
+
 GENERATE_PYTHON_OP_LIB("contrib_nearest_neighbor_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/nearest_neighbor/ops/gen_nearest_neighbor_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_resampler_ops"
@@ -888,6 +442,8 @@ set (pywrap_tensorflow_internal_src
     "${tensorflow_source_dir}/tensorflow/python/framework/cpp_shape_inference.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.h"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.cc"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/bfloat16.h"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/bfloat16.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/numpy.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/numpy.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/ndarray_tensor.h"
@@ -898,6 +454,8 @@ set (pywrap_tensorflow_internal_src
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_func.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_seq_tensor.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_seq_tensor.cc"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/py_util.h"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/py_util.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/safe_ptr.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/safe_ptr.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/io/py_record_reader.h"
@@ -1014,6 +572,20 @@ target_link_libraries(pywrap_tensorflow_internal PRIVATE
 )
 
 if(WIN32)
+
+    # include contrib/periodic_resample as .so
+    #
+    set(tf_periodic_resample_srcs
+       "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc"
+       "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h"
+       "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/ops/array_ops.cc"
+    )
+
+    AddUserOps(TARGET _periodic_resample_op
+        SOURCES "${tf_periodic_resample_srcs}"
+        DEPENDS pywrap_tensorflow_internal tf_python_ops
+        DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/periodic_resample/python/ops/)
+
     # include contrib/nearest_neighbor as .so
     #
     set(tf_nearest_neighbor_srcs
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
index d4099f32797e404cc2f3c16b95e18d6b91d13981..571d2b0decb5e9afcec2314f9837546f0974e90d 100644
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -45,7 +45,7 @@ if(WIN32)
       $<TARGET_FILE:tensorflow_static>
       $<TARGET_FILE:tf_protos_cc>
   )
-    
+
   set(tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/tensorflow.def")
   set_source_files_properties(${tensorflow_deffile} PROPERTIES GENERATED TRUE)
 
diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake
index 8d95f0d3e813885c581b37cfc0b89e24d04ae6b1..91ca33f4c4d5f6c822f45b0676e6e46d2e4c2860 100644
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@@ -61,18 +61,18 @@ file(GLOB tf_stream_executor_srcs
     "${tensorflow_source_dir}/tensorflow/stream_executor/platform/default/*.h"
 )
 
-if (tensorflow_ENABLE_GPU)    
+if (tensorflow_ENABLE_GPU)
     file(GLOB tf_stream_executor_gpu_srcs
         "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*.cc"
     )
     list(APPEND tf_stream_executor_srcs ${tf_stream_executor_gpu_srcs})
-endif()    
+endif()
 
 #file(GLOB_RECURSE tf_stream_executor_test_srcs
 #    "${tensorflow_source_dir}/tensorflow/stream_executor/*_test.cc"
 #    "${tensorflow_source_dir}/tensorflow/stream_executor/*_test.h"
 #)
-#list(REMOVE_ITEM tf_stream_executor_srcs ${tf_stream_executor_test_srcs}) 
+#list(REMOVE_ITEM tf_stream_executor_srcs ${tf_stream_executor_test_srcs})
 
 if (NOT WIN32)
   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lgomp")
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 5d6ba9ca8d85e9a2d19b7f3e488822a8f21c6821..94ca4b00175dffb4461fca34c5ecd79ba79be778 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -139,12 +139,15 @@ if (tensorflow_BUILD_PYTHON_TESTS)
 
   file(GLOB_RECURSE tf_test_src_py
     ${tf_test_rnn_src_py}
+    "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/*.py"
     "${tensorflow_source_dir}/tensorflow/python/debug/cli/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/debug/lib/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/debug/wrappers/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/estimator/python/estimator/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/*.py"
     "${tensorflow_source_dir}/tensorflow/python/meta_graph_transform/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/ops/quantized_conv_ops_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/ops/quantized_ops_test.py"
     "${tensorflow_source_dir}/tensorflow/python/platform/build_info_test.py"
     "${tensorflow_source_dir}/tensorflow/python/profiler/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/profiler/internal/*_test.py"
@@ -153,7 +156,8 @@ if (tensorflow_BUILD_PYTHON_TESTS)
     "${tensorflow_source_dir}/tensorflow/contrib/data/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/factorization/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/image/*_test.py"
-    "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/integration_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/python/kernel_tests/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/nearest_neighbor/python/kernel_tests/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/python/kernel_tests/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/stateless/python/kernel_tests/*_test.py"
@@ -171,7 +175,6 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/contrib/graph_editor/*_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/bayesflow/*_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/framework/*_test.py"
-      "${tensorflow_source_dir}/tensorflow/contrib/keras/*_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/distributions/*_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/learn/*_test.py"
     )
@@ -217,6 +220,7 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       # TFDBG grpc:// mode is not yet available on Windows.
       "${tensorflow_source_dir}/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py"
       "${tensorflow_source_dir}/tensorflow/python/debug/lib/session_debug_grpc_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/debug/lib/source_remote_test.py"
       # stl on windows handles overflows different
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/as_string_op_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/string_to_number_op_test.py"
@@ -225,6 +229,10 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       # Numerical issues, calculations off.
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/concat_op_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/python/ops/wals_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/backend_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py"
       # Float division by zero
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/benchmark_test.py"
       # Flaky, for unknown reasons. Cannot reproduce in terminal. Revisit once we can get stack traces.
@@ -233,11 +241,11 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/training/sync_replicas_optimizer_test.py"
       "${tensorflow_source_dir}/tensorflow/python/debug/lib/session_debug_grpc_test.py"
       "${tensorflow_source_dir}tensorflow/python/training/localhost_cluster_performance_test.py"
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/iterator_ops_cluster_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/functional_ops_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py"
       # Type error in testRemoteIteratorUsingRemoteCallOpDirectSessionGPUCPU.
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/iterator_ops_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/iterator_ops_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py"
       # IteratorGetMax OutOfRangeError
@@ -261,9 +269,9 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/linalg_grad_test.py"  # cudaSolver handle creation fails.
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/array_ops_test.py"  # depends on python/framework/test_ops
       # Dataset tests
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/dataset_constructor_op_test.py"  # Segfaults on windows
+      "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py"  # Segfaults on windows
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py"  # Segfaults on Windows.
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/iterator_ops_cluster_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py"
       # Broken tensorboard test due to cmake issues.
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py"  # Needs portpicker
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/sloppy_transformation_dataset_op_test.py"  # b/65430561
@@ -294,6 +302,9 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       # Test should only be run manually
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/reduction_ops_test_big.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/svd_op_test.py"
+      # Depends on python/framework/test_ops
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/array_ops_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/control_flow_util_test.py"
   )
   endif()
   list(REMOVE_ITEM tf_test_src_py ${tf_test_src_py_exclude})
diff --git a/tensorflow/contrib/copy_graph/__init__.py b/tensorflow/contrib/copy_graph/__init__.py
index 30a0aac140b576c501595fd6c8767b7dddde8e58..61ee39e4be1f0471309bb2672476dd9100cbfd49 100644
--- a/tensorflow/contrib/copy_graph/__init__.py
+++ b/tensorflow/contrib/copy_graph/__init__.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """Functions to copy elements between graphs.
-
-See the @{$python/contrib.copy_graph} guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
index 8c2528f548799f9facef740b0134ac56966b2b04..bae66ffd4289308f2cbfc730ec50d057b13923fb 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@@ -19,7 +19,7 @@ from one graph to another. The copied elements are initialized inside a
 user-specified scope in the other graph. There are separate functions to
 copy ops and variables.
 There is also a function to retrive the copied version of an op from the
-first graph inside a scope in the second graph. 
+first graph inside a scope in the second graph.
 
 @@copy_op_to_graph
 @@copy_variable_to_graph
@@ -225,7 +225,7 @@ def copy_op_to_graph(org_instance, to_graph, variables,
                            new_original_op,
                            op_def)
     #Use Graph's hidden methods to add the op
-    to_graph._add_op(new_op)
+    to_graph._add_op(new_op)  # pylint: disable=protected-access
     to_graph._record_op_seen_by_control_dependencies(new_op)
     for device_function in reversed(to_graph._device_function_stack):
       new_op._set_device(device_function(new_op))
diff --git a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
index 964ec754413f44d90c8e7e5e9358f82102f2cbcc..b47fb426a193e0fcc075deafae3eaab698f18ec9 100644
--- a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
+++ b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
@@ -32,27 +32,41 @@ from tensorflow.python.platform import test
 class CrfTest(test.TestCase):
 
   def testCrfSequenceScore(self):
-    inputs = np.array(
-        [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
-    tag_indices = np.array([1, 2, 1, 0], dtype=np.int32)
     transition_params = np.array(
         [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    sequence_lengths = np.array(3, dtype=np.int32)
-    with self.test_session() as sess:
-      sequence_score = crf.crf_sequence_score(
-          inputs=array_ops.expand_dims(inputs, 0),
-          tag_indices=array_ops.expand_dims(tag_indices, 0),
-          sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
-          transition_params=constant_op.constant(transition_params))
-      sequence_score = array_ops.squeeze(sequence_score, [0])
-      tf_sequence_score = sess.run(sequence_score)
-      expected_unary_score = sum(inputs[i][tag_indices[i]]
-                                 for i in range(sequence_lengths))
-      expected_binary_score = sum(
-          transition_params[tag_indices[i], tag_indices[i + 1]]
-          for i in range(sequence_lengths - 1))
-      expected_sequence_score = expected_unary_score + expected_binary_score
-      self.assertAllClose(tf_sequence_score, expected_sequence_score)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+        np.array(3, dtype=np.int32),
+        np.array(1, dtype=np.int32)
+    ]
+    inputs_list = [
+        np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                 dtype=np.float32),
+        np.array([[4, 5, -3]],
+                 dtype=np.float32),
+    ]
+    tag_indices_list = [
+        np.array([1, 2, 1, 0], dtype=np.int32),
+        np.array([1], dtype=np.int32)
+    ]
+    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
+                                                     inputs_list,
+                                                     tag_indices_list):
+      with self.test_session() as sess:
+        sequence_score = crf.crf_sequence_score(
+            inputs=array_ops.expand_dims(inputs, 0),
+            tag_indices=array_ops.expand_dims(tag_indices, 0),
+            sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
+            transition_params=constant_op.constant(transition_params))
+        sequence_score = array_ops.squeeze(sequence_score, [0])
+        tf_sequence_score = sess.run(sequence_score)
+        expected_unary_score = sum(inputs[i][tag_indices[i]]
+                                   for i in range(sequence_lengths))
+        expected_binary_score = sum(
+            transition_params[tag_indices[i], tag_indices[i + 1]]
+            for i in range(sequence_lengths - 1))
+        expected_sequence_score = expected_unary_score + expected_binary_score
+        self.assertAllClose(tf_sequence_score, expected_sequence_score)
 
   def testCrfUnaryScore(self):
     inputs = np.array(
@@ -89,38 +103,54 @@ class CrfTest(test.TestCase):
       self.assertAllClose(tf_binary_score, expected_binary_score)
 
   def testCrfLogNorm(self):
-    inputs = np.array(
-        [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
     transition_params = np.array(
         [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    num_words = inputs.shape[0]
-    num_tags = inputs.shape[1]
-    sequence_lengths = np.array(3, dtype=np.int32)
-    with self.test_session() as sess:
-      all_sequence_scores = []
-
-      # Compare the dynamic program with brute force computation.
-      for tag_indices in itertools.product(
-          range(num_tags), repeat=sequence_lengths):
-        tag_indices = list(tag_indices)
-        tag_indices.extend([0] * (num_words - sequence_lengths))
-        all_sequence_scores.append(
-            crf.crf_sequence_score(
-                inputs=array_ops.expand_dims(inputs, 0),
-                tag_indices=array_ops.expand_dims(tag_indices, 0),
-                sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
-                transition_params=constant_op.constant(transition_params)))
-
-      brute_force_log_norm = math_ops.reduce_logsumexp(all_sequence_scores)
-      log_norm = crf.crf_log_norm(
-          inputs=array_ops.expand_dims(inputs, 0),
-          sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
-          transition_params=constant_op.constant(transition_params))
-      log_norm = array_ops.squeeze(log_norm, [0])
-      tf_brute_force_log_norm, tf_log_norm = sess.run(
-          [brute_force_log_norm, log_norm])
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+        np.array(3, dtype=np.int32),
+        np.array(1, dtype=np.int32)
+    ]
+    inputs_list = [
+        np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                 dtype=np.float32),
+        np.array([[3, -1, 3]],
+                 dtype=np.float32),
+    ]
+    tag_indices_list = [
+        np.array([1, 2, 1, 0], dtype=np.int32),
+        np.array([2], dtype=np.int32)
+    ]
+
+    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
+                                                     inputs_list,
+                                                     tag_indices_list):
+      num_words = inputs.shape[0]
+      num_tags = inputs.shape[1]
+      with self.test_session() as sess:
+        all_sequence_scores = []
+
+        # Compare the dynamic program with brute force computation.
+        for tag_indices in itertools.product(
+            range(num_tags), repeat=sequence_lengths):
+          tag_indices = list(tag_indices)
+          tag_indices.extend([0] * (num_words - sequence_lengths))
+          all_sequence_scores.append(
+              crf.crf_sequence_score(
+                  inputs=array_ops.expand_dims(inputs, 0),
+                  tag_indices=array_ops.expand_dims(tag_indices, 0),
+                  sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
+                  transition_params=constant_op.constant(transition_params)))
+
+        brute_force_log_norm = math_ops.reduce_logsumexp(all_sequence_scores)
+        log_norm = crf.crf_log_norm(
+            inputs=array_ops.expand_dims(inputs, 0),
+            sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
+            transition_params=constant_op.constant(transition_params))
+        log_norm = array_ops.squeeze(log_norm, [0])
+        tf_brute_force_log_norm, tf_log_norm = sess.run(
+            [brute_force_log_norm, log_norm])
 
-      self.assertAllClose(tf_log_norm, tf_brute_force_log_norm)
+        self.assertAllClose(tf_log_norm, tf_brute_force_log_norm)
 
   def testCrfLogLikelihood(self):
     inputs = np.array(
@@ -201,50 +231,66 @@ class CrfTest(test.TestCase):
                        expected_max_sequence[:sequence_lengths])
 
   def testCrfDecode(self):
-    inputs = np.array(
-        [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
     transition_params = np.array(
         [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    sequence_lengths = np.array(3, dtype=np.int32)
-    num_words = inputs.shape[0]
-    num_tags = inputs.shape[1]
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+        np.array(3, dtype=np.int32),
+        np.array(1, dtype=np.int32)
+    ]
+    inputs_list = [
+        np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                 dtype=np.float32),
+        np.array([[-1, 2, 1]],
+                 dtype=np.float32),
+    ]
+    tag_indices_list = [
+        np.array([1, 2, 1, 0], dtype=np.int32),
+        np.array([2], dtype=np.int32)
+    ]
+
+    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
+                                                     inputs_list,
+                                                     tag_indices_list):
+      num_words = inputs.shape[0]
+      num_tags = inputs.shape[1]
 
-    with self.test_session() as sess:
-      all_sequence_scores = []
-      all_sequences = []
-
-      # Compare the dynamic program with brute force computation.
-      for tag_indices in itertools.product(
-          range(num_tags), repeat=sequence_lengths):
-        tag_indices = list(tag_indices)
-        tag_indices.extend([0] * (num_words - sequence_lengths))
-        all_sequences.append(tag_indices)
-        sequence_score = crf.crf_sequence_score(
-            inputs=array_ops.expand_dims(inputs, 0),
-            tag_indices=array_ops.expand_dims(tag_indices, 0),
-            sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
-            transition_params=constant_op.constant(transition_params))
-        sequence_score = array_ops.squeeze(sequence_score, [0])
-        all_sequence_scores.append(sequence_score)
-
-      tf_all_sequence_scores = sess.run(all_sequence_scores)
-
-      expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
-      expected_max_sequence = all_sequences[expected_max_sequence_index]
-      expected_max_score = tf_all_sequence_scores[expected_max_sequence_index]
-
-      actual_max_sequence, actual_max_score = crf.crf_decode(
-          array_ops.expand_dims(inputs, 0),
-          constant_op.constant(transition_params),
-          array_ops.expand_dims(sequence_lengths, 0))
-      actual_max_sequence = array_ops.squeeze(actual_max_sequence, [0])
-      actual_max_score = array_ops.squeeze(actual_max_score, [0])
-      tf_actual_max_sequence, tf_actual_max_score = sess.run(
-          [actual_max_sequence, actual_max_score])
-
-      self.assertAllClose(tf_actual_max_score, expected_max_score)
-      self.assertEqual(list(tf_actual_max_sequence[:sequence_lengths]),
-                       expected_max_sequence[:sequence_lengths])
+      with self.test_session() as sess:
+        all_sequence_scores = []
+        all_sequences = []
+
+        # Compare the dynamic program with brute force computation.
+        for tag_indices in itertools.product(
+            range(num_tags), repeat=sequence_lengths):
+          tag_indices = list(tag_indices)
+          tag_indices.extend([0] * (num_words - sequence_lengths))
+          all_sequences.append(tag_indices)
+          sequence_score = crf.crf_sequence_score(
+              inputs=array_ops.expand_dims(inputs, 0),
+              tag_indices=array_ops.expand_dims(tag_indices, 0),
+              sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
+              transition_params=constant_op.constant(transition_params))
+          sequence_score = array_ops.squeeze(sequence_score, [0])
+          all_sequence_scores.append(sequence_score)
+
+        tf_all_sequence_scores = sess.run(all_sequence_scores)
+
+        expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
+        expected_max_sequence = all_sequences[expected_max_sequence_index]
+        expected_max_score = tf_all_sequence_scores[expected_max_sequence_index]
+
+        actual_max_sequence, actual_max_score = crf.crf_decode(
+            array_ops.expand_dims(inputs, 0),
+            constant_op.constant(transition_params),
+            array_ops.expand_dims(sequence_lengths, 0))
+        actual_max_sequence = array_ops.squeeze(actual_max_sequence, [0])
+        actual_max_score = array_ops.squeeze(actual_max_score, [0])
+        tf_actual_max_sequence, tf_actual_max_score = sess.run(
+            [actual_max_sequence, actual_max_score])
+
+        self.assertAllClose(tf_actual_max_score, expected_max_score)
+        self.assertEqual(list(tf_actual_max_sequence[:sequence_lengths]),
+                         expected_max_sequence[:sequence_lengths])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 8b621732c1391feda011d21b175bc0b042b9eec7..7f5ae937b26f465076c6976429697c35924432e5 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -53,7 +53,9 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
@@ -101,12 +103,29 @@ def crf_sequence_score(inputs, tag_indices, sequence_lengths,
   Returns:
     sequence_scores: A [batch_size] vector of unnormalized sequence scores.
   """
-  # Compute the scores of the given tag sequence.
-  unary_scores = crf_unary_score(tag_indices, sequence_lengths, inputs)
-  binary_scores = crf_binary_score(tag_indices, sequence_lengths,
-                                   transition_params)
-  sequence_scores = unary_scores + binary_scores
-  return sequence_scores
+  # If max_seq_len is 1, we skip the score calculation and simply gather the
+  # unary potentials of the single tag.
+  def _single_seq_fn():
+    batch_size = array_ops.shape(inputs, out_type=tag_indices.dtype)[0]
+    example_inds = array_ops.reshape(
+        math_ops.range(batch_size, dtype=tag_indices.dtype), [-1, 1])
+    return array_ops.gather_nd(
+        array_ops.squeeze(inputs, [1]),
+        array_ops.concat([example_inds, tag_indices], axis=1))
+
+  def _multi_seq_fn():
+    # Compute the scores of the given tag sequence.
+    unary_scores = crf_unary_score(tag_indices, sequence_lengths, inputs)
+    binary_scores = crf_binary_score(tag_indices, sequence_lengths,
+                                     transition_params)
+    sequence_scores = unary_scores + binary_scores
+    return sequence_scores
+
+  return utils.smart_cond(
+      pred=math_ops.equal(inputs.shape[1].value or array_ops.shape(inputs)[1],
+                          1),
+      fn1=_single_seq_fn,
+      fn2=_multi_seq_fn)
 
 
 def crf_log_norm(inputs, sequence_lengths, transition_params):
@@ -124,19 +143,32 @@ def crf_log_norm(inputs, sequence_lengths, transition_params):
   # algorithm.
   first_input = array_ops.slice(inputs, [0, 0, 0], [-1, 1, -1])
   first_input = array_ops.squeeze(first_input, [1])
-  rest_of_input = array_ops.slice(inputs, [0, 1, 0], [-1, -1, -1])
 
-  # Compute the alpha values in the forward algorithm in order to get the
-  # partition function.
-  forward_cell = CrfForwardRnnCell(transition_params)
-  _, alphas = rnn.dynamic_rnn(
-      cell=forward_cell,
-      inputs=rest_of_input,
-      sequence_length=sequence_lengths - 1,
-      initial_state=first_input,
-      dtype=dtypes.float32)
-  log_norm = math_ops.reduce_logsumexp(alphas, [1])
-  return log_norm
+  # If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp over
+  # the "initial state" (the unary potentials).
+  def _single_seq_fn():
+    return math_ops.reduce_logsumexp(first_input, [1])
+
+  def _multi_seq_fn():
+    """Forward computation of alpha values."""
+    rest_of_input = array_ops.slice(inputs, [0, 1, 0], [-1, -1, -1])
+
+    # Compute the alpha values in the forward algorithm in order to get the
+    # partition function.
+    forward_cell = CrfForwardRnnCell(transition_params)
+    _, alphas = rnn.dynamic_rnn(
+        cell=forward_cell,
+        inputs=rest_of_input,
+        sequence_length=sequence_lengths - 1,
+        initial_state=first_input,
+        dtype=dtypes.float32)
+    log_norm = math_ops.reduce_logsumexp(alphas, [1])
+    return log_norm
+
+  max_seq_len = array_ops.shape(inputs)[1]
+  return control_flow_ops.cond(pred=math_ops.equal(max_seq_len, 1),
+                               true_fn=_single_seq_fn,
+                               false_fn=_multi_seq_fn)
 
 
 def crf_log_likelihood(inputs,
@@ -437,45 +469,64 @@ def crf_decode(potentials, transition_params, sequence_length):
     sequence_length: A [batch_size] vector of true sequence lengths.
 
   Returns:
-    decode_tags: A [batch_size, max_seq_len] tensor, with dtype tf.int32.
+    decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
                 Contains the highest scoring tag indices.
-    best_score: A [batch_size] tensor, containing the score of decode_tags.
+    best_score: A [batch_size] vector, containing the score of `decode_tags`.
   """
-  # For simplicity, in shape comments, denote:
-  # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
-  num_tags = potentials.get_shape()[2].value
-
-  # Computes forward decoding. Get last score and backpointers.
-  crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params)
-  initial_state = array_ops.slice(potentials, [0, 0, 0], [-1, 1, -1])
-  initial_state = array_ops.squeeze(initial_state, axis=[1])      # [B, O]
-  inputs = array_ops.slice(potentials, [0, 1, 0], [-1, -1, -1])   # [B, T-1, O]
-  backpointers, last_score = rnn.dynamic_rnn(
-      crf_fwd_cell,
-      inputs=inputs,
-      sequence_length=sequence_length - 1,
-      initial_state=initial_state,
-      time_major=False,
-      dtype=dtypes.int32)             # [B, T - 1, O], [B, O]
-  backpointers = gen_array_ops.reverse_sequence(
-      backpointers, sequence_length - 1, seq_dim=1)               # [B, T-1, O]
-
-  # Computes backward decoding. Extract tag indices from backpointers.
-  crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
-  initial_state = math_ops.cast(math_ops.argmax(last_score, axis=1),
-                                dtype=dtypes.int32)               # [B]
-  initial_state = array_ops.expand_dims(initial_state, axis=-1)   # [B, 1]
-  decode_tags, _ = rnn.dynamic_rnn(
-      crf_bwd_cell,
-      inputs=backpointers,
-      sequence_length=sequence_length - 1,
-      initial_state=initial_state,
-      time_major=False,
-      dtype=dtypes.int32)           # [B, T - 1, 1]
-  decode_tags = array_ops.squeeze(decode_tags, axis=[2])           # [B, T - 1]
-  decode_tags = array_ops.concat([initial_state, decode_tags], axis=1)  # [B, T]
-  decode_tags = gen_array_ops.reverse_sequence(
-      decode_tags, sequence_length, seq_dim=1)                     # [B, T]
-
-  best_score = math_ops.reduce_max(last_score, axis=1)             # [B]
-  return decode_tags, best_score
+  # If max_seq_len is 1, we skip the algorithm and simply return the argmax tag
+  # and the max activation.
+  def _single_seq_fn():
+    squeezed_potentials = array_ops.squeeze(potentials, [1])
+    decode_tags = array_ops.expand_dims(
+        math_ops.argmax(squeezed_potentials, axis=1), 1)
+    best_score = math_ops.reduce_max(squeezed_potentials, axis=1)
+    return math_ops.cast(decode_tags, dtype=dtypes.int32), best_score
+
+  def _multi_seq_fn():
+    """Decoding of highest scoring sequence."""
+
+    # For simplicity, in shape comments, denote:
+    # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
+    num_tags = potentials.get_shape()[2].value
+
+    # Computes forward decoding. Get last score and backpointers.
+    crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params)
+    initial_state = array_ops.slice(potentials, [0, 0, 0], [-1, 1, -1])
+    initial_state = array_ops.squeeze(initial_state, axis=[1])  # [B, O]
+    inputs = array_ops.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
+    backpointers, last_score = rnn.dynamic_rnn(  # [B, T - 1, O], [B, O]
+        crf_fwd_cell,
+        inputs=inputs,
+        sequence_length=sequence_length - 1,
+        initial_state=initial_state,
+        time_major=False,
+        dtype=dtypes.int32)
+    backpointers = gen_array_ops.reverse_sequence(  # [B, T - 1, O]
+        backpointers, sequence_length - 1, seq_dim=1)
+
+    # Computes backward decoding. Extract tag indices from backpointers.
+    crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
+    initial_state = math_ops.cast(math_ops.argmax(last_score, axis=1),  # [B]
+                                  dtype=dtypes.int32)
+    initial_state = array_ops.expand_dims(initial_state, axis=-1)  # [B, 1]
+    decode_tags, _ = rnn.dynamic_rnn(  # [B, T - 1, 1]
+        crf_bwd_cell,
+        inputs=backpointers,
+        sequence_length=sequence_length - 1,
+        initial_state=initial_state,
+        time_major=False,
+        dtype=dtypes.int32)
+    decode_tags = array_ops.squeeze(decode_tags, axis=[2])  # [B, T - 1]
+    decode_tags = array_ops.concat([initial_state, decode_tags],   # [B, T]
+                                   axis=1)
+    decode_tags = gen_array_ops.reverse_sequence(  # [B, T]
+        decode_tags, sequence_length, seq_dim=1)
+
+    best_score = math_ops.reduce_max(last_score, axis=1)  # [B]
+    return decode_tags, best_score
+
+  return utils.smart_cond(
+      pred=math_ops.equal(
+          potentials.shape[1].value or array_ops.shape(potentials)[1], 1),
+      fn1=_single_seq_fn,
+      fn2=_multi_seq_fn)
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index fce2c03e69bc4b8b0ac46b8e081a33c43c9d41ab..0751624bc4b7fbf413c342db3e5c440c9d572cd4 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -146,10 +146,10 @@ cuda_py_test(
 
 cuda_py_test(
     name = "cudnn_rnn_ops_benchmark",
-    size = "large",
+    size = "small",
     srcs = ["python/kernel_tests/cudnn_rnn_ops_benchmark.py"],
     additional_deps = [
-        ":cudnn_rnn_ops_py",
+        ":cudnn_rnn_py",
         "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
@@ -164,7 +164,6 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = [
-        "manual",
         "noasan",  # http://b/62067814
         "nomsan",
         "notsan",
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
index ff409ac71826f1f0f57e9133d768003f849abc09..4fc5ff1bd1887c4532e95fcf0e791d72b20471b0 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import time
 
+from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
-from tensorflow.contrib.rnn.python.ops import core_rnn
 from tensorflow.contrib.rnn.python.ops import lstm_ops
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
@@ -29,8 +29,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import rnn
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -44,19 +43,19 @@ class CudnnRNNBenchmark(test.Benchmark):
         "large": {
             "num_layers": 4,
             "num_units": 1024,
-            "seq_length": 40,
+            "seq_length": 50,
             "batch_size": 64,
         },
         "medium": {
             "num_layers": 4,
             "num_units": 512,
-            "seq_length": 30,
+            "seq_length": 50,
             "batch_size": 64,
         },
         "small": {
             "num_layers": 4,
             "num_units": 128,
-            "seq_length": 20,
+            "seq_length": 50,
             "batch_size": 64,
         },
     }
@@ -71,7 +70,7 @@ class CudnnRNNBenchmark(test.Benchmark):
 
   def _BenchmarkOp(self, op, desc):
     burn_in_steps = 10
-    benchmark_steps = 40
+    benchmark_steps = 20
     with session.Session() as sess:
       sess.run(variables.global_variables_initializer())
       for i in xrange(burn_in_steps + benchmark_steps):
@@ -126,16 +125,12 @@ class CudnnRNNBenchmark(test.Benchmark):
       seq_length = config["seq_length"]
 
       with ops.Graph().as_default(), ops.device("/device:GPU:0"):
-        inputs = seq_length * [
-            array_ops.zeros([batch_size, num_units], dtypes.float32)
-        ]
-        initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=127)
-
-        cell = rnn_cell.LSTMCell(
-            num_units=num_units, initializer=initializer, state_is_tuple=True)
-        multi_cell = rnn_cell.MultiRNNCell(
-            [cell() for _ in range(num_layers)])
-        outputs, final_state = core_rnn.static_rnn(
+        inputs = array_ops.zeros([batch_size, seq_length, num_units],
+                                 dtypes.float32)
+
+        multi_cell = contrib_rnn.MultiRNNCell(
+            [contrib_rnn.BasicLSTMCell(num_units) for _ in range(num_layers)])
+        outputs, final_state = rnn.dynamic_rnn(
             multi_cell, inputs, dtype=dtypes.float32)
         trainable_variables = ops.get_collection(
             ops.GraphKeys.TRAINABLE_VARIABLES)
@@ -154,14 +149,12 @@ class CudnnRNNBenchmark(test.Benchmark):
       seq_length = config["seq_length"]
 
       with ops.Graph().as_default(), ops.device("/device:GPU:0"):
-        inputs = seq_length * [
-            array_ops.zeros([batch_size, num_units], dtypes.float32)
-        ]
-        cell = lambda: lstm_ops.LSTMBlockCell(num_units=num_units)  # pylint: disable=cell-var-from-loop
-
-        multi_cell = rnn_cell.MultiRNNCell(
-            [cell() for _ in range(num_layers)])
-        outputs, final_state = core_rnn.static_rnn(
+        inputs = array_ops.zeros([batch_size, seq_length, num_units],
+                                 dtypes.float32)
+
+        multi_cell = contrib_rnn.MultiRNNCell(
+            [lstm_ops.LSTMBlockCell(num_units) for _ in range(num_layers)])
+        outputs, final_state = rnn.dynamic_rnn(
             multi_cell, inputs, dtype=dtypes.float32)
         trainable_variables = ops.get_collection(
             ops.GraphKeys.TRAINABLE_VARIABLES)
diff --git a/tensorflow/contrib/data/BUILD b/tensorflow/contrib/data/BUILD
index f7d8a084d9c12c05c411ae0751854d1823a818ec..3b1c33063f1214b68f79560f50d56bf5d31c9560 100644
--- a/tensorflow/contrib/data/BUILD
+++ b/tensorflow/contrib/data/BUILD
@@ -18,6 +18,7 @@ py_library(
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:iterator_ops",
         "//tensorflow/contrib/data/python/ops:readers",
+        "//tensorflow/contrib/data/python/ops:shuffle_ops",
         "//tensorflow/contrib/data/python/ops:transformation_ops",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:iterator_ops",
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 6e43ae0e6320fa237435b837780ec8aea941872b..c9ad091bd44d6e3a9368e182c3df9fc1c6e48071 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -17,6 +17,7 @@
 See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 
 @@Dataset
+@@Counter
 @@Iterator
 @@TFRecordDataset
 @@FixedLengthRecordDataset
@@ -33,6 +34,7 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 @@unbatch
 @@parallel_interleave
 @@rejection_resample
+@@scan
 @@sloppy_interleave
 
 @@get_single_element
@@ -48,6 +50,7 @@ from tensorflow.contrib.data.python.ops.batching import batch_and_drop_remainder
 from tensorflow.contrib.data.python.ops.batching import dense_to_sparse_batch
 from tensorflow.contrib.data.python.ops.batching import padded_batch_and_drop_remainder
 from tensorflow.contrib.data.python.ops.batching import unbatch
+from tensorflow.contrib.data.python.ops.counter import Counter
 from tensorflow.contrib.data.python.ops.dataset_ops import Dataset
 from tensorflow.contrib.data.python.ops.dataset_ops import get_single_element
 from tensorflow.contrib.data.python.ops.enumerate_ops import enumerate_dataset
@@ -62,6 +65,8 @@ from tensorflow.contrib.data.python.ops.readers import SqlDataset
 from tensorflow.contrib.data.python.ops.readers import TextLineDataset
 from tensorflow.contrib.data.python.ops.readers import TFRecordDataset
 from tensorflow.contrib.data.python.ops.resampling import rejection_resample
+from tensorflow.contrib.data.python.ops.scan_ops import scan
+from tensorflow.contrib.data.python.ops.shuffle_ops import shuffle_and_repeat
 from tensorflow.python.data.ops.iterator_ops import Iterator
 # pylint: enable=unused-import
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 3d4e46408e20a7c7c39c2601458b237b18676b72..d5ad14532780ff6b0cc40ae5a206c50ca70750ba 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -4,7 +4,7 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
 
 py_test(
     name = "batch_dataset_op_test",
@@ -110,6 +110,7 @@ py_test(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/util:nest",
         "//third_party/py/numpy",
     ],
@@ -131,6 +132,8 @@ py_library(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:iterator_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -140,7 +143,9 @@ py_test(
     size = "small",
     srcs = ["filter_dataset_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -152,21 +157,28 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "flat_map_dataset_op_test",
     size = "small",
     srcs = ["flat_map_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
+        ":dataset_serialization_test",
+        "//third_party/py/numpy",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:function",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:training",
-        "//third_party/py/numpy",
+        "//tensorflow/python:variable_scope",
     ],
+    grpc_enabled = True,
+    tags = ["no_pip"],
 )
 
 py_test(
@@ -178,6 +190,7 @@ py_test(
         "manual",  # b/67958761
     ],
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:transformation_ops",
         "//tensorflow/python:array_ops",
@@ -194,13 +207,11 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "iterator_ops_cluster_test",
     size = "small",
     srcs = ["iterator_ops_cluster_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -214,14 +225,19 @@ py_test(
         "//tensorflow/python:session",
         "//tensorflow/python/data/ops:iterator_ops",
     ],
+    grpc_enabled = True,
+    tags = [
+        "no_windows",
+        "oss_serial",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "iterator_ops_test",
     size = "small",
     srcs = ["iterator_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
+        "//third_party/py/numpy",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:readers",
         "//tensorflow/core:protos_all_py",
@@ -243,8 +259,8 @@ py_test(
         "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//tensorflow/python/data/ops:iterator_ops",
-        "//third_party/py/numpy",
     ],
+    grpc_enabled = True,
 )
 
 py_test(
@@ -264,12 +280,13 @@ py_test(
 
 py_test(
     name = "map_dataset_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["map_dataset_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:iterator_ops",
         "//tensorflow/contrib/data/python/ops:transformation_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -278,23 +295,35 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
         "//tensorflow/python:functional_ops",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
-        "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:iterator_ops",
         "//third_party/py/numpy",
     ],
 )
 
+py_test(
+    name = "prefetch_dataset_op_test",
+    size = "small",
+    srcs = ["prefetch_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":dataset_serialization_test",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 py_test(
     name = "range_dataset_op_test",
     size = "small",
@@ -323,9 +352,10 @@ py_test(
 
 py_test(
     name = "reader_dataset_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["reader_dataset_ops_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:readers",
@@ -362,8 +392,25 @@ py_test(
 )
 
 py_test(
-    name = "sequence_dataset_op_test",
+    name = "scan_dataset_op_test",
     size = "small",
+    srcs = ["scan_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "sequence_dataset_op_test",
+    size = "medium",
     srcs = ["sequence_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
@@ -392,12 +439,15 @@ py_test(
 
 py_test(
     name = "shuffle_dataset_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["shuffle_dataset_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:iterator_ops",
+        "//tensorflow/contrib/data/python/ops:shuffle_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -426,6 +476,21 @@ py_test(
     ],
 )
 
+py_test(
+    name = "stats_dataset_ops_test",
+    size = "small",
+    srcs = ["stats_dataset_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":dataset_serialization_test",
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+    ],
+)
+
 py_test(
     name = "zip_dataset_op_test",
     size = "small",
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index 09416f8302842355da438aa35747bdc178ed5f4f..506eefbef0204284a103827180c13b13200a3f93 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -104,14 +104,58 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(init_op, feed_dict={count: 14, batch_size: 0})
 
-  def testBatchSparseError(self):
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
 
-    def _map_fn(i):
-      return sparse_tensor.SparseTensor(
-          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
+  def testBatchSparse(self):
 
-    with self.assertRaises(TypeError):
-      _ = dataset_ops.Dataset.range(10).map(_map_fn).batch(10)
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(
+        5).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(2):
+        actual = sess.run(get_next)
+        expected = sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
+            dense_shape=[5, 1])
+        self.assertTrue(sparse_tensor.is_sparse(actual))
+        self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testNestedBatchSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(5).batch(
+        2).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      actual = sess.run(get_next)
+      expected = sparse_tensor.SparseTensorValue(
+          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [0, 4, 0],
+                   [1, 0, 0], [1, 1, 0], [1, 2, 0], [1, 3, 0], [1, 4, 0]],
+          values=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+          dense_shape=[2, 5, 1])
+      self.assertTrue(sparse_tensor.is_sparse(actual))
+      self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
 
   def testPaddedBatchDataset(self):
     seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
@@ -250,7 +294,7 @@ class BatchDatasetTest(test.TestCase):
   def testPaddedBatchSparseError(self):
 
     def _map_fn(i):
-      return sparse_tensor.SparseTensor(
+      return sparse_tensor.SparseTensorValue(
           indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
 
     with self.assertRaises(TypeError):
@@ -438,6 +482,30 @@ class BatchDatasetTest(test.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(next_element)
 
+  def testBatchAndDropRemainderSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    iterator = dataset_ops.Dataset.range(12).map(_sparse).apply(
+        batching.batch_and_drop_remainder(5)).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(2):
+        actual = sess.run(get_next)
+        expected = sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
+            dense_shape=[5, 1])
+        self.assertTrue(sparse_tensor.is_sparse(actual))
+        self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
   def testPaddedBatchAndDropRemainder(self):
     els = []
     for length in [3, 6, 9, 4, 12, 10, 2]:
@@ -474,6 +542,16 @@ class BatchDatasetTest(test.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(next_element)
 
+  def testPaddedBatchAndDropRemainderSparseError(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
+
+    with self.assertRaises(TypeError):
+      _ = dataset_ops.Dataset.range(10).map(_map_fn).apply(
+          batching.padded_batch_and_drop_remainder(5))
+
   def testBatchAndDropRemainderShapeInference(self):
     components = (array_ops.placeholder(dtypes.int32),
                   (array_ops.placeholder(dtypes.int32, shape=[None]),
@@ -499,17 +577,7 @@ class BatchDatasetTest(test.TestCase):
     self.assertEqual([None], dataset.output_shapes[1][0].as_list())
     self.assertEqual([None, 30], dataset.output_shapes[1][1].as_list())
 
-  def testBatchAndDropRemainderSparseError(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensor(
-          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
-
-    with self.assertRaises(TypeError):
-      _ = dataset_ops.Dataset.range(10).map(_map_fn).apply(
-          batching.batch_and_drop_remainder(10))
-
-  def testBatchAndMapDataset(self):
+  def _testBatchAndMapDatasetHelper(self, num_parallel_batches=1):
     """Test a dataset that maps a TF function across its input elements."""
     # The pipeline is TensorSliceDataset ->
     # RepeatDataset(count) -> BatchAndMapDataset(square_3, batch_size).
@@ -525,7 +593,10 @@ class BatchDatasetTest(test.TestCase):
 
     iterator = (
         dataset_ops.Dataset.from_tensor_slices(components).repeat(count).apply(
-            batching.map_and_batch(_map_fn, batch_size))
+            batching.map_and_batch(
+                map_func=_map_fn,
+                batch_size=batch_size,
+                num_parallel_batches=num_parallel_batches))
         .make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -559,7 +630,11 @@ class BatchDatasetTest(test.TestCase):
           for j in range(8):
             self.assertAllEqual(component[(i * 8 + j) % 7]**2,
                                 result_component[j])
-      # The last batch should fail with `OutOfRange`.
+      result = sess.run(get_next)
+      for component, result_component in zip(components, result):
+        for j in range((14 * 7) % 8):
+          self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
+                              result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -572,6 +647,36 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(init_op, feed_dict={count: 14, batch_size: 0})
 
+  def testBatchAndMapDataset(self):
+    return self._testBatchAndMapDatasetHelper()
+
+  def testBatchAndMapDatasetWithParallelBatching(self):
+    return self._testBatchAndMapDatasetHelper(num_parallel_batches=10)
+
+  def testMapAndBatchSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    iterator = dataset_ops.Dataset.range(10).apply(
+        batching.map_and_batch(_sparse, 5)).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(2):
+        actual = sess.run(get_next)
+        expected = sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
+            dense_shape=[5, 1])
+        self.assertTrue(sparse_tensor.is_sparse(actual))
+        self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
   def testBatchAndMapDatasetFails(self):
     """Test a dataset that maps a TF function across its input elements."""
     dataset = dataset_ops.Dataset.from_tensors(
@@ -631,5 +736,41 @@ class BatchDatasetSerializationTest(
         num_outputs)
 
 
+class PaddedBatchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testPaddedBatch(self):
+
+    def build_dataset(seq_lens):
+      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
+          lambda x: array_ops.fill([x], x)).padded_batch(
+              4, padded_shapes=[-1])
+
+    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
+    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
+    self.run_core_tests(lambda: build_dataset(seq_lens1),
+                        lambda: build_dataset(seq_lens2), 8)
+
+  def testPaddedBatchNonDefaultPadding(self):
+
+    def build_dataset(seq_lens):
+
+      def fill_tuple(x):
+        filled = array_ops.fill([x], x)
+        return (filled, string_ops.as_string(filled))
+
+      padded_shape = [-1]
+      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
+          fill_tuple).padded_batch(
+              4,
+              padded_shapes=(padded_shape, padded_shape),
+              padding_values=(-1, "<end>"))
+
+    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
+    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
+    self.run_core_tests(lambda: build_dataset(seq_lens1),
+                        lambda: build_dataset(seq_lens2), 8)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
index 0f1c8838ca111c7674fa4f7b16a8a5f6590281f4..55a1d3b95b212466b262ad3c26f1efd7ed0e067e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -39,7 +40,7 @@ from tensorflow.python.platform import test
 
 class DatasetConstructorTest(test.TestCase):
 
-  def testTensorDataset(self):
+  def testFromTensors(self):
     """Test an dataset that represents a single tuple of tensors."""
     components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
 
@@ -59,7 +60,75 @@ class DatasetConstructorTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testTensorSliceDataset(self):
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
+
+  def testFromTensorsSparse(self):
+    """Test an dataset that represents a single tuple of tensors."""
+    components = (sparse_tensor.SparseTensorValue(
+        indices=np.array([[0]]),
+        values=np.array([0]),
+        dense_shape=np.array([1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1]]),
+                      values=np.array([-1, 1]),
+                      dense_shape=np.array([2, 2])))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensors(components)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual(
+        [tensor_shape.TensorShape(c.dense_shape) for c in components],
+        [shape for shape in iterator.output_shapes])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      results = sess.run(get_next)
+      for component, result_component in zip(components, results):
+        self.assertSparseValuesEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromTensorsMixed(self):
+    """Test an dataset that represents a single tuple of tensors."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0]]),
+                      values=np.array([0]),
+                      dense_shape=np.array([1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1]]),
+                      values=np.array([-1, 1]),
+                      dense_shape=np.array([2, 2])))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensors(components)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([
+        tensor_shape.TensorShape(c.dense_shape)
+        if sparse_tensor.is_sparse(c) else c.shape for c in components
+    ], [shape for shape in iterator.output_shapes])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      results = sess.run(get_next)
+      for component, result_component in zip(components, results):
+        if sparse_tensor.is_sparse(component):
+          self.assertSparseValuesEqual(component, result_component)
+        else:
+          self.assertAllEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromTensorSlices(self):
     """Test an dataset that represents the slices from a tuple of tensors."""
     components = (
         np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
@@ -84,7 +153,127 @@ class DatasetConstructorTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testTensorSliceDatasetWithDict(self):
+  def testFromTensorSlicesSparse(self):
+    """Test an dataset that represents the slices from a tuple of tensors."""
+    components = (sparse_tensor.SparseTensorValue(
+        indices=np.array([[0, 0], [1, 0], [2, 0]]),
+        values=np.array([0, 0, 0]),
+        dense_shape=np.array([3, 1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
+                      values=np.array([1, 2, 3]),
+                      dense_shape=np.array([3, 3])))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual(
+        [tensor_shape.TensorShape(c.dense_shape[1:]) for c in components],
+        [shape for shape in iterator.output_shapes])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      expected = [
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[0]]),
+               values=np.array([1]),
+               dense_shape=np.array([3]))),
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[1]]),
+               values=np.array([2]),
+               dense_shape=np.array([3]))),
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[2]]),
+               values=np.array([3]),
+               dense_shape=np.array([3]))),
+      ]
+      for i in range(3):
+        results = sess.run(get_next)
+        for component, result_component in zip(expected[i], results):
+          self.assertSparseValuesEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromTensorSlicesMixed(self):
+    """Test an dataset that represents the slices from a tuple of tensors."""
+    components = (np.tile(np.array([[1], [2], [3]]), 20),
+                  np.tile(np.array([[12], [13], [14]]), 22),
+                  np.array([37.0, 38.0, 39.0]),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 0], [2, 0]]),
+                      values=np.array([0, 0, 0]),
+                      dense_shape=np.array([3, 1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
+                      values=np.array([1, 2, 3]),
+                      dense_shape=np.array([3, 3])))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([
+        tensor_shape.TensorShape(c.dense_shape[1:])
+        if sparse_tensor.is_sparse(c) else c.shape[1:] for c in components
+    ], [shape for shape in iterator.output_shapes])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      expected = [
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[0]]),
+               values=np.array([1]),
+               dense_shape=np.array([3]))),
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[1]]),
+               values=np.array([2]),
+               dense_shape=np.array([3]))),
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[2]]),
+               values=np.array([3]),
+               dense_shape=np.array([3]))),
+      ]
+      for i in range(3):
+        results = sess.run(get_next)
+        for component, result_component in zip(
+            (zip(*components[:3])[i] + expected[i]), results):
+          if sparse_tensor.is_sparse(component):
+            self.assertSparseValuesEqual(component, result_component)
+          else:
+            self.assertAllEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromTensorSlicesWithDict(self):
     components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
     iterator = (dataset_ops.Dataset.from_tensor_slices(components)
                 .make_initializable_iterator())
@@ -105,7 +294,7 @@ class DatasetConstructorTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testSparseTensorSliceDataset(self):
+  def testFromSparseTensorSlices(self):
     """Test a dataset based on slices of a `tf.SparseTensor`."""
     st = array_ops.sparse_placeholder(dtypes.float64)
     iterator = (dataset_ops.Dataset.from_sparse_tensor_slices(st)
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
index 0a9e99fd99eaff03ae242ca6cf9cc5e231da3038..bf25cc60a1c0efc09bed6501fd2d6f4ccb07764b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
@@ -23,9 +23,11 @@ import os
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver as saver_lib
@@ -63,6 +65,8 @@ class DatasetSerializationTestBase(test.TestCase):
         ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_reset_restored_iterator(
         ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+    self.verify_restore_in_empty_graph(
+        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
     if ds_fn2:
       self.verify_restore_in_modified_graph(
           ds_fn1, ds_fn2, num_outputs, sparse_tensors=sparse_tensors)
@@ -229,6 +233,7 @@ class DatasetSerializationTestBase(test.TestCase):
           ds_fn, sparse_tensors=sparse_tensors)
       with self.test_session(graph=g) as sess:
         self._restore(saver, sess)
+        sess.run(variables.global_variables_initializer())
         sess.run(init_op)
         for _ in range(num_outputs):
           actual.append(sess.run(get_next_op))
@@ -299,6 +304,97 @@ class DatasetSerializationTestBase(test.TestCase):
 
     self.match(expected, actual)
 
+  def verify_restore_in_empty_graph(self,
+                                    ds_fn,
+                                    num_outputs,
+                                    break_point=None,
+                                    sparse_tensors=False,
+                                    verify_exhausted=True):
+    """Attempts to restore an iterator in an empty graph.
+
+    Builds an input pipeline using ds_fn, runs it for `break_point` steps
+    and saves a checkpoint. Then builds a new empty graph, restores
+    the checkpoint from ds_fn and verifies that the restore is successful.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      break_point: Break point. Optional. Defaults to num_outputs/2.
+      sparse_tensors: See `run_core_tests`.
+      verify_exhausted: See `gen_outputs`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    break_point = num_outputs // 2 if not break_point else break_point
+
+    # Skip `break_point` items and store the remaining produced from ds_fn
+    # in `expected`.
+    self.gen_outputs(
+        ds_fn, [],
+        break_point,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=False)
+    expected = self.gen_outputs(
+        ds_fn, [],
+        num_outputs - break_point,
+        ckpt_saved=True,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=verify_exhausted)
+
+    # Generate `break_point` items from ds_fn and save checkpoint.
+    self.gen_outputs(
+        ds_fn, [],
+        break_point,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=False)
+
+    actual = []
+    # Build an empty graph but load checkpoint for ds_fn.
+    with ops.Graph().as_default() as g:
+      get_next_op, saver = self._build_empty_graph(
+          ds_fn, sparse_tensors=sparse_tensors)
+      with self.test_session(graph=g) as sess:
+        self._restore(saver, sess)
+        for _ in range(num_outputs - break_point):
+          actual.append(sess.run(get_next_op))
+        if verify_exhausted:
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(get_next_op)
+
+    self.match(expected, actual)
+
+  def verify_error_on_save(self,
+                           ds_fn,
+                           num_outputs,
+                           error,
+                           break_point=None,
+                           sparse_tensors=False):
+    """Attempts to save a non-saveable iterator.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      error: Declared error when trying to save iterator.
+      break_point: Break point. Optional. Defaults to num_outputs/2.
+      sparse_tensors: See `run_core_tests`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+
+    break_point = num_outputs // 2 if not break_point else break_point
+    with ops.Graph().as_default() as g:
+      init_op, get_next_op, saver = self._build_graph(
+          ds_fn, sparse_tensors=sparse_tensors)
+      with self.test_session(graph=g) as sess:
+        sess.run(variables.global_variables_initializer())
+        sess.run(init_op)
+        for _ in range(break_point):
+          sess.run(get_next_op)
+        with self.assertRaises(error):
+          self._save(sess, saver)
+
   def verify_run_with_breaks(self,
                              ds_fn,
                              break_points,
@@ -395,9 +491,11 @@ class DatasetSerializationTestBase(test.TestCase):
         with self.test_session(graph=g) as sess:
           if ckpt_saved:
             if init_before_restore:
+              sess.run(variables.global_variables_initializer())
               sess.run(init_op)
             self._restore(saver, sess)
           else:
+            sess.run(variables.global_variables_initializer())
             sess.run(init_op)
           start = break_points[i - 1] if i > 0 else 0
           end = break_points[i] if i < len(break_points) else num_outputs
@@ -466,6 +564,18 @@ class DatasetSerializationTestBase(test.TestCase):
     saver = saver_lib.Saver(allow_empty=True)
     return init_op, get_next, saver
 
+  def _build_empty_graph(self, ds_fn, sparse_tensors=False):
+    iterator = iterator_ops.Iterator.from_structure(
+        self._get_output_types(ds_fn), self._get_output_shapes(ds_fn))
+    saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+    if sparse_tensors:
+      get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+    else:
+      get_next = iterator.get_next()
+    saver = saver_lib.Saver(allow_empty=True)
+    return get_next, saver
+
   def _add_iterator_ops_to_collection(self,
                                       init_op,
                                       get_next,
@@ -495,6 +605,10 @@ class DatasetSerializationTestBase(test.TestCase):
     with ops.Graph().as_default():
       return ds_fn().output_types
 
+  def _get_output_shapes(self, ds_fn):
+    with ops.Graph().as_default():
+      return ds_fn().output_shapes
+
   def _ckpt_path(self):
     return os.path.join(self.get_temp_dir(), "iterator")
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
index 67c49d77e2489a942fbf79286ec6ebc0af29a45e..5921be2ae89ba1bbbb8d6e3a509cf49c65949544 100644
--- a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -131,9 +132,12 @@ class FilterDatasetTest(test.TestCase):
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
   def testSparse(self):
+
     def _map_fn(i):
-      return sparse_tensor.SparseTensor(
-          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1])), i
 
     def _filter_fn(_, i):
       return math_ops.equal(i % 2, 0)
@@ -148,13 +152,48 @@ class FilterDatasetTest(test.TestCase):
       sess.run(init_op)
       for i in range(5):
         actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensor(
-            indices=[[0, 0]], values=[i*2], dense_shape=[1, 1])
         self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
-        self.assertSparseValuesEqual(actual, expected.eval())
+        self.assertSparseValuesEqual(actual, _map_fn(i * 2)[0])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
 
+class FilterDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_filter_range_graph(self, div):
+    return dataset_ops.Dataset.range(100).filter(
+        lambda x: math_ops.not_equal(math_ops.mod(x, div), 2))
+
+  def testFilterCore(self):
+    div = 3
+    num_outputs = np.sum([x % 3 is not 2 for x in range(100)])
+    self.run_core_tests(lambda: self._build_filter_range_graph(div),
+                        lambda: self._build_filter_range_graph(div * 2),
+                        num_outputs)
+
+  def _build_filter_dict_graph(self):
+    return dataset_ops.Dataset.range(10).map(
+        lambda x: {"foo": x * 2, "bar": x ** 2}).filter(
+            lambda d: math_ops.equal(d["bar"] % 2, 0)).map(
+                lambda d: d["foo"] + d["bar"])
+
+  def testFilterDictCore(self):
+    num_outputs = np.sum([(x**2) % 2 == 0 for x in range(10)])
+    self.run_core_tests(self._build_filter_dict_graph, None, num_outputs)
+
+  def _build_sparse_filter(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensor(
+          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
+
+    def _filter_fn(_, i):
+      return math_ops.equal(i % 2, 0)
+
+    return dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
+        lambda x, i: x)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
index c950e4857ef0d4d1340fdded1010800e6771939e..d4fbaa5cdcdd315aa0524134b48eb0515169722c 100644
--- a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
@@ -21,11 +21,18 @@ import random
 
 import numpy as np
 
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import function
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -124,7 +131,7 @@ class FlatMapDatasetTest(test.TestCase):
 
   def testSparse(self):
     def _map_fn(i):
-      return sparse_tensor.SparseTensor(
+      return sparse_tensor.SparseTensorValue(
           indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
 
     def _flat_map_fn(x):
@@ -147,5 +154,77 @@ class FlatMapDatasetTest(test.TestCase):
         sess.run(get_next)
 
 
+class FlatMapDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testCore(self):
+    # Complicated way of saying range(start, start+25).
+    def build_ds(start):
+
+      def map_fn(x):
+        return dataset_ops.Dataset.range(x, x + 5)
+
+      return dataset_ops.Dataset.range(start, start + 5 * 5, 5).flat_map(map_fn)
+
+    self.run_core_tests(lambda: build_ds(0), lambda: build_ds(10), 25)
+
+  def testMapThenFlatMap(self):
+
+    def build_ds():
+
+      def flat_map_fn(_):
+
+        def map_fn(y):
+          return 10 * math_ops.to_int32(y)
+
+        return dataset_ops.Dataset.range(100).map(map_fn)
+
+      return dataset_ops.Dataset.range(5).flat_map(flat_map_fn)
+
+    self.run_core_tests(build_ds, None, 500)
+
+  def testCaptureDefunInMapFn(self):
+
+    def build_ds():
+
+      def map_fn(x):
+
+        @function.Defun(dtypes.int64)
+        def defun_fn(x):
+          return constant_op.constant(1000) + math_ops.to_int32(x)
+
+        return dataset_ops.Dataset.from_tensor_slices([defun_fn(x)])
+
+      return dataset_ops.Dataset.range(100).flat_map(map_fn)
+
+    self.run_core_tests(build_ds, None, 100)
+
+  def testDisallowVariableCapture(self):
+
+    def build_ds():
+      test_var = variable_scope.get_variable(
+          name="test_var", shape=(), use_resource=True)
+      return dataset_ops.Dataset.range(5).flat_map(
+          lambda _: dataset_ops.Dataset.from_tensor_slices([test_var]))
+
+    self.verify_error_on_save(build_ds, 5, errors.InvalidArgumentError)
+
+  def testDisallowCapturingStatefulOps(self):
+
+    def build_ds():
+
+      def flat_map_fn(_):
+
+        def map_fn(x):
+          return random_ops.random_uniform(
+              (), 0, 10, dtype=dtypes.int32) * math_ops.to_int32(x)
+
+        return dataset_ops.Dataset.range(100).map(map_fn)
+
+      return dataset_ops.Dataset.range(5).flat_map(flat_map_fn)
+
+    self.verify_error_on_save(build_ds, 500, errors.InvalidArgumentError)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
index 0299e3a1b7d240e75b869ef4595293f691958623..e66ed3f7aa2a512813ef353d2d0744ae67005884 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
@@ -22,8 +22,10 @@ import math
 import threading
 import time
 
+import numpy as np
 from six.moves import zip_longest
 
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.contrib.data.python.ops import interleave_ops
 from tensorflow.python.framework import dtypes
@@ -185,8 +187,9 @@ class InterleaveDatasetTest(test.TestCase):
         sess.run(next_element)
 
   def testSparse(self):
+
     def _map_fn(i):
-      return sparse_tensor.SparseTensor(
+      return sparse_tensor.SparseTensorValue(
           indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
 
     def _interleave_fn(x):
@@ -209,6 +212,46 @@ class InterleaveDatasetTest(test.TestCase):
         sess.run(get_next)
 
 
+class InterleaveDatasetSeriazationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_iterator_graph(self, input_values, cycle_length, block_length):
+    repeat_count = 2
+    return dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
+        repeat_count).interleave(
+            lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x),
+            cycle_length, block_length)
+
+  def testSerializationCore(self):
+    input_values = np.array([4, 5, 6], dtype=np.int64)
+    num_outputs = np.sum(input_values) * 2
+    # cycle_length > 1, block_length > 1
+    cycle_length = 2
+    block_length = 3
+    # pylint: disable=g-long-lambda
+    self.run_core_tests(
+        lambda: self._build_iterator_graph(
+            input_values, cycle_length, block_length),
+        lambda: self._build_iterator_graph(
+            input_values, cycle_length * 2, block_length * 1),
+        num_outputs)
+    # cycle_length = 1
+    cycle_length = 1
+    block_length = 3
+    self.run_core_tests(
+        lambda: self._build_iterator_graph(
+            input_values, cycle_length, block_length),
+        None, num_outputs)
+    # block_length = 1
+    cycle_length = 2
+    block_length = 1
+    self.run_core_tests(
+        lambda: self._build_iterator_graph(
+            input_values, cycle_length, block_length),
+        None, num_outputs)
+    # pylint: enable=g-long-lambda
+
+
 class ParallelInterleaveDatasetTest(test.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
index d8e7f9d5933b4291b2d905aeb3c54439e0958a4c..e9a07da84a8c80c09ebd4dab0b1d69febe1c9790 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -23,10 +23,9 @@ import threading
 
 import numpy as np
 
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.contrib.data.python.ops import error_ops
-from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
-from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -44,10 +43,7 @@ from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
-from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.util import compat
 
 
@@ -630,9 +626,13 @@ class MapDatasetTest(test.TestCase):
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
   def testSparse(self):
+
     def _sparse(i):
-      return sparse_tensor.SparseTensor(
-          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1])
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
     iterator = (dataset_ops.Dataset.range(10)
                 .map(_sparse)
                 .make_initializable_iterator())
@@ -643,24 +643,26 @@ class MapDatasetTest(test.TestCase):
       sess.run(init_op)
       for i in range(10):
         actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensor(
-            indices=[[0, 0]], values=[i], dense_shape=[1, 1])
         self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
-        self.assertSparseValuesEqual(actual, expected.eval())
+        self.assertSparseValuesEqual(actual, _sparse(i))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
   def testSparseChain(self):
+
     def _sparse(i):
-      return sparse_tensor.SparseTensor(
-          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1])
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
     def _check(i):
-      self.assertTrue(isinstance(i, sparse_tensor.SparseTensor))
+      self.assertTrue(sparse_tensor.is_sparse(i))
       return sparse_ops.sparse_concat(0, [i, i])
 
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(_sparse).map(_check)
-                .make_initializable_iterator())
+    iterator = (
+        dataset_ops.Dataset.range(10).map(_sparse).map(_check)
+        .make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -668,10 +670,8 @@ class MapDatasetTest(test.TestCase):
       sess.run(init_op)
       for i in range(10):
         actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensor(
-            indices=[[0, 0], [1, 0]], values=[i, i], dense_shape=[2, 1])
         self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
-        self.assertSparseValuesEqual(actual, expected.eval())
+        self.assertSparseValuesEqual(actual, _check(_sparse(i)).eval())
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -702,20 +702,14 @@ class MapDatasetTest(test.TestCase):
           sess.run(init_op)
 
 
-class MapDatasetSerializationTest(test.TestCase):
+class MapDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
 
   def setUp(self):
     self._tensor_slice_len = 7
     self._num_epochs = 14
     self._num_outputs = self._tensor_slice_len * self._num_epochs
 
-  def tearDown(self):
-    # Remove all checkpoint files.
-    prefix = self._ckpt_path()
-    pattern = prefix + "*"
-    files = gfile.Glob(pattern)
-    map(gfile.Remove, files)
-
   def _build_ds(self, multiplier=37.0):
     components = (np.arange(self._tensor_slice_len), np.array([[1, 2, 3]]) *
                   np.arange(self._tensor_slice_len)[:, np.newaxis],
@@ -727,292 +721,11 @@ class MapDatasetSerializationTest(test.TestCase):
     return (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
             .repeat(self._num_epochs))
 
-  def _build_graph(self, multiplier=37.0, build_saveable=True):
-    ds = self._build_ds(multiplier)
-    iterator = ds.make_initializable_iterator()
-
-    if build_saveable:
-      saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
-      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    self._add_iterator_ops_to_collection(init_op, get_next)
-    saver = saver_lib.Saver(allow_empty=True)
-    return init_op, get_next, saver
-
-  def _build_empty_graph(self, output_types, output_shapes):
-    iterator = iterator_ops.Iterator.from_structure(output_types, output_shapes)
-    saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
-    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-    saver = saver_lib.Saver()
-    get_next = iterator.get_next()
-    return get_next, saver
-
-  def _add_iterator_ops_to_collection(self, init_op, get_next):
-    ops.add_to_collection("iterator_ops", init_op)
-    ops.add_to_collection("iterator_ops", get_next[0])
-    ops.add_to_collection("iterator_ops", get_next[1])
-    ops.add_to_collection("iterator_ops", get_next[2])
-
-  def _get_iterator_ops_from_collection(self):
-    init_op, get_next_1, get_next_2, get_next_3 = ops.get_collection(
-        "iterator_ops")
-    return init_op, (get_next_1, get_next_2, get_next_3)
-
-  def _ckpt_path(self):
-    return os.path.join(self.get_temp_dir(), "iterator")
-
-  def _latest_ckpt(self):
-    return saver_lib.latest_checkpoint(self.get_temp_dir())
-
-  def _save(self, sess, saver):
-    saver.save(sess, self._ckpt_path())
-
-  def _restore(self, saver, sess):
-    saver.restore(sess, self._latest_ckpt())
-
-  def _import_meta_graph(self):
-    meta_file_path = self._ckpt_path() + ".meta"
-    return saver_lib.import_meta_graph(meta_file_path)
-
-  def _testReadWithBreaks(self, break_points, init_before_restore=False):
-    expected = []
-    actual = []
-    # Generate the ground truth.
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, _ = self._build_graph()
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for _ in range(self._num_outputs):
-          expected.append(sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-    # Run and checkpoint after first break_point.
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, saver = self._build_graph()
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for _ in range(break_points[0]):
-          actual.append(sess.run(get_next_op))
-        self._save(sess, saver)
-
-    # Load from checkpoint and continue running while stopping at each
-    # subsequent checkpoint.
-    for i in range(len(break_points)):
-      with ops.Graph().as_default() as g:
-        saver = self._import_meta_graph()
-        init_op, get_next_op = self._get_iterator_ops_from_collection()
-        with self.test_session(graph=g) as sess:
-          if init_before_restore:
-            sess.run(init_op)
-          self._restore(saver, sess)
-          start = break_points[i]
-          end = break_points[
-              i + 1] if i < len(break_points) - 1 else self._num_outputs
-          for _ in range(end - start):
-            actual.append(sess.run(get_next_op))
-          self._save(sess, saver)
-          if end == self._num_outputs:
-            with self.assertRaises(errors.OutOfRangeError):
-              sess.run(get_next_op)
-    self._match(expected, actual)
-
-  def _match(self, expected, actual):
-    self.assertEqual(len(expected), len(actual))
-    for expected_tuple, actual_tuple in zip(expected, actual):
-      self.assertEqual(expected_tuple[0], actual_tuple[0])
-      self.assertSequenceEqual(expected_tuple[1].tolist(),
-                               actual_tuple[1].tolist())
-      self.assertEqual(expected_tuple[2], actual_tuple[2])
-
-  def _does_not_match(self, expected, actual):
-    with self.assertRaises(AssertionError):
-      self._match(expected, actual)
-
-  def testSaveRestore(self):
-    self._testReadWithBreaks([4])
-    self._testReadWithBreaks([13])
-    self._testReadWithBreaks([18])
-    self._testReadWithBreaks([23])
-
-  def testSaveUnusedIterator(self):
-    self._testReadWithBreaks([0])
-
-  def testSaveFullyUsedIterator(self):
-    self._testReadWithBreaks([self._num_outputs])
-
-  def testMultipleBreaks(self):
-    self._testReadWithBreaks([0, 5, 9, 15, 25, 32])
-
-  def testIdempotence(self):
-    # Attempt to save iterator immediately after restoring.
-    self._testReadWithBreaks([1, 1, 5, 5, 5, 25, 32])
-
-  def testInitThenRestore(self):
-    self._testReadWithBreaks([0, 5, 9, 15, 25, 32], init_before_restore=True)
-
-  def testRestoreExhaustedIterator(self):
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, saver = self._build_graph()
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for _ in range(self._num_outputs):
-          sess.run(get_next_op)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-        self._save(sess, saver)
-
-      with ops.Graph().as_default() as g:
-        saver = self._import_meta_graph()
-        init_op, get_next_op = self._get_iterator_ops_from_collection()
-        with self.test_session(graph=g) as sess:
-          self._restore(saver, sess)
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-  def testResetRestoredIterator(self):
-    expected = []
-    # Collect ground truth containing all outputs.
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, saver = self._build_graph()
-      break_point = self._num_outputs // 2
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for _ in range(break_point):
-          expected.append(sess.run(get_next_op))
-        self._save(sess, saver)
-        for _ in range(self._num_outputs - break_point):
-          expected.append(sess.run(get_next_op))
-
-    actual = []
-    # Restore from checkpoint and then run init_op.
-    with ops.Graph().as_default() as g:
-      saver = self._import_meta_graph()
-      init_op, get_next_op = self._get_iterator_ops_from_collection()
-      with self.test_session(graph=g) as sess:
-        self._restore(saver, sess)
-        sess.run(init_op)
-        for _ in range(self._num_outputs):
-          actual.append(sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-    self._match(expected, actual)
-
-  def testRestoreInModifiedGraph(self):
-    expected = []
-    actual_without_restore = []
-    actual = []
-    break_point = 10
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, saver = self._build_graph(multiplier=15.0)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for _ in range(break_point):
-          expected.append(sess.run(get_next_op))
-        actual.extend(expected)
-        self._save(sess, saver)
-        for _ in range(self._num_outputs - break_point):
-          expected.append(sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-    # Collect outputs by running modified graph.
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, saver = self._build_graph(multiplier=30.0)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for _ in range(self._num_outputs):
-          actual_without_restore.append(sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-    # Restore the checkpoint in the modified graph.
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, saver = self._build_graph(multiplier=30.0)
-      with self.test_session(graph=g) as sess:
-        self._restore(saver, sess)
-        for _ in range(self._num_outputs - break_point):
-          actual.append(sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-    # Ensure the modified graph gets overridden when restoring checkpoint.
-    self._does_not_match(expected, actual_without_restore)
-    # Expect that the outputs are what we would expect if we ran the old
-    # graph.
-    self._match(expected, actual)
-
-  # TODO(srbs): Add this test to dataset_serialization_test_base.py.
-  def testRestoreInEmptyGraph(self):
-    expected = []
-    actual = []
-    break_point = 10
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, saver = self._build_graph(multiplier=15.0)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for _ in range(break_point):
-          sess.run(get_next_op)
-        self._save(sess, saver)
-        for _ in range(self._num_outputs - break_point):
-          expected.append(sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      ds = self._build_ds()
-      output_types = ds.output_types
-      output_shapes = ds.output_shapes
-
-    with ops.Graph().as_default() as g:
-      get_next_op, saver = self._build_empty_graph(output_types, output_shapes)
-      with self.test_session(graph=g) as sess:
-        self._restore(saver, sess)
-        for _ in range(self._num_outputs - break_point):
-          actual.append(sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-    # Expect that the outputs are what we would expect if we ran the old
-    # graph.
-    self._match(expected, actual)
-
-  def testDoNotBuildSaveable(self):
-    break_point = 10
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, saver = self._build_graph(multiplier=15.0)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for _ in range(break_point):
-          sess.run(get_next_op)
-        self._save(sess, saver)
-
-    expected = []
-    # Collect ground truth by running modified graph.
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, saver = self._build_graph(multiplier=30.0)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for _ in range(self._num_outputs):
-          expected.append(sess.run(get_next_op))
-
-    actual = []
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, saver = self._build_graph(
-          multiplier=30.0, build_saveable=False)
-      with self.test_session(graph=g) as sess:
-        # Since the SaveableObject was not added to Saver's list
-        # of saveables, iterator state is not restored by saver.restore().
-        self._restore(saver, sess)
-        with self.assertRaises(errors.FailedPreconditionError):
-          sess.run(get_next_op)
-        sess.run(init_op)
-        for _ in range(self._num_outputs):
-          actual.append(sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-    self._match(expected, actual)
+  def testSaveRestoreCore(self):
+    self.run_core_tests(
+        self._build_ds,
+        lambda: self._build_ds(multiplier=15.0),
+        self._num_outputs)
 
   def testSaveStatefulFunction(self):
 
@@ -1024,26 +737,7 @@ class MapDatasetSerializationTest(test.TestCase):
 
       return dataset_ops.Dataset.range(100).map(_map_fn)
 
-    def _build_graph():
-      ds = _build_ds()
-      iterator = ds.make_initializable_iterator()
-
-      saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
-      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      saver = saver_lib.Saver(allow_empty=True)
-      return init_op, get_next, saver
-
-    break_point = 10
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, saver = _build_graph()
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for _ in range(break_point):
-          sess.run(get_next_op)
-        with self.assertRaises(errors.InvalidArgumentError):
-          self._save(sess, saver)
+    self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
 
   def testCaptureVariableInMapFn(self):
 
@@ -1053,27 +747,7 @@ class MapDatasetSerializationTest(test.TestCase):
       return (dataset_ops.Dataset.from_tensors(0).repeat(10).map(
           lambda _: counter_var.assign_add(1)))
 
-    def _build_graph():
-      ds = _build_ds()
-      iterator = ds.make_initializable_iterator()
-
-      saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
-      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      saver = saver_lib.Saver(allow_empty=True)
-      return init_op, get_next, saver
-
-    break_point = 10
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, saver = _build_graph()
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for _ in range(break_point):
-          sess.run(get_next_op)
-        with self.assertRaises(errors.InvalidArgumentError):
-          self._save(sess, saver)
+    self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
 
   def testCaptureDefunInMapFn(self):
     num_outputs = 100
@@ -1086,46 +760,7 @@ class MapDatasetSerializationTest(test.TestCase):
 
       return dataset_ops.Dataset.range(num_outputs).map(defun_fn)
 
-    def _build_graph():
-      ds = _build_ds()
-      iterator = ds.make_initializable_iterator()
-
-      saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
-      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      saver = saver_lib.Saver(allow_empty=True)
-      return init_op, get_next, saver
-
-    break_point = 10
-    expected = []
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, saver = _build_graph()
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for _ in range(break_point):
-          sess.run(get_next_op)
-        self._save(sess, saver)
-        for _ in range(num_outputs - break_point):
-          expected.append(sess.run(get_next_op))
-
-    with ops.Graph().as_default() as g:
-      ds = _build_ds()
-      output_types = ds.output_types
-      output_shapes = ds.output_shapes
-
-    actual = []
-    with ops.Graph().as_default() as g:
-      get_next_op, saver = self._build_empty_graph(output_types, output_shapes)
-      with self.test_session(graph=g) as sess:
-        self._restore(saver, sess)
-        for _ in range(num_outputs - break_point):
-          actual.append(sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-    self.assertSequenceEqual(expected, actual)
+    self.run_core_tests(_build_ds, None, num_outputs)
 
   def testBuildDefunInMapFn(self):
     num_outputs = 100
@@ -1143,46 +778,23 @@ class MapDatasetSerializationTest(test.TestCase):
 
       return dataset_ops.Dataset.range(num_outputs).map(defun_fn)
 
-    def _build_graph():
-      ds = _build_ds()
-      iterator = ds.make_initializable_iterator()
+    self.run_core_tests(_build_ds, None, num_outputs)
 
-      saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
-      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      saver = saver_lib.Saver(allow_empty=True)
-      return init_op, get_next, saver
 
-    break_point = 10
-    expected = []
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, saver = _build_graph()
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for _ in range(break_point):
-          sess.run(get_next_op)
-        self._save(sess, saver)
-        for _ in range(num_outputs - break_point):
-          expected.append(sess.run(get_next_op))
+class IgnoreErrorsSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
 
-    with ops.Graph().as_default() as g:
-      ds = _build_ds()
-      output_types = ds.output_types
-      output_shapes = ds.output_shapes
+  def _build_ds(self, components):
+    return dataset_ops.Dataset.from_tensor_slices(components).map(
+        lambda x: array_ops.check_numerics(x, "message")).apply(
+            error_ops.ignore_errors())
 
-    actual = []
-    with ops.Graph().as_default() as g:
-      get_next_op, saver = self._build_empty_graph(output_types, output_shapes)
-      with self.test_session(graph=g) as sess:
-        self._restore(saver, sess)
-        for _ in range(num_outputs - break_point):
-          actual.append(sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-    self.assertSequenceEqual(expected, actual)
+  def testIgnoreErrorsCore(self):
+    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
+    diff_components = np.array([1., 2., 3., np.nan]).astype(np.float32)
+    num_outputs = 4
+    self.run_core_tests(lambda: self._build_ds(components),
+                        lambda: self._build_ds(diff_components), num_outputs)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py b/tensorflow/contrib/data/python/kernel_tests/prefetch_dataset_op_test.py
similarity index 51%
rename from tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py
rename to tensorflow/contrib/data/python/kernel_tests/prefetch_dataset_op_test.py
index a640dfe7dfbcce96261589c7fc49107deaefdd54..3d120a3071ef730f21221e3291d8c84385b51aa3 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py
+++ b/tensorflow/contrib/data/python/kernel_tests/prefetch_dataset_op_test.py
@@ -12,37 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Sigmoid bijector."""
-
+"""Tests for the experimental input pipeline ops."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "Sigmoid",
-]
-
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
 
-class Sigmoid(bijector.Bijector):
-  """Bijector which computes `Y = g(X) = 1 / (1 + exp(-X))`."""
 
-  def __init__(self, validate_args=False, name="sigmoid"):
-    super(Sigmoid, self).__init__(
-        event_ndims=0, validate_args=validate_args, name=name)
+class PrefetchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
 
-  def _forward(self, x):
-    return math_ops.sigmoid(x)
+  def build_dataset(self, seed):
+    return dataset_ops.Dataset.range(100).prefetch(10).shuffle(
+        buffer_size=10, seed=seed, reshuffle_each_iteration=False)
 
-  def _inverse(self, y):
-    return math_ops.log(y) - math_ops.log1p(-y)
+  def testCore(self):
+    num_outputs = 100
+    self.run_core_tests(lambda: self.build_dataset(10),
+                        lambda: self.build_dataset(20), num_outputs)
 
-  def _inverse_log_det_jacobian(self, y):
-    return -math_ops.log(y) - math_ops.log1p(-y)
 
-  def _forward_log_det_jacobian(self, x):
-    return -nn_ops.softplus(-x) - nn_ops.softplus(x)
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
index f59ac760dc83a504e563f055b91f1002cb0c80fc..8e6ad061a11752ab7b1ffc13c90b4fa52f67d6aa 100644
--- a/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import os
 
+from tensorflow.contrib.data.python.ops import counter
 from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.contrib.data.python.ops import enumerate_ops
 from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
@@ -194,6 +195,27 @@ class RangeDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testCounter(self):
+    """Test dataset construction using `count`."""
+    iterator = (counter.Counter(start=3, step=4)
+                .make_one_shot_iterator())
+    get_next = iterator.get_next()
+    self.assertEqual([], get_next.shape.as_list())
+    self.assertEqual(dtypes.int64, get_next.dtype)
+
+    negative_iterator = (counter.Counter(start=0, step=-1)
+                         .make_one_shot_iterator())
+    negative_get_next = negative_iterator.get_next()
+
+    with self.test_session() as sess:
+      self.assertEqual(3, sess.run(get_next))
+      self.assertEqual(3 + 4, sess.run(get_next))
+      self.assertEqual(3 + 2 * 4, sess.run(get_next))
+
+      self.assertEqual(0, sess.run(negative_get_next))
+      self.assertEqual(-1, sess.run(negative_get_next))
+      self.assertEqual(-2, sess.run(negative_get_next))
+
   def _iterator_checkpoint_prefix(self):
     return os.path.join(self.get_temp_dir(), "iterator")
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
index 6b5b53cc0f8f2d1df5622a5bc5e2f8ef04c6342a..72745ec7525ad0578934fb2051018f6531938088 100644
--- a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
@@ -22,8 +22,10 @@ import os
 
 import numpy as np
 
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import dataset_ops as contrib_dataset_ops
 from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.contrib.data.python.ops import shuffle_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import constant_op
@@ -474,5 +476,83 @@ class ShuffleDatasetSerializationTest(test.TestCase):
       self.assertEqual(expected_outputs_sorted, sorted(actual))
 
 
+class ShuffleAndRepeatTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_ds(self, seed, count=5, num_elements=20):
+    return dataset_ops.Dataset.range(num_elements).apply(
+        shuffle_ops.shuffle_and_repeat(buffer_size=5, count=count, seed=seed))
+
+  def testCorrectOutput(self):
+    output = self.gen_outputs(lambda: self._build_ds(10), [], 100)
+    self.assertSequenceEqual(
+        sorted(output), sorted(
+            np.array([range(20) for _ in range(5)]).flatten()))
+    for i in range(5):
+      self.assertSequenceEqual(sorted(output[i * 20:(i + 1) * 20]), range(20))
+
+  def testReshuffling(self):
+    # Check that the output orders of different epochs are indeed different.
+    output = self.gen_outputs(lambda: self._build_ds(10), [], 100)
+    for i in range(4):
+      epoch1 = output[i * 20:(i + 1) * 20]
+      epoch2 = output[(i + 1) * 20:(i + 2) * 20]
+      self.assertNotEqual(epoch1, epoch2)
+
+  def testSameOrderForSameSeeds(self):
+    output1 = self.gen_outputs(lambda: self._build_ds(10), [], 100)
+    output2 = self.gen_outputs(lambda: self._build_ds(10), [], 100)
+    self.assertEqual(output1, output2)
+
+  def testDifferentOrderForDifferentSeeds(self):
+    output1 = self.gen_outputs(lambda: self._build_ds(10), [], 100)
+    output2 = self.gen_outputs(lambda: self._build_ds(20), [], 100)
+    self.assertNotEqual(output1, output2)
+    self.assertEqual(sorted(output1), sorted(output2))
+
+  def testCountNone(self):
+    output1 = self.gen_outputs(
+        lambda: self._build_ds(10, count=None), [], 100, verify_exhausted=False)
+    output2 = self.gen_outputs(
+        lambda: self._build_ds(20, count=None), [], 100, verify_exhausted=False)
+    self.assertNotEqual(output1, output2)
+    self.assertEqual(sorted(output1), sorted(output2))
+
+  def testCountMinusOne(self):
+    output1 = self.gen_outputs(
+        lambda: self._build_ds(10, count=-1), [], 100, verify_exhausted=False)
+    output2 = self.gen_outputs(
+        lambda: self._build_ds(20, count=-1), [], 100, verify_exhausted=False)
+    self.assertNotEqual(output1, output2)
+    self.assertEqual(sorted(output1), sorted(output2))
+
+  def testInfiniteOutputs(self):
+    # Asserting the iterator is exhausted after producing 100 items should fail.
+    with self.assertRaises(AssertionError):
+      self.gen_outputs(lambda: self._build_ds(10, count=None), [], 100)
+    with self.assertRaises(AssertionError):
+      self.gen_outputs(lambda: self._build_ds(10, count=-1), [], 100)
+
+  def testInfiniteEmpty(self):
+    with self.assertRaises(errors.OutOfRangeError):
+      self.gen_outputs(lambda: self._build_ds(10, count=None, num_elements=0),
+                       [], 100)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.gen_outputs(lambda: self._build_ds(10, count=-1, num_elements=0), [],
+                       100)
+
+
+class ShuffleAndRepeatSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_ds(self, seed):
+    return dataset_ops.Dataset.range(20).apply(
+        shuffle_ops.shuffle_and_repeat(buffer_size=5, count=5, seed=seed))
+
+  def testCore(self):
+    self.run_core_tests(lambda: self._build_ds(10), lambda: self._build_ds(20),
+                        100)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..07bdf920446e953c2a1abaf495d2e9e1256106fd
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
@@ -0,0 +1,257 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline statistics gathering ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
+from tensorflow.contrib.data.python.ops import stats_ops
+from tensorflow.core.framework import summary_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class StatsDatasetTest(test.TestCase):
+
+  def _assertSummaryHasCount(self, summary_str, tag, expected_value):
+    summary_proto = summary_pb2.Summary()
+    summary_proto.ParseFromString(summary_str)
+    for value in summary_proto.value:
+      if tag == value.tag:
+        self.assertEqual(expected_value, value.histo.num)
+        return
+    self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto))
+
+  def _assertSummaryHasSum(self, summary_str, tag, expected_value):
+    summary_proto = summary_pb2.Summary()
+    summary_proto.ParseFromString(summary_str)
+    for value in summary_proto.value:
+      if tag == value.tag:
+        self.assertEqual(expected_value, value.histo.sum)
+        return
+    self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto))
+
+  def testBytesProduced(self):
+    dataset = dataset_ops.Dataset.range(100).map(
+        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
+            stats_ops.bytes_produced_stats("bytes_produced"))
+    iterator = dataset.make_initializable_iterator()
+    stats_aggregator = stats_ops.StatsAggregator()
+    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
+    next_element = iterator.get_next()
+    summary_t = stats_aggregator.get_summary()
+
+    with self.test_session() as sess:
+      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      expected_sum = 0.0
+      for i in range(100):
+        self.assertAllEqual(
+            np.array([i] * i, dtype=np.int64), sess.run(next_element))
+        summary_str = sess.run(summary_t)
+        self._assertSummaryHasCount(summary_str, "bytes_produced", float(i + 1))
+        expected_sum += i * 8.0
+        self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      summary_str = sess.run(summary_t)
+      self._assertSummaryHasCount(summary_str, "bytes_produced", 100.0)
+      self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
+
+  def testLatencyStats(self):
+    dataset = dataset_ops.Dataset.range(100).apply(
+        stats_ops.latency_stats("record_latency"))
+    iterator = dataset.make_initializable_iterator()
+    stats_aggregator = stats_ops.StatsAggregator()
+    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
+    next_element = iterator.get_next()
+    summary_t = stats_aggregator.get_summary()
+
+    with self.test_session() as sess:
+      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      for i in range(100):
+        self.assertEqual(i, sess.run(next_element))
+        self._assertSummaryHasCount(
+            sess.run(summary_t), "record_latency", float(i + 1))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0)
+
+  def testReinitialize(self):
+    dataset = dataset_ops.Dataset.range(100).apply(
+        stats_ops.latency_stats("record_latency"))
+    iterator = dataset.make_initializable_iterator()
+    stats_aggregator = stats_ops.StatsAggregator()
+    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
+    next_element = iterator.get_next()
+    summary_t = stats_aggregator.get_summary()
+
+    with self.test_session() as sess:
+      sess.run(stats_aggregator_subscriber)
+      for j in range(5):
+        sess.run(iterator.initializer)
+        for i in range(100):
+          self.assertEqual(i, sess.run(next_element))
+          self._assertSummaryHasCount(
+              sess.run(summary_t), "record_latency", float((j * 100) + i + 1))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(next_element)
+        self._assertSummaryHasCount(
+            sess.run(summary_t), "record_latency", (j + 1) * 100.0)
+
+  def testNoAggregatorRegistered(self):
+    dataset = dataset_ops.Dataset.range(100).apply(
+        stats_ops.latency_stats("record_latency"))
+    iterator = dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      for i in range(100):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testMultipleTags(self):
+    dataset = dataset_ops.Dataset.range(100).apply(
+        stats_ops.latency_stats("record_latency")).apply(
+            stats_ops.latency_stats("record_latency_2"))
+    iterator = dataset.make_initializable_iterator()
+    stats_aggregator = stats_ops.StatsAggregator()
+    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
+    next_element = iterator.get_next()
+    summary_t = stats_aggregator.get_summary()
+
+    with self.test_session() as sess:
+      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      for i in range(100):
+        self.assertEqual(i, sess.run(next_element))
+        self._assertSummaryHasCount(
+            sess.run(summary_t), "record_latency", float(i + 1))
+        self._assertSummaryHasCount(
+            sess.run(summary_t), "record_latency_2", float(i + 1))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0)
+      self._assertSummaryHasCount(
+          sess.run(summary_t), "record_latency_2", 100.0)
+
+  def testRepeatedTags(self):
+    dataset = dataset_ops.Dataset.range(100).apply(
+        stats_ops.latency_stats("record_latency")).apply(
+            stats_ops.latency_stats("record_latency"))
+    iterator = dataset.make_initializable_iterator()
+    stats_aggregator = stats_ops.StatsAggregator()
+    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
+    next_element = iterator.get_next()
+    summary_t = stats_aggregator.get_summary()
+
+    with self.test_session() as sess:
+      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      for i in range(100):
+        self.assertEqual(i, sess.run(next_element))
+        self._assertSummaryHasCount(
+            sess.run(summary_t), "record_latency", float(2 * (i + 1)))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
+
+  def testMultipleIteratorsSameAggregator(self):
+    dataset = dataset_ops.Dataset.range(100).apply(
+        stats_ops.latency_stats("record_latency"))
+    iterator_0 = dataset.make_initializable_iterator()
+    iterator_1 = dataset.make_initializable_iterator()
+    stats_aggregator = stats_ops.StatsAggregator()
+    stats_aggregator_subscribers = [stats_aggregator.subscribe(iterator_0),
+                                    stats_aggregator.subscribe(iterator_1)]
+    next_element = iterator_0.get_next() + iterator_1.get_next()
+    summary_t = stats_aggregator.get_summary()
+
+    with self.test_session() as sess:
+      sess.run([iterator_0.initializer, iterator_1.initializer,
+                stats_aggregator_subscribers])
+      for i in range(100):
+        self.assertEqual(i * 2, sess.run(next_element))
+        self._assertSummaryHasCount(
+            sess.run(summary_t), "record_latency", float(2 * (i + 1)))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
+
+  def testMultipleStatsAggregatorsSameIteratorFail(self):
+    dataset = dataset_ops.Dataset.range(100).apply(
+        stats_ops.latency_stats("record_latency"))
+    iterator = dataset.make_initializable_iterator()
+    stats_aggregator_0 = stats_ops.StatsAggregator()
+    stats_aggregator_1 = stats_ops.StatsAggregator()
+
+    with self.test_session() as sess:
+      sess.run(stats_aggregator_0.subscribe(iterator))
+      # TODO(mrry): Consider making this allowable (and also allowing
+      # aggregators to unsubscribe).
+      with self.assertRaises(errors.FailedPreconditionError):
+        sess.run(stats_aggregator_1.subscribe(iterator))
+
+
+class StatsDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_dataset_bytes_stats(self, num_elements):
+    return dataset_ops.Dataset.range(num_elements).map(
+        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
+            stats_ops.bytes_produced_stats("bytes_produced"))
+
+  def testBytesStatsDatasetSaveableCore(self):
+    num_outputs = 100
+    self.run_core_tests(
+        lambda: self._build_dataset_bytes_stats(num_outputs),
+        lambda: self._build_dataset_bytes_stats(num_outputs // 10), num_outputs)
+
+  def _build_dataset_latency_stats(self, num_elements, tag="record_latency"):
+    return dataset_ops.Dataset.range(num_elements).apply(
+        stats_ops.latency_stats(tag))
+
+  def _build_dataset_multiple_tags(self,
+                                   num_elements,
+                                   tag1="record_latency",
+                                   tag2="record_latency_2"):
+    return dataset_ops.Dataset.range(num_elements).apply(
+        stats_ops.latency_stats(tag1)).apply(stats_ops.latency_stats(tag2))
+
+  def testLatencyStatsDatasetSaveableCore(self):
+    num_outputs = 100
+
+    self.run_core_tests(
+        lambda: self._build_dataset_latency_stats(num_outputs),
+        lambda: self._build_dataset_latency_stats(num_outputs // 10),
+        num_outputs)
+
+    self.run_core_tests(lambda: self._build_dataset_multiple_tags(num_outputs),
+                        None, num_outputs)
+
+    tag1 = "record_latency"
+    tag2 = "record_latency"
+    self.run_core_tests(
+        lambda: self._build_dataset_multiple_tags(num_outputs, tag1, tag2),
+        None, num_outputs)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index d6aaa12f5b87ea1781346aea0010f23656ffc7d0..1f35ee056b7f897ce5e7488b205ecf5a05ef0268 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -14,6 +14,7 @@ load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 py_library(
     name = "dataset_ops",
     srcs = [
+        "counter.py",
         "dataset_ops.py",
     ],
     srcs_version = "PY2AND3",
@@ -39,6 +40,25 @@ py_library(
     ],
 )
 
+py_library(
+    name = "random_ops",
+    srcs = [
+        "random_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
 py_library(
     name = "readers",
     srcs = [
@@ -61,6 +81,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "shuffle_ops",
+    srcs = [
+        "shuffle_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":random_ops",
+        ":transformation_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 py_library(
     name = "transformation_ops",
     srcs = [
@@ -71,6 +104,7 @@ py_library(
         "interleave_ops.py",
         "resampling.py",
         "scan_ops.py",
+        "stats_ops.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index cc63baed81334521746fea1161003615535c371f..e8b2d44a8b57d471f11b128622b6121f699fbf85 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -112,8 +112,10 @@ def filter_irregular_batches(batch_size):
     tensor_batch_size = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
 
-    flattened = _RestructuredDataset(dataset,
-                                     tuple(nest.flatten(dataset.output_types)))
+    flattened = _RestructuredDataset(
+        dataset,
+        tuple(nest.flatten(dataset.output_types)),
+        output_classes=tuple(nest.flatten(dataset.output_classes)))
 
     def _predicate(*xs):
       """Return `True` if this element is a full batch."""
@@ -135,7 +137,11 @@ def filter_irregular_batches(batch_size):
 
     known_shapes = nest.map_structure(_set_first_dimension,
                                       dataset.output_shapes)
-    return _RestructuredDataset(filtered, dataset.output_types, known_shapes)
+    return _RestructuredDataset(
+        filtered,
+        dataset.output_types,
+        known_shapes,
+        output_classes=dataset.output_classes)
 
   return _apply_fn
 
@@ -237,6 +243,10 @@ class DenseToSparseBatchDataset(dataset_ops.Dataset):
         output_shapes=self.output_shapes,
         output_types=self.output_types)
 
+  @property
+  def output_classes(self):
+    return (ops.Tensor, ops.Tensor, ops.Tensor)
+
   @property
   def output_shapes(self):
     num_elements = tensor_shape.Dimension(None)
@@ -252,7 +262,11 @@ class DenseToSparseBatchDataset(dataset_ops.Dataset):
 class _RestructuredDataset(dataset_ops.Dataset):
   """An internal helper for changing the structure and shape of a dataset."""
 
-  def __init__(self, dataset, output_types, output_shapes=None):
+  def __init__(self,
+               dataset,
+               output_types,
+               output_shapes=None,
+               output_classes=None):
     """Creates a new dataset with the given output types and shapes.
 
     The given `dataset` must have a structure that is convertible:
@@ -268,6 +282,8 @@ class _RestructuredDataset(dataset_ops.Dataset):
       output_types: A nested structure of `tf.DType` objects.
       output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects.
         If omitted, the shapes will be inherited from `dataset`.
+      output_classes: (Optional.) A nested structure of class types.
+        If omitted, the class types will be inherited from `dataset`.
 
     Raises:
       ValueError: If either `output_types` or `output_shapes` is not compatible
@@ -307,10 +323,21 @@ class _RestructuredDataset(dataset_ops.Dataset):
                                                  output_shapes))
       self._output_shapes = nest.map_structure_up_to(
           output_types, tensor_shape.as_shape, output_shapes)
+    if output_classes is None:
+      # Inherit class types from the original `dataset`.
+      self._output_classes = nest.pack_sequence_as(output_types,
+                                                   nest.flatten(
+                                                       dataset.output_classes))
+    else:
+      self._output_classes = output_classes
 
   def _as_variant_tensor(self):
     return self._dataset._as_variant_tensor()  # pylint: disable=protected-access
 
+  @property
+  def output_classes(self):
+    return self._output_classes
+
   @property
   def output_types(self):
     return self._output_types
@@ -326,10 +353,6 @@ class _MapAndBatchDataset(dataset_ops.MapDataset):
   def __init__(self, input_dataset, map_func, batch_size, num_parallel_batches):
     """See `Dataset.map()` for details."""
     super(_MapAndBatchDataset, self).__init__(input_dataset, map_func)
-    if sparse.any_sparse(self._output_types):
-      # TODO(b/63669786): support batching of sparse tensors
-      raise TypeError("Batching of sparse tensors is not currently supported")
-
     self._batch_size = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
     self._num_parallel_batches = ops.convert_to_tensor(
@@ -345,8 +368,9 @@ class _MapAndBatchDataset(dataset_ops.MapDataset):
         batch_size=self._batch_size,
         num_parallel_batches=self._num_parallel_batches,
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)),
-        output_shapes=nest.flatten(self.output_shapes))
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
     # pylint: enable=protected-access
 
   @property
@@ -366,17 +390,12 @@ def map_and_batch(map_func, batch_size, num_parallel_batches=1):
   """Fused implementation of `map` and `batch`.
 
   Maps `map_func` across `batch_size` consecutive elements of this dataset
-  and then combines them into a batch. Similarly to `batch_and_drop_remainder`,
-  if the batch size does not evenly divide the input dataset size, this
-  transformation will drop the final smaller element.
-
-
-  Functionally, it is equivalent to `map` followed by
-  `batch_and_drop_remainder`. However, by fusing the two transformations
-  together, the implementation can be more efficient. This transformation is a
-  stop gap solution for performance critical workloads. Once automatic input
-  pipeline optimization are implemented, the fusing of map and batch will not
-  need to be exposed at the API level and this method will be removed.
+  and then combines them into a batch. Functionally, it is equivalent to `map`
+  followed by `batch`. However, by fusing the two transformations together, the
+  implementation can be more efficient. Surfacing this transformation in the API
+  is temporary. Once automatic input pipeline optimization is implemented,
+  the fusing of `map` and `batch` will happen automatically and this API will be
+  deprecated.
 
   Args:
     map_func: A function mapping a nested structure of tensors to another
@@ -394,9 +413,6 @@ def map_and_batch(map_func, batch_size, num_parallel_batches=1):
   """
 
   def _apply_fn(dataset):
-    if sparse.any_sparse(dataset.output_types):
-      # TODO(b/63669786): support batching of sparse tensors
-      raise TypeError("Batching of sparse tensors is not currently supported")
     return _MapAndBatchDataset(dataset, map_func, batch_size,
                                num_parallel_batches)
 
diff --git a/tensorflow/contrib/data/python/ops/counter.py b/tensorflow/contrib/data/python/ops/counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..63226fe78163c59025623a362d17c400fbe57c67
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/counter.py
@@ -0,0 +1,52 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Counter Dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.ops import scan_ops
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+
+
+def Counter(start=0, step=1, dtype=dtypes.int64):
+  """Creates a `Dataset` of a `step`-separated count startin from `start`.
+
+  For example:
+
+  ```python
+  Dataset.count() == [0, 1, 2, ...)
+  Dataset.count(2) == [2, 3, ...)
+  Dataset.count(2, 5) == [2, 7, 12, ...)
+  Dataset.count(0, -1) == [0, -1, -2, ...)
+  Dataset.count(10, -1) == [10, 9, ...)
+  ```
+
+  Args:
+    start: starting value for count.
+    step: step size.
+    dtype: counter data type.
+
+  Returns:
+    A `Dataset` of scalar elements.
+  """
+  with ops.name_scope("counter"):
+    start = ops.convert_to_tensor(start, dtype=dtype, name="start")
+    step = ops.convert_to_tensor(step, dtype=dtype, name="step")
+    return dataset_ops.Dataset.from_tensors(0).repeat(None).apply(
+        scan_ops.scan(start, lambda state, _: (state + step, state)))
diff --git a/tensorflow/contrib/data/python/ops/dataset_ops.py b/tensorflow/contrib/data/python/ops/dataset_ops.py
index 45d6dbe7438957029b4d6b71e181cb1fc3596ecb..626a9e0edcea5928b1636c1a2a86e83657c966a5 100644
--- a/tensorflow/contrib/data/python/ops/dataset_ops.py
+++ b/tensorflow/contrib/data/python/ops/dataset_ops.py
@@ -21,7 +21,6 @@ from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.data.python.ops import enumerate_ops
 from tensorflow.contrib.data.python.ops import error_ops
 from tensorflow.contrib.data.python.ops import grouping
-
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.ops import gen_dataset_ops
@@ -48,6 +47,10 @@ class Dataset(dataset_ops.Dataset):
   def _as_variant_tensor(self):
     return self._dataset._as_variant_tensor()  # pylint: disable=protected-access
 
+  @property
+  def output_classes(self):
+    return self._dataset.output_classes
+
   @property
   def output_shapes(self):
     return self._dataset.output_shapes
diff --git a/tensorflow/contrib/data/python/ops/error_ops.py b/tensorflow/contrib/data/python/ops/error_ops.py
index 194b61151390e2dcc3fa13b618003cbe5697806f..aa629cba479102ee4244884e7c546615b28cf4e5 100644
--- a/tensorflow/contrib/data/python/ops/error_ops.py
+++ b/tensorflow/contrib/data/python/ops/error_ops.py
@@ -63,9 +63,14 @@ class IgnoreErrorsDataset(dataset_ops.Dataset):
   def _as_variant_tensor(self):
     return gen_dataset_ops.ignore_errors_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        output_shapes=nest.flatten(self.output_shapes),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)))
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
diff --git a/tensorflow/contrib/data/python/ops/grouping.py b/tensorflow/contrib/data/python/ops/grouping.py
index 86337271bca79ea8bffda28fac79e41dc39f3fd3..ef91c56726e969053fdad667dda3e89430045652 100644
--- a/tensorflow/contrib/data/python/ops/grouping.py
+++ b/tensorflow/contrib/data/python/ops/grouping.py
@@ -88,15 +88,21 @@ def group_by_window(key_func,
 class _VariantDataset(dataset_ops.Dataset):
   """A Dataset wrapper for a tf.variant-typed function argument."""
 
-  def __init__(self, dataset_variant, output_types, output_shapes):
+  def __init__(self, dataset_variant, output_types, output_shapes,
+               output_classes):
     super(_VariantDataset, self).__init__()
     self._dataset_variant = dataset_variant
     self._output_types = output_types
     self._output_shapes = output_shapes
+    self._output_classes = output_classes
 
   def _as_variant_tensor(self):
     return self._dataset_variant
 
+  @property
+  def output_classes(self):
+    return self._output_classes
+
   @property
   def output_shapes(self):
     return self._output_shapes
@@ -138,17 +144,21 @@ class GroupByWindowDataset(dataset_ops.Dataset):
   def _make_key_func(self, key_func, input_dataset):
     """Make wrapping Defun for key_func."""
 
-    @function.Defun(
-        *nest.flatten(sparse.unwrap_sparse_types(input_dataset.output_types)))
+    @function.Defun(*nest.flatten(
+        sparse.as_dense_types(input_dataset.output_types,
+                              input_dataset.output_classes)))
     def tf_key_func(*args):
       """A wrapper for Defun that facilitates shape inference."""
       # Pass in shape information from the input_dataset.
-      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+      dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
+                                            input_dataset.output_classes)
+      for arg, shape in zip(args, nest.flatten(dense_shapes)):
         arg.set_shape(shape)
 
       nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
       nested_args = sparse.deserialize_sparse_tensors(
-          nested_args, input_dataset.output_types)
+          nested_args, input_dataset.output_types, input_dataset.output_shapes,
+          input_dataset.output_classes)
       # pylint: disable=protected-access
       if dataset_ops._should_unpack_args(nested_args):
         ret = key_func(*nested_args)
@@ -170,14 +180,15 @@ class GroupByWindowDataset(dataset_ops.Dataset):
     def tf_reduce_func(key, window_dataset_variant):
       """A wrapper for Defun that facilitates shape inference."""
       key.set_shape([])
-      window_dataset = _VariantDataset(window_dataset_variant,
-                                       input_dataset.output_types,
-                                       input_dataset.output_shapes)
+      window_dataset = _VariantDataset(
+          window_dataset_variant, input_dataset.output_types,
+          input_dataset.output_shapes, input_dataset.output_classes)
       if not isinstance(window_dataset, dataset_ops.Dataset):
         raise TypeError("`window_dataset` must return a `Dataset` object.")
       output_dataset = reduce_func(key, window_dataset)
       if not isinstance(output_dataset, dataset_ops.Dataset):
         raise TypeError("`reduce_func` must return a `Dataset` object.")
+      self._output_classes = output_dataset.output_classes
       self._output_types = output_dataset.output_types
       self._output_shapes = output_dataset.output_shapes
       return output_dataset._as_variant_tensor()  # pylint: disable=protected-access
@@ -185,6 +196,10 @@ class GroupByWindowDataset(dataset_ops.Dataset):
     self._reduce_func = tf_reduce_func
     self._reduce_func.add_to_graph(ops.get_default_graph())
 
+  @property
+  def output_classes(self):
+    return self._output_classes
+
   @property
   def output_shapes(self):
     return self._output_shapes
@@ -203,5 +218,6 @@ class GroupByWindowDataset(dataset_ops.Dataset):
         reduce_func=self._reduce_func,
         window_size_func=self._window_size_func,
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)),
-        output_shapes=nest.flatten(self.output_shapes))
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py
index 830642c0401b281e14e4dc7f7265ab6c77bbe513..53324e06e7f1dc249388410f0e14e42336630cd1 100644
--- a/tensorflow/contrib/data/python/ops/interleave_ops.py
+++ b/tensorflow/contrib/data/python/ops/interleave_ops.py
@@ -36,17 +36,21 @@ class ParallelInterleaveDataset(dataset_ops.Dataset):
     super(ParallelInterleaveDataset, self).__init__()
     self._input_dataset = input_dataset
 
-    @function.Defun(
-        *nest.flatten(sparse.unwrap_sparse_types(input_dataset.output_types)))
+    @function.Defun(*nest.flatten(
+        sparse.as_dense_types(input_dataset.output_types,
+                              input_dataset.output_classes)))
     def tf_map_func(*args):
       """A wrapper for Defun that facilitates shape inference."""
       # Pass in shape information from the input_dataset.
-      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+      dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
+                                            input_dataset.output_classes)
+      for arg, shape in zip(args, nest.flatten(dense_shapes)):
         arg.set_shape(shape)
 
       nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
       nested_args = sparse.deserialize_sparse_tensors(
-          nested_args, input_dataset.output_types)
+          nested_args, input_dataset.output_types, input_dataset.output_shapes,
+          input_dataset.output_classes)
       if dataset_ops._should_unpack_args(nested_args):  # pylint: disable=protected-access
         dataset = map_func(*nested_args)
       else:
@@ -55,6 +59,7 @@ class ParallelInterleaveDataset(dataset_ops.Dataset):
       if not isinstance(dataset, dataset_ops.Dataset):
         raise TypeError("`map_func` must return a `Dataset` object.")
 
+      self._output_classes = dataset.output_classes
       self._output_types = dataset.output_types
       self._output_shapes = dataset.output_shapes
 
@@ -79,8 +84,13 @@ class ParallelInterleaveDataset(dataset_ops.Dataset):
         self._sloppy,
         f=self._map_func,
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)),
-        output_shapes=nest.flatten(self.output_shapes))
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
 
   @property
   def output_shapes(self):
diff --git a/tensorflow/contrib/data/python/ops/random_ops.py b/tensorflow/contrib/data/python/ops/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d727165feabb101549567f28a2dfa07083de244
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/random_ops.py
@@ -0,0 +1,67 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Datasets for random number generators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import gen_dataset_ops
+
+
+class RandomDataset(dataset_ops.Dataset):
+  """A `Dataset` of pseudorandom values."""
+
+  def __init__(self, seed=None):
+    """A `Dataset` of pseudorandom values."""
+    super(RandomDataset, self).__init__()
+    seed, seed2 = random_seed.get_seed(seed)
+    if seed is None:
+      self._seed = constant_op.constant(0, dtype=dtypes.int64, name="seed")
+    else:
+      self._seed = ops.convert_to_tensor(seed, dtype=dtypes.int64, name="seed")
+    if seed2 is None:
+      self._seed2 = constant_op.constant(0, dtype=dtypes.int64, name="seed2")
+    else:
+      self._seed2 = ops.convert_to_tensor(
+          seed2, dtype=dtypes.int64, name="seed2")
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.random_dataset(
+        seed=self._seed,
+        seed2=self._seed2,
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  @property
+  def output_types(self):
+    return dtypes.int64
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index 632082b5f1edb6c3aa25cacb0d4831f9e9e7488c..347e5edc7b0d479dfa260e8cec500ffaaba375be 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -164,7 +164,7 @@ def read_batch_features(file_pattern,
       shuffling but would increase memory usage and startup time.
 
   Returns:
-    A dict from keys in features to Tensor or SparseTensor objects.
+    A dict from keys in features to `Tensor` or `SparseTensor` objects.
   """
   filenames = _get_file_names(file_pattern, randomize_input)
   if reader_args:
@@ -179,6 +179,7 @@ def read_batch_features(file_pattern,
     dataset = dataset.shuffle(capacity)
   dataset = dataset.batch(batch_size)
   dataset = dataset.map(lambda x: parsing_ops.parse_example(x, features))
+  dataset = dataset.prefetch(1)
   iterator = dataset.make_one_shot_iterator()
   outputs = iterator.get_next()
   return outputs
@@ -269,6 +270,10 @@ class _SqlDataset(dataset_ops.Dataset):
                                        nest.flatten(self.output_types),
                                        nest.flatten(self.output_shapes))
 
+  @property
+  def output_classes(self):
+    return nest.map_structure(lambda _: ops.Tensor, self._output_types)
+
   @property
   def output_shapes(self):
     return nest.map_structure(lambda _: tensor_shape.TensorShape([]),
diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py
index 2cfc0709cda37491f8cfa61c4f05b380931ab603..2744786e9eec4c9268ba854df6ea761339bb0b4e 100644
--- a/tensorflow/contrib/data/python/ops/scan_ops.py
+++ b/tensorflow/contrib/data/python/ops/scan_ops.py
@@ -53,6 +53,7 @@ class _ScanDataset(dataset_ops.Dataset):
         [t.dtype for t in nest.flatten(self._initial_state)])
 
     # Will be populated by calling `tf_scan_func`.
+    self._output_classes = None
     self._output_shapes = None
     self._output_types = None
 
@@ -68,13 +69,16 @@ class _ScanDataset(dataset_ops.Dataset):
       flat_new_state_shapes = []
 
       @function.Defun(*(flat_state_types + nest.flatten(
-          sparse.unwrap_sparse_types(input_dataset.output_types))))
+          sparse.as_dense_types(input_dataset.output_types,
+                                input_dataset.output_classes))))
       def tf_scan_func(*args):
         """A wrapper for Defun that facilitates shape inference."""
         # Pass in shape information from the state and input_dataset.
-        for arg, shape in zip(
-            args,
-            flat_state_shapes + nest.flatten(input_dataset.output_shapes)):
+        # TODO(b/69424092): Check that neither inputs nor outputs are sparse.
+        dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
+                                              input_dataset.output_classes)
+        for arg, shape in zip(args,
+                              flat_state_shapes + nest.flatten(dense_shapes)):
           arg.set_shape(shape)
 
         pivot = len(flat_state_shapes)
@@ -108,6 +112,8 @@ class _ScanDataset(dataset_ops.Dataset):
                 "state. Expected %s; got %s." %
                 (self._state_types, nest.pack_sequence_as(
                     self._state_types, [t.dtype for t in flat_new_state])))
+        self._output_classes = nest.pack_sequence_as(
+            output_value, [ops.Tensor for _ in flat_output_value])
         self._output_types = nest.pack_sequence_as(
             output_value, [t.dtype for t in flat_output_value])
 
@@ -147,8 +153,13 @@ class _ScanDataset(dataset_ops.Dataset):
         self._scan_func.captured_inputs,
         f=self._scan_func,
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)),
-        output_shapes=nest.flatten(self.output_shapes))
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
 
   @property
   def output_shapes(self):
diff --git a/tensorflow/contrib/data/python/ops/shuffle_ops.py b/tensorflow/contrib/data/python/ops/shuffle_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..410989fad4f2a3bb8c9051c094ce8ab7b2eee96c
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/shuffle_ops.py
@@ -0,0 +1,120 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental shuffle ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import gen_dataset_ops
+
+
+class _ShuffleAndRepeatDataset(dataset_ops.Dataset):
+  """A `Dataset` that fuses `shuffle` and `repeat`."""
+
+  def __init__(self,
+               input_dataset,
+               buffer_size,
+               count=None,
+               seed=None):
+    """See `Dataset.map()` for details."""
+    super(_ShuffleAndRepeatDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._buffer_size = ops.convert_to_tensor(
+        buffer_size, dtype=dtypes.int64, name="buffer_size")
+    if count is None:
+      self._count = constant_op.constant(-1, dtype=dtypes.int64, name="count")
+    else:
+      self._count = ops.convert_to_tensor(
+          count, dtype=dtypes.int64, name="count")
+
+    seed, seed2 = random_seed.get_seed(seed)
+    if seed is None:
+      self._seed = constant_op.constant(0, dtype=dtypes.int64, name="seed")
+    else:
+      self._seed = ops.convert_to_tensor(seed, dtype=dtypes.int64, name="seed")
+    if seed2 is None:
+      self._seed2 = constant_op.constant(0, dtype=dtypes.int64, name="seed2")
+    else:
+      self._seed2 = ops.convert_to_tensor(
+          seed2, dtype=dtypes.int64, name="seed2")
+
+  def _as_variant_tensor(self):
+    # pylint: disable=protected-access
+    input_resource = self._input_dataset._as_variant_tensor()
+    return gen_dataset_ops.shuffle_and_repeat_dataset(
+        input_resource,
+        buffer_size=self._buffer_size,
+        count=self._count,
+        seed=self._seed,
+        seed2=self._seed2,
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+    # pylint: enable=protected-access
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+def shuffle_and_repeat(buffer_size, count=None, seed=None):
+  """Shuffles and repeats a Dataset returning a new permutation for each epoch.
+
+  `dataset.apply(tf.contrib.data.shuffle_and_repeat(buffer_size, count))`
+
+  is equivalent to
+
+  `dataset.shuffle(buffer_size, reshuffle_each_iteration=True).repeat(count)`
+
+  The difference is that the latter dataset is not serializable. So,
+  if you need to checkpoint an input pipeline with reshuffling you must use
+  this implementation.
+
+  Args:
+    buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
+      maximum number elements that will be buffered when prefetching.
+    count: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+      number of times the dataset should be repeated. The default behavior
+      (if `count` is `None` or `-1`) is for the dataset be repeated
+      indefinitely.
+    seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+      random seed that will be used to create the distribution. See
+      @{tf.set_random_seed} for behavior.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.contrib.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):  # pylint: disable=missing-docstring
+    return _ShuffleAndRepeatDataset(dataset, buffer_size, count, seed)
+
+  return _apply_fn
diff --git a/tensorflow/contrib/data/python/ops/stats_ops.py b/tensorflow/contrib/data/python/ops/stats_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8875bd533ddc9e2c195646619dccf3aab5225e4
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/stats_ops.py
@@ -0,0 +1,177 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for gathering statistics from `tf.data` pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
+
+
+class StatsAggregator(object):
+  """A stateful resource that aggregates statistics from one or more iterators.
+
+  To record statistics, use one of the custom transformation functions defined
+  in this module when defining your @{tf.data.Dataset}. All statistics will be
+  aggregated by the `StatsAggregator` that is associated with a particular
+  iterator (see below). For example, to record the total number of bytes
+  produced by iterating over a dataset:
+
+  ```python
+  dataset = ...
+  dataset = dataset.apply(stats_ops.bytes_produced_stats("total_bytes"))
+  ```
+
+  To associate a `StatsAggregator` with a @{tf.data.Iterator} object, use
+  the following pattern:
+
+  ```python
+  dataset = ...
+  iterator = dataset.make_one_shot_iterator()
+  stats_aggregator = stats_ops.StatsAggregator()
+  set_op = stats_op.set_stats_aggregator_op(iterator, stats_aggregator)
+
+  with tf.Session() as sess:
+    # Running `set_op` will associate `iterator` with `stats_aggregator`.
+    sess.run(set_op)
+  ```
+
+  To get a protocol buffer summary of the currently aggregated statistics,
+  use the `StatsAggregator.get_summary()` tensor. The easiest way to do this
+  is to add the returned tensor to the @{tf.GraphKeys.SUMMARIES} collection,
+  so that the summaries will be included with any existing summaries.
+
+  ```python
+  stats_aggregator = stats_ops.StatsAggregator()
+  stats_summary = stats_aggregator.get_summary()
+  tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary)
+  ```
+
+  Note: This interface is experimental and expected to change. In particular,
+  we expect to add other implementations of `StatsAggregator` that provide
+  different ways of exporting statistics, and add more types of statistics.
+  """
+
+  def __init__(self):
+    """Creates a `StatsAggregator`."""
+    self._resource = gen_dataset_ops.stats_aggregator_handle()
+
+  def get_summary(self):
+    """Returns a string @{tf.Tensor} that summarizes the aggregated statistics.
+
+    The returned tensor will contain a serialized @{tf.summary.Summary} protocol
+    buffer, which can be used with the standard TensorBoard logging facilities.
+
+    Returns:
+      A scalar string @{tf.Tensor} that summarizes the aggregated statistics.
+    """
+    return gen_dataset_ops.stats_aggregator_summary(self._resource)
+
+  def subscribe(self, iterator):
+    """Returns a @{tf.Operation} to associate this aggregator with `iterator`.
+
+    Note: Each @{tf.data.Iterator} can be associated with at most one
+    `StatsAggregator`. After running the operation that this function
+    returns, all statistics recorded in the iteration of `iterator`
+    will be stored in `stats_aggregator`.
+
+    Args:
+      iterator: A @{tf.data.Iterator} object.
+
+    Returns:
+      A @{tf.Operation} that, when run, associates this aggregator with
+      `iterator`.
+    """
+    if not isinstance(iterator, iterator_ops.Iterator):
+      raise TypeError("`iterator` must be a `tf.data.Iterator` object.")
+    return gen_dataset_ops.iterator_set_stats_aggregator(
+        iterator._iterator_resource, self._resource)  # pylint: disable=protected-access
+
+
+def bytes_produced_stats(tag):
+  """Records the number of bytes produced by each element of the input dataset.
+
+  To consume the statistics, associate a `StatsAggregator` with an iterator
+  over the output dataset.
+
+  Args:
+    tag: String. All statistics recorded by the returned transformation will
+      be associated with the given `tag`.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.contrib.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):
+    return _StatsDataset(dataset, gen_dataset_ops.bytes_produced_stats_dataset,
+                         tag)
+
+  return _apply_fn
+
+
+def latency_stats(tag):
+  """Records the latency of producing each element of the input dataset.
+
+  To consume the statistics, associate a `StatsAggregator` with an iterator
+  over the output dataset.
+
+  Args:
+    tag: String. All statistics recorded by the returned transformation will
+      be associated with the given `tag`.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.contrib.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):
+    return _StatsDataset(dataset, gen_dataset_ops.latency_stats_dataset, tag)
+
+  return _apply_fn
+
+
+class _StatsDataset(dataset_ops.Dataset):
+  """A `Dataset` that acts as an identity, and also records statistics."""
+
+  def __init__(self, input_dataset, op_function, tag):
+    super(_StatsDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._op_function = op_function
+    self._tag = ops.convert_to_tensor(tag, dtype=dtypes.string)
+
+  def _as_variant_tensor(self):
+    return self._op_function(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._tag,
+        output_shapes=nest.flatten(self.output_shapes),
+        output_types=nest.flatten(self.output_types))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
diff --git a/tensorflow/contrib/decision_trees/proto/generic_tree_model_proto.swig b/tensorflow/contrib/decision_trees/proto/generic_tree_model_proto.swig
index d3d201afd5761e7c5c136301c779222bedc68492..cafb9314caee1c4907786b8101e7c71bd7095306 100644
--- a/tensorflow/contrib/decision_trees/proto/generic_tree_model_proto.swig
+++ b/tensorflow/contrib/decision_trees/proto/generic_tree_model_proto.swig
@@ -2,7 +2,7 @@
 
 %include "net/proto/swig/protofunc.swig"
 
-#ifndef MUST_USE_RESULT
+#ifndef ABSL_MUST_USE_RESULT
 #error Use this file only as a %include or %import after google.swig.
 #endif
 
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 145b9495ff40f8095b50d00e576333fdf5d7acdf..95848af69950bdaa680c41daecd8cbd8f3174f8e 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -60,6 +60,7 @@ py_library(
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:spectral_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
@@ -204,6 +205,24 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "half_normal_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/half_normal_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
     name = "inverse_gamma_test",
     srcs = ["python/kernel_tests/inverse_gamma_test.py"],
@@ -419,6 +438,7 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:spectral_ops_test_util",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 0d12d838932e3a46e07f4a4242b889296c6e13c4..7b401e178f35fe56e4eb461936565f5c630ec4cf 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -36,6 +36,7 @@ from tensorflow.contrib.distributions.python.ops.distribution_util import softpl
 from tensorflow.contrib.distributions.python.ops.distribution_util import tridiag
 from tensorflow.contrib.distributions.python.ops.estimator import *
 from tensorflow.contrib.distributions.python.ops.geometric import *
+from tensorflow.contrib.distributions.python.ops.half_normal import *
 from tensorflow.contrib.distributions.python.ops.independent import *
 from tensorflow.contrib.distributions.python.ops.inverse_gamma import *
 from tensorflow.contrib.distributions.python.ops.logistic import *
@@ -107,6 +108,7 @@ _allowed_symbols = [
     'Gamma',
     'GammaWithSoftplusConcentrationRate',
     'Geometric',
+    'HalfNormal',
     'Independent',
     'InverseGamma',
     'InverseGammaWithSoftplusConcentrationRate',
@@ -157,6 +159,10 @@ _allowed_symbols = [
     'assign_log_moving_mean_exp',
     'moving_mean_variance',
     'estimator_head_distribution_regression',
+    'quadrature_scheme_softmaxnormal_gauss_hermite',
+    'quadrature_scheme_softmaxnormal_quantiles',
+    'quadrature_scheme_lognormal_gauss_hermite',
+    'quadrature_scheme_lognormal_quantiles',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
index 25a9b6f5fe2ed6d218d6b44650fce17fa89c0664..288d9d8dd6f17cd6348d3d72aea4408e26913ebd 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
@@ -22,9 +22,9 @@ import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import test_util
 from tensorflow.contrib.distributions.python.ops.bijectors.invert import Invert
+from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import _gen_mask
 from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import masked_autoregressive_default_template
 from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import MaskedAutoregressiveFlow
-from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive_impl import _gen_mask
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
index 38b3a23c2d684a6f89b7c4be4a763c649bf4de15..49451446b56d290f130c5db90c13b94974d92dc9 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
@@ -28,8 +28,19 @@ from tensorflow.python.ops.distributions.bijector_test_util import assert_biject
 from tensorflow.python.platform import test
 
 
-class ReshapeBijectorTest(test.TestCase):
-  """Tests correctness of the reshape transformation."""
+class _ReshapeBijectorTest(object):
+  """Base class for testing the reshape transformation.
+
+  Methods defined in this class call a method self.build_shapes() that
+  is implemented by subclasses defined below, returning respectively
+   ReshapeBijectorTestStatic: static shapes,
+   ReshapeBijectorTestDynamic: shape placeholders of known ndims, and
+   ReshapeBijectorTestDynamicNdims: shape placeholders of unspecified ndims,
+  so that each test in this base class is automatically run over all
+  three cases. The subclasses also implement assertRaisesError to test
+  for either Python exceptions (in the case of static shapes) or
+  TensorFlow op errors (dynamic shapes).
+  """
 
   def setUp(self):
     self._rng = np.random.RandomState(42)
@@ -40,9 +51,10 @@ class ReshapeBijectorTest(test.TestCase):
     expected_y = np.reshape(expected_x, [4, 6])
 
     with self.test_session() as sess:
+      shape_in, shape_out, feed_dict = self.build_shapes([3, 2], [6,])
       bijector = Reshape(
-          event_shape_out=[6,],
-          event_shape_in=[3, 2],
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
           validate_args=True)
       (x_,
        y_,
@@ -52,66 +64,23 @@ class ReshapeBijectorTest(test.TestCase):
            bijector.forward(expected_x),
            bijector.forward_log_det_jacobian(expected_x),
            bijector.inverse_log_det_jacobian(expected_y),
-       ))
+       ), feed_dict=feed_dict)
       self.assertEqual("reshape", bijector.name)
       self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
       self.assertAllClose(expected_x, x_, rtol=1e-6, atol=0)
       self.assertAllClose(0., fldj_, rtol=1e-6, atol=0)
       self.assertAllClose(0., ildj_, rtol=1e-6, atol=0)
 
-  def testEventShapeDynamicNdims(self):
-    """Check forward/inverse shape methods with dynamic ndims."""
-
-    shape_in = tensor_shape.TensorShape([6,])
-    shape_in_ph = array_ops.placeholder(dtype=dtypes.int32)
-
-    shape_out = tensor_shape.TensorShape([2, 3])
-    shape_out_ph = array_ops.placeholder(dtype=dtypes.int32)
-
-    bijector = Reshape(
-        event_shape_out=shape_out_ph,
-        event_shape_in=shape_in_ph, validate_args=True)
-
-    # using the _tensor methods, we should always get a fully-specified
-    # result since these are evaluated at graph runtime.
-    with self.test_session() as sess:
-      (shape_out_,
-       shape_in_) = sess.run((
-           bijector.forward_event_shape_tensor(shape_in),
-           bijector.inverse_event_shape_tensor(shape_out),
-       ), feed_dict={
-           shape_in_ph: shape_in,
-           shape_out_ph: shape_out,
-       })
-      self.assertAllEqual(shape_out, shape_out_)
-      self.assertAllEqual(shape_in, shape_in_)
-
-  def testEventShapeDynamic(self):
-    """Check shape methods with static ndims but dynamic shape."""
-
-    shape_in = tensor_shape.TensorShape([6,])
-    shape_in_partial = tensor_shape.TensorShape([None,])
-    shape_in_ph = array_ops.placeholder(
-        shape=[1,], dtype=dtypes.int32)
-
-    shape_out = tensor_shape.TensorShape([2, 3])
-    shape_out_partial = tensor_shape.TensorShape([None, None])
-    shape_out_ph = array_ops.placeholder(
-        shape=[2,], dtype=dtypes.int32)
+  def testEventShapeTensor(self):
+    """Test event_shape_tensor methods when even ndims may be dynamic."""
 
+    shape_in_static = [2, 3]
+    shape_out_static = [6,]
+    shape_in, shape_out, feed_dict = self.build_shapes(shape_in_static,
+                                                       shape_out_static)
     bijector = Reshape(
-        event_shape_out=shape_out_ph,
-        event_shape_in=shape_in_ph,
-        validate_args=True)
-
-    # if event shapes are not statically available, should
-    # return partially-specified TensorShapes.
-    self.assertAllEqual(
-        bijector.forward_event_shape(shape_in).as_list(),
-        shape_out_partial.as_list())
-    self.assertAllEqual(
-        bijector.inverse_event_shape(shape_out).as_list(),
-        shape_in_partial.as_list())
+        event_shape_out=shape_out,
+        event_shape_in=shape_in, validate_args=True)
 
     # using the _tensor methods, we should always get a fully-specified
     # result since these are evaluated at graph runtime.
@@ -120,42 +89,9 @@ class ReshapeBijectorTest(test.TestCase):
        shape_in_) = sess.run((
            bijector.forward_event_shape_tensor(shape_in),
            bijector.inverse_event_shape_tensor(shape_out),
-       ), feed_dict={
-           shape_in_ph: shape_in,
-           shape_out_ph: shape_out,
-       })
-      self.assertAllEqual(shape_out, shape_out_)
-      self.assertAllEqual(shape_in, shape_in_)
-
-  def testEventShapeStatic(self):
-    """Check shape methods when shape is statically known."""
-
-    shape_in = tensor_shape.TensorShape([6,])
-    shape_out = tensor_shape.TensorShape([2, 3])
-
-    bijector_static = Reshape(
-        event_shape_out=shape_out,
-        event_shape_in=shape_in,
-        validate_args=True)
-
-    # test that forward_ and inverse_event_shape do sensible things
-    # when shapes are statically known.
-    self.assertEqual(
-        bijector_static.forward_event_shape(shape_in),
-        shape_out)
-    self.assertEqual(
-        bijector_static.inverse_event_shape(shape_out),
-        shape_in)
-
-    with self.test_session() as sess:
-      (shape_out_static_,
-       shape_in_static_,
-      ) = sess.run((
-          bijector_static.forward_event_shape_tensor(shape_in),
-          bijector_static.inverse_event_shape_tensor(shape_out),
-      ))
-      self.assertAllEqual(shape_out, shape_out_static_)
-      self.assertAllEqual(shape_in, shape_in_static_)
+       ), feed_dict=feed_dict)
+      self.assertAllEqual(shape_out_static, shape_out_)
+      self.assertAllEqual(shape_in_static, shape_in_)
 
   def testScalarReshape(self):
     """Test reshaping to and from a scalar shape ()."""
@@ -166,11 +102,11 @@ class ReshapeBijectorTest(test.TestCase):
     expected_x_scalar = np.random.randn(1,)
     expected_y_scalar = expected_x_scalar[0]
 
+    shape_in, shape_out, feed_dict = self.build_shapes([], [1,])
     with self.test_session() as sess:
       bijector = Reshape(
-          event_shape_out=[],
-          event_shape_in=[1,], validate_args=True)
-
+          event_shape_out=shape_in,
+          event_shape_in=shape_out, validate_args=True)
       (x_,
        y_,
        x_scalar_,
@@ -180,53 +116,178 @@ class ReshapeBijectorTest(test.TestCase):
           bijector.forward(expected_x),
           bijector.inverse(expected_y_scalar),
           bijector.forward(expected_x_scalar),
-      ))
+      ), feed_dict=feed_dict)
       self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
       self.assertAllClose(expected_x, x_, rtol=1e-6, atol=0)
       self.assertAllClose(expected_y_scalar, y_scalar_, rtol=1e-6, atol=0)
       self.assertAllClose(expected_x_scalar, x_scalar_, rtol=1e-6, atol=0)
 
-  def testRaisesOpError(self):
-    x1 = np.random.randn(4, 2, 3)
-    x2 = np.random.randn(4, 3, 2)
-    x3 = np.random.randn(4, 5, 1, 1)
+  def testMultipleUnspecifiedDimensionsOpError(self):
 
     with self.test_session() as sess:
-      shape_in_ph = array_ops.placeholder(shape=[2,], dtype=dtypes.int32)
-      shape_out_ph = array_ops.placeholder(shape=[3,], dtype=dtypes.int32)
+      shape_in, shape_out, feed_dict = self.build_shapes([2, 3], [4, -1, -1,])
       bijector = Reshape(
-          event_shape_out=shape_out_ph,
-          event_shape_in=shape_in_ph,
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
           validate_args=True)
 
-      with self.assertRaisesOpError(
+      with self.assertRaisesError(
+          "elements must have at most one `-1`."):
+        sess.run(bijector.forward_event_shape_tensor(shape_in),
+                 feed_dict=feed_dict)
+
+  def testInvalidDimensionsOpError(self):
+
+    with self.test_session() as sess:
+
+      shape_in, shape_out, feed_dict = self.build_shapes([2, 3], [1, 2, -2,])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
+
+      with self.assertRaisesError(
+          "elements must be either positive integers or `-1`."):
+        sess.run(bijector.forward_event_shape_tensor(shape_in),
+                 feed_dict=feed_dict)
+
+  def testValidButNonMatchingInputOpError(self):
+    x = np.random.randn(4, 3, 2)
+
+    with self.test_session() as sess:
+      shape_in, shape_out, feed_dict = self.build_shapes([2, 3], [1, 6, 1,])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
+
+      # Here we pass in a tensor (x) whose shape is compatible with
+      # the output shape, so tf.reshape will throw no error, but
+      # doesn't match the expected input shape.
+      with self.assertRaisesError(
           "Input `event_shape` does not match `event_shape_in`."):
-        sess.run(bijector.forward(x2),
-                 feed_dict={shape_out_ph: [1, 6, 1],
-                            shape_in_ph: [2, 3]})
+        sess.run(bijector.forward(x),
+                 feed_dict=feed_dict)
 
-      with self.assertRaisesOpError(
-          "event_shape_out entries must be positive."):
-        sess.run(bijector.forward(x1),
-                 feed_dict={shape_out_ph: [-1, -1, 6],
-                            shape_in_ph: [2, 3]})
+  def testValidButNonMatchingInputPartiallySpecifiedOpError(self):
+    x = np.random.randn(4, 3, 2)
+
+    with self.test_session() as sess:
+      shape_in, shape_out, feed_dict = self.build_shapes([2, -1], [1, 6, 1,])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
+
+      with self.assertRaisesError(
+          "Input `event_shape` does not match `event_shape_in`."):
+        sess.run(bijector.forward(x),
+                 feed_dict=feed_dict)
+
+  def testInputOutputMismatchOpError(self):
+    x1 = np.random.randn(4, 2, 3)
+    x2 = np.random.randn(4, 1, 1, 5)
+
+    with self.test_session() as sess:
+      shape_in, shape_out, fd_mismatched = self.build_shapes([2, 3],
+                                                             [1, 1, 5])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
 
       # test that *all* methods check basic assertions
-      fd_mismatched = {shape_out_ph: [1, 1, 5], shape_in_ph: [2, 3]}
-      with self.assertRaisesOpError(
-          "Input/output `event_size`s do not match."):
+      with self.assertRaisesError(
+          "Input to reshape is a tensor with"):
         sess.run(bijector.forward(x1), feed_dict=fd_mismatched)
-      with self.assertRaisesOpError(
-          "Input/output `event_size`s do not match."):
-        sess.run(bijector.inverse(x3), feed_dict=fd_mismatched)
-      with self.assertRaisesOpError(
-          "Input/output `event_size`s do not match."):
-        sess.run(bijector.inverse_log_det_jacobian(x3),
-                 feed_dict=fd_mismatched)
-      with self.assertRaisesOpError(
-          "Input/output `event_size`s do not match."):
-        sess.run(bijector.forward_log_det_jacobian(x1),
-                 feed_dict=fd_mismatched)
+      with self.assertRaisesError(
+          "Input to reshape is a tensor with"):
+        sess.run(bijector.inverse(x2), feed_dict=fd_mismatched)
+
+  def testOneShapePartiallySpecified(self):
+    expected_x = np.random.randn(4, 6)
+    expected_y = np.reshape(expected_x, [4, 2, 3])
+
+    with self.test_session() as sess:
+      # one of input/output shapes is partially specified
+      shape_in, shape_out, feed_dict = self.build_shapes([-1,], [2, 3])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
+      (x_,
+       y_,
+      ) = sess.run((
+          bijector.inverse(expected_y),
+          bijector.forward(expected_x),
+      ), feed_dict=feed_dict)
+      self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
+      self.assertAllClose(expected_x, x_, rtol=1e-6, atol=0)
+
+  def testBothShapesPartiallySpecified(self):
+    expected_x = np.random.randn(4, 2, 3)
+    expected_y = np.reshape(expected_x, [4, 3, 2])
+    with self.test_session() as sess:
+      shape_in, shape_out, feed_dict = self.build_shapes([-1, 3], [-1, 2])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
+      (x_,
+       y_,
+      ) = sess.run((
+          bijector.inverse(expected_y),
+          bijector.forward(expected_x),
+      ), feed_dict=feed_dict)
+      self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
+      self.assertAllClose(expected_x, x_, rtol=1e-6, atol=0)
+
+  def testDefaultVectorShape(self):
+    expected_x = np.random.randn(4, 4)
+    expected_y = np.reshape(expected_x, [4, 2, 2])
+    with self.test_session() as sess:
+      _, shape_out, feed_dict = self.build_shapes([-1,], [-1, 2])
+      bijector = Reshape(shape_out,
+                         validate_args=True)
+      (x_,
+       y_,
+      ) = sess.run((
+          bijector.inverse(expected_y),
+          bijector.forward(expected_x),
+      ), feed_dict=feed_dict)
+      self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
+      self.assertAllClose(expected_x, x_, rtol=1e-6, atol=0)
+
+  def build_shapes(self, *args, **kwargs):
+    raise NotImplementedError("Subclass failed to implement `build_shapes`.")
+
+
+class ReshapeBijectorTestStatic(test.TestCase, _ReshapeBijectorTest):
+
+  def build_shapes(self, shape_in, shape_out):
+    shape_in_static = shape_in
+    shape_out_static = shape_out
+    feed_dict = {}
+    return shape_in_static, shape_out_static, feed_dict
+
+  def assertRaisesError(self, msg):
+    return self.assertRaisesRegexp(Exception, msg)
+
+  def testEventShape(self):
+    shape_in_static = tensor_shape.TensorShape([2, 3])
+    shape_out_static = tensor_shape.TensorShape([6,])
+    bijector = Reshape(
+        event_shape_out=shape_out_static,
+        event_shape_in=shape_in_static, validate_args=True)
+
+    # test that forward_ and inverse_event_shape do sensible things
+    # when shapes are statically known.
+    self.assertEqual(
+        bijector.forward_event_shape(shape_in_static),
+        shape_out_static)
+    self.assertEqual(
+        bijector.inverse_event_shape(shape_out_static),
+        shape_in_static)
 
   def testBijectiveAndFinite(self):
     x = np.random.randn(4, 2, 3)
@@ -238,5 +299,32 @@ class ReshapeBijectorTest(test.TestCase):
           validate_args=True)
       assert_bijective_and_finite(bijector, x, y, rtol=1e-6, atol=0)
 
+
+class ReshapeBijectorTestDynamic(test.TestCase, _ReshapeBijectorTest):
+
+  def build_shapes(self, shape_in, shape_out):
+    shape_in_ph = array_ops.placeholder(shape=(len(shape_in),),
+                                        dtype=dtypes.int32)
+    shape_out_ph = array_ops.placeholder(shape=(len(shape_out),),
+                                         dtype=dtypes.int32)
+    feed_dict = {shape_in_ph: shape_in, shape_out_ph: shape_out}
+    return shape_in_ph, shape_out_ph, feed_dict
+
+  def assertRaisesError(self, msg):
+    return self.assertRaisesOpError(msg)
+
+
+class ReshapeBijectorTestDynamicNdims(test.TestCase, _ReshapeBijectorTest):
+
+  def build_shapes(self, shape_in, shape_out):
+    shape_in_ph = array_ops.placeholder(shape=None, dtype=dtypes.int32)
+    shape_out_ph = array_ops.placeholder(shape=None, dtype=dtypes.int32)
+    feed_dict = {shape_in_ph: shape_in, shape_out_ph: shape_out}
+    return shape_in_ph, shape_out_ph, feed_dict
+
+  def assertRaisesError(self, msg):
+    return self.assertRaisesOpError(msg)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py b/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py
index 7f7697357ce7c77b2a50b87271d4ba7b49cbe05e..73747db31c86b67eaad5aeab7d5e80191e12b333 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py
@@ -41,6 +41,7 @@ def try_import(name):  # pylint: disable=invalid-name
     tf_logging.warning("Could not import %s: %s" % (name, str(e)))
   return module
 
+
 stats = try_import("scipy.stats")
 
 
@@ -62,9 +63,9 @@ class CauchyTest(test.TestCase):
       self.assertAllEqual(expected, scale_shape.eval())
       loc = array_ops.zeros(loc_shape)
       scale = array_ops.ones(scale_shape)
-      self.assertAllEqual(
-          expected,
-          array_ops.shape(cauchy_lib.Cauchy(loc, scale).sample()).eval())
+      self.assertAllEqual(expected,
+                          array_ops.shape(
+                              cauchy_lib.Cauchy(loc, scale).sample()).eval())
 
   def _testParamStaticShapes(self, sample_shape, expected):
     param_shapes = cauchy_lib.Cauchy.param_static_shapes(sample_shape)
@@ -92,8 +93,7 @@ class CauchyTest(test.TestCase):
       cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
 
       log_pdf = cauchy.log_prob(x)
-      self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
-                          log_pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), log_pdf.shape)
       self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
                           log_pdf.eval().shape)
       self.assertAllEqual(cauchy.batch_shape, log_pdf.shape)
@@ -115,16 +115,15 @@ class CauchyTest(test.TestCase):
     with self.test_session():
       batch_size = 6
       loc = constant_op.constant([[3.0, -3.0]] * batch_size)
-      scale = constant_op.constant([[np.sqrt(10.0), np.sqrt(15.0)]] *
-                                   batch_size)
+      scale = constant_op.constant(
+          [[np.sqrt(10.0), np.sqrt(15.0)]] * batch_size)
       x = np.array([[-2.5, 2.5, 4.0, 0.0, -1.0, 2.0]], dtype=np.float32).T
       cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
 
       log_pdf = cauchy.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.shape, (6, 2))
-      self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
-                          log_pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), log_pdf.shape)
       self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
                           log_pdf.eval().shape)
       self.assertAllEqual(cauchy.batch_shape, log_pdf.shape)
@@ -248,8 +247,7 @@ class CauchyTest(test.TestCase):
       cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
 
       entropy = cauchy.entropy()
-      self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
-                          entropy.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), entropy.shape)
       self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
                           entropy.eval().shape)
       self.assertAllEqual(cauchy.batch_shape, entropy.shape)
@@ -257,7 +255,7 @@ class CauchyTest(test.TestCase):
 
       if not stats:
         return
-      expected_entropy = stats.cauchy(loc, scale).entropy()
+      expected_entropy = stats.cauchy(loc, scale[0]).entropy().reshape((1, 3))
       self.assertAllClose(expected_entropy, entropy.eval())
 
   def testCauchyMode(self):
@@ -368,8 +366,8 @@ class CauchyTest(test.TestCase):
       self.assertAllEqual(expected_shape, samples.shape)
       self.assertAllEqual(expected_shape, sample_values.shape)
 
-      expected_shape = (tensor_shape.TensorShape(
-          [n.eval()]).concatenate(cauchy.batch_shape))
+      expected_shape = (
+          tensor_shape.TensorShape([n.eval()]).concatenate(cauchy.batch_shape))
 
       self.assertAllEqual(expected_shape, samples.shape)
       self.assertAllEqual(expected_shape, sample_values.shape)
@@ -385,18 +383,18 @@ class CauchyTest(test.TestCase):
       samples = cauchy.sample(n)
       sample_values = samples.eval()
       self.assertEqual(samples.shape, (100000, batch_size, 2))
-      self.assertAllClose(np.median(sample_values[:, 0, 0]),
-                          loc_v[0], atol=1e-1)
-      self.assertAllClose(np.median(sample_values[:, 0, 1]),
-                          loc_v[1], atol=1e-1)
+      self.assertAllClose(
+          np.median(sample_values[:, 0, 0]), loc_v[0], atol=1e-1)
+      self.assertAllClose(
+          np.median(sample_values[:, 0, 1]), loc_v[1], atol=1e-1)
 
       expected_shape = tensor_shape.TensorShape([n.eval()]).concatenate(
           tensor_shape.TensorShape(cauchy.batch_shape_tensor().eval()))
       self.assertAllEqual(expected_shape, samples.shape)
       self.assertAllEqual(expected_shape, sample_values.shape)
 
-      expected_shape = (tensor_shape.TensorShape(
-          [n.eval()]).concatenate(cauchy.batch_shape))
+      expected_shape = (
+          tensor_shape.TensorShape([n.eval()]).concatenate(cauchy.batch_shape))
       self.assertAllEqual(expected_shape, samples.shape)
       self.assertAllEqual(expected_shape, sample_values.shape)
 
@@ -428,9 +426,12 @@ class CauchyTest(test.TestCase):
       self.assertEqual(cauchy.event_shape, ())
       self.assertAllEqual(cauchy.event_shape_tensor().eval(), [])
       self.assertAllEqual(
-          sess.run(cauchy.batch_shape_tensor(),
-                   feed_dict={loc: 5.0,
-                              scale: [1.0, 2.0]}), [2])
+          sess.run(
+              cauchy.batch_shape_tensor(),
+              feed_dict={
+                  loc: 5.0,
+                  scale: [1.0, 2.0]
+              }), [2])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
index 2d74aa1f320149d0f7ef9e9c52b8c7053c2f74d7..a255d4fc890e67180532e342332a8e3f63a869cd 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
@@ -395,5 +395,110 @@ class MixtureStddevTest(test.TestCase):
     self.assertAllClose(actual_devs, expected_devs)
 
 
+class _PadTest(object):
+
+  def testNegAxisCorrectness(self):
+    x_ = np.float32([[1., 2, 3],
+                     [4, 5, 6]])
+    value_ = np.float32(0.25)
+    count_ = np.int32(2)
+    with self.test_session() as sess:
+      x = array_ops.placeholder_with_default(
+          x_, shape=x_.shape if self.is_static_shape else None)
+      value = (constant_op.constant(value_) if self.is_static_shape
+               else array_ops.placeholder_with_default(value_, shape=None))
+      count = (constant_op.constant(count_) if self.is_static_shape
+               else array_ops.placeholder_with_default(count_, shape=None))
+
+      x0_front = distribution_util.pad(
+          x, axis=-2, value=value, count=count, front=True)
+      x0_back = distribution_util.pad(
+          x, axis=-2, count=count, back=True)
+      x0_both = distribution_util.pad(
+          x, axis=-2, value=value, front=True, back=True)
+
+      if self.is_static_shape:
+        self.assertAllEqual([4, 3], x0_front.shape)
+        self.assertAllEqual([4, 3], x0_back.shape)
+        self.assertAllEqual([4, 3], x0_both.shape)
+
+      [x0_front_, x0_back_, x0_both_] = sess.run([
+          x0_front, x0_back, x0_both])
+
+      self.assertAllClose(
+          np.float32([[value_]*3,
+                      [value_]*3,
+                      [1, 2, 3],
+                      [4, 5, 6]]),
+          x0_front_, atol=0., rtol=1e-6)
+      self.assertAllClose(
+          np.float32([[1, 2, 3],
+                      [4, 5, 6],
+                      [0.]*3,
+                      [0.]*3]),
+          x0_back_, atol=0., rtol=1e-6)
+      self.assertAllClose(
+          np.float32([[value_]*3,
+                      [1, 2, 3],
+                      [4, 5, 6],
+                      [value_]*3]),
+          x0_both_, atol=0., rtol=1e-6)
+
+  def testPosAxisCorrectness(self):
+    x_ = np.float32([[1., 2, 3],
+                     [4, 5, 6]])
+    value_ = np.float32(0.25)
+    count_ = np.int32(2)
+    with self.test_session() as sess:
+      x = array_ops.placeholder_with_default(
+          x_, shape=x_.shape if self.is_static_shape else None)
+      value = (constant_op.constant(value_) if self.is_static_shape
+               else array_ops.placeholder_with_default(value_, shape=None))
+      count = (constant_op.constant(count_) if self.is_static_shape
+               else array_ops.placeholder_with_default(count_, shape=None))
+
+      x1_front = distribution_util.pad(
+          x, axis=1, value=value, count=count, front=True)
+      x1_back = distribution_util.pad(
+          x, axis=1, count=count, back=True)
+      x1_both = distribution_util.pad(
+          x, axis=1, value=value, front=True, back=True)
+
+      if self.is_static_shape:
+        self.assertAllEqual([2, 5], x1_front.shape)
+        self.assertAllEqual([2, 5], x1_back.shape)
+        self.assertAllEqual([2, 5], x1_both.shape)
+
+      [x1_front_, x1_back_, x1_both_] = sess.run([
+          x1_front, x1_back, x1_both])
+
+      self.assertAllClose(
+          np.float32([[value_]*2 + [1, 2, 3],
+                      [value_]*2 + [4, 5, 6]]),
+          x1_front_, atol=0., rtol=1e-6)
+      self.assertAllClose(
+          np.float32([[1, 2, 3] + [0.]*2,
+                      [4, 5, 6] + [0.]*2]),
+          x1_back_, atol=0., rtol=1e-6)
+      self.assertAllClose(
+          np.float32([[value_, 1, 2, 3, value_],
+                      [value_, 4, 5, 6, value_]]),
+          x1_both_, atol=0., rtol=1e-6)
+
+
+class PadStaticTest(_PadTest, test.TestCase):
+
+  @property
+  def is_static_shape(self):
+    return True
+
+
+class PadDynamicTest(_PadTest, test.TestCase):
+
+  @property
+  def is_static_shape(self):
+    return False
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py b/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4e75660083dc2edd1759a3a54e221d9e8a268c3
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py
@@ -0,0 +1,320 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for initializers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import half_normal as hn_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+stats = try_import("scipy.stats")
+
+
+class HalfNormalTest(test.TestCase):
+
+  def setUp(self):
+    self._rng = np.random.RandomState(123)
+
+  def assertAllFinite(self, tensor):
+    is_finite = np.isfinite(tensor.eval())
+    all_true = np.ones_like(is_finite, dtype=np.bool)
+    self.assertAllEqual(all_true, is_finite)
+
+  def _testParamShapes(self, sample_shape, expected):
+    with self.test_session():
+      param_shapes = hn_lib.HalfNormal.param_shapes(sample_shape)
+      scale_shape = param_shapes["scale"]
+      self.assertAllEqual(expected, scale_shape.eval())
+      scale = array_ops.ones(scale_shape)
+      self.assertAllEqual(
+          expected,
+          array_ops.shape(hn_lib.HalfNormal(scale).sample()).eval())
+
+  def _testParamStaticShapes(self, sample_shape, expected):
+    param_shapes = hn_lib.HalfNormal.param_static_shapes(sample_shape)
+    scale_shape = param_shapes["scale"]
+    self.assertEqual(expected, scale_shape)
+
+  def _testBatchShapes(self, dist, tensor):
+    self.assertAllEqual(dist.batch_shape_tensor().eval(), tensor.shape)
+    self.assertAllEqual(dist.batch_shape_tensor().eval(), tensor.eval().shape)
+    self.assertAllEqual(dist.batch_shape, tensor.shape)
+    self.assertAllEqual(dist.batch_shape, tensor.eval().shape)
+
+  def testParamShapes(self):
+    sample_shape = [10, 3, 4]
+    self._testParamShapes(sample_shape, sample_shape)
+    self._testParamShapes(constant_op.constant(sample_shape), sample_shape)
+
+  def testParamStaticShapes(self):
+    sample_shape = [10, 3, 4]
+    self._testParamStaticShapes(sample_shape, sample_shape)
+    self._testParamStaticShapes(
+        tensor_shape.TensorShape(sample_shape), sample_shape)
+
+  def testHalfNormalLogPDF(self):
+    with self.test_session():
+      batch_size = 6
+      scale = constant_op.constant([3.0] * batch_size)
+      x = np.array([-2.5, 2.5, 4.0, 0.0, -1.0, 2.0], dtype=np.float32)
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+
+      log_pdf = halfnorm.log_prob(x)
+      self._testBatchShapes(halfnorm, log_pdf)
+
+      pdf = halfnorm.prob(x)
+      self._testBatchShapes(halfnorm, pdf)
+
+      if not stats:
+        return
+      expected_log_pdf = stats.halfnorm(scale=scale.eval()).logpdf(x)
+      self.assertAllClose(expected_log_pdf, log_pdf.eval())
+      self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
+
+  def testHalfNormalLogPDFMultidimensional(self):
+    with self.test_session():
+      batch_size = 6
+      scale = constant_op.constant([[3.0, 1.0]] * batch_size)
+      x = np.array([[-2.5, 2.5, 4.0, 0.0, -1.0, 2.0]], dtype=np.float32).T
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+
+      log_pdf = halfnorm.log_prob(x)
+      self._testBatchShapes(halfnorm, log_pdf)
+
+      pdf = halfnorm.prob(x)
+      self._testBatchShapes(halfnorm, pdf)
+
+      if not stats:
+        return
+      expected_log_pdf = stats.halfnorm(scale=scale.eval()).logpdf(x)
+      self.assertAllClose(expected_log_pdf, log_pdf.eval())
+      self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
+
+  def testHalfNormalCDF(self):
+    with self.test_session():
+      batch_size = 50
+      scale = self._rng.rand(batch_size) + 1.0
+      x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+
+      cdf = halfnorm.cdf(x)
+      self._testBatchShapes(halfnorm, cdf)
+
+      log_cdf = halfnorm.log_cdf(x)
+      self._testBatchShapes(halfnorm, log_cdf)
+
+      if not stats:
+        return
+      expected_logcdf = stats.halfnorm(scale=scale).logcdf(x)
+      self.assertAllClose(expected_logcdf, log_cdf.eval(), atol=0)
+      self.assertAllClose(np.exp(expected_logcdf), cdf.eval(), atol=0)
+
+  def testHalfNormalSurvivalFunction(self):
+    with self.test_session():
+      batch_size = 50
+      scale = self._rng.rand(batch_size) + 1.0
+      x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+
+      sf = halfnorm.survival_function(x)
+      self._testBatchShapes(halfnorm, sf)
+
+      log_sf = halfnorm.log_survival_function(x)
+      self._testBatchShapes(halfnorm, log_sf)
+
+      if not stats:
+        return
+      expected_logsf = stats.halfnorm(scale=scale).logsf(x)
+      self.assertAllClose(expected_logsf, log_sf.eval(), atol=0)
+      self.assertAllClose(np.exp(expected_logsf), sf.eval(), atol=0)
+
+  def testHalfNormalQuantile(self):
+    with self.test_session():
+      batch_size = 50
+      scale = self._rng.rand(batch_size) + 1.0
+      p = np.linspace(0., 1.0, batch_size).astype(np.float64)
+
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+      x = halfnorm.quantile(p)
+      self._testBatchShapes(halfnorm, x)
+
+      if not stats:
+        return
+      expected_x = stats.halfnorm(scale=scale).ppf(p)
+      self.assertAllClose(expected_x, x.eval(), atol=0)
+
+  def testFiniteGradients(self):
+    for dtype in [np.float32, np.float64]:
+      g = ops.Graph()
+      with g.as_default():
+        scale = variables.Variable(dtype(3.0))
+        dist = hn_lib.HalfNormal(scale=scale)
+        x = np.array([0.01, 0.1, 1., 5., 10.]).astype(dtype)
+        for func in [
+            dist.cdf, dist.log_cdf, dist.survival_function,
+            dist.log_prob, dist.prob, dist.log_survival_function,
+        ]:
+          print(func.__name__)
+          value = func(x)
+          grads = gradients_impl.gradients(value, [scale])
+          with self.test_session(graph=g):
+            variables.global_variables_initializer().run()
+            self.assertAllFinite(value)
+            self.assertAllFinite(grads[0])
+
+  def testHalfNormalEntropy(self):
+    with self.test_session():
+      scale = np.array([[1.0, 2.0, 3.0]])
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+
+      # See https://en.wikipedia.org/wiki/Half-normal_distribution for the
+      # entropy formula used here.
+      expected_entropy = 0.5 * np.log(np.pi * scale ** 2.0 / 2.0) + 0.5
+
+      entropy = halfnorm.entropy()
+      self._testBatchShapes(halfnorm, entropy)
+      self.assertAllClose(expected_entropy, entropy.eval())
+
+  def testHalfNormalMeanAndMode(self):
+    with self.test_session():
+      scale = np.array([11., 12., 13.])
+
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+      expected_mean = scale * np.sqrt(2.0) / np.sqrt(np.pi)
+
+      self.assertAllEqual((3,), halfnorm.mean().eval().shape)
+      self.assertAllEqual(expected_mean, halfnorm.mean().eval())
+
+      self.assertAllEqual((3,), halfnorm.mode().eval().shape)
+      self.assertAllEqual([0., 0., 0.], halfnorm.mode().eval())
+
+  def testHalfNormalVariance(self):
+    with self.test_session():
+      scale = np.array([7., 7., 7.])
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+      expected_variance = scale ** 2.0 * (1.0 - 2.0 / np.pi)
+
+      self.assertAllEqual((3,), halfnorm.variance().eval().shape)
+      self.assertAllEqual(expected_variance, halfnorm.variance().eval())
+
+  def testHalfNormalStandardDeviation(self):
+    with self.test_session():
+      scale = np.array([7., 7., 7.])
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+      expected_variance = scale ** 2.0 * (1.0 - 2.0 / np.pi)
+
+      self.assertAllEqual((3,), halfnorm.stddev().shape)
+      self.assertAllEqual(np.sqrt(expected_variance), halfnorm.stddev().eval())
+
+  def testHalfNormalSample(self):
+    with self.test_session():
+      scale = constant_op.constant(3.0)
+      n = constant_op.constant(100000)
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+
+      sample = halfnorm.sample(n)
+
+      self.assertEqual(sample.eval().shape, (100000,))
+      self.assertAllClose(sample.eval().mean(),
+                          3.0 * np.sqrt(2.0) / np.sqrt(np.pi), atol=1e-1)
+
+      expected_shape = tensor_shape.TensorShape([n.eval()]).concatenate(
+          tensor_shape.TensorShape(halfnorm.batch_shape_tensor().eval()))
+      self.assertAllEqual(expected_shape, sample.shape)
+      self.assertAllEqual(expected_shape, sample.eval().shape)
+
+      expected_shape_static = (tensor_shape.TensorShape(
+          [n.eval()]).concatenate(halfnorm.batch_shape))
+      self.assertAllEqual(expected_shape_static, sample.shape)
+      self.assertAllEqual(expected_shape_static, sample.eval().shape)
+
+  def testHalfNormalSampleMultiDimensional(self):
+    with self.test_session():
+      batch_size = 2
+      scale = constant_op.constant([[2.0, 3.0]] * batch_size)
+      n = constant_op.constant(100000)
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+
+      sample = halfnorm.sample(n)
+      self.assertEqual(sample.shape, (100000, batch_size, 2))
+      self.assertAllClose(sample.eval()[:, 0, 0].mean(),
+                          2.0 * np.sqrt(2.0) / np.sqrt(np.pi), atol=1e-1)
+      self.assertAllClose(sample.eval()[:, 0, 1].mean(),
+                          3.0 * np.sqrt(2.0) / np.sqrt(np.pi), atol=1e-1)
+
+      expected_shape = tensor_shape.TensorShape([n.eval()]).concatenate(
+          tensor_shape.TensorShape(halfnorm.batch_shape_tensor().eval()))
+      self.assertAllEqual(expected_shape, sample.shape)
+      self.assertAllEqual(expected_shape, sample.eval().shape)
+
+      expected_shape_static = (tensor_shape.TensorShape(
+          [n.eval()]).concatenate(halfnorm.batch_shape))
+      self.assertAllEqual(expected_shape_static, sample.shape)
+      self.assertAllEqual(expected_shape_static, sample.eval().shape)
+
+  def testNegativeSigmaFails(self):
+    with self.test_session():
+      halfnorm = hn_lib.HalfNormal(scale=[-5.], validate_args=True, name="G")
+      with self.assertRaisesOpError("Condition x > 0 did not hold"):
+        halfnorm.mean().eval()
+
+  def testHalfNormalShape(self):
+    with self.test_session():
+      scale = constant_op.constant([6.0] * 5)
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+
+      self.assertEqual(halfnorm.batch_shape_tensor().eval(), [5])
+      self.assertEqual(halfnorm.batch_shape, tensor_shape.TensorShape([5]))
+      self.assertAllEqual(halfnorm.event_shape_tensor().eval(), [])
+      self.assertEqual(halfnorm.event_shape, tensor_shape.TensorShape([]))
+
+  def testHalfNormalShapeWithPlaceholders(self):
+    scale = array_ops.placeholder(dtype=dtypes.float32)
+    halfnorm = hn_lib.HalfNormal(scale=scale)
+
+    with self.test_session() as sess:
+      # get_batch_shape should return an "<unknown>" tensor.
+      self.assertEqual(halfnorm.batch_shape, tensor_shape.TensorShape(None))
+      self.assertEqual(halfnorm.event_shape, ())
+      self.assertAllEqual(halfnorm.event_shape_tensor().eval(), [])
+      self.assertAllEqual(
+          sess.run(halfnorm.batch_shape_tensor(),
+                   feed_dict={scale: [1.0, 2.0]}), [2])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mixture_same_family_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mixture_same_family_test.py
index ece6bc077d9e21502fdfd01300a9d3e9f2c9c380..ff6092fc260660b512e8123823c63e98a023af6d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mixture_same_family_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mixture_same_family_test.py
@@ -45,6 +45,17 @@ class MixtureSameFamilyTest(test_util.VectorDistributionTestHelpers,
       self.assertEqual([4, 5], x.shape)
       self.assertEqual([4, 5], log_prob_x.shape)
 
+  def testSampleAndLogProbBatch(self):
+    with self.test_session():
+      gm = mixture_same_family_lib.MixtureSameFamily(
+          mixture_distribution=categorical_lib.Categorical(probs=[[0.3, 0.7]]),
+          components_distribution=normal_lib.Normal(
+              loc=[[-1., 1]], scale=[[0.1, 0.5]]))
+      x = gm.sample([4, 5], seed=42)
+      log_prob_x = gm.log_prob(x)
+      self.assertEqual([4, 5, 1], x.shape)
+      self.assertEqual([4, 5, 1], log_prob_x.shape)
+
   def testSampleAndLogProbShapesBroadcastMix(self):
     mix_probs = np.float32([.3, .7])
     bern_probs = np.float32([[.4, .6], [.25, .75]])
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/poisson_lognormal_test.py b/tensorflow/contrib/distributions/python/kernel_tests/poisson_lognormal_test.py
index 3c0147b8cf6e1b6a2791e85c0c0997992445fa7e..1035cb00f76d95c7c52c3e812e8bb2868d34b890 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/poisson_lognormal_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/poisson_lognormal_test.py
@@ -18,37 +18,40 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from tensorflow.contrib.distributions.python.ops import poisson_lognormal
 from tensorflow.contrib.distributions.python.ops import test_util
-from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class PoissonLogNormalQuadratureCompoundTest(
-    test_util.DiscreteScalarDistributionTestHelpers, test.TestCase):
+class _PoissonLogNormalQuadratureCompoundTest(
+    test_util.DiscreteScalarDistributionTestHelpers):
   """Tests the PoissonLogNormalQuadratureCompoundTest distribution."""
 
   def testSampleProbConsistent(self):
     with self.test_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
-          loc=-2.,
-          scale=1.1,
-          quadrature_grid_and_probs=(
-              np.polynomial.hermite.hermgauss(deg=10)),
+          loc=array_ops.placeholder_with_default(
+              -2.,
+              shape=[] if self.static_shape else None),
+          scale=array_ops.placeholder_with_default(
+              1.1,
+              shape=[] if self.static_shape else None),
+          quadrature_size=10,
           validate_args=True)
       self.run_test_sample_consistent_log_prob(
-          sess.run, pln, rtol=0.1)
+          sess.run, pln, batch_size=1, rtol=0.1)
 
   def testMeanVariance(self):
     with self.test_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
-          loc=0.,
-          scale=1.,
-          quadrature_grid_and_probs=(
-              np.polynomial.hermite.hermgauss(deg=10)),
+          loc=array_ops.placeholder_with_default(
+              0.,
+              shape=[] if self.static_shape else None),
+          scale=array_ops.placeholder_with_default(
+              1.,
+              shape=[] if self.static_shape else None),
+          quadrature_size=10,
           validate_args=True)
       self.run_test_sample_consistent_mean_variance(
           sess.run, pln, rtol=0.02)
@@ -56,21 +59,27 @@ class PoissonLogNormalQuadratureCompoundTest(
   def testSampleProbConsistentBroadcastScalar(self):
     with self.test_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
-          loc=[0., -0.5],
-          scale=1.,
-          quadrature_grid_and_probs=(
-              np.polynomial.hermite.hermgauss(deg=10)),
+          loc=array_ops.placeholder_with_default(
+              [0., -0.5],
+              shape=[2] if self.static_shape else None),
+          scale=array_ops.placeholder_with_default(
+              1.,
+              shape=[] if self.static_shape else None),
+          quadrature_size=10,
           validate_args=True)
       self.run_test_sample_consistent_log_prob(
-          sess.run, pln, rtol=0.1, atol=0.01)
+          sess.run, pln, batch_size=2, rtol=0.1, atol=0.01)
 
   def testMeanVarianceBroadcastScalar(self):
     with self.test_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
-          loc=[0., -0.5],
-          scale=1.,
-          quadrature_grid_and_probs=(
-              np.polynomial.hermite.hermgauss(deg=10)),
+          loc=array_ops.placeholder_with_default(
+              [0., -0.5],
+              shape=[2] if self.static_shape else None),
+          scale=array_ops.placeholder_with_default(
+              1.,
+              shape=[] if self.static_shape else None),
+          quadrature_size=10,
           validate_args=True)
       self.run_test_sample_consistent_mean_variance(
           sess.run, pln, rtol=0.1, atol=0.01)
@@ -78,38 +87,46 @@ class PoissonLogNormalQuadratureCompoundTest(
   def testSampleProbConsistentBroadcastBoth(self):
     with self.test_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
-          loc=[[0.], [-0.5]],
-          scale=[[1., 0.9]],
-          quadrature_grid_and_probs=(
-              np.polynomial.hermite.hermgauss(deg=10)),
+          loc=array_ops.placeholder_with_default(
+              [[0.], [-0.5]],
+              shape=[2, 1] if self.static_shape else None),
+          scale=array_ops.placeholder_with_default(
+              [[1., 0.9]],
+              shape=[1, 2] if self.static_shape else None),
+          quadrature_size=10,
           validate_args=True)
       self.run_test_sample_consistent_log_prob(
-          sess.run, pln, rtol=0.1, atol=0.08)
+          sess.run, pln, batch_size=4, rtol=0.1, atol=0.08)
 
   def testMeanVarianceBroadcastBoth(self):
     with self.test_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
-          loc=[[0.], [-0.5]],
-          scale=[[1., 0.9]],
-          quadrature_grid_and_probs=(
-              np.polynomial.hermite.hermgauss(deg=10)),
+          loc=array_ops.placeholder_with_default(
+              [[0.], [-0.5]],
+              shape=[2, 1] if self.static_shape else None),
+          scale=array_ops.placeholder_with_default(
+              [[1., 0.9]],
+              shape=[1, 2] if self.static_shape else None),
+          quadrature_size=10,
           validate_args=True)
       self.run_test_sample_consistent_mean_variance(
           sess.run, pln, rtol=0.1, atol=0.01)
 
-  def testSampleProbConsistentDynamicQuadrature(self):
-    with self.test_session() as sess:
-      qgrid = array_ops.placeholder(dtype=dtypes.float32)
-      qprobs = array_ops.placeholder(dtype=dtypes.float32)
-      g, p = np.polynomial.hermite.hermgauss(deg=10)
-      pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
-          loc=-2.,
-          scale=1.1,
-          quadrature_grid_and_probs=(g, p),
-          validate_args=True)
-      self.run_test_sample_consistent_log_prob(
-          lambda x: sess.run(x, feed_dict={qgrid: g, qprobs: p}),
-          pln, rtol=0.1)
+
+class PoissonLogNormalQuadratureCompoundStaticShapeTest(
+    _PoissonLogNormalQuadratureCompoundTest, test.TestCase):
+
+  @property
+  def static_shape(self):
+    return True
+
+
+class PoissonLogNormalQuadratureCompoundDynamicShapeTest(
+    _PoissonLogNormalQuadratureCompoundTest, test.TestCase):
+
+  @property
+  def static_shape(self):
+    return False
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py b/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
index 595d9f5df755d7defa63d385039bafe4f87aa6ec..4186cf129dbf31724c84133734da3f226817c71a 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
@@ -23,11 +23,244 @@ import numpy as np
 from tensorflow.contrib.distributions.python.ops import sample_stats
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import spectral_ops_test_util
 from tensorflow.python.platform import test
 
 rng = np.random.RandomState(0)
 
 
+class _AutoCorrelationTest(object):
+
+  @property
+  def use_static_shape(self):
+    raise NotImplementedError("Subclass failed to implement `use_static_shape`")
+
+  @property
+  def dtype(self):
+    raise NotImplementedError("Subclass failed to implement `dtype`.")
+
+  def test_constant_sequence_axis_0_max_lags_none_center_false(self):
+    x_ = np.array([[0., 0., 0.],
+                   [1., 1., 1.]]).astype(self.dtype)
+    x_ph = array_ops.placeholder_with_default(
+        input=x_,
+        shape=x_.shape if self.use_static_shape else None)
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.test_session() as sess:
+        # Setting normalize = True means we divide by zero.
+        auto_corr = sample_stats.auto_correlation(
+            x_ph, axis=1, center=False, normalize=False)
+        if self.use_static_shape:
+          self.assertEqual((2, 3), auto_corr.shape)
+        auto_corr_ = sess.run(auto_corr)
+        self.assertAllClose(
+            [[0., 0., 0.],
+             [1., 1., 1.]], auto_corr_)
+
+  def test_constant_sequence_axis_0_max_lags_none_center_true(self):
+    x_ = np.array([[0., 0., 0.],
+                   [1., 1., 1.]]).astype(self.dtype)
+    x_ph = array_ops.placeholder_with_default(
+        input=x_,
+        shape=x_.shape if self.use_static_shape else None)
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.test_session() as sess:
+        # Setting normalize = True means we divide by zero.
+        auto_corr = sample_stats.auto_correlation(
+            x_ph, axis=1, normalize=False, center=True)
+        if self.use_static_shape:
+          self.assertEqual((2, 3), auto_corr.shape)
+        auto_corr_ = sess.run(auto_corr)
+        self.assertAllClose(
+            [[0., 0., 0.],
+             [0., 0., 0.]], auto_corr_)
+
+  def check_results_versus_brute_force(
+      self, x, axis, max_lags, center, normalize):
+    """Compute auto-correlation by brute force, then compare to tf result."""
+    # Brute for auto-corr -- avoiding fft and transpositions.
+    axis_len = x.shape[axis]
+    if max_lags is None:
+      max_lags = axis_len - 1
+    else:
+      max_lags = min(axis_len - 1, max_lags)
+    auto_corr_at_lag = []
+    if center:
+      x -= x.mean(axis=axis, keepdims=True)
+    for m in range(max_lags + 1):
+      auto_corr_at_lag.append((
+          np.take(x, indices=range(0, axis_len - m), axis=axis) *
+          np.conj(np.take(x, indices=range(m, axis_len), axis=axis))
+      ).mean(axis=axis, keepdims=True))
+    rxx = np.concatenate(auto_corr_at_lag, axis=axis)
+    if normalize:
+      rxx /= np.take(rxx, [0], axis=axis)
+
+    x_ph = array_ops.placeholder_with_default(
+        x, shape=x.shape if self.use_static_shape else None)
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.test_session():
+        auto_corr = sample_stats.auto_correlation(
+            x_ph, axis=axis, max_lags=max_lags, center=center,
+            normalize=normalize)
+        if self.use_static_shape:
+          output_shape = list(x.shape)
+          output_shape[axis] = max_lags + 1
+          self.assertAllEqual(output_shape, auto_corr.shape)
+        self.assertAllClose(rxx, auto_corr.eval(), rtol=1e-5, atol=1e-5)
+
+  def test_axis_n1_center_false_max_lags_none(self):
+    x = rng.randn(2, 3, 4).astype(self.dtype)
+    if self.dtype in [np.complex64]:
+      x = 1j * rng.randn(2, 3, 4).astype(self.dtype)
+    self.check_results_versus_brute_force(
+        x, axis=-1, max_lags=None, center=False, normalize=False)
+
+  def test_axis_n2_center_false_max_lags_none(self):
+    x = rng.randn(3, 4, 5).astype(self.dtype)
+    if self.dtype in [np.complex64]:
+      x = 1j * rng.randn(3, 4, 5).astype(self.dtype)
+    self.check_results_versus_brute_force(
+        x, axis=-2, max_lags=None, center=False, normalize=False)
+
+  def test_axis_n1_center_false_max_lags_none_normalize_true(self):
+    x = rng.randn(2, 3, 4).astype(self.dtype)
+    if self.dtype in [np.complex64]:
+      x = 1j * rng.randn(2, 3, 4).astype(self.dtype)
+    self.check_results_versus_brute_force(
+        x, axis=-1, max_lags=None, center=False, normalize=True)
+
+  def test_axis_n2_center_false_max_lags_none_normalize_true(self):
+    x = rng.randn(3, 4, 5).astype(self.dtype)
+    if self.dtype in [np.complex64]:
+      x = 1j * rng.randn(3, 4, 5).astype(self.dtype)
+    self.check_results_versus_brute_force(
+        x, axis=-2, max_lags=None, center=False, normalize=True)
+
+  def test_axis_0_center_true_max_lags_none(self):
+    x = rng.randn(3, 4, 5).astype(self.dtype)
+    if self.dtype in [np.complex64]:
+      x = 1j * rng.randn(3, 4, 5).astype(self.dtype)
+    self.check_results_versus_brute_force(
+        x, axis=0, max_lags=None, center=True, normalize=False)
+
+  def test_axis_2_center_true_max_lags_1(self):
+    x = rng.randn(3, 4, 5).astype(self.dtype)
+    if self.dtype in [np.complex64]:
+      x = 1j * rng.randn(3, 4, 5).astype(self.dtype)
+    self.check_results_versus_brute_force(
+        x, axis=2, max_lags=1, center=True, normalize=False)
+
+  def test_axis_2_center_true_max_lags_100(self):
+    # There are less than 100 elements in axis 2, so expect we get back an array
+    # the same size as x, despite having asked for 100 lags.
+    x = rng.randn(3, 4, 5).astype(self.dtype)
+    if self.dtype in [np.complex64]:
+      x = 1j * rng.randn(3, 4, 5).astype(self.dtype)
+    self.check_results_versus_brute_force(
+        x, axis=2, max_lags=100, center=True, normalize=False)
+
+  def test_long_orthonormal_sequence_has_corr_length_0(self):
+    l = 10000
+    x = rng.randn(l).astype(self.dtype)
+    x_ph = array_ops.placeholder_with_default(
+        x, shape=(l,) if self.use_static_shape else None)
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.test_session():
+        rxx = sample_stats.auto_correlation(
+            x_ph, max_lags=l // 2, center=True, normalize=False)
+        if self.use_static_shape:
+          self.assertAllEqual((l // 2 + 1,), rxx.shape)
+        rxx_ = rxx.eval()
+        # OSS CPU FFT has some accuracy issues is not the most accurate.
+        # So this tolerance is a bit bad.
+        self.assertAllClose(1., rxx_[0], rtol=0.05)
+        # The maximal error in the rest of the sequence is not great.
+        self.assertAllClose(np.zeros(l // 2), rxx_[1:], atol=0.1)
+        # The mean error in the rest is ok, actually 0.008 when I tested it.
+        self.assertLess(np.abs(rxx_[1:]).mean(), 0.02)
+
+  def test_step_function_sequence(self):
+    # x jumps to new random value every 10 steps.  So correlation length = 10.
+    x = (rng.randint(-10, 10, size=(1000, 1))
+         * np.ones((1, 10))).ravel().astype(self.dtype)
+    x_ph = array_ops.placeholder_with_default(
+        x, shape=(1000 * 10,) if self.use_static_shape else None)
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.test_session():
+        rxx = sample_stats.auto_correlation(
+            x_ph, max_lags=1000 * 10 // 2, center=True, normalize=False)
+        if self.use_static_shape:
+          self.assertAllEqual((1000 * 10 // 2 + 1,), rxx.shape)
+        rxx_ = rxx.eval()
+        rxx_ /= rxx_[0]
+        # Expect positive correlation for the first 10 lags, then significantly
+        # smaller negative.
+        self.assertGreater(rxx_[:10].min(), 0)
+        self.assertGreater(rxx_[9], 5 * rxx_[10:20].mean())
+        # RXX should be decreasing for the first 10 lags.
+        diff = np.diff(rxx_)
+        self.assertLess(diff[:10].max(), 0)
+
+  def test_normalization(self):
+    l = 10000
+    x = 3 * rng.randn(l).astype(self.dtype)
+    x_ph = array_ops.placeholder_with_default(
+        x, shape=(l,) if self.use_static_shape else None)
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.test_session():
+        rxx = sample_stats.auto_correlation(
+            x_ph, max_lags=l // 2, center=True, normalize=True)
+        if self.use_static_shape:
+          self.assertAllEqual((l // 2 + 1,), rxx.shape)
+        rxx_ = rxx.eval()
+        # Note that RXX[0] = 1, despite the fact that E[X^2] = 9, and this is
+        # due to normalize=True.
+        # OSS CPU FFT has some accuracy issues is not the most accurate.
+        # So this tolerance is a bit bad.
+        self.assertAllClose(1., rxx_[0], rtol=0.05)
+        # The maximal error in the rest of the sequence is not great.
+        self.assertAllClose(np.zeros(l // 2), rxx_[1:], atol=0.1)
+        # The mean error in the rest is ok, actually 0.008 when I tested it.
+        self.assertLess(np.abs(rxx_[1:]).mean(), 0.02)
+
+
+class AutoCorrelationTestStaticShapeFloat32(test.TestCase,
+                                            _AutoCorrelationTest):
+
+  @property
+  def dtype(self):
+    return np.float32
+
+  @property
+  def use_static_shape(self):
+    return True
+
+
+class AutoCorrelationTestStaticShapeComplex64(test.TestCase,
+                                              _AutoCorrelationTest):
+
+  @property
+  def dtype(self):
+    return np.complex64
+
+  @property
+  def use_static_shape(self):
+    return True
+
+
+class AutoCorrelationTestDynamicShapeFloat32(test.TestCase,
+                                             _AutoCorrelationTest):
+
+  @property
+  def dtype(self):
+    return np.float32
+
+  @property
+  def use_static_shape(self):
+    return False
+
+
 class PercentileTestWithLowerInterpolation(test.TestCase):
 
   _interpolation = "lower"
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
index 103d8e186221e879d1734a097114708429f725bd..cbaf74d3f66253ae5727e1ba579e2d49235b748e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
@@ -200,6 +200,27 @@ class TransformedDistributionTest(test.TestCase):
       self.assertAllEqual([2], multi_logit_normal.event_shape)
       self.assertAllEqual([2], multi_logit_normal.event_shape_tensor().eval())
 
+  def testCastLogDetJacobian(self):
+    """Test log_prob when Jacobian and log_prob dtypes do not match."""
+
+    with self.test_session():
+      # Create an identity bijector whose jacobians have dtype int32
+      int_identity = bs.Inline(
+          forward_fn=array_ops.identity,
+          inverse_fn=array_ops.identity,
+          inverse_log_det_jacobian_fn=lambda x: math_ops.cast(0, dtypes.int32),
+          forward_log_det_jacobian_fn=lambda x: math_ops.cast(0, dtypes.int32),
+          is_constant_jacobian=True)
+      normal = self._cls()(
+          distribution=ds.Normal(loc=0., scale=1.),
+          bijector=int_identity,
+          validate_args=True)
+
+      y = normal.sample()
+      normal.log_prob(y).eval()
+      normal.prob(y).eval()
+      normal.entropy().eval()
+
   def testEntropy(self):
     with self.test_session():
       shift = np.array([[-1, 0, 1], [-1, -2, -3]], dtype=np.float32)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py
index de4a221f7badca8267a81d612a57137c676ff052..d292b04665e34196670ee4f1c1655f805e04e06a 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py
@@ -21,9 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import test_util
-from tensorflow.contrib.distributions.python.ops import vector_diffeomixture as vector_diffeomixture_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
+from tensorflow.contrib.distributions.python.ops import vector_diffeomixture as vdm_lib
 from tensorflow.python.ops.distributions import normal as normal_lib
 from tensorflow.python.ops.linalg import linear_operator_diag as linop_diag_lib
 from tensorflow.python.ops.linalg import linear_operator_identity as linop_identity_lib
@@ -37,7 +35,7 @@ class VectorDiffeomixtureTest(
   def testSampleProbConsistentBroadcastMixNoBatch(self):
     with self.test_session() as sess:
       dims = 4
-      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
+      vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [1.]],
           mix_scale=[1.],
           distribution=normal_lib.Normal(0., 1.),
@@ -54,18 +52,19 @@ class VectorDiffeomixtureTest(
                   diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
                   is_positive_definite=True),
           ],
+          quadrature_size=8,
           validate_args=True)
       # Ball centered at component0's mean.
       self.run_test_sample_consistent_log_prob(
-          sess.run, vdm, radius=2., center=0., rtol=0.005)
+          sess.run, vdm, radius=2., center=0., rtol=0.015)
       # Larger ball centered at component1's mean.
       self.run_test_sample_consistent_log_prob(
-          sess.run, vdm, radius=4., center=2., rtol=0.005)
+          sess.run, vdm, radius=4., center=2., rtol=0.015)
 
   def testSampleProbConsistentBroadcastMixNonStandardBase(self):
     with self.test_session() as sess:
       dims = 4
-      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
+      vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [1.]],
           mix_scale=[1.],
           distribution=normal_lib.Normal(1., 1.5),
@@ -82,18 +81,19 @@ class VectorDiffeomixtureTest(
                   diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
                   is_positive_definite=True),
           ],
+          quadrature_size=8,
           validate_args=True)
       # Ball centered at component0's mean.
       self.run_test_sample_consistent_log_prob(
-          sess.run, vdm, radius=2., center=1., rtol=0.006)
+          sess.run, vdm, radius=2., center=1., rtol=0.015)
       # Larger ball centered at component1's mean.
       self.run_test_sample_consistent_log_prob(
-          sess.run, vdm, radius=4., center=3., rtol=0.009)
+          sess.run, vdm, radius=4., center=3., rtol=0.01)
 
   def testSampleProbConsistentBroadcastMixBatch(self):
     with self.test_session() as sess:
       dims = 4
-      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
+      vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [1.]],
           mix_scale=[1.],
           distribution=normal_lib.Normal(0., 1.),
@@ -113,18 +113,19 @@ class VectorDiffeomixtureTest(
                   ]),
                   is_positive_definite=True),
           ],
+          quadrature_size=8,
           validate_args=True)
       # Ball centered at component0's mean.
       self.run_test_sample_consistent_log_prob(
-          sess.run, vdm, radius=2., center=0., rtol=0.005)
+          sess.run, vdm, radius=2., center=0., rtol=0.01)
       # Larger ball centered at component1's mean.
       self.run_test_sample_consistent_log_prob(
-          sess.run, vdm, radius=4., center=2., rtol=0.005)
+          sess.run, vdm, radius=4., center=2., rtol=0.01)
 
   def testMeanCovarianceNoBatch(self):
     with self.test_session() as sess:
       dims = 3
-      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
+      vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [4.]],
           mix_scale=[10.],
           distribution=normal_lib.Normal(0., 1.),
@@ -141,14 +142,15 @@ class VectorDiffeomixtureTest(
                   diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
                   is_positive_definite=True),
           ],
+          quadrature_size=8,
           validate_args=True)
       self.run_test_sample_consistent_mean_covariance(
-          sess.run, vdm, rtol=0.02, cov_rtol=0.06)
+          sess.run, vdm, rtol=0.02, cov_rtol=0.08)
 
   def testMeanCovarianceNoBatchUncenteredNonStandardBase(self):
     with self.test_session() as sess:
       dims = 3
-      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
+      vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [4.]],
           mix_scale=[10.],
           distribution=normal_lib.Normal(-1., 1.5),
@@ -165,6 +167,7 @@ class VectorDiffeomixtureTest(
                   diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
                   is_positive_definite=True),
           ],
+          quadrature_size=8,
           validate_args=True)
       self.run_test_sample_consistent_mean_covariance(
           sess.run, vdm, num_samples=int(1e6), rtol=0.01, cov_atol=0.025)
@@ -172,7 +175,7 @@ class VectorDiffeomixtureTest(
   def testMeanCovarianceBatch(self):
     with self.test_session() as sess:
       dims = 3
-      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
+      vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [4.]],
           mix_scale=[10.],
           distribution=normal_lib.Normal(0., 1.),
@@ -192,18 +195,16 @@ class VectorDiffeomixtureTest(
                   ]),
                   is_positive_definite=True),
           ],
+          quadrature_size=8,
           validate_args=True)
       self.run_test_sample_consistent_mean_covariance(
-          sess.run, vdm, rtol=0.02, cov_rtol=0.06)
+          sess.run, vdm, rtol=0.02, cov_rtol=0.07)
 
-  def testSampleProbConsistentDynamicQuadrature(self):
+  def testSampleProbConsistentQuadrature(self):
     with self.test_session() as sess:
-      qgrid = array_ops.placeholder(dtype=dtypes.float32)
-      qprobs = array_ops.placeholder(dtype=dtypes.float32)
-      g, p = np.polynomial.hermite.hermgauss(deg=8)
       dims = 4
-      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
-          mix_loc=[[0.], [1.]],
+      vdm = vdm_lib.VectorDiffeomixture(
+          mix_loc=[0.],
           mix_scale=[1.],
           distribution=normal_lib.Normal(0., 1.),
           loc=[
@@ -219,15 +220,14 @@ class VectorDiffeomixtureTest(
                   diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
                   is_positive_definite=True),
           ],
-          quadrature_grid_and_probs=(g, p),
+          quadrature_size=3,
           validate_args=True)
       # Ball centered at component0's mean.
-      sess_run_fn = lambda x: sess.run(x, feed_dict={qgrid: g, qprobs: p})
       self.run_test_sample_consistent_log_prob(
-          sess_run_fn, vdm, radius=2., center=0., rtol=0.005)
+          sess.run, vdm, radius=2., center=0., rtol=0.015)
       # Larger ball centered at component1's mean.
       self.run_test_sample_consistent_log_prob(
-          sess_run_fn, vdm, radius=4., center=2., rtol=0.005)
+          sess.run, vdm, radius=4., center=2., rtol=0.005)
 
   # TODO(jvdillon): We've tested that (i) .sample and .log_prob are consistent,
   # (ii) .mean, .stddev etc... and .sample are consistent. However, we haven't
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py b/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
index 6049419818e18c54209f0be95d41fcecf6627b7e..0fe9f6aa78fbe845b99d0668f075b0162ec2a9f7 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
@@ -18,12 +18,117 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.absolute_value_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["AbsoluteValue"]
+__all__ = [
+    "AbsoluteValue",
+]
 
-remove_undocumented(__name__, _allowed_symbols)
+
+class AbsoluteValue(bijector.Bijector):
+  """Computes `Y = g(X) = Abs(X)`, element-wise.
+
+  This non-injective bijector allows for transformations of scalar distributions
+  with the absolute value function, which maps `(-inf, inf)` to `[0, inf)`.
+
+  * For `y in (0, inf)`, `AbsoluteValue.inverse(y)` returns the set inverse
+    `{x in (-inf, inf) : |x| = y}` as a tuple, `-y, y`.
+  * `AbsoluteValue.inverse(0)` returns `0, 0`, which is not the set inverse
+    (the set inverse is the singleton `{0}`), but "works" in conjunction with
+    `TransformedDistribution` to produce a left semi-continuous pdf.
+  * For `y < 0`, `AbsoluteValue.inverse(y)` happily returns the
+    wrong thing, `-y, y`.  This is done for efficiency.  If
+    `validate_args == True`, `y < 0` will raise an exception.
+
+
+  ```python
+  tfd = tf.contrib.distributions
+
+  abs = tfd.bijectors.AbsoluteValue()
+
+  abs.forward([-1., 0., 1.])
+  ==> [1., 0.,  1.]
+
+  abs.inverse(1.)
+  ==> [-1., 1.]
+
+  # The |dX/dY| is constant, == 1.  So Log|dX/dY| == 0.
+  abs.inverse_log_det_jacobian(1.)
+  ==> [0., 0.]
+
+  # Special case handling of 0.
+  abs.inverse(0.)
+  ==> [0., 0.]
+
+  abs.inverse_log_det_jacobian(0.)
+  ==> [0., 0.]
+  ```
+
+  """
+
+  def __init__(self, event_ndims=0, validate_args=False, name="absolute_value"):
+    """Instantiates the `AbsoluteValue` bijector.
+
+    Args:
+      event_ndims: Python scalar indicating the number of dimensions associated
+        with a particular draw from the distribution.  Currently only zero is
+        supported.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness, in particular whether inputs to `inverse` and
+        `inverse_log_det_jacobian` are non-negative.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError:  If `event_ndims` is not zero.
+    """
+    self._graph_parents = []
+    self._name = name
+
+    event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+    event_ndims_const = tensor_util.constant_value(event_ndims)
+    if event_ndims_const is not None and event_ndims_const not in (0,):
+      raise ValueError("event_ndims(%s) was not 0" % event_ndims_const)
+    else:
+      if validate_args:
+        event_ndims = control_flow_ops.with_dependencies(
+            [check_ops.assert_equal(
+                event_ndims, 0, message="event_ndims was not 0")],
+            event_ndims)
+
+    with self._name_scope("init"):
+      super(AbsoluteValue, self).__init__(
+          event_ndims=event_ndims,
+          validate_args=validate_args,
+          name=name)
+
+  def _forward(self, x):
+    return math_ops.abs(x)
+
+  def _inverse(self, y):
+    if self.validate_args:
+      y = control_flow_ops.with_dependencies(
+          [check_ops.assert_non_negative(y, message="Argument y was negative")],
+          y)
+    return -y, y
+
+  def _inverse_log_det_jacobian(self, y):
+    # If event_ndims = 2,
+    # F^{-1}(y) = (-y, y), so DF^{-1}(y) = (-1, 1),
+    # so Log|DF^{-1}(y)| = Log[1, 1] = [0, 0].
+    batch_shape = array_ops.shape(y)[:array_ops.rank(y) - self.event_ndims]
+    zeros = array_ops.zeros(batch_shape, dtype=y.dtype)
+    if self.validate_args:
+      zeros = control_flow_ops.with_dependencies(
+          [check_ops.assert_non_negative(y, message="Argument y was negative")],
+          zeros)
+    return zeros, zeros
+
+  @property
+  def _is_injective(self):
+    return False
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value_impl.py
deleted file mode 100644
index b84502003ab6c0c4ffdda21eea162f441509e1fa..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value_impl.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""AbsoluteValue bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-__all__ = [
-    "AbsoluteValue",
-]
-
-
-class AbsoluteValue(bijector.Bijector):
-  """Computes `Y = g(X) = Abs(X)`, element-wise.
-
-  This non-injective bijector allows for transformations of scalar distributions
-  with the absolute value function, which maps `(-inf, inf)` to `[0, inf)`.
-
-  * For `y in (0, inf)`, `AbsoluteValue.inverse(y)` returns the set inverse
-    `{x in (-inf, inf) : |x| = y}` as a tuple, `-y, y`.
-  * `AbsoluteValue.inverse(0)` returns `0, 0`, which is not the set inverse
-    (the set inverse is the singleton `{0}`), but "works" in conjunction with
-    `TransformedDistribution` to produce a left semi-continuous pdf.
-  * For `y < 0`, `AbsoluteValue.inverse(y)` happily returns the
-    wrong thing, `-y, y`.  This is done for efficiency.  If
-    `validate_args == True`, `y < 0` will raise an exception.
-
-
-  ```python
-  abs = ds.bijectors.AbsoluteValue()
-
-  abs.forward([-1., 0., 1.])
-  ==> [1., 0.,  1.]
-
-  abs.inverse(1.)
-  ==> [-1., 1.]
-
-  # The |dX/dY| is constant, == 1.  So Log|dX/dY| == 0.
-  abs.inverse_log_det_jacobian(1.)
-  ==> [0., 0.]
-
-  # Special case handling of 0.
-  abs.inverse(0.)
-  ==> [0., 0.]
-
-  abs.inverse_log_det_jacobian(0.)
-  ==> [0., 0.]
-  ```
-
-  """
-
-  def __init__(self, event_ndims=0, validate_args=False, name="absolute_value"):
-    """Instantiates the `AbsoluteValue` bijector.
-
-    Args:
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.  Currently only zero is
-        supported.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness, in particular whether inputs to `inverse` and
-        `inverse_log_det_jacobian` are non-negative.
-      name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError:  If `event_ndims` is not zero.
-    """
-    self._graph_parents = []
-    self._name = name
-
-    event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-    event_ndims_const = tensor_util.constant_value(event_ndims)
-    if event_ndims_const is not None and event_ndims_const not in (0,):
-      raise ValueError("event_ndims(%s) was not 0" % event_ndims_const)
-    else:
-      if validate_args:
-        event_ndims = control_flow_ops.with_dependencies(
-            [check_ops.assert_equal(
-                event_ndims, 0, message="event_ndims was not 0")],
-            event_ndims)
-
-    with self._name_scope("init"):
-      super(AbsoluteValue, self).__init__(
-          event_ndims=event_ndims,
-          validate_args=validate_args,
-          name=name)
-
-  def _forward(self, x):
-    return math_ops.abs(x)
-
-  def _inverse(self, y):
-    if self.validate_args:
-      y = control_flow_ops.with_dependencies(
-          [check_ops.assert_non_negative(y, message="Argument y was negative")],
-          y)
-    return -y, y
-
-  def _inverse_log_det_jacobian(self, y):
-    # If event_ndims = 2,
-    # F^{-1}(y) = (-y, y), so DF^{-1}(y) = (-1, 1),
-    # so Log|DF^{-1}(y)| = Log[1, 1] = [0, 0].
-    batch_shape = array_ops.shape(y)[:array_ops.rank(y) - self.event_ndims]
-    zeros = array_ops.zeros(batch_shape, dtype=y.dtype)
-    if self.validate_args:
-      zeros = control_flow_ops.with_dependencies(
-          [check_ops.assert_non_negative(y, message="Argument y was negative")],
-          zeros)
-    return zeros, zeros
-
-  @property
-  def _is_injective(self):
-    return False
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
index 940cceff04e77cfc2f7caae5a798d135f7601b95..05bb9c2f9bdf35e222c94db3491157893da64ebd 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
@@ -18,12 +18,386 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.affine_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.contrib import linalg
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["Affine"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Affine",
+]
+
+
+def _as_tensor(x, name):
+  """Convenience to convert to `Tensor` or leave as `None`."""
+  return None if x is None else ops.convert_to_tensor(x, name=name)
+
+
+class Affine(bijector.Bijector):
+  """Compute `Y = g(X; shift, scale) = scale @ X + shift`.
+
+  Here `scale = c * I + diag(D1) + tril(L) + V @ diag(D2) @ V.T`.
+
+  In TF parlance, the `scale` term is logically equivalent to:
+
+  ```python
+  scale = (
+    scale_identity_multiplier * tf.diag(tf.ones(d)) +
+    tf.diag(scale_diag) +
+    scale_tril +
+    scale_perturb_factor @ diag(scale_perturb_diag) @
+      tf.transpose([scale_perturb_factor])
+  )
+  ```
+
+  The `scale` term is applied without necessarily materializing constituent
+  matrices, i.e., the matmul is [matrix-free](
+  https://en.wikipedia.org/wiki/Matrix-free_methods) when possible.
+
+  Examples:
+
+  ```python
+  # Y = X
+  b = Affine()
+
+  # Y = X + shift
+  b = Affine(shift=[1., 2, 3])
+
+  # Y = 2 * I @ X.T + shift
+  b = Affine(shift=[1., 2, 3],
+             scale_identity_multiplier=2.)
+
+  # Y = tf.diag(d1) @ X.T + shift
+  b = Affine(shift=[1., 2, 3],
+             scale_diag=[-1., 2, 1])         # Implicitly 3x3.
+
+  # Y = (I + v * v.T) @ X.T + shift
+  b = Affine(shift=[1., 2, 3],
+             scale_perturb_factor=[[1., 0],
+                                   [0, 1],
+                                   [1, 1]])
+
+  # Y = (diag(d1) + v * diag(d2) * v.T) @ X.T + shift
+  b = Affine(shift=[1., 2, 3],
+             scale_diag=[1., 3, 3],          # Implicitly 3x3.
+             scale_perturb_diag=[2., 1],     # Implicitly 2x2.
+             scale_perturb_factor=[[1., 0],
+                                   [0, 1],
+                                   [1, 1]])
+
+  ```
+
+  """
+
+  def __init__(self,
+               shift=None,
+               scale_identity_multiplier=None,
+               scale_diag=None,
+               scale_tril=None,
+               scale_perturb_factor=None,
+               scale_perturb_diag=None,
+               event_ndims=1,
+               validate_args=False,
+               name="affine"):
+    """Instantiates the `Affine` bijector.
+
+    This `Bijector` is initialized with `shift` `Tensor` and `scale` arguments,
+    giving the forward operation:
+
+    ```none
+    Y = g(X) = scale @ X + shift
+    ```
+
+    where the `scale` term is logically equivalent to:
+
+    ```python
+    scale = (
+      scale_identity_multiplier * tf.diag(tf.ones(d)) +
+      tf.diag(scale_diag) +
+      scale_tril +
+      scale_perturb_factor @ diag(scale_perturb_diag) @
+        tf.transpose([scale_perturb_factor])
+    )
+    ```
+
+    If none of `scale_identity_multiplier`, `scale_diag`, or `scale_tril` are
+    specified then `scale += IdentityMatrix`. Otherwise specifying a
+    `scale` argument has the semantics of `scale += Expand(arg)`, i.e.,
+    `scale_diag != None` means `scale += tf.diag(scale_diag)`.
+
+    Args:
+      shift: Floating-point `Tensor`. If this is set to `None`, no shift is
+        applied.
+      scale_identity_multiplier: floating point rank 0 `Tensor` representing a
+        scaling done to the identity matrix.
+        When `scale_identity_multiplier = scale_diag = scale_tril = None` then
+        `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added
+        to `scale`.
+      scale_diag: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
+        diagonal matrix.
+        When `None` no diagonal term is added to `scale`.
+      scale_tril: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ...  k, k], which represents a k x k
+        lower triangular matrix.
+        When `None` no `scale_tril` term is added to `scale`.
+        The upper triangular elements above the diagonal are ignored.
+      scale_perturb_factor: Floating-point `Tensor` representing factor matrix
+        with last two dimensions of shape `(k, r)`. When `None`, no rank-r
+        update is added to `scale`.
+      scale_perturb_diag: Floating-point `Tensor` representing the diagonal
+        matrix. `scale_perturb_diag` has shape [N1, N2, ...  r], which
+        represents an `r x r` diagonal matrix. When `None` low rank updates will
+        take the form `scale_perturb_factor * scale_perturb_factor.T`.
+      event_ndims: Scalar `int` `Tensor` indicating the number of dimensions
+        associated with a particular draw from the distribution. Must be 0 or 1.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError: if `perturb_diag` is specified but not `perturb_factor`.
+      TypeError: if `shift` has different `dtype` from `scale` arguments.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+
+    # Ambiguous definition of low rank update.
+    if scale_perturb_diag is not None and scale_perturb_factor is None:
+      raise ValueError("When scale_perturb_diag is specified, "
+                       "scale_perturb_factor must be specified.")
+
+    # Special case, only handling a scaled identity matrix. We don't know its
+    # dimensions, so this is special cased.
+    # We don't check identity_multiplier, since below we set it to 1. if all
+    # other scale args are None.
+    self._is_only_identity_multiplier = (scale_tril is None and
+                                         scale_diag is None and
+                                         scale_perturb_factor is None)
+
+    with self._name_scope("init", values=[
+        shift, scale_identity_multiplier, scale_diag, scale_tril,
+        scale_perturb_diag, scale_perturb_factor]):
+      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+      event_ndims_const = tensor_util.constant_value(event_ndims)
+      if event_ndims_const is not None and event_ndims_const not in (0, 1):
+        raise ValueError("event_ndims(%s) was not 0 or 1" % event_ndims_const)
+      else:
+        if validate_args:
+          # Shape tool will catch if event_ndims is negative.
+          event_ndims = control_flow_ops.with_dependencies(
+              [check_ops.assert_less(
+                  event_ndims, 2, message="event_ndims must be 0 or 1")],
+              event_ndims)
+
+      if event_ndims_const == 0 and not self._is_only_identity_multiplier:
+        raise ValueError(
+            "If event_ndims == 0, the only scale argument you can pass is "
+            "scale_identity_multiplier.  All others operate on vectors.")
+
+      # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
+      dtype = dtypes.float32
+
+      if shift is not None:
+        shift = ops.convert_to_tensor(shift, name="shift")
+        dtype = shift.dtype.base_dtype
+      self._shift = shift
+
+      # When no args are specified, pretend the scale matrix is the identity
+      # matrix.
+      if (self._is_only_identity_multiplier and
+          scale_identity_multiplier is None):
+        scale_identity_multiplier = ops.convert_to_tensor(1., dtype=dtype)
+
+      # self._create_scale_operator returns a LinearOperator in all cases
+      # except if self._is_only_identity_multiplier; in which case it
+      # returns a scalar Tensor.
+      scale = self._create_scale_operator(
+          identity_multiplier=scale_identity_multiplier,
+          diag=scale_diag,
+          tril=scale_tril,
+          perturb_diag=scale_perturb_diag,
+          perturb_factor=scale_perturb_factor,
+          shift=shift,
+          validate_args=validate_args)
+
+      if scale.dtype is not None:
+        dtype = scale.dtype.base_dtype
+
+      if scale is not None and not self._is_only_identity_multiplier:
+        if (shift is not None and
+            shift.dtype.base_dtype != scale.dtype.base_dtype):
+          raise TypeError(
+              "shift.dtype({}) is incompatible with scale.dtype({}).".format(
+                  shift.dtype, scale.dtype))
+
+        if scale.tensor_rank is not None:
+          batch_ndims = scale.tensor_rank - 2
+        else:
+          batch_ndims = scale.tensor_rank_tensor() - 2
+      else:
+        # We won't need shape inference when scale is None or when scale is a
+        # scalar.
+        batch_ndims = 0
+      self._scale = scale
+      self._shaper = _DistributionShape(
+          batch_ndims=batch_ndims,
+          event_ndims=event_ndims,
+          validate_args=validate_args)
+      super(Affine, self).__init__(
+          event_ndims=event_ndims,
+          graph_parents=(
+              [event_ndims] +
+              [self._scale] if tensor_util.is_tensor(self._scale)
+              else self._scale.graph_parents +
+              [self._shift] if self._shift is not None else []),
+          is_constant_jacobian=True,
+          dtype=dtype,
+          validate_args=validate_args,
+          name=name)
+
+  def _create_scale_operator(self, identity_multiplier, diag, tril,
+                             perturb_diag, perturb_factor, shift,
+                             validate_args):
+    """Construct `scale` from various components.
+
+    Args:
+      identity_multiplier: floating point rank 0 `Tensor` representing a scaling
+        done to the identity matrix.
+      diag: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
+        diagonal matrix.
+      tril: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_tril` has shape [N1, N2, ...  k], which represents a k x k lower
+        triangular matrix.
+      perturb_diag: Floating-point `Tensor` representing the diagonal matrix of
+        the low rank update.
+      perturb_factor: Floating-point `Tensor` representing factor matrix.
+      shift: Floating-point `Tensor` representing `shift in `scale @ X + shift`.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+
+    Returns:
+      scale. In the case of scaling by a constant, scale is a
+      floating point `Tensor`. Otherwise, scale is a `LinearOperator`.
+
+    Raises:
+      ValueError: if all of `tril`, `diag` and `identity_multiplier` are `None`.
+    """
+    identity_multiplier = _as_tensor(identity_multiplier, "identity_multiplier")
+    diag = _as_tensor(diag, "diag")
+    tril = _as_tensor(tril, "tril")
+    perturb_diag = _as_tensor(perturb_diag, "perturb_diag")
+    perturb_factor = _as_tensor(perturb_factor, "perturb_factor")
+
+    # If possible, use the low rank update to infer the shape of
+    # the identity matrix, when scale represents a scaled identity matrix
+    # with a low rank update.
+    shape_hint = None
+    if perturb_factor is not None:
+      shape_hint = distribution_util.dimension_size(perturb_factor, axis=-2)
+
+    if self._is_only_identity_multiplier:
+      if validate_args:
+        return control_flow_ops.with_dependencies(
+            [check_ops.assert_none_equal(
+                identity_multiplier,
+                array_ops.zeros([], identity_multiplier.dtype),
+                ["identity_multiplier should be non-zero."])],
+            identity_multiplier)
+      return identity_multiplier
+
+    scale = distribution_util.make_tril_scale(
+        loc=shift,
+        scale_tril=tril,
+        scale_diag=diag,
+        scale_identity_multiplier=identity_multiplier,
+        validate_args=validate_args,
+        assert_positive=False,
+        shape_hint=shape_hint)
+
+    if perturb_factor is not None:
+      return linalg.LinearOperatorLowRankUpdate(
+          scale,
+          u=perturb_factor,
+          diag_update=perturb_diag,
+          is_diag_update_positive=perturb_diag is None,
+          is_non_singular=True,  # Implied by is_positive_definite=True.
+          is_self_adjoint=True,
+          is_positive_definite=True,
+          is_square=True)
+
+    return scale
+
+  @property
+  def shift(self):
+    """The `shift` `Tensor` in `Y = scale @ X + shift`."""
+    return self._shift
+
+  @property
+  def scale(self):
+    """The `scale` `LinearOperator` in `Y = scale @ X + shift`."""
+    return self._scale
+
+  def _forward(self, x):
+    y = x
+    if self._is_only_identity_multiplier:
+      y *= self._scale
+      if self.shift is not None:
+        return y + self.shift
+      return y
+    y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
+        y, expand_batch_dim=False)
+    with ops.control_dependencies(self._maybe_check_scale() if
+                                  self.validate_args else []):
+      y = self.scale.matmul(y)
+    y = self._shaper.undo_make_batch_of_event_sample_matrices(
+        y, sample_shape, expand_batch_dim=False)
+    if self.shift is not None:
+      y += self.shift
+    return y
+
+  def _inverse(self, y):
+    x = y
+    if self.shift is not None:
+      x -= self.shift
+    if self._is_only_identity_multiplier:
+      return x / self._scale
+
+    x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
+        x, expand_batch_dim=False)
+    # Solve fails if the op is singular so we may safely skip this assertion.
+    x = self.scale.solve(x)
+    x = self._shaper.undo_make_batch_of_event_sample_matrices(
+        x, sample_shape, expand_batch_dim=False)
+    return x
+
+  def _inverse_log_det_jacobian(self, y):
+    return -self._forward_log_det_jacobian(y)
+
+  def _forward_log_det_jacobian(self, x):
+    if self._is_only_identity_multiplier:
+      # We don't pad in this case and instead let the fldj be applied
+      # via broadcast.
+      event_size = distribution_util.pick_vector(
+          math_ops.equal(self._shaper.event_ndims, 0),
+          [1], array_ops.shape(x))[-1]
+      event_size = math_ops.cast(event_size, dtype=self._scale.dtype)
+      return math_ops.log(math_ops.abs(self._scale)) * event_size
+    return self.scale.log_abs_determinant()
+
+  def _maybe_check_scale(self):
+    try:
+      return [self.scale.assert_non_singular()]
+    except NotImplementedError:
+      pass
+    return []
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
deleted file mode 100644
index 05bb9c2f9bdf35e222c94db3491157893da64ebd..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
+++ /dev/null
@@ -1,403 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Affine bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib import linalg
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "Affine",
-]
-
-
-def _as_tensor(x, name):
-  """Convenience to convert to `Tensor` or leave as `None`."""
-  return None if x is None else ops.convert_to_tensor(x, name=name)
-
-
-class Affine(bijector.Bijector):
-  """Compute `Y = g(X; shift, scale) = scale @ X + shift`.
-
-  Here `scale = c * I + diag(D1) + tril(L) + V @ diag(D2) @ V.T`.
-
-  In TF parlance, the `scale` term is logically equivalent to:
-
-  ```python
-  scale = (
-    scale_identity_multiplier * tf.diag(tf.ones(d)) +
-    tf.diag(scale_diag) +
-    scale_tril +
-    scale_perturb_factor @ diag(scale_perturb_diag) @
-      tf.transpose([scale_perturb_factor])
-  )
-  ```
-
-  The `scale` term is applied without necessarily materializing constituent
-  matrices, i.e., the matmul is [matrix-free](
-  https://en.wikipedia.org/wiki/Matrix-free_methods) when possible.
-
-  Examples:
-
-  ```python
-  # Y = X
-  b = Affine()
-
-  # Y = X + shift
-  b = Affine(shift=[1., 2, 3])
-
-  # Y = 2 * I @ X.T + shift
-  b = Affine(shift=[1., 2, 3],
-             scale_identity_multiplier=2.)
-
-  # Y = tf.diag(d1) @ X.T + shift
-  b = Affine(shift=[1., 2, 3],
-             scale_diag=[-1., 2, 1])         # Implicitly 3x3.
-
-  # Y = (I + v * v.T) @ X.T + shift
-  b = Affine(shift=[1., 2, 3],
-             scale_perturb_factor=[[1., 0],
-                                   [0, 1],
-                                   [1, 1]])
-
-  # Y = (diag(d1) + v * diag(d2) * v.T) @ X.T + shift
-  b = Affine(shift=[1., 2, 3],
-             scale_diag=[1., 3, 3],          # Implicitly 3x3.
-             scale_perturb_diag=[2., 1],     # Implicitly 2x2.
-             scale_perturb_factor=[[1., 0],
-                                   [0, 1],
-                                   [1, 1]])
-
-  ```
-
-  """
-
-  def __init__(self,
-               shift=None,
-               scale_identity_multiplier=None,
-               scale_diag=None,
-               scale_tril=None,
-               scale_perturb_factor=None,
-               scale_perturb_diag=None,
-               event_ndims=1,
-               validate_args=False,
-               name="affine"):
-    """Instantiates the `Affine` bijector.
-
-    This `Bijector` is initialized with `shift` `Tensor` and `scale` arguments,
-    giving the forward operation:
-
-    ```none
-    Y = g(X) = scale @ X + shift
-    ```
-
-    where the `scale` term is logically equivalent to:
-
-    ```python
-    scale = (
-      scale_identity_multiplier * tf.diag(tf.ones(d)) +
-      tf.diag(scale_diag) +
-      scale_tril +
-      scale_perturb_factor @ diag(scale_perturb_diag) @
-        tf.transpose([scale_perturb_factor])
-    )
-    ```
-
-    If none of `scale_identity_multiplier`, `scale_diag`, or `scale_tril` are
-    specified then `scale += IdentityMatrix`. Otherwise specifying a
-    `scale` argument has the semantics of `scale += Expand(arg)`, i.e.,
-    `scale_diag != None` means `scale += tf.diag(scale_diag)`.
-
-    Args:
-      shift: Floating-point `Tensor`. If this is set to `None`, no shift is
-        applied.
-      scale_identity_multiplier: floating point rank 0 `Tensor` representing a
-        scaling done to the identity matrix.
-        When `scale_identity_multiplier = scale_diag = scale_tril = None` then
-        `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added
-        to `scale`.
-      scale_diag: Floating-point `Tensor` representing the diagonal matrix.
-        `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
-        diagonal matrix.
-        When `None` no diagonal term is added to `scale`.
-      scale_tril: Floating-point `Tensor` representing the diagonal matrix.
-        `scale_diag` has shape [N1, N2, ...  k, k], which represents a k x k
-        lower triangular matrix.
-        When `None` no `scale_tril` term is added to `scale`.
-        The upper triangular elements above the diagonal are ignored.
-      scale_perturb_factor: Floating-point `Tensor` representing factor matrix
-        with last two dimensions of shape `(k, r)`. When `None`, no rank-r
-        update is added to `scale`.
-      scale_perturb_diag: Floating-point `Tensor` representing the diagonal
-        matrix. `scale_perturb_diag` has shape [N1, N2, ...  r], which
-        represents an `r x r` diagonal matrix. When `None` low rank updates will
-        take the form `scale_perturb_factor * scale_perturb_factor.T`.
-      event_ndims: Scalar `int` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution. Must be 0 or 1.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError: if `perturb_diag` is specified but not `perturb_factor`.
-      TypeError: if `shift` has different `dtype` from `scale` arguments.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-
-    # Ambiguous definition of low rank update.
-    if scale_perturb_diag is not None and scale_perturb_factor is None:
-      raise ValueError("When scale_perturb_diag is specified, "
-                       "scale_perturb_factor must be specified.")
-
-    # Special case, only handling a scaled identity matrix. We don't know its
-    # dimensions, so this is special cased.
-    # We don't check identity_multiplier, since below we set it to 1. if all
-    # other scale args are None.
-    self._is_only_identity_multiplier = (scale_tril is None and
-                                         scale_diag is None and
-                                         scale_perturb_factor is None)
-
-    with self._name_scope("init", values=[
-        shift, scale_identity_multiplier, scale_diag, scale_tril,
-        scale_perturb_diag, scale_perturb_factor]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      event_ndims_const = tensor_util.constant_value(event_ndims)
-      if event_ndims_const is not None and event_ndims_const not in (0, 1):
-        raise ValueError("event_ndims(%s) was not 0 or 1" % event_ndims_const)
-      else:
-        if validate_args:
-          # Shape tool will catch if event_ndims is negative.
-          event_ndims = control_flow_ops.with_dependencies(
-              [check_ops.assert_less(
-                  event_ndims, 2, message="event_ndims must be 0 or 1")],
-              event_ndims)
-
-      if event_ndims_const == 0 and not self._is_only_identity_multiplier:
-        raise ValueError(
-            "If event_ndims == 0, the only scale argument you can pass is "
-            "scale_identity_multiplier.  All others operate on vectors.")
-
-      # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
-      dtype = dtypes.float32
-
-      if shift is not None:
-        shift = ops.convert_to_tensor(shift, name="shift")
-        dtype = shift.dtype.base_dtype
-      self._shift = shift
-
-      # When no args are specified, pretend the scale matrix is the identity
-      # matrix.
-      if (self._is_only_identity_multiplier and
-          scale_identity_multiplier is None):
-        scale_identity_multiplier = ops.convert_to_tensor(1., dtype=dtype)
-
-      # self._create_scale_operator returns a LinearOperator in all cases
-      # except if self._is_only_identity_multiplier; in which case it
-      # returns a scalar Tensor.
-      scale = self._create_scale_operator(
-          identity_multiplier=scale_identity_multiplier,
-          diag=scale_diag,
-          tril=scale_tril,
-          perturb_diag=scale_perturb_diag,
-          perturb_factor=scale_perturb_factor,
-          shift=shift,
-          validate_args=validate_args)
-
-      if scale.dtype is not None:
-        dtype = scale.dtype.base_dtype
-
-      if scale is not None and not self._is_only_identity_multiplier:
-        if (shift is not None and
-            shift.dtype.base_dtype != scale.dtype.base_dtype):
-          raise TypeError(
-              "shift.dtype({}) is incompatible with scale.dtype({}).".format(
-                  shift.dtype, scale.dtype))
-
-        if scale.tensor_rank is not None:
-          batch_ndims = scale.tensor_rank - 2
-        else:
-          batch_ndims = scale.tensor_rank_tensor() - 2
-      else:
-        # We won't need shape inference when scale is None or when scale is a
-        # scalar.
-        batch_ndims = 0
-      self._scale = scale
-      self._shaper = _DistributionShape(
-          batch_ndims=batch_ndims,
-          event_ndims=event_ndims,
-          validate_args=validate_args)
-      super(Affine, self).__init__(
-          event_ndims=event_ndims,
-          graph_parents=(
-              [event_ndims] +
-              [self._scale] if tensor_util.is_tensor(self._scale)
-              else self._scale.graph_parents +
-              [self._shift] if self._shift is not None else []),
-          is_constant_jacobian=True,
-          dtype=dtype,
-          validate_args=validate_args,
-          name=name)
-
-  def _create_scale_operator(self, identity_multiplier, diag, tril,
-                             perturb_diag, perturb_factor, shift,
-                             validate_args):
-    """Construct `scale` from various components.
-
-    Args:
-      identity_multiplier: floating point rank 0 `Tensor` representing a scaling
-        done to the identity matrix.
-      diag: Floating-point `Tensor` representing the diagonal matrix.
-        `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
-        diagonal matrix.
-      tril: Floating-point `Tensor` representing the diagonal matrix.
-        `scale_tril` has shape [N1, N2, ...  k], which represents a k x k lower
-        triangular matrix.
-      perturb_diag: Floating-point `Tensor` representing the diagonal matrix of
-        the low rank update.
-      perturb_factor: Floating-point `Tensor` representing factor matrix.
-      shift: Floating-point `Tensor` representing `shift in `scale @ X + shift`.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-
-    Returns:
-      scale. In the case of scaling by a constant, scale is a
-      floating point `Tensor`. Otherwise, scale is a `LinearOperator`.
-
-    Raises:
-      ValueError: if all of `tril`, `diag` and `identity_multiplier` are `None`.
-    """
-    identity_multiplier = _as_tensor(identity_multiplier, "identity_multiplier")
-    diag = _as_tensor(diag, "diag")
-    tril = _as_tensor(tril, "tril")
-    perturb_diag = _as_tensor(perturb_diag, "perturb_diag")
-    perturb_factor = _as_tensor(perturb_factor, "perturb_factor")
-
-    # If possible, use the low rank update to infer the shape of
-    # the identity matrix, when scale represents a scaled identity matrix
-    # with a low rank update.
-    shape_hint = None
-    if perturb_factor is not None:
-      shape_hint = distribution_util.dimension_size(perturb_factor, axis=-2)
-
-    if self._is_only_identity_multiplier:
-      if validate_args:
-        return control_flow_ops.with_dependencies(
-            [check_ops.assert_none_equal(
-                identity_multiplier,
-                array_ops.zeros([], identity_multiplier.dtype),
-                ["identity_multiplier should be non-zero."])],
-            identity_multiplier)
-      return identity_multiplier
-
-    scale = distribution_util.make_tril_scale(
-        loc=shift,
-        scale_tril=tril,
-        scale_diag=diag,
-        scale_identity_multiplier=identity_multiplier,
-        validate_args=validate_args,
-        assert_positive=False,
-        shape_hint=shape_hint)
-
-    if perturb_factor is not None:
-      return linalg.LinearOperatorLowRankUpdate(
-          scale,
-          u=perturb_factor,
-          diag_update=perturb_diag,
-          is_diag_update_positive=perturb_diag is None,
-          is_non_singular=True,  # Implied by is_positive_definite=True.
-          is_self_adjoint=True,
-          is_positive_definite=True,
-          is_square=True)
-
-    return scale
-
-  @property
-  def shift(self):
-    """The `shift` `Tensor` in `Y = scale @ X + shift`."""
-    return self._shift
-
-  @property
-  def scale(self):
-    """The `scale` `LinearOperator` in `Y = scale @ X + shift`."""
-    return self._scale
-
-  def _forward(self, x):
-    y = x
-    if self._is_only_identity_multiplier:
-      y *= self._scale
-      if self.shift is not None:
-        return y + self.shift
-      return y
-    y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
-        y, expand_batch_dim=False)
-    with ops.control_dependencies(self._maybe_check_scale() if
-                                  self.validate_args else []):
-      y = self.scale.matmul(y)
-    y = self._shaper.undo_make_batch_of_event_sample_matrices(
-        y, sample_shape, expand_batch_dim=False)
-    if self.shift is not None:
-      y += self.shift
-    return y
-
-  def _inverse(self, y):
-    x = y
-    if self.shift is not None:
-      x -= self.shift
-    if self._is_only_identity_multiplier:
-      return x / self._scale
-
-    x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
-        x, expand_batch_dim=False)
-    # Solve fails if the op is singular so we may safely skip this assertion.
-    x = self.scale.solve(x)
-    x = self._shaper.undo_make_batch_of_event_sample_matrices(
-        x, sample_shape, expand_batch_dim=False)
-    return x
-
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(y)
-
-  def _forward_log_det_jacobian(self, x):
-    if self._is_only_identity_multiplier:
-      # We don't pad in this case and instead let the fldj be applied
-      # via broadcast.
-      event_size = distribution_util.pick_vector(
-          math_ops.equal(self._shaper.event_ndims, 0),
-          [1], array_ops.shape(x))[-1]
-      event_size = math_ops.cast(event_size, dtype=self._scale.dtype)
-      return math_ops.log(math_ops.abs(self._scale)) * event_size
-    return self.scale.log_abs_determinant()
-
-  def _maybe_check_scale(self):
-    try:
-      return [self.scale.assert_non_singular()]
-    except NotImplementedError:
-      pass
-    return []
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
index aca04a89df7c3ee09d5f7cc10f6779e33fa7aa66..89043b1410370074f11f2cfa59b6b6663fa62521 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
@@ -18,12 +18,214 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.affine_linear_operator_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.linalg import linear_operator
 
-_allowed_symbols = ["AffineLinearOperator"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "AffineLinearOperator",
+]
+
+
+class AffineLinearOperator(bijector.Bijector):
+  """Compute `Y = g(X; shift, scale) = scale @ X + shift`.
+
+  `shift` is a numeric `Tensor` and `scale` is a `LinearOperator`.
+
+  If `X` is a scalar then the forward transformation is: `scale * X + shift`
+  where `*` denotes the scalar product.
+
+  Note: we don't always simply transpose `X` (but write it this way for
+  brevity). Actually the input `X` undergoes the following transformation
+  before being premultiplied by `scale`:
+
+  1. If there are no sample dims, we call `X = tf.expand_dims(X, 0)`, i.e.,
+     `new_sample_shape = [1]`. Otherwise do nothing.
+  2. The sample shape is flattened to have one dimension, i.e.,
+     `new_sample_shape = [n]` where `n = tf.reduce_prod(old_sample_shape)`.
+  3. The sample dim is cyclically rotated left by 1, i.e.,
+     `new_shape = [B1,...,Bb, k, n]` where `n` is as above, `k` is the
+     event_shape, and `B1,...,Bb` are the batch shapes for each of `b` batch
+     dimensions.
+
+  (For more details see `shape.make_batch_of_event_sample_matrices`.)
+
+  The result of the above transformation is that `X` can be regarded as a batch
+  of matrices where each column is a draw from the distribution. After
+  premultiplying by `scale`, we take the inverse of this procedure. The input
+  `Y` also undergoes the same transformation before/after premultiplying by
+  `inv(scale)`.
+
+  Example Use:
+
+  ```python
+  linalg = tf.linalg
+
+  x = [1., 2, 3]
+
+  shift = [-1., 0., 1]
+  diag = [1., 2, 3]
+  scale = linalg.LinearOperatorDiag(diag)
+  affine = AffineLinearOperator(shift, scale)
+  # In this case, `forward` is equivalent to:
+  # y = scale @ x + shift
+  y = affine.forward(x)  # [0., 4, 10]
+
+  shift = [2., 3, 1]
+  tril = [[1., 0, 0],
+          [2, 1, 0],
+          [3, 2, 1]]
+  scale = linalg.LinearOperatorLowerTriangular(tril)
+  affine = AffineLinearOperator(shift, scale)
+  # In this case, `forward` is equivalent to:
+  # np.squeeze(np.matmul(tril, np.expand_dims(x, -1)), -1) + shift
+  y = affine.forward(x)  # [3., 7, 11]
+  ```
+
+  """
+
+  def __init__(self,
+               shift=None,
+               scale=None,
+               event_ndims=1,
+               validate_args=False,
+               name="affine_linear_operator"):
+    """Instantiates the `AffineLinearOperator` bijector.
+
+    Args:
+      shift: Floating-point `Tensor`.
+      scale:  Subclass of `LinearOperator`. Represents the (batch) positive
+        definite matrix `M` in `R^{k x k}`.
+      event_ndims: Scalar `integer` `Tensor` indicating the number of dimensions
+        associated with a particular draw from the distribution. Must be 0 or 1.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError: if `event_ndims` is not 0 or 1.
+      TypeError: if `scale` is not a `LinearOperator`.
+      TypeError: if `shift.dtype` does not match `scale.dtype`.
+      ValueError: if not `scale.is_non_singular`.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    graph_parents = []
+    with self._name_scope("init", values=[shift]):
+      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+      if tensor_util.constant_value(event_ndims) is not None:
+        event_ndims = tensor_util.constant_value(event_ndims)
+        if event_ndims not in (0, 1):
+          raise ValueError("event_ndims({}) was not 0 or 1".format(event_ndims))
+      else:
+        if validate_args:
+          # Shape tool will catch if event_ndims is negative.
+          event_ndims = control_flow_ops.with_dependencies(
+              [check_ops.assert_less(
+                  event_ndims, 2, message="event_ndims must be 0 or 1")],
+              event_ndims)
+        graph_parents += [event_ndims]
+
+      # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
+      dtype = dtypes.float32
+
+      if shift is not None:
+        shift = ops.convert_to_tensor(shift, name="shift")
+        graph_parents += [shift]
+        dtype = shift.dtype.base_dtype
+      self._shift = shift
+
+      if scale is not None:
+        if (shift is not None and
+            shift.dtype.base_dtype != scale.dtype.base_dtype):
+          raise TypeError(
+              "shift.dtype({}) is incompatible with scale.dtype({}).".format(
+                  shift.dtype, scale.dtype))
+        if not isinstance(scale, linear_operator.LinearOperator):
+          raise TypeError("scale is not an instance of tf.LinearOperator")
+        if validate_args and not scale.is_non_singular:
+          raise ValueError("Scale matrix must be non-singular.")
+        graph_parents += scale.graph_parents
+        if scale.tensor_rank is not None:
+          batch_ndims = scale.tensor_rank - 2
+        else:
+          batch_ndims = scale.tensor_rank_tensor() - 2
+          graph_parents += [batch_ndims]
+        if scale.dtype is not None:
+          dtype = scale.dtype.base_dtype
+      else:
+        batch_ndims = 0  # We won't need shape inference when scale is None.
+      self._scale = scale
+      self._shaper = _DistributionShape(
+          batch_ndims=batch_ndims,
+          event_ndims=event_ndims,
+          validate_args=validate_args)
+      super(AffineLinearOperator, self).__init__(
+          event_ndims=event_ndims,
+          graph_parents=graph_parents,
+          is_constant_jacobian=True,
+          dtype=dtype,
+          validate_args=validate_args,
+          name=name)
+
+  @property
+  def shift(self):
+    """The `shift` `Tensor` in `Y = scale @ X + shift`."""
+    return self._shift
+
+  @property
+  def scale(self):
+    """The `scale` `LinearOperator` in `Y = scale @ X + shift`."""
+    return self._scale
+
+  def _forward(self, x):
+    y = x
+    if self.scale is not None:
+      y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
+          y, expand_batch_dim=False)
+      with ops.control_dependencies(self._maybe_collect_assertions() if
+                                    self.validate_args else []):
+        y = self.scale.matmul(y)
+      y = self._shaper.undo_make_batch_of_event_sample_matrices(
+          y, sample_shape, expand_batch_dim=False)
+    if self.shift is not None:
+      y += self.shift
+    return y
+
+  def _inverse(self, y):
+    x = y
+    if self.shift is not None:
+      x -= self.shift
+    if self.scale is not None:
+      x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
+          x, expand_batch_dim=False)
+      # Solve fails if the op is singular so we may safely skip this assertion.
+      x = self.scale.solve(x)
+      x = self._shaper.undo_make_batch_of_event_sample_matrices(
+          x, sample_shape, expand_batch_dim=False)
+    return x
+
+  def _inverse_log_det_jacobian(self, y):
+    return -self._forward_log_det_jacobian(y)
+
+  def _forward_log_det_jacobian(self, x):  # pylint: disable=unused-argument
+    if self.scale is None:
+      return constant_op.constant(0, dtype=x.dtype.base_dtype)
+    with ops.control_dependencies(self._maybe_collect_assertions() if
+                                  self.validate_args else []):
+      return self.scale.log_abs_determinant()
+
+  def _maybe_collect_assertions(self):
+    try:
+      return [self.scale.assert_non_singular()]
+    except NotImplementedError:
+      pass
+    return []
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py
deleted file mode 100644
index 89043b1410370074f11f2cfa59b6b6663fa62521..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""AffineLinearOperator bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops.distributions import bijector
-from tensorflow.python.ops.linalg import linear_operator
-
-
-__all__ = [
-    "AffineLinearOperator",
-]
-
-
-class AffineLinearOperator(bijector.Bijector):
-  """Compute `Y = g(X; shift, scale) = scale @ X + shift`.
-
-  `shift` is a numeric `Tensor` and `scale` is a `LinearOperator`.
-
-  If `X` is a scalar then the forward transformation is: `scale * X + shift`
-  where `*` denotes the scalar product.
-
-  Note: we don't always simply transpose `X` (but write it this way for
-  brevity). Actually the input `X` undergoes the following transformation
-  before being premultiplied by `scale`:
-
-  1. If there are no sample dims, we call `X = tf.expand_dims(X, 0)`, i.e.,
-     `new_sample_shape = [1]`. Otherwise do nothing.
-  2. The sample shape is flattened to have one dimension, i.e.,
-     `new_sample_shape = [n]` where `n = tf.reduce_prod(old_sample_shape)`.
-  3. The sample dim is cyclically rotated left by 1, i.e.,
-     `new_shape = [B1,...,Bb, k, n]` where `n` is as above, `k` is the
-     event_shape, and `B1,...,Bb` are the batch shapes for each of `b` batch
-     dimensions.
-
-  (For more details see `shape.make_batch_of_event_sample_matrices`.)
-
-  The result of the above transformation is that `X` can be regarded as a batch
-  of matrices where each column is a draw from the distribution. After
-  premultiplying by `scale`, we take the inverse of this procedure. The input
-  `Y` also undergoes the same transformation before/after premultiplying by
-  `inv(scale)`.
-
-  Example Use:
-
-  ```python
-  linalg = tf.linalg
-
-  x = [1., 2, 3]
-
-  shift = [-1., 0., 1]
-  diag = [1., 2, 3]
-  scale = linalg.LinearOperatorDiag(diag)
-  affine = AffineLinearOperator(shift, scale)
-  # In this case, `forward` is equivalent to:
-  # y = scale @ x + shift
-  y = affine.forward(x)  # [0., 4, 10]
-
-  shift = [2., 3, 1]
-  tril = [[1., 0, 0],
-          [2, 1, 0],
-          [3, 2, 1]]
-  scale = linalg.LinearOperatorLowerTriangular(tril)
-  affine = AffineLinearOperator(shift, scale)
-  # In this case, `forward` is equivalent to:
-  # np.squeeze(np.matmul(tril, np.expand_dims(x, -1)), -1) + shift
-  y = affine.forward(x)  # [3., 7, 11]
-  ```
-
-  """
-
-  def __init__(self,
-               shift=None,
-               scale=None,
-               event_ndims=1,
-               validate_args=False,
-               name="affine_linear_operator"):
-    """Instantiates the `AffineLinearOperator` bijector.
-
-    Args:
-      shift: Floating-point `Tensor`.
-      scale:  Subclass of `LinearOperator`. Represents the (batch) positive
-        definite matrix `M` in `R^{k x k}`.
-      event_ndims: Scalar `integer` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution. Must be 0 or 1.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError: if `event_ndims` is not 0 or 1.
-      TypeError: if `scale` is not a `LinearOperator`.
-      TypeError: if `shift.dtype` does not match `scale.dtype`.
-      ValueError: if not `scale.is_non_singular`.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    graph_parents = []
-    with self._name_scope("init", values=[shift]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      if tensor_util.constant_value(event_ndims) is not None:
-        event_ndims = tensor_util.constant_value(event_ndims)
-        if event_ndims not in (0, 1):
-          raise ValueError("event_ndims({}) was not 0 or 1".format(event_ndims))
-      else:
-        if validate_args:
-          # Shape tool will catch if event_ndims is negative.
-          event_ndims = control_flow_ops.with_dependencies(
-              [check_ops.assert_less(
-                  event_ndims, 2, message="event_ndims must be 0 or 1")],
-              event_ndims)
-        graph_parents += [event_ndims]
-
-      # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
-      dtype = dtypes.float32
-
-      if shift is not None:
-        shift = ops.convert_to_tensor(shift, name="shift")
-        graph_parents += [shift]
-        dtype = shift.dtype.base_dtype
-      self._shift = shift
-
-      if scale is not None:
-        if (shift is not None and
-            shift.dtype.base_dtype != scale.dtype.base_dtype):
-          raise TypeError(
-              "shift.dtype({}) is incompatible with scale.dtype({}).".format(
-                  shift.dtype, scale.dtype))
-        if not isinstance(scale, linear_operator.LinearOperator):
-          raise TypeError("scale is not an instance of tf.LinearOperator")
-        if validate_args and not scale.is_non_singular:
-          raise ValueError("Scale matrix must be non-singular.")
-        graph_parents += scale.graph_parents
-        if scale.tensor_rank is not None:
-          batch_ndims = scale.tensor_rank - 2
-        else:
-          batch_ndims = scale.tensor_rank_tensor() - 2
-          graph_parents += [batch_ndims]
-        if scale.dtype is not None:
-          dtype = scale.dtype.base_dtype
-      else:
-        batch_ndims = 0  # We won't need shape inference when scale is None.
-      self._scale = scale
-      self._shaper = _DistributionShape(
-          batch_ndims=batch_ndims,
-          event_ndims=event_ndims,
-          validate_args=validate_args)
-      super(AffineLinearOperator, self).__init__(
-          event_ndims=event_ndims,
-          graph_parents=graph_parents,
-          is_constant_jacobian=True,
-          dtype=dtype,
-          validate_args=validate_args,
-          name=name)
-
-  @property
-  def shift(self):
-    """The `shift` `Tensor` in `Y = scale @ X + shift`."""
-    return self._shift
-
-  @property
-  def scale(self):
-    """The `scale` `LinearOperator` in `Y = scale @ X + shift`."""
-    return self._scale
-
-  def _forward(self, x):
-    y = x
-    if self.scale is not None:
-      y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
-          y, expand_batch_dim=False)
-      with ops.control_dependencies(self._maybe_collect_assertions() if
-                                    self.validate_args else []):
-        y = self.scale.matmul(y)
-      y = self._shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape, expand_batch_dim=False)
-    if self.shift is not None:
-      y += self.shift
-    return y
-
-  def _inverse(self, y):
-    x = y
-    if self.shift is not None:
-      x -= self.shift
-    if self.scale is not None:
-      x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
-          x, expand_batch_dim=False)
-      # Solve fails if the op is singular so we may safely skip this assertion.
-      x = self.scale.solve(x)
-      x = self._shaper.undo_make_batch_of_event_sample_matrices(
-          x, sample_shape, expand_batch_dim=False)
-    return x
-
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(y)
-
-  def _forward_log_det_jacobian(self, x):  # pylint: disable=unused-argument
-    if self.scale is None:
-      return constant_op.constant(0, dtype=x.dtype.base_dtype)
-    with ops.control_dependencies(self._maybe_collect_assertions() if
-                                  self.validate_args else []):
-      return self.scale.log_abs_determinant()
-
-  def _maybe_collect_assertions(self):
-    try:
-      return [self.scale.assert_non_singular()]
-    except NotImplementedError:
-      pass
-    return []
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/chain.py b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
index 0db10fb75c8483a8209f39370362b05a03d047ca..3ce7c26213034c7345a20faa803c94a1bfa8d579 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
@@ -18,12 +18,151 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.chain_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import itertools
 
-_allowed_symbols = ["Chain"]
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops.distributions import bijector
 
-remove_undocumented(__name__, _allowed_symbols)
+
+__all__ = [
+    "Chain",
+]
+
+
+class Chain(bijector.Bijector):
+  """Bijector which applies a sequence of bijectors.
+
+  Example Use:
+
+  ```python
+  chain = Chain([Exp(), Softplus()], name="one_plus_exp")
+  ```
+
+  Results in:
+
+  * Forward:
+
+   ```python
+   exp = Exp()
+   softplus = Softplus()
+   Chain([exp, softplus]).forward(x)
+   = exp.forward(softplus.forward(x))
+   = tf.exp(tf.log(1. + tf.exp(x)))
+   = 1. + tf.exp(x)
+   ```
+
+  * Inverse:
+
+   ```python
+   exp = Exp()
+   softplus = Softplus()
+   Chain([exp, softplus]).inverse(y)
+   = softplus.inverse(exp.inverse(y))
+   = tf.log(tf.exp(tf.log(y)) - 1.)
+   = tf.log(y - 1.)
+   ```
+
+  """
+
+  def __init__(self, bijectors=None, validate_args=False, name=None):
+    """Instantiates `Chain` bijector.
+
+    Args:
+      bijectors: Python `list` of bijector instances. An empty list makes this
+        bijector equivalent to the `Identity` bijector.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object. Default:
+        E.g., `Chain([Exp(), Softplus()]).name == "chain_of_exp_of_softplus"`.
+
+    Raises:
+      ValueError: if bijectors have different dtypes.
+    """
+    if bijectors is None:
+      bijectors = ()
+    self._bijectors = bijectors
+
+    for a_bijector in bijectors:
+      if not a_bijector._is_injective:  # pylint: disable=protected-access
+        raise NotImplementedError(
+            "Invert is not implemented for non-injective bijector ({})".format(
+                a_bijector.name))
+
+    dtype = list(set([b.dtype for b in bijectors]))
+    if len(dtype) > 2:
+      raise ValueError("incompatible dtypes: %s" % dtype)
+    elif len(dtype) == 2:
+      dtype = dtype[1] if dtype[0] is None else dtype[0]
+      event_ndims = bijectors[0].event_ndims
+    elif len(dtype) == 1:
+      dtype = dtype[0]
+      event_ndims = bijectors[0].event_ndims
+    else:
+      dtype = None
+      event_ndims = None
+
+    super(Chain, self).__init__(
+        graph_parents=list(itertools.chain.from_iterable(
+            b.graph_parents for b in bijectors)),
+        is_constant_jacobian=all(b.is_constant_jacobian for b in bijectors),
+        validate_args=validate_args,
+        dtype=dtype,
+        event_ndims=event_ndims,
+        name=name or ("identity" if not bijectors else
+                      "_of_".join(["chain"] + [b.name for b in bijectors])))
+
+  @property
+  def bijectors(self):
+    return self._bijectors
+
+  def _shape_helper(self, func_name, input_shape, reverse):
+    new_shape = input_shape
+    for b in reversed(self.bijectors) if reverse else self.bijectors:
+      func = getattr(b, func_name, None)
+      if func is None:
+        raise ValueError("unable to call %s on bijector %s (%s)" %
+                         (func_name, b.name, func))
+      new_shape = func(new_shape)
+    return new_shape
+
+  def _forward_event_shape(self, input_shape):
+    return self._shape_helper("forward_event_shape", input_shape,
+                              reverse=True)
+
+  def _forward_event_shape_tensor(self, input_shape):
+    return self._shape_helper(
+        "forward_event_shape_tensor", input_shape, reverse=True)
+
+  def _inverse_event_shape(self, output_shape):
+    return self._shape_helper("inverse_event_shape", output_shape,
+                              reverse=False)
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    return self._shape_helper("inverse_event_shape_tensor", output_shape,
+                              reverse=False)
+
+  def _inverse(self, y, **kwargs):
+    for b in self.bijectors:
+      y = b.inverse(y, **kwargs.get(b.name, {}))
+    return y
+
+  def _inverse_log_det_jacobian(self, y, **kwargs):
+    ildj = constant_op.constant(0., dtype=y.dtype,
+                                name="inverse_log_det_jacobian")
+    for b in self.bijectors:
+      ildj += b.inverse_log_det_jacobian(y, **kwargs.get(b.name, {}))
+      y = b.inverse(y, **kwargs.get(b.name, {}))
+    return ildj
+
+  def _forward(self, x, **kwargs):
+    for b in reversed(self.bijectors):
+      x = b.forward(x, **kwargs.get(b.name, {}))
+    return x
+
+  def _forward_log_det_jacobian(self, x, **kwargs):
+    fldj = constant_op.constant(0., dtype=x.dtype,
+                                name="forward_log_det_jacobian")
+    for b in reversed(self.bijectors):
+      fldj += b.forward_log_det_jacobian(x, **kwargs.get(b.name, {}))
+      x = b.forward(x, **kwargs.get(b.name, {}))
+    return fldj
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py
deleted file mode 100644
index 3ce7c26213034c7345a20faa803c94a1bfa8d579..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Chain bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import itertools
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "Chain",
-]
-
-
-class Chain(bijector.Bijector):
-  """Bijector which applies a sequence of bijectors.
-
-  Example Use:
-
-  ```python
-  chain = Chain([Exp(), Softplus()], name="one_plus_exp")
-  ```
-
-  Results in:
-
-  * Forward:
-
-   ```python
-   exp = Exp()
-   softplus = Softplus()
-   Chain([exp, softplus]).forward(x)
-   = exp.forward(softplus.forward(x))
-   = tf.exp(tf.log(1. + tf.exp(x)))
-   = 1. + tf.exp(x)
-   ```
-
-  * Inverse:
-
-   ```python
-   exp = Exp()
-   softplus = Softplus()
-   Chain([exp, softplus]).inverse(y)
-   = softplus.inverse(exp.inverse(y))
-   = tf.log(tf.exp(tf.log(y)) - 1.)
-   = tf.log(y - 1.)
-   ```
-
-  """
-
-  def __init__(self, bijectors=None, validate_args=False, name=None):
-    """Instantiates `Chain` bijector.
-
-    Args:
-      bijectors: Python `list` of bijector instances. An empty list makes this
-        bijector equivalent to the `Identity` bijector.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str`, name given to ops managed by this object. Default:
-        E.g., `Chain([Exp(), Softplus()]).name == "chain_of_exp_of_softplus"`.
-
-    Raises:
-      ValueError: if bijectors have different dtypes.
-    """
-    if bijectors is None:
-      bijectors = ()
-    self._bijectors = bijectors
-
-    for a_bijector in bijectors:
-      if not a_bijector._is_injective:  # pylint: disable=protected-access
-        raise NotImplementedError(
-            "Invert is not implemented for non-injective bijector ({})".format(
-                a_bijector.name))
-
-    dtype = list(set([b.dtype for b in bijectors]))
-    if len(dtype) > 2:
-      raise ValueError("incompatible dtypes: %s" % dtype)
-    elif len(dtype) == 2:
-      dtype = dtype[1] if dtype[0] is None else dtype[0]
-      event_ndims = bijectors[0].event_ndims
-    elif len(dtype) == 1:
-      dtype = dtype[0]
-      event_ndims = bijectors[0].event_ndims
-    else:
-      dtype = None
-      event_ndims = None
-
-    super(Chain, self).__init__(
-        graph_parents=list(itertools.chain.from_iterable(
-            b.graph_parents for b in bijectors)),
-        is_constant_jacobian=all(b.is_constant_jacobian for b in bijectors),
-        validate_args=validate_args,
-        dtype=dtype,
-        event_ndims=event_ndims,
-        name=name or ("identity" if not bijectors else
-                      "_of_".join(["chain"] + [b.name for b in bijectors])))
-
-  @property
-  def bijectors(self):
-    return self._bijectors
-
-  def _shape_helper(self, func_name, input_shape, reverse):
-    new_shape = input_shape
-    for b in reversed(self.bijectors) if reverse else self.bijectors:
-      func = getattr(b, func_name, None)
-      if func is None:
-        raise ValueError("unable to call %s on bijector %s (%s)" %
-                         (func_name, b.name, func))
-      new_shape = func(new_shape)
-    return new_shape
-
-  def _forward_event_shape(self, input_shape):
-    return self._shape_helper("forward_event_shape", input_shape,
-                              reverse=True)
-
-  def _forward_event_shape_tensor(self, input_shape):
-    return self._shape_helper(
-        "forward_event_shape_tensor", input_shape, reverse=True)
-
-  def _inverse_event_shape(self, output_shape):
-    return self._shape_helper("inverse_event_shape", output_shape,
-                              reverse=False)
-
-  def _inverse_event_shape_tensor(self, output_shape):
-    return self._shape_helper("inverse_event_shape_tensor", output_shape,
-                              reverse=False)
-
-  def _inverse(self, y, **kwargs):
-    for b in self.bijectors:
-      y = b.inverse(y, **kwargs.get(b.name, {}))
-    return y
-
-  def _inverse_log_det_jacobian(self, y, **kwargs):
-    ildj = constant_op.constant(0., dtype=y.dtype,
-                                name="inverse_log_det_jacobian")
-    for b in self.bijectors:
-      ildj += b.inverse_log_det_jacobian(y, **kwargs.get(b.name, {}))
-      y = b.inverse(y, **kwargs.get(b.name, {}))
-    return ildj
-
-  def _forward(self, x, **kwargs):
-    for b in reversed(self.bijectors):
-      x = b.forward(x, **kwargs.get(b.name, {}))
-    return x
-
-  def _forward_log_det_jacobian(self, x, **kwargs):
-    fldj = constant_op.constant(0., dtype=x.dtype,
-                                name="forward_log_det_jacobian")
-    for b in reversed(self.bijectors):
-      fldj += b.forward_log_det_jacobian(x, **kwargs.get(b.name, {}))
-      x = b.forward(x, **kwargs.get(b.name, {}))
-    return fldj
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
index 4686af8bc42a3232cb3a34f2cfcce8323c5896dd..cbd60f92a60612c6cf791b2c7708a3310c6e2b6b 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
@@ -18,12 +18,219 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.cholesky_outer_product_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = ["CholeskyOuterProduct"]
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
-remove_undocumented(__name__, _allowed_symbols)
+
+__all__ = [
+    "CholeskyOuterProduct",
+]
+
+
+class CholeskyOuterProduct(bijector.Bijector):
+  """Compute `g(X) = X @ X.T`; X is lower-triangular, positive-diagonal matrix.
+
+  `event_ndims` must be 0 or 2, i.e., scalar or matrix.
+
+  Note: the upper-triangular part of X is ignored (whether or not its zero).
+
+  The surjectivity of g as a map from  the set of n x n positive-diagonal
+  lower-triangular matrices to the set of SPD matrices follows immediately from
+  executing the Cholesky factorization algorithm on an SPD matrix A to produce a
+  positive-diagonal lower-triangular matrix L such that `A = L @ L.T`.
+
+  To prove the injectivity of g, suppose that L_1 and L_2 are lower-triangular
+  with positive diagonals and satisfy `A = L_1 @ L_1.T = L_2 @ L_2.T`. Then
+    `inv(L_1) @ A @ inv(L_1).T = [inv(L_1) @ L_2] @ [inv(L_1) @ L_2].T = I`.
+  Setting `L_3 := inv(L_1) @ L_2`, that L_3 is a positive-diagonal
+  lower-triangular matrix follows from `inv(L_1)` being positive-diagonal
+  lower-triangular (which follows from the diagonal of a triangular matrix being
+  its spectrum), and that the product of two positive-diagonal lower-triangular
+  matrices is another positive-diagonal lower-triangular matrix.
+
+  A simple inductive argument (proceding one column of L_3 at a time) shows
+  that, if `I = L_3 @ L_3.T`, with L_3 being lower-triangular with positive-
+  diagonal, then `L_3 = I`. Thus, `L_1 = L_2`, proving injectivity of g.
+
+  Examples:
+
+  ```python
+  bijector.CholeskyOuterProduct(event_ndims=2).forward(x=[[1., 0], [2, 1]])
+  # Result: [[1., 2], [2, 5]], i.e., x @ x.T
+
+  bijector.CholeskyOuterProduct(event_ndims=2).inverse(y=[[1., 2], [2, 5]])
+  # Result: [[1., 0], [2, 1]], i.e., cholesky(y).
+  ```
+
+  """
+
+  def __init__(self, event_ndims=2, validate_args=False,
+               name="cholesky_outer_product"):
+    """Instantiates the `CholeskyOuterProduct` bijector.
+
+    Args:
+      event_ndims: `constant` `int32` scalar `Tensor` indicating the number of
+        dimensions associated with a particular draw from the distribution. Must
+        be 0 or 2.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError: if event_ndims is neither 0 or 2.
+    """
+    self._graph_parents = []
+    self._name = name
+    with self._name_scope("init", values=[event_ndims]):
+      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+      event_ndims = tensor_util.constant_value(event_ndims)
+    if event_ndims is None or event_ndims not in [0, 2]:
+      raise ValueError("`event_ndims` must be a TF constant which is 0 or 2")
+    self._static_event_ndims = event_ndims
+    super(CholeskyOuterProduct, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    if self._static_event_ndims == 0:
+      return math_ops.square(x)
+    if self.validate_args:
+      is_matrix = check_ops.assert_rank_at_least(x, 2)
+      shape = array_ops.shape(x)
+      is_square = check_ops.assert_equal(shape[-2], shape[-1])
+      x = control_flow_ops.with_dependencies([is_matrix, is_square], x)
+    # For safety, explicitly zero-out the upper triangular part.
+    x = array_ops.matrix_band_part(x, -1, 0)
+    return math_ops.matmul(x, x, adjoint_b=True)
+
+  def _inverse(self, y):
+    return (math_ops.sqrt(y) if self._static_event_ndims == 0
+            else linalg_ops.cholesky(y))
+
+  def _inverse_log_det_jacobian(self, y):
+    return -self._forward_log_det_jacobian(x=self._inverse(y))
+
+  def _forward_log_det_jacobian(self, x):
+    # Let Y be a symmetric, positive definite matrix and write:
+    #   Y = X X.T
+    # where X is lower-triangular.
+    #
+    # Observe that,
+    #   dY[i,j]/dX[a,b]
+    #   = d/dX[a,b] { X[i,:] X[j,:] }
+    #   = sum_{d=1}^p { I[i=a] I[d=b] X[j,d] + I[j=a] I[d=b] X[i,d] }
+    #
+    # To compute the Jacobian dX/dY we must represent X,Y as vectors. Since Y is
+    # symmetric and X is lower-triangular, we need vectors of dimension:
+    #   d = p (p + 1) / 2
+    # where X, Y are p x p matrices, p > 0. We use a row-major mapping, i.e.,
+    #   k = { i (i + 1) / 2 + j   i>=j
+    #       { undef               i<j
+    # and assume zero-based indexes. When k is undef, the element is dropped.
+    # Example:
+    #           j      k
+    #        0 1 2 3  /
+    #    0 [ 0 . . . ]
+    # i  1 [ 1 2 . . ]
+    #    2 [ 3 4 5 . ]
+    #    3 [ 6 7 8 9 ]
+    # Write vec[.] to indicate transforming a matrix to vector via k(i,j). (With
+    # slight abuse: k(i,j)=undef means the element is dropped.)
+    #
+    # We now show d vec[Y] / d vec[X] is lower triangular. Assuming both are
+    # defined, observe that k(i,j) < k(a,b) iff (1) i<a or (2) i=a and j<b.
+    # In both cases dvec[Y]/dvec[X]@[k(i,j),k(a,b)] = 0 since:
+    # (1) j<=i<a thus i,j!=a.
+    # (2) i=a>j  thus i,j!=a.
+    #
+    # Since the Jacobian is lower-triangular, we need only compute the product
+    # of diagonal elements:
+    #   d vec[Y] / d vec[X] @[k(i,j), k(i,j)]
+    #   = X[j,j] + I[i=j] X[i,j]
+    #   = 2 X[j,j].
+    # Since there is a 2 X[j,j] term for every lower-triangular element of X we
+    # conclude:
+    #   |Jac(d vec[Y]/d vec[X])| = 2^p prod_{j=0}^{p-1} X[j,j]^{p-j}.
+    if self._static_event_ndims == 0:
+      if self.validate_args:
+        is_positive = check_ops.assert_positive(
+            x, message="All elements must be positive.")
+        x = control_flow_ops.with_dependencies([is_positive], x)
+      return np.log(2.) + math_ops.log(x)
+
+    diag = array_ops.matrix_diag_part(x)
+
+    # We now ensure diag is columnar. Eg, if `diag = [1, 2, 3]` then the output
+    # is `[[1], [2], [3]]` and if `diag = [[1, 2, 3], [4, 5, 6]]` then the
+    # output is unchanged.
+    diag = self._make_columnar(diag)
+
+    if self.validate_args:
+      is_matrix = check_ops.assert_rank_at_least(
+          x, 2, message="Input must be a (batch of) matrix.")
+      shape = array_ops.shape(x)
+      is_square = check_ops.assert_equal(
+          shape[-2], shape[-1],
+          message="Input must be a (batch of) square matrix.")
+      # Assuming lower-triangular means we only need check diag>0.
+      is_positive_definite = check_ops.assert_positive(
+          diag, message="Input must be positive definite.")
+      x = control_flow_ops.with_dependencies(
+          [is_matrix, is_square, is_positive_definite], x)
+
+    # Create a vector equal to: [p, p-1, ..., 2, 1].
+    if x.get_shape().ndims is None or x.get_shape()[-1].value is None:
+      p_int = array_ops.shape(x)[-1]
+      p_float = math_ops.cast(p_int, dtype=x.dtype)
+    else:
+      p_int = x.get_shape()[-1].value
+      p_float = np.array(p_int, dtype=x.dtype.as_numpy_dtype)
+    exponents = math_ops.linspace(p_float, 1., p_int)
+
+    sum_weighted_log_diag = array_ops.squeeze(
+        math_ops.matmul(math_ops.log(diag),
+                        exponents[..., array_ops.newaxis]),
+        squeeze_dims=-1)
+    fldj = p_float * np.log(2.) + sum_weighted_log_diag
+
+    return fldj
+
+  def _make_columnar(self, x):
+    """Ensures non-scalar input has at least one column.
+
+    Example:
+      If `x = [1, 2, 3]` then the output is `[[1], [2], [3]]`.
+
+      If `x = [[1, 2, 3], [4, 5, 6]]` then the output is unchanged.
+
+      If `x = 1` then the output is unchanged.
+
+    Args:
+      x: `Tensor`.
+
+    Returns:
+      columnar_x: `Tensor` with at least two dimensions.
+    """
+    if x.get_shape().ndims is not None:
+      if x.get_shape().ndims == 1:
+        x = x[array_ops.newaxis, :]
+      return x
+    shape = array_ops.shape(x)
+    maybe_expanded_shape = array_ops.concat([
+        shape[:-1],
+        distribution_util.pick_vector(
+            math_ops.equal(array_ops.rank(x), 1),
+            [1], np.array([], dtype=np.int32)),
+        shape[-1:],
+    ], 0)
+    return array_ops.reshape(x, maybe_expanded_shape)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py
deleted file mode 100644
index cbd60f92a60612c6cf791b2c7708a3310c6e2b6b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""CholeskyOuterProduct bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-from tensorflow.python.ops.distributions import util as distribution_util
-
-
-__all__ = [
-    "CholeskyOuterProduct",
-]
-
-
-class CholeskyOuterProduct(bijector.Bijector):
-  """Compute `g(X) = X @ X.T`; X is lower-triangular, positive-diagonal matrix.
-
-  `event_ndims` must be 0 or 2, i.e., scalar or matrix.
-
-  Note: the upper-triangular part of X is ignored (whether or not its zero).
-
-  The surjectivity of g as a map from  the set of n x n positive-diagonal
-  lower-triangular matrices to the set of SPD matrices follows immediately from
-  executing the Cholesky factorization algorithm on an SPD matrix A to produce a
-  positive-diagonal lower-triangular matrix L such that `A = L @ L.T`.
-
-  To prove the injectivity of g, suppose that L_1 and L_2 are lower-triangular
-  with positive diagonals and satisfy `A = L_1 @ L_1.T = L_2 @ L_2.T`. Then
-    `inv(L_1) @ A @ inv(L_1).T = [inv(L_1) @ L_2] @ [inv(L_1) @ L_2].T = I`.
-  Setting `L_3 := inv(L_1) @ L_2`, that L_3 is a positive-diagonal
-  lower-triangular matrix follows from `inv(L_1)` being positive-diagonal
-  lower-triangular (which follows from the diagonal of a triangular matrix being
-  its spectrum), and that the product of two positive-diagonal lower-triangular
-  matrices is another positive-diagonal lower-triangular matrix.
-
-  A simple inductive argument (proceding one column of L_3 at a time) shows
-  that, if `I = L_3 @ L_3.T`, with L_3 being lower-triangular with positive-
-  diagonal, then `L_3 = I`. Thus, `L_1 = L_2`, proving injectivity of g.
-
-  Examples:
-
-  ```python
-  bijector.CholeskyOuterProduct(event_ndims=2).forward(x=[[1., 0], [2, 1]])
-  # Result: [[1., 2], [2, 5]], i.e., x @ x.T
-
-  bijector.CholeskyOuterProduct(event_ndims=2).inverse(y=[[1., 2], [2, 5]])
-  # Result: [[1., 0], [2, 1]], i.e., cholesky(y).
-  ```
-
-  """
-
-  def __init__(self, event_ndims=2, validate_args=False,
-               name="cholesky_outer_product"):
-    """Instantiates the `CholeskyOuterProduct` bijector.
-
-    Args:
-      event_ndims: `constant` `int32` scalar `Tensor` indicating the number of
-        dimensions associated with a particular draw from the distribution. Must
-        be 0 or 2.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError: if event_ndims is neither 0 or 2.
-    """
-    self._graph_parents = []
-    self._name = name
-    with self._name_scope("init", values=[event_ndims]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      event_ndims = tensor_util.constant_value(event_ndims)
-    if event_ndims is None or event_ndims not in [0, 2]:
-      raise ValueError("`event_ndims` must be a TF constant which is 0 or 2")
-    self._static_event_ndims = event_ndims
-    super(CholeskyOuterProduct, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  def _forward(self, x):
-    if self._static_event_ndims == 0:
-      return math_ops.square(x)
-    if self.validate_args:
-      is_matrix = check_ops.assert_rank_at_least(x, 2)
-      shape = array_ops.shape(x)
-      is_square = check_ops.assert_equal(shape[-2], shape[-1])
-      x = control_flow_ops.with_dependencies([is_matrix, is_square], x)
-    # For safety, explicitly zero-out the upper triangular part.
-    x = array_ops.matrix_band_part(x, -1, 0)
-    return math_ops.matmul(x, x, adjoint_b=True)
-
-  def _inverse(self, y):
-    return (math_ops.sqrt(y) if self._static_event_ndims == 0
-            else linalg_ops.cholesky(y))
-
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(x=self._inverse(y))
-
-  def _forward_log_det_jacobian(self, x):
-    # Let Y be a symmetric, positive definite matrix and write:
-    #   Y = X X.T
-    # where X is lower-triangular.
-    #
-    # Observe that,
-    #   dY[i,j]/dX[a,b]
-    #   = d/dX[a,b] { X[i,:] X[j,:] }
-    #   = sum_{d=1}^p { I[i=a] I[d=b] X[j,d] + I[j=a] I[d=b] X[i,d] }
-    #
-    # To compute the Jacobian dX/dY we must represent X,Y as vectors. Since Y is
-    # symmetric and X is lower-triangular, we need vectors of dimension:
-    #   d = p (p + 1) / 2
-    # where X, Y are p x p matrices, p > 0. We use a row-major mapping, i.e.,
-    #   k = { i (i + 1) / 2 + j   i>=j
-    #       { undef               i<j
-    # and assume zero-based indexes. When k is undef, the element is dropped.
-    # Example:
-    #           j      k
-    #        0 1 2 3  /
-    #    0 [ 0 . . . ]
-    # i  1 [ 1 2 . . ]
-    #    2 [ 3 4 5 . ]
-    #    3 [ 6 7 8 9 ]
-    # Write vec[.] to indicate transforming a matrix to vector via k(i,j). (With
-    # slight abuse: k(i,j)=undef means the element is dropped.)
-    #
-    # We now show d vec[Y] / d vec[X] is lower triangular. Assuming both are
-    # defined, observe that k(i,j) < k(a,b) iff (1) i<a or (2) i=a and j<b.
-    # In both cases dvec[Y]/dvec[X]@[k(i,j),k(a,b)] = 0 since:
-    # (1) j<=i<a thus i,j!=a.
-    # (2) i=a>j  thus i,j!=a.
-    #
-    # Since the Jacobian is lower-triangular, we need only compute the product
-    # of diagonal elements:
-    #   d vec[Y] / d vec[X] @[k(i,j), k(i,j)]
-    #   = X[j,j] + I[i=j] X[i,j]
-    #   = 2 X[j,j].
-    # Since there is a 2 X[j,j] term for every lower-triangular element of X we
-    # conclude:
-    #   |Jac(d vec[Y]/d vec[X])| = 2^p prod_{j=0}^{p-1} X[j,j]^{p-j}.
-    if self._static_event_ndims == 0:
-      if self.validate_args:
-        is_positive = check_ops.assert_positive(
-            x, message="All elements must be positive.")
-        x = control_flow_ops.with_dependencies([is_positive], x)
-      return np.log(2.) + math_ops.log(x)
-
-    diag = array_ops.matrix_diag_part(x)
-
-    # We now ensure diag is columnar. Eg, if `diag = [1, 2, 3]` then the output
-    # is `[[1], [2], [3]]` and if `diag = [[1, 2, 3], [4, 5, 6]]` then the
-    # output is unchanged.
-    diag = self._make_columnar(diag)
-
-    if self.validate_args:
-      is_matrix = check_ops.assert_rank_at_least(
-          x, 2, message="Input must be a (batch of) matrix.")
-      shape = array_ops.shape(x)
-      is_square = check_ops.assert_equal(
-          shape[-2], shape[-1],
-          message="Input must be a (batch of) square matrix.")
-      # Assuming lower-triangular means we only need check diag>0.
-      is_positive_definite = check_ops.assert_positive(
-          diag, message="Input must be positive definite.")
-      x = control_flow_ops.with_dependencies(
-          [is_matrix, is_square, is_positive_definite], x)
-
-    # Create a vector equal to: [p, p-1, ..., 2, 1].
-    if x.get_shape().ndims is None or x.get_shape()[-1].value is None:
-      p_int = array_ops.shape(x)[-1]
-      p_float = math_ops.cast(p_int, dtype=x.dtype)
-    else:
-      p_int = x.get_shape()[-1].value
-      p_float = np.array(p_int, dtype=x.dtype.as_numpy_dtype)
-    exponents = math_ops.linspace(p_float, 1., p_int)
-
-    sum_weighted_log_diag = array_ops.squeeze(
-        math_ops.matmul(math_ops.log(diag),
-                        exponents[..., array_ops.newaxis]),
-        squeeze_dims=-1)
-    fldj = p_float * np.log(2.) + sum_weighted_log_diag
-
-    return fldj
-
-  def _make_columnar(self, x):
-    """Ensures non-scalar input has at least one column.
-
-    Example:
-      If `x = [1, 2, 3]` then the output is `[[1], [2], [3]]`.
-
-      If `x = [[1, 2, 3], [4, 5, 6]]` then the output is unchanged.
-
-      If `x = 1` then the output is unchanged.
-
-    Args:
-      x: `Tensor`.
-
-    Returns:
-      columnar_x: `Tensor` with at least two dimensions.
-    """
-    if x.get_shape().ndims is not None:
-      if x.get_shape().ndims == 1:
-        x = x[array_ops.newaxis, :]
-      return x
-    shape = array_ops.shape(x)
-    maybe_expanded_shape = array_ops.concat([
-        shape[:-1],
-        distribution_util.pick_vector(
-            math_ops.equal(array_ops.rank(x), 1),
-            [1], np.array([], dtype=np.int32)),
-        shape[-1:],
-    ], 0)
-    return array_ops.reshape(x, maybe_expanded_shape)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
index d254b635d28099a09a2054536f04ffee3a355b2f..ccb1f029277bc07011df7be047a075274f2b3a27 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
@@ -18,12 +18,38 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.conditional_bijector_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
-_allowed_symbols = ["ConditionalBijector"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = ["ConditionalBijector"]
+
+
+class ConditionalBijector(bijector.Bijector):
+  """Conditional Bijector is a Bijector that allows intrinsic conditioning."""
+
+  @distribution_util.AppendDocstring(kwargs_dict={
+      "**condition_kwargs":
+      "Named arguments forwarded to subclass implementation."})
+  def forward(self, x, name="forward", **condition_kwargs):
+    return self._call_forward(x, name, **condition_kwargs)
+
+  @distribution_util.AppendDocstring(kwargs_dict={
+      "**condition_kwargs":
+      "Named arguments forwarded to subclass implementation."})
+  def inverse(self, y, name="inverse", **condition_kwargs):
+    return self._call_inverse(y, name, **condition_kwargs)
+
+  @distribution_util.AppendDocstring(kwargs_dict={
+      "**condition_kwargs":
+      "Named arguments forwarded to subclass implementation."})
+  def inverse_log_det_jacobian(
+      self, y, name="inverse_log_det_jacobian", **condition_kwargs):
+    return self._call_inverse_log_det_jacobian(y, name, **condition_kwargs)
+
+  @distribution_util.AppendDocstring(kwargs_dict={
+      "**condition_kwargs":
+      "Named arguments forwarded to subclass implementation."})
+  def forward_log_det_jacobian(
+      self, x, name="forward_log_det_jacobian", **condition_kwargs):
+    return self._call_forward_log_det_jacobian(x, name, **condition_kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py
deleted file mode 100644
index ccb1f029277bc07011df7be047a075274f2b3a27..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""ConditionalBijector base."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.ops.distributions import bijector
-from tensorflow.python.ops.distributions import util as distribution_util
-
-
-__all__ = ["ConditionalBijector"]
-
-
-class ConditionalBijector(bijector.Bijector):
-  """Conditional Bijector is a Bijector that allows intrinsic conditioning."""
-
-  @distribution_util.AppendDocstring(kwargs_dict={
-      "**condition_kwargs":
-      "Named arguments forwarded to subclass implementation."})
-  def forward(self, x, name="forward", **condition_kwargs):
-    return self._call_forward(x, name, **condition_kwargs)
-
-  @distribution_util.AppendDocstring(kwargs_dict={
-      "**condition_kwargs":
-      "Named arguments forwarded to subclass implementation."})
-  def inverse(self, y, name="inverse", **condition_kwargs):
-    return self._call_inverse(y, name, **condition_kwargs)
-
-  @distribution_util.AppendDocstring(kwargs_dict={
-      "**condition_kwargs":
-      "Named arguments forwarded to subclass implementation."})
-  def inverse_log_det_jacobian(
-      self, y, name="inverse_log_det_jacobian", **condition_kwargs):
-    return self._call_inverse_log_det_jacobian(y, name, **condition_kwargs)
-
-  @distribution_util.AppendDocstring(kwargs_dict={
-      "**condition_kwargs":
-      "Named arguments forwarded to subclass implementation."})
-  def forward_log_det_jacobian(
-      self, x, name="forward_log_det_jacobian", **condition_kwargs):
-    return self._call_forward_log_det_jacobian(x, name, **condition_kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/exp.py b/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
index 399d713098eb7223601beb9518dc51dd6160ad64..b1ff840d62a73c941a4d67dec73b5c9f4d5353f9 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
@@ -18,12 +18,49 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.exp_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.contrib.distributions.python.ops.bijectors import power_transform
 
-_allowed_symbols = ["Exp"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Exp",
+]
+
+
+class Exp(power_transform.PowerTransform):
+  """Compute `Y = g(X) = exp(X)`.
+
+    Example Use:
+
+    ```python
+    # Create the Y=g(X)=exp(X) transform which works only on Tensors with 1
+    # batch ndim and 2 event ndims (i.e., vector of matrices).
+    exp = Exp(event_ndims=2)
+    x = [[[1., 2],
+           [3, 4]],
+          [[5, 6],
+           [7, 8]]]
+    exp(x) == exp.forward(x)
+    log(x) == exp.inverse(x)
+    ```
+
+    Note: the exp(.) is applied element-wise but the Jacobian is a reduction
+    over the event space.
+  """
+
+  def __init__(self,
+               event_ndims=0,
+               validate_args=False,
+               name="exp"):
+    """Instantiates the `Exp` bijector.
+
+    Args:
+      event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
+        associated with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    super(Exp, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/exp_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/exp_impl.py
deleted file mode 100644
index b1ff840d62a73c941a4d67dec73b5c9f4d5353f9..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/exp_impl.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Exp bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distributions.python.ops.bijectors import power_transform
-
-
-__all__ = [
-    "Exp",
-]
-
-
-class Exp(power_transform.PowerTransform):
-  """Compute `Y = g(X) = exp(X)`.
-
-    Example Use:
-
-    ```python
-    # Create the Y=g(X)=exp(X) transform which works only on Tensors with 1
-    # batch ndim and 2 event ndims (i.e., vector of matrices).
-    exp = Exp(event_ndims=2)
-    x = [[[1., 2],
-           [3, 4]],
-          [[5, 6],
-           [7, 8]]]
-    exp(x) == exp.forward(x)
-    log(x) == exp.inverse(x)
-    ```
-
-    Note: the exp(.) is applied element-wise but the Jacobian is a reduction
-    over the event space.
-  """
-
-  def __init__(self,
-               event_ndims=0,
-               validate_args=False,
-               name="exp"):
-    """Instantiates the `Exp` bijector.
-
-    Args:
-      event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-    """
-    super(Exp, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py b/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
index cf37aa51115ed98ab263bc03bcb297a03432a7ae..67f39785563255be0fe154aca3cbcf01c6a01e73 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
@@ -18,12 +18,107 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.gumbel_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["Gumbel"]
+__all__ = [
+    "Gumbel",
+]
 
-remove_undocumented(__name__, _allowed_symbols)
+
+class Gumbel(bijector.Bijector):
+  """Compute `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
+
+  This bijector maps inputs from `[-inf, inf]` to [0, 1]`. The inverse of the
+  bijector applied to a uniform random variable `X ~ U(0, 1) gives back a
+  random variable with the
+  [Gumbel distribution](https://en.wikipedia.org/wiki/Gumbel_distribution):
+
+  ```none
+  Y ~ Gumbel(loc, scale)
+  pdf(y; loc, scale) = exp(
+    -( (y - loc) / scale + exp(- (y - loc) / scale) ) ) / scale
+  ```
+  """
+
+  def __init__(self,
+               loc=0.,
+               scale=1.,
+               event_ndims=0,
+               validate_args=False,
+               name="gumbel"):
+    """Instantiates the `Gumbel` bijector.
+
+    Args:
+      loc: Float-like `Tensor` that is the same dtype and is
+        broadcastable with `scale`.
+        This is `loc` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
+      scale: Positive Float-like `Tensor` that is the same dtype and is
+        broadcastable with `loc`.
+        This is `scale` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
+      event_ndims: Python scalar indicating the number of dimensions associated
+        with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    with self._name_scope("init", values=[loc, scale]):
+      self._loc = ops.convert_to_tensor(loc, name="loc")
+      self._scale = ops.convert_to_tensor(scale, name="scale")
+      check_ops.assert_same_float_dtype([self._loc, self._scale])
+      if validate_args:
+        self._scale = control_flow_ops.with_dependencies([
+            check_ops.assert_positive(
+                self._scale, message="Argument scale was not positive")
+        ], self._scale)
+
+    super(Gumbel, self).__init__(
+        event_ndims=event_ndims, validate_args=validate_args, name=name)
+
+  @property
+  def loc(self):
+    """The `loc` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`."""
+    return self._loc
+
+  @property
+  def scale(self):
+    """This is `scale` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`."""
+    return self._scale
+
+  def _forward(self, x):
+    z = (x - self.loc) / self.scale
+    return math_ops.exp(-math_ops.exp(-z))
+
+  def _inverse(self, y):
+    y = self._maybe_assert_valid_y(y)
+    return self.loc - self.scale * math_ops.log(-math_ops.log(y))
+
+  def _inverse_log_det_jacobian(self, y):
+    y = self._maybe_assert_valid_y(y)
+    event_dims = self._event_dims_tensor(y)
+    return math_ops.reduce_sum(
+        math_ops.log(self.scale / (-math_ops.log(y) * y)), axis=event_dims)
+
+  def _forward_log_det_jacobian(self, x):
+    event_dims = self._event_dims_tensor(x)
+    z = (x - self.loc) / self.scale
+    return math_ops.reduce_sum(
+        -z - math_ops.exp(-z) - math_ops.log(self.scale), axis=event_dims)
+
+  def _maybe_assert_valid_y(self, y):
+    if not self.validate_args:
+      return y
+    is_positive = check_ops.assert_non_negative(
+        y, message="Inverse transformation input must be greater than 0.")
+    less_than_one = check_ops.assert_less_equal(
+        y,
+        constant_op.constant(1., y.dtype),
+        message="Inverse transformation input must be less than or equal to 1.")
+    return control_flow_ops.with_dependencies([is_positive, less_than_one], y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/gumbel_impl.py
deleted file mode 100644
index 67f39785563255be0fe154aca3cbcf01c6a01e73..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel_impl.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Gumbel bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-__all__ = [
-    "Gumbel",
-]
-
-
-class Gumbel(bijector.Bijector):
-  """Compute `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
-
-  This bijector maps inputs from `[-inf, inf]` to [0, 1]`. The inverse of the
-  bijector applied to a uniform random variable `X ~ U(0, 1) gives back a
-  random variable with the
-  [Gumbel distribution](https://en.wikipedia.org/wiki/Gumbel_distribution):
-
-  ```none
-  Y ~ Gumbel(loc, scale)
-  pdf(y; loc, scale) = exp(
-    -( (y - loc) / scale + exp(- (y - loc) / scale) ) ) / scale
-  ```
-  """
-
-  def __init__(self,
-               loc=0.,
-               scale=1.,
-               event_ndims=0,
-               validate_args=False,
-               name="gumbel"):
-    """Instantiates the `Gumbel` bijector.
-
-    Args:
-      loc: Float-like `Tensor` that is the same dtype and is
-        broadcastable with `scale`.
-        This is `loc` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
-      scale: Positive Float-like `Tensor` that is the same dtype and is
-        broadcastable with `loc`.
-        This is `scale` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    with self._name_scope("init", values=[loc, scale]):
-      self._loc = ops.convert_to_tensor(loc, name="loc")
-      self._scale = ops.convert_to_tensor(scale, name="scale")
-      check_ops.assert_same_float_dtype([self._loc, self._scale])
-      if validate_args:
-        self._scale = control_flow_ops.with_dependencies([
-            check_ops.assert_positive(
-                self._scale, message="Argument scale was not positive")
-        ], self._scale)
-
-    super(Gumbel, self).__init__(
-        event_ndims=event_ndims, validate_args=validate_args, name=name)
-
-  @property
-  def loc(self):
-    """The `loc` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`."""
-    return self._loc
-
-  @property
-  def scale(self):
-    """This is `scale` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`."""
-    return self._scale
-
-  def _forward(self, x):
-    z = (x - self.loc) / self.scale
-    return math_ops.exp(-math_ops.exp(-z))
-
-  def _inverse(self, y):
-    y = self._maybe_assert_valid_y(y)
-    return self.loc - self.scale * math_ops.log(-math_ops.log(y))
-
-  def _inverse_log_det_jacobian(self, y):
-    y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
-        math_ops.log(self.scale / (-math_ops.log(y) * y)), axis=event_dims)
-
-  def _forward_log_det_jacobian(self, x):
-    event_dims = self._event_dims_tensor(x)
-    z = (x - self.loc) / self.scale
-    return math_ops.reduce_sum(
-        -z - math_ops.exp(-z) - math_ops.log(self.scale), axis=event_dims)
-
-  def _maybe_assert_valid_y(self, y):
-    if not self.validate_args:
-      return y
-    is_positive = check_ops.assert_non_negative(
-        y, message="Inverse transformation input must be greater than 0.")
-    less_than_one = check_ops.assert_less_equal(
-        y,
-        constant_op.constant(1., y.dtype),
-        message="Inverse transformation input must be less than or equal to 1.")
-    return control_flow_ops.with_dependencies([is_positive, less_than_one], y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/inline.py b/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
index db10c3fc3a9135b4c408ada74622ba9b360f9ec1..fab1b22fbf92e7b92a5ec86ec62d66bec71a8c94 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
@@ -18,12 +18,124 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.inline_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["Inline"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Inline",
+]
+
+
+class Inline(bijector.Bijector):
+  """Bijector constructed from custom callables.
+
+  Example Use:
+
+  ```python
+  exp = Inline(
+    forward_fn=tf.exp,
+    inverse_fn=tf.log,
+    inverse_log_det_jacobian_fn=(
+      lambda y: -tf.reduce_sum(tf.log(y), axis=-1)),
+    name="exp")
+  ```
+
+  The above example is equivalent to the `Bijector` `Exp(event_ndims=1)`.
+  """
+
+  def __init__(self,
+               forward_fn=None,
+               inverse_fn=None,
+               inverse_log_det_jacobian_fn=None,
+               forward_log_det_jacobian_fn=None,
+               forward_event_shape_fn=None,
+               forward_event_shape_tensor_fn=None,
+               inverse_event_shape_fn=None,
+               inverse_event_shape_tensor_fn=None,
+               is_constant_jacobian=False,
+               validate_args=False,
+               name="inline"):
+    """Creates a `Bijector` from callables.
+
+    Args:
+      forward_fn: Python callable implementing the forward transformation.
+      inverse_fn: Python callable implementing the inverse transformation.
+      inverse_log_det_jacobian_fn: Python callable implementing the
+        log o det o jacobian of the inverse transformation.
+      forward_log_det_jacobian_fn: Python callable implementing the
+        log o det o jacobian of the forward transformation.
+      forward_event_shape_fn: Python callable implementing non-identical
+        static event shape changes. Default: shape is assumed unchanged.
+      forward_event_shape_tensor_fn: Python callable implementing non-identical
+        event shape changes. Default: shape is assumed unchanged.
+      inverse_event_shape_fn: Python callable implementing non-identical
+        static event shape changes. Default: shape is assumed unchanged.
+      inverse_event_shape_tensor_fn: Python callable implementing non-identical
+        event shape changes. Default: shape is assumed unchanged.
+      is_constant_jacobian: Python `bool` indicating that the Jacobian is
+        constant for all input arguments.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+    """
+    super(Inline, self).__init__(
+        event_ndims=0,
+        is_constant_jacobian=is_constant_jacobian,
+        validate_args=validate_args,
+        name=name)
+    self._forward_fn = forward_fn
+    self._inverse_fn = inverse_fn
+    self._inverse_log_det_jacobian_fn = inverse_log_det_jacobian_fn
+    self._forward_log_det_jacobian_fn = forward_log_det_jacobian_fn
+    self._forward_event_shape_fn = forward_event_shape_fn
+    self._forward_event_shape_tensor_fn = forward_event_shape_tensor_fn
+    self._inverse_event_shape_fn = inverse_event_shape_fn
+    self._inverse_event_shape_tensor_fn = inverse_event_shape_tensor_fn
+
+  def _forward_event_shape(self, input_shape):
+    if self._forward_event_shape_fn is None:
+      # By default assume shape doesn't change.
+      return input_shape
+    return self._forward_event_shape_fn(input_shape)
+
+  def _forward_event_shape_tensor(self, input_shape):
+    if self._forward_event_shape_tensor_fn is None:
+      # By default assume shape doesn't change.
+      return input_shape
+    return self._forward_event_shape_tensor_fn(input_shape)
+
+  def _inverse_event_shape(self, output_shape):
+    if self._inverse_event_shape_fn is None:
+      # By default assume shape doesn't change.
+      return output_shape
+    return self._inverse_event_shape_fn(output_shape)
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    if self._inverse_event_shape_tensor_fn is None:
+      # By default assume shape doesn't change.
+      return output_shape
+    return self._inverse_event_shape_tensor_fn(output_shape)
+
+  def _forward(self, x, **kwargs):
+    if not callable(self._forward_fn):
+      raise NotImplementedError(
+          "forward_fn is not a callable function.")
+    return self._forward_fn(x, **kwargs)
+
+  def _inverse(self, y, **kwargs):
+    if not callable(self._inverse_fn):
+      raise NotImplementedError(
+          "inverse_fn is not a callable function.")
+    return self._inverse_fn(y, **kwargs)
+
+  def _inverse_log_det_jacobian(self, y, **kwargs):
+    if not callable(self._inverse_log_det_jacobian_fn):
+      raise NotImplementedError(
+          "inverse_log_det_jacobian_fn is not a callable function.")
+    return self._inverse_log_det_jacobian_fn(y, **kwargs)
+
+  def _forward_log_det_jacobian(self, y, **kwargs):
+    if not callable(self._forward_log_det_jacobian_fn):
+      raise NotImplementedError(
+          "forward_log_det_jacobian_fn is not a callable function.")
+    return self._forward_log_det_jacobian_fn(y, **kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py
deleted file mode 100644
index fab1b22fbf92e7b92a5ec86ec62d66bec71a8c94..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Inline bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "Inline",
-]
-
-
-class Inline(bijector.Bijector):
-  """Bijector constructed from custom callables.
-
-  Example Use:
-
-  ```python
-  exp = Inline(
-    forward_fn=tf.exp,
-    inverse_fn=tf.log,
-    inverse_log_det_jacobian_fn=(
-      lambda y: -tf.reduce_sum(tf.log(y), axis=-1)),
-    name="exp")
-  ```
-
-  The above example is equivalent to the `Bijector` `Exp(event_ndims=1)`.
-  """
-
-  def __init__(self,
-               forward_fn=None,
-               inverse_fn=None,
-               inverse_log_det_jacobian_fn=None,
-               forward_log_det_jacobian_fn=None,
-               forward_event_shape_fn=None,
-               forward_event_shape_tensor_fn=None,
-               inverse_event_shape_fn=None,
-               inverse_event_shape_tensor_fn=None,
-               is_constant_jacobian=False,
-               validate_args=False,
-               name="inline"):
-    """Creates a `Bijector` from callables.
-
-    Args:
-      forward_fn: Python callable implementing the forward transformation.
-      inverse_fn: Python callable implementing the inverse transformation.
-      inverse_log_det_jacobian_fn: Python callable implementing the
-        log o det o jacobian of the inverse transformation.
-      forward_log_det_jacobian_fn: Python callable implementing the
-        log o det o jacobian of the forward transformation.
-      forward_event_shape_fn: Python callable implementing non-identical
-        static event shape changes. Default: shape is assumed unchanged.
-      forward_event_shape_tensor_fn: Python callable implementing non-identical
-        event shape changes. Default: shape is assumed unchanged.
-      inverse_event_shape_fn: Python callable implementing non-identical
-        static event shape changes. Default: shape is assumed unchanged.
-      inverse_event_shape_tensor_fn: Python callable implementing non-identical
-        event shape changes. Default: shape is assumed unchanged.
-      is_constant_jacobian: Python `bool` indicating that the Jacobian is
-        constant for all input arguments.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str`, name given to ops managed by this object.
-    """
-    super(Inline, self).__init__(
-        event_ndims=0,
-        is_constant_jacobian=is_constant_jacobian,
-        validate_args=validate_args,
-        name=name)
-    self._forward_fn = forward_fn
-    self._inverse_fn = inverse_fn
-    self._inverse_log_det_jacobian_fn = inverse_log_det_jacobian_fn
-    self._forward_log_det_jacobian_fn = forward_log_det_jacobian_fn
-    self._forward_event_shape_fn = forward_event_shape_fn
-    self._forward_event_shape_tensor_fn = forward_event_shape_tensor_fn
-    self._inverse_event_shape_fn = inverse_event_shape_fn
-    self._inverse_event_shape_tensor_fn = inverse_event_shape_tensor_fn
-
-  def _forward_event_shape(self, input_shape):
-    if self._forward_event_shape_fn is None:
-      # By default assume shape doesn't change.
-      return input_shape
-    return self._forward_event_shape_fn(input_shape)
-
-  def _forward_event_shape_tensor(self, input_shape):
-    if self._forward_event_shape_tensor_fn is None:
-      # By default assume shape doesn't change.
-      return input_shape
-    return self._forward_event_shape_tensor_fn(input_shape)
-
-  def _inverse_event_shape(self, output_shape):
-    if self._inverse_event_shape_fn is None:
-      # By default assume shape doesn't change.
-      return output_shape
-    return self._inverse_event_shape_fn(output_shape)
-
-  def _inverse_event_shape_tensor(self, output_shape):
-    if self._inverse_event_shape_tensor_fn is None:
-      # By default assume shape doesn't change.
-      return output_shape
-    return self._inverse_event_shape_tensor_fn(output_shape)
-
-  def _forward(self, x, **kwargs):
-    if not callable(self._forward_fn):
-      raise NotImplementedError(
-          "forward_fn is not a callable function.")
-    return self._forward_fn(x, **kwargs)
-
-  def _inverse(self, y, **kwargs):
-    if not callable(self._inverse_fn):
-      raise NotImplementedError(
-          "inverse_fn is not a callable function.")
-    return self._inverse_fn(y, **kwargs)
-
-  def _inverse_log_det_jacobian(self, y, **kwargs):
-    if not callable(self._inverse_log_det_jacobian_fn):
-      raise NotImplementedError(
-          "inverse_log_det_jacobian_fn is not a callable function.")
-    return self._inverse_log_det_jacobian_fn(y, **kwargs)
-
-  def _forward_log_det_jacobian(self, y, **kwargs):
-    if not callable(self._forward_log_det_jacobian_fn):
-      raise NotImplementedError(
-          "forward_log_det_jacobian_fn is not a callable function.")
-    return self._forward_log_det_jacobian_fn(y, **kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
index c134e10109ce5065eb58de1d847e3c487258954c..2c603fe61f36dd27f4984fe6c13c11f2fb534321 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
@@ -18,12 +18,85 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.invert_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.ops.distributions import bijector as bijector_lib
 
-_allowed_symbols = ["Invert"]
+__all__ = [
+    "Invert",
+]
 
-remove_undocumented(__name__, _allowed_symbols)
+
+class Invert(bijector_lib.Bijector):
+  """Bijector which inverts another Bijector.
+
+  Example Use: [ExpGammaDistribution (see Background & Context)](
+  https://reference.wolfram.com/language/ref/ExpGammaDistribution.html)
+  models `Y=log(X)` where `X ~ Gamma`.
+
+  ```python
+  exp_gamma_distribution = TransformedDistribution(
+    distribution=Gamma(concentration=1., rate=2.),
+    bijector=bijector.Invert(bijector.Exp())
+  ```
+
+  """
+
+  def __init__(self, bijector, validate_args=False, name=None):
+    """Creates a `Bijector` which swaps the meaning of `inverse` and `forward`.
+
+    Note: An inverted bijector's `inverse_log_det_jacobian` is often more
+    efficient if the base bijector implements `_forward_log_det_jacobian`. If
+    `_forward_log_det_jacobian` is not implemented then the following code is
+    used:
+
+    ```python
+    y = self.inverse(x, **kwargs)
+    return -self.inverse_log_det_jacobian(y, **kwargs)
+    ```
+
+    Args:
+      bijector: Bijector instance.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+    """
+
+    if not bijector._is_injective:  # pylint: disable=protected-access
+      raise NotImplementedError(
+          "Invert is not implemented for non-injective bijectors.")
+
+    self._bijector = bijector
+    super(Invert, self).__init__(
+        event_ndims=bijector.event_ndims,
+        graph_parents=bijector.graph_parents,
+        is_constant_jacobian=bijector.is_constant_jacobian,
+        validate_args=validate_args,
+        dtype=bijector.dtype,
+        name=name or "_".join(["invert", bijector.name]))
+
+  def _forward_event_shape(self, input_shape):
+    return self.bijector._inverse_event_shape(input_shape)  # pylint: disable=protected-access
+
+  def _forward_event_shape_tensor(self, input_shape):
+    return self.bijector._inverse_event_shape_tensor(input_shape)  # pylint: disable=protected-access
+
+  def _inverse_event_shape(self, output_shape):
+    return self.bijector._forward_event_shape(output_shape)  # pylint: disable=protected-access
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    return self.bijector._forward_event_shape_tensor(output_shape)  # pylint: disable=protected-access
+
+  @property
+  def bijector(self):
+    return self._bijector
+
+  def _forward(self, x, **kwargs):
+    return self.bijector._inverse(x, **kwargs)  # pylint: disable=protected-access
+
+  def _inverse(self, y, **kwargs):
+    return self.bijector._forward(y, **kwargs)  # pylint: disable=protected-access
+
+  def _inverse_log_det_jacobian(self, y, **kwargs):
+    return self.bijector._forward_log_det_jacobian(y, **kwargs)  # pylint: disable=protected-access
+
+  def _forward_log_det_jacobian(self, x, **kwargs):
+    return self.bijector._inverse_log_det_jacobian(x, **kwargs)  # pylint: disable=protected-access
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py
deleted file mode 100644
index 2c603fe61f36dd27f4984fe6c13c11f2fb534321..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Invert bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.ops.distributions import bijector as bijector_lib
-
-__all__ = [
-    "Invert",
-]
-
-
-class Invert(bijector_lib.Bijector):
-  """Bijector which inverts another Bijector.
-
-  Example Use: [ExpGammaDistribution (see Background & Context)](
-  https://reference.wolfram.com/language/ref/ExpGammaDistribution.html)
-  models `Y=log(X)` where `X ~ Gamma`.
-
-  ```python
-  exp_gamma_distribution = TransformedDistribution(
-    distribution=Gamma(concentration=1., rate=2.),
-    bijector=bijector.Invert(bijector.Exp())
-  ```
-
-  """
-
-  def __init__(self, bijector, validate_args=False, name=None):
-    """Creates a `Bijector` which swaps the meaning of `inverse` and `forward`.
-
-    Note: An inverted bijector's `inverse_log_det_jacobian` is often more
-    efficient if the base bijector implements `_forward_log_det_jacobian`. If
-    `_forward_log_det_jacobian` is not implemented then the following code is
-    used:
-
-    ```python
-    y = self.inverse(x, **kwargs)
-    return -self.inverse_log_det_jacobian(y, **kwargs)
-    ```
-
-    Args:
-      bijector: Bijector instance.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str`, name given to ops managed by this object.
-    """
-
-    if not bijector._is_injective:  # pylint: disable=protected-access
-      raise NotImplementedError(
-          "Invert is not implemented for non-injective bijectors.")
-
-    self._bijector = bijector
-    super(Invert, self).__init__(
-        event_ndims=bijector.event_ndims,
-        graph_parents=bijector.graph_parents,
-        is_constant_jacobian=bijector.is_constant_jacobian,
-        validate_args=validate_args,
-        dtype=bijector.dtype,
-        name=name or "_".join(["invert", bijector.name]))
-
-  def _forward_event_shape(self, input_shape):
-    return self.bijector._inverse_event_shape(input_shape)  # pylint: disable=protected-access
-
-  def _forward_event_shape_tensor(self, input_shape):
-    return self.bijector._inverse_event_shape_tensor(input_shape)  # pylint: disable=protected-access
-
-  def _inverse_event_shape(self, output_shape):
-    return self.bijector._forward_event_shape(output_shape)  # pylint: disable=protected-access
-
-  def _inverse_event_shape_tensor(self, output_shape):
-    return self.bijector._forward_event_shape_tensor(output_shape)  # pylint: disable=protected-access
-
-  @property
-  def bijector(self):
-    return self._bijector
-
-  def _forward(self, x, **kwargs):
-    return self.bijector._inverse(x, **kwargs)  # pylint: disable=protected-access
-
-  def _inverse(self, y, **kwargs):
-    return self.bijector._forward(y, **kwargs)  # pylint: disable=protected-access
-
-  def _inverse_log_det_jacobian(self, y, **kwargs):
-    return self.bijector._forward_log_det_jacobian(y, **kwargs)  # pylint: disable=protected-access
-
-  def _forward_log_det_jacobian(self, x, **kwargs):
-    return self.bijector._inverse_log_det_jacobian(x, **kwargs)  # pylint: disable=protected-access
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
index 132dc570f94719b6c71fb269866c943774481b7e..06c7c61ec3dc3980e0d12a984739dca5a925ac9f 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
@@ -18,16 +18,459 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = [
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core as layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import template as template_ops
+from tensorflow.python.ops import variable_scope as variable_scope_lib
+from tensorflow.python.ops.distributions import bijector as bijector_lib
+
+
+__all__ = [
     "MaskedAutoregressiveFlow",
-    "masked_dense",
     "masked_autoregressive_default_template",
+    "masked_dense",
 ]
 
-remove_undocumented(__name__, _allowed_symbols)
+
+class MaskedAutoregressiveFlow(bijector_lib.Bijector):
+  """Affine MaskedAutoregressiveFlow bijector for vector-valued events.
+
+  The affine autoregressive flow [1] provides a relatively simple framework for
+  user-specified (deep) architectures to learn a distribution over vector-valued
+  events. Regarding terminology,
+
+    "Autoregressive models decompose the joint density as a product of
+    conditionals, and model each conditional in turn. Normalizing flows
+    transform a base density (e.g. a standard Gaussian) into the target density
+    by an invertible transformation with tractable Jacobian." [1]
+
+  In other words, the "autoregressive property" is equivalent to the
+  decomposition, `p(x) = prod{ p(x[i] | x[0:i]) : i=0, ..., d }`. The provided
+  `shift_and_log_scale_fn`, `masked_autoregressive_default_template`, achieves
+  this property by zeroing out weights in its `masked_dense` layers.
+
+  In the `tf.distributions` framework, a "normalizing flow" is implemented as a
+  `tf.distributions.bijectors.Bijector`. The `forward` "autoregression"
+  is implemented using a `tf.while_loop` and a deep neural network (DNN) with
+  masked weights such that the autoregressive property is automatically met in
+  the `inverse`.
+
+  A `TransformedDistribution` using `MaskedAutoregressiveFlow(...)` uses the
+  (expensive) forward-mode calculation to draw samples and the (cheap)
+  reverse-mode calculation to compute log-probabilities. Conversely, a
+  `TransformedDistribution` using `Invert(MaskedAutoregressiveFlow(...))` uses
+  the (expensive) forward-mode calculation to compute log-probabilities and the
+  (cheap) reverse-mode calculation to compute samples.  See "Example Use"
+  [below] for more details.
+
+  Given a `shift_and_log_scale_fn`, the forward and inverse transformations are
+  (a sequence of) affine transformations. A "valid" `shift_and_log_scale_fn`
+  must compute each `shift` (aka `loc` or "mu" [2]) and `log(scale)` (aka
+  "alpha" [2]) such that each are broadcastable with the arguments to `forward`
+  and `inverse`, i.e., such that the calculations in `forward`, `inverse`
+  [below] are possible.
+
+  For convenience, `masked_autoregressive_default_template` is offered as a
+  possible `shift_and_log_scale_fn` function. It implements the MADE
+  architecture [2]. MADE is a feed-forward network that computes a `shift` and
+  `log(scale)` using `masked_dense` layers in a deep neural network. Weights are
+  masked to ensure the autoregressive property. It is possible that this
+  architecture is suboptimal for your task. To build alternative networks,
+  either change the arguments to `masked_autoregressive_default_template`, use
+  the `masked_dense` function to roll-out your own, or use some other
+  architecture, e.g., using `tf.layers`.
+
+  Warning: no attempt is made to validate that the `shift_and_log_scale_fn`
+  enforces the "autoregressive property".
+
+  Assuming `shift_and_log_scale_fn` has valid shape and autoregressive
+  semantics, the forward transformation is,
+
+  ```python
+  def forward(x):
+    y = zeros_like(x)
+    event_size = x.shape[-1]
+    for _ in range(event_size):
+      shift, log_scale = shift_and_log_scale_fn(y)
+      y = x * math_ops.exp(log_scale) + shift
+    return y
+  ```
+
+  and the inverse transformation is,
+
+  ```python
+  def inverse(y):
+    shift, log_scale = shift_and_log_scale_fn(y)
+    return (y - shift) / math_ops.exp(log_scale)
+  ```
+
+  Notice that the `inverse` does not need a for-loop. This is because in the
+  forward pass each calculation of `shift` and `log_scale` is based on the `y`
+  calculated so far (not `x`). In the `inverse`, the `y` is fully known, thus is
+  equivalent to the scaling used in `forward` after `event_size` passes, i.e.,
+  the "last" `y` used to compute `shift`, `log_scale`. (Roughly speaking, this
+  also proves the transform is bijective.)
+
+  #### Example Use
+
+  ```python
+  tfd = tf.contrib.distributions
+  tfb = tfd.bijectors
+
+  dims = 5
+
+  # A common choice for a normalizing flow is to use a Gaussian for the base
+  # distribution. (However, any continuous distribution would work.) E.g.,
+  maf = tfd.TransformedDistribution(
+      distribution=tfd.Normal(loc=0., scale=1.),
+      bijector=tfb.MaskedAutoregressiveFlow(
+          shift_and_log_scale_fn=tfb.masked_autoregressive_default_template(
+              hidden_layers=[512, 512])),
+      event_shape=[dims])
+
+  x = maf.sample()  # Expensive; uses `tf.while_loop`, no Bijector caching.
+  maf.log_prob(x)   # Almost free; uses Bijector caching.
+  maf.log_prob(0.)  # Cheap; no `tf.while_loop` despite no Bijector caching.
+
+  # [1] also describes an "Inverse Autoregressive Flow", e.g.,
+  iaf = tfd.TransformedDistribution(
+      distribution=tfd.Normal(loc=0., scale=1.),
+      bijector=tfb.Invert(tfb.MaskedAutoregressiveFlow(
+          shift_and_log_scale_fn=tfb.masked_autoregressive_default_template(
+              hidden_layers=[512, 512]))),
+      event_shape=[dims])
+
+  x = iaf.sample()  # Cheap; no `tf.while_loop` despite no Bijector caching.
+  iaf.log_prob(x)   # Almost free; uses Bijector caching.
+  iaf.log_prob(0.)  # Expensive; uses `tf.while_loop`, no Bijector caching.
+
+  # In many (if not most) cases the default `shift_and_log_scale_fn` will be a
+  # poor choice. Here's an example of using a "shift only" version and with a
+  # different number/depth of hidden layers.
+  shift_only = True
+  maf_no_scale_hidden2 = tfd.TransformedDistribution(
+      distribution=tfd.Normal(loc=0., scale=1.),
+      bijector=tfb.MaskedAutoregressiveFlow(
+          tfb.masked_autoregressive_default_template(
+              hidden_layers=[32],
+              shift_only=shift_only),
+          is_constant_jacobian=shift_only),
+      event_shape=[dims])
+  ```
+
+  [1]: "Masked Autoregressive Flow for Density Estimation."
+       George Papamakarios, Theo Pavlakou, Iain Murray. Arxiv. 2017.
+       https://arxiv.org/abs/1705.07057
+
+  [2]: "MADE: Masked Autoencoder for Distribution Estimation."
+       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
+       https://arxiv.org/abs/1502.03509
+
+  """
+
+  def __init__(self,
+               shift_and_log_scale_fn,
+               is_constant_jacobian=False,
+               validate_args=False,
+               name=None):
+    """Creates the MaskedAutoregressiveFlow bijector.
+
+    Args:
+      shift_and_log_scale_fn: Python `callable` which computes `shift` and
+        `log_scale` from both the forward domain (`x`) and the inverse domain
+        (`y`). Calculation must respect the "autoregressive property" (see class
+        docstring). Suggested default
+        `masked_autoregressive_default_template(hidden_layers=...)`.
+        Typically the function contains `tf.Variables` and is wrapped using
+        `tf.make_template`. Returning `None` for either (both) `shift`,
+        `log_scale` is equivalent to (but more efficient than) returning zero.
+      is_constant_jacobian: Python `bool`. Default: `False`. When `True` the
+        implementation assumes `log_scale` does not depend on the forward domain
+        (`x`) or inverse domain (`y`) values. (No validation is made;
+        `is_constant_jacobian=False` is always safe but possibly computationally
+        inefficient.)
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+    """
+    name = name or "masked_autoregressive_flow"
+    self._shift_and_log_scale_fn = shift_and_log_scale_fn
+    super(MaskedAutoregressiveFlow, self).__init__(
+        is_constant_jacobian=is_constant_jacobian,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    event_size = array_ops.shape(x)[-1]
+    y0 = array_ops.zeros_like(x, name="y0")
+    # call the template once to ensure creation
+    _ = self._shift_and_log_scale_fn(y0)
+    def _loop_body(index, y0):
+      """While-loop body for autoregression calculation."""
+      # Set caching device to avoid re-getting the tf.Variable for every while
+      # loop iteration.
+      with variable_scope_lib.variable_scope(
+          variable_scope_lib.get_variable_scope()) as vs:
+        if vs.caching_device is None:
+          vs.set_caching_device(lambda op: op.device)
+        shift, log_scale = self._shift_and_log_scale_fn(y0)
+      y = x
+      if log_scale is not None:
+        y *= math_ops.exp(log_scale)
+      if shift is not None:
+        y += shift
+      return index + 1, y
+    _, y = control_flow_ops.while_loop(
+        cond=lambda index, _: index < event_size,
+        body=_loop_body,
+        loop_vars=[0, y0])
+    return y
+
+  def _inverse(self, y):
+    shift, log_scale = self._shift_and_log_scale_fn(y)
+    x = y
+    if shift is not None:
+      x -= shift
+    if log_scale is not None:
+      x *= math_ops.exp(-log_scale)
+    return x
+
+  def _inverse_log_det_jacobian(self, y):
+    _, log_scale = self._shift_and_log_scale_fn(y)
+    if log_scale is None:
+      return constant_op.constant(0., dtype=y.dtype, name="ildj")
+    return -math_ops.reduce_sum(log_scale, axis=-1)
+
+
+MASK_INCLUSIVE = "inclusive"
+MASK_EXCLUSIVE = "exclusive"
+
+
+def _gen_slices(num_blocks, n_in, n_out, mask_type=MASK_EXCLUSIVE):
+  """Generate the slices for building an autoregressive mask."""
+  # TODO(b/67594795): Better support of dynamic shape.
+  slices = []
+  col = 0
+  d_in = n_in // num_blocks
+  d_out = n_out // num_blocks
+  row = d_out if mask_type == MASK_EXCLUSIVE else 0
+  for _ in range(num_blocks):
+    row_slice = slice(row, None)
+    col_slice = slice(col, col + d_in)
+    slices.append([row_slice, col_slice])
+    col += d_in
+    row += d_out
+  return slices
+
+
+def _gen_mask(num_blocks,
+              n_in,
+              n_out,
+              mask_type=MASK_EXCLUSIVE,
+              dtype=dtypes.float32):
+  """Generate the mask for building an autoregressive dense layer."""
+  # TODO(b/67594795): Better support of dynamic shape.
+  mask = np.zeros([n_out, n_in], dtype=dtype.as_numpy_dtype())
+  slices = _gen_slices(num_blocks, n_in, n_out, mask_type=mask_type)
+  for [row_slice, col_slice] in slices:
+    mask[row_slice, col_slice] = 1
+  return mask
+
+
+def masked_dense(inputs,
+                 units,
+                 num_blocks=None,
+                 exclusive=False,
+                 kernel_initializer=None,
+                 reuse=None,
+                 name=None,
+                 *args,
+                 **kwargs):
+  """A autoregressively masked dense layer. Analogous to `tf.layers.dense`.
+
+  See [1] for detailed explanation.
+
+  [1]: "MADE: Masked Autoencoder for Distribution Estimation."
+       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
+       https://arxiv.org/abs/1502.03509
+
+  Arguments:
+    inputs: Tensor input.
+    units: Python `int` scalar representing the dimensionality of the output
+      space.
+    num_blocks: Python `int` scalar representing the number of blocks for the
+      MADE masks.
+    exclusive: Python `bool` scalar representing whether to zero the diagonal of
+      the mask, used for the first layer of a MADE.
+    kernel_initializer: Initializer function for the weight matrix.
+      If `None` (default), weights are initialized using the
+      `tf.glorot_random_initializer`.
+    reuse: Python `bool` scalar representing whether to reuse the weights of a
+      previous layer by the same name.
+    name: Python `str` used to describe ops managed by this function.
+    *args: `tf.layers.dense` arguments.
+    **kwargs: `tf.layers.dense` keyword arguments.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
+      graph execution.
+  """
+  # TODO(b/67594795): Better support of dynamic shape.
+  input_depth = inputs.shape.with_rank_at_least(1)[-1].value
+  if input_depth is None:
+    raise NotImplementedError(
+        "Rightmost dimension must be known prior to graph execution.")
+
+  mask = _gen_mask(num_blocks, input_depth, units,
+                   MASK_EXCLUSIVE if exclusive else MASK_INCLUSIVE).T
+
+  if kernel_initializer is None:
+    kernel_initializer = init_ops.glorot_normal_initializer()
+
+  def masked_initializer(shape, dtype=None, partition_info=None):
+    return mask * kernel_initializer(shape, dtype, partition_info)
+
+  with ops.name_scope(name, "masked_dense", [inputs, units, num_blocks]):
+    layer = layers.Dense(
+        units,
+        kernel_initializer=masked_initializer,
+        kernel_constraint=lambda x: mask * x,
+        name=name,
+        dtype=inputs.dtype.base_dtype,
+        _scope=name,
+        _reuse=reuse,
+        *args,
+        **kwargs)
+    return layer.apply(inputs)
+
+
+def masked_autoregressive_default_template(
+    hidden_layers,
+    shift_only=False,
+    activation=nn_ops.relu,
+    log_scale_min_clip=-5.,
+    log_scale_max_clip=3.,
+    log_scale_clip_gradient=False,
+    name=None,
+    *args,
+    **kwargs):
+  """Build the MADE Model [1].
+
+  This will be wrapped in a make_template to ensure the variables are only
+  created once. It takes the input and returns the `loc` ("mu" [1]) and
+  `log_scale` ("alpha" [1]) from the MADE network.
+
+  Warning: This function uses `masked_dense` to create randomly initialized
+  `tf.Variables`. It is presumed that these will be fit, just as you would any
+  other neural architecture which uses `tf.layers.dense`.
+
+  #### About Hidden Layers:
+
+  Each element of `hidden_layers` should be greater than the `input_depth`
+  (i.e., `input_depth = tf.shape(input)[-1]` where `input` is the input to the
+  neural network). This is necessary to ensure the autoregressivity property.
+
+  #### About Clipping:
+
+  This function also optionally clips the `log_scale` (but possibly not its
+  gradient). This is useful because if `log_scale` is too small/large it might
+  underflow/overflow making it impossible for the `MaskedAutoregressiveFlow`
+  bijector to implement a bijection. Additionally, the `log_scale_clip_gradient`
+  `bool` indicates whether the gradient should also be clipped. The default does
+  not clip the gradient; this is useful because it still provides gradient
+  information (for fitting) yet solves the numerical stability problem. I.e.,
+  `log_scale_clip_gradient = False` means
+  `grad[exp(clip(x))] = grad[x] exp(clip(x))` rather than the usual
+  `grad[clip(x)] exp(clip(x))`.
+
+  [1]: "MADE: Masked Autoencoder for Distribution Estimation."
+       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
+       https://arxiv.org/abs/1502.03509
+
+  Arguments:
+    hidden_layers: Python `list`-like of non-negative integer, scalars
+      indicating the number of units in each hidden layer. Default: `[512, 512].
+    shift_only: Python `bool` indicating if only the `shift` term shall be
+      computed. Default: `False`.
+    activation: Activation function (callable). Explicitly setting to `None`
+      implies a linear activation.
+    log_scale_min_clip: `float`-like scalar `Tensor`, or a `Tensor` with the
+      same shape as `log_scale`. The minimum value to clip by. Default: -5.
+    log_scale_max_clip: `float`-like scalar `Tensor`, or a `Tensor` with the
+      same shape as `log_scale`. The maximum value to clip by. Default: 3.
+    log_scale_clip_gradient: Python `bool` indicating that the gradient of
+      `tf.clip_by_value` should be preserved. Default: `False`.
+    name: A name for ops managed by this function. Default:
+      "masked_autoregressive_default_template".
+    *args: `tf.layers.dense` arguments.
+    **kwargs: `tf.layers.dense` keyword arguments.
+
+  Returns:
+    shift: `Float`-like `Tensor` of shift terms (the "mu" in [2]).
+    log_scale: `Float`-like `Tensor` of log(scale) terms (the "alpha" in [2]).
+
+  Raises:
+    NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
+      graph execution.
+  """
+
+  with ops.name_scope(name, "masked_autoregressive_default_template",
+                      values=[log_scale_min_clip, log_scale_max_clip]):
+    def _fn(x):
+      """MADE parameterized via `masked_autoregressive_default_template`."""
+      # TODO(b/67594795): Better support of dynamic shape.
+      input_depth = x.shape.with_rank_at_least(1)[-1].value
+      if input_depth is None:
+        raise NotImplementedError(
+            "Rightmost dimension must be known prior to graph execution.")
+      input_shape = (np.int32(x.shape.as_list()) if x.shape.is_fully_defined()
+                     else array_ops.shape(x))
+      for i, units in enumerate(hidden_layers):
+        x = masked_dense(
+            inputs=x,
+            units=units,
+            num_blocks=input_depth,
+            exclusive=True if i == 0 else False,
+            activation=activation,
+            *args,
+            **kwargs)
+      x = masked_dense(
+          inputs=x,
+          units=(1 if shift_only else 2) * input_depth,
+          num_blocks=input_depth,
+          activation=None,
+          *args,
+          **kwargs)
+      if shift_only:
+        x = array_ops.reshape(x, shape=input_shape)
+        return x, None
+      x = array_ops.reshape(
+          x, shape=array_ops.concat([input_shape, [2]], axis=0))
+      shift, log_scale = array_ops.unstack(x, num=2, axis=-1)
+      which_clip = (math_ops.clip_by_value if log_scale_clip_gradient
+                    else _clip_by_value_preserve_grad)
+      log_scale = which_clip(log_scale, log_scale_min_clip, log_scale_max_clip)
+      return shift, log_scale
+    return template_ops.make_template(
+        "masked_autoregressive_default_template", _fn)
+
+
+def _clip_by_value_preserve_grad(x, clip_value_min, clip_value_max, name=None):
+  """Clips input while leaving gradient unaltered."""
+  with ops.name_scope(name, "clip_by_value_preserve_grad",
+                      [x, clip_value_min, clip_value_max]):
+    clip_x = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+    return x + array_ops.stop_gradient(clip_x - x)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive_impl.py
deleted file mode 100644
index ae142883931274b594dbbafbe86bd71e75c621bc..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive_impl.py
+++ /dev/null
@@ -1,473 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""MaskedAutoregressiveFlow bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.layers import core as layers
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import template as template_ops
-from tensorflow.python.ops import variable_scope as variable_scope_lib
-from tensorflow.python.ops.distributions import bijector as bijector_lib
-
-
-__all__ = [
-    "MaskedAutoregressiveFlow",
-    "masked_autoregressive_default_template",
-    "masked_dense",
-]
-
-
-class MaskedAutoregressiveFlow(bijector_lib.Bijector):
-  """Affine MaskedAutoregressiveFlow bijector for vector-valued events.
-
-  The affine autoregressive flow [1] provides a relatively simple framework for
-  user-specified (deep) architectures to learn a distribution over vector-valued
-  events. Regarding terminology,
-
-    "Autoregressive models decompose the joint density as a product of
-    conditionals, and model each conditional in turn. Normalizing flows
-    transform a base density (e.g. a standard Gaussian) into the target density
-    by an invertible transformation with tractable Jacobian." [1]
-
-  In other words, the "autoregressive property" is equivalent to the
-  decomposition, `p(x) = prod{ p(x[i] | x[0:i]) : i=0, ..., d }`. The provided
-  `shift_and_log_scale_fn`, `masked_autoregressive_default_template`, achieves
-  this property by zeroing out weights in its `masked_dense` layers.
-
-  In the `tf.distributions` framework, a "normalizing flow" is implemented as a
-  `tf.distributions.bijectors.Bijector`. The `forward` "autoregression"
-  is implemented using a `tf.while_loop` and a deep neural network (DNN) with
-  masked weights such that the autoregressive property is automatically met in
-  the `inverse`.
-
-  A `TransformedDistribution` using `MaskedAutoregressiveFlow(...)` uses the
-  (expensive) forward-mode calculation to draw samples and the (cheap)
-  reverse-mode calculation to compute log-probabilities. Conversely, a
-  `TransformedDistribution` using `Invert(MaskedAutoregressiveFlow(...))` uses
-  the (expensive) forward-mode calculation to compute log-probabilities and the
-  (cheap) reverse-mode calculation to compute samples.  See "Example Use"
-  [below] for more details.
-
-  Given a `shift_and_log_scale_fn`, the forward and inverse transformations are
-  (a sequence of) affine transformations. A "valid" `shift_and_log_scale_fn`
-  must compute each `shift` (aka `loc` or "mu" [2]) and `log(scale)` (aka
-  "alpha" [2]) such that each are broadcastable with the arguments to `forward`
-  and `inverse`, i.e., such that the calculations in `forward`, `inverse`
-  [below] are possible.
-
-  For convenience, `masked_autoregressive_default_template` is offered as a
-  possible `shift_and_log_scale_fn` function. It implements the MADE
-  architecture [2]. MADE is a feed-forward network that computes a `shift` and
-  `log(scale)` using `masked_dense` layers in a deep neural network. Weights are
-  masked to ensure the autoregressive property. It is possible that this
-  architecture is suboptimal for your task. To build alternative networks,
-  either change the arguments to `masked_autoregressive_default_template`, use
-  the `masked_dense` function to roll-out your own, or use some other
-  architecture, e.g., using `tf.layers`.
-
-  Warning: no attempt is made to validate that the `shift_and_log_scale_fn`
-  enforces the "autoregressive property".
-
-  Assuming `shift_and_log_scale_fn` has valid shape and autoregressive
-  semantics, the forward transformation is,
-
-  ```python
-  def forward(x):
-    y = zeros_like(x)
-    event_size = x.shape[-1]
-    for _ in range(event_size):
-      shift, log_scale = shift_and_log_scale_fn(y)
-      y = x * math_ops.exp(log_scale) + shift
-    return y
-  ```
-
-  and the inverse transformation is,
-
-  ```python
-  def inverse(y):
-    shift, log_scale = shift_and_log_scale_fn(y)
-    return (y - shift) / math_ops.exp(log_scale)
-  ```
-
-  Notice that the `inverse` does not need a for-loop. This is because in the
-  forward pass each calculation of `shift` and `log_scale` is based on the `y`
-  calculated so far (not `x`). In the `inverse`, the `y` is fully known, thus is
-  equivalent to the scaling used in `forward` after `event_size` passes, i.e.,
-  the "last" `y` used to compute `shift`, `log_scale`. (Roughly speaking, this
-  also proves the transform is bijective.)
-
-  #### Example Use
-
-  ```python
-  ds = tf.contrib.distributions
-  bs = tf.contrib.distributions.bijectors
-
-  dims = 5
-
-  # A common choice for a normalizing flow is to use a Gaussian for the base
-  # distribution. (However, any continuous distribution would work.) E.g.,
-  maf = ds.TransformedDistribution(
-      distribution=ds.Normal(loc=0., scale=1.),
-      bijector=bs.MaskedAutoregressiveFlow(
-          shift_and_log_scale_fn=bs.masked_autoregressive_default_template(
-              hidden_layers=[512, 512])),
-      event_shape=[dims])
-
-  x = maf.sample()  # Expensive; uses `tf.while_loop`, no Bijector caching.
-  maf.log_prob(x)   # Almost free; uses Bijector caching.
-  maf.log_prob(0.)  # Cheap; no `tf.while_loop` despite no Bijector caching.
-
-  # [1] also describes an "Inverse Autoregressive Flow", e.g.,
-  iaf = ds.TransformedDistribution(
-      distribution=ds.Normal(loc=0., scale=1.),
-      bijector=bs.Invert(bs.MaskedAutoregressiveFlow(
-          shift_and_log_scale_fn=bs.masked_autoregressive_default_template(
-              hidden_layers=[512, 512]))),
-      event_shape=[dims])
-
-  x = iaf.sample()  # Cheap; no `tf.while_loop` despite no Bijector caching.
-  iaf.log_prob(x)   # Almost free; uses Bijector caching.
-  iaf.log_prob(0.)  # Expensive; uses `tf.while_loop`, no Bijector caching.
-
-  # In many (if not most) cases the default `shift_and_log_scale_fn` will be a
-  # poor choice. Here's an example of using a "shift only" version and with a
-  # different number/depth of hidden layers.
-  shift_only = True
-  maf_no_scale_hidden2 = ds.TransformedDistribution(
-      distribution=ds.Normal(loc=0., scale=1.),
-      bijector=bs.MaskedAutoregressiveFlow(
-          bs.masked_autoregressive_default_template(
-              hidden_layers=[32],
-              shift_only=shift_only),
-          is_constant_jacobian=shift_only),
-      event_shape=[dims])
-  ```
-
-  [1]: "Masked Autoregressive Flow for Density Estimation."
-       George Papamakarios, Theo Pavlakou, Iain Murray. Arxiv. 2017.
-       https://arxiv.org/abs/1705.07057
-
-  [2]: "MADE: Masked Autoencoder for Distribution Estimation."
-       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
-       https://arxiv.org/abs/1502.03509
-
-  """
-
-  def __init__(self,
-               shift_and_log_scale_fn,
-               is_constant_jacobian=False,
-               validate_args=False,
-               name=None):
-    """Creates the MaskedAutoregressiveFlow bijector.
-
-    Args:
-      shift_and_log_scale_fn: Python `callable` which computes `shift` and
-        `log_scale` from both the forward domain (`x`) and the inverse domain
-        (`y`). Calculation must respect the "autoregressive property" (see class
-        docstring). Suggested default
-        `masked_autoregressive_default_template(hidden_layers=...)`.
-        Typically the function contains `tf.Variables` and is wrapped using
-        `tf.make_template`. Returning `None` for either (both) `shift`,
-        `log_scale` is equivalent to (but more efficient than) returning zero.
-      is_constant_jacobian: Python `bool`. Default: `False`. When `True` the
-        implementation assumes `log_scale` does not depend on the forward domain
-        (`x`) or inverse domain (`y`) values. (No validation is made;
-        `is_constant_jacobian=False` is always safe but possibly computationally
-        inefficient.)
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str`, name given to ops managed by this object.
-    """
-    name = name or "masked_autoregressive_flow"
-    self._shift_and_log_scale_fn = shift_and_log_scale_fn
-    super(MaskedAutoregressiveFlow, self).__init__(
-        is_constant_jacobian=is_constant_jacobian,
-        validate_args=validate_args,
-        name=name)
-
-  def _forward(self, x):
-    event_size = array_ops.shape(x)[-1]
-    def _loop_body(index, y0):
-      """While-loop body for autoregression calculation."""
-      # Set caching device to avoid re-getting the tf.Variable for every while
-      # loop iteration.
-      with variable_scope_lib.variable_scope(
-          variable_scope_lib.get_variable_scope()) as vs:
-        if vs.caching_device is None:
-          vs.set_caching_device(lambda op: op.device)
-        shift, log_scale = self._shift_and_log_scale_fn(y0)
-      y = x
-      if log_scale is not None:
-        y *= math_ops.exp(log_scale)
-      if shift is not None:
-        y += shift
-      return index + 1, y
-    _, y = control_flow_ops.while_loop(
-        cond=lambda index, _: index < event_size,
-        body=_loop_body,
-        loop_vars=[0, array_ops.zeros_like(x, name="y0")])
-    return y
-
-  def _inverse(self, y):
-    shift, log_scale = self._shift_and_log_scale_fn(y)
-    x = y
-    if shift is not None:
-      x -= shift
-    if log_scale is not None:
-      x *= math_ops.exp(-log_scale)
-    return x
-
-  def _inverse_log_det_jacobian(self, y):
-    _, log_scale = self._shift_and_log_scale_fn(y)
-    if log_scale is None:
-      return constant_op.constant(0., dtype=y.dtype, name="ildj")
-    return -math_ops.reduce_sum(log_scale, axis=-1)
-
-
-MASK_INCLUSIVE = "inclusive"
-MASK_EXCLUSIVE = "exclusive"
-
-
-def _gen_slices(num_blocks, n_in, n_out, mask_type=MASK_EXCLUSIVE):
-  """Generate the slices for building an autoregressive mask."""
-  # TODO(b/67594795): Better support of dynamic shape.
-  slices = []
-  col = 0
-  d_in = n_in // num_blocks
-  d_out = n_out // num_blocks
-  row = d_out if mask_type == MASK_EXCLUSIVE else 0
-  for _ in range(num_blocks):
-    row_slice = slice(row, None)
-    col_slice = slice(col, col + d_in)
-    slices.append([row_slice, col_slice])
-    col += d_in
-    row += d_out
-  return slices
-
-
-def _gen_mask(num_blocks,
-              n_in,
-              n_out,
-              mask_type=MASK_EXCLUSIVE,
-              dtype=dtypes.float32):
-  """Generate the mask for building an autoregressive dense layer."""
-  # TODO(b/67594795): Better support of dynamic shape.
-  mask = np.zeros([n_out, n_in], dtype=dtype.as_numpy_dtype())
-  slices = _gen_slices(num_blocks, n_in, n_out, mask_type=mask_type)
-  for [row_slice, col_slice] in slices:
-    mask[row_slice, col_slice] = 1
-  return mask
-
-
-def masked_dense(inputs,
-                 units,
-                 num_blocks=None,
-                 exclusive=False,
-                 kernel_initializer=None,
-                 reuse=None,
-                 name=None,
-                 *args,
-                 **kwargs):
-  """A autoregressively masked dense layer. Analogous to `tf.layers.dense`.
-
-  See [1] for detailed explanation.
-
-  [1]: "MADE: Masked Autoencoder for Distribution Estimation."
-       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
-       https://arxiv.org/abs/1502.03509
-
-  Arguments:
-    inputs: Tensor input.
-    units: Python `int` scalar representing the dimensionality of the output
-      space.
-    num_blocks: Python `int` scalar representing the number of blocks for the
-      MADE masks.
-    exclusive: Python `bool` scalar representing whether to zero the diagonal of
-      the mask, used for the first layer of a MADE.
-    kernel_initializer: Initializer function for the weight matrix.
-      If `None` (default), weights are initialized using the
-      `tf.glorot_random_initializer`.
-    reuse: Python `bool` scalar representing whether to reuse the weights of a
-      previous layer by the same name.
-    name: Python `str` used to describe ops managed by this function.
-    *args: `tf.layers.dense` arguments.
-    **kwargs: `tf.layers.dense` keyword arguments.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
-      graph execution.
-  """
-  # TODO(b/67594795): Better support of dynamic shape.
-  input_depth = inputs.shape.with_rank_at_least(1)[-1].value
-  if input_depth is None:
-    raise NotImplementedError(
-        "Rightmost dimension must be known prior to graph execution.")
-
-  mask = _gen_mask(num_blocks, input_depth, units,
-                   MASK_EXCLUSIVE if exclusive else MASK_INCLUSIVE).T
-
-  if kernel_initializer is None:
-    kernel_initializer = init_ops.glorot_normal_initializer()
-
-  def masked_initializer(shape, dtype=None, partition_info=None):
-    return mask * kernel_initializer(shape, dtype, partition_info)
-
-  with ops.name_scope(name, "masked_dense", [inputs, units, num_blocks]):
-    layer = layers.Dense(
-        units,
-        kernel_initializer=masked_initializer,
-        kernel_constraint=lambda x: mask * x,
-        name=name,
-        dtype=inputs.dtype.base_dtype,
-        _scope=name,
-        _reuse=reuse,
-        *args,
-        **kwargs)
-    return layer.apply(inputs)
-
-
-def masked_autoregressive_default_template(
-    hidden_layers,
-    shift_only=False,
-    activation=nn_ops.relu,
-    log_scale_min_clip=-5.,
-    log_scale_max_clip=3.,
-    log_scale_clip_gradient=False,
-    name=None,
-    *args,
-    **kwargs):
-  """Build the MADE Model [1].
-
-  This will be wrapped in a make_template to ensure the variables are only
-  created once. It takes the input and returns the `loc` ("mu" [1]) and
-  `log_scale` ("alpha" [1]) from the MADE network.
-
-  Warning: This function uses `masked_dense` to create randomly initialized
-  `tf.Variables`. It is presumed that these will be fit, just as you would any
-  other neural architecture which uses `tf.layers.dense`.
-
-  #### About Hidden Layers:
-
-  Each element of `hidden_layers` should be greater than the `input_depth`
-  (i.e., `input_depth = tf.shape(input)[-1]` where `input` is the input to the
-  neural network). This is necessary to ensure the autoregressivity property.
-
-  #### About Clipping:
-
-  This function also optionally clips the `log_scale` (but possibly not its
-  gradient). This is useful because if `log_scale` is too small/large it might
-  underflow/overflow making it impossible for the `MaskedAutoregressiveFlow`
-  bijector to implement a bijection. Additionally, the `log_scale_clip_gradient`
-  `bool` indicates whether the gradient should also be clipped. The default does
-  not clip the gradient; this is useful because it still provides gradient
-  information (for fitting) yet solves the numerical stability problem. I.e.,
-  `log_scale_clip_gradient = False` means
-  `grad[exp(clip(x))] = grad[x] exp(clip(x))` rather than the usual
-  `grad[clip(x)] exp(clip(x))`.
-
-  [1]: "MADE: Masked Autoencoder for Distribution Estimation."
-       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
-       https://arxiv.org/abs/1502.03509
-
-  Arguments:
-    hidden_layers: Python `list`-like of non-negative integer, scalars
-      indicating the number of units in each hidden layer. Default: `[512, 512].
-    shift_only: Python `bool` indicating if only the `shift` term shall be
-      computed. Default: `False`.
-    activation: Activation function (callable). Explicitly setting to `None`
-      implies a linear activation.
-    log_scale_min_clip: `float`-like scalar `Tensor`, or a `Tensor` with the
-      same shape as `log_scale`. The minimum value to clip by. Default: -5.
-    log_scale_max_clip: `float`-like scalar `Tensor`, or a `Tensor` with the
-      same shape as `log_scale`. The maximum value to clip by. Default: 3.
-    log_scale_clip_gradient: Python `bool` indicating that the gradient of
-      `tf.clip_by_value` should be preserved. Default: `False`.
-    name: A name for ops managed by this function. Default:
-      "masked_autoregressive_default_template".
-    *args: `tf.layers.dense` arguments.
-    **kwargs: `tf.layers.dense` keyword arguments.
-
-  Returns:
-    shift: `Float`-like `Tensor` of shift terms (the "mu" in [2]).
-    log_scale: `Float`-like `Tensor` of log(scale) terms (the "alpha" in [2]).
-
-  Raises:
-    NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
-      graph execution.
-  """
-
-  with ops.name_scope(name, "masked_autoregressive_default_template",
-                      values=[log_scale_min_clip, log_scale_max_clip]):
-    def _fn(x):
-      """MADE parameterized via `masked_autoregressive_default_template`."""
-      # TODO(b/67594795): Better support of dynamic shape.
-      input_depth = x.shape.with_rank_at_least(1)[-1].value
-      if input_depth is None:
-        raise NotImplementedError(
-            "Rightmost dimension must be known prior to graph execution.")
-      input_shape = (np.int32(x.shape.as_list()) if x.shape.is_fully_defined()
-                     else array_ops.shape(x))
-      for i, units in enumerate(hidden_layers):
-        x = masked_dense(
-            inputs=x,
-            units=units,
-            num_blocks=input_depth,
-            exclusive=True if i == 0 else False,
-            activation=activation,
-            *args,
-            **kwargs)
-      x = masked_dense(
-          inputs=x,
-          units=(1 if shift_only else 2) * input_depth,
-          num_blocks=input_depth,
-          activation=None,
-          *args,
-          **kwargs)
-      if shift_only:
-        x = array_ops.reshape(x, shape=input_shape)
-        return x, None
-      x = array_ops.reshape(
-          x, shape=array_ops.concat([input_shape, [2]], axis=0))
-      shift, log_scale = array_ops.unstack(x, num=2, axis=-1)
-      which_clip = (math_ops.clip_by_value if log_scale_clip_gradient
-                    else _clip_by_value_preserve_grad)
-      log_scale = which_clip(log_scale, log_scale_min_clip, log_scale_max_clip)
-      return shift, log_scale
-    return template_ops.make_template(
-        "masked_autoregressive_default_template", _fn)
-
-
-def _clip_by_value_preserve_grad(x, clip_value_min, clip_value_max, name=None):
-  """Clips input while leaving gradient unaltered."""
-  with ops.name_scope(name, "clip_by_value_preserve_grad",
-                      [x, clip_value_min, clip_value_max]):
-    clip_x = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-    return x + array_ops.stop_gradient(clip_x - x)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/permute.py b/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
index a187ce22d686ee1203802ae2bfe64b0e1a3ea850..8654cc39d0c41ec4f1b85cd5fc4366ceaf4b224d 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
@@ -12,18 +12,127 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Permute bijector."""
+"""Permutation bijectors."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.permute_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = ["Permute"]
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector as bijector_lib
 
-remove_undocumented(__name__, _allowed_symbols)
+
+__all__ = [
+    "Permute",
+]
+
+
+class Permute(bijector_lib.Bijector):
+  """Permutes the rightmost dimension of a `Tensor`.
+
+  ```python
+  tfd = tf.contrib.distributions
+
+  reverse = tfd.bijectors.Permute(permutation=[2, 1, 0])
+
+  reverse.forward([-1., 0., 1.])
+  # ==> [1., 0., -1]
+
+  reverse.inverse([1., 0., -1])
+  # ==> [-1., 0., 1.]
+
+  reverse.forward_log_det_jacobian(any_value)
+  # ==> 0.
+
+  reverse.inverse_log_det_jacobian(any_value)
+  # ==> 0.
+  ```
+
+  Warning: `tf.estimator` may repeatedly build the graph thus
+  `Permute(np.random.permutation(event_size)).astype("int32"))` is not a
+  reliable parameterization (nor would it be even if using `tf.constant`). A
+  safe alternative is to use `tf.get_variable` to achieve "init once" behavior,
+  i.e.,
+
+  ```python
+  def init_once(x, name):
+    return tf.get_variable(name, initializer=x, trainable=False)
+
+  Permute(permutation=init_once(
+      np.random.permutation(event_size).astype("int32"),
+      name="permutation"))
+  ```
+
+  """
+
+  def __init__(self, permutation, validate_args=False, name=None):
+    """Creates the `Permute` bijector.
+
+    Args:
+      permutation: An `int`-like vector-shaped `Tensor` representing the
+        permutation to apply to the rightmost dimension of the transformed
+        `Tensor`.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+
+    Raises:
+      TypeError: if `not permutation.dtype.is_integer`.
+      ValueError: if `permutation` does not contain exactly one of each of
+        `{0, 1, ..., d}`.
+    """
+    with ops.name_scope(name, "permute", values=[permutation]):
+      permutation = ops.convert_to_tensor(
+          permutation,
+          name="permutation")
+      if not permutation.dtype.is_integer:
+        raise TypeError("permutation.dtype ({}) should be `int`-like.".format(
+            permutation.dtype.name))
+      p = tensor_util.constant_value(permutation)
+      if p is not None:
+        if set(p) != set(np.arange(p.size)):
+          raise ValueError("Permutation over `d` must contain exactly one of "
+                           "each of `{0, 1, ..., d}`.")
+      elif validate_args:
+        p, _ = nn_ops.top_k(-permutation,
+                            k=array_ops.shape(permutation)[-1],
+                            sorted=True)
+        permutation = control_flow_ops.with_dependencies([
+            check_ops.assert_equal(
+                -p, math_ops.range(array_ops.size(p)),
+                message=("Permutation over `d` must contain exactly one of "
+                         "each of `{0, 1, ..., d}`.")),
+        ], permutation)
+      self._permutation = permutation
+      super(Permute, self).__init__(
+          is_constant_jacobian=True,
+          validate_args=validate_args,
+          name=name or "permute")
+
+  @property
+  def permutation(self):
+    return self._permutation
+
+  def _forward(self, x):
+    return array_ops.gather(x, self.permutation, axis=-1)
+
+  def _inverse(self, y):
+    return array_ops.gather(
+        y,
+        array_ops.invert_permutation(self.permutation),
+        axis=-1)
+
+  def _inverse_log_det_jacobian(self, y):
+    return constant_op.constant(0., dtype=y.dtype)
+
+  def _forward_log_det_jacobian(self, x):
+    return constant_op.constant(0., dtype=x.dtype)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/permute_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/permute_impl.py
deleted file mode 100644
index b1d8f2f41b28a88208a19824377f93882b767f03..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/permute_impl.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Permutation bijectors."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import bijector as bijector_lib
-
-
-__all__ = [
-    "Permute",
-]
-
-
-class Permute(bijector_lib.Bijector):
-  """Permutes the rightmost dimension of a `Tensor`.
-
-  ```python
-  bs = tf.contrib.distributions.bijectors
-
-  reverse = bs.Permute(permutation=[2, 1, 0])
-
-  reverse.forward([-1., 0., 1.])
-  # ==> [1., 0., -1]
-
-  reverse.inverse([1., 0., -1])
-  # ==> [-1., 0., 1.]
-
-  reverse.forward_log_det_jacobian(any_value)
-  # ==> 0.
-
-  reverse.inverse_log_det_jacobian(any_value)
-  # ==> 0.
-  ```
-
-  Warning: `tf.estimator` may repeatedly build the graph thus
-  `Permute(np.random.permutation(event_size)).astype("int32"))` is not a
-  reliable parameterization (nor would it be even if using `tf.constant`). A
-  safe alternative is to use `tf.get_variable` to achieve "init once" behavior,
-  i.e.,
-
-  ```python
-  def init_once(x, name):
-    return tf.get_variable(name, initializer=x, trainable=False)
-
-  Permute(permutation=init_once(
-      np.random.permutation(event_size).astype("int32"),
-      name="permutation"))
-  ```
-
-  """
-
-  def __init__(self, permutation, validate_args=False, name=None):
-    """Creates the `Permute` bijector.
-
-    Args:
-      permutation: An `int`-like vector-shaped `Tensor` representing the
-        permutation to apply to the rightmost dimension of the transformed
-        `Tensor`.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str`, name given to ops managed by this object.
-
-    Raises:
-      TypeError: if `not permutation.dtype.is_integer`.
-      ValueError: if `permutation` does not contain exactly one of each of
-        `{0, 1, ..., d}`.
-    """
-    with ops.name_scope(name, "permute", values=[permutation]):
-      permutation = ops.convert_to_tensor(
-          permutation,
-          name="permutation")
-      if not permutation.dtype.is_integer:
-        raise TypeError("permutation.dtype ({}) should be `int`-like.".format(
-            permutation.dtype.name))
-      p = tensor_util.constant_value(permutation)
-      if p is not None:
-        if set(p) != set(np.arange(p.size)):
-          raise ValueError("Permutation over `d` must contain exactly one of "
-                           "each of `{0, 1, ..., d}`.")
-      elif validate_args:
-        p, _ = nn_ops.top_k(-permutation,
-                            k=array_ops.shape(permutation)[-1],
-                            sorted=True)
-        permutation = control_flow_ops.with_dependencies([
-            check_ops.assert_equal(
-                -p, math_ops.range(array_ops.size(p)),
-                message=("Permutation over `d` must contain exactly one of "
-                         "each of `{0, 1, ..., d}`.")),
-        ], permutation)
-      self._permutation = permutation
-      super(Permute, self).__init__(
-          is_constant_jacobian=True,
-          validate_args=validate_args,
-          name=name or "permute")
-
-  @property
-  def permutation(self):
-    return self._permutation
-
-  def _forward(self, x):
-    return array_ops.gather(x, self.permutation, axis=-1)
-
-  def _inverse(self, y):
-    return array_ops.gather(
-        y,
-        array_ops.invert_permutation(self.permutation),
-        axis=-1)
-
-  def _inverse_log_det_jacobian(self, y):
-    return constant_op.constant(0., dtype=y.dtype)
-
-  def _forward_log_det_jacobian(self, x):
-    return constant_op.constant(0., dtype=x.dtype)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
index a83199549cd16101ab7b39b43d19a17bc66f03df..c37db61720d10949f294ff7b2e9778ba6efa57f0 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
@@ -18,12 +18,110 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.power_transform_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["PowerTransform"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "PowerTransform",
+]
+
+
+class PowerTransform(bijector.Bijector):
+  """Compute `Y = g(X) = (1 + X * c)**(1 / c), X >= -1 / c`.
+
+  The [power transform](https://en.wikipedia.org/wiki/Power_transform) maps
+  inputs from `[0, inf]` to `[-1/c, inf]`; this is equivalent to the `inverse`
+  of this bijector.
+
+  This bijector is equivalent to the `Exp` bijector when `c=0`.
+  """
+
+  def __init__(self,
+               power=0.,
+               event_ndims=0,
+               validate_args=False,
+               name="power_transform"):
+    """Instantiates the `PowerTransform` bijector.
+
+    Args:
+      power: Python `float` scalar indicating the transform power, i.e.,
+        `Y = g(X) = (1 + X * c)**(1 / c)` where `c` is the `power`.
+      event_ndims: Python scalar indicating the number of dimensions associated
+        with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError: if `power < 0` or is not known statically.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    with self._name_scope("init", values=[power]):
+      power = tensor_util.constant_value(
+          ops.convert_to_tensor(power, name="power"))
+    if power is None or power < 0:
+      raise ValueError("`power` must be a non-negative TF constant.")
+    self._power = power
+    super(PowerTransform, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  @property
+  def power(self):
+    """The `c` in: `Y = g(X) = (1 + X * c)**(1 / c)`."""
+    return self._power
+
+  def _forward(self, x):
+    x = self._maybe_assert_valid_x(x)
+    if self.power == 0.:
+      return math_ops.exp(x)
+    # If large x accuracy is an issue, consider using:
+    # (1. + x * self.power)**(1. / self.power) when x >> 1.
+    return math_ops.exp(math_ops.log1p(x * self.power) / self.power)
+
+  def _inverse(self, y):
+    y = self._maybe_assert_valid_y(y)
+    if self.power == 0.:
+      return math_ops.log(y)
+    # If large y accuracy is an issue, consider using:
+    # (y**self.power - 1.) / self.power when y >> 1.
+    return math_ops.expm1(math_ops.log(y) * self.power) / self.power
+
+  def _inverse_log_det_jacobian(self, y):
+    y = self._maybe_assert_valid_y(y)
+    event_dims = self._event_dims_tensor(y)
+    return (self.power - 1.) * math_ops.reduce_sum(
+        math_ops.log(y), axis=event_dims)
+
+  def _forward_log_det_jacobian(self, x):
+    x = self._maybe_assert_valid_x(x)
+    event_dims = self._event_dims_tensor(x)
+    if self.power == 0.:
+      return math_ops.reduce_sum(x, axis=event_dims)
+    return (1. / self.power - 1.) * math_ops.reduce_sum(
+        math_ops.log1p(x * self.power),
+        axis=event_dims)
+
+  def _maybe_assert_valid_x(self, x):
+    if not self.validate_args or self.power == 0.:
+      return x
+    is_valid = check_ops.assert_non_negative(
+        1. + self.power * x,
+        message="Forward transformation input must be at least {}.".format(
+            -1. / self.power))
+    return control_flow_ops.with_dependencies([is_valid], x)
+
+  def _maybe_assert_valid_y(self, y):
+    if not self.validate_args:
+      return y
+    is_valid = check_ops.assert_positive(
+        y, message="Inverse transformation input must be greater than 0.")
+    return control_flow_ops.with_dependencies([is_valid], y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py
deleted file mode 100644
index c37db61720d10949f294ff7b2e9778ba6efa57f0..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""PowerTransform bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "PowerTransform",
-]
-
-
-class PowerTransform(bijector.Bijector):
-  """Compute `Y = g(X) = (1 + X * c)**(1 / c), X >= -1 / c`.
-
-  The [power transform](https://en.wikipedia.org/wiki/Power_transform) maps
-  inputs from `[0, inf]` to `[-1/c, inf]`; this is equivalent to the `inverse`
-  of this bijector.
-
-  This bijector is equivalent to the `Exp` bijector when `c=0`.
-  """
-
-  def __init__(self,
-               power=0.,
-               event_ndims=0,
-               validate_args=False,
-               name="power_transform"):
-    """Instantiates the `PowerTransform` bijector.
-
-    Args:
-      power: Python `float` scalar indicating the transform power, i.e.,
-        `Y = g(X) = (1 + X * c)**(1 / c)` where `c` is the `power`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError: if `power < 0` or is not known statically.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    with self._name_scope("init", values=[power]):
-      power = tensor_util.constant_value(
-          ops.convert_to_tensor(power, name="power"))
-    if power is None or power < 0:
-      raise ValueError("`power` must be a non-negative TF constant.")
-    self._power = power
-    super(PowerTransform, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  @property
-  def power(self):
-    """The `c` in: `Y = g(X) = (1 + X * c)**(1 / c)`."""
-    return self._power
-
-  def _forward(self, x):
-    x = self._maybe_assert_valid_x(x)
-    if self.power == 0.:
-      return math_ops.exp(x)
-    # If large x accuracy is an issue, consider using:
-    # (1. + x * self.power)**(1. / self.power) when x >> 1.
-    return math_ops.exp(math_ops.log1p(x * self.power) / self.power)
-
-  def _inverse(self, y):
-    y = self._maybe_assert_valid_y(y)
-    if self.power == 0.:
-      return math_ops.log(y)
-    # If large y accuracy is an issue, consider using:
-    # (y**self.power - 1.) / self.power when y >> 1.
-    return math_ops.expm1(math_ops.log(y) * self.power) / self.power
-
-  def _inverse_log_det_jacobian(self, y):
-    y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return (self.power - 1.) * math_ops.reduce_sum(
-        math_ops.log(y), axis=event_dims)
-
-  def _forward_log_det_jacobian(self, x):
-    x = self._maybe_assert_valid_x(x)
-    event_dims = self._event_dims_tensor(x)
-    if self.power == 0.:
-      return math_ops.reduce_sum(x, axis=event_dims)
-    return (1. / self.power - 1.) * math_ops.reduce_sum(
-        math_ops.log1p(x * self.power),
-        axis=event_dims)
-
-  def _maybe_assert_valid_x(self, x):
-    if not self.validate_args or self.power == 0.:
-      return x
-    is_valid = check_ops.assert_non_negative(
-        1. + self.power * x,
-        message="Forward transformation input must be at least {}.".format(
-            -1. / self.power))
-    return control_flow_ops.with_dependencies([is_valid], x)
-
-  def _maybe_assert_valid_y(self, y):
-    if not self.validate_args:
-      return y
-    is_valid = check_ops.assert_positive(
-        y, message="Inverse transformation input must be greater than 0.")
-    return control_flow_ops.with_dependencies([is_valid], y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
index 8997f7ab6929745275edb38712a5bbb0a9b25ddb..55eca063126797d577653f0d6bcdfddf8192bdb5 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
@@ -12,18 +12,303 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Reshape bijector."""
+"""Reshape bijectors."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.reshape_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = ["Reshape"]
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector as bijector_lib
 
-remove_undocumented(__name__, _allowed_symbols)
+
+__all__ = [
+    "Reshape",
+]
+
+
+def _static_ndims_from_shape(shape):
+  return shape.shape.with_rank_at_least(1)[0].value
+
+
+def _ndims_from_shape(shape):
+  return array_ops.shape(shape)[0]
+
+
+class Reshape(bijector_lib.Bijector):
+  """Reshapes the `event_shape` of a `Tensor`.
+
+  The semantics generally follow that of `tf.reshape()`, with
+  a few differences:
+
+  * The user must provide both the input and output shape, so that
+    the transformation can be inverted. If an input shape is not
+    specified, the default assumes a vector-shaped input, i.e.,
+    event_shape_in = (-1,).
+  * The `Reshape` bijector automatically broadcasts over the leftmost
+    dimensions of its input (`sample_shape` and `batch_shape`); only
+    the rightmost `event_ndims_in` dimensions are reshaped. The
+    number of dimensions to reshape is inferred from the provided
+    `event_shape_in` (`event_ndims_in = len(event_shape_in)`).
+
+  Example usage:
+  ```python
+
+  tfd = tf.contrib.distributions
+
+  r = tfd.bijectors.Reshape(event_shape_out=[1, -1])
+
+  r.forward([3., 4.])    # shape [2]
+  # ==> [[3., 4.]]       # shape [1, 2]
+
+  r.forward([[1., 2.], [3., 4.]])  # shape [2, 2]
+  # ==> [[[1., 2.]],
+  #      [[3., 4.]]]   # shape [2, 1, 2]
+
+  r.inverse([[3., 4.]])  # shape [1,2]
+  # ==> [3., 4.]         # shape [2]
+
+  r.forward_log_det_jacobian(any_value)
+  # ==> 0.
+
+  r.inverse_log_det_jacobian(any_value)
+  # ==> 0.
+  ```
+
+  """
+
+  def __init__(self, event_shape_out, event_shape_in=(-1,),
+               validate_args=False, name=None):
+    """Creates a `Reshape` bijector.
+
+    Args:
+      event_shape_out: An `int`-like vector-shaped `Tensor`
+        representing the event shape of the transformed output.
+      event_shape_in: An optional `int`-like vector-shape `Tensor`
+        representing the event shape of the input. This is required in
+        order to define inverse operations; the default of (-1,)
+        assumes a vector-shaped input.
+      validate_args: Python `bool` indicating whether arguments should
+        be checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+
+    Raises:
+      TypeError: if either `event_shape_in` or `event_shape_out` has
+        non-integer `dtype`.
+      ValueError: if either of `event_shape_in` or `event_shape_out`
+       has non-vector shape (`rank > 1`), or if their sizes do not
+       match.
+    """
+    with ops.name_scope(name, "reshape",
+                        values=[event_shape_out, event_shape_in]):
+
+      event_shape_out = ops.convert_to_tensor(event_shape_out,
+                                              name="event_shape_out",
+                                              preferred_dtype=dtypes.int32)
+      event_shape_in = ops.convert_to_tensor(event_shape_in,
+                                             name="event_shape_in",
+                                             preferred_dtype=dtypes.int32)
+
+      assertions = []
+      assertions.extend(self._maybe_check_valid_shape(
+          event_shape_out, validate_args))
+      assertions.extend(self._maybe_check_valid_shape(
+          event_shape_in, validate_args))
+
+      self._assertions = assertions
+      self._event_shape_in = event_shape_in
+      self._event_shape_out = event_shape_out
+
+      super(Reshape, self).__init__(is_constant_jacobian=True,
+                                    validate_args=validate_args,
+                                    name=name or "reshape")
+
+  def _maybe_check_valid_shape(self, shape, validate_args):
+    """Check that a shape Tensor is int-type and otherwise sane."""
+    if not shape.dtype.is_integer:
+      raise TypeError("{} dtype ({}) should be `int`-like.".format(
+          shape.op.name, shape.dtype.name))
+
+    assertions = []
+
+    ndims = array_ops.rank(shape)
+    ndims_ = tensor_util.constant_value(ndims)
+    if ndims_ is not None and ndims_ > 1:
+      raise ValueError("`{}` rank ({}) should be <= 1.".format(
+          shape.op.name, ndims_))
+    elif validate_args:
+      assertions.append(check_ops.assert_less_equal(
+          ndims, 1, message="`{}` rank should be <= 1.".format(shape.op.name)))
+
+    shape_ = tensor_util.constant_value_as_shape(shape)
+    if shape_.is_fully_defined():
+      es = np.int32(shape_.as_list())
+      if sum(es == -1) > 1:
+        raise ValueError(
+            "`{}` must have at most one `-1` (given {})"
+            .format(shape.op.name, es))
+      if np.any(es < -1):
+        raise ValueError(
+            "`{}` elements must be either positive integers or `-1`"
+            "(given {})."
+            .format(shape.op.name, es))
+    elif validate_args:
+      assertions.extend([
+          check_ops.assert_less_equal(
+              math_ops.reduce_sum(
+                  math_ops.cast(math_ops.equal(shape, -1), dtypes.int32)),
+              1,
+              message="`{}` elements must have at most one `-1`."
+              .format(shape.op.name)),
+          check_ops.assert_greater_equal(
+              shape, -1,
+              message="`{}` elements must be either positive integers or `-1`."
+              .format(shape.op.name)),
+      ])
+    return assertions
+
+  def _reshape_helper(self, x, event_shape_in, event_shape_out):
+    """Reshape only the event_shape of an input `Tensor`."""
+
+    event_ndims_in_ = _static_ndims_from_shape(event_shape_in)
+    event_ndims_in = _ndims_from_shape(event_shape_in)
+    x_ndims_, x_ndims = x.shape.ndims, array_ops.rank(x)
+
+    assertions = []
+
+    # Ensure x.event_shape is compatible with event_shape_in.
+    if (event_ndims_in_ is not None
+        and x_ndims_ is not None
+        and x.shape.with_rank_at_least(event_ndims_in_)[
+            x_ndims_-event_ndims_in_:].is_fully_defined()):
+      x_event_shape_, x_event_shape = [  # pylint: disable=unbalanced-tuple-unpacking
+          np.int32(x.shape[x_ndims_-event_ndims_in_:])]*2
+    else:
+      x_event_shape_, x_event_shape = (
+          None, array_ops.shape(x)[x_ndims-event_ndims_in:])
+
+    event_shape_in_ = tensor_util.constant_value(event_shape_in)
+
+    if x_event_shape_ is not None and event_shape_in_ is not None:
+      # Compare the shape dimensions that are fully specified in the
+      # input (i.e., for which event_shape_in is not -1). If x_event_shape
+      # matches along all of these dimensions, it is compatible with
+      # the desired input shape and any further mismatches (i.e.,
+      # imcompatibility with the desired *output* shape) will be
+      # caught inside of array_ops.reshape() below.
+      x_event_shape_specified_ = x_event_shape_[event_shape_in_ >= 0]
+      event_shape_in_specified_ = event_shape_in_[event_shape_in_ >= 0]
+      if not np.equal(x_event_shape_specified_,
+                      event_shape_in_specified_).all():
+        raise ValueError(
+            "Input `event_shape` does not match `event_shape_in` ({} vs {}).".
+            format(x_event_shape_, event_shape_in_))
+    elif self.validate_args:
+      # Similarly to the static case, we compare the shape dimensions
+      # that are fully specified in the input. We extract these
+      # dimensions using boolean_mask(), which requires that the mask
+      # have known ndims. We can assume that shape Tensors always have
+      # ndims==1 (this assumption is verified inside of
+      # _maybe_check_valid_shape), so the reshape operation is just a
+      # no-op that formally encodes this fact to make boolean_mask()
+      # happy.
+      event_shape_mask = array_ops.reshape(event_shape_in >= 0, [-1])
+      x_event_shape_specified = array_ops.boolean_mask(x_event_shape,
+                                                       event_shape_mask)
+      event_shape_in_specified = array_ops.boolean_mask(event_shape_in,
+                                                        event_shape_mask)
+      assertions.append(check_ops.assert_equal(
+          x_event_shape_specified, event_shape_in_specified,
+          message="Input `event_shape` does not match `event_shape_in`."))
+
+    if assertions:
+      x = control_flow_ops.with_dependencies(assertions, x)
+
+    # get the parts of shape(x) that will not change
+    sample_and_batch_shape = array_ops.shape(x)
+
+    ndims = (x.shape.ndims if x.shape.ndims is not None
+             else array_ops.rank(x))
+    sample_and_batch_shape = sample_and_batch_shape[
+        :(ndims - math_ops.abs(event_ndims_in))]
+
+    if (event_ndims_in_ is not None
+        and x_ndims_ is not None
+        and event_ndims_in_ == x_ndims_):
+      # Hack to allow forward/inverse_event_shape to do shape
+      # inference by calling this helper method with a dummy Tensor of
+      # shape event_shape_in. In this special case,
+      # sample_and_batch_shape will be empty so we can preserve static
+      # shape information by avoiding the concat operation below
+      # (which would be a no-op).
+      new_shape = event_shape_out
+    else:
+      new_shape = array_ops.concat(
+          [sample_and_batch_shape, event_shape_out], axis=0)
+
+    return array_ops.reshape(x, new_shape)
+
+  def _forward(self, x):
+    with ops.control_dependencies(self._assertions):
+      return self._reshape_helper(x,
+                                  self._event_shape_in,
+                                  self._event_shape_out)
+
+  def _inverse(self, y):
+    with ops.control_dependencies(self._assertions):
+      return self._reshape_helper(y,
+                                  self._event_shape_out,
+                                  self._event_shape_in)
+
+  def _inverse_log_det_jacobian(self, y):
+    with ops.control_dependencies(self._assertions):
+      return constant_op.constant(0., dtype=y.dtype)
+
+  def _forward_log_det_jacobian(self, x):
+    with ops.control_dependencies(self._assertions):
+      return constant_op.constant(0., dtype=x.dtype)
+
+  def _forward_event_shape(self, input_shape):
+    # NOTE: this method and the other *_event_shape* methods
+    # compute shape by explicit transformation of a dummy
+    # variable. This approach is not generally recommended because it
+    # bloats the graph and could in general trigger side effects.
+    #
+    # In this particular case of the Reshape bijector, the
+    # forward and inverse transforms have no side effects, and we
+    # believe the reduction in code complexity from delegating the
+    # heavy lifting to tf.reshape() is worth the added graph ops.
+    # However, you should think hard before implementing this approach
+    # in other Bijectors; it is strongly preferred to compute
+    # shapes explicitly whenever it's feasible to do so.
+    with ops.control_dependencies(self._assertions):
+      dummy = array_ops.zeros(dtype=dtypes.float32, shape=input_shape)
+      dummy_reshaped = self.forward(dummy)
+      return dummy_reshaped.shape
+
+  def _inverse_event_shape(self, output_shape):
+    with ops.control_dependencies(self._assertions):
+      dummy = array_ops.zeros(dtype=dtypes.float32, shape=output_shape)
+      dummy_reshaped = self.inverse(dummy)
+      return dummy_reshaped.shape
+
+  def _forward_event_shape_tensor(self, input_shape):
+    with ops.control_dependencies(self._assertions):
+      dummy = array_ops.zeros(dtype=dtypes.float32, shape=input_shape)
+      dummy_reshaped = self.forward(dummy)
+      return array_ops.shape(dummy_reshaped)
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    with ops.control_dependencies(self._assertions):
+      dummy = array_ops.zeros(dtype=dtypes.float32, shape=output_shape)
+      dummy_reshaped = self.inverse(dummy)
+      return array_ops.shape(dummy_reshaped)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/reshape_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/reshape_impl.py
deleted file mode 100644
index 93682639aa3be3b8f59a369dedb6ee773c468130..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/reshape_impl.py
+++ /dev/null
@@ -1,297 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Reshape bijectors."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector as bijector_lib
-
-
-__all__ = [
-    "Reshape",
-]
-
-
-class Reshape(bijector_lib.Bijector):
-  """Reshapes the `event_shape` of a `Tensor`.
-
-  The semantics generally follow that of `tf.reshape()`, with
-  a few differences:
-   * The user must provide both the input and output shape, so that
-     the transformation can be inverted.
-   * The `Reshape` bijector automatically broadcasts over the leftmost
-     dimensions of its input (`sample_shape` and `batch_shape`); only
-     the rightmost `event_ndims_in` dimensions are reshaped. The
-     number of dimensions to reshape is inferred from the provided
-     `event_shape_in` (`event_ndims_in = len(event_shape_in)`).
-   * The `Reshape` bijector does not currently support
-     partially-specified shapes, i.e., those with a dimension
-     implicitly specified by `-1`.
-
-  Example usage:
-  ```python
-
-  bs = tf.contrib.distributions.bijectors
-
-  reverse = bs.Reshape(event_shape_out=[1,2],
-                       event_shape_in=[2,])
-
-  reverse.forward([1., 2.])    # shape [2,]
-  # ==> [[1., 2.]]             # shape [1,2]
-
-  reverse.forward([[1., 2.], [3., 4.]])  # shape [2, 2]
-  # ==> [[[1., 2.]], [[3., 4.]]]         # shape [2, 1, 2]
-
-  reverse.inverse([[1., 2.]])  # shape [1,2]
-  # ==> [1., 2.]               # shape [2,]
-
-  reverse.forward_log_det_jacobian(any_value)
-  # ==> 0.
-
-  reverse.inverse_log_det_jacobian(any_value)
-  # ==> 0.
-  ```
-
-  """
-
-  def __init__(self, event_shape_out, event_shape_in,
-               validate_args=False, name=None):
-    """Creates a `Reshape` bijector.
-
-    Args:
-      event_shape_out: An `int`-like vector-shaped `Tensor`
-        representing the fully specified (no -1's) event shape of the
-        transformed output.
-      event_shape_in: An `int`-like vector-shaped `Tensor`
-        representing the fully specified (no -1's) event shape of the
-        input.
-      validate_args: Python `bool` indicating whether arguments should
-        be checked for correctness.
-      name: Python `str`, name given to ops managed by this object.
-
-    Raises:
-      TypeError: if either `event_shape_in` or `event_shape_out` has
-       non-vector shape (`rank > 1`), or non-integer `dtype`.
-      ValueError: if either `event_shape_in` or `event_shape_out`
-       contains non-positive entries, or if their sizes do not match
-       (`prod(event_shape_in)` != `prod(event_shape_out)`), or if
-       their dimensionality(s) cannot be statically inferred.
-    """
-    with ops.name_scope(name, "reshape",
-                        values=[event_shape_out, event_shape_in]):
-
-      event_shape_out = ops.convert_to_tensor(event_shape_out,
-                                              name="event_shape_out",
-                                              preferred_dtype=dtypes.int32)
-      event_shape_in = ops.convert_to_tensor(event_shape_in,
-                                             name="event_shape_in",
-                                             preferred_dtype=dtypes.int32)
-
-      # check that input shapes are positive integers
-      assertions = []
-      assertions += self._maybe_check_valid_shape(
-          event_shape_out, "event_shape_out",
-          validate_args=validate_args)
-      assertions += self._maybe_check_valid_shape(
-          event_shape_in, "event_shape_in", validate_args=validate_args)
-
-      # check that prod(event_shape_in) = prod(event_shape_out)
-      assertions += self._maybe_check_matching_sizes(
-          event_shape_in, event_shape_out, validate_args=validate_args)
-
-      self._assertions = assertions
-      self._event_shape_in = event_shape_in
-      self._event_shape_out = event_shape_out
-      self._event_shape_in_static = tensor_util.constant_value_as_shape(
-          event_shape_in)
-      self._event_shape_out_static = tensor_util.constant_value_as_shape(
-          event_shape_out)
-
-      super(Reshape, self).__init__(is_constant_jacobian=True,
-                                    validate_args=validate_args,
-                                    name=name or "reshape")
-
-  def _maybe_check_valid_shape(self, shape_tensor, label,
-                               validate_args=False):
-    """Check that a shape Tensor is int-type and positive."""
-
-    assertions = []
-
-    if not shape_tensor.dtype.is_integer:
-      raise TypeError("{} dtype ({}) should be `int`-like.".format(
-          label, shape_tensor.dtype.name))
-
-    shape_rank = tensor_util.constant_value(array_ops.rank(shape_tensor))
-    if shape_rank is not None and shape_rank > 1:
-      raise ValueError("{} rank should be <= 1.".format(label))
-
-    s = tensor_util.constant_value(shape_tensor)
-    if s is not None:
-      if (s <= 0).any():
-        raise ValueError("{} entries must be positive, but found {}".format(
-            label, s))
-    elif validate_args:
-      assertions.append(check_ops.assert_positive(
-          shape_tensor, message="{} entries must be positive".format(label)))
-
-    return assertions
-
-  def _maybe_check_matching_sizes(self, event_shape_in, event_shape_out,
-                                  validate_args=False):
-    """Check that prod(event_shape_in)==prod(event_shape_out)."""
-
-    def _get_size_from_shape(shape):
-      """Computes size from a shape `Tensor`, statically if possible."""
-      s = tensor_util.constant_value(shape)
-      if s is not None:
-        return [np.int32(np.prod(s))]*2
-      return None, math_ops.reduce_prod(shape, name="size")
-
-    # Ensure `event_shape_in` is compatible with `event_shape_out`.
-    event_size_in_, event_size_in = _get_size_from_shape(  # pylint: disable=unbalanced-tuple-unpacking
-        event_shape_in)
-    event_size_out_, event_size_out = _get_size_from_shape(  # pylint: disable=unbalanced-tuple-unpacking
-        event_shape_out)
-
-    assertions = []
-    if event_size_in_ is not None and event_size_out_ is not None:
-      if event_size_in_ != event_size_out_:
-        raise ValueError(
-            "Input `event_size` ({}) does not match output `event_size` ({}).".
-            format(event_size_in, event_size_out_))
-    elif validate_args:
-      assertions.append(check_ops.assert_equal(
-          event_size_in, event_size_out,
-          message="Input/output `event_size`s do not match."))
-
-    return assertions
-
-  def _reshape_helper(self, x, event_shape_in, event_shape_out):
-    """Reshape only the event_shape of an input `Tensor`."""
-
-    def _get_rank_from_shape(shape):
-      """Computes rank from a shape `Tensor`, statically if possible."""
-      # Uses fact that rank is "shape of shape".
-      ndims = shape.shape.with_rank_at_least(1)[0].value
-      if ndims is not None:
-        return ndims, ndims
-      return None, array_ops.shape(shape)[0]
-
-    event_ndims_in_, event_ndims_in = _get_rank_from_shape(event_shape_in)
-
-    assertions = []
-    # Ensure x.event_shape is compatible with event_shape_in.
-    if x.shape.ndims is not None:
-      x_ndims_, x_ndims = [x.shape.ndims]*2
-    else:
-      x_ndims_, x_ndims = None, array_ops.rank(x)
-
-    if (event_ndims_in_ is not None
-        and x_ndims_ is not None
-        and x.shape.with_rank_at_least(event_ndims_in_)[
-            x_ndims_-event_ndims_in_:].is_fully_defined()):
-      x_event_shape_, x_event_shape = [  # pylint: disable=unbalanced-tuple-unpacking
-          np.int32(x.shape[x_ndims_-event_ndims_in_:])]*2
-    else:
-      x_event_shape_, x_event_shape = (
-          None, array_ops.shape(x)[x_ndims-event_ndims_in:])
-
-    event_shape_in_ = tensor_util.constant_value(event_shape_in)
-
-    if x_event_shape_ is not None and event_shape_in_ is not None:
-      if not np.equal(x_event_shape_, event_shape_in_).all():
-        raise ValueError(
-            "Input `event_shape` ({}) does not match `event_shape_in` ({}).".
-            format(x_event_shape_, event_shape_in_))
-    elif self.validate_args:
-      assertions.append(check_ops.assert_equal(
-          x_event_shape, event_shape_in,
-          message="Input `event_shape` does not match `event_shape_in`."))
-
-    if assertions:
-      x = control_flow_ops.with_dependencies(assertions, x)
-
-    # get the parts of shape(x) that will not change
-    sample_and_batch_shape = array_ops.shape(x)
-
-    ndims = (x.shape.ndims if x.shape.ndims is not None
-             else array_ops.rank(x))
-    sample_and_batch_shape = sample_and_batch_shape[
-        :(ndims - math_ops.abs(event_ndims_in))]
-
-    new_shape = array_ops.concat(
-        [sample_and_batch_shape, event_shape_out], axis=0)
-
-    return array_ops.reshape(x, new_shape)
-
-  def _forward(self, x):
-    with ops.control_dependencies(self._assertions):
-      return self._reshape_helper(x,
-                                  self._event_shape_in,
-                                  self._event_shape_out)
-
-  def _inverse(self, y):
-    with ops.control_dependencies(self._assertions):
-      return self._reshape_helper(y,
-                                  self._event_shape_out,
-                                  self._event_shape_in)
-
-  def _inverse_log_det_jacobian(self, y):
-    with ops.control_dependencies(self._assertions):
-      return constant_op.constant(0., dtype=y.dtype)
-
-  def _forward_log_det_jacobian(self, x):
-    with ops.control_dependencies(self._assertions):
-      return constant_op.constant(0., dtype=x.dtype)
-
-  def _forward_event_shape(self, input_shape):
-    self._event_shape_in_static.assert_is_compatible_with(input_shape)
-    return self._event_shape_out_static
-
-  def _inverse_event_shape(self, output_shape):
-    self._event_shape_out_static.assert_is_compatible_with(output_shape)
-    return self._event_shape_in_static
-
-  def _forward_event_shape_tensor(self, input_shape):
-    input_assertions = self._maybe_check_valid_shape(
-        input_shape, "input event shape", validate_args=self.validate_args)
-    input_assertions += self._maybe_check_matching_sizes(
-        input_shape, self._event_shape_out,
-        validate_args=self.validate_args)
-
-    return control_flow_ops.with_dependencies(
-        input_assertions + self._assertions, self._event_shape_out)
-
-  def _inverse_event_shape_tensor(self, output_shape):
-
-    output_assertions = self._maybe_check_valid_shape(
-        output_shape, "output event shape", validate_args=self.validate_args)
-    output_assertions += self._maybe_check_matching_sizes(
-        output_shape, self._event_shape_in, validate_args=self.validate_args)
-
-    return control_flow_ops.with_dependencies(
-        output_assertions + self._assertions, self._event_shape_in)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
index c20e76c0b7367369865faf973377201c8b8b17e6..a640dfe7dfbcce96261589c7fc49107deaefdd54 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
@@ -18,12 +18,31 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["Sigmoid"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Sigmoid",
+]
+
+
+class Sigmoid(bijector.Bijector):
+  """Bijector which computes `Y = g(X) = 1 / (1 + exp(-X))`."""
+
+  def __init__(self, validate_args=False, name="sigmoid"):
+    super(Sigmoid, self).__init__(
+        event_ndims=0, validate_args=validate_args, name=name)
+
+  def _forward(self, x):
+    return math_ops.sigmoid(x)
+
+  def _inverse(self, y):
+    return math_ops.log(y) - math_ops.log1p(-y)
+
+  def _inverse_log_det_jacobian(self, y):
+    return -math_ops.log(y) - math_ops.log1p(-y)
+
+  def _forward_log_det_jacobian(self, x):
+    return -nn_ops.softplus(-x) - nn_ops.softplus(x)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py
index 448125230d24066697624bce03fed71a2c2f00b1..223bc9d042c69be05b0e578835a31ed6e83c0c97 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py
@@ -18,12 +18,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_centered_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.contrib.distributions.python.ops.bijectors import softmax_centered
 
-_allowed_symbols = ["SigmoidCentered"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "SigmoidCentered",
+]
+
+
+class SigmoidCentered(softmax_centered.SoftmaxCentered):
+  """Bijector which computes Y = g(X) = exp([X 0]) / (1 + exp(-X)).
+
+  Equivalent to: `bijector.SoftmaxCentered(event_ndims=0)`.
+
+  See `bijector.SoftmaxCentered` for more details.
+  """
+
+  def __init__(self, validate_args=False, name="sigmoid_centered"):
+    super(SigmoidCentered, self).__init__(
+        event_ndims=0, validate_args=validate_args, name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
index b3cf03c24612f5c618c71c0a8615f272acdf2d10..3a75e4ae9495793901b0da91a5aa3982aab35852 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
@@ -18,12 +18,162 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.sinh_arcsinh_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = ["SinhArcsinh"]
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "SinhArcsinh",
+]
+
+
+def _sqrtx2p1(x):
+  """Implementation of `sqrt(1 + x**2)` which is stable despite large `x`."""
+  return array_ops.where(
+      math_ops.abs(x) * np.sqrt(np.finfo(x.dtype.as_numpy_dtype).eps) <= 1.,
+      math_ops.sqrt(x**2. + 1.),
+      # For large x, calculating x**2 can overflow. This can be alleviated by
+      # considering:
+      # sqrt(1 + x**2)
+      # = exp(0.5 log(1 + x**2))
+      # = exp(0.5 log(x**2 * (1 + x**-2)))
+      # = exp(log(x) + 0.5 * log(1 + x**-2))
+      # = |x| * exp(0.5 log(1 + x**-2))
+      # = |x| * sqrt(1 + x**-2)
+      # We omit the last term in this approximation.
+      # When |x| > 1 / sqrt(machineepsilon), the second term will be 1,
+      # due to sqrt(1 + x**-2) = 1. This is also true with the gradient term,
+      # and higher order gradients, since the first order derivative of
+      # sqrt(1 + x**-2) is -2 * x**-3 / (1 + x**-2) = -2 / (x**3 + x),
+      # and all nth-order derivatives will be O(x**-(n + 2)). This makes any
+      # gradient terms that contain any derivatives of sqrt(1 + x**-2) vanish.
+      math_ops.abs(x))
+
+
+class SinhArcsinh(bijector.Bijector):
+  """Compute `Y = g(X) = Sinh( (Arcsinh(X) + skewness) * tailweight )`.
+
+  For `skewness in (-inf, inf)` and `tailweight in (0, inf)`, this
+  transformation is a
+  diffeomorphism of the real line `(-inf, inf)`.  The inverse transform is
+  `X = g^{-1}(Y) = Sinh( ArcSinh(Y) / tailweight - skewness )`.
+
+  The `SinhArcsinh` transformation of the Normal is described in
+  [Sinh-arcsinh distributions](https://www.jstor.org/stable/27798865)
+  This Bijector allows a similar transformation of any distribution supported on
+  `(-inf, inf)`.
+
+  #### Meaning of the parameters
+
+  * If `skewness = 0` and `tailweight = 1`, this transform is the identity.
+  * Positive (negative) `skewness` leads to positive (negative) skew.
+    * positive skew means, for unimodal `X` centered at zero, the mode of `Y` is
+      "tilted" to the right.
+    * positive skew means positive values of `Y` become more likely, and
+      negative values become less likely.
+  * Larger (smaller) `tailweight` leads to fatter (thinner) tails.
+    * Fatter tails mean larger values of `|Y|` become more likely.
+    * If `X` is a unit Normal, `tailweight < 1` leads to a distribution that is
+      "flat" around `Y = 0`, and a very steep drop-off in the tails.
+    * If `X` is a unit Normal, `tailweight > 1` leads to a distribution more
+      peaked at the mode with heavier tails.
+
+  To see the argument about the tails, note that for `|X| >> 1` and
+  `|X| >> (|skewness| * tailweight)**tailweight`, we have
+  `Y approx 0.5 X**tailweight e**(sign(X) skewness * tailweight)`.
+  """
+
+  def __init__(self,
+               skewness=None,
+               tailweight=None,
+               event_ndims=0,
+               validate_args=False,
+               name="SinhArcsinh"):
+    """Instantiates the `SinhArcsinh` bijector.
+
+    Args:
+      skewness:  Skewness parameter.  Float-type `Tensor`.  Default is `0`
+        of type `float32`.
+      tailweight:  Tailweight parameter.  Positive `Tensor` of same `dtype` as
+        `skewness` and broadcastable `shape`.  Default is `1` of type `float32`.
+      event_ndims: Python scalar indicating the number of dimensions associated
+        with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    with self._name_scope("init", values=[skewness, tailweight]):
+      tailweight = 1. if tailweight is None else tailweight
+      skewness = 0. if skewness is None else skewness
+      self._skewness = ops.convert_to_tensor(
+          skewness, name="skewness")
+      self._tailweight = ops.convert_to_tensor(
+          tailweight, name="tailweight", dtype=self._skewness.dtype)
+      check_ops.assert_same_float_dtype([self._skewness, self._tailweight])
+      if validate_args:
+        self._tailweight = control_flow_ops.with_dependencies([
+            check_ops.assert_positive(
+                self._tailweight,
+                message="Argument tailweight was not positive")
+        ], self._tailweight)
+    super(SinhArcsinh, self).__init__(
+        event_ndims=event_ndims, validate_args=validate_args, name=name)
+
+  @property
+  def skewness(self):
+    """The `skewness` in: `Y  = Sinh((Arcsinh(X) + skewness) * tailweight)`."""
+    return self._skewness
+
+  @property
+  def tailweight(self):
+    """The `tailweight` in: `Y = Sinh((Arcsinh(X) + skewness) * tailweight)`."""
+    return self._tailweight
+
+  def _forward(self, x):
+    return math_ops.sinh((math_ops.asinh(x) + self.skewness) * self.tailweight)
+
+  def _inverse(self, y):
+    return math_ops.sinh(math_ops.asinh(y) / self.tailweight - self.skewness)
+
+  def _inverse_log_det_jacobian(self, y):
+    # x = sinh(arcsinh(y) / tailweight - skewness)
+    # Using sinh' = cosh, arcsinh'(y) = 1 / sqrt(y**2 + 1),
+    # dx/dy
+    # = cosh(arcsinh(y) / tailweight - skewness)
+    #     / (tailweight * sqrt(y**2 + 1))
+    event_dims = self._event_dims_tensor(y)
+    return math_ops.reduce_sum(
+        # This is computed inside the log to avoid catastrophic cancellations
+        # from cosh((arcsinh(y) / tailweight) - skewness) and sqrt(x**2 + 1).
+        math_ops.log(math_ops.cosh(
+            math_ops.asinh(y) / self.tailweight - self.skewness)
+                     # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
+                     # where (arcsinh(x) / tailweight) - skewness ~= arcsinh(x).
+                     / _sqrtx2p1(y))
+        - math_ops.log(self.tailweight),
+        axis=event_dims)
+
+  def _forward_log_det_jacobian(self, x):
+    # y = sinh((arcsinh(x) + skewness) * tailweight)
+    # Using sinh' = cosh, arcsinh'(x) = 1 / sqrt(x**2 + 1),
+    # dy/dx
+    # = cosh((arcsinh(x) + skewness) * tailweight) * tailweight / sqrt(x**2 + 1)
+    event_dims = self._event_dims_tensor(x)
+    return math_ops.reduce_sum(
+        # This is computed inside the log to avoid catastrophic cancellations
+        # from cosh((arcsinh(x) + skewness) * tailweight) and sqrt(x**2 + 1).
+        math_ops.log(math_ops.cosh(
+            (math_ops.asinh(x) + self.skewness) * self.tailweight)
+                     # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
+                     # where (arcsinh(x) + skewness) * tailweight ~= arcsinh(x).
+                     / _sqrtx2p1(x))
+        + math_ops.log(self.tailweight),
+        axis=event_dims)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh_impl.py
deleted file mode 100644
index 3a75e4ae9495793901b0da91a5aa3982aab35852..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh_impl.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""SinhArcsinh bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-__all__ = [
-    "SinhArcsinh",
-]
-
-
-def _sqrtx2p1(x):
-  """Implementation of `sqrt(1 + x**2)` which is stable despite large `x`."""
-  return array_ops.where(
-      math_ops.abs(x) * np.sqrt(np.finfo(x.dtype.as_numpy_dtype).eps) <= 1.,
-      math_ops.sqrt(x**2. + 1.),
-      # For large x, calculating x**2 can overflow. This can be alleviated by
-      # considering:
-      # sqrt(1 + x**2)
-      # = exp(0.5 log(1 + x**2))
-      # = exp(0.5 log(x**2 * (1 + x**-2)))
-      # = exp(log(x) + 0.5 * log(1 + x**-2))
-      # = |x| * exp(0.5 log(1 + x**-2))
-      # = |x| * sqrt(1 + x**-2)
-      # We omit the last term in this approximation.
-      # When |x| > 1 / sqrt(machineepsilon), the second term will be 1,
-      # due to sqrt(1 + x**-2) = 1. This is also true with the gradient term,
-      # and higher order gradients, since the first order derivative of
-      # sqrt(1 + x**-2) is -2 * x**-3 / (1 + x**-2) = -2 / (x**3 + x),
-      # and all nth-order derivatives will be O(x**-(n + 2)). This makes any
-      # gradient terms that contain any derivatives of sqrt(1 + x**-2) vanish.
-      math_ops.abs(x))
-
-
-class SinhArcsinh(bijector.Bijector):
-  """Compute `Y = g(X) = Sinh( (Arcsinh(X) + skewness) * tailweight )`.
-
-  For `skewness in (-inf, inf)` and `tailweight in (0, inf)`, this
-  transformation is a
-  diffeomorphism of the real line `(-inf, inf)`.  The inverse transform is
-  `X = g^{-1}(Y) = Sinh( ArcSinh(Y) / tailweight - skewness )`.
-
-  The `SinhArcsinh` transformation of the Normal is described in
-  [Sinh-arcsinh distributions](https://www.jstor.org/stable/27798865)
-  This Bijector allows a similar transformation of any distribution supported on
-  `(-inf, inf)`.
-
-  #### Meaning of the parameters
-
-  * If `skewness = 0` and `tailweight = 1`, this transform is the identity.
-  * Positive (negative) `skewness` leads to positive (negative) skew.
-    * positive skew means, for unimodal `X` centered at zero, the mode of `Y` is
-      "tilted" to the right.
-    * positive skew means positive values of `Y` become more likely, and
-      negative values become less likely.
-  * Larger (smaller) `tailweight` leads to fatter (thinner) tails.
-    * Fatter tails mean larger values of `|Y|` become more likely.
-    * If `X` is a unit Normal, `tailweight < 1` leads to a distribution that is
-      "flat" around `Y = 0`, and a very steep drop-off in the tails.
-    * If `X` is a unit Normal, `tailweight > 1` leads to a distribution more
-      peaked at the mode with heavier tails.
-
-  To see the argument about the tails, note that for `|X| >> 1` and
-  `|X| >> (|skewness| * tailweight)**tailweight`, we have
-  `Y approx 0.5 X**tailweight e**(sign(X) skewness * tailweight)`.
-  """
-
-  def __init__(self,
-               skewness=None,
-               tailweight=None,
-               event_ndims=0,
-               validate_args=False,
-               name="SinhArcsinh"):
-    """Instantiates the `SinhArcsinh` bijector.
-
-    Args:
-      skewness:  Skewness parameter.  Float-type `Tensor`.  Default is `0`
-        of type `float32`.
-      tailweight:  Tailweight parameter.  Positive `Tensor` of same `dtype` as
-        `skewness` and broadcastable `shape`.  Default is `1` of type `float32`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    with self._name_scope("init", values=[skewness, tailweight]):
-      tailweight = 1. if tailweight is None else tailweight
-      skewness = 0. if skewness is None else skewness
-      self._skewness = ops.convert_to_tensor(
-          skewness, name="skewness")
-      self._tailweight = ops.convert_to_tensor(
-          tailweight, name="tailweight", dtype=self._skewness.dtype)
-      check_ops.assert_same_float_dtype([self._skewness, self._tailweight])
-      if validate_args:
-        self._tailweight = control_flow_ops.with_dependencies([
-            check_ops.assert_positive(
-                self._tailweight,
-                message="Argument tailweight was not positive")
-        ], self._tailweight)
-    super(SinhArcsinh, self).__init__(
-        event_ndims=event_ndims, validate_args=validate_args, name=name)
-
-  @property
-  def skewness(self):
-    """The `skewness` in: `Y  = Sinh((Arcsinh(X) + skewness) * tailweight)`."""
-    return self._skewness
-
-  @property
-  def tailweight(self):
-    """The `tailweight` in: `Y = Sinh((Arcsinh(X) + skewness) * tailweight)`."""
-    return self._tailweight
-
-  def _forward(self, x):
-    return math_ops.sinh((math_ops.asinh(x) + self.skewness) * self.tailweight)
-
-  def _inverse(self, y):
-    return math_ops.sinh(math_ops.asinh(y) / self.tailweight - self.skewness)
-
-  def _inverse_log_det_jacobian(self, y):
-    # x = sinh(arcsinh(y) / tailweight - skewness)
-    # Using sinh' = cosh, arcsinh'(y) = 1 / sqrt(y**2 + 1),
-    # dx/dy
-    # = cosh(arcsinh(y) / tailweight - skewness)
-    #     / (tailweight * sqrt(y**2 + 1))
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
-        # This is computed inside the log to avoid catastrophic cancellations
-        # from cosh((arcsinh(y) / tailweight) - skewness) and sqrt(x**2 + 1).
-        math_ops.log(math_ops.cosh(
-            math_ops.asinh(y) / self.tailweight - self.skewness)
-                     # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
-                     # where (arcsinh(x) / tailweight) - skewness ~= arcsinh(x).
-                     / _sqrtx2p1(y))
-        - math_ops.log(self.tailweight),
-        axis=event_dims)
-
-  def _forward_log_det_jacobian(self, x):
-    # y = sinh((arcsinh(x) + skewness) * tailweight)
-    # Using sinh' = cosh, arcsinh'(x) = 1 / sqrt(x**2 + 1),
-    # dy/dx
-    # = cosh((arcsinh(x) + skewness) * tailweight) * tailweight / sqrt(x**2 + 1)
-    event_dims = self._event_dims_tensor(x)
-    return math_ops.reduce_sum(
-        # This is computed inside the log to avoid catastrophic cancellations
-        # from cosh((arcsinh(x) + skewness) * tailweight) and sqrt(x**2 + 1).
-        math_ops.log(math_ops.cosh(
-            (math_ops.asinh(x) + self.skewness) * self.tailweight)
-                     # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
-                     # where (arcsinh(x) + skewness) * tailweight ~= arcsinh(x).
-                     / _sqrtx2p1(x))
-        + math_ops.log(self.tailweight),
-        axis=event_dims)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
index be6608f97880ae68e10b17c815bf2d8438293261..a9dcce6c526600f3b26c6bceb730417000917ce7 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
@@ -18,12 +18,223 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = ["SoftmaxCentered"]
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
 
-remove_undocumented(__name__, _allowed_symbols)
+
+__all__ = [
+    "SoftmaxCentered",
+]
+
+
+class SoftmaxCentered(bijector.Bijector):
+  """Bijector which computes `Y = g(X) = exp([X 0]) / sum(exp([X 0]))`.
+
+  To implement [softmax](https://en.wikipedia.org/wiki/Softmax_function) as a
+  bijection, the forward transformation appends a value to the input and the
+  inverse removes this coordinate. The appended coordinate represents a pivot,
+  e.g., `softmax(x) = exp(x-c) / sum(exp(x-c))` where `c` is the implicit last
+  coordinate.
+
+  Because we append a coordinate, this bijector only supports `event_ndim in [0,
+  1]`, i.e., scalars and vectors.
+
+  Example Use:
+
+  ```python
+  bijector.SoftmaxCentered(event_ndims=1).forward(tf.log([2, 3, 4]))
+  # Result: [0.2, 0.3, 0.4, 0.1]
+  # Extra result: 0.1
+
+  bijector.SoftmaxCentered(event_ndims=1).inverse([0.2, 0.3, 0.4, 0.1])
+  # Result: tf.log([2, 3, 4])
+  # Extra coordinate removed.
+  ```
+
+  At first blush it may seem like the [Invariance of domain](
+  https://en.wikipedia.org/wiki/Invariance_of_domain) theorem implies this
+  implementation is not a bijection. However, the appended dimension
+  makes the (forward) image non-open and the theorem does not directly apply.
+  """
+
+  def __init__(self,
+               event_ndims=0,
+               validate_args=False,
+               name="softmax_centered"):
+    self._graph_parents = []
+    self._name = name
+    with self._name_scope("init", values=[event_ndims]):
+      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+      event_ndims = tensor_util.constant_value(event_ndims)
+      if event_ndims is None or event_ndims not in [0, 1]:
+        raise ValueError("`event_ndims` must be a TF constant which is 0 or 1")
+    self._static_event_ndims = event_ndims
+    super(SoftmaxCentered, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward_event_shape(self, input_shape):
+    if input_shape.ndims is None:
+      return input_shape
+    if input_shape.ndims != self._static_event_ndims:
+      raise ValueError("input_shape.dims = %d != %d" %
+                       (input_shape.ndims, self._static_event_ndims))
+    if input_shape.ndims == 0:
+      return tensor_shape.TensorShape([2])
+    if input_shape.ndims == 1:
+      return tensor_shape.TensorShape(input_shape[0] + 1)
+    # Unreachable code:
+    raise ValueError("event_ndims = %d must be 0 or 1" % input_shape.ndims)
+
+  def _forward_event_shape_tensor(self, input_shape):
+    ndims = array_ops.shape(input_shape)
+    if self.validate_args:
+      # It is not possible for a negative shape so we need only check <= 1.
+      is_zero_or_one = check_ops.assert_equal(
+          ndims, 0 if self._static_event_ndims == 0 else 1,
+          message="event_ndims must be 0 or 1")
+      ndims = control_flow_ops.with_dependencies([is_zero_or_one], ndims)
+    if self._static_event_ndims == 0:
+      return ops.convert_to_tensor(
+          [2], dtype=dtypes.int32, name="output_shape")
+    return input_shape + 1
+
+  def _inverse_event_shape(self, output_shape):
+    if output_shape.ndims is None:
+      return output_shape
+    if output_shape.ndims != 1:
+      raise ValueError("output_shape.ndims = %d != 1" % output_shape.ndims)
+    if self._static_event_ndims == 0:
+      return tensor_shape.TensorShape([])
+    return tensor_shape.TensorShape(output_shape[0] - 1)
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    ndims = array_ops.shape(output_shape)[0]
+    if self.validate_args:
+      # It is not possible for a negative shape so we need only check <= 1.
+      is_one = check_ops.assert_equal(
+          ndims, 1, message="event_ndims must be 1")
+      ndims = control_flow_ops.with_dependencies([is_one], ndims)
+    if self._static_event_ndims == 0:
+      return ops.convert_to_tensor([], dtype=dtypes.int32, name="output_shape")
+    return array_ops.expand_dims(output_shape[0] - 1, dim=0)
+
+  def _forward(self, x):
+    # Pad the last dim with a zeros vector. We need this because it lets us
+    # infer the scale in the inverse function.
+    y = array_ops.expand_dims(x, dim=-1) if self._static_event_ndims == 0 else x
+    y = distribution_util.pad(y, axis=-1, back=True)
+
+    # Set shape hints.
+    if x.shape.ndims is not None:
+      shape = x.shape.as_list()
+      if self._static_event_ndims == 0:
+        shape += [2]
+      elif shape[-1] is not None:
+        shape[-1] += 1
+      shape = tensor_shape.TensorShape(shape)
+      y.shape.assert_is_compatible_with(shape)
+      y.set_shape(shape)
+
+    # Since we only support event_ndims in [0, 1] and we do padding, we always
+    # reduce over the last dimension, i.e., dim=-1 (which is the default).
+    return nn_ops.softmax(y)
+
+  def _inverse(self, y):
+    # To derive the inverse mapping note that:
+    #   y[i] = exp(x[i]) / normalization
+    # and
+    #   y[end] = 1 / normalization.
+    # Thus:
+    # x[i] = log(exp(x[i])) - log(y[end]) - log(normalization)
+    #      = log(exp(x[i])/normalization) - log(y[end])
+    #      = log(y[i]) - log(y[end])
+    shape = (np.asarray(y.shape.as_list(), dtype=np.int32)
+             if y.shape.is_fully_defined()
+             else array_ops.shape(y, name="shape"))
+    ndims = distribution_util.prefer_static_rank(y)
+
+    # Do this first to make sure CSE catches that it'll happen again in
+    # _inverse_log_det_jacobian.
+    x = math_ops.log(y)
+
+    # We now extract the last coordinate of the rightmost dimension.
+    # Our trick is to slice from [0,0,...,shape[-1]-1] to shape[:-1]+[1].
+    begin = array_ops.one_hot(indices=ndims-1,
+                              depth=ndims,
+                              on_value=shape[-1]-np.array(1, dtype=shape.dtype),
+                              dtype=shape.dtype)
+    size = array_ops.concat([shape[:-1], np.asarray([1], dtype=shape.dtype)], 0)
+    log_normalization = -array_ops.strided_slice(x, begin, begin + size)
+
+    # Here we slice out all but the last coordinate; see above for idea.
+    begin = array_ops.zeros_like(shape)
+    size = array_ops.concat([shape[:-1], [shape[-1] - 1]], 0)
+    x = array_ops.strided_slice(x, begin, begin + size)
+
+    x += log_normalization
+
+    if self._static_event_ndims == 0:
+      x = array_ops.squeeze(x, squeeze_dims=[ndims-1])
+
+    # Set shape hints.
+    if y.shape.ndims is not None:
+      shape = y.shape.as_list()
+      if self._static_event_ndims == 0:
+        shape = shape[:-1]
+      elif shape[-1] is not None:
+        shape[-1] -= 1
+      shape = tensor_shape.TensorShape(shape)
+      x.shape.assert_is_compatible_with(shape)
+      x.set_shape(shape)
+
+    return x
+
+  def _inverse_log_det_jacobian(self, y):
+    # WLOG, consider the vector case:
+    #   x = log(y[:-1]) - log(y[-1])
+    # where,
+    #   y[-1] = 1 - sum(y[:-1]).
+    # We have:
+    #   det{ dX/dY } = det{ diag(1 ./ y[:-1]) + 1 / y[-1] }
+    #                = det{ inv{ diag(y[:-1]) - y[:-1]' y[:-1] } }   (1)
+    #                = 1 / det{ diag(y[:-1]) - y[:-1]' y[:-1] }
+    #                = 1 / { (1 + y[:-1]' inv(diag(y[:-1])) y[:-1]) *
+    #                        det(diag(y[:-1])) }                     (2)
+    #                = 1 / { y[-1] prod(y[:-1]) }
+    #                = 1 / prod(y)
+    # (1) - https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
+    #       or by noting that det{ dX/dY } = 1 / det{ dY/dX } from Bijector
+    #       docstring "Tip".
+    # (2) - https://en.wikipedia.org/wiki/Matrix_determinant_lemma
+    return -math_ops.reduce_sum(math_ops.log(y), axis=-1)
+
+  def _forward_log_det_jacobian(self, x):
+    if self._static_event_ndims == 0:
+      return x - 2. * nn_ops.softplus(x)
+    else:
+      # This code is similar to nn_ops.log_softmax but different because we have
+      # an implicit zero column to handle. I.e., instead of:
+      #   reduce_sum(logits - reduce_sum(exp(logits), dim))
+      # we must do:
+      #   log_normalization = 1 + reduce_sum(exp(logits))
+      #   -log_normalization + reduce_sum(logits - log_normalization)
+      log_normalization = nn_ops.softplus(
+          math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True))
+      fldj = (-log_normalization +
+              math_ops.reduce_sum(x - log_normalization,
+                                  axis=-1,
+                                  keep_dims=True))
+      return array_ops.squeeze(fldj, squeeze_dims=-1)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py
deleted file mode 100644
index 8645cc1b6b04be75a419342591272f07a4a1711c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""SoftmaxCentered bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "SoftmaxCentered",
-]
-
-
-class SoftmaxCentered(bijector.Bijector):
-  """Bijector which computes `Y = g(X) = exp([X 0]) / sum(exp([X 0]))`.
-
-  To implement [softmax](https://en.wikipedia.org/wiki/Softmax_function) as a
-  bijection, the forward transformation appends a value to the input and the
-  inverse removes this coordinate. The appended coordinate represents a pivot,
-  e.g., `softmax(x) = exp(x-c) / sum(exp(x-c))` where `c` is the implicit last
-  coordinate.
-
-  Because we append a coordinate, this bijector only supports `event_ndim in [0,
-  1]`, i.e., scalars and vectors.
-
-  Example Use:
-
-  ```python
-  bijector.SoftmaxCentered(event_ndims=1).forward(tf.log([2, 3, 4]))
-  # Result: [0.2, 0.3, 0.4, 0.1]
-  # Extra result: 0.1
-
-  bijector.SoftmaxCentered(event_ndims=1).inverse([0.2, 0.3, 0.4, 0.1])
-  # Result: tf.log([2, 3, 4])
-  # Extra coordinate removed.
-  ```
-
-  At first blush it may seem like the [Invariance of domain](
-  https://en.wikipedia.org/wiki/Invariance_of_domain) theorem implies this
-  implementation is not a bijection. However, the appended dimension
-  makes the (forward) image non-open and the theorem does not directly apply.
-  """
-
-  def __init__(self,
-               event_ndims=0,
-               validate_args=False,
-               name="softmax_centered"):
-    self._graph_parents = []
-    self._name = name
-    with self._name_scope("init", values=[event_ndims]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      event_ndims = tensor_util.constant_value(event_ndims)
-      if event_ndims is None or event_ndims not in [0, 1]:
-        raise ValueError("`event_ndims` must be a TF constant which is 0 or 1")
-    self._static_event_ndims = event_ndims
-    super(SoftmaxCentered, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  def _forward_event_shape(self, input_shape):
-    if input_shape.ndims is None:
-      return input_shape
-    if input_shape.ndims != self._static_event_ndims:
-      raise ValueError("input_shape.dims = %d != %d" %
-                       (input_shape.ndims, self._static_event_ndims))
-    if input_shape.ndims == 0:
-      return tensor_shape.TensorShape([2])
-    if input_shape.ndims == 1:
-      return tensor_shape.TensorShape(input_shape[0] + 1)
-    # Unreachable code:
-    raise ValueError("event_ndims = %d must be 0 or 1" % input_shape.ndims)
-
-  def _forward_event_shape_tensor(self, input_shape):
-    ndims = array_ops.shape(input_shape)
-    if self.validate_args:
-      # It is not possible for a negative shape so we need only check <= 1.
-      is_zero_or_one = check_ops.assert_equal(
-          ndims, 0 if self._static_event_ndims == 0 else 1,
-          message="event_ndims must be 0 or 1")
-      ndims = control_flow_ops.with_dependencies([is_zero_or_one], ndims)
-    if self._static_event_ndims == 0:
-      return ops.convert_to_tensor(
-          [2], dtype=dtypes.int32, name="output_shape")
-    return input_shape + 1
-
-  def _inverse_event_shape(self, output_shape):
-    if output_shape.ndims is None:
-      return output_shape
-    if output_shape.ndims != 1:
-      raise ValueError("output_shape.ndims = %d != 1" % output_shape.ndims)
-    if self._static_event_ndims == 0:
-      return tensor_shape.TensorShape([])
-    return tensor_shape.TensorShape(output_shape[0] - 1)
-
-  def _inverse_event_shape_tensor(self, output_shape):
-    ndims = array_ops.shape(output_shape)[0]
-    if self.validate_args:
-      # It is not possible for a negative shape so we need only check <= 1.
-      is_one = check_ops.assert_equal(
-          ndims, 1, message="event_ndims must be 1")
-      ndims = control_flow_ops.with_dependencies([is_one], ndims)
-    if self._static_event_ndims == 0:
-      return ops.convert_to_tensor([], dtype=dtypes.int32, name="output_shape")
-    return array_ops.expand_dims(output_shape[0] - 1, dim=0)
-
-  def _forward(self, x):
-    # Pad the last dim with a zeros vector. We need this because it lets us
-    # infer the scale in the inverse function.
-    y = array_ops.expand_dims(x, dim=-1) if self._static_event_ndims == 0 else x
-    ndims = (y.get_shape().ndims if y.get_shape().ndims is not None
-             else array_ops.rank(y))
-    y = array_ops.pad(y,
-                      paddings=array_ops.concat(
-                          (array_ops.zeros(
-                              (ndims - 1, 2), dtype=dtypes.int32), [[0, 1]]),
-                          0))
-
-    # Set shape hints.
-    if x.get_shape().ndims is not None:
-      shape = x.get_shape().as_list()
-      if self._static_event_ndims == 0:
-        shape += [2]
-      elif shape[-1] is not None:
-        shape[-1] += 1
-      shape = tensor_shape.TensorShape(shape)
-      y.get_shape().assert_is_compatible_with(shape)
-      y.set_shape(shape)
-
-    # Since we only support event_ndims in [0, 1] and we do padding, we always
-    # reduce over the last dimension, i.e., dim=-1 (which is the default).
-    return nn_ops.softmax(y)
-
-  def _inverse(self, y):
-    # To derive the inverse mapping note that:
-    #   y[i] = exp(x[i]) / normalization
-    # and
-    #   y[end] = 1 / normalization.
-    # Thus:
-    # x[i] = log(exp(x[i])) - log(y[end]) - log(normalization)
-    #      = log(exp(x[i])/normalization) - log(y[end])
-    #      = log(y[i]) - log(y[end])
-    shape = (np.asarray(y.get_shape().as_list(), dtype=np.int32)
-             if y.get_shape().is_fully_defined()
-             else array_ops.shape(y, name="shape"))
-    ndims = y.get_shape().ndims or math_ops.rank(y, name="ndims")
-
-    # Do this first to make sure CSE catches that it'll happen again in
-    # _inverse_log_det_jacobian.
-    x = math_ops.log(y)
-
-    # We now extract the last coordinate of the rightmost dimension.
-    # Our trick is to slice from [0,0,...,shape[-1]-1] to shape[:-1]+[1].
-    begin = array_ops.one_hot(indices=ndims-1,
-                              depth=ndims,
-                              on_value=shape[-1]-np.array(1, dtype=shape.dtype),
-                              dtype=shape.dtype)
-    size = array_ops.concat([shape[:-1], np.asarray([1], dtype=shape.dtype)], 0)
-    log_normalization = -array_ops.strided_slice(x, begin, begin + size)
-
-    # Here we slice out all but the last coordinate; see above for idea.
-    begin = array_ops.zeros_like(shape)
-    size = array_ops.concat([shape[:-1], [shape[-1] - 1]], 0)
-    x = array_ops.strided_slice(x, begin, begin + size)
-
-    x += log_normalization
-
-    if self._static_event_ndims == 0:
-      x = array_ops.squeeze(x, squeeze_dims=[ndims-1])
-
-    # Set shape hints.
-    if y.get_shape().ndims is not None:
-      shape = y.get_shape().as_list()
-      if self._static_event_ndims == 0:
-        shape = shape[:-1]
-      elif shape[-1] is not None:
-        shape[-1] -= 1
-      shape = tensor_shape.TensorShape(shape)
-      x.get_shape().assert_is_compatible_with(shape)
-      x.set_shape(shape)
-
-    return x
-
-  def _inverse_log_det_jacobian(self, y):
-    # WLOG, consider the vector case:
-    #   x = log(y[:-1]) - log(y[-1])
-    # where,
-    #   y[-1] = 1 - sum(y[:-1]).
-    # We have:
-    #   det{ dX/dY } = det{ diag(1 ./ y[:-1]) + 1 / y[-1] }
-    #                = det{ inv{ diag(y[:-1]) - y[:-1]' y[:-1] } }   (1)
-    #                = 1 / det{ diag(y[:-1]) - y[:-1]' y[:-1] }
-    #                = 1 / { (1 + y[:-1]' inv(diag(y[:-1])) y[:-1]) *
-    #                        det(diag(y[:-1])) }                     (2)
-    #                = 1 / { y[-1] prod(y[:-1]) }
-    #                = 1 / prod(y)
-    # (1) - https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
-    #       or by noting that det{ dX/dY } = 1 / det{ dY/dX } from Bijector
-    #       docstring "Tip".
-    # (2) - https://en.wikipedia.org/wiki/Matrix_determinant_lemma
-    return -math_ops.reduce_sum(math_ops.log(y), axis=-1)
-
-  def _forward_log_det_jacobian(self, x):
-    if self._static_event_ndims == 0:
-      return x - 2. * nn_ops.softplus(x)
-    else:
-      # This code is similar to nn_ops.log_softmax but different because we have
-      # an implicit zero column to handle. I.e., instead of:
-      #   reduce_sum(logits - reduce_sum(exp(logits), dim))
-      # we must do:
-      #   log_normalization = 1 + reduce_sum(exp(logits))
-      #   -log_normalization + reduce_sum(logits - log_normalization)
-      log_normalization = nn_ops.softplus(
-          math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True))
-      fldj = (-log_normalization +
-              math_ops.reduce_sum(x - log_normalization,
-                                  axis=-1,
-                                  keep_dims=True))
-      return array_ops.squeeze(fldj, squeeze_dims=-1)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py b/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
index 250a1144b53bb43271ff7ee494604d9bae6feda8..81957fcf78922fa15fd20a25d144071f431161ae 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
@@ -18,12 +18,127 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.softplus_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
-_allowed_symbols = ["Softplus"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Softplus",
+]
+
+
+class Softplus(bijector.Bijector):
+  """Bijector which computes `Y = g(X) = Log[1 + exp(X)]`.
+
+  The softplus `Bijector` has the following two useful properties:
+
+  * The domain is the positive real numbers
+  * `softplus(x) approx x`, for large `x`, so it does not overflow as easily as
+    the `Exp` `Bijector`.
+
+  The optional nonzero `hinge_softness` parameter changes the transition at
+  zero.  With `hinge_softness = c`, the bijector is:
+
+    ```f_c(x) := c * g(x / c) = c * Log[1 + exp(x / c)].```
+
+  For large `x >> 1`, `c * Log[1 + exp(x / c)] approx c * Log[exp(x / c)] = x`,
+  so the behavior for large `x` is the same as the standard softplus.
+
+  As `c > 0` approaches 0 from the right, `f_c(x)` becomes less and less soft,
+  approaching `max(0, x)`.
+
+  * `c = 1` is the default.
+  * `c > 0` but small means `f(x) approx ReLu(x) = max(0, x)`.
+  * `c < 0` flips sign and reflects around the `y-axis`: `f_{-c}(x) = -f_c(-x)`.
+  * `c = 0` results in a non-bijective transformation and triggers an exception.
+
+    Example Use:
+
+    ```python
+    # Create the Y=g(X)=softplus(X) transform which works only on Tensors with 1
+    # batch ndim and 2 event ndims (i.e., vector of matrices).
+    softplus = Softplus(event_ndims=2)
+    x = [[[1., 2],
+          [3, 4]],
+         [[5, 6],
+          [7, 8]]]
+    log(1 + exp(x)) == softplus.forward(x)
+    log(exp(x) - 1) == softplus.inverse(x)
+    ```
+
+    Note: log(.) and exp(.) are applied element-wise but the Jacobian is a
+    reduction over the event space.
+  """
+
+  @distribution_util.AppendDocstring(
+      kwargs_dict={
+          "hinge_softness": (
+              "Nonzero floating point `Tensor`.  Controls the softness of what "
+              "would otherwise be a kink at the origin.  Default is 1.0")})
+  def __init__(self,
+               event_ndims=0,
+               hinge_softness=None,
+               validate_args=False,
+               name="softplus"):
+    with ops.name_scope(name, values=[hinge_softness]):
+      if hinge_softness is not None:
+        self._hinge_softness = ops.convert_to_tensor(
+            hinge_softness, name="hinge_softness")
+      else:
+        self._hinge_softness = None
+      if validate_args:
+        nonzero_check = check_ops.assert_none_equal(
+            ops.convert_to_tensor(
+                0, dtype=self.hinge_softness.dtype),
+            self.hinge_softness,
+            message="hinge_softness must be non-zero")
+        self._hinge_softness = control_flow_ops.with_dependencies(
+            [nonzero_check], self.hinge_softness)
+
+    super(Softplus, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    if self.hinge_softness is None:
+      return nn_ops.softplus(x)
+    hinge_softness = math_ops.cast(self.hinge_softness, x.dtype)
+    return hinge_softness * nn_ops.softplus(x / hinge_softness)
+
+  def _inverse(self, y):
+    if self.hinge_softness is None:
+      return distribution_util.softplus_inverse(y)
+    hinge_softness = math_ops.cast(self.hinge_softness, y.dtype)
+    return hinge_softness * distribution_util.softplus_inverse(
+        y / hinge_softness)
+
+  def _inverse_log_det_jacobian(self, y):
+    # Could also do:
+    #   ildj = math_ops.reduce_sum(y - distribution_util.softplus_inverse(y),
+    #                              axis=event_dims)
+    # but the following is more numerically stable. Ie,
+    # Y = Log[1 + exp{X}] ==> X = Log[exp{Y} - 1]
+    # ==> dX/dY = exp{Y} / (exp{Y} - 1)
+    #           = 1 / (1 - exp{-Y}),
+    # which is the most stable for large Y > 0. For small Y, we use
+    # 1 - exp{-Y} approx Y.
+    if self.hinge_softness is not None:
+      y /= math_ops.cast(self.hinge_softness, y.dtype)
+    return -math_ops.reduce_sum(math_ops.log(-math_ops.expm1(-y)),
+                                axis=self._event_dims_tensor(y))
+
+  def _forward_log_det_jacobian(self, x):
+    if self.hinge_softness is not None:
+      x /= math_ops.cast(self.hinge_softness, x.dtype)
+    return -math_ops.reduce_sum(nn_ops.softplus(-x),
+                                axis=self._event_dims_tensor(x))
+
+  @property
+  def hinge_softness(self):
+    return self._hinge_softness
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py
deleted file mode 100644
index 81957fcf78922fa15fd20a25d144071f431161ae..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Softplus bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import bijector
-from tensorflow.python.ops.distributions import util as distribution_util
-
-
-__all__ = [
-    "Softplus",
-]
-
-
-class Softplus(bijector.Bijector):
-  """Bijector which computes `Y = g(X) = Log[1 + exp(X)]`.
-
-  The softplus `Bijector` has the following two useful properties:
-
-  * The domain is the positive real numbers
-  * `softplus(x) approx x`, for large `x`, so it does not overflow as easily as
-    the `Exp` `Bijector`.
-
-  The optional nonzero `hinge_softness` parameter changes the transition at
-  zero.  With `hinge_softness = c`, the bijector is:
-
-    ```f_c(x) := c * g(x / c) = c * Log[1 + exp(x / c)].```
-
-  For large `x >> 1`, `c * Log[1 + exp(x / c)] approx c * Log[exp(x / c)] = x`,
-  so the behavior for large `x` is the same as the standard softplus.
-
-  As `c > 0` approaches 0 from the right, `f_c(x)` becomes less and less soft,
-  approaching `max(0, x)`.
-
-  * `c = 1` is the default.
-  * `c > 0` but small means `f(x) approx ReLu(x) = max(0, x)`.
-  * `c < 0` flips sign and reflects around the `y-axis`: `f_{-c}(x) = -f_c(-x)`.
-  * `c = 0` results in a non-bijective transformation and triggers an exception.
-
-    Example Use:
-
-    ```python
-    # Create the Y=g(X)=softplus(X) transform which works only on Tensors with 1
-    # batch ndim and 2 event ndims (i.e., vector of matrices).
-    softplus = Softplus(event_ndims=2)
-    x = [[[1., 2],
-          [3, 4]],
-         [[5, 6],
-          [7, 8]]]
-    log(1 + exp(x)) == softplus.forward(x)
-    log(exp(x) - 1) == softplus.inverse(x)
-    ```
-
-    Note: log(.) and exp(.) are applied element-wise but the Jacobian is a
-    reduction over the event space.
-  """
-
-  @distribution_util.AppendDocstring(
-      kwargs_dict={
-          "hinge_softness": (
-              "Nonzero floating point `Tensor`.  Controls the softness of what "
-              "would otherwise be a kink at the origin.  Default is 1.0")})
-  def __init__(self,
-               event_ndims=0,
-               hinge_softness=None,
-               validate_args=False,
-               name="softplus"):
-    with ops.name_scope(name, values=[hinge_softness]):
-      if hinge_softness is not None:
-        self._hinge_softness = ops.convert_to_tensor(
-            hinge_softness, name="hinge_softness")
-      else:
-        self._hinge_softness = None
-      if validate_args:
-        nonzero_check = check_ops.assert_none_equal(
-            ops.convert_to_tensor(
-                0, dtype=self.hinge_softness.dtype),
-            self.hinge_softness,
-            message="hinge_softness must be non-zero")
-        self._hinge_softness = control_flow_ops.with_dependencies(
-            [nonzero_check], self.hinge_softness)
-
-    super(Softplus, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  def _forward(self, x):
-    if self.hinge_softness is None:
-      return nn_ops.softplus(x)
-    hinge_softness = math_ops.cast(self.hinge_softness, x.dtype)
-    return hinge_softness * nn_ops.softplus(x / hinge_softness)
-
-  def _inverse(self, y):
-    if self.hinge_softness is None:
-      return distribution_util.softplus_inverse(y)
-    hinge_softness = math_ops.cast(self.hinge_softness, y.dtype)
-    return hinge_softness * distribution_util.softplus_inverse(
-        y / hinge_softness)
-
-  def _inverse_log_det_jacobian(self, y):
-    # Could also do:
-    #   ildj = math_ops.reduce_sum(y - distribution_util.softplus_inverse(y),
-    #                              axis=event_dims)
-    # but the following is more numerically stable. Ie,
-    # Y = Log[1 + exp{X}] ==> X = Log[exp{Y} - 1]
-    # ==> dX/dY = exp{Y} / (exp{Y} - 1)
-    #           = 1 / (1 - exp{-Y}),
-    # which is the most stable for large Y > 0. For small Y, we use
-    # 1 - exp{-Y} approx Y.
-    if self.hinge_softness is not None:
-      y /= math_ops.cast(self.hinge_softness, y.dtype)
-    return -math_ops.reduce_sum(math_ops.log(-math_ops.expm1(-y)),
-                                axis=self._event_dims_tensor(y))
-
-  def _forward_log_det_jacobian(self, x):
-    if self.hinge_softness is not None:
-      x /= math_ops.cast(self.hinge_softness, x.dtype)
-    return -math_ops.reduce_sum(nn_ops.softplus(-x),
-                                axis=self._event_dims_tensor(x))
-
-  @property
-  def hinge_softness(self):
-    return self._hinge_softness
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py b/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
index d439f28884d8bd7f2b808317e10c5b5e44bfcfa2..00520bcda85e9527767e6342bf75f10667c264a8 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
@@ -18,12 +18,132 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.weibull_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["Weibull"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Weibull",
+]
+
+
+class Weibull(bijector.Bijector):
+  """Compute `Y = g(X) = 1 - exp((-X / scale) ** concentration), X >= 0`.
+
+  This bijector maps inputs from `[0, inf]` to [0, 1]`. The inverse of the
+  bijector applied to a uniform random variable `X ~ U(0, 1) gives back a
+  random variable with the
+  [Weibull distribution](https://en.wikipedia.org/wiki/Weibull_distribution):
+
+  ```none
+  Y ~ Weibull(scale, concentration)
+  pdf(y; scale, concentration, y >= 0) = (scale / concentration) * (
+    scale / concentration) ** (concentration - 1) * exp(
+      -(y / scale) ** concentration)
+  ```
+  """
+
+  def __init__(self,
+               scale=1.,
+               concentration=1.,
+               event_ndims=0,
+               validate_args=False,
+               name="weibull"):
+    """Instantiates the `Weibull` bijector.
+
+    Args:
+      scale: Positive Float-type `Tensor` that is the same dtype and is
+        broadcastable with `concentration`.
+        This is `l` in `Y = g(X) = 1 - exp((-x / l) ** k)`.
+      concentration: Positive Float-type `Tensor` that is the same dtype and is
+        broadcastable with `scale`.
+        This is `k` in `Y = g(X) = 1 - exp((-x / l) ** k)`.
+      event_ndims: Python scalar indicating the number of dimensions associated
+        with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    with self._name_scope("init", values=[scale, concentration]):
+      self._scale = ops.convert_to_tensor(scale, name="scale")
+      self._concentration = ops.convert_to_tensor(
+          concentration, name="concentration")
+      check_ops.assert_same_float_dtype([self._scale, self._concentration])
+      if validate_args:
+        self._scale = control_flow_ops.with_dependencies([
+            check_ops.assert_positive(
+                self._scale,
+                message="Argument scale was not positive")
+        ], self._scale)
+        self._concentration = control_flow_ops.with_dependencies([
+            check_ops.assert_positive(
+                self._concentration,
+                message="Argument concentration was not positive")
+        ], self._concentration)
+
+    super(Weibull, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  @property
+  def scale(self):
+    """The `l` in `Y = g(X) = 1 - exp((-x / l) ** k)`."""
+    return self._scale
+
+  @property
+  def concentration(self):
+    """The `k` in `Y = g(X) = 1 - exp((-x / l) ** k)`."""
+    return self._concentration
+
+  def _forward(self, x):
+    x = self._maybe_assert_valid_x(x)
+    return -math_ops.expm1(-((x / self.scale) ** self.concentration))
+
+  def _inverse(self, y):
+    y = self._maybe_assert_valid_y(y)
+    return self.scale * (-math_ops.log1p(-y)) ** (1 / self.concentration)
+
+  def _inverse_log_det_jacobian(self, y):
+    y = self._maybe_assert_valid_y(y)
+    event_dims = self._event_dims_tensor(y)
+    return math_ops.reduce_sum(
+        -math_ops.log1p(-y) +
+        (1 / self.concentration - 1) * math_ops.log(-math_ops.log1p(-y)) +
+        math_ops.log(self.scale / self.concentration),
+        axis=event_dims)
+
+  def _forward_log_det_jacobian(self, x):
+    x = self._maybe_assert_valid_x(x)
+    event_dims = self._event_dims_tensor(x)
+    return math_ops.reduce_sum(
+        -(x / self.scale) ** self.concentration +
+        (self.concentration - 1) * math_ops.log(x) +
+        math_ops.log(self.concentration) +
+        -self.concentration * math_ops.log(self.scale),
+        axis=event_dims)
+
+  def _maybe_assert_valid_x(self, x):
+    if not self.validate_args:
+      return x
+    is_valid = check_ops.assert_non_negative(
+        x,
+        message="Forward transformation input must be at least {}.".format(0))
+    return control_flow_ops.with_dependencies([is_valid], x)
+
+  def _maybe_assert_valid_y(self, y):
+    if not self.validate_args:
+      return y
+    is_positive = check_ops.assert_non_negative(
+        y, message="Inverse transformation input must be greater than 0.")
+    less_than_one = check_ops.assert_less_equal(
+        y, constant_op.constant(1., y.dtype),
+        message="Inverse transformation input must be less than or equal to 1.")
+    return control_flow_ops.with_dependencies([is_positive, less_than_one], y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/weibull_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/weibull_impl.py
deleted file mode 100644
index 00520bcda85e9527767e6342bf75f10667c264a8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/weibull_impl.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Weibull bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "Weibull",
-]
-
-
-class Weibull(bijector.Bijector):
-  """Compute `Y = g(X) = 1 - exp((-X / scale) ** concentration), X >= 0`.
-
-  This bijector maps inputs from `[0, inf]` to [0, 1]`. The inverse of the
-  bijector applied to a uniform random variable `X ~ U(0, 1) gives back a
-  random variable with the
-  [Weibull distribution](https://en.wikipedia.org/wiki/Weibull_distribution):
-
-  ```none
-  Y ~ Weibull(scale, concentration)
-  pdf(y; scale, concentration, y >= 0) = (scale / concentration) * (
-    scale / concentration) ** (concentration - 1) * exp(
-      -(y / scale) ** concentration)
-  ```
-  """
-
-  def __init__(self,
-               scale=1.,
-               concentration=1.,
-               event_ndims=0,
-               validate_args=False,
-               name="weibull"):
-    """Instantiates the `Weibull` bijector.
-
-    Args:
-      scale: Positive Float-type `Tensor` that is the same dtype and is
-        broadcastable with `concentration`.
-        This is `l` in `Y = g(X) = 1 - exp((-x / l) ** k)`.
-      concentration: Positive Float-type `Tensor` that is the same dtype and is
-        broadcastable with `scale`.
-        This is `k` in `Y = g(X) = 1 - exp((-x / l) ** k)`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    with self._name_scope("init", values=[scale, concentration]):
-      self._scale = ops.convert_to_tensor(scale, name="scale")
-      self._concentration = ops.convert_to_tensor(
-          concentration, name="concentration")
-      check_ops.assert_same_float_dtype([self._scale, self._concentration])
-      if validate_args:
-        self._scale = control_flow_ops.with_dependencies([
-            check_ops.assert_positive(
-                self._scale,
-                message="Argument scale was not positive")
-        ], self._scale)
-        self._concentration = control_flow_ops.with_dependencies([
-            check_ops.assert_positive(
-                self._concentration,
-                message="Argument concentration was not positive")
-        ], self._concentration)
-
-    super(Weibull, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  @property
-  def scale(self):
-    """The `l` in `Y = g(X) = 1 - exp((-x / l) ** k)`."""
-    return self._scale
-
-  @property
-  def concentration(self):
-    """The `k` in `Y = g(X) = 1 - exp((-x / l) ** k)`."""
-    return self._concentration
-
-  def _forward(self, x):
-    x = self._maybe_assert_valid_x(x)
-    return -math_ops.expm1(-((x / self.scale) ** self.concentration))
-
-  def _inverse(self, y):
-    y = self._maybe_assert_valid_y(y)
-    return self.scale * (-math_ops.log1p(-y)) ** (1 / self.concentration)
-
-  def _inverse_log_det_jacobian(self, y):
-    y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
-        -math_ops.log1p(-y) +
-        (1 / self.concentration - 1) * math_ops.log(-math_ops.log1p(-y)) +
-        math_ops.log(self.scale / self.concentration),
-        axis=event_dims)
-
-  def _forward_log_det_jacobian(self, x):
-    x = self._maybe_assert_valid_x(x)
-    event_dims = self._event_dims_tensor(x)
-    return math_ops.reduce_sum(
-        -(x / self.scale) ** self.concentration +
-        (self.concentration - 1) * math_ops.log(x) +
-        math_ops.log(self.concentration) +
-        -self.concentration * math_ops.log(self.scale),
-        axis=event_dims)
-
-  def _maybe_assert_valid_x(self, x):
-    if not self.validate_args:
-      return x
-    is_valid = check_ops.assert_non_negative(
-        x,
-        message="Forward transformation input must be at least {}.".format(0))
-    return control_flow_ops.with_dependencies([is_valid], x)
-
-  def _maybe_assert_valid_y(self, y):
-    if not self.validate_args:
-      return y
-    is_positive = check_ops.assert_non_negative(
-        y, message="Inverse transformation input must be greater than 0.")
-    less_than_one = check_ops.assert_less_equal(
-        y, constant_op.constant(1., y.dtype),
-        message="Inverse transformation input must be less than or equal to 1.")
-    return control_flow_ops.with_dependencies([is_positive, less_than_one], y)
diff --git a/tensorflow/contrib/distributions/python/ops/cauchy.py b/tensorflow/contrib/distributions/python/ops/cauchy.py
index a17bb091f69b651d21f70a25c5aab61b203e62de..6f5d724a2a945ed8f9c159d8314327c6f994d1db 100644
--- a/tensorflow/contrib/distributions/python/ops/cauchy.py
+++ b/tensorflow/contrib/distributions/python/ops/cauchy.py
@@ -30,7 +30,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 
-
 __all__ = [
     "Cauchy",
 ]
@@ -44,16 +43,17 @@ class Cauchy(distribution.Distribution):
   The probability density function (pdf) is,
 
   ```none
-  pdf(x; loc, scale) = 1 / (pi * scale * (1 + ((x - loc) / scale)**2))
+  pdf(x; loc, scale) = 1 / (pi scale (1 + z**2))
+  z = (x - loc) / scale
   ```
   where `loc` is the location, and `scale` is the scale.
 
   The Cauchy distribution is a member of the [location-scale family](
   https://en.wikipedia.org/wiki/Location-scale_family), i.e.
+  `Y ~ Cauchy(loc, scale)` is equivalent to,
 
   ```none
   X ~ Cauchy(loc=0, scale=1)
-  Y ~ Cauchy(loc=loc, scale=scale)
   Y = loc + scale * X
   ```
 
@@ -62,14 +62,16 @@ class Cauchy(distribution.Distribution):
   Examples of initialization of one or a batch of distributions.
 
   ```python
+  tfd = tf.contrib.distributions
+
   # Define a single scalar Cauchy distribution.
-  dist = Cauchy(loc=0., scale=3.)
+  dist = tfd.Cauchy(loc=0., scale=3.)
 
   # Evaluate the cdf at 1, returning a scalar.
   dist.cdf(1.)
 
   # Define a batch of two scalar valued Cauchy distributions.
-  dist = Cauchy(loc=[1, 2.], scale=[11, 22.])
+  dist = tfd.Cauchy(loc=[1, 2.], scale=[11, 22.])
 
   # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
   # returning a length two tensor.
@@ -77,18 +79,17 @@ class Cauchy(distribution.Distribution):
 
   # Get 3 samples, returning a 3 x 2 tensor.
   dist.sample([3])
-  ```
-
-  Arguments are broadcast when possible.
 
-  ```python
+  # Arguments are broadcast when possible.
   # Define a batch of two scalar valued Cauchy distributions.
   # Both have median 1, but different scales.
-  dist = tf.contrib.distributions.Cauchy(loc=1., scale=[11, 22.])
+  dist = tfd.Cauchy(loc=1., scale=[11, 22.])
+
   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
-  dist.prob(3.0)
+  dist.prob(3.)
   ```
+
   """
 
   def __init__(self,
@@ -97,7 +98,7 @@ class Cauchy(distribution.Distribution):
                validate_args=False,
                allow_nan_stats=True,
                name="Cauchy"):
-    """Construct Cauchy distributions with loc and and scale `loc` and `scale`.
+    """Construct Cauchy distributions.
 
     The parameters `loc` and `scale` must be shaped in a way that supports
     broadcasting (e.g. `loc + scale` is a valid operation).
@@ -121,8 +122,8 @@ class Cauchy(distribution.Distribution):
     """
     parameters = locals()
     with ops.name_scope(name, values=[loc, scale]):
-      with ops.control_dependencies([check_ops.assert_positive(scale)] if
-                                    validate_args else []):
+      with ops.control_dependencies([check_ops.assert_positive(scale)]
+                                    if validate_args else []):
         self._loc = array_ops.identity(loc, name="loc")
         self._scale = array_ops.identity(scale, name="scale")
         check_ops.assert_same_float_dtype([self._loc, self._scale])
@@ -138,8 +139,8 @@ class Cauchy(distribution.Distribution):
   @staticmethod
   def _param_shapes(sample_shape):
     return dict(
-        zip(("loc", "scale"), ([ops.convert_to_tensor(
-            sample_shape, dtype=dtypes.int32)] * 2)))
+        zip(("loc", "scale"),
+            ([ops.convert_to_tensor(sample_shape, dtype=dtypes.int32)] * 2)))
 
   @property
   def loc(self):
@@ -153,13 +154,10 @@ class Cauchy(distribution.Distribution):
 
   def _batch_shape_tensor(self):
     return array_ops.broadcast_dynamic_shape(
-        array_ops.shape(self.loc),
-        array_ops.shape(self.scale))
+        array_ops.shape(self.loc), array_ops.shape(self.scale))
 
   def _batch_shape(self):
-    return array_ops.broadcast_static_shape(
-        self.loc.shape,
-        self.scale.shape)
+    return array_ops.broadcast_static_shape(self.loc.shape, self.scale.shape)
 
   def _event_shape_tensor(self):
     return constant_op.constant([], dtype=dtypes.int32)
diff --git a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
index 599c855cda434d9249187d5d154d50a8a8c49a6c..1d4c5660d8d73b7b6a7e758fc834ccfddeb5c8ea 100644
--- a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
@@ -121,7 +121,7 @@ class ConditionalTransformedDistribution(
     log_prob = self.distribution.log_prob(x, **distribution_kwargs)
     if self._is_maybe_event_override:
       log_prob = math_ops.reduce_sum(log_prob, self._reduce_event_indices)
-    return ildj + log_prob
+    return math_ops.cast(ildj, log_prob.dtype) + log_prob
 
   @distribution_util.AppendDocstring(kwargs_dict=_condition_kwargs_dict)
   def _prob(self, y, bijector_kwargs=None, distribution_kwargs=None):
@@ -143,7 +143,7 @@ class ConditionalTransformedDistribution(
     prob = self.distribution.prob(x, **distribution_kwargs)
     if self._is_maybe_event_override:
       prob = math_ops.reduce_prod(prob, self._reduce_event_indices)
-    return math_ops.exp(ildj) * prob
+    return math_ops.exp(math_ops.cast(ildj, prob.dtype)) * prob
 
   @distribution_util.AppendDocstring(kwargs_dict=_condition_kwargs_dict)
   def _log_cdf(self, y, bijector_kwargs=None, distribution_kwargs=None):
diff --git a/tensorflow/contrib/distributions/python/ops/deterministic.py b/tensorflow/contrib/distributions/python/ops/deterministic.py
index 850d08d1bd69ebc7661557d648e2bffe77e6a908..8049522e9f5dc26b244b7e710a9ae8b981efd6b6 100644
--- a/tensorflow/contrib/distributions/python/ops/deterministic.py
+++ b/tensorflow/contrib/distributions/python/ops/deterministic.py
@@ -290,8 +290,10 @@ class VectorDeterministic(_BaseDeterministic):
   #### Examples
 
   ```python
+  tfd = tf.contrib.distributions
+
   # Initialize a single VectorDeterministic supported at [0., 2.] in R^2.
-  constant = tf.contrib.distributions.Deterministic([0., 2.])
+  constant = tfd.Deterministic([0., 2.])
   constant.prob([0., 2.])
   ==> 1.
   constant.prob([0., 3.])
@@ -299,7 +301,7 @@ class VectorDeterministic(_BaseDeterministic):
 
   # Initialize a [3] batch of constants on R^2.
   loc = [[0., 1.], [2., 3.], [4., 5.]]
-  constant = constant_lib.VectorDeterministic(loc)
+  constant = tfd.VectorDeterministic(loc)
   constant.prob([[0., 1.], [1.9, 3.], [3.99, 5.]])
   ==> [1., 0., 0.]
   ```
diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py
index 869b5698e57d199755ce1686a74a1eafe3b73e7d..a4d249d41ec9733721a3583d3708e0da56db1733 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution_util.py
+++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py
@@ -19,9 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import linalg
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -330,54 +328,14 @@ def shapes_from_loc_and_scale(loc, scale, name="shapes_from_loc_and_scale"):
       else:
         loc_batch_shape = ops.convert_to_tensor(loc_batch_shape,
                                                 name="loc_batch_shape")
+      # This is defined in the core util module.
+      # pylint: disable=undefined-variable
       batch_shape = prefer_static_broadcast_shape(batch_shape, loc_batch_shape)
+      # pylint: enable=undefined-variable
 
   return batch_shape, event_shape
 
 
-def prefer_static_broadcast_shape(
-    shape1, shape2, name="prefer_static_broadcast_shape"):
-  """Convenience function which statically broadcasts shape when possible.
-
-  Args:
-    shape1:  `1-D` integer `Tensor`.  Already converted to tensor!
-    shape2:  `1-D` integer `Tensor`.  Already converted to tensor!
-    name:  A string name to prepend to created ops.
-
-  Returns:
-    The broadcast shape, either as `TensorShape` (if broadcast can be done
-      statically), or as a `Tensor`.
-  """
-  with ops.name_scope(name, values=[shape1, shape2]):
-    def make_shape_tensor(x):
-      return ops.convert_to_tensor(x, name="shape", dtype=dtypes.int32)
-
-    def get_tensor_shape(s):
-      if isinstance(s, tensor_shape.TensorShape):
-        return s
-      s_ = tensor_util.constant_value(make_shape_tensor(s))
-      if s_ is not None:
-        return tensor_shape.TensorShape(s_)
-      return None
-
-    def get_shape_tensor(s):
-      if not isinstance(s, tensor_shape.TensorShape):
-        return make_shape_tensor(s)
-      if s.is_fully_defined():
-        return make_shape_tensor(s.as_list())
-      raise ValueError("Cannot broadcast from partially "
-                       "defined `TensorShape`.")
-
-    shape1_ = get_tensor_shape(shape1)
-    shape2_ = get_tensor_shape(shape2)
-    if shape1_ is not None and shape2_ is not None:
-      return array_ops.broadcast_static_shape(shape1_, shape2_)
-
-    shape1_ = get_shape_tensor(shape1)
-    shape2_ = get_shape_tensor(shape2)
-    return array_ops.broadcast_dynamic_shape(shape1_, shape2_)
-
-
 def get_broadcast_shape(*tensors):
   """Get broadcast shape as a Python list of integers (preferred) or `Tensor`.
 
diff --git a/tensorflow/contrib/distributions/python/ops/gumbel.py b/tensorflow/contrib/distributions/python/ops/gumbel.py
index ba8d3c639b397422f0f6210ba9f48650f0da1e3e..d0efaefb8e78ddf4436e9e5a112d2c1cdddaf3b5 100644
--- a/tensorflow/contrib/distributions/python/ops/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/gumbel.py
@@ -62,15 +62,17 @@ class _Gumbel(distribution.Distribution):
   Examples of initialization of one or a batch of distributions.
 
   ```python
+  tfd = tf.contrib.distributions
+
   # Define a single scalar Gumbel distribution.
-  dist = tf.contrib.distributions.Gumbel(loc=0., scale=3.)
+  dist = tfd.Gumbel(loc=0., scale=3.)
 
   # Evaluate the cdf at 1, returning a scalar.
   dist.cdf(1.)
 
   # Define a batch of two scalar valued Gumbels.
   # The first has mean 1 and scale 11, the second 2 and 22.
-  dist = tf.contrib.distributions.Gumbel(loc=[1, 2.], scale=[11, 22.])
+  dist = tfd.Gumbel(loc=[1, 2.], scale=[11, 22.])
 
   # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
   # returning a length two tensor.
@@ -85,7 +87,7 @@ class _Gumbel(distribution.Distribution):
   ```python
   # Define a batch of two scalar valued Logistics.
   # Both have mean 1, but different scales.
-  dist = tf.contrib.distributions.Gumbel(loc=1., scale=[11, 22.])
+  dist = tfd.Gumbel(loc=1., scale=[11, 22.])
 
   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
diff --git a/tensorflow/contrib/distributions/python/ops/half_normal.py b/tensorflow/contrib/distributions/python/ops/half_normal.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc0751a6e0b78cb3d79bd3478e740bb05cd26428
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/half_normal.py
@@ -0,0 +1,171 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Half Normal distribution class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import special_math
+
+
+__all__ = [
+    "HalfNormal",
+]
+
+
+class HalfNormal(distribution.Distribution):
+  """The Half Normal distribution with scale `scale`.
+
+  #### Mathematical details
+
+  The half normal is a transformation of a centered normal distribution.
+  If some random variable `X` has normal distribution,
+  ```none
+  X ~ Normal(0.0, scale)
+  Y = |X|
+  ```
+  Then `Y` will have half normal distribution. The probability density
+  function (pdf) is:
+
+  ```none
+  pdf(x; scale, x > 0) = sqrt(2) / (scale * sqrt(pi)) *
+    exp(- 1/2 * (x / scale) ** 2)
+  )
+  ```
+  Where `scale = sigma` is the standard deviation of the underlying normal
+  distribution.
+
+  #### Examples
+
+  Examples of initialization of one or a batch of distributions.
+
+  ```python
+  # Define a single scalar HalfNormal distribution.
+  dist = tf.contrib.distributions.HalfNormal(scale=3.0)
+
+  # Evaluate the cdf at 1, returning a scalar.
+  dist.cdf(1.)
+
+  # Define a batch of two scalar valued HalfNormals.
+  # The first has scale 11.0, the second 22.0
+  dist = tf.contrib.distributions.HalfNormal(scale=[11.0, 22.0])
+
+  # Evaluate the pdf of the first distribution on 1.0, and the second on 1.5,
+  # returning a length two tensor.
+  dist.prob([1.0, 1.5])
+
+  # Get 3 samples, returning a 3 x 2 tensor.
+  dist.sample([3])
+  ```
+
+  """
+
+  def __init__(self,
+               scale,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="HalfNormal"):
+    """Construct HalfNormals with scale `scale`.
+
+    Args:
+      scale: Floating point tensor; the scales of the distribution(s).
+        Must contain only positive values.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
+        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+        indicate the result is undefined. When `False`, an exception is raised
+        if one or more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+    """
+    parameters = locals()
+    with ops.name_scope(name, values=[scale]):
+      with ops.control_dependencies([check_ops.assert_positive(scale)] if
+                                    validate_args else []):
+        self._scale = array_ops.identity(scale, name="scale")
+    super(HalfNormal, self).__init__(
+        dtype=self._scale.dtype,
+        reparameterization_type=distribution.FULLY_REPARAMETERIZED,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        parameters=parameters,
+        graph_parents=[self._scale],
+        name=name)
+
+  @staticmethod
+  def _param_shapes(sample_shape):
+    return {"scale": ops.convert_to_tensor(sample_shape, dtype=dtypes.int32)}
+
+  @property
+  def scale(self):
+    """Distribution parameter for the scale."""
+    return self._scale
+
+  def _batch_shape_tensor(self):
+    return array_ops.shape(self.scale)
+
+  def _batch_shape(self):
+    return self.scale.shape
+
+  def _event_shape_tensor(self):
+    return constant_op.constant([], dtype=dtypes.int32)
+
+  def _event_shape(self):
+    return tensor_shape.scalar()
+
+  def _sample_n(self, n, seed=None):
+    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
+    sampled = random_ops.random_normal(
+        shape=shape, mean=0., stddev=1., dtype=self.dtype, seed=seed)
+    return math_ops.abs(sampled * self.scale)
+
+  def _prob(self, x):
+    coeff = np.sqrt(2) / self.scale / np.sqrt(np.pi)
+    pdf = coeff * math_ops.exp(- 0.5 * (x / self.scale) ** 2)
+    return pdf * math_ops.cast(x >= 0, self.dtype)
+
+  def _cdf(self, x):
+    truncated_x = nn.relu(x)
+    return math_ops.erf(truncated_x / self.scale / np.sqrt(2.0))
+
+  def _entropy(self):
+    return 0.5 * math_ops.log(np.pi * self.scale ** 2.0 / 2.0) + 0.5
+
+  def _mean(self):
+    return self.scale * np.sqrt(2.0) / np.sqrt(np.pi)
+
+  def _quantile(self, p):
+    return np.sqrt(2.0) * self.scale * special_math.erfinv(p)
+
+  def _mode(self):
+    return array_ops.zeros(self.batch_shape_tensor())
+
+  def _variance(self):
+    return self.scale ** 2.0 * (1.0 - 2.0 / np.pi)
diff --git a/tensorflow/contrib/distributions/python/ops/independent.py b/tensorflow/contrib/distributions/python/ops/independent.py
index 6a74ca9a0ae1ad30081d21cc15a65be052a99e2a..cbce005013281ff3c58c94d525d5ce7a865d725a 100644
--- a/tensorflow/contrib/distributions/python/ops/independent.py
+++ b/tensorflow/contrib/distributions/python/ops/independent.py
@@ -68,11 +68,11 @@ class Independent(distribution_lib.Distribution):
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Make independent distribution from a 2-batch Normal.
-  ind = ds.Independent(
-      distribution=ds.Normal(loc=[-1., 1], scale=[0.1, 0.5]),
+  ind = tfd.Independent(
+      distribution=tfd.Normal(loc=[-1., 1], scale=[0.1, 0.5]),
       reinterpreted_batch_ndims=1)
 
   # All batch dims have been "absorbed" into event dims.
@@ -80,8 +80,8 @@ class Independent(distribution_lib.Distribution):
   ind.event_shape  # ==> [2]
 
   # Make independent distribution from a 2-batch bivariate Normal.
-  ind = ds.Independent(
-      distribution=ds.MultivariateNormalDiag(
+  ind = tfd.Independent(
+      distribution=tfd.MultivariateNormalDiag(
           loc=[[-1., 1], [1, -1]],
           scale_identity_multiplier=[1., 0.5]),
       reinterpreted_batch_ndims=1)
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index 956dee38a378813434656a28a69c89b6ec1e8b72..ee4d86867d48b20e97757bcec57d452085814b80 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -88,8 +88,9 @@ class InverseGamma(distribution.Distribution):
   #### Examples
 
   ```python
-  dist = InverseGamma(concentration=3.0, rate=2.0)
-  dist2 = InverseGamma(concentration=[3.0, 4.0], rate=[2.0, 3.0])
+  tfd = tf.contrib.distributions
+  dist = tfd.InverseGamma(concentration=3.0, rate=2.0)
+  dist2 = tfd.InverseGamma(concentration=[3.0, 4.0], rate=[2.0, 3.0])
   ```
 
   """
diff --git a/tensorflow/contrib/distributions/python/ops/logistic.py b/tensorflow/contrib/distributions/python/ops/logistic.py
index 48794a48828fe796e233e968d8c755136ce166ad..473677f8d91b184e029f345bb05f5c5d63df7a40 100644
--- a/tensorflow/contrib/distributions/python/ops/logistic.py
+++ b/tensorflow/contrib/distributions/python/ops/logistic.py
@@ -60,15 +60,17 @@ class Logistic(distribution.Distribution):
   Examples of initialization of one or a batch of distributions.
 
   ```python
+  tfd = tf.contrib.distributions
+
   # Define a single scalar Logistic distribution.
-  dist = tf.contrib.distributions.Logistic(loc=0., scale=3.)
+  dist = tfd.Logistic(loc=0., scale=3.)
 
   # Evaluate the cdf at 1, returning a scalar.
   dist.cdf(1.)
 
   # Define a batch of two scalar valued Logistics.
   # The first has mean 1 and scale 11, the second 2 and 22.
-  dist = tf.contrib.distributions.Logistic(loc=[1, 2.], scale=[11, 22.])
+  dist = tfd.Logistic(loc=[1, 2.], scale=[11, 22.])
 
   # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
   # returning a length two tensor.
@@ -76,14 +78,11 @@ class Logistic(distribution.Distribution):
 
   # Get 3 samples, returning a 3 x 2 tensor.
   dist.sample([3])
-  ```
 
-  Arguments are broadcast when possible.
-
-  ```python
+  # Arguments are broadcast when possible.
   # Define a batch of two scalar valued Logistics.
   # Both have mean 1, but different scales.
-  dist = tf.contrib.distributions.Logistic(loc=1., scale=[11, 22.])
+  dist = tfd.Logistic(loc=1., scale=[11, 22.])
 
   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py
index e676931d9145e72907d990148ee2d180e0da0258..f2d492f5489a197157558ae727416b51db04793e 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture.py
@@ -49,13 +49,13 @@ class Mixture(distribution.Distribution):
 
   ```python
   # Create a mixture of two Gaussians:
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
   mix = 0.3
-  bimix_gauss = ds.Mixture(
-    cat=ds.Categorical(probs=[mix, 1.-mix]),
+  bimix_gauss = tfd.Mixture(
+    cat=tfd.Categorical(probs=[mix, 1.-mix]),
     components=[
-      ds.Normal(loc=-1., scale=0.1),
-      ds.Normal(loc=+1., scale=0.5),
+      tfd.Normal(loc=-1., scale=0.1),
+      tfd.Normal(loc=+1., scale=0.5),
   ])
 
   # Plot the PDF.
diff --git a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
index 5558ef0f255db684b229d129666634e50c625887..0ca236c3761f9d3a0fcc79ff9db792319108db0d 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
@@ -43,15 +43,14 @@ class MixtureSameFamily(distribution.Distribution):
   #### Examples
 
   ```python
-  import matplotlib.pyplot as plt
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   ### Create a mixture of two scalar Gaussians:
 
-  gm = ds.MixtureSameFamily(
-      mixture_distribution=ds.Categorical(
+  gm = tfd.MixtureSameFamily(
+      mixture_distribution=tfd.Categorical(
           probs=[0.3, 0.7]),
-      components_distribution=ds.Normal(
+      components_distribution=tfd.Normal(
         loc=[-1., 1],       # One for each component.
         scale=[0.1, 0.5]))  # And same here.
 
@@ -63,14 +62,15 @@ class MixtureSameFamily(distribution.Distribution):
 
   # Plot PDF.
   x = np.linspace(-2., 3., int(1e4), dtype=np.float32)
+  import matplotlib.pyplot as plt
   plt.plot(x, gm.prob(x).eval());
 
   ### Create a mixture of two Bivariate Gaussians:
 
-  gm = ds.MixtureSameFamily(
-      mixture_distribution=ds.Categorical(
+  gm = tfd.MixtureSameFamily(
+      mixture_distribution=tfd.Categorical(
           probs=[0.3, 0.7]),
-      components_distribution=ds.MultivariateNormalDiag(
+      components_distribution=tfd.MultivariateNormalDiag(
           loc=[[-1., 1],  # component 1
                [1, -1]],  # component 2
           scale_identity_multiplier=[.3, .6]))
@@ -320,13 +320,14 @@ class MixtureSameFamily(distribution.Distribution):
         return array_ops.shape(d.batch_shape_tensor())[0]
       dist_batch_ndims = _get_ndims(self)
       cat_batch_ndims = _get_ndims(self.mixture_distribution)
-      bnd = distribution_util.pick_vector(
+      pad_ndims = array_ops.where(
           self.mixture_distribution.is_scalar_batch(),
-          [dist_batch_ndims], [cat_batch_ndims])[0]
+          dist_batch_ndims,
+          dist_batch_ndims - cat_batch_ndims)
       s = array_ops.shape(x)
       x = array_ops.reshape(x, shape=array_ops.concat([
           s[:-1],
-          array_ops.ones([bnd], dtype=dtypes.int32),
+          array_ops.ones([pad_ndims], dtype=dtypes.int32),
           s[-1:],
           array_ops.ones([self._event_ndims], dtype=dtypes.int32),
       ], axis=0))
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag.py b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
index 163cf75d990d5fe7ec1e3aaf0040fc71f61774a7..e862552880f4073c8fa8e90134d0633e7484b0bf 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
@@ -84,10 +84,10 @@ class MultivariateNormalDiag(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 2-variate Gaussian.
-  mvn = ds.MultivariateNormalDiag(
+  mvn = tfd.MultivariateNormalDiag(
       loc=[1., -1],
       scale_diag=[1, 2.])
 
@@ -101,7 +101,7 @@ class MultivariateNormalDiag(
   mvn.prob([-1., 0]).eval()  # shape: []
 
   # Initialize a 3-batch, 2-variate scaled-identity Gaussian.
-  mvn = ds.MultivariateNormalDiag(
+  mvn = tfd.MultivariateNormalDiag(
       loc=[1., -1],
       scale_identity_multiplier=[1, 2., 3])
 
@@ -119,7 +119,7 @@ class MultivariateNormalDiag(
   mvn.prob([-1., 0]).eval()  # shape: [3]
 
   # Initialize a 2-batch of 3-variate Gaussians.
-  mvn = ds.MultivariateNormalDiag(
+  mvn = tfd.MultivariateNormalDiag(
       loc=[[1., 2, 3],
            [11, 22, 33]]           # shape: [2, 3]
       scale_diag=[[1., 2, 3],
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
index 040bc230722194316b8a74627344e315a2578281..413e88f03ae0286c294f3404549a73e1a47dcff7 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
@@ -86,7 +86,7 @@ class MultivariateNormalDiagPlusLowRank(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate Gaussian with covariance `cov = S @ S.T`,
   # `S = diag(d) + U @ diag(m) @ U.T`. The perturbation, `U @ diag(m) @ U.T`, is
@@ -97,7 +97,7 @@ class MultivariateNormalDiagPlusLowRank(
        [-1, 1],
        [2, -0.5]]        # shape: [3, 2]
   m = [4., 5]            # shape: [2]
-  mvn = ds.MultivariateNormalDiagPlusLowRank(
+  mvn = tfd.MultivariateNormalDiagPlusLowRank(
       loc=mu
       scale_diag=d
       scale_perturb_factor=U,
@@ -118,7 +118,7 @@ class MultivariateNormalDiagPlusLowRank(
   m = [[0.1, 0.2],
        [0.4, 0.5]]         # shape: [b, r] = [2, 2]
 
-  mvn = ds.MultivariateNormalDiagPlusLowRank(
+  mvn = tfd.MultivariateNormalDiagPlusLowRank(
       loc=mu,
       scale_perturb_factor=U,
       scale_perturb_diag=m)
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
index f9952b2069d6dfd2593e6bd71ede0badf44cdf98..00a18569fce0175ee39e433dfad796e5f21fe8a4 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
@@ -18,12 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.contrib.distributions.python.ops import mvn_tril
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
 
 
 __all__ = [
@@ -73,14 +76,14 @@ class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate Gaussian.
   mu = [1., 2, 3]
   cov = [[ 0.36,  0.12,  0.06],
          [ 0.12,  0.29, -0.13],
          [ 0.06, -0.13,  0.26]]
-  mvn = ds.MultivariateNormalFullCovariance(
+  mvn = tfd.MultivariateNormalFullCovariance(
       loc=mu,
       covariance_matrix=cov)
 
@@ -100,7 +103,7 @@ class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
   mu = [[1., 2, 3],
         [11, 22, 33]]              # shape: [2, 3]
   covariance_matrix = ...  # shape: [2, 3, 3], symmetric, positive definite.
-  mvn = ds.MultivariateNormalFullCovariance(
+  mvn = tfd.MultivariateNormalFullCovariance(
       loc=mu,
       covariance=covariance_matrix)
 
@@ -167,9 +170,12 @@ class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
           covariance_matrix = ops.convert_to_tensor(
               covariance_matrix, name="covariance_matrix")
           if validate_args:
-            assert_symmetric = check_ops.assert_equal(
-                covariance_matrix,
-                array_ops.matrix_transpose(covariance_matrix),
+            tol = np.finfo(covariance_matrix.dtype.as_numpy_dtype).eps * 10
+            diff = math_ops.abs(
+                covariance_matrix
+                - array_ops.matrix_transpose(covariance_matrix))
+            assert_symmetric = check_ops.assert_less(
+                diff, tol + tol * math_ops.abs(covariance_matrix),
                 message="Matrix was not symmetric.")
             covariance_matrix = control_flow_ops.with_dependencies(
                 [assert_symmetric], covariance_matrix)
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
index 300bdd5f6064a1cc9c336689ac4fae04338edb30..a7399792892f4c179c05168184d76ec95c168b51 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
@@ -90,8 +90,7 @@ class MultivariateNormalLinearOperator(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
-  la = tf.linalg
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate Gaussian.
   mu = [1., 2, 3]
@@ -103,9 +102,9 @@ class MultivariateNormalLinearOperator(
   #      [ 0.2,  0.5,  0. ],
   #      [ 0.1, -0.3,  0.4]])
 
-  mvn = ds.MultivariateNormalLinearOperator(
+  mvn = tfd.MultivariateNormalLinearOperator(
       loc=mu,
-      scale=la.LinearOperatorLowerTriangular(scale))
+      scale=tf.linalg.LinearOperatorLowerTriangular(scale))
 
   # Covariance agrees with cholesky(cov) parameterization.
   mvn.covariance().eval()
@@ -122,9 +121,9 @@ class MultivariateNormalLinearOperator(
   scale_diag = [[1., 2, 3],
                 [0.5, 1, 1.5]]     # shape: [2, 3]
 
-  mvn = ds.MultivariateNormalLinearOperator(
+  mvn = tfd.MultivariateNormalLinearOperator(
       loc=mu,
-      scale=la.LinearOperatorDiag(scale_diag))
+      scale=tf.linalg.LinearOperatorDiag(scale_diag))
 
   # Compute the pdf of two `R^3` observations; return a length-2 vector.
   x = [[-0.9, 0, 0.1],
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_tril.py b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
index 260dcc18f513d5440d3d39368539274c03faa72a..6c7dc4ca7aaf5b3a20b072e9360d15528ad10556 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_tril.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
@@ -76,12 +76,13 @@ class MultivariateNormalTriL(
   ```
 
   Trainable (batch) lower-triangular matrices can be created with
-  `ds.matrix_diag_transform()` and/or `ds.fill_triangular()`
+  `tf.contrib.distributions.matrix_diag_transform()` and/or
+  `tf.contrib.distributions.fill_triangular()`
 
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate Gaussian.
   mu = [1., 2, 3]
@@ -92,7 +93,7 @@ class MultivariateNormalTriL(
   # ==> [[ 0.6,  0. ,  0. ],
   #      [ 0.2,  0.5,  0. ],
   #      [ 0.1, -0.3,  0.4]])
-  mvn = ds.MultivariateNormalTriL(
+  mvn = tfd.MultivariateNormalTriL(
       loc=mu,
       scale_tril=scale)
 
@@ -112,7 +113,7 @@ class MultivariateNormalTriL(
   mu = [[1., 2, 3],
         [11, 22, 33]]              # shape: [2, 3]
   tril = ...  # shape: [2, 3, 3], lower triangular, non-zero diagonal.
-  mvn = ds.MultivariateNormalTriL(
+  mvn = tfd.MultivariateNormalTriL(
       loc=mu,
       scale_tril=tril)
 
@@ -124,9 +125,9 @@ class MultivariateNormalTriL(
   # Instantiate a "learnable" MVN.
   dims = 4
   with tf.variable_scope("model"):
-    mvn = ds.MultivariateNormalTriL(
+    mvn = tfd.MultivariateNormalTriL(
         loc=tf.get_variable(shape=[dims], dtype=tf.float32, name="mu"),
-        scale_tril=ds.fill_triangular(
+        scale_tril=tfd.fill_triangular(
             tf.get_variable(shape=[dims * (dims + 1) / 2],
                             dtype=tf.float32, name="chol_Sigma")))
   ```
diff --git a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
index 8a95038a3c8eccf8a75fea79d0a62f9883b4f13a..92f2bba1828696248c9d9460566a08ba372c3358 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
@@ -22,21 +22,135 @@ import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops import poisson as poisson_lib
+from tensorflow.contrib.distributions.python.ops.bijectors.exp import Exp
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import categorical as categorical_lib
 from tensorflow.python.ops.distributions import distribution as distribution_lib
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.ops.distributions import transformed_distribution as transformed_lib
 
 
 __all__ = [
     "PoissonLogNormalQuadratureCompound",
+    "quadrature_scheme_lognormal_gauss_hermite",
+    "quadrature_scheme_lognormal_quantiles",
 ]
 
 
+def quadrature_scheme_lognormal_gauss_hermite(
+    loc, scale, quadrature_size,
+    validate_args=False, name=None):  # pylint: disable=unused-argument
+  """Use Gauss-Hermite quadrature to form quadrature on positive-reals.
+
+  Note: for a given `quadrature_size`, this method is generally less accurate
+  than `quadrature_scheme_lognormal_quantiles`.
+
+  Args:
+    loc: `float`-like (batch of) scalar `Tensor`; the location parameter of
+      the LogNormal prior.
+    scale: `float`-like (batch of) scalar `Tensor`; the scale parameter of
+      the LogNormal prior.
+    quadrature_size: Python `int` scalar representing the number of quadrature
+      points.
+    validate_args: Python `bool`, default `False`. When `True` distribution
+      parameters are checked for validity despite possibly degrading runtime
+      performance. When `False` invalid inputs may silently render incorrect
+      outputs.
+    name: Python `str` name prefixed to Ops created by this class.
+
+  Returns:
+    grid: (Batch of) length-`quadrature_size` vectors representing the
+      `log_rate` parameters of a `Poisson`.
+    probs: (Batch of) length-`quadrature_size` vectors representing the
+      weight associate with each `grid` value.
+  """
+  with ops.name_scope(name, "vector_diffeomixture_quadrature_gauss_hermite",
+                      [loc, scale]):
+    grid, probs = np.polynomial.hermite.hermgauss(deg=quadrature_size)
+    grid = grid.astype(loc.dtype.as_numpy_dtype)
+    probs = probs.astype(loc.dtype.as_numpy_dtype)
+    probs /= np.linalg.norm(probs, ord=1, keepdims=True)
+    probs = ops.convert_to_tensor(probs, name="probs", dtype=loc.dtype)
+    # The following maps the broadcast of `loc` and `scale` to each grid
+    # point, i.e., we are creating several log-rates that correspond to the
+    # different Gauss-Hermite quadrature points and (possible) batches of
+    # `loc` and `scale`.
+    grid = (loc[..., array_ops.newaxis]
+            + np.sqrt(2.) * scale[..., array_ops.newaxis] * grid)
+    return grid, probs
+
+
+def quadrature_scheme_lognormal_quantiles(
+    loc, scale, quadrature_size,
+    validate_args=False, name=None):
+  """Use LogNormal quantiles to form quadrature on positive-reals.
+
+  Args:
+    loc: `float`-like (batch of) scalar `Tensor`; the location parameter of
+      the LogNormal prior.
+    scale: `float`-like (batch of) scalar `Tensor`; the scale parameter of
+      the LogNormal prior.
+    quadrature_size: Python `int` scalar representing the number of quadrature
+      points.
+    validate_args: Python `bool`, default `False`. When `True` distribution
+      parameters are checked for validity despite possibly degrading runtime
+      performance. When `False` invalid inputs may silently render incorrect
+      outputs.
+    name: Python `str` name prefixed to Ops created by this class.
+
+  Returns:
+    grid: (Batch of) length-`quadrature_size` vectors representing the
+      `log_rate` parameters of a `Poisson`.
+    probs: (Batch of) length-`quadrature_size` vectors representing the
+      weight associate with each `grid` value.
+  """
+  with ops.name_scope(name, "quadrature_scheme_lognormal_quantiles",
+                      [loc, scale]):
+    # Create a LogNormal distribution.
+    dist = transformed_lib.TransformedDistribution(
+        distribution=normal_lib.Normal(loc=loc, scale=scale),
+        bijector=Exp(event_ndims=0),
+        validate_args=validate_args)
+    batch_ndims = dist.batch_shape.ndims
+    if batch_ndims is None:
+      batch_ndims = array_ops.shape(dist.batch_shape_tensor())[0]
+
+    def _compute_quantiles():
+      """Helper to build quantiles."""
+      # Omit {0, 1} since they might lead to Inf/NaN.
+      zero = array_ops.zeros([], dtype=dist.dtype)
+      edges = math_ops.linspace(zero, 1., quadrature_size + 3)[1:-1]
+      # Expand edges so its broadcast across batch dims.
+      edges = array_ops.reshape(edges, shape=array_ops.concat([
+          [-1], array_ops.ones([batch_ndims], dtype=dtypes.int32)], axis=0))
+      quantiles = dist.quantile(edges)
+      # Cyclically permute left by one.
+      perm = array_ops.concat([
+          math_ops.range(1, 1 + batch_ndims), [0]], axis=0)
+      quantiles = array_ops.transpose(quantiles, perm)
+      return quantiles
+    quantiles = _compute_quantiles()
+
+    # Compute grid as quantile midpoints.
+    grid = (quantiles[..., :-1] + quantiles[..., 1:]) / 2.
+    # Set shape hints.
+    grid.set_shape(dist.batch_shape.concatenate([quadrature_size]))
+
+    # By construction probs is constant, i.e., `1 / quadrature_size`. This is
+    # important, because non-constant probs leads to non-reparameterizable
+    # samples.
+    probs = array_ops.fill(
+        dims=[quadrature_size],
+        value=1. / math_ops.cast(quadrature_size, dist.dtype))
+
+    return grid, probs
+
+
 class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
   """`PoissonLogNormalQuadratureCompound` distribution.
 
@@ -47,30 +161,18 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
   ```none
   p(k|loc, scale)
   = int_{R_+} dl LogNormal(l | loc, scale) Poisson(k | l)
-  = int_{R} dz ((lambda(z) sqrt(2) scale)
-                * exp(-z**2) / (lambda(z) sqrt(2 pi) sigma)
-                * Poisson(k | lambda(z)))
-  = int_{R} dz exp(-z**2) / sqrt(pi) Poisson(k | lambda(z))
   approx= sum{ prob[d] Poisson(k | lambda(grid[d])) : d=0, ..., deg-1 }
   ```
 
-  where `lambda(z) = exp(sqrt(2) scale z + loc)` and the `prob,grid` terms
-  are from [numerical quadrature](
-  https://en.wikipedia.org/wiki/Numerical_integration) (default:
-  [Gauss--Hermite quadrature](
-  https://en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)). Note that
-  the second line made the substitution:
-  `z(l) = (log(l) - loc) / (sqrt(2) scale)` which implies `lambda(z)` [above]
-  and `dl = sqrt(2) scale lambda(z) dz`
+  By default, the `grid` is chosen as quantiles of the `LogNormal` distribution
+  parameterized by `loc`, `scale` and the `prob` vector is
+  `[1. / quadrature_size]*quadrature_size`.
 
   In the non-approximation case, a draw from the LogNormal prior represents the
   Poisson rate parameter. Unfortunately, the non-approximate distribution lacks
   an analytical probability density function (pdf). Therefore the
   `PoissonLogNormalQuadratureCompound` class implements an approximation based
-  on [numerical quadrature](
-  https://en.wikipedia.org/wiki/Numerical_integration) (default:
-  [Gauss--Hermite quadrature](
-  https://en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)).
+  on [quadrature](https://en.wikipedia.org/wiki/Numerical_integration).
 
   Note: although the `PoissonLogNormalQuadratureCompound` is approximately the
   Poisson-LogNormal compound distribution, it is itself a valid distribution.
@@ -84,10 +186,8 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
   https://en.wikipedia.org/wiki/Compound_probability_distribution). Using
   variable-substitution and [numerical quadrature](
   https://en.wikipedia.org/wiki/Numerical_integration) (default:
-  [Gauss--Hermite quadrature](
-  https://en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)) we can
-  redefine the distribution to be a parameter-less convex combination of `deg`
-  different Poisson samples.
+  based on `LogNormal` quantiles) we can redefine the distribution to be a
+  parameter-less convex combination of `deg` different Poisson samples.
 
   That is, defined over positive integers, this distribution is parameterized
   by a (batch of) `loc` and `scale` scalars.
@@ -96,46 +196,51 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
 
   ```none
   pdf(k | loc, scale, deg)
-    = sum{ prob[d] Poisson(k | lambda=exp(sqrt(2) scale grid[d] + loc))
+    = sum{ prob[d] Poisson(k | lambda=exp(grid[d]))
           : d=0, ..., deg-1 }
   ```
 
-  where, [e.g., `grid, w = numpy.polynomial.hermite.hermgauss(deg)`](
-  https://docs.scipy.org/doc/numpy-1.10.0/reference/generated/numpy.polynomial.hermite.hermgauss.html)
-  and `prob = w / sqrt(pi)`.
-
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
+
   # Create two batches of PoissonLogNormalQuadratureCompounds, one with
   # prior `loc = 0.` and another with `loc = 1.` In both cases `scale = 1.`
-  pln = ds.PoissonLogNormalQuadratureCompound(
+  pln = tfd.PoissonLogNormalQuadratureCompound(
       loc=[0., -0.5],
       scale=1.,
-      quadrature_grid_and_probs=(
-        np.polynomial.hermite.hermgauss(deg=10)),
+      quadrature_size=10,
       validate_args=True)
   """
 
   def __init__(self,
                loc,
                scale,
-               quadrature_grid_and_probs=None,
+               quadrature_size=8,
+               quadrature_fn=quadrature_scheme_lognormal_quantiles,
                validate_args=False,
                allow_nan_stats=True,
                name="PoissonLogNormalQuadratureCompound"):
-    """Constructs the PoissonLogNormalQuadratureCompound on `R**k`.
+    """Constructs the PoissonLogNormalQuadratureCompound`.
+
+    Note: `probs` returned by (optional) `quadrature_fn` are presumed to be
+    either a length-`quadrature_size` vector or a batch of vectors in 1-to-1
+    correspondence with the returned `grid`. (I.e., broadcasting is only
+    partially supported.)
 
     Args:
       loc: `float`-like (batch of) scalar `Tensor`; the location parameter of
         the LogNormal prior.
       scale: `float`-like (batch of) scalar `Tensor`; the scale parameter of
         the LogNormal prior.
-      quadrature_grid_and_probs: Python pair of `float`-like `Tensor`s
-        representing the sample points and the corresponding (possibly
-        normalized) weight.  When `None`, defaults to:
-        `np.polynomial.hermite.hermgauss(deg=8)`.
+      quadrature_size: Python `int` scalar representing the number of quadrature
+        points.
+      quadrature_fn: Python callable taking `loc`, `scale`,
+        `quadrature_size`, `validate_args` and returning `tuple(grid, probs)`
+        representing the LogNormal grid and corresponding normalized weight.
+        normalized) weight.
+        Default value: `quadrature_scheme_lognormal_quantiles`.
       validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
@@ -147,47 +252,41 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
-      TypeError: if `loc.dtype != scale[0].dtype`.
+      TypeError: if `quadrature_grid` and `quadrature_probs` have different base
+        `dtype`.
     """
     parameters = locals()
     with ops.name_scope(name, values=[loc, scale]):
-      loc = ops.convert_to_tensor(loc, name="loc")
-      self._loc = loc
+      if loc is not None:
+        loc = ops.convert_to_tensor(loc, name="loc")
+      if scale is not None:
+        scale = ops.convert_to_tensor(
+            scale, dtype=None if loc is None else loc.dtype, name="scale")
+      self._quadrature_grid, self._quadrature_probs = tuple(quadrature_fn(
+          loc, scale, quadrature_size, validate_args))
+
+      dt = self._quadrature_grid.dtype
+      if dt.base_dtype != self._quadrature_probs.dtype.base_dtype:
+        raise TypeError("Quadrature grid dtype ({}) does not match quadrature "
+                        "probs dtype ({}).".format(
+                            dt.name, self._quadrature_probs.dtype.name))
 
-      scale = ops.convert_to_tensor(scale, name="scale")
-      self._scale = scale
-
-      dtype = loc.dtype.base_dtype
-      if dtype != scale.dtype.base_dtype:
-        raise TypeError(
-            "loc.dtype(\"{}\") does not match scale.dtype(\"{}\")".format(
-                loc.dtype.name, scale.dtype.name))
-
-      grid, probs = distribution_util.process_quadrature_grid_and_probs(
-          quadrature_grid_and_probs, dtype, validate_args)
-      self._quadrature_grid = grid
-      self._quadrature_probs = probs
-      self._quadrature_size = distribution_util.dimension_size(probs, axis=0)
+      self._distribution = poisson_lib.Poisson(
+          log_rate=self._quadrature_grid,
+          validate_args=validate_args,
+          allow_nan_stats=allow_nan_stats)
 
       self._mixture_distribution = categorical_lib.Categorical(
           logits=math_ops.log(self._quadrature_probs),
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats)
 
-      # The following maps the broadcast of `loc` and `scale` to each grid
-      # point, i.e., we are creating several log-rates that correspond to the
-      # different Gauss-Hermite quadrature points and (possible) batches of
-      # `loc` and `scale`.
-      self._log_rate = (loc[..., array_ops.newaxis]
-                        + np.sqrt(2.) * scale[..., array_ops.newaxis] * grid)
-
-      self._distribution = poisson_lib.Poisson(
-          log_rate=self._log_rate,
-          validate_args=validate_args,
-          allow_nan_stats=allow_nan_stats)
+      self._loc = loc
+      self._scale = scale
+      self._quadrature_size = quadrature_size
 
       super(PoissonLogNormalQuadratureCompound, self).__init__(
-          dtype=dtype,
+          dtype=dt,
           reparameterization_type=distribution_lib.NOT_REPARAMETERIZED,
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
@@ -197,12 +296,12 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
 
   @property
   def mixture_distribution(self):
-    """Distribution which randomly selects a Poisson with Gauss-Hermite rate."""
+    """Distribution which randomly selects a Poisson with quadrature param."""
     return self._mixture_distribution
 
   @property
   def distribution(self):
-    """Base Poisson parameterized by a Gauss-Hermite grid of rates."""
+    """Base Poisson parameterized by a quadrature grid."""
     return self._distribution
 
   @property
@@ -216,24 +315,18 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
     return self._scale
 
   @property
-  def quadrature_grid(self):
-    """Quadrature grid points."""
-    return self._quadrature_grid
-
-  @property
-  def quadrature_probs(self):
-    """Quadrature normalized weights."""
-    return self._quadrature_probs
+  def quadrature_size(self):
+    return self._quadrature_size
 
   def _batch_shape_tensor(self):
     return array_ops.broadcast_dynamic_shape(
-        array_ops.shape(self.loc),
-        array_ops.shape(self.scale))
+        self.distribution.batch_shape_tensor(),
+        array_ops.shape(self.mixture_distribution.logits))[:-1]
 
   def _batch_shape(self):
     return array_ops.broadcast_static_shape(
-        self.loc.shape,
-        self.scale.shape)
+        self.distribution.batch_shape,
+        self.mixture_distribution.logits.shape)[:-1]
 
   def _event_shape(self):
     return tensor_shape.scalar()
@@ -241,18 +334,31 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
   def _sample_n(self, n, seed=None):
     # Get ids as a [n, batch_size]-shaped matrix, unless batch_shape=[] then get
     # ids as a [n]-shaped vector.
-    batch_size = (np.prod(self.batch_shape.as_list(), dtype=np.int32)
-                  if self.batch_shape.is_fully_defined()
-                  else math_ops.reduce_prod(self.batch_shape_tensor()))
+    batch_size = self.batch_shape.num_elements()
+    if batch_size is None:
+      batch_size = math_ops.reduce_prod(self.batch_shape_tensor())
+    # We need to "sample extra" from the mixture distribution if it doesn't
+    # already specify a probs vector for each batch coordinate.
+    # We only support this kind of reduced broadcasting, i.e., there is exactly
+    # one probs vector for all batch dims or one for each.
     ids = self._mixture_distribution.sample(
         sample_shape=concat_vectors(
             [n],
             distribution_util.pick_vector(
-                self.is_scalar_batch(),
-                np.int32([]),
-                [batch_size])),
+                self.mixture_distribution.is_scalar_batch(),
+                [batch_size],
+                np.int32([]))),
         seed=distribution_util.gen_new_seed(
             seed, "poisson_lognormal_quadrature_compound"))
+    # We need to flatten batch dims in case mixture_distribution has its own
+    # batch dims.
+    ids = array_ops.reshape(ids, shape=concat_vectors(
+        [n],
+        distribution_util.pick_vector(
+            self.is_scalar_batch(),
+            np.int32([]),
+            np.int32([-1]))))
+
     # Stride `quadrature_size` for `batch_size` number of times.
     offset = math_ops.range(start=0,
                             limit=batch_size * self._quadrature_size,
@@ -275,7 +381,7 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
   def _mean(self):
     return math_ops.exp(
         math_ops.reduce_logsumexp(
-            self.mixture_distribution.logits + self._log_rate,
+            self.mixture_distribution.logits + self.distribution.log_rate,
             axis=-1))
 
   def _variance(self):
@@ -292,7 +398,7 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
     # where,
     #
     # Z|v ~ interpolate_affine[v](distribution)
-    # V ~ mixture_distrubution
+    # V ~ mixture_distribution
     #
     # thus,
     #
@@ -300,7 +406,7 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
     # Var[E[Z | V]] = sum{ prob[d] (Mean[d] - Mean)**2 : d=0, ..., deg-1 }
     v = array_ops.stack([
         # log(self.distribution.variance()) = log(Var[d]) = log(rate[d])
-        self._log_rate,
+        self.distribution.log_rate,
         # log((Mean[d] - Mean)**2)
         2. * math_ops.log(
             math_ops.abs(self.distribution.mean()
@@ -311,14 +417,9 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
         axis=[-2, -1])
 
 
-def static_value(x):
-  """Returns the static value of a `Tensor` or `None`."""
-  return tensor_util.constant_value(ops.convert_to_tensor(x))
-
-
 def concat_vectors(*args):
   """Concatenates input vectors, statically if possible."""
-  args_ = [static_value(x) for x in args]
+  args_ = [distribution_util.static_value(x) for x in args]
   if any(vec is None for vec in args_):
     return array_ops.concat(args, axis=0)
   return [val for vec in args_ for val in vec]
diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py
index 2a4b92c72900f79785e7e34b77179d3decbace5b..dfc813361977c159d8d48f9d5b9ff03db5b4acdc 100644
--- a/tensorflow/contrib/distributions/python/ops/sample_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@@ -28,12 +28,190 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import spectral_ops
+from tensorflow.python.ops.distributions import util
 
 __all__ = [
+    "auto_correlation",
     "percentile",
 ]
 
 
+# TODO(langmore) Write separate versions of this for real/complex dtype, taking
+# advantage of optimized real-fft ops.
+def auto_correlation(
+    x,
+    axis=-1,
+    max_lags=None,
+    center=True,
+    normalize=True,
+    name="auto_correlation"):
+  """Auto correlation along one axis.
+
+  Given a `1-D` wide sense stationary (WSS) sequence `X`, the auto correlation
+  `RXX` may be defined as  (with `E` expectation and `Conj` complex conjugate)
+
+  ```
+  RXX[m] := E{ W[m] Conj(W[0]) } = E{ W[0] Conj(W[-m]) },
+  W[n]   := (X[n] - MU) / S,
+  MU     := E{ X[0] },
+  S**2   := E{ (X[0] - MU) Conj(X[0] - MU) }.
+  ```
+
+  This function takes the viewpoint that `x` is (along one axis) a finite
+  sub-sequence of a realization of (WSS) `X`, and then uses `x` to produce an
+  estimate of `RXX[m]` as follows:
+
+  After extending `x` from length `L` to `inf` by zero padding, the auto
+  correlation estimate `rxx[m]` is computed for `m = 0, 1, ..., max_lags` as
+
+  ```
+  rxx[m] := (L - m)**-1 sum_n w[n + m] Conj(w[n]),
+  w[n]   := (x[n] - mu) / s,
+  mu     := L**-1 sum_n x[n],
+  s**2   := L**-1 sum_n (x[n] - mu) Conj(x[n] - mu)
+  ```
+
+  The error in this estimate is proportional to `1 / sqrt(len(x) - m)`, so users
+  often set `max_lags` small enough so that the entire output is meaningful.
+
+  Note that since `mu` is an imperfect estimate of `E{ X[0] }`, and we divide by
+  `len(x) - m` rather than `len(x) - m - 1`, our estimate of auto correlation
+  contains a slight bias, which goes to zero as `len(x) - m --> infinity`.
+
+  Args:
+    x:  `float32` or `complex64` `Tensor`.
+    axis:  Python `int`. The axis number along which to compute correlation.
+      Other dimensions index different batch members.
+    max_lags:  Positive `int` tensor.  The maximum value of `m` to consider
+      (in equation above).  If `max_lags >= x.shape[axis]`, we effectively
+      re-set `max_lags` to `x.shape[axis] - 1`.
+    center:  Python `bool`.  If `False`, do not subtract the mean estimate `mu`
+      from `x[n]` when forming `w[n]`.
+    normalize:  Python `bool`.  If `False`, do not divide by the variance
+      estimate `s**2` when forming `w[n]`.
+    name:  `String` name to prepend to created ops.
+
+  Returns:
+    `rxx`: `Tensor` of same `dtype` as `x`.  `rxx.shape[i] = x.shape[i]` for
+      `i != axis`, and `rxx.shape[axis] = max_lags + 1`.
+
+  Raises:
+    TypeError:  If `x` is not a supported type.
+  """
+  # Implementation details:
+  # Extend length N / 2 1-D array x to length N by zero padding onto the end.
+  # Then, set
+  #   F[x]_k := sum_n x_n exp{-i 2 pi k n / N }.
+  # It is not hard to see that
+  #   F[x]_k Conj(F[x]_k) = F[R]_k, where
+  #   R_m := sum_n x_n Conj(x_{(n - m) mod N}).
+  # One can also check that R_m / (N / 2 - m) is an unbiased estimate of RXX[m].
+
+  # Since F[x] is the DFT of x, this leads us to a zero-padding and FFT/IFFT
+  # based version of estimating RXX.
+  # Note that this is a special case of the Wiener-Khinchin Theorem.
+  with ops.name_scope(name, values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+
+    # Rotate dimensions of x in order to put axis at the rightmost dim.
+    # FFT op requires this.
+    rank = util.prefer_static_rank(x)
+    if axis < 0:
+      axis = rank + axis
+    shift = rank - 1 - axis
+    # Suppose x.shape[axis] = T, so there are T "time" steps.
+    #   ==> x_rotated.shape = B + [T],
+    # where B is x_rotated's batch shape.
+    x_rotated = util.rotate_transpose(x, shift)
+
+    if center:
+      x_rotated -= math_ops.reduce_mean(x_rotated, axis=-1, keepdims=True)
+
+    # x_len = N / 2 from above explanation.  The length of x along axis.
+    # Get a value for x_len that works in all cases.
+    x_len = util.prefer_static_shape(x_rotated)[-1]
+
+    # TODO(langmore) Investigate whether this zero padding helps or hurts.  At
+    # the moment is is necessary so that all FFT implementations work.
+    # Zero pad to the next power of 2 greater than 2 * x_len, which equals
+    # 2**(ceil(Log_2(2 * x_len))).  Note: Log_2(X) = Log_e(X) / Log_e(2).
+    x_len_float64 = math_ops.cast(x_len, np.float64)
+    target_length = math_ops.pow(
+        np.float64(2.),
+        math_ops.ceil(math_ops.log(x_len_float64 * 2) / np.log(2.)))
+    pad_length = math_ops.cast(target_length - x_len_float64, np.int32)
+
+    # We should have:
+    # x_rotated_pad.shape = x_rotated.shape[:-1] + [T + pad_length]
+    #                     = B + [T + pad_length]
+    x_rotated_pad = util.pad(x_rotated, axis=-1, back=True, count=pad_length)
+
+    dtype = x.dtype
+    if not dtype.is_complex:
+      if not dtype.is_floating:
+        raise TypeError("Argument x must have either float or complex dtype"
+                        " found: {}".format(dtype))
+      x_rotated_pad = math_ops.complex(x_rotated_pad,
+                                       dtype.real_dtype.as_numpy_dtype(0.))
+
+    # Autocorrelation is IFFT of power-spectral density (up to some scaling).
+    fft_x_rotated_pad = spectral_ops.fft(x_rotated_pad)
+    spectral_density = fft_x_rotated_pad * math_ops.conj(fft_x_rotated_pad)
+    # shifted_product is R[m] from above detailed explanation.
+    # It is the inner product sum_n X[n] * Conj(X[n - m]).
+    shifted_product = spectral_ops.ifft(spectral_density)
+
+    # Cast back to real-valued if x was real to begin with.
+    shifted_product = math_ops.cast(shifted_product, dtype)
+
+    # Figure out if we can deduce the final static shape, and set max_lags.
+    # Use x_rotated as a reference, because it has the time dimension in the far
+    # right, and was created before we performed all sorts of crazy shape
+    # manipulations.
+    know_static_shape = True
+    if not x_rotated.shape.is_fully_defined():
+      know_static_shape = False
+    if max_lags is None:
+      max_lags = x_len - 1
+    else:
+      max_lags = ops.convert_to_tensor(max_lags, name="max_lags")
+      max_lags_ = tensor_util.constant_value(max_lags)
+      if max_lags_ is None or not know_static_shape:
+        know_static_shape = False
+        max_lags = math_ops.minimum(x_len - 1, max_lags)
+      else:
+        max_lags = min(x_len - 1, max_lags_)
+
+    # Chop off the padding.
+    # We allow users to provide a huge max_lags, but cut it off here.
+    # shifted_product_chopped.shape = x_rotated.shape[:-1] + [max_lags]
+    shifted_product_chopped = shifted_product[..., :max_lags + 1]
+
+    # If possible, set shape.
+    if know_static_shape:
+      chopped_shape = x_rotated.shape.as_list()
+      chopped_shape[-1] = min(x_len, max_lags + 1)
+      shifted_product_chopped.set_shape(chopped_shape)
+
+    # Recall R[m] is a sum of N / 2 - m nonzero terms x[n] Conj(x[n - m]).  The
+    # other terms were zeros arising only due to zero padding.
+    # `denominator = (N / 2 - m)` (defined below) is the proper term to
+    # divide by by to make this an unbiased estimate of the expectation
+    # E[X[n] Conj(X[n - m])].
+    x_len = math_ops.cast(x_len, dtype.real_dtype)
+    max_lags = math_ops.cast(max_lags, dtype.real_dtype)
+    denominator = x_len - math_ops.range(0., max_lags + 1.)
+    denominator = math_ops.cast(denominator, dtype)
+    shifted_product_rotated = shifted_product_chopped / denominator
+
+    if normalize:
+      shifted_product_rotated /= shifted_product_rotated[..., :1]
+
+    # Transpose dimensions back to those of x.
+    return util.rotate_transpose(shifted_product_rotated, -shift)
+
+
 # TODO(langmore) To make equivalent to numpy.percentile:
 #  Make work with a sequence of floats or single float for 'q'.
 #  Make work with "linear", "midpoint" interpolation. (linear should be default)
diff --git a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
index b05f15771a3a94779ffddea8f16ad2fa4ea2fdd1..c4b8f055b7fbc3f0835b503eddd7617610326d8c 100644
--- a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
+++ b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
@@ -115,7 +115,7 @@ class SinhArcsinh(transformed_distribution.TransformedDistribution):
       tailweight:  Tailweight parameter. Default is `1.0` (unchanged tailweight)
       distribution: `tf.Distribution`-like instance. Distribution that is
         transformed to produce this distribution.
-        Default is `ds.Normal(0., 1.)`.
+        Default is `tf.distributions.Normal(0., 1.)`.
         Must be a scalar-batch, scalar-event distribution.  Typically
         `distribution.reparameterization_type = FULLY_REPARAMETERIZED` or it is
         a function of non-trainable parameters. WARNING: If you backprop through
diff --git a/tensorflow/contrib/distributions/python/ops/test_util.py b/tensorflow/contrib/distributions/python/ops/test_util.py
index 77f2a39273dc365a4ac202d846dd2bc364655c86..bfc727450f5e48ecbf98bf8ab0475ec67c9e7137 100644
--- a/tensorflow/contrib/distributions/python/ops/test_util.py
+++ b/tensorflow/contrib/distributions/python/ops/test_util.py
@@ -40,6 +40,7 @@ class DiscreteScalarDistributionTestHelpers(object):
   def run_test_sample_consistent_log_prob(
       self, sess_run_fn, dist,
       num_samples=int(1e5), num_threshold=int(1e3), seed=42,
+      batch_size=None,
       rtol=1e-2, atol=0.):
     """Tests that sample/log_prob are consistent with each other.
 
@@ -66,6 +67,8 @@ class DiscreteScalarDistributionTestHelpers(object):
       seed: Python `int` indicating the seed to use when sampling from `dist`.
         In general it is not recommended to use `None` during a test as this
         increases the likelihood of spurious test failure.
+      batch_size: Hint for unpacking result of samples. Default: `None` means
+        batch_size is inferred.
       rtol: Python `float`-type indicating the admissible relative error between
         analytical and sample statistics.
       atol: Python `float`-type indicating the admissible absolute error between
@@ -80,10 +83,11 @@ class DiscreteScalarDistributionTestHelpers(object):
     # Histogram only supports vectors so we call it once per batch coordinate.
     y = dist.sample(num_samples, seed=seed)
     y = array_ops.reshape(y, shape=[num_samples, -1])
-    batch_size = math_ops.reduce_prod(dist.batch_shape_tensor())
+    if batch_size is None:
+      batch_size = math_ops.reduce_prod(dist.batch_shape_tensor())
     batch_dims = array_ops.shape(dist.batch_shape_tensor())[0]
     edges_expanded_shape = 1 + array_ops.pad([-2], paddings=[[0, batch_dims]])
-    for b, x in enumerate(array_ops.unstack(y, axis=1)):
+    for b, x in enumerate(array_ops.unstack(y, num=batch_size, axis=1)):
       counts, edges = self.histogram(x)
       edges = array_ops.reshape(edges, edges_expanded_shape)
       probs = math_ops.exp(dist.log_prob(edges))
diff --git a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
index 92043d6a08833888c36009261addca0d14949ea8..7ce8a83fd91e2dfaa0ccef633f803b3ae595e646 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
@@ -22,30 +22,176 @@ import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops.bijectors.affine_linear_operator import AffineLinearOperator
+from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import SoftmaxCentered
 from tensorflow.contrib.linalg.python.ops import linear_operator_addition as linop_add_lib
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops.distributions import categorical as categorical_lib
 from tensorflow.python.ops.distributions import distribution as distribution_lib
+from tensorflow.python.ops.distributions import normal as normal_lib
 from tensorflow.python.ops.linalg import linear_operator_diag as linop_diag_lib
 from tensorflow.python.ops.linalg import linear_operator_full_matrix as linop_full_lib
 from tensorflow.python.ops.linalg import linear_operator_identity as linop_identity_lib
 from tensorflow.python.ops.linalg import linear_operator_lower_triangular as linop_tril_lib
 
-static_value = distribution_util.static_value
-
 
 __all__ = [
     "VectorDiffeomixture",
+    "quadrature_scheme_softmaxnormal_gauss_hermite",
+    "quadrature_scheme_softmaxnormal_quantiles",
 ]
 
 
+def quadrature_scheme_softmaxnormal_gauss_hermite(
+    loc, scale, quadrature_size,
+    validate_args=False, name=None):
+  """Use Gauss-Hermite quadrature to form quadrature on `K - 1` simplex.
+
+  Note: for a given `quadrature_size`, this method is generally less accurate
+  than `quadrature_scheme_softmaxnormal_quantiles`.
+
+  Args:
+    loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`, B>=0.
+      Represents the `location` parameter of the SoftmaxNormal used for
+      selecting one of the `K` affine transformations.
+    scale: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`, B>=0.
+      Represents the `scale` parameter of the SoftmaxNormal used for
+      selecting one of the `K` affine transformations.
+    quadrature_size: Python `int` scalar representing the number of quadrature
+      points.
+    validate_args: Python `bool`, default `False`. When `True` distribution
+      parameters are checked for validity despite possibly degrading runtime
+      performance. When `False` invalid inputs may silently render incorrect
+      outputs.
+    name: Python `str` name prefixed to Ops created by this class.
+
+  Returns:
+    grid: Shape `[b1, ..., bB, K, quadrature_size]` `Tensor` representing the
+      convex combination of affine parameters for `K` components.
+      `grid[..., :, n]` is the `n`-th grid point, living in the `K - 1` simplex.
+    probs:  Shape `[b1, ..., bB, K, quadrature_size]` `Tensor` representing the
+      associated with each grid point.
+  """
+  with ops.name_scope(name, "quadrature_scheme_softmaxnormal_gauss_hermite",
+                      [loc, scale]):
+    loc = ops.convert_to_tensor(loc, name="loc")
+    dt = loc.dtype.base_dtype
+    scale = ops.convert_to_tensor(scale, dtype=dt, name="scale")
+
+    loc = maybe_check_quadrature_param(loc, "loc", validate_args)
+    scale = maybe_check_quadrature_param(scale, "scale", validate_args)
+
+    grid, probs = np.polynomial.hermite.hermgauss(deg=quadrature_size)
+    grid = grid.astype(loc.dtype.as_numpy_dtype)
+    probs = probs.astype(loc.dtype.as_numpy_dtype)
+    probs /= np.linalg.norm(probs, ord=1, keepdims=True)
+    probs = ops.convert_to_tensor(probs, name="probs", dtype=loc.dtype)
+
+    grid = softmax(
+        -distribution_util.pad(
+            (loc[..., array_ops.newaxis] +
+             np.sqrt(2.) * scale[..., array_ops.newaxis] * grid),
+            axis=-2,
+            front=True),
+        axis=-2)  # shape: [B, components, deg]
+
+    return grid, probs
+
+
+def quadrature_scheme_softmaxnormal_quantiles(
+    loc, scale, quadrature_size,
+    validate_args=False, name=None):
+  """Use SoftmaxNormal quantiles to form quadrature on `K - 1` simplex.
+
+  Args:
+    loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`, B>=0.
+      Represents the `location` parameter of the SoftmaxNormal used for
+      selecting one of the `K` affine transformations.
+    scale: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`, B>=0.
+      Represents the `scale` parameter of the SoftmaxNormal used for
+      selecting one of the `K` affine transformations.
+    quadrature_size: Python scalar `int` representing the number of quadrature
+      points.
+    validate_args: Python `bool`, default `False`. When `True` distribution
+      parameters are checked for validity despite possibly degrading runtime
+      performance. When `False` invalid inputs may silently render incorrect
+      outputs.
+    name: Python `str` name prefixed to Ops created by this class.
+
+  Returns:
+    grid: Shape `[b1, ..., bB, K, quadrature_size]` `Tensor` representing the
+      convex combination of affine parameters for `K` components.
+      `grid[..., :, n]` is the `n`-th grid point, living in the `K - 1` simplex.
+    probs:  Shape `[b1, ..., bB, K, quadrature_size]` `Tensor` representing the
+      associated with each grid point.
+  """
+  with ops.name_scope(name, "softmax_normal_grid_and_probs", [loc, scale]):
+    loc = ops.convert_to_tensor(loc, name="loc")
+    dt = loc.dtype.base_dtype
+    scale = ops.convert_to_tensor(scale, dtype=dt, name="scale")
+
+    loc = maybe_check_quadrature_param(loc, "loc", validate_args)
+    scale = maybe_check_quadrature_param(scale, "scale", validate_args)
+
+    dist = normal_lib.Normal(loc=loc, scale=scale)
+
+    def _get_batch_ndims():
+      """Helper to get dist.batch_shape.ndims, statically if possible."""
+      ndims = dist.batch_shape.ndims
+      if ndims is None:
+        ndims = array_ops.shape(dist.batch_shape_tensor())[0]
+      return ndims
+    batch_ndims = _get_batch_ndims()
+
+    def _get_final_shape(qs):
+      """Helper to build `TensorShape`."""
+      bs = dist.batch_shape.with_rank_at_least(1)
+      num_components = bs[-1].value
+      if num_components is not None:
+        num_components += 1
+      tail = tensor_shape.TensorShape([num_components, qs])
+      return bs[:-1].concatenate(tail)
+
+    def _compute_quantiles():
+      """Helper to build quantiles."""
+      # Omit {0, 1} since they might lead to Inf/NaN.
+      zero = array_ops.zeros([], dtype=dist.dtype)
+      edges = math_ops.linspace(zero, 1., quadrature_size + 3)[1:-1]
+      # Expand edges so its broadcast across batch dims.
+      edges = array_ops.reshape(edges, shape=array_ops.concat([
+          [-1], array_ops.ones([batch_ndims], dtype=dtypes.int32)], axis=0))
+      quantiles = dist.quantile(edges)
+      quantiles = SoftmaxCentered(event_ndims=1).forward(quantiles)
+      # Cyclically permute left by one.
+      perm = array_ops.concat([
+          math_ops.range(1, 1 + batch_ndims), [0]], axis=0)
+      quantiles = array_ops.transpose(quantiles, perm)
+      quantiles.set_shape(_get_final_shape(quadrature_size + 1))
+      return quantiles
+    quantiles = _compute_quantiles()
+
+    # Compute grid as quantile midpoints.
+    grid = (quantiles[..., :-1] + quantiles[..., 1:]) / 2.
+    # Set shape hints.
+    grid.set_shape(_get_final_shape(quadrature_size))
+
+    # By construction probs is constant, i.e., `1 / quadrature_size`. This is
+    # important, because non-constant probs leads to non-reparameterizable
+    # samples.
+    probs = array_ops.fill(
+        dims=[quadrature_size],
+        value=1. / math_ops.cast(quadrature_size, dist.dtype))
+
+    return grid, probs
+
+
 class VectorDiffeomixture(distribution_lib.Distribution):
   """VectorDiffeomixture distribution.
 
@@ -188,8 +334,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
-  la = tf.linalg
+  tfd = tf.contrib.distributions
 
   # Create two batches of VectorDiffeomixtures, one with mix_loc=[0.] and
   # another with mix_loc=[1]. In both cases, `K=2` and the affine
@@ -197,20 +342,20 @@ class VectorDiffeomixture(distribution_lib.Distribution):
   # k=0: loc=zeros(dims)  scale=LinearOperatorScaledIdentity
   # k=1: loc=[2.]*dims    scale=LinOpDiag
   dims = 5
-  vdm = ds.VectorDiffeomixture(
+  vdm = tfd.VectorDiffeomixture(
       mix_loc=[[0.], [1]],
       mix_scale=[1.],
-      distribution=ds.Normal(loc=0., scale=1.),
+      distribution=tfd.Normal(loc=0., scale=1.),
       loc=[
           None,  # Equivalent to `np.zeros(dims, dtype=np.float32)`.
           np.float32([2.]*dims),
       ],
       scale=[
-          la.LinearOperatorScaledIdentity(
+          tf.linalg.LinearOperatorScaledIdentity(
             num_rows=dims,
             multiplier=np.float32(1.1),
             is_positive_definite=True),
-          la.LinearOperatorDiag(
+          tf.linalg.LinearOperatorDiag(
             diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
             is_positive_definite=True),
       ],
@@ -223,17 +368,20 @@ class VectorDiffeomixture(distribution_lib.Distribution):
                distribution,
                loc=None,
                scale=None,
-               quadrature_grid_and_probs=None,
+               quadrature_size=8,
+               quadrature_fn=quadrature_scheme_softmaxnormal_quantiles,
                validate_args=False,
                allow_nan_stats=True,
                name="VectorDiffeomixture"):
-    """Constructs the VectorDiffeomixture on `R**k`.
+    """Constructs the VectorDiffeomixture on `R**d`.
 
     Args:
-      mix_loc: `float`-like `Tensor`. Represents the `location` parameter of the
-        SoftmaxNormal used for selecting one of the `K` affine transformations.
-      mix_scale: `float`-like `Tensor`. Represents the `scale` parameter of the
-        SoftmaxNormal used for selecting one of the `K` affine transformations.
+      mix_loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`. Represents
+        the `location` parameter of the SoftmaxNormal used for selecting one of
+        the `K` affine transformations.
+      mix_scale: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`.
+        Represents the `scale` parameter of the SoftmaxNormal used for selecting
+        one of the `K` affine transformations.
       distribution: `tf.Distribution`-like instance. Distribution from which `d`
         iid samples are used as input to the selected affine transformation.
         Must be a scalar-batch, scalar-event distribution.  Typically
@@ -252,10 +400,13 @@ class VectorDiffeomixture(distribution_lib.Distribution):
         `k`-th element represents the `scale` used for the `k`-th affine
         transformation. `LinearOperator`s must have shape `[B1, ..., Bb, d, d]`,
         `b >= 0`, i.e., characterizes `b`-batches of `d x d` matrices
-      quadrature_grid_and_probs: Python pair of `float`-like `Tensor`s
-        representing the sample points and the corresponding (possibly
-        normalized) weight.  When `None`, defaults to:
-        `np.polynomial.hermite.hermgauss(deg=8)`.
+      quadrature_size: Python `int` scalar representing number of
+        quadrature points.
+      quadrature_fn: Python callable taking `mix_loc`, `mix_scale`,
+        `quadrature_size`, `validate_args` and returning `tuple(grid, probs)`
+        representing the SoftmaxNormal grid and corresponding normalized weight.
+        normalized) weight.
+        Default value: `quadrature_scheme_softmaxnormal_quantiles`.
       validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
@@ -322,11 +473,8 @@ class VectorDiffeomixture(distribution_lib.Distribution):
         raise NotImplementedError("Currently only bimixtures are supported; "
                                   "len(scale)={} is not 2.".format(len(scale)))
 
-      grid, probs = distribution_util.process_quadrature_grid_and_probs(
-          quadrature_grid_and_probs, dtype, validate_args)
-      self._quadrature_grid = grid
-      self._quadrature_probs = probs
-      self._quadrature_size = distribution_util.dimension_size(probs, axis=0)
+      self._grid, probs = tuple(quadrature_fn(
+          mix_loc, mix_scale, quadrature_size, validate_args))
 
       # Note: by creating the logits as `log(prob)` we ensure that
       # `self.mixture_distribution.logits` is equivalent to
@@ -336,22 +484,13 @@ class VectorDiffeomixture(distribution_lib.Distribution):
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats)
 
-      mix_loc = maybe_check_mix_param(
-          mix_loc, "mix_loc", dtype, validate_args)
-      mix_scale = maybe_check_mix_param(
-          mix_scale, "mix_scale", dtype, validate_args)
-
       asserts = distribution_util.maybe_check_scalar_distribution(
           distribution, dtype, validate_args)
       if asserts:
-        mix_loc = control_flow_ops.with_dependencies(asserts, mix_loc)
+        self._grid = control_flow_ops.with_dependencies(
+            asserts, self._grid)
       self._distribution = distribution
 
-      # shape: [B, deg]
-      self._interpolate_weight = math_ops.sigmoid(
-          mix_loc
-          + np.sqrt(2.) * mix_scale * grid)
-
       self._interpolated_affine = [
           AffineLinearOperator(shift=loc_,
                                scale=scale_,
@@ -359,15 +498,16 @@ class VectorDiffeomixture(distribution_lib.Distribution):
                                validate_args=validate_args,
                                name="interpolated_affine_{}".format(k))
           for k, (loc_, scale_) in enumerate(zip(
-              interpolate_loc(self._quadrature_size,
-                              self._interpolate_weight,
-                              loc),
-              interpolate_scale(self._quadrature_size,
-                                self._interpolate_weight,
-                                scale)))]
+              interpolate_loc(self._grid, loc),
+              interpolate_scale(self._grid, scale)))]
 
-      self._batch_shape_, self._event_shape_ = determine_batch_event_shapes(
-          mix_loc, mix_scale, self._endpoint_affine)
+      [
+          self._batch_shape_,
+          self._batch_shape_tensor_,
+          self._event_shape_,
+          self._event_shape_tensor_,
+      ] = determine_batch_event_shapes(self._grid,
+                                       self._endpoint_affine)
 
       super(VectorDiffeomixture, self).__init__(
           dtype=dtype,
@@ -386,8 +526,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
           allow_nan_stats=allow_nan_stats,
           parameters=parameters,
           graph_parents=(
-              [mix_loc, mix_scale]
-              + distribution._graph_parents  # pylint: disable=protected-access
+              distribution._graph_parents  # pylint: disable=protected-access
               + [loc_ for loc_ in loc if loc_ is not None]
               + [p for scale_ in scale for p in scale_.graph_parents]),
           name=name)
@@ -403,9 +542,9 @@ class VectorDiffeomixture(distribution_lib.Distribution):
     return self._distribution
 
   @property
-  def interpolate_weight(self):
+  def grid(self):
     """Grid of mixing probabilities, one for each grid point."""
-    return self._interpolate_weight
+    return self._grid
 
   @property
   def endpoint_affine(self):
@@ -417,27 +556,17 @@ class VectorDiffeomixture(distribution_lib.Distribution):
     """Affine transformation for each convex combination of `K` components."""
     return self._interpolated_affine
 
-  @property
-  def quadrature_grid(self):
-    """Quadrature grid points."""
-    return self._quadrature_grid
-
-  @property
-  def quadrature_probs(self):
-    """Quadrature normalized weights."""
-    return self._quadrature_probs
-
   def _batch_shape_tensor(self):
-    return self._batch_shape_
+    return self._batch_shape_tensor_
 
   def _batch_shape(self):
-    return tensor_shape.TensorShape(static_value(self._batch_shape_))
+    return self._batch_shape_
 
   def _event_shape_tensor(self):
-    return self._event_shape_
+    return self._event_shape_tensor_
 
   def _event_shape(self):
-    return tensor_shape.TensorShape(static_value(self._event_shape_))
+    return self._event_shape_
 
   def _sample_n(self, n, seed=None):
     x = self.distribution.sample(
@@ -450,25 +579,44 @@ class VectorDiffeomixture(distribution_lib.Distribution):
 
     # Get ids as a [n, batch_size]-shaped matrix, unless batch_shape=[] then get
     # ids as a [n]-shaped vector.
-    batch_size = reduce_prod(self.batch_shape_tensor())
-    ids = self._mixture_distribution.sample(
+    batch_size = self.batch_shape.num_elements()
+    if batch_size is None:
+      batch_size = array_ops.reduce_prod(self.batch_shape_tensor())
+    mix_batch_size = self.mixture_distribution.batch_shape.num_elements()
+    if mix_batch_size is None:
+      mix_batch_size = math_ops.reduce_prod(
+          self.mixture_distribution.batch_shape_tensor())
+    ids = self.mixture_distribution.sample(
         sample_shape=concat_vectors(
             [n],
             distribution_util.pick_vector(
                 self.is_scalar_batch(),
                 np.int32([]),
-                [batch_size])),
+                [batch_size // mix_batch_size])),
         seed=distribution_util.gen_new_seed(
             seed, "vector_diffeomixture"))
-
-    # Stride `quadrature_size` for `batch_size` number of times.
+    # We need to flatten batch dims in case mixture_distribution has its own
+    # batch dims.
+    ids = array_ops.reshape(ids, shape=concat_vectors(
+        [n],
+        distribution_util.pick_vector(
+            self.is_scalar_batch(),
+            np.int32([]),
+            np.int32([-1]))))
+
+    # Stride `components * quadrature_size` for `batch_size` number of times.
+    stride = self.grid.shape.with_rank_at_least(
+        2)[-2:].num_elements()
+    if stride is None:
+      stride = array_ops.reduce_prod(
+          array_ops.shape(self.grid)[-2:])
     offset = math_ops.range(start=0,
-                            limit=batch_size * self._quadrature_size,
-                            delta=self._quadrature_size,
+                            limit=batch_size * stride,
+                            delta=stride,
                             dtype=ids.dtype)
 
     weight = array_ops.gather(
-        array_ops.reshape(self.interpolate_weight, shape=[-1]),
+        array_ops.reshape(self.grid, shape=[-1]),
         ids + offset)
     weight = weight[..., array_ops.newaxis]
 
@@ -500,10 +648,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
         self.mixture_distribution.logits - fldj + log_prob, axis=-1)
 
   def _mean(self):
-    # Since we created logits to already be scaled, we can use exp which is
-    # slightly cheaper than `self.mixture_distribution.probs`.
-    p = math_ops.exp(self.mixture_distribution.logits)
-
+    p = self._expand_mix_distribution_probs()
     m = self._expand_base_distribution_mean()
     mean = None
     for k, aff in enumerate(self.interpolated_affine):
@@ -537,9 +682,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
         self._covariance_of_mean_given_quadrature_component(diag_only=True))
 
   def _mean_of_covariance_given_quadrature_component(self, diag_only):
-    # Since we created logits to already be scaled, we can use exp which is
-    # slightly cheaper than `self.mixture_distribution.probs`.
-    p = math_ops.exp(self.mixture_distribution.logits)
+    p = self.mixture_distribution.probs
 
     # To compute E[Cov(Z|V)], we'll add matrices within three categories:
     # scaled-identity, diagonal, and full. Then we'll combine these at the end.
@@ -611,10 +754,9 @@ class VectorDiffeomixture(distribution_lib.Distribution):
   def _covariance_of_mean_given_quadrature_component(self, diag_only):
     square = math_ops.square if diag_only else vec_osquare
 
-    # Since we created logits to already be scaled, we can use exp which is
-    # slightly cheaper than `self.mixture_distribution.probs`.
-    p = math_ops.exp(self.mixture_distribution.logits)
-
+    p = self._expand_mix_distribution_probs()
+    if not diag_only:
+      p = p[..., array_ops.newaxis, :]  # Assuming event.ndims=1.
     m = self._expand_base_distribution_mean()
 
     cov_e_z_given_v = None
@@ -638,17 +780,25 @@ class VectorDiffeomixture(distribution_lib.Distribution):
     m.set_shape(self.batch_shape.concatenate(self.event_shape))
     return m
 
-
-def maybe_check_mix_param(param, name, expected_base_dtype, validate_args):
-  """Helper which checks validity of `mix_loc` and `mix_scale` init args."""
+  def _expand_mix_distribution_probs(self):
+    p = self.mixture_distribution.probs  # [B, deg]
+    deg = p.shape.with_rank_at_least(1)[-1].value
+    if deg is None:
+      deg = array_ops.shape(p)[-1]
+    event_ndims = self.event_shape.ndims
+    if event_ndims is None:
+      event_ndims = array_ops.shape(self.event_shape_tensor())[0]
+    expand_shape = array_ops.concat([
+        self.mixture_distribution.batch_shape_tensor(),
+        array_ops.ones([event_ndims], dtype=dtypes.int32),
+        [deg],
+    ], axis=0)
+    return array_ops.reshape(p, shape=expand_shape)
+
+
+def maybe_check_quadrature_param(param, name, validate_args):
+  """Helper which checks validity of `loc` and `scale` init args."""
   with ops.name_scope(name="check_" + name, values=[param]):
-    param = ops.convert_to_tensor(param, dtype=expected_base_dtype, name=name)
-
-    if param.dtype.base_dtype != expected_base_dtype:
-      raise TypeError(
-          "dtype mismatch; {}.base_dtype=\"{}\" is not \"{}\".".format(
-              name, param.dtype.base_dtype.name, expected_base_dtype.name))
-
     assertions = []
     if param.shape.ndims is not None:
       if param.shape.ndims == 0:
@@ -679,79 +829,84 @@ def maybe_check_mix_param(param, name, expected_base_dtype, validate_args):
     return param
 
 
-def determine_batch_event_shapes(mix_loc, mix_scale, endpoint_affine):
+def determine_batch_event_shapes(grid, endpoint_affine):
   """Helper to infer batch_shape and event_shape."""
   with ops.name_scope(name="determine_batch_event_shapes"):
-    mix_batch_shape = distribution_util.prefer_static_broadcast_shape(
-        array_ops.shape(mix_loc, name="mix_loc_shape"),
-        array_ops.shape(mix_scale, name="mix_scale_shape"))
-    if isinstance(mix_batch_shape, tensor_shape.TensorShape):
-      mix_batch_shape = mix_batch_shape.with_rank_at_least(1)[:-1]
-    else:
-      s = static_value(mix_batch_shape)
-      if s is not None:
-        mix_batch_shape = ops.convert_to_tensor(
-            s[:-1], dtype=dtypes.int32, name="mix_batch_shape")
-      else:
-        mix_batch_shape = mix_batch_shape[:-1]
-
-    # We broadcast with a 1D constant to automatically make the result a
-    # TensorShape if possible.
-    batch_shape = distribution_util.prefer_static_broadcast_shape(
-        mix_batch_shape,
-        constant_op.constant([], dtype=dtypes.int32, name="batch_shape"))
-    event_shape = constant_op.constant(
-        [], dtype=dtypes.int32, name="event_shape")
+    # grid  # shape: [B, k, q]
+    # endpoint_affine     # len=k, shape: [B, d, d]
+    batch_shape = grid.shape[:-2]
+    batch_shape_tensor = array_ops.shape(grid)[:-2]
+    event_shape = None
+    event_shape_tensor = None
+
+    def _set_event_shape(shape, shape_tensor):
+      if event_shape is None:
+        return shape, shape_tensor
+      return (array_ops.broadcast_static_shape(event_shape, shape),
+              array_ops.broadcast_dynamic_shape(
+                  event_shape_tensor, shape_tensor))
+
     for aff in endpoint_affine:
-      b, e = distribution_util.shapes_from_loc_and_scale(aff.shift, aff.scale)
-      if batch_shape is None:
-        batch_shape = distribution_util.prefer_static_broadcast_shape(
-            mix_batch_shape, b)
-      else:
-        batch_shape = distribution_util.prefer_static_broadcast_shape(
-            batch_shape, b)
-      event_shape = distribution_util.prefer_static_broadcast_shape(
-          event_shape, e)
-    if isinstance(batch_shape, tensor_shape.TensorShape):
-      batch_shape = ops.convert_to_tensor(
-          batch_shape.as_list(), dtype=dtypes.int32, name="batch_shape")
-    if isinstance(event_shape, tensor_shape.TensorShape):
-      event_shape = ops.convert_to_tensor(
-          event_shape.as_list(), dtype=dtypes.int32, name="event_shape")
-    return batch_shape, event_shape
-
-
-def interpolate_loc(deg, interpolate_weight, loc):
+      if aff.shift is not None:
+        batch_shape = array_ops.broadcast_static_shape(
+            batch_shape, aff.shift.shape[:-1])
+        batch_shape_tensor = array_ops.broadcast_dynamic_shape(
+            batch_shape_tensor, array_ops.shape(aff.shift)[:-1])
+        event_shape, event_shape_tensor = _set_event_shape(
+            aff.shift.shape[-1:], array_ops.shape(aff.shift)[-1:])
+
+      if aff.scale is not None:
+        batch_shape = array_ops.broadcast_static_shape(
+            batch_shape, aff.scale.batch_shape)
+        batch_shape_tensor = array_ops.broadcast_dynamic_shape(
+            batch_shape_tensor, aff.scale.batch_shape_tensor())
+        event_shape, event_shape_tensor = _set_event_shape(
+            tensor_shape.TensorShape([aff.scale.range_dimension]),
+            aff.scale.range_dimension_tensor()[array_ops.newaxis])
+
+    return batch_shape, batch_shape_tensor, event_shape, event_shape_tensor
+
+
+def interpolate_loc(grid, loc):
   """Helper which interpolates between two locs."""
   if len(loc) != 2:
     raise NotImplementedError("Currently only bimixtures are supported; "
                               "len(scale)={} is not 2.".format(len(loc)))
-  with ops.name_scope("interpolate_loc", values=[interpolate_weight, loc]):
+  deg = grid.shape.with_rank_at_least(1)[-1].value
+  if deg is None:
+    raise ValueError("Num quadrature grid points must be known prior "
+                     "to graph execution.")
+  with ops.name_scope("interpolate_loc", values=[grid, loc]):
     if loc is None or loc[0] is None and loc[1] is None:
       return [None]*deg
-    w = interpolate_weight[..., array_ops.newaxis, :]  # shape: [B, 1, deg]
+    # shape: [B, 1, k, deg]
+    w = grid[..., array_ops.newaxis, :, :]
     loc = [x[..., array_ops.newaxis]                   # shape: [B, e, 1]
            if x is not None else None for x in loc]
     if loc[0] is None:
-      x = (1. - w) * loc[1]                            # shape: [B, e, deg]
+      x = w[..., 1, :] * loc[1]                        # shape: [B, e, deg]
     elif loc[1] is None:
-      x = w * loc[0]                                   # shape: [B, e, deg]
+      x = w[..., 0, :] * loc[0]                        # shape: [B, e, deg]
     else:
       delta = loc[0] - loc[1]
-      x = w * delta + loc[1]                           # shape: [B, e, deg]
+      x = w[..., 0, :] * delta + loc[1]                # shape: [B, e, deg]
     return [x[..., k] for k in range(deg)]             # list(shape:[B, e])
 
 
-def interpolate_scale(deg, interpolate_weight, scale):
+def interpolate_scale(grid, scale):
   """Helper which interpolates between two scales."""
   if len(scale) != 2:
     raise NotImplementedError("Currently only bimixtures are supported; "
                               "len(scale)={} is not 2.".format(len(scale)))
-  with ops.name_scope("interpolate_scale", values=[interpolate_weight]):
+  deg = grid.shape.with_rank_at_least(1)[-1].value
+  if deg is None:
+    raise ValueError("Num quadrature grid points must be known prior "
+                     "to graph execution.")
+  with ops.name_scope("interpolate_scale", values=[grid]):
     return [linop_add_lib.add_operators([
-        linop_scale(interpolate_weight[..., k], scale[0]),
-        linop_scale(1. - interpolate_weight[..., k], scale[1]),
-    ])[0] for k in range(deg)]
+        linop_scale(grid[..., k, q], s)
+        for k, s in enumerate(scale)
+    ])[0] for q in range(deg)]
 
 
 def linop_scale(w, op):
@@ -791,39 +946,12 @@ def linop_scale(w, op):
 
 def concat_vectors(*args):
   """Concatenates input vectors, statically if possible."""
-  args_ = [static_value(x) for x in args]
+  args_ = [distribution_util.static_value(x) for x in args]
   if any(vec is None for vec in args_):
     return array_ops.concat(args, axis=0)
   return [val for vec in args_ for val in vec]
 
 
-def reduce_prod(x):
-  """Same as `math_ops.reduce_prod` but statically if possible."""
-  x_ = static_value(x)
-  if x_ is not None:
-    return np.prod(x_, dtype=x.dtype.as_numpy_dtype)
-  return array_ops.reduce_prod(x)
-
-
-def ndims_from_shape(shape):
-  """Returns `Tensor`'s `rank` implied by a `Tensor` shape."""
-  if shape.shape.ndims not in (None, 1):
-    raise ValueError("input is not a valid shape: not 1D")
-  if not shape.dtype.is_integer:
-    raise TypeError("input is not a valid shape: wrong dtype")
-  if shape.shape.is_fully_defined():
-    return shape.shape.as_list()[0]
-  return array_ops.shape(shape)[0]
-
-
-def ndims(x):
-  """Returns rank, statically if possible."""
-  x = ops.convert_to_tensor(x)
-  if x.shape.ndims is not None:
-    return x.shape.ndims
-  return array_ops.rank(x)
-
-
 def add(x, y):
   """Adds inputs; interprets `None` as zero."""
   if x is None:
@@ -836,3 +964,18 @@ def add(x, y):
 def vec_osquare(x):
   """Computes the outer-product of a (batch of) vector, i.e., x.T x."""
   return x[..., :, array_ops.newaxis] * x[..., array_ops.newaxis, :]
+
+
+def softmax(x, axis, name=None):
+  """Equivalent to tf.nn.softmax but works around b/70297725."""
+  with ops.name_scope(name, "softmax", [x, axis]):
+    x = ops.convert_to_tensor(x, name="x")
+    ndims = (x.shape.ndims if x.shape.ndims is not None
+             else array_ops.rank(x, name="ndims"))
+    axis = ops.convert_to_tensor(axis, dtype=dtypes.int32, name="axis")
+    axis_ = tensor_util.constant_value(axis)
+    if axis_ is not None:
+      axis = np.int(ndims + axis_ if axis_ < 0 else axis_)
+    else:
+      axis = array_ops.where(axis < 0, ndims + axis, axis)
+  return nn_ops.softmax(x, axis=axis)
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
index 356d78b67a8107750f68f7f84d73d1231f5b2b03..526fe2d39aef9aed833b889de80e849c469435e7 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
@@ -89,14 +89,13 @@ class VectorExponentialDiag(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
-  la = tf.linalg
+  tfd = tf.contrib.distributions
 
   # Initialize a single 2-variate VectorExponential, supported on
   # {(x, y) in R^2 : x > 0, y > 0}.
 
   # The first component has pdf exp{-x}, the second 0.5 exp{-x / 2}
-  vex = ds.VectorExponentialDiag(scale_diag=[1., 2.])
+  vex = tfd.VectorExponentialDiag(scale_diag=[1., 2.])
 
   # Compute the pdf of an`R^2` observation; return a scalar.
   vex.prob([3., 4.]).eval()  # shape: []
@@ -107,7 +106,7 @@ class VectorExponentialDiag(
   scale_diag = [[1., 2, 3],
                 [0.5, 1, 1.5]]     # shape: [2, 3]
 
-  vex = ds.VectorExponentialDiag(loc, scale_diag)
+  vex = tfd.VectorExponentialDiag(loc, scale_diag)
 
   # Compute the pdf of two `R^3` observations; return a length-2 vector.
   x = [[1.9, 2.2, 3.1],
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
index b313a851b381e5b3a057fd17e6c2ef4eb0fc34f1..9d5fd9ac4178a1ae29b1ce32f304b22fd3d234dc 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
@@ -107,16 +107,15 @@ class VectorExponentialLinearOperator(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
-  la = tf.linalg
+  tfd = tf.contrib.distributions
 
   # Initialize a single 2-variate VectorExponential, supported on
   # {(x, y) in R^2 : x > 0, y > 0}.
   mat = [[1.0, 0.1],
          [0.1, 1.0]]
 
-  vex = ds.VectorExponentialLinearOperator(
-      scale=la.LinearOperatorFullMatrix(mat))
+  vex = tfd.VectorExponentialLinearOperator(
+      scale=tf.linalg.LinearOperatorFullMatrix(mat))
 
   # Compute the pdf of an`R^2` observation; return a scalar.
   vex.prob([1., 2.]).eval()  # shape: []
@@ -127,9 +126,9 @@ class VectorExponentialLinearOperator(
   scale_diag = [[1., 2, 3],
                 [0.5, 1, 1.5]]     # shape: [2, 3]
 
-  vex = ds.VectorExponentialLinearOperator(
+  vex = tfd.VectorExponentialLinearOperator(
       loc=mu,
-      scale=la.LinearOperatorDiag(scale_diag))
+      scale=tf.linalg.LinearOperatorDiag(scale_diag))
 
   # Compute the pdf of two `R^3` observations; return a length-2 vector.
   x = [[1.9, 2.2, 3.1],
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
index 0e3867809a820f49cfa7f5282c47f786626481a6..8dd983b750d9b39775e570800006011f4968f7f3 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
@@ -101,10 +101,10 @@ class VectorLaplaceDiag(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 2-variate VectorLaplace.
-  vla = ds.VectorLaplaceDiag(
+  vla = tfd.VectorLaplaceDiag(
       loc=[1., -1],
       scale_diag=[1, 2.])
 
@@ -118,7 +118,7 @@ class VectorLaplaceDiag(
   vla.prob([-1., 0]).eval()  # shape: []
 
   # Initialize a 3-batch, 2-variate scaled-identity VectorLaplace.
-  vla = ds.VectorLaplaceDiag(
+  vla = tfd.VectorLaplaceDiag(
       loc=[1., -1],
       scale_identity_multiplier=[1, 2., 3])
 
@@ -136,7 +136,7 @@ class VectorLaplaceDiag(
   vla.prob([-1., 0]).eval()  # shape: [3]
 
   # Initialize a 2-batch of 3-variate VectorLaplace's.
-  vla = ds.VectorLaplaceDiag(
+  vla = tfd.VectorLaplaceDiag(
       loc=[[1., 2, 3],
            [11, 22, 33]]           # shape: [2, 3]
       scale_diag=[[1., 2, 3],
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
index c7abdbb4caf9bee4cbd5991eb5d652f20dd0f8d1..ec485c95c15da2794b67d2699d2bdd9db97bb6c4 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
@@ -109,8 +109,7 @@ class VectorLaplaceLinearOperator(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
-  la = tf.linalg
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate VectorLaplace with some desired covariance.
   mu = [1., 2, 3]
@@ -124,9 +123,9 @@ class VectorLaplaceLinearOperator(
   #      [ 0.1, -0.3,  0.4]])
 
   # Divide scale by sqrt(2) so that the final covariance will be what we want.
-  vla = ds.VectorLaplaceLinearOperator(
+  vla = tfd.VectorLaplaceLinearOperator(
       loc=mu,
-      scale=la.LinearOperatorLowerTriangular(scale / tf.sqrt(2)))
+      scale=tf.linalg.LinearOperatorLowerTriangular(scale / tf.sqrt(2.)))
 
   # Covariance agrees with cholesky(cov) parameterization.
   vla.covariance().eval()
@@ -143,9 +142,9 @@ class VectorLaplaceLinearOperator(
   scale_diag = [[1., 2, 3],
                 [0.5, 1, 1.5]]     # shape: [2, 3]
 
-  vla = ds.VectorLaplaceLinearOperator(
+  vla = tfd.VectorLaplaceLinearOperator(
       loc=mu,
-      scale=la.LinearOperatorDiag(scale_diag))
+      scale=tf.linalg.LinearOperatorDiag(scale_diag))
 
   # Compute the pdf of two `R^3` observations; return a length-2 vector.
   x = [[-0.9, 0, 0.1],
diff --git a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
index 544a8710709a0afb56c6ae6f36d35de892e8e420..e1ccf116457a97261b9ce3965552764771d3bdd2 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
@@ -143,7 +143,7 @@ class VectorSinhArcsinhDiag(transformed_distribution.TransformedDistribution):
         broadcastable with `event_shape`.
       distribution: `tf.Distribution`-like instance. Distribution from which `k`
         iid samples are used as input to transformation `F`.  Default is
-        `ds.Normal(0., 1.)`.
+        `tf.distributions.Normal(loc=0., scale=1.)`.
         Must be a scalar-batch, scalar-event distribution.  Typically
         `distribution.reparameterization_type = FULLY_REPARAMETERIZED` or it is
         a function of non-trainable parameters. WARNING: If you backprop through
diff --git a/tensorflow/contrib/distributions/python/ops/vector_student_t.py b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
index 29d41ab81c62d621c3c3533e1449341e9a085645..8c67647a618d22a58428d78865c4ebf7d98bdf9e 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
@@ -91,14 +91,14 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
   Extra leading dimensions, if provided, allow for batches.
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate vector Student's t-distribution.
   mu = [1., 2, 3]
   chol = [[1., 0, 0.],
           [1, 3, 0],
           [1, 2, 3]]
-  vt = ds.VectorStudentT(df=2, loc=mu, scale_tril=chol)
+  vt = tfd.VectorStudentT(df=2, loc=mu, scale_tril=chol)
 
   # Evaluate this on an observation in R^3, returning a scalar.
   vt.prob([-1., 0, 1])
@@ -107,7 +107,7 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
   mu = [[1., 2, 3],
         [11, 22, 33]]
   chol = ...  # shape 2 x 3 x 3, lower triangular, positive diagonal.
-  vt = ds.VectorStudentT(loc=mu, scale_tril=chol)
+  vt = tfd.VectorStudentT(loc=mu, scale_tril=chol)
 
   # Evaluate this on a two observations, each in R^3, returning a length two
   # tensor.
diff --git a/tensorflow/contrib/eager/README.md b/tensorflow/contrib/eager/README.md
index dcc370cd00d5f93cd5b145a31fd58ef5041a86a8..09242ee47ddd044dfc99e22d5b7751a989c86485 100644
--- a/tensorflow/contrib/eager/README.md
+++ b/tensorflow/contrib/eager/README.md
@@ -76,3 +76,6 @@ For an introduction to eager execution in TensorFlow, see:
 ## Changelog
 
 - 2017/10/31: Initial preview release.
+- 2017/12/01: Example of dynamic neural network:
+  [SPINN: Stack-augmented Parser-Interpreter Neural Network](https://arxiv.org/abs/1603.06021).
+  See [README.md](python/examples/spinn/README.md) for details.
diff --git a/tensorflow/contrib/eager/proto/BUILD b/tensorflow/contrib/eager/proto/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..aedfec8924e7314addd22349c0576a84a58d9aa3
--- /dev/null
+++ b/tensorflow/contrib/eager/proto/BUILD
@@ -0,0 +1,24 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+tf_proto_library(
+    name = "checkpointable_object_graph_proto",
+    srcs = [
+        "checkpointable_object_graph.proto",
+    ],
+    visibility = ["//tensorflow/contrib/eager/python:__subpackages__"],
+)
diff --git a/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto b/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto
new file mode 100644
index 0000000000000000000000000000000000000000..c962638aa11c06dcd5be6a794314e029ae84e572
--- /dev/null
+++ b/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto
@@ -0,0 +1,56 @@
+syntax = "proto3";
+
+option cc_enable_arenas = true;
+
+package tensorflow.contrib.eager;
+
+// Prototype for an addition to BundleHeaderProto which saves extra information
+// about the objects which own variables, allowing for more robust checkpoint
+// loading into modified programs.
+
+message CheckpointableObjectGraph {
+  message Object {
+    message ObjectReference {
+      // An index into `CheckpointableObjectGraph.nodes`, indicating the object
+      // being referenced.
+      int32 node_id = 1;
+      // A numeric identifier for this object within its parent.
+      int32 local_uid = 2;
+      // A user-provided name for the edge. May be blank/omitted, in which case
+      // there is no explicitly provided local name; fall back on local_uid.
+      string local_name = 3;
+    }
+
+    message VariableReference {
+      // A name for the variable which is unique within the object which owns
+      // it. Does not include a name_scope or variable_scope prefix.
+      string local_name = 1;
+      // The full name of the variable. Used to allow name-based loading of
+      // checkpoints which were saved using an object-based API.
+      string full_name = 2;
+    }
+
+    message SlotVariableReference {
+      // An index into `CheckpointableObjectGraph.nodes`, indicating the object
+      // which created the variable that this variable is slotting for.
+      int32 original_variable_node_id = 1;
+      // The local name of the variable being slotted for within the object that
+      // owns it.
+      string original_variable_local_name = 2;
+      // The name of the slot (e.g. "m"/"v").
+      string slot_name = 3;
+      // The full name of the slot variable. Used to allow name-based loading of
+      // checkpoints which were saved using an object-based API.
+      string full_name = 4;
+    }
+
+    // Objects which this object depends on.
+    repeated ObjectReference children = 1;
+    // Non-slot variables owned by this object.
+    repeated VariableReference variables = 2;
+    // Slot variables owned by this object.
+    repeated SlotVariableReference slot_variables = 3;
+  }
+
+  repeated Object nodes = 1;
+}
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index bf2e883bc53c3281ef89d1200f5a089305ef3e72..086315464c99811371d836aed290b5068729adb0 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -19,6 +19,7 @@ py_library(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:numerics",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:backprop",
@@ -103,37 +104,6 @@ cuda_py_test(
     ],
 )
 
-py_library(
-    name = "summary_writer",
-    srcs = ["summary_writer.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/summary:gen_summary_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:summary_op_util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:context",
-    ],
-)
-
-cuda_py_test(
-    name = "summary_writer_test",
-    srcs = ["summary_writer_test.py"],
-    additional_deps = [
-        ":summary_writer",
-        "//third_party/py/numpy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:test",
-    ],
-)
-
 py_library(
     name = "metrics",
     srcs = [
@@ -232,6 +202,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":network",
+        "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
@@ -246,6 +217,39 @@ py_test(
     ],
 )
 
+py_library(
+    name = "checkpointable",
+    srcs = ["checkpointable.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/contrib/eager/proto:checkpointable_object_graph_proto_py",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+py_test(
+    name = "checkpointable_test",
+    srcs = ["checkpointable_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":checkpointable",
+        ":network",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "@six_archive//:six",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/eager/python/checkpointable.py b/tensorflow/contrib/eager/python/checkpointable.py
new file mode 100644
index 0000000000000000000000000000000000000000..b141ffb2bc03b8e38f8481bc044c3aae7e156c15
--- /dev/null
+++ b/tensorflow/contrib/eager/python/checkpointable.py
@@ -0,0 +1,392 @@
+"""An object-local variable management scheme."""
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+
+from tensorflow.contrib.eager.proto import checkpointable_object_graph_pb2
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import optimizer as optimizer_lib
+from tensorflow.python.training import saver as saver_lib
+
+_CheckpointableReference = collections.namedtuple(
+    "_CheckpointableReference",
+    [
+        "name",  # The local name if explicitly specified, else None.
+        "local_uid",  # 0 for the first dependency, 1 for the next, ... Used for
+        # routing checkpointed variables to their correct
+        # Checkpointables when "name" is not set (see docstring of
+        # `track_checkpointable`).
+        "ref"  # The Checkpointable object being referenced.
+    ])
+
+_OwnedVariable = collections.namedtuple(
+    "_OwnedVariable",
+    [
+        "name",  # The variable's (local) name.
+        "variable"  # The owned variable object.
+    ])
+
+# Validation regular expression for the local names of Checkpointable
+# objects. In particular, disallows "/" in names, and reserves
+# underscore-prefixed names.
+_VALID_LOCAL_NAME = re.compile(r"^[A-Za-z0-9.][A-Za-z0-9_.-]*$")
+
+# Keyword for identifying that the next bit of a checkpoint variable name is a
+# slot name. May not be the local name of a checkpointable. Checkpoint names for
+# slot variables look like:
+#
+#   <path to variable>/<_OPTIMIZER_SLOTS_NAME>/<path to optimizer>/<slot name>
+#
+# Where <path to variable> is a full path from the checkpoint root to the
+# variable being slotted for.
+_OPTIMIZER_SLOTS_NAME = "_OPTIMIZER_SLOT"
+
+
+class Checkpointable(object):
+  """Manages variables and dependencies on other objects.
+
+  To make reliable checkpoints, all `Checkpointable`s on which this object
+  depends must be registered in the constructor using `track_checkpointable` in
+  a deterministic order, and if possible they should be named. Variables may be
+  created using `add_variable` outside of the constructor and in any order, but
+  only these variables will be saved.
+  """
+
+  def __init__(self):
+    # Basically less useful OrderedDicts but without the reference cycles.
+    # TODO(allenl): Switch these to OrderedDict once TensorFlow supports only
+    # Python 3.6+.
+    self._checkpoint_dependencies = []  # A list of _CheckpointableReference
+    # objects.
+    self._dependency_names = set()
+    self._owned_variables = []  # A list of _OwnedVariable objects.
+    self._owned_variable_names = set()
+
+  def add_variable(self, name, shape, dtype=None, initializer=None, **kwargs):
+    """Create a new variable object to be saved with this `Checkpointable`.
+
+    If the user has requested that this object or another `Checkpointable` which
+    depends on this object be restored from a checkpoint (deferred loading
+    before variable object creation), `initializer` may be ignored and the value
+    from the checkpoint used instead.
+
+    Args:
+      name: A name for the variable. Must be unique within this object.
+      shape: The shape of the variable.
+      dtype: The data type of the variable.
+      initializer: The initializer to use. Ignored if deferred loading has been
+        requested.
+      **kwargs: Passed to get_variable.
+
+    Returns:
+      The new variable object.
+
+    Raises:
+      ValueError: If the variable name is not unique.
+    """
+    if name in self._owned_variable_names:
+      raise ValueError(
+          ("A variable named '%s' already exists in this Checkpointable, but "
+           "Checkpointable.add_variable called to create another with "
+           "that name. Variable names must be unique within a Checkpointable "
+           "object.") % (name,))
+    if "getter" in kwargs:
+      # Allow the getter to be overridden, typically because there is a need for
+      # compatibility with some other variable creation mechanism. This should
+      # be relatively uncommon in user code.
+      getter = kwargs.pop("getter")
+    else:
+      getter = variable_scope.get_variable
+    # TODO(allenl): handle deferred loading
+    new_variable = getter(
+        name=name, shape=shape, dtype=dtype, initializer=initializer, **kwargs)
+    self._owned_variables.append(
+        _OwnedVariable(name=name, variable=new_variable))
+    self._owned_variable_names.add(name)
+    return new_variable
+
+  def track_checkpointable(self, checkpointable, name=None):
+    """Declare a dependency on another `Checkpointable` object.
+
+    Indicates that checkpoints for this object should include variables from
+    `checkpointable`.
+
+    Variables in a checkpoint are mapped to `Checkpointable`s based on names if
+    provided when the checkpoint was written, but otherwise use the order those
+    `Checkpointable`s were declared as dependencies. Both `name` arguments and
+    the dependency declaration order should be deterministic.
+
+    There are two sufficient conditions to avoid breaking existing checkpoints
+    when modifying a class: (1) New dependencies must be declared after existing
+    dependencies, and (2) dependencies which were previously declared may never
+    be removed (a trivial placeholder with the same name may be used instead).
+
+    Args:
+      checkpointable: A `Checkpointable` which this object depends on.
+      name: A local name for `checkpointable`, used for loading checkpoints into
+        the correct objects. If provided, it must be unique within this
+        `Checkpointable`. If None, dependency declaration order is used instead.
+
+    Returns:
+      `checkpointable`, for convenience when declaring a dependency and
+      assigning to a member variable in one statement.
+
+    Raises:
+      RuntimeError: If __init__ was not called.
+      TypeError: If `checkpointable` does not inherit from `Checkpointable`.
+      ValueError: For invalid names.
+    """
+    if not hasattr(self, "_checkpoint_dependencies"):
+      raise RuntimeError("Need to call Checkpointable.__init__ before calling "
+                         "Checkpointable.track_checkpointable().")
+    if not isinstance(checkpointable, Checkpointable):
+      raise TypeError(
+          ("Checkpointable.track_checkpointable() passed type %s, not a "
+           "Checkpointable.") % (type(checkpointable),))
+    if name is not None:
+      if not _VALID_LOCAL_NAME.match(name):
+        raise ValueError(
+            ("Checkpointable names must match the regular expression '%s', but "
+             "got an invalid name '%s' instead.") % (_VALID_LOCAL_NAME.pattern,
+                                                     name))
+      if name in self._dependency_names:
+        raise ValueError(
+            ("Called Checkpointable.track_checkpointable() with name='%s', but "
+             "a Checkpointable with this name is already declared as a "
+             "dependency. If provided, names must be unique.") % (name,))
+      self._dependency_names.add(name)
+    self._checkpoint_dependencies.append(
+        _CheckpointableReference(
+            name=name,
+            ref=checkpointable,
+            # TODO(allenl): Should this be exposed to allow users to stop
+            # depending on things and still load checkpoints when not using
+            # names?
+            local_uid=len(self._checkpoint_dependencies)))
+    return checkpointable
+
+  @property
+  def checkpoint_dependencies(self):
+    """Other `Checkpointable` objects on which this object depends."""
+    return self._checkpoint_dependencies
+
+
+def _breadth_first_checkpointable_traversal(root_checkpointable):
+  """Find shortest paths to all variables owned by dependencies of root."""
+  bfs_sorted = []
+  root_checkpointable_reference = _CheckpointableReference(
+      name=None, local_uid=0, ref=root_checkpointable)
+  to_visit = collections.deque([root_checkpointable_reference])
+  path_to_root = {root_checkpointable_reference: ()}
+  while to_visit:
+    current_checkpointable = to_visit.popleft()
+    bfs_sorted.append(current_checkpointable)
+    for child_checkpointable in (
+        current_checkpointable.ref.checkpoint_dependencies):
+      if child_checkpointable not in path_to_root:
+        path_to_root[child_checkpointable] = (
+            path_to_root[current_checkpointable] + (child_checkpointable,))
+        to_visit.append(child_checkpointable)
+  return bfs_sorted, path_to_root
+
+
+def _object_prefix_from_path(path_to_root):
+  return "/".join((checkpointable.name if checkpointable.name else "_%d" % (
+      checkpointable.local_uid,)) for checkpointable in path_to_root)
+
+
+def _escape_variable_name(variable_name):
+  # We need to support slashes in variable names for compatibility, since this
+  # naming scheme is being patched in to things like Layer.add_variable where
+  # slashes were previously accepted. We also want to use slashes to indicate
+  # edges traversed to reach the variable, so we escape forward slashes in
+  # variable names.
+  return variable_name.replace("_S_", "_S_.").replace(r"/", r"_S__")
+
+
+def _variable_naming_for_object(path_to_root):
+  """Make a function for naming variables in an object."""
+  # Name non-slot variables:
+  #
+  #   <path to node>/<local variable name>
+  #
+  # <path to node> is not necessarily unique, but this is fine since we also
+  # save the graph of `Checkpointable`s with the checkpoint. Even if this path
+  # no longer exists because of a change in the Python program, we can look up
+  # the `Checkpointable` which owns the variable in the checkpoint's graph and
+  # use another path if one still exists.
+
+  object_prefix = _object_prefix_from_path(path_to_root)
+  if object_prefix:
+    object_prefix += "/"
+
+  def _name_single_variable(owned_variable):
+    """Names a variable within an object."""
+    return object_prefix + _escape_variable_name(owned_variable.name)
+
+  return _name_single_variable
+
+
+def _slot_variable_naming_for_optimizer(optimizer, path_to_root):
+  """Make a function for naming slot variables in an optimizer."""
+  # Name slot variables:
+  #
+  #   <variable name>/<_OPTIMIZER_SLOTS_NAME>/<optimizer path>/<slot name>
+  #
+  # where <variable name> is exactly the checkpoint name used for the original
+  # variable, including the path from the checkpoint root and the local name in
+  # the object which owns it. Note that we only save slot variables if the
+  # variable it's slotting for is also being saved.
+
+  optimizer_identifier = "/%s/%s/" % (_OPTIMIZER_SLOTS_NAME,
+                                      _object_prefix_from_path(path_to_root))
+
+  def _name_slot_variable(variable_path, slot_name):
+    """With an optimizer specified, name a slot variable."""
+
+    if not _VALID_LOCAL_NAME.match(slot_name):
+      # Slot variable names include the name of the slot. We need to
+      # validate that part of the name to be sure that the checkpoint name
+      # is a valid name scope name.
+      raise ValueError(
+          ("Could not save slot variables for optimizer %s, because its "
+           "slot name has invalid characters (got '%s', was expecting it "
+           "to match the regular expression '%s').") %
+          (optimizer, slot_name, _VALID_LOCAL_NAME.pattern))
+
+    return variable_path + optimizer_identifier + slot_name
+
+  return _name_slot_variable
+
+
+def _serialize_non_slot_variables(checkpointable_objects, path_to_root,
+                                  object_graph_proto):
+  """Name non-slot variables and add them to `object_graph_proto`."""
+  named_variables = {}
+  non_slot_variables = []
+  checkpoint_node_ids = {}
+
+  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
+    checkpoint_node_ids[checkpointable] = checkpoint_id
+
+  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
+    naming_scheme = _variable_naming_for_object(path_to_root[checkpointable])
+    object_proto = object_graph_proto.nodes.add()
+    for owned_variable in checkpointable.ref._owned_variables:  # pylint: disable=protected-access
+      variable_name = naming_scheme(owned_variable)
+      named_variables[variable_name] = owned_variable.variable
+      non_slot_variables.append((
+          variable_name,  # The variable's full checkpoint name
+          owned_variable,  # The variable's _OwnedVariable object
+          checkpoint_id))  # The checkpoint ID of the node which owns this
+      # variable.
+      variable_proto = object_proto.variables.add()
+      variable_proto.local_name = owned_variable.name
+      # Figure out the name-based Saver's name for this variable.
+      saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
+          [owned_variable.variable], convert_variable_to_tensor=False)
+      variable_full_name, = saver_dict.keys()
+      variable_proto.full_name = variable_full_name
+
+    for child in checkpointable.ref.checkpoint_dependencies:
+      child_proto = object_proto.children.add()
+      child_proto.node_id = checkpoint_node_ids[child]
+      child_proto.local_uid = child.local_uid
+      if child.name is not None:
+        child_proto.local_name = child.name
+  return named_variables, non_slot_variables
+
+
+def _serialize_slot_variables(checkpointable_objects, path_to_root,
+                              non_slot_variables, object_graph_proto):
+  """Name slot variables and add them to `object_graph_proto`."""
+  named_slot_variables = {}
+  for optimizer_checkpoint_id, checkpointable_ref in enumerate(
+      checkpointable_objects):
+    if isinstance(checkpointable_ref.ref, optimizer_lib.Optimizer):
+      optimizer_object_proto = object_graph_proto.nodes[optimizer_checkpoint_id]
+      naming_scheme = _slot_variable_naming_for_optimizer(
+          optimizer=checkpointable_ref.ref,
+          path_to_root=path_to_root[checkpointable_ref])
+      slot_names = checkpointable_ref.ref.get_slot_names()
+      for (variable_path, owned_variable,
+           original_node_checkpoint_id) in non_slot_variables:
+        for slot_name in slot_names:
+          slot_variable = checkpointable_ref.ref.get_slot(
+              owned_variable.variable, slot_name)
+          if slot_variable is not None:
+            checkpoint_name = naming_scheme(
+                variable_path=variable_path, slot_name=slot_name)
+            named_slot_variables[checkpoint_name] = slot_variable
+            slot_variable_proto = optimizer_object_proto.slot_variables.add()
+            slot_variable_proto.slot_name = slot_name
+            # Figure out the name-based Saver's name for this variable.
+            saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
+                [slot_variable], convert_variable_to_tensor=False)
+            slot_variable_full_name, = saver_dict.keys()
+            slot_variable_proto.full_name = slot_variable_full_name
+            slot_variable_proto.original_variable_local_name = (
+                owned_variable.name)
+            slot_variable_proto.original_variable_node_id = (
+                original_node_checkpoint_id)
+  return named_slot_variables
+
+
+# TODO(allenl): Convenience utility for saving multiple objects (i.e. construct
+# a root Checkpointable if passed a list of Checkpointables).
+def _serialize_object_graph(root_checkpointable):
+  """Determine checkpoint keys for variables and build a serialized graph.
+
+  Non-slot variables are keyed based on a shortest path from the root saveable
+  to the object which owns the variable (i.e. the one which called
+  `Checkpointable.add_variable` to create it).
+
+  Slot variables are keyed based on a shortest path to the variable being
+  slotted for, a shortest path to their optimizer, and the slot name.
+
+  Args:
+    root_checkpointable: A `Checkpointable` object whose variables (including
+      the variables of dependencies, recursively) should be saved.
+
+  Returns:
+    A tuple of (named_variables, object_graph_proto):
+      named_variables: A dictionary mapping names to variable objects.
+      object_graph_proto: A CheckpointableObjectGraph protocol buffer containing
+        the serialized object graph and variable references.
+
+  Raises:
+    ValueError: If there are invalid characters in an optimizer's slot names.
+  """
+  checkpointable_objects, path_to_root = (
+      _breadth_first_checkpointable_traversal(root_checkpointable))
+  object_graph_proto = (
+      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+
+  # Gather non-slot variables.
+  named_variables, non_slot_variables = _serialize_non_slot_variables(
+      checkpointable_objects, path_to_root, object_graph_proto)
+
+  # Gather slot variables which are associated with variables gathered above.
+  named_slot_variables = _serialize_slot_variables(
+      checkpointable_objects, path_to_root, non_slot_variables,
+      object_graph_proto)
+
+  named_variables.update(named_slot_variables)
+  return named_variables, object_graph_proto
diff --git a/tensorflow/contrib/eager/python/checkpointable_test.py b/tensorflow/contrib/eager/python/checkpointable_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f820990bbe5fe6c9b4cdf890680aaad0847010c0
--- /dev/null
+++ b/tensorflow/contrib/eager/python/checkpointable_test.py
@@ -0,0 +1,277 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import six
+
+from tensorflow.contrib.eager.python import checkpointable
+from tensorflow.contrib.eager.python import network as network_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.layers import core
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.training import adam
+from tensorflow.python.training import training_util
+
+
+class CheckpointableDenseLayer(core.Dense, checkpointable.Checkpointable):
+
+  def __init__(self, *args, **kwargs):
+    checkpointable.Checkpointable.__init__(self)
+    core.Dense.__init__(self, *args, **kwargs)
+
+  def add_variable(self, name, shape, **kwargs):
+    # Calls both Checkpointable.add_variable and Layer.add_variable. Eventually
+    # Layer.add_variable should inherit from Checkpointable and simply call
+    # super and then do post-processing.
+    return checkpointable.Checkpointable.add_variable(
+        self,
+        name=name,
+        shape=shape,
+        getter=functools.partial(core.Dense.add_variable, self),
+        **kwargs)
+
+
+# pylint: disable=not-callable
+class CheckpointableNetwork(network_lib.Network, checkpointable.Checkpointable):
+
+  def __init__(self):
+    network_lib.Network.__init__(self)
+    checkpointable.Checkpointable.__init__(self)
+
+  def track_layer(self, layer, name=None):
+    self.track_checkpointable(layer, name=name)
+    return super(CheckpointableNetwork, self).track_layer(layer)
+
+
+class CheckpointableAdam(adam.AdamOptimizer, checkpointable.Checkpointable):
+
+  def __init__(self, *args, **kwargs):
+    checkpointable.Checkpointable.__init__(self)
+    adam.AdamOptimizer.__init__(self, *args, **kwargs)
+
+  # NOTE: Copied from AdamOptimizer with modifications to use add_variable
+  # for non-slot variables. These contortions are necessary to maintain
+  # checkpoint compatibility with variable.name based saving.
+  def _create_slots(self, var_list):
+    # Create the beta1 and beta2 accumulators on the same device as the first
+    # variable. Sort the var_list to make sure this device is consistent across
+    # workers (these need to go on the same PS, otherwise some updates are
+    # silently ignored).
+    first_var = min(var_list, key=lambda x: x.name)
+
+    create_new = self._beta1_power is None
+    if not create_new and context.in_graph_mode():
+      create_new = (self._beta1_power.graph is not first_var.graph)
+
+    if create_new:
+      with ops.colocate_with(first_var):
+
+        def _variable_getter(name, shape, dtype, initializer):
+          del shape, dtype  # not used, but there for compatibility
+          return variable_scope.variable(
+              name=name, initial_value=initializer, trainable=False)
+
+        self._beta1_power = self.add_variable(
+            name="beta1_power",
+            shape=[],
+            initializer=self._beta1,
+            getter=_variable_getter)
+        self._beta2_power = self.add_variable(
+            name="beta2_power",
+            shape=[],
+            initializer=self._beta2,
+            getter=_variable_getter)
+    # Create slots for the first and second moments.
+    for v in var_list:
+      self._zeros_slot(v, "m", self._name)
+      self._zeros_slot(v, "v", self._name)
+
+  # TODO(allenl): Override slot variable creation (_get_or_make_slot,
+  # _get_or_make_slot_with_initializer, _zeros_slot) to allow deferred
+  # loading. Likely no need to run this through add_variable, since gathering
+  # slot variables is special cased anyway.
+
+
+class MyNetwork(CheckpointableNetwork):
+  """A concrete Network for testing."""
+
+  def __init__(self):
+    super(MyNetwork, self).__init__()
+    self._named = self.track_layer(
+        CheckpointableDenseLayer(1, use_bias=True), name="named_dense")
+    self._unnamed = self.track_layer(
+        CheckpointableDenseLayer(1, use_bias=False))
+
+  def call(self, values):
+    return self._unnamed(self._named(values))
+
+
+class Root(checkpointable.Checkpointable):
+  """A stand-in for a Trainer class."""
+
+  def __init__(self, optimizer, network):
+    super(Root, self).__init__()
+    self.track_checkpointable(optimizer, name="optimizer")
+    self.track_checkpointable(network, name="network")
+    self._global_step = None
+
+  @property
+  def global_step(self):
+    if self._global_step is None:
+      # Get the default create_global_step utility to actually call
+      # self.add_variable, by setting a custom getter.
+      def _owned_variable_as_custom_getter(getter, *args, **kwargs):
+        return self.add_variable(*args, getter=getter, **kwargs)
+
+      with variable_scope.variable_scope(
+          "", custom_getter=_owned_variable_as_custom_getter):
+        self._global_step = training_util.create_global_step()
+    return self._global_step
+
+
+class CheckpointNamingTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNamingWithOptimizer(self):
+    input_value = constant_op.constant([[3.]])
+    network = MyNetwork()
+    # A nuisance Network using the same optimizer. Its slot variables should not
+    # go in the checkpoint, since it is never depended on.
+    other_network = MyNetwork()
+    optimizer = CheckpointableAdam(0.001)
+    root_checkpointable = Root(optimizer=optimizer, network=network)
+    if context.in_eager_mode():
+      optimizer.minimize(
+          lambda: network(input_value),
+          global_step=root_checkpointable.global_step)
+      optimizer.minimize(
+          lambda: other_network(input_value),
+          global_step=root_checkpointable.global_step)
+    else:
+      train_op = optimizer.minimize(
+          network(input_value), global_step=root_checkpointable.global_step)
+      optimizer.minimize(
+          other_network(input_value),
+          global_step=root_checkpointable.global_step)
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(train_op)
+    named_variables, serialized_graph = checkpointable._serialize_object_graph(
+        root_checkpointable)
+    expected_checkpoint_names = (
+        # Created in the root node, so no prefix.
+        "global_step",
+        # No name provided to track_checkpointable(), so the position (1, after
+        # the named track_checkpointable() which is 0) is used instead.
+        "network/_1/kernel",
+        # track_checkpointable() with a name provided, so that's used
+        "network/named_dense/kernel",
+        "network/named_dense/bias",
+        # The optimizer creates two non-slot variables
+        "optimizer/beta1_power",
+        "optimizer/beta2_power",
+        # Slot variables
+        "network/_1/kernel/_OPTIMIZER_SLOT/optimizer/m",
+        "network/_1/kernel/_OPTIMIZER_SLOT/optimizer/v",
+        "network/named_dense/kernel/_OPTIMIZER_SLOT/optimizer/m",
+        "network/named_dense/kernel/_OPTIMIZER_SLOT/optimizer/v",
+        "network/named_dense/bias/_OPTIMIZER_SLOT/optimizer/m",
+        "network/named_dense/bias/_OPTIMIZER_SLOT/optimizer/v",
+    )
+    six.assertCountEqual(self, expected_checkpoint_names,
+                         named_variables.keys())
+    # Check that we've mapped to the right variable objects (not exhaustive)
+    self.assertEqual("global_step:0", named_variables["global_step"].name)
+    self.assertEqual("my_network/checkpointable_dense_layer_1/kernel:0",
+                     named_variables["network/_1/kernel"].name)
+    self.assertEqual("my_network/checkpointable_dense_layer/kernel:0",
+                     named_variables["network/named_dense/kernel"].name)
+    self.assertEqual("beta1_power:0",
+                     named_variables["optimizer/beta1_power"].name)
+    self.assertEqual("beta2_power:0",
+                     named_variables["optimizer/beta2_power"].name)
+    # Spot check the generated protocol buffers.
+    self.assertEqual(0, serialized_graph.nodes[0].children[0].local_uid)
+    self.assertEqual("optimizer",
+                     serialized_graph.nodes[0].children[0].local_name)
+    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
+        0].node_id]
+    self.assertEqual("beta1_power", optimizer_node.variables[0].local_name)
+    self.assertEqual("beta1_power", optimizer_node.variables[0].full_name)
+    self.assertEqual(
+        "kernel", optimizer_node.slot_variables[0].original_variable_local_name)
+    original_variable_owner = serialized_graph.nodes[
+        optimizer_node.slot_variables[0].original_variable_node_id]
+    self.assertEqual("kernel", original_variable_owner.variables[0].local_name)
+    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
+    # We strip off the :0 suffix, as variable.name-based saving does.
+    self.assertEqual("my_network/checkpointable_dense_layer/kernel/Adam",
+                     optimizer_node.slot_variables[0].full_name)
+    self.assertEqual("my_network/checkpointable_dense_layer/kernel/Adam:0",
+                     optimizer.get_slot(
+                         var=named_variables["network/named_dense/kernel"],
+                         name="m").name)
+
+  def _get_checkpoint_name(self, name):
+    root = checkpointable.Checkpointable()
+    with variable_scope.variable_scope("get_checkpoint_name"):
+      # Create the variable in a variable scope so that we get more relaxed
+      # naming rules (variables outside a scope may not start with "_", "/" or
+      # "-"). Since we don't use the scope part of the name, these cases are
+      # somewhat annoying.
+      root.add_variable(name=name, shape=[1, 2], dtype=dtypes.float64)
+    named_variables, _ = checkpointable._serialize_object_graph(root)
+    checkpoint_name, = named_variables.keys()
+    with ops.name_scope("root/" + checkpoint_name):
+      pass  # Make sure we can use this as an op name if we prefix it.
+    return checkpoint_name
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testVariableNameEscaping(self):
+    self.assertEqual(r"a_S__b_S__c", self._get_checkpoint_name(r"a/b/c"))
+    self.assertEqual(r"", self._get_checkpoint_name(r""))
+    self.assertEqual(r"_S__", self._get_checkpoint_name(r"/"))
+    self.assertEqual(r"_S___S_._", self._get_checkpoint_name(r"/_S__"))
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNumberedPath(self):
+    root = checkpointable.Checkpointable()
+    leaf = checkpointable.Checkpointable()
+    root.track_checkpointable(leaf)
+    leaf.add_variable(name="v", shape=[])
+    named_variables, _ = checkpointable._serialize_object_graph(root)
+    variable_name, = named_variables.keys()
+    self.assertEqual(r"_0/v", variable_name)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testLocalNameValidation(self):
+    root = checkpointable.Checkpointable()
+    leaf = checkpointable.Checkpointable()
+    with self.assertRaisesRegexp(ValueError, "invalid name"):
+      # Leading underscores are reserved, which avoids conflicts with
+      # un-named edges in paths and the optimizer slots identifier.
+      root.track_checkpointable(leaf, name="_12")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/eager/python/evaluator.py b/tensorflow/contrib/eager/python/evaluator.py
index bd0ab02ecf7ae6025e08dde1c3ddc634db9255c1..3faaeef5903615ea122800a6690117dde682e830 100644
--- a/tensorflow/contrib/eager/python/evaluator.py
+++ b/tensorflow/contrib/eager/python/evaluator.py
@@ -110,7 +110,7 @@ class Evaluator(object):
         return self._all_metric_results()
     else:
       def f():
-        with summary_ops.create_summary_file_writer(
+        with summary_ops.create_file_writer(
             summary_logdir).as_default(), summary_ops.always_record_summaries():
           return self._all_metric_results()
       if context.in_eager_mode():
diff --git a/tensorflow/contrib/eager/python/examples/BUILD b/tensorflow/contrib/eager/python/examples/BUILD
index aa21a6ab994acf929890ecebc07a86cf7ebf97db..6aef010a2139c4cd2ae19c008aa21d4e3592ca98 100644
--- a/tensorflow/contrib/eager/python/examples/BUILD
+++ b/tensorflow/contrib/eager/python/examples/BUILD
@@ -11,5 +11,6 @@ py_library(
         "//tensorflow/contrib/eager/python/examples/resnet50",
         "//tensorflow/contrib/eager/python/examples/rnn_colorbot",
         "//tensorflow/contrib/eager/python/examples/rnn_ptb",
+        "//tensorflow/contrib/eager/python/examples/spinn:data",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
index d0130ebd118dbaff4f0161c8b2528764c6103e02..7bc5007c5655bed81b5600ee283c35bd332a1ebe 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
@@ -85,7 +85,7 @@ def fit(model, dataset, optimizer, verbose=False, logdir=None):
   if logdir:
     # Support for TensorBoard summaries. Once training has started, use:
     #   tensorboard --logdir=<logdir>
-    summary_writer = tf.contrib.summary.create_summary_file_writer(logdir)
+    summary_writer = tf.contrib.summary.create_file_writer(logdir)
 
   # Training loop.
   for i, (xs, ys) in enumerate(tfe.Iterator(dataset)):
diff --git a/tensorflow/contrib/eager/python/examples/mnist/mnist.py b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
index bfb7d5a9002787f6544d383de58150661ac2bde3..bb121c7704b4772dde520ddc928a13c50ec8bb18 100644
--- a/tensorflow/contrib/eager/python/examples/mnist/mnist.py
+++ b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
@@ -190,9 +190,9 @@ def main(_):
   else:
     train_dir = None
     test_dir = None
-  summary_writer = tf.contrib.summary.create_summary_file_writer(
+  summary_writer = tf.contrib.summary.create_file_writer(
       train_dir, flush_millis=10000)
-  test_summary_writer = tf.contrib.summary.create_summary_file_writer(
+  test_summary_writer = tf.contrib.summary.create_file_writer(
       test_dir, flush_millis=10000, name='test')
   checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt')
 
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
index 14c82c87a72457d414c4a1d3c53d4d1a68a400e6..23317886e712323f4b520000e0fd372734fc53a1 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
@@ -73,7 +73,7 @@ class ResNet50GraphTest(tf.test.TestCase):
       tf.train.get_or_create_global_step()
       logdir = tempfile.mkdtemp()
       with tf.contrib.summary.always_record_summaries():
-        with tf.contrib.summary.create_summary_file_writer(
+        with tf.contrib.summary.create_file_writer(
             logdir, max_queue=0,
             name='t0').as_default():
           model = resnet50.ResNet50(data_format())
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
index 582f4837c6f3197081cb558063e963866d173f29..d8d8644dde10498e5fd480f92b69656fca1558dd 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
@@ -95,7 +95,7 @@ class ResNet50Test(tf.test.TestCase):
     model = resnet50.ResNet50(data_format)
     tf.train.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
-    with tf.contrib.summary.create_summary_file_writer(
+    with tf.contrib.summary.create_file_writer(
         logdir, max_queue=0,
         name='t0').as_default(), tf.contrib.summary.always_record_summaries():
       with tf.device(device):
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
index 609cbd28772c3ae8da70648ca5b1b264a8a255e2..40919f2d4cf511eb35fac954719286366aef6c7c 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
@@ -247,9 +247,9 @@ def main(_):
 
   log_dir = os.path.join(FLAGS.dir, "summaries")
   tf.gfile.MakeDirs(log_dir)
-  train_summary_writer = tf.contrib.summary.create_summary_file_writer(
+  train_summary_writer = tf.contrib.summary.create_file_writer(
       os.path.join(log_dir, "train"), flush_millis=10000)
-  test_summary_writer = tf.contrib.summary.create_summary_file_writer(
+  test_summary_writer = tf.contrib.summary.create_file_writer(
       os.path.join(log_dir, "eval"), flush_millis=10000, name="eval")
 
   with tf.device(device):
diff --git a/tensorflow/contrib/eager/python/examples/spinn/BUILD b/tensorflow/contrib/eager/python/examples/spinn/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a1f8a759e2a556bc219f0aa13942f293c4f34cfa
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/spinn/BUILD
@@ -0,0 +1,42 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "data",
+    srcs = ["data.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = ["//third_party/py/numpy"],
+)
+
+py_test(
+    name = "data_test",
+    size = "small",
+    srcs = ["data_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":data",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "spinn_test",
+    size = "medium",
+    srcs = ["spinn_test.py"],
+    additional_deps = [
+        ":data",
+        "//third_party/examples/eager/spinn",
+        "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/summary:summary_test_util",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+    tags = ["no_pip"],  # because spinn.py is under third_party/.
+)
diff --git a/tensorflow/contrib/eager/python/examples/spinn/README.md b/tensorflow/contrib/eager/python/examples/spinn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..eb0637df473e22e5d39ca1b0816464cb2b7c6435
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/spinn/README.md
@@ -0,0 +1,13 @@
+# SPINN: Dynamic neural network with TensorFlow eager execution
+
+This directory contains files supporting the
+[spinn.py model in third_party/examples/eager/spinn/](../../../../../../third_party/examples/eager/spinn/spinn.py),
+including
+
+- `data.py`: Utility library for loading and preprocessing the SNLI and GloVe
+  data.
+- `data_test.py` and `spinn_test.py`: Unit tests for the data and model modules.
+
+See the [README.md in third_party/examples/eager/spinn/](../../../../../../third_party/examples/eager/spinn/README.md)
+for detailed background, license and usage information regarding the SPINN code.
+
diff --git a/tensorflow/contrib/eager/python/examples/spinn/data.py b/tensorflow/contrib/eager/python/examples/spinn/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6e046320f78541bef4e091e97f08fd51857af83
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/spinn/data.py
@@ -0,0 +1,350 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities of SNLI data and GloVe word vectors for SPINN model.
+
+See more details about the SNLI data set at:
+  https://nlp.stanford.edu/projects/snli/
+
+See more details about the GloVe pretrained word embeddings at:
+  https://nlp.stanford.edu/projects/glove/
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import math
+import os
+import random
+
+import numpy as np
+
+POSSIBLE_LABELS = ("entailment", "contradiction", "neutral")
+
+UNK_CODE = 0   # Code for unknown word tokens.
+PAD_CODE = 1   # Code for padding tokens.
+
+SHIFT_CODE = 3
+REDUCE_CODE = 2
+
+WORD_VECTOR_LEN = 300  # Embedding dimensions.
+
+LEFT_PAREN = "("
+RIGHT_PAREN = ")"
+PARENTHESES = (LEFT_PAREN, RIGHT_PAREN)
+
+
+def get_non_parenthesis_words(items):
+  """Get the non-parenthesis items from a SNLI parsed sentence.
+
+  Args:
+    items: Data items from a parsed SNLI setence, with parentheses. E.g.,
+      ["(", "Man", "(", "(", "(", "(", "(", "wearing", "pass", ")", ...
+
+  Returns:
+    A list of non-parenthis word items, all converted to lower case. E.g.,
+      ["man", "wearing", "pass", ...
+  """
+  return [x.lower() for x in items if x not in PARENTHESES and x]
+
+
+def get_shift_reduce(items):
+  """Obtain shift-reduce vector from a list of items from the SNLI data.
+
+  Args:
+    items: Data items as a list of str, e.g.,
+       ["(", "Man", "(", "(", "(", "(", "(", "wearing", "pass", ")", ...
+
+  Returns:
+    A list of shift-reduce transitions, encoded as `SHIFT_CODE` for shift and
+      `REDUCE_CODE` for reduce. See code above for the values of `SHIFT_CODE`
+      and `REDUCE_CODE`.
+  """
+  trans = []
+  for item in items:
+    if item == LEFT_PAREN:
+      continue
+    elif item == RIGHT_PAREN:
+      trans.append(REDUCE_CODE)
+    else:
+      trans.append(SHIFT_CODE)
+  return trans
+
+
+def pad_and_reverse_word_ids(sentences):
+  """Pad a list of sentences to the common maximum length + 1.
+
+  Args:
+    sentences: A list of sentences as a list of list of integers. Each integer
+      is a word ID. Each list of integer corresponds to one sentence.
+
+  Returns:
+    A numpy.ndarray of shape (num_sentences, max_length + 1), wherein max_length
+      is the maximum sentence length (in # of words). Each sentence is reversed
+      and then padded with an extra one at head, as required by the model.
+  """
+  max_len = max(len(sent) for sent in sentences)
+  for sent in sentences:
+    if len(sent) < max_len:
+      sent.extend([PAD_CODE] * (max_len - len(sent)))
+  # Reverse in time order and pad an extra one.
+  sentences = np.fliplr(np.array(sentences, dtype=np.int64))
+  sentences = np.concatenate(
+      [np.ones([sentences.shape[0], 1], dtype=np.int64), sentences], axis=1)
+  return sentences
+
+
+def pad_transitions(sentences_transitions):
+  """Pad a list of shift-reduce transitions to the maximum length."""
+  max_len = max(len(transitions) for transitions in sentences_transitions)
+  for transitions in sentences_transitions:
+    if len(transitions) < max_len:
+      transitions.extend([PAD_CODE] * (max_len - len(transitions)))
+  return np.array(sentences_transitions, dtype=np.int64)
+
+
+def load_vocabulary(data_root):
+  """Load vocabulary from SNLI data files.
+
+  Args:
+    data_root: Root directory of the data. It is assumed that the SNLI data
+      files have been downloaded and extracted to the "snli/snli_1.0"
+      subdirectory of it.
+
+  Returns:
+    Vocabulary as a set of strings.
+
+  Raises:
+    ValueError: If SNLI data files cannot be found.
+  """
+  snli_path = os.path.join(data_root, "snli")
+  snli_glob_pattern = os.path.join(snli_path, "snli_1.0/snli_1.0_*.txt")
+  file_names = glob.glob(snli_glob_pattern)
+  if not file_names:
+    raise ValueError(
+        "Cannot find SNLI data files at %s. "
+        "Please download and extract SNLI data first." % snli_glob_pattern)
+
+  print("Loading vocabulary...")
+  vocab = set()
+  for file_name in file_names:
+    with open(os.path.join(snli_path, file_name), "rt") as f:
+      for i, line in enumerate(f):
+        if i == 0:
+          continue
+        items = line.split("\t")
+        premise_words = get_non_parenthesis_words(items[1].split(" "))
+        hypothesis_words = get_non_parenthesis_words(items[2].split(" "))
+        vocab.update(premise_words)
+        vocab.update(hypothesis_words)
+  return vocab
+
+
+def load_word_vectors(data_root, vocab):
+  """Load GloVe word vectors for words present in the vocabulary.
+
+  Args:
+    data_root: Data root directory. It is assumed that the GloVe file
+     has been downloaded and extracted at the "glove/" subdirectory of it.
+    vocab: A `set` of words, representing the vocabulary.
+
+  Returns:
+    1. word2index: A dict from lower-case word to row index in the embedding
+       matrix, i.e, `embed` below.
+    2. embed: The embedding matrix as a float32 numpy array. Its shape is
+       [vocabulary_size, WORD_VECTOR_LEN]. vocabulary_size is len(vocab).
+       WORD_VECTOR_LEN is the embedding dimension (300).
+
+  Raises:
+    ValueError: If GloVe embedding file cannot be found.
+  """
+  glove_path = os.path.join(data_root, "glove/glove.42B.300d.txt")
+  if not os.path.isfile(glove_path):
+    raise ValueError(
+        "Cannot find GloVe embedding file at %s. "
+        "Please download and extract GloVe embeddings first." % glove_path)
+
+  print("Loading word vectors...")
+
+  word2index = dict()
+  embed = []
+
+  embed.append([0] * WORD_VECTOR_LEN)  # <unk>
+  embed.append([0] * WORD_VECTOR_LEN)  # <pad>
+  word2index["<unk>"] = UNK_CODE
+  word2index["<pad>"] = PAD_CODE
+
+  with open(glove_path, "rt") as f:
+    for line in f:
+      items = line.split(" ")
+      word = items[0]
+      if word in vocab and word not in word2index:
+        word2index[word] = len(embed)
+        vector = np.array([float(item) for item in items[1:]])
+        assert (WORD_VECTOR_LEN,) == vector.shape
+        embed.append(vector)
+  embed = np.array(embed, dtype=np.float32)
+  return word2index, embed
+
+
+def calculate_bins(length2count, min_bin_size):
+  """Cacluate bin boundaries given a histogram of lengths and mininum bin size.
+
+  Args:
+    length2count: A `dict` mapping length to sentence count.
+    min_bin_size: Minimum bin size in terms of total number of sentence pairs
+      in the bin.
+
+  Returns:
+    A `list` representing the right bin boundaries, starting from the inclusive
+    right boundary of the first bin. For example, if the output is
+      [10, 20, 35],
+    it means there are three bins: [1, 10], [11, 20] and [21, 35].
+  """
+  bounds = []
+  lengths = sorted(length2count.keys())
+  cum_count = 0
+  for length in lengths:
+    cum_count += length2count[length]
+    if cum_count >= min_bin_size:
+      bounds.append(length)
+      cum_count = 0
+  if bounds[-1] != lengths[-1]:
+    bounds.append(lengths[-1])
+  return bounds
+
+
+class SnliData(object):
+  """A split of SNLI data."""
+
+  def __init__(self, data_file, word2index, sentence_len_limit=-1):
+    """SnliData constructor.
+
+    Args:
+      data_file: Full path to the data file, e.g.,
+        "/tmp/spinn-data/snli/snli_1.0/snli_1.0.train.txt"
+      word2index: A dict from lower-case word to row index in the embedding
+        matrix (see `load_word_vectors()` for details).
+      sentence_len_limit: Maximum allowed sentence length (# of words).
+        A value of <= 0 means unlimited. Sentences longer than this limit
+        are currently discarded, not truncated.
+    """
+
+    self._labels = []
+    self._premises = []
+    self._premise_transitions = []
+    self._hypotheses = []
+    self._hypothesis_transitions = []
+
+    with open(data_file, "rt") as f:
+      for i, line in enumerate(f):
+        if i == 0:
+          # Skip header line.
+          continue
+        items = line.split("\t")
+        if items[0] not in POSSIBLE_LABELS:
+          continue
+
+        premise_items = items[1].split(" ")
+        hypothesis_items = items[2].split(" ")
+        premise_words = get_non_parenthesis_words(premise_items)
+        hypothesis_words = get_non_parenthesis_words(hypothesis_items)
+
+        if (sentence_len_limit > 0 and
+            (len(premise_words) > sentence_len_limit or
+             len(hypothesis_words) > sentence_len_limit)):
+          # TODO(cais): Maybe truncate; do not discard.
+          continue
+
+        premise_ids = [
+            word2index.get(word, UNK_CODE) for word in premise_words]
+        hypothesis_ids = [
+            word2index.get(word, UNK_CODE) for word in hypothesis_words]
+
+        self._premises.append(premise_ids)
+        self._hypotheses.append(hypothesis_ids)
+        self._premise_transitions.append(get_shift_reduce(premise_items))
+        self._hypothesis_transitions.append(get_shift_reduce(hypothesis_items))
+        assert (len(self._premise_transitions[-1]) ==
+                2 * len(premise_words) - 1)
+        assert (len(self._hypothesis_transitions[-1]) ==
+                2 * len(hypothesis_words) - 1)
+
+        self._labels.append(POSSIBLE_LABELS.index(items[0]) + 1)
+
+    assert len(self._labels) == len(self._premises)
+    assert len(self._labels) == len(self._hypotheses)
+    assert len(self._labels) == len(self._premise_transitions)
+    assert len(self._labels) == len(self._hypothesis_transitions)
+
+  def num_batches(self, batch_size):
+    """Calculate number of batches given batch size."""
+    return int(math.ceil(len(self._labels) / batch_size))
+
+  def get_generator(self, batch_size):
+    """Obtain a generator for batched data.
+
+    All examples of this SnliData object are randomly shuffled, sorted
+    according to the maximum sentence length of the premise and hypothesis
+    sentences in the pair, and batched.
+
+    Args:
+      batch_size: Desired batch size.
+
+    Returns:
+      A generator for data batches. The generator yields a 5-tuple:
+        label: An array of the shape (batch_size,).
+        premise: An array of the shape (max_premise_len, batch_size), wherein
+          max_premise_len is the maximum length of the (padded) premise
+          sentence in the batch.
+        premise_transitions: An array of the shape (2 * max_premise_len -3,
+          batch_size).
+        hypothesis: Same as `premise`, but for hypothesis sentences.
+        hypothesis_transitions: Same as `premise_transitions`, but for
+          hypothesis sentences.
+      All the elements of the 5-tuple have dtype `int64`.
+    """
+    # Randomly shuffle examples.
+    zipped = list(zip(
+        self._labels, self._premises, self._premise_transitions,
+        self._hypotheses, self._hypothesis_transitions))
+    random.shuffle(zipped)
+    # Then sort the examples by maximum of the premise and hypothesis sentence
+    # lengths in the pair. During training, the batches are expected to be
+    # shuffled. So it is okay to leave them sorted by max length here.
+    (labels, premises, premise_transitions, hypotheses,
+     hypothesis_transitions) = zip(
+         *sorted(zipped, key=lambda x: max(len(x[1]), len(x[3]))))
+
+    def _generator():
+      begin = 0
+      while begin < len(labels):
+        # The sorting above and the batching here makes sure that sentences of
+        # similar max lengths are batched together, minimizing the inefficiency
+        # due to uneven max lengths. The sentences are batched differently in
+        # each call to get_generator() due to the shuffling before sotring
+        # above. The pad_and_reverse_word_ids() and pad_transitions() functions
+        # take care of any remaning unevenness of the max sentence lengths.
+        end = min(begin + batch_size, len(labels))
+        # Transpose, because the SPINN model requires time-major, instead of
+        # batch-major.
+        yield (labels[begin:end],
+               pad_and_reverse_word_ids(premises[begin:end]).T,
+               pad_transitions(premise_transitions[begin:end]).T,
+               pad_and_reverse_word_ids(hypotheses[begin:end]).T,
+               pad_transitions(hypothesis_transitions[begin:end]).T)
+        begin = end
+    return _generator
diff --git a/tensorflow/contrib/eager/python/examples/spinn/data_test.py b/tensorflow/contrib/eager/python/examples/spinn/data_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4f0b37c5099e45b7e3b258b258c0a203c36b3b7
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/spinn/data_test.py
@@ -0,0 +1,243 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for SPINN data module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+import tensorflow as tf
+
+from tensorflow.contrib.eager.python.examples.spinn import data
+
+
+class DataTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(DataTest, self).setUp()
+    self._temp_data_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    shutil.rmtree(self._temp_data_dir)
+    super(DataTest, self).tearDown()
+
+  def testGenNonParenthesisWords(self):
+    seq_with_parse = (
+        "( Man ( ( ( ( ( wearing pass ) ( on ( a lanyard ) ) ) and "
+        ") ( standing ( in ( ( a crowd ) ( of people ) ) ) ) ) . ) )")
+    self.assertEqual(
+        ["man", "wearing", "pass", "on", "a", "lanyard", "and", "standing",
+         "in", "a", "crowd", "of", "people", "."],
+        data.get_non_parenthesis_words(seq_with_parse.split(" ")))
+
+  def testGetShiftReduce(self):
+    seq_with_parse = (
+        "( Man ( ( ( ( ( wearing pass ) ( on ( a lanyard ) ) ) and "
+        ") ( standing ( in ( ( a crowd ) ( of people ) ) ) ) ) . ) )")
+    self.assertEqual(
+        [3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 3, 2, 3, 3, 3, 3, 2, 3, 3, 2, 2, 2, 2, 2,
+         3, 2, 2], data.get_shift_reduce(seq_with_parse.split(" ")))
+
+  def testPadAndReverseWordIds(self):
+    id_sequences = [[0, 2, 3, 4, 5],
+                    [6, 7, 8],
+                    [9, 10, 11, 12, 13, 14, 15, 16]]
+    self.assertAllClose(
+        [[1, 1, 1, 1, 5, 4, 3, 2, 0],
+         [1, 1, 1, 1, 1, 1, 8, 7, 6],
+         [1, 16, 15, 14, 13, 12, 11, 10, 9]],
+        data.pad_and_reverse_word_ids(id_sequences))
+
+  def testPadTransitions(self):
+    unpadded = [[3, 3, 3, 2, 2, 2, 2],
+                [3, 3, 2, 2, 2]]
+    self.assertAllClose(
+        [[3, 3, 3, 2, 2, 2, 2],
+         [3, 3, 2, 2, 2, 1, 1]],
+        data.pad_transitions(unpadded))
+
+  def testCalculateBins(self):
+    length2count = {
+        1: 10,
+        2: 15,
+        3: 25,
+        4: 40,
+        5: 35,
+        6: 10}
+    self.assertEqual([2, 3, 4, 5, 6],
+                     data.calculate_bins(length2count, 20))
+    self.assertEqual([3, 4, 6], data.calculate_bins(length2count, 40))
+    self.assertEqual([4, 6], data.calculate_bins(length2count, 60))
+
+  def testLoadVoacbulary(self):
+    snli_1_0_dir = os.path.join(self._temp_data_dir, "snli/snli_1.0")
+    fake_train_file = os.path.join(snli_1_0_dir, "snli_1.0_train.txt")
+    fake_dev_file = os.path.join(snli_1_0_dir, "snli_1.0_dev.txt")
+    os.makedirs(snli_1_0_dir)
+
+    with open(fake_train_file, "wt") as f:
+      f.write("gold_label\tsentence1_binary_parse\tsentence2_binary_parse\t"
+              "sentence1_parse\tsentence2_parse\tsentence1\tsentence2\t"
+              "captionID\tpairID\tlabel1\tlabel2\tlabel3\tlabel4\tlabel5\n")
+      f.write("neutral\t( ( Foo bar ) . )\t( ( foo baz ) . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+    with open(fake_dev_file, "wt") as f:
+      f.write("gold_label\tsentence1_binary_parse\tsentence2_binary_parse\t"
+              "sentence1_parse\tsentence2_parse\tsentence1\tsentence2\t"
+              "captionID\tpairID\tlabel1\tlabel2\tlabel3\tlabel4\tlabel5\n")
+      f.write("neutral\t( ( Quux quuz ) ? )\t( ( Corge grault ) ! )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Quux quuz?\t.Corge grault!\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+
+    vocab = data.load_vocabulary(self._temp_data_dir)
+    self.assertSetEqual(
+        {".", "?", "!", "foo", "bar", "baz", "quux", "quuz", "corge", "grault"},
+        vocab)
+
+  def testLoadVoacbularyWithoutFileRaisesError(self):
+    with self.assertRaisesRegexp(ValueError, "Cannot find SNLI data files at"):
+      data.load_vocabulary(self._temp_data_dir)
+
+    os.makedirs(os.path.join(self._temp_data_dir, "snli"))
+    with self.assertRaisesRegexp(ValueError, "Cannot find SNLI data files at"):
+      data.load_vocabulary(self._temp_data_dir)
+
+    os.makedirs(os.path.join(self._temp_data_dir, "snli/snli_1.0"))
+    with self.assertRaisesRegexp(ValueError, "Cannot find SNLI data files at"):
+      data.load_vocabulary(self._temp_data_dir)
+
+  def testLoadWordVectors(self):
+    glove_dir = os.path.join(self._temp_data_dir, "glove")
+    os.makedirs(glove_dir)
+    glove_file = os.path.join(glove_dir, "glove.42B.300d.txt")
+
+    words = [".", ",", "foo", "bar", "baz"]
+    with open(glove_file, "wt") as f:
+      for i, word in enumerate(words):
+        f.write("%s " % word)
+        for j in range(data.WORD_VECTOR_LEN):
+          f.write("%.5f" % (i * 0.1))
+          if j < data.WORD_VECTOR_LEN - 1:
+            f.write(" ")
+          else:
+            f.write("\n")
+
+    vocab = {"foo", "bar", "baz", "qux", "."}
+    # Notice that "qux" is not present in `words`.
+    word2index, embed = data.load_word_vectors(self._temp_data_dir, vocab)
+
+    self.assertEqual(6, len(word2index))
+    self.assertEqual(0, word2index["<unk>"])
+    self.assertEqual(1, word2index["<pad>"])
+    self.assertEqual(2, word2index["."])
+    self.assertEqual(3, word2index["foo"])
+    self.assertEqual(4, word2index["bar"])
+    self.assertEqual(5, word2index["baz"])
+    self.assertEqual((6, data.WORD_VECTOR_LEN), embed.shape)
+    self.assertAllClose([0.0] * data.WORD_VECTOR_LEN, embed[0, :])
+    self.assertAllClose([0.0] * data.WORD_VECTOR_LEN, embed[1, :])
+    self.assertAllClose([0.0] * data.WORD_VECTOR_LEN, embed[2, :])
+    self.assertAllClose([0.2] * data.WORD_VECTOR_LEN, embed[3, :])
+    self.assertAllClose([0.3] * data.WORD_VECTOR_LEN, embed[4, :])
+    self.assertAllClose([0.4] * data.WORD_VECTOR_LEN, embed[5, :])
+
+  def testLoadWordVectorsWithoutFileRaisesError(self):
+    vocab = {"foo", "bar", "baz", "qux", "."}
+    with self.assertRaisesRegexp(
+        ValueError, "Cannot find GloVe embedding file at"):
+      data.load_word_vectors(self._temp_data_dir, vocab)
+
+    os.makedirs(os.path.join(self._temp_data_dir, "glove"))
+    with self.assertRaisesRegexp(
+        ValueError, "Cannot find GloVe embedding file at"):
+      data.load_word_vectors(self._temp_data_dir, vocab)
+
+  def testSnliData(self):
+    """Unit test for SnliData objects."""
+    snli_1_0_dir = os.path.join(self._temp_data_dir, "snli/snli_1.0")
+    fake_train_file = os.path.join(snli_1_0_dir, "snli_1.0_train.txt")
+    os.makedirs(snli_1_0_dir)
+
+    # Four sentences in total.
+    with open(fake_train_file, "wt") as f:
+      f.write("gold_label\tsentence1_binary_parse\tsentence2_binary_parse\t"
+              "sentence1_parse\tsentence2_parse\tsentence1\tsentence2\t"
+              "captionID\tpairID\tlabel1\tlabel2\tlabel3\tlabel4\tlabel5\n")
+      f.write("neutral\t( ( Foo bar ) . )\t( ( foo . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("contradiction\t( ( Bar foo ) . )\t( ( baz . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("entailment\t( ( Quux quuz ) . )\t( ( grault . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("entailment\t( ( Quuz quux ) . )\t( ( garply . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+
+    glove_dir = os.path.join(self._temp_data_dir, "glove")
+    os.makedirs(glove_dir)
+    glove_file = os.path.join(glove_dir, "glove.42B.300d.txt")
+
+    words = [".", "foo", "bar", "baz", "quux", "quuz", "grault", "garply"]
+    with open(glove_file, "wt") as f:
+      for i, word in enumerate(words):
+        f.write("%s " % word)
+        for j in range(data.WORD_VECTOR_LEN):
+          f.write("%.5f" % (i * 0.1))
+          if j < data.WORD_VECTOR_LEN - 1:
+            f.write(" ")
+          else:
+            f.write("\n")
+
+    vocab = data.load_vocabulary(self._temp_data_dir)
+    word2index, _ = data.load_word_vectors(self._temp_data_dir, vocab)
+
+    train_data = data.SnliData(fake_train_file, word2index)
+    self.assertEqual(4, train_data.num_batches(1))
+    self.assertEqual(2, train_data.num_batches(2))
+    self.assertEqual(2, train_data.num_batches(3))
+    self.assertEqual(1, train_data.num_batches(4))
+
+    generator = train_data.get_generator(2)()
+    for i in range(2):
+      label, prem, prem_trans, hypo, hypo_trans = next(generator)
+      self.assertEqual(2, len(label))
+      self.assertEqual((4, 2), prem.shape)
+      self.assertEqual((5, 2), prem_trans.shape)
+      self.assertEqual((3, 2), hypo.shape)
+      self.assertEqual((3, 2), hypo_trans.shape)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..84e25cf81a2223800c47994b26d000caddee6b01
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
@@ -0,0 +1,409 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import gc
+import glob
+import os
+import shutil
+import tempfile
+import time
+
+import numpy as np
+import tensorflow as tf
+
+# pylint: disable=g-bad-import-order
+import tensorflow.contrib.eager as tfe
+from tensorflow.contrib.eager.python.examples.spinn import data
+from third_party.examples.eager.spinn import spinn
+from tensorflow.contrib.summary import summary_test_util
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+# pylint: enable=g-bad-import-order
+
+
+def _generate_synthetic_snli_data_batch(sequence_length,
+                                        batch_size,
+                                        vocab_size):
+  """Generate a fake batch of SNLI data for testing."""
+  with tf.device("cpu:0"):
+    labels = tf.random_uniform([batch_size], minval=1, maxval=4, dtype=tf.int64)
+    prem = tf.random_uniform(
+        (sequence_length, batch_size), maxval=vocab_size, dtype=tf.int64)
+    prem_trans = tf.constant(np.array(
+        [[3, 3, 2, 3, 3, 3, 2, 2, 2, 3, 3, 3,
+          2, 3, 3, 2, 2, 3, 3, 3, 2, 2, 2, 2,
+          3, 2, 2]] * batch_size, dtype=np.int64).T)
+    hypo = tf.random_uniform(
+        (sequence_length, batch_size), maxval=vocab_size, dtype=tf.int64)
+    hypo_trans = tf.constant(np.array(
+        [[3, 3, 2, 3, 3, 3, 2, 2, 2, 3, 3, 3,
+          2, 3, 3, 2, 2, 3, 3, 3, 2, 2, 2, 2,
+          3, 2, 2]] * batch_size, dtype=np.int64).T)
+  if tfe.num_gpus():
+    labels = labels.gpu()
+    prem = prem.gpu()
+    prem_trans = prem_trans.gpu()
+    hypo = hypo.gpu()
+    hypo_trans = hypo_trans.gpu()
+  return labels, prem, prem_trans, hypo, hypo_trans
+
+
+def _test_spinn_config(d_embed, d_out, logdir=None):
+  config_tuple = collections.namedtuple(
+      "Config", ["d_hidden", "d_proj", "d_tracker", "predict",
+                 "embed_dropout", "mlp_dropout", "n_mlp_layers", "d_mlp",
+                 "d_out", "projection", "lr", "batch_size", "epochs",
+                 "force_cpu", "logdir", "log_every", "dev_every", "save_every",
+                 "lr_decay_every", "lr_decay_by"])
+  return config_tuple(
+      d_hidden=d_embed,
+      d_proj=d_embed * 2,
+      d_tracker=8,
+      predict=False,
+      embed_dropout=0.1,
+      mlp_dropout=0.1,
+      n_mlp_layers=2,
+      d_mlp=32,
+      d_out=d_out,
+      projection=True,
+      lr=2e-2,
+      batch_size=2,
+      epochs=10,
+      force_cpu=False,
+      logdir=logdir,
+      log_every=1,
+      dev_every=2,
+      save_every=2,
+      lr_decay_every=1,
+      lr_decay_by=0.75)
+
+
+class SpinnTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    super(SpinnTest, self).setUp()
+    self._test_device = "gpu:0" if tfe.num_gpus() else "cpu:0"
+    self._temp_data_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    shutil.rmtree(self._temp_data_dir)
+    super(SpinnTest, self).tearDown()
+
+  def testBundle(self):
+    with tf.device(self._test_device):
+      lstm_iter = [np.array([[0, 1], [2, 3]], dtype=np.float32),
+                   np.array([[0, -1], [-2, -3]], dtype=np.float32),
+                   np.array([[0, 2], [4, 6]], dtype=np.float32),
+                   np.array([[0, -2], [-4, -6]], dtype=np.float32)]
+      out = spinn._bundle(lstm_iter)
+
+      self.assertEqual(2, len(out))
+      self.assertEqual(tf.float32, out[0].dtype)
+      self.assertEqual(tf.float32, out[1].dtype)
+      self.assertAllEqual(np.array([[0, 2, 0, -2, 0, 4, 0, -4]]).T,
+                          out[0].numpy())
+      self.assertAllEqual(np.array([[1, 3, -1, -3, 2, 6, -2, -6]]).T,
+                          out[1].numpy())
+
+  def testUnbunbdle(self):
+    with tf.device(self._test_device):
+      state = [np.array([[0, 1, 2], [3, 4, 5]], dtype=np.float32),
+               np.array([[0, -1, -2], [-3, -4, -5]], dtype=np.float32)]
+      out = spinn._unbundle(state)
+
+      self.assertEqual(2, len(out))
+      self.assertEqual(tf.float32, out[0].dtype)
+      self.assertEqual(tf.float32, out[1].dtype)
+      self.assertAllEqual(np.array([[0, 1, 2, 0, -1, -2]]),
+                          out[0].numpy())
+      self.assertAllEqual(np.array([[3, 4, 5, -3, -4, -5]]),
+                          out[1].numpy())
+
+  def testReducer(self):
+    with tf.device(self._test_device):
+      batch_size = 3
+      size = 10
+      tracker_size = 8
+      reducer = spinn.Reducer(size, tracker_size=tracker_size)
+
+      left_in = []
+      right_in = []
+      tracking = []
+      for _ in range(batch_size):
+        left_in.append(tf.random_normal((1, size * 2)))
+        right_in.append(tf.random_normal((1, size * 2)))
+        tracking.append(tf.random_normal((1, tracker_size * 2)))
+
+      out = reducer(left_in, right_in, tracking=tracking)
+      self.assertEqual(batch_size, len(out))
+      self.assertEqual(tf.float32, out[0].dtype)
+      self.assertEqual((1, size * 2), out[0].shape)
+
+  def testReduceTreeLSTM(self):
+    with tf.device(self._test_device):
+      size = 10
+      tracker_size = 8
+      reducer = spinn.Reducer(size, tracker_size=tracker_size)
+
+      lstm_in = np.array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+                          [0, -1, -2, -3, -4, -5, -6, -7, -8, -9]],
+                         dtype=np.float32)
+      c1 = np.array([[0, 1], [2, 3]], dtype=np.float32)
+      c2 = np.array([[0, -1], [-2, -3]], dtype=np.float32)
+
+      h, c = reducer._tree_lstm(c1, c2, lstm_in)
+      self.assertEqual(tf.float32, h.dtype)
+      self.assertEqual(tf.float32, c.dtype)
+      self.assertEqual((2, 2), h.shape)
+      self.assertEqual((2, 2), c.shape)
+
+  def testTracker(self):
+    with tf.device(self._test_device):
+      batch_size = 2
+      size = 10
+      tracker_size = 8
+      buffer_length = 18
+      stack_size = 3
+
+      tracker = spinn.Tracker(tracker_size, False)
+      tracker.reset_state()
+
+      # Create dummy inputs for testing.
+      bufs = []
+      buf = []
+      for _ in range(buffer_length):
+        buf.append(tf.random_normal((batch_size, size * 2)))
+      bufs.append(buf)
+      self.assertEqual(1, len(bufs))
+      self.assertEqual(buffer_length, len(bufs[0]))
+      self.assertEqual((batch_size, size * 2), bufs[0][0].shape)
+
+      stacks = []
+      stack = []
+      for _ in range(stack_size):
+        stack.append(tf.random_normal((batch_size, size * 2)))
+      stacks.append(stack)
+      self.assertEqual(1, len(stacks))
+      self.assertEqual(3, len(stacks[0]))
+      self.assertEqual((batch_size, size * 2), stacks[0][0].shape)
+
+      for _ in range(2):
+        out1, out2 = tracker(bufs, stacks)
+        self.assertIsNone(out2)
+        self.assertEqual(batch_size, len(out1))
+        self.assertEqual(tf.float32, out1[0].dtype)
+        self.assertEqual((1, tracker_size * 2), out1[0].shape)
+
+        self.assertEqual(tf.float32, tracker.state.c.dtype)
+        self.assertEqual((batch_size, tracker_size), tracker.state.c.shape)
+        self.assertEqual(tf.float32, tracker.state.h.dtype)
+        self.assertEqual((batch_size, tracker_size), tracker.state.h.shape)
+
+  def testSPINN(self):
+    with tf.device(self._test_device):
+      embedding_dims = 10
+      d_tracker = 8
+      sequence_length = 15
+      num_transitions = 27
+
+      config_tuple = collections.namedtuple(
+          "Config", ["d_hidden", "d_proj", "d_tracker", "predict"])
+      config = config_tuple(
+          embedding_dims, embedding_dims * 2, d_tracker, False)
+      s = spinn.SPINN(config)
+
+      # Create some fake data.
+      buffers = tf.random_normal((sequence_length, 1, config.d_proj))
+      transitions = tf.constant(
+          [[3], [3], [2], [3], [3], [3], [2], [2], [2], [3], [3], [3],
+           [2], [3], [3], [2], [2], [3], [3], [3], [2], [2], [2], [2],
+           [3], [2], [2]], dtype=tf.int64)
+      self.assertEqual(tf.int64, transitions.dtype)
+      self.assertEqual((num_transitions, 1), transitions.shape)
+
+      out = s(buffers, transitions, training=True)
+      self.assertEqual(tf.float32, out.dtype)
+      self.assertEqual((1, embedding_dims), out.shape)
+
+  def testSNLIClassifierAndTrainer(self):
+    with tf.device(self._test_device):
+      vocab_size = 40
+      batch_size = 2
+      d_embed = 10
+      sequence_length = 15
+      d_out = 4
+
+      config = _test_spinn_config(d_embed, d_out)
+
+      # Create fake embedding matrix.
+      embed = tf.random_normal((vocab_size, d_embed))
+
+      model = spinn.SNLIClassifier(config, embed)
+      trainer = spinn.SNLIClassifierTrainer(model, config.lr)
+
+      (labels, prem, prem_trans, hypo,
+       hypo_trans) = _generate_synthetic_snli_data_batch(sequence_length,
+                                                         batch_size,
+                                                         vocab_size)
+
+      # Invoke model under non-training mode.
+      logits = model(prem, prem_trans, hypo, hypo_trans, training=False)
+      self.assertEqual(tf.float32, logits.dtype)
+      self.assertEqual((batch_size, d_out), logits.shape)
+
+      # Invoke model under training model.
+      logits = model(prem, prem_trans, hypo, hypo_trans, training=True)
+      self.assertEqual(tf.float32, logits.dtype)
+      self.assertEqual((batch_size, d_out), logits.shape)
+
+      # Calculate loss.
+      loss1 = trainer.loss(labels, logits)
+      self.assertEqual(tf.float32, loss1.dtype)
+      self.assertEqual((), loss1.shape)
+
+      loss2, logits = trainer.train_batch(
+          labels, prem, prem_trans, hypo, hypo_trans)
+      self.assertEqual(tf.float32, loss2.dtype)
+      self.assertEqual((), loss2.shape)
+      self.assertEqual(tf.float32, logits.dtype)
+      self.assertEqual((batch_size, d_out), logits.shape)
+      # Training on the batch should have led to a change in the loss value.
+      self.assertNotEqual(loss1.numpy(), loss2.numpy())
+
+  def testTrainSpinn(self):
+    """Test with fake toy SNLI data and GloVe vectors."""
+
+    # 1. Create and load a fake SNLI data file and a fake GloVe embedding file.
+    snli_1_0_dir = os.path.join(self._temp_data_dir, "snli/snli_1.0")
+    fake_train_file = os.path.join(snli_1_0_dir, "snli_1.0_train.txt")
+    os.makedirs(snli_1_0_dir)
+
+    # Four sentences in total.
+    with open(fake_train_file, "wt") as f:
+      f.write("gold_label\tsentence1_binary_parse\tsentence2_binary_parse\t"
+              "sentence1_parse\tsentence2_parse\tsentence1\tsentence2\t"
+              "captionID\tpairID\tlabel1\tlabel2\tlabel3\tlabel4\tlabel5\n")
+      f.write("neutral\t( ( Foo bar ) . )\t( ( foo . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("contradiction\t( ( Bar foo ) . )\t( ( baz . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("entailment\t( ( Quux quuz ) . )\t( ( grault . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("entailment\t( ( Quuz quux ) . )\t( ( garply . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+
+    glove_dir = os.path.join(self._temp_data_dir, "glove")
+    os.makedirs(glove_dir)
+    glove_file = os.path.join(glove_dir, "glove.42B.300d.txt")
+
+    words = [".", "foo", "bar", "baz", "quux", "quuz", "grault", "garply"]
+    with open(glove_file, "wt") as f:
+      for i, word in enumerate(words):
+        f.write("%s " % word)
+        for j in range(data.WORD_VECTOR_LEN):
+          f.write("%.5f" % (i * 0.1))
+          if j < data.WORD_VECTOR_LEN - 1:
+            f.write(" ")
+          else:
+            f.write("\n")
+
+    vocab = data.load_vocabulary(self._temp_data_dir)
+    word2index, embed = data.load_word_vectors(self._temp_data_dir, vocab)
+
+    train_data = data.SnliData(fake_train_file, word2index)
+    dev_data = data.SnliData(fake_train_file, word2index)
+    test_data = data.SnliData(fake_train_file, word2index)
+    print(embed)
+
+    # 2. Create a fake config.
+    config = _test_spinn_config(
+        data.WORD_VECTOR_LEN, 4,
+        logdir=os.path.join(self._temp_data_dir, "logdir"))
+
+    # 3. Test training of a SPINN model.
+    spinn.train_spinn(embed, train_data, dev_data, test_data, config)
+
+    # 4. Load train loss values from the summary files and verify that they
+    #    decrease with training.
+    summary_file = glob.glob(os.path.join(config.logdir, "events.out.*"))[0]
+    events = summary_test_util.events_from_file(summary_file)
+    train_losses = [event.summary.value[0].simple_value for event in events
+                    if event.summary.value
+                    and event.summary.value[0].tag == "train/loss"]
+    self.assertEqual(config.epochs, len(train_losses))
+    self.assertLess(train_losses[-1], train_losses[0])
+
+
+class EagerSpinnSNLIClassifierBenchmark(test.Benchmark):
+
+  def benchmarkEagerSpinnSNLIClassifier(self):
+    test_device = "gpu:0" if tfe.num_gpus() else "cpu:0"
+    with tf.device(test_device):
+      burn_in_iterations = 2
+      benchmark_iterations = 10
+
+      vocab_size = 1000
+      batch_size = 128
+      sequence_length = 15
+      d_embed = 200
+      d_out = 4
+
+      embed = tf.random_normal((vocab_size, d_embed))
+
+      config = _test_spinn_config(d_embed, d_out)
+      model = spinn.SNLIClassifier(config, embed)
+      trainer = spinn.SNLIClassifierTrainer(model, config.lr)
+
+      (labels, prem, prem_trans, hypo,
+       hypo_trans) = _generate_synthetic_snli_data_batch(sequence_length,
+                                                         batch_size,
+                                                         vocab_size)
+
+      for _ in range(burn_in_iterations):
+        trainer.train_batch(labels, prem, prem_trans, hypo, hypo_trans)
+
+      gc.collect()
+      start_time = time.time()
+      for _ in xrange(benchmark_iterations):
+        trainer.train_batch(labels, prem, prem_trans, hypo, hypo_trans)
+      wall_time = time.time() - start_time
+      # Named "examples"_per_sec to conform with other benchmarks.
+      extras = {"examples_per_sec": benchmark_iterations / wall_time}
+      self.report_benchmark(
+          name="Eager_SPINN_SNLIClassifier_Benchmark",
+          iters=benchmark_iterations,
+          wall_time=wall_time,
+          extras=extras)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/eager/python/g3doc/guide.md b/tensorflow/contrib/eager/python/g3doc/guide.md
index 147b7047f42b7ccba5829b61370e82e217ce5838..0095ffa0db99d46d25654d73504d0d7d41c18b6f 100644
--- a/tensorflow/contrib/eager/python/g3doc/guide.md
+++ b/tensorflow/contrib/eager/python/g3doc/guide.md
@@ -757,7 +757,7 @@ For example, to record summaries once every 100 global steps, use:
 
 ```python
 tf.train.get_or_create_global_step()  # Ensuring the global step variable exists
-writer = tf.contrib.summary.create_summary_file_writer(logdir)
+writer = tf.contrib.summary.create_file_writer(logdir)
 
 for _ in range(iterations):
   with writer.as_default():
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index aa359b7a0d7d89e8788c323d1621798d1a22b658..2f8016ede3caee6dbb6fd8f5226f1464b5c3976b 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -73,7 +73,7 @@ class Metric(object):
   * `result()`: Computes and returns a final value for the metric
     from the variables in `self`.
 
-  Decendants may override `aggregate()`, but usually won't need to.  It
+  Descendants may override `aggregate()`, but usually won't need to.  It
   adds in the state from a list of metrics of the same type as `self`.
   (Default is to sum all the variables.) Note that users should not call
   `aggregate()`, it is for use by TensorFlow infrastructure.
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 96eb1b4f2a0e4c4af1f3310a2801b1b6aee285d6..1055f4563cd4608189281450aed512fbf5f31de1 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -67,7 +67,7 @@ class MetricsTest(test.TestCase):
     m([1, 10, 100])
     training_util.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
-    with summary_ops.create_summary_file_writer(
+    with summary_ops.create_file_writer(
         logdir, max_queue=0,
         name="t0").as_default(), summary_ops.always_record_summaries():
       m.result()  # As a side-effect will write summaries.
diff --git a/tensorflow/contrib/eager/python/network.py b/tensorflow/contrib/eager/python/network.py
index 97eded7dca2c0594321a006fecb360e26675a005..e3c13cbd2e8ccd2ab79da74e0e97905c6ed5c02d 100644
--- a/tensorflow/contrib/eager/python/network.py
+++ b/tensorflow/contrib/eager/python/network.py
@@ -54,16 +54,81 @@ def _network_name_scope_naming(current_variable_scope):
 class Network(base.Layer):
   """Represents the composition of a set of Layers.
 
-  TODO(josh11b,ashankar):
-  - Should "trainable" be changeable on the Network object?
-  - Do we allow add_variable in Network?
-  - Detect layers used in __call__ that weren't registered with track_layer.
-  - Convert inputs to __call__ to tensors.
-  - Prevent variables from being created after the first __call__?
-    (Think about restoring from a checkpoint).
+  `Network` implements the `Layer` interface and adds convenience methods for
+  managing sub-`Layer`s, such as listing variables.
+
+  `Layer`s (including other `Network`s) should be added via `track_layer`. They
+  can then be used when overriding the `Network.call` method:
+
+  ```python
+  class TwoLayerNetwork(tfe.Network):
+
+    def __init__(self, name):
+      super(TwoLayerNetwork, self).__init__(name=name)
+      self.layer_one = self.track_layer(tf.layers.Dense(16, input_shape=(8,)))
+      self.layer_two = self.track_layer(tf.layers.Dense(1, input_shape=(16,)))
+
+    def call(self, inputs):
+      return self.layer_two(self.layer_one(inputs))
+  ```
+
+  After constructing an object and calling the `Network`, a list of variables
+  created by tracked `Layer`s is available via `Network.variables`:
+
+  ```python
+  net = TwoLayerNetwork(name="net")
+  output = net(tf.ones([1, 8]))
+  print([v.name for v in net.variables])
+  ```
+
+  This example prints variable names, one kernel and one bias per
+  `tf.layers.Dense` layer:
+
+  ```
+  ['net/dense/kernel:0',
+   'net/dense/bias:0',
+   'net/dense_1/kernel:0',
+   'net/dense_1/bias:0']
+  ```
+
+  These variables can be passed to a `Saver` (`tf.train.Saver`, or
+  `tf.contrib.eager.Saver` when executing eagerly) to save or restore the
+  `Network`, typically alongside a global step and `tf.train.Optimizer`
+  variables when checkpointing during training.
+
+  Note that the semantics of calling a `Network` with graph execution (i.e. not
+  executing eagerly) may change slightly in the future. Currently stateful ops
+  are pruned from the graph unless they or something that depends on them is
+  executed in a session, but this behavior is not consistent with eager
+  execution (where stateful ops are executed eagerly). `Layer`s from `tf.layers`
+  do not depend on this pruning and so will not be affected, but `Network`s
+  which rely on stateful ops being added to the graph but not executed (e.g. via
+  custom `Layer`s which manage stateful ops) may break with this change.
   """
+  # TODO(josh11b,ashankar,allenl):
+  # - Should 'trainable' be changeable on the Network object?
+  # - Do we allow add_variable in Network?
+  # - Detect layers used in __call__ that weren't registered with track_layer.
+  # - Convert inputs to __call__ to tensors.
 
   def __init__(self, name=None):
+    """Configure the `Network`.
+
+    Args:
+      name: The name to use for this `Network`. If specified, it must be unique
+        in the context where this `Network` is first
+         (1) added to another `Network` (in which case it must not share a name
+           with other `Layers` added to that `Network`), or
+         (2) built/called (in which case no other 'top-level' `Network`s may
+          share this name).
+        If unspecified or None, the `Network` will be named using its class
+        name, with a number appended if necessary for uniqueness (e.g. MyNetwork
+        -> 'my_network_1').
+
+    Raises:
+      ValueError: If `name` is not valid. Note that some naming errors will
+        instead be raised when the `Network` is called.
+    """
     if isinstance(name, variable_scope.VariableScope):
       raise ValueError("VariableScopes are not valid Network names.")
     if name is not None and "/" in name:
@@ -386,8 +451,30 @@ class Network(base.Layer):
         "at https://github.com/tensorflow/tensorflow/issues/new if this is "
         "important to you")
 
-  # TODO(josh11b): Support other Layer methods needed for graph mode, such as for
-  # losses and updates
+  def add_loss(self, losses, inputs=None):
+    raise RuntimeError(
+        "add_loss is not supported in Network class yet. Please file an issue "
+        "at https://github.com/tensorflow/tensorflow/issues/new if this is "
+        "important to you")
+
+  @property
+  def losses(self):
+    """Gather losses from `Layer`s in the `Network`.
+
+    Note that when executing eagerly, `Layer.losses` evaluates
+    regularizers. When using graph execution, variable regularization ops have
+    already been created and are simply returned here.
+
+    Returns:
+      A list of tensors.
+    """
+    layer_losses = []
+    for layer in self.layers:
+      layer_losses.extend(layer.losses)
+    return layer_losses
+
+  # TODO(allenl): Support other Layer methods needed for graph mode, such as for
+  # updates
 
 
 class Sequential(Network):
diff --git a/tensorflow/contrib/eager/python/network_test.py b/tensorflow/contrib/eager/python/network_test.py
index e7835a63e6db926aa2d4b6c76c681c8a301757bd..3eb4f5f8b3954a7ed04d2ef1d4f119ad137e1e65 100644
--- a/tensorflow/contrib/eager/python/network_test.py
+++ b/tensorflow/contrib/eager/python/network_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 import gc
 
 from tensorflow.contrib.eager.python import network
+from tensorflow.contrib.layers.python.layers import regularizers
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
@@ -45,6 +46,22 @@ class MyNetwork(network.Network):
     return self.l1(x)
 
 
+class RegularizedNetwork(network.Network):
+
+  def __init__(self):
+    super(RegularizedNetwork, self).__init__()
+    self.l1 = self.track_layer(core.Dense(
+        1,
+        bias_regularizer=regularizers.l1_regularizer(2.0),
+        kernel_regularizer=regularizers.l1_regularizer(2.0)))
+    self.l2 = self.track_layer(core.Dense(
+        1,
+        bias_regularizer=regularizers.l1_regularizer(2.0)))
+
+  def call(self, values):
+    return self.l2(self.l1(values))
+
+
 class NetworkTest(test.TestCase):
 
   def _save_modify_load_network_built(self, net, global_step=None):
@@ -484,6 +501,18 @@ class NetworkTest(test.TestCase):
       _check_op_prefixes(expected_prefix="my_network_1/dense/",
                          checked_ops=checked_ops)
 
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testVariableRegularizers(self):
+    net = RegularizedNetwork()
+    net(constant_op.constant([[1.]]))
+    self.evaluate(net.variables[0].assign([[2.]]))
+    self.evaluate(net.variables[1].assign([3.]))
+    self.evaluate(net.variables[2].assign([[-2.]]))
+    self.evaluate(net.variables[3].assign([4.]))
+    self.assertAllEqual([4., 6., 8.], self.evaluate(net.losses))
+    self.evaluate(net.variables[3].assign([5.]))
+    self.assertAllEqual([4., 6., 10.], self.evaluate(net.losses))
+
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testDuplicateNameError(self):
     one = constant_op.constant([[1.]])
diff --git a/tensorflow/contrib/eager/python/summary_writer.py b/tensorflow/contrib/eager/python/summary_writer.py
deleted file mode 100644
index 5d8c41b545b3c9fd03af85f302ba05a394f085a4..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/eager/python/summary_writer.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TensorBoard Summary Writer for TensorFlow Eager Execution."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import uuid
-
-from tensorflow.contrib.summary import gen_summary_ops
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import summary_op_util
-from tensorflow.python.ops import variable_scope
-
-
-def _maybe_cpu(v):
-  if isinstance(v, (ops.EagerTensor, ops.Tensor)):
-    return v.cpu()
-  else:
-    return v
-
-
-def _summary_writer_function(name, tensor, function, family=None):
-  def record():
-    with summary_op_util.summary_scope(
-        name, family, values=[tensor]) as (tag, scope):
-      function(tag, scope)
-      return True
-  return record
-
-
-class SummaryWriter(object):
-  """Writes summaries for TensorBoard, compatible with eager execution.
-
-  This class is the supported way of writing TensorBoard summaries under
-  eager execution.
-  """
-
-  _CPU_DEVICE = "cpu:0"
-
-  def __init__(self,
-               logdir,
-               max_queue=10,
-               flush_secs=120,
-               filename_suffix=""):
-    """Summary writer for TensorBoard, compatible with eager execution.
-
-    If necessary, multiple instances of `SummaryWriter` can be created, with
-    distinct `logdir`s and `name`s. Each `SummaryWriter` instance will retain
-    its independent `global_step` counter and data writing destination.
-
-    Example:
-    ```python
-    writer = tfe.SummaryWriter("my_model")
-
-    # ... Code that sets up the model and data batches ...
-
-    for _ in xrange(train_iters):
-      loss = model.train_batch(batch)
-      writer.scalar("loss", loss)
-      writer.step()
-    ```
-
-    Args:
-      logdir: Directory in which summary files will be written.
-      max_queue: Number of summary items to buffer before flushing to
-        filesystem. If 0, summaries will be flushed immediately.
-      flush_secs: Number of secondsbetween forced commits to disk.
-      filename_suffix: Suffix of the event protobuf files in which the summary
-        data are stored.
-
-    Raises:
-      ValueError: If this constructor is called not under eager execution.
-    """
-    # TODO(apassos, ashankar): Make this class and the underlying
-    # contrib.summary_ops compatible with graph model and remove this check.
-    if not context.in_eager_mode():
-      raise ValueError(
-          "Use of SummaryWriter is currently supported only with eager "
-          "execution enabled. File an issue at "
-          "https://github.com/tensorflow/tensorflow/issues/new to express "
-          "interest in fixing this.")
-
-    # TODO(cais): Consider adding name keyword argument, which if None or empty,
-    # will register the global global_step that training_util.get_global_step()
-    # can find.
-    with context.device(self._CPU_DEVICE):
-      self._name = uuid.uuid4().hex
-      self._global_step = 0
-      self._global_step_tensor = variable_scope.get_variable(
-          "global_step/summary_writer/" + self._name,
-          shape=[], dtype=dtypes.int64,
-          initializer=init_ops.zeros_initializer())
-      self._global_step_dirty = False
-      self._resource = gen_summary_ops.summary_writer(shared_name=self._name)
-      gen_summary_ops.create_summary_file_writer(
-          self._resource, logdir, max_queue, flush_secs, filename_suffix)
-      # Delete the resource when this object is deleted
-      self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
-          handle=self._resource, handle_device=self._CPU_DEVICE)
-
-  def step(self):
-    """Increment the global step counter of this SummaryWriter instance."""
-    self._global_step += 1
-    self._global_step_dirty = True
-
-  @property
-  def global_step(self):
-    """Obtain the current global_step value of this SummaryWriter instance.
-
-    Returns:
-      An `int` representing the current value of the global_step of this
-       `SummaryWriter` instance.
-    """
-    return self._global_step
-
-  def _update_global_step_tensor(self):
-    with context.device(self._CPU_DEVICE):
-      if self._global_step_dirty:
-        self._global_step_dirty = False
-        return state_ops.assign(self._global_step_tensor, self._global_step)
-      else:
-        return self._global_step_tensor
-
-  def generic(self, name, tensor, metadata, family=None):
-    """Write a generic-type summary.
-
-    Args:
-      name: A name for the generated node. Will also serve as the series name in
-        TensorBoard.
-      tensor: A `Tensor` or compatible value type containing the value of the
-        summary.
-      metadata: Metadata about the summary.
-      family: Optional; if provided, used as the prefix of the summary tag name,
-        which controls the tab name used for display on Tensorboard.
-    """
-    with context.device(self._CPU_DEVICE):
-      with summary_op_util.summary_scope(
-          name, family, values=[tensor]) as (tag, scope):
-        gen_summary_ops.write_summary(
-            self._resource,
-            self._update_global_step_tensor(),
-            _maybe_cpu(tensor),
-            tag,
-            _maybe_cpu(metadata),
-            name=scope)
-
-  def scalar(self, name, tensor, family=None):
-    """Write a scalar summary.
-
-    Args:
-      name: A name for the generated node. Will also serve as the series name in
-        TensorBoard.
-      tensor: A real numeric `Tensor` or compatible value type containing a
-        single value.
-      family: Optional; if provided, used as the prefix of the summary tag name,
-        which controls the tab name used for display on Tensorboard.
-
-    Returns:
-      A summary writer function for scalars.
-    """
-    with context.device(self._CPU_DEVICE):
-      with summary_op_util.summary_scope(
-          name, family, values=[tensor]) as (tag, scope):
-        gen_summary_ops.write_scalar_summary(
-            self._resource, self._update_global_step_tensor(),
-            tag, _maybe_cpu(tensor), name=scope)
-
-  def histogram(self, name, tensor, family=None):
-    """Write a histogram summary.
-
-    Args:
-      name: A name for the generated node. Will also serve as a series name in
-        TensorBoard.
-      tensor: A real numeric `Tensor` or compatible value type. Any shape.
-        Values to use to build the histogram.
-      family: Optional; if provided, used as the prefix of the summary tag name,
-        which controls the tab name used for display on Tensorboard.
-    """
-    with context.device(self._CPU_DEVICE):
-      with summary_op_util.summary_scope(
-          name, family, values=[tensor]) as (tag, scope):
-        gen_summary_ops.write_histogram_summary(
-            self._resource, self._update_global_step_tensor(),
-            tag, _maybe_cpu(tensor), name=scope)
-
-  def image(self, name, tensor, bad_color=None, max_images=3, family=None):
-    """Write an image summary."""
-    with context.device(self._CPU_DEVICE):
-      if bad_color is None:
-        bad_color_ = constant_op.constant([255, 0, 0, 255], dtype=dtypes.uint8)
-      with summary_op_util.summary_scope(
-          name, family, values=[tensor]) as (tag, scope):
-        gen_summary_ops.write_image_summary(
-            self._resource, self._update_global_step_tensor(),
-            tag, _maybe_cpu(tensor), bad_color_, max_images,
-            name=scope)
-
-  def audio(self, name, tensor, sample_rate, max_outputs, family=None):
-    """Write an audio summary.
-
-    Args:
-      name: A name for the generated node. Will also serve as a series name in
-        TensorBoard.
-      tensor: A 3-D `float32` `Tensor` of shape `[batch_size, frames, channels]`
-        or a 2-D `float32` `Tensor` of shape `[batch_size, frames]`, or
-        compatible value type.
-      sample_rate: A Scalar `float32` `Tensor` indicating the sample rate of the
-        signal in hertz.
-      max_outputs: Max number of batch elements to generate audio for.
-      family: Optional; if provided, used as the prefix of the summary tag name,
-        which controls the tab name used for display on Tensorboard.
-    """
-    with context.device(self._CPU_DEVICE):
-      with summary_op_util.summary_scope(
-          name, family, values=[tensor]) as (tag, scope):
-        gen_summary_ops.write_audio_summary(
-            self._resource, self._update_global_step_tensor(),
-            tag,
-            _maybe_cpu(tensor),
-            sample_rate=_maybe_cpu(sample_rate),
-            max_outputs=max_outputs,
-            name=scope)
diff --git a/tensorflow/contrib/eager/python/summary_writer_test.py b/tensorflow/contrib/eager/python/summary_writer_test.py
deleted file mode 100644
index 5ebb36d04fcba8f4558fa1c09716314af42f559f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/eager/python/summary_writer_test.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Unit tests for eager execution SummaryWriter."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import shutil
-import tempfile
-
-import numpy as np
-
-from tensorflow.contrib.eager.python import summary_writer
-from tensorflow.core.util import event_pb2
-from tensorflow.python.eager import context
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.lib.io import tf_record
-from tensorflow.python.platform import gfile
-
-
-class SummaryWriterTest(test.TestCase):
-
-  def setUp(self):
-    super(SummaryWriterTest, self).setUp()
-    self._test_device = "gpu:0" if context.num_gpus() else "cpu:0"
-    self._tmp_logdir = tempfile.mkdtemp()
-    with context.device(self._test_device):
-      # Use max_queue=0 so that summaries are immediately flushed to filesystem,
-      # making testing easier.
-      self._writer = summary_writer.SummaryWriter(self._tmp_logdir, max_queue=0)
-
-  def tearDown(self):
-    if os.path.isdir(self._tmp_logdir):
-      shutil.rmtree(self._tmp_logdir)
-    super(SummaryWriterTest, self).tearDown()
-
-  def _readLastEvent(self, logdir=None):
-    if not logdir:
-      logdir = self._tmp_logdir
-    files = [f for f in gfile.ListDirectory(logdir)
-             if not gfile.IsDirectory(os.path.join(logdir, f))]
-    file_path = os.path.join(logdir, files[0])
-    records = list(tf_record.tf_record_iterator(file_path))
-    event = event_pb2.Event()
-    event.ParseFromString(records[-1])
-    return event
-
-  def testGlobalStep(self):
-    with context.device(self._test_device):
-      orig_step = self._writer.global_step
-      self._writer.step()
-      self.assertEqual(orig_step + 1, self._writer.global_step)
-      self.assertEqual(orig_step + 1, self._writer.global_step)
-      self._writer.step()
-      self._writer.step()
-      self.assertEqual(orig_step + 3, self._writer.global_step)
-
-  def testGenericSummary(self):
-    with context.device(self._test_device):
-      x = constant_op.constant(1337.0)
-      with context.device("cpu:0"):
-        metadata = constant_op.constant("foo")
-      self._writer.generic("x", x, metadata)
-      event = self._readLastEvent()
-      self.assertEqual("x", event.summary.value[0].tag)
-
-  def testScalarSummary(self):
-    with context.device(self._test_device):
-      x = constant_op.constant(1337.0)
-      self._writer.scalar("x", x)
-      event = self._readLastEvent()
-      self.assertTrue("x", event.summary.value[0].tag)
-      self.assertEqual(1337.0, event.summary.value[0].simple_value)
-
-  def testHistogramSummary(self):
-    with context.device(self._test_device):
-      y = constant_op.constant([1.0, 3.0, 3.0, 7.0])
-      self._writer.histogram("y", y)
-      event = self._readLastEvent()
-      self.assertEqual("y", event.summary.value[0].tag)
-      self.assertTrue(event.summary.value[0].histo)
-
-  def testImageSummary(self):
-    with context.device(self._test_device):
-      a = constant_op.constant([[10.0, 20.0], [-20.0, -10.0]])
-      self._writer.histogram("image1", a)
-      event = self._readLastEvent()
-      self.assertEqual("image1", event.summary.value[0].tag)
-      self.assertTrue(event.summary.value[0].image)
-
-  def testAudioSummary(self):
-    with context.device(self._test_device):
-      w = constant_op.constant(np.random.rand(3, 10, 2), dtype=dtypes.float32)
-      fs = constant_op.constant(44100.0, dtype=dtypes.float32)
-      max_outputs = 1
-      self._writer.audio("audio1", w, fs, max_outputs)
-      event = self._readLastEvent()
-      self.assertTrue(event.summary.value[0].audio)
-
-  def testTwoSummaryWritersGlobalStepsWorkWithoutCrosstalk(self):
-    tmp_logdir2 = os.path.join(self._tmp_logdir, "_writer2_")
-    writer2 = summary_writer.SummaryWriter(tmp_logdir2, max_queue=0)
-
-    self.assertEqual(0, writer2.global_step)
-    self._writer.step()
-    self.assertEqual(0, writer2.global_step)
-    writer2.step()
-    writer2.step()
-    writer2.step()
-    self.assertEqual(3, writer2.global_step)
-
-    x = constant_op.constant(1337.0)
-    writer_orig_step = self._writer.global_step
-    self._writer.step()
-    self._writer.scalar("x", x)
-
-    event = self._readLastEvent()
-    self.assertEqual(writer_orig_step + 1, event.step)
-
-    writer2.scalar("x", x)
-    event = self._readLastEvent(tmp_logdir2)
-    self.assertEqual(3, event.step)
-
-    self._writer.step()
-    self._writer.scalar("x", x)
-
-    event = self._readLastEvent()
-    self.assertEqual(writer_orig_step + 2, event.step)
-
-
-# TODO(cais): Add performance benchmark for SummaryWriter.
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index 1697c879def8af5c05f3c9b11d318d570785d6de..770a7e3e7a01f3351c229b7fb53383240dd1f1c8 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -23,6 +23,7 @@ To use, at program startup, call `tfe.enable_eager_execution()`.
 @@list_devices
 @@num_gpus
 
+@@py_func
 @@defun
 @@implicit_gradients
 @@implicit_value_and_gradients
@@ -101,8 +102,10 @@ from tensorflow.python.framework.test_util import IsolateTest
 from tensorflow.python.framework.test_util import run_in_graph_and_eager_modes as run_test_in_graph_and_eager_modes
 from tensorflow.python.ops.resource_variable_ops import ResourceVariable as Variable
 from tensorflow.python.ops.variable_scope import EagerVariableStore
+from tensorflow.python.ops import script_ops
 from tensorflow.python.util.all_util import remove_undocumented
 
+py_func = script_ops.eager_py_func
 defun = function.defun
 implicit_gradients = backprop.implicit_grad
 implicit_value_and_gradients = backprop.implicit_val_and_grad
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 008ca7a5d17437213ad64a54dddd40ad37e81df0..bd65ece85d2bfc6b38ba3507d3e702241eaf6067 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -27,8 +27,10 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":dnn",
+        ":dnn_linear_combined",
         ":extenders",
         ":head",
+        ":linear",
         ":logit_fns",
         ":multi_head",
         ":replicate_model_fn",
@@ -73,6 +75,46 @@ py_test(
     ],
 )
 
+py_library(
+    name = "dnn_linear_combined",
+    srcs = ["python/estimator/dnn_linear_combined.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:nn",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:dnn_linear_combined",
+    ],
+)
+
+py_test(
+    name = "dnn_linear_combined_test",
+    size = "medium",
+    srcs = ["python/estimator/dnn_linear_combined_test.py"],
+    shard_count = 3,
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "notsan",
+    ],
+    deps = [
+        ":dnn_linear_combined",
+        ":head",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+        "//tensorflow/python/estimator:dnn_testing_utils",
+        "//tensorflow/python/estimator:export_export",
+        "//tensorflow/python/estimator:linear_testing_utils",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/estimator:prediction_keys",
+        "//tensorflow/python/feature_column",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "extenders",
     srcs = [
@@ -169,6 +211,42 @@ py_test(
     ],
 )
 
+py_library(
+    name = "linear",
+    srcs = ["python/estimator/linear.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:linear",
+    ],
+)
+
+py_test(
+    name = "linear_test",
+    size = "small",
+    srcs = ["python/estimator/linear_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "notsan",
+    ],
+    deps = [
+        ":head",
+        ":linear",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+        "//tensorflow/python/estimator:export_export",
+        "//tensorflow/python/estimator:linear_testing_utils",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/estimator:prediction_keys",
+        "//tensorflow/python/feature_column",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "logit_fns",
     srcs = [
@@ -253,23 +331,24 @@ py_library(
         "//tensorflow/python:device",
         "//tensorflow/python:device_lib",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/estimator:model_fn",
         "//tensorflow/python/estimator:util",
+        "//tensorflow/python/ops/losses",
         "@six_archive//:six",
     ],
 )
 
 cuda_py_test(
     name = "replicate_model_fn_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/estimator/replicate_model_fn_test.py"],
     additional_deps = [
         "//tensorflow/python/estimator",
@@ -297,5 +376,5 @@ cuda_py_test(
         "//tensorflow/python:variables",
         ":replicate_model_fn",
     ],
-    tags = ["requires-gpu-sm35"],
+    tags = ["multi_gpu"],
 )
diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index cf727264cd5116915f6bd7f285e470cbc2e2742a..28c1f8b1809d27db697365b7bb50441f7820d2b4 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -20,10 +20,13 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import
 from tensorflow.contrib.estimator.python.estimator.dnn import *
+from tensorflow.contrib.estimator.python.estimator.dnn_linear_combined import *
 from tensorflow.contrib.estimator.python.estimator.extenders import *
 from tensorflow.contrib.estimator.python.estimator.head import *
+from tensorflow.contrib.estimator.python.estimator.linear import *
 from tensorflow.contrib.estimator.python.estimator.logit_fns import *
 from tensorflow.contrib.estimator.python.estimator.multi_head import *
+from tensorflow.contrib.estimator.python.estimator.replicate_model_fn import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long,wildcard-import
@@ -38,9 +41,12 @@ _allowed_symbols = [
     'multi_label_head',
     'regression_head',
     'DNNEstimator',
+    'DNNLinearCombinedEstimator',
+    'LinearEstimator',
     'call_logit_fn',
     'dnn_logit_fn_builder',
     'linear_logit_fn_builder',
+    'replicate_model_fn',
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccaf1128bf23af734f7a5722a4dd8c1f0304fab7
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py
@@ -0,0 +1,164 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow estimator for Linear and DNN joined training models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator.canned import dnn_linear_combined as dnn_linear_combined_lib
+from tensorflow.python.ops import nn
+
+
+class DNNLinearCombinedEstimator(estimator.Estimator):
+  """An estimator for TensorFlow Linear and DNN joined models with custom head.
+
+  Note: This estimator is also known as wide-n-deep.
+
+  Example:
+
+  ```python
+  numeric_feature = numeric_column(...)
+  categorical_column_a = categorical_column_with_hash_bucket(...)
+  categorical_column_b = categorical_column_with_hash_bucket(...)
+
+  categorical_feature_a_x_categorical_feature_b = crossed_column(...)
+  categorical_feature_a_emb = embedding_column(
+      categorical_column=categorical_feature_a, ...)
+  categorical_feature_b_emb = embedding_column(
+      categorical_column=categorical_feature_b, ...)
+
+  estimator = DNNLinearCombinedEstimator(
+      head=tf.contrib.estimator.multi_label_head(n_classes=3),
+      # wide settings
+      linear_feature_columns=[categorical_feature_a_x_categorical_feature_b],
+      linear_optimizer=tf.train.FtrlOptimizer(...),
+      # deep settings
+      dnn_feature_columns=[
+          categorical_feature_a_emb, categorical_feature_b_emb,
+          numeric_feature],
+      dnn_hidden_units=[1000, 500, 100],
+      dnn_optimizer=tf.train.ProximalAdagradOptimizer(...))
+
+  # To apply L1 and L2 regularization, you can set optimizers as follows:
+  tf.train.ProximalAdagradOptimizer(
+      learning_rate=0.1,
+      l1_regularization_strength=0.001,
+      l2_regularization_strength=0.001)
+  # It is same for FtrlOptimizer.
+
+  # Input builders
+  def input_fn_train: # returns x, y
+    pass
+  estimator.train(input_fn=input_fn_train, steps=100)
+
+  def input_fn_eval: # returns x, y
+    pass
+  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
+  def input_fn_predict: # returns x, None
+    pass
+  predictions = estimator.predict(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+  otherwise there will be a `KeyError`:
+
+  * for each `column` in `dnn_feature_columns` + `linear_feature_columns`:
+    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
+      with `key` the id column name, the second with `key` the weight column
+      name. Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+
+  Loss is calculated by using mean squared error.
+
+  @compatibility(eager)
+  Estimators are not compatible with eager execution.
+  @end_compatibility
+  """
+
+  def __init__(self,
+               head,
+               model_dir=None,
+               linear_feature_columns=None,
+               linear_optimizer='Ftrl',
+               dnn_feature_columns=None,
+               dnn_optimizer='Adagrad',
+               dnn_hidden_units=None,
+               dnn_activation_fn=nn.relu,
+               dnn_dropout=None,
+               input_layer_partitioner=None,
+               config=None):
+    """Initializes a DNNLinearCombinedEstimator instance.
+
+    Args:
+      head: A `_Head` instance constructed with a method such as
+        `tf.contrib.estimator.multi_label_head`.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      linear_feature_columns: An iterable containing all the feature columns
+        used by linear part of the model. All items in the set must be
+        instances of classes derived from `FeatureColumn`.
+      linear_optimizer: An instance of `tf.Optimizer` used to apply gradients to
+        the linear part of the model. Defaults to FTRL optimizer.
+      dnn_feature_columns: An iterable containing all the feature columns used
+        by deep part of the model. All items in the set must be instances of
+        classes derived from `FeatureColumn`.
+      dnn_optimizer: An instance of `tf.Optimizer` used to apply gradients to
+        the deep part of the model. Defaults to Adagrad optimizer.
+      dnn_hidden_units: List of hidden units per layer. All layers are fully
+        connected.
+      dnn_activation_fn: Activation function applied to each layer. If None,
+        will use `tf.nn.relu`.
+      dnn_dropout: When not None, the probability we will drop out
+        a given coordinate.
+      input_layer_partitioner: Partitioner for input layer. Defaults to
+        `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+      config: RunConfig object to configure the runtime settings.
+
+    Raises:
+      ValueError: If both linear_feature_columns and dnn_features_columns are
+        empty at the same time.
+    """
+    linear_feature_columns = linear_feature_columns or []
+    dnn_feature_columns = dnn_feature_columns or []
+    self._feature_columns = (
+        list(linear_feature_columns) + list(dnn_feature_columns))
+    if not self._feature_columns:
+      raise ValueError('Either linear_feature_columns or dnn_feature_columns '
+                       'must be defined.')
+
+    def _model_fn(features, labels, mode, config):
+      return dnn_linear_combined_lib._dnn_linear_combined_model_fn(  # pylint: disable=protected-access
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          linear_feature_columns=linear_feature_columns,
+          linear_optimizer=linear_optimizer,
+          dnn_feature_columns=dnn_feature_columns,
+          dnn_optimizer=dnn_optimizer,
+          dnn_hidden_units=dnn_hidden_units,
+          dnn_activation_fn=dnn_activation_fn,
+          dnn_dropout=dnn_dropout,
+          input_layer_partitioner=input_layer_partitioner,
+          config=config)
+
+    super(DNNLinearCombinedEstimator, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5e4d34dc70ccaa4806ae8b8ed5001bd971ee7b4
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
@@ -0,0 +1,220 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for dnn_linear_combined.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import shutil
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.contrib.estimator.python.estimator import dnn_linear_combined
+from tensorflow.contrib.estimator.python.estimator import head as head_lib
+from tensorflow.python.estimator.canned import dnn_testing_utils
+from tensorflow.python.estimator.canned import linear_testing_utils
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+
+
+def _dnn_only_estimator_fn(
+    hidden_units,
+    feature_columns,
+    model_dir=None,
+    label_dimension=1,
+    weight_column=None,
+    optimizer='Adagrad',
+    activation_fn=nn.relu,
+    dropout=None,
+    input_layer_partitioner=None,
+    config=None):
+  return dnn_linear_combined.DNNLinearCombinedEstimator(
+      head=head_lib.regression_head(
+          weight_column=weight_column, label_dimension=label_dimension),
+      model_dir=model_dir,
+      dnn_feature_columns=feature_columns,
+      dnn_optimizer=optimizer,
+      dnn_hidden_units=hidden_units,
+      dnn_activation_fn=activation_fn,
+      dnn_dropout=dropout,
+      input_layer_partitioner=input_layer_partitioner,
+      config=config)
+
+
+class DNNOnlyEstimatorEvaluateTest(
+    dnn_testing_utils.BaseDNNRegressorEvaluateTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNRegressorEvaluateTest.__init__(
+        self, _dnn_only_estimator_fn)
+
+
+class DNNOnlyEstimatorPredictTest(
+    dnn_testing_utils.BaseDNNRegressorPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNRegressorPredictTest.__init__(
+        self, _dnn_only_estimator_fn)
+
+
+class DNNOnlyEstimatorTrainTest(
+    dnn_testing_utils.BaseDNNRegressorTrainTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNRegressorTrainTest.__init__(
+        self, _dnn_only_estimator_fn)
+
+
+def _linear_only_estimator_fn(
+    feature_columns,
+    model_dir=None,
+    label_dimension=1,
+    weight_column=None,
+    optimizer='Ftrl',
+    config=None,
+    partitioner=None):
+  return dnn_linear_combined.DNNLinearCombinedEstimator(
+      head=head_lib.regression_head(
+          weight_column=weight_column, label_dimension=label_dimension),
+      model_dir=model_dir,
+      linear_feature_columns=feature_columns,
+      linear_optimizer=optimizer,
+      input_layer_partitioner=partitioner,
+      config=config)
+
+
+class LinearOnlyEstimatorEvaluateTest(
+    linear_testing_utils.BaseLinearRegressorEvaluationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorEvaluationTest.__init__(
+        self, _linear_only_estimator_fn)
+
+
+class LinearOnlyEstimatorPredictTest(
+    linear_testing_utils.BaseLinearRegressorPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorPredictTest.__init__(
+        self, _linear_only_estimator_fn)
+
+
+class LinearOnlyEstimatorTrainTest(
+    linear_testing_utils.BaseLinearRegressorTrainingTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorTrainingTest.__init__(
+        self, _linear_only_estimator_fn)
+
+
+class DNNLinearCombinedEstimatorIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(
+      self, train_input_fn, eval_input_fn, predict_input_fn, input_dimension,
+      label_dimension, batch_size):
+    linear_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))]
+    dnn_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))]
+    feature_columns = linear_feature_columns + dnn_feature_columns
+    est = dnn_linear_combined.DNNLinearCombinedEstimator(
+        head=head_lib.regression_head(label_dimension=label_dimension),
+        linear_feature_columns=linear_feature_columns,
+        dnn_feature_columns=dnn_feature_columns,
+        dnn_hidden_units=(2, 2),
+        model_dir=self._model_dir)
+
+    # TRAIN
+    num_steps = 10
+    est.train(train_input_fn, steps=num_steps)
+
+    # EVALUTE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn('loss', six.iterkeys(scores))
+
+    # PREDICT
+    predictions = np.array([
+        x[prediction_keys.PredictionKeys.PREDICTIONS]
+        for x in est.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
+
+    # EXPORT
+    feature_spec = feature_column.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def test_numpy_input_fn(self):
+    """Tests complete flow with numpy_input_fn."""
+    label_dimension = 2
+    batch_size = 10
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+    # learn y = x
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        batch_size=batch_size,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=label_dimension,
+        label_dimension=label_dimension,
+        batch_size=batch_size)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/linear.py b/tensorflow/contrib/estimator/python/estimator/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bf4abe83d54504d55de73b63f369cceaf149dd2
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/linear.py
@@ -0,0 +1,118 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Linear estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator.canned import linear as linear_lib
+
+
+class LinearEstimator(estimator.Estimator):
+  """An estimator for TensorFlow linear models with user-specified head.
+
+  Example:
+
+  ```python
+  categorical_column_a = categorical_column_with_hash_bucket(...)
+  categorical_column_b = categorical_column_with_hash_bucket(...)
+
+  categorical_feature_a_x_categorical_feature_b = crossed_column(...)
+
+  # Estimator using the default optimizer.
+  estimator = LinearEstimator(
+      head=tf.contrib.estimator.multi_label_head(n_classes=3),
+      feature_columns=[categorical_column_a,
+                       categorical_feature_a_x_categorical_feature_b])
+
+  # Or estimator using the FTRL optimizer with regularization.
+  estimator = LinearEstimator(
+      head=tf.contrib.estimator.multi_label_head(n_classes=3),
+      feature_columns=[categorical_column_a,
+                       categorical_feature_a_x_categorical_feature_b])
+      optimizer=tf.train.FtrlOptimizer(
+          learning_rate=0.1,
+          l1_regularization_strength=0.001
+      ))
+
+  def input_fn_train: # returns x, y (where y represents label's class index).
+    ...
+  estimator.train(input_fn=input_fn_train, steps=100)
+  def input_fn_eval: # returns x, y (where y represents label's class index).
+    ...
+  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
+  def input_fn_predict: # returns x, None
+    ...
+  predictions = estimator.predict(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+  otherwise there will be a `KeyError`:
+
+  * if `weight_column` is not `None`, a feature with
+    `key=weight_column` whose value is a `Tensor`.
+  * for each `column` in `feature_columns`:
+    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
+      with `key` the id column name, the second with `key` the weight column
+      name. Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+
+  Loss and predicted output are determined by the specified head.
+
+  @compatibility(eager)
+  Estimators are not compatible with eager execution.
+  @end_compatibility
+  """
+
+  def __init__(self,
+               head,
+               feature_columns,
+               model_dir=None,
+               optimizer='Ftrl',
+               config=None,
+               partitioner=None):
+    """Initializes a `LinearEstimator` instance.
+
+    Args:
+      head: A `_Head` instance constructed with a method such as
+        `tf.contrib.estimator.multi_label_head`.
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      optimizer: An instance of `tf.Optimizer` used to train the model. Defaults
+        to FTRL optimizer.
+      config: `RunConfig` object to configure the runtime settings.
+      partitioner: Optional. Partitioner for input layer.
+    """
+    def _model_fn(features, labels, mode, config):
+      return linear_lib._linear_model_fn(  # pylint: disable=protected-access
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          feature_columns=tuple(feature_columns or []),
+          optimizer=optimizer,
+          partitioner=partitioner,
+          config=config)
+    super(LinearEstimator, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/estimator/python/estimator/linear_test.py b/tensorflow/contrib/estimator/python/estimator/linear_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c63514eb688af48577f0a3b7ce9e7478309f2c30
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/linear_test.py
@@ -0,0 +1,153 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for linear.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import shutil
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.contrib.estimator.python.estimator import head as head_lib
+from tensorflow.contrib.estimator.python.estimator import linear
+from tensorflow.python.estimator.canned import linear_testing_utils
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+
+
+def _linear_estimator_fn(
+    weight_column=None, label_dimension=1, *args, **kwargs):
+  """Returns a LinearEstimator that uses regression_head."""
+  return linear.LinearEstimator(
+      head=head_lib.regression_head(
+          weight_column=weight_column, label_dimension=label_dimension),
+      *args, **kwargs)
+
+
+class LinearEstimatorEvaluateTest(
+    linear_testing_utils.BaseLinearRegressorEvaluationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorEvaluationTest.__init__(
+        self, _linear_estimator_fn)
+
+
+class LinearEstimatorPredictTest(
+    linear_testing_utils.BaseLinearRegressorPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorPredictTest.__init__(
+        self, _linear_estimator_fn)
+
+
+class LinearEstimatorTrainTest(
+    linear_testing_utils.BaseLinearRegressorTrainingTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorTrainingTest.__init__(
+        self, _linear_estimator_fn)
+
+
+class LinearEstimatorIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(
+      self, train_input_fn, eval_input_fn, predict_input_fn, input_dimension,
+      label_dimension, batch_size):
+    feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))]
+    est = linear.LinearEstimator(
+        head=head_lib.regression_head(label_dimension=label_dimension),
+        feature_columns=feature_columns,
+        model_dir=self._model_dir)
+
+    # TRAIN
+    num_steps = 10
+    est.train(train_input_fn, steps=num_steps)
+
+    # EVALUTE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn('loss', six.iterkeys(scores))
+
+    # PREDICT
+    predictions = np.array([
+        x[prediction_keys.PredictionKeys.PREDICTIONS]
+        for x in est.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
+
+    # EXPORT
+    feature_spec = feature_column.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def test_numpy_input_fn(self):
+    """Tests complete flow with numpy_input_fn."""
+    label_dimension = 2
+    batch_size = 10
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+    # learn y = x
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        batch_size=batch_size,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=label_dimension,
+        label_dimension=label_dimension,
+        batch_size=batch_size)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
index d9c83aa86577aa129458c56887ff4668c103d0db..598bd549c5cef7edde6bf94605aa8839b611e185 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -41,20 +41,25 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.training import device_setter as device_setter_lib
 from tensorflow.python.training import training_util
 
 
-def replicate_model_fn(model_fn, optimizer_fn, devices=None):
+def replicate_model_fn(model_fn,
+                       optimizer_fn,
+                       loss_reduction=losses.Reduction.SUM,
+                       devices=None):
   """Replicate `Estimator.model_fn` over GPUs within a single host.
 
   The given `model_fn` specifies a single forward pass of a model.  To replicate
   such a model over GPUs, each GPU gets its own instance of the forward pass
   (a.k.a. a tower).  The input features and labels get sharded into the chunks
-  that correspond to the number of GPUs.  Each tower computes its own loss based
+  that correspond to the number of GPUs.  Each tower computes a loss based
   on its input.  For each such loss, gradients are computed.  After that, the
-  available losses are summed to form aggregated loss.  The available
-  gradients are summed too.  Then, they update weights using the specified
+  available losses are aggregated to form aggregated loss.  Available
+  gradients are summed.  Then, they update weights using the specified
   optimizer.
 
   If `devices` are `None`, then all available GPUs are going to be used for
@@ -101,7 +106,7 @@ def replicate_model_fn(model_fn, optimizer_fn, devices=None):
   On reduction algorithms:
   Certain algorithms were chosen for aggregating results of computations on
   multiple towers:
-    - Losses from all towers are reduced using sum.
+    - Losses from all towers are reduced according to `loss_reduction`.
     - Gradients are reduced using sum for each trainable variable.
     - `eval_metrics_ops` are reduced per metric using `reduce_mean`.
     - `EstimatorSpec.predictions` and `EstimatorSpec.export_outputs` are
@@ -109,7 +114,7 @@ def replicate_model_fn(model_fn, optimizer_fn, devices=None):
     - For all other fields of `EstimatorSpec` the values of the first tower
       are taken.
 
-  On replication of variables:
+  On distribution of variables:
   Variables are not duplicated between towers.  Instead, they are placed on a
   single device as defined above and shared across towers.
 
@@ -123,6 +128,7 @@ def replicate_model_fn(model_fn, optimizer_fn, devices=None):
     optimizer_fn: a function that returns an optimizer instance.  The function
       may accept one `params` argument.  This is the `params` argument as
       defined by `Estimator`.  See  the `Estimator` documentation for details.
+    loss_reduction: controls whether losses are summed or averaged.
     devices: Optional list of devices to replicate the model across.  This
       argument can be used to replice only on the subset of available GPUs.
       If `None`, then all available GPUs are going to be used for replication.
@@ -133,39 +139,91 @@ def replicate_model_fn(model_fn, optimizer_fn, devices=None):
       conforms to the requirements of `Estimator`'s `model_fn` and can be used
       instead of the supplied `model_fn`.
   """
+  return _replicate_model_fn_with_mode(
+      model_fn,
+      optimizer_fn,
+      loss_reduction,
+      devices,
+      # TODO(isaprykin): Query the system configuration to choose modes other
+      # than `SHARED_LOCAL_PARAMETER_SERVER`, even though it is often
+      # appropriate.
+      mode=_VariableDistributionMode.SHARED_LOCAL_PARAMETER_SERVER)
+
+
+class _VariableDistributionMode(object):
+  """Modes for variable distribution used for forcing a particular one.
+
+  Forcing a mode is meant for performance experimentation purposes rather than
+  for general use cases.
+  """
+
+  SHARED_LOCAL_PARAMETER_SERVER = 1
+  """Variables are placed on a single device and shared across all devices.
+
+  Two ways to achieve this distribution over available GPUs are supported:
+    1)  If exactly 1 GPU is detected, then variables and operations are placed
+        onto GPU.
+    2)  If more than 1 GPU is detected, then variables are going to be placed on
+        the CPU.  Replicas of operations are placed on each individual GPU.
+  """
+
+  SHARED_ROUND_ROBIN = 2
+  """Variables are placed on all devices in a round-robin fashion.
+
+  Every subsequent variable is placed on the next device.  There is only one
+  copy of each variable that is shared across all devices.
+  """
+
+
+def _replicate_model_fn_with_mode(
+    model_fn,
+    optimizer_fn,
+    loss_reduction=losses.Reduction.SUM,
+    devices=None,
+    mode=_VariableDistributionMode.SHARED_LOCAL_PARAMETER_SERVER):
+  """A version of `replicate_model_fn` that allows to specify a `mode`."""
+  if loss_reduction == losses.Reduction.NONE:
+    raise ValueError('Tower losses need to be reduced in some way, yet {} '
+                     'reduction is specified.'.format(loss_reduction))
   if not devices:
     devices = _get_local_devices('GPU') or _get_local_devices('CPU')
 
   is_a_single_gpu_case = len(devices) == 1 and 'GPU' in devices[0]
-  local_ps_device = '/{}:0'.format('GPU' if is_a_single_gpu_case else 'CPU')
+  consolidation_device = '/{}:0'.format('GPU'
+                                        if is_a_single_gpu_case else 'CPU')
+
+  ps_devices = [consolidation_device]
+  if mode == _VariableDistributionMode.SHARED_ROUND_ROBIN:
+    ps_devices = devices
 
-  tf_logging.info('Replicating the `model_fn` across {}.  Local parameter '
-                  'server device is going to be {}.'.format(
-                      devices, local_ps_device))
+  tf_logging.info('Replicating the `model_fn` across {}.  Variables are going '
+                  'to be placed on {}.  Consolidation device is going to be {}.'
+                  .format(devices, ps_devices, consolidation_device))
 
   def replicated_model_fn(features, labels, mode, params=None, config=None):
     """Replicated version of `model_fn` to be used instead."""
     feature_shards, label_shards = _split_batch(
-        features, labels, len(devices), device=local_ps_device)
+        features, labels, len(devices), device=consolidation_device)
     tower_specs = _get_loss_towers(
         model_fn=model_fn,
         mode=mode,
         features=feature_shards,
         labels=label_shards,
         params=params,
+        loss_reduction=loss_reduction,
         config=config,
         devices=devices,
-        local_ps_device=local_ps_device)
+        local_ps_devices=ps_devices)
 
     if mode == model_fn_lib.ModeKeys.TRAIN:
       train_op = _minimize_towers(tower_specs,
                                   _call_optimizer_fn(optimizer_fn, params))
       return _train_spec(
-          tower_specs, train_op, aggregation_device=local_ps_device)
+          tower_specs, train_op, aggregation_device=consolidation_device)
     elif mode == model_fn_lib.ModeKeys.EVAL:
-      return _eval_spec(tower_specs, aggregation_device=local_ps_device)
+      return _eval_spec(tower_specs, aggregation_device=consolidation_device)
     elif mode == model_fn_lib.ModeKeys.PREDICT:
-      return _predict_spec(tower_specs, aggregation_device=local_ps_device)
+      return _predict_spec(tower_specs, aggregation_device=consolidation_device)
 
   return replicated_model_fn
 
@@ -222,7 +280,8 @@ def _get_loss_towers(model_fn,
                      params,
                      config,
                      devices,
-                     local_ps_device,
+                     local_ps_devices,
+                     loss_reduction=losses.Reduction.SUM,
                      name_scope_pattern=_DEFAULT_NAME_SCOPE_PATTERN):
   """Replicate the loss computation across devices."""
   tower_specs = []
@@ -234,15 +293,22 @@ def _get_loss_towers(model_fn,
   if 'config' in model_fn_args:
     optional_params['config'] = copy.deepcopy(config)
 
+  # pylint: disable=protected-access
+  round_robin_strategy = device_setter_lib._RoundRobinStrategy(
+      num_tasks=len(local_ps_devices))
+  # pylint: enable=protected-access
+
   for i, device in enumerate(devices):
     is_the_first_tower = (i == 0)
 
     device_setter = _local_device_setter(
-        worker_device=device, ps_device=local_ps_device)
+        worker_device=device,
+        ps_devices=local_ps_devices,
+        ps_strategy=round_robin_strategy)
 
-    # We would like to preserve the names of the variables and ops that a user
-    # might be relying on. Names with prefix are going to resolve to variables
-    # and ops of the first tower.
+    # We would like to preserve the names of the variables and ops that the user
+    # might be relying on. Names without a prefix are going to resolve to
+    # variables and ops of the first tower.
     name_scope = name_scope_pattern
     if is_the_first_tower:
       name_scope = ''
@@ -254,16 +320,19 @@ def _get_loss_towers(model_fn,
           if labels:
             labels_shard = labels[i]
 
-          tower_specs.append(
-              model_fn(
-                  mode=mode,
-                  features=features[i],
-                  labels=labels_shard,
-                  **optional_params))
+          tower_spec = model_fn(
+              mode=mode,
+              features=features[i],
+              labels=labels_shard,
+              **optional_params)
+          if loss_reduction != losses.Reduction.SUM:
+            tower_spec = _scale_tower_loss(
+                tower_spec, number_of_towers=len(devices))
+          tower_specs.append(tower_spec)
   return tower_specs
 
 
-def _local_device_setter(ps_device, worker_device):
+def _local_device_setter(worker_device, ps_devices, ps_strategy):
   """A device setter that puts distributes Var/Ops to PS/workers."""
   ps_ops = ['Variable', 'VariableV2', 'VarHandleOp']
 
@@ -273,7 +342,7 @@ def _local_device_setter(ps_device, worker_device):
     node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
     if node_def.op in ps_ops:
       ps_device_spec = framework_device.DeviceSpec.from_string(
-          '{}'.format(ps_device))
+          '{}'.format(ps_devices[ps_strategy(op)]))
 
       ps_device_spec.merge_from(current_device)
       return ps_device_spec.to_string()
@@ -286,6 +355,17 @@ def _local_device_setter(ps_device, worker_device):
   return local_device_chooser
 
 
+def _scale_tower_loss(tower_spec, number_of_towers):
+  """Scale down the loss for arriving at the average loss by summing."""
+  if tower_spec.loss is None:
+    return tower_spec
+
+  estimator_spec = _asdict(tower_spec)
+  estimator_spec['loss'] = math_ops.div(
+      tower_spec.loss, 1.0 * number_of_towers, name='averaged_loss')
+  return model_fn_lib.EstimatorSpec(**estimator_spec)
+
+
 def _minimize_towers(tower_specs, optimizer):
   """Aggregate and apply gradients for computed losses."""
   grad_lists = {}
@@ -335,7 +415,7 @@ def _train_spec(tower_specs,
                 aggregation_device,
                 aggregated_loss_name='loss'):
   """Populate replicated EstimatorSpec for `GraphKeys.TRAIN`."""
-  estimator_spec = tower_specs[0]._asdict()
+  estimator_spec = _asdict(tower_specs[0])
   estimator_spec['mode'] = model_fn_lib.ModeKeys.TRAIN
   estimator_spec['train_op'] = train_op
   estimator_spec['loss'] = _compute_sum_on_device(
@@ -346,7 +426,7 @@ def _train_spec(tower_specs,
 
 def _eval_spec(tower_specs, aggregation_device, aggregated_loss_name='loss'):
   """Populate replicated EstimatorSpec for `GraphKeys.EVAL`."""
-  estimator_spec = tower_specs[0]._asdict()
+  estimator_spec = _asdict(tower_specs[0])
   estimator_spec['mode'] = model_fn_lib.ModeKeys.EVAL
   estimator_spec['loss'] = _compute_sum_on_device(
       [spec.loss for spec in tower_specs], aggregation_device,
@@ -414,7 +494,7 @@ def _reduce_metric_variables(number_of_towers):
 
 def _predict_spec(tower_specs, aggregation_device):
   """Populate replicated EstimatorSpec for `GraphKeys.PREDICT`."""
-  estimator_spec = tower_specs[0]._asdict()
+  estimator_spec = _asdict(tower_specs[0])
   estimator_spec['mode'] = model_fn_lib.ModeKeys.PREDICT
 
   with ops_lib.device(aggregation_device):
@@ -474,3 +554,19 @@ def _dict_concat(*dicts):
     for k, v in six.iteritems(d):
       list_dict.setdefault(k, []).append(v)
   return list_dict
+
+
+def _asdict(namedtuple):
+  """Returns a namedtuple as a dictionary.
+
+  This is required because `_asdict()` in Python 3.x.x is broken in classes
+  that inherit from `collections.namedtuple`. See
+  https://bugs.python.org/issue24931 for more details.
+
+  Args:
+    namedtuple: An object that inherits from `collections.namedtuple`.
+
+  Returns:
+    A dictionary version of the tuple.
+  """
+  return {k: getattr(namedtuple, k) for k in namedtuple._fields}
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
index 5a1982f5eb52f685a6998ae64a30b29a8aa2ce11..b452e5c7359a973bea670f5760b229cf72d032f5 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
@@ -40,6 +40,7 @@ from tensorflow.python.framework import ops as ops_lib
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import losses
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import variable_scope
@@ -49,15 +50,30 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import device_setter
 from tensorflow.python.training import gradient_descent
 
 
+# TODO(isaprykin):  Parametrize all the tests on
+#   replicate_model_fn._VariableDistributionMode when it's supported.
 class DNNClassifierIntegrationTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
 
-  def test_complete_flow(self):
+  def test_complete_flow_with_public_version(self):
+    return self._complete_flow_with_mode(mode=None)
+
+  def test_complete_flow_with_mode_local_ps_server(self):
+    return self._complete_flow_with_mode(
+        replicate_model_fn._VariableDistributionMode.
+        SHARED_LOCAL_PARAMETER_SERVER)
+
+  def test_complete_flow_with_mode_round_robin(self):
+    return self._complete_flow_with_mode(
+        replicate_model_fn._VariableDistributionMode.SHARED_ROUND_ROBIN)
+
+  def _complete_flow_with_mode(self, mode):
     n_classes = 3
     input_dimension = 2
     batch_size = 12
@@ -105,11 +121,20 @@ class DNNClassifierIntegrationTest(test_util.TensorFlowTestCase):
     def optimizer_fn():
       return optimizers.get_optimizer_instance('Adagrad', learning_rate=0.05)
 
+    if not mode:  # Use the public `replicate_model_fn`.
+      model_fn = replicate_model_fn.replicate_model_fn(
+          estimator.model_fn,
+          optimizer_fn,
+          devices=['/gpu:0', '/gpu:1', '/gpu:2'])
+    else:
+      model_fn = replicate_model_fn._replicate_model_fn_with_mode(
+          estimator.model_fn,
+          optimizer_fn,
+          devices=['/gpu:0', '/gpu:1', '/gpu:2'],
+          mode=mode)
+
     estimator = estimator_lib.Estimator(
-        model_fn=replicate_model_fn.replicate_model_fn(
-            estimator.model_fn,
-            optimizer_fn,
-            devices=['/gpu:0', '/gpu:1', '/gpu:2']),
+        model_fn=model_fn,
         model_dir=estimator.model_dir,
         config=estimator.config,
         params=estimator.params)
@@ -197,13 +222,40 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
       total_loss = (1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)
       self.assertEqual(total_loss, session.run(estimator_spec.loss))
 
-      # loss' of c is 3.
+      # derivative of loss = (1*c - 1) + (2*c - 2) is 3.
       # new value of c = 10 - learning rate * 3 = 7.0.
       session.run(estimator_spec.train_op)
       with variable_scope.variable_scope('', reuse=True):
         c = variable_scope.get_variable('c', dtype=dtypes.float64)
         self.assertEqual(7.0, session.run(c))
 
+  def test_train_with_mean_reduction(self):
+    features = np.array([[1.0], [2.0]])
+    labels = np.array([[1.0], [2.0]])
+
+    with self.test_session() as session:
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn,
+          self.optimizer_fn,
+          losses.Reduction.MEAN,
+          devices=['/gpu:0', '/gpu:1'])
+      estimator_spec = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
+      session.run(variables.global_variables_initializer())
+
+      # loss = feature * c - label
+      total_loss = ((1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)) / 2.0
+      self.assertEqual(total_loss, session.run(estimator_spec.loss))
+
+      # derivative of loss = (1*c - 1)/2 + (2*c - 2)/2 is 1.5.
+      # It's the same computation as without mean reduction, but the
+      # loss from every tower is scaled by 1/<number of towers>.
+      # new value of c = 10 - learning rate * 1.5 = 8.5
+      session.run(estimator_spec.train_op)
+      with variable_scope.variable_scope('', reuse=True):
+        c = variable_scope.get_variable('c', dtype=dtypes.float64)
+        self.assertEqual(8.5, session.run(c))
+
   def test_train_spec_with_optimizer_without_params(self):
 
     def optimizer_fn_without_params():
@@ -252,6 +304,38 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
       self.assertEqual(0, auc)
       self.assertNear(total_loss, session.run(estimator_spec.loss), 0.01)
 
+  def test_eval_with_mean_reduction(self):
+    features = np.array([[0.01], [0.002]])
+    labels = np.array([[0.01], [0.02]])
+
+    with self.test_session() as session:
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn,
+          self.optimizer_fn,
+          losses.Reduction.MEAN,
+          devices=['/gpu:0', '/gpu:1'])
+      estimator_spec = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.EVAL, self.params)
+      session.run(variables.local_variables_initializer())
+      session.run(variables.global_variables_initializer())
+
+      accuracy, a = estimator_spec.eval_metric_ops['accuracy']
+      auc, b = estimator_spec.eval_metric_ops['auc']
+
+      session.run([a, b])
+      accuracy = session.run(accuracy)
+      auc = session.run(auc)
+
+      # loss[i] = features[i] * 10 - labels[i].
+      # Accuracy is 0.0 (no match) in the first tower.
+      # Accuracy is 1.0 (match) in the second tower, since the feature
+      # times weight "c" happened to be equal to the label.
+      total_loss = ((0.01 * 10 - 0.01) + (0.002 * 10 - 0.02)) / 2.0
+
+      self.assertNear((0.0 + 1.0) / 2.0, accuracy, 0.01)
+      self.assertEqual(0, auc)
+      self.assertNear(total_loss, session.run(estimator_spec.loss), 0.01)
+
   def test_predict(self):
     features = np.array([[0.01], [0.002]])
     labels = np.array([[0.01], [0.02]])
@@ -273,7 +357,7 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
 
     with self.test_session() as session:
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn, self.optimizer_fn)
+          self.model_fn, self.optimizer_fn, devices=['/gpu:0'])
       estimator_spec = replicated_model_fn(
           features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
       session.run(variables.global_variables_initializer())
@@ -332,6 +416,11 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
           'probabilities': np.array([[0.1], [0.02]])
       }, session.run(estimator_spec.predictions))
 
+  def test_unsupported_loss_reduction(self):
+    with self.assertRaisesRegexp(ValueError, ''):
+      _ = replicate_model_fn.replicate_model_fn(
+          self.model_fn, self.optimizer_fn, losses.Reduction.NONE)
+
 
 class GetLossTowersTest(test_util.TensorFlowTestCase):
 
@@ -359,7 +448,7 @@ class GetLossTowersTest(test_util.TensorFlowTestCase):
           params=None,
           config=None,
           devices=['/gpu:0', '/gpu:1'],
-          local_ps_device='/gpu:0',
+          local_ps_devices=['/gpu:0'],
           name_scope_pattern='test_tower_{}')
       session.run(variables.global_variables_initializer())
 
@@ -382,6 +471,88 @@ class GetLossTowersTest(test_util.TensorFlowTestCase):
         c = variable_scope.get_variable('c', dtype=dtypes.float64)
         self.assertEqual(0.25, session.run(c))
 
+  def test_gradients_are_computed_with_mean_reduction(self):
+    with self.test_session() as session:
+      tower_specs = replicate_model_fn._get_loss_towers(
+          self.model_fn,
+          mode=model_fn_lib.ModeKeys.EVAL,
+          features=[[0.6], [1.6]],
+          labels=[[0.6], [0.6]],
+          params=None,
+          loss_reduction=losses.Reduction.MEAN,
+          config=None,
+          devices=['/gpu:0', '/gpu:1'],
+          local_ps_devices=['/gpu:0'],
+          name_scope_pattern='test_tower_{}')
+      session.run(variables.global_variables_initializer())
+
+      self.assertEqual(len(tower_specs), 2)
+
+      self.assertEqual('/device:GPU:0', tower_specs[0].loss.device)
+      self.assertEqual('averaged_loss:0', tower_specs[0].loss.name)
+      self.assertEqual(0.5, session.run(tower_specs[0].loss))
+
+      self.assertEqual('/device:GPU:1', tower_specs[1].loss.device)
+      self.assertEqual('test_tower_1/averaged_loss:0', tower_specs[1].loss.name)
+      # The input batch for the second tower had a loss that is 1.0
+      # bigger: 0.6 vs 1.6.
+      self.assertEqual(1.0, session.run(tower_specs[1].loss))
+
+      self.assertEqual(1, len(variables.global_variables()))
+      self.assertEqual(1, len(variables.trainable_variables()))
+
+      with variable_scope.variable_scope('', reuse=True):
+        c = variable_scope.get_variable('c', dtype=dtypes.float64)
+        self.assertEqual(0.25, session.run(c))
+
+  def test_variables_are_round_robined_correctly(self):
+    """Test that creates multiple variables and tests round-robin placement."""
+
+    def model_fn(mode, features, labels, params):
+      del params
+      for variable_name in ['a', 'b', 'c', 'd']:
+        c = variable_scope.get_variable(
+            variable_name,
+            initializer=constant_op.constant(0.25, dtype=dtypes.float64),
+            dtype=dtypes.float64)
+
+      predictions = math_ops.add(np.array([0.1, 0.2, 0.3, features[0]]), c)
+      labels = np.array([0.1, 0.2, 0.3, labels[0]])
+      loss = losses.absolute_difference(
+          labels=labels,
+          predictions=predictions,
+          reduction=losses.Reduction.SUM)
+      return model_fn_lib.EstimatorSpec(
+          mode=mode, loss=math_ops.reduce_sum(loss))
+
+    with self.test_session() as session:
+      tower_specs = replicate_model_fn._get_loss_towers(
+          model_fn,
+          mode=None,
+          features=[[0.6], [1.6], [2.6]],
+          labels=[[0.6], [0.6], [2.6]],
+          params=None,
+          config=None,
+          devices=['/gpu:0', '/gpu:1', '/gpu:3'],
+          local_ps_devices=['/gpu:0', '/gpu:1', '/gpu:3'],
+          name_scope_pattern='test_tower_{}')
+      session.run(variables.global_variables_initializer())
+
+      self.assertEqual(len(tower_specs), 3)
+      self.assertEqual('/device:GPU:0', tower_specs[0].loss.device)
+      self.assertEqual('/device:GPU:1', tower_specs[1].loss.device)
+      self.assertEqual('/device:GPU:3', tower_specs[2].loss.device)
+
+      with variable_scope.variable_scope('', reuse=True):
+        a = variable_scope.get_variable('a', dtype=dtypes.float64)
+        self.assertEqual('/device:GPU:0', a.device)
+        b = variable_scope.get_variable('b', dtype=dtypes.float64)
+        self.assertEqual('/device:GPU:1', b.device)
+        c = variable_scope.get_variable('c', dtype=dtypes.float64)
+        self.assertEqual('/device:GPU:3', c.device)
+        d = variable_scope.get_variable('d', dtype=dtypes.float64)
+        self.assertEqual('/device:GPU:0', d.device)
+
 
 class SplitBatchTest(test_util.TensorFlowTestCase):
 
@@ -604,7 +775,7 @@ class PredictSpecTest(test_util.TensorFlowTestCase):
           params=None,
           config=None,
           devices=['/gpu:0', '/gpu:1'],
-          local_ps_device='/gpu:0',
+          local_ps_devices=['/gpu:0'],
       )
       session.run(variables.global_variables_initializer())
 
@@ -843,33 +1014,73 @@ class GetLocalDevicesTest(test_util.TensorFlowTestCase):
         replicate_model_fn._get_local_devices('XPU'))  # XPU doesn't exist.
 
   def test_whether_there_is_a_gpu(self):
-    self.assertEqual(
-        len(replicate_model_fn._get_local_devices('GPU')),
-        test.is_gpu_available())
+    if test.is_gpu_available():
+      self.assertTrue(len(replicate_model_fn._get_local_devices('GPU')))
 
 
 class LocalDeviceSetterTest(test_util.TensorFlowTestCase):
 
   def test_vars_are_on_ps_but_ops_are_on_workers(self):
+    ps_devices = ['/device:GPU:3']
+    round_robin = device_setter._RoundRobinStrategy(num_tasks=len(ps_devices))
+
     local_device_setter = replicate_model_fn._local_device_setter(
-        ps_device='/device:GPU:3', worker_device='/device:GPU:2')
+        ps_devices=ps_devices,
+        ps_strategy=round_robin,
+        worker_device='/device:GPU:2')
 
     with ops_lib.device(local_device_setter):
-      c = variables.Variable(0.01)
+      a = variables.Variable(0.01)
+      self.assertEqual('/device:GPU:3', a.device)
+
+      b = variables.Variable(0.02)
+      self.assertEqual('/device:GPU:3', b.device)
+
+      c = variables.Variable(0.03)
       self.assertEqual('/device:GPU:3', c.device)
 
-      cc = variables.Variable(0.02)
-      self.assertEqual('/device:GPU:3', cc.device)
+      a_op = array_ops.concat(a, axis=0)
+      self.assertEqual('/device:GPU:2', a_op.device)
 
-      ccc = variables.Variable(0.03)
-      self.assertEqual('/device:GPU:3', ccc.device)
+      b_op = array_ops.concat(b, axis=0)
+      self.assertEqual('/device:GPU:2', b_op.device)
+
+  def test_round_robin_placement(self):
+    ps_devices = [
+        '/device:GPU:0', '/device:GPU:1', '/device:GPU:3', '/device:GPU:4'
+    ]
+    round_robin = device_setter._RoundRobinStrategy(num_tasks=len(ps_devices))
+
+    local_device_setter = replicate_model_fn._local_device_setter(
+        ps_devices=ps_devices,
+        ps_strategy=round_robin,
+        worker_device='/device:GPU:2')
+
+    with ops_lib.device(local_device_setter):
+      a = variables.Variable(0.01)
+      self.assertEqual('/device:GPU:0', a.device)
+
+      b = variables.Variable(0.02)
+      self.assertEqual('/device:GPU:1', b.device)
+
+      c = variables.Variable(0.03)
+      self.assertEqual('/device:GPU:3', c.device)
+
+      a_op = array_ops.concat(a, axis=0)
+      self.assertEqual('/device:GPU:2', a_op.device)
+
+      b_op = array_ops.concat(b, axis=0)
+      self.assertEqual('/device:GPU:2', b_op.device)
+
+      c = variables.Variable(0.03)
+      self.assertEqual('/device:GPU:4', c.device)
+
+      d = variables.Variable(0.03)
+      self.assertEqual('/device:GPU:0', d.device)
 
       c_op = array_ops.concat(c, axis=0)
       self.assertEqual('/device:GPU:2', c_op.device)
 
-      cc_op = array_ops.concat(cc, axis=0)
-      self.assertEqual('/device:GPU:2', cc_op.device)
-
 
 class ComputeSumWithDevicePlacementTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/contrib/factorization/python/ops/gmm.py b/tensorflow/contrib/factorization/python/ops/gmm.py
index 0d67e09f8151b48c97094b6b48f26e63443707ef..f72280c4ecf19e33278ffe74061f44bbb7b21709 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm.py
@@ -24,7 +24,7 @@ import numpy as np
 from tensorflow.contrib import framework
 from tensorflow.contrib.factorization.python.ops import gmm_ops
 from tensorflow.contrib.framework.python.framework import checkpoint_utils
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
 from tensorflow.python.framework import constant_op
@@ -167,7 +167,7 @@ class GMM(estimator.Estimator):
                                      self._num_clusters, self._random_seed,
                                      self._covariance_type,
                                      self._params)
-      incr_step = state_ops.assign_add(variables.get_global_step(), 1)
+      incr_step = state_ops.assign_add(training_util.get_global_step(), 1)
       loss = math_ops.reduce_sum(losses)
       training_op = with_dependencies([training_op, incr_step], loss)
       training_hooks = [_InitializeClustersHook(
diff --git a/tensorflow/contrib/factorization/python/ops/wals.py b/tensorflow/contrib/factorization/python/ops/wals.py
index b2f22eb2fce89415b6cc60ecbbc5c86da97ba40b..4fe22ea26ec5f5a43f1c99d1fee518b1d326c5c9 100644
--- a/tensorflow/contrib/factorization/python/ops/wals.py
+++ b/tensorflow/contrib/factorization/python/ops/wals.py
@@ -77,6 +77,7 @@ class _SweepHook(session_run_hook.SessionRunHook):
       logging.info("SweepHook running init op.")
       sess.run(self._init_op)
     if is_sweep_done:
+      logging.info("SweepHook starting the next sweep.")
       sess.run(self._switch_op)
     is_row_sweep = sess.run(self._is_row_sweep_var)
     if is_sweep_done or not self._is_initialized:
@@ -91,6 +92,22 @@ class _SweepHook(session_run_hook.SessionRunHook):
         fetches=[self._row_train_op if is_row_sweep else self._col_train_op])
 
 
+class _IncrementGlobalStepHook(session_run_hook.SessionRunHook):
+  """Hook that increments the global step."""
+
+  def __init__(self):
+    global_step = training_util.get_global_step()
+    if global_step:
+      self._global_step_incr_op = state_ops.assign_add(
+          global_step, 1, name="global_step_incr").op
+    else:
+      self._global_step_incr_op = None
+
+  def before_run(self, run_context):
+    if self._global_step_incr_op:
+      run_context.session.run(self._global_step_incr_op)
+
+
 class _StopAtSweepHook(session_run_hook.SessionRunHook):
   """Hook that requests stop at a given sweep."""
 
@@ -166,7 +183,7 @@ def _wals_factorization_model_function(features, labels, mode, params):
 
   # TRAIN mode:
   if mode == model_fn.ModeKeys.TRAIN:
-    # Training consists of the folowing ops (controlled using a SweepHook).
+    # Training consists of the following ops (controlled using a SweepHook).
     # Before a row sweep:
     #   row_update_prep_gramian_op
     #   initialize_row_update_op
@@ -210,14 +227,6 @@ def _wals_factorization_model_function(features, labels, mode, params):
     summary.scalar("root_weighted_squared_error", rwse_var)
     summary.scalar("completed_sweeps", completed_sweeps_var)
 
-    # Increments global step.
-    global_step = training_util.get_global_step()
-    if global_step:
-      global_step_incr_op = state_ops.assign_add(
-          global_step, 1, name="global_step_incr").op
-    else:
-      global_step_incr_op = control_flow_ops.no_op()
-
     def create_axis_ops(sp_input, num_items, update_fn, axis_name):
       """Creates book-keeping and training ops for a given axis.
 
@@ -246,9 +255,6 @@ def _wals_factorization_model_function(features, labels, mode, params):
             collections=[ops.GraphKeys.GLOBAL_VARIABLES],
             trainable=False,
             name="processed_" + axis_name)
-      reset_processed_items_op = state_ops.assign(
-          processed_items, processed_items_init,
-          name="reset_processed_" + axis_name)
       _, update_op, loss, reg, sum_weights = update_fn(sp_input)
       input_indices = sp_input.indices[:, 0]
       with ops.control_dependencies([
@@ -264,13 +270,12 @@ def _wals_factorization_model_function(features, labels, mode, params):
         with ops.control_dependencies([update_processed_items]):
           is_sweep_done = math_ops.reduce_all(processed_items)
           axis_train_op = control_flow_ops.group(
-              global_step_incr_op,
               state_ops.assign(is_sweep_done_var, is_sweep_done),
               state_ops.assign_add(
                   completed_sweeps_var,
                   math_ops.cast(is_sweep_done, dtypes.int32)),
               name="{}_sweep_train_op".format(axis_name))
-      return reset_processed_items_op, axis_train_op
+      return processed_items.initializer, axis_train_op
 
     reset_processed_rows_op, row_train_op = create_axis_ops(
         input_rows,
@@ -296,7 +301,8 @@ def _wals_factorization_model_function(features, labels, mode, params):
     sweep_hook = _SweepHook(
         is_row_sweep_var, is_sweep_done_var, init_op,
         row_prep_ops, col_prep_ops, row_train_op, col_train_op, switch_op)
-    training_hooks = [sweep_hook]
+    global_step_hook = _IncrementGlobalStepHook()
+    training_hooks = [sweep_hook, global_step_hook]
     if max_sweeps is not None:
       training_hooks.append(_StopAtSweepHook(max_sweeps))
 
diff --git a/tensorflow/contrib/ffmpeg/BUILD b/tensorflow/contrib/ffmpeg/BUILD
index dc5a04a0b15870babbc98cf104e109caf829901c..eccce99071dc1477cf4f3bb152f3304b3b0fc35a 100644
--- a/tensorflow/contrib/ffmpeg/BUILD
+++ b/tensorflow/contrib/ffmpeg/BUILD
@@ -155,7 +155,10 @@ tf_py_test(
     data = [
         ":test_data",
     ],
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "notap",
+    ],
 )
 
 py_library(
diff --git a/tensorflow/contrib/ffmpeg/__init__.py b/tensorflow/contrib/ffmpeg/__init__.py
index 871dff7bbe4912f0daf2bc184d6b0f12510abee7..daba965a98893b992abdc598ec713f13020d6e91 100644
--- a/tensorflow/contrib/ffmpeg/__init__.py
+++ b/tensorflow/contrib/ffmpeg/__init__.py
@@ -26,6 +26,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_audio
+from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import encode_audio
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 
diff --git a/tensorflow/contrib/ffmpeg/decode_video_op_test.py b/tensorflow/contrib/ffmpeg/decode_video_op_test.py
index 4d1fac4ef8afbf44cd45bae065f8a95b0527079a..b43b6b8919223bd7731209d5423b142601396ea5 100644
--- a/tensorflow/contrib/ffmpeg/decode_video_op_test.py
+++ b/tensorflow/contrib/ffmpeg/decode_video_op_test.py
@@ -20,11 +20,9 @@ from __future__ import print_function
 
 import os.path
 
-import six
+import six  # pylint: disable=unused-import
 
 from tensorflow.contrib import ffmpeg
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
@@ -32,7 +30,8 @@ from tensorflow.python.platform import test
 
 class DecodeVideoOpTest(test.TestCase):
 
-  def _loadFileAndTest(self, filename, width, height, frames, bmp_filename, index):
+  def _loadFileAndTest(self, filename, width, height, frames, bmp_filename,
+                       index):
     """Loads an video file and validates the output tensor.
 
     Args:
@@ -40,6 +39,8 @@ class DecodeVideoOpTest(test.TestCase):
       width: The width of the video.
       height: The height of the video.
       frames: The frames of the video.
+      bmp_filename: The filename for the bmp file.
+      index: Index location inside the video.
     """
     with self.test_session():
       path = os.path.join(resource_loader.get_data_files_path(), 'testdata',
@@ -48,7 +49,7 @@ class DecodeVideoOpTest(test.TestCase):
         contents = f.read()
 
       bmp_path = os.path.join(resource_loader.get_data_files_path(), 'testdata',
-                          bmp_filename)
+                              bmp_filename)
       with open(bmp_path, 'rb') as f:
         bmp_contents = f.read()
 
@@ -58,7 +59,7 @@ class DecodeVideoOpTest(test.TestCase):
       video_op = ffmpeg.decode_video(contents)
       video = video_op.eval()
       self.assertEqual(video.shape, (frames, height, width, 3))
-      self.assertAllEqual(video[index,:,:,:], image)
+      self.assertAllEqual(video[index, :, :, :], image)
 
   def testMp4(self):
     self._loadFileAndTest('small.mp4', 560, 320, 166, 'small_100.bmp', 99)
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index 201774e1d011f35df9c3803f2ed8818cc9b1c1c2..1e8af1458cea13b2ddb89b7d93a4ffb8b974ecd2 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -49,7 +49,8 @@ std::vector<string> FfmpegAudioCommandLine(const string& input_filename,
           "-nostdin",             // No interactive commands accepted.
           "-f", input_format_id,  // eg: "mp3"
           "-probesize", StrCat(kDefaultProbeSize), "-i", input_filename,
-          "-loglevel", "info",  // Enable verbose logging to support debugging.
+          "-loglevel", "error",   // Print errors only.
+          "-hide_banner",         // Skip printing build options, version, etc.
           "-map_metadata", "-1",  // Copy global metadata from input to output.
           "-vn",                  // No video recording.
           "-ac:a:0", StrCat(channel_count), "-ar:a:0",
@@ -72,7 +73,8 @@ std::vector<string> FfmpegVideoCommandLine(const string& input_filename,
           "-probesize",
           StrCat(kDefaultProbeSize),
           "-loglevel",
-          "info",  // Enable verbose logging to support debugging.
+          "error",  // Print errors only.
+          "-hide_banner",  // Skip printing build options, version, etc.
           "-vcodec",
           "rawvideo",
           "-pix_fmt",
@@ -220,7 +222,8 @@ string BuildWavFile(int32 samples_per_second, int32 channel_count,
 Status ReadInfoFile(const string& filename, uint32* width, uint32* height,
                     uint32* frames) {
   string data;
-  ReadFileToString(Env::Default(), filename, &data);
+  TF_QCHECK_OK(ReadFileToString(Env::Default(), filename, &data))
+      << "Could not read FFmpeg file: " << filename;
   bool in_output = false;
   bool in_mapping = false;
   uint32 frames_value = 0;
@@ -377,7 +380,7 @@ Status ReadVideoFile(const string& filename, std::vector<uint8>* output_data,
         open(stderr_filename.c_str(), O_RDWR | O_CREAT | O_APPEND, 0600);
     if (fd < 0) {
       const int error = errno;
-      LOG(ERROR) << "FFmpeg stderr file coule not be created: "
+      LOG(ERROR) << "FFmpeg stderr file could not be created: "
                  << strerror(error);
       ::_exit(error);
     }
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
index 2871c1462894c6a4ddef63e9178272df0d14824c..85b61b26163d87a10d4e316720b4f633e038bbec 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
@@ -39,7 +39,7 @@ const char kTestMp3Filename[] =
 
 // Set to true via a command line flag iff the test is expected to have FFmpeg
 // installed.
-mutex mu;
+mutex mu(LINKER_INITIALIZED);
 bool should_ffmpeg_be_installed GUARDED_BY(mu) = false;
 
 string ParseTestFlags(int* argc, char** argv) {
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
index 39e7e90cccf1012eb42261bde55d0dc3b7f278ef..36fc71794b06e0f3cb86c40b325ce50e8999c667 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
@@ -23,6 +23,7 @@
 
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
index 78ead471d2cf9f0654a06dc022d7cc592d14c710..08b5a6ea48c2d4959af68a2ee9d27d21c6245457 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.ffmpeg.ops import gen_decode_audio_op_py
+from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_encode_audio_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.util import loader
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index e8dad886a1409babdf4ea47b9cd05def1f1ce25e..5b659ddaa1386736eb8cc05a203ed1827ccd160e 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -276,6 +276,7 @@ py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:partitioned_variables",
         "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 3f592611830e40a30392239c85486a2fad15a2a2..4edc77f86ba786ca547b8d3842e2cf02833fbbac 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -65,6 +65,7 @@ See the @{$python/contrib.framework} guide.
 @@get_variable_full_name
 @@get_variables_to_restore
 @@get_variables
+@@global_variable
 @@local_variable
 @@model_variable
 @@variable
diff --git a/tensorflow/contrib/framework/python/framework/graph_util.py b/tensorflow/contrib/framework/python/framework/graph_util.py
index 8ab8711db4650921e0d366a91adfe2f68b5a42f9..a18ff2320d99726bb355ff6179fc97a070c2fec7 100644
--- a/tensorflow/contrib/framework/python/framework/graph_util.py
+++ b/tensorflow/contrib/framework/python/framework/graph_util.py
@@ -24,12 +24,14 @@ import six
 # pylint: disable=unused-import
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
+from tensorflow.python.framework import ops
 from tensorflow.python.framework.graph_util_impl import _assert_nodes_are_present
 from tensorflow.python.framework.graph_util_impl import _bfs_for_reachable_nodes
 from tensorflow.python.framework.graph_util_impl import _extract_graph_summary
 from tensorflow.python.framework.graph_util_impl import _node_name
 
-__all__ = ["fuse_op"]
+
+__all__ = ["fuse_op", "get_placeholders"]
 
 
 def fuse_op(graph_def, input_nodes, output_nodes, output_dtypes,
@@ -91,7 +93,7 @@ def fuse_op(graph_def, input_nodes, output_nodes, output_dtypes,
                             (n, cur_node))
           if cur_node not in input_nodes_set:
             next_to_visit += name_to_input_name[cur_node]
-    else:
+    elif n not in reachable_by_input:
       nodes_post_output.append(n)
 
   # Add all nodes upto the input nodes
@@ -126,3 +128,27 @@ def fuse_op(graph_def, input_nodes, output_nodes, output_dtypes,
   out.library.CopyFrom(graph_def.library)
   out.versions.CopyFrom(graph_def.versions)
   return out
+
+
+def get_placeholders(graph):
+  """Get placeholders of a graph.
+
+  Args:
+    graph: A tf.Graph.
+  Returns:
+    A list contains all placeholders of given graph.
+
+  Raises:
+    TypeError: If `graph` is not a tensorflow graph.
+  """
+
+  if not isinstance(graph, ops.Graph):
+    raise TypeError("Input graph needs to be a Graph: %s" % graph)
+
+  # For each placeholder() call, there is a corresponding
+  # operation of type 'Placeholder' registered to the graph.
+  # The return value (a Tensor) of placeholder() is the
+  # first output of this operation in fact.
+  operations = graph.get_operations()
+  result = [i.outputs[0] for i in operations if i.type == "Placeholder"]
+  return result
diff --git a/tensorflow/contrib/framework/python/framework/graph_util_test.py b/tensorflow/contrib/framework/python/framework/graph_util_test.py
index 87b992e22e1ad3aa20389d0834eeb3a5972c676e..b8a6d109e19211d271c2b15bac66ddacd38fe395 100644
--- a/tensorflow/contrib/framework/python/framework/graph_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/graph_util_test.py
@@ -21,6 +21,9 @@ from tensorflow.contrib.framework.python.framework import graph_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
 from tensorflow.core.framework import types_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -56,6 +59,41 @@ class GraphUtilTest(test.TestCase):
     self.assertEqual(fused_graph_def.node[2].name, 'D')
     self.assertEqual(fused_graph_def.node[3].name, 'E')
 
+  def testGraphUtilArtificialDependencyInjection(self):
+    graph_def = graph_pb2.GraphDef()
+    node_a = GetNewNode('A', 'Placeholder', [])
+    node_a1 = GetNewNode('A1', 'Placeholder', [])
+    node_b = GetNewNode('B', 'Op1', ['A'])
+    node_c = GetNewNode('C', 'Op1', ['B'])
+    node_d = GetNewNode('D', 'Op1', ['C'])
+    node_e = GetNewNode('E', 'Op1', ['D'])
+    graph_def.node.extend([node_a, node_a1, node_b, node_c, node_d, node_e])
+    fused_graph_def = graph_util.fuse_op(graph_def, ['A', 'A1'], ['D'],
+                                         [types_pb2.DT_FLOAT], True, 'FusedOp',
+                                         'Op2')
+    self.assertEqual(len(fused_graph_def.node), 5)
+    self.assertEqual(fused_graph_def.node[0].name, 'A')
+    self.assertEqual(fused_graph_def.node[1].name, 'A1')
+    self.assertEqual(fused_graph_def.node[2].name, 'FusedOp')
+    self.assertEqual(fused_graph_def.node[2].input[0], 'A')
+    self.assertEqual(fused_graph_def.node[2].op, 'Op2')
+    self.assertEqual(fused_graph_def.node[2].attr['_output_quantized'].b, True)
+    self.assertEqual(fused_graph_def.node[2].attr['_output_types'].list.type,
+                     [types_pb2.DT_FLOAT])
+    self.assertEqual(fused_graph_def.node[3].name, 'D')
+    self.assertEqual(fused_graph_def.node[4].name, 'E')
+
+
+class GetPlaceholdersTest(test.TestCase):
+
+  def test_get_placeholders(self):
+    with ops.Graph().as_default() as g:
+      placeholders = [array_ops.placeholder(dtypes.float32) for _ in range(5)]
+      results = graph_util.get_placeholders(g)
+      self.assertEqual(
+          sorted(placeholders, key=lambda x: x._id),  # pylint: disable=protected-access
+          sorted(results, key=lambda x: x._id))  # pylint: disable=protected-access
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py b/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py
index a0667bd489213cf366e27114a91e8699ed9e7428..2375ee4f550616ff60d20b87b5773704d8fbbe1e 100644
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py
+++ b/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py
@@ -48,7 +48,7 @@ def accumulate_n_v2(inputs, shape=None, tensor_dtype=None, name=None):
   tf.accumulate_n_v2([a, b, a])  # [[7, 4], [6, 14]]
 
   # Explicitly pass shape and type
-  tf.accumulate_n_v2([a, b, a], shape=[2, 2], tensor_dtype=tf.int32)  
+  tf.accumulate_n_v2([a, b, a], shape=[2, 2], tensor_dtype=tf.int32)
                                                                    # [[7,  4],
                                                                    #  [6, 14]]
   ```
@@ -93,7 +93,7 @@ def accumulate_n_v2(inputs, shape=None, tensor_dtype=None, name=None):
   elif len(inputs) == 1 and name is not None:
     return array_ops.identity(inputs[0], name=name)
   elif context.in_eager_mode():
-    # TemporaryVariable not currently supported in eager mode; fall back 
+    # TemporaryVariable not currently supported in eager mode; fall back
     # onto AddN for now.
     # TODO(frreiss) remove this once the lifetime of eager variables gets
     # addressed
@@ -101,7 +101,7 @@ def accumulate_n_v2(inputs, shape=None, tensor_dtype=None, name=None):
   else:
     return gen_math_ops._accumulate_nv2(inputs, name=name, shape=shape)
 
-# The following code should eventually be merged into 
+# The following code should eventually be merged into
 # tensorflow/python/ops/math_grad.py
 @ops.RegisterGradient("AccumulateNV2")
 def _AddNGrad(op, grad):
diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py
index c2229bb8ad3d5b38321d16f150ed94175ab9bdbe..8f44698da851b48abf831e957c80fa1643a58bda 100644
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py
+++ b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for new version of accumulate_n op that will eventually go into 
+"""Tests for new version of accumulate_n op that will eventually go into
 `ops.math_ops`.
 
-These test cases spefically exercise the `eager` APIs. They need to be in a 
+These test cases spefically exercise the `eager` APIs. They need to be in a
 separate file from the remaining tests because eager mode is currently something
 you can turn on but can't turn off for the lifetime of the current process."""
 from __future__ import absolute_import
@@ -64,7 +64,7 @@ class AccumulateNV2EagerTest(test_util.TensorFlowTestCase):
     np.random.seed(42)
     num_inputs = 3
     input_vars = [
-        resource_variable_ops.ResourceVariable(10.0 * np.random.random(), 
+        resource_variable_ops.ResourceVariable(10.0 * np.random.random(),
                                                name="t%d" % i)
         for i in range(0, num_inputs)
     ]
@@ -72,7 +72,7 @@ class AccumulateNV2EagerTest(test_util.TensorFlowTestCase):
     def fn(first, second, third):
       return av2.accumulate_n_v2([first, second, third])
 
-    grad_fn = backprop.gradients_function(fn)      
+    grad_fn = backprop.gradients_function(fn)
     grad = grad_fn(input_vars[0], input_vars[1], input_vars[2])
     self.assertAllEqual(np.repeat(1.0, num_inputs), # d/dx (x + y + ...) = 1
                         [elem.numpy() for elem in grad])
diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
index 3386e849d5cb8516ab3b1f6cb0429be3fc2fc960..b5e9f8df79262635bf579a6bf2260bc40c140c6f 100644
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
+++ b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for new version of accumulate_n op that will eventually go into 
+"""Tests for new version of accumulate_n op that will eventually go into
 `ops.math_ops`."""
 from __future__ import absolute_import
 from __future__ import division
@@ -102,21 +102,21 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         a = variables.Variable(np.array([0.1,0.2]))
         b = variables.Variable(np.array([[0.3],[0.4]]))
-        tf_val = av2.accumulate_n_v2([a,b]) 
+        tf_val = av2.accumulate_n_v2([a,b])
 
   def testWrongType(self):
     with self.test_session():
       with self.assertRaises(TypeError):
         a = variables.Variable(0.2, dtype=np.float32)
         b = variables.Variable(0.1, dtype=np.float32)
-        tf_val = av2.accumulate_n_v2([a,b], tensor_dtype=np.int32) 
+        tf_val = av2.accumulate_n_v2([a,b], tensor_dtype=np.int32)
 
   def testWrongTypeOneInput(self):
     # Scenario that used to trigger a bug, even when testWrongType() worked
     with self.test_session():
       with self.assertRaises(TypeError):
         a = variables.Variable(0.2, dtype=np.float32)
-        tf_val = av2.accumulate_n_v2([a], tensor_dtype=np.int32) 
+        tf_val = av2.accumulate_n_v2([a], tensor_dtype=np.int32)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py
index b7668379686b4f0ba2a3e415ddb44b287659baaa..3f1ece4510578b5ac39849c577fffbb2a3be45a7 100644
--- a/tensorflow/contrib/framework/python/ops/variables.py
+++ b/tensorflow/contrib/framework/python/ops/variables.py
@@ -60,6 +60,7 @@ __all__ = ['add_model_variable',
            'get_variable_full_name',
            'get_variables_to_restore',
            'get_variables',
+           'global_variable',
            'local_variable',
            'model_variable',
            'variable',
@@ -147,20 +148,48 @@ def get_or_create_global_step(graph=None):
   return training_util.get_or_create_global_step(graph)
 
 
-def local_variable(initial_value, validate_shape=True, name=None):
-  """Create variable and add it to `GraphKeys.LOCAL_VARIABLES` collection.
+def local_variable(initial_value,
+                   validate_shape=True,
+                   name=None,
+                   use_resource=None):
+  """Create a variable with a value and add it to `GraphKeys.LOCAL_VARIABLES`.
 
   Args:
     initial_value: See variables.Variable.__init__.
     validate_shape: See variables.Variable.__init__.
     name: See variables.Variable.__init__.
+    use_resource: If `True` use a ResourceVariable instead of a Variable.
   Returns:
     New variable.
   """
   return variable_scope.variable(
       initial_value, trainable=False,
       collections=[ops.GraphKeys.LOCAL_VARIABLES],
-      validate_shape=validate_shape, name=name)
+      validate_shape=validate_shape,
+      use_resource=use_resource,
+      name=name)
+
+
+def global_variable(initial_value,
+                    validate_shape=True,
+                    name=None,
+                    use_resource=None):
+  """Create a variable with a value and add it to `GraphKeys.GLOBAL_VARIABLES`.
+
+  Args:
+    initial_value: See variables.Variable.__init__.
+    validate_shape: See variables.Variable.__init__.
+    name: See variables.Variable.__init__.
+    use_resource: If `True` use a ResourceVariable instead of a Variable.
+  Returns:
+    New variable.
+  """
+  return variable_scope.variable(
+      initial_value, trainable=False,
+      collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+      validate_shape=validate_shape,
+      use_resource=use_resource,
+      name=name)
 
 
 @contrib_add_arg_scope
@@ -412,7 +441,7 @@ def get_unique_variable(var_op_name):
   """
   candidates = get_variables(scope=var_op_name)
   if not candidates:
-    raise ValueError('Couldnt find variable %s' % var_op_name)
+    raise ValueError('Couldn\'t find variable %s' % var_op_name)
 
   for candidate in candidates:
     if candidate.op.name == var_op_name:
diff --git a/tensorflow/contrib/framework/python/ops/variables_test.py b/tensorflow/contrib/framework/python/ops/variables_test.py
index 6a74e4e8666e98ca3c97dc9ddd8a6c11613f708e..2f06df93acb0a4c0b36c68839ff531e3c22c5ee3 100644
--- a/tensorflow/contrib/framework/python/ops/variables_test.py
+++ b/tensorflow/contrib/framework/python/ops/variables_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import gfile
@@ -102,6 +103,82 @@ class LocalVariableTest(test.TestCase):
       sess.run(variables_lib.local_variables_initializer())
       self.assertAllEqual(a.eval(), [0] * 5)
 
+  def testResourceVariable(self):
+    a = variables_lib2.local_variable(0)
+    b = variables_lib2.local_variable(0, use_resource=True)
+    self.assertEqual(type(a), variables_lib.Variable)
+    self.assertEqual(type(b), resource_variable_ops.ResourceVariable)
+
+
+class GlobalVariableTest(test.TestCase):
+
+  def test_global_variable(self):
+    with self.test_session() as sess:
+      self.assertEquals([], variables_lib.global_variables())
+      value0 = 42
+      variables_lib2.global_variable(value0)
+      value1 = 43
+      variables_lib2.global_variable(value1)
+      variables = variables_lib.global_variables()
+      self.assertEquals(2, len(variables))
+      with self.assertRaisesOpError(
+          'Attempting to use uninitialized value Variable'):
+        sess.run(variables)
+      variables_lib.variables_initializer(variables).run()
+      self.assertAllEqual(set([value0, value1]), set(sess.run(variables)))
+
+  def testVariableNameAndShape(self):
+    with self.test_session():
+      with variable_scope.variable_scope('A'):
+        a = variables_lib2.global_variable([1, 1, 1, 1, 1], name='a')
+        self.assertEquals(a.op.name, 'A/a')
+        self.assertListEqual(a.get_shape().as_list(), [5])
+        self.assertListEqual([a], variables_lib.global_variables())
+
+  def testGlobalVariableNotInLocalVariables(self):
+    with self.test_session():
+      with variable_scope.variable_scope('A'):
+        a = variables_lib2.global_variable(0)
+        self.assertFalse(a in variables_lib.local_variables())
+        self.assertTrue(a in variables_lib.global_variables())
+
+  def testGlobalVariableInVariablesToRestore(self):
+    with self.test_session():
+      with variable_scope.variable_scope('A'):
+        a = variables_lib2.global_variable(0)
+        self.assertFalse(a in variables_lib.local_variables())
+        self.assertTrue(a in variables_lib2.get_variables_to_restore())
+
+  def testGetVariablesReturnsThem(self):
+    with self.test_session():
+      with variable_scope.variable_scope('A'):
+        a = variables_lib2.global_variable(0)
+      with variable_scope.variable_scope('B'):
+        b = variables_lib2.global_variable(0)
+      self.assertEquals([a], variables_lib2.get_variables('A'))
+      self.assertEquals([b], variables_lib2.get_variables('B'))
+
+  def testGetLocalVariablesDontReturnsThem(self):
+    with self.test_session():
+      with variable_scope.variable_scope('A'):
+        variables_lib2.global_variable(0)
+      with variable_scope.variable_scope('B'):
+        variables_lib2.global_variable(0)
+      self.assertEquals([], variables_lib2.get_local_variables('A'))
+      self.assertEquals([], variables_lib2.get_local_variables('B'))
+
+  def testInitializedVariableValue(self):
+    with self.test_session() as sess:
+      a = variables_lib2.global_variable([0, 0, 0, 0, 0], name='a')
+      sess.run(variables_lib.global_variables_initializer())
+      self.assertAllEqual(a.eval(), [0] * 5)
+
+  def testResourceVariable(self):
+    a = variables_lib2.global_variable(0)
+    b = variables_lib2.global_variable(0, use_resource=True)
+    self.assertEqual(type(a), variables_lib.Variable)
+    self.assertEqual(type(b), resource_variable_ops.ResourceVariable)
+
 
 class GlobalStepTest(test.TestCase):
 
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index 88306094ab9947c9c78b03c0013f6afc88316803..5fec69ea4361a97c79ddc3188469e7ffb327f6cc 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -493,6 +493,8 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
       {{conv_input_rows, conv_input_cols}},
       output_depth,
       {{filter_rows, filter_cols}},
+      // TODO(yangzihao): Add support for arbitrary dilations for fused conv.
+      {{1, 1}},  // dilation_rows, dilation_cols
       {{row_stride, col_stride}},
       {{padding_rows, padding_cols}},
       conv_input->dtype(),
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h b/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
index dc43af11580ce5fda74ee25da6c151a5b89c7aee..fa7a3c03aa35c756252b22a004be91fa24c10e41 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
@@ -30,11 +30,12 @@ class FusedConvParameters : public ConvParameters {
  public:
   FusedConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
                       int64 out_depths, const SpatialArray& filter,
-                      const SpatialArray& stride, const SpatialArray& padding,
-                      DataType dtype, int device_id, bool has_side_input,
+                      const SpatialArray& dilation, const SpatialArray& stride,
+                      const SpatialArray& padding, DataType dtype,
+                      int device_id, bool has_side_input,
                       ActivationMode activation_mode)
-      : ConvParameters(batch, in_depths, in, out_depths, filter, stride,
-                       padding, dtype, device_id),
+      : ConvParameters(batch, in_depths, in, out_depths, filter, dilation,
+                       stride, padding, dtype, device_id),
         activation_mode_(activation_mode),
         has_side_input_(has_side_input) {
     hash_code_ = Hash64Combine(hash_code_, has_side_input);
diff --git a/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
index 887ebc5a6c35379476fa1a643c866d38e2b25699..6a56237f67c844a3daa546eb02d64c9e2658f639 100644
--- a/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
@@ -52,6 +52,7 @@ REGISTER_OP("FusedConv2DBiasActivation")
     .Attr("data_format: {'NHWC', 'NCHW', 'NCHW_VECT_C'} = 'NHWC'")
     .Attr("filter_format: {'HWIO', 'OIHW', 'OIHW_VECT_I'} = 'HWIO'")
     .Attr("activation_mode: {'Relu'} = 'Relu'")
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       using shape_inference::ShapeHandle;
       using shape_inference::DimensionHandle;
@@ -151,6 +152,11 @@ REGISTER_OP("FusedConv2DBiasActivation")
                      kernel_height, kernel_width, input_channels % 4 ]`
     activation_mode: The activation applied to the output.
         Currently must be "Relu".
+    dilations: 1-D tensor of length 4.  The dilation factor for each dimension
+        of `input`. If set to k > 1, there will be k-1 skipped cells between
+        each filter element on that dimension. The dimension order is determined
+        by the value of `data_format`, see above for details. Dilations in the
+        batch and depth dimensions must be 1.
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index 1418c87023af0dbff890f46e10f0140d5b89e4b7..b355a79b1a5d967eb82a30d41c073bbb52e0364c 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -56,6 +56,7 @@ py_test(
     srcs = ["python/train_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":features",
         ":namedtuples",
         ":train",
         "//tensorflow/contrib/framework:framework_py",
@@ -82,6 +83,7 @@ py_library(
     deps = [
         ":classifier_metrics",
         ":eval_utils",
+        ":sliced_wasserstein",
         ":summaries",
         "//tensorflow/python:util",
     ],
@@ -116,6 +118,7 @@ py_library(
     deps = [
         ":clip_weights",
         ":conditioning_utils",
+        ":random_tensor_pool",
         ":virtual_batchnorm",
         "//tensorflow/python:util",
     ],
@@ -219,6 +222,37 @@ py_test(
     ],
 )
 
+py_library(
+    name = "random_tensor_pool",
+    srcs = [
+        "python/features/python/random_tensor_pool.py",
+        "python/features/python/random_tensor_pool_impl.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_test(
+    name = "random_tensor_pool_test",
+    srcs = ["python/features/python/random_tensor_pool_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":random_tensor_pool",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "virtual_batchnorm",
     srcs = [
@@ -470,6 +504,41 @@ py_test(
     ],
 )
 
+py_library(
+    name = "sliced_wasserstein",
+    srcs = [
+        "python/eval/python/sliced_wasserstein.py",
+        "python/eval/python/sliced_wasserstein_impl.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "sliced_wasserstein_test",
+    srcs = ["python/eval/python/sliced_wasserstein_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":sliced_wasserstein",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:random_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/gan/README.md b/tensorflow/contrib/gan/README.md
index 3ab84780705b35567169bd76fd3485ad355ba9d8..4ead66ca13e74bacc0e4679a8d5c4e0f23d04b69 100644
--- a/tensorflow/contrib/gan/README.md
+++ b/tensorflow/contrib/gan/README.md
@@ -8,7 +8,8 @@ explicitly model the distribution and without writing an explicit loss. For
 example, the generator could learn to draw samples from the distribution of
 natural images. For more details on this technique, see
 ['Generative Adversarial Networks'](https://arxiv.org/abs/1406.2661) by
-Goodfellow et al.
+Goodfellow et al. See [tensorflow/models](https://github.com/tensorflow/models/tree/master/research/gan/) for examples, and [this tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb) for an
+introduction.
 
 #### Usage
 ```python
@@ -23,8 +24,8 @@ mix TFGAN, native TF, and other custom frameworks
 * Use already implemented [GAN losses and penalties](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/losses/python/losses_impl.py) (ex Wasserstein loss, gradient penalty, mutual information penalty, etc)
 * [Monitor and visualize](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/eval/python/summaries_impl.py) GAN progress during training, and [evaluate](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py) them
 * Use already-implemented [tricks](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/features/python/) to stabilize and improve training
-* Develop based on examples of common GAN setups
-* Use the TFGAN-backed tf.Learn Estimator to easily train a GAN model
+* Develop based on examples of [common GAN setups](https://github.com/tensorflow/models/tree/master/research/gan/)
+* Use the TFGAN-backed [GANEstimator](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py) to easily train a GAN model
 * Improvements in TFGAN infrastructure will automatically benefit your TFGAN project
 * Stay up-to-date with research as we add more algorithms
 
@@ -51,7 +52,7 @@ network to evaluate your unconditional generative model. You can also use
 your own pretrained classifier for more specific performance numbers, or use
 other methods for evaluating conditional generative models.
 
-* examples (coming soon):
+* [examples](https://github.com/tensorflow/models/tree/master/research/gan/) and [tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb):
 See examples of how to use TFGAN to make GAN training easier, or use the more complicated examples to jumpstart your
 own project. These include unconditional and conditional GANs, InfoGANs,
 adversarial losses on existing networks, and image-to-image translation.
@@ -98,8 +99,8 @@ gan_model = tfgan.gan_model(
 # Build the GAN loss.
 gan_loss = tfgan.gan_loss(
     gan_model,
-    generator_loss_fn=tfgan_losses.wasserstein_generator_loss,
-    discriminator_loss_fn=tfgan_losses.wasserstein_discriminator_loss)
+    generator_loss_fn=tfgan.losses.wasserstein_generator_loss,
+    discriminator_loss_fn=tfgan.losses.wasserstein_discriminator_loss)
 
 # Create the train ops, which calculate gradients and apply updates to weights.
 train_ops = tfgan.gan_train_ops(
@@ -160,8 +161,8 @@ gan_model = tfgan.gan_model(
 # Build the GAN loss and standard pixel loss.
 gan_loss = tfgan.gan_loss(
     gan_model,
-    generator_loss_fn=tfgan_losses.wasserstein_generator_loss,
-    discriminator_loss_fn=tfgan_losses.wasserstein_discriminator_loss,
+    generator_loss_fn=tfgan.losses.wasserstein_generator_loss,
+    discriminator_loss_fn=tfgan.losses.wasserstein_discriminator_loss,
     gradient_penalty=1.0)
 l1_pixel_loss = tf.norm(gan_model.real_data - gan_model.generated_data, ord=1)
 
@@ -192,8 +193,8 @@ gan_model = tfgan.gan_model(
 # Build the GAN loss and standard pixel loss.
 gan_loss = tfgan.gan_loss(
     gan_model,
-    generator_loss_fn=tfgan_losses.least_squares_generator_loss,
-    discriminator_loss_fn=tfgan_losses.least_squares_discriminator_loss)
+    generator_loss_fn=tfgan.losses.least_squares_generator_loss,
+    discriminator_loss_fn=tfgan.losses.least_squares_discriminator_loss)
 l1_pixel_loss = tf.norm(gan_model.real_data - gan_model.generated_data, ord=1)
 
 # Modify the loss tuple to include the pixel loss.
@@ -222,8 +223,8 @@ gan_model = tfgan.infogan_model(
 # Build the GAN loss with mutual information penalty.
 gan_loss = tfgan.gan_loss(
     gan_model,
-    generator_loss_fn=tfgan_losses.wasserstein_generator_loss,
-    discriminator_loss_fn=tfgan_losses.wasserstein_discriminator_loss,
+    generator_loss_fn=tfgan.losses.wasserstein_generator_loss,
+    discriminator_loss_fn=tfgan.losses.wasserstein_discriminator_loss,
     gradient_penalty=1.0,
     mutual_information_penalty_weight=1.0)
 
diff --git a/tensorflow/contrib/gan/__init__.py b/tensorflow/contrib/gan/__init__.py
index dff361fdc42708ea69999c2def4721f9d49fcf14..f1946c7f925660eae3aaa650c437e03da1f33d6c 100644
--- a/tensorflow/contrib/gan/__init__.py
+++ b/tensorflow/contrib/gan/__init__.py
@@ -12,7 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN grouped API. Please see README.md for details and usage."""
+"""TFGAN is a lightweight library for training and evaluating GANs.
+
+In addition to providing the infrastructure for easily training and evaluating
+GANS, this library contains modules for a TFGAN-backed Estimator,
+evaluation metrics, features (such as virtual batch normalization), and losses.
+Please see README.md for details and usage.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/estimator/__init__.py b/tensorflow/contrib/gan/python/estimator/__init__.py
index 8c4a18228039cb4f2c06e0333f4b8408f1f631e9..c9f7bc61b25230e4159cf8cbc7c9cceead0aa706 100644
--- a/tensorflow/contrib/gan/python/estimator/__init__.py
+++ b/tensorflow/contrib/gan/python/estimator/__init__.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN grouped API. Please see README.md for details and usage."""
+"""TFGAN estimator module.
+
+GANEstimator provides all the infrastructure support of a TensorFlow Estimator
+with the feature support of TFGAN.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index 0824ecf616caa91938c365d0c117287ed9ea8f32..d3dca3d9e75fe1ef3be67143e18c0b51e84ad24c 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import enum
 
 from tensorflow.contrib.framework.python.ops import variables as variable_lib
@@ -29,6 +30,7 @@ from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import tf_inspect as inspect
 
 
 __all__ = [
@@ -105,6 +107,7 @@ class GANEstimator(estimator.Estimator):
                discriminator_loss_fn=None,
                generator_optimizer=None,
                discriminator_optimizer=None,
+               get_hooks_fn=None,
                add_summaries=None,
                use_loss_summaries=True,
                config=None):
@@ -116,7 +119,10 @@ class GANEstimator(estimator.Estimator):
         to continue training a previously saved model.
       generator_fn: A python function that takes a Tensor, Tensor list, or
         Tensor dictionary as inputs and returns the outputs of the GAN
-        generator. See `TFGAN` for more details and examples.
+        generator. See `TFGAN` for more details and examples. Additionally, if
+        it has an argument called `mode`, the Estimator's `mode` will be passed
+        in (ex TRAIN, EVAL, PREDICT). This is useful for things like batch
+        normalization.
       discriminator_fn: A python function that takes the output of
         `generator_fn` or real data in the GAN setup, and `generator_inputs`.
         Outputs a Tensor in the range [-inf, inf]. See `TFGAN` for more details
@@ -132,6 +138,10 @@ class GANEstimator(estimator.Estimator):
         work.
       discriminator_optimizer: Same as `generator_optimizer`, but for the
         discriminator updates.
+      get_hooks_fn: A function that takes a `GANTrainOps` tuple and returns a
+        list of hooks. These hooks are run on the generator and discriminator
+        train ops, and can be used to implement the GAN training scheme.
+        Defaults to `train.get_sequential_train_hooks()`.
       add_summaries: `None`, a single `SummaryType`, or a list of `SummaryType`.
       use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
         If `None`, uses defaults.
@@ -146,7 +156,7 @@ class GANEstimator(estimator.Estimator):
               else discriminator_optimizer)
       gan_head = head_lib.gan_head(
           generator_loss_fn, discriminator_loss_fn, gopt, dopt,
-          use_loss_summaries)
+          use_loss_summaries, get_hooks_fn=get_hooks_fn)
       return _gan_model_fn(
           features, labels, mode, generator_fn, discriminator_fn, gan_head,
           add_summaries)
@@ -155,11 +165,6 @@ class GANEstimator(estimator.Estimator):
         model_fn=_model_fn, model_dir=model_dir, config=config)
 
 
-def _use_check_shapes(real_data):
-  """Determines whether TFGAN should check Tensor shapes."""
-  return isinstance(real_data, ops.Tensor)
-
-
 def _gan_model_fn(
     features,
     labels,
@@ -225,16 +230,19 @@ def _gan_model_fn(
       labels=None)
 
 
-def _make_train_gan_model(generator_fn, discriminator_fn, real_data,
-                          generator_inputs, generator_scope, add_summaries):
-  """Make a `GANModel` for training."""
+def _make_gan_model(generator_fn, discriminator_fn, real_data,
+                    generator_inputs, generator_scope, add_summaries, mode):
+  """Make a `GANModel`, and optionally pass in `mode`."""
+  # If `generator_fn` has an argument `mode`, pass mode to it.
+  if 'mode' in inspect.getargspec(generator_fn).args:
+    generator_fn = functools.partial(generator_fn, mode=mode)
   gan_model = tfgan_train.gan_model(
       generator_fn,
       discriminator_fn,
       real_data,
       generator_inputs,
       generator_scope=generator_scope,
-      check_shapes=_use_check_shapes(real_data))
+      check_shapes=False)
   if add_summaries:
     if not isinstance(add_summaries, (tuple, list)):
       add_summaries = [add_summaries]
@@ -245,15 +253,28 @@ def _make_train_gan_model(generator_fn, discriminator_fn, real_data,
   return gan_model
 
 
+def _make_train_gan_model(generator_fn, discriminator_fn, real_data,
+                          generator_inputs, generator_scope, add_summaries):
+  """Make a `GANModel` for training."""
+  return _make_gan_model(generator_fn, discriminator_fn, real_data,
+                         generator_inputs, generator_scope, add_summaries,
+                         model_fn_lib.ModeKeys.TRAIN)
+
+
 def _make_eval_gan_model(generator_fn, discriminator_fn, real_data,
                          generator_inputs, generator_scope, add_summaries):
   """Make a `GANModel` for evaluation."""
-  return _make_train_gan_model(generator_fn, discriminator_fn, real_data,
-                               generator_inputs, generator_scope, add_summaries)
+  return _make_gan_model(generator_fn, discriminator_fn, real_data,
+                         generator_inputs, generator_scope, add_summaries,
+                         model_fn_lib.ModeKeys.EVAL)
 
 
 def _make_prediction_gan_model(generator_inputs, generator_fn, generator_scope):
   """Make a `GANModel` from just the generator."""
+  # If `generator_fn` has an argument `mode`, pass mode to it.
+  if 'mode' in inspect.getargspec(generator_fn).args:
+    generator_fn = functools.partial(generator_fn,
+                                     mode=model_fn_lib.ModeKeys.PREDICT)
   with variable_scope.variable_scope(generator_scope) as gen_scope:
     generator_inputs = tfgan_train._convert_tensor_or_l_or_d(generator_inputs)  # pylint:disable=protected-access
     generated_data = generator_fn(generator_inputs)
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
index 1bfdce9ee94d4d05d5186cd999361662bc0e3f85..e752f0bcccda418b79d4fdabb27807394cbbb425 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
@@ -48,7 +48,8 @@ from tensorflow.python.training import training
 from tensorflow.python.training import training_util
 
 
-def generator_fn(noise_dict):
+def generator_fn(noise_dict, mode):
+  del mode
   noise = noise_dict['x']
   return layers.fully_connected(noise, noise.shape[1].value)
 
@@ -90,7 +91,6 @@ def mock_head(testcase, expected_generator_inputs, expected_real_data,
         generator_var_names,
         set([x.name for x in gan_model.generator_variables]))
     testcase.assertEqual(generator_scope_name, gan_model.generator_scope.name)
-    testcase.assertEqual(generator_fn, gan_model.generator_fn)
     testcase.assertEqual(_or_none(expected_real_data), gan_model.real_data)
     # TODO(joelshor): Add check on `discriminator_real_outputs`.
     # TODO(joelshor): Add check on `discriminator_gen_outputs`.
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_impl.py b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
index 204c646e194319c0e63599da0b2a4909ef270ef3..a21358c50bbdb4a1a929b0c5bc322cec4c9923b5 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
@@ -71,7 +71,7 @@ class GANHead(head._Head):  # pylint: disable=protected-access
   def __init__(self, generator_loss_fn, discriminator_loss_fn,
                generator_optimizer, discriminator_optimizer,
                use_loss_summaries=True,
-               get_hooks_fn=tfgan_train.get_sequential_train_hooks(),
+               get_hooks_fn=None,
                name=None):
     """`Head` for GAN training.
 
@@ -86,10 +86,12 @@ class GANHead(head._Head):  # pylint: disable=protected-access
       use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
         If `None`, uses defaults.
       get_hooks_fn: A function that takes a GANTrainOps tuple and returns a list
-        of hooks.
+        of hooks. Defaults to `train.get_sequential_train_hooks()`
       name: name of the head. If provided, summary and metrics keys will be
         suffixed by `"/" + name`.
     """
+    if get_hooks_fn is None:
+      get_hooks_fn = tfgan_train.get_sequential_train_hooks()
     # TODO(joelshor): Validate inputs.
 
     if use_loss_summaries in [True, False]:
diff --git a/tensorflow/contrib/gan/python/eval/__init__.py b/tensorflow/contrib/gan/python/eval/__init__.py
index bb8046187807d0cc584f7174eb9aac578855c110..f86b8513053a45f9830411f7df2c32d1f36a97b2 100644
--- a/tensorflow/contrib/gan/python/eval/__init__.py
+++ b/tensorflow/contrib/gan/python/eval/__init__.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN grouped API. Please see README.md for details and usage."""
+"""TFGAN evaluation module.
+
+This module supports techniques such as Inception Score, Frechet Inception
+distance, and Sliced Wasserstein distance.
+"""
 # pylint: disable=,wildcard-import,unused-import
 
 from __future__ import absolute_import
@@ -22,10 +26,12 @@ from __future__ import print_function
 # Collapse eval into a single namespace.
 from tensorflow.contrib.gan.python.eval.python import classifier_metrics
 from tensorflow.contrib.gan.python.eval.python import eval_utils
+from tensorflow.contrib.gan.python.eval.python import sliced_wasserstein
 from tensorflow.contrib.gan.python.eval.python import summaries
 
 from tensorflow.contrib.gan.python.eval.python.classifier_metrics import *
 from tensorflow.contrib.gan.python.eval.python.eval_utils import *
+from tensorflow.contrib.gan.python.eval.python.sliced_wasserstein import *
 from tensorflow.contrib.gan.python.eval.python.summaries import *
 # pylint: enable=wildcard-import,unused-import
 
@@ -33,7 +39,10 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
     'classifier_metrics',
+    'sliced_wasserstein_distance',
     'summaries',
     'eval_utils',
-] + classifier_metrics.__all__ + summaries.__all__ + eval_utils.__all__
+] + (
+    classifier_metrics.__all__ + sliced_wasserstein.__all__ +
+    summaries.__all__ + eval_utils.__all__)
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index bb65f05b5a17e9a872e41d1dcb05aeb3cd6f6f40..82293b575aefa198a618ae7286ca24ebabd6987d 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -57,8 +57,10 @@ __all__ = [
     'run_inception',
     'inception_score',
     'classifier_score',
+    'classifier_score_from_logits',
     'frechet_inception_distance',
     'frechet_classifier_distance',
+    'frechet_classifier_distance_from_activations',
     'INCEPTION_DEFAULT_IMAGE_SIZE',
 ]
 
@@ -222,13 +224,13 @@ def run_inception(images,
     image_size: Required image width and height. See unit tests for the default
       values.
     input_tensor: Name of input Tensor.
-    output_tensor: Name of output Tensor. This function will compute activations
-      at the specified layer. Examples include INCEPTION_V3_OUTPUT and
-      INCEPTION_V3_FINAL_POOL which would result in this function computing
+    output_tensor: Name or list of output Tensors. This function will compute
+      activations at the specified layer. Examples include INCEPTION_V3_OUTPUT
+      and INCEPTION_V3_FINAL_POOL which would result in this function computing
       the final logits or the penultimate pooling layer.
 
   Returns:
-    Logits.
+    Tensor or Tensors corresponding to computed `output_tensor`.
 
   Raises:
     ValueError: If images are not the correct size.
@@ -244,8 +246,14 @@ def run_inception(images,
 
   activations = run_image_classifier(images, graph_def, input_tensor,
                                      output_tensor)
-  if array_ops.rank(activations) != 2:
-    activations = layers.flatten(activations)
+  if isinstance(activations, list):
+    for i, activation in enumerate(activations):
+      if array_ops.rank(activation) != 2:
+        activations[i] = layers.flatten(activation)
+  else:
+    if array_ops.rank(activations) != 2:
+      activations = layers.flatten(activations)
+
   return activations
 
 
@@ -257,23 +265,26 @@ def run_image_classifier(tensor, graph_def, input_tensor,
     tensor: An Input tensor.
     graph_def: A GraphDef proto.
     input_tensor: Name of input tensor in graph def.
-    output_tensor: Name of output tensor in graph def.
+    output_tensor: A tensor name or list of tensor names in graph def.
     scope: Name scope for classifier.
 
   Returns:
-    Classifier output. Shape depends on the classifier used, but is often
-    [batch, classes].
+    Classifier output if `output_tensor` is a string, or a list of outputs if
+    `output_tensor` is a list.
 
   Raises:
-    ValueError: If `image_size` is not `None`, and `tensor` are not the correct
-      size.
+    ValueError: If `input_tensor` or `output_tensor` aren't in the graph_def.
   """
   input_map = {input_tensor: tensor}
-  return_elements = [output_tensor]
-  classifier_output = importer.import_graph_def(
-      graph_def, input_map, return_elements, name=scope)[0]
+  is_singleton = isinstance(output_tensor, str)
+  if is_singleton:
+    output_tensor = [output_tensor]
+  classifier_outputs = importer.import_graph_def(
+      graph_def, input_map, output_tensor, name=scope)
+  if is_singleton:
+    classifier_outputs = classifier_outputs[0]
 
-  return classifier_output
+  return classifier_outputs
 
 
 def classifier_score(images, classifier_fn, num_batches=1):
@@ -312,6 +323,30 @@ def classifier_score(images, classifier_fn, num_batches=1):
       swap_memory=True,
       name='RunClassifier')
   logits = array_ops.concat(array_ops.unstack(logits), 0)
+
+  return classifier_score_from_logits(logits)
+
+
+def classifier_score_from_logits(logits):
+  """Classifier score for evaluating a conditional generative model.
+
+  This is based on the Inception Score, but for an arbitrary classifier.
+
+  This technique is described in detail in https://arxiv.org/abs/1606.03498. In
+  summary, this function calculates
+
+  exp( E[ KL(p(y|x) || p(y)) ] )
+
+  which captures how different the network's classification prediction is from
+  the prior distribution over classes.
+
+  Args:
+    logits: A 2D Tensor of logits.
+
+  Returns:
+    The classifier score. A floating-point scalar of the same type as the output
+    of `logits`.
+  """
   logits.shape.assert_has_rank(2)
 
   # Use maximum precision for best results.
@@ -436,31 +471,71 @@ def frechet_classifier_distance(real_images,
       swap_memory=True,
       name='RunClassifier')
 
-  activations_dtype = activations.dtype
   # Split the activations by the real and generated images.
   real_a, gen_a = array_ops.split(activations, [num_batches, num_batches], 0)
 
   # Ensure the activations have the right shapes.
   real_a = array_ops.concat(array_ops.unstack(real_a), 0)
   gen_a = array_ops.concat(array_ops.unstack(gen_a), 0)
-  if activations_dtype != dtypes.float64:
-    real_a = math_ops.to_double(real_a)
-    gen_a = math_ops.to_double(gen_a)
 
-  real_a.shape.assert_has_rank(2)
-  gen_a.shape.assert_has_rank(2)
+  return frechet_classifier_distance_from_activations(real_a, gen_a)
+
+
+def frechet_classifier_distance_from_activations(
+    real_activations, generated_activations):
+  """Classifier distance for evaluating a generative model.
+
+  This is based on the Frechet Inception distance, but for an arbitrary
+  classifier.
+
+  This technique is described in detail in https://arxiv.org/abs/1706.08500.
+  Given two Gaussian distribution with means m and m_w and covariance matrices
+  C and C_w, this function calcuates
+
+  |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))
+
+  which captures how different the distributions of real images and generated
+  images (or more accurately, their visual features) are. Note that unlike the
+  Inception score, this is a true distance and utilizes information about real
+  world images.
+
+  Note that when computed using sample means and sample covariance matrices,
+  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
+  even if the two distributions are the same, for a small sample size, the
+  expected Frechet distance is large). It is important to use the same
+  sample size to compute frechet classifier distance when comparing two
+  generative models.
+
+  Args:
+    real_activations: Real images to use to compute Frechet Inception distance.
+    generated_activations: Generated images to use to compute Frechet Inception
+      distance.
+
+  Returns:
+    The Frechet Inception distance. A floating-point scalar of the same type
+    as the output of the activations.
+  """
+  real_activations.shape.assert_has_rank(2)
+  generated_activations.shape.assert_has_rank(2)
+
+  activations_dtype = real_activations.dtype
+  if activations_dtype != dtypes.float64:
+    real_activations = math_ops.to_double(real_activations)
+    generated_activations = math_ops.to_double(generated_activations)
 
   # Compute mean and covariance matrices of activations.
-  m = math_ops.reduce_mean(real_a, 0)
-  m_v = math_ops.reduce_mean(gen_a, 0)
-  num_examples = math_ops.to_double(array_ops.shape(real_a)[0])
+  m = math_ops.reduce_mean(real_activations, 0)
+  m_v = math_ops.reduce_mean(generated_activations, 0)
+  num_examples = math_ops.to_double(array_ops.shape(real_activations)[0])
 
   # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
+  real_centered = real_activations - m
   sigma = math_ops.matmul(
-      real_a - m, real_a - m, transpose_a=True) / (num_examples - 1)
+      real_centered, real_centered, transpose_a=True) / (num_examples - 1)
 
+  gen_centered = generated_activations - m_v
   sigma_v = math_ops.matmul(
-      gen_a - m_v, gen_a - m_v, transpose_a=True) / (num_examples - 1)
+      gen_centered, gen_centered, transpose_a=True) / (num_examples - 1)
 
   # Find the Tr(sqrt(sigma sigma_v)) component of FID
   sqrt_trace_component = trace_sqrt_product(sigma, sigma_v)
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
index 92e0a995748c1c4c2ddfff0daae59be5a6eaefb4..1e18c699ba93b5f524341c65d0a2db84556b65a2 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
@@ -190,6 +190,23 @@ class ClassifierMetricsTest(test.TestCase):
     # Check that none of the model variables are trainable.
     self.assertListEqual([], variables.trainable_variables())
 
+  def test_run_inception_multiple_outputs(self):
+    """Test `run_inception` graph construction with multiple outputs."""
+    batch_size = 3
+    img = array_ops.ones([batch_size, 299, 299, 3])
+    logits, pool = _run_with_mock(
+        classifier_metrics.run_inception, img,
+        output_tensor=[classifier_metrics.INCEPTION_OUTPUT,
+                       classifier_metrics.INCEPTION_FINAL_POOL])
+
+    self.assertTrue(isinstance(logits, ops.Tensor))
+    self.assertTrue(isinstance(pool, ops.Tensor))
+    logits.shape.assert_is_compatible_with([batch_size, 1001])
+    pool.shape.assert_is_compatible_with([batch_size, 2048])
+
+    # Check that none of the model variables are trainable.
+    self.assertListEqual([], variables.trainable_variables())
+
   def test_inception_score_graph(self):
     """Test `inception_score` graph construction."""
     score = _run_with_mock(classifier_metrics.inception_score,
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py
new file mode 100644
index 0000000000000000000000000000000000000000..523968bed91f1021ae629bf52c405cf5c2d7b917
--- /dev/null
+++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py
@@ -0,0 +1,28 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model evaluation tools for TFGAN."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.gan.python.eval.python import sliced_wasserstein_impl
+# pylint: disable=wildcard-import
+from tensorflow.contrib.gan.python.eval.python.sliced_wasserstein_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+__all__ = sliced_wasserstein_impl.__all__
+remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bebcacbe46d85fc4226c4275b71b3ecbde57a97
--- /dev/null
+++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
@@ -0,0 +1,282 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Sliced Wasserstein Distance.
+
+Proposed in https://arxiv.org/abs/1710.10196 and the official Theano
+implementation that we used as reference can be found here:
+https://github.com/tkarras/progressive_growing_of_gans
+
+Note: this is not an exact distance but an approximation through random
+projections.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import script_ops
+
+__all__ = ['sliced_wasserstein_distance']
+_GAUSSIAN_FILTER = np.float32([[1, 4, 6, 4, 1], [4, 16, 24, 16, 4], [
+    6, 24, 36, 24, 6
+], [4, 16, 24, 16, 4], [1, 4, 6, 4, 1]]).reshape([5, 5, 1, 1]) / 256.0
+
+
+def _laplacian_pyramid(batch, num_levels):
+  """Compute a Laplacian pyramid.
+
+  Args:
+      batch: (tensor) The batch of images (batch, height, width, channels).
+      num_levels: (int) Desired number of hierarchical levels.
+  Returns:
+      List of tensors from the highest to lowest resolution.
+  """
+  gaussian_filter = constant_op.constant(_GAUSSIAN_FILTER)
+
+  def spatial_conv(batch, gain):
+    s = array_ops.shape(batch)
+    padded = array_ops.pad(batch, [[0, 0], [2, 2], [2, 2], [0, 0]], 'REFLECT')
+    xt = array_ops.transpose(padded, [0, 3, 1, 2])
+    xt = array_ops.reshape(xt, [s[0] * s[3], s[1] + 4, s[2] + 4, 1])
+    conv_out = nn_ops.conv2d(xt, gaussian_filter * gain, [1] * 4, 'VALID')
+    conv_xt = array_ops.reshape(conv_out, [s[0], s[3], s[1], s[2]])
+    conv_xt = array_ops.transpose(conv_xt, [0, 2, 3, 1])
+    return conv_xt
+
+  def pyr_down(batch):  # matches cv2.pyrDown()
+    return spatial_conv(batch, 1)[:, ::2, ::2]
+
+  def pyr_up(batch):  # matches cv2.pyrUp()
+    s = array_ops.shape(batch)
+    zeros = array_ops.zeros([3 * s[0], s[1], s[2], s[3]])
+    res = array_ops.concat([batch, zeros], 0)
+    res = array_ops.batch_to_space(res, crops=[[0, 0], [0, 0]], block_size=2)
+    res = spatial_conv(res, 4)
+    return res
+
+  pyramid = [math_ops.to_float(batch)]
+  for _ in range(1, num_levels):
+    pyramid.append(pyr_down(pyramid[-1]))
+    pyramid[-2] -= pyr_up(pyramid[-1])
+  return pyramid
+
+
+def _batch_to_patches(batch, patches_per_image, patch_size):
+  """Extract patches from a batch.
+
+  Args:
+      batch: (tensor) The batch of images (batch, height, width, channels).
+      patches_per_image: (int) Number of patches to extract per image.
+      patch_size: (int) Size of the patches (size, size, channels) to extract.
+  Returns:
+      Tensor (batch*patches_per_image, patch_size, patch_size, channels) of
+      patches.
+  """
+
+  def py_func_random_patches(batch):
+    """Numpy wrapper."""
+    batch_size, height, width, channels = batch.shape
+    patch_count = patches_per_image * batch_size
+    hs = patch_size // 2
+    # Randomly pick patches.
+    patch_id, y, x, chan = np.ogrid[0:patch_count, -hs:hs + 1, -hs:hs + 1, 0:3]
+    img_id = patch_id // patches_per_image
+    # pylint: disable=g-no-augmented-assignment
+    # Need explicit addition for broadcast to work properly.
+    y = y + np.random.randint(hs, height - hs, size=(patch_count, 1, 1, 1))
+    x = x + np.random.randint(hs, width - hs, size=(patch_count, 1, 1, 1))
+    # pylint: enable=g-no-augmented-assignment
+    idx = ((img_id * height + y) * width + x) * channels + chan
+    patches = batch.flat[idx]
+    return patches
+
+  patches = script_ops.py_func(
+      py_func_random_patches, [batch], batch.dtype, stateful=False)
+  return patches
+
+
+def _normalize_patches(patches):
+  """Normalize patches by their mean and standard deviation.
+
+  Args:
+      patches: (tensor) The batch of patches (batch, size, size, channels).
+  Returns:
+      Tensor (batch, size, size, channels) of the normalized patches.
+  """
+  patches = array_ops.concat(patches, 0)
+  mean, variance = nn.moments(patches, [1, 2, 3], keep_dims=True)
+  patches = (patches - mean) / math_ops.sqrt(variance)
+  return array_ops.reshape(patches, [array_ops.shape(patches)[0], -1])
+
+
+def _sort_rows(matrix, num_rows):
+  """Sort matrix rows by the last column.
+
+  Args:
+      matrix: a matrix of values (row,col).
+      num_rows: (int) number of sorted rows to return from the matrix.
+  Returns:
+      Tensor (num_rows, col) of the sorted matrix top K rows.
+  """
+  tmatrix = array_ops.transpose(matrix, [1, 0])
+  sorted_tmatrix = nn_ops.top_k(tmatrix, num_rows)[0]
+  return array_ops.transpose(sorted_tmatrix, [1, 0])
+
+
+def _sliced_wasserstein(a, b, random_sampling_count, random_projection_dim):
+  """Compute the approximate sliced Wasserstein distance.
+
+  Args:
+      a: (matrix) Distribution "a" of samples (row, col).
+      b: (matrix) Distribution "b" of samples (row, col).
+      random_sampling_count: (int) Number of random projections to average.
+      random_projection_dim: (int) Dimension of the random projection space.
+  Returns:
+      Float containing the approximate distance between "a" and "b".
+  """
+  s = array_ops.shape(a)
+  means = []
+  for _ in range(random_sampling_count):
+    # Random projection matrix.
+    proj = random_ops.random_normal(
+        [array_ops.shape(a)[1], random_projection_dim])
+    proj *= math_ops.rsqrt(
+        math_ops.reduce_sum(math_ops.square(proj), 0, keep_dims=True))
+    # Project both distributions and sort them.
+    proj_a = math_ops.matmul(a, proj)
+    proj_b = math_ops.matmul(b, proj)
+    proj_a = _sort_rows(proj_a, s[0])
+    proj_b = _sort_rows(proj_b, s[0])
+    # Pairwise Wasserstein distance.
+    wdist = math_ops.reduce_mean(math_ops.abs(proj_a - proj_b))
+    means.append(wdist)
+  return math_ops.reduce_mean(means)
+
+
+def _sliced_wasserstein_svd(a, b):
+  """Compute the approximate sliced Wasserstein distance using an SVD.
+
+  This is not part of the paper, it's a variant with possibly more accurate
+  measure.
+
+  Args:
+      a: (matrix) Distribution "a" of samples (row, col).
+      b: (matrix) Distribution "b" of samples (row, col).
+  Returns:
+      Float containing the approximate distance between "a" and "b".
+  """
+  s = array_ops.shape(a)
+  # Random projection matrix.
+  sig, u = linalg_ops.svd(array_ops.concat([a, b], 0))[:2]
+  proj_a, proj_b = array_ops.split(u * sig, 2, axis=0)
+  proj_a = _sort_rows(proj_a[:, ::-1], s[0])
+  proj_b = _sort_rows(proj_b[:, ::-1], s[0])
+  # Pairwise Wasserstein distance.
+  wdist = math_ops.reduce_mean(math_ops.abs(proj_a - proj_b))
+  return wdist
+
+
+def sliced_wasserstein_distance(real_images,
+                                fake_images,
+                                resolution_min=16,
+                                patches_per_image=64,
+                                patch_size=7,
+                                random_sampling_count=1,
+                                random_projection_dim=7 * 7 * 3,
+                                use_svd=False):
+  """Compute the Wasserstein distance between two distributions of images.
+
+  Note that measure vary with the number of images. Use 8192 images to get
+  numbers comparable to the ones in the original paper.
+
+  Args:
+      real_images: (tensor) Real images (batch, height, width, channels).
+      fake_images: (tensor) Fake images (batch, height, width, channels).
+      resolution_min: (int) Minimum resolution for the Laplacion pyramid.
+      patches_per_image: (int) Number of patches to extract per image per
+        Laplacian level.
+      patch_size: (int) Width of a square patch.
+      random_sampling_count: (int) Number of random projections to average.
+      random_projection_dim: (int) Dimension of the random projection space.
+      use_svd: experimental method to compute a more accurate distance.
+  Returns:
+      List of tuples (distance_real, distance_fake) for each level of the
+      Laplacian pyramid from the highest resoluion to the lowest.
+        distance_real is the Wasserstein distance between real images
+        distance_fake is the Wasserstein distance between real and fake images.
+  Raises:
+      ValueError: If the inputs shapes are incorrect. Input tensor dimensions
+      (batch, height, width, channels) are expected to be known at graph
+      construction time. In addition height and width must be the same and the
+      number of colors should be exactly 3. Real and fake images must have the
+      same size.
+  """
+  height = real_images.shape[1]
+  real_images.shape.assert_is_compatible_with([None, None, height, 3])
+  fake_images.shape.assert_is_compatible_with(real_images.shape)
+
+  # Select resolutions.
+  resolution_full = int(height)
+  resolution_min = min(resolution_min, resolution_full)
+  resolution_max = resolution_full
+  # Base loss of detail.
+  resolutions = [
+      2**i
+      for i in range(
+          int(np.log2(resolution_max)),
+          int(np.log2(resolution_min)) - 1, -1)
+  ]
+
+  # Gather patches for each level of the Laplacian pyramids.
+  patches_real, patches_fake, patches_test = (
+      [[] for _ in resolutions] for _ in range(3))
+  for lod, level in enumerate(
+      _laplacian_pyramid(real_images, len(resolutions))):
+    patches_real[lod].append(
+        _batch_to_patches(level, patches_per_image, patch_size))
+    patches_test[lod].append(
+        _batch_to_patches(level, patches_per_image, patch_size))
+
+  for lod, level in enumerate(
+      _laplacian_pyramid(fake_images, len(resolutions))):
+    patches_fake[lod].append(
+        _batch_to_patches(level, patches_per_image, patch_size))
+
+  for lod in range(len(resolutions)):
+    for patches in [patches_real, patches_test, patches_fake]:
+      patches[lod] = _normalize_patches(patches[lod])
+
+  # Evaluate scores.
+  scores = []
+  for lod in range(len(resolutions)):
+    if not use_svd:
+      scores.append(
+          (_sliced_wasserstein(patches_real[lod], patches_test[lod],
+                               random_sampling_count, random_projection_dim),
+           _sliced_wasserstein(patches_real[lod], patches_fake[lod],
+                               random_sampling_count, random_projection_dim)))
+    else:
+      scores.append(
+          (_sliced_wasserstein_svd(patches_real[lod], patches_test[lod]),
+           _sliced_wasserstein_svd(patches_real[lod], patches_fake[lod])))
+  return scores
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b960af28eaa969079b72c7aabcde2ad6cd1f5c68
--- /dev/null
+++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py
@@ -0,0 +1,131 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Sliced Wasserstein Distance."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from scipy import ndimage
+from tensorflow.contrib.gan.python.eval.python import sliced_wasserstein_impl as swd
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+
+class ClassifierMetricsTest(test.TestCase):
+
+  def test_laplacian_pyramid(self):
+    # The numpy/scipy code for reference estimation comes from:
+    # https://github.com/tkarras/progressive_growing_of_gans
+    gaussian_filter = np.float32([[1, 4, 6, 4, 1], [4, 16, 24, 16, 4], [
+        6, 24, 36, 24, 6
+    ], [4, 16, 24, 16, 4], [1, 4, 6, 4, 1]]) / 256.0
+
+    def np_pyr_down(minibatch):  # matches cv2.pyrDown()
+      assert minibatch.ndim == 4
+      return ndimage.convolve(
+          minibatch,
+          gaussian_filter[np.newaxis, np.newaxis, :, :],
+          mode='mirror')[:, :, ::2, ::2]
+
+    def np_pyr_up(minibatch):  # matches cv2.pyrUp()
+      assert minibatch.ndim == 4
+      s = minibatch.shape
+      res = np.zeros((s[0], s[1], s[2] * 2, s[3] * 2), minibatch.dtype)
+      res[:, :, ::2, ::2] = minibatch
+      return ndimage.convolve(
+          res,
+          gaussian_filter[np.newaxis, np.newaxis, :, :] * 4.0,
+          mode='mirror')
+
+    def np_laplacian_pyramid(minibatch, num_levels):
+      # Note: there's a bug in the original SWD, fixed repeatability.
+      pyramid = [minibatch.astype('f').copy()]
+      for _ in range(1, num_levels):
+        pyramid.append(np_pyr_down(pyramid[-1]))
+        pyramid[-2] -= np_pyr_up(pyramid[-1])
+      return pyramid
+
+    data = np.random.normal(size=[256, 3, 32, 32]).astype('f')
+    pyramid = np_laplacian_pyramid(data, 3)
+    data_tf = array_ops.placeholder(dtypes.float32, [256, 32, 32, 3])
+    pyramid_tf = swd._laplacian_pyramid(data_tf, 3)
+    with self.test_session() as sess:
+      pyramid_tf = sess.run(
+          pyramid_tf, feed_dict={
+              data_tf: data.transpose(0, 2, 3, 1)
+          })
+    for x in range(3):
+      self.assertAllClose(
+          pyramid[x].transpose(0, 2, 3, 1), pyramid_tf[x], atol=1e-6)
+
+  def test_sliced_wasserstein_distance(self):
+    """Test the distance."""
+    d1 = random_ops.random_uniform([256, 32, 32, 3])
+    d2 = random_ops.random_normal([256, 32, 32, 3])
+    wfunc = swd.sliced_wasserstein_distance(d1, d2)
+    with self.test_session() as sess:
+      wscores = [sess.run(x) for x in wfunc]
+    self.assertAllClose(
+        np.array([0.014, 0.014], 'f'),
+        np.array([x[0] for x in wscores], 'f'),
+        rtol=0.1)
+    self.assertAllClose(
+        np.array([0.014, 0.020], 'f'),
+        np.array([x[1] for x in wscores], 'f'),
+        rtol=0.1)
+
+  def test_sliced_wasserstein_distance_svd(self):
+    """Test the distance."""
+    d1 = random_ops.random_uniform([256, 32, 32, 3])
+    d2 = random_ops.random_normal([256, 32, 32, 3])
+    wfunc = swd.sliced_wasserstein_distance(d1, d2, use_svd=True)
+    with self.test_session() as sess:
+      wscores = [sess.run(x) for x in wfunc]
+    self.assertAllClose(
+        np.array([0.013, 0.013], 'f'),
+        np.array([x[0] for x in wscores], 'f'),
+        rtol=0.15)
+    self.assertAllClose(
+        np.array([0.014, 0.019], 'f'),
+        np.array([x[1] for x in wscores], 'f'),
+        rtol=0.15)
+
+  def test_swd_mismatched(self):
+    """Test the inputs mismatched shapes are detected."""
+    d1 = random_ops.random_uniform([256, 32, 32, 3])
+    d2 = random_ops.random_normal([256, 32, 31, 3])
+    d3 = random_ops.random_normal([256, 31, 32, 3])
+    d4 = random_ops.random_normal([255, 32, 32, 3])
+    with self.assertRaises(ValueError):
+      swd.sliced_wasserstein_distance(d1, d2)
+    with self.assertRaises(ValueError):
+      swd.sliced_wasserstein_distance(d1, d3)
+    with self.assertRaises(ValueError):
+      swd.sliced_wasserstein_distance(d1, d4)
+
+  def test_swd_not_rgb(self):
+    """Test that only RGB is supported."""
+    d1 = random_ops.random_uniform([256, 32, 32, 1])
+    d2 = random_ops.random_normal([256, 32, 32, 1])
+    with self.assertRaises(ValueError):
+      swd.sliced_wasserstein_distance(d1, d2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/gan/python/features/__init__.py b/tensorflow/contrib/gan/python/features/__init__.py
index 6d0972f8db418d6fcf517cc6f7e96093ae08a9e4..4816daf760143af9f1502873b123ffad8e5ec8ce 100644
--- a/tensorflow/contrib/gan/python/features/__init__.py
+++ b/tensorflow/contrib/gan/python/features/__init__.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN grouped API. Please see README.md for details and usage."""
+"""TFGAN features module.
+
+This module includes support for virtual batch normalization, buffer replay,
+conditioning, etc.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,10 +26,12 @@ from __future__ import print_function
 # pylint: disable=unused-import,wildcard-import
 from tensorflow.contrib.gan.python.features.python import clip_weights
 from tensorflow.contrib.gan.python.features.python import conditioning_utils
+from tensorflow.contrib.gan.python.features.python import random_tensor_pool
 from tensorflow.contrib.gan.python.features.python import virtual_batchnorm
 
 from tensorflow.contrib.gan.python.features.python.clip_weights import *
 from tensorflow.contrib.gan.python.features.python.conditioning_utils import *
+from tensorflow.contrib.gan.python.features.python.random_tensor_pool import *
 from tensorflow.contrib.gan.python.features.python.virtual_batchnorm import *
 # pylint: enable=unused-import,wildcard-import
 
@@ -33,5 +39,6 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = clip_weights.__all__
 _allowed_symbols += conditioning_utils.__all__
+_allowed_symbols += random_tensor_pool.__all__
 _allowed_symbols += virtual_batchnorm.__all__
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/gan/python/features/python/clip_weights_test.py b/tensorflow/contrib/gan/python/features/python/clip_weights_test.py
index 030e37ec679ec58e3b534fd3644ffe1d23173404..2b7bb5f14e7f3d1b3f913d3426efaaae19079ffb 100644
--- a/tensorflow/contrib/gan/python/features/python/clip_weights_test.py
+++ b/tensorflow/contrib/gan/python/features/python/clip_weights_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tfgan.python.features.clip_weights."""
+"""Tests for features.clip_weights."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -31,17 +31,18 @@ class ClipWeightsTest(test.TestCase):
   """Tests for `discriminator_weight_clip`."""
 
   def setUp(self):
+    super(ClipWeightsTest, self).setUp()
     self.variables = [variables.Variable(2.0)]
     self.tuple = collections.namedtuple(
         'VarTuple', ['discriminator_variables'])(self.variables)
 
   def _test_weight_clipping_helper(self, use_tuple):
-    loss = self.variables[0] * 2.0
+    loss = self.variables[0]
     opt = training.GradientDescentOptimizer(1.0)
     if use_tuple:
-      opt_clip = clip_weights.weight_clip(opt, self.variables, 0.1)
+      opt_clip = clip_weights.clip_variables(opt, self.variables, 0.1)
     else:
-      opt_clip = clip_weights.discriminator_weight_clip(opt, self.tuple, 0.1)
+      opt_clip = clip_weights.clip_discriminator_weights(opt, self.tuple, 0.1)
 
     train_op1 = opt.minimize(loss, var_list=self.variables)
     train_op2 = opt_clip.minimize(loss, var_list=self.variables)
@@ -72,10 +73,14 @@ class ClipWeightsTest(test.TestCase):
         clip_weights.clip_discriminator_weights(opt, self.tuple, weight_clip=-1)
     else:
       with self.assertRaisesRegexp(ValueError, 'must be positive'):
-        clip_weights.clip_weights(opt, self.variables, weight_clip=-1)
+        clip_weights.clip_variables(opt, self.variables, weight_clip=-1)
 
   def test_incorrect_weight_clip_value_argsonly(self):
     self._test_incorrect_weight_clip_value_helper(False)
 
   def test_incorrect_weight_clip_value_tuple(self):
     self._test_incorrect_weight_clip_value_helper(True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/gan/python/features/python/random_tensor_pool.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca904971fa8cb0440d3e0c9060f13cc214c9eaad
--- /dev/null
+++ b/tensorflow/contrib/gan/python/features/python/random_tensor_pool.py
@@ -0,0 +1,35 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A tensor pool stores values from an input tensor and returns a stored one.
+
+See the following papers for more details.
+1) `Learning from simulated and unsupervised images through adversarial
+    training` (https://arxiv.org/abs/1612.07828).
+2) `Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial
+    Networks` (https://arxiv.org/abs/1703.10593).
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.gan.python.features.python import random_tensor_pool_impl
+# pylint: disable=wildcard-import
+from tensorflow.contrib.gan.python.features.python.random_tensor_pool_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+__all__ = random_tensor_pool_impl.__all__
+remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d10db0f5a3d09dc4dd7d8b1c97c16c29808547c
--- /dev/null
+++ b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
@@ -0,0 +1,135 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A tensor pool stores values from an input tensor and returns a stored one.
+
+We use this to keep a history of values created by a generator, such that
+a discriminator can randomly be trained on some older samples, not just the
+current one. This can help to not let the discriminator get too far ahead of the
+generator and also to keep the system from oscilating, if the discriminator
+forgets too fast what past samples from the generator looked like.
+
+See the following papers for more details.
+1) `Learning from simulated and unsupervised images through adversarial
+    training` (https://arxiv.org/abs/1612.07828).
+2) `Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial
+    Networks` (https://arxiv.org/abs/1703.10593).
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import random_ops
+
+__all__ = [
+    'tensor_pool',
+]
+
+
+def _to_tuple(x):
+  if isinstance(x, (list, tuple)):
+    return tuple(x)
+  return (x,)
+
+
+def tensor_pool(input_values,
+                pool_size=50,
+                pooling_probability=0.5,
+                name='tensor_pool'):
+  """Queue storing input values and returning random previously stored ones.
+
+  Every time the returned `output_value` is evaluated, `input_value` is
+  evaluated and its value either directly returned (with
+  `1-pooling_probability`) or stored in the pool and a random one of the samples
+  currently in the pool is popped and returned. As long as the pool in not fully
+  filled, the input_value is always directly returned, as well as stored in the
+  pool. Note during inference / testing, it may be appropriate to set
+  `pool_size` = 0 or `pooling_probability` = 0.
+
+  Args:
+    input_values: A `Tensor`, or a list or tuple of `Tensor`s from which to read
+      values to be pooled.
+    pool_size: An integer specifying the maximum size of the pool. Defaults to
+      50.
+    pooling_probability: A float `Tensor` specifying the probability of getting
+      a value from the pool, as opposed to just the current input.
+    name: A string prefix for the name scope for all tensorflow ops.
+
+  Returns:
+    A `Tensor`, or a list or tuple of `Tensor`s (according to the type ofx
+    `input_values`) which is with given probability either the `input_values` or
+    a randomly chosen sample that was previously inserted in the pool.
+
+  Raises:
+    ValueError: If `pool_size` is negative.
+  """
+  pool_size = int(pool_size)
+  if pool_size < 0:
+    raise ValueError('`pool_size` is negative.')
+  elif pool_size == 0:
+    return input_values
+
+  original_input_values = input_values
+  input_values = _to_tuple(input_values)
+
+  with ops.name_scope(
+      '{}_pool_queue'.format(name),
+      values=input_values + (pooling_probability,)):
+    pool_queue = data_flow_ops.RandomShuffleQueue(
+        capacity=pool_size,
+        min_after_dequeue=0,
+        dtypes=[v.dtype for v in input_values],
+        shapes=None)
+
+    # In pseudeo code this code does the following:
+    # if not pool_full:
+    #   enqueue(input_values)
+    #   return input_values
+    # else
+    #   dequeue_values = dequeue_random_sample()
+    #   enqueue(input_values)
+    #   if rand() < pooling_probability:
+    #     return dequeue_values
+    #   else
+    #     return input_values
+
+    def _get_input_value_pooled():
+      enqueue_op = pool_queue.enqueue(input_values)
+      with ops.control_dependencies([enqueue_op]):
+        return tuple(array_ops.identity(v) for v in input_values)
+
+    def _get_random_pool_value_and_enqueue_input():
+      dequeue_values = _to_tuple(pool_queue.dequeue())
+      with ops.control_dependencies(dequeue_values):
+        enqueue_op = pool_queue.enqueue(input_values)
+        with ops.control_dependencies([enqueue_op]):
+          prob = random_ops.random_uniform(
+              (), dtype=dtypes.float32) < pooling_probability
+          return control_flow_ops.cond(prob, lambda: dequeue_values,
+                                       lambda: input_values)
+
+    output_values = _to_tuple(control_flow_ops.cond(
+        pool_queue.size() < pool_size, _get_input_value_pooled,
+        _get_random_pool_value_and_enqueue_input))
+
+  if isinstance(original_input_values, list):
+    return list(output_values)
+  elif isinstance(original_input_values, tuple):
+    return output_values
+  return output_values[0]
diff --git a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cef3a87ab34f9754099073eefcb3f1b1c97a3762
--- /dev/null
+++ b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
@@ -0,0 +1,110 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.contrib.gan.python.features.random_tensor_pool."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.gan.python.features.python.random_tensor_pool_impl import tensor_pool
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class TensorPoolTest(test.TestCase):
+
+  def test_pool_unknown_input_shape(self):
+    """Checks that `input_value` can have unknown shape."""
+    input_value = array_ops.placeholder(
+        dtype=dtypes.int32, shape=[None, None, 3])
+    output_value = tensor_pool(input_value, pool_size=10)
+
+    with self.test_session(use_gpu=True) as session:
+      for i in range(10):
+        session.run(output_value, {input_value: [[[i] * 3]]})
+        session.run(output_value, {input_value: [[[i] * 3] * 2]})
+        session.run(output_value, {input_value: [[[i] * 3] * 5] * 2})
+
+  def test_pool_sequence(self):
+    """Checks that values are pooled and returned maximally twice."""
+    input_value = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    output_value = tensor_pool(input_value, pool_size=10)
+
+    with self.test_session(use_gpu=True) as session:
+      outs = []
+      for i in range(50):
+        out = session.run(output_value, {input_value: i})
+        outs.append(out)
+        self.assertLessEqual(out, i)
+
+      _, counts = np.unique(outs, return_counts=True)
+      # Check that each value is returned maximally twice.
+      self.assertTrue((counts <= 2).all())
+
+  def test_never_pool(self):
+    """Checks that setting `pooling_probability` to zero works."""
+    input_value = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    output_value = tensor_pool(
+        input_value, pool_size=10, pooling_probability=0.0)
+
+    with self.test_session(use_gpu=True) as session:
+      for i in range(50):
+        out = session.run(output_value, {input_value: i})
+        self.assertEqual(out, i)
+
+  def test_pooling_probability(self):
+    """Checks that `pooling_probability` works."""
+    input_value = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    pool_size = 10
+    pooling_probability = 0.2
+    output_value = tensor_pool(
+        input_value,
+        pool_size=pool_size,
+        pooling_probability=pooling_probability)
+
+    with self.test_session(use_gpu=True) as session:
+      not_pooled = 0
+      total = 1000
+      for i in range(total):
+        out = session.run(output_value, {input_value: i})
+        if out == i:
+          not_pooled += 1
+      self.assertAllClose(
+          (not_pooled - pool_size) / (total - pool_size),
+          1 - pooling_probability,
+          atol=0.03)
+
+  def test_input_values_tuple(self):
+    """Checks that `input_values` can be a tuple."""
+    input_values = (array_ops.placeholder(dtype=dtypes.int32, shape=[]),
+                    array_ops.placeholder(dtype=dtypes.int32, shape=[]))
+    output_values = tensor_pool(input_values, pool_size=3)
+    self.assertEqual(len(output_values), len(input_values))
+
+    with self.test_session(use_gpu=True) as session:
+      for i in range(10):
+        outs = session.run(output_values, {
+            input_values[0]: i,
+            input_values[1]: i + 1
+        })
+        self.assertEqual(len(outs), len(input_values))
+        self.assertEqual(outs[1] - outs[0], 1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/gan/python/losses/__init__.py b/tensorflow/contrib/gan/python/losses/__init__.py
index 290ff867a1e443f20a63e27fd97f53fed8a6cc11..d9bf8ebfdf65dfc76e4569dcaf26e0e51c7fc107 100644
--- a/tensorflow/contrib/gan/python/losses/__init__.py
+++ b/tensorflow/contrib/gan/python/losses/__init__.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN grouped API. Please see README.md for details and usage."""
+"""TFGAN losses and penalties.
+
+Losses can be used with individual arguments or with GANModel tuples.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/namedtuples.py b/tensorflow/contrib/gan/python/namedtuples.py
index 48f5e8e47dbcd5d32c23806b967a0d1e7403d2f7..3d4e315ebd0bd52b3b5e3e4a8655df8bfe9cebe8 100644
--- a/tensorflow/contrib/gan/python/namedtuples.py
+++ b/tensorflow/contrib/gan/python/namedtuples.py
@@ -79,6 +79,7 @@ class InfoGANModel(
     collections.namedtuple('InfoGANModel', GANModel._fields + (
         'structured_generator_inputs',
         'predicted_distributions',
+        'discriminator_and_aux_fn',
     ))):
   """An InfoGANModel contains all the pieces needed for InfoGAN training.
 
@@ -91,6 +92,8 @@ class InfoGANModel(
     predicted_distributions: A list of tf.Distributions. Predicted by the
       recognizer, and used to evaluate the likelihood of the structured noise.
       List length should match `structured_generator_inputs`.
+    discriminator_and_aux_fn: The original discriminator function that returns
+      a tuple of (logits, `predicted_distributions`).
   """
 
 
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index ad2d5eb86cdab89273efbd4ddce45f6657b54406..edd0113977ff4ddc672b0ec134be1a48c621b579 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -215,7 +215,8 @@ def infogan_model(
       disc_scope,
       lambda x, y: discriminator_fn(x, y)[0],  # conform to non-InfoGAN API
       structured_generator_inputs,
-      predicted_distributions)
+      predicted_distributions,
+      discriminator_fn)
 
 
 def acgan_model(
@@ -326,6 +327,53 @@ def _use_aux_loss(aux_loss_weight):
     return False
 
 
+def _tensor_pool_adjusted_model(model, tensor_pool_fn):
+  """Adjusts model using `tensor_pool_fn`.
+
+  Args:
+    model: A GANModel tuple.
+    tensor_pool_fn: A function that takes (generated_data, generator_inputs),
+      stores them in an internal pool and returns a previously stored
+      (generated_data, generator_inputs) with some probability. For example
+      tfgan.features.tensor_pool.
+
+  Returns:
+    A new GANModel tuple where discriminator outputs are adjusted by taking
+    pooled generator outputs as inputs. Returns the original model if
+    `tensor_pool_fn` is None.
+
+  Raises:
+    ValueError: If tensor pool does not suport the `model`.
+  """
+  if tensor_pool_fn is None:
+    return model
+
+  pooled_generated_data, pooled_generator_inputs = tensor_pool_fn(
+      (model.generated_data, model.generator_inputs))
+
+  if isinstance(model, namedtuples.GANModel):
+    dis_gen_outputs = model.discriminator_fn(pooled_generated_data,
+                                             pooled_generator_inputs)
+    return model._replace(discriminator_gen_outputs=dis_gen_outputs)
+  elif isinstance(model, namedtuples.ACGANModel):
+    (dis_pooled_gen_outputs,
+     dis_pooled_gen_classification_logits) = model.discriminator_fn(
+         pooled_generated_data, pooled_generator_inputs)
+    return model._replace(
+        discriminator_gen_outputs=dis_pooled_gen_outputs,
+        discriminator_gen_classification_logits=
+        dis_pooled_gen_classification_logits)
+  elif isinstance(model, namedtuples.InfoGANModel):
+    (dis_pooled_gen_outputs,
+     pooled_predicted_distributions) = model.discriminator_and_aux_fn(
+         pooled_generated_data, pooled_generator_inputs)
+    return model._replace(
+        discriminator_gen_outputs=dis_pooled_gen_outputs,
+        predicted_distributions=pooled_predicted_distributions)
+  else:
+    raise ValueError('Tensor pool does not support `model`: %s.' % type(model))
+
+
 def gan_loss(
     # GANModel.
     model,
@@ -338,6 +386,7 @@ def gan_loss(
     mutual_information_penalty_weight=None,
     aux_cond_generator_weight=None,
     aux_cond_discriminator_weight=None,
+    tensor_pool_fn=None,
     # Options.
     add_summaries=True):
   """Returns losses necessary to train generator and discriminator.
@@ -363,6 +412,10 @@ def gan_loss(
       https://arxiv.org/abs/1610.09585
     aux_cond_discriminator_weight: If not None: add a classification loss as in
       https://arxiv.org/abs/1610.09585
+    tensor_pool_fn: A function that takes (generated_data, generator_inputs),
+      stores them in an internal pool and returns previous stored
+      (generated_data, generator_inputs). For example
+      `tf.gan.features.tensor_pool`. Defaults to None (not using tensor pool).
     add_summaries: Whether or not to add summaries for the losses.
 
   Returns:
@@ -402,7 +455,9 @@ def gan_loss(
 
   # Create standard losses.
   gen_loss = generator_loss_fn(model, add_summaries=add_summaries)
-  dis_loss = discriminator_loss_fn(model, add_summaries=add_summaries)
+  dis_loss = discriminator_loss_fn(
+      _tensor_pool_adjusted_model(model, tensor_pool_fn),
+      add_summaries=add_summaries)
 
   # Add optional extra losses.
   if _use_aux_loss(gradient_penalty_weight):
@@ -422,7 +477,7 @@ def gan_loss(
     ac_disc_loss = tfgan_losses.acgan_discriminator_loss(
         model, add_summaries=add_summaries)
     dis_loss += aux_cond_discriminator_weight * ac_disc_loss
-  # Gathers auxilliary losses.
+  # Gathers auxiliary losses.
   if model.generator_scope:
     gen_reg_loss = losses.get_regularization_loss(model.generator_scope.name)
   else:
diff --git a/tensorflow/contrib/gan/python/train_test.py b/tensorflow/contrib/gan/python/train_test.py
index 6b27b6926102b6e5a7ff134ceed75c23459a6534..519d101e07f4f28d684017b86102fce8fa7677ef 100644
--- a/tensorflow/contrib/gan/python/train_test.py
+++ b/tensorflow/contrib/gan/python/train_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.contrib.framework.python.ops import variables as variables_lib
 from tensorflow.contrib.gan.python import namedtuples
 from tensorflow.contrib.gan.python import train
+from tensorflow.contrib.gan.python.features.python import random_tensor_pool
 from tensorflow.contrib.slim.python.slim import learning as slim_learning
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -145,14 +146,16 @@ def get_infogan_model():
   return namedtuples.InfoGANModel(
       *get_gan_model(),
       structured_generator_inputs=[constant_op.constant(0)],
-      predicted_distributions=[categorical.Categorical([1.0])])
+      predicted_distributions=[categorical.Categorical([1.0])],
+      discriminator_and_aux_fn=infogan_discriminator_model)
 
 
 def get_callable_infogan_model():
   return namedtuples.InfoGANModel(
       *get_callable_gan_model(),
       structured_generator_inputs=[constant_op.constant(0)],
-      predicted_distributions=[categorical.Categorical([1.0])])
+      predicted_distributions=[categorical.Categorical([1.0])],
+      discriminator_and_aux_fn=infogan_discriminator_model)
 
 
 def create_infogan_model():
@@ -409,6 +412,51 @@ class GANLossTest(test.TestCase):
   def test_callable_acgan(self):
     self._test_acgan_helper(create_callable_acgan_model)
 
+  # Test tensor pool.
+  def _test_tensor_pool_helper(self, create_gan_model_fn):
+    model = create_gan_model_fn()
+    if isinstance(model, namedtuples.InfoGANModel):
+
+      def tensor_pool_fn_impl(input_values):
+        generated_data, generator_inputs = input_values
+        output_values = random_tensor_pool.tensor_pool(
+            [generated_data] + generator_inputs, pool_size=5)
+        return output_values[0], output_values[1:]
+
+      tensor_pool_fn = tensor_pool_fn_impl
+    else:
+
+      def tensor_pool_fn_impl(input_values):
+        return random_tensor_pool.tensor_pool(input_values, pool_size=5)
+
+      tensor_pool_fn = tensor_pool_fn_impl
+    loss = train.gan_loss(model, tensor_pool_fn=tensor_pool_fn)
+    self.assertTrue(isinstance(loss, namedtuples.GANLoss))
+
+    # Check values.
+    with self.test_session(use_gpu=True) as sess:
+      variables.global_variables_initializer().run()
+      for _ in range(10):
+        sess.run([loss.generator_loss, loss.discriminator_loss])
+
+  def test_tensor_pool_gan(self):
+    self._test_tensor_pool_helper(create_gan_model)
+
+  def test_tensor_pool_callable_gan(self):
+    self._test_tensor_pool_helper(create_callable_gan_model)
+
+  def test_tensor_pool_infogan(self):
+    self._test_tensor_pool_helper(create_infogan_model)
+
+  def test_tensor_pool_callable_infogan(self):
+    self._test_tensor_pool_helper(create_callable_infogan_model)
+
+  def test_tensor_pool_acgan(self):
+    self._test_tensor_pool_helper(create_acgan_model)
+
+  def test_tensor_pool_callable_acgan(self):
+    self._test_tensor_pool_helper(create_callable_acgan_model)
+
   def test_doesnt_crash_when_in_nested_scope(self):
     with variable_scope.variable_scope('outer_scope'):
       gan_model = train.gan_model(
diff --git a/tensorflow/contrib/image/BUILD b/tensorflow/contrib/image/BUILD
index 157e97d237021d95c935a6be66aa57842b97125c..54502cfc6eecb9d064ffde9773e97d893a24133a 100755
--- a/tensorflow/contrib/image/BUILD
+++ b/tensorflow/contrib/image/BUILD
@@ -9,6 +9,7 @@ package(default_visibility = ["//visibility:public"])
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
     "tf_custom_op_library",
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
@@ -106,10 +107,33 @@ tf_custom_op_library(
     name = "python/ops/_distort_image_ops.so",
     srcs = [
         "kernels/adjust_hsv_in_yiq_op.cc",
+        "kernels/adjust_hsv_in_yiq_op.h",
         "ops/distort_image_ops.cc",
     ],
+    gpu_srcs = [
+        "kernels/adjust_hsv_in_yiq_op_gpu.cu.cc",
+        "kernels/adjust_hsv_in_yiq_op.h",
+    ],
     deps = [
-        "@protobuf_archive//:protobuf",
+        "//tensorflow/core/kernels:gpu_util_hdrs",
+    ],
+)
+
+tf_cc_test(
+    name = "adjust_hsv_in_yiq_op_test",
+    size = "small",
+    srcs = [
+        "kernels/adjust_hsv_in_yiq_op.h",
+        "kernels/adjust_hsv_in_yiq_op_test.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+        "//third_party/eigen3",
     ],
 )
 
@@ -122,19 +146,6 @@ tf_gen_op_wrapper_py(
     deps = [":distort_image_ops_op_lib"],
 )
 
-cc_library(
-    name = "distort_image_ops_cc",
-    srcs = [
-        "kernels/adjust_hsv_in_yiq_op.cc",
-    ],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//third_party/eigen3",
-    ],
-    alwayslink = 1,
-)
-
 py_library(
     name = "distort_image_py",
     srcs = [
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
index f4962ed69dc68d4bad06ef29d7a167e0ba8ae044..478b716d88321101c971789f36c0ff8ecd3f418e 100644
--- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
@@ -12,14 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <cmath>
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
+#include "tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h"
 #include <memory>
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/work_sharder.h"
@@ -36,10 +37,10 @@ class AdjustHsvInYiqOpBase : public OpKernel {
 
   struct ComputeOptions {
     const Tensor* input = nullptr;
+    Tensor* output = nullptr;
     const Tensor* delta_h = nullptr;
     const Tensor* scale_s = nullptr;
     const Tensor* scale_v = nullptr;
-    Tensor* output = nullptr;
     int64 channel_count = 0;
   };
 
@@ -65,7 +66,7 @@ class AdjustHsvInYiqOpBase : public OpKernel {
                                         scale_v.shape().DebugString()));
     auto channels = input.dim_size(input.dims() - 1);
     OP_REQUIRES(
-        context, channels == 3,
+        context, channels == kChannelSize,
         errors::InvalidArgument("input must have 3 channels but instead has ",
                                 channels, " channels."));
 
@@ -101,53 +102,21 @@ class AdjustHsvInYiqOp<CPUDevice> : public AdjustHsvInYiqOpBase {
     const Tensor* input = options.input;
     Tensor* output = options.output;
     const int64 channel_count = options.channel_count;
-    static const int kChannelSize = 3;
     auto input_data = input->shaped<float, 2>({channel_count, kChannelSize});
     const float delta_h = options.delta_h->scalar<float>()();
     const float scale_s = options.scale_s->scalar<float>()();
     const float scale_v = options.scale_v->scalar<float>()();
     auto output_data = output->shaped<float, 2>({channel_count, kChannelSize});
+    float tranformation_matrix[kChannelSize * kChannelSize] = {0};
+    internal::compute_tranformation_matrix<kChannelSize * kChannelSize>(
+        delta_h, scale_s, scale_v, tranformation_matrix);
     const int kCostPerChannel = 10;
     const DeviceBase::CpuWorkerThreads& worker_threads =
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
           kCostPerChannel,
-          [channel_count, &input_data, &output_data, delta_h, scale_s, scale_v](
+          [channel_count, &input_data, &output_data, &tranformation_matrix](
               int64 start_channel, int64 end_channel) {
-            // Using approximate linear transfomation described in:
-            // https://beesbuzz.biz/code/hsv_color_transforms.php
-            /** Get the constants from sympy
-             from sympy import Matrix
-             from sympy.abc import u, w
-             # Projection matrix to YIQ. http://en.wikipedia.org/wiki/YIQ
-             tyiq = Matrix([[0.299, 0.587, 0.114],
-                            [0.596, -0.274, -0.322],
-                            [0.211, -0.523, 0.312]])
-             # Hue rotation matrix in YIQ space.
-             hue_proj = Matrix(3,3, [v, 0, 0, 0, vsu, -vsw, 0, vsw, vsu])
-             m = tyiq.inv() * hue_proj * tyiq
-             **/
-            // TODO(huangyp): directly compute the projection matrix from tyiq.
-            static const float t[kChannelSize][kChannelSize][kChannelSize] = {
-                {{.299, .701, .16862179492229},
-                 {.587, -.587, .329804745287403},
-                 {.114, -.114, -0.498426540209694}},
-                {{.299, -.299, -.327963394172371},
-                 {.587, .413, .0346106879248821},
-                 {.114, -.114, .293352706247489}},
-                {{.299, -.299, 1.24646136576682},
-                 {.587, -.587, -1.04322888291964},
-                 {.114, .886, -.203232482847173}}};
-            float m[kChannelSize][kChannelSize] = {{0.}};
-            float su = scale_s * std::cos(delta_h);
-            float sw = scale_s * std::sin(delta_h);
-            for (int q_index = 0; q_index < kChannelSize; q_index++) {
-              for (int p_index = 0; p_index < kChannelSize; p_index++) {
-                m[q_index][p_index] = scale_v * (t[q_index][p_index][0] +
-                                                 t[q_index][p_index][1] * su +
-                                                 t[q_index][p_index][2] * sw);
-              }
-            }
             // Applying projection matrix to input RGB vectors.
             const float* p = input_data.data() + start_channel * kChannelSize;
             float* q = output_data.data() + start_channel * kChannelSize;
@@ -155,7 +124,9 @@ class AdjustHsvInYiqOp<CPUDevice> : public AdjustHsvInYiqOpBase {
               for (int q_index = 0; q_index < kChannelSize; q_index++) {
                 q[q_index] = 0;
                 for (int p_index = 0; p_index < kChannelSize; p_index++) {
-                  q[q_index] += m[q_index][p_index] * p[p_index];
+                  q[q_index] +=
+                      p[p_index] *
+                      tranformation_matrix[q_index + kChannelSize * p_index];
                 }
               }
               p += kChannelSize;
@@ -165,8 +136,33 @@ class AdjustHsvInYiqOp<CPUDevice> : public AdjustHsvInYiqOpBase {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("AdjustHsvInYiq").Device(DEVICE_CPU),
-                        AdjustHsvInYiqOp<CPUDevice>);
+REGISTER_KERNEL_BUILDER(
+    Name("AdjustHsvInYiq").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    AdjustHsvInYiqOp<CPUDevice>);
+
+#if GOOGLE_CUDA
+template <>
+class AdjustHsvInYiqOp<GPUDevice> : public AdjustHsvInYiqOpBase {
+ public:
+  explicit AdjustHsvInYiqOp(OpKernelConstruction* context)
+      : AdjustHsvInYiqOpBase(context) {}
+
+  void DoCompute(OpKernelContext* ctx, const ComputeOptions& options) override {
+    const int64 number_of_elements = options.input->NumElements();
+    if (number_of_elements <= 0) {
+      return;
+    }
+    const float* delta_h = options.delta_h->flat<float>().data();
+    const float* scale_s = options.scale_s->flat<float>().data();
+    const float* scale_v = options.scale_v->flat<float>().data();
+    functor::AdjustHsvInYiqGPU()(ctx, options.channel_count, options.input,
+                                 delta_h, scale_s, scale_v, options.output);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("AdjustHsvInYiq").Device(DEVICE_GPU).TypeConstraint<float>("T"),
+    AdjustHsvInYiqOp<GPUDevice>);
+#endif
 
-// TODO(huangyp): add the GPU kernel
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..194ae2ba47456cac66c01989a78ab4ce607d1295
--- /dev/null
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h
@@ -0,0 +1,87 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_IMAGE_KERNELS_ADJUST_HSV_IN_YIQ_OP_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_IMAGE_KERNELS_ADJUST_HSV_IN_YIQ_OP_H_
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include <cmath>
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+static constexpr int kChannelSize = 3;
+
+namespace internal {
+
+template <int MATRIX_SIZE>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void compute_tranformation_matrix(
+    const float delta_h, const float scale_s, const float scale_v,
+    float* matrix) {
+  static_assert(MATRIX_SIZE == kChannelSize * kChannelSize,
+                "Size of matrix should be 9.");
+  // Projection matrix from RGB to YIQ. Numbers from wikipedia
+  // https://en.wikipedia.org/wiki/YIQ
+  Eigen::Matrix3f yiq;
+  /* clang-format off */
+  yiq << 0.299, 0.587, 0.114,
+         0.596, -0.274, -0.322,
+         0.211, -0.523, 0.312;
+  Eigen::Matrix3f yiq_inverse;
+  yiq_inverse << 1, 0.95617069, 0.62143257,
+                 1, -0.2726886, -0.64681324,
+                 1, -1.103744, 1.70062309;
+  /* clang-format on */
+  // Construct hsv linear transformation matrix in YIQ space.
+  // https://beesbuzz.biz/code/hsv_color_transforms.php
+  float vsu = scale_v * scale_s * std::cos(delta_h);
+  float vsw = scale_v * scale_s * std::sin(delta_h);
+  Eigen::Matrix3f hsv_transform;
+  /* clang-format off */
+  hsv_transform << scale_v, 0, 0,
+                   0, vsu, -vsw,
+                   0, vsw, vsu;
+  /* clang-format on */
+  // Compute final transformation matrix = inverse_yiq * hsv_transform * yiq
+  Eigen::Map<Eigen::Matrix<float, 3, 3, Eigen::ColMajor>> eigen_matrix(matrix);
+  eigen_matrix = yiq_inverse * hsv_transform * yiq;
+}
+}  // namespace internal
+
+#if GOOGLE_CUDA
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+struct AdjustHsvInYiqGPU {
+  void operator()(OpKernelContext* ctx, int channel_count,
+                  const Tensor* const input, const float* const delta_h,
+                  const float* const scale_s, const float* const scale_v,
+                  Tensor* const output);
+};
+
+}  // namespace functor
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_IMAGE_KERNELS_ADJUST_HSV_IN_YIQ_OP_H_
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b71ff9cd507faac66b3a33d3c02ec9b5901d814a
--- /dev/null
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
@@ -0,0 +1,84 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h"
+#include "tensorflow/core/kernels/gpu_utils.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+namespace internal {
+
+__global__ void compute_tranformation_matrix_cuda(const float* const delta_h,
+                                                  const float* const scale_s,
+                                                  const float* const scale_v,
+                                                  float* const matrix,
+                                                  const int matrix_size) {
+  if (matrix_size == kChannelSize * kChannelSize) {
+    compute_tranformation_matrix<kChannelSize * kChannelSize>(
+        *delta_h, *scale_s, *scale_v, matrix);
+  }
+}
+}  // namespace internal
+
+namespace functor {
+
+void AdjustHsvInYiqGPU::operator()(OpKernelContext* ctx, int channel_count,
+                                   const Tensor* const input,
+                                   const float* const delta_h,
+                                   const float* const scale_s,
+                                   const float* const scale_v,
+                                   Tensor* const output) {
+  const uint64 m = channel_count;
+  const uint64 k = kChannelSize;
+  const uint64 n = kChannelSize;
+  auto* cu_stream = ctx->eigen_device<GPUDevice>().stream();
+  OP_REQUIRES(ctx, cu_stream, errors::Internal("No GPU stream available."));
+  Tensor tranformation_matrix;
+  OP_REQUIRES_OK(ctx, ctx->allocate_temp(
+                          DT_FLOAT, TensorShape({kChannelSize * kChannelSize}),
+                          &tranformation_matrix));
+  // TODO(huangyp): It takes about 3.5 us to comute tranformation_matrix
+  // with one thread. Improve its performance if necessary.
+  internal::compute_tranformation_matrix_cuda<<<1, 1, 0, cu_stream>>>(
+      delta_h, scale_s, scale_v, tranformation_matrix.flat<float>().data(),
+      tranformation_matrix.flat<float>().size());
+  // Call cuBlas C = A * B directly.
+  auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+  auto a_ptr =
+      AsDeviceMemory(input->flat<float>().data(), input->flat<float>().size());
+  auto b_ptr = AsDeviceMemory(tranformation_matrix.flat<float>().data(),
+                              tranformation_matrix.flat<float>().size());
+  auto c_ptr = AsDeviceMemory(output->flat<float>().data(),
+                              output->flat<float>().size());
+  auto* stream = ctx->op_device_context()->stream();
+  OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
+  // TODO(huangyp): share/use autotune cublas algorithms in Matmul.op.
+  bool blas_launch_status =
+      stream
+          ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n,
+                         a_ptr, k, 0.0f, &c_ptr, n)
+          .ok();
+  if (!blas_launch_status) {
+    ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
+                                    ", n=", n, ", k=", k));
+  }
+}
+}  // namespace functor
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_test.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4cbbd277840133c9419f9ce3d945b7d099679dc0
--- /dev/null
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_test.cc
@@ -0,0 +1,48 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class AdjustHsvInYiqOpTest : public OpsTestBase {
+ protected:
+};
+
+TEST_F(AdjustHsvInYiqOpTest, IdentiyTransformMatrix) {
+  Tensor matrix(allocator(), DT_FLOAT, TensorShape({9}));
+  internal::compute_tranformation_matrix<9>(0.0, 1.0, 1.0,
+                                            matrix.flat<float>().data());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({9}));
+  test::FillValues<float>(&expected, {1, 0, 0, 0, 1, 0, 0, 0, 1});
+  test::ExpectClose(matrix, expected);
+}
+
+TEST_F(AdjustHsvInYiqOpTest, ScaleValueTransformMatrix) {
+  float scale_v = 2.3;
+  Tensor matrix(allocator(), DT_FLOAT, TensorShape({9}));
+  internal::compute_tranformation_matrix<9>(0.0, 1.0, scale_v,
+                                            matrix.flat<float>().data());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({9}));
+  test::FillValues<float>(&expected,
+                          {scale_v, 0, 0, 0, scale_v, 0, 0, 0, scale_v});
+  test::ExpectClose(matrix, expected);
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
index 2b6799213827537f77deda4e052bb7ec16f46343..f8b56ab1c5400694b3aa8d4a0c19c7769aa8cbce 100755
--- a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
+++ b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
@@ -40,7 +40,7 @@ REGISTER_OP("SingleImageRandomDotStereograms")
     .Doc(R"doc(
 Outputs a single image random dot stereogram for export via encode_PNG/JPG OP.
 
-Given the 2-D tensor 'depth_values' with encoded Z values, this operation will 
+Given the 2-D tensor 'depth_values' with encoded Z values, this operation will
 encode 3-D data into a 2-D image.  The output of this Op is suitable for the
 encode_PNG/JPG ops.  Be careful with image compression as this may corrupt the
 encode 3-D data witin the image.
@@ -68,14 +68,14 @@ with open('picture_out.png', 'wb') as f:
     f.write(png)
 ```
 
-depth_values: Z values of data to encode into 'output_data_window' window, 
+depth_values: Z values of data to encode into 'output_data_window' window,
   lower values are further away {0.0 floor(far), 1.0 ceiling(near) after normalization}, must be 2-D tensor
 hidden_surface_removal: Activate hidden surface removal
 convergence_dots_size: Black dot size in pixels to help view converge image, drawn on bottom of image
 dots_per_inch: Output device in dots/inch
 eye_separation: Separation between eyes in inches
 mu: Depth of field, Fraction of viewing distance (eg. 1/3 = .3333)
-normalize: Normalize input data to [0.0, 1.0] 
+normalize: Normalize input data to [0.0, 1.0]
 normalize_max: Fix MAX value for Normalization - if < MIN, autoscale
 normalize_min: Fix MIN value for Normalization - if > MAX, autoscale
 border_level: Value of border depth 0.0 {far} to 1.0 {near}
diff --git a/tensorflow/contrib/image/python/kernel_tests/distort_image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/distort_image_ops_test.py
index b85f19d29b79defa10493bdbaa4a1b237cb2a9ee..a495b58b7f6481d4cdedf73f23615d0390eb6a45 100644
--- a/tensorflow/contrib/image/python/kernel_tests/distort_image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/distort_image_ops_test.py
@@ -172,7 +172,7 @@ class AdjustValueInYiqTest(test_util.TensorFlowTestCase):
           raise AssertionError('Invalid test style: %s' % (test_style))
         y_np = self._adjust_value_in_yiq_np(x_np, scale)
         y_tf = self._adjust_value_in_yiq_tf(x_np, scale)
-        self.assertAllClose(y_tf, y_np, rtol=2e-5, atol=1e-5)
+        self.assertAllClose(y_tf, y_np, rtol=2e-4, atol=1e-4)
 
   def test_invalid_shapes(self):
     x_np = np.random.rand(2, 3) * 255.
@@ -237,7 +237,7 @@ class AdjustSaturationInYiqTest(test_util.TensorFlowTestCase):
             raise AssertionError('Invalid test style: %s' % (test_style))
           y_baseline = self._adjust_saturation_in_yiq_np(x_np, scale)
           y_tf = self._adjust_saturation_in_yiq_tf(x_np, scale)
-          self.assertAllClose(y_tf, y_baseline, rtol=2e-5, atol=1e-5)
+          self.assertAllClose(y_tf, y_baseline, rtol=2e-4, atol=1e-4)
 
   def test_invalid_shapes(self):
     x_np = np.random.rand(2, 3) * 255.
@@ -291,6 +291,9 @@ class AdjustHueInYiqBenchmark(test.Benchmark):
   def benchmark_adjust_hue_in_yiqCpuAll(self):
     self._benchmark_adjust_hue_in_yiq('/cpu:0', None)
 
+  def benchmark_adjust_hue_in_yiq_gpu_all(self):
+    self._benchmark_adjust_hue_in_yiq(test.gpu_device_name(), None)
+
 
 class AdjustSaturationInYiqBenchmark(test.Benchmark):
 
@@ -333,6 +336,9 @@ class AdjustSaturationInYiqBenchmark(test.Benchmark):
   def benchmark_adjust_saturation_in_yiq_cpu_all(self):
     self._benchmark_adjust_saturation_in_yiq('/cpu:0', None)
 
+  def benchmark_adjust_saturation_in_yiq_gpu_all(self):
+    self._benchmark_adjust_saturation_in_yiq(test.gpu_device_name(), None)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
index 5cccf26028ca6bf269dbc67a33075351edecb407..bb766e59d2cee648042cc08be466796d9233ad66 100755
--- a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
+++ b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
@@ -68,7 +68,7 @@ def single_image_random_dot_stereograms(
   ```
 
   Args:
-    depth_values: A `Tensor`. Must be one of the following types: 
+    depth_values: A `Tensor`. Must be one of the following types:
       `float64`, `float32`, `int64`, `int32`.  Z values of data to encode
       into 'output_data_window' window, lower further away {0.0 floor(far),
       1.0 ceiling(near) after norm}, must be 2-D tensor
@@ -84,17 +84,17 @@ def single_image_random_dot_stereograms(
     mu: An optional `float`. Defaults to `0.3333`.
       Depth of field, Fraction of viewing distance (eg. 1/3 = 0.3333)
     normalize: An optional `bool`. Defaults to `True`.
-      Normalize input data to [0.0, 1.0] 
+      Normalize input data to [0.0, 1.0]
     normalize_max: An optional `float`. Defaults to `-100`.
       Fix MAX value for Normalization (0.0) - if < MIN, autoscale
     normalize_min: An optional `float`. Defaults to `100`.
       Fix MIN value for Normalization (0.0) - if > MAX, autoscale
     border_level: An optional `float`. Defaults to `0`.
-      Value of bord in depth 0.0 {far} to 1.0 {near} 
+      Value of bord in depth 0.0 {far} to 1.0 {near}
     number_colors: An optional `int`. Defaults to `256`. 2 (Black &
       White), 256 (grayscale), and Numbers > 256 (Full Color) are
       supported
-    output_image_shape: An optional `tf.TensorShape` or list of `ints`. 
+    output_image_shape: An optional `tf.TensorShape` or list of `ints`.
       Defaults to shape `[1024, 768, 1]`. Defines output shape of returned
       image in '[X,Y, Channels]' 1-grayscale, 3 color; channels will be
       updated to 3 if number_colors > 256
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/BUILD b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
index 7d65ac9a43dd777baa020fe0453af65e69e6c509..95fba59e3c96ae3c69e0b154740785b0d2bcb3c9 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
@@ -16,6 +16,7 @@ py_test(
         "//tensorflow/contrib/kfac/python/ops:utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
@@ -33,6 +34,7 @@ py_test(
         "//tensorflow/contrib/kfac/python/ops:fisher_factors",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py b/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
index b52a7b52a7efd4292ad514c5a744c4da07082142..9b28c45c7263208d21b1514ae5f05b7e81e315a3 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.contrib.kfac.python.ops import estimator
 from tensorflow.contrib.kfac.python.ops import layer_collection as lc
 from tensorflow.contrib.kfac.python.ops import utils
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -33,6 +34,30 @@ from tensorflow.python.platform import test
 _ALL_ESTIMATION_MODES = ["gradients", "empirical", "curvature_prop", "exact"]
 
 
+class DeviceContextGeneratorTest(test.TestCase):
+
+  def testNoDevice(self):
+    device_context_generator = estimator._DeviceContextGenerator(None)
+    with ops.device("/device:CPU:0"):  # This is what will be used
+      with device_context_generator():  # Does nothing
+        a = constant_op.constant([2.0], name="a")
+    self.assertEqual("/device:CPU:0", a.op.device)
+
+  def testTwoDevices(self):
+    device_context_generator = estimator._DeviceContextGenerator(
+        ["/device:GPU:0", "/device:GPU:1"])
+    with ops.device("/device:CPU:0"):  # Will be over-ridden by the inner scopes
+      with device_context_generator():
+        a = constant_op.constant([2.0], name="a")
+      with device_context_generator():
+        b = constant_op.constant([2.0], name="b")
+      with device_context_generator():
+        c = constant_op.constant([2.0], name="c")
+    self.assertEqual("/device:GPU:0", a.op.device)
+    self.assertEqual("/device:GPU:1", b.op.device)
+    self.assertEqual("/device:GPU:0", c.op.device)
+
+
 class EstimatorTest(test.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
index 5f2b5c6cace9cd18f4cc5590ff55a9b39680a381..2d9b28185ce0db32d5cd7d84737fdf96e2c98851 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
@@ -40,6 +40,21 @@ def _make_psd(dim):
   return array_ops.constant(mat)
 
 
+class UtilsTest(test.TestCase):
+
+  def testComputePiTracenorm(self):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      random_seed.set_random_seed(200)
+      left_factor = array_ops.diag([1., 2., 0., 1.])
+      right_factor = array_ops.ones([2., 2.])
+
+      # pi is the sqrt of the left trace norm divided by the right trace norm
+      pi = fb._compute_pi_tracenorm(left_factor, right_factor)
+
+      pi_val = sess.run(pi)
+      self.assertEqual(1., pi_val)
+
+
 class FullFBTest(test.TestCase):
 
   def testFullFBInitSingleTensor(self):
@@ -301,8 +316,7 @@ class FullyConnectedDiagonalFB(test.TestCase):
     multiply_result_big, multiply_inverse_result_big = self.runFisherBlockOps(
         self.w, [self.inputs], [self.outputs], [self.output_grads])
     multiply_result_small, multiply_inverse_result_small = (
-        self.runFisherBlockOps(self.w,
-                               np.split(self.inputs, 2),
+        self.runFisherBlockOps(self.w, np.split(self.inputs, 2),
                                np.split(self.outputs, 2),
                                np.split(self.output_grads, 2)))
 
@@ -584,8 +598,7 @@ class ConvDiagonalFBTest(test.TestCase):
     multiply_result_big, multiply_inverse_result_big = self.runFisherBlockOps(
         self.w, [self.inputs], [self.outputs], [self.output_grads])
     multiply_result_small, multiply_inverse_result_small = (
-        self.runFisherBlockOps(self.w,
-                               np.split(self.inputs, 2),
+        self.runFisherBlockOps(self.w, np.split(self.inputs, 2),
                                np.split(self.outputs, 2),
                                np.split(self.output_grads, 2)))
 
@@ -608,8 +621,9 @@ class ConvDiagonalFBTest(test.TestCase):
         self.kernel_size, self.kernel_size, self.input_channels + 1,
         self.output_channels
     ])
-    expected_result = (expected_result[:, :, 0:-1, :], np.reshape(
-        expected_result[:, :, -1, :], [self.output_channels]))
+    expected_result = (expected_result[:, :, 0:-1, :],
+                       np.reshape(expected_result[:, :, -1, :],
+                                  [self.output_channels]))
 
     self.assertEqual(len(result), 2)
     self.assertAllClose(expected_result[0], result[0])
@@ -692,8 +706,8 @@ class ConvKFCBasicFBTest(test.TestCase):
       sess.run(block._input_factor.make_inverse_update_ops())
       sess.run(block._output_factor.make_inverse_update_ops())
 
-      vector = (np.arange(1, 15).reshape(7, 2).astype(np.float32), np.arange(
-          2, 4).reshape(2, 1).astype(np.float32))
+      vector = (np.arange(1, 15).reshape(7, 2).astype(np.float32),
+                np.arange(2, 4).reshape(2, 1).astype(np.float32))
       output = block.multiply_inverse((array_ops.constant(vector[0]),
                                        array_ops.constant(vector[1])))
 
@@ -776,11 +790,50 @@ class ConvKFCBasicFBTest(test.TestCase):
       self.assertAllClose(output_flat, explicit)
 
 
+class FullyConnectedSeriesFBTest(test.TestCase):
+
+  def testFullyConnectedSeriesFBInit(self):
+    with ops.Graph().as_default():
+      random_seed.set_random_seed(200)
+      inputs = array_ops.constant([1., 2.])
+      outputs = array_ops.constant([3., 4.])
+      block = fb.FullyConnectedSeriesFB(
+          lc.LayerCollection(), inputs=[inputs], outputs=[outputs])
+      self.assertAllEqual([outputs], block.tensors_to_compute_grads())
+
+  def testInstantiateFactorsHasBias(self):
+    with ops.Graph().as_default():
+      random_seed.set_random_seed(200)
+      inputs = array_ops.constant([[1., 2.], [3., 4.]])
+      outputs = array_ops.constant([[3., 4.], [5., 6.]])
+      block = fb.FullyConnectedSeriesFB(
+          lc.LayerCollection(),
+          inputs=[inputs],
+          outputs=[outputs],
+          has_bias=True)
+      grads = outputs**2
+      block.instantiate_factors(((grads,),), 0.5)
+
+  def testInstantiateFactorsNoBias(self):
+    with ops.Graph().as_default():
+      random_seed.set_random_seed(200)
+      inputs = array_ops.constant([[1., 2.], [3., 4.]])
+      outputs = array_ops.constant([[3., 4.], [5., 6.]])
+      block = fb.FullyConnectedSeriesFB(
+          lc.LayerCollection(),
+          inputs=[inputs],
+          outputs=[outputs],
+          has_bias=False)
+      grads = outputs**2
+      block.instantiate_factors(((grads,),), 0.5)
+
+
 def as_tensors(tensor_or_tuple):
   """Converts a potentially nested tuple of np.array to Tensors."""
   if isinstance(tensor_or_tuple, (tuple, list)):
     return tuple(as_tensors(t) for t in tensor_or_tuple)
   return ops.convert_to_tensor(tensor_or_tuple)
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
index fbb3d219139a4bc05253841a89e73645ef37dddd..70e56db055078bd4399b03e4d4a877e34249cc5e 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
@@ -22,6 +22,7 @@ import numpy as np
 import numpy.random as npr
 
 from tensorflow.contrib.kfac.python.ops import fisher_factors as ff
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops as tf_ops
 from tensorflow.python.framework import random_seed
@@ -32,6 +33,25 @@ from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import test
 
 
+class MaybeColocateTest(test.TestCase):
+
+  def testFalse(self):
+    with tf_ops.Graph().as_default():
+      a = constant_op.constant([2.0], name='a')
+      with ff._maybe_colocate_with(a, False):
+        b = constant_op.constant(3.0, name='b')
+      self.assertEqual([b'loc:@a'], a.op.colocation_groups())
+      self.assertEqual([b'loc:@b'], b.op.colocation_groups())
+
+  def testTrue(self):
+    with tf_ops.Graph().as_default():
+      a = constant_op.constant([2.0], name='a')
+      with ff._maybe_colocate_with(a, True):
+        b = constant_op.constant(3.0, name='b')
+      self.assertEqual([b'loc:@a'], a.op.colocation_groups())
+      self.assertEqual([b'loc:@a'], b.op.colocation_groups())
+
+
 class FisherFactorTestingDummy(ff.FisherFactor):
   """Dummy class to test the non-abstract methods on ff.FisherFactor."""
 
@@ -47,12 +67,19 @@ class FisherFactorTestingDummy(ff.FisherFactor):
   def _num_sources(self):
     return 1
 
+  @property
+  def _dtype(self):
+    return dtypes.float32
+
   def _compute_new_cov(self):
     raise NotImplementedError
 
   def instantiate_covariance(self):
     pass
 
+  def make_inverse_update_ops(self):
+    return []
+
 
 class InverseProvidingFactorTestingDummy(ff.InverseProvidingFactor):
   """Dummy class to test the non-abstract methods on ff.InverseProvidingFactor.
@@ -74,6 +101,10 @@ class InverseProvidingFactorTestingDummy(ff.InverseProvidingFactor):
   def _num_sources(self):
     return 1
 
+  @property
+  def _dtype(self):
+    return dtypes.float32
+
   def _compute_new_cov(self):
     raise NotImplementedError
 
@@ -101,7 +132,7 @@ class NumericalUtilsTest(test.TestCase):
 
       normalizer = 10.
       x = npr.randn(100, 3)
-      cov = ff._compute_cov(array_ops.constant(x), normalizer)
+      cov = ff._compute_cov(array_ops.constant(x), normalizer=normalizer)
       np_cov = np.dot(x.T, x) / normalizer
 
       self.assertAllClose(sess.run(cov), np_cov)
@@ -247,13 +278,13 @@ class InverseProvidingFactorTest(test.TestCase):
       for i in range(1, ff.EIGENVALUE_DECOMPOSITION_THRESHOLD + 1):
         factor.register_damped_inverse(1. / i)
       ops = factor.make_inverse_update_ops()
-      self.assertEqual(ff.EIGENVALUE_DECOMPOSITION_THRESHOLD, len(ops))
+      self.assertEqual(1, len(ops))
 
       sess.run(tf_variables.global_variables_initializer())
       new_invs = []
+      sess.run(ops)
       for i in range(1, ff.EIGENVALUE_DECOMPOSITION_THRESHOLD + 1):
         # The inverse op will assign the damped inverse of cov to the inv var.
-        sess.run(ops[i - 1])
         new_invs.append(sess.run(factor._inverses_by_damping[1. / i]))
       # We want to see that the new invs are all different from each other.
       for i in range(len(new_invs)):
@@ -311,6 +342,16 @@ class FullFactorTest(test.TestCase):
       factor = ff.FullFactor((tensor,), 32)
       self.assertEqual([6, 6], factor.get_cov().get_shape().as_list())
 
+  def testFullFactorInitFloat64(self):
+    with tf_ops.Graph().as_default():
+      dtype = dtypes.float64_ref
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
+      factor = ff.FullFactor((tensor,), 32)
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual([6, 6], cov.get_shape().as_list())
+
   def testMakeCovarianceUpdateOp(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
@@ -331,6 +372,16 @@ class NaiveDiagonalFactorTest(test.TestCase):
       factor = ff.NaiveDiagonalFactor((tensor,), 32)
       self.assertEqual([6, 1], factor.get_cov().get_shape().as_list())
 
+  def testNaiveDiagonalFactorInitFloat64(self):
+    with tf_ops.Graph().as_default():
+      dtype = dtypes.float64_ref
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
+      factor = ff.NaiveDiagonalFactor((tensor,), 32)
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual([6, 1], cov.get_shape().as_list())
+
   def testMakeCovarianceUpdateOp(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
@@ -344,18 +395,25 @@ class NaiveDiagonalFactorTest(test.TestCase):
 
 class FullyConnectedKroneckerFactorTest(test.TestCase):
 
-  def _testFullyConnectedKroneckerFactorInit(self, has_bias, final_shape):
+  def _testFullyConnectedKroneckerFactorInit(self,
+                                             has_bias,
+                                             final_shape,
+                                             dtype=dtypes.float32_ref):
     with tf_ops.Graph().as_default():
       random_seed.set_random_seed(200)
-      tensor = array_ops.ones((2, 3), name='a/b/c')
+      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
       factor = ff.FullyConnectedKroneckerFactor((tensor,), has_bias=has_bias)
-      self.assertEqual(final_shape, factor.get_cov().get_shape().as_list())
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual(final_shape, cov.get_shape().as_list())
 
   def testFullyConnectedKroneckerFactorInitNoBias(self):
-    self._testFullyConnectedKroneckerFactorInit(False, [3, 3])
+    for dtype in (dtypes.float32_ref, dtypes.float64_ref):
+      self._testFullyConnectedKroneckerFactorInit(False, [3, 3], dtype=dtype)
 
   def testFullyConnectedKroneckerFactorInitWithBias(self):
-    self._testFullyConnectedKroneckerFactorInit(True, [4, 4])
+    for dtype in (dtypes.float32_ref, dtypes.float64_ref):
+      self._testFullyConnectedKroneckerFactorInit(True, [4, 4], dtype=dtype)
 
   def testMakeCovarianceUpdateOpWithBias(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
@@ -398,6 +456,18 @@ class ConvInputKroneckerFactorTest(test.TestCase):
       self.assertEqual([1 * 2 * 3 + 1, 1 * 2 * 3 + 1],
                        factor.get_cov().get_shape().as_list())
 
+  def testConvInputKroneckerFactorInitFloat64(self):
+    with tf_ops.Graph().as_default():
+      dtype = dtypes.float64_ref
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
+      factor = ff.ConvInputKroneckerFactor(
+          tensor, (1, 2, 3, 4), 3, 2, has_bias=True)
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual([1 * 2 * 3 + 1, 1 * 2 * 3 + 1],
+                       cov.get_shape().as_list())
+
   def testMakeCovarianceUpdateOpWithBias(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
@@ -433,6 +503,16 @@ class ConvOutputKroneckerFactorTest(test.TestCase):
       factor = ff.ConvOutputKroneckerFactor((tensor,))
       self.assertEqual([5, 5], factor.get_cov().get_shape().as_list())
 
+  def testConvOutputKroneckerFactorInitFloat64(self):
+    with tf_ops.Graph().as_default():
+      dtype = dtypes.float64_ref
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3, 4, 5), dtype=dtype, name='a/b/c')
+      factor = ff.ConvOutputKroneckerFactor((tensor,))
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual([5, 5], cov.get_shape().as_list())
+
   def testConvOutputKroneckerFactorInitNotEnoughDims(self):
     with tf_ops.Graph().as_default():
       random_seed.set_random_seed(200)
@@ -451,5 +531,49 @@ class ConvOutputKroneckerFactorTest(test.TestCase):
       self.assertAllClose([[43, 46.5], [46.5, 51.5]], new_cov)
 
 
+class FullyConnectedMultiKFTest(test.TestCase):
+
+  def testFullyConnectedMultiKFInit(self):
+    with tf_ops.Graph().as_default():
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3), name='a/b/c')
+      tensor_list = [tensor]
+      factor = ff.FullyConnectedMultiKF((tensor_list,), has_bias=False)
+      self.assertEqual([3, 3], factor.get_cov().get_shape().as_list())
+
+  def testFullyConnectedMultiKFInitFloat64(self):
+    with tf_ops.Graph().as_default():
+      dtype = dtypes.float64_ref
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
+      tensor_list = [tensor]
+      factor = ff.FullyConnectedMultiKF((tensor_list,), has_bias=False)
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual([3, 3], cov.get_shape().as_list())
+
+  def testMakeCovarianceUpdateOpWithBias(self):
+    with tf_ops.Graph().as_default(), self.test_session() as sess:
+      random_seed.set_random_seed(200)
+      tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
+      tensor_list = [tensor]
+      factor = ff.FullyConnectedMultiKF((tensor_list,), has_bias=True)
+
+      sess.run(tf_variables.global_variables_initializer())
+      new_cov = sess.run(factor.make_covariance_update_op(.5))
+      self.assertAllClose([[3, 3.5, 1], [3.5, 5.5, 1.5], [1, 1.5, 1]], new_cov)
+
+  def testMakeCovarianceUpdateOpNoBias(self):
+    with tf_ops.Graph().as_default(), self.test_session() as sess:
+      random_seed.set_random_seed(200)
+      tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
+      tensor_list = [tensor]
+      factor = ff.FullyConnectedMultiKF((tensor_list,))
+
+      sess.run(tf_variables.global_variables_initializer())
+      new_cov = sess.run(factor.make_covariance_update_op(.5))
+      self.assertAllClose([[3, 3.5], [3.5, 5.5]], new_cov)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py b/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py
index c5ad90d1dc7807ae5214523d4a443fb2430d202f..b8ccbeadd0a9d69edb41fef50e3edb090457adf2 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py
@@ -128,8 +128,9 @@ class LayerCollectionTest(test.TestCase):
       key = array_ops.constant(1)
       lc.register_fully_connected(key, array_ops.constant(2),
                                   array_ops.constant(3))
-      with self.assertRaises(ValueError):
+      with self.assertRaises(ValueError) as cm:
         lc.register_generic(key, 16)
+      self.assertIn('already in LayerCollection', str(cm.exception))
 
   def testRegisterSingleParamNotRegistered(self):
     x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
@@ -144,16 +145,18 @@ class LayerCollectionTest(test.TestCase):
     x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
     lc = layer_collection.LayerCollection()
     lc.fisher_blocks = {x: '1'}
-    with self.assertRaises(ValueError):
+    with self.assertRaises(ValueError) as cm:
       lc.register_block(x, 'foo')
+    self.assertIn('already in LayerCollection', str(cm.exception))
 
   def testRegisterSingleParamRegisteredInTuple(self):
     x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
     y = variable_scope.get_variable('y', initializer=array_ops.constant(1,))
     lc = layer_collection.LayerCollection()
     lc.fisher_blocks = {(x, y): '1'}
-    lc.register_block(x, 'foo')
-    self.assertEqual(set(['1']), set(lc.get_blocks()))
+    with self.assertRaises(ValueError) as cm:
+      lc.register_block(x, 'foo')
+    self.assertIn('was already registered', str(cm.exception))
 
   def testRegisterTupleParamNotRegistered(self):
     x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
@@ -173,8 +176,9 @@ class LayerCollectionTest(test.TestCase):
     lc = layer_collection.LayerCollection()
     lc.fisher_blocks = {(x, y): '1'}
 
-    with self.assertRaises(ValueError):
+    with self.assertRaises(ValueError) as cm:
       lc.register_block((x, y), 'foo')
+    self.assertIn('already in LayerCollection', str(cm.exception))
 
   def testRegisterTupleParamRegisteredInSuperset(self):
     x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
@@ -183,8 +187,9 @@ class LayerCollectionTest(test.TestCase):
     lc = layer_collection.LayerCollection()
     lc.fisher_blocks = {(x, y, z): '1'}
 
-    lc.register_block((x, y), 'foo')
-    self.assertEqual(set(['1']), set(lc.get_blocks()))
+    with self.assertRaises(ValueError) as cm:
+      lc.register_block((x, y), 'foo')
+    self.assertIn('was already registered', str(cm.exception))
 
   def testRegisterTupleParamSomeRegistered(self):
     x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
@@ -193,10 +198,9 @@ class LayerCollectionTest(test.TestCase):
     lc = layer_collection.LayerCollection()
     lc.fisher_blocks = {x: MockFisherBlock('1'), z: MockFisherBlock('2')}
 
-    lc.register_block((x, y), MockFisherBlock('foo'))
-    self.assertEqual(
-        set([MockFisherBlock('2'), MockFisherBlock('foo')]), set(
-            lc.get_blocks()))
+    with self.assertRaises(ValueError) as cm:
+      lc.register_block((x, y), MockFisherBlock('foo'))
+    self.assertIn('was already registered', str(cm.exception))
 
   def testRegisterTupleVarSomeRegisteredInOtherTuples(self):
     x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
@@ -206,8 +210,9 @@ class LayerCollectionTest(test.TestCase):
     lc = layer_collection.LayerCollection()
     lc.fisher_blocks = {(x, z): '1', (z, w): '2'}
 
-    with self.assertRaises(ValueError):
+    with self.assertRaises(ValueError) as cm:
       lc.register_block((x, y), 'foo')
+    self.assertIn('was already registered', str(cm.exception))
 
   def testRegisterCategoricalPredictiveDistribution(self):
     with ops.Graph().as_default(), self.test_session() as sess:
@@ -427,6 +432,23 @@ class LayerCollectionTest(test.TestCase):
 
       self.ensureLayerReuseWorks(register_fn)
 
+  def testReuseWithInvalidRegistration(self):
+    """Invalid registrations shouldn't overwrite existing blocks."""
+    with ops.Graph().as_default():
+      inputs = array_ops.ones([2, 5, 5, 10])
+      outputs = array_ops.zeros([2, 5, 5, 3])
+      w = variable_scope.get_variable('w', [1, 1, 10, 3])
+      b = variable_scope.get_variable('b', [3])
+      lc = layer_collection.LayerCollection()
+      lc.register_fully_connected(w, inputs, outputs)
+      self.assertEqual(lc.fisher_blocks[w].num_registered_minibatches, 1)
+      with self.assertRaises(KeyError):
+        lc.register_fully_connected((w, b), inputs, outputs, reuse=True)
+      self.assertNotIn((w, b), lc.fisher_blocks)
+      self.assertEqual(lc.fisher_blocks[w].num_registered_minibatches, 1)
+      lc.register_fully_connected(w, inputs, outputs, reuse=True)
+      self.assertEqual(lc.fisher_blocks[w].num_registered_minibatches, 2)
+
   def testMakeOrGetFactor(self):
     with ops.Graph().as_default():
       random_seed.set_random_seed(200)
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py b/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py
index 55fe38e3e9aab2dbd70a45cdc8fa0c208b036db0..d255a6e7160386d8eb6fca00765eea8a318f4eaa 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py
@@ -222,18 +222,6 @@ class UtilsTest(test.TestCase):
       self.assertAllClose(b, np.array([4., 5.]))
       self.assertAllClose(c, np.array([[6.], [7.], [8.], [9.]]))
 
-  def testComputePi(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      left_factor = array_ops.diag([1., 2., 0., 1.])
-      right_factor = array_ops.ones([2., 2.])
-
-      # pi is the sqrt of the left trace norm divided by the right trace norm
-      pi = utils.compute_pi(left_factor, right_factor)
-
-      pi_val = sess.run(pi)
-      self.assertEqual(1., pi_val)
-
   def testPosDefInvCholesky(self):
     with ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
diff --git a/tensorflow/contrib/kfac/python/ops/BUILD b/tensorflow/contrib/kfac/python/ops/BUILD
index de4b8920b849dbf2117657de6e7c26f94f4d0363..3d731c7bc206d6f168e9b8f29b66bf4f1dbe8542 100644
--- a/tensorflow/contrib/kfac/python/ops/BUILD
+++ b/tensorflow/contrib/kfac/python/ops/BUILD
@@ -38,6 +38,7 @@ py_library(
         ":utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:special_math_ops",
@@ -171,6 +172,7 @@ py_library(
     deps = [
         ":utils",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
         "//tensorflow/python:util",
         "//third_party/py/numpy",
diff --git a/tensorflow/contrib/kfac/python/ops/estimator.py b/tensorflow/contrib/kfac/python/ops/estimator.py
index ce4e776324bbde1b8f214d89daa876032d8a21ff..5e1680967c184bf19f2a2578219db07a48264dc9 100644
--- a/tensorflow/contrib/kfac/python/ops/estimator.py
+++ b/tensorflow/contrib/kfac/python/ops/estimator.py
@@ -18,16 +18,53 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
+import contextlib
+import itertools
 
 import numpy as np
 
 from tensorflow.contrib.kfac.python.ops import utils
+from tensorflow.python.framework import ops as tf_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.util import nest
 
 
+class _DeviceContextGenerator(object):
+  """Class for generating device contexts in a round-robin fashion."""
+
+  def __init__(self, devices):
+    """Creates a _DeviceContextGenerator object.
+
+    Example usage:
+
+    ```python
+    dcg = _DeviceContextGenerator(['/gpu:0', 'gpu:1'])
+    with dcg():
+      # All operations in this context will be placed on GPU 0
+      ...
+    with dcg():
+      # All operations in this context will be placed on GPU 1
+      ...
+    ```
+
+    Args:
+      devices: An iterable of device strings (or None). Successive calls to
+          __call__ will give contexts which place devices on these devices in
+          a round-robin fashion.
+    """
+    self._cycle = None if devices is None else itertools.cycle(devices)
+
+  @contextlib.contextmanager
+  def __call__(self):
+    """Returns a context manager specifying the default device."""
+    if self._cycle is None:
+      yield
+    else:
+      with tf_ops.device(next(self._cycle)):
+        yield
+
+
 class FisherEstimator(object):
   """Fisher estimator class supporting various approximations of the Fisher."""
 
@@ -36,7 +73,10 @@ class FisherEstimator(object):
                cov_ema_decay,
                damping,
                layer_collection,
-               estimation_mode="gradients"):
+               estimation_mode="gradients",
+               colocate_gradients_with_ops=False,
+               cov_devices=None,
+               inv_devices=None):
     """Create a FisherEstimator object.
 
     Args:
@@ -54,7 +94,7 @@ class FisherEstimator(object):
           blocks, kronecker factors, and losses associated with the
           graph.
       estimation_mode: The type of estimator to use for the Fishers.  Can be
-          'gradients', 'empirical', 'curvature_propagation', or 'exact'.
+          'gradients', 'empirical', 'curvature_prop', or 'exact'.
           (Default: 'gradients').  'gradients' is the basic estimation approach
           from the original K-FAC paper.  'empirical' computes the 'empirical'
           Fisher information matrix (which uses the data's distribution for the
@@ -69,6 +109,14 @@ class FisherEstimator(object):
           for each coordinate of the output instead of using 1/-1 vectors.  It
           is more expensive to compute than the other three options by a factor
           equal to the output dimension, roughly speaking.
+      colocate_gradients_with_ops: Whether we should request gradients be
+          colocated with their respective ops.
+      cov_devices: Iterable of device strings (e.g. '/gpu:0'). Covariance
+          computations will be placed on these devices in a round-robin fashion.
+          Can be None, which means that no devices are specified.
+      inv_devices: Iterable of device strings (e.g. '/gpu:0'). Inversion
+          computations will be placed on these devices in a round-robin fashion.
+          Can be None, which means that no devices are specified.
 
     Raises:
       ValueError: If no losses have been registered with layer_collection.
@@ -79,13 +127,19 @@ class FisherEstimator(object):
     self._estimation_mode = estimation_mode
     self._layers = layer_collection
     self._layers.create_subgraph()
-    self._check_registration(variables)
+    self._layers.check_registration(variables)
     self._gradient_fns = {
         "gradients": self._get_grads_lists_gradients,
         "empirical": self._get_grads_lists_empirical,
         "curvature_prop": self._get_grads_lists_curvature_prop,
         "exact": self._get_grads_lists_exact
     }
+    self._colocate_gradients_with_ops = colocate_gradients_with_ops
+    self._cov_device_context_generator = _DeviceContextGenerator(cov_devices)
+    if inv_devices == cov_devices:
+      self._inv_device_context_generator = self._cov_device_context_generator
+    else:
+      self._inv_device_context_generator = _DeviceContextGenerator(inv_devices)
     setup = self._setup(cov_ema_decay)
     self.cov_update_op, self.inv_update_op, self.inv_updates_dict = setup
 
@@ -148,49 +202,6 @@ class FisherEstimator(object):
     return self._apply_transformation(vecs_and_vars,
                                       lambda fb, vec: fb.multiply(vec))
 
-  def _check_registration(self, variables):
-    """Checks that all variable uses have been registered properly.
-
-    Args:
-      variables: List of variables.
-
-    Raises:
-      ValueError: If any registered variables are not included in the list.
-      ValueError: If any variable in the list is not registered.
-      ValueError: If any variable in the list is registered with the wrong
-          number of "uses" in the subgraph recorded (vs the number of times that
-          variable is actually used in the subgraph).
-    """
-    # Note that overlapping parameters (i.e. those that share variables) will
-    # be caught by layer_collection.LayerParametersDict during registration.
-
-    reg_use_map = self._layers.get_use_count_map()
-
-    error_messages = []
-
-    for var in variables:
-      total_uses = self._layers.subgraph.variable_uses(var)
-      reg_uses = reg_use_map[var]
-
-      if reg_uses == 0:
-        error_messages.append("Variable {} not registered.".format(var))
-      elif (not math.isinf(reg_uses)) and reg_uses != total_uses:
-        error_messages.append(
-            "Variable {} registered with wrong number of uses ({} "
-            "registrations vs {} uses).".format(var, reg_uses, total_uses))
-
-    num_get_vars = len(reg_use_map)
-
-    if num_get_vars > len(variables):
-      error_messages.append("{} registered variables were not included in list."
-                            .format(num_get_vars - len(variables)))
-
-    if error_messages:
-      error_messages = [
-          "Found the following errors with variable registration:"
-      ] + error_messages
-      raise ValueError("\n\t".join(error_messages))
-
   def _setup(self, cov_ema_decay):
     """Sets up the various operations.
 
@@ -219,8 +230,13 @@ class FisherEstimator(object):
       raise ValueError("Unrecognized value {} for estimation_mode.".format(
           self._estimation_mode))
 
+    # TODO(b/68033310): This loop round-robins the "concat" operations which
+    # gather the inputs for the cov_updates. In future, we might do these
+    # computations locally then communicate the results, which would require a
+    # modification to this code.
     for grads_list, fb in zip(grads_lists, fisher_blocks_list):
-      fb.instantiate_factors(grads_list, self.damping)
+      with self._cov_device_context_generator():
+        fb.instantiate_factors(grads_list, self.damping)
 
     cov_updates = [
         factor.make_covariance_update_op(cov_ema_decay)
@@ -233,18 +249,23 @@ class FisherEstimator(object):
 
   def _get_all_inverse_update_ops(self):
     for factor in self._layers.get_factors():
-      for op in factor.make_inverse_update_ops():
-        yield op
+      with self._inv_device_context_generator():
+        for op in factor.make_inverse_update_ops():
+          yield op
 
   def _get_grads_lists_gradients(self, tensors):
-    grads_flat = gradients_impl.gradients(self._layers.total_sampled_loss(),
-                                          nest.flatten(tensors))
+    grads_flat = gradients_impl.gradients(
+        self._layers.total_sampled_loss(),
+        nest.flatten(tensors),
+        colocate_gradients_with_ops=self._colocate_gradients_with_ops)
     grads_all = nest.pack_sequence_as(tensors, grads_flat)
     return tuple((grad,) for grad in grads_all)
 
   def _get_grads_lists_empirical(self, tensors):
-    grads_flat = gradients_impl.gradients(self._layers.total_loss(),
-                                          nest.flatten(tensors))
+    grads_flat = gradients_impl.gradients(
+        self._layers.total_loss(),
+        nest.flatten(tensors),
+        colocate_gradients_with_ops=self._colocate_gradients_with_ops)
     grads_all = nest.pack_sequence_as(tensors, grads_flat)
     return tuple((grad,) for grad in grads_all)
 
@@ -262,11 +283,13 @@ class FisherEstimator(object):
     grads_flat = gradients_impl.gradients(
         nest.flatten(loss_inputs),
         nest.flatten(tensors),
-        grad_ys=nest.flatten(transformed_random_signs))
+        grad_ys=nest.flatten(transformed_random_signs),
+        colocate_gradients_with_ops=self._colocate_gradients_with_ops)
     grads_all = nest.pack_sequence_as(tensors, grads_flat)
     return tuple((grad,) for grad in grads_all)
 
   def _get_grads_lists_exact(self, tensors):
+    """No docstring required."""
     # Loop over all coordinates of all losses.
     grads_all = []
     for loss in self._layers.losses:
@@ -274,6 +297,9 @@ class FisherEstimator(object):
         transformed_one_hot = loss.multiply_fisher_factor_replicated_one_hot(
             index)
         grads_flat = gradients_impl.gradients(
-            loss.inputs, nest.flatten(tensors), grad_ys=transformed_one_hot)
+            loss.inputs,
+            nest.flatten(tensors),
+            grad_ys=transformed_one_hot,
+            colocate_gradients_with_ops=self._colocate_gradients_with_ops)
         grads_all.append(nest.pack_sequence_as(tensors, grads_flat))
     return zip(*grads_all)
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
index e822a1213a4132522be8031401609c78572cb1a6..1ccb9e040f2bb6bcfd217886918abd40e3cc1cfb 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
@@ -38,6 +38,7 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import enum  # pylint: disable=g-bad-import-order
 
 import six
 
@@ -52,14 +53,54 @@ from tensorflow.python.ops import math_ops
 #   damping /= num_replications ** NORMALIZE_DAMPING_POWER
 NORMALIZE_DAMPING_POWER = 1.0
 
+# Methods for adjusting damping for FisherBlocks. See
+# _compute_pi_adjusted_damping() for details.
+PI_OFF_NAME = "off"
+PI_TRACENORM_NAME = "tracenorm"
+PI_TYPE = PI_TRACENORM_NAME
 
-def set_global_constants(normalize_damping_power=None):
+
+def set_global_constants(normalize_damping_power=None, pi_type=None):
   """Sets various global constants used by the classes in this module."""
   global NORMALIZE_DAMPING_POWER
+  global PI_TYPE
 
   if normalize_damping_power is not None:
     NORMALIZE_DAMPING_POWER = normalize_damping_power
 
+  if pi_type is not None:
+    PI_TYPE = pi_type
+
+
+def _compute_pi_tracenorm(left_cov, right_cov):
+  """Computes the scalar constant pi for Tikhonov regularization/damping.
+
+  pi = sqrt( (trace(A) / dim(A)) / (trace(B) / dim(B)) )
+  See section 6.3 of https://arxiv.org/pdf/1503.05671.pdf for details.
+
+  Args:
+    left_cov: The left Kronecker factor "covariance".
+    right_cov: The right Kronecker factor "covariance".
+
+  Returns:
+    The computed scalar constant pi for these Kronecker Factors (as a Tensor).
+  """
+  # Instead of dividing by the dim of the norm, we multiply by the dim of the
+  # other norm. This works out the same in the ratio.
+  left_norm = math_ops.trace(left_cov) * right_cov.shape.as_list()[0]
+  right_norm = math_ops.trace(right_cov) * left_cov.shape.as_list()[0]
+  return math_ops.sqrt(left_norm / right_norm)
+
+
+def _compute_pi_adjusted_damping(left_cov, right_cov, damping):
+
+  if PI_TYPE == PI_TRACENORM_NAME:
+    pi = _compute_pi_tracenorm(left_cov, right_cov)
+    return (damping * pi, damping / pi)
+
+  elif PI_TYPE == PI_OFF_NAME:
+    return (damping, damping)
+
 
 @six.add_metaclass(abc.ABCMeta)
 class FisherBlock(object):
@@ -153,7 +194,7 @@ class FullFB(FisherBlock):
     self._factor.register_damped_inverse(damping)
 
   def multiply_inverse(self, vector):
-    inverse = self._factor.get_inverse(self._damping)
+    inverse = self._factor.get_damped_inverse(self._damping)
     out_flat = math_ops.matmul(inverse, utils.tensors_to_column(vector))
     return utils.column_to_tensors(vector, out_flat)
 
@@ -411,7 +452,7 @@ class ConvDiagonalFB(FisherBlock):
         (self._strides[1] * self._strides[2]))
 
     if NORMALIZE_DAMPING_POWER:
-      damping /= self._num_locations ** NORMALIZE_DAMPING_POWER
+      damping /= self._num_locations**NORMALIZE_DAMPING_POWER
     self._damping = damping
 
     self._factor = self._layer_collection.make_or_get_factor(
@@ -465,11 +506,10 @@ class KroneckerProductFB(FisherBlock):
     Args:
       damping: The base damping factor (float or Tensor) for the damped inverse.
     """
-    pi = utils.compute_pi(self._input_factor.get_cov(),
-                          self._output_factor.get_cov())
-
-    self._input_damping = (damping**0.5) * pi
-    self._output_damping = (damping**0.5) / pi
+    self._input_damping, self._output_damping = _compute_pi_adjusted_damping(
+        self._input_factor.get_cov(),
+        self._output_factor.get_cov(),
+        damping**0.5)
 
     self._input_factor.register_damped_inverse(self._input_damping)
     self._output_factor.register_damped_inverse(self._output_damping)
@@ -487,8 +527,9 @@ class KroneckerProductFB(FisherBlock):
     return 1.0
 
   def multiply_inverse(self, vector):
-    left_factor_inv = self._input_factor.get_inverse(self._input_damping)
-    right_factor_inv = self._output_factor.get_inverse(self._output_damping)
+    left_factor_inv = self._input_factor.get_damped_inverse(self._input_damping)
+    right_factor_inv = self._output_factor.get_damped_inverse(
+        self._output_damping)
     reshaped_vector = utils.layer_params_to_mat2d(vector)
     reshaped_out = math_ops.matmul(left_factor_inv,
                                    math_ops.matmul(reshaped_vector,
@@ -720,3 +761,260 @@ def _concat_along_batch_dim(tensor_list):
 def _num_conv_locations(input_shape, strides):
   """Returns the number of locations a Conv kernel is applied to."""
   return input_shape[1] * input_shape[2] // (strides[1] * strides[2])
+
+
+class FullyConnectedMultiIndepFB(KroneckerProductFB):
+  """FisherBlock for fully-connected layers that share parameters.
+  """
+
+  def __init__(self, layer_collection, inputs, outputs, has_bias=False):
+    """Creates a FullyConnectedMultiIndepFB block.
+
+    Args:
+      layer_collection: LayerCollection instance.
+      inputs: list or tuple of Tensors. Each Tensor has shape [batch_size,
+        inputs_size].
+      outputs: list or tuple of Tensors. Each Tensor has shape [batch_size,
+        outputs_size].
+      has_bias: bool. If True, estimates Fisher with respect to a bias
+        parameter as well as the layer's parameters.
+    """
+
+    assert len(inputs) == len(outputs)
+    # We need to make sure inputs and outputs are tuples and not lists so that
+    # they get hashed by layer_collection.make_or_get_factor properly.
+    self._inputs = tuple(inputs)
+    self._outputs = tuple(outputs)
+    self._has_bias = has_bias
+    self._num_uses = len(inputs)
+
+    super(FullyConnectedMultiIndepFB, self).__init__(layer_collection)
+
+  @property
+  def num_registered_minibatches(self):
+    # TODO(b/69411207): Add support for registering additional minibatches.
+    return 1
+
+  def instantiate_factors(self, grads_list, damping):
+
+    self._input_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedMultiKF,
+        ((self._inputs,), self._has_bias))
+
+    self._output_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedMultiKF, (grads_list,))
+
+    if NORMALIZE_DAMPING_POWER:
+      damping /= self._num_uses**NORMALIZE_DAMPING_POWER
+
+    self._register_damped_input_and_output_inverses(damping)
+
+  @property
+  def _renorm_coeff(self):
+    return self._num_uses
+
+  def tensors_to_compute_grads(self):
+    return self._outputs
+
+  def num_inputs(self):
+    return len(self._inputs)
+
+
+class SeriesFBApproximation(enum.IntEnum):
+  """See FullyConnectedSeriesFB.__init__ for description and usage."""
+  option1 = 1
+  option2 = 2
+
+
+class FullyConnectedSeriesFB(FisherBlock):
+  """FisherBlock for fully-connected layers that share parameters across time.
+
+  See the following preprint for details:
+    https://openreview.net/pdf?id=HyMTkQZAb
+
+  See the end of the appendix of the paper for a pseudo-code of the
+  algorithm being implemented by multiply_inverse here.  Note that we are
+  using pre-computed versions of certain matrix-matrix products to speed
+  things up.  This is explicitly explained wherever it is done.
+  """
+
+  def __init__(self,
+               layer_collection,
+               inputs,
+               outputs,
+               has_bias=False,
+               option=SeriesFBApproximation.option2):
+    """Constructs a new `FullyConnectedSeriesFB`.
+
+    Args:
+      layer_collection: The collection of all layers in the K-FAC approximate
+        Fisher information matrix to which this FisherBlock belongs.
+      inputs: List of tensors of shape [batch_size, input_size].
+        Inputs to the layer.
+      outputs: List of tensors of shape [batch_size, input_size].
+        Outputs of the layer (before activations).
+      has_bias: Whether the layer includes a bias parameter.
+      option: A `SeriesFBApproximation` specifying the simplifying assumption
+        to be used in this block. `option1` approximates the cross-covariance
+        over time as a symmetric matrix, while `option2` makes
+        the assumption that training sequences are infinitely long. See section
+        3.5 of the paper for more details.
+    """
+
+    assert len(inputs) == len(outputs)
+    # We need to make sure inputs and outputs are tuples and not lists so that
+    # they get hashed by layer_collection.make_or_get_factor properly.
+    self._inputs = tuple(inputs)
+    self._outputs = tuple(outputs)
+    self._has_bias = has_bias
+    self._num_timesteps = len(inputs)
+    self._option = option
+
+    super(FullyConnectedSeriesFB, self).__init__(layer_collection)
+
+  @property
+  def num_registered_minibatches(self):
+    # TODO(b/69411207): Add support for registering additional minibatches.
+    return 1
+
+  def instantiate_factors(self, grads_list, damping):
+
+    self._input_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedMultiKF, ((self._inputs,), self._has_bias))
+
+    self._output_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedMultiKF, (grads_list,))
+
+    if NORMALIZE_DAMPING_POWER:
+      damping /= self._num_timesteps**NORMALIZE_DAMPING_POWER
+
+    self._damping_input, self._damping_output = _compute_pi_adjusted_damping(
+        self._input_factor.get_cov(),
+        self._output_factor.get_cov(),
+        damping**0.5)
+
+    if self._option == SeriesFBApproximation.option1:
+      self._input_factor.register_option1quants(self._damping_input)
+      self._output_factor.register_option1quants(self._damping_output)
+    elif self._option == SeriesFBApproximation.option2:
+      self._input_factor.register_option2quants(self._damping_input)
+      self._output_factor.register_option2quants(self._damping_output)
+    else:
+      raise ValueError(
+          "Unrecognized FullyConnectedSeriesFB approximation: {}".format(
+              self._option))
+
+  def multiply_inverse(self, vector):
+    # pylint: disable=invalid-name
+
+    Z = utils.layer_params_to_mat2d(vector)
+
+    # Derivations were done for "batch_dim==1" case so we need to convert to
+    # that orientation:
+    Z = array_ops.transpose(Z)
+
+    if self._option == SeriesFBApproximation.option1:
+
+      # Note that L_A = A0^(-1/2) * U_A and L_G = G0^(-1/2) * U_G.
+      L_A, psi_A = self._input_factor.get_option1quants(self._damping_input)
+      L_G, psi_G = self._output_factor.get_option1quants(self._damping_output)
+
+      def gamma(x):
+        # We are assuming that each case has the same number of time-steps.
+        # If this stops being the case one shouldn't simply replace this T
+        # with its average value.  Instead, one needs to go back to the
+        # definition of the gamma function from the paper.
+        T = self._num_timesteps
+        return (1 - x)**2 / (T * (1 - x**2) - 2 * x * (1 - x**T))
+
+      # Y = gamma( psi_G*psi_A^T ) (computed element-wise)
+      # Even though Y is Z-independent we are recomputing it from the psi's
+      # each since Y depends on both A and G quantities, and it is relatively
+      # cheap to compute.
+      Y = gamma(array_ops.reshape(psi_G, [int(psi_G.shape[0]), -1]) * psi_A)
+
+      # Z = L_G^T * Z * L_A
+      # This is equivalent to the following computation from the original
+      # pseudo-code:
+      # Z = G0^(-1/2) * Z * A0^(-1/2)
+      # Z = U_G^T * Z * U_A
+      Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A), transpose_a=True)
+
+      # Z = Z .* Y
+      Z *= Y
+
+      # Z = L_G * Z * L_A^T
+      # This is equivalent to the following computation from the original
+      # pseudo-code:
+      # Z = U_G * Z * U_A^T
+      # Z = G0^(-1/2) * Z * A0^(-1/2)
+      Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A, transpose_b=True))
+
+    elif self._option == SeriesFBApproximation.option2:
+
+      # Note that P_A = A_1^T * A_0^(-1) and P_G = G_1^T * G_0^(-1),
+      # and K_A = A_0^(-1/2) * E_A and K_G = G_0^(-1/2) * E_G.
+      P_A, K_A, mu_A = self._input_factor.get_option2quants(self._damping_input)
+      P_G, K_G, mu_G = self._output_factor.get_option2quants(
+          self._damping_output)
+
+      # Our approach differs superficially from the pseudo-code in the paper
+      # in order to reduce the total number of matrix-matrix multiplies.
+      # In particular, the first three computations in the pseudo code are
+      # Z = G0^(-1/2) * Z * A0^(-1/2)
+      # Z = Z - hPsi_G^T * Z * hPsi_A
+      # Z = E_G^T * Z * E_A
+      # Noting that hPsi = C0^(-1/2) * C1 * C0^(-1/2), so that
+      # C0^(-1/2) * hPsi = C0^(-1) * C1 * C0^(-1/2) = P^T * C0^(-1/2)
+      # the entire computation can be written as
+      # Z = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
+      #     - hPsi_G^T * G0^(-1/2) * Z * A0^(-1/2) * hPsi_A) * E_A
+      #   = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
+      #     - G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2)) * E_A
+      #   = E_G^T * G0^(-1/2) * Z * A0^(-1/2) * E_A
+      #     -  E_G^T* G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2) * E_A
+      #   = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A
+      # This final expression is computed by the following two lines:
+      # Z = Z - P_G * Z * P_A^T
+      Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A, transpose_b=True))
+      # Z = K_G^T * Z * K_A
+      Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A), transpose_a=True)
+
+      # Z = Z ./ (1*1^T - mu_G*mu_A^T)
+      # Be careful with the outer product.  We don't want to accidentally
+      # make it an inner-product instead.
+      tmp = 1.0 - array_ops.reshape(mu_G, [int(mu_G.shape[0]), -1]) * mu_A
+      # Prevent some numerical issues by setting any 0.0 eigs to 1.0
+      tmp += 1.0 * math_ops.cast(math_ops.equal(tmp, 0.0), dtype=tmp.dtype)
+      Z /= tmp
+
+      # We now perform the transpose/reverse version of the operations
+      # derived above, whose derivation from the original pseudo-code is
+      # analgous.
+      # Z = K_G * Z * K_A^T
+      Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A, transpose_b=True))
+
+      # Z = Z - P_G^T * Z * P_A
+      Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A), transpose_a=True)
+
+      # Z = normalize (1/E[T]) * Z
+      # Note that this normalization is done because we compute the statistics
+      # by averaging, not summing, over time. (And the gradient is presumably
+      # summed over time, not averaged, and thus their scales are different.)
+      Z /= math_ops.cast(self._num_timesteps, Z.dtype)
+
+    # Convert back to the "batch_dim==0" orientation.
+    Z = array_ops.transpose(Z)
+
+    return utils.mat2d_to_layer_params(vector, Z)
+
+    # pylint: enable=invalid-name
+
+  def multiply(self, vector):
+    raise NotImplementedError
+
+  def tensors_to_compute_grads(self):
+    return self._outputs
+
+  def num_inputs(self):
+    return len(self._inputs)
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
index 4e36813369e69de1d6f13ddb00566bda912244f6..5a6d1a93ff217c3922f45a047b4d548086ac5258 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import contextlib
 
 import numpy as np
 import six
@@ -26,6 +27,8 @@ import six
 from tensorflow.contrib.kfac.python.ops import utils
 from tensorflow.python.framework import ops as tf_ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import special_math_ops
@@ -50,7 +53,22 @@ EIGENVALUE_DECOMPOSITION_THRESHOLD = 2
 EIGENVALUE_CLIPPING_THRESHOLD = 0.0
 
 
-def set_global_constants(init_covariances_at_zero=None, zero_debias=None,
+@contextlib.contextmanager
+def _maybe_colocate_with(op, colocate_cov_ops_with_inputs):
+  """Context to colocate with `op` if `colocate_cov_ops_with_inputs`."""
+  if colocate_cov_ops_with_inputs:
+    if isinstance(op, (list, tuple)):
+      with tf_ops.colocate_with(op[0]):
+        yield
+    else:
+      with tf_ops.colocate_with(op):
+        yield
+  else:
+    yield
+
+
+def set_global_constants(init_covariances_at_zero=None,
+                         zero_debias=None,
                          eigenvalue_decomposition_threshold=None,
                          eigenvalue_clipping_threshold=None):
   """Sets various global constants used by the classes in this module."""
@@ -85,7 +103,7 @@ def diagonal_covariance_initializer(shape, dtype, partition_info):  # pylint: di
   return array_ops.ones(shape, dtype)
 
 
-def _compute_cov(tensor, normalizer=None):
+def _compute_cov(tensor, tensor_right=None, normalizer=None):
   """Compute the empirical second moment of the rows of a 2D Tensor.
 
   This function is meant to be applied to random matrices for which the true row
@@ -93,6 +111,8 @@ def _compute_cov(tensor, normalizer=None):
 
   Args:
     tensor: A 2D Tensor.
+    tensor_right: An optional 2D Tensor. If provided, this function computes
+      the matrix product tensor^T * tensor_right instead of tensor^T * tensor.
     normalizer: optional scalar for the estimator (by default, the normalizer is
         the number of rows of tensor).
 
@@ -101,9 +121,14 @@ def _compute_cov(tensor, normalizer=None):
   """
   if normalizer is None:
     normalizer = array_ops.shape(tensor)[0]
-  cov = (math_ops.matmul(tensor, tensor, transpose_a=True) / math_ops.cast(
-      normalizer, tensor.dtype))
-  return (cov + array_ops.transpose(cov)) / math_ops.cast(2, cov.dtype)
+  if tensor_right is None:
+    cov = (
+        math_ops.matmul(tensor, tensor, transpose_a=True) / math_ops.cast(
+            normalizer, tensor.dtype))
+    return (cov + array_ops.transpose(cov)) / math_ops.cast(2.0, cov.dtype)
+  else:
+    return (math_ops.matmul(tensor, tensor_right, transpose_a=True) /
+            math_ops.cast(normalizer, tensor.dtype))
 
 
 def _append_homog(tensor):
@@ -119,7 +144,7 @@ def _append_homog(tensor):
   rank = len(tensor.shape.as_list())
   shape = array_ops.concat([array_ops.shape(tensor)[:-1], [1]], axis=0)
   ones = array_ops.ones(shape, dtype=tensor.dtype)
-  return array_ops.concat([tensor, ones], axis=rank-1)
+  return array_ops.concat([tensor, ones], axis=rank - 1)
 
 
 def scope_string_from_params(params):
@@ -157,8 +182,8 @@ def scope_string_from_params(params):
     elif isinstance(param, (tf_ops.Tensor, variables.Variable)):
       name_parts.append(scope_string_from_name(param))
     else:
-      raise ValueError(
-          "Encountered an unsupported param type {}".format(type(param)))
+      raise ValueError("Encountered an unsupported param type {}".format(
+          type(param)))
   return "_".join(name_parts)
 
 
@@ -209,6 +234,10 @@ class FisherFactor(object):
     """
     pass
 
+  @abc.abstractproperty
+  def _dtype(self):
+    pass
+
   @property
   def _cov_initializer(self):
     return covariance_initializer
@@ -220,7 +249,8 @@ class FisherFactor(object):
           "cov",
           initializer=self._cov_initializer,
           shape=self._cov_shape,
-          trainable=False)
+          trainable=False,
+          dtype=self._dtype)
 
   @abc.abstractmethod
   def _compute_new_cov(self, idx=0):
@@ -240,9 +270,10 @@ class FisherFactor(object):
     return moving_averages.assign_moving_average(
         self._cov, new_cov, ema_decay, zero_debias=ZERO_DEBIAS)
 
+  @abc.abstractmethod
   def make_inverse_update_ops(self):
     """Create and return update ops corresponding to registered computations."""
-    return []
+    pass
 
   def get_cov(self):
     return self._cov
@@ -257,6 +288,13 @@ class InverseProvidingFactor(FisherFactor):
   _cov_shape properties.
   """
 
+  # TODO(b/69108481): This class (and its subclasses) should be refactored to
+  # serve the matrix quantities it computes as both (potentially stale)
+  # variables, updated by the inverse update ops, and fresh values stored in
+  # tensors that recomputed once every session.run() call.  Currently matpower
+  # and damp_inverse have the former behavior, while eigendecomposition has
+  # the latter.
+
   def __init__(self):
     self._inverses_by_damping = {}
     self._matpower_by_exp_and_damping = {}
@@ -267,6 +305,10 @@ class InverseProvidingFactor(FisherFactor):
   def register_damped_inverse(self, damping):
     """Registers a damped inverse needed by a FisherBlock.
 
+    This creates a variable and signals make_inverse_update_ops to make the
+    corresponding update op.  The variable can be read via the method
+    get_inverse.
+
     Args:
       damping: The damping value (float or Tensor) for this factor.
     """
@@ -277,12 +319,17 @@ class InverseProvidingFactor(FisherFactor):
             "inv_damp{}".format(damping_string),
             initializer=inverse_initializer,
             shape=self._cov_shape,
-            trainable=False)
+            trainable=False,
+            dtype=self._dtype)
       self._inverses_by_damping[damping] = inv
 
   def register_matpower(self, exp, damping):
     """Registers a matrix power needed by a FisherBlock.
 
+    This creates a variable and signals make_inverse_update_ops to make the
+    corresponding update op.  The variable can be read via the method
+    get_matpower.
+
     Args:
       exp: The exponent (float or Tensor) to raise the matrix to.
       damping: The damping value (float or Tensor).
@@ -295,57 +342,78 @@ class InverseProvidingFactor(FisherFactor):
             "matpower_exp{}_damp{}".format(exp_string, damping_string),
             initializer=inverse_initializer,
             shape=self._cov_shape,
-            trainable=False)
+            trainable=False,
+            dtype=self._dtype)
       self._matpower_by_exp_and_damping[(exp, damping)] = matpower
 
   def register_eigendecomp(self):
-    """Registers that an eigendecomposition is needed by a FisherBlock."""
+    """Registers an eigendecomposition.
+
+    Unlike register_damp_inverse and register_matpower this doesn't create
+    any variables or inverse ops.  Instead it merely makes tensors containing
+    the eigendecomposition available to anyone that wants them.  They will be
+    recomputed (once) for each session.run() call (when they needed by some op).
+    """
     if not self._eigendecomp:
-      self._eigendecomp = linalg_ops.self_adjoint_eig(self._cov)
+      eigenvalues, eigenvectors = linalg_ops.self_adjoint_eig(self._cov)
+
+      # The matrix self._cov is positive semidefinite by construction, but the
+      # numerical eigenvalues could be negative due to numerical errors, so here
+      # we clip them to be at least FLAGS.eigenvalue_clipping_threshold
+      clipped_eigenvalues = math_ops.maximum(eigenvalues,
+                                             EIGENVALUE_CLIPPING_THRESHOLD)
+      self._eigendecomp = (clipped_eigenvalues, eigenvectors)
 
   def make_inverse_update_ops(self):
     """Create and return update ops corresponding to registered computations."""
-    ops = super(InverseProvidingFactor, self).make_inverse_update_ops()
+    ops = []
 
     num_inverses = len(self._inverses_by_damping)
     matrix_power_registered = bool(self._matpower_by_exp_and_damping)
-    use_eig = (self._eigendecomp or matrix_power_registered or
-               num_inverses >= EIGENVALUE_DECOMPOSITION_THRESHOLD)
+    use_eig = (
+        self._eigendecomp or matrix_power_registered or
+        num_inverses >= EIGENVALUE_DECOMPOSITION_THRESHOLD)
 
     if use_eig:
       self.register_eigendecomp()  # ensures self._eigendecomp is set
       eigenvalues, eigenvectors = self._eigendecomp  # pylint: disable=unpacking-non-sequence
 
-      # The matrix self._cov is positive semidefinite by construction, but the
-      # numerical eigenvalues could be negative due to numerical errors, so here
-      # we clip them to be at least EIGENVALUE_CLIPPING_THRESHOLD.
-      clipped_eigenvalues = math_ops.maximum(eigenvalues,
-                                             EIGENVALUE_CLIPPING_THRESHOLD)
-
       for damping, inv in self._inverses_by_damping.items():
         ops.append(
             inv.assign(
-                math_ops.matmul(eigenvectors / (clipped_eigenvalues + damping),
+                math_ops.matmul(eigenvectors / (eigenvalues + damping),
                                 array_ops.transpose(eigenvectors))))
 
       for (exp, damping), matpower in self._matpower_by_exp_and_damping.items():
         ops.append(
             matpower.assign(
-                math_ops.matmul(eigenvectors * (clipped_eigenvalues + damping)**
-                                exp, array_ops.transpose(eigenvectors))))
+                math_ops.matmul(eigenvectors *
+                                (eigenvalues + damping)**exp,
+                                array_ops.transpose(eigenvectors))))
+      # These ops share computation and should be run on a single device.
+      ops = [control_flow_ops.group(*ops)]
     else:
       for damping, inv in self._inverses_by_damping.items():
         ops.append(inv.assign(utils.posdef_inv(self._cov, damping)))
 
     return ops
 
-  def get_inverse(self, damping):
+  def get_damped_inverse(self, damping):
+    # Note that this function returns a variable which gets updated by the
+    # inverse ops.  It may be stale / inconsistent with the latest value of
+    # get_cov().
     return self._inverses_by_damping[damping]
 
   def get_matpower(self, exp, damping):
+    # Note that this function returns a variable which gets updated by the
+    # inverse ops.  It may be stale / inconsistent with the latest value of
+    # get_cov().
     return self._matpower_by_exp_and_damping[(exp, damping)]
 
   def get_eigendecomp(self):
+    # Unlike get_inverse and get_matpower this doesn't retrieve a stored
+    # variable, but instead always computes a fresh version from the current
+    # value of get_cov().
     return self._eigendecomp
 
 
@@ -356,12 +424,21 @@ class FullFactor(InverseProvidingFactor):
   to any type of parameter in principle, but has very high variance.
   """
 
-  def __init__(self, params_grads, batch_size):
+  def __init__(self,
+               params_grads,
+               batch_size,
+               colocate_cov_ops_with_inputs=False):
     self._batch_size = batch_size
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
     self._orig_params_grads_name = scope_string_from_params(
         [params_grads, self._batch_size])
-    self._params_grads_flat = tuple(
-        utils.tensors_to_column(params_grad) for params_grad in params_grads)
+    params_grads_flat = []
+    for params_grad in params_grads:
+      with _maybe_colocate_with(params_grad,
+                                self._colocate_cov_ops_with_inputs):
+        col = utils.tensors_to_column(params_grad)
+        params_grads_flat.append(col)
+    self._params_grads_flat = tuple(params_grads_flat)
     super(FullFactor, self).__init__()
 
   @property
@@ -377,11 +454,17 @@ class FullFactor(InverseProvidingFactor):
   def _num_sources(self):
     return len(self._params_grads_flat)
 
+  @property
+  def _dtype(self):
+    return self._params_grads_flat[0].dtype
+
   def _compute_new_cov(self, idx=0):
     # This will be a very basic rank 1 estimate
-    return ((self._params_grads_flat[idx] * array_ops.transpose(
-        self._params_grads_flat[idx])) / math_ops.cast(
-            self._batch_size, self._params_grads_flat[idx].dtype))
+    with _maybe_colocate_with(self._params_grads_flat[idx],
+                              self._colocate_cov_ops_with_inputs):
+      return ((self._params_grads_flat[idx] * array_ops.transpose(
+          self._params_grads_flat[idx])) / math_ops.cast(
+              self._batch_size, self._params_grads_flat[idx].dtype))
 
 
 class DiagonalFactor(FisherFactor):
@@ -394,6 +477,9 @@ class DiagonalFactor(FisherFactor):
   def _cov_initializer(self):
     return diagonal_covariance_initializer
 
+  def make_inverse_update_ops(self):
+    return []
+
 
 class NaiveDiagonalFactor(DiagonalFactor):
   """FisherFactor for a diagonal approximation of any type of param's Fisher.
@@ -402,10 +488,19 @@ class NaiveDiagonalFactor(DiagonalFactor):
   to any type of parameter in principle, but has very high variance.
   """
 
-  def __init__(self, params_grads, batch_size):
+  def __init__(self,
+               params_grads,
+               batch_size,
+               colocate_cov_ops_with_inputs=False):
     self._batch_size = batch_size
-    self._params_grads = tuple(
-        utils.tensors_to_column(params_grad) for params_grad in params_grads)
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
+    params_grads_flat = []
+    for params_grad in params_grads:
+      with _maybe_colocate_with(params_grad,
+                                self._colocate_cov_ops_with_inputs):
+        col = utils.tensors_to_column(params_grad)
+        params_grads_flat.append(col)
+    self._params_grads = tuple(params_grads_flat)
     self._orig_params_grads_name = scope_string_from_params(
         [self._params_grads, self._batch_size])
     super(NaiveDiagonalFactor, self).__init__()
@@ -422,9 +517,15 @@ class NaiveDiagonalFactor(DiagonalFactor):
   def _num_sources(self):
     return len(self._params_grads)
 
+  @property
+  def _dtype(self):
+    return self._params_grads[0].dtype
+
   def _compute_new_cov(self, idx=0):
-    return (math_ops.square(self._params_grads[idx]) / math_ops.cast(
-        self._batch_size, self._params_grads[idx].dtype))
+    with _maybe_colocate_with(self._params_grads[idx],
+                              self._colocate_cov_ops_with_inputs):
+      return (math_ops.square(self._params_grads[idx]) / math_ops.cast(
+          self._batch_size, self._params_grads[idx].dtype))
 
 
 class FullyConnectedDiagonalFactor(DiagonalFactor):
@@ -440,7 +541,11 @@ class FullyConnectedDiagonalFactor(DiagonalFactor):
 
   # TODO(jamesmartens): add units tests for this class
 
-  def __init__(self, inputs, outputs_grads, has_bias=False):
+  def __init__(self,
+               inputs,
+               outputs_grads,
+               has_bias=False,
+               colocate_cov_ops_with_inputs=False):
     """Instantiate FullyConnectedDiagonalFactor.
 
     Args:
@@ -449,18 +554,22 @@ class FullyConnectedDiagonalFactor(DiagonalFactor):
       outputs_grads: List of Tensors of shape [batch_size, output_size].
         Gradient of loss with respect to layer's preactivations.
       has_bias: bool. If True, append '1' to each input.
+      colocate_cov_ops_with_inputs: Whether to colocate cov_update ops with
+          their inputs.
     """
     self._outputs_grads = outputs_grads
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
     self._batch_size = array_ops.shape(inputs)[0]
-    self._orig_tensors_name = scope_string_from_params((inputs,) +
-                                                       tuple(outputs_grads))
+    self._orig_tensors_name = scope_string_from_params(
+        (inputs,) + tuple(outputs_grads))
 
     # Note that we precompute the required operations on the inputs since the
     # inputs don't change with the 'idx' argument to _compute_new_cov.  (Only
     # the target entry of _outputs_grads changes with idx.)
-    if has_bias:
-      inputs = _append_homog(inputs)
-    self._squared_inputs = math_ops.square(inputs)
+    with _maybe_colocate_with(inputs, self._colocate_cov_ops_with_inputs):
+      if has_bias:
+        inputs = _append_homog(inputs)
+      self._squared_inputs = math_ops.square(inputs)
 
     super(FullyConnectedDiagonalFactor, self).__init__()
 
@@ -476,17 +585,23 @@ class FullyConnectedDiagonalFactor(DiagonalFactor):
   def _num_sources(self):
     return len(self._outputs_grads)
 
+  @property
+  def _dtype(self):
+    return self._outputs_grads[0].dtype
+
   def _compute_new_cov(self, idx=0):
     # The well-known special formula that uses the fact that the entry-wise
     # square of an outer product is the outer-product of the entry-wise squares.
     # The gradient is the outer product of the input and the output gradients,
     # so we just square both and then take their outer-product.
-    new_cov = math_ops.matmul(
-        self._squared_inputs,
-        math_ops.square(self._outputs_grads[idx]),
-        transpose_a=True)
-    new_cov /= math_ops.cast(self._batch_size, new_cov.dtype)
-    return new_cov
+    with _maybe_colocate_with(self._squared_inputs,
+                              self._colocate_cov_ops_with_inputs):
+      new_cov = math_ops.matmul(
+          self._squared_inputs,
+          math_ops.square(self._outputs_grads[idx]),
+          transpose_a=True)
+      new_cov /= math_ops.cast(self._batch_size, new_cov.dtype)
+      return new_cov
 
 
 class ConvDiagonalFactor(DiagonalFactor):
@@ -494,8 +609,14 @@ class ConvDiagonalFactor(DiagonalFactor):
 
   # TODO(jamesmartens): add units tests for this class
 
-  def __init__(self, inputs, outputs_grads, filter_shape, strides, padding,
-               has_bias=False):
+  def __init__(self,
+               inputs,
+               outputs_grads,
+               filter_shape,
+               strides,
+               padding,
+               has_bias=False,
+               colocate_cov_ops_with_inputs=False):
     """Creates a ConvDiagonalFactor object.
 
     Args:
@@ -510,29 +631,36 @@ class ConvDiagonalFactor(DiagonalFactor):
       padding: The padding in this layer (1-D of Tensor length 4).
       has_bias: Python bool. If True, the layer is assumed to have a bias
         parameter in addition to its filter parameter.
+      colocate_cov_ops_with_inputs: Whether to colocate cov_update ops with
+          their inputs.
     """
     self._filter_shape = filter_shape
     self._has_bias = has_bias
     self._outputs_grads = outputs_grads
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
 
-    self._orig_tensors_name = scope_string_from_name((inputs,)
-                                                     + tuple(outputs_grads))
+    self._orig_tensors_name = scope_string_from_name(
+        (inputs,) + tuple(outputs_grads))
 
     # Note that we precompute the required operations on the inputs since the
     # inputs don't change with the 'idx' argument to _compute_new_cov.  (Only
     # the target entry of _outputs_grads changes with idx.)
-    filter_height, filter_width, _, _ = self._filter_shape
-    patches = array_ops.extract_image_patches(
-        inputs,
-        ksizes=[1, filter_height, filter_width, 1],
-        strides=strides,
-        rates=[1, 1, 1, 1],
-        padding=padding)
+    with _maybe_colocate_with(inputs, self._colocate_cov_ops_with_inputs):
+      filter_height, filter_width, _, _ = self._filter_shape
 
-    if has_bias:
-      patches = _append_homog(patches)
+      # TODO(b/64144716): there is potential here for a big savings in terms of
+      # memory use.
+      patches = array_ops.extract_image_patches(
+          inputs,
+          ksizes=[1, filter_height, filter_width, 1],
+          strides=strides,
+          rates=[1, 1, 1, 1],
+          padding=padding)
+
+      if has_bias:
+        patches = _append_homog(patches)
 
-    self._patches = patches
+      self._patches = patches
 
     super(ConvDiagonalFactor, self).__init__()
 
@@ -543,21 +671,29 @@ class ConvDiagonalFactor(DiagonalFactor):
   @property
   def _cov_shape(self):
     filter_height, filter_width, in_channels, out_channels = self._filter_shape
-    return [filter_height * filter_width * in_channels + self._has_bias,
-            out_channels]
+    return [
+        filter_height * filter_width * in_channels + self._has_bias,
+        out_channels
+    ]
 
   @property
   def _num_sources(self):
     return len(self._outputs_grads)
 
+  @property
+  def _dtype(self):
+    return self._outputs_grads[0].dtype
+
   def _compute_new_cov(self, idx=0):
-    outputs_grad = self._outputs_grads[idx]
-    batch_size = array_ops.shape(self._patches)[0]
+    with _maybe_colocate_with(self._outputs_grads[idx],
+                              self._colocate_cov_ops_with_inputs):
+      outputs_grad = self._outputs_grads[idx]
+      batch_size = array_ops.shape(self._patches)[0]
 
-    new_cov = self._convdiag_sum_of_squares(self._patches, outputs_grad)
-    new_cov /= math_ops.cast(batch_size, new_cov.dtype)
+      new_cov = self._convdiag_sum_of_squares(self._patches, outputs_grad)
+      new_cov /= math_ops.cast(batch_size, new_cov.dtype)
 
-    return new_cov
+      return new_cov
 
   def _convdiag_sum_of_squares(self, patches, outputs_grad):
     # This computes the sum of the squares of the per-training-case "gradients".
@@ -572,19 +708,24 @@ class FullyConnectedKroneckerFactor(InverseProvidingFactor):
   """Kronecker factor for the input or output side of a fully-connected layer.
   """
 
-  def __init__(self, tensors, has_bias=False):
+  def __init__(self,
+               tensors,
+               has_bias=False,
+               colocate_cov_ops_with_inputs=False):
     """Instantiate FullyConnectedKroneckerFactor.
 
     Args:
       tensors: List of Tensors of shape [batch_size, n]. Represents either a
         layer's inputs or its output's gradients.
-      has_bias: bool. If True, assume this factor is for the layer's inputs and
-        append '1' to each row.
+      has_bias: bool. If True, append '1' to each row.
+      colocate_cov_ops_with_inputs: Whether to colocate cov_update ops with
+          their inputs.
     """
     # The tensor argument is either a tensor of input activations or a tensor of
     # output pre-activation gradients.
     self._has_bias = has_bias
     self._tensors = tensors
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
     super(FullyConnectedKroneckerFactor, self).__init__()
 
   @property
@@ -601,11 +742,17 @@ class FullyConnectedKroneckerFactor(InverseProvidingFactor):
   def _num_sources(self):
     return len(self._tensors)
 
+  @property
+  def _dtype(self):
+    return self._tensors[0].dtype
+
   def _compute_new_cov(self, idx=0):
-    tensor = self._tensors[idx]
-    if self._has_bias:
-      tensor = _append_homog(tensor)
-    return _compute_cov(tensor)
+    with _maybe_colocate_with(self._tensors[idx],
+                              self._colocate_cov_ops_with_inputs):
+      tensor = self._tensors[idx]
+      if self._has_bias:
+        tensor = _append_homog(tensor)
+      return _compute_cov(tensor)
 
 
 class ConvInputKroneckerFactor(InverseProvidingFactor):
@@ -618,7 +765,13 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
   Section 3.1 Estimating the factors.
   """
 
-  def __init__(self, inputs, filter_shape, strides, padding, has_bias=False):
+  def __init__(self,
+               inputs,
+               filter_shape,
+               strides,
+               padding,
+               has_bias=False,
+               colocate_cov_ops_with_inputs=False):
     """Initializes ConvInputKroneckerFactor.
 
     Args:
@@ -630,12 +783,15 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
         width_stride, in_channel_stride].
       padding: str. Padding method for layer. "SAME" or "VALID".
       has_bias: bool. If True, append 1 to in_channel.
+      colocate_cov_ops_with_inputs: Whether to colocate cov_update ops with
+          their inputs.
     """
     self._filter_shape = filter_shape
     self._strides = strides
     self._padding = padding
     self._has_bias = has_bias
     self._inputs = inputs
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
     super(ConvInputKroneckerFactor, self).__init__()
 
   @property
@@ -655,26 +811,34 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
   def _num_sources(self):
     return 1
 
+  @property
+  def _dtype(self):
+    return self._inputs.dtype
+
   def _compute_new_cov(self, idx=0):
     if idx != 0:
       raise ValueError("ConvInputKroneckerFactor only supports idx = 0")
 
     # TODO(jamesmartens): factor this patches stuff out into a utility function
-    filter_height, filter_width, in_channels, _ = self._filter_shape
-    patches = array_ops.extract_image_patches(
-        self._inputs,
-        ksizes=[1, filter_height, filter_width, 1],
-        strides=self._strides,
-        rates=[1, 1, 1, 1],
-        padding=self._padding)
+    with _maybe_colocate_with(self._inputs, self._colocate_cov_ops_with_inputs):
+      filter_height, filter_width, in_channels, _ = self._filter_shape
 
-    flatten_size = (filter_height * filter_width * in_channels)
-    patches_flat = array_ops.reshape(patches, [-1, flatten_size])
+      # TODO(b/64144716): there is potential here for a big savings in terms of
+      # memory use.
+      patches = array_ops.extract_image_patches(
+          self._inputs,
+          ksizes=[1, filter_height, filter_width, 1],
+          strides=self._strides,
+          rates=[1, 1, 1, 1],
+          padding=self._padding)
 
-    if self._has_bias:
-      patches_flat = _append_homog(patches_flat)
+      flatten_size = (filter_height * filter_width * in_channels)
+      patches_flat = array_ops.reshape(patches, [-1, flatten_size])
 
-    return _compute_cov(patches_flat)
+      if self._has_bias:
+        patches_flat = _append_homog(patches_flat)
+
+      return _compute_cov(patches_flat)
 
 
 class ConvOutputKroneckerFactor(InverseProvidingFactor):
@@ -688,15 +852,18 @@ class ConvOutputKroneckerFactor(InverseProvidingFactor):
   Section 3.1 Estimating the factors.
   """
 
-  def __init__(self, outputs_grads):
+  def __init__(self, outputs_grads, colocate_cov_ops_with_inputs=False):
     """Initializes ConvOutputKroneckerFactor.
 
     Args:
       outputs_grads: list of Tensors. Each Tensor is of shape
-        [batch_size, height, width, out_channels].
+          [batch_size, height, width, out_channels].
+      colocate_cov_ops_with_inputs: Whether to colocate cov_update ops with
+          their inputs.
     """
     self._out_channels = outputs_grads[0].shape.as_list()[3]
     self._outputs_grads = outputs_grads
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
     super(ConvOutputKroneckerFactor, self).__init__()
 
   @property
@@ -712,7 +879,286 @@ class ConvOutputKroneckerFactor(InverseProvidingFactor):
   def _num_sources(self):
     return len(self._outputs_grads)
 
+  @property
+  def _dtype(self):
+    return self._outputs_grads[0].dtype
+
   def _compute_new_cov(self, idx=0):
-    reshaped_tensor = array_ops.reshape(self._outputs_grads[idx],
-                                        [-1, self._out_channels])
-    return _compute_cov(reshaped_tensor)
+    with _maybe_colocate_with(self._outputs_grads[idx],
+                              self._colocate_cov_ops_with_inputs):
+      reshaped_tensor = array_ops.reshape(self._outputs_grads[idx],
+                                          [-1, self._out_channels])
+      return _compute_cov(reshaped_tensor)
+
+
+class FullyConnectedMultiKF(InverseProvidingFactor):
+  """Kronecker factor for a fully connected recurrent layer."""
+
+  def __init__(self,
+               tensor_lists,
+               has_bias=False,
+               colocate_cov_ops_with_inputs=False):
+    """Constructs a new `FullyConnectedMultiKF`.
+
+    Args:
+      tensor_lists: List of lists of Tensors of shape [batch_size, n].
+      has_bias: bool. If True, '1' is appended to each row.
+      colocate_cov_ops_with_inputs: Whether to colocate cov_update ops with
+        their inputs.
+    """
+
+    self._orig_tensors_name = scope_string_from_params(tensor_lists)
+    self._batch_size = array_ops.shape(tensor_lists[0][0])[0]
+    self._num_timesteps = len(tensor_lists[0])
+
+    tensors = tuple(
+        array_ops.concat(tensor_list, 0) for tensor_list in tensor_lists)
+    if has_bias:
+      tensors = tuple(_append_homog(tensor) for tensor in tensors)
+    self._tensors = tensors
+
+    self._cov_dt1 = None
+    self._option1quants_by_damping = {}
+    self._option2quants_by_damping = {}
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
+
+    super(FullyConnectedMultiKF, self).__init__()
+
+  @property
+  def _var_scope(self):
+    return "ff_fc_multi/" + self._orig_tensors_name
+
+  @property
+  def _num_sources(self):
+    return len(self._tensors)
+
+  @property
+  def _dtype(self):
+    return self._tensors[0].dtype
+
+  def make_covariance_update_op(self, ema_decay):
+    with _maybe_colocate_with(self._tensors,
+                              self._colocate_cov_ops_with_inputs):
+      op = super(FullyConnectedMultiKF,
+                 self).make_covariance_update_op(ema_decay)
+
+      if self._cov_dt1 is not None:
+        new_cov_dt1 = math_ops.add_n(
+            tuple(
+                self._compute_new_cov_dt1(idx)
+                for idx in range(self._num_sources)))
+        op2 = moving_averages.assign_moving_average(
+            self._cov_dt1, new_cov_dt1, ema_decay, zero_debias=ZERO_DEBIAS)
+
+        # TODO(b/69112164):
+        # It's important that _cov and _cov_dt1 remain consistent with each
+        # other while the inverse ops are happening. How can we ensure this?
+        # We will need to add explicit synchronization for this to
+        # work with asynchronous training.
+        op = control_flow_ops.group(op, op2)
+
+    return op
+
+  def _compute_new_cov(self, idx=0):
+    tensor = self._tensors[idx]
+    normalizer = self._num_timesteps * self._batch_size
+    return _compute_cov(tensor, normalizer=normalizer)
+
+  def _compute_new_cov_dt1(self, idx=0):
+    tensor = self._tensors[idx]
+    normalizer = self._num_timesteps * self._batch_size
+    tensor_present = tensor[:-self._batch_size, :]
+    tensor_future = tensor[self._batch_size:, :]
+    return _compute_cov(
+        tensor_future, tensor_right=tensor_present, normalizer=normalizer)
+
+  @property
+  def _cov_shape(self):
+    size = self._tensors[0].shape[1]
+    return [size, size]
+
+  @property
+  def _vec_shape(self):
+    size = self._tensors[0].shape[1]
+    return [size]
+
+  def get_option1quants(self, damping):
+    return self._option1quants_by_damping[damping]
+
+  def get_option2quants(self, damping):
+    return self._option2quants_by_damping[damping]
+
+  def get_cov_dt1(self):
+    assert self._cov_dt1 is not None
+    return self._cov_dt1
+
+  def register_cov_dt1(self):
+    """Create a variable representing temporal cross-covariance.
+
+    (This is technically the second moment, not covariance, since it's
+    not mean subtracted.)
+    """
+    if self._cov_dt1 is None:
+      with variable_scope.variable_scope(self._var_scope):
+        self._cov_dt1 = variable_scope.get_variable(
+            "cov_dt1",
+            initializer=init_ops.zeros_initializer,
+            shape=self._cov_shape,
+            trainable=False,
+            dtype=self._dtype)
+
+  def register_option1quants(self, damping):
+
+    self.register_eigendecomp()
+    self.register_cov_dt1()
+
+    if damping not in self._option1quants_by_damping:
+      # It's questionable as to whether we should initialize with stuff like
+      # this at all.  Ideally these values should never be used until they are
+      # updated at least once.
+      damping_string = scalar_or_tensor_to_string(damping)
+      with variable_scope.variable_scope(self._var_scope):
+        Lmat = variable_scope.get_variable(  # pylint: disable=invalid-name
+            "Lmat_damp{}".format(damping_string),
+            initializer=inverse_initializer,
+            shape=self._cov_shape,
+            trainable=False,
+            dtype=self._dtype)
+        psi = variable_scope.get_variable(
+            "psi_damp{}".format(damping_string),
+            initializer=init_ops.ones_initializer,
+            shape=self._vec_shape,
+            trainable=False,
+            dtype=self._dtype)
+
+      self._option1quants_by_damping[damping] = (Lmat, psi)
+
+  def register_option2quants(self, damping):
+
+    self.register_eigendecomp()
+    self.register_cov_dt1()
+
+    if damping not in self._option2quants_by_damping:
+      # It's questionable as to whether we should initialize with stuff like
+      # this at all.  Ideally these values should never be used until they are
+      # updated at least once.
+      damping_string = scalar_or_tensor_to_string(damping)
+      with variable_scope.variable_scope(self._var_scope):
+        Pmat = variable_scope.get_variable(  # pylint: disable=invalid-name
+            "Lmat_damp{}".format(damping_string),
+            initializer=inverse_initializer,
+            shape=self._cov_shape,
+            trainable=False,
+            dtype=self._dtype)
+        Kmat = variable_scope.get_variable(  # pylint: disable=invalid-name
+            "Kmat_damp{}".format(damping_string),
+            initializer=inverse_initializer,
+            shape=self._cov_shape,
+            trainable=False,
+            dtype=self._dtype)
+        mu = variable_scope.get_variable(
+            "mu_damp{}".format(damping_string),
+            initializer=init_ops.ones_initializer,
+            shape=self._vec_shape,
+            trainable=False,
+            dtype=self._dtype)
+
+      self._option2quants_by_damping[damping] = (Pmat, Kmat, mu)
+
+  def make_inverse_update_ops(self):
+    """Create and return update ops corresponding to registered computations."""
+    # TODO(b/69918258): Add correctness tests for this method.
+    # pylint: disable=invalid-name
+
+    ops = super(FullyConnectedMultiKF, self).make_inverse_update_ops()
+
+    if (len(self._option1quants_by_damping) +
+        len(self._option2quants_by_damping)):
+
+      # Note that C0 and C1 are stand-ins for A0 and A1, or G0 and G1, from
+      # the pseudo-code in the original paper.  Because the computations for
+      # the A and G case are essentially the same they can both be performed by
+      # the same class (this one).
+
+      C1 = self.get_cov_dt1()
+
+      # Get the eigendecomposition of C0  (= self.get_cov())
+      eigen_e, eigen_V = self.get_eigendecomp()
+
+      # TODO(b/69678661): Note, there is an implicit assumption here that C1
+      # and C0 (as represented here by its eigen-decomp) are consistent.  This
+      # could fail to be the case if self._cov and self._cov_dt1 are not updated
+      # consistently, or are somehow read between or during the cov updates.
+      # Can this possibly happen?  Is there a way to prevent it?
+
+      for damping, (Lmat_var,
+                    psi_var) in self._option1quants_by_damping.items():
+
+        invsqrtC0 = math_ops.matmul(
+            eigen_V * (eigen_e + damping)**(-0.5), eigen_V, transpose_b=True)
+
+        # Might need to enforce symmetry lost due to numerical issues.
+        invsqrtC0 = (invsqrtC0 + array_ops.transpose(invsqrtC0)) / 2.0
+
+        # The following line imposses the symmetry assumed by "Option 1" on C1.
+        # Stangely the code can work okay with this line commented out,
+        # depending on how psd_eig is defined.  I'm not sure why.
+        C1 = (C1 + array_ops.transpose(C1)) / 2.0
+
+        # hPsi = C0^(-1/2) * C1 * C0^(-1/2)  (hPsi means \hat{Psi})
+        hPsi = math_ops.matmul(math_ops.matmul(invsqrtC0, C1), invsqrtC0)
+
+        # Compute the decomposition U*diag(psi)*U^T = hPsi
+        psi, U = utils.posdef_eig(hPsi)
+
+        # L = C0^(-1/2) * U
+        Lmat = math_ops.matmul(invsqrtC0, U)
+
+        ops.append(Lmat_var.assign(Lmat))
+        ops.append(psi_var.assign(psi))
+
+      for damping, (Pmat_var, Kmat_var,
+                    mu_var) in self._option2quants_by_damping.items():
+
+        # compute C0^(-1/2)
+        invsqrtC0 = math_ops.matmul(
+            eigen_V * (eigen_e + damping)**(-0.5), eigen_V, transpose_b=True)
+
+        # Might need to enforce symmetry lost due to numerical issues.
+        invsqrtC0 = (invsqrtC0 + array_ops.transpose(invsqrtC0)) / 2.0
+
+        # Compute the product C0^(-1/2) * C1
+        invsqrtC0C1 = math_ops.matmul(invsqrtC0, C1)
+
+        # hPsi = C0^(-1/2) * C1 * C0^(-1/2)  (hPsi means \hat{Psi})
+        hPsi = math_ops.matmul(invsqrtC0C1, invsqrtC0)
+
+        # Compute the decomposition E*diag(mu)*E^T = hPsi^T * hPsi
+        # Note that we using the notation mu instead of "m" for the eigenvalues.
+        # Instead of computing the product hPsi^T * hPsi and then doing an
+        # eigen-decomposition of this we just compute the SVD of hPsi and then
+        # square the singular values to get the eigenvalues. For a justification
+        # of this approach, see:
+        # https://en.wikipedia.org/wiki/Singular-value_decomposition#Relation_to_eigenvalue_decomposition
+        sqrtmu, _, E = linalg_ops.svd(hPsi)
+        mu = math_ops.square(sqrtmu)
+
+        # Mathematically, the eigenvalues should not should not exceed 1.0, but
+        # due to numerical issues, or possible issues with inconsistent
+        # values of C1 and (the eigen-decomposition of) C0 they might. So
+        # we enforce this condition.
+        mu = math_ops.minimum(mu, 1.0)
+
+        # P = (C0^(-1/2) * C1)^T * C0^(-1/2) = C_1^T * C_0^(-1)
+        Pmat = math_ops.matmul(invsqrtC0C1, invsqrtC0, transpose_a=True)
+
+        # K = C_0^(-1/2) * E
+        Kmat = math_ops.matmul(invsqrtC0, E)
+
+        ops.append(Pmat_var.assign(Pmat))
+        ops.append(Kmat_var.assign(Kmat))
+        ops.append(mu_var.assign(mu))
+
+    return [control_flow_ops.group(*ops)]
+
+    # pylint: enable=invalid-name
diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection.py b/tensorflow/contrib/kfac/python/ops/layer_collection.py
index 2139a261e05e33bcb650f31d5d9e85f592009ba6..ca42afe6fb2f5c7d7de8b5b087dc11be30a75d5e 100644
--- a/tensorflow/contrib/kfac/python/ops/layer_collection.py
+++ b/tensorflow/contrib/kfac/python/ops/layer_collection.py
@@ -26,7 +26,9 @@ from __future__ import print_function
 
 from collections import defaultdict
 from collections import OrderedDict
+from functools import partial
 
+import math
 import six
 
 from tensorflow.contrib.kfac.python.ops import fisher_blocks as fb
@@ -35,7 +37,6 @@ from tensorflow.contrib.kfac.python.ops import utils
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 # Names for various approximations that can be requested for Fisher blocks.
@@ -58,12 +59,22 @@ _CONV2D_APPROX_TO_BLOCK_TYPES = {
     APPROX_DIAGONAL_NAME: fb.ConvDiagonalFB,
 }
 
+APPROX_KRONECKER_INDEP_NAME = "kron_indep"
+APPROX_KRONECKER_SERIES_1_NAME = "kron_series_1"
+APPROX_KRONECKER_SERIES_2_NAME = "kron_series_2"
+
+_FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES = {
+    APPROX_KRONECKER_INDEP_NAME: fb.FullyConnectedMultiIndepFB,
+    APPROX_KRONECKER_SERIES_1_NAME: partial(fb.FullyConnectedSeriesFB,
+                                            option=1),
+    APPROX_KRONECKER_SERIES_2_NAME: partial(fb.FullyConnectedSeriesFB,
+                                            option=2)
+}
+
 # Possible value for 'reuse' keyword argument. Sets 'reuse' to
 # tf.get_variable_scope().reuse.
 VARIABLE_SCOPE = "VARIABLE_SCOPE"
 
-# TODO(jamesmartens): need to add find_canonical_output back into this somewhere
-
 
 def ensure_sequence(obj):
   """If `obj` isn't a tuple or list, return a tuple containing `obj`."""
@@ -129,7 +140,10 @@ class LayerCollection(object):
         sum.
   """
 
-  def __init__(self, graph=None, name="LayerCollection"):
+  def __init__(self,
+               graph=None,
+               colocate_cov_ops_with_inputs=False,
+               name="LayerCollection"):
     self.fisher_blocks = LayerParametersDict()
     self.fisher_factors = OrderedDict()
     self._linked_parameters = dict(
@@ -140,6 +154,9 @@ class LayerCollection(object):
     self._default_generic_approximation = APPROX_FULL_NAME
     self._default_fully_connected_approximation = APPROX_KRONECKER_NAME
     self._default_convolution_2d_approximation = APPROX_KRONECKER_NAME
+    self._default_fully_connected_multi_approximation = (
+        APPROX_KRONECKER_SERIES_2_NAME)
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
 
     with variable_scope.variable_scope(None, default_name=name) as scope:
       self._var_scope = scope.name
@@ -149,19 +166,13 @@ class LayerCollection(object):
     """LossFunctions registered with this LayerCollection."""
     return list(self._loss_dict.values())
 
-  def is_variable_registered(self, variable):
-    """Checks whether the variable has already been registered.
-
-    Args:
-      variable: A single variable or tensor.
-    Returns:
-      True if the variable has been registered either by itself or as part of a
-      tuple.
-    """
-    return any([
-        variable in key if isinstance(key, (tuple, list)) else variable == key
-        for key in self.fisher_blocks.keys()
-    ])
+  @property
+  def registered_variables(self):
+    """A tuple of all of the variables currently registered."""
+    tuple_of_tuples = (ensure_sequence(key) for key, block
+                       in six.iteritems(self.fisher_blocks))
+    flat_tuple = tuple(item for tuple_ in tuple_of_tuples for item in tuple_)
+    return flat_tuple
 
   @property
   def linked_parameters(self):
@@ -181,8 +192,7 @@ class LayerCollection(object):
   def default_generic_approximation(self):
     return self._default_generic_approximation
 
-  @default_generic_approximation.setter
-  def default_generic_approximation(self, value):
+  def set_default_generic_approximation(self, value):
     if value not in _GENERIC_APPROX_TO_BLOCK_TYPES:
       raise ValueError(
           "{} is not a valid approximation for generic variables.".format(
@@ -193,8 +203,7 @@ class LayerCollection(object):
   def default_fully_connected_approximation(self):
     return self._default_fully_connected_approximation
 
-  @default_fully_connected_approximation.setter
-  def default_fully_connected_approximation(self, value):
+  def set_default_fully_connected_approximation(self, value):
     if value not in _FULLY_CONNECTED_APPROX_TO_BLOCK_TYPES:
       raise ValueError(
           "{} is not a valid approximation for fully connected layers.".format(
@@ -205,50 +214,44 @@ class LayerCollection(object):
   def default_conv2d_approximation(self):
     return self._default_convolution_2d_approximation
 
-  @default_conv2d_approximation.setter
-  def default_conv2d_approximation(self, value):
+  def set_default_conv2d_approximation(self, value):
     if value not in _CONV2D_APPROX_TO_BLOCK_TYPES:
       raise ValueError(
           "{} is not a valid approximation for 2d convolutional layers.".format(
               value))
     self._default_convolution_2d_approximation = value
 
+  @property
+  def default_fully_connected_multi_approximation(self):
+    return self._default_fully_connected_multi_approximation
+
+  def set_default_fully_connected_multi_approximation(self, value):
+    if value not in _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES:
+      raise ValueError("{} is not a valid approximation for a fully-connected "
+                       "multi layer.".format(value))
+    self._default_fully_connected_multi_approximation = value
+
   def register_block(self, layer_key, fisher_block, reuse=VARIABLE_SCOPE):
     """Validates and registers the layer_key associated with the fisher_block.
 
-    Validation consists of checking whether the key was already registered or
-    if any of the elements of layer_key (if it's a tuple) were already
-    registered as part of another tuple (throws an error if so). If any of the
-    elements were registered by themselves, or as part of tuples that are
-    subsets of this layer_key, those registrations are first removed.
-
-    If the layer_key is a subset of an existing registration, registration of
-    the new, smaller layer_key is skipped.
-
-    e.g. If registrations include {'a': foo, ('b', 'c'): bar}, then
-      - register_layer('a', baz) -> ValueError
-      - register_layer(('b', 'c', 'd'), baz) ->
-        {'a': foo, ('b', 'c', 'd'): baz}
-      - register_layer('b', baz) ->
-        {'a': foo, ('b', 'c'): bar} (No change)
-      - register_layer(('a', 'd'), baz) ->
-        {('a', 'd'): baz, ('b', 'c'): bar}
-      - register_layer(('b', 'd'), baz) -> ValueError
-
     Args:
-      layer_key: The key to check for in existing registrations and to register
-          if valid.
-      fisher_block: The associated fisher block.
-      reuse: Method to use for inserting new FisherBlocks. One of True, False,
-        or VARIABLE_SCOPE.
+      layer_key: A variable or tuple of variables. The key to check for in
+          existing registrations and to register if valid.
+      fisher_block: The associated `FisherBlock`.
+      reuse: Method to use for inserting new `FisherBlock`s. One of True, False,
+        or 'VARIABLE_SCOPE'.
 
     Raises:
-      ValueError: If the layer_key was already registered, or if a subset of the
-          layer_key has already been registered as part of a different tuple.
+      ValueError: If `layer_key` was already registered and reuse is `False`,
+        if `layer_key` was registered with a different block type, or if
+        `layer_key` shares any variables with but is not equal to a previously
+        registered key.
+      KeyError: If `reuse` is `True` but `layer_key` was not previously
+        registered.
 
     Returns:
-      FisherBlock registered under 'layer_key'. May or may not be the same as
-      'fisher_block'.
+      The `FisherBlock` registered under `layer_key`. If `layer_key` was already
+      registered, this will be the previously registered `FisherBlock`.
     """
     if reuse is VARIABLE_SCOPE:
       reuse = variable_scope.get_variable_scope().reuse
@@ -268,110 +271,84 @@ class LayerCollection(object):
     # Insert fisher_block into self.fisher_blocks.
     if layer_key in self.fisher_blocks:
       raise ValueError("Duplicate registration: {}".format(layer_key))
-    if isinstance(layer_key, (tuple, list)):
-      return self._register_block_with_sequence_key(layer_key, fisher_block)
-    else:
-      return self._register_block_with_nonsequence_key(layer_key, fisher_block)
-
-  def _register_block_with_sequence_key(self, layer_key, fisher_block):
-    """Validates and registers the layer_key if it's a sequence."""
-    # Find all keys that are either supersets or subsets of 'layer_key'.
-    inclusions = {
-        fisher_elt
-        for layer_elt in layer_key
-        for fisher_elt in self.fisher_blocks
-        if self._equal_or_subset(layer_elt, fisher_elt)
+    # Raise an error if any variable in layer_key has been registered in any
+    # other blocks.
+    variable_to_block = {
+        var: (params, block)
+        for (params, block) in self.fisher_blocks.items()
+        for var in ensure_sequence(params)
     }
-
-    if not inclusions:
-      self.fisher_blocks[layer_key] = fisher_block
-      return fisher_block
-
-    result_key = None
-    for key in inclusions:
-      fisher_block_key = key if isinstance(key, (tuple, list)) else (key,)
-      in_existing_only = set(fisher_block_key) - set(layer_key)
-      in_new_only = set(layer_key) - set(fisher_block_key)
-
-      if in_existing_only and in_new_only:
-        # Existing and new key have an intersection but neither is a subset of
-        # the other. This is an error.
+    for variable in ensure_sequence(layer_key):
+      if variable in variable_to_block:
+        prev_key, prev_block = variable_to_block[variable]
         raise ValueError(
-            "Inconsistent registration, expected new key to be a subset or "
-            "superset of the existing key: existing is {}, new is {}".format(
-                key, layer_key))
-      elif in_existing_only and not in_new_only:
-        # Existing key is strict superset of new key. Return existing
-        # FisherBlock.
-        logging.warning("Graph Registration Warning: tried to register "
-                        "a subset ({}) of an already registered tuple "
-                        "({}), skipping".format(layer_key, fisher_block_key))
-        assert result_key is None
-        result_key = key
-      elif in_new_only and not in_existing_only:
-        # Existing key is a strict subset of new key. Replace existing
-        # FisherBlock with new one.
-        #
-        # TODO(b/68715045): This is dangerous. If there are existing
-        # registrations for a minibatch from elsewhere in the graph, they won't
-        # be re-registered with this new FisherBlock. The type of FisherBlock
-        # could also change here.
-        logging.warning(
-            "Replacing existing FisherBlock for key {} with new FisherBlock "
-            "for key {}. {} registered minibatches from the existing "
-            "FisherBlock will not be migrated.".format(
-                key, layer_key,
-                self.fisher_blocks[key].num_registered_minibatches))
-        self.fisher_blocks.pop(key)
-        self.fisher_blocks[layer_key] = fisher_block
-        assert result_key is None
-        result_key = layer_key
-      elif not in_new_only and not in_existing_only:
-        # Existing and new are identical. Reuse the old FisherBlock.
-        #
-        # TODO(b/68715045): This is dangerous. If the new FisherBlock has
-        # existing registered minibatches, they will not be migrated to the
-        # existing FisherBlock.
-        assert result_key is None
-        result_key = key
-      else:
-        raise ValueError("Unexpected layer key conflict: {} vs. {}".format(
-            layer_key, key))
-
-    return self.fisher_blocks[result_key]
-
-  def _register_block_with_nonsequence_key(self, layer_key, fisher_block):
-    """Validates and registers the layer_key if it's not a sequence."""
-    inclusions = {
-        fisher_elt
-        for fisher_elt in self.fisher_blocks
-        if self._equal_or_subset(layer_key, fisher_elt)
-    }
-
-    if not inclusions:
-      self.fisher_blocks[layer_key] = fisher_block
-    else:
-      logging.warning("Graph Registration Warning: tried to register "
-                      "variable ({}) but a containing tuple was already "
-                      "registered ({}), skipping".format(layer_key, inclusions))
-
+            "Attempted to register layer_key {} with block {}, but variable {}"
+            " was already registered in key {} with block {}.".format(
+                layer_key, fisher_block, variable, prev_key, prev_block))
+    self.fisher_blocks[layer_key] = fisher_block
     return fisher_block
 
-  def _equal_or_subset(self, elt1, elt2):
-    """Checks if the elements are equal or one is contained in the other."""
-    return (elt1 == elt2 or (isinstance(elt1,
-                                        (tuple, list)) and elt2 in elt1) or
-            (isinstance(elt2, (tuple, list)) and elt1 in elt2))
-
   def get_use_count_map(self):
     """Returns a dict of variables to their number of registrations."""
+    # TODO(b/70283403): Reimplement this in the old way, where each
+    # registration function would be responsible for incrementing the count.
+    # Also, this version has a bug: it won't do the right thing for generic
+    # registration for parameters that are shared.  i.e. it won't set the use
+    # count to infinity.
     vars_to_uses = defaultdict(int)
     for key, block in six.iteritems(self.fisher_blocks):
-      key = key if isinstance(key, (tuple, list)) else (key,)
+      n = (
+          block.num_inputs()*block.num_registered_minibatches if isinstance(
+              block, (fb.FullyConnectedSeriesFB, fb.FullyConnectedMultiIndepFB))
+          else block.num_registered_minibatches)
+      key = ensure_sequence(key)
       for k in key:
-        vars_to_uses[k] += block.num_registered_minibatches
+        vars_to_uses[k] += n
     return vars_to_uses
 
+  def check_registration(self, variables):
+    """Checks that all variable uses have been registered properly.
+
+    Args:
+      variables: List of variables.
+
+    Raises:
+      ValueError: If any registered variables are not included in the list.
+      ValueError: If any variable in the list is not registered.
+      ValueError: If any variable in the list is registered with the wrong
+          number of "uses" in the subgraph recorded (vs the number of times that
+          variable is actually used in the subgraph).
+    """
+    # Note that overlapping parameters (i.e. those that share variables) will
+    # be caught by layer_collection.LayerParametersDict during registration.
+
+    reg_use_map = self.get_use_count_map()
+
+    error_messages = []
+
+    for var in variables:
+      total_uses = self.subgraph.variable_uses(var)
+      reg_uses = reg_use_map[var]
+
+      if reg_uses == 0:
+        error_messages.append("Variable {} not registered.".format(var))
+      elif (not math.isinf(reg_uses)) and reg_uses != total_uses:
+        error_messages.append(
+            "Variable {} registered with wrong number of uses ({} "
+            "registrations vs {} uses).".format(var, reg_uses, total_uses))
+
+    num_get_vars = len(reg_use_map)
+
+    if num_get_vars > len(variables):
+      error_messages.append("{} registered variables were not included in list."
+                            .format(num_get_vars - len(variables)))
+
+    if error_messages:
+      error_messages = [
+          "Found the following errors with variable registration:"
+      ] + error_messages
+      raise ValueError("\n\t".join(error_messages))
+
   def get_blocks(self):
     return self.fisher_blocks.values()
 
@@ -463,11 +440,11 @@ class LayerCollection(object):
         this layer. Weight matrix should have shape [input_size, output_size].
         Bias should have shape [output_size].
       inputs: Tensor of shape [batch_size, input_size]. Inputs to layer.
-      outputs: Tensor of shape [batch_size, output_size]. Preactivations
+      outputs: Tensor of shape [batch_size, output_size]. Outputs
         produced by layer.
-      approx: str. One of APPROX_KRONECKER_NAME or APPROX_DIAGONAL_NAME.
+      approx: str. One of "kron" or "diagonal".
       reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If VARIABLE_SCOPE, use
+        create a new FisherBlock.  If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse.
 
     Raises:
@@ -509,10 +486,10 @@ class LayerCollection(object):
       inputs: Tensor of shape [batch_size, height, width, in_channels]. Inputs
         to layer.
       outputs: Tensor of shape [batch_size, height, width, out_channels].
-        Preactivations produced by layer.
-      approx: str. One of APPROX_KRONECKER_NAME or APPROX_DIAGONAL_NAME.
+        Output produced by layer.
+      approx: str. One of "kron" or "diagonal".
       reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If VARIABLE_SCOPE, use
+        create a new FisherBlock.  If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse.
 
     Raises:
@@ -542,14 +519,11 @@ class LayerCollection(object):
     """Registers a generic layer.
 
     Args:
-      params: Tensor or 2-tuple of Tensors corresponding to weight and bias of
-        this layer. Weight matrix should have shape [kernel_height,
-        kernel_width, in_channels, out_channels].  Bias should have shape
-        [out_channels].
+      params: Tensor or tuple of Tensors corresponding to the parameters.
       batch_size: 0-D Tensor. Size of the minibatch.
-      approx: str. One of APPROX_KRONECKER_NAME or APPROX_DIAGONAL_NAME.
+      approx: str. One of "full" or "diagonal".
       reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If VARIABLE_SCOPE, use
+        create a new FisherBlock.  If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse.
 
     Raises:
@@ -570,6 +544,47 @@ class LayerCollection(object):
     block = self.register_block(params, block_type(self, params), reuse=reuse)
     block.register_additional_minibatch(batch_size)
 
+  def register_fully_connected_multi(self, params, inputs, outputs,
+                                     approx=None):
+    """Register fully connected layers with shared parameters.
+
+    This can handle general fully-connected layers with shared parameters, but
+    has specialized approximations to deal with the case where there is a
+    meaningful linear order to the share instances (such as in an RNN).
+
+    Args:
+      params: Tensor or 2-tuple of Tensors corresponding to weight and bias of
+        this layer. Weight matrix should have shape [input_size, output_size].
+        Bias should have shape [output_size].
+      inputs: A list of tensors, each of shape [batch_size, input_size]. Inputs
+        to layer. In the case of RNNs, one Tensor per time step.
+      outputs: A list of tensors, the same length as 'inputs', each of shape
+        [batch_size, output_size]. Outputs produced by layer. In the case of
+        RNNs, one Tensor per time step.
+      approx: str. One of "kron_indep", "kron_series_1", or "kron_series_2".
+
+    Raises:
+      ValueError: For improper value to 'approx'.
+    """
+    if approx is None:
+      approx = self._get_linked_approx(params)
+      if approx is None:
+        approx = self.default_fully_connected_multi_approximation
+    has_bias = isinstance(params, (tuple, list))
+
+    # TODO(b/70283649): something along the lines of find_canonical_output
+    # should be added back in here (and for the other block types, arguably).
+
+    if approx not in _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES:
+      raise ValueError("Bad value {} for approx.".format(approx))
+    block_type = _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES[approx]
+
+    # For now we don't support multiple minibatches for this type of layer, so
+    # we set reuse=False
+    self.register_block(params,
+                        block_type(self, inputs, outputs, has_bias=has_bias),
+                        reuse=False)
+
   def register_categorical_predictive_distribution(self,
                                                    logits,
                                                    seed=None,
@@ -710,6 +725,9 @@ class LayerCollection(object):
            "LayerCollection.fisher_factors. The pair cannot be hashed.").format(
                cls, args))
 
-    with variable_scope.variable_scope(self._var_scope):
-      return utils.setdefault(self.fisher_factors, (cls, args),
-                              lambda: cls(*args))
+    key = cls, args
+    if key not in self.fisher_factors:
+      colo = self._colocate_cov_ops_with_inputs
+      with variable_scope.variable_scope(self._var_scope):
+        self.fisher_factors[key] = cls(*args, colocate_cov_ops_with_inputs=colo)
+    return self.fisher_factors[key]
diff --git a/tensorflow/contrib/kfac/python/ops/loss_functions.py b/tensorflow/contrib/kfac/python/ops/loss_functions.py
index e2e5bc3ffea3e52087c24802948bc8260e3b199a..d449abcfa78b361b9d4774ca5c2e936f14f65433 100644
--- a/tensorflow/contrib/kfac/python/ops/loss_functions.py
+++ b/tensorflow/contrib/kfac/python/ops/loss_functions.py
@@ -91,13 +91,13 @@ class LossFunction(object):
 
   @abc.abstractmethod
   def _evaluate(self, targets):
-    """Evaluates the log probability of the targets.
+    """Evaluates the negative log probability of the targets.
 
     Args:
       targets: Tensor that distribution can calculate log_prob() of.
 
     Returns:
-      log probability of each target, summed across all targets.
+      negative log probability of each target, summed across all targets.
     """
     pass
 
diff --git a/tensorflow/contrib/kfac/python/ops/optimizer.py b/tensorflow/contrib/kfac/python/ops/optimizer.py
index 88299e495cb3069280cd3ae33d1cdd65f653a01b..ecf7f3e4e5ab7d9c151f760fdab733bc3830e37b 100644
--- a/tensorflow/contrib/kfac/python/ops/optimizer.py
+++ b/tensorflow/contrib/kfac/python/ops/optimizer.py
@@ -35,17 +35,20 @@ from tensorflow.python.training import gradient_descent
 class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
   """The KFAC Optimizer (https://arxiv.org/abs/1503.05671)."""
 
-  def __init__(
-      self,
-      learning_rate,
-      cov_ema_decay,
-      damping,
-      layer_collection,
-      momentum=0.,
-      momentum_type="regular",
-      norm_constraint=None,
-      name="KFAC",
-      estimation_mode="gradients"):
+  def __init__(self,
+               learning_rate,
+               cov_ema_decay,
+               damping,
+               layer_collection,
+               var_list=None,
+               momentum=0.,
+               momentum_type="regular",
+               norm_constraint=None,
+               name="KFAC",
+               estimation_mode="gradients",
+               colocate_gradients_with_ops=False,
+               cov_devices=None,
+               inv_devices=None):
     """Initializes the KFAC optimizer with the given settings.
 
     Args:
@@ -64,6 +67,9 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
           blocks, kronecker factors, and losses associated with the
           graph.  The layer_collection cannot be modified after KfacOptimizer's
           initialization.
+      var_list: Optional list or tuple of variables to train. Defaults to the
+          list of variables collected in the graph under the key
+          `GraphKeys.TRAINABLE_VARIABLES`.
       momentum: The momentum value for this optimizer. Only applies when
           momentum_type is 'regular' or 'adam'. (Default: 0)
       momentum_type: The type of momentum to use in this optimizer, one of
@@ -77,6 +83,14 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
           'gradients', 'empirical', 'curvature_propagation', or 'exact'.
           (Default: 'gradients'). See the doc-string for FisherEstimator for
           more a more detailed description of these options.
+      colocate_gradients_with_ops: Whether we should request gradients we
+          compute in the estimator be colocated with their respective ops.
+      cov_devices: Iterable of device strings (e.g. '/gpu:0'). Covariance
+          computations will be placed on these devices in a round-robin fashion.
+          Can be None, which means that no devices are specified.
+      inv_devices: Iterable of device strings (e.g. '/gpu:0'). Inversion
+          computations will be placed on these devices in a round-robin fashion.
+          Can be None, which means that no devices are specified.
 
     Raises:
       ValueError: If the momentum type is unsupported.
@@ -86,13 +100,19 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
           or 'adam'.
     """
 
-    # We may consider determining the set of variables some other way, but for
-    # now it's just all the trainable variables.
-    variables = tf_variables.trainable_variables()
+    variables = var_list
+    if variables is None:
+      variables = tf_variables.trainable_variables()
 
-    self._fisher_est = est.FisherEstimator(variables, cov_ema_decay, damping,
-                                           layer_collection,
-                                           estimation_mode=estimation_mode)
+    self._fisher_est = est.FisherEstimator(
+        variables,
+        cov_ema_decay,
+        damping,
+        layer_collection,
+        estimation_mode=estimation_mode,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        cov_devices=cov_devices,
+        inv_devices=inv_devices)
 
     momentum_type = momentum_type.lower()
     legal_momentum_types = ["regular", "adam", "qmodel"]
@@ -107,7 +127,7 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       raise ValueError("Momentum must be unspecified if using a momentum_type "
                        "other than 'regular' or 'adam'.")
 
-    self._momentum = ops.convert_to_tensor(momentum, name="momentum")
+    self._momentum = momentum
     self._momentum_type = momentum_type
     self._norm_constraint = norm_constraint
 
@@ -131,16 +151,24 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
     return self._fisher_est.damping
 
   def minimize(self, *args, **kwargs):
-
-    if "var_list" not in kwargs:
-      kwargs["var_list"] = tf_variables.trainable_variables()
-
+    kwargs["var_list"] = kwargs.get("var_list") or self.variables
     if set(kwargs["var_list"]) != set(self.variables):
       raise ValueError("var_list doesn't match with set of Fisher-estimating "
                        "variables.")
-
     return super(KfacOptimizer, self).minimize(*args, **kwargs)
 
+  def compute_gradients(self, *args, **kwargs):
+    # args[1] could be our var_list
+    if len(args) > 1:
+      var_list = args[1]
+    else:
+      kwargs["var_list"] = kwargs.get("var_list") or self.variables
+      var_list = kwargs["var_list"]
+    if set(var_list) != set(self.variables):
+      raise ValueError("var_list doesn't match with set of Fisher-estimating "
+                       "variables.")
+    return super(KfacOptimizer, self).compute_gradients(*args, **kwargs)
+
   def apply_gradients(self, grads_and_vars, *args, **kwargs):
     """Applies gradients to variables.
 
@@ -297,14 +325,17 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
         self._batch_size, dtype=fft_precon_grads[0].dtype)
 
     # compute the entries of the 2x2 matrix
-    m_11 = (_inner_product_list(fft_precon_grads, fft_precon_grads) / batch_size
-            + self.damping * _inner_product_list(precon_grads, precon_grads))
+    m_11 = (
+        _inner_product_list(fft_precon_grads, fft_precon_grads) / batch_size +
+        self.damping * _inner_product_list(precon_grads, precon_grads))
 
-    m_21 = (_inner_product_list(fft_prev_updates, fft_precon_grads) / batch_size
-            + self.damping * _inner_product_list(prev_updates, precon_grads))
+    m_21 = (
+        _inner_product_list(fft_prev_updates, fft_precon_grads) / batch_size +
+        self.damping * _inner_product_list(prev_updates, precon_grads))
 
-    m_22 = (_inner_product_list(fft_prev_updates, fft_prev_updates) / batch_size
-            + self.damping * _inner_product_list(prev_updates, prev_updates))
+    m_22 = (
+        _inner_product_list(fft_prev_updates, fft_prev_updates) / batch_size +
+        self.damping * _inner_product_list(prev_updates, prev_updates))
 
     def non_zero_prevupd_case():
       r"""Computes optimal (alpha, mu) given non-zero previous update.
@@ -390,8 +421,8 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       grads = list(grad for (grad, _) in grads_and_vars)
       variables = list(var for (_, var) in grads_and_vars)
       # previous updates are the negative velocities (up to scaling by LR)
-      prev_updates = list(-self._zeros_slot(var, "velocity", self._name)
-                          for var in variables)
+      prev_updates = list(
+          -self._zeros_slot(var, "velocity", self._name) for var in variables)
 
       # Compute optimal velocity update parameters according to quadratic model
       alpha, mu, _ = self._compute_qmodel_hyperparams(
diff --git a/tensorflow/contrib/kfac/python/ops/utils.py b/tensorflow/contrib/kfac/python/ops/utils.py
index 0fd7f5147739f0f46d2ab6a1c284c6dc75f53cc2..cec018e406bc51c07f5cafcc2c38efe7e9601618 100644
--- a/tensorflow/contrib/kfac/python/ops/utils.py
+++ b/tensorflow/contrib/kfac/python/ops/utils.py
@@ -28,9 +28,9 @@ from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 
-
 # Method used for inverting matrices.
 POSDEF_INV_METHOD = "cholesky"
+POSDEF_EIG_METHOD = "self_adjoint"
 
 
 def set_global_constants(posdef_inv_method=None):
@@ -64,13 +64,6 @@ class SequenceDict(object):
     return list(self._dict.items())
 
 
-def setdefault(dct, key, thunk):
-  """Like dict.setdefault but delays evaluation of the value to be set."""
-  if key not in dct:
-    dct[key] = thunk()
-  return dct[key]
-
-
 def tensors_to_column(tensors):
   """Converts a tensor or list of tensors to a column vector.
 
@@ -169,33 +162,11 @@ def mat2d_to_layer_params(vector_template, mat2d):
     return array_ops.reshape(mat2d, vector_template.shape)
 
 
-def compute_pi(left_factor, right_factor):
-  """Computes the scalar constant pi for Tikhonov regularization/damping.
-
-  pi = sqrt( (trace(A) / dim(A)) / (trace(B) / dim(B)) )
-  See section 6.3 of https://arxiv.org/pdf/1503.05671.pdf for details.
-
-  Args:
-    left_factor: The left Kronecker factor Tensor.
-    right_factor: The right Kronecker factor Tensor.
-
-  Returns:
-    The computed scalar constant pi for these Kronecker Factors (as a Tensor).
-  """
-  # Instead of dividing by the dim of the norm, we multiply by the dim of the
-  # other norm. This works out the same in the ratio.
-  left_norm = math_ops.trace(left_factor) * right_factor.get_shape().as_list()[
-      0]
-  right_norm = math_ops.trace(right_factor) * left_factor.get_shape().as_list()[
-      0]
-  return math_ops.sqrt(left_norm / right_norm)
-
-
 def posdef_inv(tensor, damping):
   """Computes the inverse of tensor + damping * identity."""
   identity = linalg_ops.eye(tensor.shape.as_list()[0], dtype=tensor.dtype)
   damping = math_ops.cast(damping, dtype=tensor.dtype)
-  return posdef_inv_funcs[POSDEF_INV_METHOD](tensor, identity, damping)
+  return posdef_inv_functions[POSDEF_INV_METHOD](tensor, identity, damping)
 
 
 def posdef_inv_matrix_inverse(tensor, identity, damping):
@@ -209,9 +180,44 @@ def posdef_inv_cholesky(tensor, identity, damping):
   return linalg_ops.cholesky_solve(chol, identity)
 
 
-posdef_inv_funcs = {
+def posdef_inv_eig(tensor, identity, damping):
+  """Computes inverse(tensor + damping * identity) with eigendecomposition."""
+  eigenvalues, eigenvectors = linalg_ops.self_adjoint_eig(
+      tensor + damping * identity)
+  return math_ops.matmul(
+      eigenvectors / eigenvalues, eigenvectors, transpose_b=True)
+
+
+posdef_inv_functions = {
     "matrix_inverse": posdef_inv_matrix_inverse,
     "cholesky": posdef_inv_cholesky,
+    "eig": posdef_inv_eig,
+}
+
+
+def posdef_eig(mat):
+  """Computes the eigendecomposition of a positive semidefinite matrix."""
+  return posdef_eig_functions[POSDEF_EIG_METHOD](mat)
+
+
+def posdef_eig_svd(mat):
+  """Computes the singular values and left singular vectors of a matrix."""
+  evals, evecs, _ = linalg_ops.svd(mat)
+
+  return evals, evecs
+
+
+def posdef_eig_self_adjoint(mat):
+  """Computes eigendecomposition using self_adjoint_eig."""
+  evals, evecs = linalg_ops.self_adjoint_eig(mat)
+  evals = math_ops.abs(evals)  # Should be equivalent to svd approach.
+
+  return evals, evecs
+
+
+posdef_eig_functions = {
+    "self_adjoint": posdef_eig_self_adjoint,
+    "svd": posdef_eig_svd,
 }
 
 
@@ -268,8 +274,8 @@ def fwd_gradients(ys, xs, grad_xs=None, stop_gradients=None):
   # generated by the first gradients_impl.gradients call.
 
   us = [array_ops.zeros_like(y) + float("nan") for y in ys]
-  dydxs = gradients_impl.gradients(ys, xs, grad_ys=us,
-                                   stop_gradients=stop_gradients)
+  dydxs = gradients_impl.gradients(
+      ys, xs, grad_ys=us, stop_gradients=stop_gradients)
 
   # Deal with strange types that gradients_impl.gradients returns but can't
   # deal with.
@@ -285,3 +291,6 @@ def fwd_gradients(ys, xs, grad_xs=None, stop_gradients=None):
   dysdx = gradients_impl.gradients(dydxs, us, grad_ys=grad_xs)
 
   return dysdx
+
+# TODO(b/69623235): Add a function for finding tensors that share gradients
+# to eliminate redundant fisher factor computations.
diff --git a/tensorflow/contrib/kfac/python/ops/utils_lib.py b/tensorflow/contrib/kfac/python/ops/utils_lib.py
index ddbb4485ce6967082f1844c6d798c078f1cc303b..8903c90fbce6a890aa419d89b3b79d75f69509fc 100644
--- a/tensorflow/contrib/kfac/python/ops/utils_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/utils_lib.py
@@ -25,13 +25,11 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
     "SequenceDict",
-    "setdefault",
     "tensors_to_column",
     "column_to_tensors",
     "kronecker_product",
     "layer_params_to_mat2d",
     "mat2d_to_layer_params",
-    "compute_pi",
     "posdef_inv",
     "posdef_inv_matrix_inverse",
     "posdef_inv_cholesky",
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index 2f1f283811b6cb9e8bfb52ab2052afac1de700cb..852d06e1e3cc8f8deecd15b7436cd4e4a393ad66 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -61,6 +61,7 @@ tf_custom_op_py_library(
         "python/layers/normalization.py",
         "python/layers/optimizers.py",
         "python/layers/regularizers.py",
+        "python/layers/rev_block_lib.py",
         "python/layers/summaries.py",
         "python/layers/target_column.py",
         "python/layers/utils.py",
@@ -376,6 +377,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "rev_block_lib_test",
+    size = "small",
+    srcs = ["python/layers/rev_block_lib_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":layers_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index d309ba958ded86afdc1e4bba2ff471a5181cda4e..6c624929f20503054e0258aad8a843f4a201be64 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -42,6 +42,9 @@ See the @{$python/contrib.layers} guide.
 @@relu
 @@relu6
 @@repeat
+@@recompute_grad
+@@RevBlock
+@@rev_block
 @@safe_embedding_lookup_sparse
 @@scale_gradient
 @@separable_conv2d
diff --git a/tensorflow/contrib/layers/python/layers/__init__.py b/tensorflow/contrib/layers/python/layers/__init__.py
index 03337f9a5d11784316124442125bb498c4ce9603..f1ae2de68be33880a6fc09957f4d857973902b26 100644
--- a/tensorflow/contrib/layers/python/layers/__init__.py
+++ b/tensorflow/contrib/layers/python/layers/__init__.py
@@ -28,6 +28,7 @@ from tensorflow.contrib.layers.python.layers.layers import *
 from tensorflow.contrib.layers.python.layers.normalization import *
 from tensorflow.contrib.layers.python.layers.optimizers import *
 from tensorflow.contrib.layers.python.layers.regularizers import *
+from tensorflow.contrib.layers.python.layers.rev_block_lib import *
 from tensorflow.contrib.layers.python.layers.summaries import *
 from tensorflow.contrib.layers.python.layers.target_column import *
 from tensorflow.contrib.layers.python.ops.bucketization_op import *
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 226d933d85d91600e36ffb84212703e10455bfbb..8d2931b4867938024a494459c77976e1e714de5a 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -156,6 +156,10 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
 
 
+# Imports the core `InputLayer` symbol in contrib during development.
+InputLayer = fc_core.InputLayer  # pylint: disable=invalid-name
+
+
 class _LinearEmbeddingLookupArguments(
     collections.namedtuple("_LinearEmbeddingLookupArguments",
                            ["input_tensor",
@@ -521,7 +525,7 @@ def sparse_column_with_integerized_feature(column_name,
 
   Args:
     column_name: A string defining sparse column name.
-    bucket_size: An int that is > 1. The number of buckets. It should be bigger
+    bucket_size: An int that is >= 1. The number of buckets. It should be bigger
       than maximum feature. In other words features in this column should be an
       int64 in range [0, bucket_size)
     combiner: A string specifying how to reduce if the sparse column is
@@ -539,7 +543,7 @@ def sparse_column_with_integerized_feature(column_name,
     An integerized _SparseColumn definition.
 
   Raises:
-    ValueError: bucket_size is not greater than 1.
+    ValueError: bucket_size is less than 1.
     ValueError: dtype is not integer.
   """
   return _SparseColumnIntegerized(
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops.py b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
index fa0047f05d893f6543ddb1680824a32469e13293..78affea44cbfb92523063968dbc1be98841854db 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
@@ -97,10 +97,13 @@ def _input_from_feature_columns(columns_to_tensors,
                                 trainable,
                                 scope,
                                 output_rank,
-                                default_name):
+                                default_name,
+                                cols_to_outs=None):
   """Implementation of `input_from(_sequence)_feature_columns`."""
   columns_to_tensors = columns_to_tensors.copy()
   check_feature_columns(feature_columns)
+  if cols_to_outs is not None and not isinstance(cols_to_outs, dict):
+    raise ValueError('cols_to_outs must be a dict unless None')
   with variable_scope.variable_scope(scope,
                                      default_name=default_name,
                                      values=columns_to_tensors.values()):
@@ -144,6 +147,8 @@ def _input_from_feature_columns(columns_to_tensors,
           except ValueError as e:
             raise ValueError('Error creating input layer for column: {}.\n'
                              '{}, {}'.format(column.name, e, ee))
+        if cols_to_outs is not None:
+          cols_to_outs[column] = output_tensors[-1]
     return array_ops.concat(output_tensors, output_rank - 1)
 
 
@@ -151,7 +156,8 @@ def input_from_feature_columns(columns_to_tensors,
                                feature_columns,
                                weight_collections=None,
                                trainable=True,
-                               scope=None):
+                               scope=None,
+                               cols_to_outs=None):
   """A tf.contrib.layers style input layer builder based on FeatureColumns.
 
   Generally a single example in training data is described with feature columns.
@@ -196,6 +202,8 @@ def input_from_feature_columns(columns_to_tensors,
     trainable: If `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     scope: Optional scope for variable_scope.
+    cols_to_outs: Optional dict from feature column to output tensor,
+      which is concatenated into the returned tensor.
 
   Returns:
     A Tensor which can be consumed by hidden layers in the neural network.
@@ -209,7 +217,8 @@ def input_from_feature_columns(columns_to_tensors,
                                      trainable,
                                      scope,
                                      output_rank=2,
-                                     default_name='input_from_feature_columns')
+                                     default_name='input_from_feature_columns',
+                                     cols_to_outs=cols_to_outs)
 
 
 @experimental
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
index fbfa0e32de55edab3c90189ddfe05ab826ac9167..e6bbd86ab722c4e853a59f816bed8a8ac1fe9ede 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@@ -607,6 +607,31 @@ class CreateInputLayersForDNNsTest(test.TestCase):
       # Verify cross compatibility: Core builder output should equal to contrib.
       self.assertAllEqual(output.eval().shape, output_core.eval().shape)
 
+  def testAllDNNColumnsWithColumnwiseOutputs(self):
+    sparse_column = feature_column.sparse_column_with_keys(
+        "ids", ["a", "b", "c", "unseen"])
+    real_valued_column = feature_column.real_valued_column("income", 2)
+    one_hot_column = feature_column.one_hot_column(sparse_column)
+    embedding_column = feature_column.embedding_column(sparse_column, 10)
+    features = {
+        "ids":
+            sparse_tensor.SparseTensor(
+                values=["c", "b", "a"],
+                indices=[[0, 0], [1, 0], [2, 0]],
+                dense_shape=[3, 1]),
+        "income":
+            constant_op.constant([[20.3, 10], [110.3, 0.4], [-3.0, 30.4]]),
+    }
+    columns = [one_hot_column, embedding_column, real_valued_column]
+    cols_to_outs = {}
+    feature_column_ops.input_from_feature_columns(
+        features, columns, cols_to_outs=cols_to_outs)
+    with self.test_session():
+      variables_lib.global_variables_initializer().run()
+      lookup_ops.tables_initializer().run()
+      for column in columns:
+        self.assertTrue(column in cols_to_outs)
+
   def testRealValuedColumn(self):
     real_valued = feature_column.real_valued_column("price")
     features = {"price": constant_op.constant([[20.], [110], [-3]])}
diff --git a/tensorflow/contrib/layers/python/layers/initializers.py b/tensorflow/contrib/layers/python/layers/initializers.py
index b12a882d9ae88f7cf4f920cfa5872e5de1c67290..51610f21b24f1d40f26630cc1e69ca723d130639 100644
--- a/tensorflow/contrib/layers/python/layers/initializers.py
+++ b/tensorflow/contrib/layers/python/layers/initializers.py
@@ -79,7 +79,8 @@ def variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False,
   ```
 
   * To get [Delving Deep into Rectifiers](
-     http://arxiv.org/pdf/1502.01852v1.pdf), use (Default):<br/>
+     http://arxiv.org/pdf/1502.01852v1.pdf) (also know as the "MSRA 
+     initialization"), use (Default):<br/>
     `factor=2.0 mode='FAN_IN' uniform=False`
   * To get [Convolutional Architecture for Fast Feature Embedding](
      http://arxiv.org/abs/1408.5093), use:<br/>
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 30630852181e8f4fdf6f8dd83fb852759806b36b..0d25a09852544a7eb1ed5eb9c2f3402d9064d91a 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -309,7 +309,6 @@ def _fused_batch_norm(inputs,
         new_shape = [-1, channels, 1, 1]
       inputs = array_ops.reshape(inputs, new_shape)
     inputs_shape = inputs.get_shape()
-    dtype = inputs.dtype.base_dtype
     if data_format == DATA_FORMAT_NHWC:
       params_shape = inputs_shape[-1:]
     else:
@@ -2562,7 +2561,10 @@ def separable_convolution2d(
           regularizer=weights_regularizer,
           trainable=trainable,
           collections=weights_collections)
-      strides = [1, 1, stride_h, stride_w] if data_format.startswith('NC') else [1, stride_h, stride_w, 1]
+      strides = [1, 1, stride_h,
+                 stride_w] if data_format.startswith('NC') else [
+                     1, stride_h, stride_w, 1
+                 ]
 
       outputs = nn.depthwise_conv2d(inputs, depthwise_weights, strides, padding,
                                     rate=utils.two_element_tuple(rate),
@@ -2652,51 +2654,52 @@ def spatial_softmax(features,
     ValueError: If unexpected data_format specified.
     ValueError: If num_channels dimension is unspecified.
   """
-  shape = array_ops.shape(features)
-  static_shape = features.shape
-  if data_format == DATA_FORMAT_NHWC:
-    height, width, num_channels = shape[1], shape[2], static_shape[3]
-  elif data_format == DATA_FORMAT_NCHW:
-    num_channels, height, width = static_shape[1], shape[2], shape[3]
-  else:
-    raise ValueError('data_format has to be either NCHW or NHWC.')
-  if num_channels.value is None:
-    raise ValueError('The num_channels dimension of the inputs to '
-                     '`spatial_softmax` should be defined. Found `None`.')
-
-  with ops.name_scope(name, 'spatial_softmax', [features]) as name:
-    # Create tensors for x and y coordinate values, scaled to range [-1, 1].
-    pos_x, pos_y = array_ops.meshgrid(math_ops.lin_space(-1., 1., num=height),
-                                      math_ops.lin_space(-1., 1., num=width),
-                                      indexing='ij')
-    pos_x = array_ops.reshape(pos_x, [height * width])
-    pos_y = array_ops.reshape(pos_y, [height * width])
-    if temperature is None:
-      temperature_collections = utils.get_variable_collections(
-          variables_collections, 'temperature')
-      temperature = variables.model_variable(
-          'temperature',
-          shape=(),
-          dtype=dtypes.float32,
-          initializer=init_ops.ones_initializer(),
-          collections=temperature_collections,
-          trainable=trainable)
-    if data_format == 'NCHW':
-      features = array_ops.reshape(features, [-1, height * width])
+  with variable_scope.variable_scope(name, 'spatial_softmax'):
+    shape = array_ops.shape(features)
+    static_shape = features.shape
+    if data_format == DATA_FORMAT_NHWC:
+      height, width, num_channels = shape[1], shape[2], static_shape[3]
+    elif data_format == DATA_FORMAT_NCHW:
+      num_channels, height, width = static_shape[1], shape[2], shape[3]
     else:
-      features = array_ops.reshape(
-          array_ops.transpose(features, [0, 3, 1, 2]), [-1, height * width])
-
-    softmax_attention = nn.softmax(features/temperature)
-    expected_x = math_ops.reduce_sum(
-        pos_x * softmax_attention, [1], keep_dims=True)
-    expected_y = math_ops.reduce_sum(
-        pos_y * softmax_attention, [1], keep_dims=True)
-    expected_xy = array_ops.concat([expected_x, expected_y], 1)
-    feature_keypoints = array_ops.reshape(
-        expected_xy, [-1, num_channels.value * 2])
-    feature_keypoints.set_shape([None, num_channels.value * 2])
-    return feature_keypoints
+      raise ValueError('data_format has to be either NCHW or NHWC.')
+    if num_channels.value is None:
+      raise ValueError('The num_channels dimension of the inputs to '
+                       '`spatial_softmax` should be defined. Found `None`.')
+
+    with ops.name_scope('spatial_softmax_op', 'spatial_softmax_op', [features]):
+      # Create tensors for x and y coordinate values, scaled to range [-1, 1].
+      pos_x, pos_y = array_ops.meshgrid(math_ops.lin_space(-1., 1., num=height),
+                                        math_ops.lin_space(-1., 1., num=width),
+                                        indexing='ij')
+      pos_x = array_ops.reshape(pos_x, [height * width])
+      pos_y = array_ops.reshape(pos_y, [height * width])
+      if temperature is None:
+        temperature_collections = utils.get_variable_collections(
+            variables_collections, 'temperature')
+        temperature = variables.model_variable(
+            'temperature',
+            shape=(),
+            dtype=dtypes.float32,
+            initializer=init_ops.ones_initializer(),
+            collections=temperature_collections,
+            trainable=trainable)
+      if data_format == 'NCHW':
+        features = array_ops.reshape(features, [-1, height * width])
+      else:
+        features = array_ops.reshape(
+            array_ops.transpose(features, [0, 3, 1, 2]), [-1, height * width])
+
+      softmax_attention = nn.softmax(features/temperature)
+      expected_x = math_ops.reduce_sum(
+          pos_x * softmax_attention, [1], keep_dims=True)
+      expected_y = math_ops.reduce_sum(
+          pos_y * softmax_attention, [1], keep_dims=True)
+      expected_xy = array_ops.concat([expected_x, expected_y], 1)
+      feature_keypoints = array_ops.reshape(
+          expected_xy, [-1, num_channels.value * 2])
+      feature_keypoints.set_shape([None, num_channels.value * 2])
+  return feature_keypoints
 
 
 def stack(inputs, layer, stack_args, **kwargs):
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 9019d3a60991fa0274de10c95986a61c21223bd7..ae64b75d939ce0ffab300b01d3cfcb67a9d0da1c 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -1779,7 +1779,8 @@ class BatchNormTest(test.TestCase):
       dtype = dtypes.float32
     height, width = 3, 3
     with self.test_session():
-      images = np.random.uniform(size=(5, height, width, 3)).astype(dtype.as_numpy_dtype)
+      images = np.random.uniform(size=(5, height, width, 3)).astype(
+          dtype.as_numpy_dtype)
       output = _layers.batch_norm(images, fused=fused)
       expected_name = ('BatchNorm/FusedBatchNorm' if fused else
                        'BatchNorm/batchnorm')
@@ -2665,18 +2666,18 @@ class BatchNormTest(test.TestCase):
     # Test case for 11673
     with self.test_session() as sess:
       a_32 = array_ops.placeholder(dtypes.float32, shape=(10, 10, 10, 10))
-      b_32 = _layers.batch_norm(a_32, center=False, data_format='NCHW',
-                                zero_debias_moving_mean=True)
+      _layers.batch_norm(
+          a_32, center=False, data_format='NCHW', zero_debias_moving_mean=True)
       a_16 = array_ops.placeholder(dtypes.float16, shape=(10, 10, 10, 10))
-      b_16 = _layers.batch_norm(a_16, center=False, data_format='NCHW',
-                                zero_debias_moving_mean=True)
+      _layers.batch_norm(
+          a_16, center=False, data_format='NCHW', zero_debias_moving_mean=True)
       sess.run(variables_lib.global_variables_initializer())
 
   def testVariablesAreFloat32(self):
     height, width = 3, 3
     with self.test_session():
-      images = random_ops.random_uniform((5, height, width, 3),
-                                         seed=1, dtype=dtypes.float16)
+      images = random_ops.random_uniform(
+          (5, height, width, 3), seed=1, dtype=dtypes.float16)
       _layers.batch_norm(images, scale=True)
       beta = variables.get_variables_by_name('beta')[0]
       gamma = variables.get_variables_by_name('gamma')[0]
@@ -2691,17 +2692,13 @@ class BatchNormTest(test.TestCase):
     channels = shape[1]
     images = np.arange(np.product(shape), dtype=dtype).reshape(shape)
     beta = init_ops.constant_initializer(
-        np.arange(
-            2, channels + 2, dtype=np.float32))
+        np.arange(2, channels + 2, dtype=np.float32))
     gamma = init_ops.constant_initializer(
-        np.arange(
-            10, channels + 10, dtype=np.float32) * 2.0)
+        np.arange(10, channels + 10, dtype=np.float32) * 2.0)
     mean = init_ops.constant_initializer(
-        np.arange(
-            3, channels + 3, dtype=np.float32) * 5.0)
+        np.arange(3, channels + 3, dtype=np.float32) * 5.0)
     variance = init_ops.constant_initializer(
-        np.arange(
-            1, channels + 1, dtype=np.float32) * 4.0)
+        np.arange(1, channels + 1, dtype=np.float32) * 4.0)
     output = _layers.batch_norm(
         images,
         fused=True,
@@ -2726,7 +2723,6 @@ class BatchNormTest(test.TestCase):
       res_16 = self._runFusedBatchNorm(shape, np.float16)
       self.assertAllClose(res_32, res_16, rtol=1e-3)
 
-
   def testAdjustmentCreated(self):
     # Tests that the adjustment is appropriately passed to and used by the core
     # BN layer.
@@ -3336,11 +3332,18 @@ class SeparableConv2dTest(test.TestCase):
         batch, height, width = 4, 10, 12
         kernel_dim, stride = 3, 2
         images = random_ops.random_uniform((batch, 3, height, width), seed=1)
-        output = layers_lib.separable_conv2d(images, num_outputs=num_filters, kernel_size=[kernel_dim, kernel_dim],
-                                             depth_multiplier=2, stride=stride, padding='VALID', data_format='NCHW')
-        self.assertListEqual(
-            output.get_shape().as_list(), [batch, correct_output_filters,
-                                           (height - kernel_dim + 1) // stride, (width - kernel_dim + 1) // stride])
+        output = layers_lib.separable_conv2d(
+            images,
+            num_outputs=num_filters,
+            kernel_size=[kernel_dim, kernel_dim],
+            depth_multiplier=2,
+            stride=stride,
+            padding='VALID',
+            data_format='NCHW')
+        self.assertListEqual(output.get_shape().as_list(), [
+            batch, correct_output_filters, (height - kernel_dim + 1) // stride,
+            (width - kernel_dim + 1) // stride
+        ])
 
 
 class ScaleGradientTests(test.TestCase):
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..123275e1fde047cd3772528641b2e3b09742fbdc
--- /dev/null
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
@@ -0,0 +1,583 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Reversible Residual Block.
+
+From
+[The Reversible Residual Network: Backpropagation Without Storing
+Activations](https://arxiv.org/abs/1707.04585).
+
+Also contains the @recompute_grad decorator, which recomputes the forward
+function on the backwards pass.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import re
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.framework.python import ops as contrib_framework_ops
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops as framework_ops
+from tensorflow.python.layers import base
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
+
+__all__ = ["rev_block", "RevBlock", "recompute_grad"]
+
+LAYER_RE = re.compile(".*revlayer_([0-9]*)/([fg])/.*")
+
+
+def _acc_grads(*lists_of_grads):
+  """Accumulates lists of gradients."""
+  acc_grads = []
+  for grads in zip(*lists_of_grads):
+    grads = [g for g in grads if g is not None]
+    if grads:
+      acc_grads.append(math_ops.add_n(grads))
+    else:
+      acc_grads.append(None)
+  return acc_grads
+
+
+def _rev_layer_forward(xs, f, g, f_side_input, g_side_input,
+                       gate_outputs=False):
+  """Forward for 1 reversible layer."""
+  x1, x2 = xs
+  y1 = x1 + (f(x2, f_side_input) if f_side_input else f(x2))
+  y2 = x2 + (g(y1, g_side_input) if g_side_input else g(y1))
+  if gate_outputs:
+    return control_flow_ops.tuple([y1, y2])
+  else:
+    return (y1, y2)
+
+
+def _rev_layer_backward(ys, grad_ys, f, g, f_vars, f_side_input, g_vars,
+                        g_side_input):
+  """Backprop for 1 layer."""
+  y1, y2 = ys
+  grad_y1, grad_y2 = grad_ys
+
+  # Reconstruct intermediates and inputs (x1, x2)
+  # stop_gradients required on fn inputs to prevent infinite recursion into this
+  # grad function on the calls to gradients.
+  y1_stop = array_ops.stop_gradient(y1)
+  g_side_input = [array_ops.stop_gradient(t) for t in g_side_input]
+  gy1 = g(y1_stop, g_side_input) if g_side_input else g(y1_stop)
+
+  x2 = y2 - gy1
+  x2_stop = array_ops.stop_gradient(x2)
+  f_side_input = [array_ops.stop_gradient(t) for t in f_side_input]
+  fx2 = f(x2_stop, f_side_input) if f_side_input else f(x2_stop)
+
+  x1 = y1 - fx2
+
+  # Compute gradients wrt to inputs
+  # dL/dy2 * dG(y1)/y1
+  grad_gy1_y2 = gradients_impl.gradients(gy1, y1_stop, grad_y2)[0]
+  grad_x1 = grad_y1 + grad_gy1_y2
+  grad_x2 = (
+      gradients_impl.gradients(fx2, x2_stop, grad_y1)[0] + grad_y2 +
+      gradients_impl.gradients(fx2, x2_stop, grad_gy1_y2)[0])
+
+  # Compute gradients wrt to vars and side inputs in f and g
+  grads1 = gradients_impl.gradients(gy1, g_vars + g_side_input, grad_y2)
+  grad_g_vars, grad_g_side = grads1[:len(g_vars)], grads1[len(g_vars):]
+  grads2 = gradients_impl.gradients(fx2, f_vars + f_side_input, grad_y1)
+  grad_f_y1, grad_f_side1 = grads2[:len(f_vars)], grads2[len(f_vars):]
+  grads3 = gradients_impl.gradients(fx2, f_vars + f_side_input, grad_gy1_y2)
+  grad_f_y2, grad_f_side2 = grads3[:len(f_vars)], grads3[len(f_vars):]
+  grad_f_vars = _acc_grads(grad_f_y1, grad_f_y2)
+
+  grad_f_side = _acc_grads(grad_f_side1, grad_f_side2)
+
+  # Put returns in a tuple to ensure a constant memory budget (i.e. don't want
+  # the subsequent layer to start computing and consuming memory based on a
+  # subset of these values).
+  outputs = ((x1, x2), (grad_x1, grad_x2), (grad_f_vars, grad_f_side),
+             (grad_g_vars, grad_g_side))
+  tupled = control_flow_ops.tuple(nest.flatten(outputs))
+  return nest.pack_sequence_as(outputs, tupled)
+
+
+def _rev_block_forward(x1,
+                       x2,
+                       f,
+                       g,
+                       num_layers=1,
+                       f_side_input=None,
+                       g_side_input=None,
+                       gate_outputs=False):
+  """Forward for a series of reversible layers."""
+  out = (x1, x2)
+  for i in xrange(num_layers):
+    out = _rev_layer_forward(
+        out, f[i], g[i], f_side_input, g_side_input, gate_outputs=gate_outputs)
+
+  y1, y2 = out
+  return y1, y2
+
+
+def _scope_wrap(fn, scope):
+
+  @functools.wraps(fn)
+  def wrap(*args, **kwargs):
+    with variable_scope.variable_scope(scope):
+      return fn(*args, **kwargs)
+
+  return wrap
+
+
+class RevBlock(base.Layer):
+  """Block of reversible layers. See rev_block."""
+
+  def __init__(self,
+               f,
+               g,
+               num_layers=1,
+               f_side_input=None,
+               g_side_input=None,
+               use_efficient_backprop=True,
+               name="revblock",
+               **kwargs):
+    super(RevBlock, self).__init__(name=name, **kwargs)
+
+    if isinstance(f, list):
+      assert len(f) == num_layers
+    else:
+      f = [f] * num_layers
+
+    if isinstance(g, list):
+      assert len(g) == num_layers
+    else:
+      g = [g] * num_layers
+
+    f = [_scope_wrap(fn, "revlayer_%d/f" % i) for i, fn in enumerate(f)]
+    g = [_scope_wrap(fn, "revlayer_%d/g" % i) for i, fn in enumerate(g)]
+
+    self.f = f
+    self.g = g
+
+    self.num_layers = num_layers
+    self.f_side_input = f_side_input or []
+    self.g_side_input = g_side_input or []
+
+    self._use_efficient_backprop = use_efficient_backprop
+
+  def call(self, inputs, forward=True):
+    vs = variable_scope.get_variable_scope()
+    vars_before = vs.global_variables()
+
+    if forward:
+      x1, x2 = inputs
+      out = self._forward(x1, x2)
+    else:
+      y1, y2 = inputs
+      out = self._backward(y1, y2)
+
+    # Add any created variables to the Layer's variable stores
+    new_vars = vs.global_variables()[len(vars_before):]
+    train_vars = vs.trainable_variables()
+    for new_var in new_vars:
+      if new_var in train_vars:
+        self._trainable_weights.append(new_var)
+      else:
+        self._non_trainable_weights.append(new_var)
+
+    return out
+
+  def forward(self, x1, x2):
+    return self.apply([x1, x2])
+
+  def backward(self, y1, y2):
+    return self.apply([y1, y2], forward=False)
+
+  def build(self, _):
+    logging.warn("RevBlock constructs its variables on first call, not on "
+                 "build.")
+    self.built = True
+
+  def _efficient_grad_fn(self, inputs, variables, ys, grad_ys):
+    """Custom gradient fn for a block of reversible residual layers."""
+    side_inputs = inputs[2:]
+    f_side_idxs = [None] * len(self.f_side_input)
+    g_side_idxs = [None] * len(self.g_side_input)
+    assert len(side_inputs) == len(self.f_side_input) + len(self.g_side_input)
+
+    for i, t in enumerate(side_inputs):
+      if t in self.f_side_input:
+        f_side_idxs[self.f_side_input.index(t)] = i
+      elif t in self.g_side_input:
+        g_side_idxs[self.g_side_input.index(t)] = i
+      else:
+        assert False
+
+    f_vars = [[] for _ in range(self.num_layers)]
+    g_vars = [[] for _ in range(self.num_layers)]
+    f_vars_idxs = [[] for _ in range(self.num_layers)]
+    g_vars_idxs = [[] for _ in range(self.num_layers)]
+
+    for i, t in enumerate(variables):
+      ref = _underlying_variable_ref(t)
+
+      # Use the name to identify the layer number and function (f or g)
+      regex = LAYER_RE.match(ref.name)
+      layer_no = int(regex.group(1))
+      fn_name = regex.group(2)
+      if fn_name == "f":
+        f_vars[layer_no].append(ref)
+        f_vars_idxs[layer_no].append(i)
+      else:
+        assert fn_name == "g"
+        g_vars[layer_no].append(ref)
+        g_vars_idxs[layer_no].append(i)
+
+    f_var_grads = []
+    g_var_grads = []
+    f_side_grads = []
+    g_side_grads = []
+
+    # Reverse variable containers to go backward
+    f_vars.reverse()
+    g_vars.reverse()
+    f = list(self.f)
+    g = list(self.g)
+    f.reverse()
+    g.reverse()
+
+    with variable_scope.variable_scope(self.scope_name, reuse=True):
+      for i in xrange(self.num_layers):
+        ys, grad_ys, f_ret, g_ret = _rev_layer_backward(
+            ys, grad_ys, f[i], g[i], f_vars[i], self.f_side_input, g_vars[i],
+            self.g_side_input)
+
+        grad_f_vars, grad_f_side = f_ret
+        grad_g_vars, grad_g_side = g_ret
+        f_var_grads.append(grad_f_vars)
+        g_var_grads.append(grad_g_vars)
+        f_side_grads.append(grad_f_side)
+        g_side_grads.append(grad_g_side)
+
+    # Accumulate layer gradients for f_side_input and g_side_input
+    acc_f_side_grads = _acc_grads(*f_side_grads)
+    acc_g_side_grads = _acc_grads(*g_side_grads)
+
+    # Use the stored idxs to put gradients in the passed-in order.
+    side_input_grads = [None] * len(side_inputs)
+    variable_grads = [None] * len(variables)
+
+    # Variable gradients were collected in reverse layer order. Reverse to match
+    # idxs.
+    f_var_grads.reverse()
+    g_var_grads.reverse()
+    for idxs, grads in list(zip(f_vars_idxs, f_var_grads)) + list(
+        zip(g_vars_idxs, g_var_grads)):
+      for i, grad in zip(idxs, grads):
+        variable_grads[i] = grad
+
+    for i, grad in zip(f_side_idxs, acc_f_side_grads):
+      side_input_grads[i] = grad
+    for i, grad in zip(g_side_idxs, acc_g_side_grads):
+      side_input_grads[i] = grad
+
+    grad_x1, grad_x2 = grad_ys
+    return [grad_x1, grad_x2] + side_input_grads, variable_grads
+
+  def _forward(self, x1, x2):
+    """Run forward through the reversible layers."""
+
+    side_inputs = [self.f_side_input, self.g_side_input]
+    flat_side_inputs = nest.flatten(side_inputs)
+
+    custom_grad_fn = (
+        self._efficient_grad_fn if self._use_efficient_backprop else None)
+
+    @_fn_with_custom_grad(custom_grad_fn)
+    def _forward_wrap(x1_, x2_, *flat_side_inputs):
+      f_side, g_side = nest.pack_sequence_as(side_inputs, flat_side_inputs)
+      return _rev_block_forward(
+          x1_,
+          x2_,
+          self.f,
+          self.g,
+          num_layers=self.num_layers,
+          f_side_input=f_side,
+          g_side_input=g_side,
+          gate_outputs=self._use_efficient_backprop)
+
+    return _forward_wrap(x1, x2, *flat_side_inputs)
+
+  def _backward(self, y1, y2):
+    """Run backward through the reversible layers."""
+
+    f = list(self.f)
+    g = list(self.g)
+    f.reverse()
+    g.reverse()
+
+    for i in xrange(self.num_layers):
+      gy1 = g[i](y1, self.g_side_input) if self.g_side_input else g[i](y1)
+      x2 = y2 - gy1
+      fx2 = f[i](x2, self.f_side_input) if self.f_side_input else f[i](x2)
+      x1 = y1 - fx2
+
+      y1, y2 = x1, x2
+
+    return x1, x2
+
+
+def rev_block(x1,
+              x2,
+              f,
+              g,
+              num_layers=1,
+              f_side_input=None,
+              g_side_input=None,
+              is_training=True):
+  """A block of reversible residual layers.
+
+  A reversible residual layer is defined as:
+
+  ```
+  y1 = x1 + f(x2, f_side_input)
+  y2 = x2 + g(y1, g_side_input)
+  ```
+
+  A reversible residual block, defined here, is a series of reversible residual
+  layers.
+
+  Limitations:
+  * f and g must not close over any Tensors; all side inputs to f and g should
+    be passed in with f_side_input and g_side_input which will be forwarded to
+    f and g.
+  * f and g must not change the dimensionality of their inputs in order for the
+    addition in the equations above to work.
+
+  Args:
+    x1: a float Tensor.
+    x2: a float Tensor.
+    f: a function, (Tensor) -> (Tensor) (or list of such of length num_layers).
+      Should not change the shape of the Tensor. Can make calls to get_variable.
+      See f_side_input if there are side inputs.
+    g: a function, (Tensor) -> (Tensor) (or list of such of length num_layers).
+      Should not change the shape of the Tensor. Can make calls to get_variable.
+      See g_side_input if there are side inputs.
+    num_layers: int, number of reversible residual layers. Each layer will
+      apply f and g according to the equations above, with new variables in each
+      layer.
+    f_side_input: list of Tensors, side input to f. If not None, signature of f
+      should be (Tensor, list<Tensor>) -> (Tensor).
+    g_side_input: list of Tensors, side input to g. If not None, signature of g
+      should be (Tensor, list<Tensor>) -> (Tensor).
+    is_training: bool, whether to actually use the efficient backprop codepath.
+
+  Returns:
+    y1, y2: tuple of float Tensors.
+  """
+  block = RevBlock(
+      f=f,
+      g=g,
+      num_layers=num_layers,
+      f_side_input=f_side_input,
+      g_side_input=g_side_input,
+      use_efficient_backprop=is_training,
+      _reuse=variable_scope.get_variable_scope().reuse)
+  return block.forward(x1, x2)
+
+
+def recompute_grad(fn):
+  """Decorator that recomputes the function on the backwards pass.
+
+  Args:
+    fn: a function that takes Tensors (all as positional arguments) and returns
+      a tuple of Tensors.
+
+  Returns:
+    A wrapped fn that is identical to fn when called, but its activations will
+    be discarded and recomputed on the backwards pass (i.e. on a call to
+    tf.gradients).
+  """
+
+  @functools.wraps(fn)
+  def wrapped(*args):
+    return _recompute_grad(fn, args)
+
+  return wrapped
+
+
+def _recompute_grad(fn, args):
+  """See recompute_grad."""
+
+  cached_vs = []
+  cached_arg_scope = []
+
+  def grad_fn(inputs, variables, outputs, output_grads):
+    """Recompute outputs for gradient computation."""
+    del outputs
+    # Recompute outputs
+    with framework_ops.control_dependencies(output_grads):
+      with contrib_framework_ops.arg_scope(cached_arg_scope[0]):
+        with variable_scope.variable_scope(cached_vs[0], reuse=True):
+          outputs = fn(*inputs)
+
+    if not (isinstance(outputs, list) or isinstance(outputs, tuple)):
+      outputs = [outputs]
+    outputs = list(outputs)
+    grads = gradients_impl.gradients(outputs, inputs + variables, output_grads)
+    grad_inputs = grads[:len(inputs)]
+    grad_vars = grads[len(inputs):]
+    return grad_inputs, grad_vars
+
+  @_fn_with_custom_grad(grad_fn)
+  def fn_with_recompute(*args):
+    cached_vs.append(variable_scope.get_variable_scope())
+    # TODO(rsepassi): Rm conditional in TF 1.4
+    if hasattr(contrib_framework_ops, "current_arg_scope"):
+      cached_arg_scope.append(contrib_framework_ops.current_arg_scope())
+    else:
+      cached_arg_scope.append({})
+    return fn(*args)
+
+  return fn_with_recompute(*args)
+
+
+def _underlying_variable_ref(t):
+  """Find the underlying variable ref.
+
+  Traverses through Identity, ReadVariableOp, and Enter ops.
+  Stops when op type has Variable or VarHandle in name.
+
+  Args:
+    t: a Tensor
+
+  Returns:
+    a Tensor that is a variable ref, or None on error.
+  """
+  while t.op.type in ["Identity", "ReadVariableOp", "Enter"]:
+    t = t.op.inputs[0]
+
+  op_type = t.op.type
+  if "Variable" in op_type or "VarHandle" in op_type:
+    return t
+  else:
+    return None
+
+
+def _fn_with_custom_grad(grad_fn, use_global_vars=False):
+  """Decorator to create a subgraph with a custom gradient function.
+
+  The subgraph created by the decorated function is NOT put in a Defun and so
+  does not suffer from the limitations of the Defun (all subgraph ops on the
+  same device, no summaries).
+
+  Args:
+    grad_fn: function with signature
+      (inputs, variables, outputs, output_grads) -> (grad_inputs, grad_vars),
+      all of which are lists of Tensors.
+    use_global_vars: if True, variables will be the global variables created.
+      If False, will be the trainable variables.
+
+  Returns:
+    Decorator for function such that the gradient is defined by grad_fn.
+  """
+
+  def dec(fn):
+
+    @functools.wraps(fn)
+    def wrapped(*args):
+      return _fn_with_custom_grad_internal(
+          fn, args, grad_fn, use_global_vars=use_global_vars)
+
+    return wrapped
+
+  return dec
+
+
+def _fn_with_custom_grad_internal(fn, inputs, grad_fn, use_global_vars=False):
+  """Create a subgraph with a custom gradient.
+
+  Args:
+    fn: function that takes inputs as arguments and produces 1 or more Tensors.
+    inputs: list<Tensor>, will be passed as fn(*inputs).
+    grad_fn: function with signature
+      (inputs, vars, outputs, output_grads) -> (grad_inputs, grad_vars),
+      all of which are lists of Tensors.
+    use_global_vars: if True, variables will be the global variables created.
+      If False, will be the trainable variables.
+
+  Returns:
+    fn(*inputs)
+  """
+  vs = variable_scope.get_variable_scope()
+  get_vars_fn = (
+      vs.global_variables if use_global_vars else vs.trainable_variables)
+  len_before_vars = len(get_vars_fn())
+  inputs = list(inputs)
+  outputs = fn(*inputs)
+  train_vars = get_vars_fn()[len_before_vars:]
+
+  if grad_fn is None:
+    return outputs
+
+  if not (isinstance(outputs, tuple) or isinstance(outputs, list)):
+    outputs = [outputs]
+  outputs = list(outputs)
+
+  defun_inputs = [inputs, train_vars, outputs]
+
+  def custom_grad_fn(op, *dys):
+    """Custom grad fn applying grad_fn for identity Defun."""
+    fn_inputs, fn_vars, fn_outputs = nest.pack_sequence_as(
+        defun_inputs, list(op.inputs))
+    dys = list(dys)
+    assert len(fn_outputs) == len(outputs)
+    assert len(fn_outputs) == len(dys)
+
+    grad_inputs, grad_vars = grad_fn(fn_inputs, fn_vars, fn_outputs, dys)
+    grad_outputs = [None] * len(fn_outputs)
+    return tuple(grad_inputs + grad_vars + grad_outputs)
+
+  # The Defun takes as input the original inputs, the trainable variables
+  # created in fn, and the outputs. In the forward it passes through the
+  # outputs. In the backwards, it produces gradients for the original inputs
+  # and the trainable variables.
+  in_types = [t.dtype for t in inputs]
+  out_types = [t.dtype for t in outputs]
+  var_types = [t.dtype for t in train_vars]
+
+  # Get a unique name for the Defun
+  with framework_ops.name_scope("identity_custom_grad") as ns:
+    defun_name = ns
+
+  @function.Defun(
+      *(in_types + var_types + out_types),
+      func_name=defun_name,
+      python_grad_func=custom_grad_fn,
+      shape_func=lambda _: [t.get_shape() for t in outputs])
+  def identity(*args):
+    _, _, outs = nest.pack_sequence_as(defun_inputs, args)
+    return tuple([array_ops.identity(t) for t in outs])
+
+  flat_inputs = nest.flatten(defun_inputs)
+  id_out = identity(*flat_inputs)
+  return id_out
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbcbcd75114a522b95631e4e7e95c1641b0a9987
--- /dev/null
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
@@ -0,0 +1,364 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RevBlock."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.layers.python.layers import layers
+from tensorflow.contrib.layers.python.layers import rev_block_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import random_seed
+from tensorflow.python.layers import convolutional
+from tensorflow.python.layers import core as core_layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class RevBlockTest(test.TestCase):
+  CHANNELS = 8
+  NUM_LAYERS = 4
+  BATCH_SIZE = 16
+
+  def testForwardBackward(self):
+
+    def f(x):
+      return core_layers.dense(x, self.CHANNELS // 2, use_bias=True)
+
+    def g(x):
+      return core_layers.dense(x, self.CHANNELS // 2, use_bias=True)
+
+    x = random_ops.random_uniform(
+        [self.BATCH_SIZE, self.CHANNELS], dtype=dtypes.float32)
+    x1, x2 = array_ops.split(x, 2, axis=-1)
+
+    block = rev_block_lib.RevBlock(f, g, num_layers=3)
+    y1, y2 = block.forward(x1, x2)
+    x1_inv, x2_inv = block.backward(y1, y2)
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      x1, x2, x1_inv, x2_inv = sess.run([x1, x2, x1_inv, x2_inv])
+
+      self.assertAllClose(x1, x1_inv)
+      self.assertAllClose(x2, x2_inv)
+
+  def testBackwardForward(self):
+
+    def f(x):
+      return core_layers.dense(x, self.CHANNELS // 2, use_bias=True)
+
+    def g(x):
+      return core_layers.dense(x, self.CHANNELS // 2, use_bias=True)
+
+    y = random_ops.random_uniform(
+        [self.BATCH_SIZE, self.CHANNELS], dtype=dtypes.float32)
+    y1, y2 = array_ops.split(y, 2, axis=-1)
+
+    block = rev_block_lib.RevBlock(f, g, num_layers=3)
+    x1, x2 = block.backward(y1, y2)
+    y1_inv, y2_inv = block.forward(x1, x2)
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      y1, y2, y1_inv, y2_inv = sess.run([y1, y2, y1_inv, y2_inv])
+
+      self.assertAllClose(y1, y1_inv)
+      self.assertAllClose(y2, y2_inv)
+
+  def _testRevBlock(self,
+                    x=None,
+                    f=None,
+                    g=None,
+                    f_side_input=None,
+                    g_side_input=None):
+    random_seed.set_random_seed(1234)
+
+    if f is None:
+
+      def f(x):  # pylint: disable=function-redefined
+        return core_layers.dense(x, self.CHANNELS // 2, use_bias=True)
+
+    if g is None:
+
+      def g(x):  # pylint: disable=function-redefined
+        return core_layers.dense(x, self.CHANNELS // 2, use_bias=True)
+
+    if f_side_input is None:
+      f_side_input = []
+
+    if g_side_input is None:
+      g_side_input = []
+
+    if x is None:
+      x = random_ops.random_uniform(
+          [self.BATCH_SIZE, self.CHANNELS], dtype=dtypes.float32)
+    x1, x2 = array_ops.split(x, 2, axis=-1)
+
+    with variable_scope.variable_scope("rev_test") as vs:
+      y1_rev, y2_rev = rev_block_lib.rev_block(
+          x1,
+          x2,
+          f,
+          g,
+          f_side_input=f_side_input,
+          g_side_input=g_side_input,
+          num_layers=self.NUM_LAYERS)
+      y_rev = array_ops.concat([y1_rev, y2_rev], axis=1)
+      fg_vars = vs.trainable_variables()
+
+    num_vars = len(variables.global_variables())
+    with variable_scope.variable_scope(vs, reuse=True):
+      y1, y2 = rev_block_lib.rev_block(
+          x1,
+          x2,
+          f,
+          g,
+          f_side_input=f_side_input,
+          g_side_input=g_side_input,
+          num_layers=self.NUM_LAYERS,
+          is_training=False)
+      y = array_ops.concat([y1, y2], axis=1)
+    # Ensure no new vars were created - full reuse
+    assert len(variables.global_variables()) == num_vars
+
+    loss_rev = math_ops.reduce_mean(y_rev + 10.)
+    loss = math_ops.reduce_mean(y + 10.)
+
+    wrt = [x] + f_side_input + g_side_input + fg_vars
+    grads_rev = gradients_impl.gradients(loss_rev, wrt)
+    grads = gradients_impl.gradients(loss, wrt)
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      y_val, yd_val, gd_val, g_val = sess.run([y, y_rev, grads_rev, grads])
+      self.assertAllClose(y_val, yd_val)
+      for g1, g2 in zip(gd_val, g_val):
+        self.assertAllClose(g1, g2)
+
+  def testRevBlock(self):
+    self._testRevBlock()
+
+  def testSideInput(self):
+    f_side_input = random_ops.random_uniform(
+        [self.BATCH_SIZE, self.CHANNELS // 2])
+
+    def f(x, side_input):
+      return core_layers.dense(
+          x, self.CHANNELS // 2, use_bias=True) + side_input[0]
+
+    self._testRevBlock(f=f, f_side_input=[f_side_input])
+
+  def testMultipleFns(self):
+
+    def f1(x):
+      return core_layers.dense(x, self.CHANNELS // 2)
+
+    def f2(x):
+      return core_layers.dense(x, self.CHANNELS // 2, activation=nn_ops.relu)
+
+    self._testRevBlock(f=[f1, f2, f1, f2])
+
+  # TODO(rsepassi): Recent change to conv seems to have broken this test. Find
+  # out why.
+  def _testConvAndBatchNorm(self):
+
+    x = random_ops.random_uniform(
+        [self.BATCH_SIZE, 10, self.CHANNELS], dtype=dtypes.float32)
+
+    def f(x):
+      x = convolutional.conv1d(x, self.CHANNELS // 2, 3, padding="same")
+      x = layers.batch_norm(x, is_training=True)
+      x = convolutional.conv1d(x, self.CHANNELS // 2, 3, padding="same")
+      x = layers.batch_norm(x, is_training=True)
+      return x
+
+    self._testRevBlock(x=x, f=f)
+
+  def testReuse(self):
+
+    def f(x):
+      return core_layers.dense(x, self.CHANNELS // 2)
+
+    def g(x):
+      return core_layers.dense(x, self.CHANNELS // 2)
+
+    x = random_ops.random_uniform(
+        [self.BATCH_SIZE, self.CHANNELS], dtype=dtypes.float32)
+    x1, x2 = array_ops.split(x, 2, axis=-1)
+
+    with variable_scope.variable_scope("test"):
+      y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS)
+
+    num_vars_before = len(variables.global_variables())
+
+    with variable_scope.variable_scope("test", reuse=True):
+      y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS)
+
+    num_vars_after = len(variables.global_variables())
+    self.assertEqual(num_vars_before, num_vars_after)
+
+    loss = math_ops.reduce_mean(y1 + y2)
+    _ = gradients_impl.gradients(loss,
+                                 [x] + variables.trainable_variables())
+
+    with variable_scope.variable_scope("test", reuse=True):
+      y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS)
+
+    num_vars_after = len(variables.global_variables())
+    self.assertEqual(num_vars_before, num_vars_after)
+
+
+class RecomputeTest(test.TestCase):
+
+  def testRecompute(self):
+
+    def layer(x, name=None):
+      with variable_scope.variable_scope(name, default_name="layer"):
+        x = layers.layer_norm(x)
+        x = convolutional.conv1d(
+            x,
+            10,
+            1,
+            use_bias=False,
+            kernel_initializer=init_ops.constant_initializer(42.42))
+        x = nn_ops.relu(x)
+        return x
+
+    def fn(x):
+      out = x
+      for _ in range(3):
+        out = layer(out)
+      return out
+
+    @rev_block_lib.recompute_grad
+    def fn_recompute(x):
+      return fn(x)
+
+    x = random_ops.random_uniform((3, 1, 3))
+    recompute_vars = None
+    with variable_scope.variable_scope("recompute") as vs:
+      out1 = math_ops.reduce_sum(fn_recompute(x))
+      recompute_vars = vs.trainable_variables()
+    reg_vars = None
+    with variable_scope.variable_scope("regular") as vs:
+      out2 = math_ops.reduce_sum(fn(x))
+      reg_vars = vs.trainable_variables()
+
+    grad1 = gradients_impl.gradients(out1, recompute_vars)
+    grad2 = gradients_impl.gradients(out2, reg_vars)
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      outs = sess.run([out1, out2, grad1, grad2])
+      self.assertAllClose(outs[0], outs[1])
+      for g1, g2 in zip(outs[2], outs[3]):
+        self.assertAllClose(g1, g2)
+
+
+class FnWithCustomGradTest(test.TestCase):
+
+  def testCorrectness(self):
+
+    w = random_ops.random_uniform([6, 10])
+
+    def fn(a, b, c):
+      return core_layers.dense(
+          a,
+          10,
+          use_bias=False,
+          kernel_initializer=lambda shape, dtype, partition_info: w
+      ) + math_ops.matmul(b, c)
+
+    def grad_fn(inputs, trainable_variables, outputs, grad_outputs):
+      outputs = outputs[0]
+      grad_outputs = grad_outputs[0]
+      grad_inputs = gradients_impl.gradients(
+          outputs, inputs, grad_ys=grad_outputs)
+      grad_vars = gradients_impl.gradients(
+          outputs, trainable_variables, grad_ys=grad_outputs)
+      return grad_inputs, grad_vars
+
+    custom_fn = rev_block_lib._fn_with_custom_grad(grad_fn)(fn)
+
+    a = random_ops.random_uniform([11, 6])
+    b = random_ops.random_uniform([11, 7])
+    c = random_ops.random_uniform([7, 10])
+
+    out = fn(a, b, c)
+    custom_out = custom_fn(a, b, c)
+    self.assertEqual(out.get_shape().as_list(),
+                     custom_out.get_shape().as_list())
+
+    loss = math_ops.reduce_mean(out)
+    custom_loss = math_ops.reduce_mean(custom_out)
+
+    grads = gradients_impl.gradients(
+        loss, [a, b, c] + [variables.trainable_variables()[0]])
+    custom_grads = gradients_impl.gradients(
+        custom_loss, [a, b, c] + [variables.trainable_variables()[1]])
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      out_val, custom_out_val, grads_val, custom_grads_val = sess.run(
+          [out, custom_out, grads, custom_grads])
+      self.assertAllClose(out_val, custom_out_val)
+      for g1, g2 in zip(grads_val, custom_grads_val):
+        self.assertAllClose(g1, g2)
+
+  def testCustomGrad(self):
+
+    def fn(a, b, c):
+      return core_layers.dense(a, 10, use_bias=False) + math_ops.matmul(b, c)
+
+    def grad_fn(inputs, trainable_variables, unused_outputs,
+                unused_grad_outputs):
+      grad_inputs = [
+          array_ops.ones_like(t) * (i + 1.) for i, t in enumerate(inputs)
+      ]
+      grad_vars = [
+          array_ops.ones_like(t) * (i + len(inputs) + 1.)
+          for i, t in enumerate(trainable_variables)
+      ]
+      return grad_inputs, grad_vars
+
+    a = random_ops.random_uniform([11, 6])
+    b = random_ops.random_uniform([11, 7])
+    c = random_ops.random_uniform([7, 10])
+    w = random_ops.random_uniform([6, 10])
+    out = rev_block_lib._fn_with_custom_grad(grad_fn)(fn)(a, b, c)
+    loss = math_ops.reduce_mean(out)
+    grads = gradients_impl.gradients(
+        loss, [a, b, c, variables.trainable_variables()[0]])
+    expected_grads = [
+        array_ops.ones_like(t) * (i + 1.) for i, t in enumerate([a, b, c, w])
+    ]
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      g_val, eg_val = sess.run([grads, expected_grads])
+      for g1, g2 in zip(g_val, eg_val):
+        self.assertAllClose(g1, g2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 94920db574e07529c28313a78e0128676fcc7970..5df2c77249b81434125d838f896f0ace2a5ee130 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -10,7 +10,7 @@ package(default_visibility = [
     "//tensorflow:internal",
 ])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
 
 py_library(
     name = "learn",
@@ -154,12 +154,11 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "experiment_test",
     size = "medium",
     srcs = ["python/learn/experiment_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":learn",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/core:protos_all_py",
@@ -346,6 +345,7 @@ py_test(
     srcs = ["python/learn/estimators/dnn_linear_combined_test.py"],
     shard_count = 4,
     srcs_version = "PY2AND3",
+    tags = ["no_oss"],  # flaky b/70524820
     deps = [
         ":learn",
         "//tensorflow/contrib/layers:layers_py",
@@ -461,6 +461,7 @@ py_test(
     size = "medium",
     srcs = ["python/learn/estimators/state_saving_rnn_estimator_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["noasan"],
     deps = [
         ":learn",
         "//tensorflow/contrib/layers:layers_py",
@@ -715,12 +716,11 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "graph_io_test",
     size = "small",
     srcs = ["python/learn/learn_io/graph_io_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":learn",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -736,6 +736,7 @@ py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    grpc_enabled = True,
 )
 
 py_test(
diff --git a/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py b/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
index 14750961efa30128708430fac038498de0a42118..ef5e620e8f08cffa7c2b945089aa5d150baefefc 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.contrib.learn.python.learn.estimators import composable_model
@@ -55,7 +55,7 @@ def _base_model_fn(features, labels, mode, params):
     raise NotImplementedError
 
   def _train_op_fn(loss):
-    global_step = contrib_variables.get_global_step()
+    global_step = training_util.get_global_step()
     assert global_step
     train_step = model.get_train_step(loss)
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index cb15ef23e95d27c737d8ae08065b804bafd39a07..c17b41c0f767e19d9c3635a8f60347a49b297cfb 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -23,7 +23,7 @@ import six
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import metric_spec
@@ -189,7 +189,7 @@ def _dnn_model_fn(features, labels, mode, params, config=None):
       """Returns the op to optimize the loss."""
       return optimizers.optimize_loss(
           loss=loss,
-          global_step=contrib_variables.get_global_step(),
+          global_step=training_util.get_global_step(),
           learning_rate=_LEARNING_RATE,
           optimizer=_get_optimizer(optimizer),
           gradient_multipliers=(
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 788d2d0b1a58fad16712c968593b40de0d3979f0..05ed8b3409e68ae54e5ef89b3a1592a6f285565b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -30,7 +30,6 @@ import six
 
 from google.protobuf import message
 from tensorflow.contrib import layers
-from tensorflow.contrib import metrics as metrics_lib
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_args
 from tensorflow.contrib.framework import list_variables
@@ -60,6 +59,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import resources
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
@@ -1230,7 +1230,7 @@ class Estimator(BaseEstimator):
 
     if metric_key.MetricKey.LOSS not in model_fn_ops.eval_metric_ops:
       model_fn_ops.eval_metric_ops[metric_key.MetricKey.LOSS] = (
-          metrics_lib.streaming_mean(model_fn_ops.loss))
+          metrics_lib.mean(model_fn_ops.loss))
     return model_fn_ops
 
   def _get_predict_ops(self, features):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
index 248c6c733ffca351c848ba07110ba89928634a23..9d7c1a099aa4be64ca0296fa5b870597dabec7b4 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
@@ -23,7 +23,7 @@ import tempfile
 
 import numpy as np
 
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import metric_spec
 from tensorflow.contrib.learn.python.learn import models
@@ -114,7 +114,7 @@ def linear_model_params_fn(features, labels, mode, params):
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
       loss,
-      variables.get_global_step(),
+      training_util.get_global_step(),
       optimizer='Adagrad',
       learning_rate=params['learning_rate'])
   return prediction, loss, train_op
@@ -129,7 +129,7 @@ def linear_model_fn(features, labels, mode):
     (_, features), = features.items()
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return prediction, loss, train_op
 
 
@@ -139,7 +139,7 @@ def linear_model_fn_with_model_fn_ops(features, labels, mode):
                   model_fn.ModeKeys.INFER)
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return model_fn.ModelFnOps(
       mode=mode, predictions=prediction, loss=loss, train_op=train_op)
 
@@ -150,7 +150,7 @@ def logistic_model_no_mode_fn(features, labels):
   labels = array_ops.one_hot(labels, 3, 1, 0)
   prediction, loss = (models.logistic_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return {
       'class': math_ops.argmax(prediction, 1),
       'prob': prediction
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index be2b0cb3ca959323b4de095ca072278f028be301..2a13a84627df35a68a4f04b25ab26ceecad0db0d 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -32,7 +32,7 @@ from google.protobuf import text_format
 
 from tensorflow.contrib import learn
 from tensorflow.contrib import lookup
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import experiment
@@ -132,7 +132,7 @@ def linear_model_params_fn(features, labels, mode, params):
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
       loss,
-      variables.get_global_step(),
+      training_util.get_global_step(),
       optimizer='Adagrad',
       learning_rate=params['learning_rate'])
   return prediction, loss, train_op
@@ -147,7 +147,7 @@ def linear_model_fn(features, labels, mode):
     (_, features), = features.items()
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return prediction, loss, train_op
 
 
@@ -157,7 +157,7 @@ def linear_model_fn_with_model_fn_ops(features, labels, mode):
                   model_fn.ModeKeys.INFER)
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return model_fn.ModelFnOps(
       mode=mode, predictions=prediction, loss=loss, train_op=train_op)
 
@@ -168,7 +168,7 @@ def logistic_model_no_mode_fn(features, labels):
   labels = array_ops.one_hot(labels, 3, 1, 0)
   prediction, loss = (models.logistic_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return {
       'class': math_ops.argmax(prediction, 1),
       'prob': prediction
@@ -241,7 +241,7 @@ def _build_estimator_for_resource_export_test():
     const = constant_op.constant(-1, dtype=dtypes.int64)
     table = lookup.MutableHashTable(
         dtypes.string, dtypes.int64, const, name='LookupTableModel')
-    update_global_step = variables.get_global_step().assign_add(1)
+    update_global_step = training_util.get_global_step().assign_add(1)
     if mode in (model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL):
       key = constant_op.constant(['key'])
       value = constant_op.constant([42], dtype=dtypes.int64)
@@ -306,7 +306,7 @@ def _model_fn_ops(
         mode=mode,
         predictions=constant_op.constant(0.),
         loss=constant_op.constant(0.),
-        train_op=variables.get_global_step().assign_add(1))
+        train_op=training_util.get_global_step().assign_add(1))
 
 
 def _make_input_fn(features, labels):
@@ -389,7 +389,7 @@ class EstimatorModelFnTest(test.TestCase):
       self.assertEqual(expected_param, params)
       self.assertEqual(model_dir, expected_model_dir)
       return (constant_op.constant(0.), constant_op.constant(0.),
-              variables.get_global_step().assign_add(1))
+              training_util.get_global_step().assign_add(1))
     est = estimator.Estimator(model_fn=_argument_checker,
                               params=expected_param,
                               model_dir=expected_model_dir)
@@ -400,7 +400,7 @@ class EstimatorModelFnTest(test.TestCase):
     def _invalid_model_fn(features, labels):
       # pylint: disable=unused-argument
       w = variables_lib.Variable(42.0, 'weight')
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       with ops.control_dependencies([update_global_step]):
         loss = 100.0 - w
       return None, loss, None
@@ -415,7 +415,7 @@ class EstimatorModelFnTest(test.TestCase):
       # pylint: disable=unused-argument
       w = variables_lib.Variable(42.0, 'weight')
       loss = 100.0 - w
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       with ops.control_dependencies([update_global_step]):
         train_op = w.assign_add(loss / 100.0)
       predictions = loss
@@ -434,7 +434,7 @@ class EstimatorModelFnTest(test.TestCase):
       # pylint: disable=unused-argument
       w = variables_lib.Variable(42.0, 'weight')
       loss = 100.0 - w
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       with ops.control_dependencies([update_global_step]):
         train_op = w.assign_add(loss / 100.0)
       return None, loss, train_op
@@ -464,7 +464,7 @@ class EstimatorModelFnTest(test.TestCase):
           mode=mode,
           predictions=constant_op.constant(0.),
           loss=constant_op.constant(0.),
-          train_op=variables.get_global_step().assign_add(1),
+          train_op=training_util.get_global_step().assign_add(1),
           scaffold=monitored_session.Scaffold(init_fn=_init_fn))
 
     est = estimator.Estimator(model_fn=_model_fn_scaffold)
@@ -483,7 +483,7 @@ class EstimatorModelFnTest(test.TestCase):
           mode=mode,
           predictions=constant_op.constant([[1.]]),
           loss=constant_op.constant(0.),
-          train_op=variables.get_global_step().assign_add(1),
+          train_op=training_util.get_global_step().assign_add(1),
           scaffold=monitored_session.Scaffold(saver=self.mock_saver))
 
     def input_fn():
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
index 1d89dfb55b10b032cab7dcf434d396404d4eb83b..8131e0fde6fea5501cacc4714f53ed8d867ca70f 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
@@ -22,7 +22,7 @@ import random
 
 import numpy as np
 
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.learn.python import learn
 from tensorflow.contrib.learn.python.learn import datasets
 from tensorflow.contrib.learn.python.learn import metric_spec
@@ -62,7 +62,7 @@ class FeatureEngineeringFunctionTest(test.TestCase):
       _ = labels
       predictions = features["transformed_x"]
       loss = constant_op.constant([2.])
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       return predictions, loss, update_global_step
 
     estimator = estimator_lib.Estimator(
@@ -100,7 +100,7 @@ class FeatureEngineeringFunctionTest(test.TestCase):
       _ = labels
       predictions = features["x"]
       loss = constant_op.constant([2.])
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       return predictions, loss, update_global_step
 
     estimator = estimator_lib.Estimator(
@@ -139,7 +139,7 @@ class FeatureEngineeringFunctionTest(test.TestCase):
       _ = labels
       predictions = features["x"]
       loss = constant_op.constant([2.])
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       return predictions, loss, update_global_step
 
     estimator_with_fe_fn = estimator_lib.Estimator(
diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
index 992b804f59ecd88fedc2fba10d3079f93c4fe83d..8f9d6fc318a357853bdb8e3264f6691b410006b1 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
@@ -28,7 +28,7 @@ import time
 import numpy as np
 
 from tensorflow.contrib.factorization.python.ops import clustering_ops
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators.model_fn import ModelFnOps
 from tensorflow.python.framework import ops
@@ -128,7 +128,7 @@ def _kmeans_clustering_model_fn(features, labels, mode, params, config):
        random_seed=params.get('random_seed'),
        kmeans_plus_plus_num_retries=params.get(
            'kmeans_plus_plus_num_retries')).training_graph()
-  incr_step = state_ops.assign_add(variables.get_global_step(), 1)
+  incr_step = state_ops.assign_add(training_util.get_global_step(), 1)
   loss = math_ops.reduce_sum(losses, name=KMeansClustering.LOSS_OP_NAME)
   summary.scalar('loss/raw', loss)
   training_op = with_dependencies([training_op, incr_step], loss)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index f5445ad4e728dbd3904279573771de9454b5d17c..37aa8b339622415d082933cdf66d2472a4119b48 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -26,7 +26,7 @@ import six
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
@@ -170,7 +170,7 @@ def _linear_model_fn(features, labels, mode, params, config=None):
           weight_collections=[parent_scope])
 
     def _train_op_fn(loss):
-      global_step = contrib_variables.get_global_step()
+      global_step = training_util.get_global_step()
       my_vars = ops.get_collection(parent_scope)
       grads = gradients.gradients(loss, my_vars)
       if gradient_clip_norm:
@@ -252,7 +252,7 @@ def sdca_model_fn(features, labels, mode, params):
     _add_bias_column(feature_columns, features, bias, columns_to_variables)
 
   def _train_op_fn(unused_loss):
-    global_step = contrib_variables.get_global_step()
+    global_step = training_util.get_global_step()
     sdca_model, train_op = optimizer.get_train_step(columns_to_variables,
                                                     weight_column_name,
                                                     loss_type, features,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
index 93c62f87e8495f299a8c456574c7b40534186304..656d68b76888d9319c0b9be481f9b0478ac4314c 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib import layers
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.contrib.learn.python.learn.estimators import logistic_regressor
@@ -57,7 +57,7 @@ def _logistic_regression_model_fn(features, labels, mode):
   predictions = math_ops.sigmoid(logits)
   loss = losses.sigmoid_cross_entropy(labels, logits)
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return predictions, loss, train_op
 
 
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index 307db76afe20a7743df16d169270a6f319497eb6..9576ff21c243022276bb0641882dfaf0decf05c0 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -35,6 +35,7 @@ from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.tpu.python.tpu import tpu_estimator
 from tensorflow.python.estimator import estimator as core_estimator
+from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import basic_session_run_hooks
@@ -46,6 +47,18 @@ from tensorflow.python.util import compat
 __all__ = ["Experiment"]
 
 
+def _get_standardized_predicate_fn(predicate_fn):
+  pred_fn_args = estimator_util.fn_args(predicate_fn)
+  if "checkpoint_path" not in pred_fn_args:
+    # pylint: disable=unused-argument
+    def _pred_fn_wrapper(eval_results, checkpoint_path):
+      return predicate_fn(eval_results)
+
+    return _pred_fn_wrapper
+  else:
+    return predicate_fn
+
+
 class _EvalAndExportListener(basic_session_run_hooks.CheckpointSaverListener):
   """Listener that evaluates and exports a model after creating a checkpoint.
 
@@ -140,7 +153,8 @@ class Experiment(object):
                delay_workers_by_global_step=False,
                export_strategies=None,
                train_steps_per_iteration=None,
-               checkpoint_and_export=False):
+               checkpoint_and_export=False,
+               saving_listeners=None):
     """Constructor for `Experiment`.
 
     Creates an Experiment instance. None of the functions passed to this
@@ -200,6 +214,9 @@ class Experiment(object):
         `save_checkpoints_steps`. Also, this parameter leads to the creation of
         a default `CheckpointSaverHook` instead of a `ValidationMonitor`, so the
         provided `train_monitors` will need to be adjusted accordingly.
+      saving_listeners: list of `CheckpointSaverListener` objects. Used by
+        tf.estimator.Estimator for callbacks that run immediately before or
+        after checkpoint savings.
 
     Raises:
       ValueError: if `estimator` does not implement Estimator interface,
@@ -221,6 +238,9 @@ class Experiment(object):
         raise ValueError(
             "`estimator` must implement `tf.contrib.learn.Trainable`"
             "or `tf.estimator.`Estimator`.")
+      if saving_listeners is not None:
+        raise ValueError("`saving_listeners` must be `None` with "
+                         "`tf.contrib.learn.Estimator`.")
 
     if isinstance(estimator, tpu_estimator.TPUEstimator):
       logging.warn(
@@ -242,6 +262,7 @@ class Experiment(object):
     self._eval_delay_secs = eval_delay_secs
     self._continuous_eval_throttle_secs = continuous_eval_throttle_secs
     self._checkpoint_and_export = checkpoint_and_export
+    self._saving_listeners = saving_listeners
     # Using 1 on a non-cached file system requires a lot of overhead to
     # read the checkpoint state file. This is particular bad on GCS, so
     # we use a different default. This is a temporary band-aid, to be
@@ -362,9 +383,11 @@ class Experiment(object):
       logging.info("Waiting %d secs before starting training.", remaining)
       time.sleep(delay_secs)
 
-    return self._call_train(input_fn=self._train_input_fn,
-                            max_steps=self._train_steps,
-                            hooks=self._train_monitors + extra_hooks)
+    return self._call_train(
+        input_fn=self._train_input_fn,
+        max_steps=self._train_steps,
+        hooks=self._train_monitors + extra_hooks,
+        saving_listeners=self._saving_listeners)
 
   def evaluate(self, delay_secs=None, name=None):
     """Evaluate on the evaluation data.
@@ -436,22 +459,33 @@ class Experiment(object):
       evaluate_checkpoint_only_once: Whether to skip evaluation of checkpoints
         that have already been evaluated. Default is `True`.
       continuous_eval_predicate_fn: A predicate function determining whether to
-        continue eval after each iteration. `predicate_fn` takes the evaluation
-        results as arguments. At the beginning of evaluation, the passed eval
-        results will be None so it's expected that the predicate function
-        handles that gracefully. When `predicate_fn` is not specified,
-        continuous eval will run in an infinite loop (if `train_steps` is None)
-        or exit once global step reaches `train_steps`.
+        continue eval after each iteration. A `predicate_fn` has one of the
+        following signatures:
+          * (eval_results) -> boolean
+          * (eval_results, checkpoint_path) -> boolean
+        Where `eval_results` is the dictionary of metric evaluations and
+        checkpoint_path is the path to the checkpoint containing the parameters
+        on which that evaluation was based.
+        At the beginning of evaluation, the passed `eval_results` will be None
+        so it's expected that the predicate function handles that gracefully.
+        When `predicate_fn` is not specified, continuous eval will run in an
+        infinite loop (if `train_steps` is None). or exit once global step
+        reaches `train_steps`.
+
       export: Whether to export from this step. Default is 'True'.
 
     Raises:
       ValueError: if `continuous_eval_predicate_fn` is neither None nor
         callable.
     """
-    if (continuous_eval_predicate_fn is not None and
-        not callable(continuous_eval_predicate_fn)):
-      raise ValueError(
-          "`continuous_eval_predicate_fn` must be a callable, or None.")
+    if continuous_eval_predicate_fn is not None:
+      if not callable(continuous_eval_predicate_fn):
+        raise ValueError(
+            "`continuous_eval_predicate_fn` must be a callable, or None.")
+      predicate_fn = _get_standardized_predicate_fn(
+          continuous_eval_predicate_fn)
+    else:
+      predicate_fn = None
 
     if delay_secs is None:
       delay_secs = self._eval_delay_secs
@@ -465,8 +499,10 @@ class Experiment(object):
     previous_path = None
     eval_result = None
     last_warning_time = 0
-    while (not continuous_eval_predicate_fn or
-           continuous_eval_predicate_fn(eval_result)):
+    while (not predicate_fn or
+           predicate_fn(
+               eval_result,
+               checkpoint_path=previous_path if eval_result else None)):
       # Exit if we have already reached number of steps to train.
       if self._has_training_stopped(eval_result):
         logging.info("Exiting continuous eval, global_step=%s >= "
@@ -672,11 +708,19 @@ class Experiment(object):
 
     Args:
       continuous_eval_predicate_fn: A predicate function determining whether to
-        continue after each iteration. `predicate_fn` takes the evaluation
-        results as its arguments. At the beginning of evaluation, the passed
-        eval results will be None so it's expected that the predicate function
-        handles that gracefully. When `predicate_fn` is not specified, this will
-        run in an infinite loop or exit when global_step reaches `train_steps`.
+        continue eval after each iteration. A `predicate_fn` has one of the
+        following signatures:
+          * (eval_results) -> boolean
+          * (eval_results, checkpoint_path) -> boolean
+        Where `eval_results` is the dictionary of metric evaluations and
+        checkpoint_path is the path to the checkpoint containing the parameters
+        on which that evaluation was based.
+        At the beginning of evaluation, the passed `eval_results` and
+        `checkpoint_path` will be None so it's expected that the predicate
+        function handles that gracefully.
+        When `predicate_fn` is not specified, continuous eval will run in an
+        infinite loop (if `train_steps` is None). or exit once global step
+        reaches `train_steps`.
 
     Returns:
       A tuple of the result of the `evaluate` call to the `Estimator` and the
@@ -687,13 +731,18 @@ class Experiment(object):
         callable.
     """
 
-    if (continuous_eval_predicate_fn is not None and
-        not callable(continuous_eval_predicate_fn)):
-      raise ValueError(
-          "`continuous_eval_predicate_fn` must be a callable, or None.")
+    if continuous_eval_predicate_fn is not None:
+      if not callable(continuous_eval_predicate_fn):
+        raise ValueError(
+            "`continuous_eval_predicate_fn` must be a callable, or None.")
+      predicate_fn = _get_standardized_predicate_fn(
+          continuous_eval_predicate_fn)
+    else:
+      predicate_fn = None
 
-    eval_result = None
     export_results = None
+    latest_checkpoint = None
+    eval_result = None
 
     # Set the default value for train_steps_per_iteration, which will be
     # overridden by other settings.
@@ -703,8 +752,10 @@ class Experiment(object):
     elif self._train_steps is not None:
       train_steps_per_iteration = int(self._train_steps / 10)
 
-    while (not continuous_eval_predicate_fn or
-           continuous_eval_predicate_fn(eval_result)):
+    while (not predicate_fn or
+           predicate_fn(
+               eval_result,
+               checkpoint_path=latest_checkpoint if eval_result else None)):
 
       if self._has_training_stopped(eval_result):
         # Exits once max steps of training is satisfied.
@@ -712,16 +763,21 @@ class Experiment(object):
         break
 
       logging.info("Training model for %s steps", train_steps_per_iteration)
-      self._call_train(input_fn=self._train_input_fn,
-                       steps=train_steps_per_iteration,
-                       hooks=self._train_monitors)
+      self._call_train(
+          input_fn=self._train_input_fn,
+          steps=train_steps_per_iteration,
+          hooks=self._train_monitors,
+          saving_listeners=self._saving_listeners)
 
       logging.info("Evaluating model now.")
-      eval_result = self._call_evaluate(input_fn=self._eval_input_fn,
-                                        steps=self._eval_steps,
-                                        metrics=self._eval_metrics,
-                                        name="one_pass",
-                                        hooks=self._eval_hooks)
+      latest_checkpoint = saver.latest_checkpoint(self._estimator.model_dir)
+      eval_result = self._call_evaluate(
+          input_fn=self._eval_input_fn,
+          steps=self._eval_steps,
+          metrics=self._eval_metrics,
+          name="one_pass",
+          checkpoint_path=latest_checkpoint,
+          hooks=self._eval_hooks)
       export_results = self._maybe_export(eval_result)
 
     return eval_result, export_results
@@ -762,9 +818,11 @@ class Experiment(object):
     Returns:
       The result of the `evaluate` call to the `Estimator`.
     """
-    self._call_train(input_fn=self._train_input_fn,
-                     steps=1,
-                     hooks=self._train_monitors)
+    self._call_train(
+        input_fn=self._train_input_fn,
+        steps=1,
+        hooks=self._train_monitors,
+        saving_listeners=self._saving_listeners)
 
     eval_result = self._call_evaluate(input_fn=self._eval_input_fn,
                                       steps=1,
@@ -792,7 +850,8 @@ class Experiment(object):
     return server
 
   def _call_train(self, _sentinel=None,  # pylint: disable=invalid-name,
-                  input_fn=None, steps=None, hooks=None, max_steps=None):
+                  input_fn=None, steps=None, hooks=None, max_steps=None,
+                  saving_listeners=None):
     if _sentinel is not None:
       raise ValueError("_call_train should be called with keyword args only")
 
@@ -801,10 +860,12 @@ class Experiment(object):
     # safe to convert for both cases.
     hooks = monitors.replace_monitors_with_hooks(hooks, self._estimator)
     if self._core_estimator_used:
-      return self._estimator.train(input_fn=input_fn,
-                                   steps=steps,
-                                   max_steps=max_steps,
-                                   hooks=hooks)
+      return self._estimator.train(
+          input_fn=input_fn,
+          steps=steps,
+          max_steps=max_steps,
+          hooks=hooks,
+          saving_listeners=saving_listeners)
     else:
       return self._estimator.fit(input_fn=input_fn,
                                  steps=steps,
diff --git a/tensorflow/contrib/learn/python/learn/experiment_test.py b/tensorflow/contrib/learn/python/learn/experiment_test.py
index fe40d27c445d4f560c96fc9b50ceb0daed30ee93..545d7d8924c0c10544e6113e2968b7ae3d2090fc 100644
--- a/tensorflow/contrib/learn/python/learn/experiment_test.py
+++ b/tensorflow/contrib/learn/python/learn/experiment_test.py
@@ -232,14 +232,19 @@ class ExperimentTest(test.TestCase):
 
   def test_train(self):
     for est in self._estimators_for_tests():
-      eval_metrics = 'eval_metrics' if not isinstance(
-          est, core_estimator.Estimator) else None
+      if isinstance(est, core_estimator.Estimator):
+        eval_metrics = None
+        saving_listeners = 'saving_listeners'
+      else:
+        eval_metrics = 'eval_metrics'
+        saving_listeners = None
       ex = experiment.Experiment(
           est,
           train_input_fn='train_input',
           train_steps='train_steps',
           eval_input_fn='eval_input',
-          eval_metrics=eval_metrics)
+          eval_metrics=eval_metrics,
+          saving_listeners=saving_listeners)
       fit_args = ex.train(delay_secs=0)
       self.assertEqual(1, est.fit_count)
       self.assertIn(('max_steps', 'train_steps'), fit_args)
@@ -487,6 +492,33 @@ class ExperimentTest(test.TestCase):
       self.assertEqual(3, est.eval_count)
       self.assertEqual([noop_hook], est.eval_hooks)
 
+  def test_continuous_eval_predicate_fn_with_checkpoint(self):
+    for est in self._estimators_for_tests():
+      eval_metrics = 'eval_metrics' if not isinstance(
+          est, core_estimator.Estimator) else None
+      est.fake_checkpoint()
+      noop_hook = _NoopHook()
+
+      def _predicate_fn(eval_result, checkpoint_path):
+        self.assertEqual(not eval_result,
+                         checkpoint_path is None)
+        return est.eval_count < 3  # pylint: disable=cell-var-from-loop
+
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input',
+          eval_metrics=eval_metrics,
+          eval_hooks=[noop_hook],
+          eval_delay_secs=0,
+          continuous_eval_throttle_secs=0)
+      ex.continuous_eval(
+          evaluate_checkpoint_only_once=False,
+          continuous_eval_predicate_fn=_predicate_fn)
+      self.assertEqual(0, est.fit_count)
+      self.assertEqual(3, est.eval_count)
+      self.assertEqual([noop_hook], est.eval_hooks)
+
   def test_run_local(self):
     for est in self._estimators_for_tests():
       eval_metrics = 'eval_metrics' if not isinstance(
@@ -675,8 +707,12 @@ class ExperimentTest(test.TestCase):
 
   def test_continuous_train_and_eval(self):
     for est in self._estimators_for_tests(eval_dict={'global_step': 100}):
-      eval_metrics = 'eval_metrics' if not isinstance(
-          est, core_estimator.Estimator) else None
+      if isinstance(est, core_estimator.Estimator):
+        eval_metrics = None
+        saving_listeners = 'saving_listeners'
+      else:
+        eval_metrics = 'eval_metrics'
+        saving_listeners = None
       noop_hook = _NoopHook()
       export_strategy = saved_model_export_utils.make_export_strategy(
           est,
@@ -690,7 +726,8 @@ class ExperimentTest(test.TestCase):
           eval_hooks=[noop_hook],
           train_steps=100,
           eval_steps=100,
-          export_strategies=export_strategy)
+          export_strategies=export_strategy,
+          saving_listeners=saving_listeners)
       ex.continuous_train_and_eval()
       self.assertEqual(1, est.fit_count)
       self.assertEqual(1, est.eval_count)
@@ -742,9 +779,10 @@ class ExperimentTest(test.TestCase):
     ex.continuous_train_and_eval(continuous_eval_predicate_fn=predicate_fn)
     mock_estimator.train.assert_called_once_with(
         input_fn='train_input',
-        steps=int(total_steps/10),
+        steps=int(total_steps / 10),
         max_steps=test.mock.ANY,
-        hooks=test.mock.ANY)
+        hooks=test.mock.ANY,
+        saving_listeners=test.mock.ANY)
 
   def test_continuous_train_and_eval_with_steps_per_iteration_from_user(self):
     mock_estimator = test.mock.Mock(core_estimator.Estimator)
@@ -768,7 +806,8 @@ class ExperimentTest(test.TestCase):
         input_fn='train_input',
         steps=1234,
         max_steps=test.mock.ANY,
-        hooks=test.mock.ANY)
+        hooks=test.mock.ANY,
+        saving_listeners=test.mock.ANY)
 
   def test_continuous_train_and_eval_with_default_steps_per_iteration(self):
     mock_estimator = test.mock.Mock(core_estimator.Estimator)
@@ -791,7 +830,8 @@ class ExperimentTest(test.TestCase):
         input_fn='train_input',
         steps=1000,
         max_steps=test.mock.ANY,
-        hooks=test.mock.ANY)
+        hooks=test.mock.ANY,
+        saving_listeners=test.mock.ANY)
 
   def test_continuous_train_and_eval_with_invalid_predicate_fn(self):
     for est in self._estimators_for_tests():
@@ -857,11 +897,19 @@ class ExperimentTest(test.TestCase):
           est,
           None if isinstance(est, core_estimator.Estimator) else 'export_input',
           exports_to_keep=None)
+      if isinstance(est, core_estimator.Estimator):
+        eval_metrics = None
+        saving_listeners = 'saving_listeners'
+      else:
+        eval_metrics = 'eval_metrics'
+        saving_listeners = None
       ex = experiment.Experiment(
           est,
           train_input_fn='train_input',
           eval_input_fn='eval_input',
-          export_strategies=(exp_strategy,))
+          export_strategies=(exp_strategy,),
+          eval_metrics=eval_metrics,
+          saving_listeners=saving_listeners)
       ex.test()
       self.assertEqual(1, est.fit_count)
       self.assertEqual(1, est.eval_count)
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
index db18ebf05d5fb98e28e767be7bcccdf992a56fd8..f36a778b529a83f158241ddb060959c4b33e2e95 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
@@ -28,7 +28,6 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -369,10 +368,11 @@ class DataFeeder(object):
     if x_is_dict:
       num_samples = list(self._x.values())[0].shape[0]
     elif tensor_util.is_tensor(self._x):
-      num_samples = self._x.shape[0].value  # shape will be a Dimension, extract an int
+      num_samples = self._x.shape[
+          0].value  # shape will be a Dimension, extract an int
     else:
       num_samples = self._x.shape[0]
-      
+
     if self._shuffle:
       self.indices = self.random_state.permutation(num_samples)
     else:
@@ -857,8 +857,8 @@ class DaskDataFeeder(object):
     """Returns a function, that will sample data and provide it to placeholders.
 
     Args:
-      input_placeholder: tf.Placeholder for input features mini batch.
-      output_placeholder: tf.Placeholder for output labels.
+      input_placeholder: tf.placeholder for input features mini batch.
+      output_placeholder: tf.placeholder for output labels.
 
     Returns:
       A function that when called samples a random subset of batch size
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
index 4b34fc62849766370979bb2002d42ee03ea7161a..3a46c239688017f9204d2c6182a6f81cd325a417 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import io_ops
@@ -280,14 +281,33 @@ def _get_file_names(file_pattern, randomize_input):
 
 def _get_examples(file_name_queue, reader, num_threads, read_batch_size,
                   filter_fn, parse_fn):
+  """Get example filenames matching.
+
+  Args:
+    file_name_queue: A queue implementation that dequeues elements in
+      first-in first-out order.
+    reader: A function or class that returns an object with
+      `read` method, (filename tensor) -> (example tensor).
+    num_threads: The number of threads enqueuing examples.
+    read_batch_size: An int or scalar `Tensor` specifying the number of
+      records to read at once.
+    filter_fn: Filtering function, takes both keys as well as an `Example`
+      Tensors and returns a boolean mask of the same shape as the input Tensors
+      to be applied for filtering. If `None`, no filtering is done.
+    parse_fn: Parsing function, takes `Example` Tensor returns parsed
+      representation. If `None`, no parsing is done.
+
+  Returns:
+    List of example file names matching `file_name_queue`.
+  """
   with ops.name_scope('read'):
     example_list = []
     for _ in range(num_threads):
-      if read_batch_size > 1:
-        keys, examples_proto = reader().read_up_to(file_name_queue,
-                                                   read_batch_size)
-      else:
-        keys, examples_proto = reader().read(file_name_queue)
+      keys, examples_proto = utils.smart_cond(
+          read_batch_size > 1,
+          lambda: reader().read_up_to(file_name_queue, read_batch_size),
+          lambda: reader().read(file_name_queue))
+
       if filter_fn:
         mask = filter_fn(keys, examples_proto)
         keys = array_ops.boolean_mask(keys, mask)
@@ -379,14 +399,15 @@ def _read_keyed_batch_examples_helper(file_pattern,
             capacity=1, dtypes=[dtypes.string], shapes=[[]])
         enqueue_op = file_name_queue.enqueue(
             input_pipeline_ops.seek_next(
-                file_names, shuffle=randomize_input, num_epochs=num_epochs,
+                file_names,
+                shuffle=randomize_input,
+                num_epochs=num_epochs,
                 seed=seed))
         queue_runner.add_queue_runner(
             queue_runner.QueueRunner(file_name_queue, [enqueue_op]))
       else:
         file_name_queue = input_ops.string_input_producer(
-            constant_op.constant(
-                file_names, name='input'),
+            constant_op.constant(file_names, name='input'),
             shuffle=randomize_input,
             num_epochs=num_epochs,
             name=file_name_queue_scope,
@@ -496,7 +517,8 @@ def read_keyed_batch_features(file_pattern,
   """
 
   with ops.name_scope(name, 'read_batch_features', [file_pattern]) as scope:
-    if read_batch_size is None: read_batch_size = batch_size
+    if read_batch_size is None:
+      read_batch_size = batch_size
     keys, examples = read_keyed_batch_examples(
         file_pattern,
         batch_size,
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
index 6f0fd9a2976d37d1c701a96f50c2b987562cb191..e11e8b698adc113486bbb45572c8129e964cc931 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
@@ -204,8 +204,7 @@ class GraphIOTest(test.TestCase):
     shape = (0,)
     features = {
         "feature":
-            parsing_ops.FixedLenFeature(
-                shape=shape, dtype=dtypes_lib.float32)
+            parsing_ops.FixedLenFeature(shape=shape, dtype=dtypes_lib.float32)
     }
 
     with ops.Graph().as_default() as g, self.test_session(graph=g) as sess:
@@ -255,8 +254,8 @@ class GraphIOTest(test.TestCase):
       self.assertAllEqual((None,), inputs.get_shape().as_list())
       self.assertEqual("%s:1" % name, inputs.name)
       file_name_queue_name = "%s/file_name_queue" % name
-      file_name_queue_limit_name = ("%s/limit_epochs/epochs" %
-                                    file_name_queue_name)
+      file_name_queue_limit_name = (
+          "%s/limit_epochs/epochs" % file_name_queue_name)
       file_names_name = "%s/input" % file_name_queue_name
       example_queue_name = "%s/random_shuffle_queue" % name
       op_nodes = test_util.assert_ops_in_graph({
@@ -354,8 +353,8 @@ class GraphIOTest(test.TestCase):
     json_lines = [
         "".join([
             '{"features": { "feature": { "sequence": {',
-            '"bytes_list": { "value": ["', base64.b64encode(l).decode("ascii"),
-            '"]}}}}}\n'
+            '"bytes_list": { "value": ["',
+            base64.b64encode(l).decode("ascii"), '"]}}}}}\n'
         ]) for l in lines
     ]
     return self._create_temp_file("".join(json_lines))
@@ -823,6 +822,31 @@ class GraphIOTest(test.TestCase):
       coord.request_stop()
       coord.join(threads)
 
+  def test_read_keyed_batch_features_shared_queue(self):
+    batch_size = 17
+    shape = (0,)
+    fixed_feature = parsing_ops.FixedLenFeature(
+        shape=shape, dtype=dtypes_lib.float32)
+    feature = {"feature": fixed_feature}
+    reader = io_ops.TFRecordReader
+
+    _, queued_feature = graph_io.read_keyed_batch_features_shared_queue(
+        _VALID_FILE_PATTERN, batch_size, feature, reader)
+
+    with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+      features_result = graph_io.read_batch_features(
+          _VALID_FILE_PATTERN, batch_size, feature, reader)
+      session.run(variables.local_variables_initializer())
+
+    self.assertAllEqual(
+        queued_feature.get("feature").get_shape().as_list(),
+        features_result.get("feature").get_shape().as_list())
+
+  def test_get_file_names_errors(self):
+    # Raise bad file_pattern.
+    with self.assertRaises(ValueError):
+      graph_io._get_file_names([], True)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/metric_spec.py b/tensorflow/contrib/learn/python/learn/metric_spec.py
index ed6683abedbb8ae76ba364405158eb52cbb6d762..6440bc204b8e339ff51311dcc87b36f556b94092 100644
--- a/tensorflow/contrib/learn/python/learn/metric_spec.py
+++ b/tensorflow/contrib/learn/python/learn/metric_spec.py
@@ -42,10 +42,8 @@ def _args(fn):
   """
   if hasattr(fn, 'func') and hasattr(fn, 'keywords'):
     # Handle functools.partial and similar objects.
-    return tuple([
-        arg for arg in tf_inspect.getargspec(fn.func).args
-        if arg not in set(fn.keywords.keys())
-    ])
+    return tuple(
+        [arg for arg in _args(fn.func) if arg not in set(fn.keywords.keys())])
   # Handle function.
   return tuple(tf_inspect.getargspec(fn).args)
 
diff --git a/tensorflow/contrib/learn/python/learn/utils/export.py b/tensorflow/contrib/learn/python/learn/utils/export.py
index 6af2287761299f6725f9547917101c18b0cc0164..cb34cb1d26b6812c7f3f39e9f965615de5a8ef07 100644
--- a/tensorflow/contrib/learn/python/learn/utils/export.py
+++ b/tensorflow/contrib/learn/python/learn/utils/export.py
@@ -20,7 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.framework import deprecated
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.session_bundle import exporter
 from tensorflow.contrib.session_bundle import gc
 from tensorflow.python.client import session as tf_session
@@ -78,7 +78,7 @@ def _export_graph(graph, saver, checkpoint_path, export_dir,
           default_graph_signature=default_graph_signature,
           named_graph_signatures=named_graph_signatures,
           assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS))
-      return export.export(export_dir, contrib_variables.get_global_step(),
+      return export.export(export_dir, training_util.get_global_step(),
                            session, exports_to_keep=exports_to_keep)
 
 
@@ -295,7 +295,7 @@ def _export_estimator(estimator,
   checkpoint_path = (checkpoint_path or
                      tf_saver.latest_checkpoint(estimator._model_dir))
   with ops.Graph().as_default() as g:
-    contrib_variables.create_global_step(g)
+    training_util.create_global_step(g)
 
     if use_deprecated_input_fn:
       examples = array_ops.placeholder(dtype=dtypes.string,
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
index 6ffd2a133995a6ff8b35540221fb5676bf5de19f..4b404a8e20e33a17a0d5f857e4220f90c7bc799f 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
@@ -33,7 +33,6 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import tempfile
 import time
 
 from tensorflow.contrib.layers.python.layers import feature_column
@@ -682,22 +681,36 @@ def extend_export_strategy(base_export_strategy,
       ValueError: If `estimator` is a ${tf.estimator.Estimator} instance
         and `default_output_alternative_key` was specified or if post_export_fn
         does not return a valid directory.
+      RuntimeError: If unable to create temporary or final export directory.
     """
-    tmp_base_export_dir = tempfile.mkdtemp()
+    tmp_base_export_folder = 'temp-base-export-' + str(int(time.time()))
+    tmp_base_export_dir = os.path.join(export_dir_base, tmp_base_export_folder)
+    if gfile.Exists(tmp_base_export_dir):
+      raise RuntimeError('Failed to obtain base export directory')
+    gfile.MakeDirs(tmp_base_export_dir)
     tmp_base_export = base_export_strategy.export(
         estimator, tmp_base_export_dir, checkpoint_path)
-    tmp_post_export_dir = tempfile.mkdtemp()
+
+    tmp_post_export_folder = 'temp-post-export-' + str(int(time.time()))
+    tmp_post_export_dir = os.path.join(export_dir_base, tmp_post_export_folder)
+    if gfile.Exists(tmp_post_export_dir):
+      raise RuntimeError('Failed to obtain temp export directory')
+
+    gfile.MakeDirs(tmp_post_export_dir)
     tmp_post_export = post_export_fn(tmp_base_export, tmp_post_export_dir)
 
     if not tmp_post_export.startswith(tmp_post_export_dir):
       raise ValueError('post_export_fn must return a sub-directory of {}'
                        .format(tmp_post_export_dir))
-    export_relpath = os.path.relpath(tmp_post_export, tmp_post_export_dir)
-
-    gfile.Rename(
-        os.path.join(tmp_post_export_dir, export_relpath),
-        os.path.join(export_dir_base, export_relpath))
-    return os.path.join(export_dir_base, export_relpath)
+    post_export_relpath = os.path.relpath(tmp_post_export, tmp_post_export_dir)
+    post_export = os.path.join(export_dir_base, post_export_relpath)
+    if gfile.Exists(post_export):
+      raise RuntimeError('Failed to obtain final export directory')
+    gfile.Rename(tmp_post_export, post_export)
+
+    gfile.DeleteRecursively(tmp_base_export_dir)
+    gfile.DeleteRecursively(tmp_post_export_dir)
+    return post_export
 
   name = post_export_name if post_export_name else base_export_strategy.name
   return export_strategy.ExportStrategy(name, export_fn)
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
index ec3a88003f01b3b62591c13472029601b11ba491..628eb254c3b1129648c453dc47f0c0919891de6f 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
@@ -766,10 +766,11 @@ class SavedModelExportUtilsTest(test.TestCase):
 
     test_estimator = TestEstimator()
     tmpdir = tempfile.mkdtemp()
-    final_path = final_export_strategy.export(test_estimator, tmpdir,
-                                              os.path.join(
-                                                  tmpdir, "checkpoint"))
-    self.assertEqual(os.path.join(tmpdir, "rewrite"), final_path)
+    export_model_dir = os.path.join(tmpdir, "model")
+    checkpoint_path = os.path.join(tmpdir, "checkpoint")
+    final_path = final_export_strategy.export(test_estimator, export_model_dir,
+                                              checkpoint_path)
+    self.assertEqual(os.path.join(export_model_dir, "rewrite"), final_path)
 
   def test_extend_export_strategy_same_name(self):
 
@@ -795,10 +796,11 @@ class SavedModelExportUtilsTest(test.TestCase):
 
     test_estimator = TestEstimator()
     tmpdir = tempfile.mkdtemp()
-    final_path = final_export_strategy.export(test_estimator, tmpdir,
-                                              os.path.join(
-                                                  tmpdir, "checkpoint"))
-    self.assertEqual(os.path.join(tmpdir, "rewrite"), final_path)
+    export_model_dir = os.path.join(tmpdir, "model")
+    checkpoint_path = os.path.join(tmpdir, "checkpoint")
+    final_path = final_export_strategy.export(test_estimator, export_model_dir,
+                                              checkpoint_path)
+    self.assertEqual(os.path.join(export_model_dir, "rewrite"), final_path)
 
   def test_extend_export_strategy_raises_error(self):
 
diff --git a/tensorflow/contrib/libsvm/BUILD b/tensorflow/contrib/libsvm/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..df96402a4ffd51840f77d58d8066487030362340
--- /dev/null
+++ b/tensorflow/contrib/libsvm/BUILD
@@ -0,0 +1,102 @@
+package(
+    default_visibility = ["//visibility:private"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+tf_custom_op_library(
+    name = "python/ops/_libsvm_ops.so",
+    srcs = [
+        "kernels/decode_libsvm_op.cc",
+        "ops/libsvm_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/core/kernels:bounds_check_lib",
+    ],
+)
+
+tf_kernel_library(
+    name = "libsvm_kernels",
+    srcs = ["kernels/decode_libsvm_op.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:bounds_check_lib",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["libsvm_ops"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "libsvm_ops",
+    deps = [":libsvm_ops_op_lib"],
+)
+
+tf_custom_op_py_library(
+    name = "libsvm",
+    srcs = [
+        "__init__.py",
+        "python/ops/libsvm_ops.py",
+    ],
+    dso = [
+        ":python/ops/_libsvm_ops.so",
+    ],
+    kernels = [
+        ":libsvm_kernels",
+        ":libsvm_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":libsvm_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+    ],
+)
+
+tf_py_test(
+    name = "decode_libsvm_op_test",
+    srcs = ["python/kernel_tests/decode_libsvm_op_test.py"],
+    additional_deps = [
+        ":libsvm",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered_impl.py b/tensorflow/contrib/libsvm/__init__.py
similarity index 58%
rename from tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered_impl.py
rename to tensorflow/contrib/libsvm/__init__.py
index 223bc9d042c69be05b0e578835a31ed6e83c0c97..a875863caab29eb59a1834ca9184a5e272cb6656 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered_impl.py
+++ b/tensorflow/contrib/libsvm/__init__.py
@@ -12,28 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""SigmoidCentered bijector."""
+"""Libsvm decoder.
+
+@@decode_libsvm
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors import softmax_centered
+from tensorflow.contrib.libsvm.python.ops.libsvm_ops import decode_libsvm
 
+from tensorflow.python.util.all_util import remove_undocumented
 
-__all__ = [
-    "SigmoidCentered",
+_allowed_symbols = [
+    "decode_libsvm",
 ]
 
-
-class SigmoidCentered(softmax_centered.SoftmaxCentered):
-  """Bijector which computes Y = g(X) = exp([X 0]) / (1 + exp(-X)).
-
-  Equivalent to: `bijector.SoftmaxCentered(event_ndims=0)`.
-
-  See `bijector.SoftmaxCentered` for more details.
-  """
-
-  def __init__(self, validate_args=False, name="sigmoid_centered"):
-    super(SigmoidCentered, self).__init__(
-        event_ndims=0, validate_args=validate_args, name=name)
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc b/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc7889b27cd9ec50d8d2f7d34975ec8cd16c258f
--- /dev/null
+++ b/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc
@@ -0,0 +1,178 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+namespace {
+template <typename T>
+bool ConvertHelper(const string& s, T* value);
+}
+
+template <typename T, typename Tlabel>
+class DecodeLibsvmOp : public OpKernel {
+ public:
+  explicit DecodeLibsvmOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_features", &num_features_));
+    OP_REQUIRES(ctx, (num_features_ >= 1),
+                errors::InvalidArgument("Invalid number of features \"",
+                                        num_features_, "\""));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    const auto& input_flat = input_tensor->flat<string>();
+
+    Tensor* label_tensor;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, input_tensor->shape(), &label_tensor));
+    auto label = label_tensor->flat<Tlabel>();
+
+    std::vector<T> out_values;
+    std::vector<std::pair<int64, int64>> out_indices;
+    for (int i = 0; i < input_flat.size(); ++i) {
+      std::vector<string> entries =
+          str_util::Split(input_flat(i), " ", str_util::SkipEmpty());
+      OP_REQUIRES(ctx, (entries.size() > 0),
+                  errors::InvalidArgument("No entries found for input[", i,
+                                          "]: \"", input_flat(i), "\""));
+      Tlabel label_value;
+      OP_REQUIRES(
+          ctx, ConvertHelper<Tlabel>(entries[0].c_str(), &label_value),
+          errors::InvalidArgument("Label format incorrect: ", entries[0]));
+      label(i) = label_value;
+      for (int j = 1; j < entries.size(); j++) {
+        std::vector<string> pair = str_util::Split(entries[j], ":");
+        OP_REQUIRES(
+            ctx, (pair.size() == 2),
+            errors::InvalidArgument("Invalid feature \"", entries[j], "\""));
+        int64 feature_index;
+        OP_REQUIRES(
+            ctx, strings::safe_strto64(pair[0].c_str(), &feature_index),
+            errors::InvalidArgument("Feature format incorrect: ", entries[j]));
+        OP_REQUIRES(ctx, (feature_index >= 0),
+                    errors::InvalidArgument(
+                        "Feature index should be >= 0, got ", feature_index));
+        T feature_value;
+        OP_REQUIRES(
+            ctx, ConvertHelper<T>(pair[1], &feature_value),
+            errors::InvalidArgument("Feature format incorrect: ", entries[j]));
+        out_values.emplace_back(feature_value);
+        out_indices.emplace_back(std::pair<int64, int64>(i, feature_index));
+      }
+    }
+
+    Tensor* indices_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                            1,
+                            TensorShape({static_cast<int64>(out_indices.size()),
+                                         input_tensor->shape().dims() + 1}),
+                            &indices_tensor));
+    auto indices = indices_tensor->matrix<int64>();
+    // Translate flat index to shaped index like np.unravel_index
+    // Calculate factors for each dimension
+    std::vector<int64> factors(input_tensor->shape().dims());
+    factors[input_tensor->shape().dims() - 1] = 1;
+    for (int j = input_tensor->shape().dims() - 2; j >= 0; j--) {
+      factors[j] = factors[j + 1] * input_tensor->shape().dim_size(j + 1);
+    }
+    for (int i = 0; i < out_indices.size(); i++) {
+      indices(i, 0) = out_indices[i].first;
+      int64 value = out_indices[i].first;
+      for (int j = 0; j < input_tensor->shape().dims(); j++) {
+        indices(i, j) = value / factors[j];
+        value = value % factors[j];
+      }
+      indices(i, input_tensor->shape().dims()) = out_indices[i].second;
+    }
+
+    Tensor* values_tensor;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(
+                       2, TensorShape({static_cast<int64>(out_values.size())}),
+                       &values_tensor));
+    auto values = values_tensor->vec<T>();
+    std::copy_n(out_values.begin(), out_values.size(), &values(0));
+
+    Tensor* shape_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                            3, TensorShape({input_tensor->shape().dims() + 1}),
+                            &shape_tensor));
+    auto shape = shape_tensor->flat<int64>();
+    for (int i = 0; i < input_tensor->shape().dims(); i++) {
+      shape(i) = input_tensor->shape().dim_size(i);
+    }
+    shape(input_tensor->shape().dims()) = num_features_;
+  }
+
+ private:
+  int64 num_features_;
+};
+
+namespace {
+template <>
+bool ConvertHelper<float>(const string& s, float* value) {
+  return strings::safe_strtof(s.c_str(), value);
+}
+template <>
+bool ConvertHelper<double>(const string& s, double* value) {
+  return strings::safe_strtod(s.c_str(), value);
+}
+template <>
+bool ConvertHelper<int32>(const string& s, int32* value) {
+  return strings::safe_strto32(s.c_str(), value);
+}
+template <>
+bool ConvertHelper<int64>(const string& s, int64* value) {
+  return strings::safe_strto64(s.c_str(), value);
+}
+}  // namespace
+
+#define REGISTER_KERNEL(type)                                         \
+  REGISTER_KERNEL_BUILDER(Name("DecodeLibsvm")                        \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<type>("dtype")          \
+                              .TypeConstraint<int32>("label_dtype"),  \
+                          DecodeLibsvmOp<type, int32>);               \
+  REGISTER_KERNEL_BUILDER(Name("DecodeLibsvm")                        \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<type>("dtype")          \
+                              .TypeConstraint<int64>("label_dtype"),  \
+                          DecodeLibsvmOp<type, int64>);               \
+  REGISTER_KERNEL_BUILDER(Name("DecodeLibsvm")                        \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<type>("dtype")          \
+                              .TypeConstraint<float>("label_dtype"),  \
+                          DecodeLibsvmOp<type, float>);               \
+  REGISTER_KERNEL_BUILDER(Name("DecodeLibsvm")                        \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<type>("dtype")          \
+                              .TypeConstraint<double>("label_dtype"), \
+                          DecodeLibsvmOp<type, double>);
+
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(int64);
+#undef REGISTER_KERNEL
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/libsvm/ops/libsvm_ops.cc b/tensorflow/contrib/libsvm/ops/libsvm_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4c65e676291d1244f5224c43d32a321ae72ffe41
--- /dev/null
+++ b/tensorflow/contrib/libsvm/ops/libsvm_ops.cc
@@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("DecodeLibsvm")
+    .Input("input: string")
+    .Output("label: label_dtype")
+    .Output("feature_indices: int64")
+    .Output("feature_values: dtype")
+    .Output("feature_shape: int64")
+    .Attr("dtype: {float, double, int32, int64} = DT_FLOAT")
+    .Attr("label_dtype: {float, double, int32, int64} = DT_INT64")
+    .Attr("num_features: int >= 1")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->input(0));
+
+      c->set_output(1, c->Matrix(InferenceContext::kUnknownDim,
+                                 InferenceContext::kUnknownDim));
+      c->set_output(2, c->Vector(InferenceContext::kUnknownDim));
+      c->set_output(3, c->Vector(InferenceContext::kUnknownDim));
+
+      return Status::OK();
+    })
+
+    .Doc(R"doc(
+Convert LibSVM input to tensors. The output consists of
+a label and a feature tensor. The shape of the label tensor
+is the same as input and the shape of the feature tensor is
+`[input_shape, num_features]`.
+
+input: Each string is a record in the LibSVM.
+label: A tensor of the same shape as input.
+feature_indices: A 2-D int64 tensor of dense_shape [N, ndims].
+feature_values: A 1-D tensor of any type and dense_shape [N].
+feature_shape: A 1-D int64 tensor of dense_shape [ndims].
+num_features: The number of features.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/libsvm/python/kernel_tests/decode_libsvm_op_test.py b/tensorflow/contrib/libsvm/python/kernel_tests/decode_libsvm_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d9d5ceed393c03e6baa0872950670cf1ff71d3f
--- /dev/null
+++ b/tensorflow/contrib/libsvm/python/kernel_tests/decode_libsvm_op_test.py
@@ -0,0 +1,72 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DecodeLibsvm op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.libsvm.python.ops import libsvm_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import test
+
+
+class DecodeLibsvmOpTest(test.TestCase):
+
+  def testBasic(self):
+    with self.test_session() as sess:
+      content = ["1 1:3.4 2:0.5 4:0.231",
+                 "1 2:2.5 3:inf 5:0.503",
+                 "2 3:2.5 2:nan 1:0.105"]
+      sparse_features, labels = libsvm_ops.decode_libsvm(content,
+                                                         num_features=6)
+      features = sparse_ops.sparse_tensor_to_dense(sparse_features,
+                                                   validate_indices=False)
+
+      self.assertAllEqual(labels.get_shape().as_list(), [3])
+
+      features, labels = sess.run([features, labels])
+      self.assertAllEqual(labels, [1, 1, 2])
+      self.assertAllClose(features, [[0, 3.4, 0.5, 0, 0.231, 0],
+                                     [0, 0, 2.5, np.inf, 0, 0.503],
+                                     [0, 0.105, np.nan, 2.5, 0, 0]])
+
+  def testNDimension(self):
+    with self.test_session() as sess:
+      content = [["1 1:3.4 2:0.5 4:0.231", "1 1:3.4 2:0.5 4:0.231"],
+                 ["1 2:2.5 3:inf 5:0.503", "1 2:2.5 3:inf 5:0.503"],
+                 ["2 3:2.5 2:nan 1:0.105", "2 3:2.5 2:nan 1:0.105"]]
+      sparse_features, labels = libsvm_ops.decode_libsvm(
+          content, num_features=6, label_dtype=dtypes.float64)
+      features = sparse_ops.sparse_tensor_to_dense(sparse_features,
+                                                   validate_indices=False)
+
+      self.assertAllEqual(labels.get_shape().as_list(), [3, 2])
+
+      features, labels = sess.run([features, labels])
+      self.assertAllEqual(labels, [[1, 1], [1, 1], [2, 2]])
+      self.assertAllClose(features, [[[0, 3.4, 0.5, 0, 0.231, 0],
+                                     [0, 3.4, 0.5, 0, 0.231, 0]],
+                                    [[0, 0, 2.5, np.inf, 0, 0.503],
+                                     [0, 0, 2.5, np.inf, 0, 0.503]],
+                                    [[0, 0.105, np.nan, 2.5, 0, 0],
+                                     [0, 0.105, np.nan, 2.5, 0, 0]]])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/libsvm/python/ops/libsvm_ops.py b/tensorflow/contrib/libsvm/python/ops/libsvm_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c133e7e7f048966f222a5e3a1a61c5ac7c723eb
--- /dev/null
+++ b/tensorflow/contrib/libsvm/python/ops/libsvm_ops.py
@@ -0,0 +1,51 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Libsvm decoder."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.libsvm.ops import gen_libsvm_ops
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import io_ops
+from tensorflow.python.platform import resource_loader
+
+
+_libsvm_ops_so = loader.load_op_library(
+    resource_loader.get_path_to_datafile("_libsvm_ops.so"))
+
+def decode_libsvm(content, num_features, dtype=None, label_dtype=None):
+  """Convert Libsvm records to a tensor of label and a tensor of feature.
+
+  Args:
+    content: A `Tensor` of type `string`. Each string is a record/row in
+      the Libsvm format.
+    num_features: The number of features.
+    dtype: The type of the output feature tensor. Default to tf.float32.
+    label_dtype: The type of the output label tensor. Default to tf.int64.
+
+  Returns:
+    features: A `SparseTensor` of the shape `[input_shape, num_features]`.
+    labels: A `Tensor` of the same shape as content.
+  """
+  labels, indices, values, shape = gen_libsvm_ops.decode_libsvm(
+      content, num_features, dtype=dtype, label_dtype=label_dtype)
+  return sparse_tensor.SparseTensor(indices, values, shape), labels
+
+
+ops.NotDifferentiable('DecodeLibSVM')
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
index 86d848439191aeb0dfa88bbe0fb9b3b654499423..7526f3ae0dbdb3d6827e9d7f690090b8438e4f6e 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@@ -251,8 +251,9 @@ class SdcaModel(object):
 
       result_dense = 0.0
       for i in range(len(dense_variables)):
-        result_dense += math_ops.matmul(
-            dense_features[i], array_ops.expand_dims(dense_variables[i], -1))
+        result_dense += math_ops.matmul(dense_features[i],
+                                        array_ops.expand_dims(
+                                            dense_variables[i], -1))
 
     # Reshaping to allow shape inference at graph construction time.
     return array_ops.reshape(result_dense, [-1]) + result_sparse
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
index 701fc1c0597d1de0b0189e86feafbd1c5bbdc818..05794a42c5f2d0eece6adab36fb5610078cece31 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import layers
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
@@ -154,7 +154,7 @@ def sdca_model_fn(features, labels, mode, params, config=None):
     _add_bias_column(feature_columns, features, bias, columns_to_variables)
 
   def _train_op_fn(unused_loss):
-    global_step = contrib_variables.get_global_step()
+    global_step = training_util.get_global_step()
     sdca_model, train_op = optimizer.get_train_step(
         columns_to_variables, weight_column_name, loss_type, features, labels,
         global_step)
diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 96a9e281ad11009e8406bb6ccd583adba09f9f0d..3f1b0be1a73a3ff1da3452f4ee1a9125f9e26178 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -111,6 +111,7 @@ cc_test(
     deps = [
         ":framework",
         ":string_util",
+        "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -134,6 +135,7 @@ cc_test(
     srcs = ["simple_memory_arena_test.cc"],
     deps = [
         ":framework",
+        "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -152,6 +154,7 @@ cc_test(
     ],
     deps = [
         ":framework",
+        "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -163,6 +166,7 @@ cc_test(
     srcs = ["context_test.cc"],
     deps = [
         ":framework",
+        "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -191,6 +195,9 @@ filegroup(
         exclude = [
             "**/METADATA",
             "**/OWNERS",
+            "downloads",
+            "examples",
+            "gen",
         ],
     ),
     visibility = ["//tensorflow:__subpackages__"],
diff --git a/tensorflow/contrib/lite/README.md b/tensorflow/contrib/lite/README.md
index 827c5d0baa90b73a72c3565e23c417c24b1d06d8..852284cbc7f33b5d9c0f7774bca89c1dff3fa3ec 100644
--- a/tensorflow/contrib/lite/README.md
+++ b/tensorflow/contrib/lite/README.md
@@ -1,10 +1,10 @@
 # TensorFlow Lite
-TensorFlow Lite is TensorFlow’s lightweight solution for mobile and embedded devices. It enables low-latency inference of on-device machine learning models with a small binary size and fast performance supporting hardware acceleration.
+TensorFlow Lite is TensorFlow's lightweight solution for mobile and embedded devices. It enables low-latency inference of on-device machine learning models with a small binary size and fast performance supporting hardware acceleration.
 
 TensorFlow Lite uses many techniques for achieving low latency like optimizing the kernels for specific mobile apps, pre-fused activations, quantized kernels that allow smaller and faster (fixed-point math) models, and in the future, leverage specialized machine learning hardware to get the best possible performance for a particular model on a particular device.
 
 ![image](g3doc/TFLite-Architecture.jpg)
-# Getting Started with a Demo App
+# Getting Started with an Android Demo App
 
 This section contains an example application using TensorFlow Lite for Android devices. The demo is a sample camera app that classifies images continuously using a quantized Mobilenet model. A device running Android 5.0 ( API 21) or higher is required to run the demo.
 
@@ -17,21 +17,21 @@ There are 3 ways to get the demo app to your device
 In the demo app, inference is done using the TensorFlow Lite Java API. The demo app classifies frames in real-time, displaying the top most probable classifications. It also displays the time taken to detect the object.
 
 ## Downloading the pre-built binary
-The  fastest path to trying the demo, is to download the pre-built binary
+The fastest path to trying the demo, is to download the pre-built binary
 [TfLiteCameraDemo.apk](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk)
 
-Once the apk is installed, click the app icon to start the app. The first-time the app is opened, the app asks for runtime permissions to access the device camera. The demo app opens the back-camera of the device and recognizes the objects in the camera’s field of view. At the bottom of the image (or at the left of the image if the device is in landscape mode), it shows the latency of classification and the top three objects classified.
+Once the apk is installed, click the app icon to start the app. The first-time the app is opened, the app asks for runtime permissions to access the device camera. The demo app opens the back-camera of the device and recognizes the objects in the camera's field of view. At the bottom of the image (or at the left of the image if the device is in landscape mode), it shows the latency of classification and the top three objects classified.
 
 ## Building in Android Studio using TensorFlow Lite AAR from JCenter
 The simplest way to compile the demo app, and try out changes to the project code is to use AndroidStudio.
 
  - Install the latest version of Android Studio 3 as specified [here](https://developer.android.com/studio/index.html).
  - Make sure the Android SDK version is greater than 26 and NDK version is greater than 14 (in the Android Studio Settings).
- - Import the tensorflow/contrib/lite/java/demo directory as a new Android Studio project.
+ - Import the `tensorflow/contrib/lite/java/demo` directory as a new Android Studio project.
  - Click through installing all the Gradle extensions it requests.
  - Download the quantized Mobilenet TensorFlow Lite model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip)
      - unzip and copy mobilenet_quant_v1_224.tflite to the assets directory:
-       tensorflow/contrib/lite/java/demo/app/src/main/assets/
+       `tensorflow/contrib/lite/java/demo/app/src/main/assets/`
  - Build and run the demo app
 
 ## Building TensorFlow Lite and the demo app from source
@@ -43,7 +43,7 @@ The simplest way to compile the demo app, and try out changes to the project cod
 ### Install Bazel
 If bazel is not installed on your system, install it now by following [these directions](https://bazel.build/versions/master/docs/install.html)
 
-NOTE: Bazel does not currently support building for Android on Windows. Full support for gradle/cmake builds is coming soon, but in the meantime Windows users should download the [prebuilt binary](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk) instead.
+NOTE: Bazel does not fully support building Android on Windows yet. Full support for Gradle/CMake builds is coming soon, but in the meantime Windows users should download the [prebuilt binary](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk) instead.
 
 ### Install Android NDK and SDK
 Bazel is the primary build system for TensorFlow. Bazel and the Android NDK and SDK must be installed on your system.
@@ -53,25 +53,30 @@ Bazel is the primary build system for TensorFlow. Bazel and the Android NDK and
  - In the root of the TensorFlow repository update the `WORKSPACE` file with the `api_level` and location of the SDK and NDK. If you installed it with AndroidStudio the SDK path can be found in the SDK manager, and the default NDK path is:`{SDK path}/ndk-bundle.`
 
 ```
- Android_sdk_repository (
-   name = "androidsdk",
-   api_level = 23,
-   build_tools_version = "23.0.2",
-   path = "/home/xxxx/android-sdk-linux/", )
+android_sdk_repository (
+    name = "androidsdk",
+    api_level = 23,
+    build_tools_version = "23.0.2",
+    path = "/home/xxxx/android-sdk-linux/",
+)
 
 android_ndk_repository(
-  name="androidndk",
-  path="/home/xxxx/android-ndk-r10e/",
-  api_level=19)
-
+    name = "androidndk",
+    path = "/home/xxxx/android-ndk-r10e/",
+    api_level = 19,
+)
 ```
+
 Additional details on building with Android can be found [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/README.md).
 
-### Build the  source code
+### Build the source code
 Run bazel with the following command to build the demo.
 
 Build the demo app:
-bazel build --cxxopt='--std=c++11' //tensorflow/contrib/lite/java/demo/app/src/main:TfLiteCameraDemo
+
+```
+bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/demo/app/src/main:TfLiteCameraDemo
+```
 
 ### Note
 
@@ -81,6 +86,17 @@ environment (due to a Bazel bug).
 ### More about the demo
 The demo is resizing each camera image frame to (224 width * 224 height) to match the  quantized Mobilenet model being used. The resized image is converted into a ByteBuffer row by row of size 1 * 224 * 224 * 3 bytes, where 1 is the number of images in a batch 224 * 224 is the width and height of the image 3 bytes represents three colors of a pixel. This demo uses the TensorFlow Lite Java inference API for models which take a single input and provide a single output. This outputs a two-dimensional array, with the first dimension being the category index and the second dimension being the confidence of classification. The Mobilenet model has 1001 unique categories and the app sorts the probabilities of all the categories and displays the top three. The Mobilenet quantized model is bundled within the assets directory of the app.
 
+# iOS Demo App
+
+Similar to the Android demo app, there's an iOS camera app that uses exactly the same model (224 * 224 quantized Mobilenet).
+
+This demo app requires a camera so it doesn't work with simulators. It need to be executed on a real iOS device. Follow the instructions to build and run the demo app:
+
+1.   Follow the Building section [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/ios.md#building) to build the universal iOS library for TensorFlow Lite.
+1.   Install [CocoaPods](https://cocoapods.org/) if it wasn't installed yet: `sudo gem install cocoapods`.
+1.   Run `pod install` in `tensorflow/contrib/lite/examples/ios/camera` to generate the workspace file.
+1.   Open the project by running `open tflite_camera_example.xcworkspace`, and build the app in XCode.
+
 # TensorFlow Lite Quick Start
 
 ## Step 1. Decide which GraphDef to use
@@ -105,7 +121,7 @@ The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tenso
 
 
 ### Train a custom model
-A developer may choose to train a custom model using Tensorflow. TensorFlow documentation has [several tutorials](https://www.tensorflow.org/tutorials/) for building and training models. If the user has written a model using TensorFlow’s Slim Framework the first step is to export this to a GraphDef file. This is necessary because Slim does not store the model structure outside the code, so to communicate with other parts of the framework it needs to be exported. Documentation for the export can be found [here](https://github.com/tensorflow/models/tree/master/research/slim#Export). The output of this step will be a .pb file for the custom model.
+A developer may choose to train a custom model using Tensorflow. TensorFlow documentation has [several tutorials](https://www.tensorflow.org/tutorials/) for building and training models. If the user has written a model using TensorFlow's Slim Framework the first step is to export this to a GraphDef file. This is necessary because Slim does not store the model structure outside the code, so to communicate with other parts of the framework it needs to be exported. Documentation for the export can be found [here](https://github.com/tensorflow/models/tree/master/research/slim#Export). The output of this step will be a .pb file for the custom model.
 
 TensorFlow Lite currently supports a subset of TensorFlow operators. Please refer to [this document](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md) for details of supported operators and their usage. This
 set will continue to expand in future releases of Tensorflow Lite.
@@ -129,7 +145,7 @@ Since we employ several formats, the following definitions may be useful:
  - TensorFlow lite model (.lite) - a serialized flatbuffer, containing TensorFlow lite operators and Tensors for the TensorFlow lite interpreter. This is most analogous to TensorFlow frozen GraphDefs.
 
 ### Freeze Graph
-To use this .pb GraphDef file within TensorFlow Lite, the application developer will need checkpoints containing trained weight parameters. The .pb contains only the structure of the graph. The process of merging the checkpoint values with the graph structure is known as “freezing” the graph.
+To use this .pb GraphDef file within TensorFlow Lite, the application developer will need checkpoints containing trained weight parameters. The .pb contains only the structure of the graph. The process of merging the checkpoint values with the graph structure is known as "freezing" the graph.
 
 The developer should know where the checkpoints folder is present or checkpoints can also be downloaded for a pre-trained model (Example: Here is a link to the [MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md)).
 
@@ -151,12 +167,13 @@ graphviz, or [in tensorboard](https://codelabs.developers.google.com/codelabs/te
 This frozen Graphdef is now ready to be converted to flatbuffer format (.lite) for use on Android or iOS.  On Android users have the flexibility to use either the float or quantized versions of the frozen graphdef, if available, using the Tensorflow Optimizing Converter tool.
 
 Here is a sample command line to convert the frozen Graphdef to '.lite' format for  The Tensorflow Optimizing Converter supports both float and quantized models, however, different configuration parameters are needed depending on whether a FLOAT or QUANTIZED mode is being used.
+(Here is a link to the pb [file](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)).
 
 ```
 bazel build tensorflow/contrib/lite/toco:toco
 
-bazel run --config=opt tensorflow/contrib/lite/toco:toco -- \
-  --input_file=(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
+bazel-bin/tensorflow/contrib/lite/toco/toco -- \
+  --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
   --input_format=TENSORFLOW_GRAPHDEF  --output_format=TFLITE \
   --output_file=/tmp/mobilenet_v1_1.0_224.lite --inference_type=FLOAT \
   --input_type=FLOAT --input_arrays=input \
@@ -169,7 +186,7 @@ bazel run --config=opt tensorflow/contrib/lite/toco:toco -- \
 - Setting the input_array, output_array and input_shape arguments are a bit trickier. The easiest way to find these values is to explore the graph in tensorboard .  The user should reuse the arguments that were used for specifying the output nodes for inference in the `freeze_graph`step.
 
 Note, it is also possible to use the Tensorflow Optimizing Converter through protos either from Python or from the command line see the
-documentation [here](https://github.com/tensorflow/tensorflow/tree/mastertensorflow/contrib/lite/python:toco_from_protos target) A developer can then integrate the conversion step into their model design workflow to ensure that a model will be easily convertible to a mobile inference graph. For example,
+documentation [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/python/toco_from_protos.py). A developer can then integrate the conversion step into their model design workflow to ensure that a model will be easily convertible to a mobile inference graph. For example,
 
 ```
 import tensorflow as tf
@@ -184,7 +201,7 @@ with tf.Session() as sess:
 ```
 For detailed instructions on how to use the Tensorflow Optimizing Converter, please see [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md).
 
-You may refer to the [Ops compatibility guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md) for troubleshooting help. If that doesn’t help, please file an [issue](https://github.com/tensorflow/tensorflow/issues).
+You may refer to the [Ops compatibility guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md) for troubleshooting help. If that doesn't help, please file an [issue](https://github.com/tensorflow/tensorflow/issues).
 
 ## Step 3. Use the TensorFlow Lite model for inference in a mobile app
 
@@ -193,9 +210,13 @@ After completion of Step 2 the developer should have a .lite model.
 ### For Android
 Because Android apps need to be written in Java, and core TensorFlow is in C++, a JNI library is provided to interface between the two. Its interface is aimed only at inference, so it provides the ability to load a graph, set up inputs, and run the model to calculate particular outputs. The full documentation for the set of methods can be seen [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/). The demo app is also open sourced on [github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
 
-The [demo app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app)  uses this interface, so it’s a good place to look for example usage. You can also download the prebuilt binary [here](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
+The [demo app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app)  uses this interface, so it's a good place to look for example usage. You can also download the prebuilt binary [here](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
 
-Note that you’d need to follow instructions for installing TensorFlow on Android, setting up bazel and Android Studio outlined [here](https://www.tensorflow.org/mobile/android_build).
+Note that you'd need to follow instructions for installing TensorFlow on Android, setting up bazel and Android Studio outlined [here](https://www.tensorflow.org/mobile/android_build).
 
 ### For iOS
-Follow the documentation [here](https://github.com/TensorFlow/TensorFlow/blob/master/TensorFlow/contrib/lite/g3doc/ios.md) to get integrate a TFLite model into your app.
+Follow the documentation [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/ios.md) to get integrate a TFLite model into your app.
+
+## Core ML support
+
+Core ML is a machine learning framework used across Apple products. In addition to using Tensorflow Lite models directly in their applications, developers have the option to convert their trained Tensorflow models to the [CoreML](https://developer.apple.com/machine-learning/) format for use on Apple devices. For information on how to use the converter please refer to the [Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index e3c9cdd99beb93e356c148298dcbe6498fbe0306..d1fcdce70a34393defce0f2d0f6d5bb53f21c45e 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -89,6 +89,7 @@ def tflite_jni_linkopts():
   return tflite_jni_linkopts_unstripped() + select({
       "//tensorflow:android": [
           "-s",  # Omit symbol table.
+          "-latomic",  # Required for some uses of ISO C++11 <atomic> in x86.
       ],
       "//conditions:default": [],
   })
@@ -223,11 +224,12 @@ def gen_selected_ops(name, model):
   """
   out = name + "_registration.cc"
   tool = "//tensorflow/contrib/lite/tools:generate_op_registrations"
+  tflite_path = "//tensorflow/contrib/lite"
   native.genrule(
       name = name,
       srcs = [model],
       outs = [out],
-      cmd = ("$(location %s) --input_model=$(location %s) --output_registration=$(location %s)")
-      % (tool, model, out),
+      cmd = ("$(location %s) --input_model=$(location %s) --output_registration=$(location %s) --tflite_path=%s")
+      % (tool, model, out, tflite_path[2:]),
       tools = [tool],
   )
diff --git a/tensorflow/contrib/lite/build_ios_universal_lib.sh b/tensorflow/contrib/lite/build_ios_universal_lib.sh
index e0f2ef768bfed544ed8acd6c0e3a5823e61a1e8c..cbc96e6edd4358f6666731caa4c208c77d9c6c54 100755
--- a/tensorflow/contrib/lite/build_ios_universal_lib.sh
+++ b/tensorflow/contrib/lite/build_ios_universal_lib.sh
@@ -1,4 +1,19 @@
 #!/bin/bash -x
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
 set -e
 make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=x86_64 -j 8
 make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=i386 -j 8
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 93072bf90bd8a18d9011a74c2eec95d86dbdce8a..5c6f3016b1c7d06ba35229faeff9cec32e168ef2 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -104,6 +104,17 @@ typedef struct {
   TfLiteFusedActivation activation;
 } TfLiteAddParams;
 
+typedef struct {
+  // Number of spatial dimensions.
+  // For now only NHWC is supported, and the value should always be 2.
+  int num_spatial_dimensions;
+  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
+  // For now we will fix the maximum possible number of dimensions.
+  int block_shape[2];
+  int before_crops[2];
+  int after_crops[2];
+} TfLiteBatchToSpaceNDParams;
+
 typedef struct {
   TfLiteFusedActivation activation;
 } TfLiteMulParams;
@@ -130,6 +141,14 @@ typedef struct {
   int new_width;
 } TfLiteResizeBilinearParams;
 
+typedef struct {
+  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
+  // For now we will fix the maximum possible number of dimensions.
+  int before_padding[8];
+  int after_padding[8];
+  int num_dimensions;
+} TfLitePadParams;
+
 typedef struct {
   // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
   // For now we will fix the maximum possible number of dimensions.
@@ -157,6 +176,10 @@ typedef struct {
   TfLiteCombinerType combiner;
 } TfLiteEmbeddingLookupSparseParams;
 
+typedef struct {
+  int axis;
+} TfLiteGatherParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/context_test.cc b/tensorflow/contrib/lite/context_test.cc
index d0a104f43d9b9d148d80ce26b8ecf732d51ef110..20d6f69a25e9f0bb4323cf5d067b8ebd37bb3c23 100644
--- a/tensorflow/contrib/lite/context_test.cc
+++ b/tensorflow/contrib/lite/context_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/context.h"
 #include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
 
 namespace tflite {
 
@@ -68,7 +69,7 @@ TEST(IntArray, TestIntArrayEqual) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index 41480c20077f4b31928cf17ff02e357f5dea6851..7fce1ba3461066e6dada95246781440258d844c1 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,6 +19,13 @@ set -e
 DOWNLOADS_DIR=tensorflow/contrib/lite/downloads
 BZL_FILE_PATH=tensorflow/workspace.bzl
 
+# Ensure it is being run from repo root
+if [ ! -f $BZL_FILE_PATH ]; then
+  echo "Could not find ${BZL_FILE_PATH}":
+  echo "Likely you are not running this from the root directory of the repository.";
+  exit 1;
+fi
+
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
 GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
@@ -56,11 +63,19 @@ download_and_extract() {
   elif [[ "${url}" == *zip ]]; then
     tempdir=$(mktemp -d)
     tempdir2=$(mktemp -d)
-    wget -P ${tempdir} ${url}
-    unzip ${tempdir}/* -d ${tempdir2}
-    # unzip has no strip components, so unzip to a temp dir, and move the files
-    # we want from the tempdir to destination.
-    echo cp `find ${tempdir2} -type f` ${dir}/
+
+    curl -L ${url} > ${tempdir}/zipped.zip
+    unzip ${tempdir}/zipped.zip -d ${tempdir2}
+
+    # If the zip file contains nested directories, extract the files from the
+    # inner directory.
+    if ls ${tempdir2}/*/* 1> /dev/null 2>&1; then
+      # unzip has no strip components, so unzip to a temp dir, and move the
+      # files we want from the tempdir to destination.
+      cp -R ${tempdir2}/*/* ${dir}/
+    else
+      cp -R ${tempdir2}/* ${dir}/
+    fi
     rm -rf ${tempdir2} ${tempdir}
   fi
 
diff --git a/tensorflow/contrib/lite/error_reporter.cc b/tensorflow/contrib/lite/error_reporter.cc
index 6ba5384a94dbf9de03fb2e4e2f63074525eafa2d..03fcd5409ceab1895cea3b9e0e4fcb5a127e6a45 100644
--- a/tensorflow/contrib/lite/error_reporter.cc
+++ b/tensorflow/contrib/lite/error_reporter.cc
@@ -39,7 +39,9 @@ int ErrorReporter::ReportError(void*, const char* format, ...) {
 }
 
 int StderrReporter::Report(const char* format, va_list args) {
-  return vfprintf(stderr, format, args);
+  const int result = vfprintf(stderr, format, args);
+  fputc('\n', stderr);
+  return result;
 }
 
 ErrorReporter* DefaultErrorReporter() {
diff --git a/tensorflow/contrib/lite/error_reporter.h b/tensorflow/contrib/lite/error_reporter.h
index 637d456ce7a754c7da34e551869e49b4efd18e3b..d5715e4f90aead79a617fe4576bfe5100d5e121a 100644
--- a/tensorflow/contrib/lite/error_reporter.h
+++ b/tensorflow/contrib/lite/error_reporter.h
@@ -25,10 +25,10 @@ namespace tflite {
 //
 // Usage:
 //  ErrorReporter foo;
-//  foo.Report("test %d\n", 5);
+//  foo.Report("test %d", 5);
 // or
 //  va_list args;
-//  foo.Report("test %d\n", args); // where args is va_list
+//  foo.Report("test %d", args); // where args is va_list
 //
 // Sublclass ErrorReporter to provide another reporting destination.
 // For example, if you have a GUI program, you might redirect to a buffer
diff --git a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
index ea398ad14e8be4c5a0021befc7cc076549b47e23..10f31bb6f17242c9f7f70f0648ec643f99c5ac86 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -123,7 +123,11 @@ static void GetTopN(const uint8_t* prediction, const int prediction_size, const
   AVCaptureDevice* device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
   AVCaptureDeviceInput* deviceInput =
       [AVCaptureDeviceInput deviceInputWithDevice:device error:&error];
-  assert(error == nil);
+
+  if (error != nil) {
+    NSLog(@"Failed to initialize AVCaptureDeviceInput. Note: This app doesn't work with simulator");
+    assert(NO);
+  }
 
   if ([session canAddInput:deviceInput]) [session addInput:deviceInput];
 
diff --git a/tensorflow/contrib/lite/examples/ios/camera/data/.gitignore b/tensorflow/contrib/lite/examples/ios/camera/data/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.h b/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.h
index 75b1f1da384b527e8332dfba08fec87c65eff8b1..94046d9728258901091f018fd0d081651145f400 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.h
+++ b/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.h
@@ -14,8 +14,8 @@
 
 #import <UIKit/UIKit.h>
 
-@interface AppDelegate : UIResponder <UIApplicationDelegate>
+@interface AppDelegate : UIResponder<UIApplicationDelegate>
 
-@property (strong, nonatomic) UIWindow *window;
+@property(strong, nonatomic) UIWindow *window;
 
 @end
diff --git a/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.mm b/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.mm
index 1e808eb976ff3eeda4cf6f81b3c1794c6a037dc8..d1215fa0bffd978b4aaadbd8bc13b07723703c9a 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.mm
+++ b/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.mm
@@ -22,8 +22,7 @@
     didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
 
   UITabBarController *bar = [[UITabBarController alloc] init];
-  [bar setViewControllers:
-      @[[[RunModelViewController alloc] init]]];
+  [bar setViewControllers:@[ [[RunModelViewController alloc] init] ]];
   bar.selectedIndex = 0;
   self.window = [[UIWindow alloc] initWithFrame:[[UIScreen mainScreen] bounds]];
   self.window.rootViewController = bar;
@@ -31,14 +30,19 @@
   return YES;
 }
 
-- (void)applicationWillResignActive:(UIApplication *)application {}
+- (void)applicationWillResignActive:(UIApplication *)application {
+}
 
-- (void)applicationDidEnterBackground:(UIApplication *)application {}
+- (void)applicationDidEnterBackground:(UIApplication *)application {
+}
 
-- (void)applicationWillEnterForeground:(UIApplication *)application {}
+- (void)applicationWillEnterForeground:(UIApplication *)application {
+}
 
-- (void)applicationDidBecomeActive:(UIApplication *)application {}
+- (void)applicationDidBecomeActive:(UIApplication *)application {
+}
 
-- (void)applicationWillTerminate:(UIApplication *)application {}
+- (void)applicationWillTerminate:(UIApplication *)application {
+}
 
 @end
diff --git a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.h b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.h
index 4e1a83ccf5a12c609baadab7359c55ec4f464ed8..a4b358b4eb7f6ba109638405091b798d30bd1768 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.h
+++ b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.h
@@ -18,7 +18,7 @@
 
 - (IBAction)getUrl:(id)sender;
 
-@property (weak, nonatomic) IBOutlet UITextView *urlContentTextView;
-@property (weak, nonatomic) IBOutlet UITextField *urlTextField;
+@property(weak, nonatomic) IBOutlet UITextView *urlContentTextView;
+@property(weak, nonatomic) IBOutlet UITextField *urlTextField;
 
 @end
diff --git a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm
index 965d83010516c6db72c9e8b1c33079b3eda204de..0dafb1f61e19f46bb3b17f07c55e09f5813ed560 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm
+++ b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm
@@ -14,10 +14,10 @@
 
 #import "RunModelViewController.h"
 
-#include <fstream>
-#include <iostream>
 #include <pthread.h>
 #include <unistd.h>
+#include <fstream>
+#include <iostream>
 #include <queue>
 #include <sstream>
 #include <string>
@@ -30,7 +30,11 @@
 #include "ios_image_load.h"
 
 #define LOG(x) std::cerr
-#define CHECK(x) if (!(x)) { LOG(ERROR) << #x << "failed"; exit(1); }
+#define CHECK(x)                  \
+  if (!(x)) {                     \
+    LOG(ERROR) << #x << "failed"; \
+    exit(1);                      \
+  }
 
 NSString* RunInferenceOnImage();
 
@@ -49,15 +53,12 @@ NSString* RunInferenceOnImage();
 
 // Returns the top N confidence values over threshold in the provided vector,
 // sorted by confidence in descending order.
-static void GetTopN(
-    const float* prediction,
-    const int prediction_size,
-    const int num_results, const float threshold,
-    std::vector<std::pair<float, int> >* top_results) {
+static void GetTopN(const float* prediction, const int prediction_size, const int num_results,
+                    const float threshold, std::vector<std::pair<float, int> >* top_results) {
   // Will contain top N results in ascending order.
-  std::priority_queue<std::pair<float, int>,
-      std::vector<std::pair<float, int> >,
-      std::greater<std::pair<float, int> > > top_result_pq;
+  std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int> >,
+                      std::greater<std::pair<float, int> > >
+      top_result_pq;
 
   const long count = prediction_size;
   for (int i = 0; i < count; ++i) {
@@ -88,8 +89,8 @@ static void GetTopN(
 NSString* FilePathForResourceName(NSString* name, NSString* extension) {
   NSString* file_path = [[NSBundle mainBundle] pathForResource:name ofType:extension];
   if (file_path == NULL) {
-    LOG(FATAL) << "Couldn't find '" << [name UTF8String] << "."
-	       << [extension UTF8String] << "' in bundle.";
+    LOG(FATAL) << "Couldn't find '" << [name UTF8String] << "." << [extension UTF8String]
+               << "' in bundle.";
   }
   return file_path;
 }
@@ -102,7 +103,8 @@ NSString* RunInferenceOnImage() {
 
   NSString* graph_path = FilePathForResourceName(@"mobilenet_v1_1.0_224", @"tflite");
 
-  std::unique_ptr<tflite::FlatBufferModel> model(tflite::FlatBufferModel::BuildFromFile([graph_path UTF8String]));
+  std::unique_ptr<tflite::FlatBufferModel> model(
+      tflite::FlatBufferModel::BuildFromFile([graph_path UTF8String]));
   if (!model) {
     LOG(FATAL) << "Failed to mmap model " << graph;
   }
@@ -143,7 +145,7 @@ NSString* RunInferenceOnImage() {
   std::ifstream t;
   t.open([labels_path UTF8String]);
   std::string line;
-  while(t){
+  while (t) {
     std::getline(t, line);
     label_strings.push_back(line);
   }
@@ -154,7 +156,8 @@ NSString* RunInferenceOnImage() {
   int image_width;
   int image_height;
   int image_channels;
-  std::vector<uint8_t> image_data = LoadImageFromFile([image_path UTF8String], &image_width, &image_height, &image_channels);
+  std::vector<uint8_t> image_data =
+      LoadImageFromFile([image_path UTF8String], &image_width, &image_height, &image_channels);
   const int wanted_width = 224;
   const int wanted_height = 224;
   const int wanted_channels = 3;
@@ -212,8 +215,7 @@ NSString* RunInferenceOnImage() {
 
   std::string predictions = ss.str();
   NSString* result = @"";
-  result = [NSString stringWithFormat: @"%@ - %s", result,
-            predictions.c_str()];
-  
+  result = [NSString stringWithFormat:@"%@ - %s", result, predictions.c_str()];
+
   return result;
 }
diff --git a/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h b/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h
index 7287d0d63d5b4c0b9c9a528578b6341cdb9c9954..98934ce41d349b33d4fc010a39a956e52f3d5721 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h
+++ b/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h
@@ -17,9 +17,7 @@
 
 #include <vector>
 
-std::vector<uint8_t> LoadImageFromFile(const char* file_name,
-						 int* out_width,
-						 int* out_height,
-						 int* out_channels);
+std::vector<uint8_t> LoadImageFromFile(const char* file_name, int* out_width,
+                                       int* out_height, int* out_channels);
 
 #endif  // TENSORFLOW_EXAMPLES_IOS_IOS_IMAGE_LOAD_H_
diff --git a/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.mm b/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.mm
index 789522d2a9900b136f91f77c4ada682f1a316848..cb0fe1a7650c572d3745066431f2759daa94ffc9 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.mm
+++ b/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.mm
@@ -14,17 +14,16 @@
 
 #include "ios_image_load.h"
 
-#include <stdlib.h>
-#include <string.h>
 #include <assert.h>
 #include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 #import <CoreImage/CoreImage.h>
 #import <ImageIO/ImageIO.h>
 
-std::vector<uint8_t> LoadImageFromFile(const char* file_name,
-				     int* out_width, int* out_height,
-				     int* out_channels) {
+std::vector<uint8_t> LoadImageFromFile(const char* file_name, int* out_width, int* out_height,
+                                       int* out_channels) {
   FILE* file_handle = fopen(file_name, "rb");
   fseek(file_handle, 0, SEEK_END);
   const size_t bytes_in_file = ftell(file_handle);
@@ -32,11 +31,10 @@ std::vector<uint8_t> LoadImageFromFile(const char* file_name,
   std::vector<uint8_t> file_data(bytes_in_file);
   fread(file_data.data(), 1, bytes_in_file, file_handle);
   fclose(file_handle);
-  CFDataRef file_data_ref = CFDataCreateWithBytesNoCopy(NULL, file_data.data(),
-						      bytes_in_file,
-						      kCFAllocatorNull);
-  CGDataProviderRef image_provider =
-    CGDataProviderCreateWithCFData(file_data_ref);
+
+  CFDataRef file_data_ref =
+      CFDataCreateWithBytesNoCopy(NULL, file_data.data(), bytes_in_file, kCFAllocatorNull);
+  CGDataProviderRef image_provider = CGDataProviderCreateWithCFData(file_data_ref);
 
   const char* suffix = strrchr(file_name, '.');
   if (!suffix || suffix == file_name) {
@@ -44,12 +42,10 @@ std::vector<uint8_t> LoadImageFromFile(const char* file_name,
   }
   CGImageRef image;
   if (strcasecmp(suffix, ".png") == 0) {
-    image = CGImageCreateWithPNGDataProvider(image_provider, NULL, true,
-					     kCGRenderingIntentDefault);
-  } else if ((strcasecmp(suffix, ".jpg") == 0) ||
-    (strcasecmp(suffix, ".jpeg") == 0)) {
-    image = CGImageCreateWithJPEGDataProvider(image_provider, NULL, true,
-					      kCGRenderingIntentDefault);
+    image = CGImageCreateWithPNGDataProvider(image_provider, NULL, true, kCGRenderingIntentDefault);
+  } else if ((strcasecmp(suffix, ".jpg") == 0) || (strcasecmp(suffix, ".jpeg") == 0)) {
+    image =
+        CGImageCreateWithJPEGDataProvider(image_provider, NULL, true, kCGRenderingIntentDefault);
   } else {
     CFRelease(image_provider);
     CFRelease(file_data_ref);
@@ -68,9 +64,10 @@ std::vector<uint8_t> LoadImageFromFile(const char* file_name,
   const int bytes_in_image = (bytes_per_row * height);
   std::vector<uint8_t> result(bytes_in_image);
   const int bits_per_component = 8;
-  CGContextRef context = CGBitmapContextCreate(result.data(), width, height,
-    bits_per_component, bytes_per_row, color_space,
-    kCGImageAlphaPremultipliedLast | kCGBitmapByteOrder32Big);
+
+  CGContextRef context =
+      CGBitmapContextCreate(result.data(), width, height, bits_per_component, bytes_per_row,
+                            color_space, kCGImageAlphaPremultipliedLast | kCGBitmapByteOrder32Big);
   CGColorSpaceRelease(color_space);
   CGContextDrawImage(context, CGRectMake(0, 0, width, height), image);
   CGContextRelease(context);
diff --git a/tensorflow/contrib/lite/examples/ios/simple/main.mm b/tensorflow/contrib/lite/examples/ios/simple/main.mm
index d70550a730720e5d6799a186c1beb3cfa04b0b9d..05cb55ddd7a230593863e64b351f6aac31a1b4d7 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/main.mm
+++ b/tensorflow/contrib/lite/examples/ios/simple/main.mm
@@ -14,7 +14,7 @@
 
 #import <UIKit/UIKit.h>
 
-int main(int argc, char * argv[]) {
+int main(int argc, char *argv[]) {
   @autoreleasepool {
     NSString *delegateClassName = @"AppDelegate";
     return UIApplicationMain(argc, argv, nil, delegateClassName);
diff --git a/tensorflow/contrib/lite/g3doc/apis.md b/tensorflow/contrib/lite/g3doc/apis.md
index 662ae2032c990b649fc6d34dcf915d58796c0665..fe208e47d1ac10995881e55c8596ae14ff4242df 100644
--- a/tensorflow/contrib/lite/g3doc/apis.md
+++ b/tensorflow/contrib/lite/g3doc/apis.md
@@ -52,7 +52,7 @@ typedef enum {
 Failures can be easily verified with:
 ```c++
 if (status != kTfLiteOk) {
-  // ... error handling here ... 
+  // ... error handling here ...
 }
 ```
 
diff --git a/tensorflow/contrib/lite/g3doc/ios.md b/tensorflow/contrib/lite/g3doc/ios.md
index ce8b37fbf9b0db5dee60784e85a3cbf0326fddb6..a359b8d4b481dbc15cc86db14eabda5433722b8b 100644
--- a/tensorflow/contrib/lite/g3doc/ios.md
+++ b/tensorflow/contrib/lite/g3doc/ios.md
@@ -45,6 +45,10 @@ into a universal file containing armv7, armv7s, arm64, i386, and x86_64
 architectures. The resulting library is in
 `tensorflow/contrib/lite/gen/lib/libtensorflow-lite.a`.
 
+If you get an error such as `no such file or directory: 'x86_64'` when running 
+`build_ios_universal_lib.sh`: open Xcode > Preferences > Locations, and ensure 
+a value is selected in the "Command Line Tools" dropdown.
+
 ## Using in your own application
 
 You'll need to update various settings in your app to link against TensorFlow
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 8bf60e91f769338aa0751761c2dc0df417ee0943..65c61e44bee48535f884a3afaddc691972f5e04b 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/error_reporter.h"
 #include "tensorflow/contrib/lite/simple_memory_arena.h"
-#include "tensorflow/core/platform/platform.h"
 
 namespace tflite {
 
@@ -232,7 +231,6 @@ class Interpreter {
   // If you know that your sizes are not changing, you need not call this.
 
   // Returns status of success or failure.
-  // TODO(aselle): Madde
   TfLiteStatus AllocateTensors();
 
   // Invoke the interpreter (run the whole graph in dependency order).
diff --git a/tensorflow/contrib/lite/java/demo/README.md b/tensorflow/contrib/lite/java/demo/README.md
index 71b633c5774d93684f651821adad13c378a8243c..2e818f728ef208d30b0eeb27ffd7e3fa0c7c1a2d 100644
--- a/tensorflow/contrib/lite/java/demo/README.md
+++ b/tensorflow/contrib/lite/java/demo/README.md
@@ -8,7 +8,12 @@
      It's easiest with Android Studio.
 
       - You'll need at least SDK version 23.
+      - Make sure to install the latest version of Bazel. Some distributions
+        ship with Bazel 0.5.4, which is too old.
       - Bazel requires Android Build Tools `26.0.1` or higher.
+      - **Bazel is incompatible with NDK revisions 15 and above,** with revision
+        16 being a compile-breaking change. [Download an older version manually
+        instead of using the SDK Manager.](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install-bazel-and-android-prerequisites)
       - You also need to install the Android Support Repository, available
         through Android Studio under `Android SDK Manager -> SDK Tools ->
         Android Support Repository`.
@@ -16,10 +21,15 @@
   2. [Edit your `WORKSPACE`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#edit-workspace)
      to add SDK and NDK targets.
 
+     NOTE: As long as you have the SDK and NDK installed, the `./configure`
+     script will create these rules for you. Answer "Yes" when the script asks
+     to automatically configure the `./WORKSPACE`.
+
       - Make sure the `api_level` in `WORKSPACE` is set to an SDK version that
         you have installed.
       - By default, Android Studio will install the SDK to `~/Android/Sdk` and
-        the NDK to `~/Android/Sdk/ndk-bundle`.
+        the NDK to `~/Android/Sdk/ndk-bundle` (but the NDK should be a manual
+        download until Bazel supports NDK 16. See bullet points under (1)).
 
 2. Build the app with Bazel. The demo needs C++11:
 
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/base-strings.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/base-strings.xml
index ab7d3fd496376ae702ca75a8c496863b1ff93a90..0a71dbd0e8010f5e3a176de1f7e8321331289f7c 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/base-strings.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/base-strings.xml
@@ -19,12 +19,12 @@
     <string name="app_name">TfLiteCameraDemo</string>
     <string name="intro_message">
         <![CDATA[
-        
-            
+
+
             This sample demonstrates the basic use of TfLite API. Check the source code to see how
             you can use TfLite for efficient, on-device inference with trained TensorFlow models.
-            
-        
+
+
         ]]>
     </string>
 </resources>
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index bbbfa3e7415bfd7a34dfc7d764da55cac22e7d42..cc02cddb3d6cce3787fd15ee1734a490389fb9b3 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -32,6 +32,7 @@ cc_library(
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:schema_fbs_version",
         "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/testing:util",
         "//tensorflow/core:lib",
         "@com_google_googletest//:gtest",
     ],
@@ -76,12 +77,14 @@ cc_library(
         "activations.cc",
         "add.cc",
         "basic_rnn.cc",
+        "batch_to_space_nd.cc",
         "concatenation.cc",
         "conv.cc",
         "depthwise_conv.cc",
         "embedding_lookup.cc",
         "embedding_lookup_sparse.cc",
         "fully_connected.cc",
+        "gather.cc",
         "hashtable_lookup.cc",
         "kernel_util.cc",
         "l2norm.cc",
@@ -89,6 +92,7 @@ cc_library(
         "lsh_projection.cc",
         "lstm.cc",
         "mul.cc",
+        "pad.cc",
         "pooling.cc",
         "register.cc",
         "reshape.cc",
@@ -96,6 +100,7 @@ cc_library(
         "skip_gram.cc",
         "space_to_depth.cc",
         "svdf.cc",
+        "unidirectional_sequence_rnn.cc",
     ],
     hdrs = [
         "kernel_util.h",
@@ -152,6 +157,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "batch_to_space_nd_test",
+    size = "small",
+    srcs = ["batch_to_space_nd_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "concatenation_test",
     size = "small",
@@ -200,6 +217,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "unidirectional_sequence_rnn_test",
+    size = "small",
+    srcs = ["unidirectional_sequence_rnn_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "l2norm_test",
     size = "small",
@@ -224,6 +253,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "pad_test",
+    size = "small",
+    srcs = ["pad_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "reshape_test",
     size = "small",
@@ -236,6 +277,19 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "gather_test",
+    size = "small",
+    srcs = ["gather_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "resize_bilinear_test",
     size = "small",
diff --git a/tensorflow/contrib/lite/kernels/activations_test.cc b/tensorflow/contrib/lite/kernels/activations_test.cc
index f10aee70170d4a94ed54376fa410b22a60f109af..33ca56e745c043efd12b851af14f273fb273d577 100644
--- a/tensorflow/contrib/lite/kernels/activations_test.cc
+++ b/tensorflow/contrib/lite/kernels/activations_test.cc
@@ -317,7 +317,7 @@ TEST(QuantizedActivationsOpTest, Softmax2D) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/add_test.cc b/tensorflow/contrib/lite/kernels/add_test.cc
index 8e12a837c4954832ff37a6d1ab377bee9e8d5763..ddf45bb576755d57d50c9e6e01bf50f15612c56d 100644
--- a/tensorflow/contrib/lite/kernels/add_test.cc
+++ b/tensorflow/contrib/lite/kernels/add_test.cc
@@ -164,8 +164,7 @@ TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) {
 }  // namespace
 }  // namespace tflite
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
index dfa75655bcfe7762c6cc4c9a98a71d529028c03a..5ecccb985e91238f1183c8f94a2b5f468758ce55 100644
--- a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
+++ b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
@@ -261,7 +261,7 @@ TEST(FullyConnectedOpTest, BlackBoxTest) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0eed680fdcc2afc4bc72be55a5e7722310fa4538
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
@@ -0,0 +1,161 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace batch_to_space_nd {
+
+// This file has two implementations of BatchToSpaceND.
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
+struct BatchToSpaceNDContext {
+  BatchToSpaceNDContext(TfLiteContext* context, TfLiteNode* node) {
+    params = reinterpret_cast<TfLiteBatchToSpaceNDParams*>(node->builtin_data);
+    input = GetInput(context, node, 0);
+    output = GetOutput(context, node, 0);
+  }
+  TfLiteBatchToSpaceNDParams* params;
+  TfLiteTensor* input;
+  TfLiteTensor* output;
+};
+
+// Currently, only 4D NHWC input/output op_context are supported.
+// The 4D array need to have exactly 2 spatial dimensions.
+// TODO(ycling): Support arbitrary dimension in BatchToSpaceND.
+const int kInputDimensionNum = 4;
+const int kOutputDimensionNum = 4;
+const int kSpatialDimensionNum = 2;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // The 2nd tensor (block_shape) and the 3rd tensor (crops) are ignored now.
+  TF_LITE_ENSURE(context, NumInputs(node) >= 1 && NumInputs(node) <= 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  BatchToSpaceNDContext op_context(context, node);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input),
+                    kInputDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context.params->num_spatial_dimensions,
+                    kSpatialDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+
+  const TfLiteIntArray* input_size = op_context.input->dims;
+  const int* block_shape = op_context.params->block_shape;
+
+  // Number of batch must be multiple of (block_shape[0] * block_shape[1]).
+  TF_LITE_ENSURE_EQ(context,
+                    input_size->data[0] % (block_shape[0] * block_shape[1]), 0);
+
+  const int output_batch_size =
+      input_size->data[0] / (block_shape[0] * block_shape[1]);
+  const int output_height = input_size->data[1] * block_shape[0];
+  const int output_width = input_size->data[2] * block_shape[1];
+  const int output_channel_size = input_size->data[3];
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(kOutputDimensionNum);
+  output_size->data[0] = output_batch_size;
+  output_size->data[1] = output_height;
+  output_size->data[2] = output_width;
+  output_size->data[3] = output_channel_size;
+
+  return context->ResizeTensor(context, op_context.output, output_size);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  BatchToSpaceNDContext op_context(context, node);
+
+  int block_shape_dims_array[1] = {kSpatialDimensionNum};
+  Dims<4> block_shape_dims = GetTensorDims(block_shape_dims_array, 1);
+
+#define TF_LITE_BATCH_TO_SPACE_ND(type, scalar)                          \
+  type::BatchToSpaceND(GetTensorData<scalar>(op_context.input),          \
+                       GetTensorDims(op_context.input),                  \
+                       op_context.params->block_shape, block_shape_dims, \
+                       GetTensorData<scalar>(op_context.output),         \
+                       GetTensorDims(op_context.output))
+  switch (op_context.input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      if (kernel_type == kReference) {
+        TF_LITE_BATCH_TO_SPACE_ND(reference_ops, float);
+      } else {
+        TF_LITE_BATCH_TO_SPACE_ND(optimized_ops, float);
+      }
+      break;
+    case kTfLiteUInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_BATCH_TO_SPACE_ND(reference_ops, uint8_t);
+      } else {
+        TF_LITE_BATCH_TO_SPACE_ND(optimized_ops, uint8_t);
+      }
+      break;
+    case kTfLiteInt32:
+      if (kernel_type == kReference) {
+        TF_LITE_BATCH_TO_SPACE_ND(reference_ops, int32_t);
+      } else {
+        TF_LITE_BATCH_TO_SPACE_ND(optimized_ops, int32_t);
+      }
+      break;
+    case kTfLiteInt64:
+      if (kernel_type == kReference) {
+        TF_LITE_BATCH_TO_SPACE_ND(reference_ops, int64_t);
+      } else {
+        TF_LITE_BATCH_TO_SPACE_ND(optimized_ops, int64_t);
+      }
+      break;
+    default:
+      context->ReportError(context,
+                           "Type is currently not supported by BatchToSpace.");
+      return kTfLiteError;
+  }
+#undef TF_LITE_BATCH_TO_SPACE_ND
+  return kTfLiteOk;
+}
+
+}  // namespace batch_to_space_nd
+
+TfLiteRegistration* Register_BATCH_TO_SPACE_ND_REF() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, batch_to_space_nd::Prepare,
+      batch_to_space_nd::Eval<batch_to_space_nd::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_BATCH_TO_SPACE_ND_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, batch_to_space_nd::Prepare,
+      batch_to_space_nd::Eval<batch_to_space_nd::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_BATCH_TO_SPACE_ND() {
+  return Register_BATCH_TO_SPACE_ND_GENERIC_OPT();
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3ec4efbebcef9d55d0042d93007018c9f6ee3b58
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
@@ -0,0 +1,78 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BatchToSpaceNDOpModel : public SingleOpModel {
+ public:
+  BatchToSpaceNDOpModel(std::initializer_list<int> input_shape,
+                        std::initializer_list<int> block_shape,
+                        std::initializer_list<int> before_crops,
+                        std::initializer_list<int> after_crops) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
+                 BuiltinOptions_BatchToSpaceNDOptions,
+                 CreateBatchToSpaceNDOptions(
+                     builder_, builder_.CreateVector<int>(block_shape),
+                     builder_.CreateVector<int>(before_crops),
+                     builder_.CreateVector<int>(after_crops))
+                     .Union());
+    BuildInterpreter({input_shape});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(BatchToSpaceNDOpTest, SimpleTest) {
+  BatchToSpaceNDOpModel m({4, 2, 2, 1}, {2, 2}, {0, 0}, {0, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7,
+                                               4, 8, 11, 15, 12, 16}));
+}
+
+TEST(BatchToSpaceNDOpTest, InvalidShapeTest) {
+  EXPECT_DEATH(BatchToSpaceNDOpModel({3, 2, 2, 1}, {2, 2}, {0, 0}, {0, 0}),
+               "Cannot allocate tensors");
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/concatenation_test.cc b/tensorflow/contrib/lite/kernels/concatenation_test.cc
index 94e5b2acdcabeedb4652baa1a008b22bf6bc8433..499856a93cbbfbf9aa1a326912e52ce32bbbdf83 100644
--- a/tensorflow/contrib/lite/kernels/concatenation_test.cc
+++ b/tensorflow/contrib/lite/kernels/concatenation_test.cc
@@ -156,7 +156,7 @@ TEST(ConcatenationOpTest, FourInputsQuantized) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc
index 18d7a31d594efb6a05fe7292a0194ea17599a65b..1d0a81c3135625c07a3566f5f9a8e5401f0d4db7 100644
--- a/tensorflow/contrib/lite/kernels/conv_test.cc
+++ b/tensorflow/contrib/lite/kernels/conv_test.cc
@@ -434,7 +434,7 @@ TEST(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc b/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc
index 39227b2811e2be719a0be77f89793bcf9366d513..1439c8bce14ad127ed68dc54991aed8b8bb39383 100644
--- a/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc
+++ b/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc
@@ -180,7 +180,7 @@ TEST(QuantizedDepthwiseConvolutionOpTest, SimpleTestQuantized) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
index 69d9c5cc7dec13a65f1c5050f2f1c56812ad5aa1..dcdc5fffad9ceac1a9d23a4e91637a9ff92a8dda 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
@@ -158,9 +158,7 @@ TEST(EmbeddingLookupOpTest, Indices3DTest) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-#ifdef OS_LINUX
-  tflite::LogToStderr();
-#endif
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc
index 8c030b06772ac0c6af34a45897f03ebc4637d4de..9b501878f196216a61568bfa36e6615f4dd07478 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc
@@ -88,7 +88,7 @@ TEST(EmbeddingLookupOpTest, SimpleTest) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/fully_connected_test.cc b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
index 112e3f1ba01a428023eea5ee8410fb76c1d67de6..a0f766c4f4580d7679275c0b63aa200410fcb5ad 100644
--- a/tensorflow/contrib/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
@@ -370,8 +370,7 @@ TEST(FullyConnectedOpTest, BlackBoxTest) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/gather.cc b/tensorflow/contrib/lite/kernels/gather.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f8df797daf7338e33b16508c21fc61cd9836db1e
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/gather.cc
@@ -0,0 +1,130 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace gather {
+constexpr int kInputTensor = 0;
+constexpr int kInputPositions = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const auto* params =
+      reinterpret_cast<const TfLiteGatherParams*>(node->builtin_data);
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* positions = GetInput(context, node, kInputPositions);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  // Only INT32 positions are supported.
+  TF_LITE_ENSURE_EQ(context, positions->type, kTfLiteInt32);
+  // Check that input and output types match.
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  // TODO(mgubin): only 1D positions are currently supported.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(positions), 1);
+  // TODO(mgubin): Only default axis == 0 is supported.
+  // Check conditions for different types.
+  switch (input->type) {
+    case kTfLiteFloat32:
+    case kTfLiteUInt8:
+    case kTfLiteInt32: {
+      // Fully supported by reference_ops::Gather.
+    } break;
+
+    case kTfLiteString: {
+      // Only 1D input is supported.
+      TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
+    } break;
+    default:
+      context->ReportError(context,
+                           "Only float32 and string types are supported");
+      return kTfLiteError;
+  }
+  const int num_dimensions =
+      NumDimensions(input) + NumDimensions(positions) - 1;
+  TF_LITE_ENSURE(context, params->axis < num_dimensions);
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(num_dimensions);
+  int output_index = 0;
+  for (int i = 0; i < params->axis; ++i) {
+    output_shape->data[output_index++] = input->dims->data[i];
+  }
+  for (int i = 0; i < positions->dims->size; ++i) {
+    output_shape->data[output_index++] = positions->dims->data[i];
+  }
+  for (int i = params->axis + 1; i < input->dims->size; ++i) {
+    output_shape->data[output_index++] = input->dims->data[i];
+  }
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* positions = GetInput(context, node, kInputPositions);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const int input_rank = NumDimensions(input);
+#define TF_LITE_GATHER(data_type, index_type)                            \
+  optimized_ops::Gather(                                                 \
+      GetTensorData<data_type>(input), GetTensorDims(input), input_rank, \
+      GetTensorData<index_type>(positions), GetTensorDims(positions),    \
+      GetTensorData<data_type>(output), GetTensorDims(output));
+  switch (input->type) {
+    case kTfLiteFloat32:
+      TF_LITE_GATHER(float, int32_t);
+      break;
+    case kTfLiteUInt8:
+      TF_LITE_GATHER(uint8_t, int32_t);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_GATHER(int32_t, int32_t);
+      break;
+    case kTfLiteString: {
+      DynamicBuffer buffer;
+      const int32* indexes = positions->data.i32;
+      const int num_strings = GetStringCount(input);
+      for (int i = 0; i < positions->dims->data[0]; ++i) {
+        const int pos = indexes[i];
+        TF_LITE_ENSURE(context, pos < num_strings);
+        const auto string_ref = GetString(input, pos);
+        buffer.AddString(string_ref.str, string_ref.len);
+      }
+      buffer.WriteToTensor(output);
+    } break;
+    default:
+      return kTfLiteError;
+  }
+#undef TF_LITE_GATHER
+  return kTfLiteOk;
+}
+}  // namespace gather
+
+TfLiteRegistration* Register_GATHER() {
+  static TfLiteRegistration r = {nullptr, nullptr, gather::Prepare,
+                                 gather::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/gather_test.cc b/tensorflow/contrib/lite/kernels/gather_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6343d3b4ef20ae3e030396ec1b6adbcf83a3e45f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/gather_test.cc
@@ -0,0 +1,121 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class GatherOpModel : public SingleOpModel {
+ public:
+  GatherOpModel(std::initializer_list<int> input_shape, TensorType input_type,
+                std::initializer_list<int> positions_shape) {
+    input_ = AddInput(input_type);
+    positions_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(input_type);
+    SetBuiltinOp(BuiltinOperator_GATHER, BuiltinOptions_GatherOptions,
+                 CreateGatherOptions(builder_, 0).Union());
+    BuildInterpreter({input_shape, positions_shape});
+  }
+
+  void SetInputFloat(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+
+  void SetInputUint8(std::initializer_list<uint8_t> data) {
+    PopulateTensor<uint8_t>(input_, data);
+  }
+
+  void SetInput(std::initializer_list<string> data) {
+    PopulateStringTensor(input_, data);
+  }
+
+  void SetPositions(std::initializer_list<int32> data) {
+    PopulateTensor<int32>(positions_, data);
+  }
+
+  std::vector<float> GetOutputFloat() { return ExtractVector<float>(output_); }
+  std::vector<uint8_t> GetOutputUint8() {
+    return ExtractVector<uint8_t>(output_);
+  }
+  std::vector<string> GetOutputString() {
+    return ExtractVector<string>(output_);
+  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int positions_;
+  int output_;
+};
+
+TEST(GatherOpTest, Shuffle) {
+  GatherOpModel m({2, 2}, TensorType_FLOAT32, {2});
+  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions({1, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputFloat(),
+              ElementsAreArray(ArrayFloatNear({0.7, 0.8, -2, 0.2})));
+}
+
+TEST(FloatGatherOpTest, Duplicate) {
+  GatherOpModel m({1, 2, 2}, TensorType_FLOAT32, {2});
+  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions({0, 0});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutputFloat(),
+      ElementsAreArray(ArrayFloatNear({-2, 0.2, 0.7, 0.8, -2, 0.2, 0.7, 0.8})));
+}
+
+TEST(FloatGatherOpTest, Slice) {
+  GatherOpModel m({4, 1}, TensorType_FLOAT32, {2});
+  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions({1, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({0.2, 0.8})));
+}
+
+TEST(Uint8tGatherOpTest, Shuffle) {
+  GatherOpModel m({2, 2}, TensorType_UINT8, {2});
+  m.SetInputUint8({133, 134, 14, 15});
+  m.SetPositions({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutputUint8(), ElementsAreArray({14, 15, 133, 134}));
+}
+
+TEST(GatherOpTest, SimpleString) {
+  GatherOpModel m({3}, TensorType_STRING, {2});
+  m.SetInput({"A", "B", "C"});
+  m.SetPositions({0, 2});
+  m.Invoke();
+  ASSERT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutputString(), ElementsAreArray({"A", "C"}));
+}
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc b/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
index 916a23225e2ad3c5645a7809169677a7a8880535..cb6038f9009a3865661e7b4f075c3033166d0f91 100644
--- a/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
+++ b/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
@@ -170,7 +170,7 @@ TEST(HashtableLookupOpTest, TestString) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index 288534099b9e090ce0c223a401b4152ca6ffb61f..a3ecb2ebf6a889729954d1e447997c510e8ff6d4 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -124,6 +124,13 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "freebsd",
+    values = {
+        "cpu": "freebsd",
+    },
+)
+
 cc_library(
     name = "optimized_base",
     srcs = [],
@@ -147,6 +154,7 @@ cc_library(
         ":x86": tflite_deps_intel,
         ":x86_64": tflite_deps_intel,
         ":darwin": tflite_deps_intel,
+        ":freebsd": tflite_deps_intel,
         "//conditions:default": [],
     }),
 )
@@ -224,6 +232,7 @@ cc_library(
         ":x86": tflite_deps_intel,
         ":x86_64": tflite_deps_intel,
         ":darwin": tflite_deps_intel,
+        ":freebsd": tflite_deps_intel,
         "//conditions:default": [],
     }),
 )
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
index 974611f52ac74cec275f978c5af5bd561688db78..da34c8aef94b1c69e661bd33fcb518e73034c4bd 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -311,6 +311,9 @@ struct FloatDepthwiseConvKernel<true, 0, 8> {
   }
 };
 
+// Note this implementation is very slow for input_depths < 8
+// (e.g. comparable to reference implementation) see, specializations for
+// input_depth=3 below.
 template <>
 struct FloatDepthwiseConvKernel<true, 0, 2> {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
@@ -417,6 +420,74 @@ struct FloatDepthwiseConvKernel<true, 0, 2> {
   }
 };
 
+template <>
+struct FloatDepthwiseConvKernel<true, 3, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x2_t filter[3];
+    for (int i = 0; i < 3; i++) {
+      filter[i] = vld1_f32(filter_ptr + 2 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float32x2_t input01 = vld1_f32(input_ptr);
+      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+      // Load the accumulators from acc_buffer
+      float32x2_t acc[3];
+      for (int i = 0; i < 3; i++) {
+        acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+      }
+      // Multiply-accumulate for each input channel there 2 outputs
+      acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0);
+      acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1);
+      acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 3; i++) {
+        vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+      }
+      acc_buffer_ptr += 6;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 3, 4> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter[3];
+    for (int i = 0; i < 3; i++) {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // NOTE: we only want 3 values, so we read it as two ops where
+      // the second op just duplicates the lane
+      const float32x2_t input01 = vld1_f32(input_ptr);
+      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[3];
+      for (int i = 0; i < 3; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate all outputs.
+      acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0);
+      acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1);
+      acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 3; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 12;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
 template <>
 struct FloatDepthwiseConvKernel<true, 1, 8> {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
@@ -857,6 +928,8 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4)
   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
 
   // Finally, the kernels allowing a variable input depth,
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index cd565c16a1ee7226f83c19f0020beed75e401497..2df919e579efaaa283f191df91cd433374b31567 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -3704,6 +3704,43 @@ void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
   auto max_value = input2_data[0];
   output_map.array() = input1_map.array().max(max_value);
 }
+
+template <typename T1, typename T2, typename T3>
+void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
+            T2* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("ArgMax");
+
+  // The current ArgMax implemention can only determine the index of the maximum
+  // value in the last dimension. So the axis argument is ignored.
+  TFLITE_DCHECK_EQ(axis[0], 3);
+
+  // For ArgMax, the number of output dimensions = (number of input dimensions -
+  // 1). For the sake of simplicity, the output dimensions are equal to the
+  // input dimensions here. We enforce the constraint that the last dimension
+  // must always be 1.
+  TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1);
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = ArraySize(input_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        auto max_value = input_data[Offset(input_dims, 0, x, y, b)];
+        int max_index = 0;
+        for (int d = 1; d < depth; ++d) {
+          const auto& curr_value = input_data[Offset(input_dims, d, x, y, b)];
+          if (curr_value > max_value) {
+            max_value = curr_value;
+            max_index = d;
+          }
+        }
+        output_data[Offset(output_dims, 0, x, y, b)] = max_index;
+      }
+    }
+  }
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
index c2ab78000b81485f037c507933cd024e70f39850..7f90d731b8454a020ab273e6b5591ed90aab14c7 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tflite {
 namespace tensor_utils {
 
-// Limit a float input f betweeen +abs_limit and -abs_limit.
+// Limit a float input f between +abs_limit and -abs_limit.
 float PortableClip(float f, float abs_limit);
 
 // Multiply a matrix by a batch vector, and store results in a batch-size
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index b9ca3d5c626dff4ea8ba52949e8fea8e9b43689f..14c430258740b65dce65816f7c5c41fccf6dd5cf 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -2449,6 +2449,40 @@ void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
+template <typename T1, typename T2, typename T3>
+void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
+            T2* output_data, const Dims<4>& output_dims) {
+  // The current ArgMax implemention can only determine the index of the maximum
+  // value in the last dimension. So the axis argument is ignored.
+  TFLITE_DCHECK_EQ(axis[0], 3);
+
+  // For ArgMax, the number of output dimensions = (number of input dimensions -
+  // 1). For the sake of simplicity, the output dimensions are equal to the
+  // input dimensions here. We enforce the constraint that the last dimension
+  // must always be 1.
+  TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1);
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = ArraySize(input_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        auto max_value = input_data[Offset(input_dims, 0, x, y, b)];
+        int max_index = 0;
+        for (int d = 1; d < depth; ++d) {
+          const auto& curr_value = input_data[Offset(input_dims, d, x, y, b)];
+          if (curr_value > max_value) {
+            max_value = curr_value;
+            max_index = d;
+          }
+        }
+        output_data[Offset(output_dims, 0, x, y, b)] = max_index;
+      }
+    }
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
index 0e69ef5982f01e364d865684652d1dfecab6fee3..e7e2994397650004c7ba442fa1803290e6b12302 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
@@ -20,7 +20,7 @@ limitations under the License.
 namespace tflite {
 namespace tensor_utils {
 
-// Limit a float input f betweeen +abs_limit and -abs_limit.
+// Limit a float input f between +abs_limit and -abs_limit.
 float Clip(float f, float abs_limit);
 
 // Multiply a matrix by a batch vector, and store results in a batch-size
diff --git a/tensorflow/contrib/lite/kernels/l2norm_test.cc b/tensorflow/contrib/lite/kernels/l2norm_test.cc
index b1db89b8bd3474ac868d7215e4a0de12088c48ef..30e103f3303484c339ef98e6a68e0438291c102f 100644
--- a/tensorflow/contrib/lite/kernels/l2norm_test.cc
+++ b/tensorflow/contrib/lite/kernels/l2norm_test.cc
@@ -57,7 +57,7 @@ TEST(L2NormOpTest, SimpleTest) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/local_response_norm_test.cc b/tensorflow/contrib/lite/kernels/local_response_norm_test.cc
index 63a8b0a3d0186def7da2c9f31481721f1a55281c..d75ce258a04c820d8f82735988c01d0154ef36f2 100644
--- a/tensorflow/contrib/lite/kernels/local_response_norm_test.cc
+++ b/tensorflow/contrib/lite/kernels/local_response_norm_test.cc
@@ -95,7 +95,7 @@ TEST(LocalResponseNormOpTest, SmallRadius) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/lsh_projection_test.cc b/tensorflow/contrib/lite/kernels/lsh_projection_test.cc
index 1011927848d586c8541fb694914b5eee123cb8dc..414d728dfc153058ec878d3c766f58e86815cd3f 100644
--- a/tensorflow/contrib/lite/kernels/lsh_projection_test.cc
+++ b/tensorflow/contrib/lite/kernels/lsh_projection_test.cc
@@ -117,7 +117,7 @@ TEST(LSHProjectionOpTest2, Sparse3DInputs) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/lstm_test.cc b/tensorflow/contrib/lite/kernels/lstm_test.cc
index be4c7ddbf88fc902368cda13aff72f5aecb9dac4..c068286b0d84bcb51ebb0e239350a42863de6523 100644
--- a/tensorflow/contrib/lite/kernels/lstm_test.cc
+++ b/tensorflow/contrib/lite/kernels/lstm_test.cc
@@ -1081,8 +1081,7 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/mul_test.cc b/tensorflow/contrib/lite/kernels/mul_test.cc
index 4b858e1f396252e7f7bdc231bc1e00f47277f08a..4255cfe18a043c55f3ce7292afdedb6e988a28a2 100644
--- a/tensorflow/contrib/lite/kernels/mul_test.cc
+++ b/tensorflow/contrib/lite/kernels/mul_test.cc
@@ -120,8 +120,7 @@ TEST(QuantizedMulOpTest, NoActivation) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/op_macros.h b/tensorflow/contrib/lite/kernels/op_macros.h
index 7535afaf8ea52d855e2e4773e56ce2118a16447c..63670efcb1e6349317aa5c75756707fb7a7fa2aa 100644
--- a/tensorflow/contrib/lite/kernels/op_macros.h
+++ b/tensorflow/contrib/lite/kernels/op_macros.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
 #define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
 
+#include <cstdio>
+
 #define TF_LITE_FATAL(msg)          \
   do {                              \
     fprintf(stderr, "%s\n", (msg)); \
diff --git a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
index 8e9cc07656c8bea83f7cb78ca0b6cc5de7ad1b73..17166715ca30ff3d8ba3d384110e403f8910e39d 100644
--- a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
+++ b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
@@ -334,8 +334,7 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e90282a43b1b6caf7918b3874fd4273f59e31b7
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/pad.cc
@@ -0,0 +1,139 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace pad {
+
+// This file has two implementations of Pad.
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
+// TODO(nupurgarg): Padding represented as a tensor is ignored. Only use the
+// `left_padding` and `right_padding` specified in `params`.
+struct PadContext {
+  PadContext(TfLiteContext* context, TfLiteNode* node) {
+    params = reinterpret_cast<TfLitePadParams*>(node->builtin_data);
+    input = GetInput(context, node, 0);
+    output = GetOutput(context, node, 0);
+  }
+  TfLitePadParams* params;
+  TfLiteTensor* input;
+  TfLiteTensor* output;
+};
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  // Determines size of output tensor.
+  PadContext op_context(context, node);
+  int dims = NumDimensions(op_context.input);
+  TF_LITE_ENSURE_EQ(context, dims, op_context.params->num_dimensions);
+
+  // TODO(nupurgarg): Our current implementations rely on the inputs being 4D.
+  TF_LITE_ENSURE_EQ(context, dims, 4);
+
+  const TfLiteIntArray* input_size = op_context.input->dims;
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(dims);
+  for (int idx = 0; idx < dims; ++idx) {
+    TF_LITE_ENSURE_MSG(context,
+                       (op_context.params->before_padding[idx] >= 0 &&
+                        op_context.params->after_padding[idx] >= 0),
+                       "Pad value has to be greater than equal to 0.");
+    output_size->data[idx] =
+        (input_size->data[idx] + op_context.params->before_padding[idx] +
+         op_context.params->after_padding[idx]);
+  }
+
+  return context->ResizeTensor(context, op_context.output, output_size);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  PadContext op_context(context, node);
+
+  // TODO(nupurgarg): Support different data types.
+  if (op_context.output->type == kTfLiteFloat32) {
+    std::vector<int> before_padding(
+        op_context.params->before_padding,
+        op_context.params->before_padding + op_context.params->num_dimensions);
+    std::vector<int> after_padding(
+        op_context.params->after_padding,
+        op_context.params->after_padding + op_context.params->num_dimensions);
+
+    // TODO(nupurgarg): Change TOCO's implementation to use padding arrays
+    // in forward order (depth, width, height, batch).
+    // Converts from int[] = {depth, width, height, batch} to int[] = {batch,
+    // height, width, depth} to match TOCO's implementation of pad in
+    // referenced_ops.h and optimized_ops.h.
+    std::reverse(before_padding.begin(), before_padding.end());
+    std::reverse(after_padding.begin(), after_padding.end());
+
+#define TF_LITE_PAD(type)                                                   \
+  type::Pad(GetTensorData<float>(op_context.input),                         \
+            GetTensorDims(op_context.input), before_padding, after_padding, \
+            GetTensorData<float>(op_context.output),                        \
+            GetTensorDims(op_context.output))
+
+    if (kernel_type == kReference) {
+      TF_LITE_PAD(reference_ops);
+    }
+    if (kernel_type == kGenericOptimized) {
+      TF_LITE_PAD(optimized_ops);
+    }
+#undef TF_LITE_PAD
+  } else {
+    context->ReportError(context, "Inputs and outputs not all float types.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace pad
+
+TfLiteRegistration* Register_PAD_REF() {
+  static TfLiteRegistration r = {nullptr, nullptr, pad::Prepare,
+                                 pad::Eval<pad::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_PAD_GENERIC_OPT() {
+  static TfLiteRegistration r = {nullptr, nullptr, pad::Prepare,
+                                 pad::Eval<pad::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_PAD() {
+  return Register_PAD_GENERIC_OPT();
+  // return Register_PAD_REF();
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/pad_test.cc b/tensorflow/contrib/lite/kernels/pad_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f3ea9417df0e61dcff7a877726ab91c9b22691ba
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/pad_test.cc
@@ -0,0 +1,99 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class PadOpModel : public SingleOpModel {
+ public:
+  PadOpModel(std::initializer_list<int> input_shape,
+             std::initializer_list<int> before_padding,
+             std::initializer_list<int> after_padding) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(
+        BuiltinOperator_PAD, BuiltinOptions_PadOptions,
+        CreatePadOptions(builder_, builder_.CreateVector<int>(before_padding),
+                         builder_.CreateVector<int>(after_padding))
+            .Union());
+    BuildInterpreter({input_shape});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(PadOpTest, TooManyDimensions) {
+  EXPECT_DEATH(
+      PadOpModel({1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 2, 3, 4, 5, 6, 7, 8, 9},
+                 {1, 2, 3, 4, 5, 6, 7, 8, 9}),
+      "dims != 4");
+}
+
+// TODO(nupurgarg): Test case where before padding and after padding arrays
+// don't contain the same number of dimensions.
+TEST(PadOpTest, UnequalDimensions) {
+  EXPECT_DEATH(PadOpModel({1, 1, 2, 1}, {1, 2, 3}, {1, 2, 3}),
+               "dims != op_context.params->num_dimensions");
+}
+
+TEST(PadOpTest, InvalidPadValue) {
+  EXPECT_DEATH(PadOpModel({1, 1, 2, 1}, {0, 1, 2, 0}, {0, -1, -1, 0}),
+               "Pad value has to be greater than equal to 0.");
+}
+
+TEST(PadOpTest, SimpleTest) {
+  PadOpModel m({1, 2, 2, 1}, {0, 1, 1, 0}, {0, 1, 1, 0});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
+                                               0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadOpTest, AdvancedTest) {
+  // The padding is input in the order of batch, height, width, depth.
+  PadOpModel m({1, 2, 3, 1}, {0, 0, 1, 0}, {0, 2, 3, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/pooling_test.cc b/tensorflow/contrib/lite/kernels/pooling_test.cc
index e1b51ec7d5141bf2a41e7ede3e90ff20ec523819..01c91b2ba905e249c36af19f175c68a7e7f17f6d 100644
--- a/tensorflow/contrib/lite/kernels/pooling_test.cc
+++ b/tensorflow/contrib/lite/kernels/pooling_test.cc
@@ -155,7 +155,7 @@ TEST(FloatPoolingOpTest, L2Pool) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index ca7a0dd1949a3a31d26be770a7df781cc5fe7533..d4e7503f48debbdc092ad7950ee4c0e52854c432 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -31,6 +31,7 @@ TfLiteRegistration* Register_CONV_2D();
 TfLiteRegistration* Register_DEPTHWISE_CONV_2D();
 TfLiteRegistration* Register_SVDF();
 TfLiteRegistration* Register_RNN();
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_RNN();
 TfLiteRegistration* Register_EMBEDDING_LOOKUP();
 TfLiteRegistration* Register_EMBEDDING_LOOKUP_SPARSE();
 TfLiteRegistration* Register_FULLY_CONNECTED();
@@ -39,14 +40,17 @@ TfLiteRegistration* Register_HASHTABLE_LOOKUP();
 TfLiteRegistration* Register_SOFTMAX();
 TfLiteRegistration* Register_CONCATENATION();
 TfLiteRegistration* Register_ADD();
+TfLiteRegistration* Register_BATCH_TO_SPACE_ND();
 TfLiteRegistration* Register_MUL();
 TfLiteRegistration* Register_L2_NORMALIZATION();
 TfLiteRegistration* Register_LOCAL_RESPONSE_NORMALIZATION();
 TfLiteRegistration* Register_LSTM();
+TfLiteRegistration* Register_PAD();
 TfLiteRegistration* Register_RESHAPE();
 TfLiteRegistration* Register_RESIZE_BILINEAR();
 TfLiteRegistration* Register_SKIP_GRAM();
 TfLiteRegistration* Register_SPACE_TO_DEPTH();
+TfLiteRegistration* Register_GATHER();
 
 BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
@@ -61,6 +65,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D());
   AddBuiltin(BuiltinOperator_SVDF, Register_SVDF());
   AddBuiltin(BuiltinOperator_RNN, Register_RNN());
+  AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
+             Register_UNIDIRECTIONAL_SEQUENCE_RNN());
   AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP());
   AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP_SPARSE,
              Register_EMBEDDING_LOOKUP_SPARSE());
@@ -70,15 +76,18 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX());
   AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION());
   AddBuiltin(BuiltinOperator_ADD, Register_ADD());
+  AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND, Register_BATCH_TO_SPACE_ND());
   AddBuiltin(BuiltinOperator_MUL, Register_MUL());
   AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION());
   AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
              Register_LOCAL_RESPONSE_NORMALIZATION());
   AddBuiltin(BuiltinOperator_LSTM, Register_LSTM());
+  AddBuiltin(BuiltinOperator_PAD, Register_PAD());
   AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
   AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR());
   AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
   AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH());
+  AddBuiltin(BuiltinOperator_GATHER, Register_GATHER());
 }
 
 TfLiteRegistration* BuiltinOpResolver::FindOp(
diff --git a/tensorflow/contrib/lite/kernels/reshape_test.cc b/tensorflow/contrib/lite/kernels/reshape_test.cc
index 59ce7d5648c04f78123b16a195d3a4928d28394b..0fbcf6e6aa311d2cac491336ee54ccf58bbda8fd 100644
--- a/tensorflow/contrib/lite/kernels/reshape_test.cc
+++ b/tensorflow/contrib/lite/kernels/reshape_test.cc
@@ -83,8 +83,7 @@ TEST(ReshapeOpTest, WithStretchDimension) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
index 0257c0b557feb352413bcc33cb4e2ecdb32c5111..314a71e210d9b5ea75bb137ef228273ef48f28b5 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
@@ -111,7 +111,7 @@ TEST(ResizeBilinearOpTest, ThreeDimensionalResize) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/skip_gram_test.cc b/tensorflow/contrib/lite/kernels/skip_gram_test.cc
index e7f6bc904be5e4c23a88f5b4ae7e199346c78ab2..185b64cb44969b57588ea5d0b40f55b6ddf8e11f 100644
--- a/tensorflow/contrib/lite/kernels/skip_gram_test.cc
+++ b/tensorflow/contrib/lite/kernels/skip_gram_test.cc
@@ -251,7 +251,7 @@ TEST(SkipGramTest, TestInputWithExtraSpace) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/softmax_test.cc b/tensorflow/contrib/lite/kernels/softmax_test.cc
index ec8ec03b0d0279cad8543352b1dbaf34c88a7957..6c5338ff0fd26337c9adc8e0b94a0a88edfde37f 100644
--- a/tensorflow/contrib/lite/kernels/softmax_test.cc
+++ b/tensorflow/contrib/lite/kernels/softmax_test.cc
@@ -136,8 +136,7 @@ TEST(SoftmaxOpTest, CompareWithTFminiBetaNotEq1) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/space_to_depth_test.cc b/tensorflow/contrib/lite/kernels/space_to_depth_test.cc
index 911f08a92ccd6a97bee414c87bd79091808f0ed1..997f354861a235fb511235e4d64544dc8c3ddb34 100644
--- a/tensorflow/contrib/lite/kernels/space_to_depth_test.cc
+++ b/tensorflow/contrib/lite/kernels/space_to_depth_test.cc
@@ -95,8 +95,7 @@ TEST(SpaceToDepthOpModel, Int64) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/svdf_test.cc b/tensorflow/contrib/lite/kernels/svdf_test.cc
index d956025e9dfc9b6c03e55657023fb042c8ac485d..4de2ceaf053df31a4bc857fb250db416c071e80f 100644
--- a/tensorflow/contrib/lite/kernels/svdf_test.cc
+++ b/tensorflow/contrib/lite/kernels/svdf_test.cc
@@ -306,7 +306,7 @@ TEST(SVDFOpTest, BlackBoxTestRank2) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/test_util.cc b/tensorflow/contrib/lite/kernels/test_util.cc
index f716ba8741fd469e7ee405ac300924b53c5c48e5..b69f2b3e4bc66c94fdfc7ed4c244151be63a1711 100644
--- a/tensorflow/contrib/lite/kernels/test_util.cc
+++ b/tensorflow/contrib/lite/kernels/test_util.cc
@@ -180,4 +180,17 @@ int32_t SingleOpModel::GetTensorSize(int index) const {
   return total_size;
 }
 
+template <>
+std::vector<string> SingleOpModel::ExtractVector(int index) {
+  TfLiteTensor* tensor_ptr = interpreter_->tensor(index);
+  CHECK(tensor_ptr != nullptr);
+  const int num_strings = GetStringCount(tensor_ptr);
+  std::vector<string> result;
+  result.reserve(num_strings);
+  for (int i = 0; i < num_strings; ++i) {
+    const auto str = GetString(tensor_ptr, i);
+    result.emplace_back(str.str, str.len);
+  }
+  return result;
+}
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h
index e68e49466119c50ec123edb84f1b1b6390a15a60..531c1366a87e20e140e779b767e29b1fd1111f97 100644
--- a/tensorflow/contrib/lite/kernels/test_util.h
+++ b/tensorflow/contrib/lite/kernels/test_util.h
@@ -24,16 +24,11 @@ limitations under the License.
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/testing/util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tflite {
 
-inline void LogToStderr() {
-#ifdef PLATFORM_GOOGLE
-  FLAGS_logtostderr = true;
-#endif
-}
-
 // A gmock matcher that check that elements of a float vector match to a given
 // tolerance.
 std::vector<::testing::Matcher<float>> ArrayFloatNear(
@@ -197,6 +192,9 @@ class SingleOpModel {
   std::map<string, std::function<TfLiteRegistration*()>> custom_registrations_;
 };
 
+// Strings have a special implementation that is in test_util.cc
+template <>
+std::vector<string> SingleOpModel::ExtractVector(int index);
 }  // namespace tflite
 
 #endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..85e09049eea5f66a2bb854990bf80e9ed5dcc88a
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
@@ -0,0 +1,169 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace unidirectional_sequence_rnn {
+
+constexpr int kInputTensor = 0;
+constexpr int kWeightsTensor = 1;
+constexpr int kRecurrentWeightsTensor = 2;
+constexpr int kBiasTensor = 3;
+constexpr int KHiddenStateTensor = 0;
+constexpr int kOutputTensor = 1;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 4);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 2);
+
+  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+  TfLiteTensor* input_weights =
+      &context->tensors[node->inputs->data[kWeightsTensor]];
+  TfLiteTensor* recurrent_weights =
+      &context->tensors[node->inputs->data[kRecurrentWeightsTensor]];
+  TfLiteTensor* bias = &context->tensors[node->inputs->data[kBiasTensor]];
+
+  // Check all the parameters of tensor match within themselves and match the
+  // input configuration.
+  const int batch_size = input->dims->data[0];
+  const int max_time = input->dims->data[1];
+  const int num_units = input_weights->dims->data[0];
+  TF_LITE_ASSERT_EQ(input->dims->data[2], input_weights->dims->data[1]);
+  TF_LITE_ASSERT_EQ(input_weights->dims->data[0], bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[0], bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[1], bias->dims->data[0]);
+
+  TfLiteTensor* hidden_state =
+      &context->tensors[node->outputs->data[KHiddenStateTensor]];
+  TfLiteTensor* output = &context->tensors[node->outputs->data[kOutputTensor]];
+
+  // Resize state.
+  TfLiteIntArray* hidden_state_size_array = TfLiteIntArrayCreate(2);
+  hidden_state_size_array->data[0] = batch_size;
+  hidden_state_size_array->data[1] = num_units;
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, hidden_state,
+                                                   hidden_state_size_array));
+
+  // Mark hidden state as a persistent tensor.
+  hidden_state->allocation_type = kTfLiteArenaRwPersistent;
+
+  // Resize output.
+  TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(3);
+  output_size_array->data[0] = batch_size;
+  output_size_array->data[1] = max_time;
+  output_size_array->data[2] = num_units;
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output,
+                                                   output_size_array));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteRNNParams*>(node->builtin_data);
+
+  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+  TfLiteTensor* input_weights =
+      &context->tensors[node->inputs->data[kWeightsTensor]];
+  TfLiteTensor* recurrent_weights =
+      &context->tensors[node->inputs->data[kRecurrentWeightsTensor]];
+  TfLiteTensor* bias = &context->tensors[node->inputs->data[kBiasTensor]];
+  TfLiteTensor* hidden_state =
+      &context->tensors[node->outputs->data[KHiddenStateTensor]];
+  TfLiteTensor* output = &context->tensors[node->outputs->data[kOutputTensor]];
+
+  // Initialize the pointer bias.
+  const float* bias_ptr = bias->data.f;
+
+  const int batch_size = input->dims->data[0];
+  const int max_time = input->dims->data[1];
+  const int num_units = input_weights->dims->data[0];
+  const int input_size = input->dims->data[2];
+  const int input_weights_stride = input_weights->dims->data[1];
+  const int recurrent_weights_stride = recurrent_weights->dims->data[1];
+
+  // For each batch
+  for (int b = 0; b < batch_size; b++) {
+    // Initialize the pointer to hidden state.
+    float* hidden_state_ptr_batch = hidden_state->data.f + b * num_units;
+    for (int s = 0; s < max_time; s++) {
+      // Initialize the pointer to input and output.
+      const float* input_ptr_batch =
+          input->data.f + b * input_size * max_time + s * input_size;
+     float* output_ptr_batch =
+         output->data.f + b * num_units * max_time + s * num_units;
+
+      // Initialize input_weights and recurrent_weights.
+      const float* input_weights_ptr = input_weights->data.f;
+      const float* recurrent_weights_ptr = recurrent_weights->data.f;
+
+      // Output = bias
+      for (int o = 0; o < num_units; o++) {
+        output_ptr_batch[o] = bias_ptr[o];
+      }
+
+      // Output += input * input_weights
+      for (int o = 0; o < num_units; o++) {
+        for (int i = 0; i < input_size; i++) {
+          output_ptr_batch[o] += input_ptr_batch[i] * input_weights_ptr[i];
+        }
+        input_weights_ptr += input_weights_stride;
+      }
+
+      // Output += recurrent_weights * hidden_state
+      for (int o = 0; o < num_units; o++) {
+        for (int h = 0; h < num_units; h++) {
+          output_ptr_batch[o] +=
+              hidden_state_ptr_batch[h] * recurrent_weights_ptr[h];
+        }
+        recurrent_weights_ptr += recurrent_weights_stride;
+      }
+
+      // Output = activation(Output) and update hidden_state
+      for (int o = 0; o < num_units; o++) {
+        output_ptr_batch[o] =
+            (ActivationFunctor(params->activation))(output_ptr_batch[o]);
+        hidden_state_ptr_batch[o] = output_ptr_batch[o];
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace unidirectional_sequence_rnn
+
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_RNN() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 unidirectional_sequence_rnn::Prepare,
+                                 unidirectional_sequence_rnn::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a1c1eda16034f83ca5c79fc18f4fa495a3e73f90
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
@@ -0,0 +1,270 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite RNN op.
+
+#include <vector>
+#include <iomanip>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+static float rnn_input[] = {
+    0.23689353,   0.285385,     0.037029743, -0.19858193,  -0.27569133,
+    0.43773448,   0.60379338,   0.35562468,  -0.69424844,  -0.93421471,
+    -0.87287879,  0.37144363,   -0.62476718, 0.23791671,   0.40060222,
+    0.1356622,    -0.99774903,  -0.98858172, -0.38952237,  -0.47685933,
+    0.31073618,   0.71511042,   -0.63767755, -0.31729108,  0.33468103,
+    0.75801885,   0.30660987,   -0.37354088, 0.77002847,   -0.62747043,
+    -0.68572164,  0.0069220066, 0.65791464,  0.35130811,   0.80834007,
+    -0.61777675,  -0.21095741,  0.41213346,  0.73784804,   0.094794154,
+    0.47791874,   0.86496925,   -0.53376222, 0.85315156,   0.10288584,
+    0.86684,      -0.011186242, 0.10513687,  0.87825835,   0.59929144,
+    0.62827742,   0.18899453,   0.31440187,  0.99059987,   0.87170351,
+    -0.35091716,  0.74861872,   0.17831337,  0.2755419,    0.51864719,
+    0.55084288,   0.58982027,   -0.47443086, 0.20875752,   -0.058871567,
+    -0.66609079,  0.59098077,   0.73017097,  0.74604273,   0.32882881,
+    -0.17503482,  0.22396147,   0.19379807,  0.29120302,   0.077113032,
+    -0.70331609,  0.15804303,   -0.93407321, 0.40182066,   0.036301374,
+    0.66521823,   0.0300982,    -0.7747041,  -0.02038002,  0.020698071,
+    -0.90300065,  0.62870288,   -0.23068321, 0.27531278,   -0.095755219,
+    -0.712036,    -0.17384434,  -0.50593495, -0.18646687,  -0.96508682,
+    0.43519354,   0.14744234,   0.62589407,  0.1653645,    -0.10651493,
+    -0.045277178, 0.99032974,   -0.88255352, -0.85147917,  0.28153265,
+    0.19455957,   -0.55479527,  -0.56042433, 0.26048636,   0.84702539,
+    0.47587705,   -0.074295521, -0.12287641, 0.70117295,   0.90532446,
+    0.89782166,   0.79817224,   0.53402734,  -0.33286154,  0.073485017,
+    -0.56172788,  -0.044897556, 0.89964068,  -0.067662835, 0.76863563,
+    0.93455386,   -0.6324693,   -0.083922029};
+
+static float rnn_golden_output[] = {
+    0.496726,   0,          0.965996,  0,         0.0584254, 0,
+    0,          0.12315,    0,         0,         0.612266,  0.456601,
+    0,          0.52286,    1.16099,   0.0291232,
+
+    0,          0,          0.524901,  0,         0,         0,
+    0,          1.02116,    0,         1.35762,   0,         0.356909,
+    0.436415,   0.0355727,  0,         0,
+
+    0,          0,          0,         0.262335,  0,         0,
+    0,          1.33992,    0,         2.9739,    0,         0,
+    1.31914,    2.66147,    0,         0,
+
+    0.942568,   0,          0,         0,         0.025507,  0,
+    0,          0,          0.321429,  0.569141,  1.25274,   1.57719,
+    0.8158,     1.21805,    0.586239,  0.25427,
+
+    1.04436,    0,          0.630725,  0,         0.133801,  0.210693,
+    0.363026,   0,          0.533426,  0,         1.25926,   0.722707,
+    0,          1.22031,    1.30117,   0.495867,
+
+    0.222187,   0,          0.72725,   0,         0.767003,  0,
+    0,          0.147835,   0,         0,         0,         0.608758,
+    0.469394,   0.00720298, 0.927537,  0,
+
+    0.856974,   0.424257,   0,         0,         0.937329,  0,
+    0,          0,          0.476425,  0,         0.566017,  0.418462,
+    0.141911,   0.996214,   1.13063,   0,
+
+    0.967899,   0,          0,         0,         0.0831304, 0,
+    0,          1.00378,    0,         0,         0,         1.44818,
+    1.01768,    0.943891,   0.502745,  0,
+
+    0.940135,   0,          0,         0,         0,         0,
+    0,          2.13243,    0,         0.71208,   0.123918,  1.53907,
+    1.30225,    1.59644,    0.70222,   0,
+
+    0.804329,   0,          0.430576,  0,         0.505872,  0.509603,
+    0.343448,   0,          0.107756,  0.614544,  1.44549,   1.52311,
+    0.0454298,  0.300267,   0.562784,  0.395095,
+
+    0.228154,   0,          0.675323,  0,         1.70536,   0.766217,
+    0,          0,          0,         0.735363,  0.0759267, 1.91017,
+    0.941888,   0,          0,         0,
+
+    0,          0,          1.5909,    0,         0,         0,
+    0,          0.5755,     0,         0.184687,  0,         1.56296,
+    0.625285,   0,          0,         0,
+
+    0,          0,          0.0857888, 0,         0,         0,
+    0,          0.488383,   0.252786,  0,         0,         0,
+    1.02817,    1.85665,    0,         0,
+
+    0.00981836, 0,          1.06371,   0,         0,         0,
+    0,          0,          0,         0.290445,  0.316406,  0,
+    0.304161,   1.25079,    0.0707152, 0,
+
+    0.986264,   0.309201,   0,         0,         0,         0,
+    0,          1.64896,    0.346248,  0,         0.918175,  0.78884,
+    0.524981,   1.92076,    2.07013,   0.333244,
+
+    0.415153,   0.210318,   0,         0,         0,         0,
+    0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
+    0.628881,   3.58099,    1.49974,   0
+};
+
+class UnidirectionalRNNOpModel : public SingleOpModel {
+ public:
+  UnidirectionalRNNOpModel(int batches, int sequence_len, int units, int size)
+      : batches_(batches),
+        sequence_len_(sequence_len),
+        units_(units),
+        input_size_(size) {
+    input_ = AddInput(TensorType_FLOAT32);
+    weights_ = AddInput(TensorType_FLOAT32);
+    recurrent_weights_ = AddInput(TensorType_FLOAT32);
+    bias_ = AddInput(TensorType_FLOAT32);
+    hidden_state_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(
+        BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, BuiltinOptions_RNNOptions,
+        CreateRNNOptions(builder_, ActivationFunctionType_RELU).Union());
+    BuildInterpreter({{batches_, sequence_len_, input_size_},
+                      {units_, input_size_},
+                      {units_, units_},
+                      {units_}});
+  }
+
+  void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
+
+  void SetWeights(std::initializer_list<float> f) {
+    PopulateTensor(weights_, f);
+  }
+
+  void SetRecurrentWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_weights_, f);
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  void ResetHiddenState() {
+    const int zero_buffer_size = units_ * batches_;
+    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+    PopulateTensor(hidden_state_, 0, zero_buffer.get(),
+                   zero_buffer.get() + zero_buffer_size);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int input_size() { return input_size_; }
+  int num_units() { return units_; }
+  int num_batches() { return batches_; }
+  int sequence_len() { return sequence_len_; }
+
+ private:
+  int input_;
+  int weights_;
+  int recurrent_weights_;
+  int bias_;
+  int hidden_state_;
+  int output_;
+
+  int batches_;
+  int sequence_len_;
+  int units_;
+  int input_size_;
+};
+
+// TODO(mirkov): add another test which directly compares to TF once TOCO
+// supports the conversion from dynamic_rnn with BasicRNNCell.
+TEST(FullyConnectedOpTest, BlackBoxTest) {
+  UnidirectionalRNNOpModel rnn(2, 16, 16, 8);
+  rnn.SetWeights(
+      {0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
+       0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
+       0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
+       -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
+       -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
+       -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
+       -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
+       0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
+       0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
+       0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
+       -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
+       0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
+       -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
+       -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
+       0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
+       0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
+       0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
+       -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
+       0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
+       0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
+       -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
+       0.277308,    0.415818});
+
+  rnn.SetBias({0.065691948, -0.69055247, 0.1107955, -0.97084129, -0.23957068,
+               -0.23566568, -0.389184, 0.47481549, -0.4791103, 0.29931796,
+               0.10463274, 0.83918178, 0.37197268, 0.61957061, 0.3956964,
+               -0.37609905});
+
+  rnn.SetRecurrentWeights({0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1});
+
+  rnn.ResetHiddenState();
+  const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
+  float* batch_start = rnn_input;
+  float* batch_end = batch_start + input_sequence_size;
+  rnn.SetInput(0, batch_start, batch_end);
+  rnn.SetInput(input_sequence_size, batch_start, batch_end);
+
+  rnn.Invoke();
+
+  float* golden_start = rnn_golden_output;
+  float* golden_end = golden_start + rnn.num_units() * rnn.sequence_len();
+  std::vector<float> expected;
+  expected.insert(expected.end(), golden_start, golden_end);
+  expected.insert(expected.end(), golden_start, golden_end);
+
+  EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index e2f3560e61baae88a4afaafaa202cde784063efc..94e22b265964b300c862a9ee52511d479c20c64d 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -60,6 +60,14 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromBuffer(
   return model;
 }
 
+std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromModel(
+    const tflite::Model* model_spec, ErrorReporter* error_reporter) {
+  std::unique_ptr<FlatBufferModel> model;
+  model.reset(new FlatBufferModel(model_spec, error_reporter));
+  if (!model->initialized()) model.reset();
+  return model;
+}
+
 FlatBufferModel::FlatBufferModel(const char* filename, bool mmap_file,
                                  ErrorReporter* error_reporter, bool use_nnapi)
     : error_reporter_(error_reporter ? error_reporter
@@ -99,6 +107,13 @@ FlatBufferModel::FlatBufferModel(const char* ptr, size_t num_bytes,
   model_ = VerifyAndGetModel(allocation_->base(), allocation_->bytes());
 }
 
+FlatBufferModel::FlatBufferModel(const Model* model,
+                                 ErrorReporter* error_reporter)
+    : error_reporter_(error_reporter ? error_reporter
+                                     : DefaultErrorReporter()) {
+  model_ = model;
+}
+
 FlatBufferModel::~FlatBufferModel() { delete allocation_; }
 
 InterpreterBuilder::InterpreterBuilder(const FlatBufferModel& model,
@@ -160,6 +175,27 @@ std::vector<int> FlatBufferIntArrayToVector(T* flat_array) {
   return ret;
 }
 
+// Copies the contents from the flatbuffer int vector `flatbuffer` into the
+// int array `buffer`. `flat_vector` and `buffer` represent the same
+// configuration operation for a given operation.
+void FlatBufferIntVectorToArray(int max_size_of_buffer,
+                                const flatbuffers::Vector<int32_t>* flat_vector,
+                                int* buffer, ErrorReporter* error_reporter) {
+  if (!flat_vector) {
+    error_reporter->Report("Input array not provided for operation.\n");
+  } else {
+    int num_dimensions = flat_vector->Length();
+    if (num_dimensions > max_size_of_buffer / sizeof(int)) {
+      error_reporter->Report(
+          "Found too many dimensions in the operation's input array.\n");
+    } else {
+      for (int i = 0; i < num_dimensions; ++i) {
+        buffer[i] = flat_vector->Get(i);
+      }
+    }
+  }
+}
+
 // Allocate a structure using C malloc, but make sure the structure is a
 // POD structure that doesn't require constructors to run. The reason we do
 // this, is that Interpreter's C extension part will take ownership and wants
@@ -175,6 +211,9 @@ T* MallocPOD() {
 // This handles builtin data explicitly as there are flatbuffer schemas.
 //
 // Returns memory that must be feed.
+//
+// TODO(nupurgarg): Pass in void ** and return TfLiteStatus to ensure program
+// crashes if error reporter is called.
 void* ParseOpData(const Operator* op, BuiltinOperator op_type,
                   ErrorReporter* error_reporter) {
   auto parse_padding = [](Padding padding) {
@@ -301,6 +340,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
     case BuiltinOperator_RNN: {
       TfLiteRNNParams* params = MallocPOD<TfLiteRNNParams>();
       if (auto* rnn_params = op->builtin_options_as_RNNOptions()) {
@@ -417,23 +457,35 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_PAD: {
+      auto* params = MallocPOD<TfLitePadParams>();
+      if (auto* schema_params = op->builtin_options_as_PadOptions()) {
+        auto* before_padding = schema_params->before_padding();
+        FlatBufferIntVectorToArray(sizeof(params->before_padding),
+                                   before_padding, params->before_padding,
+                                   error_reporter);
+
+        auto* after_padding = schema_params->after_padding();
+        FlatBufferIntVectorToArray(sizeof(params->after_padding), after_padding,
+                                   params->after_padding, error_reporter);
+
+        if (before_padding->Length() != after_padding->Length()) {
+          error_reporter->Report(
+              "Before padding and after padding arrays need to contain the "
+              "same number of dimensions.\n");
+        }
+        params->num_dimensions = after_padding->Length();
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
     case BuiltinOperator_RESHAPE: {
       auto* params = MallocPOD<TfLiteReshapeParams>();
       if (auto* schema_params = op->builtin_options_as_ReshapeOptions()) {
         auto* new_shape = schema_params->new_shape();
-        if (!new_shape) {
-          error_reporter->Report("No new_shape provided for Reshape\n");
-        } else {
-          params->num_dimensions = new_shape->Length();
-          if (params->num_dimensions > sizeof(params->shape) / sizeof(int)) {
-            error_reporter->Report(
-                "Found too many dimensions in Reshape's new_shape\n");
-          } else {
-            for (int i = 0; i < params->num_dimensions; ++i) {
-              params->shape[i] = new_shape->Get(i);
-            }
-          }
-        }
+        FlatBufferIntVectorToArray(sizeof(params->shape), new_shape,
+                                   params->shape, error_reporter);
+        params->num_dimensions = new_shape->Length();
       }
       builtin_data = reinterpret_cast<void*>(params);
       break;
@@ -456,6 +508,34 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_GATHER: {
+      TfLiteGatherParams* params = MallocPOD<TfLiteGatherParams>();
+      params->axis = 0;
+      if (auto* gather_params = op->builtin_options_as_GatherOptions()) {
+        params->axis = gather_params->axis();
+      }
+
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_BATCH_TO_SPACE_ND: {
+      auto* params = MallocPOD<TfLiteBatchToSpaceNDParams>();
+      if (auto* schema_params =
+              op->builtin_options_as_BatchToSpaceNDOptions()) {
+        const auto& block_shape = schema_params->block_shape();
+        FlatBufferIntVectorToArray(sizeof(params->block_shape), block_shape,
+                                   params->block_shape, error_reporter);
+        const auto& before_crops = schema_params->before_crops();
+        FlatBufferIntVectorToArray(sizeof(params->before_crops), before_crops,
+                                   params->before_crops, error_reporter);
+        const auto& after_crops = schema_params->after_crops();
+        FlatBufferIntVectorToArray(sizeof(params->after_crops), after_crops,
+                                   params->after_crops, error_reporter);
+        params->num_spatial_dimensions = block_shape->Length();
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
   }
   return builtin_data;
 }
diff --git a/tensorflow/contrib/lite/model.h b/tensorflow/contrib/lite/model.h
index 15659d33f37dfb2f119480ed88d2e1b81f34c145..e0c96f7f0480cd3146f95a22957477809cf0096d 100644
--- a/tensorflow/contrib/lite/model.h
+++ b/tensorflow/contrib/lite/model.h
@@ -45,18 +45,25 @@ namespace tflite {
 // or mmapped. This uses flatbuffers as the serialization format.
 class FlatBufferModel {
  public:
-  // Build a model based on a file. Return a nullptr in case of failure.
+  // Builds a model based on a file. Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromFile(
       const char* filename,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
-  // Build a model based on a pre-loaded flatbuffer. The caller retains
+  // Builds a model based on a pre-loaded flatbuffer. The caller retains
   // ownership of the buffer and should keep it alive until the returned object
-  // is destroyed. Return a nullptr in case of failure.
+  // is destroyed. Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
       const char* buffer, size_t buffer_size,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
+  // Builds a model directly from a flatbuffer pointer. The caller retains
+  // ownership of the buffer and should keep it alive until the returned object
+  // is destroyed. Returns a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> BuildFromModel(
+      const tflite::Model* model_spec,
+      ErrorReporter* error_reporter = DefaultErrorReporter());
+
   // Releases memory or unmaps mmaped meory.
   ~FlatBufferModel();
 
@@ -75,7 +82,7 @@ class FlatBufferModel {
   bool CheckModelIdentifier() const;
 
  private:
-  // Load a model from `filename`. If `mmap_file` is true then use mmap,
+  // Loads a model from `filename`. If `mmap_file` is true then use mmap,
   // otherwise make a copy of the model in a buffer.
   //
   // Note, if `error_reporter` is null, then a DefaultErrorReporter() will be
@@ -85,8 +92,8 @@ class FlatBufferModel {
       ErrorReporter* error_reporter = DefaultErrorReporter(),
       bool use_nnapi = false);
 
-  // Load a model from `ptr` and `num_bytes` of the model file. The `ptr` has to
-  // remain alive and unchanged until the end of this flatbuffermodel's
+  // Loads a model from `ptr` and `num_bytes` of the model file. The `ptr` has
+  // to remain alive and unchanged until the end of this flatbuffermodel's
   // lifetime.
   //
   // Note, if `error_reporter` is null, then a DefaultErrorReporter() will be
@@ -94,6 +101,10 @@ class FlatBufferModel {
   FlatBufferModel(const char* ptr, size_t num_bytes,
                   ErrorReporter* error_reporter = DefaultErrorReporter());
 
+  // Loads a model from Model flatbuffer. The `model` has to remain alive and
+  // unchanged until the end of this flatbuffermodel's lifetime.
+  FlatBufferModel(const Model* model, ErrorReporter* error_reporter);
+
   // Flatbuffer traverser pointer. (Model* is a pointer that is within the
   // allocated memory of the data allocated by allocation's internals.
   const tflite::Model* model_ = nullptr;
@@ -106,9 +117,9 @@ class FlatBufferModel {
 // model are mapped to executable function pointers (TfLiteRegistrations).
 class OpResolver {
  public:
-  // Find the op registration for a builtin operator by enum code.
+  // Finds the op registration for a builtin operator by enum code.
   virtual TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const = 0;
-  // Find the op registration of a custom operator by op name.
+  // Finds the op registration of a custom operator by op name.
   virtual TfLiteRegistration* FindOp(const char* op) const = 0;
   virtual ~OpResolver() {}
 };
@@ -131,7 +142,7 @@ class InterpreterBuilder {
  public:
   InterpreterBuilder(const FlatBufferModel& model,
                      const OpResolver& op_resolver);
-  // Build an interpreter given only the raw flatbuffer Model object (instead
+  // Builds an interpreter given only the raw flatbuffer Model object (instead
   // of a FlatBufferModel). Mostly used for testing.
   // If `error_reporter` is null, then DefaultErrorReporter() is used.
   InterpreterBuilder(const ::tflite::Model* model,
diff --git a/tensorflow/contrib/lite/model_test.cc b/tensorflow/contrib/lite/model_test.cc
index 61043866420752b552281e353be9a2b41a6aadc8..5330c8f594593655b2a8776cf6b399c0d16cdc19 100644
--- a/tensorflow/contrib/lite/model_test.cc
+++ b/tensorflow/contrib/lite/model_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/testing/util.h"
 
 // Comparison for TfLiteRegistration. Since TfLiteRegistration is a C object,
 // we must declare this in global namespace, so argument-dependent operator
@@ -254,6 +255,28 @@ TEST(BasicFlatBufferModel, TestBuildModelFromCorruptedData) {
   ASSERT_FALSE(model);
 }
 
+// Test that loading model directly from a Model flatbuffer works.
+TEST(BasicFlatBufferModel, TestBuildFromModel) {
+  TestErrorReporter reporter;
+  FileCopyAllocation model_allocation(
+      "tensorflow/contrib/lite/testdata/test_model.bin", &reporter);
+  ASSERT_TRUE(model_allocation.valid());
+  ::flatbuffers::Verifier verifier(
+      reinterpret_cast<const uint8_t*>(model_allocation.base()),
+      model_allocation.bytes());
+  ASSERT_TRUE(VerifyModelBuffer(verifier));
+  const Model* model_fb = ::tflite::GetModel(model_allocation.base());
+
+  auto model = FlatBufferModel::BuildFromModel(model_fb);
+  ASSERT_TRUE(model);
+
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(*model, TrivialResolver(&dummy_reg))(&interpreter),
+      kTfLiteOk);
+  ASSERT_NE(interpreter, nullptr);
+}
+
 // TODO(aselle): Add tests for serialization of builtin op data types.
 // These tests will occur with the evaluation tests of individual operators,
 // not here.
@@ -261,7 +284,7 @@ TEST(BasicFlatBufferModel, TestBuildModelFromCorruptedData) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/models/smartreply/BUILD b/tensorflow/contrib/lite/models/smartreply/BUILD
index fbdf19f2054cf01aec44e3fcb13d0d0a2ff6f914..733c3f4c7fa0605f24a1e6b4c458e34310c079c4 100644
--- a/tensorflow/contrib/lite/models/smartreply/BUILD
+++ b/tensorflow/contrib/lite/models/smartreply/BUILD
@@ -1,7 +1,92 @@
 package(default_visibility = ["//visibility:public"])
 
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
+
 licenses(["notice"])  # Apache 2.0
 
+gen_selected_ops(
+    name = "smartreply_ops",
+    model = "@tflite_smartreply//:smartreply.tflite",
+)
+
+cc_library(
+    name = "custom_ops",
+    srcs = [
+        "ops/extract_feature.cc",
+        "ops/normalize.cc",
+        "ops/predict.cc",
+        ":smartreply_ops",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/tools:mutable_op_resolver",
+        "@com_google_absl//absl/strings",
+        "@com_googlesource_code_re2//:re2",
+        "@farmhash_archive//:farmhash",
+    ],
+)
+
+cc_library(
+    name = "predictor_lib",
+    srcs = ["predictor.cc"],
+    hdrs = ["predictor.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":custom_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/tools:mutable_op_resolver",
+        "@com_google_absl//absl/strings",
+        "@com_googlesource_code_re2//:re2",
+    ],
+)
+
+cc_test(
+    name = "extract_feature_op_test",
+    size = "small",
+    srcs = ["ops/extract_feature_test.cc"],
+    deps = [
+        ":custom_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@farmhash_archive//:farmhash",
+    ],
+)
+
+cc_test(
+    name = "normalize_op_test",
+    size = "small",
+    srcs = ["ops/normalize_test.cc"],
+    deps = [
+        ":custom_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "predict_op_test",
+    size = "small",
+    srcs = ["ops/predict_test.cc"],
+    deps = [
+        ":custom_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/AndroidManifest.xml b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/AndroidManifest.xml
new file mode 100644
index 0000000000000000000000000000000000000000..75ed9432c8fcdfd77a64d3c659e6336c977cdda2
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/AndroidManifest.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2017 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+  package="com.example.android.smartreply" >
+
+  <uses-sdk
+      android:minSdkVersion="15"
+      android:targetSdkVersion="24" />
+
+  <application android:label="TfLite SmartReply Demo">
+    <activity
+        android:name="com.example.android.smartreply.MainActivity"
+        android:configChanges="orientation|keyboardHidden|screenSize"
+        android:windowSoftInputMode="stateUnchanged|adjustPan"
+        android:label="TfLite SmartReply Demo"
+        android:screenOrientation="portrait" >
+      <intent-filter>
+        <action android:name="android.intent.action.MAIN" />
+        <category android:name="android.intent.category.LAUNCHER" />
+      </intent-filter>
+    </activity>
+  </application>
+
+</manifest>
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/BUILD b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f8767b443a2aa64b666c3b6bfb7db30cc0be62ea
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/BUILD
@@ -0,0 +1,65 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/contrib/lite:build_def.bzl",
+    "tflite_copts",
+    "tflite_jni_binary",
+)
+
+filegroup(
+    name = "assets",
+    srcs = [
+        "@tflite_smartreply//:model_files",
+    ],
+)
+
+android_binary(
+    name = "SmartReplyDemo",
+    srcs = glob(["java/**/*.java"]),
+    assets = [":assets"],
+    assets_dir = "",
+    custom_package = "com.example.android.smartreply",
+    manifest = "AndroidManifest.xml",
+    nocompress_extensions = [
+        ".tflite",
+    ],
+    resource_files = glob(["res/**"]),
+    tags = ["manual"],
+    deps = [
+        ":smartreply_runtime",
+        "@androidsdk//com.android.support:support-v13-25.2.0",
+        "@androidsdk//com.android.support:support-v4-25.2.0",
+    ],
+)
+
+cc_library(
+    name = "smartreply_runtime",
+    srcs = ["libsmartreply_jni.so"],
+    visibility = ["//visibility:public"],
+)
+
+tflite_jni_binary(
+    name = "libsmartreply_jni.so",
+    deps = [
+        ":smartreply_jni_lib",
+    ],
+)
+
+cc_library(
+    name = "smartreply_jni_lib",
+    srcs = [
+        "smartreply_jni.cc",
+    ],
+    copts = tflite_copts(),
+    linkopts = [
+        "-lm",
+        "-ldl",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/models/smartreply:predictor_lib",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/BUILD b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3c882ffc43fde577801428151a43b592e8faaed1
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/BUILD
@@ -0,0 +1,15 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(glob(["*"]))
+
+filegroup(
+    name = "assets_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/backoff_response.txt b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/backoff_response.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a0a5b46b5f8d5fd6a0297c8056bb2fb9b6ad9ada
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/backoff_response.txt
@@ -0,0 +1,16 @@
+Ok
+Yes
+No
+👍
+☺
+😟
+❤️
+Lol
+Thanks
+Got it
+Done
+Nice
+I don't know
+What?
+Why?
+What's up?
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/MainActivity.java b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/MainActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..02fec9ae5e971ad756ae6c2b0149a6aacfa27cad
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/MainActivity.java
@@ -0,0 +1,99 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.smartreply;
+
+import android.app.Activity;
+import android.os.Bundle;
+import android.os.Handler;
+import android.util.Log;
+import android.view.View;
+import android.widget.Button;
+import android.widget.EditText;
+import android.widget.TextView;
+
+/**
+ * The main (and only) activity of this demo app. Displays a text box which updates as messages are
+ * received.
+ */
+public class MainActivity extends Activity {
+  private static final String TAG = "SmartReplyDemo";
+  private SmartReplyClient client;
+
+  private Button sendButton;
+  private TextView messageTextView;
+  private EditText messageInput;
+
+  private Handler handler;
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+    Log.v(TAG, "onCreate");
+    setContentView(R.layout.main_activity);
+
+    client = new SmartReplyClient(getApplicationContext());
+    handler = new Handler();
+
+    sendButton = (Button) findViewById(R.id.send_button);
+    sendButton.setOnClickListener(
+        (View v) -> {
+          send(messageInput.getText().toString());
+        });
+
+    messageTextView = (TextView) findViewById(R.id.message_text);
+    messageInput = (EditText) findViewById(R.id.message_input);
+  }
+
+  @Override
+  protected void onStart() {
+    super.onStart();
+    Log.v(TAG, "onStart");
+    handler.post(
+        () -> {
+          client.loadModel();
+        });
+  }
+
+  @Override
+  protected void onStop() {
+    super.onStop();
+    Log.v(TAG, "onStop");
+    handler.post(
+        () -> {
+          client.unloadModel();
+        });
+  }
+
+  private void send(final String message) {
+    handler.post(
+        () -> {
+          messageTextView.append("Input: " + message + "\n");
+
+          SmartReply[] ans = client.predict(new String[] {message});
+          for (SmartReply reply : ans) {
+            appendMessage("Reply: " + reply.getText());
+          }
+          appendMessage("------");
+        });
+  }
+
+  private void appendMessage(final String message) {
+    handler.post(
+        () -> {
+          messageTextView.append(message + "\n");
+        });
+  }
+}
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReply.java b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReply.java
new file mode 100644
index 0000000000000000000000000000000000000000..3357fd17c11f870d1b0998bb26ffa9abf149686b
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReply.java
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.smartreply;
+
+import android.support.annotation.Keep;
+
+/**
+ * SmartReply contains predicted message, and confidence.
+ *
+ * <p>NOTE: this class used by JNI, class name and constructor should not be obfuscated.
+ */
+@Keep
+public class SmartReply {
+
+  private final String text;
+  private final float score;
+
+  @Keep
+  public SmartReply(String text, float score) {
+    this.text = text;
+    this.score = score;
+  }
+
+  public String getText() {
+    return text;
+  }
+
+  public float getScore() {
+    return score;
+  }
+}
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
new file mode 100644
index 0000000000000000000000000000000000000000..d5b1ac0ffbc47283aa0c1bf68c0a85ad6228cdcc
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.smartreply;
+
+import android.content.Context;
+import android.content.res.AssetFileDescriptor;
+import android.support.annotation.Keep;
+import android.support.annotation.WorkerThread;
+import android.util.Log;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.ArrayList;
+import java.util.List;
+
+/** Interface to load TfLite model and provide predictions. */
+public class SmartReplyClient implements AutoCloseable {
+  private static final String TAG = "SmartReplyDemo";
+  private static final String MODEL_PATH = "smartreply.tflite";
+  private static final String BACKOFF_PATH = "backoff_response.txt";
+  private static final String JNI_LIB = "smartreply_jni";
+
+  private final Context context;
+  private long storage;
+  private MappedByteBuffer model;
+
+  private volatile boolean isLibraryLoaded;
+
+  public SmartReplyClient(Context context) {
+    this.context = context;
+  }
+
+  public boolean isLoaded() {
+    return storage != 0;
+  }
+
+  @WorkerThread
+  public synchronized void loadModel() {
+    if (!isLibraryLoaded) {
+      System.loadLibrary(JNI_LIB);
+      isLibraryLoaded = true;
+    }
+
+    try {
+      model = loadModelFile();
+      String[] backoff = loadBackoffList();
+      storage = loadJNI(model, backoff);
+    } catch (IOException e) {
+      Log.e(TAG, "Fail to load model", e);
+      return;
+    }
+  }
+
+  @WorkerThread
+  public synchronized SmartReply[] predict(String[] input) {
+    if (storage != 0) {
+      return predictJNI(storage, input);
+    } else {
+      return new SmartReply[] {};
+    }
+  }
+
+  @WorkerThread
+  public synchronized void unloadModel() {
+    close();
+  }
+
+  @Override
+  public synchronized void close() {
+    if (storage != 0) {
+      unloadJNI(storage);
+      storage = 0;
+    }
+  }
+
+  private MappedByteBuffer loadModelFile() throws IOException {
+    AssetFileDescriptor fileDescriptor = context.getAssets().openFd(MODEL_PATH);
+    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
+    try {
+      FileChannel fileChannel = inputStream.getChannel();
+      long startOffset = fileDescriptor.getStartOffset();
+      long declaredLength = fileDescriptor.getDeclaredLength();
+      return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+    } finally {
+      inputStream.close();
+    }
+  }
+
+  private String[] loadBackoffList() throws IOException {
+    List<String> labelList = new ArrayList<String>();
+    BufferedReader reader =
+        new BufferedReader(new InputStreamReader(context.getAssets().open(BACKOFF_PATH)));
+    String line;
+    while ((line = reader.readLine()) != null) {
+      if (!line.isEmpty()) {
+        labelList.add(line);
+      }
+    }
+    reader.close();
+    String[] ans = new String[labelList.size()];
+    labelList.toArray(ans);
+    return ans;
+  }
+
+  @Keep
+  private native long loadJNI(MappedByteBuffer buffer, String[] backoff);
+
+  @Keep
+  private native SmartReply[] predictJNI(long storage, String[] text);
+
+  @Keep
+  private native void unloadJNI(long storage);
+}
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/res/layout/main_activity.xml b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/res/layout/main_activity.xml
new file mode 100644
index 0000000000000000000000000000000000000000..23b4cadc007a4457d33b8c8fecf9b1e7b7436320
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/res/layout/main_activity.xml
@@ -0,0 +1,44 @@
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:orientation="vertical">
+
+    <LinearLayout
+        android:layout_width="fill_parent"
+        android:layout_height="0dp"
+        android:padding="5dip"
+        android:layout_weight="3">
+
+        <TextView
+            android:id="@+id/message_text"
+            android:layout_width="fill_parent"
+            android:layout_height="fill_parent"
+            android:scrollbars="vertical"
+            android:gravity="bottom"/>
+    </LinearLayout>
+
+    <LinearLayout
+        android:layout_width="fill_parent"
+        android:layout_height="0dp"
+        android:padding="5dip"
+        android:layout_weight="1">
+
+        <EditText
+            android:id="@+id/message_input"
+            android:layout_width="0dp"
+            android:layout_height="fill_parent"
+            android:layout_weight="6"
+            android:scrollbars="vertical"
+            android:hint="Enter Text"
+            android:gravity="top"
+            android:inputType="text"/>
+        <Button
+            android:id="@+id/send_button"
+            android:layout_width="0dp"
+            android:layout_height="fill_parent"
+            android:layout_weight="2"
+            android:text="Send" />
+    </LinearLayout>
+
+</LinearLayout>
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/smartreply_jni.cc b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/smartreply_jni.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f158cc511a9bee0710aee13cd04f77b6f95fb868
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/smartreply_jni.cc
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <jni.h>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/models/smartreply/predictor.h"
+
+const char kIllegalStateException[] = "java/lang/IllegalStateException";
+
+using tflite::custom::smartreply::GetSegmentPredictions;
+using tflite::custom::smartreply::PredictorResponse;
+
+template <typename T>
+T CheckNotNull(JNIEnv* env, T&& t) {
+  if (t == nullptr) {
+    env->ThrowNew(env->FindClass(kIllegalStateException), "");
+    return nullptr;
+  }
+  return std::forward<T>(t);
+}
+
+std::vector<std::string> jniStringArrayToVector(JNIEnv* env,
+                                                jobjectArray string_array) {
+  int count = env->GetArrayLength(string_array);
+  std::vector<std::string> result;
+  for (int i = 0; i < count; i++) {
+    auto jstr =
+        reinterpret_cast<jstring>(env->GetObjectArrayElement(string_array, i));
+    const char* raw_str = env->GetStringUTFChars(jstr, JNI_FALSE);
+    result.emplace_back(std::string(raw_str));
+    env->ReleaseStringUTFChars(jstr, raw_str);
+  }
+  return result;
+}
+
+struct JNIStorage {
+  std::vector<std::string> backoff_list;
+  std::unique_ptr<::tflite::FlatBufferModel> model;
+};
+
+extern "C" JNIEXPORT jlong JNICALL
+Java_com_example_android_smartreply_SmartReplyClient_loadJNI(
+    JNIEnv* env, jobject thiz, jobject model_buffer,
+    jobjectArray backoff_list) {
+  const char* buf =
+      static_cast<char*>(env->GetDirectBufferAddress(model_buffer));
+  jlong capacity = env->GetDirectBufferCapacity(model_buffer);
+
+  JNIStorage* storage = new JNIStorage;
+  storage->model = tflite::FlatBufferModel::BuildFromBuffer(
+      buf, static_cast<size_t>(capacity));
+  storage->backoff_list = jniStringArrayToVector(env, backoff_list);
+
+  if (!storage->model) {
+    delete storage;
+    env->ThrowNew(env->FindClass(kIllegalStateException), "");
+    return 0;
+  }
+  return reinterpret_cast<jlong>(storage);
+}
+
+extern "C" JNIEXPORT jobjectArray JNICALL
+Java_com_example_android_smartreply_SmartReplyClient_predictJNI(
+    JNIEnv* env, jobject /*thiz*/, jlong storage_ptr, jobjectArray input_text) {
+  // Predict
+  if (storage_ptr == 0) {
+    return nullptr;
+  }
+  JNIStorage* storage = reinterpret_cast<JNIStorage*>(storage_ptr);
+  if (storage == nullptr) {
+    return nullptr;
+  }
+  std::vector<PredictorResponse> responses;
+  GetSegmentPredictions(jniStringArrayToVector(env, input_text),
+                        *storage->model, {storage->backoff_list}, &responses);
+
+  // Create a SmartReply[] to return back to Java
+  jclass smart_reply_class = CheckNotNull(
+      env, env->FindClass("com/example/android/smartreply/SmartReply"));
+  if (env->ExceptionCheck()) {
+    return nullptr;
+  }
+  jmethodID smart_reply_ctor = CheckNotNull(
+      env,
+      env->GetMethodID(smart_reply_class, "<init>", "(Ljava/lang/String;F)V"));
+  if (env->ExceptionCheck()) {
+    return nullptr;
+  }
+  jobjectArray array = CheckNotNull(
+      env, env->NewObjectArray(responses.size(), smart_reply_class, nullptr));
+  if (env->ExceptionCheck()) {
+    return nullptr;
+  }
+  for (int i = 0; i < responses.size(); i++) {
+    jstring text =
+        CheckNotNull(env, env->NewStringUTF(responses[i].GetText().data()));
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jobject reply = env->NewObject(smart_reply_class, smart_reply_ctor, text,
+                                   responses[i].GetScore());
+    env->SetObjectArrayElement(array, i, reply);
+  }
+  return array;
+}
+
+extern "C" JNIEXPORT void JNICALL
+Java_com_example_android_smartreply_SmartReplyClient_unloadJNI(
+    JNIEnv* env, jobject thiz, jlong storage_ptr) {
+  if (storage_ptr != 0) {
+    JNIStorage* storage = reinterpret_cast<JNIStorage*>(storage_ptr);
+    delete storage;
+  }
+}
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc b/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc
index 1c422b659abc0871a346b8cffc260df4b22a4f9d..f97a6486d6c11cf0184622f515fe5b1e096c6257 100644
--- a/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc
+++ b/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <map>
-#include "re2/re2.h"
+
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include "tensorflow/contrib/lite/string_util.h"
@@ -81,7 +81,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* label = GetOutput(context, node, 0);
   TfLiteTensor* weight = GetOutput(context, node, 1);
 
-  std::map<int64, int> feature_id_counts;
+  std::map<int64_t, int> feature_id_counts;
   for (int i = 0; i < num_strings; i++) {
     // Use fingerprint of feature name as id.
     auto strref = tflite::GetString(input, i);
@@ -91,10 +91,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       continue;
     }
 
-    int64 feature_id =
+    int64_t feature_id =
         ::util::Fingerprint64(strref.str, strref.len) % kMaxDimension;
-
-    label->data.i32[i] = static_cast<int32>(feature_id);
+    label->data.i32[i] = static_cast<int32_t>(feature_id);
     weight->data.f[i] =
         std::count(strref.str, strref.str + strref.len, ' ') + 1;
   }
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/normalize.cc b/tensorflow/contrib/lite/models/smartreply/ops/normalize.cc
index d0dc2a35a7cc527bef0b24508f207da8eec17fc0..c55ac9f52f7293a8ba5baf17f2052e11a7422074 100644
--- a/tensorflow/contrib/lite/models/smartreply/ops/normalize.cc
+++ b/tensorflow/contrib/lite/models/smartreply/ops/normalize.cc
@@ -21,7 +21,10 @@ limitations under the License.
 // Output:
 //     Output[0]: Normalized sentence. string[1]
 //
-#include "absl/strings/ascii.h"
+
+#include <algorithm>
+#include <string>
+
 #include "absl/strings/str_cat.h"
 #include "absl/strings/strip.h"
 #include "re2/re2.h"
@@ -50,7 +53,7 @@ const std::map<string, string>* kRegexTransforms =
 
 static const char kStartToken[] = "<S>";
 static const char kEndToken[] = "<E>";
-static const int32 kMaxInputChars = 300;
+static const int32_t kMaxInputChars = 300;
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   tflite::StringRef input = tflite::GetString(GetInput(context, node, 0), 0);
diff --git a/tensorflow/contrib/lite/models/smartreply/predictor.cc b/tensorflow/contrib/lite/models/smartreply/predictor.cc
index a28222213ea8c66a1e9288ba9ae06aea7653f108..6da5cc8eecc0920850f666b0992c4d9598c55b6c 100644
--- a/tensorflow/contrib/lite/models/smartreply/predictor.cc
+++ b/tensorflow/contrib/lite/models/smartreply/predictor.cc
@@ -30,7 +30,7 @@ namespace custom {
 namespace smartreply {
 
 // Split sentence into segments (using punctuation).
-std::vector<string> SplitSentence(const string& input) {
+std::vector<std::string> SplitSentence(const std::string& input) {
   string result(input);
 
   RE2::GlobalReplace(&result, "([?.!,])+", " \\1");
@@ -38,12 +38,13 @@ std::vector<string> SplitSentence(const string& input) {
   RE2::GlobalReplace(&result, "[ ]+", " ");
   RE2::GlobalReplace(&result, "\t+$", "");
 
-  return strings::Split(result, '\t');
+  return absl::StrSplit(result, '\t');
 }
 
 // Predict with TfLite model.
-void ExecuteTfLite(const string& sentence, ::tflite::Interpreter* interpreter,
-                   std::map<string, float>* response_map) {
+void ExecuteTfLite(const std::string& sentence,
+                   ::tflite::Interpreter* interpreter,
+                   std::map<std::string, float>* response_map) {
   {
     TfLiteTensor* input = interpreter->tensor(interpreter->inputs()[0]);
     tflite::DynamicBuffer buf;
@@ -67,8 +68,8 @@ void ExecuteTfLite(const string& sentence, ::tflite::Interpreter* interpreter,
 }
 
 void GetSegmentPredictions(
-    const std::vector<string>& input, const ::tflite::FlatBufferModel& model,
-    const SmartReplyConfig& config,
+    const std::vector<std::string>& input,
+    const ::tflite::FlatBufferModel& model, const SmartReplyConfig& config,
     std::vector<PredictorResponse>* predictor_responses) {
   // Initialize interpreter
   std::unique_ptr<::tflite::Interpreter> interpreter;
@@ -82,10 +83,10 @@ void GetSegmentPredictions(
   }
 
   // Execute Tflite Model
-  std::map<string, float> response_map;
-  std::vector<string> sentences;
-  for (const string& str : input) {
-    std::vector<string> splitted_str = SplitSentence(str);
+  std::map<std::string, float> response_map;
+  std::vector<std::string> sentences;
+  for (const std::string& str : input) {
+    std::vector<std::string> splitted_str = SplitSentence(str);
     sentences.insert(sentences.end(), splitted_str.begin(), splitted_str.end());
   }
   for (const auto& sentence : sentences) {
diff --git a/tensorflow/contrib/lite/models/smartreply/predictor.h b/tensorflow/contrib/lite/models/smartreply/predictor.h
index 3b9a2b32e17f93f7ebbf35e77ec1e238fe14b020..d17323a3f9a0ea80ad5e215b0a4700e625d0c590 100644
--- a/tensorflow/contrib/lite/models/smartreply/predictor.h
+++ b/tensorflow/contrib/lite/models/smartreply/predictor.h
@@ -34,7 +34,7 @@ struct SmartReplyConfig;
 // With a given string as input, predict the response with a Tflite model.
 // When config.backoff_response is not empty, predictor_responses will be filled
 // with messagees from backoff response.
-void GetSegmentPredictions(const std::vector<string>& input,
+void GetSegmentPredictions(const std::vector<std::string>& input,
                            const ::tflite::FlatBufferModel& model,
                            const SmartReplyConfig& config,
                            std::vector<PredictorResponse>* predictor_responses);
@@ -43,17 +43,17 @@ void GetSegmentPredictions(const std::vector<string>& input,
 // It includes messages, and confidence.
 class PredictorResponse {
  public:
-  PredictorResponse(const string& response_text, float score) {
+  PredictorResponse(const std::string& response_text, float score) {
     response_text_ = response_text;
     prediction_score_ = score;
   }
 
   // Accessor methods.
-  const string& GetText() const { return response_text_; }
+  const std::string& GetText() const { return response_text_; }
   float GetScore() const { return prediction_score_; }
 
  private:
-  string response_text_ = "";
+  std::string response_text_ = "";
   float prediction_score_ = 0.0;
 };
 
@@ -65,9 +65,9 @@ struct SmartReplyConfig {
   float backoff_confidence;
   // Backoff responses are used when predicted responses cannot fulfill the
   // list.
-  const std::vector<string>& backoff_responses;
+  const std::vector<std::string>& backoff_responses;
 
-  SmartReplyConfig(std::vector<string> backoff_responses)
+  SmartReplyConfig(std::vector<std::string> backoff_responses)
       : num_response(kDefaultNumResponse),
         backoff_confidence(kDefaultBackoffConfidence),
         backoff_responses(backoff_responses) {}
diff --git a/tensorflow/contrib/lite/models/smartreply/predictor_test.cc b/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
index 2fa9923bc93d7e559884b6880187637b78f4b217..97d3c650e21c3cb4bef1db09df93f4bf24f38ba5 100644
--- a/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
+++ b/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include <fstream>
 #include <unordered_set>
 
-#include "base/logging.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/contrib/lite/models/test_utils.h"
+#include "tensorflow/contrib/lite/string_util.h"
 
 namespace tflite {
 namespace custom {
@@ -65,7 +65,6 @@ TEST_F(PredictorTest, GetSegmentPredictions) {
 
   float max = 0;
   for (const auto &item : predictions) {
-    LOG(INFO) << "Response: " << item.GetText();
     if (item.GetScore() > max) {
       max = item.GetScore();
     }
@@ -86,7 +85,6 @@ TEST_F(PredictorTest, TestTwoSentences) {
 
   float max = 0;
   for (const auto &item : predictions) {
-    LOG(INFO) << "Response: " << item.GetText();
     if (item.GetScore() > max) {
       max = item.GetScore();
     }
@@ -119,7 +117,7 @@ TEST_F(PredictorTest, BatchTest) {
   string line;
   std::ifstream fin(StrCat(TestDataPath(), "/", kSamples));
   while (std::getline(fin, line)) {
-    const std::vector<string> &fields = strings::Split(line, '\t');
+    const std::vector<string> fields = absl::StrSplit(line, '\t');
     if (fields.empty()) {
       continue;
     }
@@ -139,9 +137,8 @@ TEST_F(PredictorTest, BatchTest) {
                                   fields.begin() + 1, fields.end())));
   }
 
-  LOG(INFO) << "Responses: " << total_responses << " / " << total_items;
-  LOG(INFO) << "Triggers: " << total_triggers << " / " << total_items;
   EXPECT_EQ(total_triggers, total_items);
+  EXPECT_GE(total_responses, total_triggers);
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/models/speech_terse_am_model_test.cc b/tensorflow/contrib/lite/models/speech_asr_am_model_test.cc
similarity index 93%
rename from tensorflow/contrib/lite/models/speech_terse_am_model_test.cc
rename to tensorflow/contrib/lite/models/speech_asr_am_model_test.cc
index 30d89a135403db2ef6e4533ddcc321206bf8bd5e..bf95b313f31c2f76046727353a9a7b0658dbf067 100644
--- a/tensorflow/contrib/lite/models/speech_terse_am_model_test.cc
+++ b/tensorflow/contrib/lite/models/speech_asr_am_model_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// Unit test for speech TERSE AM model using TFLite Ops.
+// Unit test for speech ASR AM model using TFLite Ops.
 
 #include <string.h>
 
@@ -45,10 +45,10 @@ constexpr int kLstmLayer5OutputStateTensor = 103;
 constexpr int kLstmLayer5CellStateTensor = 104;
 constexpr int kModelOutputTensor = 109;
 
-TEST(SpeechTerseAm, RandomIOTest) {
+TEST(SpeechAsrAm, RandomIOTest) {
   // Read the model.
   string tflite_file_path =
-      file::JoinPath(TestDataPath(), "speech_terse_am_model.tflite");
+      file::JoinPath(TestDataPath(), "speech_asr_am_model.tflite");
   auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
   CHECK(model) << "Failed to mmap model " << tflite_file_path;
 
@@ -62,13 +62,13 @@ TEST(SpeechTerseAm, RandomIOTest) {
   // Load the input frames.
   Frames input_frames;
   const string input_file_path =
-      file::JoinPath(TestDataPath(), "speech_terse_am_model_in.csv");
+      file::JoinPath(TestDataPath(), "speech_asr_am_model_in.csv");
   ReadFrames(input_file_path, &input_frames);
 
   // Load the golden output results.
   Frames output_frames;
   const string output_file_path =
-      file::JoinPath(TestDataPath(), "speech_terse_am_model_out.csv");
+      file::JoinPath(TestDataPath(), "speech_asr_am_model_out.csv");
   ReadFrames(output_file_path, &output_frames);
 
   const int speech_batch_size =
diff --git a/tensorflow/contrib/lite/models/speech_asr_lm_model_test.cc b/tensorflow/contrib/lite/models/speech_asr_lm_model_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..53f2b66da492f8fe56fa9e234f0951cf61c35037
--- /dev/null
+++ b/tensorflow/contrib/lite/models/speech_asr_lm_model_test.cc
@@ -0,0 +1,122 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for speech ASR LM model using TFLite Ops.
+
+#include <string.h>
+
+#include <memory>
+#include <string>
+
+#include "base/logging.h"
+#include "file/base/path.h"
+#include "testing/base/public/googletest.h"
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/models/test_utils.h"
+
+namespace tflite {
+namespace models {
+
+constexpr int kModelInput1Tensor = 0;
+constexpr int kModelInput2Tensor = 66;
+constexpr int kLstmLayer1OutputStateTensor = 21;
+constexpr int kLstmLayer1CellStateTensor = 22;
+constexpr int kLstmLayer2OutputStateTensor = 42;
+constexpr int kLstmLayer2CellStateTensor = 43;
+constexpr int kLstmLayer3OutputStateTensor = 63;
+constexpr int kLstmLayer3CellStateTensor = 64;
+constexpr int kModelOutputTensor = 75;
+
+static void ClearLstmStates(Interpreter* interpreter) {
+  memset(interpreter->tensor(kLstmLayer1OutputStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer1OutputStateTensor)->bytes);
+  memset(interpreter->tensor(kLstmLayer1CellStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer1CellStateTensor)->bytes);
+
+  memset(interpreter->tensor(kLstmLayer2OutputStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer2OutputStateTensor)->bytes);
+  memset(interpreter->tensor(kLstmLayer2CellStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer2CellStateTensor)->bytes);
+
+  memset(interpreter->tensor(kLstmLayer3OutputStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer3OutputStateTensor)->bytes);
+  memset(interpreter->tensor(kLstmLayer3CellStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer3CellStateTensor)->bytes);
+}
+
+TEST(SpeechAsrLm, EndToEndTest) {
+  // Read the model.
+  string tflite_file_path =
+      file::JoinPath(TestDataPath(), "speech_asr_lm_model.tflite");
+  auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
+  CHECK(model) << "Failed to mmap model " << tflite_file_path;
+
+  // Initialize the interpreter.
+  ops::builtin::BuiltinOpResolver builtins;
+  std::unique_ptr<Interpreter> interpreter;
+  InterpreterBuilder(*model, builtins)(&interpreter);
+  CHECK(interpreter != nullptr);
+  interpreter->AllocateTensors();
+
+  // Load the input frames.
+  Frames input_frames;
+  const string input_file_path =
+      file::JoinPath(TestDataPath(), "speech_asr_lm_model_in.csv");
+  ReadFrames(input_file_path, &input_frames);
+
+  // Load the golden output results.
+  Frames output_frames;
+  const string output_file_path =
+      file::JoinPath(TestDataPath(), "speech_asr_lm_model_out.csv");
+  ReadFrames(output_file_path, &output_frames);
+
+  CHECK_EQ(interpreter->tensor(kModelInput1Tensor)->dims->size, 1);
+  const int input1_size =
+      interpreter->tensor(kModelInput1Tensor)->dims->data[0];
+  CHECK_EQ(input1_size, 1);
+  CHECK_EQ(interpreter->tensor(kModelInput2Tensor)->dims->size, 1);
+  const int output_size =
+      interpreter->tensor(kModelOutputTensor)->dims->data[0];
+  CHECK_EQ(output_size, 1);
+
+  int* input_lookup_ptr = interpreter->tensor(kModelInput1Tensor)->data.i32;
+  int* output_lookup_ptr = interpreter->tensor(kModelInput2Tensor)->data.i32;
+  float* output_ptr = interpreter->tensor(kModelOutputTensor)->data.f;
+
+
+  for (int i = 0; i < input_frames.size(); i++) {
+    float output_score = 0.0f;
+    // Reset LSTM states for each sequence.
+    ClearLstmStates(interpreter.get());
+    // For subsequent inputs feed them sequentially, one-by-one.
+    for (int k = 1; k < input_frames[i].size(); k++) {
+      // Feed the inputs to model.
+      input_lookup_ptr[0] = static_cast<int32>(input_frames[i][k - 1]);
+      output_lookup_ptr[0] = static_cast<int32>(input_frames[i][k]);
+      // Run the model.
+      interpreter->Invoke();
+      // Sum up the outputs.
+      output_score += output_ptr[0];
+    }
+    // Validate the output.
+    ASSERT_NEAR(output_score, output_frames[i][0], 1.4e-5);
+  }
+}
+
+}  // namespace models
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/speech_hotword_model_test.cc b/tensorflow/contrib/lite/models/speech_hotword_model_test.cc
index 0b8266447adf758184fe3b1ad6a77f1ac6045193..f69cae8d2cb08678f9eec8c9b9d653cfce55bd2e 100644
--- a/tensorflow/contrib/lite/models/speech_hotword_model_test.cc
+++ b/tensorflow/contrib/lite/models/speech_hotword_model_test.cc
@@ -73,8 +73,8 @@ void RunTest(int model_input_tensor, int svdf_layer_state_tensor,
   float* output_ptr = interpreter->tensor(model_output_tensor)->data.f;
 
   // The first layer (SVDF) input size is 40 (speech_input_size). Each speech
-  // input frames for this model is 1280 floats, which can be fed to input in a
-  // sequence of size 32 (input_sequence_size).
+  // input frames for this model is 1600 floats, which can be fed to input in a
+  // sequence of size 40 (input_sequence_size).
   for (int i = 0; i < TestInputSize(input_frames); i++) {
     int frame_ptr = 0;
     for (int s = 0; s < input_sequence_size; s++) {
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/README.md b/tensorflow/contrib/lite/models/testdata/g3doc/README.md
index 77fe8b3f84f7a3b0a3c9433b79b7c4ba7c5adac7..46b24248f002b8a1a30a2ac614c3874dfd2207db 100644
--- a/tensorflow/contrib/lite/models/testdata/g3doc/README.md
+++ b/tensorflow/contrib/lite/models/testdata/g3doc/README.md
@@ -61,6 +61,20 @@ the corresponding parameters as shown in the figure.
 
 ![asr_am_model](asr_am.svg "ASR AM model")
 
+### Automatic Speech Recognizer (ASR) Language Model (LM)
+
+The language model for automatic speech recognition is the neural network model
+for predicting the probability of a word given previous words in a sentence.
+It generates posterior probabilities of the next word based from a sequence of
+words. The words are encoded as indices in a fixed size dictionary.
+The model has two inputs both of size one (integer): the current word index and
+next word index, an output size of one (float): the log probability. It consits
+of three embedding layer, three LSTM layers, followed by a multiplication, a
+fully connected layers and an addition.
+The corresponding parameters as shown in the figure.
+
+![asr_lm_model](asr_lm.svg "ASR LM model")
+
 ## Speech models test input/output generation
 
 As mentioned above the input to models are generated from a pre-processing
@@ -72,25 +86,34 @@ same input.
 
 ### Models:
 
-[Speech hotword model (Svdf rank=1)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank1_2017_11_14.tflite)
+[Speech hotword model (Svdf
+rank=1)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank1_2017_11_14.tflite)
 
-[Speech hotword model (Svdf rank=2)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank2_2017_11_14.tflite)
+[Speech hotword model (Svdf
+rank=2)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank2_2017_11_14.tflite)
 
-[Speaker-id model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_speakerid_model_2017_11_14.tflite)
+[Speaker-id
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_speakerid_model_2017_11_14.tflite)
 
-[TTS model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_tts_model_2017_11_14.tflite)
+[TTS
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_tts_model_2017_11_14.tflite)
 
-[ASR AM model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_terse_am_model_2017_11_14.tflite)
+[ASR AM
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_terse_am_model_2017_11_14.tflite)
 
 ### Test benches
 
-[Speech hotword model test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_hotword_model_test.cc)
+[Speech hotword model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_hotword_model_test.cc)
 
-[Speaker-id model test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_speakerid_model_test.cc)
+[Speaker-id model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_speakerid_model_test.cc)
 
-[TTS model test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_tts_model_test.cc)
+[TTS model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_tts_model_test.cc)
 
-[ASR AM model test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_terse_am_model_test.cc)
+[ASR AM model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_terse_am_model_test.cc)
 
 ## Android Support
 The models have been tested on Android phones, using the following tests:
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/asr_am.svg b/tensorflow/contrib/lite/models/testdata/g3doc/asr_am.svg
index ca9655642211bbb68587fed84ddc6951f5d35e79..9f841c219b1ff247231939106d0a6ba47bf6d305 100644
--- a/tensorflow/contrib/lite/models/testdata/g3doc/asr_am.svg
+++ b/tensorflow/contrib/lite/models/testdata/g3doc/asr_am.svg
@@ -1,4 +1,4 @@
 <?xml version="1.0" standalone="yes"?>
 
-<svg version="1.1" viewBox="0.0 0.0 960.0 720.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><clipPath id="p.0"><path d="m0 0l960.0 0l0 720.0l-960.0 0l0 -720.0z" clip-rule="nonzero"></path></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l960.0 0l0 720.0l-960.0 0z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m392.0 30.700842l166.01575 0l0 42.110237l-166.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m392.0 30.700842l166.01575 0l0 42.110237l-166.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m404.43954 57.620842l0 -13.59375l1.8125 0l0 13.59375l-1.8125 0zm4.6676636 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375732 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313202 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.5788574 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.897858 5.5q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.254181 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm6.8439026 0.28125l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141296 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219482 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m359.0 102.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m359.0 102.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m401.82367 128.94362l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm21.212677 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.918396 4.0q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.572052 -7.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141357 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944519 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.9844055 0l-3.3906555 4.640625l3.6562805 5.21875l-2.0469055 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.9687805 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375671 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656982 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219421 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m395.9714 154.72487l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844452 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019806 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426636 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5042114 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281952 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.7187805 0.21875q-0.40625 1.5 -1.5156555 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.2344055 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.3437805 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.578827 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm18.210388 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656921 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m385.80054 657.01575l180.0 0l0 42.11023l-180.0 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m385.80054 657.01575l180.0 0l0 42.11023l-180.0 0z" fill-rule="evenodd"></path><path fill="#000000" d="m402.3206 677.3107q0 -3.390625 1.8125 -5.296875q1.828125 -1.921875 4.703125 -1.921875q1.875 0 3.390625 0.90625q1.515625 0.890625 2.296875 2.5q0.796875 1.609375 0.796875 3.65625q0 2.0625 -0.84375 3.703125q-0.828125 1.625 -2.359375 2.46875q-1.53125 0.84375 -3.296875 0.84375q-1.921875 0 -3.4375 -0.921875q-1.5 -0.9375 -2.28125 -2.53125q-0.78125 -1.609375 -0.78125 -3.40625zm1.859375 0.03125q0 2.453125 1.3125 3.875q1.328125 1.40625 3.3125 1.40625q2.03125 0 3.34375 -1.421875q1.3125 -1.4375 1.3125 -4.0625q0 -1.65625 -0.5625 -2.890625q-0.546875 -1.234375 -1.640625 -1.921875q-1.078125 -0.6875 -2.421875 -0.6875q-1.90625 0 -3.28125 1.3125q-1.375 1.3125 -1.375 4.390625zm19.433289 6.59375l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.5788574 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5270386 5.28125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313232 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.578827 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.897858 5.5q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm12.187622 3.875l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm13.797607 3.171875l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm3.1569824 5.609375l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m475.09448 161.01575l0 24.724411" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m475.09448 161.01575l0 18.724411" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m473.44275 179.74016l1.6517334 4.538101l1.6517334 -4.538101z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m475.09448 244.72906l0 25.29132" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m475.09448 244.72906l0 19.291351" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m473.44275 264.02042l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m475.00787 72.81108l0.09448242 29.196846" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m475.00787 72.81108l0.07510376 23.196877" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m473.4312 96.013306l1.6664124 4.5327225l1.6370544 -4.543419z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m359.0 526.4199l232.18896 0l0 42.11029l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m359.0 526.4199l232.18896 0l0 42.11029l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m372.43524 553.33997l0 -13.59375l9.171875 0l0 1.59375l-7.375 0l0 4.21875l6.375 0l0 1.609375l-6.375 0l0 6.171875l-1.796875 0zm17.53659 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.8913574 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.144806 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.0979614 3.796875l-0.171875 -1.5625q0.546875 0.140625 0.953125 0.140625q0.546875 0 0.875 -0.1875q0.34375 -0.1875 0.5625 -0.515625q0.15625 -0.25 0.5 -1.25q0.046875 -0.140625 0.15625 -0.40625l-3.734375 -9.875l1.796875 0l2.046875 5.71875q0.40625 1.078125 0.71875 2.28125q0.28125 -1.15625 0.6875 -2.25l2.09375 -5.75l1.671875 0l-3.75 10.03125q-0.59375 1.625 -0.9375 2.234375q-0.4375 0.828125 -1.015625 1.203125q-0.578125 0.390625 -1.375 0.390625q-0.484375 0 -1.078125 -0.203125zm19.328125 -8.5625l1.796875 0.453125q-0.5625 2.21875 -2.03125 3.390625q-1.46875 1.15625 -3.59375 1.15625q-2.203125 0 -3.578125 -0.890625q-1.375 -0.90625 -2.09375 -2.59375q-0.71875 -1.703125 -0.71875 -3.65625q0 -2.125 0.796875 -3.703125q0.8125 -1.578125 2.3125 -2.390625q1.5 -0.828125 3.296875 -0.828125q2.046875 0 3.4375 1.046875q1.390625 1.03125 1.9375 2.90625l-1.765625 0.421875q-0.46875 -1.484375 -1.375 -2.15625q-0.90625 -0.6875 -2.265625 -0.6875q-1.5625 0 -2.625 0.75q-1.046875 0.75 -1.484375 2.03125q-0.421875 1.265625 -0.421875 2.609375q0 1.734375 0.5 3.03125q0.515625 1.28125 1.578125 1.921875q1.078125 0.640625 2.3125 0.640625q1.515625 0 2.5625 -0.859375q1.046875 -0.875 1.421875 -2.59375zm2.926056 -0.15625q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375702 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm17.125732 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547577 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm8.277069 -1.671875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500702 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm17.637146 8.921875q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375732 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656982 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm15.328125 0l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm13.797546 3.171875l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm3.1569824 5.609375l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m475.09448 413.32974l0 24.125977" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m475.09448 413.3297l0 18.126007" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m473.44275 431.45572l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m475.09448 329.01575l0 25.322845" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m475.09448 329.01575l0 19.322845" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m473.44275 348.3386l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m475.09448 496.44235l0 29.984283" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m475.09448 496.44238l0 23.984253" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m473.44275 520.42664l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m359.0 185.73694l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m359.0 185.73694l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m401.82367 212.65694l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm23.697052 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm10.434021 5.609375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375732 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.9844055 0l-3.3906555 4.640625l3.6562805 5.21875l-2.0469055 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.9687805 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375671 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656982 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219421 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m395.9714 238.43819l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844452 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019806 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426636 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5042114 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281952 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.7187805 0.21875q-0.40625 1.5 -1.5156555 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.2344055 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.3437805 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.578827 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm18.210388 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656921 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m359.0 270.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m359.0 270.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m401.82367 296.94363l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm23.697052 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm10.434021 5.609375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375732 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.9844055 0l-3.3906555 4.640625l3.6562805 5.21875l-2.0469055 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.9687805 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375671 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656982 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219421 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m395.9714 322.72488l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844452 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019806 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426636 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5042114 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281952 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.7187805 0.21875q-0.40625 1.5 -1.5156555 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.2344055 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.3437805 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.578827 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm18.210388 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656921 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m359.0 354.33762l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m359.0 354.33762l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m401.82367 381.2576l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm23.697052 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm10.434021 5.609375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375732 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.9844055 0l-3.3906555 4.640625l3.6562805 5.21875l-2.0469055 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.9687805 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375671 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656982 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219421 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m395.9714 407.03885l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844452 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019806 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426636 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5042114 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281952 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.7187805 0.21875q-0.40625 1.5 -1.5156555 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.2344055 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.3437805 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.578827 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm18.210388 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656921 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m359.0 437.45026l232.18896 0l0 58.992096l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m359.0 437.45026l232.18896 0l0 58.992096l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m401.82367 464.37024l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm23.697052 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm10.434021 5.609375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375732 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.9844055 0l-3.3906555 4.640625l3.6562805 5.21875l-2.0469055 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.9687805 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375671 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656982 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219421 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m395.9714 490.1515l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844452 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019806 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426636 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5042114 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281952 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.7187805 0.21875q-0.40625 1.5 -1.5156555 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.2344055 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.3437805 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.578827 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm18.210388 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656921 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m405.46194 594.54596l140.06302 0l0 42.11023l-140.06302 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m405.46194 594.54596l140.06302 0l0 42.11023l-140.06302 0z" fill-rule="evenodd"></path><path fill="#000000" d="m442.13754 617.09094l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm12.209198 -0.546875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.688232 4.921875l0 -8.546875l-1.484375 0l0 -1.3125l1.484375 0l0 -1.046875q0 -0.984375 0.171875 -1.46875q0.234375 -0.65625 0.84375 -1.046875q0.609375 -0.40625 1.703125 -0.40625q0.703125 0 1.5624695 0.15625l-0.25 1.46875q-0.5155945 -0.09375 -0.9843445 -0.09375q-0.765625 0 -1.078125 0.328125q-0.3125 0.3125 -0.3125 1.203125l0 0.90625l1.921875 0l0 1.3125l-1.921875 0l0 8.546875l-1.65625 0zm8.433289 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5270386 1.5l0 -9.859375l1.5 0l0 1.390625q0.453125 -0.71875 1.21875 -1.15625q0.78125 -0.453125 1.765625 -0.453125q1.09375 0 1.796875 0.453125q0.703125 0.453125 0.984375 1.28125q1.171875 -1.734375 3.046875 -1.734375q1.46875 0 2.25 0.8125q0.796875 0.8125 0.796875 2.5l0 6.765625l-1.671875 0l0 -6.203125q0 -1.0 -0.15625 -1.4375q-0.15625 -0.453125 -0.59375 -0.71875q-0.421875 -0.265625 -1.0 -0.265625q-1.03125 0 -1.71875 0.6875q-0.6875 0.6875 -0.6875 2.21875l0 5.71875l-1.671875 0l0 -6.40625q0 -1.109375 -0.40625 -1.65625q-0.40625 -0.5625 -1.34375 -0.5625q-0.703125 0 -1.3125 0.375q-0.59375 0.359375 -0.859375 1.078125q-0.265625 0.71875 -0.265625 2.0625l0 5.109375l-1.671875 0zm21.978302 -1.21875q-0.9375 0.796875 -1.796875 1.125q-0.859375 0.3125 -1.84375 0.3125q-1.609375 0 -2.484375 -0.78125q-0.875 -0.796875 -0.875 -2.03125q0 -0.734375 0.328125 -1.328125q0.328125 -0.59375 0.859375 -0.953125q0.53125 -0.359375 1.203125 -0.546875q0.5 -0.140625 1.484375 -0.25q2.03125 -0.25 2.984375 -0.578125q0 -0.34375 0 -0.4375q0 -1.015625 -0.46875 -1.4375q-0.640625 -0.5625 -1.90625 -0.5625q-1.171875 0 -1.734375 0.40625q-0.5625 0.40625 -0.828125 1.46875l-1.640625 -0.234375q0.234375 -1.046875 0.734375 -1.6875q0.515625 -0.640625 1.46875 -0.984375q0.96875 -0.359375 2.25 -0.359375q1.265625 0 2.046875 0.296875q0.78125 0.296875 1.15625 0.75q0.375 0.453125 0.515625 1.140625q0.09375 0.421875 0.09375 1.53125l0 2.234375q0 2.328125 0.09375 2.953125q0.109375 0.609375 0.4375 1.171875l-1.75 0q-0.265625 -0.515625 -0.328125 -1.21875zm-0.140625 -3.71875q-0.90625 0.359375 -2.734375 0.625q-1.03125 0.140625 -1.453125 0.328125q-0.421875 0.1875 -0.65625 0.546875q-0.234375 0.359375 -0.234375 0.796875q0 0.671875 0.5 1.125q0.515625 0.4375 1.484375 0.4375q0.96875 0 1.71875 -0.421875q0.75 -0.4375 1.109375 -1.15625q0.265625 -0.578125 0.265625 -1.671875l0 -0.609375zm2.9694824 4.9375l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m475.09448 568.5302l0.40945435 26.015747" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m475.09448 568.5302l0.31506348 20.01648" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m473.758 588.5727l1.7229309 4.5115356l1.5801086 -4.5635376z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m475.49344 636.6562l0.31497192 20.346436" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m475.49344 636.6562l0.22210693 14.347168" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m474.064 651.02893l1.7217712 4.511963l1.5812988 -4.5631104z" fill-rule="evenodd"></path></g></svg>
+<svg version="1.1" viewBox="0.0 0.0 703.0 722.8005249343832" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><clipPath id="p.0"><path d="m0 0l703.0 0l0 722.80054l-703.0 0l0 -722.80054z" clip-rule="nonzero"></path></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l703.0 0l0 722.80054l-703.0 0z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m256.0 30.700842l166.01575 0l0 42.110237l-166.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m256.0 30.700842l166.01575 0l0 42.110237l-166.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m268.43954 57.620842l0 -13.59375l1.8125 0l0 13.59375l-1.8125 0zm4.6676636 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375732 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313202 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.5788574 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.897858 5.5q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.254181 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm6.8439026 0.28125l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141327 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219452 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m223.0 102.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m223.0 102.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m265.82367 128.94362l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm21.212677 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.918396 4.0q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.572052 -7.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141357 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944519 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.96875 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375702 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656982 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219452 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m259.9714 154.72487l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844452 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019806 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426636 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5042114 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281952 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.578827 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm18.210358 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m249.80052 657.01575l180.00002 0l0 42.11023l-180.00002 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m249.80052 657.01575l180.00002 0l0 42.11023l-180.00002 0z" fill-rule="evenodd"></path><path fill="#000000" d="m266.3206 677.3107q0 -3.390625 1.8125 -5.296875q1.828125 -1.921875 4.703125 -1.921875q1.875 0 3.390625 0.90625q1.515625 0.890625 2.296875 2.5q0.796875 1.609375 0.796875 3.65625q0 2.0625 -0.84375 3.703125q-0.828125 1.625 -2.359375 2.46875q-1.53125 0.84375 -3.296875 0.84375q-1.921875 0 -3.4375 -0.921875q-1.5 -0.9375 -2.28125 -2.53125q-0.78125 -1.609375 -0.78125 -3.40625zm1.859375 0.03125q0 2.453125 1.3125 3.875q1.328125 1.40625 3.3125 1.40625q2.03125 0 3.34375 -1.421875q1.3125 -1.4375 1.3125 -4.0625q0 -1.65625 -0.5625 -2.890625q-0.546875 -1.234375 -1.640625 -1.921875q-1.078125 -0.6875 -2.421875 -0.6875q-1.90625 0 -3.28125 1.3125q-1.375 1.3125 -1.375 4.390625zm19.433289 6.59375l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.5788574 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5270386 5.28125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313232 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.578827 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.897858 5.5q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm12.187653 3.875l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm13.797577 3.171875l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm3.1569824 5.609375l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m339.09448 161.01575l0 24.724411" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m339.09448 161.01575l0 18.724411" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m337.44275 179.74016l1.6517334 4.538101l1.6517334 -4.538101z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m339.09448 244.72906l0 25.29132" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m339.09448 244.72906l0 19.291351" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m337.44275 264.02042l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m339.00787 72.81108l0.09448242 29.196846" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m339.00787 72.81108l0.07507324 23.196877" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m337.4312 96.013306l1.6664124 4.5327225l1.6370544 -4.543419z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m223.0 526.4199l232.18896 0l0 42.11029l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m223.0 526.4199l232.18896 0l0 42.11029l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m236.43524 553.33997l0 -13.59375l9.171875 0l0 1.59375l-7.375 0l0 4.21875l6.375 0l0 1.609375l-6.375 0l0 6.171875l-1.796875 0zm17.53659 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.8913574 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.144806 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.0979614 3.796875l-0.171875 -1.5625q0.546875 0.140625 0.953125 0.140625q0.546875 0 0.875 -0.1875q0.34375 -0.1875 0.5625 -0.515625q0.15625 -0.25 0.5 -1.25q0.046875 -0.140625 0.15625 -0.40625l-3.734375 -9.875l1.796875 0l2.046875 5.71875q0.40625 1.078125 0.71875 2.28125q0.28125 -1.15625 0.6875 -2.25l2.09375 -5.75l1.671875 0l-3.75 10.03125q-0.59375 1.625 -0.9375 2.234375q-0.4375 0.828125 -1.015625 1.203125q-0.578125 0.390625 -1.375 0.390625q-0.484375 0 -1.078125 -0.203125zm19.328125 -8.5625l1.796875 0.453125q-0.5625 2.21875 -2.03125 3.390625q-1.46875 1.15625 -3.59375 1.15625q-2.203125 0 -3.578125 -0.890625q-1.375 -0.90625 -2.09375 -2.59375q-0.71875 -1.703125 -0.71875 -3.65625q0 -2.125 0.796875 -3.703125q0.8125 -1.578125 2.3125 -2.390625q1.5 -0.828125 3.296875 -0.828125q2.046875 0 3.4375 1.046875q1.390625 1.03125 1.9375 2.90625l-1.765625 0.421875q-0.46875 -1.484375 -1.375 -2.15625q-0.90625 -0.6875 -2.265625 -0.6875q-1.5625 0 -2.625 0.75q-1.046875 0.75 -1.484375 2.03125q-0.421875 1.265625 -0.421875 2.609375q0 1.734375 0.5 3.03125q0.515625 1.28125 1.578125 1.921875q1.078125 0.640625 2.3125 0.640625q1.515625 0 2.5625 -0.859375q1.046875 -0.875 1.421875 -2.59375zm2.926056 -0.15625q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375702 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm17.125732 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547577 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm8.277069 -1.671875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500702 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm17.637146 8.921875q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375732 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm15.328125 0l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm13.797577 3.171875l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm3.1569824 5.609375l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m339.09448 413.32974l0 24.125977" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m339.09448 413.3297l0 18.126007" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m337.44275 431.45572l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m339.09448 329.01575l0 25.322845" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m339.09448 329.01575l0 19.322845" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m337.44275 348.3386l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m339.09448 496.44235l0 29.984283" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m339.09448 496.44238l0 23.984253" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m337.44275 520.42664l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m223.0 185.73694l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m223.0 185.73694l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m265.82367 212.65694l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm23.697052 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm10.434021 5.609375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375732 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.96875 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375702 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656982 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219452 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m259.9714 238.43819l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844452 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019806 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426636 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5042114 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281952 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.578827 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm18.210358 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m223.0 270.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m223.0 270.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m265.82367 296.94363l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm14.931427 -3.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.199646 7.59375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375732 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.96875 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375702 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656982 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219452 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m259.9714 322.72488l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844452 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019806 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426636 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5042114 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281952 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.578827 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm18.210358 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m223.0 354.33762l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m223.0 354.33762l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m265.82367 381.2576l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm20.275177 0l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm13.855896 8.78125q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375732 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.96875 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375702 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656982 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219452 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m259.9714 407.03885l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844452 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019806 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426636 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5042114 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281952 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.578827 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm18.210358 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m223.0 437.45026l232.18896 0l0 58.992096l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m223.0 437.45026l232.18896 0l0 58.992096l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m265.82367 464.37024l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm14.915802 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm19.215271 7.5625q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375732 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.96875 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375702 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656982 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219452 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m259.9714 490.1515l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844452 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019806 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426636 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5042114 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281952 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.578827 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm18.210358 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m269.46194 594.54596l140.06299 0l0 42.11023l-140.06299 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m269.46194 594.54596l140.06299 0l0 42.11023l-140.06299 0z" fill-rule="evenodd"></path><path fill="#000000" d="m306.13754 617.09094l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm12.209198 -0.546875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.688232 4.921875l0 -8.546875l-1.484375 0l0 -1.3125l1.484375 0l0 -1.046875q0 -0.984375 0.171875 -1.46875q0.234375 -0.65625 0.84375 -1.046875q0.609375 -0.40625 1.703125 -0.40625q0.703125 0 1.5624695 0.15625l-0.25 1.46875q-0.5155945 -0.09375 -0.9843445 -0.09375q-0.765625 0 -1.078125 0.328125q-0.3125 0.3125 -0.3125 1.203125l0 0.90625l1.921875 0l0 1.3125l-1.921875 0l0 8.546875l-1.65625 0zm8.433289 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5270386 1.5l0 -9.859375l1.5 0l0 1.390625q0.453125 -0.71875 1.21875 -1.15625q0.78125 -0.453125 1.765625 -0.453125q1.09375 0 1.796875 0.453125q0.703125 0.453125 0.984375 1.28125q1.171875 -1.734375 3.046875 -1.734375q1.46875 0 2.25 0.8125q0.796875 0.8125 0.796875 2.5l0 6.765625l-1.671875 0l0 -6.203125q0 -1.0 -0.15625 -1.4375q-0.15625 -0.453125 -0.59375 -0.71875q-0.421875 -0.265625 -1.0 -0.265625q-1.03125 0 -1.71875 0.6875q-0.6875 0.6875 -0.6875 2.21875l0 5.71875l-1.671875 0l0 -6.40625q0 -1.109375 -0.40625 -1.65625q-0.40625 -0.5625 -1.34375 -0.5625q-0.703125 0 -1.3125 0.375q-0.59375 0.359375 -0.859375 1.078125q-0.265625 0.71875 -0.265625 2.0625l0 5.109375l-1.671875 0zm21.978302 -1.21875q-0.9375 0.796875 -1.796875 1.125q-0.859375 0.3125 -1.84375 0.3125q-1.609375 0 -2.484375 -0.78125q-0.875 -0.796875 -0.875 -2.03125q0 -0.734375 0.328125 -1.328125q0.328125 -0.59375 0.859375 -0.953125q0.53125 -0.359375 1.203125 -0.546875q0.5 -0.140625 1.484375 -0.25q2.03125 -0.25 2.984375 -0.578125q0 -0.34375 0 -0.4375q0 -1.015625 -0.46875 -1.4375q-0.640625 -0.5625 -1.90625 -0.5625q-1.171875 0 -1.734375 0.40625q-0.5625 0.40625 -0.828125 1.46875l-1.640625 -0.234375q0.234375 -1.046875 0.734375 -1.6875q0.515625 -0.640625 1.46875 -0.984375q0.96875 -0.359375 2.25 -0.359375q1.265625 0 2.046875 0.296875q0.78125 0.296875 1.15625 0.75q0.375 0.453125 0.515625 1.140625q0.09375 0.421875 0.09375 1.53125l0 2.234375q0 2.328125 0.09375 2.953125q0.109375 0.609375 0.4375 1.171875l-1.75 0q-0.265625 -0.515625 -0.328125 -1.21875zm-0.140625 -3.71875q-0.90625 0.359375 -2.734375 0.625q-1.03125 0.140625 -1.453125 0.328125q-0.421875 0.1875 -0.65625 0.546875q-0.234375 0.359375 -0.234375 0.796875q0 0.671875 0.5 1.125q0.515625 0.4375 1.484375 0.4375q0.96875 0 1.71875 -0.421875q0.75 -0.4375 1.109375 -1.15625q0.265625 -0.578125 0.265625 -1.671875l0 -0.609375zm2.9694824 4.9375l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m339.09448 568.5302l0.40945435 26.015747" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m339.09448 568.5302l0.31503296 20.01648" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m337.758 588.5727l1.7229309 4.5115356l1.5801086 -4.5635376z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m339.49344 636.6562l0.31497192 20.346436" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m339.49344 636.6562l0.22210693 14.347168" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m338.064 651.02893l1.7217712 4.511963l1.5812988 -4.5631104z" fill-rule="evenodd"></path></g></svg>
 
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/asr_lm.svg b/tensorflow/contrib/lite/models/testdata/g3doc/asr_lm.svg
new file mode 100644
index 0000000000000000000000000000000000000000..2662f772693197ed21197463175961bf9b65a1f4
--- /dev/null
+++ b/tensorflow/contrib/lite/models/testdata/g3doc/asr_lm.svg
@@ -0,0 +1,4 @@
+<?xml version="1.0" standalone="yes"?>
+
+<svg version="1.1" viewBox="0.0 0.0 742.6010498687664 753.6010498687664" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><clipPath id="p.0"><path d="m0 0l742.6011 0l0 753.6011l-742.6011 0l0 -753.6011z" clip-rule="nonzero"></path></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l742.6011 0l0 753.6011l-742.6011 0z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m136.0 30.700842l166.01575 0l0 42.110237l-166.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m136.0 30.700842l166.01575 0l0 42.110237l-166.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m153.6274 57.620842l0 -13.59375l1.8125 0l0 13.59375l-1.8125 0zm4.667679 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375717 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313217 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.578842 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm7.355179 1.5l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.918396 4.0q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2541962 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.07463 -2.125l-8.968735 0l0 -1.5625l8.968735 0l0 1.5625zm0 4.125l-8.968735 0l0 -1.546875l8.968735 0l0 1.546875zm13.125153 3.875l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm5.641327 4.0l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m103.0 180.96326l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m103.0 180.96326l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m151.01154 207.88326l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844467 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.880356 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm21.212677 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.918396 4.0q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm16.672592 3.5625l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.860092 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm0.95384216 1.609375l3.5937347 -5.125l-3.3281097 -4.734375l2.09375 0l1.5156097 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.5937347 3.890625l-2.015625 0zm16.26561 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.750732 -10.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm10.078857 8.40625l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m145.15926 233.6645l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844467 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019821 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547592 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426788 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5041962 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2541962 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.95311 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.4218597 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.2812347 -1.375 3.3281097 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.3437347 0q0.09375 1.625 0.92185974 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.4843597 -2.703125l5.4999847 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78123474 0.765625 -0.85935974 2.046875zm9.578842 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm9.444733 -3.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141327 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m129.09448 653.0184l180.0 0l0 42.11023l-180.0 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m129.09448 653.0184l180.0 0l0 42.11023l-180.0 0z" fill-rule="evenodd"></path><path fill="#000000" d="m150.8024 673.31335q0 -3.390625 1.8125 -5.296875q1.828125 -1.921875 4.703125 -1.921875q1.875 0 3.390625 0.90625q1.515625 0.890625 2.296875 2.5q0.796875 1.609375 0.796875 3.65625q0 2.0625 -0.84375 3.703125q-0.828125 1.625 -2.359375 2.46875q-1.53125 0.84375 -3.296875 0.84375q-1.921875 0 -3.4375 -0.921875q-1.5 -0.9375 -2.28125 -2.53125q-0.78125 -1.609375 -0.78125 -3.40625zm1.859375 0.03125q0 2.453125 1.3125 3.875q1.328125 1.40625 3.3125 1.40625q2.03125 0 3.34375 -1.421875q1.3125 -1.4375 1.3125 -4.0625q0 -1.65625 -0.5625 -2.890625q-0.546875 -1.234375 -1.640625 -1.921875q-1.078125 -0.6875 -2.421875 -0.6875q-1.90625 0 -3.28125 1.3125q-1.375 1.3125 -1.375 4.390625zm19.433304 6.59375l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.578842 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5270538 5.28125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313217 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.578842 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.897858 5.5q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2541962 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm13.125153 3.875l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm5.641327 4.0l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m219.09448 239.95538l0 21.543304" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.09448 239.95538l0 15.543304" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m217.44275 255.49869l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m219.09448 320.48557l0 21.543304" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.09448 320.48557l0 15.543304" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m217.44275 336.02887l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m219.00787 72.81108l0.09448242 25.732285" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.00787 72.81108l0.07246399 19.732315" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m217.4286 92.54946l1.668396 4.5320053l1.6350555 -4.544136z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m219.09448 401.01575l0 19.40158" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.09448 401.01575l0 13.401581" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m217.44275 414.41733l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m103.0 261.49344l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m103.0 261.49344l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m145.82367 288.41342l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm23.697052 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm10.434021 5.609375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm16.672607 3.5625l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.860077 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm0.9538574 1.609375l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.96875 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm16.672577 3.5625l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.860107 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm3.156952 5.609375l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m145.15926 314.19467l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844467 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019821 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547592 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426788 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5041962 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2541962 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.95311 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.4218597 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.2812347 -1.375 3.3281097 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.3437347 0q0.09375 1.625 0.92185974 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.4843597 -2.703125l5.4999847 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78123474 0.765625 -0.85935974 2.046875zm9.578842 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm9.444733 -3.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141327 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m103.0 342.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m103.0 342.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m145.82367 368.94363l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm14.931427 -3.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.199646 7.59375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm16.672607 3.5625l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.860077 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm0.9538574 1.609375l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.96875 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm16.672577 3.5625l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.860107 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm3.156952 5.609375l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m145.15926 394.72488l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844467 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019821 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547592 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426788 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5041962 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2541962 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.95311 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.4218597 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.2812347 -1.375 3.3281097 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.3437347 0q0.09375 1.625 0.92185974 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.4843597 -2.703125l5.4999847 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78123474 0.765625 -0.85935974 2.046875zm9.578842 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm9.444733 -3.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141327 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m219.09448 618.4042l0 34.614197" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.09448 618.4042l0 28.614197" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m217.44275 647.0184l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m103.0 98.54593l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m103.0 98.54593l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m143.32318 125.46593l0 -13.59375l9.84375 0l0 1.59375l-8.046875 0l0 4.171875l7.53125 0l0 1.59375l-7.53125 0l0 4.625l8.359375 0l0 1.609375l-10.15625 0zm12.193573 0l0 -9.859375l1.5 0l0 1.390625q0.453125 -0.71875 1.21875 -1.15625q0.78125 -0.453125 1.765625 -0.453125q1.09375 0 1.796875 0.453125q0.703125 0.453125 0.984375 1.28125q1.171875 -1.734375 3.046875 -1.734375q1.46875 0 2.25 0.8125q0.796875 0.8125 0.796875 2.5l0 6.765625l-1.671875 0l0 -6.203125q0 -1.0 -0.15625 -1.4375q-0.15625 -0.453125 -0.59375 -0.71875q-0.421875 -0.265625 -1.0 -0.265625q-1.03125 0 -1.71875 0.6875q-0.6875 0.6875 -0.6875 2.21875l0 5.71875l-1.671875 0l0 -6.40625q0 -1.109375 -0.40625 -1.65625q-0.40625 -0.5625 -1.34375 -0.5625q-0.703125 0 -1.3125 0.375q-0.59375 0.359375 -0.859375 1.078125q-0.265625 0.71875 -0.265625 2.0625l0 5.109375l-1.671875 0zm17.087677 0l-1.546875 0l0 -13.59375l1.65625 0l0 4.84375q1.0625 -1.328125 2.703125 -1.328125q0.90625 0 1.71875 0.375q0.8125 0.359375 1.328125 1.03125q0.53125 0.65625 0.828125 1.59375q0.296875 0.9375 0.296875 2.0q0 2.53125 -1.25 3.921875q-1.25 1.375 -3.0 1.375q-1.75 0 -2.734375 -1.453125l0 1.234375zm-0.015625 -5.0q0 1.765625 0.46875 2.5625q0.796875 1.28125 2.140625 1.28125q1.09375 0 1.890625 -0.9375q0.796875 -0.953125 0.796875 -2.84375q0 -1.921875 -0.765625 -2.84375q-0.765625 -0.921875 -1.84375 -0.921875q-1.09375 0 -1.890625 0.953125q-0.796875 0.953125 -0.796875 2.75zm15.594467 1.828125l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500717 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm15.656967 4.921875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm9.281967 -6.765625l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm4.129196 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.078842 0.8125l1.609375 0.25q0.109375 0.75 0.578125 1.09375q0.609375 0.453125 1.6875 0.453125q1.171875 0 1.796875 -0.46875q0.625 -0.453125 0.859375 -1.28125q0.125 -0.515625 0.109375 -2.15625q-1.09375 1.296875 -2.71875 1.296875q-2.03125 0 -3.15625 -1.46875q-1.109375 -1.46875 -1.109375 -3.515625q0 -1.40625 0.515625 -2.59375q0.515625 -1.203125 1.484375 -1.84375q0.96875 -0.65625 2.265625 -0.65625q1.75 0 2.875 1.40625l0 -1.1875l1.546875 0l0 8.515625q0 2.3125 -0.46875 3.265625q-0.46875 0.96875 -1.484375 1.515625q-1.015625 0.5625 -2.5 0.5625q-1.765625 0 -2.859375 -0.796875q-1.078125 -0.796875 -1.03125 -2.390625zm1.375 -5.921875q0 1.953125 0.765625 2.84375q0.78125 0.890625 1.9375 0.890625q1.140625 0 1.921875 -0.890625q0.78125 -0.890625 0.78125 -2.78125q0 -1.8125 -0.8125 -2.71875q-0.796875 -0.921875 -1.921875 -0.921875q-1.109375 0 -1.890625 0.90625q-0.78125 0.890625 -0.78125 2.671875zm14.449646 5.109375l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm3.5510712 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm8.656967 0q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.297607 4.921875l0 -13.59375l1.671875 0l0 7.75l3.953125 -4.015625l2.15625 0l-3.765625 3.65625l4.140625 6.203125l-2.0625 0l-3.25 -5.03125l-1.171875 1.125l0 3.90625l-1.671875 0zm15.765625 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.922577 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625z" fill-rule="nonzero"></path><path fill="#000000" d="m176.34024 151.46593q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.228302 -14.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm8.531967 0.8125l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm10.625717 0.453125l1.59375 -0.15625q0.203125 1.140625 0.78125 1.65625q0.578125 0.5 1.484375 0.5q0.765625 0 1.34375 -0.34375q0.578125 -0.359375 0.953125 -0.953125q0.375 -0.59375 0.625 -1.59375q0.25 -1.0 0.25 -2.03125q0 -0.109375 -0.015625 -0.34375q-0.5 0.796875 -1.375 1.296875q-0.859375 0.5 -1.875 0.5q-1.6875 0 -2.859375 -1.21875q-1.171875 -1.234375 -1.171875 -3.234375q0 -2.078125 1.21875 -3.328125q1.234375 -1.265625 3.0625 -1.265625q1.328125 0 2.421875 0.71875q1.109375 0.703125 1.671875 2.03125q0.578125 1.328125 0.578125 3.828125q0 2.609375 -0.578125 4.15625q-0.5625 1.546875 -1.6875 2.359375q-1.109375 0.796875 -2.609375 0.796875q-1.59375 0 -2.609375 -0.890625q-1.0 -0.890625 -1.203125 -2.484375zm6.828125 -6.0q0 -1.4375 -0.765625 -2.28125q-0.765625 -0.859375 -1.84375 -0.859375q-1.109375 0 -1.9375 0.921875q-0.828125 0.90625 -0.828125 2.34375q0 1.3125 0.78125 2.125q0.796875 0.796875 1.9375 0.796875q1.171875 0 1.90625 -0.796875q0.75 -0.8125 0.75 -2.25zm5.860092 1.765625q-1.046875 -0.375 -1.546875 -1.078125q-0.5 -0.71875 -0.5 -1.703125q0 -1.484375 1.0625 -2.484375q1.078125 -1.015625 2.84375 -1.015625q1.78125 0 2.859375 1.03125q1.09375 1.03125 1.09375 2.515625q0 0.953125 -0.5 1.65625q-0.484375 0.703125 -1.5 1.078125q1.25 0.40625 1.90625 1.3125q0.65625 0.90625 0.65625 2.171875q0 1.75 -1.234375 2.9375q-1.234375 1.1875 -3.25 1.1875q-2.015625 0 -3.25 -1.1875q-1.234375 -1.203125 -1.234375 -2.984375q0 -1.328125 0.671875 -2.21875q0.671875 -0.890625 1.921875 -1.21875zm-0.328125 -2.828125q0 0.96875 0.609375 1.578125q0.625 0.609375 1.625 0.609375q0.953125 0 1.5625 -0.609375q0.625 -0.609375 0.625 -1.484375q0 -0.921875 -0.640625 -1.546875q-0.625 -0.625 -1.578125 -0.625q-0.953125 0 -1.578125 0.609375q-0.625 0.609375 -0.625 1.46875zm-0.546875 6.28125q0 0.71875 0.328125 1.390625q0.34375 0.65625 1.015625 1.03125q0.671875 0.359375 1.4375 0.359375q1.203125 0 1.984375 -0.765625q0.78125 -0.78125 0.78125 -1.96875q0 -1.203125 -0.8125 -1.984375q-0.796875 -0.796875 -2.0 -0.796875q-1.1875 0 -1.96875 0.78125q-0.765625 0.78125 -0.765625 1.953125zm8.688217 0.328125l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm9.719467 3.59375l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm16.265625 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.750717 -10.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm10.078827 8.40625l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m219.09448 157.53806l0 23.433075" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.09448 157.53806l0 17.433075" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m217.44275 174.97113l1.6517334 4.538101l1.6517334 -4.538101z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m395.48425 30.700842l166.01575 0l0 42.110237l-166.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m395.48425 30.700842l166.01575 0l0 42.110237l-166.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m413.11163 57.620842l0 -13.59375l1.8125 0l0 13.59375l-1.8125 0zm4.667694 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375702 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313232 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.578827 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.839569 -0.109375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm10.434021 5.609375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.254181 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm13.125122 3.875l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm5.6413574 4.0l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m388.49344 411.97638l179.99997 0l0 58.992126l-179.99997 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m388.49344 411.97638l179.99997 0l0 58.992126l-179.99997 0z" fill-rule="evenodd"></path><path fill="#000000" d="m402.72214 438.89636l0 -13.59375l9.84375 0l0 1.59375l-8.046875 0l0 4.171875l7.53125 0l0 1.59375l-7.53125 0l0 4.625l8.359375 0l0 1.609375l-10.15625 0zm12.193573 0l0 -9.859375l1.5 0l0 1.390625q0.453125 -0.71875 1.21875 -1.15625q0.78125 -0.453125 1.765625 -0.453125q1.09375 0 1.796875 0.453125q0.703125 0.453125 0.984375 1.28125q1.171875 -1.734375 3.046875 -1.734375q1.46875 0 2.25 0.8125q0.796875 0.8125 0.796875 2.5l0 6.765625l-1.671875 0l0 -6.203125q0 -1.0 -0.15625 -1.4375q-0.15625 -0.453125 -0.59375 -0.71875q-0.421875 -0.265625 -1.0 -0.265625q-1.03125 0 -1.71875 0.6875q-0.6875 0.6875 -0.6875 2.21875l0 5.71875l-1.671875 0l0 -6.40625q0 -1.109375 -0.40625 -1.65625q-0.40625 -0.5625 -1.34375 -0.5625q-0.703125 0 -1.3125 0.375q-0.59375 0.359375 -0.859375 1.078125q-0.265625 0.71875 -0.265625 2.0625l0 5.109375l-1.671875 0zm17.087677 0l-1.546875 0l0 -13.59375l1.65625 0l0 4.84375q1.0625 -1.328125 2.703125 -1.328125q0.90625 0 1.71875 0.375q0.8125 0.359375 1.328125 1.03125q0.53125 0.65625 0.828125 1.59375q0.296875 0.9375 0.296875 2.0q0 2.53125 -1.25 3.921875q-1.25 1.375 -3.0 1.375q-1.75 0 -2.734375 -1.453125l0 1.234375zm-0.015625 -5.0q0 1.765625 0.46875 2.5625q0.796875 1.28125 2.140625 1.28125q1.09375 0 1.890625 -0.9375q0.796875 -0.953125 0.796875 -2.84375q0 -1.921875 -0.765625 -2.84375q-0.765625 -0.921875 -1.84375 -0.921875q-1.09375 0 -1.890625 0.953125q-0.796875 0.953125 -0.796875 2.75zm15.594452 1.828125l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500732 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm15.656952 4.921875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm9.281982 -6.765625l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm4.129181 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.078857 0.8125l1.609375 0.25q0.109375 0.75 0.578125 1.09375q0.609375 0.453125 1.6875 0.453125q1.171875 0 1.796875 -0.46875q0.625 -0.453125 0.859375 -1.28125q0.125 -0.515625 0.109375 -2.15625q-1.09375 1.296875 -2.71875 1.296875q-2.03125 0 -3.15625 -1.46875q-1.109375 -1.46875 -1.109375 -3.515625q0 -1.40625 0.515625 -2.59375q0.515625 -1.203125 1.484375 -1.84375q0.96875 -0.65625 2.265625 -0.65625q1.75 0 2.875 1.40625l0 -1.1875l1.546875 0l0 8.515625q0 2.3125 -0.46875 3.265625q-0.46875 0.96875 -1.484375 1.515625q-1.015625 0.5625 -2.5 0.5625q-1.765625 0 -2.859375 -0.796875q-1.078125 -0.796875 -1.03125 -2.390625zm1.375 -5.921875q0 1.953125 0.765625 2.84375q0.78125 0.890625 1.9375 0.890625q1.140625 0 1.921875 -0.890625q0.78125 -0.890625 0.78125 -2.78125q0 -1.8125 -0.8125 -2.71875q-0.796875 -0.921875 -1.921875 -0.921875q-1.109375 0 -1.890625 0.90625q-0.78125 0.890625 -0.78125 2.671875zm14.449646 5.109375l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm3.551056 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.0312805 0 3.3125305 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.0781555 0.59375 -2.3750305 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625305 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.8281555 -0.9375 -2.0625305 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm8.656952 0q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.297607 4.921875l0 -13.59375l1.671875 0l0 7.75l3.953125 -4.015625l2.15625 0l-3.765625 3.65625l4.140625 6.203125l-2.0625 0l-3.25 -5.03125l-1.171875 1.125l0 3.90625l-1.671875 0zm15.765625 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.9226074 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625z" fill-rule="nonzero"></path><path fill="#000000" d="m435.7392 464.89636q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.228302 -14.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm8.531952 0.8125l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm10.625732 0.453125l1.59375 -0.15625q0.203125 1.140625 0.78125 1.65625q0.578125 0.5 1.484375 0.5q0.765625 0 1.34375 -0.34375q0.578125 -0.359375 0.953125 -0.953125q0.375 -0.59375 0.625 -1.59375q0.25 -1.0 0.25 -2.03125q0 -0.109375 -0.015625 -0.34375q-0.5 0.796875 -1.375 1.296875q-0.859375 0.5 -1.875 0.5q-1.6875 0 -2.859375 -1.21875q-1.171875 -1.234375 -1.171875 -3.234375q0 -2.078125 1.21875 -3.328125q1.234375 -1.265625 3.0625 -1.265625q1.328125 0 2.421875 0.71875q1.109375 0.703125 1.671875 2.03125q0.578125 1.328125 0.578125 3.828125q0 2.609375 -0.578125 4.15625q-0.5625 1.546875 -1.6875 2.359375q-1.109375 0.796875 -2.609375 0.796875q-1.59375 0 -2.609375 -0.890625q-1.0 -0.890625 -1.203125 -2.484375zm6.828125 -6.0q0 -1.4375 -0.765625 -2.28125q-0.765625 -0.859375 -1.84375 -0.859375q-1.109375 0 -1.9375 0.921875q-0.828125 0.90625 -0.828125 2.34375q0 1.3125 0.78125 2.125q0.796875 0.796875 1.9375 0.796875q1.171875 0 1.90625 -0.796875q0.75 -0.8125 0.75 -2.25zm5.860077 1.765625q-1.046875 -0.375 -1.546875 -1.078125q-0.5 -0.71875 -0.5 -1.703125q0 -1.484375 1.0625 -2.484375q1.078125 -1.015625 2.84375 -1.015625q1.78125 0 2.859375 1.03125q1.09375 1.03125 1.09375 2.515625q0 0.953125 -0.5 1.65625q-0.484375 0.703125 -1.5 1.078125q1.25 0.40625 1.90625 1.3125q0.65625 0.90625 0.65625 2.171875q0 1.75 -1.234375 2.9375q-1.234375 1.1875 -3.25 1.1875q-2.015625 0 -3.25 -1.1875q-1.234375 -1.203125 -1.234375 -2.984375q0 -1.328125 0.671875 -2.21875q0.671875 -0.890625 1.921875 -1.21875zm-0.328125 -2.828125q0 0.96875 0.609375 1.578125q0.625 0.609375 1.625 0.609375q0.953125 0 1.5625 -0.609375q0.625 -0.609375 0.625 -1.484375q0 -0.921875 -0.640625 -1.546875q-0.625 -0.625 -1.578125 -0.625q-0.953125 0 -1.578125 0.609375q-0.625 0.609375 -0.625 1.46875zm-0.546875 6.28125q0 0.71875 0.328125 1.390625q0.34375 0.65625 1.015625 1.03125q0.671875 0.359375 1.4375 0.359375q1.203125 0 1.984375 -0.765625q0.78125 -0.78125 0.78125 -1.96875q0 -1.203125 -0.8125 -1.984375q-0.796875 -0.796875 -2.0 -0.796875q-1.1875 0 -1.96875 0.78125q-0.765625 0.78125 -0.765625 1.953125zm8.688232 0.328125l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm9.719452 3.59375l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.984375 -3.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141357 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm3.1569214 5.609375l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m388.49344 567.8504l179.99997 0l0 58.992126l-179.99997 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m388.49344 567.8504l179.99997 0l0 58.992126l-179.99997 0z" fill-rule="evenodd"></path><path fill="#000000" d="m402.72214 594.7704l0 -13.59375l9.84375 0l0 1.59375l-8.046875 0l0 4.171875l7.53125 0l0 1.59375l-7.53125 0l0 4.625l8.359375 0l0 1.609375l-10.15625 0zm12.193573 0l0 -9.859375l1.5 0l0 1.390625q0.453125 -0.71875 1.21875 -1.15625q0.78125 -0.453125 1.765625 -0.453125q1.09375 0 1.796875 0.453125q0.703125 0.453125 0.984375 1.28125q1.171875 -1.734375 3.046875 -1.734375q1.46875 0 2.25 0.8125q0.796875 0.8125 0.796875 2.5l0 6.765625l-1.671875 0l0 -6.203125q0 -1.0 -0.15625 -1.4375q-0.15625 -0.453125 -0.59375 -0.71875q-0.421875 -0.265625 -1.0 -0.265625q-1.03125 0 -1.71875 0.6875q-0.6875 0.6875 -0.6875 2.21875l0 5.71875l-1.671875 0l0 -6.40625q0 -1.109375 -0.40625 -1.65625q-0.40625 -0.5625 -1.34375 -0.5625q-0.703125 0 -1.3125 0.375q-0.59375 0.359375 -0.859375 1.078125q-0.265625 0.71875 -0.265625 2.0625l0 5.109375l-1.671875 0zm17.087677 0l-1.546875 0l0 -13.59375l1.65625 0l0 4.84375q1.0625 -1.328125 2.703125 -1.328125q0.90625 0 1.71875 0.375q0.8125 0.359375 1.328125 1.03125q0.53125 0.65625 0.828125 1.59375q0.296875 0.9375 0.296875 2.0q0 2.53125 -1.25 3.921875q-1.25 1.375 -3.0 1.375q-1.75 0 -2.734375 -1.453125l0 1.234375zm-0.015625 -5.0q0 1.765625 0.46875 2.5625q0.796875 1.28125 2.140625 1.28125q1.09375 0 1.890625 -0.9375q0.796875 -0.953125 0.796875 -2.84375q0 -1.921875 -0.765625 -2.84375q-0.765625 -0.921875 -1.84375 -0.921875q-1.09375 0 -1.890625 0.953125q-0.796875 0.953125 -0.796875 2.75zm15.594452 1.828125l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500732 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm15.656952 4.921875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm9.281982 -6.765625l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm4.129181 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.078857 0.8125l1.609375 0.25q0.109375 0.75 0.578125 1.09375q0.609375 0.453125 1.6875 0.453125q1.171875 0 1.796875 -0.46875q0.625 -0.453125 0.859375 -1.28125q0.125 -0.515625 0.109375 -2.15625q-1.09375 1.296875 -2.71875 1.296875q-2.03125 0 -3.15625 -1.46875q-1.109375 -1.46875 -1.109375 -3.515625q0 -1.40625 0.515625 -2.59375q0.515625 -1.203125 1.484375 -1.84375q0.96875 -0.65625 2.265625 -0.65625q1.75 0 2.875 1.40625l0 -1.1875l1.546875 0l0 8.515625q0 2.3125 -0.46875 3.265625q-0.46875 0.96875 -1.484375 1.515625q-1.015625 0.5625 -2.5 0.5625q-1.765625 0 -2.859375 -0.796875q-1.078125 -0.796875 -1.03125 -2.390625zm1.375 -5.921875q0 1.953125 0.765625 2.84375q0.78125 0.890625 1.9375 0.890625q1.140625 0 1.921875 -0.890625q0.78125 -0.890625 0.78125 -2.78125q0 -1.8125 -0.8125 -2.71875q-0.796875 -0.921875 -1.921875 -0.921875q-1.109375 0 -1.890625 0.90625q-0.78125 0.890625 -0.78125 2.671875zm14.449646 5.109375l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm3.551056 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.0312805 0 3.3125305 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.0781555 0.59375 -2.3750305 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625305 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.8281555 -0.9375 -2.0625305 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm8.656952 0q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.297607 4.921875l0 -13.59375l1.671875 0l0 7.75l3.953125 -4.015625l2.15625 0l-3.765625 3.65625l4.140625 6.203125l-2.0625 0l-3.25 -5.03125l-1.171875 1.125l0 3.90625l-1.671875 0zm15.765625 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.9226074 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625z" fill-rule="nonzero"></path><path fill="#000000" d="m440.92703 620.7704q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.228302 -14.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm8.531982 0.8125l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm10.625702 0.453125l1.59375 -0.15625q0.203125 1.140625 0.78125 1.65625q0.578125 0.5 1.484375 0.5q0.765625 0 1.34375 -0.34375q0.578125 -0.359375 0.953125 -0.953125q0.375 -0.59375 0.625 -1.59375q0.25 -1.0 0.25 -2.03125q0 -0.109375 -0.015625 -0.34375q-0.5 0.796875 -1.375 1.296875q-0.859375 0.5 -1.875 0.5q-1.6875 0 -2.859375 -1.21875q-1.171875 -1.234375 -1.171875 -3.234375q0 -2.078125 1.21875 -3.328125q1.234375 -1.265625 3.0625 -1.265625q1.328125 0 2.421875 0.71875q1.109375 0.703125 1.671875 2.03125q0.578125 1.328125 0.578125 3.828125q0 2.609375 -0.578125 4.15625q-0.5625 1.546875 -1.6875 2.359375q-1.109375 0.796875 -2.609375 0.796875q-1.59375 0 -2.609375 -0.890625q-1.0 -0.890625 -1.203125 -2.484375zm6.828125 -6.0q0 -1.4375 -0.765625 -2.28125q-0.765625 -0.859375 -1.84375 -0.859375q-1.109375 0 -1.9375 0.921875q-0.828125 0.90625 -0.828125 2.34375q0 1.3125 0.78125 2.125q0.796875 0.796875 1.9375 0.796875q1.171875 0 1.90625 -0.796875q0.75 -0.8125 0.75 -2.25zm5.8601074 1.765625q-1.046875 -0.375 -1.546875 -1.078125q-0.5 -0.71875 -0.5 -1.703125q0 -1.484375 1.0625 -2.484375q1.078125 -1.015625 2.84375 -1.015625q1.78125 0 2.859375 1.03125q1.09375 1.03125 1.09375 2.515625q0 0.953125 -0.5 1.65625q-0.484375 0.703125 -1.5 1.078125q1.25 0.40625 1.90625 1.3125q0.65625 0.90625 0.65625 2.171875q0 1.75 -1.234375 2.9375q-1.234375 1.1875 -3.25 1.1875q-2.015625 0 -3.25 -1.1875q-1.234375 -1.203125 -1.234375 -2.984375q0 -1.328125 0.671875 -2.21875q0.671875 -0.890625 1.921875 -1.21875zm-0.328125 -2.828125q0 0.96875 0.609375 1.578125q0.625 0.609375 1.625 0.609375q0.953125 0 1.5625 -0.609375q0.625 -0.609375 0.625 -1.484375q0 -0.921875 -0.640625 -1.546875q-0.625 -0.625 -1.578125 -0.625q-0.953125 0 -1.578125 0.609375q-0.625 0.609375 -0.625 1.46875zm-0.546875 6.28125q0 0.71875 0.328125 1.390625q0.34375 0.65625 1.015625 1.03125q0.671875 0.359375 1.4375 0.359375q1.203125 0 1.984375 -0.765625q0.78125 -0.78125 0.78125 -1.96875q0 -1.203125 -0.8125 -1.984375q-0.796875 -0.796875 -2.0 -0.796875q-1.1875 0 -1.96875 0.78125q-0.765625 0.78125 -0.765625 1.953125zm8.688202 0.328125l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm9.719482 3.59375l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm16.265625 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm5.6413574 4.0l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m129.09448 420.41733l180.0 0l0 42.11023l-180.0 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m129.09448 420.41733l180.0 0l0 42.11023l-180.0 0z" fill-rule="evenodd"></path><path fill="#000000" d="m147.40158 447.3373l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm21.837677 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.8913422 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm7.832321 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426788 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm4.129196 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.828842 4.875l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.191696 -11.6875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm10.566696 -3.609375l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm9.328125 2.390625q-0.9375 0.796875 -1.796875 1.125q-0.859375 0.3125 -1.84375 0.3125q-1.609375 0 -2.484375 -0.78125q-0.875 -0.796875 -0.875 -2.03125q0 -0.734375 0.328125 -1.328125q0.328125 -0.59375 0.859375 -0.953125q0.53125 -0.359375 1.203125 -0.546875q0.5 -0.140625 1.484375 -0.25q2.03125 -0.25 2.984375 -0.578125q0 -0.34375 0 -0.4375q0 -1.015625 -0.46875 -1.4375q-0.640625 -0.5625 -1.90625 -0.5625q-1.171875 0 -1.734375 0.40625q-0.5625 0.40625 -0.828125 1.46875l-1.640625 -0.234375q0.234375 -1.046875 0.734375 -1.6875q0.515625 -0.640625 1.46875 -0.984375q0.96875 -0.359375 2.25 -0.359375q1.265625 0 2.046875 0.296875q0.78125 0.296875 1.15625 0.75q0.375 0.453125 0.515625 1.140625q0.09375 0.421875 0.09375 1.53125l0 2.234375q0 2.328125 0.09375 2.953125q0.109375 0.609375 0.4375 1.171875l-1.75 0q-0.265625 -0.515625 -0.328125 -1.21875zm-0.140625 -3.71875q-0.90625 0.359375 -2.734375 0.625q-1.03125 0.140625 -1.453125 0.328125q-0.421875 0.1875 -0.65625 0.546875q-0.234375 0.359375 -0.234375 0.796875q0 0.671875 0.5 1.125q0.515625 0.4375 1.484375 0.4375q0.96875 0 1.71875 -0.421875q0.75 -0.4375 1.109375 -1.15625q0.265625 -0.578125 0.265625 -1.671875l0 -0.609375zm7.735092 3.4375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426788 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5041962 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm18.746506 4.0q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.572052 -7.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141357 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm3.156952 5.609375l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m129.09448 576.29395l180.0 0l0 42.11023l-180.0 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m129.09448 576.29395l180.0 0l0 42.11023l-180.0 0z" fill-rule="evenodd"></path><path fill="#000000" d="m171.36136 603.214l5.234375 -13.59375l1.9375 0l5.5625 13.59375l-2.046875 0l-1.59375 -4.125l-5.6875 0l-1.484375 4.125l-1.921875 0zm3.921875 -5.578125l4.609375 0l-1.40625 -3.78125q-0.65625 -1.703125 -0.96875 -2.8125q-0.265625 1.3125 -0.734375 2.59375l-1.5 4.0zm16.193573 5.578125l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm15.656967 4.921875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm9.281967 -6.765625l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm7.785446 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426788 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5041962 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm18.746521 4.0q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm8.853302 -4.0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm5.641327 4.0l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m388.49344 597.34644l-79.40158 0" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m388.49344 597.34644l-73.40158 0" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m315.09186 595.6947l-4.538086 1.6517334l4.538086 1.6517334z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m388.49344 441.47244l-79.40158 0" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m388.49344 441.47244l-73.40158 0" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m315.09186 439.8207l-4.538086 1.6517334l4.538086 1.6517334z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m219.09448 462.52756l0 31.84253" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.09448 462.52756l0 25.84253" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m217.44275 488.3701l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m561.5 51.755962l31.99347 0l0 545.57477l-25.001343 0" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m561.5 51.755962l31.99347 0l0 545.57477l-25.001343 0" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m478.49213 72.81108l0 339.1496" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m478.49213 72.81108l0 333.1496" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m476.8404 405.96066l1.6517334 4.5381165l1.6517334 -4.5381165z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m590.00525 597.4094l-21.51184 -0.06298828" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m590.00525 597.4094l-15.511841 -0.045410156" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m574.4982 595.7123l-4.5429077 1.6384277l4.533264 1.6650391z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m109.09449 494.357l220.0 0l0 42.11023l-220.0 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m109.09449 494.357l220.0 0l0 42.11023l-220.0 0z" fill-rule="evenodd"></path><path fill="#000000" d="m126.81095 521.277l0 -13.59375l9.171867 0l0 1.59375l-7.375 0l0 4.21875l6.375 0l0 1.609375l-6.375 0l0 6.171875l-1.7968674 0zm17.536598 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.8913422 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.144821 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.097946 3.796875l-0.171875 -1.5625q0.546875 0.140625 0.953125 0.140625q0.546875 0 0.875 -0.1875q0.34375 -0.1875 0.5625 -0.515625q0.15625 -0.25 0.5 -1.25q0.046875 -0.140625 0.15625 -0.40625l-3.734375 -9.875l1.796875 0l2.046875 5.71875q0.40625 1.078125 0.71875 2.28125q0.28125 -1.15625 0.6875 -2.25l2.09375 -5.75l1.671875 0l-3.75 10.03125q-0.59375 1.625 -0.9375 2.234375q-0.4375 0.828125 -1.015625 1.203125q-0.578125 0.390625 -1.375 0.390625q-0.484375 0 -1.078125 -0.203125zm19.328125 -8.5625l1.796875 0.453125q-0.5625 2.21875 -2.03125 3.390625q-1.46875 1.15625 -3.59375 1.15625q-2.203125 0 -3.578125 -0.890625q-1.375 -0.90625 -2.09375 -2.59375q-0.71875 -1.703125 -0.71875 -3.65625q0 -2.125 0.796875 -3.703125q0.8125 -1.578125 2.3125 -2.390625q1.5 -0.828125 3.296875 -0.828125q2.046875 0 3.4375 1.046875q1.390625 1.03125 1.9375 2.90625l-1.765625 0.421875q-0.46875 -1.484375 -1.375 -2.15625q-0.90625 -0.6875 -2.265625 -0.6875q-1.5625 0 -2.625 0.75q-1.046875 0.75 -1.484375 2.03125q-0.421875 1.265625 -0.421875 2.609375q0 1.734375 0.5 3.03125q0.515625 1.28125 1.578125 1.921875q1.078125 0.640625 2.3125 0.640625q1.515625 0 2.5625 -0.859375q1.046875 -0.875 1.421875 -2.59375zm2.9260712 -0.15625q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375717 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm17.125717 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547592 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm8.277054 -1.671875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500717 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm17.637161 8.921875q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.572052 -7.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141327 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm0.9538574 1.609375l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm16.265625 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm5.641327 4.0l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m219.09448 536.4672l0 39.811035" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.09448 536.4672l0 33.811035" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m217.44275 570.27826l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path></g></svg>
+
diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index b78e958e7f3a99993ab5e2cf487cfa73de8a74e8..80668890786becd161f9fd07317970b199ddb044 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -108,7 +108,7 @@ enum {
  * The type of operations that can be added to a model.
  */
 enum {
-  /** Adds two tensors, elment-wise.
+  /** Adds two tensors, element-wise.
    *
    * Takes two input tensors of identical type and compatible dimensions. The
    * output is the sum of both input tensors, optionally modified by an
@@ -743,7 +743,7 @@ enum {
    */
   ANEURALNETWORKS_MAX_POOL_2D = 17,
 
-  /** Multiplies two tensors, elment-wise.
+  /** Multiplies two tensors, element-wise.
    *
    * Takes two input tensors of identical type and compatible dimensions. The
    * output is the product of both input tensors, optionally modified by an
@@ -1454,9 +1454,9 @@ inline int ANeuralNetworksModel_finish(ANeuralNetworksModel* model) {
  * {@link ANeuralNetworksExecution_setOutputFromMemory} and
  * {@link ANeuralNetworksExecution_setOperandValue}.
  *
- * To build a model that can accommodate inputs of various sizes, as you may want
- * to do for a CNN, set the size of the dimensions that will vary at run time to
- * 0. If you do so, provide the full dimensions when calling
+ * To build a model that can accommodate inputs of various sizes, as you may
+ * want to do for a CNN, set the size of the dimensions that will vary at run
+ * time to 0. If you do so, provide the full dimensions when calling
  * {@link ANeuralNetworksExecution_setInput} or {@link
  * ANeuralNetworksExecution_setInputFromMemory}.
  *
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index 6a199cc8406c73f822b813603e55b0ba1994a235..5cb0afcea0286b847ca8365548db419ad619d3e6 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -208,6 +208,11 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       add_scalar_float32(builtin->beta);
     };
 
+    auto add_space_to_depth_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteSpaceToDepthParams*>(data);
+      add_scalar_int32(builtin->block_size);
+    };
+
 #if 0
     auto add_reshape_params = [&](void* data) {
       auto builtin = reinterpret_cast<TfLiteReshapeParams*>(data);
@@ -280,22 +285,29 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
         nn_op_type = ANEURALNETWORKS_RESHAPE;
         // add_reshape_params(node.builtin_data);
         break;
+      case tflite::BuiltinOperator_SPACE_TO_DEPTH:
+        add_space_to_depth_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_SPACE_TO_DEPTH;
+        break;
       case tflite::BuiltinOperator_CONCAT_EMBEDDINGS:
       case tflite::BuiltinOperator_LSH_PROJECTION:
       case tflite::BuiltinOperator_SVDF:
       case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
       case tflite::BuiltinOperator_RNN:
+      case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP:
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
       case tflite::BuiltinOperator_LSTM:
       case tflite::BuiltinOperator_L2_NORMALIZATION:
       case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
       case tflite::BuiltinOperator_MUL:
+      case tflite::BuiltinOperator_PAD:
       case tflite::BuiltinOperator_RESIZE_BILINEAR:
       case tflite::BuiltinOperator_CALL:
       case tflite::BuiltinOperator_SKIP_GRAM:
       case tflite::BuiltinOperator_RELU1:
-      case tflite::BuiltinOperator_SPACE_TO_DEPTH:
+      case tflite::BuiltinOperator_GATHER:
+      case tflite::BuiltinOperator_BATCH_TO_SPACE_ND:
         FATAL("Op code %d is currently not delegated to NNAPI", builtin);
         nn_op_type = -1;  // set to invalid
         break;
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index 89e8693490dcec79e7a117073696e57a9060e68f..3d6a3ec0fd4c673f601254b19452bbf8b9454e27 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -24,6 +24,7 @@ py_test(
     name = "lite_test",
     srcs = ["lite_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_oss"],
     deps = [
         ":lite",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 759677121f5621d0327841e98658142e89726acc..95309478a6f9791e3510736c45f9c5cfab88703b 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -50,7 +50,7 @@ GRAPHVIZ_DOT = _toco_flags_pb2.GRAPHVIZ_DOT
 # to protect against crashes. However, it breaks some dependent targets because
 # it forces us to depend on an external py_binary. The experimental API doesn't
 # have that drawback.
-EXPERIMENTAL_USE_TOCO_API_DIRECTLY = True
+EXPERIMENTAL_USE_TOCO_API_DIRECTLY = False
 
 # Find the toco_from_protos binary using the resource loader if using from
 # bazel, otherwise we are in a pip where console_scripts already has
@@ -164,8 +164,8 @@ def toco_convert(input_data,
   toco = _toco_flags_pb2.TocoFlags()
   toco.input_format = input_format
   toco.output_format = output_format
+  toco.drop_control_dependency = drop_control_dependency
   model = _model_flags_pb2.ModelFlags()
-  model.drop_control_dependency = drop_control_dependency
   toco.inference_type = inference_type
   for idx, input_tensor in enumerate(input_tensors):
     if input_tensor.dtype == _dtypes.float32:
@@ -187,8 +187,8 @@ def toco_convert(input_data,
       input_array.mean, input_array.std = quantized_input_stats[idx]
 
     input_array.name = _tensor_name(input_tensor)
-    input_array.shape.extend(map(int, input_tensor.get_shape()))
-    toco.input_types.append(tflite_input_type)
+    input_array.shape.dims.extend(map(int, input_tensor.get_shape()))
+    toco.inference_input_type = tflite_input_type
 
   for output_tensor in output_tensors:
     model.output_arrays.append(_tensor_name(output_tensor))
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index ddb2ab792c520eb245445532f534ebce8a9f1280..cc31e03dfc9c0449cb404ee64d6a9964738d7bfb 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -104,6 +104,10 @@ enum BuiltinOperator : byte {
   CALL = 31,
   CUSTOM = 32,
   EMBEDDING_LOOKUP_SPARSE = 33,
+  PAD = 34,
+  UNIDIRECTIONAL_SEQUENCE_RNN = 35,
+  GATHER = 36,
+  BATCH_TO_SPACE_ND = 37,
 }
 
 // Options for the builtin operators.
@@ -129,6 +133,9 @@ union BuiltinOptions {
   SpaceToDepthOptions,
   EmbeddingLookupSparseOptions,
   MulOptions,
+  PadOptions,
+  GatherOptions,
+  BatchToSpaceNDOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -244,10 +251,21 @@ table CallOptions {
   subgraph:uint;
 }
 
+table PadOptions {
+  before_padding:[int];
+  after_padding:[int];
+}
+
 table ReshapeOptions {
   new_shape:[int];
 }
 
+table BatchToSpaceNDOptions {
+  block_shape:[int];
+  before_crops:[int];
+  after_crops:[int];
+}
+
 table SkipGramOptions {
   ngram_size: int;
   max_skip_size: int;
@@ -268,6 +286,10 @@ table EmbeddingLookupSparseOptions {
   combiner:CombinerType;
 }
 
+table GatherOptions {
+  axis: int;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
@@ -343,4 +365,3 @@ table Model {
 }
 
 root_type Model;
-
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index df460ab9a32f1d80c0788649e799778db8050b7f..aa169198fe96b538addf091a8c7569e9127d0a01 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -1,5 +1,20 @@
-// automatically generated by the FlatBuffers compiler, do not modify
 
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// automatically generated by the FlatBuffers compiler, do not modify
 
 #ifndef FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
 #define FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
@@ -65,9 +80,15 @@ struct ResizeBilinearOptionsT;
 struct CallOptions;
 struct CallOptionsT;
 
+struct PadOptions;
+struct PadOptionsT;
+
 struct ReshapeOptions;
 struct ReshapeOptionsT;
 
+struct BatchToSpaceNDOptions;
+struct BatchToSpaceNDOptionsT;
+
 struct SkipGramOptions;
 struct SkipGramOptionsT;
 
@@ -77,6 +98,9 @@ struct SpaceToDepthOptionsT;
 struct EmbeddingLookupSparseOptions;
 struct EmbeddingLookupSparseOptionsT;
 
+struct GatherOptions;
+struct GatherOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -104,27 +128,15 @@ enum TensorType {
 };
 
 inline TensorType (&EnumValuesTensorType())[6] {
-  static TensorType values[] = {
-    TensorType_FLOAT32,
-    TensorType_FLOAT16,
-    TensorType_INT32,
-    TensorType_UINT8,
-    TensorType_INT64,
-    TensorType_STRING
-  };
+  static TensorType values[] = {TensorType_FLOAT32, TensorType_FLOAT16,
+                                TensorType_INT32,   TensorType_UINT8,
+                                TensorType_INT64,   TensorType_STRING};
   return values;
 }
 
 inline const char **EnumNamesTensorType() {
-  static const char *names[] = {
-    "FLOAT32",
-    "FLOAT16",
-    "INT32",
-    "UINT8",
-    "INT64",
-    "STRING",
-    nullptr
-  };
+  static const char *names[] = {"FLOAT32", "FLOAT16", "INT32", "UINT8",
+                                "INT64",   "STRING",  nullptr};
   return names;
 }
 
@@ -165,85 +177,94 @@ enum BuiltinOperator {
   BuiltinOperator_CALL = 31,
   BuiltinOperator_CUSTOM = 32,
   BuiltinOperator_EMBEDDING_LOOKUP_SPARSE = 33,
+  BuiltinOperator_PAD = 34,
+  BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN = 35,
+  BuiltinOperator_GATHER = 36,
+  BuiltinOperator_BATCH_TO_SPACE_ND = 37,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_EMBEDDING_LOOKUP_SPARSE
+  BuiltinOperator_MAX = BuiltinOperator_BATCH_TO_SPACE_ND
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[31] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[35] {
   static BuiltinOperator values[] = {
-    BuiltinOperator_ADD,
-    BuiltinOperator_AVERAGE_POOL_2D,
-    BuiltinOperator_CONCATENATION,
-    BuiltinOperator_CONV_2D,
-    BuiltinOperator_DEPTHWISE_CONV_2D,
-    BuiltinOperator_EMBEDDING_LOOKUP,
-    BuiltinOperator_FULLY_CONNECTED,
-    BuiltinOperator_HASHTABLE_LOOKUP,
-    BuiltinOperator_L2_NORMALIZATION,
-    BuiltinOperator_L2_POOL_2D,
-    BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
-    BuiltinOperator_LOGISTIC,
-    BuiltinOperator_LSH_PROJECTION,
-    BuiltinOperator_LSTM,
-    BuiltinOperator_MAX_POOL_2D,
-    BuiltinOperator_MUL,
-    BuiltinOperator_RELU,
-    BuiltinOperator_RELU1,
-    BuiltinOperator_RELU6,
-    BuiltinOperator_RESHAPE,
-    BuiltinOperator_RESIZE_BILINEAR,
-    BuiltinOperator_RNN,
-    BuiltinOperator_SOFTMAX,
-    BuiltinOperator_SPACE_TO_DEPTH,
-    BuiltinOperator_SVDF,
-    BuiltinOperator_TANH,
-    BuiltinOperator_CONCAT_EMBEDDINGS,
-    BuiltinOperator_SKIP_GRAM,
-    BuiltinOperator_CALL,
-    BuiltinOperator_CUSTOM,
-    BuiltinOperator_EMBEDDING_LOOKUP_SPARSE
-  };
+      BuiltinOperator_ADD,
+      BuiltinOperator_AVERAGE_POOL_2D,
+      BuiltinOperator_CONCATENATION,
+      BuiltinOperator_CONV_2D,
+      BuiltinOperator_DEPTHWISE_CONV_2D,
+      BuiltinOperator_EMBEDDING_LOOKUP,
+      BuiltinOperator_FULLY_CONNECTED,
+      BuiltinOperator_HASHTABLE_LOOKUP,
+      BuiltinOperator_L2_NORMALIZATION,
+      BuiltinOperator_L2_POOL_2D,
+      BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
+      BuiltinOperator_LOGISTIC,
+      BuiltinOperator_LSH_PROJECTION,
+      BuiltinOperator_LSTM,
+      BuiltinOperator_MAX_POOL_2D,
+      BuiltinOperator_MUL,
+      BuiltinOperator_RELU,
+      BuiltinOperator_RELU1,
+      BuiltinOperator_RELU6,
+      BuiltinOperator_RESHAPE,
+      BuiltinOperator_RESIZE_BILINEAR,
+      BuiltinOperator_RNN,
+      BuiltinOperator_SOFTMAX,
+      BuiltinOperator_SPACE_TO_DEPTH,
+      BuiltinOperator_SVDF,
+      BuiltinOperator_TANH,
+      BuiltinOperator_CONCAT_EMBEDDINGS,
+      BuiltinOperator_SKIP_GRAM,
+      BuiltinOperator_CALL,
+      BuiltinOperator_CUSTOM,
+      BuiltinOperator_EMBEDDING_LOOKUP_SPARSE,
+      BuiltinOperator_PAD,
+      BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
+      BuiltinOperator_GATHER,
+      BuiltinOperator_BATCH_TO_SPACE_ND};
   return values;
 }
 
 inline const char **EnumNamesBuiltinOperator() {
-  static const char *names[] = {
-    "ADD",
-    "AVERAGE_POOL_2D",
-    "CONCATENATION",
-    "CONV_2D",
-    "DEPTHWISE_CONV_2D",
-    "",
-    "",
-    "EMBEDDING_LOOKUP",
-    "",
-    "FULLY_CONNECTED",
-    "HASHTABLE_LOOKUP",
-    "L2_NORMALIZATION",
-    "L2_POOL_2D",
-    "LOCAL_RESPONSE_NORMALIZATION",
-    "LOGISTIC",
-    "LSH_PROJECTION",
-    "LSTM",
-    "MAX_POOL_2D",
-    "MUL",
-    "RELU",
-    "RELU1",
-    "RELU6",
-    "RESHAPE",
-    "RESIZE_BILINEAR",
-    "RNN",
-    "SOFTMAX",
-    "SPACE_TO_DEPTH",
-    "SVDF",
-    "TANH",
-    "CONCAT_EMBEDDINGS",
-    "SKIP_GRAM",
-    "CALL",
-    "CUSTOM",
-    "EMBEDDING_LOOKUP_SPARSE",
-    nullptr
-  };
+  static const char *names[] = {"ADD",
+                                "AVERAGE_POOL_2D",
+                                "CONCATENATION",
+                                "CONV_2D",
+                                "DEPTHWISE_CONV_2D",
+                                "",
+                                "",
+                                "EMBEDDING_LOOKUP",
+                                "",
+                                "FULLY_CONNECTED",
+                                "HASHTABLE_LOOKUP",
+                                "L2_NORMALIZATION",
+                                "L2_POOL_2D",
+                                "LOCAL_RESPONSE_NORMALIZATION",
+                                "LOGISTIC",
+                                "LSH_PROJECTION",
+                                "LSTM",
+                                "MAX_POOL_2D",
+                                "MUL",
+                                "RELU",
+                                "RELU1",
+                                "RELU6",
+                                "RESHAPE",
+                                "RESIZE_BILINEAR",
+                                "RNN",
+                                "SOFTMAX",
+                                "SPACE_TO_DEPTH",
+                                "SVDF",
+                                "TANH",
+                                "CONCAT_EMBEDDINGS",
+                                "SKIP_GRAM",
+                                "CALL",
+                                "CUSTOM",
+                                "EMBEDDING_LOOKUP_SPARSE",
+                                "PAD",
+                                "UNIDIRECTIONAL_SEQUENCE_RNN",
+                                "GATHER",
+                                "BATCH_TO_SPACE_ND",
+                                nullptr};
   return names;
 }
 
@@ -275,64 +296,70 @@ enum BuiltinOptions {
   BuiltinOptions_SpaceToDepthOptions = 19,
   BuiltinOptions_EmbeddingLookupSparseOptions = 20,
   BuiltinOptions_MulOptions = 21,
+  BuiltinOptions_PadOptions = 22,
+  BuiltinOptions_GatherOptions = 23,
+  BuiltinOptions_BatchToSpaceNDOptions = 24,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_MulOptions
+  BuiltinOptions_MAX = BuiltinOptions_BatchToSpaceNDOptions
 };
 
-inline BuiltinOptions (&EnumValuesBuiltinOptions())[22] {
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[25] {
   static BuiltinOptions values[] = {
-    BuiltinOptions_NONE,
-    BuiltinOptions_Conv2DOptions,
-    BuiltinOptions_DepthwiseConv2DOptions,
-    BuiltinOptions_ConcatEmbeddingsOptions,
-    BuiltinOptions_LSHProjectionOptions,
-    BuiltinOptions_Pool2DOptions,
-    BuiltinOptions_SVDFOptions,
-    BuiltinOptions_RNNOptions,
-    BuiltinOptions_FullyConnectedOptions,
-    BuiltinOptions_SoftmaxOptions,
-    BuiltinOptions_ConcatenationOptions,
-    BuiltinOptions_AddOptions,
-    BuiltinOptions_L2NormOptions,
-    BuiltinOptions_LocalResponseNormalizationOptions,
-    BuiltinOptions_LSTMOptions,
-    BuiltinOptions_ResizeBilinearOptions,
-    BuiltinOptions_CallOptions,
-    BuiltinOptions_ReshapeOptions,
-    BuiltinOptions_SkipGramOptions,
-    BuiltinOptions_SpaceToDepthOptions,
-    BuiltinOptions_EmbeddingLookupSparseOptions,
-    BuiltinOptions_MulOptions
-  };
+      BuiltinOptions_NONE,
+      BuiltinOptions_Conv2DOptions,
+      BuiltinOptions_DepthwiseConv2DOptions,
+      BuiltinOptions_ConcatEmbeddingsOptions,
+      BuiltinOptions_LSHProjectionOptions,
+      BuiltinOptions_Pool2DOptions,
+      BuiltinOptions_SVDFOptions,
+      BuiltinOptions_RNNOptions,
+      BuiltinOptions_FullyConnectedOptions,
+      BuiltinOptions_SoftmaxOptions,
+      BuiltinOptions_ConcatenationOptions,
+      BuiltinOptions_AddOptions,
+      BuiltinOptions_L2NormOptions,
+      BuiltinOptions_LocalResponseNormalizationOptions,
+      BuiltinOptions_LSTMOptions,
+      BuiltinOptions_ResizeBilinearOptions,
+      BuiltinOptions_CallOptions,
+      BuiltinOptions_ReshapeOptions,
+      BuiltinOptions_SkipGramOptions,
+      BuiltinOptions_SpaceToDepthOptions,
+      BuiltinOptions_EmbeddingLookupSparseOptions,
+      BuiltinOptions_MulOptions,
+      BuiltinOptions_PadOptions,
+      BuiltinOptions_GatherOptions,
+      BuiltinOptions_BatchToSpaceNDOptions};
   return values;
 }
 
 inline const char **EnumNamesBuiltinOptions() {
-  static const char *names[] = {
-    "NONE",
-    "Conv2DOptions",
-    "DepthwiseConv2DOptions",
-    "ConcatEmbeddingsOptions",
-    "LSHProjectionOptions",
-    "Pool2DOptions",
-    "SVDFOptions",
-    "RNNOptions",
-    "FullyConnectedOptions",
-    "SoftmaxOptions",
-    "ConcatenationOptions",
-    "AddOptions",
-    "L2NormOptions",
-    "LocalResponseNormalizationOptions",
-    "LSTMOptions",
-    "ResizeBilinearOptions",
-    "CallOptions",
-    "ReshapeOptions",
-    "SkipGramOptions",
-    "SpaceToDepthOptions",
-    "EmbeddingLookupSparseOptions",
-    "MulOptions",
-    nullptr
-  };
+  static const char *names[] = {"NONE",
+                                "Conv2DOptions",
+                                "DepthwiseConv2DOptions",
+                                "ConcatEmbeddingsOptions",
+                                "LSHProjectionOptions",
+                                "Pool2DOptions",
+                                "SVDFOptions",
+                                "RNNOptions",
+                                "FullyConnectedOptions",
+                                "SoftmaxOptions",
+                                "ConcatenationOptions",
+                                "AddOptions",
+                                "L2NormOptions",
+                                "LocalResponseNormalizationOptions",
+                                "LSTMOptions",
+                                "ResizeBilinearOptions",
+                                "CallOptions",
+                                "ReshapeOptions",
+                                "SkipGramOptions",
+                                "SpaceToDepthOptions",
+                                "EmbeddingLookupSparseOptions",
+                                "MulOptions",
+                                "PadOptions",
+                                "GatherOptions",
+                                "BatchToSpaceNDOptions",
+                                nullptr};
   return names;
 }
 
@@ -341,114 +368,166 @@ inline const char *EnumNameBuiltinOptions(BuiltinOptions e) {
   return EnumNamesBuiltinOptions()[index];
 }
 
-template<typename T> struct BuiltinOptionsTraits {
+template <typename T>
+struct BuiltinOptionsTraits {
   static const BuiltinOptions enum_value = BuiltinOptions_NONE;
 };
 
-template<> struct BuiltinOptionsTraits<Conv2DOptions> {
+template <>
+struct BuiltinOptionsTraits<Conv2DOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_Conv2DOptions;
 };
 
-template<> struct BuiltinOptionsTraits<DepthwiseConv2DOptions> {
-  static const BuiltinOptions enum_value = BuiltinOptions_DepthwiseConv2DOptions;
+template <>
+struct BuiltinOptionsTraits<DepthwiseConv2DOptions> {
+  static const BuiltinOptions enum_value =
+      BuiltinOptions_DepthwiseConv2DOptions;
 };
 
-template<> struct BuiltinOptionsTraits<ConcatEmbeddingsOptions> {
-  static const BuiltinOptions enum_value = BuiltinOptions_ConcatEmbeddingsOptions;
+template <>
+struct BuiltinOptionsTraits<ConcatEmbeddingsOptions> {
+  static const BuiltinOptions enum_value =
+      BuiltinOptions_ConcatEmbeddingsOptions;
 };
 
-template<> struct BuiltinOptionsTraits<LSHProjectionOptions> {
+template <>
+struct BuiltinOptionsTraits<LSHProjectionOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_LSHProjectionOptions;
 };
 
-template<> struct BuiltinOptionsTraits<Pool2DOptions> {
+template <>
+struct BuiltinOptionsTraits<Pool2DOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_Pool2DOptions;
 };
 
-template<> struct BuiltinOptionsTraits<SVDFOptions> {
+template <>
+struct BuiltinOptionsTraits<SVDFOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_SVDFOptions;
 };
 
-template<> struct BuiltinOptionsTraits<RNNOptions> {
+template <>
+struct BuiltinOptionsTraits<RNNOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_RNNOptions;
 };
 
-template<> struct BuiltinOptionsTraits<FullyConnectedOptions> {
+template <>
+struct BuiltinOptionsTraits<FullyConnectedOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_FullyConnectedOptions;
 };
 
-template<> struct BuiltinOptionsTraits<SoftmaxOptions> {
+template <>
+struct BuiltinOptionsTraits<SoftmaxOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_SoftmaxOptions;
 };
 
-template<> struct BuiltinOptionsTraits<ConcatenationOptions> {
+template <>
+struct BuiltinOptionsTraits<ConcatenationOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_ConcatenationOptions;
 };
 
-template<> struct BuiltinOptionsTraits<AddOptions> {
+template <>
+struct BuiltinOptionsTraits<AddOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_AddOptions;
 };
 
-template<> struct BuiltinOptionsTraits<L2NormOptions> {
+template <>
+struct BuiltinOptionsTraits<L2NormOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_L2NormOptions;
 };
 
-template<> struct BuiltinOptionsTraits<LocalResponseNormalizationOptions> {
-  static const BuiltinOptions enum_value = BuiltinOptions_LocalResponseNormalizationOptions;
+template <>
+struct BuiltinOptionsTraits<LocalResponseNormalizationOptions> {
+  static const BuiltinOptions enum_value =
+      BuiltinOptions_LocalResponseNormalizationOptions;
 };
 
-template<> struct BuiltinOptionsTraits<LSTMOptions> {
+template <>
+struct BuiltinOptionsTraits<LSTMOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_LSTMOptions;
 };
 
-template<> struct BuiltinOptionsTraits<ResizeBilinearOptions> {
+template <>
+struct BuiltinOptionsTraits<ResizeBilinearOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_ResizeBilinearOptions;
 };
 
-template<> struct BuiltinOptionsTraits<CallOptions> {
+template <>
+struct BuiltinOptionsTraits<CallOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_CallOptions;
 };
 
-template<> struct BuiltinOptionsTraits<ReshapeOptions> {
+template <>
+struct BuiltinOptionsTraits<ReshapeOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_ReshapeOptions;
 };
 
-template<> struct BuiltinOptionsTraits<SkipGramOptions> {
+template <>
+struct BuiltinOptionsTraits<SkipGramOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_SkipGramOptions;
 };
 
-template<> struct BuiltinOptionsTraits<SpaceToDepthOptions> {
+template <>
+struct BuiltinOptionsTraits<SpaceToDepthOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_SpaceToDepthOptions;
 };
 
-template<> struct BuiltinOptionsTraits<EmbeddingLookupSparseOptions> {
-  static const BuiltinOptions enum_value = BuiltinOptions_EmbeddingLookupSparseOptions;
+template <>
+struct BuiltinOptionsTraits<EmbeddingLookupSparseOptions> {
+  static const BuiltinOptions enum_value =
+      BuiltinOptions_EmbeddingLookupSparseOptions;
 };
 
-template<> struct BuiltinOptionsTraits<MulOptions> {
+template <>
+struct BuiltinOptionsTraits<MulOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_MulOptions;
 };
 
+template <>
+struct BuiltinOptionsTraits<PadOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_PadOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<GatherOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GatherOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<BatchToSpaceNDOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BatchToSpaceNDOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
 
   BuiltinOptionsUnion() : type(BuiltinOptions_NONE), value(nullptr) {}
-  BuiltinOptionsUnion(BuiltinOptionsUnion&& u) FLATBUFFERS_NOEXCEPT :
-    type(BuiltinOptions_NONE), value(nullptr)
-    { std::swap(type, u.type); std::swap(value, u.value); }
+  BuiltinOptionsUnion(BuiltinOptionsUnion &&u) FLATBUFFERS_NOEXCEPT
+      : type(BuiltinOptions_NONE),
+        value(nullptr) {
+    std::swap(type, u.type);
+    std::swap(value, u.value);
+  }
   BuiltinOptionsUnion(const BuiltinOptionsUnion &) FLATBUFFERS_NOEXCEPT;
-  BuiltinOptionsUnion &operator=(const BuiltinOptionsUnion &u) FLATBUFFERS_NOEXCEPT
-    { BuiltinOptionsUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
-  BuiltinOptionsUnion &operator=(BuiltinOptionsUnion &&u) FLATBUFFERS_NOEXCEPT
-    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  BuiltinOptionsUnion &operator=(const BuiltinOptionsUnion &u)
+      FLATBUFFERS_NOEXCEPT {
+    BuiltinOptionsUnion t(u);
+    std::swap(type, t.type);
+    std::swap(value, t.value);
+    return *this;
+  }
+  BuiltinOptionsUnion &operator=(BuiltinOptionsUnion &&u) FLATBUFFERS_NOEXCEPT {
+    std::swap(type, u.type);
+    std::swap(value, u.value);
+    return *this;
+  }
   ~BuiltinOptionsUnion() { Reset(); }
 
   void Reset();
 
 #ifndef FLATBUFFERS_CPP98_STL
   template <typename T>
-  void Set(T&& val) {
+  void Set(T &&val) {
     Reset();
     type = BuiltinOptionsTraits<typename T::TableType>::enum_value;
     if (type != BuiltinOptions_NONE) {
@@ -457,181 +536,262 @@ struct BuiltinOptionsUnion {
   }
 #endif  // FLATBUFFERS_CPP98_STL
 
-  static void *UnPack(const void *obj, BuiltinOptions type, const flatbuffers::resolver_function_t *resolver);
-  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+  static void *UnPack(const void *obj, BuiltinOptions type,
+                      const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
 
   Conv2DOptionsT *AsConv2DOptions() {
-    return type == BuiltinOptions_Conv2DOptions ?
-      reinterpret_cast<Conv2DOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_Conv2DOptions
+               ? reinterpret_cast<Conv2DOptionsT *>(value)
+               : nullptr;
   }
   const Conv2DOptionsT *AsConv2DOptions() const {
-    return type == BuiltinOptions_Conv2DOptions ?
-      reinterpret_cast<const Conv2DOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_Conv2DOptions
+               ? reinterpret_cast<const Conv2DOptionsT *>(value)
+               : nullptr;
   }
   DepthwiseConv2DOptionsT *AsDepthwiseConv2DOptions() {
-    return type == BuiltinOptions_DepthwiseConv2DOptions ?
-      reinterpret_cast<DepthwiseConv2DOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_DepthwiseConv2DOptions
+               ? reinterpret_cast<DepthwiseConv2DOptionsT *>(value)
+               : nullptr;
   }
   const DepthwiseConv2DOptionsT *AsDepthwiseConv2DOptions() const {
-    return type == BuiltinOptions_DepthwiseConv2DOptions ?
-      reinterpret_cast<const DepthwiseConv2DOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_DepthwiseConv2DOptions
+               ? reinterpret_cast<const DepthwiseConv2DOptionsT *>(value)
+               : nullptr;
   }
   ConcatEmbeddingsOptionsT *AsConcatEmbeddingsOptions() {
-    return type == BuiltinOptions_ConcatEmbeddingsOptions ?
-      reinterpret_cast<ConcatEmbeddingsOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_ConcatEmbeddingsOptions
+               ? reinterpret_cast<ConcatEmbeddingsOptionsT *>(value)
+               : nullptr;
   }
   const ConcatEmbeddingsOptionsT *AsConcatEmbeddingsOptions() const {
-    return type == BuiltinOptions_ConcatEmbeddingsOptions ?
-      reinterpret_cast<const ConcatEmbeddingsOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_ConcatEmbeddingsOptions
+               ? reinterpret_cast<const ConcatEmbeddingsOptionsT *>(value)
+               : nullptr;
   }
   LSHProjectionOptionsT *AsLSHProjectionOptions() {
-    return type == BuiltinOptions_LSHProjectionOptions ?
-      reinterpret_cast<LSHProjectionOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_LSHProjectionOptions
+               ? reinterpret_cast<LSHProjectionOptionsT *>(value)
+               : nullptr;
   }
   const LSHProjectionOptionsT *AsLSHProjectionOptions() const {
-    return type == BuiltinOptions_LSHProjectionOptions ?
-      reinterpret_cast<const LSHProjectionOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_LSHProjectionOptions
+               ? reinterpret_cast<const LSHProjectionOptionsT *>(value)
+               : nullptr;
   }
   Pool2DOptionsT *AsPool2DOptions() {
-    return type == BuiltinOptions_Pool2DOptions ?
-      reinterpret_cast<Pool2DOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_Pool2DOptions
+               ? reinterpret_cast<Pool2DOptionsT *>(value)
+               : nullptr;
   }
   const Pool2DOptionsT *AsPool2DOptions() const {
-    return type == BuiltinOptions_Pool2DOptions ?
-      reinterpret_cast<const Pool2DOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_Pool2DOptions
+               ? reinterpret_cast<const Pool2DOptionsT *>(value)
+               : nullptr;
   }
   SVDFOptionsT *AsSVDFOptions() {
-    return type == BuiltinOptions_SVDFOptions ?
-      reinterpret_cast<SVDFOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_SVDFOptions
+               ? reinterpret_cast<SVDFOptionsT *>(value)
+               : nullptr;
   }
   const SVDFOptionsT *AsSVDFOptions() const {
-    return type == BuiltinOptions_SVDFOptions ?
-      reinterpret_cast<const SVDFOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_SVDFOptions
+               ? reinterpret_cast<const SVDFOptionsT *>(value)
+               : nullptr;
   }
   RNNOptionsT *AsRNNOptions() {
-    return type == BuiltinOptions_RNNOptions ?
-      reinterpret_cast<RNNOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_RNNOptions
+               ? reinterpret_cast<RNNOptionsT *>(value)
+               : nullptr;
   }
   const RNNOptionsT *AsRNNOptions() const {
-    return type == BuiltinOptions_RNNOptions ?
-      reinterpret_cast<const RNNOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_RNNOptions
+               ? reinterpret_cast<const RNNOptionsT *>(value)
+               : nullptr;
   }
   FullyConnectedOptionsT *AsFullyConnectedOptions() {
-    return type == BuiltinOptions_FullyConnectedOptions ?
-      reinterpret_cast<FullyConnectedOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_FullyConnectedOptions
+               ? reinterpret_cast<FullyConnectedOptionsT *>(value)
+               : nullptr;
   }
   const FullyConnectedOptionsT *AsFullyConnectedOptions() const {
-    return type == BuiltinOptions_FullyConnectedOptions ?
-      reinterpret_cast<const FullyConnectedOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_FullyConnectedOptions
+               ? reinterpret_cast<const FullyConnectedOptionsT *>(value)
+               : nullptr;
   }
   SoftmaxOptionsT *AsSoftmaxOptions() {
-    return type == BuiltinOptions_SoftmaxOptions ?
-      reinterpret_cast<SoftmaxOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_SoftmaxOptions
+               ? reinterpret_cast<SoftmaxOptionsT *>(value)
+               : nullptr;
   }
   const SoftmaxOptionsT *AsSoftmaxOptions() const {
-    return type == BuiltinOptions_SoftmaxOptions ?
-      reinterpret_cast<const SoftmaxOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_SoftmaxOptions
+               ? reinterpret_cast<const SoftmaxOptionsT *>(value)
+               : nullptr;
   }
   ConcatenationOptionsT *AsConcatenationOptions() {
-    return type == BuiltinOptions_ConcatenationOptions ?
-      reinterpret_cast<ConcatenationOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_ConcatenationOptions
+               ? reinterpret_cast<ConcatenationOptionsT *>(value)
+               : nullptr;
   }
   const ConcatenationOptionsT *AsConcatenationOptions() const {
-    return type == BuiltinOptions_ConcatenationOptions ?
-      reinterpret_cast<const ConcatenationOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_ConcatenationOptions
+               ? reinterpret_cast<const ConcatenationOptionsT *>(value)
+               : nullptr;
   }
   AddOptionsT *AsAddOptions() {
-    return type == BuiltinOptions_AddOptions ?
-      reinterpret_cast<AddOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_AddOptions
+               ? reinterpret_cast<AddOptionsT *>(value)
+               : nullptr;
   }
   const AddOptionsT *AsAddOptions() const {
-    return type == BuiltinOptions_AddOptions ?
-      reinterpret_cast<const AddOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_AddOptions
+               ? reinterpret_cast<const AddOptionsT *>(value)
+               : nullptr;
   }
   L2NormOptionsT *AsL2NormOptions() {
-    return type == BuiltinOptions_L2NormOptions ?
-      reinterpret_cast<L2NormOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_L2NormOptions
+               ? reinterpret_cast<L2NormOptionsT *>(value)
+               : nullptr;
   }
   const L2NormOptionsT *AsL2NormOptions() const {
-    return type == BuiltinOptions_L2NormOptions ?
-      reinterpret_cast<const L2NormOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_L2NormOptions
+               ? reinterpret_cast<const L2NormOptionsT *>(value)
+               : nullptr;
   }
   LocalResponseNormalizationOptionsT *AsLocalResponseNormalizationOptions() {
-    return type == BuiltinOptions_LocalResponseNormalizationOptions ?
-      reinterpret_cast<LocalResponseNormalizationOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_LocalResponseNormalizationOptions
+               ? reinterpret_cast<LocalResponseNormalizationOptionsT *>(value)
+               : nullptr;
   }
-  const LocalResponseNormalizationOptionsT *AsLocalResponseNormalizationOptions() const {
-    return type == BuiltinOptions_LocalResponseNormalizationOptions ?
-      reinterpret_cast<const LocalResponseNormalizationOptionsT *>(value) : nullptr;
+  const LocalResponseNormalizationOptionsT *
+  AsLocalResponseNormalizationOptions() const {
+    return type == BuiltinOptions_LocalResponseNormalizationOptions
+               ? reinterpret_cast<const LocalResponseNormalizationOptionsT *>(
+                     value)
+               : nullptr;
   }
   LSTMOptionsT *AsLSTMOptions() {
-    return type == BuiltinOptions_LSTMOptions ?
-      reinterpret_cast<LSTMOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_LSTMOptions
+               ? reinterpret_cast<LSTMOptionsT *>(value)
+               : nullptr;
   }
   const LSTMOptionsT *AsLSTMOptions() const {
-    return type == BuiltinOptions_LSTMOptions ?
-      reinterpret_cast<const LSTMOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_LSTMOptions
+               ? reinterpret_cast<const LSTMOptionsT *>(value)
+               : nullptr;
   }
   ResizeBilinearOptionsT *AsResizeBilinearOptions() {
-    return type == BuiltinOptions_ResizeBilinearOptions ?
-      reinterpret_cast<ResizeBilinearOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_ResizeBilinearOptions
+               ? reinterpret_cast<ResizeBilinearOptionsT *>(value)
+               : nullptr;
   }
   const ResizeBilinearOptionsT *AsResizeBilinearOptions() const {
-    return type == BuiltinOptions_ResizeBilinearOptions ?
-      reinterpret_cast<const ResizeBilinearOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_ResizeBilinearOptions
+               ? reinterpret_cast<const ResizeBilinearOptionsT *>(value)
+               : nullptr;
   }
   CallOptionsT *AsCallOptions() {
-    return type == BuiltinOptions_CallOptions ?
-      reinterpret_cast<CallOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_CallOptions
+               ? reinterpret_cast<CallOptionsT *>(value)
+               : nullptr;
   }
   const CallOptionsT *AsCallOptions() const {
-    return type == BuiltinOptions_CallOptions ?
-      reinterpret_cast<const CallOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_CallOptions
+               ? reinterpret_cast<const CallOptionsT *>(value)
+               : nullptr;
   }
   ReshapeOptionsT *AsReshapeOptions() {
-    return type == BuiltinOptions_ReshapeOptions ?
-      reinterpret_cast<ReshapeOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_ReshapeOptions
+               ? reinterpret_cast<ReshapeOptionsT *>(value)
+               : nullptr;
   }
   const ReshapeOptionsT *AsReshapeOptions() const {
-    return type == BuiltinOptions_ReshapeOptions ?
-      reinterpret_cast<const ReshapeOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_ReshapeOptions
+               ? reinterpret_cast<const ReshapeOptionsT *>(value)
+               : nullptr;
   }
   SkipGramOptionsT *AsSkipGramOptions() {
-    return type == BuiltinOptions_SkipGramOptions ?
-      reinterpret_cast<SkipGramOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_SkipGramOptions
+               ? reinterpret_cast<SkipGramOptionsT *>(value)
+               : nullptr;
   }
   const SkipGramOptionsT *AsSkipGramOptions() const {
-    return type == BuiltinOptions_SkipGramOptions ?
-      reinterpret_cast<const SkipGramOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_SkipGramOptions
+               ? reinterpret_cast<const SkipGramOptionsT *>(value)
+               : nullptr;
   }
   SpaceToDepthOptionsT *AsSpaceToDepthOptions() {
-    return type == BuiltinOptions_SpaceToDepthOptions ?
-      reinterpret_cast<SpaceToDepthOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_SpaceToDepthOptions
+               ? reinterpret_cast<SpaceToDepthOptionsT *>(value)
+               : nullptr;
   }
   const SpaceToDepthOptionsT *AsSpaceToDepthOptions() const {
-    return type == BuiltinOptions_SpaceToDepthOptions ?
-      reinterpret_cast<const SpaceToDepthOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_SpaceToDepthOptions
+               ? reinterpret_cast<const SpaceToDepthOptionsT *>(value)
+               : nullptr;
   }
   EmbeddingLookupSparseOptionsT *AsEmbeddingLookupSparseOptions() {
-    return type == BuiltinOptions_EmbeddingLookupSparseOptions ?
-      reinterpret_cast<EmbeddingLookupSparseOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_EmbeddingLookupSparseOptions
+               ? reinterpret_cast<EmbeddingLookupSparseOptionsT *>(value)
+               : nullptr;
   }
   const EmbeddingLookupSparseOptionsT *AsEmbeddingLookupSparseOptions() const {
-    return type == BuiltinOptions_EmbeddingLookupSparseOptions ?
-      reinterpret_cast<const EmbeddingLookupSparseOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_EmbeddingLookupSparseOptions
+               ? reinterpret_cast<const EmbeddingLookupSparseOptionsT *>(value)
+               : nullptr;
   }
   MulOptionsT *AsMulOptions() {
-    return type == BuiltinOptions_MulOptions ?
-      reinterpret_cast<MulOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_MulOptions
+               ? reinterpret_cast<MulOptionsT *>(value)
+               : nullptr;
   }
   const MulOptionsT *AsMulOptions() const {
-    return type == BuiltinOptions_MulOptions ?
-      reinterpret_cast<const MulOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_MulOptions
+               ? reinterpret_cast<const MulOptionsT *>(value)
+               : nullptr;
+  }
+  PadOptionsT *AsPadOptions() {
+    return type == BuiltinOptions_PadOptions
+               ? reinterpret_cast<PadOptionsT *>(value)
+               : nullptr;
+  }
+  const PadOptionsT *AsPadOptions() const {
+    return type == BuiltinOptions_PadOptions
+               ? reinterpret_cast<const PadOptionsT *>(value)
+               : nullptr;
+  }
+  GatherOptionsT *AsGatherOptions() {
+    return type == BuiltinOptions_GatherOptions
+               ? reinterpret_cast<GatherOptionsT *>(value)
+               : nullptr;
+  }
+  const GatherOptionsT *AsGatherOptions() const {
+    return type == BuiltinOptions_GatherOptions
+               ? reinterpret_cast<const GatherOptionsT *>(value)
+               : nullptr;
+  }
+  BatchToSpaceNDOptionsT *AsBatchToSpaceNDOptions() {
+    return type == BuiltinOptions_BatchToSpaceNDOptions
+               ? reinterpret_cast<BatchToSpaceNDOptionsT *>(value)
+               : nullptr;
+  }
+  const BatchToSpaceNDOptionsT *AsBatchToSpaceNDOptions() const {
+    return type == BuiltinOptions_BatchToSpaceNDOptions
+               ? reinterpret_cast<const BatchToSpaceNDOptionsT *>(value)
+               : nullptr;
   }
 };
 
-bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
-bool VerifyBuiltinOptionsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj,
+                          BuiltinOptions type);
+bool VerifyBuiltinOptionsVector(
+    flatbuffers::Verifier &verifier,
+    const flatbuffers::Vector<flatbuffers::Offset<void>> *values,
+    const flatbuffers::Vector<uint8_t> *types);
 
 enum Padding {
   Padding_SAME = 0,
@@ -641,19 +801,12 @@ enum Padding {
 };
 
 inline Padding (&EnumValuesPadding())[2] {
-  static Padding values[] = {
-    Padding_SAME,
-    Padding_VALID
-  };
+  static Padding values[] = {Padding_SAME, Padding_VALID};
   return values;
 }
 
 inline const char **EnumNamesPadding() {
-  static const char *names[] = {
-    "SAME",
-    "VALID",
-    nullptr
-  };
+  static const char *names[] = {"SAME", "VALID", nullptr};
   return names;
 }
 
@@ -675,26 +828,15 @@ enum ActivationFunctionType {
 
 inline ActivationFunctionType (&EnumValuesActivationFunctionType())[6] {
   static ActivationFunctionType values[] = {
-    ActivationFunctionType_NONE,
-    ActivationFunctionType_RELU,
-    ActivationFunctionType_RELU1,
-    ActivationFunctionType_RELU6,
-    ActivationFunctionType_TANH,
-    ActivationFunctionType_SIGN_BIT
-  };
+      ActivationFunctionType_NONE,  ActivationFunctionType_RELU,
+      ActivationFunctionType_RELU1, ActivationFunctionType_RELU6,
+      ActivationFunctionType_TANH,  ActivationFunctionType_SIGN_BIT};
   return values;
 }
 
 inline const char **EnumNamesActivationFunctionType() {
-  static const char *names[] = {
-    "NONE",
-    "RELU",
-    "RELU1",
-    "RELU6",
-    "TANH",
-    "SIGN_BIT",
-    nullptr
-  };
+  static const char *names[] = {"NONE", "RELU",     "RELU1", "RELU6",
+                                "TANH", "SIGN_BIT", nullptr};
   return names;
 }
 
@@ -712,21 +854,14 @@ enum LSHProjectionType {
 };
 
 inline LSHProjectionType (&EnumValuesLSHProjectionType())[3] {
-  static LSHProjectionType values[] = {
-    LSHProjectionType_UNKNOWN,
-    LSHProjectionType_SPARSE,
-    LSHProjectionType_DENSE
-  };
+  static LSHProjectionType values[] = {LSHProjectionType_UNKNOWN,
+                                       LSHProjectionType_SPARSE,
+                                       LSHProjectionType_DENSE};
   return values;
 }
 
 inline const char **EnumNamesLSHProjectionType() {
-  static const char *names[] = {
-    "UNKNOWN",
-    "SPARSE",
-    "DENSE",
-    nullptr
-  };
+  static const char *names[] = {"UNKNOWN", "SPARSE", "DENSE", nullptr};
   return names;
 }
 
@@ -744,21 +879,13 @@ enum CombinerType {
 };
 
 inline CombinerType (&EnumValuesCombinerType())[3] {
-  static CombinerType values[] = {
-    CombinerType_SUM,
-    CombinerType_MEAN,
-    CombinerType_SQRTN
-  };
+  static CombinerType values[] = {CombinerType_SUM, CombinerType_MEAN,
+                                  CombinerType_SQRTN};
   return values;
 }
 
 inline const char **EnumNamesCombinerType() {
-  static const char *names[] = {
-    "SUM",
-    "MEAN",
-    "SQRTN",
-    nullptr
-  };
+  static const char *names[] = {"SUM", "MEAN", "SQRTN", nullptr};
   return names;
 }
 
@@ -774,17 +901,12 @@ enum CustomOptionsFormat {
 };
 
 inline CustomOptionsFormat (&EnumValuesCustomOptionsFormat())[1] {
-  static CustomOptionsFormat values[] = {
-    CustomOptionsFormat_FLEXBUFFERS
-  };
+  static CustomOptionsFormat values[] = {CustomOptionsFormat_FLEXBUFFERS};
   return values;
 }
 
 inline const char **EnumNamesCustomOptionsFormat() {
-  static const char *names[] = {
-    "FLEXBUFFERS",
-    nullptr
-  };
+  static const char *names[] = {"FLEXBUFFERS", nullptr};
   return names;
 }
 
@@ -799,18 +921,13 @@ struct QuantizationParametersT : public flatbuffers::NativeTable {
   std::vector<float> max;
   std::vector<float> scale;
   std::vector<int64_t> zero_point;
-  QuantizationParametersT() {
-  }
+  QuantizationParametersT() {}
 };
 
-struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct QuantizationParameters FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef QuantizationParametersT NativeTableType;
-  enum {
-    VT_MIN = 4,
-    VT_MAX = 6,
-    VT_SCALE = 8,
-    VT_ZERO_POINT = 10
-  };
+  enum { VT_MIN = 4, VT_MAX = 6, VT_SCALE = 8, VT_ZERO_POINT = 10 };
   const flatbuffers::Vector<float> *min() const {
     return GetPointer<const flatbuffers::Vector<float> *>(VT_MIN);
   }
@@ -824,20 +941,20 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
     return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_ZERO_POINT);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_MIN) &&
-           verifier.Verify(min()) &&
-           VerifyOffset(verifier, VT_MAX) &&
-           verifier.Verify(max()) &&
-           VerifyOffset(verifier, VT_SCALE) &&
-           verifier.Verify(scale()) &&
-           VerifyOffset(verifier, VT_ZERO_POINT) &&
-           verifier.Verify(zero_point()) &&
-           verifier.EndTable();
-  }
-  QuantizationParametersT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(QuantizationParametersT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<QuantizationParameters> Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_MIN) &&
+           verifier.Verify(min()) && VerifyOffset(verifier, VT_MAX) &&
+           verifier.Verify(max()) && VerifyOffset(verifier, VT_SCALE) &&
+           verifier.Verify(scale()) && VerifyOffset(verifier, VT_ZERO_POINT) &&
+           verifier.Verify(zero_point()) && verifier.EndTable();
+  }
+  QuantizationParametersT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      QuantizationParametersT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<QuantizationParameters> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct QuantizationParametersBuilder {
@@ -852,14 +969,16 @@ struct QuantizationParametersBuilder {
   void add_scale(flatbuffers::Offset<flatbuffers::Vector<float>> scale) {
     fbb_.AddOffset(QuantizationParameters::VT_SCALE, scale);
   }
-  void add_zero_point(flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point) {
+  void add_zero_point(
+      flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point) {
     fbb_.AddOffset(QuantizationParameters::VT_ZERO_POINT, zero_point);
   }
   explicit QuantizationParametersBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  QuantizationParametersBuilder &operator=(const QuantizationParametersBuilder &);
+  QuantizationParametersBuilder &operator=(
+      const QuantizationParametersBuilder &);
   flatbuffers::Offset<QuantizationParameters> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<QuantizationParameters>(end);
@@ -881,21 +1000,23 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersDirect(
+inline flatbuffers::Offset<QuantizationParameters>
+CreateQuantizationParametersDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<float> *min = nullptr,
     const std::vector<float> *max = nullptr,
     const std::vector<float> *scale = nullptr,
     const std::vector<int64_t> *zero_point = nullptr) {
   return tflite::CreateQuantizationParameters(
-      _fbb,
-      min ? _fbb.CreateVector<float>(*min) : 0,
+      _fbb, min ? _fbb.CreateVector<float>(*min) : 0,
       max ? _fbb.CreateVector<float>(*max) : 0,
       scale ? _fbb.CreateVector<float>(*scale) : 0,
       zero_point ? _fbb.CreateVector<int64_t>(*zero_point) : 0);
 }
 
-flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
+    flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct TensorT : public flatbuffers::NativeTable {
   typedef Tensor TableType;
@@ -904,10 +1025,7 @@ struct TensorT : public flatbuffers::NativeTable {
   uint32_t buffer;
   std::string name;
   std::unique_ptr<QuantizationParametersT> quantization;
-  TensorT()
-      : type(TensorType_FLOAT32),
-        buffer(0) {
-  }
+  TensorT() : type(TensorType_FLOAT32), buffer(0) {}
 };
 
 struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -925,9 +1043,7 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   TensorType type() const {
     return static_cast<TensorType>(GetField<int8_t>(VT_TYPE, 0));
   }
-  uint32_t buffer() const {
-    return GetField<uint32_t>(VT_BUFFER, 0);
-  }
+  uint32_t buffer() const { return GetField<uint32_t>(VT_BUFFER, 0); }
   const flatbuffers::String *name() const {
     return GetPointer<const flatbuffers::String *>(VT_NAME);
   }
@@ -935,20 +1051,20 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return GetPointer<const QuantizationParameters *>(VT_QUANTIZATION);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_SHAPE) &&
-           verifier.Verify(shape()) &&
-           VerifyField<int8_t>(verifier, VT_TYPE) &&
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_SHAPE) &&
+           verifier.Verify(shape()) && VerifyField<int8_t>(verifier, VT_TYPE) &&
            VerifyField<uint32_t>(verifier, VT_BUFFER) &&
-           VerifyOffset(verifier, VT_NAME) &&
-           verifier.Verify(name()) &&
+           VerifyOffset(verifier, VT_NAME) && verifier.Verify(name()) &&
            VerifyOffset(verifier, VT_QUANTIZATION) &&
-           verifier.VerifyTable(quantization()) &&
-           verifier.EndTable();
-  }
-  TensorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(TensorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Tensor> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           verifier.VerifyTable(quantization()) && verifier.EndTable();
+  }
+  TensorT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TensorT *_o, const flatbuffers::resolver_function_t *_resolver =
+                                 nullptr) const;
+  static flatbuffers::Offset<Tensor> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct TensorBuilder {
@@ -966,11 +1082,11 @@ struct TensorBuilder {
   void add_name(flatbuffers::Offset<flatbuffers::String> name) {
     fbb_.AddOffset(Tensor::VT_NAME, name);
   }
-  void add_quantization(flatbuffers::Offset<QuantizationParameters> quantization) {
+  void add_quantization(
+      flatbuffers::Offset<QuantizationParameters> quantization) {
     fbb_.AddOffset(Tensor::VT_QUANTIZATION, quantization);
   }
-  explicit TensorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+  explicit TensorBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   TensorBuilder &operator=(const TensorBuilder &);
@@ -984,8 +1100,7 @@ struct TensorBuilder {
 inline flatbuffers::Offset<Tensor> CreateTensor(
     flatbuffers::FlatBufferBuilder &_fbb,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape = 0,
-    TensorType type = TensorType_FLOAT32,
-    uint32_t buffer = 0,
+    TensorType type = TensorType_FLOAT32, uint32_t buffer = 0,
     flatbuffers::Offset<flatbuffers::String> name = 0,
     flatbuffers::Offset<QuantizationParameters> quantization = 0) {
   TensorBuilder builder_(_fbb);
@@ -1000,20 +1115,17 @@ inline flatbuffers::Offset<Tensor> CreateTensor(
 inline flatbuffers::Offset<Tensor> CreateTensorDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<int32_t> *shape = nullptr,
-    TensorType type = TensorType_FLOAT32,
-    uint32_t buffer = 0,
+    TensorType type = TensorType_FLOAT32, uint32_t buffer = 0,
     const char *name = nullptr,
     flatbuffers::Offset<QuantizationParameters> quantization = 0) {
   return tflite::CreateTensor(
-      _fbb,
-      shape ? _fbb.CreateVector<int32_t>(*shape) : 0,
-      type,
-      buffer,
-      name ? _fbb.CreateString(name) : 0,
-      quantization);
+      _fbb, shape ? _fbb.CreateVector<int32_t>(*shape) : 0, type, buffer,
+      name ? _fbb.CreateString(name) : 0, quantization);
 }
 
-flatbuffers::Offset<Tensor> CreateTensor(flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<Tensor> CreateTensor(
+    flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct Conv2DOptionsT : public flatbuffers::NativeTable {
   typedef Conv2DOptions TableType;
@@ -1025,8 +1137,7 @@ struct Conv2DOptionsT : public flatbuffers::NativeTable {
       : padding(Padding_SAME),
         stride_w(0),
         stride_h(0),
-        fused_activation_function(ActivationFunctionType_NONE) {
-  }
+        fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
 struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -1040,14 +1151,11 @@ struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   Padding padding() const {
     return static_cast<Padding>(GetField<int8_t>(VT_PADDING, 0));
   }
-  int32_t stride_w() const {
-    return GetField<int32_t>(VT_STRIDE_W, 0);
-  }
-  int32_t stride_h() const {
-    return GetField<int32_t>(VT_STRIDE_H, 0);
-  }
+  int32_t stride_w() const { return GetField<int32_t>(VT_STRIDE_W, 0); }
+  int32_t stride_h() const { return GetField<int32_t>(VT_STRIDE_H, 0); }
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -1057,16 +1165,22 @@ struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  Conv2DOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(Conv2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Conv2DOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  Conv2DOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      Conv2DOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Conv2DOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct Conv2DOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_padding(Padding padding) {
-    fbb_.AddElement<int8_t>(Conv2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+    fbb_.AddElement<int8_t>(Conv2DOptions::VT_PADDING,
+                            static_cast<int8_t>(padding), 0);
   }
   void add_stride_w(int32_t stride_w) {
     fbb_.AddElement<int32_t>(Conv2DOptions::VT_STRIDE_W, stride_w, 0);
@@ -1074,11 +1188,13 @@ struct Conv2DOptionsBuilder {
   void add_stride_h(int32_t stride_h) {
     fbb_.AddElement<int32_t>(Conv2DOptions::VT_STRIDE_H, stride_h, 0);
   }
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(Conv2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(Conv2DOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit Conv2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   Conv2DOptionsBuilder &operator=(const Conv2DOptionsBuilder &);
@@ -1090,11 +1206,10 @@ struct Conv2DOptionsBuilder {
 };
 
 inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    Padding padding = Padding_SAME,
-    int32_t stride_w = 0,
-    int32_t stride_h = 0,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    flatbuffers::FlatBufferBuilder &_fbb, Padding padding = Padding_SAME,
+    int32_t stride_w = 0, int32_t stride_h = 0,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   Conv2DOptionsBuilder builder_(_fbb);
   builder_.add_stride_h(stride_h);
   builder_.add_stride_w(stride_w);
@@ -1103,7 +1218,9 @@ inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct Pool2DOptionsT : public flatbuffers::NativeTable {
   typedef Pool2DOptions TableType;
@@ -1119,8 +1236,7 @@ struct Pool2DOptionsT : public flatbuffers::NativeTable {
         stride_h(0),
         filter_width(0),
         filter_height(0),
-        fused_activation_function(ActivationFunctionType_NONE) {
-  }
+        fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
 struct Pool2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -1136,20 +1252,15 @@ struct Pool2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   Padding padding() const {
     return static_cast<Padding>(GetField<int8_t>(VT_PADDING, 0));
   }
-  int32_t stride_w() const {
-    return GetField<int32_t>(VT_STRIDE_W, 0);
-  }
-  int32_t stride_h() const {
-    return GetField<int32_t>(VT_STRIDE_H, 0);
-  }
-  int32_t filter_width() const {
-    return GetField<int32_t>(VT_FILTER_WIDTH, 0);
-  }
+  int32_t stride_w() const { return GetField<int32_t>(VT_STRIDE_W, 0); }
+  int32_t stride_h() const { return GetField<int32_t>(VT_STRIDE_H, 0); }
+  int32_t filter_width() const { return GetField<int32_t>(VT_FILTER_WIDTH, 0); }
   int32_t filter_height() const {
     return GetField<int32_t>(VT_FILTER_HEIGHT, 0);
   }
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -1161,16 +1272,22 @@ struct Pool2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  Pool2DOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(Pool2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Pool2DOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  Pool2DOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      Pool2DOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Pool2DOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct Pool2DOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_padding(Padding padding) {
-    fbb_.AddElement<int8_t>(Pool2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+    fbb_.AddElement<int8_t>(Pool2DOptions::VT_PADDING,
+                            static_cast<int8_t>(padding), 0);
   }
   void add_stride_w(int32_t stride_w) {
     fbb_.AddElement<int32_t>(Pool2DOptions::VT_STRIDE_W, stride_w, 0);
@@ -1184,11 +1301,13 @@ struct Pool2DOptionsBuilder {
   void add_filter_height(int32_t filter_height) {
     fbb_.AddElement<int32_t>(Pool2DOptions::VT_FILTER_HEIGHT, filter_height, 0);
   }
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(Pool2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(Pool2DOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit Pool2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   Pool2DOptionsBuilder &operator=(const Pool2DOptionsBuilder &);
@@ -1200,13 +1319,11 @@ struct Pool2DOptionsBuilder {
 };
 
 inline flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    Padding padding = Padding_SAME,
-    int32_t stride_w = 0,
-    int32_t stride_h = 0,
-    int32_t filter_width = 0,
+    flatbuffers::FlatBufferBuilder &_fbb, Padding padding = Padding_SAME,
+    int32_t stride_w = 0, int32_t stride_h = 0, int32_t filter_width = 0,
     int32_t filter_height = 0,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   Pool2DOptionsBuilder builder_(_fbb);
   builder_.add_filter_height(filter_height);
   builder_.add_filter_width(filter_width);
@@ -1217,7 +1334,9 @@ inline flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct DepthwiseConv2DOptionsT : public flatbuffers::NativeTable {
   typedef DepthwiseConv2DOptions TableType;
@@ -1231,11 +1350,11 @@ struct DepthwiseConv2DOptionsT : public flatbuffers::NativeTable {
         stride_w(0),
         stride_h(0),
         depth_multiplier(0),
-        fused_activation_function(ActivationFunctionType_NONE) {
-  }
+        fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
-struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef DepthwiseConv2DOptionsT NativeTableType;
   enum {
     VT_PADDING = 4,
@@ -1247,17 +1366,14 @@ struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
   Padding padding() const {
     return static_cast<Padding>(GetField<int8_t>(VT_PADDING, 0));
   }
-  int32_t stride_w() const {
-    return GetField<int32_t>(VT_STRIDE_W, 0);
-  }
-  int32_t stride_h() const {
-    return GetField<int32_t>(VT_STRIDE_H, 0);
-  }
+  int32_t stride_w() const { return GetField<int32_t>(VT_STRIDE_W, 0); }
+  int32_t stride_h() const { return GetField<int32_t>(VT_STRIDE_H, 0); }
   int32_t depth_multiplier() const {
     return GetField<int32_t>(VT_DEPTH_MULTIPLIER, 0);
   }
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -1268,16 +1384,22 @@ struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  DepthwiseConv2DOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(DepthwiseConv2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<DepthwiseConv2DOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  DepthwiseConv2DOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      DepthwiseConv2DOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<DepthwiseConv2DOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct DepthwiseConv2DOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_padding(Padding padding) {
-    fbb_.AddElement<int8_t>(DepthwiseConv2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+    fbb_.AddElement<int8_t>(DepthwiseConv2DOptions::VT_PADDING,
+                            static_cast<int8_t>(padding), 0);
   }
   void add_stride_w(int32_t stride_w) {
     fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_STRIDE_W, stride_w, 0);
@@ -1286,16 +1408,21 @@ struct DepthwiseConv2DOptionsBuilder {
     fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_STRIDE_H, stride_h, 0);
   }
   void add_depth_multiplier(int32_t depth_multiplier) {
-    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_DEPTH_MULTIPLIER, depth_multiplier, 0);
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_DEPTH_MULTIPLIER,
+                             depth_multiplier, 0);
   }
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(DepthwiseConv2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(
+        DepthwiseConv2DOptions::VT_FUSED_ACTIVATION_FUNCTION,
+        static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit DepthwiseConv2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  DepthwiseConv2DOptionsBuilder &operator=(const DepthwiseConv2DOptionsBuilder &);
+  DepthwiseConv2DOptionsBuilder &operator=(
+      const DepthwiseConv2DOptionsBuilder &);
   flatbuffers::Offset<DepthwiseConv2DOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<DepthwiseConv2DOptions>(end);
@@ -1304,12 +1431,10 @@ struct DepthwiseConv2DOptionsBuilder {
 };
 
 inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    Padding padding = Padding_SAME,
-    int32_t stride_w = 0,
-    int32_t stride_h = 0,
-    int32_t depth_multiplier = 0,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    flatbuffers::FlatBufferBuilder &_fbb, Padding padding = Padding_SAME,
+    int32_t stride_w = 0, int32_t stride_h = 0, int32_t depth_multiplier = 0,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   DepthwiseConv2DOptionsBuilder builder_(_fbb);
   builder_.add_depth_multiplier(depth_multiplier);
   builder_.add_stride_h(stride_h);
@@ -1319,33 +1444,34 @@ inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct ConcatEmbeddingsOptionsT : public flatbuffers::NativeTable {
   typedef ConcatEmbeddingsOptions TableType;
   int32_t num_channels;
   std::vector<int32_t> num_columns_per_channel;
   std::vector<int32_t> embedding_dim_per_channel;
-  ConcatEmbeddingsOptionsT()
-      : num_channels(0) {
-  }
+  ConcatEmbeddingsOptionsT() : num_channels(0) {}
 };
 
-struct ConcatEmbeddingsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ConcatEmbeddingsOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef ConcatEmbeddingsOptionsT NativeTableType;
   enum {
     VT_NUM_CHANNELS = 4,
     VT_NUM_COLUMNS_PER_CHANNEL = 6,
     VT_EMBEDDING_DIM_PER_CHANNEL = 8
   };
-  int32_t num_channels() const {
-    return GetField<int32_t>(VT_NUM_CHANNELS, 0);
-  }
+  int32_t num_channels() const { return GetField<int32_t>(VT_NUM_CHANNELS, 0); }
   const flatbuffers::Vector<int32_t> *num_columns_per_channel() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_NUM_COLUMNS_PER_CHANNEL);
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(
+        VT_NUM_COLUMNS_PER_CHANNEL);
   }
   const flatbuffers::Vector<int32_t> *embedding_dim_per_channel() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_EMBEDDING_DIM_PER_CHANNEL);
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(
+        VT_EMBEDDING_DIM_PER_CHANNEL);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -1353,31 +1479,43 @@ struct ConcatEmbeddingsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Ta
            VerifyOffset(verifier, VT_NUM_COLUMNS_PER_CHANNEL) &&
            verifier.Verify(num_columns_per_channel()) &&
            VerifyOffset(verifier, VT_EMBEDDING_DIM_PER_CHANNEL) &&
-           verifier.Verify(embedding_dim_per_channel()) &&
-           verifier.EndTable();
-  }
-  ConcatEmbeddingsOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ConcatEmbeddingsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ConcatEmbeddingsOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           verifier.Verify(embedding_dim_per_channel()) && verifier.EndTable();
+  }
+  ConcatEmbeddingsOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      ConcatEmbeddingsOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ConcatEmbeddingsOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ConcatEmbeddingsOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_num_channels(int32_t num_channels) {
-    fbb_.AddElement<int32_t>(ConcatEmbeddingsOptions::VT_NUM_CHANNELS, num_channels, 0);
+    fbb_.AddElement<int32_t>(ConcatEmbeddingsOptions::VT_NUM_CHANNELS,
+                             num_channels, 0);
   }
-  void add_num_columns_per_channel(flatbuffers::Offset<flatbuffers::Vector<int32_t>> num_columns_per_channel) {
-    fbb_.AddOffset(ConcatEmbeddingsOptions::VT_NUM_COLUMNS_PER_CHANNEL, num_columns_per_channel);
+  void add_num_columns_per_channel(
+      flatbuffers::Offset<flatbuffers::Vector<int32_t>>
+          num_columns_per_channel) {
+    fbb_.AddOffset(ConcatEmbeddingsOptions::VT_NUM_COLUMNS_PER_CHANNEL,
+                   num_columns_per_channel);
   }
-  void add_embedding_dim_per_channel(flatbuffers::Offset<flatbuffers::Vector<int32_t>> embedding_dim_per_channel) {
-    fbb_.AddOffset(ConcatEmbeddingsOptions::VT_EMBEDDING_DIM_PER_CHANNEL, embedding_dim_per_channel);
+  void add_embedding_dim_per_channel(
+      flatbuffers::Offset<flatbuffers::Vector<int32_t>>
+          embedding_dim_per_channel) {
+    fbb_.AddOffset(ConcatEmbeddingsOptions::VT_EMBEDDING_DIM_PER_CHANNEL,
+                   embedding_dim_per_channel);
   }
   explicit ConcatEmbeddingsOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  ConcatEmbeddingsOptionsBuilder &operator=(const ConcatEmbeddingsOptionsBuilder &);
+  ConcatEmbeddingsOptionsBuilder &operator=(
+      const ConcatEmbeddingsOptionsBuilder &);
   flatbuffers::Offset<ConcatEmbeddingsOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<ConcatEmbeddingsOptions>(end);
@@ -1385,11 +1523,13 @@ struct ConcatEmbeddingsOptionsBuilder {
   }
 };
 
-inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t num_channels = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> num_columns_per_channel = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> embedding_dim_per_channel = 0) {
+inline flatbuffers::Offset<ConcatEmbeddingsOptions>
+CreateConcatEmbeddingsOptions(flatbuffers::FlatBufferBuilder &_fbb,
+                              int32_t num_channels = 0,
+                              flatbuffers::Offset<flatbuffers::Vector<int32_t>>
+                                  num_columns_per_channel = 0,
+                              flatbuffers::Offset<flatbuffers::Vector<int32_t>>
+                                  embedding_dim_per_channel = 0) {
   ConcatEmbeddingsOptionsBuilder builder_(_fbb);
   builder_.add_embedding_dim_per_channel(embedding_dim_per_channel);
   builder_.add_num_columns_per_channel(num_columns_per_channel);
@@ -1397,54 +1537,61 @@ inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOption
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t num_channels = 0,
+inline flatbuffers::Offset<ConcatEmbeddingsOptions>
+CreateConcatEmbeddingsOptionsDirect(
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t num_channels = 0,
     const std::vector<int32_t> *num_columns_per_channel = nullptr,
     const std::vector<int32_t> *embedding_dim_per_channel = nullptr) {
   return tflite::CreateConcatEmbeddingsOptions(
-      _fbb,
-      num_channels,
-      num_columns_per_channel ? _fbb.CreateVector<int32_t>(*num_columns_per_channel) : 0,
-      embedding_dim_per_channel ? _fbb.CreateVector<int32_t>(*embedding_dim_per_channel) : 0);
+      _fbb, num_channels,
+      num_columns_per_channel
+          ? _fbb.CreateVector<int32_t>(*num_columns_per_channel)
+          : 0,
+      embedding_dim_per_channel
+          ? _fbb.CreateVector<int32_t>(*embedding_dim_per_channel)
+          : 0);
 }
 
-flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct LSHProjectionOptionsT : public flatbuffers::NativeTable {
   typedef LSHProjectionOptions TableType;
   LSHProjectionType type;
-  LSHProjectionOptionsT()
-      : type(LSHProjectionType_UNKNOWN) {
-  }
+  LSHProjectionOptionsT() : type(LSHProjectionType_UNKNOWN) {}
 };
 
-struct LSHProjectionOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct LSHProjectionOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef LSHProjectionOptionsT NativeTableType;
-  enum {
-    VT_TYPE = 4
-  };
+  enum { VT_TYPE = 4 };
   LSHProjectionType type() const {
     return static_cast<LSHProjectionType>(GetField<int8_t>(VT_TYPE, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int8_t>(verifier, VT_TYPE) &&
-           verifier.EndTable();
-  }
-  LSHProjectionOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(LSHProjectionOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<LSHProjectionOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           VerifyField<int8_t>(verifier, VT_TYPE) && verifier.EndTable();
+  }
+  LSHProjectionOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      LSHProjectionOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LSHProjectionOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct LSHProjectionOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_type(LSHProjectionType type) {
-    fbb_.AddElement<int8_t>(LSHProjectionOptions::VT_TYPE, static_cast<int8_t>(type), 0);
+    fbb_.AddElement<int8_t>(LSHProjectionOptions::VT_TYPE,
+                            static_cast<int8_t>(type), 0);
   }
   explicit LSHProjectionOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   LSHProjectionOptionsBuilder &operator=(const LSHProjectionOptionsBuilder &);
@@ -1463,29 +1610,25 @@ inline flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct SVDFOptionsT : public flatbuffers::NativeTable {
   typedef SVDFOptions TableType;
   int32_t rank;
   ActivationFunctionType fused_activation_function;
   SVDFOptionsT()
-      : rank(0),
-        fused_activation_function(ActivationFunctionType_NONE) {
-  }
+      : rank(0), fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
 struct SVDFOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SVDFOptionsT NativeTableType;
-  enum {
-    VT_RANK = 4,
-    VT_FUSED_ACTIVATION_FUNCTION = 6
-  };
-  int32_t rank() const {
-    return GetField<int32_t>(VT_RANK, 0);
-  }
+  enum { VT_RANK = 4, VT_FUSED_ACTIVATION_FUNCTION = 6 };
+  int32_t rank() const { return GetField<int32_t>(VT_RANK, 0); }
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -1493,9 +1636,14 @@ struct SVDFOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  SVDFOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SVDFOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SVDFOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SVDFOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SVDFOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SVDFOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SVDFOptionsBuilder {
@@ -1504,11 +1652,13 @@ struct SVDFOptionsBuilder {
   void add_rank(int32_t rank) {
     fbb_.AddElement<int32_t>(SVDFOptions::VT_RANK, rank, 0);
   }
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(SVDFOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(SVDFOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit SVDFOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   SVDFOptionsBuilder &operator=(const SVDFOptionsBuilder &);
@@ -1520,51 +1670,57 @@ struct SVDFOptionsBuilder {
 };
 
 inline flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t rank = 0,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t rank = 0,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   SVDFOptionsBuilder builder_(_fbb);
   builder_.add_rank(rank);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct RNNOptionsT : public flatbuffers::NativeTable {
   typedef RNNOptions TableType;
   ActivationFunctionType fused_activation_function;
-  RNNOptionsT()
-      : fused_activation_function(ActivationFunctionType_NONE) {
-  }
+  RNNOptionsT() : fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
 struct RNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef RNNOptionsT NativeTableType;
-  enum {
-    VT_FUSED_ACTIVATION_FUNCTION = 4
-  };
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4 };
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  RNNOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(RNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<RNNOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  RNNOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      RNNOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<RNNOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct RNNOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(RNNOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(RNNOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit RNNOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   RNNOptionsBuilder &operator=(const RNNOptionsBuilder &);
@@ -1577,48 +1733,57 @@ struct RNNOptionsBuilder {
 
 inline flatbuffers::Offset<RNNOptions> CreateRNNOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   RNNOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<RNNOptions> CreateRNNOptions(flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<RNNOptions> CreateRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct FullyConnectedOptionsT : public flatbuffers::NativeTable {
   typedef FullyConnectedOptions TableType;
   ActivationFunctionType fused_activation_function;
   FullyConnectedOptionsT()
-      : fused_activation_function(ActivationFunctionType_NONE) {
-  }
+      : fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
-struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef FullyConnectedOptionsT NativeTableType;
-  enum {
-    VT_FUSED_ACTIVATION_FUNCTION = 4
-  };
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4 };
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  FullyConnectedOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(FullyConnectedOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<FullyConnectedOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  FullyConnectedOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      FullyConnectedOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<FullyConnectedOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct FullyConnectedOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(FullyConnectedOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(FullyConnectedOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit FullyConnectedOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   FullyConnectedOptionsBuilder &operator=(const FullyConnectedOptionsBuilder &);
@@ -1631,38 +1796,39 @@ struct FullyConnectedOptionsBuilder {
 
 inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   FullyConnectedOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct SoftmaxOptionsT : public flatbuffers::NativeTable {
   typedef SoftmaxOptions TableType;
   float beta;
-  SoftmaxOptionsT()
-      : beta(0.0f) {
-  }
+  SoftmaxOptionsT() : beta(0.0f) {}
 };
 
 struct SoftmaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SoftmaxOptionsT NativeTableType;
-  enum {
-    VT_BETA = 4
-  };
-  float beta() const {
-    return GetField<float>(VT_BETA, 0.0f);
-  }
+  enum { VT_BETA = 4 };
+  float beta() const { return GetField<float>(VT_BETA, 0.0f); }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<float>(verifier, VT_BETA) &&
-           verifier.EndTable();
-  }
-  SoftmaxOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SoftmaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SoftmaxOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           VerifyField<float>(verifier, VT_BETA) && verifier.EndTable();
+  }
+  SoftmaxOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SoftmaxOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SoftmaxOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SoftmaxOptionsBuilder {
@@ -1672,7 +1838,7 @@ struct SoftmaxOptionsBuilder {
     fbb_.AddElement<float>(SoftmaxOptions::VT_BETA, beta, 0.0f);
   }
   explicit SoftmaxOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   SoftmaxOptionsBuilder &operator=(const SoftmaxOptionsBuilder &);
@@ -1684,36 +1850,32 @@ struct SoftmaxOptionsBuilder {
 };
 
 inline flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    float beta = 0.0f) {
+    flatbuffers::FlatBufferBuilder &_fbb, float beta = 0.0f) {
   SoftmaxOptionsBuilder builder_(_fbb);
   builder_.add_beta(beta);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct ConcatenationOptionsT : public flatbuffers::NativeTable {
   typedef ConcatenationOptions TableType;
   int32_t axis;
   ActivationFunctionType fused_activation_function;
   ConcatenationOptionsT()
-      : axis(0),
-        fused_activation_function(ActivationFunctionType_NONE) {
-  }
+      : axis(0), fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
-struct ConcatenationOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ConcatenationOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef ConcatenationOptionsT NativeTableType;
-  enum {
-    VT_AXIS = 4,
-    VT_FUSED_ACTIVATION_FUNCTION = 6
-  };
-  int32_t axis() const {
-    return GetField<int32_t>(VT_AXIS, 0);
-  }
+  enum { VT_AXIS = 4, VT_FUSED_ACTIVATION_FUNCTION = 6 };
+  int32_t axis() const { return GetField<int32_t>(VT_AXIS, 0); }
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -1721,9 +1883,14 @@ struct ConcatenationOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  ConcatenationOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ConcatenationOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ConcatenationOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ConcatenationOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      ConcatenationOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ConcatenationOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ConcatenationOptionsBuilder {
@@ -1732,11 +1899,13 @@ struct ConcatenationOptionsBuilder {
   void add_axis(int32_t axis) {
     fbb_.AddElement<int32_t>(ConcatenationOptions::VT_AXIS, axis, 0);
   }
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(ConcatenationOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(ConcatenationOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit ConcatenationOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   ConcatenationOptionsBuilder &operator=(const ConcatenationOptionsBuilder &);
@@ -1748,51 +1917,57 @@ struct ConcatenationOptionsBuilder {
 };
 
 inline flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t axis = 0,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t axis = 0,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   ConcatenationOptionsBuilder builder_(_fbb);
   builder_.add_axis(axis);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct AddOptionsT : public flatbuffers::NativeTable {
   typedef AddOptions TableType;
   ActivationFunctionType fused_activation_function;
-  AddOptionsT()
-      : fused_activation_function(ActivationFunctionType_NONE) {
-  }
+  AddOptionsT() : fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
 struct AddOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef AddOptionsT NativeTableType;
-  enum {
-    VT_FUSED_ACTIVATION_FUNCTION = 4
-  };
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4 };
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  AddOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(AddOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<AddOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  AddOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      AddOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<AddOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct AddOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(AddOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(AddOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit AddOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   AddOptionsBuilder &operator=(const AddOptionsBuilder &);
@@ -1805,48 +1980,55 @@ struct AddOptionsBuilder {
 
 inline flatbuffers::Offset<AddOptions> CreateAddOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   AddOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<AddOptions> CreateAddOptions(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<AddOptions> CreateAddOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct MulOptionsT : public flatbuffers::NativeTable {
   typedef MulOptions TableType;
   ActivationFunctionType fused_activation_function;
-  MulOptionsT()
-      : fused_activation_function(ActivationFunctionType_NONE) {
-  }
+  MulOptionsT() : fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
 struct MulOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef MulOptionsT NativeTableType;
-  enum {
-    VT_FUSED_ACTIVATION_FUNCTION = 4
-  };
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4 };
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  MulOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(MulOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<MulOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  MulOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      MulOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MulOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct MulOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(MulOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(MulOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit MulOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   MulOptionsBuilder &operator=(const MulOptionsBuilder &);
@@ -1859,48 +2041,55 @@ struct MulOptionsBuilder {
 
 inline flatbuffers::Offset<MulOptions> CreateMulOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   MulOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<MulOptions> CreateMulOptions(flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<MulOptions> CreateMulOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct L2NormOptionsT : public flatbuffers::NativeTable {
   typedef L2NormOptions TableType;
   ActivationFunctionType fused_activation_function;
-  L2NormOptionsT()
-      : fused_activation_function(ActivationFunctionType_NONE) {
-  }
+  L2NormOptionsT() : fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
 struct L2NormOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef L2NormOptionsT NativeTableType;
-  enum {
-    VT_FUSED_ACTIVATION_FUNCTION = 4
-  };
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4 };
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  L2NormOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(L2NormOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<L2NormOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  L2NormOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      L2NormOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<L2NormOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct L2NormOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(L2NormOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(L2NormOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit L2NormOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   L2NormOptionsBuilder &operator=(const L2NormOptionsBuilder &);
@@ -1913,13 +2102,16 @@ struct L2NormOptionsBuilder {
 
 inline flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   L2NormOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct LocalResponseNormalizationOptionsT : public flatbuffers::NativeTable {
   typedef LocalResponseNormalizationOptions TableType;
@@ -1928,66 +2120,61 @@ struct LocalResponseNormalizationOptionsT : public flatbuffers::NativeTable {
   float alpha;
   float beta;
   LocalResponseNormalizationOptionsT()
-      : radius(0),
-        bias(0.0f),
-        alpha(0.0f),
-        beta(0.0f) {
-  }
+      : radius(0), bias(0.0f), alpha(0.0f), beta(0.0f) {}
 };
 
-struct LocalResponseNormalizationOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct LocalResponseNormalizationOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef LocalResponseNormalizationOptionsT NativeTableType;
-  enum {
-    VT_RADIUS = 4,
-    VT_BIAS = 6,
-    VT_ALPHA = 8,
-    VT_BETA = 10
-  };
-  int32_t radius() const {
-    return GetField<int32_t>(VT_RADIUS, 0);
-  }
-  float bias() const {
-    return GetField<float>(VT_BIAS, 0.0f);
-  }
-  float alpha() const {
-    return GetField<float>(VT_ALPHA, 0.0f);
-  }
-  float beta() const {
-    return GetField<float>(VT_BETA, 0.0f);
-  }
+  enum { VT_RADIUS = 4, VT_BIAS = 6, VT_ALPHA = 8, VT_BETA = 10 };
+  int32_t radius() const { return GetField<int32_t>(VT_RADIUS, 0); }
+  float bias() const { return GetField<float>(VT_BIAS, 0.0f); }
+  float alpha() const { return GetField<float>(VT_ALPHA, 0.0f); }
+  float beta() const { return GetField<float>(VT_BETA, 0.0f); }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_RADIUS) &&
            VerifyField<float>(verifier, VT_BIAS) &&
            VerifyField<float>(verifier, VT_ALPHA) &&
-           VerifyField<float>(verifier, VT_BETA) &&
-           verifier.EndTable();
-  }
-  LocalResponseNormalizationOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(LocalResponseNormalizationOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<LocalResponseNormalizationOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           VerifyField<float>(verifier, VT_BETA) && verifier.EndTable();
+  }
+  LocalResponseNormalizationOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      LocalResponseNormalizationOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LocalResponseNormalizationOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb,
+      const LocalResponseNormalizationOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct LocalResponseNormalizationOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_radius(int32_t radius) {
-    fbb_.AddElement<int32_t>(LocalResponseNormalizationOptions::VT_RADIUS, radius, 0);
+    fbb_.AddElement<int32_t>(LocalResponseNormalizationOptions::VT_RADIUS,
+                             radius, 0);
   }
   void add_bias(float bias) {
-    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BIAS, bias, 0.0f);
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BIAS, bias,
+                           0.0f);
   }
   void add_alpha(float alpha) {
-    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_ALPHA, alpha, 0.0f);
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_ALPHA, alpha,
+                           0.0f);
   }
   void add_beta(float beta) {
-    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BETA, beta, 0.0f);
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BETA, beta,
+                           0.0f);
   }
-  explicit LocalResponseNormalizationOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+  explicit LocalResponseNormalizationOptionsBuilder(
+      flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  LocalResponseNormalizationOptionsBuilder &operator=(const LocalResponseNormalizationOptionsBuilder &);
+  LocalResponseNormalizationOptionsBuilder &operator=(
+      const LocalResponseNormalizationOptionsBuilder &);
   flatbuffers::Offset<LocalResponseNormalizationOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<LocalResponseNormalizationOptions>(end);
@@ -1995,12 +2182,10 @@ struct LocalResponseNormalizationOptionsBuilder {
   }
 };
 
-inline flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t radius = 0,
-    float bias = 0.0f,
-    float alpha = 0.0f,
-    float beta = 0.0f) {
+inline flatbuffers::Offset<LocalResponseNormalizationOptions>
+CreateLocalResponseNormalizationOptions(flatbuffers::FlatBufferBuilder &_fbb,
+                                        int32_t radius = 0, float bias = 0.0f,
+                                        float alpha = 0.0f, float beta = 0.0f) {
   LocalResponseNormalizationOptionsBuilder builder_(_fbb);
   builder_.add_beta(beta);
   builder_.add_alpha(alpha);
@@ -2009,7 +2194,11 @@ inline flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalRespons
   return builder_.Finish();
 }
 
-flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<LocalResponseNormalizationOptions>
+CreateLocalResponseNormalizationOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const LocalResponseNormalizationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct LSTMOptionsT : public flatbuffers::NativeTable {
   typedef LSTMOptions TableType;
@@ -2019,43 +2208,41 @@ struct LSTMOptionsT : public flatbuffers::NativeTable {
   LSTMOptionsT()
       : fused_activation_function(ActivationFunctionType_NONE),
         cell_clip(0.0f),
-        proj_clip(0.0f) {
-  }
+        proj_clip(0.0f) {}
 };
 
 struct LSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef LSTMOptionsT NativeTableType;
-  enum {
-    VT_FUSED_ACTIVATION_FUNCTION = 4,
-    VT_CELL_CLIP = 6,
-    VT_PROJ_CLIP = 8
-  };
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4, VT_CELL_CLIP = 6, VT_PROJ_CLIP = 8 };
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
-  }
-  float cell_clip() const {
-    return GetField<float>(VT_CELL_CLIP, 0.0f);
-  }
-  float proj_clip() const {
-    return GetField<float>(VT_PROJ_CLIP, 0.0f);
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
+  float cell_clip() const { return GetField<float>(VT_CELL_CLIP, 0.0f); }
+  float proj_clip() const { return GetField<float>(VT_PROJ_CLIP, 0.0f); }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            VerifyField<float>(verifier, VT_CELL_CLIP) &&
-           VerifyField<float>(verifier, VT_PROJ_CLIP) &&
-           verifier.EndTable();
-  }
-  LSTMOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(LSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<LSTMOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           VerifyField<float>(verifier, VT_PROJ_CLIP) && verifier.EndTable();
+  }
+  LSTMOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      LSTMOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LSTMOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct LSTMOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(LSTMOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(LSTMOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   void add_cell_clip(float cell_clip) {
     fbb_.AddElement<float>(LSTMOptions::VT_CELL_CLIP, cell_clip, 0.0f);
@@ -2064,7 +2251,7 @@ struct LSTMOptionsBuilder {
     fbb_.AddElement<float>(LSTMOptions::VT_PROJ_CLIP, proj_clip, 0.0f);
   }
   explicit LSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   LSTMOptionsBuilder &operator=(const LSTMOptionsBuilder &);
@@ -2077,9 +2264,9 @@ struct LSTMOptionsBuilder {
 
 inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
-    float cell_clip = 0.0f,
-    float proj_clip = 0.0f) {
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE,
+    float cell_clip = 0.0f, float proj_clip = 0.0f) {
   LSTMOptionsBuilder builder_(_fbb);
   builder_.add_proj_clip(proj_clip);
   builder_.add_cell_clip(cell_clip);
@@ -2087,52 +2274,50 @@ inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct ResizeBilinearOptionsT : public flatbuffers::NativeTable {
   typedef ResizeBilinearOptions TableType;
   int32_t new_height;
   int32_t new_width;
-  ResizeBilinearOptionsT()
-      : new_height(0),
-        new_width(0) {
-  }
+  ResizeBilinearOptionsT() : new_height(0), new_width(0) {}
 };
 
-struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef ResizeBilinearOptionsT NativeTableType;
-  enum {
-    VT_NEW_HEIGHT = 4,
-    VT_NEW_WIDTH = 6
-  };
-  int32_t new_height() const {
-    return GetField<int32_t>(VT_NEW_HEIGHT, 0);
-  }
-  int32_t new_width() const {
-    return GetField<int32_t>(VT_NEW_WIDTH, 0);
-  }
+  enum { VT_NEW_HEIGHT = 4, VT_NEW_WIDTH = 6 };
+  int32_t new_height() const { return GetField<int32_t>(VT_NEW_HEIGHT, 0); }
+  int32_t new_width() const { return GetField<int32_t>(VT_NEW_WIDTH, 0); }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_NEW_HEIGHT) &&
-           VerifyField<int32_t>(verifier, VT_NEW_WIDTH) &&
-           verifier.EndTable();
-  }
-  ResizeBilinearOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ResizeBilinearOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ResizeBilinearOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           VerifyField<int32_t>(verifier, VT_NEW_WIDTH) && verifier.EndTable();
+  }
+  ResizeBilinearOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      ResizeBilinearOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ResizeBilinearOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ResizeBilinearOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_new_height(int32_t new_height) {
-    fbb_.AddElement<int32_t>(ResizeBilinearOptions::VT_NEW_HEIGHT, new_height, 0);
+    fbb_.AddElement<int32_t>(ResizeBilinearOptions::VT_NEW_HEIGHT, new_height,
+                             0);
   }
   void add_new_width(int32_t new_width) {
     fbb_.AddElement<int32_t>(ResizeBilinearOptions::VT_NEW_WIDTH, new_width, 0);
   }
   explicit ResizeBilinearOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   ResizeBilinearOptionsBuilder &operator=(const ResizeBilinearOptionsBuilder &);
@@ -2144,8 +2329,7 @@ struct ResizeBilinearOptionsBuilder {
 };
 
 inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t new_height = 0,
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t new_height = 0,
     int32_t new_width = 0) {
   ResizeBilinearOptionsBuilder builder_(_fbb);
   builder_.add_new_width(new_width);
@@ -2153,32 +2337,32 @@ inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct CallOptionsT : public flatbuffers::NativeTable {
   typedef CallOptions TableType;
   uint32_t subgraph;
-  CallOptionsT()
-      : subgraph(0) {
-  }
+  CallOptionsT() : subgraph(0) {}
 };
 
 struct CallOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef CallOptionsT NativeTableType;
-  enum {
-    VT_SUBGRAPH = 4
-  };
-  uint32_t subgraph() const {
-    return GetField<uint32_t>(VT_SUBGRAPH, 0);
-  }
+  enum { VT_SUBGRAPH = 4 };
+  uint32_t subgraph() const { return GetField<uint32_t>(VT_SUBGRAPH, 0); }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<uint32_t>(verifier, VT_SUBGRAPH) &&
-           verifier.EndTable();
-  }
-  CallOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(CallOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<CallOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           VerifyField<uint32_t>(verifier, VT_SUBGRAPH) && verifier.EndTable();
+  }
+  CallOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      CallOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<CallOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct CallOptionsBuilder {
@@ -2188,7 +2372,7 @@ struct CallOptionsBuilder {
     fbb_.AddElement<uint32_t>(CallOptions::VT_SUBGRAPH, subgraph, 0);
   }
   explicit CallOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   CallOptionsBuilder &operator=(const CallOptionsBuilder &);
@@ -2200,49 +2384,130 @@ struct CallOptionsBuilder {
 };
 
 inline flatbuffers::Offset<CallOptions> CreateCallOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    uint32_t subgraph = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb, uint32_t subgraph = 0) {
   CallOptionsBuilder builder_(_fbb);
   builder_.add_subgraph(subgraph);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<CallOptions> CreateCallOptions(flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<CallOptions> CreateCallOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct PadOptionsT : public flatbuffers::NativeTable {
+  typedef PadOptions TableType;
+  std::vector<int32_t> before_padding;
+  std::vector<int32_t> after_padding;
+  PadOptionsT() {}
+};
+
+struct PadOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef PadOptionsT NativeTableType;
+  enum { VT_BEFORE_PADDING = 4, VT_AFTER_PADDING = 6 };
+  const flatbuffers::Vector<int32_t> *before_padding() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BEFORE_PADDING);
+  }
+  const flatbuffers::Vector<int32_t> *after_padding() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_AFTER_PADDING);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_BEFORE_PADDING) &&
+           verifier.Verify(before_padding()) &&
+           VerifyOffset(verifier, VT_AFTER_PADDING) &&
+           verifier.Verify(after_padding()) && verifier.EndTable();
+  }
+  PadOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      PadOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<PadOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct PadOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_before_padding(
+      flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_padding) {
+    fbb_.AddOffset(PadOptions::VT_BEFORE_PADDING, before_padding);
+  }
+  void add_after_padding(
+      flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_padding) {
+    fbb_.AddOffset(PadOptions::VT_AFTER_PADDING, after_padding);
+  }
+  explicit PadOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  PadOptionsBuilder &operator=(const PadOptionsBuilder &);
+  flatbuffers::Offset<PadOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<PadOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<PadOptions> CreatePadOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_padding = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_padding = 0) {
+  PadOptionsBuilder builder_(_fbb);
+  builder_.add_after_padding(after_padding);
+  builder_.add_before_padding(before_padding);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<PadOptions> CreatePadOptionsDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *before_padding = nullptr,
+    const std::vector<int32_t> *after_padding = nullptr) {
+  return tflite::CreatePadOptions(
+      _fbb, before_padding ? _fbb.CreateVector<int32_t>(*before_padding) : 0,
+      after_padding ? _fbb.CreateVector<int32_t>(*after_padding) : 0);
+}
+
+flatbuffers::Offset<PadOptions> CreatePadOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct ReshapeOptionsT : public flatbuffers::NativeTable {
   typedef ReshapeOptions TableType;
   std::vector<int32_t> new_shape;
-  ReshapeOptionsT() {
-  }
+  ReshapeOptionsT() {}
 };
 
 struct ReshapeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ReshapeOptionsT NativeTableType;
-  enum {
-    VT_NEW_SHAPE = 4
-  };
+  enum { VT_NEW_SHAPE = 4 };
   const flatbuffers::Vector<int32_t> *new_shape() const {
     return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_NEW_SHAPE);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_NEW_SHAPE) &&
-           verifier.Verify(new_shape()) &&
-           verifier.EndTable();
-  }
-  ReshapeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ReshapeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ReshapeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NEW_SHAPE) &&
+           verifier.Verify(new_shape()) && verifier.EndTable();
+  }
+  ReshapeOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      ReshapeOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ReshapeOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ReshapeOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_new_shape(flatbuffers::Offset<flatbuffers::Vector<int32_t>> new_shape) {
+  void add_new_shape(
+      flatbuffers::Offset<flatbuffers::Vector<int32_t>> new_shape) {
     fbb_.AddOffset(ReshapeOptions::VT_NEW_SHAPE, new_shape);
   }
   explicit ReshapeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   ReshapeOptionsBuilder &operator=(const ReshapeOptionsBuilder &);
@@ -2265,11 +2530,107 @@ inline flatbuffers::Offset<ReshapeOptions> CreateReshapeOptionsDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<int32_t> *new_shape = nullptr) {
   return tflite::CreateReshapeOptions(
-      _fbb,
-      new_shape ? _fbb.CreateVector<int32_t>(*new_shape) : 0);
+      _fbb, new_shape ? _fbb.CreateVector<int32_t>(*new_shape) : 0);
+}
+
+flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BatchToSpaceNDOptionsT : public flatbuffers::NativeTable {
+  typedef BatchToSpaceNDOptions TableType;
+  std::vector<int32_t> block_shape;
+  std::vector<int32_t> before_crops;
+  std::vector<int32_t> after_crops;
+  BatchToSpaceNDOptionsT() {}
+};
+
+struct BatchToSpaceNDOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef BatchToSpaceNDOptionsT NativeTableType;
+  enum { VT_BLOCK_SHAPE = 4, VT_BEFORE_CROPS = 6, VT_AFTER_CROPS = 8 };
+  const flatbuffers::Vector<int32_t> *block_shape() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BLOCK_SHAPE);
+  }
+  const flatbuffers::Vector<int32_t> *before_crops() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BEFORE_CROPS);
+  }
+  const flatbuffers::Vector<int32_t> *after_crops() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_AFTER_CROPS);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_BLOCK_SHAPE) &&
+           verifier.Verify(block_shape()) &&
+           VerifyOffset(verifier, VT_BEFORE_CROPS) &&
+           verifier.Verify(before_crops()) &&
+           VerifyOffset(verifier, VT_AFTER_CROPS) &&
+           verifier.Verify(after_crops()) && verifier.EndTable();
+  }
+  BatchToSpaceNDOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      BatchToSpaceNDOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<BatchToSpaceNDOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BatchToSpaceNDOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_block_shape(
+      flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_shape) {
+    fbb_.AddOffset(BatchToSpaceNDOptions::VT_BLOCK_SHAPE, block_shape);
+  }
+  void add_before_crops(
+      flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_crops) {
+    fbb_.AddOffset(BatchToSpaceNDOptions::VT_BEFORE_CROPS, before_crops);
+  }
+  void add_after_crops(
+      flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_crops) {
+    fbb_.AddOffset(BatchToSpaceNDOptions::VT_AFTER_CROPS, after_crops);
+  }
+  explicit BatchToSpaceNDOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  BatchToSpaceNDOptionsBuilder &operator=(const BatchToSpaceNDOptionsBuilder &);
+  flatbuffers::Offset<BatchToSpaceNDOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<BatchToSpaceNDOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_shape = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_crops = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_crops = 0) {
+  BatchToSpaceNDOptionsBuilder builder_(_fbb);
+  builder_.add_after_crops(after_crops);
+  builder_.add_before_crops(before_crops);
+  builder_.add_block_shape(block_shape);
+  return builder_.Finish();
 }
 
-flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+inline flatbuffers::Offset<BatchToSpaceNDOptions>
+CreateBatchToSpaceNDOptionsDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *block_shape = nullptr,
+    const std::vector<int32_t> *before_crops = nullptr,
+    const std::vector<int32_t> *after_crops = nullptr) {
+  return tflite::CreateBatchToSpaceNDOptions(
+      _fbb, block_shape ? _fbb.CreateVector<int32_t>(*block_shape) : 0,
+      before_crops ? _fbb.CreateVector<int32_t>(*before_crops) : 0,
+      after_crops ? _fbb.CreateVector<int32_t>(*after_crops) : 0);
+}
+
+flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct SkipGramOptionsT : public flatbuffers::NativeTable {
   typedef SkipGramOptions TableType;
@@ -2277,22 +2638,13 @@ struct SkipGramOptionsT : public flatbuffers::NativeTable {
   int32_t max_skip_size;
   bool include_all_ngrams;
   SkipGramOptionsT()
-      : ngram_size(0),
-        max_skip_size(0),
-        include_all_ngrams(false) {
-  }
+      : ngram_size(0), max_skip_size(0), include_all_ngrams(false) {}
 };
 
 struct SkipGramOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SkipGramOptionsT NativeTableType;
-  enum {
-    VT_NGRAM_SIZE = 4,
-    VT_MAX_SKIP_SIZE = 6,
-    VT_INCLUDE_ALL_NGRAMS = 8
-  };
-  int32_t ngram_size() const {
-    return GetField<int32_t>(VT_NGRAM_SIZE, 0);
-  }
+  enum { VT_NGRAM_SIZE = 4, VT_MAX_SKIP_SIZE = 6, VT_INCLUDE_ALL_NGRAMS = 8 };
+  int32_t ngram_size() const { return GetField<int32_t>(VT_NGRAM_SIZE, 0); }
   int32_t max_skip_size() const {
     return GetField<int32_t>(VT_MAX_SKIP_SIZE, 0);
   }
@@ -2306,9 +2658,14 @@ struct SkipGramOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<uint8_t>(verifier, VT_INCLUDE_ALL_NGRAMS) &&
            verifier.EndTable();
   }
-  SkipGramOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SkipGramOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SkipGramOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SkipGramOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SkipGramOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SkipGramOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SkipGramOptionsBuilder {
@@ -2318,13 +2675,15 @@ struct SkipGramOptionsBuilder {
     fbb_.AddElement<int32_t>(SkipGramOptions::VT_NGRAM_SIZE, ngram_size, 0);
   }
   void add_max_skip_size(int32_t max_skip_size) {
-    fbb_.AddElement<int32_t>(SkipGramOptions::VT_MAX_SKIP_SIZE, max_skip_size, 0);
+    fbb_.AddElement<int32_t>(SkipGramOptions::VT_MAX_SKIP_SIZE, max_skip_size,
+                             0);
   }
   void add_include_all_ngrams(bool include_all_ngrams) {
-    fbb_.AddElement<uint8_t>(SkipGramOptions::VT_INCLUDE_ALL_NGRAMS, static_cast<uint8_t>(include_all_ngrams), 0);
+    fbb_.AddElement<uint8_t>(SkipGramOptions::VT_INCLUDE_ALL_NGRAMS,
+                             static_cast<uint8_t>(include_all_ngrams), 0);
   }
   explicit SkipGramOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   SkipGramOptionsBuilder &operator=(const SkipGramOptionsBuilder &);
@@ -2336,10 +2695,8 @@ struct SkipGramOptionsBuilder {
 };
 
 inline flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t ngram_size = 0,
-    int32_t max_skip_size = 0,
-    bool include_all_ngrams = false) {
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t ngram_size = 0,
+    int32_t max_skip_size = 0, bool include_all_ngrams = false) {
   SkipGramOptionsBuilder builder_(_fbb);
   builder_.add_max_skip_size(max_skip_size);
   builder_.add_ngram_size(ngram_size);
@@ -2347,32 +2704,33 @@ inline flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct SpaceToDepthOptionsT : public flatbuffers::NativeTable {
   typedef SpaceToDepthOptions TableType;
   int32_t block_size;
-  SpaceToDepthOptionsT()
-      : block_size(0) {
-  }
+  SpaceToDepthOptionsT() : block_size(0) {}
 };
 
-struct SpaceToDepthOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SpaceToDepthOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef SpaceToDepthOptionsT NativeTableType;
-  enum {
-    VT_BLOCK_SIZE = 4
-  };
-  int32_t block_size() const {
-    return GetField<int32_t>(VT_BLOCK_SIZE, 0);
-  }
+  enum { VT_BLOCK_SIZE = 4 };
+  int32_t block_size() const { return GetField<int32_t>(VT_BLOCK_SIZE, 0); }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int32_t>(verifier, VT_BLOCK_SIZE) &&
-           verifier.EndTable();
-  }
-  SpaceToDepthOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SpaceToDepthOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SpaceToDepthOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           VerifyField<int32_t>(verifier, VT_BLOCK_SIZE) && verifier.EndTable();
+  }
+  SpaceToDepthOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SpaceToDepthOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SpaceToDepthOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SpaceToDepthOptionsBuilder {
@@ -2382,7 +2740,7 @@ struct SpaceToDepthOptionsBuilder {
     fbb_.AddElement<int32_t>(SpaceToDepthOptions::VT_BLOCK_SIZE, block_size, 0);
   }
   explicit SpaceToDepthOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   SpaceToDepthOptionsBuilder &operator=(const SpaceToDepthOptionsBuilder &);
@@ -2394,52 +2752,58 @@ struct SpaceToDepthOptionsBuilder {
 };
 
 inline flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t block_size = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t block_size = 0) {
   SpaceToDepthOptionsBuilder builder_(_fbb);
   builder_.add_block_size(block_size);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct EmbeddingLookupSparseOptionsT : public flatbuffers::NativeTable {
   typedef EmbeddingLookupSparseOptions TableType;
   CombinerType combiner;
-  EmbeddingLookupSparseOptionsT()
-      : combiner(CombinerType_SUM) {
-  }
+  EmbeddingLookupSparseOptionsT() : combiner(CombinerType_SUM) {}
 };
 
-struct EmbeddingLookupSparseOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct EmbeddingLookupSparseOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef EmbeddingLookupSparseOptionsT NativeTableType;
-  enum {
-    VT_COMBINER = 4
-  };
+  enum { VT_COMBINER = 4 };
   CombinerType combiner() const {
     return static_cast<CombinerType>(GetField<int8_t>(VT_COMBINER, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int8_t>(verifier, VT_COMBINER) &&
-           verifier.EndTable();
-  }
-  EmbeddingLookupSparseOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(EmbeddingLookupSparseOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<EmbeddingLookupSparseOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           VerifyField<int8_t>(verifier, VT_COMBINER) && verifier.EndTable();
+  }
+  EmbeddingLookupSparseOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      EmbeddingLookupSparseOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<EmbeddingLookupSparseOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb,
+      const EmbeddingLookupSparseOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct EmbeddingLookupSparseOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_combiner(CombinerType combiner) {
-    fbb_.AddElement<int8_t>(EmbeddingLookupSparseOptions::VT_COMBINER, static_cast<int8_t>(combiner), 0);
+    fbb_.AddElement<int8_t>(EmbeddingLookupSparseOptions::VT_COMBINER,
+                            static_cast<int8_t>(combiner), 0);
   }
-  explicit EmbeddingLookupSparseOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+  explicit EmbeddingLookupSparseOptionsBuilder(
+      flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  EmbeddingLookupSparseOptionsBuilder &operator=(const EmbeddingLookupSparseOptionsBuilder &);
+  EmbeddingLookupSparseOptionsBuilder &operator=(
+      const EmbeddingLookupSparseOptionsBuilder &);
   flatbuffers::Offset<EmbeddingLookupSparseOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<EmbeddingLookupSparseOptions>(end);
@@ -2447,31 +2811,83 @@ struct EmbeddingLookupSparseOptionsBuilder {
   }
 };
 
-inline flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    CombinerType combiner = CombinerType_SUM) {
+inline flatbuffers::Offset<EmbeddingLookupSparseOptions>
+CreateEmbeddingLookupSparseOptions(flatbuffers::FlatBufferBuilder &_fbb,
+                                   CombinerType combiner = CombinerType_SUM) {
   EmbeddingLookupSparseOptionsBuilder builder_(_fbb);
   builder_.add_combiner(combiner);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<EmbeddingLookupSparseOptions>
+CreateEmbeddingLookupSparseOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const EmbeddingLookupSparseOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GatherOptionsT : public flatbuffers::NativeTable {
+  typedef GatherOptions TableType;
+  int32_t axis;
+  GatherOptionsT() : axis(0) {}
+};
+
+struct GatherOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef GatherOptionsT NativeTableType;
+  enum { VT_AXIS = 4 };
+  int32_t axis() const { return GetField<int32_t>(VT_AXIS, 0); }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_AXIS) && verifier.EndTable();
+  }
+  GatherOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      GatherOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<GatherOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GatherOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(GatherOptions::VT_AXIS, axis, 0);
+  }
+  explicit GatherOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  GatherOptionsBuilder &operator=(const GatherOptionsBuilder &);
+  flatbuffers::Offset<GatherOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<GatherOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<GatherOptions> CreateGatherOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t axis = 0) {
+  GatherOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<GatherOptions> CreateGatherOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
   std::string custom_code;
-  OperatorCodeT()
-      : builtin_code(BuiltinOperator_ADD) {
-  }
+  OperatorCodeT() : builtin_code(BuiltinOperator_ADD) {}
 };
 
 struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef OperatorCodeT NativeTableType;
-  enum {
-    VT_BUILTIN_CODE = 4,
-    VT_CUSTOM_CODE = 6
-  };
+  enum { VT_BUILTIN_CODE = 4, VT_CUSTOM_CODE = 6 };
   BuiltinOperator builtin_code() const {
     return static_cast<BuiltinOperator>(GetField<int8_t>(VT_BUILTIN_CODE, 0));
   }
@@ -2482,25 +2898,30 @@ struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_BUILTIN_CODE) &&
            VerifyOffset(verifier, VT_CUSTOM_CODE) &&
-           verifier.Verify(custom_code()) &&
-           verifier.EndTable();
-  }
-  OperatorCodeT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(OperatorCodeT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<OperatorCode> Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           verifier.Verify(custom_code()) && verifier.EndTable();
+  }
+  OperatorCodeT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      OperatorCodeT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<OperatorCode> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct OperatorCodeBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_builtin_code(BuiltinOperator builtin_code) {
-    fbb_.AddElement<int8_t>(OperatorCode::VT_BUILTIN_CODE, static_cast<int8_t>(builtin_code), 0);
+    fbb_.AddElement<int8_t>(OperatorCode::VT_BUILTIN_CODE,
+                            static_cast<int8_t>(builtin_code), 0);
   }
   void add_custom_code(flatbuffers::Offset<flatbuffers::String> custom_code) {
     fbb_.AddOffset(OperatorCode::VT_CUSTOM_CODE, custom_code);
   }
   explicit OperatorCodeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   OperatorCodeBuilder &operator=(const OperatorCodeBuilder &);
@@ -2526,12 +2947,12 @@ inline flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
     BuiltinOperator builtin_code = BuiltinOperator_ADD,
     const char *custom_code = nullptr) {
   return tflite::CreateOperatorCode(
-      _fbb,
-      builtin_code,
-      custom_code ? _fbb.CreateString(custom_code) : 0);
+      _fbb, builtin_code, custom_code ? _fbb.CreateString(custom_code) : 0);
 }
 
-flatbuffers::Offset<OperatorCode> CreateOperatorCode(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<OperatorCode> CreateOperatorCode(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct OperatorT : public flatbuffers::NativeTable {
   typedef Operator TableType;
@@ -2543,8 +2964,7 @@ struct OperatorT : public flatbuffers::NativeTable {
   CustomOptionsFormat custom_options_format;
   OperatorT()
       : opcode_index(0),
-        custom_options_format(CustomOptionsFormat_FLEXBUFFERS) {
-  }
+        custom_options_format(CustomOptionsFormat_FLEXBUFFERS) {}
 };
 
 struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -2568,189 +2988,316 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
   }
   BuiltinOptions builtin_options_type() const {
-    return static_cast<BuiltinOptions>(GetField<uint8_t>(VT_BUILTIN_OPTIONS_TYPE, 0));
+    return static_cast<BuiltinOptions>(
+        GetField<uint8_t>(VT_BUILTIN_OPTIONS_TYPE, 0));
   }
   const void *builtin_options() const {
     return GetPointer<const void *>(VT_BUILTIN_OPTIONS);
   }
-  template<typename T> const T *builtin_options_as() const;
+  template <typename T>
+  const T *builtin_options_as() const;
   const Conv2DOptions *builtin_options_as_Conv2DOptions() const {
-    return builtin_options_type() == BuiltinOptions_Conv2DOptions ? static_cast<const Conv2DOptions *>(builtin_options()) : nullptr;
-  }
-  const DepthwiseConv2DOptions *builtin_options_as_DepthwiseConv2DOptions() const {
-    return builtin_options_type() == BuiltinOptions_DepthwiseConv2DOptions ? static_cast<const DepthwiseConv2DOptions *>(builtin_options()) : nullptr;
-  }
-  const ConcatEmbeddingsOptions *builtin_options_as_ConcatEmbeddingsOptions() const {
-    return builtin_options_type() == BuiltinOptions_ConcatEmbeddingsOptions ? static_cast<const ConcatEmbeddingsOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_Conv2DOptions
+               ? static_cast<const Conv2DOptions *>(builtin_options())
+               : nullptr;
+  }
+  const DepthwiseConv2DOptions *builtin_options_as_DepthwiseConv2DOptions()
+      const {
+    return builtin_options_type() == BuiltinOptions_DepthwiseConv2DOptions
+               ? static_cast<const DepthwiseConv2DOptions *>(builtin_options())
+               : nullptr;
+  }
+  const ConcatEmbeddingsOptions *builtin_options_as_ConcatEmbeddingsOptions()
+      const {
+    return builtin_options_type() == BuiltinOptions_ConcatEmbeddingsOptions
+               ? static_cast<const ConcatEmbeddingsOptions *>(builtin_options())
+               : nullptr;
   }
   const LSHProjectionOptions *builtin_options_as_LSHProjectionOptions() const {
-    return builtin_options_type() == BuiltinOptions_LSHProjectionOptions ? static_cast<const LSHProjectionOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_LSHProjectionOptions
+               ? static_cast<const LSHProjectionOptions *>(builtin_options())
+               : nullptr;
   }
   const Pool2DOptions *builtin_options_as_Pool2DOptions() const {
-    return builtin_options_type() == BuiltinOptions_Pool2DOptions ? static_cast<const Pool2DOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_Pool2DOptions
+               ? static_cast<const Pool2DOptions *>(builtin_options())
+               : nullptr;
   }
   const SVDFOptions *builtin_options_as_SVDFOptions() const {
-    return builtin_options_type() == BuiltinOptions_SVDFOptions ? static_cast<const SVDFOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_SVDFOptions
+               ? static_cast<const SVDFOptions *>(builtin_options())
+               : nullptr;
   }
   const RNNOptions *builtin_options_as_RNNOptions() const {
-    return builtin_options_type() == BuiltinOptions_RNNOptions ? static_cast<const RNNOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_RNNOptions
+               ? static_cast<const RNNOptions *>(builtin_options())
+               : nullptr;
   }
-  const FullyConnectedOptions *builtin_options_as_FullyConnectedOptions() const {
-    return builtin_options_type() == BuiltinOptions_FullyConnectedOptions ? static_cast<const FullyConnectedOptions *>(builtin_options()) : nullptr;
+  const FullyConnectedOptions *builtin_options_as_FullyConnectedOptions()
+      const {
+    return builtin_options_type() == BuiltinOptions_FullyConnectedOptions
+               ? static_cast<const FullyConnectedOptions *>(builtin_options())
+               : nullptr;
   }
   const SoftmaxOptions *builtin_options_as_SoftmaxOptions() const {
-    return builtin_options_type() == BuiltinOptions_SoftmaxOptions ? static_cast<const SoftmaxOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_SoftmaxOptions
+               ? static_cast<const SoftmaxOptions *>(builtin_options())
+               : nullptr;
   }
   const ConcatenationOptions *builtin_options_as_ConcatenationOptions() const {
-    return builtin_options_type() == BuiltinOptions_ConcatenationOptions ? static_cast<const ConcatenationOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_ConcatenationOptions
+               ? static_cast<const ConcatenationOptions *>(builtin_options())
+               : nullptr;
   }
   const AddOptions *builtin_options_as_AddOptions() const {
-    return builtin_options_type() == BuiltinOptions_AddOptions ? static_cast<const AddOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_AddOptions
+               ? static_cast<const AddOptions *>(builtin_options())
+               : nullptr;
   }
   const L2NormOptions *builtin_options_as_L2NormOptions() const {
-    return builtin_options_type() == BuiltinOptions_L2NormOptions ? static_cast<const L2NormOptions *>(builtin_options()) : nullptr;
-  }
-  const LocalResponseNormalizationOptions *builtin_options_as_LocalResponseNormalizationOptions() const {
-    return builtin_options_type() == BuiltinOptions_LocalResponseNormalizationOptions ? static_cast<const LocalResponseNormalizationOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_L2NormOptions
+               ? static_cast<const L2NormOptions *>(builtin_options())
+               : nullptr;
+  }
+  const LocalResponseNormalizationOptions *
+  builtin_options_as_LocalResponseNormalizationOptions() const {
+    return builtin_options_type() ==
+                   BuiltinOptions_LocalResponseNormalizationOptions
+               ? static_cast<const LocalResponseNormalizationOptions *>(
+                     builtin_options())
+               : nullptr;
   }
   const LSTMOptions *builtin_options_as_LSTMOptions() const {
-    return builtin_options_type() == BuiltinOptions_LSTMOptions ? static_cast<const LSTMOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_LSTMOptions
+               ? static_cast<const LSTMOptions *>(builtin_options())
+               : nullptr;
   }
-  const ResizeBilinearOptions *builtin_options_as_ResizeBilinearOptions() const {
-    return builtin_options_type() == BuiltinOptions_ResizeBilinearOptions ? static_cast<const ResizeBilinearOptions *>(builtin_options()) : nullptr;
+  const ResizeBilinearOptions *builtin_options_as_ResizeBilinearOptions()
+      const {
+    return builtin_options_type() == BuiltinOptions_ResizeBilinearOptions
+               ? static_cast<const ResizeBilinearOptions *>(builtin_options())
+               : nullptr;
   }
   const CallOptions *builtin_options_as_CallOptions() const {
-    return builtin_options_type() == BuiltinOptions_CallOptions ? static_cast<const CallOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_CallOptions
+               ? static_cast<const CallOptions *>(builtin_options())
+               : nullptr;
   }
   const ReshapeOptions *builtin_options_as_ReshapeOptions() const {
-    return builtin_options_type() == BuiltinOptions_ReshapeOptions ? static_cast<const ReshapeOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_ReshapeOptions
+               ? static_cast<const ReshapeOptions *>(builtin_options())
+               : nullptr;
   }
   const SkipGramOptions *builtin_options_as_SkipGramOptions() const {
-    return builtin_options_type() == BuiltinOptions_SkipGramOptions ? static_cast<const SkipGramOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_SkipGramOptions
+               ? static_cast<const SkipGramOptions *>(builtin_options())
+               : nullptr;
   }
   const SpaceToDepthOptions *builtin_options_as_SpaceToDepthOptions() const {
-    return builtin_options_type() == BuiltinOptions_SpaceToDepthOptions ? static_cast<const SpaceToDepthOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_SpaceToDepthOptions
+               ? static_cast<const SpaceToDepthOptions *>(builtin_options())
+               : nullptr;
   }
-  const EmbeddingLookupSparseOptions *builtin_options_as_EmbeddingLookupSparseOptions() const {
-    return builtin_options_type() == BuiltinOptions_EmbeddingLookupSparseOptions ? static_cast<const EmbeddingLookupSparseOptions *>(builtin_options()) : nullptr;
+  const EmbeddingLookupSparseOptions *
+  builtin_options_as_EmbeddingLookupSparseOptions() const {
+    return builtin_options_type() == BuiltinOptions_EmbeddingLookupSparseOptions
+               ? static_cast<const EmbeddingLookupSparseOptions *>(
+                     builtin_options())
+               : nullptr;
   }
   const MulOptions *builtin_options_as_MulOptions() const {
-    return builtin_options_type() == BuiltinOptions_MulOptions ? static_cast<const MulOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_MulOptions
+               ? static_cast<const MulOptions *>(builtin_options())
+               : nullptr;
+  }
+  const PadOptions *builtin_options_as_PadOptions() const {
+    return builtin_options_type() == BuiltinOptions_PadOptions
+               ? static_cast<const PadOptions *>(builtin_options())
+               : nullptr;
+  }
+  const GatherOptions *builtin_options_as_GatherOptions() const {
+    return builtin_options_type() == BuiltinOptions_GatherOptions
+               ? static_cast<const GatherOptions *>(builtin_options())
+               : nullptr;
+  }
+  const BatchToSpaceNDOptions *builtin_options_as_BatchToSpaceNDOptions()
+      const {
+    return builtin_options_type() == BuiltinOptions_BatchToSpaceNDOptions
+               ? static_cast<const BatchToSpaceNDOptions *>(builtin_options())
+               : nullptr;
   }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
   CustomOptionsFormat custom_options_format() const {
-    return static_cast<CustomOptionsFormat>(GetField<int8_t>(VT_CUSTOM_OPTIONS_FORMAT, 0));
+    return static_cast<CustomOptionsFormat>(
+        GetField<int8_t>(VT_CUSTOM_OPTIONS_FORMAT, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint32_t>(verifier, VT_OPCODE_INDEX) &&
-           VerifyOffset(verifier, VT_INPUTS) &&
-           verifier.Verify(inputs()) &&
-           VerifyOffset(verifier, VT_OUTPUTS) &&
-           verifier.Verify(outputs()) &&
+           VerifyOffset(verifier, VT_INPUTS) && verifier.Verify(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) && verifier.Verify(outputs()) &&
            VerifyField<uint8_t>(verifier, VT_BUILTIN_OPTIONS_TYPE) &&
            VerifyOffset(verifier, VT_BUILTIN_OPTIONS) &&
-           VerifyBuiltinOptions(verifier, builtin_options(), builtin_options_type()) &&
+           VerifyBuiltinOptions(verifier, builtin_options(),
+                                builtin_options_type()) &&
            VerifyOffset(verifier, VT_CUSTOM_OPTIONS) &&
            verifier.Verify(custom_options()) &&
            VerifyField<int8_t>(verifier, VT_CUSTOM_OPTIONS_FORMAT) &&
            verifier.EndTable();
   }
-  OperatorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(OperatorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Operator> Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  OperatorT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      OperatorT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Operator> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
-template<> inline const Conv2DOptions *Operator::builtin_options_as<Conv2DOptions>() const {
+template <>
+inline const Conv2DOptions *Operator::builtin_options_as<Conv2DOptions>()
+    const {
   return builtin_options_as_Conv2DOptions();
 }
 
-template<> inline const DepthwiseConv2DOptions *Operator::builtin_options_as<DepthwiseConv2DOptions>() const {
+template <>
+inline const DepthwiseConv2DOptions *
+Operator::builtin_options_as<DepthwiseConv2DOptions>() const {
   return builtin_options_as_DepthwiseConv2DOptions();
 }
 
-template<> inline const ConcatEmbeddingsOptions *Operator::builtin_options_as<ConcatEmbeddingsOptions>() const {
+template <>
+inline const ConcatEmbeddingsOptions *
+Operator::builtin_options_as<ConcatEmbeddingsOptions>() const {
   return builtin_options_as_ConcatEmbeddingsOptions();
 }
 
-template<> inline const LSHProjectionOptions *Operator::builtin_options_as<LSHProjectionOptions>() const {
+template <>
+inline const LSHProjectionOptions *
+Operator::builtin_options_as<LSHProjectionOptions>() const {
   return builtin_options_as_LSHProjectionOptions();
 }
 
-template<> inline const Pool2DOptions *Operator::builtin_options_as<Pool2DOptions>() const {
+template <>
+inline const Pool2DOptions *Operator::builtin_options_as<Pool2DOptions>()
+    const {
   return builtin_options_as_Pool2DOptions();
 }
 
-template<> inline const SVDFOptions *Operator::builtin_options_as<SVDFOptions>() const {
+template <>
+inline const SVDFOptions *Operator::builtin_options_as<SVDFOptions>() const {
   return builtin_options_as_SVDFOptions();
 }
 
-template<> inline const RNNOptions *Operator::builtin_options_as<RNNOptions>() const {
+template <>
+inline const RNNOptions *Operator::builtin_options_as<RNNOptions>() const {
   return builtin_options_as_RNNOptions();
 }
 
-template<> inline const FullyConnectedOptions *Operator::builtin_options_as<FullyConnectedOptions>() const {
+template <>
+inline const FullyConnectedOptions *
+Operator::builtin_options_as<FullyConnectedOptions>() const {
   return builtin_options_as_FullyConnectedOptions();
 }
 
-template<> inline const SoftmaxOptions *Operator::builtin_options_as<SoftmaxOptions>() const {
+template <>
+inline const SoftmaxOptions *Operator::builtin_options_as<SoftmaxOptions>()
+    const {
   return builtin_options_as_SoftmaxOptions();
 }
 
-template<> inline const ConcatenationOptions *Operator::builtin_options_as<ConcatenationOptions>() const {
+template <>
+inline const ConcatenationOptions *
+Operator::builtin_options_as<ConcatenationOptions>() const {
   return builtin_options_as_ConcatenationOptions();
 }
 
-template<> inline const AddOptions *Operator::builtin_options_as<AddOptions>() const {
+template <>
+inline const AddOptions *Operator::builtin_options_as<AddOptions>() const {
   return builtin_options_as_AddOptions();
 }
 
-template<> inline const L2NormOptions *Operator::builtin_options_as<L2NormOptions>() const {
+template <>
+inline const L2NormOptions *Operator::builtin_options_as<L2NormOptions>()
+    const {
   return builtin_options_as_L2NormOptions();
 }
 
-template<> inline const LocalResponseNormalizationOptions *Operator::builtin_options_as<LocalResponseNormalizationOptions>() const {
+template <>
+inline const LocalResponseNormalizationOptions *
+Operator::builtin_options_as<LocalResponseNormalizationOptions>() const {
   return builtin_options_as_LocalResponseNormalizationOptions();
 }
 
-template<> inline const LSTMOptions *Operator::builtin_options_as<LSTMOptions>() const {
+template <>
+inline const LSTMOptions *Operator::builtin_options_as<LSTMOptions>() const {
   return builtin_options_as_LSTMOptions();
 }
 
-template<> inline const ResizeBilinearOptions *Operator::builtin_options_as<ResizeBilinearOptions>() const {
+template <>
+inline const ResizeBilinearOptions *
+Operator::builtin_options_as<ResizeBilinearOptions>() const {
   return builtin_options_as_ResizeBilinearOptions();
 }
 
-template<> inline const CallOptions *Operator::builtin_options_as<CallOptions>() const {
+template <>
+inline const CallOptions *Operator::builtin_options_as<CallOptions>() const {
   return builtin_options_as_CallOptions();
 }
 
-template<> inline const ReshapeOptions *Operator::builtin_options_as<ReshapeOptions>() const {
+template <>
+inline const ReshapeOptions *Operator::builtin_options_as<ReshapeOptions>()
+    const {
   return builtin_options_as_ReshapeOptions();
 }
 
-template<> inline const SkipGramOptions *Operator::builtin_options_as<SkipGramOptions>() const {
+template <>
+inline const SkipGramOptions *Operator::builtin_options_as<SkipGramOptions>()
+    const {
   return builtin_options_as_SkipGramOptions();
 }
 
-template<> inline const SpaceToDepthOptions *Operator::builtin_options_as<SpaceToDepthOptions>() const {
+template <>
+inline const SpaceToDepthOptions *
+Operator::builtin_options_as<SpaceToDepthOptions>() const {
   return builtin_options_as_SpaceToDepthOptions();
 }
 
-template<> inline const EmbeddingLookupSparseOptions *Operator::builtin_options_as<EmbeddingLookupSparseOptions>() const {
+template <>
+inline const EmbeddingLookupSparseOptions *
+Operator::builtin_options_as<EmbeddingLookupSparseOptions>() const {
   return builtin_options_as_EmbeddingLookupSparseOptions();
 }
 
-template<> inline const MulOptions *Operator::builtin_options_as<MulOptions>() const {
+template <>
+inline const MulOptions *Operator::builtin_options_as<MulOptions>() const {
   return builtin_options_as_MulOptions();
 }
 
-struct OperatorBuilder {
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_opcode_index(uint32_t opcode_index) {
+template <>
+inline const PadOptions *Operator::builtin_options_as<PadOptions>() const {
+  return builtin_options_as_PadOptions();
+}
+
+template <>
+inline const GatherOptions *Operator::builtin_options_as<GatherOptions>()
+    const {
+  return builtin_options_as_GatherOptions();
+}
+
+template <>
+inline const BatchToSpaceNDOptions *
+Operator::builtin_options_as<BatchToSpaceNDOptions>() const {
+  return builtin_options_as_BatchToSpaceNDOptions();
+}
+
+struct OperatorBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_opcode_index(uint32_t opcode_index) {
     fbb_.AddElement<uint32_t>(Operator::VT_OPCODE_INDEX, opcode_index, 0);
   }
   void add_inputs(flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs) {
@@ -2760,19 +3307,21 @@ struct OperatorBuilder {
     fbb_.AddOffset(Operator::VT_OUTPUTS, outputs);
   }
   void add_builtin_options_type(BuiltinOptions builtin_options_type) {
-    fbb_.AddElement<uint8_t>(Operator::VT_BUILTIN_OPTIONS_TYPE, static_cast<uint8_t>(builtin_options_type), 0);
+    fbb_.AddElement<uint8_t>(Operator::VT_BUILTIN_OPTIONS_TYPE,
+                             static_cast<uint8_t>(builtin_options_type), 0);
   }
   void add_builtin_options(flatbuffers::Offset<void> builtin_options) {
     fbb_.AddOffset(Operator::VT_BUILTIN_OPTIONS, builtin_options);
   }
-  void add_custom_options(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom_options) {
+  void add_custom_options(
+      flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom_options) {
     fbb_.AddOffset(Operator::VT_CUSTOM_OPTIONS, custom_options);
   }
   void add_custom_options_format(CustomOptionsFormat custom_options_format) {
-    fbb_.AddElement<int8_t>(Operator::VT_CUSTOM_OPTIONS_FORMAT, static_cast<int8_t>(custom_options_format), 0);
+    fbb_.AddElement<int8_t>(Operator::VT_CUSTOM_OPTIONS_FORMAT,
+                            static_cast<int8_t>(custom_options_format), 0);
   }
-  explicit OperatorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+  explicit OperatorBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   OperatorBuilder &operator=(const OperatorBuilder &);
@@ -2784,14 +3333,14 @@ struct OperatorBuilder {
 };
 
 inline flatbuffers::Offset<Operator> CreateOperator(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    uint32_t opcode_index = 0,
+    flatbuffers::FlatBufferBuilder &_fbb, uint32_t opcode_index = 0,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs = 0,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs = 0,
     BuiltinOptions builtin_options_type = BuiltinOptions_NONE,
     flatbuffers::Offset<void> builtin_options = 0,
     flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom_options = 0,
-    CustomOptionsFormat custom_options_format = CustomOptionsFormat_FLEXBUFFERS) {
+    CustomOptionsFormat custom_options_format =
+        CustomOptionsFormat_FLEXBUFFERS) {
   OperatorBuilder builder_(_fbb);
   builder_.add_custom_options(custom_options);
   builder_.add_builtin_options(builtin_options);
@@ -2804,26 +3353,25 @@ inline flatbuffers::Offset<Operator> CreateOperator(
 }
 
 inline flatbuffers::Offset<Operator> CreateOperatorDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    uint32_t opcode_index = 0,
+    flatbuffers::FlatBufferBuilder &_fbb, uint32_t opcode_index = 0,
     const std::vector<int32_t> *inputs = nullptr,
     const std::vector<int32_t> *outputs = nullptr,
     BuiltinOptions builtin_options_type = BuiltinOptions_NONE,
     flatbuffers::Offset<void> builtin_options = 0,
     const std::vector<uint8_t> *custom_options = nullptr,
-    CustomOptionsFormat custom_options_format = CustomOptionsFormat_FLEXBUFFERS) {
+    CustomOptionsFormat custom_options_format =
+        CustomOptionsFormat_FLEXBUFFERS) {
   return tflite::CreateOperator(
-      _fbb,
-      opcode_index,
-      inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0,
-      outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0,
-      builtin_options_type,
+      _fbb, opcode_index, inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0,
+      outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0, builtin_options_type,
       builtin_options,
       custom_options ? _fbb.CreateVector<uint8_t>(*custom_options) : 0,
       custom_options_format);
 }
 
-flatbuffers::Offset<Operator> CreateOperator(flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<Operator> CreateOperator(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct SubGraphT : public flatbuffers::NativeTable {
   typedef SubGraph TableType;
@@ -2832,8 +3380,7 @@ struct SubGraphT : public flatbuffers::NativeTable {
   std::vector<int32_t> outputs;
   std::vector<std::unique_ptr<OperatorT>> operators;
   std::string name;
-  SubGraphT() {
-  }
+  SubGraphT() {}
 };
 
 struct SubGraph FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -2846,7 +3393,8 @@ struct SubGraph FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_NAME = 12
   };
   const flatbuffers::Vector<flatbuffers::Offset<Tensor>> *tensors() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Tensor>> *>(VT_TENSORS);
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Tensor>> *>(
+        VT_TENSORS);
   }
   const flatbuffers::Vector<int32_t> *inputs() const {
     return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_INPUTS);
@@ -2855,36 +3403,41 @@ struct SubGraph FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
   }
   const flatbuffers::Vector<flatbuffers::Offset<Operator>> *operators() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Operator>> *>(VT_OPERATORS);
+    return GetPointer<
+        const flatbuffers::Vector<flatbuffers::Offset<Operator>> *>(
+        VT_OPERATORS);
   }
   const flatbuffers::String *name() const {
     return GetPointer<const flatbuffers::String *>(VT_NAME);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_TENSORS) &&
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_TENSORS) &&
            verifier.Verify(tensors()) &&
            verifier.VerifyVectorOfTables(tensors()) &&
-           VerifyOffset(verifier, VT_INPUTS) &&
-           verifier.Verify(inputs()) &&
-           VerifyOffset(verifier, VT_OUTPUTS) &&
-           verifier.Verify(outputs()) &&
+           VerifyOffset(verifier, VT_INPUTS) && verifier.Verify(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) && verifier.Verify(outputs()) &&
            VerifyOffset(verifier, VT_OPERATORS) &&
            verifier.Verify(operators()) &&
            verifier.VerifyVectorOfTables(operators()) &&
-           VerifyOffset(verifier, VT_NAME) &&
-           verifier.Verify(name()) &&
+           VerifyOffset(verifier, VT_NAME) && verifier.Verify(name()) &&
            verifier.EndTable();
   }
-  SubGraphT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SubGraphT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SubGraph> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SubGraphT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SubGraphT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SubGraph> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SubGraphBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_tensors(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Tensor>>> tensors) {
+  void add_tensors(
+      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Tensor>>>
+          tensors) {
     fbb_.AddOffset(SubGraph::VT_TENSORS, tensors);
   }
   void add_inputs(flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs) {
@@ -2893,14 +3446,15 @@ struct SubGraphBuilder {
   void add_outputs(flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs) {
     fbb_.AddOffset(SubGraph::VT_OUTPUTS, outputs);
   }
-  void add_operators(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Operator>>> operators) {
+  void add_operators(
+      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Operator>>>
+          operators) {
     fbb_.AddOffset(SubGraph::VT_OPERATORS, operators);
   }
   void add_name(flatbuffers::Offset<flatbuffers::String> name) {
     fbb_.AddOffset(SubGraph::VT_NAME, name);
   }
-  explicit SubGraphBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+  explicit SubGraphBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   SubGraphBuilder &operator=(const SubGraphBuilder &);
@@ -2913,10 +3467,12 @@ struct SubGraphBuilder {
 
 inline flatbuffers::Offset<SubGraph> CreateSubGraph(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Tensor>>> tensors = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Tensor>>>
+        tensors = 0,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs = 0,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Operator>>> operators = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Operator>>>
+        operators = 0,
     flatbuffers::Offset<flatbuffers::String> name = 0) {
   SubGraphBuilder builder_(_fbb);
   builder_.add_name(name);
@@ -2939,36 +3495,38 @@ inline flatbuffers::Offset<SubGraph> CreateSubGraphDirect(
       tensors ? _fbb.CreateVector<flatbuffers::Offset<Tensor>>(*tensors) : 0,
       inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0,
       outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0,
-      operators ? _fbb.CreateVector<flatbuffers::Offset<Operator>>(*operators) : 0,
+      operators ? _fbb.CreateVector<flatbuffers::Offset<Operator>>(*operators)
+                : 0,
       name ? _fbb.CreateString(name) : 0);
 }
 
-flatbuffers::Offset<SubGraph> CreateSubGraph(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<SubGraph> CreateSubGraph(
+    flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct BufferT : public flatbuffers::NativeTable {
   typedef Buffer TableType;
   std::vector<uint8_t> data;
-  BufferT() {
-  }
+  BufferT() {}
 };
 
 struct Buffer FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef BufferT NativeTableType;
-  enum {
-    VT_DATA = 4
-  };
+  enum { VT_DATA = 4 };
   const flatbuffers::Vector<uint8_t> *data() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_DATA);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_DATA) &&
-           verifier.Verify(data()) &&
-           verifier.EndTable();
-  }
-  BufferT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BufferT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Buffer> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BufferT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_DATA) &&
+           verifier.Verify(data()) && verifier.EndTable();
+  }
+  BufferT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BufferT *_o, const flatbuffers::resolver_function_t *_resolver =
+                                 nullptr) const;
+  static flatbuffers::Offset<Buffer> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct BufferBuilder {
@@ -2977,8 +3535,7 @@ struct BufferBuilder {
   void add_data(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> data) {
     fbb_.AddOffset(Buffer::VT_DATA, data);
   }
-  explicit BufferBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+  explicit BufferBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   BufferBuilder &operator=(const BufferBuilder &);
@@ -3000,12 +3557,13 @@ inline flatbuffers::Offset<Buffer> CreateBuffer(
 inline flatbuffers::Offset<Buffer> CreateBufferDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<uint8_t> *data = nullptr) {
-  return tflite::CreateBuffer(
-      _fbb,
-      data ? _fbb.CreateVector<uint8_t>(*data) : 0);
+  return tflite::CreateBuffer(_fbb,
+                              data ? _fbb.CreateVector<uint8_t>(*data) : 0);
 }
 
-flatbuffers::Offset<Buffer> CreateBuffer(flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<Buffer> CreateBuffer(
+    flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct ModelT : public flatbuffers::NativeTable {
   typedef Model TableType;
@@ -3014,9 +3572,7 @@ struct ModelT : public flatbuffers::NativeTable {
   std::vector<std::unique_ptr<SubGraphT>> subgraphs;
   std::string description;
   std::vector<std::unique_ptr<BufferT>> buffers;
-  ModelT()
-      : version(0) {
-  }
+  ModelT() : version(0) {}
 };
 
 struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -3028,20 +3584,24 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_DESCRIPTION = 10,
     VT_BUFFERS = 12
   };
-  uint32_t version() const {
-    return GetField<uint32_t>(VT_VERSION, 0);
-  }
-  const flatbuffers::Vector<flatbuffers::Offset<OperatorCode>> *operator_codes() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<OperatorCode>> *>(VT_OPERATOR_CODES);
+  uint32_t version() const { return GetField<uint32_t>(VT_VERSION, 0); }
+  const flatbuffers::Vector<flatbuffers::Offset<OperatorCode>> *operator_codes()
+      const {
+    return GetPointer<
+        const flatbuffers::Vector<flatbuffers::Offset<OperatorCode>> *>(
+        VT_OPERATOR_CODES);
   }
   const flatbuffers::Vector<flatbuffers::Offset<SubGraph>> *subgraphs() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<SubGraph>> *>(VT_SUBGRAPHS);
+    return GetPointer<
+        const flatbuffers::Vector<flatbuffers::Offset<SubGraph>> *>(
+        VT_SUBGRAPHS);
   }
   const flatbuffers::String *description() const {
     return GetPointer<const flatbuffers::String *>(VT_DESCRIPTION);
   }
   const flatbuffers::Vector<flatbuffers::Offset<Buffer>> *buffers() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Buffer>> *>(VT_BUFFERS);
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Buffer>> *>(
+        VT_BUFFERS);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -3054,14 +3614,16 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyVectorOfTables(subgraphs()) &&
            VerifyOffset(verifier, VT_DESCRIPTION) &&
            verifier.Verify(description()) &&
-           VerifyOffset(verifier, VT_BUFFERS) &&
-           verifier.Verify(buffers()) &&
-           verifier.VerifyVectorOfTables(buffers()) &&
-           verifier.EndTable();
-  }
-  ModelT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ModelT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Model> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           VerifyOffset(verifier, VT_BUFFERS) && verifier.Verify(buffers()) &&
+           verifier.VerifyVectorOfTables(buffers()) && verifier.EndTable();
+  }
+  ModelT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ModelT *_o, const flatbuffers::resolver_function_t *_resolver =
+                                nullptr) const;
+  static flatbuffers::Offset<Model> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ModelBuilder {
@@ -3070,20 +3632,26 @@ struct ModelBuilder {
   void add_version(uint32_t version) {
     fbb_.AddElement<uint32_t>(Model::VT_VERSION, version, 0);
   }
-  void add_operator_codes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<OperatorCode>>> operator_codes) {
+  void add_operator_codes(
+      flatbuffers::Offset<
+          flatbuffers::Vector<flatbuffers::Offset<OperatorCode>>>
+          operator_codes) {
     fbb_.AddOffset(Model::VT_OPERATOR_CODES, operator_codes);
   }
-  void add_subgraphs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<SubGraph>>> subgraphs) {
+  void add_subgraphs(
+      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<SubGraph>>>
+          subgraphs) {
     fbb_.AddOffset(Model::VT_SUBGRAPHS, subgraphs);
   }
   void add_description(flatbuffers::Offset<flatbuffers::String> description) {
     fbb_.AddOffset(Model::VT_DESCRIPTION, description);
   }
-  void add_buffers(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Buffer>>> buffers) {
+  void add_buffers(
+      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Buffer>>>
+          buffers) {
     fbb_.AddOffset(Model::VT_BUFFERS, buffers);
   }
-  explicit ModelBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+  explicit ModelBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   ModelBuilder &operator=(const ModelBuilder &);
@@ -3095,12 +3663,14 @@ struct ModelBuilder {
 };
 
 inline flatbuffers::Offset<Model> CreateModel(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    uint32_t version = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<OperatorCode>>> operator_codes = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<SubGraph>>> subgraphs = 0,
+    flatbuffers::FlatBufferBuilder &_fbb, uint32_t version = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<OperatorCode>>>
+        operator_codes = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<SubGraph>>>
+        subgraphs = 0,
     flatbuffers::Offset<flatbuffers::String> description = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Buffer>>> buffers = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Buffer>>>
+        buffers = 0) {
   ModelBuilder builder_(_fbb);
   builder_.add_buffers(buffers);
   builder_.add_description(description);
@@ -3111,890 +3681,1693 @@ inline flatbuffers::Offset<Model> CreateModel(
 }
 
 inline flatbuffers::Offset<Model> CreateModelDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    uint32_t version = 0,
-    const std::vector<flatbuffers::Offset<OperatorCode>> *operator_codes = nullptr,
+    flatbuffers::FlatBufferBuilder &_fbb, uint32_t version = 0,
+    const std::vector<flatbuffers::Offset<OperatorCode>> *operator_codes =
+        nullptr,
     const std::vector<flatbuffers::Offset<SubGraph>> *subgraphs = nullptr,
     const char *description = nullptr,
     const std::vector<flatbuffers::Offset<Buffer>> *buffers = nullptr) {
   return tflite::CreateModel(
-      _fbb,
-      version,
-      operator_codes ? _fbb.CreateVector<flatbuffers::Offset<OperatorCode>>(*operator_codes) : 0,
-      subgraphs ? _fbb.CreateVector<flatbuffers::Offset<SubGraph>>(*subgraphs) : 0,
+      _fbb, version,
+      operator_codes ? _fbb.CreateVector<flatbuffers::Offset<OperatorCode>>(
+                           *operator_codes)
+                     : 0,
+      subgraphs ? _fbb.CreateVector<flatbuffers::Offset<SubGraph>>(*subgraphs)
+                : 0,
       description ? _fbb.CreateString(description) : 0,
       buffers ? _fbb.CreateVector<flatbuffers::Offset<Buffer>>(*buffers) : 0);
 }
 
-flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<Model> CreateModel(
+    flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-inline QuantizationParametersT *QuantizationParameters::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline QuantizationParametersT *QuantizationParameters::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new QuantizationParametersT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void QuantizationParameters::UnPackTo(QuantizationParametersT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void QuantizationParameters::UnPackTo(
+    QuantizationParametersT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = min(); if (_e) { _o->min.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->min[_i] = _e->Get(_i); } } };
-  { auto _e = max(); if (_e) { _o->max.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->max[_i] = _e->Get(_i); } } };
-  { auto _e = scale(); if (_e) { _o->scale.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->scale[_i] = _e->Get(_i); } } };
-  { auto _e = zero_point(); if (_e) { _o->zero_point.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->zero_point[_i] = _e->Get(_i); } } };
+  {
+    auto _e = min();
+    if (_e) {
+      _o->min.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->min[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = max();
+    if (_e) {
+      _o->max.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->max[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = scale();
+    if (_e) {
+      _o->scale.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->scale[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = zero_point();
+    if (_e) {
+      _o->zero_point.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->zero_point[_i] = _e->Get(_i);
+      }
+    }
+  };
 }
 
-inline flatbuffers::Offset<QuantizationParameters> QuantizationParameters::Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<QuantizationParameters> QuantizationParameters::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateQuantizationParameters(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
+    flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const QuantizationParametersT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const QuantizationParametersT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _min = _o->min.size() ? _fbb.CreateVector(_o->min) : 0;
   auto _max = _o->max.size() ? _fbb.CreateVector(_o->max) : 0;
   auto _scale = _o->scale.size() ? _fbb.CreateVector(_o->scale) : 0;
-  auto _zero_point = _o->zero_point.size() ? _fbb.CreateVector(_o->zero_point) : 0;
-  return tflite::CreateQuantizationParameters(
-      _fbb,
-      _min,
-      _max,
-      _scale,
-      _zero_point);
+  auto _zero_point =
+      _o->zero_point.size() ? _fbb.CreateVector(_o->zero_point) : 0;
+  return tflite::CreateQuantizationParameters(_fbb, _min, _max, _scale,
+                                              _zero_point);
 }
 
-inline TensorT *Tensor::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline TensorT *Tensor::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new TensorT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void Tensor::UnPackTo(TensorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Tensor::UnPackTo(
+    TensorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = shape(); if (_e) { _o->shape.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape[_i] = _e->Get(_i); } } };
-  { auto _e = type(); _o->type = _e; };
-  { auto _e = buffer(); _o->buffer = _e; };
-  { auto _e = name(); if (_e) _o->name = _e->str(); };
-  { auto _e = quantization(); if (_e) _o->quantization = std::unique_ptr<QuantizationParametersT>(_e->UnPack(_resolver)); };
+  {
+    auto _e = shape();
+    if (_e) {
+      _o->shape.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->shape[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = type();
+    _o->type = _e;
+  };
+  {
+    auto _e = buffer();
+    _o->buffer = _e;
+  };
+  {
+    auto _e = name();
+    if (_e) _o->name = _e->str();
+  };
+  {
+    auto _e = quantization();
+    if (_e)
+      _o->quantization =
+          std::unique_ptr<QuantizationParametersT>(_e->UnPack(_resolver));
+  };
 }
 
-inline flatbuffers::Offset<Tensor> Tensor::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Tensor> Tensor::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateTensor(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Tensor> CreateTensor(flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Tensor> CreateTensor(
+    flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TensorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const TensorT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _shape = _o->shape.size() ? _fbb.CreateVector(_o->shape) : 0;
   auto _type = _o->type;
   auto _buffer = _o->buffer;
   auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
-  auto _quantization = _o->quantization ? CreateQuantizationParameters(_fbb, _o->quantization.get(), _rehasher) : 0;
-  return tflite::CreateTensor(
-      _fbb,
-      _shape,
-      _type,
-      _buffer,
-      _name,
-      _quantization);
+  auto _quantization = _o->quantization
+                           ? CreateQuantizationParameters(
+                                 _fbb, _o->quantization.get(), _rehasher)
+                           : 0;
+  return tflite::CreateTensor(_fbb, _shape, _type, _buffer, _name,
+                              _quantization);
 }
 
-inline Conv2DOptionsT *Conv2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline Conv2DOptionsT *Conv2DOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new Conv2DOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void Conv2DOptions::UnPackTo(Conv2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Conv2DOptions::UnPackTo(
+    Conv2DOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = padding(); _o->padding = _e; };
-  { auto _e = stride_w(); _o->stride_w = _e; };
-  { auto _e = stride_h(); _o->stride_h = _e; };
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = padding();
+    _o->padding = _e;
+  };
+  {
+    auto _e = stride_w();
+    _o->stride_w = _e;
+  };
+  {
+    auto _e = stride_h();
+    _o->stride_h = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<Conv2DOptions> Conv2DOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Conv2DOptions> Conv2DOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateConv2DOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Conv2DOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const Conv2DOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _padding = _o->padding;
   auto _stride_w = _o->stride_w;
   auto _stride_h = _o->stride_h;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateConv2DOptions(
-      _fbb,
-      _padding,
-      _stride_w,
-      _stride_h,
-      _fused_activation_function);
+  return tflite::CreateConv2DOptions(_fbb, _padding, _stride_w, _stride_h,
+                                     _fused_activation_function);
 }
 
-inline Pool2DOptionsT *Pool2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline Pool2DOptionsT *Pool2DOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new Pool2DOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void Pool2DOptions::UnPackTo(Pool2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Pool2DOptions::UnPackTo(
+    Pool2DOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = padding(); _o->padding = _e; };
-  { auto _e = stride_w(); _o->stride_w = _e; };
-  { auto _e = stride_h(); _o->stride_h = _e; };
-  { auto _e = filter_width(); _o->filter_width = _e; };
-  { auto _e = filter_height(); _o->filter_height = _e; };
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = padding();
+    _o->padding = _e;
+  };
+  {
+    auto _e = stride_w();
+    _o->stride_w = _e;
+  };
+  {
+    auto _e = stride_h();
+    _o->stride_h = _e;
+  };
+  {
+    auto _e = filter_width();
+    _o->filter_width = _e;
+  };
+  {
+    auto _e = filter_height();
+    _o->filter_height = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<Pool2DOptions> Pool2DOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Pool2DOptions> Pool2DOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreatePool2DOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Pool2DOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const Pool2DOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _padding = _o->padding;
   auto _stride_w = _o->stride_w;
   auto _stride_h = _o->stride_h;
   auto _filter_width = _o->filter_width;
   auto _filter_height = _o->filter_height;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreatePool2DOptions(
-      _fbb,
-      _padding,
-      _stride_w,
-      _stride_h,
-      _filter_width,
-      _filter_height,
-      _fused_activation_function);
+  return tflite::CreatePool2DOptions(_fbb, _padding, _stride_w, _stride_h,
+                                     _filter_width, _filter_height,
+                                     _fused_activation_function);
 }
 
-inline DepthwiseConv2DOptionsT *DepthwiseConv2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline DepthwiseConv2DOptionsT *DepthwiseConv2DOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new DepthwiseConv2DOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void DepthwiseConv2DOptions::UnPackTo(DepthwiseConv2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void DepthwiseConv2DOptions::UnPackTo(
+    DepthwiseConv2DOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = padding(); _o->padding = _e; };
-  { auto _e = stride_w(); _o->stride_w = _e; };
-  { auto _e = stride_h(); _o->stride_h = _e; };
-  { auto _e = depth_multiplier(); _o->depth_multiplier = _e; };
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = padding();
+    _o->padding = _e;
+  };
+  {
+    auto _e = stride_w();
+    _o->stride_w = _e;
+  };
+  {
+    auto _e = stride_h();
+    _o->stride_h = _e;
+  };
+  {
+    auto _e = depth_multiplier();
+    _o->depth_multiplier = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<DepthwiseConv2DOptions> DepthwiseConv2DOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<DepthwiseConv2DOptions> DepthwiseConv2DOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateDepthwiseConv2DOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DepthwiseConv2DOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const DepthwiseConv2DOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _padding = _o->padding;
   auto _stride_w = _o->stride_w;
   auto _stride_h = _o->stride_h;
   auto _depth_multiplier = _o->depth_multiplier;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateDepthwiseConv2DOptions(
-      _fbb,
-      _padding,
-      _stride_w,
-      _stride_h,
-      _depth_multiplier,
-      _fused_activation_function);
+  return tflite::CreateDepthwiseConv2DOptions(_fbb, _padding, _stride_w,
+                                              _stride_h, _depth_multiplier,
+                                              _fused_activation_function);
 }
 
-inline ConcatEmbeddingsOptionsT *ConcatEmbeddingsOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ConcatEmbeddingsOptionsT *ConcatEmbeddingsOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new ConcatEmbeddingsOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void ConcatEmbeddingsOptions::UnPackTo(ConcatEmbeddingsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ConcatEmbeddingsOptions::UnPackTo(
+    ConcatEmbeddingsOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = num_channels(); _o->num_channels = _e; };
-  { auto _e = num_columns_per_channel(); if (_e) { _o->num_columns_per_channel.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->num_columns_per_channel[_i] = _e->Get(_i); } } };
-  { auto _e = embedding_dim_per_channel(); if (_e) { _o->embedding_dim_per_channel.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->embedding_dim_per_channel[_i] = _e->Get(_i); } } };
+  {
+    auto _e = num_channels();
+    _o->num_channels = _e;
+  };
+  {
+    auto _e = num_columns_per_channel();
+    if (_e) {
+      _o->num_columns_per_channel.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->num_columns_per_channel[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = embedding_dim_per_channel();
+    if (_e) {
+      _o->embedding_dim_per_channel.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->embedding_dim_per_channel[_i] = _e->Get(_i);
+      }
+    }
+  };
 }
 
-inline flatbuffers::Offset<ConcatEmbeddingsOptions> ConcatEmbeddingsOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<ConcatEmbeddingsOptions>
+ConcatEmbeddingsOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateConcatEmbeddingsOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<ConcatEmbeddingsOptions>
+CreateConcatEmbeddingsOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ConcatEmbeddingsOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const ConcatEmbeddingsOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _num_channels = _o->num_channels;
-  auto _num_columns_per_channel = _o->num_columns_per_channel.size() ? _fbb.CreateVector(_o->num_columns_per_channel) : 0;
-  auto _embedding_dim_per_channel = _o->embedding_dim_per_channel.size() ? _fbb.CreateVector(_o->embedding_dim_per_channel) : 0;
-  return tflite::CreateConcatEmbeddingsOptions(
-      _fbb,
-      _num_channels,
-      _num_columns_per_channel,
-      _embedding_dim_per_channel);
-}
-
-inline LSHProjectionOptionsT *LSHProjectionOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _num_columns_per_channel =
+      _o->num_columns_per_channel.size()
+          ? _fbb.CreateVector(_o->num_columns_per_channel)
+          : 0;
+  auto _embedding_dim_per_channel =
+      _o->embedding_dim_per_channel.size()
+          ? _fbb.CreateVector(_o->embedding_dim_per_channel)
+          : 0;
+  return tflite::CreateConcatEmbeddingsOptions(_fbb, _num_channels,
+                                               _num_columns_per_channel,
+                                               _embedding_dim_per_channel);
+}
+
+inline LSHProjectionOptionsT *LSHProjectionOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new LSHProjectionOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void LSHProjectionOptions::UnPackTo(LSHProjectionOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void LSHProjectionOptions::UnPackTo(
+    LSHProjectionOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = type(); _o->type = _e; };
+  {
+    auto _e = type();
+    _o->type = _e;
+  };
 }
 
-inline flatbuffers::Offset<LSHProjectionOptions> LSHProjectionOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<LSHProjectionOptions> LSHProjectionOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateLSHProjectionOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LSHProjectionOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const LSHProjectionOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _type = _o->type;
-  return tflite::CreateLSHProjectionOptions(
-      _fbb,
-      _type);
+  return tflite::CreateLSHProjectionOptions(_fbb, _type);
 }
 
-inline SVDFOptionsT *SVDFOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SVDFOptionsT *SVDFOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new SVDFOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void SVDFOptions::UnPackTo(SVDFOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SVDFOptions::UnPackTo(
+    SVDFOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = rank(); _o->rank = _e; };
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = rank();
+    _o->rank = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<SVDFOptions> SVDFOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SVDFOptions> SVDFOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSVDFOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SVDFOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SVDFOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _rank = _o->rank;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateSVDFOptions(
-      _fbb,
-      _rank,
-      _fused_activation_function);
+  return tflite::CreateSVDFOptions(_fbb, _rank, _fused_activation_function);
 }
 
-inline RNNOptionsT *RNNOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline RNNOptionsT *RNNOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new RNNOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void RNNOptions::UnPackTo(RNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void RNNOptions::UnPackTo(
+    RNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<RNNOptions> RNNOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<RNNOptions> RNNOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateRNNOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<RNNOptions> CreateRNNOptions(flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<RNNOptions> CreateRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const RNNOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const RNNOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateRNNOptions(
-      _fbb,
-      _fused_activation_function);
+  return tflite::CreateRNNOptions(_fbb, _fused_activation_function);
 }
 
-inline FullyConnectedOptionsT *FullyConnectedOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline FullyConnectedOptionsT *FullyConnectedOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new FullyConnectedOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void FullyConnectedOptions::UnPackTo(FullyConnectedOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void FullyConnectedOptions::UnPackTo(
+    FullyConnectedOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<FullyConnectedOptions> FullyConnectedOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<FullyConnectedOptions> FullyConnectedOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateFullyConnectedOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const FullyConnectedOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const FullyConnectedOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateFullyConnectedOptions(
-      _fbb,
-      _fused_activation_function);
+  return tflite::CreateFullyConnectedOptions(_fbb, _fused_activation_function);
 }
 
-inline SoftmaxOptionsT *SoftmaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SoftmaxOptionsT *SoftmaxOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new SoftmaxOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void SoftmaxOptions::UnPackTo(SoftmaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SoftmaxOptions::UnPackTo(
+    SoftmaxOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = beta(); _o->beta = _e; };
+  {
+    auto _e = beta();
+    _o->beta = _e;
+  };
 }
 
-inline flatbuffers::Offset<SoftmaxOptions> SoftmaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SoftmaxOptions> SoftmaxOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSoftmaxOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SoftmaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SoftmaxOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _beta = _o->beta;
-  return tflite::CreateSoftmaxOptions(
-      _fbb,
-      _beta);
+  return tflite::CreateSoftmaxOptions(_fbb, _beta);
 }
 
-inline ConcatenationOptionsT *ConcatenationOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ConcatenationOptionsT *ConcatenationOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new ConcatenationOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void ConcatenationOptions::UnPackTo(ConcatenationOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ConcatenationOptions::UnPackTo(
+    ConcatenationOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = axis(); _o->axis = _e; };
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = axis();
+    _o->axis = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<ConcatenationOptions> ConcatenationOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<ConcatenationOptions> ConcatenationOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateConcatenationOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ConcatenationOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const ConcatenationOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _axis = _o->axis;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateConcatenationOptions(
-      _fbb,
-      _axis,
-      _fused_activation_function);
+  return tflite::CreateConcatenationOptions(_fbb, _axis,
+                                            _fused_activation_function);
 }
 
-inline AddOptionsT *AddOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline AddOptionsT *AddOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new AddOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void AddOptions::UnPackTo(AddOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void AddOptions::UnPackTo(
+    AddOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<AddOptions> AddOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<AddOptions> AddOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateAddOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<AddOptions> CreateAddOptions(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<AddOptions> CreateAddOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AddOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const AddOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateAddOptions(
-      _fbb,
-      _fused_activation_function);
+  return tflite::CreateAddOptions(_fbb, _fused_activation_function);
 }
 
-inline MulOptionsT *MulOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline MulOptionsT *MulOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new MulOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void MulOptions::UnPackTo(MulOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void MulOptions::UnPackTo(
+    MulOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<MulOptions> MulOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<MulOptions> MulOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateMulOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<MulOptions> CreateMulOptions(flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<MulOptions> CreateMulOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MulOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const MulOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateMulOptions(
-      _fbb,
-      _fused_activation_function);
+  return tflite::CreateMulOptions(_fbb, _fused_activation_function);
 }
 
-inline L2NormOptionsT *L2NormOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline L2NormOptionsT *L2NormOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new L2NormOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void L2NormOptions::UnPackTo(L2NormOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void L2NormOptions::UnPackTo(
+    L2NormOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<L2NormOptions> L2NormOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<L2NormOptions> L2NormOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateL2NormOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const L2NormOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const L2NormOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateL2NormOptions(
-      _fbb,
-      _fused_activation_function);
+  return tflite::CreateL2NormOptions(_fbb, _fused_activation_function);
 }
 
-inline LocalResponseNormalizationOptionsT *LocalResponseNormalizationOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline LocalResponseNormalizationOptionsT *
+LocalResponseNormalizationOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new LocalResponseNormalizationOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void LocalResponseNormalizationOptions::UnPackTo(LocalResponseNormalizationOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void LocalResponseNormalizationOptions::UnPackTo(
+    LocalResponseNormalizationOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = radius(); _o->radius = _e; };
-  { auto _e = bias(); _o->bias = _e; };
-  { auto _e = alpha(); _o->alpha = _e; };
-  { auto _e = beta(); _o->beta = _e; };
+  {
+    auto _e = radius();
+    _o->radius = _e;
+  };
+  {
+    auto _e = bias();
+    _o->bias = _e;
+  };
+  {
+    auto _e = alpha();
+    _o->alpha = _e;
+  };
+  {
+    auto _e = beta();
+    _o->beta = _e;
+  };
 }
 
-inline flatbuffers::Offset<LocalResponseNormalizationOptions> LocalResponseNormalizationOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<LocalResponseNormalizationOptions>
+LocalResponseNormalizationOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const LocalResponseNormalizationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateLocalResponseNormalizationOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<LocalResponseNormalizationOptions>
+CreateLocalResponseNormalizationOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const LocalResponseNormalizationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LocalResponseNormalizationOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const LocalResponseNormalizationOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _radius = _o->radius;
   auto _bias = _o->bias;
   auto _alpha = _o->alpha;
   auto _beta = _o->beta;
-  return tflite::CreateLocalResponseNormalizationOptions(
-      _fbb,
-      _radius,
-      _bias,
-      _alpha,
-      _beta);
+  return tflite::CreateLocalResponseNormalizationOptions(_fbb, _radius, _bias,
+                                                         _alpha, _beta);
 }
 
-inline LSTMOptionsT *LSTMOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline LSTMOptionsT *LSTMOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new LSTMOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void LSTMOptions::UnPackTo(LSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void LSTMOptions::UnPackTo(
+    LSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
-  { auto _e = cell_clip(); _o->cell_clip = _e; };
-  { auto _e = proj_clip(); _o->proj_clip = _e; };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+  {
+    auto _e = cell_clip();
+    _o->cell_clip = _e;
+  };
+  {
+    auto _e = proj_clip();
+    _o->proj_clip = _e;
+  };
 }
 
-inline flatbuffers::Offset<LSTMOptions> LSTMOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<LSTMOptions> LSTMOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateLSTMOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LSTMOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const LSTMOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
   auto _cell_clip = _o->cell_clip;
   auto _proj_clip = _o->proj_clip;
-  return tflite::CreateLSTMOptions(
-      _fbb,
-      _fused_activation_function,
-      _cell_clip,
-      _proj_clip);
+  return tflite::CreateLSTMOptions(_fbb, _fused_activation_function, _cell_clip,
+                                   _proj_clip);
 }
 
-inline ResizeBilinearOptionsT *ResizeBilinearOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ResizeBilinearOptionsT *ResizeBilinearOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new ResizeBilinearOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void ResizeBilinearOptions::UnPackTo(ResizeBilinearOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ResizeBilinearOptions::UnPackTo(
+    ResizeBilinearOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = new_height(); _o->new_height = _e; };
-  { auto _e = new_width(); _o->new_width = _e; };
+  {
+    auto _e = new_height();
+    _o->new_height = _e;
+  };
+  {
+    auto _e = new_width();
+    _o->new_width = _e;
+  };
 }
 
-inline flatbuffers::Offset<ResizeBilinearOptions> ResizeBilinearOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<ResizeBilinearOptions> ResizeBilinearOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateResizeBilinearOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ResizeBilinearOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const ResizeBilinearOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _new_height = _o->new_height;
   auto _new_width = _o->new_width;
-  return tflite::CreateResizeBilinearOptions(
-      _fbb,
-      _new_height,
-      _new_width);
+  return tflite::CreateResizeBilinearOptions(_fbb, _new_height, _new_width);
 }
 
-inline CallOptionsT *CallOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline CallOptionsT *CallOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new CallOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void CallOptions::UnPackTo(CallOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void CallOptions::UnPackTo(
+    CallOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = subgraph(); _o->subgraph = _e; };
+  {
+    auto _e = subgraph();
+    _o->subgraph = _e;
+  };
 }
 
-inline flatbuffers::Offset<CallOptions> CallOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<CallOptions> CallOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateCallOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<CallOptions> CreateCallOptions(flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<CallOptions> CreateCallOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CallOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const CallOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _subgraph = _o->subgraph;
-  return tflite::CreateCallOptions(
-      _fbb,
-      _subgraph);
+  return tflite::CreateCallOptions(_fbb, _subgraph);
+}
+
+inline PadOptionsT *PadOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new PadOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void PadOptions::UnPackTo(
+    PadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = before_padding();
+    if (_e) {
+      _o->before_padding.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->before_padding[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = after_padding();
+    if (_e) {
+      _o->after_padding.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->after_padding[_i] = _e->Get(_i);
+      }
+    }
+  };
 }
 
-inline ReshapeOptionsT *ReshapeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline flatbuffers::Offset<PadOptions> PadOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePadOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<PadOptions> CreatePadOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const PadOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _before_padding =
+      _o->before_padding.size() ? _fbb.CreateVector(_o->before_padding) : 0;
+  auto _after_padding =
+      _o->after_padding.size() ? _fbb.CreateVector(_o->after_padding) : 0;
+  return tflite::CreatePadOptions(_fbb, _before_padding, _after_padding);
+}
+
+inline ReshapeOptionsT *ReshapeOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new ReshapeOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void ReshapeOptions::UnPackTo(ReshapeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ReshapeOptions::UnPackTo(
+    ReshapeOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = new_shape(); if (_e) { _o->new_shape.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->new_shape[_i] = _e->Get(_i); } } };
+  {
+    auto _e = new_shape();
+    if (_e) {
+      _o->new_shape.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->new_shape[_i] = _e->Get(_i);
+      }
+    }
+  };
 }
 
-inline flatbuffers::Offset<ReshapeOptions> ReshapeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<ReshapeOptions> ReshapeOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateReshapeOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ReshapeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const ReshapeOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _new_shape = _o->new_shape.size() ? _fbb.CreateVector(_o->new_shape) : 0;
-  return tflite::CreateReshapeOptions(
-      _fbb,
-      _new_shape);
+  return tflite::CreateReshapeOptions(_fbb, _new_shape);
+}
+
+inline BatchToSpaceNDOptionsT *BatchToSpaceNDOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new BatchToSpaceNDOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void BatchToSpaceNDOptions::UnPackTo(
+    BatchToSpaceNDOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = block_shape();
+    if (_e) {
+      _o->block_shape.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->block_shape[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = before_crops();
+    if (_e) {
+      _o->before_crops.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->before_crops[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = after_crops();
+    if (_e) {
+      _o->after_crops.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->after_crops[_i] = _e->Get(_i);
+      }
+    }
+  };
 }
 
-inline SkipGramOptionsT *SkipGramOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline flatbuffers::Offset<BatchToSpaceNDOptions> BatchToSpaceNDOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBatchToSpaceNDOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const BatchToSpaceNDOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _block_shape =
+      _o->block_shape.size() ? _fbb.CreateVector(_o->block_shape) : 0;
+  auto _before_crops =
+      _o->before_crops.size() ? _fbb.CreateVector(_o->before_crops) : 0;
+  auto _after_crops =
+      _o->after_crops.size() ? _fbb.CreateVector(_o->after_crops) : 0;
+  return tflite::CreateBatchToSpaceNDOptions(_fbb, _block_shape, _before_crops,
+                                             _after_crops);
+}
+
+inline SkipGramOptionsT *SkipGramOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new SkipGramOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void SkipGramOptions::UnPackTo(SkipGramOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SkipGramOptions::UnPackTo(
+    SkipGramOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = ngram_size(); _o->ngram_size = _e; };
-  { auto _e = max_skip_size(); _o->max_skip_size = _e; };
-  { auto _e = include_all_ngrams(); _o->include_all_ngrams = _e; };
+  {
+    auto _e = ngram_size();
+    _o->ngram_size = _e;
+  };
+  {
+    auto _e = max_skip_size();
+    _o->max_skip_size = _e;
+  };
+  {
+    auto _e = include_all_ngrams();
+    _o->include_all_ngrams = _e;
+  };
 }
 
-inline flatbuffers::Offset<SkipGramOptions> SkipGramOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SkipGramOptions> SkipGramOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSkipGramOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SkipGramOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SkipGramOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _ngram_size = _o->ngram_size;
   auto _max_skip_size = _o->max_skip_size;
   auto _include_all_ngrams = _o->include_all_ngrams;
-  return tflite::CreateSkipGramOptions(
-      _fbb,
-      _ngram_size,
-      _max_skip_size,
-      _include_all_ngrams);
+  return tflite::CreateSkipGramOptions(_fbb, _ngram_size, _max_skip_size,
+                                       _include_all_ngrams);
 }
 
-inline SpaceToDepthOptionsT *SpaceToDepthOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SpaceToDepthOptionsT *SpaceToDepthOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new SpaceToDepthOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void SpaceToDepthOptions::UnPackTo(SpaceToDepthOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SpaceToDepthOptions::UnPackTo(
+    SpaceToDepthOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = block_size(); _o->block_size = _e; };
+  {
+    auto _e = block_size();
+    _o->block_size = _e;
+  };
 }
 
-inline flatbuffers::Offset<SpaceToDepthOptions> SpaceToDepthOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SpaceToDepthOptions> SpaceToDepthOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSpaceToDepthOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SpaceToDepthOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SpaceToDepthOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _block_size = _o->block_size;
-  return tflite::CreateSpaceToDepthOptions(
-      _fbb,
-      _block_size);
+  return tflite::CreateSpaceToDepthOptions(_fbb, _block_size);
 }
 
-inline EmbeddingLookupSparseOptionsT *EmbeddingLookupSparseOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline EmbeddingLookupSparseOptionsT *EmbeddingLookupSparseOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new EmbeddingLookupSparseOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void EmbeddingLookupSparseOptions::UnPackTo(EmbeddingLookupSparseOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void EmbeddingLookupSparseOptions::UnPackTo(
+    EmbeddingLookupSparseOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = combiner(); _o->combiner = _e; };
+  {
+    auto _e = combiner();
+    _o->combiner = _e;
+  };
 }
 
-inline flatbuffers::Offset<EmbeddingLookupSparseOptions> EmbeddingLookupSparseOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<EmbeddingLookupSparseOptions>
+EmbeddingLookupSparseOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const EmbeddingLookupSparseOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateEmbeddingLookupSparseOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<EmbeddingLookupSparseOptions>
+CreateEmbeddingLookupSparseOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const EmbeddingLookupSparseOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const EmbeddingLookupSparseOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const EmbeddingLookupSparseOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _combiner = _o->combiner;
-  return tflite::CreateEmbeddingLookupSparseOptions(
-      _fbb,
-      _combiner);
+  return tflite::CreateEmbeddingLookupSparseOptions(_fbb, _combiner);
+}
+
+inline GatherOptionsT *GatherOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new GatherOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void GatherOptions::UnPackTo(
+    GatherOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = axis();
+    _o->axis = _e;
+  };
+}
+
+inline flatbuffers::Offset<GatherOptions> GatherOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGatherOptions(_fbb, _o, _rehasher);
 }
 
-inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline flatbuffers::Offset<GatherOptions> CreateGatherOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const GatherOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _axis = _o->axis;
+  return tflite::CreateGatherOptions(_fbb, _axis);
+}
+
+inline OperatorCodeT *OperatorCode::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void OperatorCode::UnPackTo(OperatorCodeT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void OperatorCode::UnPackTo(
+    OperatorCodeT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = builtin_code(); _o->builtin_code = _e; };
-  { auto _e = custom_code(); if (_e) _o->custom_code = _e->str(); };
+  {
+    auto _e = builtin_code();
+    _o->builtin_code = _e;
+  };
+  {
+    auto _e = custom_code();
+    if (_e) _o->custom_code = _e->str();
+  };
 }
 
-inline flatbuffers::Offset<OperatorCode> OperatorCode::Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<OperatorCode> OperatorCode::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateOperatorCode(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const OperatorCodeT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const OperatorCodeT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _builtin_code = _o->builtin_code;
-  auto _custom_code = _o->custom_code.empty() ? 0 : _fbb.CreateString(_o->custom_code);
-  return tflite::CreateOperatorCode(
-      _fbb,
-      _builtin_code,
-      _custom_code);
+  auto _custom_code =
+      _o->custom_code.empty() ? 0 : _fbb.CreateString(_o->custom_code);
+  return tflite::CreateOperatorCode(_fbb, _builtin_code, _custom_code);
 }
 
-inline OperatorT *Operator::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline OperatorT *Operator::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void Operator::UnPackTo(OperatorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Operator::UnPackTo(
+    OperatorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = opcode_index(); _o->opcode_index = _e; };
-  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inputs[_i] = _e->Get(_i); } } };
-  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } };
-  { auto _e = builtin_options_type(); _o->builtin_options.type = _e; };
-  { auto _e = builtin_options(); if (_e) _o->builtin_options.value = BuiltinOptionsUnion::UnPack(_e, builtin_options_type(), _resolver); };
-  { auto _e = custom_options(); if (_e) { _o->custom_options.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->custom_options[_i] = _e->Get(_i); } } };
-  { auto _e = custom_options_format(); _o->custom_options_format = _e; };
+  {
+    auto _e = opcode_index();
+    _o->opcode_index = _e;
+  };
+  {
+    auto _e = inputs();
+    if (_e) {
+      _o->inputs.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->inputs[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = outputs();
+    if (_e) {
+      _o->outputs.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->outputs[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = builtin_options_type();
+    _o->builtin_options.type = _e;
+  };
+  {
+    auto _e = builtin_options();
+    if (_e)
+      _o->builtin_options.value =
+          BuiltinOptionsUnion::UnPack(_e, builtin_options_type(), _resolver);
+  };
+  {
+    auto _e = custom_options();
+    if (_e) {
+      _o->custom_options.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->custom_options[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = custom_options_format();
+    _o->custom_options_format = _e;
+  };
 }
 
-inline flatbuffers::Offset<Operator> Operator::Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Operator> Operator::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateOperator(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Operator> CreateOperator(flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Operator> CreateOperator(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const OperatorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const OperatorT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _opcode_index = _o->opcode_index;
   auto _inputs = _o->inputs.size() ? _fbb.CreateVector(_o->inputs) : 0;
   auto _outputs = _o->outputs.size() ? _fbb.CreateVector(_o->outputs) : 0;
   auto _builtin_options_type = _o->builtin_options.type;
   auto _builtin_options = _o->builtin_options.Pack(_fbb);
-  auto _custom_options = _o->custom_options.size() ? _fbb.CreateVector(_o->custom_options) : 0;
+  auto _custom_options =
+      _o->custom_options.size() ? _fbb.CreateVector(_o->custom_options) : 0;
   auto _custom_options_format = _o->custom_options_format;
-  return tflite::CreateOperator(
-      _fbb,
-      _opcode_index,
-      _inputs,
-      _outputs,
-      _builtin_options_type,
-      _builtin_options,
-      _custom_options,
-      _custom_options_format);
+  return tflite::CreateOperator(_fbb, _opcode_index, _inputs, _outputs,
+                                _builtin_options_type, _builtin_options,
+                                _custom_options, _custom_options_format);
 }
 
-inline SubGraphT *SubGraph::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SubGraphT *SubGraph::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new SubGraphT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void SubGraph::UnPackTo(SubGraphT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SubGraph::UnPackTo(
+    SubGraphT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = tensors(); if (_e) { _o->tensors.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->tensors[_i] = std::unique_ptr<TensorT>(_e->Get(_i)->UnPack(_resolver)); } } };
-  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inputs[_i] = _e->Get(_i); } } };
-  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } };
-  { auto _e = operators(); if (_e) { _o->operators.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->operators[_i] = std::unique_ptr<OperatorT>(_e->Get(_i)->UnPack(_resolver)); } } };
-  { auto _e = name(); if (_e) _o->name = _e->str(); };
+  {
+    auto _e = tensors();
+    if (_e) {
+      _o->tensors.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->tensors[_i] =
+            std::unique_ptr<TensorT>(_e->Get(_i)->UnPack(_resolver));
+      }
+    }
+  };
+  {
+    auto _e = inputs();
+    if (_e) {
+      _o->inputs.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->inputs[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = outputs();
+    if (_e) {
+      _o->outputs.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->outputs[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = operators();
+    if (_e) {
+      _o->operators.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->operators[_i] =
+            std::unique_ptr<OperatorT>(_e->Get(_i)->UnPack(_resolver));
+      }
+    }
+  };
+  {
+    auto _e = name();
+    if (_e) _o->name = _e->str();
+  };
 }
 
-inline flatbuffers::Offset<SubGraph> SubGraph::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SubGraph> SubGraph::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSubGraph(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SubGraph> CreateSubGraph(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SubGraph> CreateSubGraph(
+    flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SubGraphT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _tensors = _o->tensors.size() ? _fbb.CreateVector<flatbuffers::Offset<Tensor>> (_o->tensors.size(), [](size_t i, _VectorArgs *__va) { return CreateTensor(*__va->__fbb, __va->__o->tensors[i].get(), __va->__rehasher); }, &_va ) : 0;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SubGraphT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _tensors =
+      _o->tensors.size()
+          ? _fbb.CreateVector<flatbuffers::Offset<Tensor>>(
+                _o->tensors.size(),
+                [](size_t i, _VectorArgs *__va) {
+                  return CreateTensor(*__va->__fbb, __va->__o->tensors[i].get(),
+                                      __va->__rehasher);
+                },
+                &_va)
+          : 0;
   auto _inputs = _o->inputs.size() ? _fbb.CreateVector(_o->inputs) : 0;
   auto _outputs = _o->outputs.size() ? _fbb.CreateVector(_o->outputs) : 0;
-  auto _operators = _o->operators.size() ? _fbb.CreateVector<flatbuffers::Offset<Operator>> (_o->operators.size(), [](size_t i, _VectorArgs *__va) { return CreateOperator(*__va->__fbb, __va->__o->operators[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _operators = _o->operators.size()
+                        ? _fbb.CreateVector<flatbuffers::Offset<Operator>>(
+                              _o->operators.size(),
+                              [](size_t i, _VectorArgs *__va) {
+                                return CreateOperator(
+                                    *__va->__fbb, __va->__o->operators[i].get(),
+                                    __va->__rehasher);
+                              },
+                              &_va)
+                        : 0;
   auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
-  return tflite::CreateSubGraph(
-      _fbb,
-      _tensors,
-      _inputs,
-      _outputs,
-      _operators,
-      _name);
+  return tflite::CreateSubGraph(_fbb, _tensors, _inputs, _outputs, _operators,
+                                _name);
 }
 
-inline BufferT *Buffer::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BufferT *Buffer::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new BufferT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void Buffer::UnPackTo(BufferT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Buffer::UnPackTo(
+    BufferT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = data(); if (_e) { _o->data.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->data[_i] = _e->Get(_i); } } };
+  {
+    auto _e = data();
+    if (_e) {
+      _o->data.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->data[_i] = _e->Get(_i);
+      }
+    }
+  };
 }
 
-inline flatbuffers::Offset<Buffer> Buffer::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BufferT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Buffer> Buffer::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateBuffer(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Buffer> CreateBuffer(flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Buffer> CreateBuffer(
+    flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BufferT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const BufferT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _data = _o->data.size() ? _fbb.CreateVector(_o->data) : 0;
-  return tflite::CreateBuffer(
-      _fbb,
-      _data);
+  return tflite::CreateBuffer(_fbb, _data);
 }
 
-inline ModelT *Model::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ModelT *Model::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new ModelT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void Model::UnPackTo(ModelT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Model::UnPackTo(
+    ModelT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = version(); _o->version = _e; };
-  { auto _e = operator_codes(); if (_e) { _o->operator_codes.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->operator_codes[_i] = std::unique_ptr<OperatorCodeT>(_e->Get(_i)->UnPack(_resolver)); } } };
-  { auto _e = subgraphs(); if (_e) { _o->subgraphs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->subgraphs[_i] = std::unique_ptr<SubGraphT>(_e->Get(_i)->UnPack(_resolver)); } } };
-  { auto _e = description(); if (_e) _o->description = _e->str(); };
-  { auto _e = buffers(); if (_e) { _o->buffers.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->buffers[_i] = std::unique_ptr<BufferT>(_e->Get(_i)->UnPack(_resolver)); } } };
+  {
+    auto _e = version();
+    _o->version = _e;
+  };
+  {
+    auto _e = operator_codes();
+    if (_e) {
+      _o->operator_codes.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->operator_codes[_i] =
+            std::unique_ptr<OperatorCodeT>(_e->Get(_i)->UnPack(_resolver));
+      }
+    }
+  };
+  {
+    auto _e = subgraphs();
+    if (_e) {
+      _o->subgraphs.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->subgraphs[_i] =
+            std::unique_ptr<SubGraphT>(_e->Get(_i)->UnPack(_resolver));
+      }
+    }
+  };
+  {
+    auto _e = description();
+    if (_e) _o->description = _e->str();
+  };
+  {
+    auto _e = buffers();
+    if (_e) {
+      _o->buffers.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->buffers[_i] =
+            std::unique_ptr<BufferT>(_e->Get(_i)->UnPack(_resolver));
+      }
+    }
+  };
 }
 
-inline flatbuffers::Offset<Model> Model::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Model> Model::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateModel(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Model> CreateModel(
+    flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ModelT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const ModelT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _version = _o->version;
-  auto _operator_codes = _o->operator_codes.size() ? _fbb.CreateVector<flatbuffers::Offset<OperatorCode>> (_o->operator_codes.size(), [](size_t i, _VectorArgs *__va) { return CreateOperatorCode(*__va->__fbb, __va->__o->operator_codes[i].get(), __va->__rehasher); }, &_va ) : 0;
-  auto _subgraphs = _o->subgraphs.size() ? _fbb.CreateVector<flatbuffers::Offset<SubGraph>> (_o->subgraphs.size(), [](size_t i, _VectorArgs *__va) { return CreateSubGraph(*__va->__fbb, __va->__o->subgraphs[i].get(), __va->__rehasher); }, &_va ) : 0;
-  auto _description = _o->description.empty() ? 0 : _fbb.CreateString(_o->description);
-  auto _buffers = _o->buffers.size() ? _fbb.CreateVector<flatbuffers::Offset<Buffer>> (_o->buffers.size(), [](size_t i, _VectorArgs *__va) { return CreateBuffer(*__va->__fbb, __va->__o->buffers[i].get(), __va->__rehasher); }, &_va ) : 0;
-  return tflite::CreateModel(
-      _fbb,
-      _version,
-      _operator_codes,
-      _subgraphs,
-      _description,
-      _buffers);
-}
-
-inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type) {
+  auto _operator_codes =
+      _o->operator_codes.size()
+          ? _fbb.CreateVector<flatbuffers::Offset<OperatorCode>>(
+                _o->operator_codes.size(),
+                [](size_t i, _VectorArgs *__va) {
+                  return CreateOperatorCode(*__va->__fbb,
+                                            __va->__o->operator_codes[i].get(),
+                                            __va->__rehasher);
+                },
+                &_va)
+          : 0;
+  auto _subgraphs = _o->subgraphs.size()
+                        ? _fbb.CreateVector<flatbuffers::Offset<SubGraph>>(
+                              _o->subgraphs.size(),
+                              [](size_t i, _VectorArgs *__va) {
+                                return CreateSubGraph(
+                                    *__va->__fbb, __va->__o->subgraphs[i].get(),
+                                    __va->__rehasher);
+                              },
+                              &_va)
+                        : 0;
+  auto _description =
+      _o->description.empty() ? 0 : _fbb.CreateString(_o->description);
+  auto _buffers =
+      _o->buffers.size()
+          ? _fbb.CreateVector<flatbuffers::Offset<Buffer>>(
+                _o->buffers.size(),
+                [](size_t i, _VectorArgs *__va) {
+                  return CreateBuffer(*__va->__fbb, __va->__o->buffers[i].get(),
+                                      __va->__rehasher);
+                },
+                &_va)
+          : 0;
+  return tflite::CreateModel(_fbb, _version, _operator_codes, _subgraphs,
+                             _description, _buffers);
+}
+
+inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier,
+                                 const void *obj, BuiltinOptions type) {
   switch (type) {
     case BuiltinOptions_NONE: {
       return true;
@@ -4048,7 +5421,8 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       return verifier.VerifyTable(ptr);
     }
     case BuiltinOptions_LocalResponseNormalizationOptions: {
-      auto ptr = reinterpret_cast<const LocalResponseNormalizationOptions *>(obj);
+      auto ptr =
+          reinterpret_cast<const LocalResponseNormalizationOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
     case BuiltinOptions_LSTMOptions: {
@@ -4083,22 +5457,40 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const MulOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
-    default: return false;
+    case BuiltinOptions_PadOptions: {
+      auto ptr = reinterpret_cast<const PadOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_GatherOptions: {
+      auto ptr = reinterpret_cast<const GatherOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      auto ptr = reinterpret_cast<const BatchToSpaceNDOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default:
+      return false;
   }
 }
 
-inline bool VerifyBuiltinOptionsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+inline bool VerifyBuiltinOptionsVector(
+    flatbuffers::Verifier &verifier,
+    const flatbuffers::Vector<flatbuffers::Offset<void>> *values,
+    const flatbuffers::Vector<uint8_t> *types) {
   if (values->size() != types->size()) return false;
   for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
-    if (!VerifyBuiltinOptions(
-        verifier,  values->Get(i), types->GetEnum<BuiltinOptions>(i))) {
+    if (!VerifyBuiltinOptions(verifier, values->Get(i),
+                              types->GetEnum<BuiltinOptions>(i))) {
       return false;
     }
   }
   return true;
 }
 
-inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, const flatbuffers::resolver_function_t *resolver) {
+inline void *BuiltinOptionsUnion::UnPack(
+    const void *obj, BuiltinOptions type,
+    const flatbuffers::resolver_function_t *resolver) {
   switch (type) {
     case BuiltinOptions_Conv2DOptions: {
       auto ptr = reinterpret_cast<const Conv2DOptions *>(obj);
@@ -4149,7 +5541,8 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       return ptr->UnPack(resolver);
     }
     case BuiltinOptions_LocalResponseNormalizationOptions: {
-      auto ptr = reinterpret_cast<const LocalResponseNormalizationOptions *>(obj);
+      auto ptr =
+          reinterpret_cast<const LocalResponseNormalizationOptions *>(obj);
       return ptr->UnPack(resolver);
     }
     case BuiltinOptions_LSTMOptions: {
@@ -4184,11 +5577,26 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const MulOptions *>(obj);
       return ptr->UnPack(resolver);
     }
-    default: return nullptr;
+    case BuiltinOptions_PadOptions: {
+      auto ptr = reinterpret_cast<const PadOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_GatherOptions: {
+      auto ptr = reinterpret_cast<const GatherOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      auto ptr = reinterpret_cast<const BatchToSpaceNDOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default:
+      return nullptr;
   }
 }
 
-inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const flatbuffers::rehasher_function_t *_rehasher) const {
   switch (type) {
     case BuiltinOptions_Conv2DOptions: {
       auto ptr = reinterpret_cast<const Conv2DOptionsT *>(value);
@@ -4239,8 +5647,10 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       return CreateL2NormOptions(_fbb, ptr, _rehasher).Union();
     }
     case BuiltinOptions_LocalResponseNormalizationOptions: {
-      auto ptr = reinterpret_cast<const LocalResponseNormalizationOptionsT *>(value);
-      return CreateLocalResponseNormalizationOptions(_fbb, ptr, _rehasher).Union();
+      auto ptr =
+          reinterpret_cast<const LocalResponseNormalizationOptionsT *>(value);
+      return CreateLocalResponseNormalizationOptions(_fbb, ptr, _rehasher)
+          .Union();
     }
     case BuiltinOptions_LSTMOptions: {
       auto ptr = reinterpret_cast<const LSTMOptionsT *>(value);
@@ -4274,26 +5684,44 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const MulOptionsT *>(value);
       return CreateMulOptions(_fbb, ptr, _rehasher).Union();
     }
-    default: return 0;
+    case BuiltinOptions_PadOptions: {
+      auto ptr = reinterpret_cast<const PadOptionsT *>(value);
+      return CreatePadOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_GatherOptions: {
+      auto ptr = reinterpret_cast<const GatherOptionsT *>(value);
+      return CreateGatherOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      auto ptr = reinterpret_cast<const BatchToSpaceNDOptionsT *>(value);
+      return CreateBatchToSpaceNDOptions(_fbb, ptr, _rehasher).Union();
+    }
+    default:
+      return 0;
   }
 }
 
-inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FLATBUFFERS_NOEXCEPT : type(u.type), value(nullptr) {
+inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u)
+    FLATBUFFERS_NOEXCEPT : type(u.type),
+                           value(nullptr) {
   switch (type) {
     case BuiltinOptions_Conv2DOptions: {
       value = new Conv2DOptionsT(*reinterpret_cast<Conv2DOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_DepthwiseConv2DOptions: {
-      value = new DepthwiseConv2DOptionsT(*reinterpret_cast<DepthwiseConv2DOptionsT *>(u.value));
+      value = new DepthwiseConv2DOptionsT(
+          *reinterpret_cast<DepthwiseConv2DOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_ConcatEmbeddingsOptions: {
-      value = new ConcatEmbeddingsOptionsT(*reinterpret_cast<ConcatEmbeddingsOptionsT *>(u.value));
+      value = new ConcatEmbeddingsOptionsT(
+          *reinterpret_cast<ConcatEmbeddingsOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_LSHProjectionOptions: {
-      value = new LSHProjectionOptionsT(*reinterpret_cast<LSHProjectionOptionsT *>(u.value));
+      value = new LSHProjectionOptionsT(
+          *reinterpret_cast<LSHProjectionOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_Pool2DOptions: {
@@ -4309,15 +5737,18 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       break;
     }
     case BuiltinOptions_FullyConnectedOptions: {
-      value = new FullyConnectedOptionsT(*reinterpret_cast<FullyConnectedOptionsT *>(u.value));
+      value = new FullyConnectedOptionsT(
+          *reinterpret_cast<FullyConnectedOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_SoftmaxOptions: {
-      value = new SoftmaxOptionsT(*reinterpret_cast<SoftmaxOptionsT *>(u.value));
+      value =
+          new SoftmaxOptionsT(*reinterpret_cast<SoftmaxOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_ConcatenationOptions: {
-      value = new ConcatenationOptionsT(*reinterpret_cast<ConcatenationOptionsT *>(u.value));
+      value = new ConcatenationOptionsT(
+          *reinterpret_cast<ConcatenationOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_AddOptions: {
@@ -4329,7 +5760,8 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       break;
     }
     case BuiltinOptions_LocalResponseNormalizationOptions: {
-      value = new LocalResponseNormalizationOptionsT(*reinterpret_cast<LocalResponseNormalizationOptionsT *>(u.value));
+      value = new LocalResponseNormalizationOptionsT(
+          *reinterpret_cast<LocalResponseNormalizationOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_LSTMOptions: {
@@ -4337,7 +5769,8 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       break;
     }
     case BuiltinOptions_ResizeBilinearOptions: {
-      value = new ResizeBilinearOptionsT(*reinterpret_cast<ResizeBilinearOptionsT *>(u.value));
+      value = new ResizeBilinearOptionsT(
+          *reinterpret_cast<ResizeBilinearOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_CallOptions: {
@@ -4345,25 +5778,42 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       break;
     }
     case BuiltinOptions_ReshapeOptions: {
-      value = new ReshapeOptionsT(*reinterpret_cast<ReshapeOptionsT *>(u.value));
+      value =
+          new ReshapeOptionsT(*reinterpret_cast<ReshapeOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_SkipGramOptions: {
-      value = new SkipGramOptionsT(*reinterpret_cast<SkipGramOptionsT *>(u.value));
+      value =
+          new SkipGramOptionsT(*reinterpret_cast<SkipGramOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_SpaceToDepthOptions: {
-      value = new SpaceToDepthOptionsT(*reinterpret_cast<SpaceToDepthOptionsT *>(u.value));
+      value = new SpaceToDepthOptionsT(
+          *reinterpret_cast<SpaceToDepthOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_EmbeddingLookupSparseOptions: {
-      value = new EmbeddingLookupSparseOptionsT(*reinterpret_cast<EmbeddingLookupSparseOptionsT *>(u.value));
+      value = new EmbeddingLookupSparseOptionsT(
+          *reinterpret_cast<EmbeddingLookupSparseOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_MulOptions: {
       value = new MulOptionsT(*reinterpret_cast<MulOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_PadOptions: {
+      value = new PadOptionsT(*reinterpret_cast<PadOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_GatherOptions: {
+      value = new GatherOptionsT(*reinterpret_cast<GatherOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      value = new BatchToSpaceNDOptionsT(
+          *reinterpret_cast<BatchToSpaceNDOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -4476,7 +5926,23 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
-    default: break;
+    case BuiltinOptions_PadOptions: {
+      auto ptr = reinterpret_cast<PadOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_GatherOptions: {
+      auto ptr = reinterpret_cast<GatherOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      auto ptr = reinterpret_cast<BatchToSpaceNDOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    default:
+      break;
   }
   value = nullptr;
   type = BuiltinOptions_NONE;
@@ -4486,33 +5952,25 @@ inline const tflite::Model *GetModel(const void *buf) {
   return flatbuffers::GetRoot<tflite::Model>(buf);
 }
 
-inline const char *ModelIdentifier() {
-  return "TFL3";
-}
+inline const char *ModelIdentifier() { return "TFL3"; }
 
 inline bool ModelBufferHasIdentifier(const void *buf) {
-  return flatbuffers::BufferHasIdentifier(
-      buf, ModelIdentifier());
+  return flatbuffers::BufferHasIdentifier(buf, ModelIdentifier());
 }
 
-inline bool VerifyModelBuffer(
-    flatbuffers::Verifier &verifier) {
+inline bool VerifyModelBuffer(flatbuffers::Verifier &verifier) {
   return verifier.VerifyBuffer<tflite::Model>(ModelIdentifier());
 }
 
-inline const char *ModelExtension() {
-  return "tflite";
-}
+inline const char *ModelExtension() { return "tflite"; }
 
-inline void FinishModelBuffer(
-    flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<tflite::Model> root) {
+inline void FinishModelBuffer(flatbuffers::FlatBufferBuilder &fbb,
+                              flatbuffers::Offset<tflite::Model> root) {
   fbb.Finish(root, ModelIdentifier());
 }
 
 inline std::unique_ptr<ModelT> UnPackModel(
-    const void *buf,
-    const flatbuffers::resolver_function_t *res = nullptr) {
+    const void *buf, const flatbuffers::resolver_function_t *res = nullptr) {
   return std::unique_ptr<ModelT>(GetModel(buf)->UnPack(res));
 }
 
diff --git a/tensorflow/contrib/lite/schema/upgrade_schema_test.py b/tensorflow/contrib/lite/schema/upgrade_schema_test.py
index 754400e88871ae911f1fd5ae2aa0429f0e23987f..b5002e6f7576b6de533046aaad37fe06746d3644 100644
--- a/tensorflow/contrib/lite/schema/upgrade_schema_test.py
+++ b/tensorflow/contrib/lite/schema/upgrade_schema_test.py
@@ -252,7 +252,7 @@ def JsonDumpAndFlush(data, fp):
 
 class TestSchemaUpgrade(test_util.TensorFlowTestCase):
 
-  def testNonExistantFile(self):
+  def testNonExistentFile(self):
     converter = upgrade_schema_lib.Converter()
     non_existent = tempfile.mktemp(suffix=".json")
     with self.assertRaisesRegexp(IOError, "No such file or directory"):
diff --git a/tensorflow/contrib/lite/simple_memory_arena_test.cc b/tensorflow/contrib/lite/simple_memory_arena_test.cc
index ac676092c6d5d8982b65cd35c2b9770d10ea37b2..4444f642eb75c563c57762d095e454ac63d836c6 100644
--- a/tensorflow/contrib/lite/simple_memory_arena_test.cc
+++ b/tensorflow/contrib/lite/simple_memory_arena_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
 
 namespace tflite {
 namespace {
@@ -85,7 +86,7 @@ TEST(SimpleMemoryArenaTest, TestAfterClear) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/string.h b/tensorflow/contrib/lite/string.h
index ecd6f04ec2ac91ee2ae9b3c30c524686bf61cc90..7f8f4e851ee69aa86b7f3eaec6383e17fa6a734c 100644
--- a/tensorflow/contrib/lite/string.h
+++ b/tensorflow/contrib/lite/string.h
@@ -17,11 +17,10 @@ limitations under the License.
 #define _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_H_
 
 #include <string>
-#include "tensorflow/core/platform/platform.h"
 
 namespace tflite {
 
-#ifndef PLATFORM_GOOGLE
+#ifndef HAS_GLOBAL_STRING
 using std::string;
 #endif
 
diff --git a/tensorflow/contrib/lite/string_util_test.cc b/tensorflow/contrib/lite/string_util_test.cc
index 5c351638dc2fad0e64fda6d3a9cb14dfc45375af..d53fec7512f902fb277524100640f4a6a2aaf130 100644
--- a/tensorflow/contrib/lite/string_util_test.cc
+++ b/tensorflow/contrib/lite/string_util_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/testing/util.h"
 
 namespace tflite {
 
@@ -111,7 +112,7 @@ TEST(StringUtil, TestEmptyList) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index ecddb4b807bf1dddec10adfcbab6db6cca85247a..96800304e56a2517b7c1222f47d0207725edc15f 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -18,6 +18,7 @@ gen_zipped_test_files(
     files = [
         "add.zip",
         "avg_pool.zip",
+        "batch_to_space_nd.zip",
         "concat.zip",
         "constant.zip",
         "control_dep.zip",
@@ -25,12 +26,14 @@ gen_zipped_test_files(
         "depthwiseconv.zip",
         "fully_connected.zip",
         "fused_batch_norm.zip",
+        "gather.zip",
         "global_batch_norm.zip",
         "l2_pool.zip",
         "l2norm.zip",
         "local_response_norm.zip",
         "max_pool.zip",
         "mul.zip",
+        "pad.zip",
         "relu.zip",
         "relu1.zip",
         "relu6.zip",
@@ -160,6 +163,12 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "util",
+    testonly = 1,
+    hdrs = ["util.h"],
+)
+
 cc_test(
     name = "test_runner_test",
     srcs = ["test_runner_test.cc"],
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index b122818221e81e6898dc92f8f8d336f7fc924b75..02f59438cdb08180ded1098181613067ec4c7c67 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -40,6 +40,7 @@ from six import StringIO
 # TODO(aselle): Disable GPU for now
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 
+# pylint: disable=g-import-not-at-top
 import tensorflow as tf
 from google.protobuf import text_format
 # TODO(aselle): switch to TensorFlow's resource_loader
@@ -93,6 +94,12 @@ KNOWN_BUGS = {
     r"softmax.*input_shape=\[1,3,4,3\]": "67749831",
     # SpaceToDepth only supports float32.
     r"space_to_depth.*(float16|int32|uint8|int64)": "68018134",
+    # Gather doesn't support int64 indices.
+    r"gather.*indices_dtype=int64": "XXXX",
+    # BatchToSpaceND doesn't support cropping.
+    r"batch_to_space_nd.*crops=\[\[1,1\],\[1,1\]\]": "70594634",
+    # BatchToSpaceND only supports 4D tensors.
+    r"batch_to_space_nd.*input_shape=\[8,2,2,2,1,1\]": "70594733",
 }
 
 
@@ -119,7 +126,7 @@ def toco_options(data_types,
   # to change
   if data_types[0] == "QUANTIZED_UINT8":
     inference_type = "QUANTIZED_UINT8"
-  s = (" --input_types=%s" % ",".join(data_types) +
+  s = (" --input_data_types=%s" % ",".join(data_types) +
        " --inference_type=%s" % inference_type +
        " --input_format=TENSORFLOW_GRAPHDEF" + " --output_format=TFLITE" +
        " --input_arrays=%s" % ",".join(input_arrays) +
@@ -383,7 +390,7 @@ def make_zip_of_tests(zip_path,
         report["toco_log"] = ""
         tf.reset_default_graph()
 
-        with tf.device('/cpu:0'):
+        with tf.device("/cpu:0"):
           try:
             inputs, outputs = make_graph(param_dict_real)
           except (tf.errors.UnimplementedError, tf.errors.InvalidArgumentError,
@@ -703,6 +710,46 @@ def make_mul_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_gather_tests(zip_path):
+  """Make a set of tests to do gather."""
+
+  test_parameters = [{
+      # TODO(mgubin): add string tests when they are supported by Toco.
+      # TODO(mgubin): add tests for Nd indices when they are supported by
+      # TfLite.
+      # TODO(mgubin): add tests for axis != 0 when it is supported by TfLite.
+      "params_dtype": [tf.float32, tf.int32],
+      "params_shape": [[10], [1, 2, 20]],
+      "indices_dtype": [tf.int32],
+      "indices_shape": [[3], [5]],
+      "axis": [0],  # axis!=0 is GatherV2
+  }]
+
+  def build_graph(parameters):
+    """Build the gather op testing graph."""
+    params = tf.placeholder(
+        dtype=parameters["params_dtype"],
+        name="params",
+        shape=parameters["params_shape"])
+    indices = tf.placeholder(
+        dtype=parameters["indices_dtype"],
+        name="indices",
+        shape=parameters["indices_shape"])
+    out = tf.gather(params, indices, axis=parameters["axis"])
+    return [params, indices], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    params = create_tensor_data(parameters["params_dtype"],
+                                parameters["params_shape"])
+    indices = create_tensor_data(parameters["indices_dtype"],
+                                 parameters["indices_shape"], 0,
+                                 parameters["params_shape"][0] - 1)
+    return [params, indices], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [params, indices])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_global_batch_norm_tests(zip_path):
   """Make a set of tests to do batch_norm_with_global_normalization."""
 
@@ -998,6 +1045,37 @@ def make_local_response_norm_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_pad_tests(zip_path):
+  """Make a set of tests to do pad."""
+
+  test_parameters = [{
+      "dtype": [tf.int32, tf.float32],
+      "input_shape": [[1, 1, 2, 1], [2, 1, 1, 1]],
+      "paddings": [[[0, 0], [0, 1], [2, 3], [0, 0]], [[0, 1], [0, 0], [0, 0],
+                                                      [2, 3]]],
+  }, {
+      "dtype": [tf.int32, tf.float32],
+      "input_shape": [[1, 2], [0, 1, 2]],
+      "paddings": [[[0, 1], [2, 3]]],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    out = tf.pad(input_tensor, paddings=parameters["paddings"])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(parameters["dtype"],
+                                      parameters["input_shape"])
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_reshape_tests(zip_path):
   """Make a set of tests to do reshape."""
 
@@ -1124,6 +1202,43 @@ def make_space_to_depth_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_batch_to_space_nd_tests(zip_path):
+  """Make a set of tests to do batch_to_space_nd."""
+
+  test_parameters = [
+      {
+          "dtype": [tf.float32, tf.int64, tf.int32],
+          "input_shape": [[12, 2, 2, 1]],
+          "block_shape": [[1, 4], [2, 2], [3, 4]],
+          "crops": [[[0, 0], [0, 0]], [[1, 1], [1, 1]]],
+      },
+      # Non-4D use case: 1 bath dimension, 3 spatial dimensions, 2 others.
+      {
+          "dtype": [tf.float32],
+          "input_shape": [[8, 2, 2, 2, 1, 1]],
+          "block_shape": [[2, 2, 2]],
+          "crops": [[[0, 0], [0, 0], [0, 0]]],
+      },
+  ]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    out = tf.batch_to_space_nd(input_tensor, parameters["block_shape"],
+                               parameters["crops"])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(parameters["dtype"],
+                                      parameters["input_shape"])
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_l2_pool(input_tensor, ksize, strides, padding, data_format):
   """Given an input perform a sequence of TensorFlow ops to produce l2pool."""
   return tf.sqrt(tf.nn.avg_pool(
@@ -1152,12 +1267,14 @@ def main(unused_args):
     dispatch = {
         "control_dep.zip": make_control_dep_tests,
         "add.zip": make_add_tests,
+        "batch_to_space_nd.zip": make_batch_to_space_nd_tests,
         "conv.zip": make_conv_tests,
         "constant.zip": make_constant_tests,
         "depthwiseconv.zip": make_depthwiseconv_tests,
         "concat.zip": make_concatenation_tests,
         "fully_connected.zip": make_fully_connected_tests,
         "global_batch_norm.zip": make_global_batch_norm_tests,
+        "gather.zip": make_gather_tests,
         "fused_batch_norm.zip": make_fused_batch_norm_tests,
         "l2norm.zip": make_l2norm_tests,
         "local_response_norm.zip": make_local_response_norm_tests,
@@ -1168,6 +1285,7 @@ def main(unused_args):
         "l2_pool.zip": make_pool_tests(make_l2_pool),
         "avg_pool.zip": make_pool_tests(tf.nn.avg_pool),
         "max_pool.zip": make_pool_tests(tf.nn.max_pool),
+        "pad.zip": make_pad_tests,
         "reshape.zip": make_reshape_tests,
         "resize_bilinear.zip": make_resize_bilinear_tests,
         "sigmoid.zip": make_sigmoid_tests,
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index e7df97ee54cc631c29a3a6f63a85894236f08157..4c05979e245e48ac9f389ca7a430ec761ef6764e 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -59,6 +59,11 @@ std::map<string, string> kBrokenTests = {
     // more than 1 element.
     {R"(constant.*input_shape=\[(2|2,2,2,2)\])", "68721522"},
 
+    // Pad only supports 4D float32 tensors.
+    {R"(paddtype=.*,input_shape=\[.,.\],paddings=\[\[.,.\],\[.,.\]\])",
+     "70527055"},
+    {R"(padd.*int32)", "70527055"},
+
     // L2Norm only supports 4D tensors.
     {R"(l2normdim=.*,epsilon=.*,input_shape=\[.,.\])", "67963684"},
     {R"(l2normdim=.*,epsilon=.*,input_shape=\[.,.,.,.,.*\])", "67963684"},
@@ -96,13 +101,13 @@ class ZipEnvironment : public ::testing::Environment {
   }
 
   // Unzip `zip` file into a new temporary directory  `out_dir`.
-  tensorflow::Status UnZip(const std::string& zip, std::string* out_dir) {
+  tensorflow::Status UnZip(const string& zip, string* out_dir) {
     string dir;
     TF_CHECK_OK(MakeTemporaryDirectory(&dir));
     tensorflow::SubProcess proc;
-    std::string unzip_binary =
+    string unzip_binary =
         "/usr/bin/unzip";
-    proc.SetProgram(unzip_binary, {"unzip", "-d", dir, zip.c_str()});
+    proc.SetProgram(unzip_binary, {"unzip", "-d", dir, zip});
     proc.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
     proc.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
     if (!proc.Start())
@@ -144,48 +149,48 @@ ZipEnvironment* zip_environment() {
 // the temporary directory where the zip file has been unarchived and
 // `test_paths` is the list of test prefixes that were in the manifest.
 // Note, it is an error for a manifest to contain no tests.
-tensorflow::Status ReadManifest(const std::string& original_file,
-                                const std::string& dir,
-                                std::vector<std::string>* test_paths) {
+tensorflow::Status ReadManifest(const string& original_file, const string& dir,
+                                std::vector<string>* test_paths) {
   // Read the newline delimited list of entries in the manifest.
   std::ifstream manifest_fp(dir + "/manifest.txt");
-  std::string manifest((std::istreambuf_iterator<char>(manifest_fp)),
-                       std::istreambuf_iterator<char>());
+  string manifest((std::istreambuf_iterator<char>(manifest_fp)),
+                  std::istreambuf_iterator<char>());
   size_t pos = 0;
   int added = 0;
   while (true) {
     size_t end_pos = manifest.find("\n", pos);
-    if (end_pos == std::string::npos) break;
-    std::string filename = manifest.substr(pos, end_pos - pos);
+    if (end_pos == string::npos) break;
+    string filename = manifest.substr(pos, end_pos - pos);
     test_paths->push_back(dir + "/" + filename);
     pos = end_pos + 1;
     added += 1;
   }
   if (!added) {
-    std::string message = "Test had no examples: " + original_file;
+    string message = "Test had no examples: " + original_file;
     return tensorflow::Status(tensorflow::error::UNKNOWN, message.c_str());
   }
   return tensorflow::Status::OK();
 }
 
 // Get a list of tests from a zip file `zip_file_name`.
-std::vector<std::string> UnarchiveZipAndFindTestNames(
-    const std::string& zip_file_name) {
-  std::string zip_file = ::tensorflow::testing::TensorFlowSrcRoot() +
-                         "/contrib/lite/testing/optest/" + zip_file_name;
-  std::string decompress_tmp_dir;
+std::vector<string> UnarchiveZipAndFindTestNames(const string& zip_file_name) {
+  string zip_file = ::tensorflow::testing::TensorFlowSrcRoot() +
+                    "/contrib/lite/testing/optest/" + zip_file_name;
+  string decompress_tmp_dir;
   TF_CHECK_OK(zip_environment()->UnZip(zip_file, &decompress_tmp_dir));
-  std::vector<std::string> stuff;
+  std::vector<string> stuff;
   TF_CHECK_OK(ReadManifest(zip_file, decompress_tmp_dir, &stuff));
   return stuff;
 }
 
-class OpsTest : public ::testing::TestWithParam<std::string> {};
+class OpsTest : public ::testing::TestWithParam<string> {};
 
 TEST_P(OpsTest, RunStuff) {
-  std::string test_path = GetParam();
-  std::string tflite_file = test_path + ".bin";
-  std::string tflite_examples = test_path + ".inputs";
+  string test_path = GetParam();
+  string tflite_file = test_path + ".bin";
+  string tflite_examples = test_path + ".inputs";
+  string test_name = test_path.substr(test_path.find_last_of('/'));
+
   auto model = tflite::FlatBufferModel::BuildFromFile(tflite_file.c_str());
   std::unique_ptr<tflite::Interpreter> interpreter;
 
@@ -199,7 +204,7 @@ TEST_P(OpsTest, RunStuff) {
 
   string bug_number;
   for (const auto& p : kBrokenTests) {
-    if (RE2::PartialMatch(test_path, p.first)) {
+    if (RE2::PartialMatch(test_name, p.first)) {
       bug_number = p.second;
     }
   }
@@ -218,7 +223,7 @@ TEST_P(OpsTest, RunStuff) {
     } else {
       if (FLAGS_ignore_known_bugs) {
         ASSERT_EQ(result, kTfLiteError)
-            << "Not failing as expected dut to http://b/" << bug_number;
+            << "Not failing as expected due to http://b/" << bug_number;
       } else {
         ASSERT_EQ(result, kTfLiteOk)
             << "Possibly due to http://b/" << bug_number;
@@ -236,6 +241,7 @@ TEST_P(OpsTest, RunStuff) {
 
 INSTANTIATE_TESTS(add)
 INSTANTIATE_TESTS(avg_pool)
+INSTANTIATE_TESTS(batch_to_space_nd)
 INSTANTIATE_TESTS(concat)
 INSTANTIATE_TESTS(constant)
 INSTANTIATE_TESTS(control_dep)
@@ -243,12 +249,14 @@ INSTANTIATE_TESTS(conv)
 INSTANTIATE_TESTS(depthwiseconv)
 INSTANTIATE_TESTS(fully_connected)
 INSTANTIATE_TESTS(fused_batch_norm)
+INSTANTIATE_TESTS(gather)
 INSTANTIATE_TESTS(global_batch_norm)
 INSTANTIATE_TESTS(l2norm)
 INSTANTIATE_TESTS(l2_pool)
 INSTANTIATE_TESTS(local_response_norm)
 INSTANTIATE_TESTS(max_pool)
 INSTANTIATE_TESTS(mul)
+INSTANTIATE_TESTS(pad)
 INSTANTIATE_TESTS(relu)
 INSTANTIATE_TESTS(relu1)
 INSTANTIATE_TESTS(relu6)
diff --git a/tensorflow/contrib/lite/testing/util.h b/tensorflow/contrib/lite/testing/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d4304f022187027950f58050ececae73dedffb6
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/util.h
@@ -0,0 +1,28 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
+
+namespace tflite {
+
+inline void LogToStderr() {
+#ifdef PLATFORM_GOOGLE
+  FLAGS_logtostderr = true;
+#endif
+}
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 0bf8d067a3f21a01fc1b384bba2a1703f9367733..7556a402f9bc0eeadfb6af21207a1ce3cbb83d28 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -159,21 +159,17 @@ cc_library(
         "toco_types.h",
     ],
     deps = [
+        # Placeholder for internal file dependency.
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-    ] + select({
-        "//tensorflow:android": [],
-        "//tensorflow:darwin": [],
-        "//tensorflow:ios": [],
-        "//conditions:default": [],
-        "//tensorflow:dummy_disabled_internal": [],
-    }),
+    ],
 )
 
 cc_library(
     name = "graph_transformations",
     srcs = [
+        "graph_transformations/convert_expanddims_to_reshape.cc",
         "graph_transformations/convert_pure_conv_to_depthwise.cc",
         "graph_transformations/create_im2col_arrays.cc",
         "graph_transformations/dequantize.cc",
@@ -206,6 +202,7 @@ cc_library(
         "graph_transformations/remove_trivial_reshape.cc",
         "graph_transformations/remove_unused_op.cc",
         "graph_transformations/resolve_batch_normalization.cc",
+        "graph_transformations/resolve_batch_to_space_nd_attributes.cc",
         "graph_transformations/resolve_constant_binary.cc",
         "graph_transformations/resolve_constant_concatenation.cc",
         "graph_transformations/resolve_constant_fake_quant.cc",
diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
index 2f4454d7c849c49c853e1379cbdd8241062ba348..62e7282d16aa9aa02d6ebe131ead569282518753 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
@@ -218,7 +218,8 @@ void AllocateTransientArrays(Model* model,
   // just guard this assumption with a CHECK:
   bool batchless_input_shapes = true;
   for (const auto& input_array : model->flags.input_arrays()) {
-    if (input_array.shape().empty() || input_array.shape(0) != 1) {
+    if (!input_array.has_shape() || input_array.shape().dims().empty() ||
+        input_array.shape().dims(0) != 1) {
       batchless_input_shapes = false;
       break;
     }
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index 28661d4ff0d0b34370374d79f4b7f019b2b0d1c8..a2f80fae9b2706d77652c89901430e7f77b90814 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -191,16 +191,20 @@ struct ParsedModelFlags {
   Arg<string> mean_values;
   Arg<float> std_value = Arg<float>(1.f);
   Arg<string> std_values;
+  Arg<string> input_data_type;
+  Arg<string> input_data_types;
   Arg<bool> variable_batch = Arg<bool>(false);
-  Arg<bool> drop_control_dependency = Arg<bool>(false);
   Arg<toco::IntList> input_shape;
   Arg<toco::StringMapList> rnn_states;
   Arg<toco::StringMapList> model_checks;
-  // Debugging output options
+  // Debugging output options.
+  // TODO(benoitjacob): these shouldn't be ModelFlags.
   Arg<string> graphviz_first_array;
   Arg<string> graphviz_last_array;
   Arg<string> dump_graphviz;
   Arg<bool> dump_graphviz_video = Arg<bool>(false);
+  Arg<bool> allow_nonexistent_arrays = Arg<bool>(false);
+  Arg<bool> allow_nonascii_arrays = Arg<bool>(false);
 };
 
 // Flags that describe the operation you would like to do (what conversion
@@ -213,12 +217,15 @@ struct ParsedTocoFlags {
   // TODO(aselle): command_line_flags  doesn't support doubles
   Arg<float> default_ranges_min = Arg<float>(0.);
   Arg<float> default_ranges_max = Arg<float>(0.);
-  Arg<string> input_type;
-  Arg<string> input_types;
   Arg<string> inference_type;
+  Arg<string> inference_input_type;
   Arg<bool> drop_fake_quant = Arg<bool>(false);
   Arg<bool> reorder_across_fake_quant = Arg<bool>(false);
   Arg<bool> allow_custom_ops = Arg<bool>(false);
+  // Deprecated flags
+  Arg<string> input_type;
+  Arg<string> input_types;
+  Arg<bool> drop_control_dependency = Arg<bool>(false);
 };
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.cc b/tensorflow/contrib/lite/toco/dump_graphviz.cc
index f5e2868dc05306d9f08d585e54900a3f873e6079..47d5e96825c47844a27f6f9005c5621f2fdb46d2 100644
--- a/tensorflow/contrib/lite/toco/dump_graphviz.cc
+++ b/tensorflow/contrib/lite/toco/dump_graphviz.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_replace.h"
+#include "absl/strings/strip.h"
 #include "tensorflow/contrib/lite/toco/model_flags.pb.h"
 #include "tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h"
 #include "tensorflow/contrib/lite/toco/toco_port.h"
@@ -105,6 +106,51 @@ Color GetColorForArray(const Model& model, const string& array_name) {
   return Color(0xF5, 0xF5, 0xF5);
 }
 
+bool ArrayIsScalarFloat(Model const* model, const std::string& name,
+                        float* val) {
+  const auto& op_array = model->GetArray(name);
+  if (!op_array.buffer || op_array.buffer->type != ArrayDataType::kFloat ||
+      RequiredBufferSizeForShape(op_array.shape()) != 1) {
+    return false;
+  }
+  const auto& data = op_array.GetBuffer<ArrayDataType::kFloat>().data;
+  if (data.empty()) {
+    return false;
+  }
+  *val = data[0];
+  return true;
+}
+
+bool ArrayIsScalarInt(Model const* model, const std::string& name, int* val) {
+  const auto& op_array = model->GetArray(name);
+  if (!op_array.buffer || RequiredBufferSizeForShape(op_array.shape()) != 1) {
+    return false;
+  }
+
+  if (op_array.buffer->type == ArrayDataType::kUint8) {
+    const auto& data = op_array.GetBuffer<ArrayDataType::kUint8>().data;
+    if (data.empty()) {
+      return false;
+    }
+    *val = data[0];
+  } else if (op_array.buffer->type == ArrayDataType::kInt32) {
+    const auto& data = op_array.GetBuffer<ArrayDataType::kInt32>().data;
+    if (data.empty()) {
+      return false;
+    }
+    *val = data[0];
+  } else if (op_array.buffer->type == ArrayDataType::kInt64) {
+    const auto& data = op_array.GetBuffer<ArrayDataType::kInt64>().data;
+    if (data.empty()) {
+      return false;
+    }
+    *val = data[0];
+  } else {
+    return false;
+  }
+  return true;
+}
+
 NodeProperties GetPropertiesForArray(const Model& model,
                                      const string& array_name) {
   NodeProperties node_properties;
@@ -129,12 +175,20 @@ NodeProperties GetPropertiesForArray(const Model& model,
       if (id == 0) {
         AppendF(&node_properties.label, "%d", array_shape.dims(id));
       } else {
-        AppendF(&node_properties.label, "x%d", array_shape.dims(id));
+        // 00D7 is multiplication symbol in unicode
+        AppendF(&node_properties.label, "\u00D7%d", array_shape.dims(id));
       }
     }
     node_properties.label += "]";
+    float flt_val;
+    if (ArrayIsScalarFloat(&model, array_name, &flt_val)) {
+      AppendF(&node_properties.label, " = %.3f", flt_val);
+    }
+    int int_val;
+    if (ArrayIsScalarInt(&model, array_name, &int_val)) {
+      AppendF(&node_properties.label, " = %d", int_val);
+    }
   }
-
   if (array.minmax) {
     AppendF(&node_properties.label, "\\nMinMax: [%.3g, %.3g]",
             array.minmax->min, array.minmax->max);
@@ -160,7 +214,21 @@ NodeProperties GetPropertiesForOperator(const Operator& op) {
     node_properties.label =
         static_cast<const TensorFlowUnsupportedOperator&>(op).tensorflow_op;
   } else {
-    node_properties.label = OperatorTypeName(op.type);
+    node_properties.label =
+        string(absl::StripPrefix(OperatorTypeName(op.type), "TensorFlow"));
+  }
+  switch (op.fused_activation_function) {
+    case FusedActivationFunctionType::kRelu:
+      AppendF(&node_properties.label, "\\nReLU");
+      break;
+    case FusedActivationFunctionType::kRelu6:
+      AppendF(&node_properties.label, "\\nReLU6");
+      break;
+    case FusedActivationFunctionType::kRelu1:
+      AppendF(&node_properties.label, "\\nReLU1");
+      break;
+    default:
+      break;
   }
   // Additional information for some of the operators.
   switch (op.type) {
@@ -259,6 +327,10 @@ void DumpGraphviz(const Model& model, string* output_file_contents) {
             op_properties.color.TextColorString().c_str());
     // Add nodes and edges for all inputs of the operator.
     for (const auto& input : op.inputs) {
+      if (model.arrays.count(input) == 0) {
+        // Arrays should _always_ exist. Except, perhaps, during development.
+        continue;
+      }
       auto array_properties = GetPropertiesForArray(model, input);
       if (!already_added_arrays.count(input)) {
         AppendF(output_file_contents, kNodeFormat, input,
@@ -271,6 +343,10 @@ void DumpGraphviz(const Model& model, string* output_file_contents) {
     }
     // Add nodes and edges for all outputs of the operator.
     for (const auto& output : op.outputs) {
+      if (model.arrays.count(output) == 0) {
+        // Arrays should _always_ exist. Except, perhaps, during development.
+        continue;
+      }
       auto array_properties = GetPropertiesForArray(model, output);
       if (!already_added_arrays.count(output)) {
         AppendF(output_file_contents, kNodeFormat, output,
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 16b9fa226055dde80e4d89e46ec775f59392333e..51d76e44a03fdf75248ce3b73998f52a00e723e4 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -35,8 +35,11 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/logging.h"
 
+using tensorflow::DT_BOOL;
 using tensorflow::DT_FLOAT;
 using tensorflow::DT_INT32;
+using tensorflow::DT_INT64;
+using tensorflow::DT_UINT8;
 using tensorflow::GraphDef;
 using tensorflow::TensorProto;
 
@@ -777,13 +780,12 @@ void ConvertConcatenationOperator(const Model& model,
   auto* dc_op = tensorflow_graph->add_node();
   dc_op->set_op("ConcatV2");
   dc_op->set_name(src_op.outputs[0]);
-  const string dummy_concat_dim = src_op.outputs[0] + "/concat_dim";
-  CreateDummyConcatDimTensorConst(dummy_concat_dim, src_op.concat_dim,
-                                  tensorflow_graph);
+  const string dummy_axis = src_op.outputs[0] + "/axis";
+  CreateDummyConcatDimTensorConst(dummy_axis, src_op.axis, tensorflow_graph);
   for (const auto& input : src_op.inputs) {
     *dc_op->add_input() = input;
   }
-  *dc_op->add_input() = dummy_concat_dim;
+  *dc_op->add_input() = dummy_axis;
   (*dc_op->mutable_attr())["T"].set_type(DT_FLOAT);
   (*dc_op->mutable_attr())["Tidx"].set_type(DT_INT32);
   (*dc_op->mutable_attr())["N"].set_i(src_op.inputs.size());
@@ -897,13 +899,15 @@ tensorflow::DataType GetTensorFlowDataType(const Model& model,
                                            const string& array_name) {
   auto& dtype = model.GetArray(array_name).data_type;
   CHECK(dtype == ArrayDataType::kFloat || dtype == ArrayDataType::kInt32 ||
-        dtype == ArrayDataType::kUint8);
+        dtype == ArrayDataType::kUint8 || dtype == ArrayDataType::kInt64);
   if (dtype == ArrayDataType::kFloat) {
     return tensorflow::DT_FLOAT;
   } else if (dtype == ArrayDataType::kInt32) {
     return tensorflow::DT_INT32;
   } else if (dtype == ArrayDataType::kUint8) {
     return tensorflow::DT_UINT8;
+  } else if (dtype == ArrayDataType::kInt64) {
+    return tensorflow::DT_INT64;
   } else {
     LOG(FATAL) << "Wrong data type";
   }
@@ -947,6 +951,22 @@ void ConvertGatherOperator(const Model& model, const GatherOperator& src_op,
   (*gather_op->mutable_attr())["Tparams"].set_type(params_type);
 }
 
+void ConvertArgMaxOperator(const Model& model, const ArgMaxOperator& src_op,
+                           GraphDef* tensorflow_graph) {
+  auto* argmax_op = tensorflow_graph->add_node();
+  argmax_op->set_op("ArgMax");
+  argmax_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *argmax_op->add_input() = src_op.inputs[0];
+  *argmax_op->add_input() = src_op.inputs[1];
+  (*argmax_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+  (*argmax_op->mutable_attr())["Tidx"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[1]));
+  (*argmax_op->mutable_attr())["output_type"].set_type(
+      GetTensorFlowDataType(model, src_op.outputs[0]));
+}
+
 void ConvertResizeBilinearOperator(const Model& model,
                                    const ResizeBilinearOperator& src_op,
                                    GraphDef* tensorflow_graph) {
@@ -990,22 +1010,21 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   const string concat_output = base + "basic_lstm_cell/concat";
   // Op names have been chosen to match the tf.slim LSTM naming
   // as closely as possible.
-  const int concat_dim =
+  const int axis =
       model.arrays.at(src_op.inputs[LstmCellOperator::PREV_ACTIV_INPUT])
           ->shape()
           .dimensions_count() -
       1;
   // Note that DATA_INPUT may have extra size 1 dimensions, but TF concat
   // works the same since the tensor has the same underlying data layout.
-  const string concat_dim_output = concat_output + "/concat_dim";
-  CreateDummyConcatDimTensorConst(concat_dim_output, concat_dim,
-                                  tensorflow_graph);
+  const string axis_output = concat_output + "/axis";
+  CreateDummyConcatDimTensorConst(axis_output, axis, tensorflow_graph);
   auto* concat_op = tensorflow_graph->add_node();
   concat_op->set_op("ConcatV2");
   concat_op->set_name(concat_output);
   *concat_op->add_input() = src_op.inputs[LstmCellOperator::DATA_INPUT];
   *concat_op->add_input() = src_op.inputs[LstmCellOperator::PREV_ACTIV_INPUT];
-  *concat_op->add_input() = concat_dim_output;
+  *concat_op->add_input() = axis_output;
   (*concat_op->mutable_attr())["T"].set_type(DT_FLOAT);
   (*concat_op->mutable_attr())["Tidx"].set_type(DT_INT32);
   (*concat_op->mutable_attr())["N"].set_i(2);  // Number of inputs
@@ -1066,8 +1085,7 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   // Split
   string split_dim_output = base + "split/split_dim";
   // The dimension is the same as the concatenation dimension
-  CreateDummyConcatDimTensorConst(split_dim_output, concat_dim,
-                                  tensorflow_graph);
+  CreateDummyConcatDimTensorConst(split_dim_output, axis, tensorflow_graph);
   string split_output = base + "split";
   auto* split_op = tensorflow_graph->add_node();
   split_op->set_op("Split");
@@ -1283,6 +1301,10 @@ void ConvertMeanOperator(const Model& model, const MeanOperator& src_op,
   const auto params_type = GetTensorFlowDataType(model, src_op.inputs[0]);
   (*new_op->mutable_attr())["T"].set_type(params_type);
 
+  if (src_op.keep_dims) {
+    (*new_op->mutable_attr())["keep_dims"].set_b(true);
+  }
+
   // Create the params tensor.
   auto* params_op = tensorflow_graph->add_node();
   params_op->set_op("Const");
@@ -1291,11 +1313,11 @@ void ConvertMeanOperator(const Model& model, const MeanOperator& src_op,
   auto* tensor = (*params_op->mutable_attr())["value"].mutable_tensor();
   tensor->set_dtype(DT_INT32);
 
-  for (int i = 0; i < src_op.reduction_indices.size(); ++i) {
-    tensor->add_int_val(src_op.reduction_indices[i]);
+  for (int i = 0; i < src_op.axis.size(); ++i) {
+    tensor->add_int_val(src_op.axis[i]);
   }
   auto* shape = tensor->mutable_tensor_shape();
-  shape->add_dim()->set_size(src_op.reduction_indices.size());
+  shape->add_dim()->set_size(src_op.axis.size());
 }
 
 void ConvertSqueezeOperator(const Model& model, const SqueezeOperator& src_op,
@@ -1491,15 +1513,37 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kSlice) {
     ConvertSliceOperator(model, static_cast<const SliceOperator&>(src_op),
                          tensorflow_graph);
+  } else if (src_op.type == OperatorType::kArgMax) {
+    ConvertArgMaxOperator(model, static_cast<const ArgMaxOperator&>(src_op),
+                          tensorflow_graph);
   } else {
     LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(src_op.type);
   }
 }
 
-void AddPlaceholder(const string& name, GraphDef* tensorflow_graph) {
+void AddPlaceholder(const string& name, ArrayDataType type,
+                    GraphDef* tensorflow_graph) {
   auto* placeholder = tensorflow_graph->add_node();
   placeholder->set_op("Placeholder");
-  (*placeholder->mutable_attr())["dtype"].set_type(DT_FLOAT);
+  switch (type) {
+    case ArrayDataType::kBool:
+      (*placeholder->mutable_attr())["dtype"].set_type(DT_BOOL);
+      break;
+    case ArrayDataType::kFloat:
+      (*placeholder->mutable_attr())["dtype"].set_type(DT_FLOAT);
+      break;
+    case ArrayDataType::kUint8:
+      (*placeholder->mutable_attr())["dtype"].set_type(DT_UINT8);
+      break;
+    case ArrayDataType::kInt32:
+      (*placeholder->mutable_attr())["dtype"].set_type(DT_INT32);
+      break;
+    case ArrayDataType::kInt64:
+      (*placeholder->mutable_attr())["dtype"].set_type(DT_INT64);
+      break;
+    default:
+      LOG(FATAL) << "Unexpected data type in array \"" << name << "\"";
+  }
   placeholder->set_name(name);
 }
 
@@ -1527,7 +1571,9 @@ void AddPlaceholderForRNNState(const Model& model, const string& name, int size,
 void ExportTensorFlowGraphDefImplementation(const Model& model,
                                             GraphDef* tensorflow_graph) {
   for (const auto& input_array : model.flags.input_arrays()) {
-    AddPlaceholder(input_array.name(), tensorflow_graph);
+    AddPlaceholder(input_array.name(),
+                   model.arrays.at(input_array.name())->data_type,
+                   tensorflow_graph);
   }
   for (const auto& rnn_state : model.flags.rnn_states()) {
     AddPlaceholderForRNNState(model, rnn_state.state_array(), rnn_state.size(),
diff --git a/tensorflow/contrib/lite/toco/format_port.h b/tensorflow/contrib/lite/toco/format_port.h
index 3bc3295d0494482f306f3af00795a3c00e3153bf..0e999001e0e35fb916b11db199dbf28572685f3d 100644
--- a/tensorflow/contrib/lite/toco/format_port.h
+++ b/tensorflow/contrib/lite/toco/format_port.h
@@ -36,7 +36,7 @@ inline const char* IdentityOrConvertStringToRaw(const std::string& foo) {
   return foo.c_str();
 }
 
-#if defined(PLATFORM_GOOGLE)
+#if defined(PLATFORM_GOOGLE) && defined(HAS_GLOBAL_STRING)
 // Overloaded case where we return string.
 inline const char* IdentityOrConvertStringToRaw(const string& foo) {
   return foo.c_str();
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
index b9f8c8d152e7f0f856bfdf0b141c240882d447c4..7e152f5ba887088c98055596f8245b82fbc86eaa 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
@@ -26,7 +26,6 @@ bazel run --config=opt \
   --output_file=/tmp/foo.lite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
-  --input_type=FLOAT \
   --inference_type=FLOAT \
   --input_shape=1,128,128,3 \
   --input_array=input \
@@ -58,19 +57,9 @@ To explain each of these flags:
     allowing to defer the specification of the input shape until runtime. The
     format of `input_shape` is always a comma-separated list of dimensions,
     always in TensorFlow convention.
-*   `--input_type` specifies what should be the type of the input arrays in the
-    **output** file. `--input_type` does not describe a property of the input
-    file: the type of input arrays is already encoded in the input graph.
-    Rather, `--input_type` is how you specify what should be the type of the
-    inputs to be provided to the output converted graph. This only affects
-    arrays of real numbers: this flag allows to quantized/dequantize
-    real-numbers inputs, switching between floating-point and quantized forms.
-    This flag has no incidence on all other types of input arrays, such as plain
-    integers or strings.
 *   `--inference_type` specifies what type of arithmetic the output file should
     be relying on. It implies in particular the choice of type of the output
-    arrays in the output file. Like `--input_type`, `--inference_type` does not
-    describe a property of the input file.
+    arrays in the output file.
 
 ## Just optimize a TensorFlow GraphDef
 
@@ -94,11 +83,11 @@ bazel run --config=opt \
   --output_array=MobilenetV1/Predictions/Reshape_1
 ```
 
-Here we did not pass `--input_type` and `--inference_type` because they are
-considered not applicable to the TensorFlow GraphDef format (as far as we are
-concerned, TensorFlow GraphDefs are technically always float, and the only
-flavor of "quantized" GraphDef that the converter deals with is "FakeQuantized"
-graphs that are still technically float graphs).
+Here we did not pass `--inference_type` because it is not considered applicable
+to the TensorFlow GraphDef format (as far as we are concerned, TensorFlow
+GraphDefs are technically always float, and the only flavor of "quantized"
+GraphDef that the converter deals with is "FakeQuantized" graphs that are still
+technically float graphs).
 
 Below in the section about passing arbitrary input/output arrays we give another
 example, using the converter to extract just a sub-graph from a TensorFlow
@@ -144,7 +133,6 @@ bazel run --config=opt \
   --output_file=/tmp/foo.lite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
-  --input_type=QUANTIZED_UINT8 \
   --inference_type=QUANTIZED_UINT8 \
   --input_shape=1,128,128,3 \
   --input_array=input \
@@ -156,11 +144,9 @@ bazel run --config=opt \
 Here, besides changing `--input_file` to point to a (fake-)quantized GraphDef,
 the only other changes are:
 
-*   To change `--input_type` and `--inference_type` to `QUANTIZED_UINT8`. This
-    effectively tells the converter to generate an output file that can take a
-    quantized uint8 array as input (`--input_type=QUANTIZED_UINT8`), and have
-    quantized uint8 internal and output arrays as well
-    (`--inference_type=QUANTIZED_UINT8`).
+*   To change `--inference_type` to `QUANTIZED_UINT8`. This effectively tells
+    the converter to generate an output file that performs quantized inference
+    on a quantized input.
 *   To pass `--mean_value` and `--std_value` flags to describe how the quantized
     uint8 input array values are to be interpreted as the mathematical real
     numbers that the graph is concerned with (keep in mind that even a
@@ -195,7 +181,6 @@ bazel run --config=opt \
   --output_file=/tmp/foo.cc \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
-  --input_type=QUANTIZED_UINT8 \
   --inference_type=QUANTIZED_UINT8 \
   --input_shape=1,128,128,3 \
   --input_array=input \
@@ -225,7 +210,6 @@ bazel run --config=opt \
   --output_file=/tmp/foo.lite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
-  --input_type=FLOAT \
   --inference_type=FLOAT \
   --input_shape=1,224,224,3 \
   --input_array=input \
@@ -254,7 +238,6 @@ bazel run --config=opt \
   --output_file=/tmp/foo.lite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
-  --input_type=FLOAT \
   --inference_type=FLOAT \
   --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
   --input_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool,InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu \
@@ -328,7 +311,6 @@ bazel run --config=opt \
   --output_file=/tmp/foo.lite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
-  --input_type=FLOAT \
   --inference_type=FLOAT \
   --input_shape=1,128,128,3 \
   --input_array=input \
@@ -436,7 +418,6 @@ bazel run --config=opt \
   --output_file=/tmp/foo.lite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
-  --input_type=FLOAT \
   --inference_type=FLOAT \
   --input_shape=1,128,128,3 \
   --input_array=input \
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
index cc6d416959c2a4d3a06d95b44e5bb333224838c0..4776741ab9273cf3b2ef0c63a6dbfdea5475b057 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
@@ -38,23 +38,27 @@ on the input and output formats, additional flags may be allowed or mandatory:
         file are to be considered the input activations.
     *   `--input_shape` or `--input_shapes` specify the shapes of the input
         arrays.
+    *   `--input_data_type` or `--input_data_types` specify the data types of
+        input arrays, which can be used if the input file does not already
+        specify them.
     *   `--mean_value` or `--mean_values`, and `--std_value` or `--std_values`,
         give the dequantization parameters of the input arrays, for the case
         when the output file will accept quantized input arrays.
 *   *Transformation flags* specify options of the transformations to be applied
     to the graph, i.e. they specify requested properties that the output file
     should have.
-    *   `--input_type` specifies the type that the input arrays should have
-        after transformations, in the output file. This is where you choose
-        whether you want runtime inference code to accept float or quantized
-        inputs. This flag only applies to float or quantized inputs, and allows
-        to convert between the two. This flag has no effect on all other types
-        of inputs, such as ordinary integer arrays.
-    *   `--inference_type` or `--inference_types` specify the type that generic
-        intermediate and output activation arrays should have after
-        transformations, in the output file. This is where you choose whether
-        you want runtime inference code to perform float or quantized inference
-        arithmetic.
+    *   `--inference_type` specifies the type of real-numbers arrays in the
+        output file. This only affects arrays of real numbers and allows to
+        control their quantization or dequantization, effectively switching
+        between floating-point and quantized arithmetic for the inference
+        workload, as far as real numbers are concerned. Other data types are
+        unaffected (e.g. plain integers, and strings).
+    *   `--inference_input_type` is like `--inference_type` but specifically
+        controlling input arrays, separately from other arrays. If not
+        specified, then `--inference_type` is used. The use case for specifying
+        `--inference_input_type` is when one wants to perform floating-point
+        inference on a quantized input, as is common in image models operating
+        on bitmap image inputs.
     *   Some transformation flags allow to carry on with quantization when the
         input graph is not properly quantized: `--default_ranges_min`,
         `--default_ranges_max`, `--drop_fake_quant`,
@@ -77,8 +81,6 @@ on the input and output formats, additional flags may be allowed or mandatory:
     *   `TFLITE` &mdash; The TensorFlow Lite flatbuffers format.
         *   Whether a float or quantized TensorFlow Lite file will be produced
             depends on the `--inference_type` flag.
-        *   Whether the produced TensorFlow Lite file will accept a float or
-            quantized input depends on the `--input_type` flag.
     *   `GRAPHVIZ_DOT` &mdash; The GraphViz `.dot` format. This asks the
         converter to generate a reasonable graphical representation of the graph
         after simplification by a generic set of transformation.
@@ -126,9 +128,7 @@ additional information about the single input array:
         next innermost dimension after 'depth').
 *   `--mean_value` and `--std_value`. Type: floating-point. The decimal point
     character is always the dot (`.`) regardless of the locale. These specify
-    the (de-)quantization parameters of the input array, to use when the output
-    file will take a quantized input array (that is, when passing
-    `--input_type=QUANTIZED_UINT8`).
+    the (de-)quantization parameters of the input array, when it is quantized.
     *   The meaning of mean_value and std_value is as follows: each quantized
         value in the quantized input array will be interpreted as a mathematical
         real number (i.e. as an input activation value) according to the
@@ -162,33 +162,48 @@ additional information about the multiple input arrays:
 
 ### Transformation flags
 
-*   `--input_type`. Type: string. Specifies what should be the type of the
-    entries in the input array(s) in the output file, after transformations, for
-    those input arrays that are originally either floating-point or quantized
-    real numbers in the input file. If there are multiple such input arrays,
-    then they all use this type. Input arrays of other types, such as arrays of
-    plain integers or strings, are not concerned with this flag. Allowed values:
-    *   `FLOAT` &mdash; Keep floating-point input arrays as such. Dequantize any
-        quantized input array. entries ("float32").
-    *   `QUANTIZED_UINT8` &mdash; Quantize floating-point input arrays, to have
-        8-bit unsigned integer entries. The quantization params are specified by
-        `--mean_value`, `--std_value` flags as explained in the documentation of
-        these flags.
-*   `--inference_type`. Type: string. Specifies what to do with floating-point
-    arrays found in the input file, besides input arrays. In other words, this
-    controls the possible quantization of floating-point weights, intermediate
-    activations, and output activations. Has no effect on arrays that aren't
-    floating-point in the input file. Allowed values:
-    *   `FLOAT` &mdash; Keep floating-point arrays as floating-point in the
-        output file. This corresponds to what is commonly called "floating-point
-        inference".
-    *   `QUANTIZED_UINT8` &mdash; Quantize floating-point arrays, changing their
-        storage data type from float to some integer type:
-        *   All float activations are quantized as `uint8`.
-        *   Almost all float weights are quantized as `uint8`.
-            *   A few exceptions exist. In particular, the bias-vectors in
-                "Conv" and "FullyConnected" layers are quantized as `int32`
-                instead for technical reasons.
+*   `--inference_type`. Type: string. Sets the type of real-number arrays in the
+    output file, that is, controls the representation (quantization) of real
+    numbers in the output file, except for input arrays, which are controlled by
+    `--inference_input_type`.
+
+    This flag only impacts real-number arrays. By "real-number" we mean float
+    arrays, and quantized arrays. This excludes plain integer arrays, strings
+    arrays, and every other data type.
+
+    For real-number arrays, the impact of this flag is to allow the output file
+    to choose a different real-numbers representation (quantization) from what
+    the input file used. For any other types of arrays, changing the data type
+    would not make sense.
+
+    Specifically:
+
+    *   If `FLOAT`, then real-numbers arrays will be of type float in the output
+        file. If they were quantized in the input file, then they get
+        dequantized.
+    *   If `QUANTIZED_UINT8`, then real-numbers arrays will be quantized as
+        uint8 in the output file. If they were float in the input file, then
+        they get quantized.
+    *   If not set, then all real-numbers arrays retain the same type in the
+        output file as they have in the input file.
+
+*   `--inference_input_type`. Type: string. Similar to inference_type, but
+    allows to control specifically the quantization of input arrays, separately
+    from other arrays.
+
+    If not set, then the value of `--inference_type` is implicitly used, i.e. by
+    default input arrays are quantized like other arrays.
+
+    Like `--inference_type`, this only affects real-number arrays. By
+    "real-number" we mean float arrays, and quantized arrays. This excludes
+    plain integer arrays, strings arrays, and every other data type.
+
+    The typical use for this flag is for vision models taking a bitmap as input,
+    typically with uint8 channels, yet still requiring floating-point inference.
+    For such image models, the uint8 input is quantized, i.e. the uint8 values
+    are interpreted as real numbers, and the quantization parameters used for
+    such input arrays are their `mean_value`, `std_value` parameters.
+
 *   `--default_ranges_min`, `--default_ranges_max`. Type: floating-point. The
     decimal point character is always the dot (`.`) regardless of the locale.
     These flags enable what is called "dummy quantization". If defined, their
@@ -198,9 +213,11 @@ additional information about the multiple input arrays:
     incorrectly-quantized input files. This enables easy performance prototyping
     ("how fast would my model run if I quantized it?") but should never be used
     in production as the resulting quantized arithmetic is inaccurate.
+
 *   `--drop_fake_quant`. Type: boolean. Default: false. Causes fake-quantization
     nodes to be dropped from the graph. This may be used to recover a plain
     float graph from a fake-quantized graph.
+
 *   `--reorder_across_fake_quant`. Type: boolean. Default: false. Normally,
     fake-quantization nodes must be strict boundaries for graph transformations,
     in order to ensure that quantized inference has the exact same arithmetic
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3bde9b0169ddfb7fc37657122e2e8eb65ccbdf6d
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
@@ -0,0 +1,101 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ConvertExpandDimsToReshape::Run(Model* model, std::size_t op_index) {
+  auto expand_it = model->operators.begin() + op_index;
+  if (expand_it->get()->type != OperatorType::kExpandDims) {
+    return false;
+  }
+  ExpandDimsOperator* expand_op =
+      static_cast<ExpandDimsOperator*>(expand_it->get());
+  CHECK_EQ(expand_op->inputs.size(), 2);
+  CHECK_EQ(expand_op->outputs.size(), 1);
+
+  const auto& input_array = *model->arrays[expand_op->inputs[0]];
+  if (!input_array.has_shape()) {
+    // Yield until input dims have been resolved.
+    return false;
+  }
+  if (input_array.shape().dimensions_count() == 0) {
+    // Input array cannot be 0-D.
+    // (Unsure if this is TF behavior, but was required to get a test to pass.)
+    return false;
+  }
+
+  const auto& axis_array = *model->arrays[expand_op->inputs[1]];
+  if (!axis_array.has_shape()) {
+    // Yield until input axis array shape has been resolved.
+    return false;
+  }
+  CHECK_EQ(RequiredBufferSizeForShape(axis_array.shape()), 1);
+  if (!axis_array.buffer) {
+    // Yield until the input axis array is constant
+    return false;
+  }
+  int axis = axis_array.GetBuffer<ArrayDataType::kInt32>().data[0];
+  std::vector<int> reshape_dims(input_array.shape().dims());
+  if (axis < 0) {
+    axis = reshape_dims.size();
+  }
+  reshape_dims.insert(reshape_dims.begin() + axis, 1);
+
+  // The input tensor has shape, and the axis input is constant. We can now
+  // replace ExpandDims with a Reshape.
+  auto* reshape_op = new TensorFlowReshapeOperator;
+
+  // Copy inputs
+  reshape_op->inputs.push_back(expand_op->inputs[0]);
+  reshape_op->outputs = expand_op->outputs;
+
+  // Create a new input array
+  string axis_array_name = expand_op->inputs[1];
+  string shape_array_name = toco::AvailableArrayName(*model, axis_array_name);
+  Array& shape_array = model->GetOrCreateArray(shape_array_name);
+  *(shape_array.mutable_shape()->mutable_dims()) = {
+      1, static_cast<int>(reshape_dims.size())};
+  reshape_op->inputs.push_back(shape_array_name);
+  shape_array.data_type = ArrayDataType::kInt32;
+  auto& shape_buffer = shape_array.GetMutableBuffer<ArrayDataType::kInt32>();
+  shape_buffer.data = reshape_dims;
+
+  // Delete axis array if unused
+  if (IsDiscardableArray(*model, axis_array_name) &&
+      CountOpsWithInput(*model, axis_array_name) == 1 &&
+      !GetOpWithOutput(*model, axis_array_name)) {
+    model->arrays.erase(axis_array_name);
+  }
+
+  // Replace the operator in the graph.
+  const auto reshape_it = model->operators.emplace(expand_it, reshape_op);
+  expand_it = reshape_it + 1;
+  CHECK_EQ(expand_it->get(), expand_op);
+  model->operators.erase(expand_it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
index 7a865100259f79af998c9d7faa224dff75cb3c57..d129b5ecf2615434b8ff8387a04af9561fe617a4 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
@@ -71,7 +71,8 @@ bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
   // TODO(dkalenichenko): Great many ops don't support activation function
   // fusing. Switch to the whilelist approach instead.
   if (op->type == OperatorType::kConcatenation ||
-      op->type == OperatorType::kSlice) {
+      op->type == OperatorType::kSlice ||
+      op->type == OperatorType::kTensorFlowSplit) {
     AddMessageF(
         "Not fusing activation function because the %s op doesn't support it",
         LogName(*op));
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
index 323fec6cf864a798a02aecdbbbf7c2e7bb904d2b..3a7611a6683206eb3a9f6779668158292274a7fe 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
@@ -41,6 +41,97 @@ void PrintModelStats(const string& label, const Model& model) {
             << " quantized)";
 }
 
+// Some graphs have RNN back-edges that are discardable, having been
+// created typically by TensorFlow import rather than specified by the user.
+// Such graphs might have cycles (closed by RNN back-edges) that may be pruned.
+// Local graph transformations can't identify such global features,
+// so this function performs this global transformation.
+//
+// The other (and related) thing that is peculiar about RNN back-edges
+// is that they do not prevent the arrays that they touch, from being
+// pruned. Thus, they may refer to array names which no longer exist.
+// The intent is for that to result in the eventual pruning of such
+// 'dangling' RNN back-edges. We perform this pruning at the end of this
+// function, as the pruning of connected components done here may leave
+// more RNN back-edges dangling.
+void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
+  // Identify the set of arrays that are in 'useful' connected components
+  // of the graph, which means connected to output arrays.
+  std::unordered_set<string> useful_arrays;
+  for (const string& output_array : model->flags.output_arrays()) {
+    useful_arrays.insert(output_array);
+  }
+  bool found_new_useful_arrays;
+  do {
+    found_new_useful_arrays = false;
+    for (const auto& op : model->operators) {
+      bool op_touches_useful_arrays = false;
+      for (const string& output : op->outputs) {
+        op_touches_useful_arrays |= useful_arrays.count(output);
+      }
+      if (op_touches_useful_arrays) {
+        for (const string& input : op->inputs) {
+          found_new_useful_arrays |= !useful_arrays.count(input);
+          useful_arrays.insert(input);
+        }
+        for (const string& output : op->outputs) {
+          found_new_useful_arrays |= !useful_arrays.count(output);
+          useful_arrays.insert(output);
+        }
+      }
+    }
+    for (const auto& rnn_state : model->flags.rnn_states()) {
+      bool rnn_back_edge_touches_useful_arrays =
+          useful_arrays.count(rnn_state.state_array());
+      if (rnn_back_edge_touches_useful_arrays) {
+        found_new_useful_arrays |=
+            !useful_arrays.count(rnn_state.back_edge_source_array());
+        useful_arrays.insert(rnn_state.back_edge_source_array());
+      }
+    }
+  } while (found_new_useful_arrays);
+  // Erase arrays that aren't useful, and that are discardable.
+  for (auto it = model->arrays.begin(); it != model->arrays.end();) {
+    if (useful_arrays.count(it->first) ||
+        !IsDiscardableArray(*model, it->first)) {
+      ++it;
+    } else {
+      it = model->arrays.erase(it);
+    }
+  }
+  // Erase operators that do not produce a useful output array.
+  for (auto it = model->operators.begin(); it != model->operators.end();) {
+    // Only need to test the first output, as we simultaneously added all of
+    // an operator's outputs to the list of output arrays.
+    if (useful_arrays.count((*it)->outputs[0])) {
+      ++it;
+    } else {
+      for (const string& output : (*it)->outputs) {
+        CHECK(!useful_arrays.count(output));
+      }
+      it = model->operators.erase(it);
+    }
+  }
+  // Erase RNN back-edges that are 'dangling' i.e. that touch an array
+  // that no longer exists. This should only happen for discardable RNN
+  // back-edges.
+  std::vector<RnnState> rnn_states_to_keep;
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    const bool dangling =
+        !model->arrays.count(rnn_state.back_edge_source_array()) ||
+        !model->arrays.count(rnn_state.state_array());
+    if (dangling) {
+      CHECK(rnn_state.discardable());
+    } else {
+      rnn_states_to_keep.push_back(rnn_state);
+    }
+  }
+  model->flags.clear_rnn_states();
+  for (const auto& rnn_state : rnn_states_to_keep) {
+    *model->flags.add_rnn_states() = rnn_state;
+  }
+}
+
 bool GraphTransformationsPass(int increment, Model* model,
                               const GraphTransformationsSet& transformations) {
   CHECK(increment == 1 || increment == -1);
@@ -86,6 +177,7 @@ bool GraphTransformationsPass(int increment, Model* model,
       op_index += increment;
     }
   }
+  DiscardUselessConnectedComponentsAndRNNBackEdges(model);
   return changed;
 }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 2cc24ff361a4c2b9c5c444d8a7fc12b6889a2ce1..2eb244ee0895cb96ebb5fc00cceb8dc81fc55c71 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -108,10 +108,11 @@ void RunGraphTransformations(Model* model, const string& message,
   class GTName : public GraphTransformation {              \
    public:                                                 \
     bool Run(Model* model, std::size_t op_index) override; \
-    const char* Name() const { return #GTName; }           \
+    const char* Name() const override { return #GTName; }  \
   };
 
 // List of all graph transformations
+DECLARE_GRAPH_TRANSFORMATION(ConvertExpandDimsToReshape)
 DECLARE_GRAPH_TRANSFORMATION(ConvertPureConvToDepthwise)
 DECLARE_GRAPH_TRANSFORMATION(EnsureBiasVectors)
 DECLARE_GRAPH_TRANSFORMATION(FuseActivationFunctions)
@@ -151,6 +152,7 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveConstantFakeQuant)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantConcatenation)
 DECLARE_GRAPH_TRANSFORMATION(DropFakeQuant)
 DECLARE_GRAPH_TRANSFORMATION(UnfuseActivationFunctions)
+DECLARE_GRAPH_TRANSFORMATION(ResolveBatchToSpaceNDAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolvePadAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveStridedSliceAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveSliceAttributes)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
index 9cb26c8752c0d27a3d1138b9ad32e60f34177520..9689b205cd137904504d87906cb691d0ed8235bf 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -143,7 +143,7 @@ bool HardcodeMinMaxForAverageOrMaxPool(Model* model, Operator* op) {
   return true;
 }
 
-bool HardcodeMinMaxForReshapeOrSqueeze(Model* model, Operator* op) {
+bool HardcodeMinMaxFromFirstInput(Model* model, Operator* op) {
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.minmax) {
     return false;
@@ -203,7 +203,8 @@ bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
 
     case OperatorType::kSqueeze:
     case OperatorType::kTensorFlowReshape:
-      changed = HardcodeMinMaxForReshapeOrSqueeze(model, op);
+    case OperatorType::kPad:
+      changed = HardcodeMinMaxFromFirstInput(model, op);
       break;
 
     case OperatorType::kLogistic:
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
index 1ff4e827aa043cbbb0515e10a6ae9bd33e6d819c..4fe127544be6f4439b184e1fcf4436eda4a53cc5 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -24,19 +24,6 @@ limitations under the License.
 namespace toco {
 
 namespace {
-
-ArrayDataType CommonDataTypeOfAllInputs(const Model& model,
-                                        const Operator& op) {
-  CHECK_GT(op.inputs.size(), 0);
-  const ArrayDataType data_type = model.GetArray(op.inputs[0]).data_type;
-  for (const auto& input : op.inputs) {
-    const auto& array = model.GetArray(input);
-    CHECK(array.data_type == data_type)
-        << " Unexpected: this operator has inputs with different data types.";
-  }
-  return data_type;
-}
-
 void SetDataTypeForAllOutputs(Model* model, Operator* op,
                               ArrayDataType data_type) {
   for (const auto& output : op->outputs) {
@@ -72,41 +59,15 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
              op->type == OperatorType::kTensorFlowGreaterEqual) {
     // These operators unconditionally produce bool outputs
     SetDataTypeForAllOutputs(model, op, ArrayDataType::kBool);
-  } else if (op->type == OperatorType::kTensorFlowShape) {
+  } else if (op->type == OperatorType::kRank ||
+             op->type == OperatorType::kTensorFlowShape) {
     // These operators are assumed to produce int32 outputs.
     SetDataTypeForAllOutputs(model, op, ArrayDataType::kInt32);
-  } else if (op->type == OperatorType::kAveragePool ||
-             op->type == OperatorType::kMaxPool ||
-             op->type == OperatorType::kL2Pool ||
-             op->type == OperatorType::kConv ||
-             op->type == OperatorType::kDepthwiseConv ||
-             op->type == OperatorType::kFullyConnected ||
-             op->type == OperatorType::kTensorFlowMax ||
-             op->type == OperatorType::kTensorFlowMin ||
-             op->type == OperatorType::kPad ||
-             op->type == OperatorType::kStridedSlice ||
-             op->type == OperatorType::kTensorFlowReshape ||
-             op->type == OperatorType::kSlice ||
-             op->type == OperatorType::kSqueeze ||
-             op->type == OperatorType::kTensorFlowSum ||
-             op->type == OperatorType::kTensorFlowSwitch ||
-             op->type == OperatorType::kTensorFlowTile ||
-             op->type == OperatorType::kTensorFlowAll ||
-             op->type == OperatorType::kReorderAxes ||
-             op->type == OperatorType::kTensorFlowConcatV2 ||
-             op->type == OperatorType::kFloor ||
-             op->type == OperatorType::kGather ||
-             op->type == OperatorType::kSpaceToBatchND ||
-             op->type == OperatorType::kBatchToSpaceND ||
-             op->type == OperatorType::kMean) {
-    // These operators produce outputs with the same type as their 1st input
-    CHECK_GT(op->inputs.size(), 0);
-    const ArrayDataType data_type = model->arrays[op->inputs[0]]->data_type;
-    SetDataTypeForAllOutputs(model, op, data_type);
   } else if (op->type == OperatorType::kTensorFlowSplit ||
-             op->type == OperatorType::kTensorFlowConcat) {
+             op->type == OperatorType::kTensorFlowConcat ||
+             op->type == OperatorType::kFill) {
     // These operators produce an output with the same type as their 2nd input
-    CHECK_GT(op->inputs.size(), 1);
+    CHECK_GE(op->inputs.size(), 2);
     const ArrayDataType data_type = model->arrays[op->inputs[1]]->data_type;
     SetDataTypeForAllOutputs(model, op, data_type);
   } else if (op->type == OperatorType::kCast) {
@@ -114,6 +75,11 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
     CHECK_EQ(op->outputs.size(), 1);
     auto* cast_op = static_cast<CastOperator*>(op);
     model->arrays[op->outputs[0]]->data_type = cast_op->dst_data_type;
+  } else if (op->type == OperatorType::kArgMax) {
+    // Data type of the ArgMax op is specified.
+    CHECK_EQ(op->outputs.size(), 1);
+    auto* argmax_op = static_cast<ArgMaxOperator*>(op);
+    model->arrays[op->outputs[0]]->data_type = argmax_op->output_data_type;
   } else if (op->type == OperatorType::kTensorFlowUnsupported) {
     auto* unsupported_op = static_cast<TensorFlowUnsupportedOperator*>(op);
     if (unsupported_op->output_data_types.size() != op->outputs.size()) {
@@ -124,10 +90,13 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
       auto data_type = unsupported_op->output_data_types[i];
       model->arrays[output]->data_type = data_type;
     }
+  } else if (op->type == OperatorType::kExpandDims) {
+    // Yield on ExpandDim until it is converted to Reshape
+    return false;
   } else {
-    // These operators produce an output with the same type as any of their
-    // inputs, which must always have the same type.
-    const ArrayDataType data_type = CommonDataTypeOfAllInputs(*model, *op);
+    // These operators produce outputs with the same type as their 1st input
+    CHECK_GT(op->inputs.size(), 0);
+    const ArrayDataType data_type = model->arrays[op->inputs[0]]->data_type;
     SetDataTypeForAllOutputs(model, op, data_type);
   }
   // Return true if any output data type changed, false if none changed.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 82a43bc2ce9aa4eb90b520bbf2227d2b5eef839b..5a95b9961f572e6e4380d34920c81146f7411d8f 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -48,10 +49,10 @@ void ComputeConvSizes(const Shape& input_shape, int output_depth, int kwidth,
     LOG(FATAL) << "Only supporting SAME or VALID padding";
   }
 
-  fixed_padding->height =
-      ((output_height - 1) * stride_height + kheight - input_height) / 2;
-  fixed_padding->width =
-      ((output_width - 1) * stride_width + kwidth - input_width) / 2;
+  fixed_padding->height = std::max(
+      0, ((output_height - 1) * stride_height + kheight - input_height) / 2);
+  fixed_padding->width = std::max(
+      0, ((output_width - 1) * stride_width + kwidth - input_width) / 2);
 
   // Actually had to debug a situation where those were negative due to bad
   // propagation of placeholder -1 sizes in TensorFlowReshape.
@@ -367,23 +368,40 @@ void ProcessSimpleBinaryOperator(Model* model, Operator* op) {
                                   &output_array);
 }
 
+bool KeepDims(const Operator& op) {
+  switch (op.type) {
+    case OperatorType::kTensorFlowMin:
+      return static_cast<const TensorFlowMinOperator&>(op).keep_dims;
+    case OperatorType::kTensorFlowMax:
+      return static_cast<const TensorFlowMaxOperator&>(op).keep_dims;
+    case OperatorType::kTensorFlowSum:
+      return static_cast<const TensorFlowSumOperator&>(op).keep_dims;
+    case OperatorType::kMean:
+      return static_cast<const MeanOperator&>(op).keep_dims;
+    default:
+      LOG(FATAL) << "Not a reduction operator!";
+      return false;
+  }
+}
+
 void ProcessTensorFlowReductionOperator(Model* model, Operator* op) {
   CHECK_LE(op->inputs.size(), 2);
   auto& output_array = *model->arrays[op->outputs[0]];
   if (output_array.has_shape()) {
     return;
   }
+  const auto& input_array = *model->arrays[op->inputs[0]];
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  const bool keep_dims = KeepDims(*op);
   if (op->inputs.size() == 2) {
     // There is a reduction_indices input.
-    const auto& input_array = *model->arrays[op->inputs[0]];
     const auto& reduction_array = *model->arrays[op->inputs[1]];
     if (!reduction_array.buffer) {
       return;
     }
-    if (!input_array.has_shape()) {
-      return;
-    }
-    auto& input_shape = input_array.shape();
     CHECK(reduction_array.buffer->type == ArrayDataType::kInt32);
     const auto& reduction_array_vals =
         reduction_array.GetBuffer<ArrayDataType::kInt32>().data;
@@ -398,11 +416,17 @@ void ProcessTensorFlowReductionOperator(Model* model, Operator* op) {
       }
       if (!is_reduction_dim) {
         output_dims.push_back(input_shape.dims(i));
+      } else if (keep_dims) {
+        output_dims.push_back(1);
       }
     }
   } else {
     // No reduction_indices means complete reduction to a single scalar.
-    output_array.copy_shape(Shape({}));
+    if (keep_dims) {
+      output_array.copy_shape(input_shape);
+    } else {
+      output_array.copy_shape(Shape({}));
+    }
   }
 }
 
@@ -474,14 +498,14 @@ void ProcessConcatenationOperator(Model* model, ConcatenationOperator* op) {
     CHECK_EQ(input_array.shape().dimensions_count(),
              output_array.shape().dimensions_count());
     const std::vector<int>& input_dims = input_array.shape().dims();
-    CHECK_LT(op->concat_dim, input_dims.size());
-    concat_size += input_dims[op->concat_dim];
+    CHECK_LT(op->axis, input_dims.size());
+    concat_size += input_dims[op->axis];
   }
   // Write out the concat_size on the output array shape.
   auto& output_shape = *output_array.mutable_shape();
   auto& output_dims = *output_shape.mutable_dims();
-  CHECK_LT(op->concat_dim, output_shape.dimensions_count());
-  output_dims[op->concat_dim] = concat_size;
+  CHECK_LT(op->axis, output_shape.dimensions_count());
+  output_dims[op->axis] = concat_size;
 }
 
 void ProcessTensorFlowSplitOperator(Model* model, TensorFlowSplitOperator* op) {
@@ -675,7 +699,10 @@ void ProcessSpaceToBatchNDOperator(Model* model, SpaceToBatchNDOperator* op) {
     return;
   }
   const auto& input_shape = input_array.shape();
-  CHECK_EQ(input_shape.dimensions_count(), 4);
+  if (input_shape.dimensions_count() != 4) {
+    // This method only handles input dimensions of 4
+    return;
+  }
   const auto input_height = input_shape.dims(1);
   const auto input_width = input_shape.dims(2);
 
@@ -794,6 +821,7 @@ void ProcessGatherOperator(Model* model, GatherOperator* op) {
 
   // Copy the input dimensions to the output except for dimension 0,
   // where the dimension of indices_shape is used.
+  // TODO(mgubin): if axis != 0 this is not true, change when it's supported.
   auto output_dims = output_array.mutable_shape()->mutable_dims();
   output_dims->push_back(indices_shape.dims(0));
   for (int dim = 1; dim < input_shape.dimensions_count(); dim++) {
@@ -827,33 +855,6 @@ void ProcessPadOperator(Model* model, PadOperator* op) {
   output_array.copy_shape(output_shape);
 }
 
-void ProcessMeanOperator(Model* model, MeanOperator* op) {
-  CHECK_EQ(op->inputs.size(), 2);
-  CHECK_EQ(op->outputs.size(), 1);
-
-  const auto& input_array = *model->arrays[op->inputs[0]];
-
-  // Yield until input dims have been resolved.
-  if (!input_array.has_shape()) return;
-  const std::vector<int>& indices = op->reduction_indices;
-  if (indices.empty()) return;
-
-  auto& output_array = *model->arrays[op->outputs[0]];
-  if (output_array.has_shape()) return;
-
-  const std::vector<int>& input_dims = input_array.shape().dims();
-  std::vector<int> output_dims;
-  for (int i = 0; i < input_dims.size(); ++i) {
-    if (std::find(indices.begin(), indices.end(), i) == indices.end()) {
-      output_dims.push_back(input_dims[i]);
-    }
-  }
-  CHECK(!output_dims.empty());
-  CHECK_EQ(output_dims.size(), 2);
-
-  *output_array.mutable_shape()->mutable_dims() = output_dims;
-}
-
 void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
   CHECK_EQ(op->inputs.size(), 4);
   CHECK_EQ(op->outputs.size(), 1);
@@ -939,6 +940,34 @@ void ProcessSvdfOperator(Model* model, SvdfOperator* op) {
   auto& output_array = model->GetArray(op->outputs[1]);
   output_array.mutable_shape()->ReplaceDims({batch_size, num_units});
 }
+
+void ProcessArgMaxOperator(Model* model, ArgMaxOperator* op) {
+  CHECK_EQ(op->inputs.size(), 2);
+  const auto& input_array = *model->arrays[op->inputs[0]];
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+
+  // The current ArgMax implementation only supports 4-dimensional inputs with
+  // the last dimension as the axis to perform ArgMax for.
+  const std::vector<int>& input_dims = input_array.shape().dims();
+  CHECK_EQ(input_dims.size(), 4);
+  std::vector<int> output_dims;
+
+  output_dims.reserve(input_dims.size() - 1);
+  for (int i = 0; i < input_dims.size() - 1; ++i) {
+    output_dims.push_back(input_dims[i]);
+  }
+  output_dims.push_back(1);
+  const string& output_name = op->outputs[0];
+  auto& output_array = *model->arrays[output_name];
+  if (output_array.has_shape()) {
+    return;
+  }
+  *output_array.mutable_shape()->mutable_dims() = output_dims;
+}
+
 }  // namespace
 
 bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
@@ -964,6 +993,7 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kLocalResponseNormalization:
     case OperatorType::kTensorFlowIdentity:
     case OperatorType::kFakeQuant:
+    case OperatorType::kNeg:
     case OperatorType::kTensorFlowRsqrt:
     case OperatorType::kTensorFlowSqrt:
     case OperatorType::kTensorFlowSquare:
@@ -981,6 +1011,8 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kSub:
     case OperatorType::kMul:
     case OperatorType::kDiv:
+    case OperatorType::kFloorDiv:
+    case OperatorType::kFloorMod:
     case OperatorType::kTensorFlowLess:
     case OperatorType::kTensorFlowLessEqual:
     case OperatorType::kTensorFlowGreater:
@@ -992,6 +1024,10 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kConv:
       ProcessConvOperator(model, static_cast<ConvOperator*>(op));
       break;
+    case OperatorType::kTransposeConv:
+      // Unimplemented, hopefully another graph transformation will drop it or
+      // rewrite it.
+      break;
     case OperatorType::kDepthwiseConv:
       ProcessDepthwiseConvOperator(model,
                                    static_cast<DepthwiseConvOperator*>(op));
@@ -1024,6 +1060,7 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kTensorFlowMin:
     case OperatorType::kTensorFlowMax:
     case OperatorType::kTensorFlowSum:
+    case OperatorType::kMean:
       ProcessTensorFlowReductionOperator(model, op);
       break;
 
@@ -1065,8 +1102,14 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
       // a more general non-depth concatenation that will hopefully be dropped,
       // or else at the moment we will abort.
       break;
+    case OperatorType::kExpandDims:
+    case OperatorType::kFill:
+    case OperatorType::kRange:
+    case OperatorType::kRank:
     case OperatorType::kTensorFlowShape:
-      // Unimplemented, hopefully another graph transformation will drop it or
+    case OperatorType::kStack:
+    case OperatorType::kTranspose:
+      // Unimplemented. Hopefully another graph transformation will drop it or
       // rewrite it.
       break;
     case OperatorType::kReorderAxes:
@@ -1098,13 +1141,13 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kPad:
       ProcessPadOperator(model, static_cast<PadOperator*>(op));
       break;
-    case OperatorType::kMean:
-      ProcessMeanOperator(model, static_cast<MeanOperator*>(op));
-      break;
     case OperatorType::kStridedSlice:
       ProcessStridedSliceOperator(model,
                                   static_cast<StridedSliceOperator*>(op));
       break;
+    case OperatorType::kArgMax:
+      ProcessArgMaxOperator(model, static_cast<ArgMaxOperator*>(op));
+      break;
     case OperatorType::kTensorFlowUnsupported:
       break;
     case OperatorType::kSvdf:
@@ -1120,6 +1163,8 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
   for (const auto& output : op->outputs) {
     if (model->arrays[output]->has_shape() &&
         (old_output_dims[output] != model->arrays[output]->shape().dims())) {
+      AddMessageF("Set shape of %s to [%s]", output,
+                  absl::StrJoin(model->arrays[output]->shape().dims(), ","));
       return true;
     }
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index d33597d38144278dfca66edbdd9b3da68fbaa32c..56082b965a7cbd9d61cca2e26f7d76764c0e54aa 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -42,7 +42,7 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kL2Normalization || type == OperatorType::kAdd ||
          type == OperatorType::kAveragePool || type == OperatorType::kMaxPool ||
          type == OperatorType::kLogistic || type == OperatorType::kSoftmax ||
-         type == OperatorType::kSqueeze ||
+         type == OperatorType::kSqueeze || type == OperatorType::kPad ||
          type == OperatorType::kTensorFlowReshape ||
          type == OperatorType::kMul || type == OperatorType::kSpaceToDepth ||
          type == OperatorType::kDepthToSpace;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
index b6037357047fc699ffb15cb40d539be148a0b637..23a5c857e8b19f7edbb48f2c004d03e21008833d 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
@@ -57,7 +57,8 @@ bool RemoveTrivialConcatenationInput::Run(Model* model, std::size_t op_index) {
 
   // Drop trivial inputs.
   for (const string& input : trivial_inputs) {
-    if (CountOpsWithInput(*model, input) == 1) {
+    if (IsDiscardableArray(*model, input) &&
+        CountOpsWithInput(*model, input) == 1) {
       model->arrays.erase(input);
     }
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index a0d1338298431848ce5ebc8ae1d166959c320aef..047389f69a1d8987b52b07478b0d3eaf46f433ba 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -19,13 +19,12 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
-
 namespace {
+
 // Reroute all edges involving a given discardable array to another
 // array instead. from_array is assumed to be discardable, and consequently
 // this only updates operator edges (since discardable arrays only
@@ -46,7 +45,7 @@ void RerouteEdges(const string& from_array, const string& to_array,
   }
 }
 
-}  // end anonymous namespace
+}  // namespace
 
 bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
                                 Model* model, std::size_t op_index) {
@@ -64,19 +63,28 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
       main_input_array_index = i;
     }
   }
-  CHECK_LE(count_nonconstant_input_arrays, 1);
 
   const string main_input_name = passthru_op->inputs[main_input_array_index];
   const string output_name = passthru_op->outputs[0];
+
+  // Build the list of all input and output arrays of the passthrough node
+  // that we are considering removing. Any of these arrays is a candidate
+  // for being removed as well, if nothing else references it. Doing that
+  // arrays-removal together with the passthrough-node-removal proved too
+  // error-prone.
+  std::vector<string> removal_candidates;
+  for (const string& input : passthru_op->inputs) {
+    removal_candidates.push_back(input);
+  }
+  removal_candidates.push_back(output_name);
+
   if (IsDiscardableArray(*model, output_name)) {
     transformation->AddMessageF(
         "Removing %s, keeping its non-constant input array",
         LogName(*passthru_op));
-    model->arrays.erase(output_name);
     for (const string& input : passthru_op->inputs) {
       if (IsDiscardableArray(*model, input) && input != main_input_name &&
           CountOpsWithInput(*model, input) == 1) {
-        model->arrays.erase(input);
       }
     }
     RerouteEdges(output_name, main_input_name, model);
@@ -86,13 +94,12 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
     for (const string& input : passthru_op->inputs) {
       if (IsDiscardableArray(*model, input) &&
           (input == main_input_name || CountOpsWithInput(*model, input) == 1)) {
-        model->arrays.erase(input);
       }
     }
     RerouteEdges(main_input_name, output_name, model);
   } else {
     transformation->AddMessageF(
-        "Cannot remove %s, neither its nonconstant input nor its output may be "
+        "Cannot remove %s, neither its main input nor its output may be "
         "discarded",
         LogName(*passthru_op));
     return false;
@@ -101,6 +108,26 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
   // Remove the pass-through node.
   model->operators.erase(passthru_it);
 
+  // Remove any array that is no longer used.
+  for (const string& removal_candidate : removal_candidates) {
+    bool is_referenced = false;
+    for (const auto& op : model->operators) {
+      for (const string& input : op->inputs) {
+        if (input == removal_candidate) {
+          is_referenced = true;
+        }
+      }
+      for (const string& output : op->outputs) {
+        if (output == removal_candidate) {
+          is_referenced = true;
+        }
+      }
+    }
+    if (!is_referenced) {
+      model->arrays.erase(removal_candidate);
+    }
+  }
+
   return true;
 }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h
index b72c85c0e577ffe6d53c89bf35236192771efde2..a06181ca0b5f1cbb930fa4295fec3d6adf66440d 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h
@@ -21,10 +21,12 @@ limitations under the License.
 namespace toco {
 
 // A "passthrough op" is an op that satisfies the following conditions:
-//   1. It has at most one non-constant input (it may have other constant
-//   inputs).
+//   1. One of its inputs is (per the semantics of that op) its "main input"
+//      for some notion of "main input" that is operator-specific; for example,
+//      for a Reshape op, the main input is the array being reshaped, not the
+//      other input which gives the new shape.
 //   2. It has exactly one output.
-//   3. It forwards exactly its single non-constant input to its single output.
+//   3. It forwards exactly its main input to its single output.
 //
 // Examples include:
 //   1. TensorFlow Identity ops. (Have one input).
@@ -34,7 +36,7 @@ namespace toco {
 //      where one of its inputs is a constant array filled with zeros.
 //
 // A passthrough op is "trivial" and can be removed when it is possible to
-// discard either its single non-constant input or output array, rerouting any
+// discard either its main input or output array, rerouting any
 // edge involving it to the other of these two arrays.
 //
 // It is only possible to discard such an array if it is not explicitly
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
index 1f1f1f69488e5ec17f5a1507cf0b01b6d62657b5..e6cca8acf36745d989fb731aa948f257375d7e90 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
@@ -47,10 +47,7 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
     bool found_output_as_rnn_state_array = false;
     for (const auto& rnn_state : model->flags.rnn_states()) {
       if (output == rnn_state.state_array()) {
-        CHECK(op->type == OperatorType::kTensorFlowUnsupported);
-        CHECK_EQ(static_cast<const TensorFlowUnsupportedOperator*>(op)
-                     ->tensorflow_op,
-                 "Fill");
+        CHECK(op->type == OperatorType::kFill);
         found_output_as_rnn_state_array = true;
         break;
       }
@@ -65,7 +62,12 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
     }
     for (const auto& rnn_state : model->flags.rnn_states()) {
       if (output == rnn_state.back_edge_source_array()) {
-        return false;
+        // The output is consumed by a RNN back-edge..
+        if (!IsDiscardableArray(*model, rnn_state.back_edge_source_array()) ||
+            !IsDiscardableArray(*model, rnn_state.state_array()) ||
+            CountOpsWithInput(*model, rnn_state.state_array())) {
+          return false;
+        }
       }
     }
     if (CountOpsWithInput(*model, output)) {
@@ -88,7 +90,8 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
   // Remove any input array that is not used by anything else,
   // and that is not the output of some other operator.
   for (const auto& input : op->inputs) {
-    if (CountOpsWithInput(*model, input) == 1 &&
+    if (IsDiscardableArray(*model, input) &&
+        CountOpsWithInput(*model, input) == 1 &&
         !GetOpWithOutput(*model, input)) {
       model->arrays.erase(input);
     }
@@ -98,7 +101,7 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
   for (const auto& output : op->outputs) {
     // If the output array is the model's input array, don't remove that.
     // That's the case when cropping a model at a given --input_array.
-    if (IsInputArray(*model, output)) {
+    if (!IsDiscardableArray(*model, output)) {
       continue;
     }
     // Likewise, if the output array is a RNN state array, don't remove that.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a4f198e92f8d5e4ef9d43d84ab3aa63b6e1ac5f7
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc
@@ -0,0 +1,70 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveBatchToSpaceNDAttributes::Run(Model* model, std::size_t op_index) {
+  const auto op_it = model->operators.begin() + op_index;
+  if (op_it->get()->type != OperatorType::kBatchToSpaceND) return false;
+
+  auto* op = static_cast<BatchToSpaceNDOperator*>(op_it->get());
+
+  // The attributes are resolved only when the 3 attributes (block_shape,
+  // before_crops, after_crops) are all constant.
+  if (!op->block_shape.empty()) {
+    return false;
+  }
+
+  CHECK_EQ(op->inputs.size(), 3);
+  if (!IsConstantParameterArray(*model, op->inputs[1]) or
+      !IsConstantParameterArray(*model, op->inputs[2]))
+    return false;
+
+  // Handling block_shape.
+  const auto& block_shape_array = *model->arrays[op->inputs[1]];
+  if (!block_shape_array.has_shape()) return false;
+  const std::vector<int>& block_shape_dims = block_shape_array.shape().dims();
+  CHECK_EQ(block_shape_dims.size(), 1);
+  std::vector<int> block_shape_buffer =
+      block_shape_array.GetBuffer<ArrayDataType::kInt32>().data;
+  for (int i = 0; i < block_shape_dims[0]; ++i) {
+    op->block_shape.push_back(block_shape_buffer[i]);
+  }
+
+  // Handling crops.
+  const auto& crops_array = *model->arrays[op->inputs[2]];
+  if (!crops_array.has_shape()) return false;
+  const std::vector<int>& crops_dims = crops_array.shape().dims();
+  CHECK_EQ(crops_dims.size(), 2);
+  std::vector<int> crops_buffer =
+      crops_array.GetBuffer<ArrayDataType::kInt32>().data;
+  for (int i = 0; i < crops_dims[0]; ++i) {
+    op->before_crops.push_back(crops_buffer[i * 2]);
+    op->after_crops.push_back(crops_buffer[i * 2 + 1]);
+  }
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 0983c438498fed28903f8facf8db239ec1a7c2c4..86033275a0fa3d93aafba13faf6efa21d7a9814f 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -151,7 +151,7 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
     if (!IsDiscardableArray(*model, input_name)) return false;
   }
 
-  const int concatenation_axis = concat_op->concat_dim;
+  const int concatenation_axis = concat_op->axis;
 
   CHECK_EQ(concat_op->outputs.size(), 1);
   string concatenated_array_name = concat_op->outputs[0];
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
index bb9bda3c82cc9e9d3526efdabbb2c478fb172d80..26ff9d887b40651559ad030cd41a824679d6dd15 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -32,7 +32,9 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
   const auto unary_it = model->operators.begin() + op_index;
   const auto* unary_op = unary_it->get();
   // Test for unary ops of types that we know how to resolve
-  if (unary_op->type != OperatorType::kTensorFlowRsqrt &&
+  if (unary_op->type != OperatorType::kCast &&
+      unary_op->type != OperatorType::kNeg &&
+      unary_op->type != OperatorType::kTensorFlowRsqrt &&
       unary_op->type != OperatorType::kTensorFlowSqrt &&
       unary_op->type != OperatorType::kTensorFlowSquare &&
       unary_op->type != OperatorType::kTensorFlowSum &&
@@ -56,6 +58,12 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
     }
   }
 
+  auto& output_array = model->GetArray(unary_op->outputs[0]);
+  if (!output_array.has_shape()) {
+    // Yield until the output array dims have been resolved.
+    return false;
+  }
+
   // At the moment we don't want to care about fused activation functions.
   // The idea is that we should do the present constants-propagation before
   // activation functions get fused.
@@ -67,48 +75,76 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
         LogName(*unary_op));
     return false;
   }
+
   const auto& input_array = model->GetArray(unary_op->inputs[0]);
   // We have already tested above for existence of buffers (synonymous to being
   // a constant param).
   CHECK(input_array.buffer);
-  // At the moment we only support float buffers.
-  if (input_array.buffer->type != ArrayDataType::kFloat) {
-    return false;
-  }
-  const auto& input_float_data =
-      input_array.GetBuffer<ArrayDataType::kFloat>().data;
-  // Create the float buffer on the output array, effectively turning it into
-  // a constant parameter
-  const auto& output_name = unary_op->outputs[0];
-  auto& output_array = model->GetArray(output_name);
-  // Yield until the output array dims have been resolved.
-  if (!output_array.has_shape()) {
-    return false;
+  std::vector<DataType<ArrayDataType::kFloat>> const* input_float_data;
+  if (unary_op->type == OperatorType::kCast) {
+    CastOperator const* cast_op = static_cast<CastOperator const*>(unary_op);
+    if (cast_op->dst_data_type != ArrayDataType::kFloat) {
+      AddMessageF(
+          "Not resolving constant %s because we currently only support casting "
+          "to float",
+          LogName(*unary_op));
+      return false;
+    }
+    if (cast_op->src_data_type != input_array.buffer->type) {
+      AddMessageF(
+          "Not resolving constant %s because cast op source type does not "
+          "match input type",
+          LogName(*unary_op));
+    }
+  } else {
+    if (input_array.buffer->type != ArrayDataType::kFloat) {
+      return false;
+    }
+    input_float_data = &(input_array.GetBuffer<ArrayDataType::kFloat>().data);
   }
 
-  int input_buffer_size = RequiredBufferSizeForShape(input_array.shape());
-  int output_buffer_size = RequiredBufferSizeForShape(output_array.shape());
-  const Shape& input_shape = input_array.shape();
+  // Create a float buffer on the output array, which are always constant.
   const Shape& output_shape = output_array.shape();
-
+  const int output_dims_count = output_shape.dimensions_count();
+  const int output_buffer_size = RequiredBufferSizeForShape(output_shape);
   auto& output_float_data =
       output_array.GetMutableBuffer<ArrayDataType::kFloat>().data;
   output_float_data.resize(output_buffer_size);
 
-  const int output_dims_count = output_shape.dimensions_count();
-  if (unary_op->type == OperatorType::kTensorFlowReshape) {
+  const Shape& input_shape = input_array.shape();
+  const int input_buffer_size = RequiredBufferSizeForShape(input_shape);
+  if (unary_op->type == OperatorType::kCast) {
+    for (int i = 0; i < output_buffer_size; i++) {
+      float outval = 0.0f;
+      if (input_array.buffer->type == ArrayDataType::kFloat) {
+        outval = static_cast<float>(
+            input_array.GetBuffer<ArrayDataType::kFloat>().data[i]);
+      } else if (input_array.buffer->type == ArrayDataType::kUint8) {
+        outval = static_cast<float>(
+            input_array.GetBuffer<ArrayDataType::kUint8>().data[i]);
+      } else if (input_array.buffer->type == ArrayDataType::kInt32) {
+        outval = static_cast<float>(
+            input_array.GetBuffer<ArrayDataType::kInt32>().data[i]);
+      } else if (input_array.buffer->type == ArrayDataType::kInt64) {
+        outval = static_cast<float>(
+            input_array.GetBuffer<ArrayDataType::kInt64>().data[i]);
+      } else {
+        LOG(FATAL) << "Unsupported cast op input type";
+      }
+      output_float_data[i] = outval;
+    }
+  } else if (unary_op->type == OperatorType::kTensorFlowReshape) {
     CHECK(input_buffer_size == output_buffer_size);
-    memcpy(output_float_data.data(), input_float_data.data(),
-           input_buffer_size * sizeof(input_float_data[0]));
+    memcpy(output_float_data.data(), (*input_float_data).data(),
+           output_buffer_size * sizeof(output_float_data[0]));
   } else if (unary_op->type == OperatorType::kTensorFlowSum) {
     // At the moment only full reduction across all dimensions is supported.
     for (int i = 0; i < output_dims_count; i++) {
       CHECK_EQ(output_shape.dims(i), 1);
     }
     float sum = 0.f;
-    const int input_size = RequiredBufferSizeForShape(input_shape);
-    for (int i = 0; i < input_size; i++) {
-      sum += input_float_data[i];
+    for (int i = 0; i < input_buffer_size; i++) {
+      sum += (*input_float_data)[i];
     }
     output_float_data[0] = sum;
   } else if (unary_op->type == OperatorType::kTensorFlowMin) {
@@ -117,10 +153,9 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
     for (int i = 0; i < output_dims_count; i++) {
       CHECK_EQ(output_shape.dims(i), 1);
     }
-    float min = input_float_data[0];
-    const int input_size = RequiredBufferSizeForShape(input_shape);
-    for (int i = 0; i < input_size; i++) {
-      min = std::min(min, input_float_data[i]);
+    float min = (*input_float_data)[0];
+    for (int i = 0; i < input_buffer_size; i++) {
+      min = std::min(min, (*input_float_data)[i]);
     }
     output_float_data[0] = min;
   } else if (unary_op->type == OperatorType::kTensorFlowMax) {
@@ -129,25 +164,26 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
     for (int i = 0; i < output_dims_count; i++) {
       CHECK_EQ(output_shape.dims(i), 1);
     }
-    float max = input_float_data[0];
-    const int input_size = RequiredBufferSizeForShape(input_shape);
-    for (int i = 0; i < input_size; i++) {
-      max = std::max(max, input_float_data[i]);
+    float max = (*input_float_data)[0];
+    for (int i = 0; i < input_buffer_size; i++) {
+      max = std::max(max, (*input_float_data)[i]);
     }
     output_float_data[0] = max;
-  } else if (unary_op->type == OperatorType::kTensorFlowRsqrt ||
+  } else if (unary_op->type == OperatorType::kNeg ||
+             unary_op->type == OperatorType::kTensorFlowRsqrt ||
              unary_op->type == OperatorType::kTensorFlowSqrt ||
              unary_op->type == OperatorType::kTensorFlowSquare) {
     // Element-wise ops. Should have perfectly matching sizes here.
-    const int input_size = RequiredBufferSizeForShape(input_shape);
     for (int i = 0; i < output_dims_count; i++) {
       CHECK_EQ(output_shape.dims(i), input_shape.dims(i));
     }
 
-    for (int i = 0; i < input_size; i++) {
-      const float val = input_float_data[i];
+    for (int i = 0; i < output_buffer_size; i++) {
+      const float val = (*input_float_data)[i];
       float outval = 0.f;
-      if (unary_op->type == OperatorType::kTensorFlowRsqrt) {
+      if (unary_op->type == OperatorType::kNeg) {
+        outval = -val;
+      } else if (unary_op->type == OperatorType::kTensorFlowRsqrt) {
         outval = 1.0f / std::sqrt(val);
       } else if (unary_op->type == OperatorType::kTensorFlowSqrt) {
         outval = std::sqrt(val);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_mean_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_mean_attributes.cc
index d25c773f195cea407251bf046f0b1f1924e01968..444f59d14bc916e306186eaa031f04493af63bb8 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_mean_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_mean_attributes.cc
@@ -29,22 +29,26 @@ bool ResolveMeanAttributes::Run(Model* model, std::size_t op_index) {
   if (mean_op->type != OperatorType::kMean) return false;
   auto* op = static_cast<MeanOperator*>(mean_op);
 
-  if (!op->reduction_indices.empty()) return false;
+  if (!op->axis.empty()) {
+    // Attributes already resolved
+    return false;
+  }
   if (op->inputs.size() != 2) return false;
   if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
 
   const auto& indices_array = *model->arrays[op->inputs[1]];
   if (!indices_array.has_shape()) return false;
 
-  op->reduction_indices = indices_array.GetBuffer<ArrayDataType::kInt32>().data;
-
-  // At the moment, we only support simultaneous reduction over width and
-  // height. This is mainly limited by the fact that currently, the runtime
-  // arrays are always 4-dimensional.
-  CHECK_EQ(op->reduction_indices.size(), 2);
-  CHECK((op->reduction_indices[0] == 1 && op->reduction_indices[1] == 2) ||
-        (op->reduction_indices[0] == 2 && op->reduction_indices[1] == 1));
+  // We only support simultaneous reduction over width and height.
+  std::vector<int> axis = indices_array.GetBuffer<ArrayDataType::kInt32>().data;
+  if (axis.size() != 2) {
+    return false;
+  }
+  if (!((axis[0] == 1 && axis[1] == 2) || (axis[0] == 2 && axis[1] == 1))) {
+    return false;
+  }
 
+  op->axis = axis;
   return true;
 }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
index 5fc3b25bc12b0644ce2fcd3f7ee5e793791d54d5..97946182ef07b0c3d826cafa95b2bb47fbaf0125 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
@@ -39,6 +39,10 @@ bool ResolveStridedSliceAttributes::Run(Model* model, std::size_t op_index) {
 
   const auto& start_array = *model->arrays[op->inputs[1]];
   if (!start_array.has_shape()) return false;
+  if (toco::RequiredBufferSizeForShape(start_array.shape()) != 4) {
+    // Only 4D arrays are supported for now.
+    return false;
+  }
 
   const auto& stop_array = *model->arrays[op->inputs[2]];
   if (!stop_array.has_shape()) return false;
@@ -50,11 +54,6 @@ bool ResolveStridedSliceAttributes::Run(Model* model, std::size_t op_index) {
   op->stop_indices = stop_array.GetBuffer<ArrayDataType::kInt32>().data;
   op->strides = stride_array.GetBuffer<ArrayDataType::kInt32>().data;
 
-  // Only 4D arrays are supported for now.
-  CHECK_EQ(op->start_indices.size(), 4);
-  CHECK_EQ(op->stop_indices.size(), 4);
-  CHECK_EQ(op->strides.size(), 4);
-
   // TODO(dkalenichenko): Delete the extra inputs?
 
   return true;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
index b482f5cf51f7bde67e76792439203487402b75ce..c6723a880ed0e51cc5828f77742a6c8eb70fa864 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
@@ -35,37 +35,36 @@ bool ResolveTensorFlowConcat::Run(Model* model, std::size_t op_index) {
 
   CHECK_GE(tf_concat_op->inputs.size(), 2);
   // TensorFlow Concat and ConcatV2 nodes only differ by the ordering
-  // of inputs: in Concat, the concat_dim is the first input, while in
+  // of inputs: in Concat,the axis is the first input, while in
   // ConcatV2, it is the last input.
-  std::size_t concat_dim_pos = 0;
+  std::size_t axis_pos = 0;
   if (tf_concat_op->type == OperatorType::kTensorFlowConcatV2) {
-    concat_dim_pos = tf_concat_op->inputs.size() - 1;
+    axis_pos = tf_concat_op->inputs.size() - 1;
   }
-  const string concat_dim_name = tf_concat_op->inputs[concat_dim_pos];
+  const string axis_name = tf_concat_op->inputs[axis_pos];
   std::vector<string> concat_input_names;
   for (std::size_t i = 0; i < tf_concat_op->inputs.size(); i++) {
-    if (i != concat_dim_pos) {
+    if (i != axis_pos) {
       concat_input_names.push_back(tf_concat_op->inputs[i]);
     }
   }
-  // If the concat_dim array hasn't been resolved to a constant yet,
+  // If the axis array hasn't been resolved to a constant yet,
   // we need to yield.
-  const auto& concat_dim_array = model->GetArray(concat_dim_name);
-  if (!concat_dim_array.buffer) {
-    AddMessageF("Waiting for the concat_dim of %s to be resolved to a constant",
+  const auto& axis_array = model->GetArray(axis_name);
+  if (!axis_array.buffer) {
+    AddMessageF("Waiting for the axis of %s to be resolved to a constant",
                 LogName(*tf_concat_op));
     return false;
   }
 
-  CHECK(concat_dim_array.data_type == ArrayDataType::kInt32);
-  const auto& concat_dim_data =
-      concat_dim_array.GetBuffer<ArrayDataType::kInt32>().data;
-  CHECK_EQ(concat_dim_data.size(), 1);
-  const int concat_dim = concat_dim_data[0];
+  CHECK(axis_array.data_type == ArrayDataType::kInt32);
+  const auto& axis_data = axis_array.GetBuffer<ArrayDataType::kInt32>().data;
+  CHECK_EQ(axis_data.size(), 1);
+  const int axis = axis_data[0];
 
   // Create the Concatenation op replacing the TensorFlowConcat op.
   auto* concatenation_op = new ConcatenationOperator;
-  concatenation_op->concat_dim = concat_dim;
+  concatenation_op->axis = axis;
   concatenation_op->inputs = concat_input_names;
   concatenation_op->outputs = {tf_concat_op->outputs[0]};
   auto depth_concat_it = model->operators.emplace(concat_it, concatenation_op);
@@ -74,9 +73,9 @@ bool ResolveTensorFlowConcat::Run(Model* model, std::size_t op_index) {
   concat_it = depth_concat_it + 1;
   CHECK_EQ(concat_it->get(), tf_concat_op);
 
-  // Remove the concat_dim array if it is not used by anything else.
-  if (CountOpsWithInput(*model, concat_dim_name) == 1) {
-    model->arrays.erase(concat_dim_name);
+  // Remove the axis array if it is not used by anything else.
+  if (CountOpsWithInput(*model, axis_name) == 1) {
+    model->arrays.erase(axis_name);
   }
   // Remove the TensorFlowConcat op
   model->operators.erase(concat_it);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
index 55adfca03739deb35cbeb50c67222768f8a02164..150cf53da3099227c5c637ee58c44512d5a41d4f 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
@@ -59,7 +59,7 @@ bool ResolveTensorFlowSwitch::Run(Model* model, std::size_t op_index) {
   // From the TensorFlow docs on .switch() in
   // third_party/tensorflow/python/ops/control_flow_ops.py
   //
-  //    If `pred` is false, the `data` input is forwared to the first output.
+  //    If `pred` is false, the `data` input is forwarded to the first output.
   //    Otherwise, the data goes to the second output.
   //
   // Note that this comment used to say the opposite and was recently fixed:
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc b/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
index c6705ad305ac85f7098f40469ebc54fc6fa1b3ab..a14016e8e2705a66c392118899335eb3997fa1de 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
@@ -109,7 +109,7 @@ class ResolveConstantConcatenationTest : public ::testing::Test {
   // Prepare a hypothetical TOCO model with one Concatenation operator in it
   // together with 4 arrays as its inputs.
   // It receives the dimension of concatenation as input.
-  void PrepareModel(Model* model, int concat_dim) {
+  void PrepareModel(Model* model, int axis) {
     std::vector<string> concat_input_names = {"array0", "array1", "array2",
                                               "array3"};
 
@@ -142,7 +142,7 @@ class ResolveConstantConcatenationTest : public ::testing::Test {
       cnt++;
     }
     auto* concatenation_op = new ConcatenationOperator;
-    concatenation_op->concat_dim = concat_dim;
+    concatenation_op->axis = axis;
     concatenation_op->inputs = concat_input_names;
     concatenation_op->outputs = {"concat_op_outputs"};
     Array& out_array = model->GetOrCreateArray(concatenation_op->outputs[0]);
@@ -151,7 +151,7 @@ class ResolveConstantConcatenationTest : public ::testing::Test {
     std::vector<int>* out_array_shape_dim = out_array_shape->mutable_dims();
     out_array_shape_dim->resize(kDim);
     for (int i = 0; i < kDim; i++) {
-      if (i == concat_dim) {
+      if (i == axis) {
         (*out_array_shape_dim)[i] = kNumArrays * kElementPerDim;
       } else {
         (*out_array_shape_dim)[i] = kElementPerDim;
@@ -163,8 +163,8 @@ class ResolveConstantConcatenationTest : public ::testing::Test {
 
 TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis0) {
   Model model;
-  const int concat_dim = 0;
-  PrepareModel(&model, concat_dim);
+  const int axis = 0;
+  PrepareModel(&model, axis);
 
   GraphTransformationsSet graph_transformation_set;
   graph_transformation_set.Add(new toco::ResolveConstantConcatenation);
@@ -182,8 +182,8 @@ TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis0) {
 
 TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis1) {
   Model model;
-  const int concat_dim = 1;
-  PrepareModel(&model, concat_dim);
+  const int axis = 1;
+  PrepareModel(&model, axis);
 
   GraphTransformationsSet graph_transformation_set;
   graph_transformation_set.Add(new toco::ResolveConstantConcatenation);
@@ -201,8 +201,8 @@ TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis1) {
 
 TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis2) {
   Model model;
-  const int concat_dim = 2;
-  PrepareModel(&model, concat_dim);
+  const int axis = 2;
+  PrepareModel(&model, axis);
 
   GraphTransformationsSet graph_transformation_set;
   graph_transformation_set.Add(new toco::ResolveConstantConcatenation);
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index c889149ada395697cbc574f747e6d186fb1e75c6..31eee12ffca84cc74aae7be48f69ea22bd1e5395 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/contrib/lite/toco/import_tensorflow.h"
+
 #include <memory>
 #include <string>
 #include <utility>
@@ -19,6 +21,7 @@ limitations under the License.
 
 #include "google/protobuf/map.h"
 #include "google/protobuf/text_format.h"
+#include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
@@ -233,14 +236,14 @@ void ImportInt64Array(const TensorProto& input_tensor, Array* output_array) {
   }
 }
 
-// Count the number of inputs of a given node. If `drop_control_dependency` is
-// true, count the number of non-control-dependency inputs.
-size_t GetInputsCount(const NodeDef& node, bool drop_control_dependency) {
-  if (drop_control_dependency) {
+// Count the number of inputs of a given node. If
+// `tf_import_flags.drop_control_dependency` is true, count the number of
+// non-control-dependency inputs.
+int GetInputsCount(const NodeDef& node,
+                   const TensorFlowImportFlags& tf_import_flags) {
+  if (tf_import_flags.drop_control_dependency) {
     for (size_t i = 0; i < node.input_size(); ++i) {
       if (node.input(i)[0] == '^') {
-        LOG(INFO) << "Reached first control dependency input: "
-                  << node.input(i);
         return i;
       }
     }
@@ -250,7 +253,9 @@ size_t GetInputsCount(const NodeDef& node, bool drop_control_dependency) {
   }
 }
 
-void ConvertConstOperator(const NodeDef& node, Model* model) {
+void ConvertConstOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
   CHECK_EQ(node.op(), "Const");
   const auto& tensor = GetTensorAttr(node, "value");
   const auto dtype = GetDataTypeAttr(node, "dtype");
@@ -276,9 +281,11 @@ void ConvertConstOperator(const NodeDef& node, Model* model) {
   }
 }
 
-void ConvertConvOperator(const NodeDef& node, Model* model) {
+void ConvertConvOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
   CHECK_EQ(node.op(), "Conv2D");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
 
   // We only support NHWC, which is the default data_format.
   // So if data_format is not defined, we're all good.
@@ -327,9 +334,11 @@ void ConvertConvOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(conv);
 }
 
-void ConvertDepthwiseConvOperator(const NodeDef& node, Model* model) {
+void ConvertDepthwiseConvOperator(const NodeDef& node,
+                                  const TensorFlowImportFlags& tf_import_flags,
+                                  Model* model) {
   CHECK_EQ(node.op(), "DepthwiseConv2dNative");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
 
   // We only support NHWC, which is the default data_format.
   // So if data_format is not defined, we're all good.
@@ -378,9 +387,11 @@ void ConvertDepthwiseConvOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(conv);
 }
 
-void ConvertDepthToSpaceOperator(const NodeDef& node, Model* model) {
+void ConvertDepthToSpaceOperator(const NodeDef& node,
+                                 const TensorFlowImportFlags& tf_import_flags,
+                                 Model* model) {
   CHECK_EQ(node.op(), "DepthToSpace");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
   auto* op = new DepthToSpaceOperator;
   op->inputs.push_back(node.input(0));
@@ -390,9 +401,11 @@ void ConvertDepthToSpaceOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertSpaceToDepthOperator(const NodeDef& node, Model* model) {
+void ConvertSpaceToDepthOperator(const NodeDef& node,
+                                 const TensorFlowImportFlags& tf_import_flags,
+                                 Model* model) {
   CHECK_EQ(node.op(), "SpaceToDepth");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
   auto* op = new SpaceToDepthOperator;
   op->inputs.push_back(node.input(0));
@@ -402,9 +415,11 @@ void ConvertSpaceToDepthOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertBiasAddOperator(const NodeDef& node, Model* model) {
+void ConvertBiasAddOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
   CHECK_EQ(node.op(), "BiasAdd");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   const auto& input_name = node.input(0);
   const auto& bias_name = node.input(1);
   CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
@@ -415,9 +430,11 @@ void ConvertBiasAddOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(biasadd);
 }
 
-void ConvertReluOperator(const NodeDef& node, Model* model) {
+void ConvertReluOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
   CHECK_EQ(node.op(), "Relu");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   const auto& input_name = node.input(0);
   auto* relu = new ReluOperator;
   relu->inputs.push_back(input_name);
@@ -425,9 +442,11 @@ void ConvertReluOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(relu);
 }
 
-void ConvertRelu6Operator(const NodeDef& node, Model* model) {
+void ConvertRelu6Operator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
   CHECK_EQ(node.op(), "Relu6");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   const auto& input_name = node.input(0);
   auto* op = new Relu6Operator;
   op->inputs.push_back(input_name);
@@ -435,9 +454,11 @@ void ConvertRelu6Operator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertLogisticOperator(const NodeDef& node, Model* model) {
+void ConvertLogisticOperator(const NodeDef& node,
+                             const TensorFlowImportFlags& tf_import_flags,
+                             Model* model) {
   CHECK_EQ(node.op(), "Sigmoid");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   const auto& input_name = node.input(0);
   auto* op = new LogisticOperator;
   op->inputs.push_back(input_name);
@@ -445,9 +466,11 @@ void ConvertLogisticOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertTanhOperator(const NodeDef& node, Model* model) {
+void ConvertTanhOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
   CHECK_EQ(node.op(), "Tanh");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   const auto& input_name = node.input(0);
   auto* op = new TanhOperator;
   op->inputs.push_back(input_name);
@@ -455,9 +478,11 @@ void ConvertTanhOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertDivOperator(const NodeDef& node, Model* model) {
+void ConvertDivOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
   CHECK(node.op() == "Div" || node.op() == "RealDiv");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   auto* op = new DivOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -465,9 +490,11 @@ void ConvertDivOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertIdentityOperator(const NodeDef& node, Model* model) {
+void ConvertIdentityOperator(const NodeDef& node,
+                             const TensorFlowImportFlags& tf_import_flags,
+                             Model* model) {
   CHECK(node.op() == "Identity" || node.op() == "CheckNumerics" ||
-        node.op() == "PlaceholderWithDefault");
+        node.op() == "PlaceholderWithDefault" || node.op() == "StopGradient");
   auto* op = new TensorFlowIdentityOperator;
   // Amazingly, some TensorFlow graphs (at least rajeev_lstm.pb) have
   // identity nodes with multiple inputs, but the other inputs seem
@@ -481,9 +508,11 @@ void ConvertIdentityOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertFakeQuantWithMinMaxArgs(const NodeDef& node, Model* model) {
+void ConvertFakeQuantWithMinMaxArgs(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
   CHECK_EQ(node.op(), "FakeQuantWithMinMaxArgs");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   auto* op = new FakeQuantOperator;
   op->inputs.push_back(node.input(0));
   op->minmax.reset(new MinMax);
@@ -494,10 +523,11 @@ void ConvertFakeQuantWithMinMaxArgs(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertFakeQuantWithMinMaxVars(const NodeDef& node, Model* model) {
+void ConvertFakeQuantWithMinMaxVars(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
   CHECK_EQ(node.op(), "FakeQuantWithMinMaxVars");
-  const int num_inputs =
-      GetInputsCount(node, model->flags.drop_control_dependency());
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
   CHECK(num_inputs == 3 || num_inputs == 4);
   auto* op = new FakeQuantOperator;
   for (int i = 0; i < 3; i++) {
@@ -507,27 +537,44 @@ void ConvertFakeQuantWithMinMaxVars(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertRsqrtOperator(const NodeDef& node, Model* model) {
+void ConvertNegOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK_EQ(node.op(), "Neg");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  auto* op = new NegOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertRsqrtOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
   CHECK_EQ(node.op(), "Rsqrt");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   auto* op = new TensorFlowRsqrtOperator;
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
 }
 
-void ConvertSqrtOperator(const NodeDef& node, Model* model) {
+void ConvertSqrtOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
   CHECK_EQ(node.op(), "Sqrt");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   auto* op = new TensorFlowSqrtOperator;
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
 }
 
-void ConvertSqueezeOperator(const NodeDef& node, Model* model) {
+void ConvertSqueezeOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
   CHECK_EQ(node.op(), "Squeeze");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   auto* op = new SqueezeOperator;
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
@@ -540,18 +587,22 @@ void ConvertSqueezeOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertSquareOperator(const NodeDef& node, Model* model) {
+void ConvertSquareOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
   CHECK_EQ(node.op(), "Square");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   auto* op = new TensorFlowSquareOperator;
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
 }
 
-void ConvertAddOperator(const NodeDef& node, Model* model) {
+void ConvertAddOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
   CHECK_EQ(node.op(), "Add");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   auto* op = new AddOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -559,9 +610,11 @@ void ConvertAddOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertMulOperator(const NodeDef& node, Model* model) {
+void ConvertMulOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
   CHECK_EQ(node.op(), "Mul");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   auto* op = new MulOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -569,9 +622,11 @@ void ConvertMulOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertSubOperator(const NodeDef& node, Model* model) {
+void ConvertSubOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
   CHECK_EQ(node.op(), "Sub");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   auto* op = new SubOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -579,19 +634,26 @@ void ConvertSubOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertSumOperator(const NodeDef& node, Model* model) {
+void ConvertSumOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
   CHECK_EQ(node.op(), "Sum");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   auto* op = new TensorFlowSumOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
+  if (HasAttr(node, "keep_dims")) {
+    op->keep_dims = GetBoolAttr(node, "keep_dims");
+  }
 }
 
-void ConvertTileOperator(const NodeDef& node, Model* model) {
+void ConvertTileOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
   CHECK_EQ(node.op(), "Tile");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   auto* op = new TensorFlowTileOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -599,9 +661,11 @@ void ConvertTileOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertSliceOperator(const NodeDef& node, Model* model) {
+void ConvertSliceOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
   CHECK_EQ(node.op(), "Slice");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 3);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 3);
   auto* op = new SliceOperator;
   for (int i = 0; i < 3; ++i) {
     op->inputs.push_back(node.input(i));
@@ -610,9 +674,11 @@ void ConvertSliceOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertPadOperator(const NodeDef& node, Model* model) {
+void ConvertPadOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
   CHECK_EQ(node.op(), "Pad");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   auto* op = new PadOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -620,18 +686,22 @@ void ConvertPadOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertShapeOperator(const NodeDef& node, Model* model) {
+void ConvertShapeOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
   CHECK_EQ(node.op(), "Shape");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   auto* op = new TensorFlowShapeOperator;
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
 }
 
-void ConvertSplitOperator(const NodeDef& node, Model* model) {
+void ConvertSplitOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
   CHECK_EQ(node.op(), "Split");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   auto* op = new TensorFlowSplitOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -644,9 +714,11 @@ void ConvertSplitOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertMergeOperator(const NodeDef& node, Model* model) {
+void ConvertMergeOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
   CHECK_EQ(node.op(), "Merge");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   auto* op = new TensorFlowMergeOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -654,9 +726,11 @@ void ConvertMergeOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertSwitchOperator(const NodeDef& node, Model* model) {
+void ConvertSwitchOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
   CHECK_EQ(node.op(), "Switch");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   auto* op = new TensorFlowSwitchOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -665,9 +739,11 @@ void ConvertSwitchOperator(const NodeDef& node, Model* model) {
   op->outputs.push_back(node.name() + ":1");
   model->operators.emplace_back(op);
 }
-void ConvertSoftmaxOperator(const NodeDef& node, Model* model) {
+void ConvertSoftmaxOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
   CHECK_EQ(node.op(), "Softmax");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   const auto& input_name = node.input(0);
   auto* softmax = new SoftmaxOperator;
   softmax->inputs.push_back(input_name);
@@ -678,9 +754,11 @@ void ConvertSoftmaxOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(softmax);
 }
 
-void ConvertLRNOperator(const NodeDef& node, Model* model) {
+void ConvertLRNOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
   CHECK_EQ(node.op(), "LRN");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   const auto& input_name = node.input(0);
   auto* lrn = new LocalResponseNormalizationOperator;
   lrn->inputs.push_back(input_name);
@@ -692,10 +770,17 @@ void ConvertLRNOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(lrn);
 }
 
-void ConvertMaxPoolOperator(const NodeDef& node, Model* model) {
+void ConvertMaxPoolOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
   CHECK_EQ(node.op(), "MaxPool");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   const auto& input_name = node.input(0);
+  // We only support NHWC, which is the default data_format.
+  // So if data_format is not defined, we're all good.
+  if (node.attr().count("data_format")) {
+    CHECK_EQ(GetStringAttr(node, "data_format"), "NHWC");
+  }
   if (HasAttr(node, "T")) {
     CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
   } else {
@@ -727,10 +812,17 @@ void ConvertMaxPoolOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(maxpool);
 }
 
-void ConvertAvgPoolOperator(const NodeDef& node, Model* model) {
+void ConvertAvgPoolOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
   CHECK_EQ(node.op(), "AvgPool");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   const auto& input_name = node.input(0);
+  // We only support NHWC, which is the default data_format.
+  // So if data_format is not defined, we're all good.
+  if (node.attr().count("data_format")) {
+    CHECK_EQ(GetStringAttr(node, "data_format"), "NHWC");
+  }
   CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
   auto* avgpool = new AveragePoolOperator;
   avgpool->inputs.push_back(input_name);
@@ -758,9 +850,11 @@ void ConvertAvgPoolOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(avgpool);
 }
 
-void ConvertReshapeOperator(const NodeDef& node, Model* model) {
+void ConvertReshapeOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
   CHECK_EQ(node.op(), "Reshape");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   auto* op = new TensorFlowReshapeOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -768,13 +862,27 @@ void ConvertReshapeOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertMatMulOperator(const NodeDef& node, Model* model) {
-  CHECK_EQ(node.op(), "MatMul");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
-  // Transpose flags should be easy to support, but we don't have a
-  // GraphDef with them to test on at the moment.
-  CHECK_EQ(GetBoolAttr(node, "transpose_a"), false);
-  CHECK_EQ(GetBoolAttr(node, "transpose_b"), false);
+void ConvertMatMulOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  if (node.op() == "MatMul") {
+    // Transpose flags should be easy to support, but we don't have a
+    // GraphDef with them to test on at the moment.
+    CHECK_EQ(GetBoolAttr(node, "transpose_a"), false);
+    CHECK_EQ(GetBoolAttr(node, "transpose_b"), false);
+    CHECK(!HasAttr(node, "adjoint_a") ||
+          (GetBoolAttr(node, "adjoint_a") == false));
+    CHECK(!HasAttr(node, "adjoint_b") ||
+          (GetBoolAttr(node, "adjoint_b") == false));
+  } else if (node.op() == "BatchMatMul") {
+    // https://www.tensorflow.org/versions/r0.12/api_docs/python/math_ops/matrix_math_functions
+    CHECK(!HasAttr(node, "adj_a") || (GetBoolAttr(node, "adj_a") == false));
+    CHECK(!HasAttr(node, "adj_b") || (GetBoolAttr(node, "adj_b") == false));
+  } else {
+    LOG(FATAL) << "op must be 'MatMul' or 'BatchMatMul'";
+  }
+
   const auto& input_name = node.input(0);
   const auto& weights_name = node.input(1);
   const auto& reordered_weights_name = weights_name + "_reordered";
@@ -801,7 +909,9 @@ void ConvertMatMulOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(matmul);
 }
 
-void ConvertConcatOperator(const NodeDef& node, Model* model) {
+void ConvertConcatOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
   Operator* op = nullptr;
   if (node.op() == "Concat") {
     op = new TensorFlowConcatOperator;
@@ -810,8 +920,7 @@ void ConvertConcatOperator(const NodeDef& node, Model* model) {
   } else {
     LOG(FATAL) << "Expected Concat or ConcatV2";
   }
-  const int num_inputs =
-      GetInputsCount(node, model->flags.drop_control_dependency());
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
   CHECK_GE(num_inputs, 2);
   CHECK_EQ(num_inputs, 1 + GetIntAttr(node, "N"));
   for (int i = 0; i < num_inputs; ++i) {
@@ -821,11 +930,12 @@ void ConvertConcatOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertAllOperator(const NodeDef& node, Model* model) {
+void ConvertAllOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
   CHECK_EQ(node.op(), "All");
   auto* op = new TensorFlowAllOperator;
-  const int num_inputs =
-      GetInputsCount(node, model->flags.drop_control_dependency());
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
   for (int i = 0; i < num_inputs; ++i) {
     op->inputs.push_back(node.input(i));
   }
@@ -833,11 +943,12 @@ void ConvertAllOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertAssertOperator(const NodeDef& node, Model* model) {
+void ConvertAssertOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
   CHECK_EQ(node.op(), "Assert");
   auto* op = new TensorFlowAssertOperator;
-  const int num_inputs =
-      GetInputsCount(node, model->flags.drop_control_dependency());
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
   for (int i = 0; i < num_inputs; ++i) {
     op->inputs.push_back(node.input(i));
   }
@@ -845,11 +956,12 @@ void ConvertAssertOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertLessOperator(const NodeDef& node, Model* model) {
+void ConvertLessOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
   CHECK_EQ(node.op(), "Less");
   auto* op = new TensorFlowLessOperator;
-  const int num_inputs =
-      GetInputsCount(node, model->flags.drop_control_dependency());
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
   for (int i = 0; i < num_inputs; ++i) {
     op->inputs.push_back(node.input(i));
   }
@@ -857,11 +969,12 @@ void ConvertLessOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertLessEqualOperator(const NodeDef& node, Model* model) {
+void ConvertLessEqualOperator(const NodeDef& node,
+                              const TensorFlowImportFlags& tf_import_flags,
+                              Model* model) {
   CHECK_EQ(node.op(), "LessEqual");
   auto* op = new TensorFlowLessEqualOperator;
-  const int num_inputs =
-      GetInputsCount(node, model->flags.drop_control_dependency());
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
   for (int i = 0; i < num_inputs; ++i) {
     op->inputs.push_back(node.input(i));
   }
@@ -869,11 +982,12 @@ void ConvertLessEqualOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertGreaterOperator(const NodeDef& node, Model* model) {
+void ConvertGreaterOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
   CHECK_EQ(node.op(), "Greater");
   auto* op = new TensorFlowGreaterOperator;
-  const int num_inputs =
-      GetInputsCount(node, model->flags.drop_control_dependency());
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
   for (int i = 0; i < num_inputs; ++i) {
     op->inputs.push_back(node.input(i));
   }
@@ -881,11 +995,12 @@ void ConvertGreaterOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertGreaterEqualOperator(const NodeDef& node, Model* model) {
+void ConvertGreaterEqualOperator(const NodeDef& node,
+                                 const TensorFlowImportFlags& tf_import_flags,
+                                 Model* model) {
   CHECK_EQ(node.op(), "GreaterEqual");
   auto* op = new TensorFlowGreaterEqualOperator;
-  const int num_inputs =
-      GetInputsCount(node, model->flags.drop_control_dependency());
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
   for (int i = 0; i < num_inputs; ++i) {
     op->inputs.push_back(node.input(i));
   }
@@ -893,29 +1008,41 @@ void ConvertGreaterEqualOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertMaxOperator(const NodeDef& node, Model* model) {
+void ConvertMaxOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
   CHECK_EQ(node.op(), "Max");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   auto* op = new TensorFlowMaxOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
+  if (HasAttr(node, "keep_dims")) {
+    op->keep_dims = GetBoolAttr(node, "keep_dims");
+  }
 }
 
-void ConvertMinOperator(const NodeDef& node, Model* model) {
+void ConvertMinOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
   CHECK_EQ(node.op(), "Min");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   auto* op = new TensorFlowMinOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
+  if (HasAttr(node, "keep_dims")) {
+    op->keep_dims = GetBoolAttr(node, "keep_dims");
+  }
 }
 
-void ConvertMaximumOperator(const NodeDef& node, Model* model) {
+void ConvertMaximumOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
   CHECK_EQ(node.op(), "Maximum");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   auto* op = new TensorFlowMaximumOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -923,9 +1050,11 @@ void ConvertMaximumOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertMinimumOperator(const NodeDef& node, Model* model) {
+void ConvertMinimumOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
   CHECK_EQ(node.op(), "Minimum");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   auto* op = new TensorFlowMinimumOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -933,11 +1062,12 @@ void ConvertMinimumOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertUnsupportedOperator(const NodeDef& node, Model* model) {
+void ConvertUnsupportedOperator(const NodeDef& node,
+                                const TensorFlowImportFlags& tf_import_flags,
+                                Model* model) {
   LOG(INFO) << "Converting unsupported operation: " << node.op();
   auto* op = new TensorFlowUnsupportedOperator;
-  const int num_inputs =
-      GetInputsCount(node, model->flags.drop_control_dependency());
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
   for (int i = 0; i < num_inputs; ++i) {
     op->inputs.push_back(node.input(i));
   }
@@ -956,7 +1086,9 @@ void ConvertUnsupportedOperator(const NodeDef& node, Model* model) {
   }
 }
 
-void ConvertStridedSliceOperator(const NodeDef& node, Model* model) {
+void ConvertStridedSliceOperator(const NodeDef& node,
+                                 const TensorFlowImportFlags& tf_import_flags,
+                                 Model* model) {
   CHECK_EQ(node.op(), "StridedSlice");
   CHECK_EQ(node.input_size(), 4);
 
@@ -971,7 +1103,7 @@ void ConvertStridedSliceOperator(const NodeDef& node, Model* model) {
       // Only 4D tensors are supported.
       GetIntAttr(node, "begin_mask") > 15 ||
       GetIntAttr(node, "end_mask") > 15) {
-    ConvertUnsupportedOperator(node, model);
+    ConvertUnsupportedOperator(node, tf_import_flags, model);
     return;
   }
 
@@ -989,10 +1121,12 @@ void ConvertStridedSliceOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertPlaceholderOperator(const NodeDef& node, Model* model) {
+void ConvertPlaceholderOperator(const NodeDef& node,
+                                const TensorFlowImportFlags& tf_import_flags,
+                                Model* model) {
   CHECK(node.op() == "Placeholder" || node.op() == "LegacyFedInput");
   if (node.op() == "Placeholder") {
-    CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 0);
+    CHECK_EQ(GetInputsCount(node, tf_import_flags), 0);
   }
   auto& array = model->GetOrCreateArray(node.name());
   if (node.attr().count("dtype")) {
@@ -1019,42 +1153,30 @@ void ConvertPlaceholderOperator(const NodeDef& node, Model* model) {
   }
 }
 
-void ConvertNoOpOperator(const NodeDef& node, Model* model) {}
-
-ArrayDataType GetArrayDataType(tensorflow::DataType tf_data_type) {
-  if (tf_data_type == DT_UINT8) {
-    return ArrayDataType::kUint8;
-  } else if (tf_data_type == DT_INT32) {
-    return ArrayDataType::kInt32;
-  } else if (tf_data_type == DT_FLOAT) {
-    return ArrayDataType::kFloat;
-  } else {
-    return ArrayDataType::kNone;
-  }
-}
+void ConvertNoOpOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {}
 
-void ConvertCastOperator(const NodeDef& node, Model* model) {
+void ConvertCastOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
   CHECK_EQ(node.op(), "Cast");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   const auto tf_src_dtype = GetDataTypeAttr(node, "SrcT");
   const auto tf_dst_dtype = GetDataTypeAttr(node, "DstT");
-  CHECK(tf_src_dtype == DT_UINT8 || tf_src_dtype == DT_INT32 ||
-        tf_src_dtype == DT_FLOAT);
-  CHECK(tf_dst_dtype == DT_UINT8 || tf_dst_dtype == DT_INT32 ||
-        tf_dst_dtype == DT_FLOAT);
-  CHECK_NE(tf_src_dtype, tf_dst_dtype)
-      << "Same input and output data type. No need to cast.";
   auto* op = new CastOperator;
-  op->src_data_type = GetArrayDataType(tf_src_dtype);
-  op->dst_data_type = GetArrayDataType(tf_dst_dtype);
+  op->src_data_type = ConvertDataType(tf_src_dtype);
+  op->dst_data_type = ConvertDataType(tf_dst_dtype);
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
 }
 
-void ConvertFloorOperator(const NodeDef& node, Model* model) {
+void ConvertFloorOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
   CHECK_EQ(node.op(), "Floor");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 1);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
   const auto data_type = GetDataTypeAttr(node, "T");
   CHECK(data_type == DT_FLOAT);
   auto* op = new FloorOperator;
@@ -1063,9 +1185,11 @@ void ConvertFloorOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertGatherOperator(const NodeDef& node, Model* model) {
+void ConvertGatherOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
   CHECK_EQ(node.op(), "Gather");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   const auto indices_data_type = GetDataTypeAttr(node, "Tindices");
   CHECK(indices_data_type == DT_INT32);
   auto* op = new GatherOperator;
@@ -1075,9 +1199,28 @@ void ConvertGatherOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertResizeBilinearOperator(const NodeDef& node, Model* model) {
+void ConvertArgMaxOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
+  CHECK_EQ(node.op(), "ArgMax");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  const auto axis_data_type = GetDataTypeAttr(node, "Tidx");
+  const auto output_type = GetDataTypeAttr(node, "output_type");
+  CHECK(axis_data_type == DT_INT64 || axis_data_type == DT_INT32);
+  CHECK(output_type == DT_INT64 || output_type == DT_INT32);
+  auto* op = new ArgMaxOperator;
+  op->output_data_type = ConvertDataType(output_type);
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertResizeBilinearOperator(const NodeDef& node,
+                                   const TensorFlowImportFlags& tf_import_flags,
+                                   Model* model) {
   CHECK_EQ(node.op(), "ResizeBilinear");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 2);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
   auto* op = new ResizeBilinearOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -1085,10 +1228,11 @@ void ConvertResizeBilinearOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertBatchNormWithGlobalNormalizationOperator(const NodeDef& node,
-                                                     Model* model) {
+void ConvertBatchNormWithGlobalNormalizationOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
   CHECK_EQ(node.op(), "BatchNormWithGlobalNormalization");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 5);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 5);
 
   // TODO(ahentz): to really match tensorflow we need to add variance_epsilon
   // to the input, before feeding it into TensorFlowRsqrtOperator.
@@ -1133,7 +1277,9 @@ void ConvertBatchNormWithGlobalNormalizationOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
-void ConvertFusedBatchNormOperator(const NodeDef& node, Model* model) {
+void ConvertFusedBatchNormOperator(const NodeDef& node,
+                                   const TensorFlowImportFlags& tf_import_flags,
+                                   Model* model) {
   CHECK_EQ(node.op(), "FusedBatchNorm");
   CHECK_EQ(node.input_size(), 5);
 
@@ -1187,9 +1333,11 @@ void ConvertFusedBatchNormOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertSpaceToBatchNDOperator(const NodeDef& node, Model* model) {
+void ConvertSpaceToBatchNDOperator(const NodeDef& node,
+                                   const TensorFlowImportFlags& tf_import_flags,
+                                   Model* model) {
   CHECK_EQ(node.op(), "SpaceToBatchND");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 3);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 3);
   CHECK_EQ(GetDataTypeAttr(node, "Tblock_shape"), DT_INT32);
   CHECK_EQ(GetDataTypeAttr(node, "Tpaddings"), DT_INT32);
   auto* op = new SpaceToBatchNDOperator;
@@ -1200,9 +1348,11 @@ void ConvertSpaceToBatchNDOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertBatchToSpaceNDOperator(const NodeDef& node, Model* model) {
+void ConvertBatchToSpaceNDOperator(const NodeDef& node,
+                                   const TensorFlowImportFlags& tf_import_flags,
+                                   Model* model) {
   CHECK_EQ(node.op(), "BatchToSpaceND");
-  CHECK_EQ(GetInputsCount(node, model->flags.drop_control_dependency()), 3);
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 3);
   CHECK_EQ(GetDataTypeAttr(node, "Tblock_shape"), DT_INT32);
   CHECK_EQ(GetDataTypeAttr(node, "Tcrops"), DT_INT32);
   auto* op = new BatchToSpaceNDOperator;
@@ -1213,7 +1363,9 @@ void ConvertBatchToSpaceNDOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
-void ConvertMeanOperator(const NodeDef& node, Model* model) {
+void ConvertMeanOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
   CHECK_EQ(node.op(), "Mean");
   CHECK_EQ(node.input_size(), 2);
   auto* op = new MeanOperator;
@@ -1221,9 +1373,14 @@ void ConvertMeanOperator(const NodeDef& node, Model* model) {
   op->inputs.push_back(node.input(1));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
+  if (HasAttr(node, "keep_dims")) {
+    op->keep_dims = GetBoolAttr(node, "keep_dims");
+  }
 }
 
-void ConvertSvdfOperator(const NodeDef& node, Model* model) {
+void ConvertSvdfOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
   CHECK_EQ(node.op(), "Svdf");
   bool has_bias = (node.input_size() == 4);
   auto* op = new SvdfOperator;
@@ -1244,6 +1401,167 @@ void ConvertSvdfOperator(const NodeDef& node, Model* model) {
   model->operators.emplace_back(op);
 }
 
+// This is just bare bones support to get the shapes to propagate.
+void ConvertTransposeConvOperator(const NodeDef& node,
+                                  const TensorFlowImportFlags& tf_import_flags,
+                                  Model* model) {
+  CHECK_EQ(node.op(), "Conv2DBackpropInput");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 3);
+  auto* op = new TransposeConvOperator;
+  op->inputs.push_back(node.input(2));
+  op->inputs.push_back(node.input(1));
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  const auto& strides = GetListAttr(node, "strides");
+  CHECK_EQ(strides.i_size(), 4);
+  CHECK_EQ(strides.i(0), 1);
+  op->stride_height = strides.i(1);
+  op->stride_width = strides.i(2);
+  CHECK_EQ(strides.i(3), 1);
+  auto const& padding = GetStringAttr(node, "padding");
+  if (padding == "SAME") {
+    op->padding.type = PaddingType::kSame;
+  } else if (padding == "VALID") {
+    op->padding.type = PaddingType::kValid;
+  } else {
+    LOG(FATAL) << "Only SAME and VALID padding supported on "
+                  "Conv2DBackpropInput nodes.";
+  }
+  model->operators.emplace_back(op);
+}
+
+void ConvertExpandDimsOperator(const NodeDef& node,
+                               const TensorFlowImportFlags& tf_import_flags,
+                               Model* model) {
+  CHECK_EQ(node.op(), "ExpandDims");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new ExpandDimsOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertFillOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "Fill");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new FillOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertFloorDivOperator(const NodeDef& node,
+                             const TensorFlowImportFlags& tf_import_flags,
+                             Model* model) {
+  CHECK_EQ(node.op(), "FloorDiv");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new FloorDivOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertFloorModOperator(const NodeDef& node,
+                             const TensorFlowImportFlags& tf_import_flags,
+                             Model* model) {
+  CHECK(node.op() == "FloorMod");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new FloorModOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertRangeOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK_EQ(node.op(), "Range");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 3);
+  auto* op = new RangeOperator;
+  if (HasAttr(node, "Tidx")) {
+    const auto dtype = toco::GetDataTypeAttr(node, "Tidx");
+    CHECK(dtype == DT_UINT8 || dtype == DT_INT32 || dtype == DT_INT64 ||
+          dtype == DT_FLOAT);
+    op->dtype = ConvertDataType(dtype);
+  }
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->inputs.push_back(node.input(2));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertRankOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "Rank");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  auto* op = new RankOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertStackOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK((node.op() == "Stack") || (node.op() == "Pack"));
+  auto* op = new StackOperator;
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  CHECK_GE(num_inputs, 1);
+  CHECK_EQ(num_inputs, GetIntAttr(node, "N"));
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  // Both "Stack" and "Pack" have the "axis" attribute.
+  op->axis = GetIntAttr(node, "axis");
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertTransposeOperator(const NodeDef& node,
+                              const TensorFlowImportFlags& tf_import_flags,
+                              Model* model) {
+  CHECK_EQ(node.op(), "Transpose");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new TransposeOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+// Some TensorFlow ops only occur in graph cycles, representing
+// control flow. We do not currently support control flow, so we wouldn't
+// be able to fully support such graphs, including performing inference,
+// anyway. However, rather than erroring out early on graphs being cyclic,
+// it helps to at least support these just enough to allow getting a
+// graph visualization. This is not trivial, as we require graphs to be
+// acyclic aside from RNN back-edges. The solution is to special-case
+// such ops as RNN back-edges, which is technically incorrect (does not
+// allow representing the op's semantics) but good enough to get a
+// graph visualization.
+void ConvertOperatorSpecialCasedAsRNNBackEdge(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  // At the moment, the only type of operator special-cased in this way is
+  // NextIteration, occurring only in control-flow cycles.
+  CHECK_EQ(node.op(), "NextIteration");
+  CHECK_EQ(node.input_size(), 1);
+  auto* rnn_state = model->flags.add_rnn_states();
+  // This RNN state is not explicitly created by the user, so it's
+  // OK for some later graph transformation to discard it.
+  rnn_state->set_discardable(true);
+  rnn_state->set_state_array(node.name());
+  rnn_state->set_back_edge_source_array(node.input(0));
+}
+
 void StripCaretFromArrayNames(Model* model) {
   for (auto& op : model->operators) {
     for (auto& input : op->inputs) {
@@ -1260,26 +1578,67 @@ void StripCaretFromArrayNames(Model* model) {
   }
 }
 
-void AddExtraOutputsFedIntoOtherOps(Model* model) {
+void StripZeroOutputIndexFromInputs(NodeDef* node) {
+  for (auto& input : *node->mutable_input()) {
+    input = string(absl::StripSuffix(input, ":0"));
+  }
+}
+
+// In TensorFlow GraphDef, when a node has multiple outputs, they are named
+// name:0, name:1, ...
+// where 'name' is the node's name(). Just 'name' is an equivalent shorthand
+// form for name:0.
+// A TensorFlow GraphDef does not explicitly list all the outputs of each node
+// (unlike inputs), it being implied by the node's name and operator type
+// (the latter implies the number of outputs).
+// This makes it non-trivial for us to reconstruct the list of all arrays
+// present in the graph and, for each operator, the list of its outputs.
+// We do that by taking advantage of the fact that
+// at least each node lists explicitly its inputs, so after we've loaded
+// all nodes, we can use that information.
+void AddExtraOutputs(Model* model) {
+  // Construct the list of all arrays consumed by anything in the graph.
+  std::vector<string> consumed_arrays;
+  // Add arrays consumed by an op.
   for (const auto& consumer_op : model->operators) {
     for (const string& input : consumer_op->inputs) {
-      const std::vector<string>& split = absl::StrSplit(input, ':');
-      if (split.size() != 2) {
-        continue;
-      }
-      int output_index = 0;
-      if (!absl::SimpleAtoi(split[1], &output_index)) {
-        continue;
-      }
-      auto* producer_op = GetOpWithOutput(*model, split[0]);
-      if (!producer_op) {
-        continue;
-      }
-      while (producer_op->outputs.size() <= output_index) {
-        using toco::port::StringF;
-        producer_op->outputs.push_back(
-            StringF("%s:%d", split[0], producer_op->outputs.size()));
-      }
+      consumed_arrays.push_back(input);
+    }
+  }
+  // Add global outputs of the model.
+  for (const string& output_array : model->flags.output_arrays()) {
+    consumed_arrays.push_back(output_array);
+  }
+  // Add arrays consumed by a RNN back-edge.
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    consumed_arrays.push_back(rnn_state.back_edge_source_array());
+  }
+  // Now add operator outputs so that all arrays that are consumed,
+  // are produced.
+  for (const string& consumed_array : consumed_arrays) {
+    // Split the consumed array name into the form name:output_index.
+    const std::vector<string>& split = absl::StrSplit(consumed_array, ':');
+    // If not of the form name:output_index, then this is not an additional
+    // output of a node with multiple outputs, so nothing to do here.
+    if (split.size() != 2) {
+      continue;
+    }
+    int output_index = 0;
+    if (!absl::SimpleAtoi(split[1], &output_index)) {
+      continue;
+    }
+    // Each op is initially recorded as producing at least the array that
+    // has its name. We use that to identify the producer node.
+    auto* producer_op = GetOpWithOutput(*model, split[0]);
+    if (!producer_op) {
+      continue;
+    }
+    // Add extra outputs to that producer node, all the way to the
+    // output_index.
+    while (producer_op->outputs.size() <= output_index) {
+      using toco::port::StringF;
+      producer_op->outputs.push_back(
+          StringF("%s:%d", split[0], producer_op->outputs.size()));
     }
   }
 }
@@ -1320,26 +1679,32 @@ bool InlineAllFunctions(GraphDef* graphdef) {
 
   tensorflow::Graph graph(fld);
   tensorflow::GraphConstructorOptions gc_opts;
-  TF_CHECK_OK(
-      tensorflow::ConvertGraphDefToGraph(gc_opts, graphdef_copy, &graph));
+  const auto& tf_convert_status =
+      tensorflow::ConvertGraphDefToGraph(gc_opts, graphdef_copy, &graph);
+  if (!tf_convert_status.ok()) {
+    LOG(ERROR) << "tensorflow::ConvertGraphDefToGraph failed with status: "
+               << tf_convert_status.ToString();
+    return false;
+  }
 
   // Iterate over the graph until there are no more nodes to be inlined.
   bool graph_modified = false;
   while (tensorflow::ExpandInlineFunctions(flr, &graph)) {
     graph_modified = true;
-    LOG(INFO) << "Found functions that were inlined.";
   }
 
   // Output inlined graph
   if (graph_modified) {
+    LOG(INFO) << "Found and inlined TensorFlow functions.";
     graph.ToGraphDef(graphdef);
   }
   return graph_modified;
 }
 }  // namespace
 
-std::unique_ptr<Model> ImportTensorFlowGraphDef(const ModelFlags& model_flags,
-                                                const GraphDef& tf_graph) {
+std::unique_ptr<Model> ImportTensorFlowGraphDef(
+    const ModelFlags& model_flags, const TensorFlowImportFlags& tf_import_flags,
+    const GraphDef& tf_graph) {
   LogDumpGraphDef(kLogLevelModelChanged, "AT IMPORT", tf_graph);
 
   GraphDef inlined_graph(tf_graph);
@@ -1347,139 +1712,178 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(const ModelFlags& model_flags,
     LogDumpGraphDef(kLogLevelModelChanged, "AFTER INLINING", inlined_graph);
   }
 
+  // Check input and output specification.
+  for (const auto& specified_input_array : model_flags.input_arrays()) {
+    CHECK(!absl::EndsWith(specified_input_array.name(), ":0"))
+        << "Unsupported explicit zero output index: "
+        << specified_input_array.name();
+  }
+  for (const string& specified_output_array : model_flags.output_arrays()) {
+    CHECK(!absl::EndsWith(specified_output_array, ":0"))
+        << "Unsupported explicit zero output index: " << specified_output_array;
+  }
+
   Model* model = new Model;
-  ResolveModelFlags(model_flags, model);
 
-  for (const auto& node : inlined_graph.node()) {
+  for (auto node : inlined_graph.node()) {
+    StripZeroOutputIndexFromInputs(&node);
     if (node.op() == "Const") {
-      ConvertConstOperator(node, model);
+      ConvertConstOperator(node, tf_import_flags, model);
     } else if (node.op() == "Conv2D") {
-      ConvertConvOperator(node, model);
+      ConvertConvOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Conv2DBackpropInput") {
+      ConvertTransposeConvOperator(node, tf_import_flags, model);
     } else if (node.op() == "DepthwiseConv2dNative") {
-      ConvertDepthwiseConvOperator(node, model);
+      ConvertDepthwiseConvOperator(node, tf_import_flags, model);
     } else if (node.op() == "DepthToSpace") {
-      ConvertDepthToSpaceOperator(node, model);
+      ConvertDepthToSpaceOperator(node, tf_import_flags, model);
     } else if (node.op() == "SpaceToDepth") {
-      ConvertSpaceToDepthOperator(node, model);
+      ConvertSpaceToDepthOperator(node, tf_import_flags, model);
     } else if (node.op() == "BiasAdd") {
-      ConvertBiasAddOperator(node, model);
+      ConvertBiasAddOperator(node, tf_import_flags, model);
     } else if (node.op() == "Relu") {
-      ConvertReluOperator(node, model);
+      ConvertReluOperator(node, tf_import_flags, model);
     } else if (node.op() == "Relu6") {
-      ConvertRelu6Operator(node, model);
+      ConvertRelu6Operator(node, tf_import_flags, model);
     } else if (node.op() == "Sigmoid") {
-      ConvertLogisticOperator(node, model);
+      ConvertLogisticOperator(node, tf_import_flags, model);
     } else if (node.op() == "Tanh") {
-      ConvertTanhOperator(node, model);
+      ConvertTanhOperator(node, tf_import_flags, model);
     } else if (node.op() == "MaxPool") {
-      ConvertMaxPoolOperator(node, model);
+      ConvertMaxPoolOperator(node, tf_import_flags, model);
     } else if (node.op() == "AvgPool") {
-      ConvertAvgPoolOperator(node, model);
+      ConvertAvgPoolOperator(node, tf_import_flags, model);
     } else if (node.op() == "Reshape") {
-      ConvertReshapeOperator(node, model);
-    } else if (node.op() == "MatMul") {
-      ConvertMatMulOperator(node, model);
+      ConvertReshapeOperator(node, tf_import_flags, model);
+    } else if (node.op() == "MatMul" || node.op() == "BatchMatMul") {
+      ConvertMatMulOperator(node, tf_import_flags, model);
     } else if (node.op() == "Div" || node.op() == "RealDiv") {
-      ConvertDivOperator(node, model);
-    } else if (node.op() == "Identity" || node.op() == "CheckNumerics") {
-      ConvertIdentityOperator(node, model);
+      ConvertDivOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Identity" || node.op() == "CheckNumerics" ||
+               node.op() == "StopGradient") {
+      ConvertIdentityOperator(node, tf_import_flags, model);
     } else if (node.op() == "FakeQuantWithMinMaxVars") {
-      ConvertFakeQuantWithMinMaxVars(node, model);
+      ConvertFakeQuantWithMinMaxVars(node, tf_import_flags, model);
     } else if (node.op() == "FakeQuantWithMinMaxArgs") {
-      ConvertFakeQuantWithMinMaxArgs(node, model);
+      ConvertFakeQuantWithMinMaxArgs(node, tf_import_flags, model);
+    } else if (node.op() == "Neg") {
+      ConvertNegOperator(node, tf_import_flags, model);
     } else if (node.op() == "Rsqrt") {
-      ConvertRsqrtOperator(node, model);
+      ConvertRsqrtOperator(node, tf_import_flags, model);
     } else if (node.op() == "Squeeze") {
-      ConvertSqueezeOperator(node, model);
+      ConvertSqueezeOperator(node, tf_import_flags, model);
     } else if (node.op() == "Sqrt") {
-      ConvertSqrtOperator(node, model);
+      ConvertSqrtOperator(node, tf_import_flags, model);
     } else if (node.op() == "Square") {
-      ConvertSquareOperator(node, model);
+      ConvertSquareOperator(node, tf_import_flags, model);
     } else if (node.op() == "Add") {
-      ConvertAddOperator(node, model);
+      ConvertAddOperator(node, tf_import_flags, model);
     } else if (node.op() == "Mul") {
-      ConvertMulOperator(node, model);
+      ConvertMulOperator(node, tf_import_flags, model);
     } else if (node.op() == "Sub") {
-      ConvertSubOperator(node, model);
+      ConvertSubOperator(node, tf_import_flags, model);
     } else if (node.op() == "Sum") {
-      ConvertSumOperator(node, model);
+      ConvertSumOperator(node, tf_import_flags, model);
     } else if (node.op() == "Tile") {
-      ConvertTileOperator(node, model);
+      ConvertTileOperator(node, tf_import_flags, model);
     } else if (node.op() == "Concat" || node.op() == "ConcatV2") {
-      ConvertConcatOperator(node, model);
+      ConvertConcatOperator(node, tf_import_flags, model);
     } else if (node.op() == "LRN") {
-      ConvertLRNOperator(node, model);
+      ConvertLRNOperator(node, tf_import_flags, model);
     } else if (node.op() == "Softmax") {
-      ConvertSoftmaxOperator(node, model);
+      ConvertSoftmaxOperator(node, tf_import_flags, model);
     } else if (node.op() == "All") {
-      ConvertAllOperator(node, model);
+      ConvertAllOperator(node, tf_import_flags, model);
     } else if (node.op() == "Assert") {
-      ConvertAssertOperator(node, model);
+      ConvertAssertOperator(node, tf_import_flags, model);
     } else if (node.op() == "Less") {
-      ConvertLessOperator(node, model);
+      ConvertLessOperator(node, tf_import_flags, model);
     } else if (node.op() == "LessEqual") {
-      ConvertLessEqualOperator(node, model);
+      ConvertLessEqualOperator(node, tf_import_flags, model);
     } else if (node.op() == "Greater") {
-      ConvertGreaterOperator(node, model);
+      ConvertGreaterOperator(node, tf_import_flags, model);
     } else if (node.op() == "GreaterEqual") {
-      ConvertGreaterEqualOperator(node, model);
+      ConvertGreaterEqualOperator(node, tf_import_flags, model);
     } else if (node.op() == "Max") {
-      ConvertMaxOperator(node, model);
+      ConvertMaxOperator(node, tf_import_flags, model);
     } else if (node.op() == "Min") {
-      ConvertMinOperator(node, model);
+      ConvertMinOperator(node, tf_import_flags, model);
     } else if (node.op() == "Maximum") {
-      ConvertMaximumOperator(node, model);
+      ConvertMaximumOperator(node, tf_import_flags, model);
     } else if (node.op() == "Minimum") {
-      ConvertMinimumOperator(node, model);
+      ConvertMinimumOperator(node, tf_import_flags, model);
     } else if (node.op() == "Merge") {
-      ConvertMergeOperator(node, model);
+      ConvertMergeOperator(node, tf_import_flags, model);
     } else if (node.op() == "Pad") {
-      ConvertPadOperator(node, model);
+      ConvertPadOperator(node, tf_import_flags, model);
     } else if (node.op() == "StridedSlice") {
-      ConvertStridedSliceOperator(node, model);
+      ConvertStridedSliceOperator(node, tf_import_flags, model);
     } else if (node.op() == "Shape") {
-      ConvertShapeOperator(node, model);
+      ConvertShapeOperator(node, tf_import_flags, model);
     } else if (node.op() == "Slice") {
-      ConvertSliceOperator(node, model);
+      ConvertSliceOperator(node, tf_import_flags, model);
     } else if (node.op() == "Split") {
-      ConvertSplitOperator(node, model);
+      ConvertSplitOperator(node, tf_import_flags, model);
     } else if (node.op() == "Switch") {
-      ConvertSwitchOperator(node, model);
+      ConvertSwitchOperator(node, tf_import_flags, model);
     } else if (node.op() == "Placeholder") {
-      ConvertPlaceholderOperator(node, model);
+      ConvertPlaceholderOperator(node, tf_import_flags, model);
     } else if (node.op() == "PlaceholderWithDefault") {
-      ConvertIdentityOperator(node, model);
+      ConvertIdentityOperator(node, tf_import_flags, model);
     } else if (node.op() == "LegacyFedInput") {
-      ConvertPlaceholderOperator(node, model);
+      ConvertPlaceholderOperator(node, tf_import_flags, model);
     } else if (node.op() == "NoOp") {
-      ConvertNoOpOperator(node, model);
+      ConvertNoOpOperator(node, tf_import_flags, model);
     } else if (node.op() == "Cast") {
-      ConvertCastOperator(node, model);
+      ConvertCastOperator(node, tf_import_flags, model);
     } else if (node.op() == "Floor") {
-      ConvertFloorOperator(node, model);
+      ConvertFloorOperator(node, tf_import_flags, model);
     } else if (node.op() == "Gather") {
-      ConvertGatherOperator(node, model);
+      ConvertGatherOperator(node, tf_import_flags, model);
     } else if (node.op() == "ResizeBilinear") {
-      ConvertResizeBilinearOperator(node, model);
+      ConvertResizeBilinearOperator(node, tf_import_flags, model);
     } else if (node.op() == "BatchNormWithGlobalNormalization") {
-      ConvertBatchNormWithGlobalNormalizationOperator(node, model);
+      ConvertBatchNormWithGlobalNormalizationOperator(node, tf_import_flags,
+                                                      model);
     } else if (node.op() == "FusedBatchNorm") {
-      ConvertFusedBatchNormOperator(node, model);
+      ConvertFusedBatchNormOperator(node, tf_import_flags, model);
     } else if (node.op() == "SpaceToBatchND") {
-      ConvertSpaceToBatchNDOperator(node, model);
+      ConvertSpaceToBatchNDOperator(node, tf_import_flags, model);
     } else if (node.op() == "BatchToSpaceND") {
-      ConvertBatchToSpaceNDOperator(node, model);
+      ConvertBatchToSpaceNDOperator(node, tf_import_flags, model);
     } else if (node.op() == "Mean") {
-      ConvertMeanOperator(node, model);
+      ConvertMeanOperator(node, tf_import_flags, model);
     } else if (node.op() == "Svdf") {
-      ConvertSvdfOperator(node, model);
+      ConvertSvdfOperator(node, tf_import_flags, model);
+    } else if (node.op() == "NextIteration") {
+      ConvertOperatorSpecialCasedAsRNNBackEdge(node, tf_import_flags, model);
+    } else if (node.op() == "ExpandDims") {
+      ConvertExpandDimsOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Fill") {
+      ConvertFillOperator(node, tf_import_flags, model);
+    } else if (node.op() == "FloorDiv") {
+      ConvertFloorDivOperator(node, tf_import_flags, model);
+    } else if (node.op() == "FloorMod") {
+      ConvertFloorModOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Range") {
+      ConvertRangeOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Rank") {
+      ConvertRankOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Stack" || node.op() == "Pack") {
+      ConvertStackOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Transpose") {
+      ConvertTransposeOperator(node, tf_import_flags, model);
+    } else if (node.op() == "ArgMax") {
+      ConvertArgMaxOperator(node, tf_import_flags, model);
     } else {
-      ConvertUnsupportedOperator(node, model);
+      ConvertUnsupportedOperator(node, tf_import_flags, model);
     }
   }
 
+  ResolveModelFlags(model_flags, model);
+
   StripCaretFromArrayNames(model);
-  AddExtraOutputsFedIntoOtherOps(model);
+  AddExtraOutputs(model);
   FixNoMissingArray(model);
   FixNoOrphanedArray(model);
   FixOperatorOrdering(model);
@@ -1494,7 +1898,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(const ModelFlags& model_flags,
 }
 
 std::unique_ptr<Model> ImportTensorFlowGraphDef(
-    const ModelFlags& model_flags, const string& input_file_contents) {
+    const ModelFlags& model_flags, const TensorFlowImportFlags& tf_import_flags,
+    const string& input_file_contents) {
   std::unique_ptr<GraphDef> tf_graph(new GraphDef);
   CHECK(ParseFromStringEitherTextOrBinary(input_file_contents, tf_graph.get()));
 
@@ -1503,6 +1908,6 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
   if (pruned_graph) {
     tf_graph = std::move(pruned_graph);
   }
-  return ImportTensorFlowGraphDef(model_flags, *tf_graph);
+  return ImportTensorFlowGraphDef(model_flags, tf_import_flags, *tf_graph);
 }
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.h b/tensorflow/contrib/lite/toco/import_tensorflow.h
index d2eb423ca43ce7feb0dd0e09b7b007fde5605493..312e3b8f17cfaa012bf25696937f97d396802bb2 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.h
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.h
@@ -23,11 +23,19 @@ limitations under the License.
 
 namespace toco {
 
+struct TensorFlowImportFlags {
+  // If true, control dependencies will be dropped immediately
+  // during the import of the TensorFlow GraphDef.
+  bool drop_control_dependency = false;
+};
+
 std::unique_ptr<Model> ImportTensorFlowGraphDef(
-    const ModelFlags& model_flags, const tensorflow::GraphDef& graph_def);
+    const ModelFlags& model_flags, const TensorFlowImportFlags& tf_import_flags,
+    const tensorflow::GraphDef& graph_def);
 
 std::unique_ptr<Model> ImportTensorFlowGraphDef(
-    const ModelFlags& model_flags, const string& input_file_contents);
+    const ModelFlags& model_flags, const TensorFlowImportFlags& tf_import_flags,
+    const string& input_file_contents);
 
 }  // namespace toco
 
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 63953a1e28fcb3bba34b878e8590f738129c4dbb..253b163649f98c00928c6ac5333561197b633b55 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -41,6 +41,10 @@ enum class OperatorType {
   kSpaceToDepth,
   kDequantize,
   kDiv,
+  kExpandDims,
+  kFill,
+  kFloorDiv,
+  kFloorMod,
   kFullyConnected,
   kL2Normalization,
   kL2Pool,
@@ -50,23 +54,28 @@ enum class OperatorType {
   kMaxPool,
   kFakeQuant,
   kMul,
+  kRange,
+  kRank,
   kRelu,
   kRelu1,
   kRelu6,
   kSoftmax,
   kSub,
   kTanh,
+  kTransposeConv,
   kCast,
   kFloor,
   kGather,
   kResizeBilinear,
   kSpaceToBatchND,
+  kStack,
   kBatchToSpaceND,
   kPad,
   kStridedSlice,
   kSlice,
   kSqueeze,
   kMean,
+  kArgMax,
   // The SVDF Op is a decomposition of a densely connected Op into
   // low rank filters. For details:
   // https://research.google.com/pubs/pub43813.html
@@ -89,6 +98,7 @@ enum class OperatorType {
   kTensorFlowMinimum,
   kTensorFlowMatMul,
   kTensorFlowMerge,
+  kNeg,
   kTensorFlowReshape,
   kTensorFlowRsqrt,
   kTensorFlowShape,
@@ -98,6 +108,7 @@ enum class OperatorType {
   kTensorFlowSum,
   kTensorFlowSwitch,
   kTensorFlowTile,
+  kTranspose,
   // An unsupported TF operation. It's only needed to be able to represent TF
   // graph internally and is expected to be dropped by graph transformations.
   kTensorFlowUnsupported,
@@ -302,6 +313,10 @@ struct ConvOperator : Operator {
   Padding padding;
   int stride_width = 0;
   int stride_height = 0;
+  // A dilation_rate of 0 is invalid and this field is an optional attribute.
+  // Thus initializing it to 1 to allow default conv behavior when the
+  // attribute is not present.
+  int dilation_rate = 1;
 };
 
 // Depthwise-separable convolution operator.
@@ -533,7 +548,7 @@ struct AddOperator : Operator {
 };
 
 // Concatenation operator: concatenates its inputs
-// along the concat_dim dimension.
+// along the axis.
 //
 // Inputs: this operator accepts any number >= 1 of inputs.
 //   inputs[i]: the i-th array to concatenate.
@@ -541,7 +556,7 @@ struct AddOperator : Operator {
 // TensorFlow equivalent: Concat.
 struct ConcatenationOperator : Operator {
   ConcatenationOperator() : Operator(OperatorType::kConcatenation) {}
-  int concat_dim = 0;
+  int axis = 0;
 };
 
 // Reordering dimensions. Used only during tooling to transform graphs from
@@ -754,6 +769,112 @@ struct SqueezeOperator : Operator {
   std::vector<int> squeeze_dims;
 };
 
+// Inputs:
+//   inputs[0]: required: the input activations array
+//   inputs[1]: required: the Conv weights
+//   channel.
+//
+// Outputs:
+//   outputs[0]: required: the output activations array
+//
+// TensorFlow equivalent: Conv2DBackpropInput
+struct TransposeConvOperator : Operator {
+  TransposeConvOperator() : Operator(OperatorType::kTransposeConv) {}
+  Padding padding;
+  int stride_width = 0;
+  int stride_height = 0;
+};
+
+// Given a tensor input, this operation inserts a dimension of 1 at the
+// dimension index axis of input's shape. The dimension index axis starts at
+// zero; if you specify a negative number for axis it is counted backward from
+// the end.
+//
+// Inputs:
+//   inputs[0]: required: input tensor
+//   inputs[1]: required: 0-D (scalar). Specifies the dimension index at which
+//   to expand the shape of input
+//
+// TensorFlow equivalent: ExpandDims
+struct ExpandDimsOperator : Operator {
+  ExpandDimsOperator() : Operator(OperatorType::kExpandDims) {}
+};
+
+// Ceates a tensor of shape dims and fills it with the given scalar value.
+// Output type will be the same as the given scalar value.
+//
+// Inputs:
+//   inputs[0]: required: 1-D (int32) - the shape of the output tensor
+//   inputs[1]: required: 0-D (scalar) - value to fill the tensor with
+//
+// TensorFlow equivalent: Fill
+struct FillOperator : Operator {
+  FillOperator() : Operator(OperatorType::kFill) {}
+};
+
+// Element-wise floor division operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: FloorDiv
+struct FloorDivOperator : Operator {
+  FloorDivOperator() : Operator(OperatorType::kFloorDiv) {}
+};
+
+// Element-wise floor mod operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: FloorMod
+struct FloorModOperator : Operator {
+  FloorModOperator() : Operator(OperatorType::kFloorMod) {}
+};
+
+// Creates a sequence of numbers that begins at start and extends by increments
+// of delta up to but not including limit.
+//
+// The dtype of the resulting tensor is inferred from the inputs unless it is
+// provided explicitly.
+//
+// Inputs:
+//   inputs[0]: required: the start
+//   inputs[1]: required: the limit
+//   inputs[2]: required: the delta
+//
+// TensorFlow equivalent: Range
+struct RangeOperator : Operator {
+  RangeOperator() : Operator(OperatorType::kRange) {}
+  ArrayDataType dtype = ArrayDataType::kNone;
+};
+
+// Rank operator. Extracts the rank of the tensor.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// This operation outputs a 0-D integer tensor representing the rank of
+// the input.
+//
+// TensorFlow equivalent: Rank.  We currently assume that the output is int32
+// and not int64.  The output type could be stored herein.
+struct RankOperator : Operator {
+  RankOperator() : Operator(OperatorType::kRank) {}
+};
+
+// Element-wise negation (-x) operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Neg
+struct NegOperator : Operator {
+  NegOperator() : Operator(OperatorType::kNeg) {}
+};
+
 // Element-wise reciprocal-square-root (x^-0.5) operator.
 //
 // Inputs:
@@ -764,6 +885,21 @@ struct TensorFlowRsqrtOperator : Operator {
   TensorFlowRsqrtOperator() : Operator(OperatorType::kTensorFlowRsqrt) {}
 };
 
+// Stacks a list of rank-R tensors into one rank-(R+1) tensor.
+//
+// Packs the list of tensors in values into a tensor with rank one higher than
+// each tensor in values, by packing them along the axis dimension. Given a list
+// of length N of tensors of shape (A, B, C);.
+//
+// Inputs: this operator accepts any number >= 1 of inputs.
+//   inputs[i]: the i-th array to merge.
+//
+// TensorFlow equivalent: Stack or Pack
+struct StackOperator : Operator {
+  StackOperator() : Operator(OperatorType::kStack) {}
+  int axis = 0;
+};
+
 // Shape operator. Extracts the shape of the tensor.
 //
 // Inputs:
@@ -798,6 +934,19 @@ struct TensorFlowSquareOperator : Operator {
   TensorFlowSquareOperator() : Operator(OperatorType::kTensorFlowSquare) {}
 };
 
+// Transposes a tensor.
+//
+// By default, this operation performs a regular matrix transpose on 2-D input
+// tensors.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Transpose
+struct TransposeOperator : Operator {
+  TransposeOperator() : Operator(OperatorType::kTranspose) {}
+};
+
 // Element-wise subtraction operator.
 //
 // Inputs:
@@ -819,6 +968,7 @@ struct SubOperator : Operator {
 // of global reduction across all dimensions.
 struct TensorFlowSumOperator : Operator {
   TensorFlowSumOperator() : Operator(OperatorType::kTensorFlowSum) {}
+  bool keep_dims = false;
 };
 
 // TensorFlow Tile equivalent. Refer to TensorFlow documentation for details.
@@ -971,6 +1121,7 @@ struct TensorFlowGreaterEqualOperator : Operator {
 // of global reduction across all dimensions.
 struct TensorFlowMaxOperator : Operator {
   TensorFlowMaxOperator() : Operator(OperatorType::kTensorFlowMax) {}
+  bool keep_dims = false;
 };
 
 // Global min reduction: computes the min of all of entries in the input array.
@@ -983,6 +1134,7 @@ struct TensorFlowMaxOperator : Operator {
 // of global reduction across all dimensions.
 struct TensorFlowMinOperator : Operator {
   TensorFlowMinOperator() : Operator(OperatorType::kTensorFlowMin) {}
+  bool keep_dims = false;
 };
 
 // Element-wise maximum operator. Currently it only supports scalar as
@@ -1068,7 +1220,19 @@ struct FloorOperator : Operator {
 // TensorFlow equivalent: Gather
 struct GatherOperator : Operator {
   GatherOperator() : Operator(OperatorType::kGather) {}
-  int input_rank;
+  int axis = 0;
+  int input_rank = 0;
+};
+
+// ArgMax operator. It returns the index of the maximum value along axis.
+//
+// Inputs:
+//   inputs[0]: required: the input tensor
+//
+// TensorFlow equivalent: ArgMax
+struct ArgMaxOperator : Operator {
+  ArgMaxOperator() : Operator(OperatorType::kArgMax) {}
+  ArrayDataType output_data_type = ArrayDataType::kInt64;
 };
 
 // ResizeBilinear operator. It resizes input images with bilinear interpolation.
@@ -1109,6 +1273,10 @@ struct SpaceToBatchNDOperator : Operator {
 // TensorFlow equivalent: BatchToSpaceND
 struct BatchToSpaceNDOperator : Operator {
   BatchToSpaceNDOperator() : Operator(OperatorType::kBatchToSpaceND) {}
+
+  std::vector<int> block_shape;
+  std::vector<int> before_crops;
+  std::vector<int> after_crops;
 };
 
 // Mean operator.
@@ -1120,7 +1288,8 @@ struct BatchToSpaceNDOperator : Operator {
 struct MeanOperator : Operator {
   MeanOperator() : Operator(OperatorType::kMean) {}
 
-  std::vector<int> reduction_indices;
+  std::vector<int> axis;
+  bool keep_dims = false;
 };
 
 // Svdf operator:
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 699c95753fab7a2b7dd373e123402af01759cfc7..790b3443cef1c577e19bafc5e087ca42e6fce60a 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
@@ -28,6 +27,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/toco/toco_port.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/command_line_flags.h"
+
 // "batch" flag only exists internally
 #ifdef PLATFORM_GOOGLE
 #include "base/commandlineflags.h"
@@ -43,7 +43,8 @@ bool ParseModelFlagsFromCommandLineFlags(
   std::vector<tensorflow::Flag> flags = {
       Flag("input_array", parsed_flags.input_array.bind(),
            parsed_flags.input_array.default_value(),
-           "Name of the input array. If not specified, will try to read "
+           "Deprecated: use --input_arrays instead. Name of the input array. "
+           "If not specified, will try to read "
            "that information from the input file."),
       Flag("input_arrays", parsed_flags.input_arrays.bind(),
            parsed_flags.input_arrays.default_value(),
@@ -51,7 +52,8 @@ bool ParseModelFlagsFromCommandLineFlags(
            "will try to read that information from the input file."),
       Flag("output_array", parsed_flags.output_array.bind(),
            parsed_flags.output_array.default_value(),
-           "Name of the output array, when specifying a unique output array. "
+           "Deprecated: use --output_arrays instead. Name of the output array, "
+           "when specifying a unique output array. "
            "If not specified, will try to read that information from the "
            "input file."),
       Flag("output_arrays", parsed_flags.output_arrays.bind(),
@@ -60,8 +62,9 @@ bool ParseModelFlagsFromCommandLineFlags(
            "If not specified, will try to read "
            "that information from the input file."),
       Flag("input_shape", parsed_flags.input_shape.bind(),
-           parsed_flags.output_arrays.default_value(),
-           "Input array shape. For many models the shape takes the form "
+           parsed_flags.input_shape.default_value(),
+           "Deprecated: use --input_shapes instead. Input array shape. For "
+           "many models the shape takes the form "
            "batch size, input array height, input array width, input array "
            "depth."),
       Flag("input_shapes", parsed_flags.input_shapes.bind(),
@@ -69,9 +72,22 @@ bool ParseModelFlagsFromCommandLineFlags(
            "Shapes corresponding to --input_arrays, colon-separated. For "
            "many models each shape takes the form batch size, input array "
            "height, input array width, input array depth."),
+      Flag("input_data_type", parsed_flags.input_data_type.bind(),
+           parsed_flags.input_data_type.default_value(),
+           "Deprecated: use --input_data_types instead. Input array type, if "
+           "not already provided in the graph. "
+           "Typically needs to be specified when passing arbitrary arrays "
+           "to --input_array."),
+      Flag("input_data_types", parsed_flags.input_data_types.bind(),
+           parsed_flags.input_data_types.default_value(),
+           "Input arrays types, comma-separated, if not already provided in "
+           "the graph. "
+           "Typically needs to be specified when passing arbitrary arrays "
+           "to --input_arrays."),
       Flag("mean_value", parsed_flags.mean_value.bind(),
            parsed_flags.mean_value.default_value(),
-           "mean_value parameter for image models, used to compute input "
+           "Deprecated: use --mean_values instead. mean_value parameter for "
+           "image models, used to compute input "
            "activations from input pixel data."),
       Flag("mean_values", parsed_flags.mean_values.bind(),
            parsed_flags.mean_values.default_value(),
@@ -81,7 +97,8 @@ bool ParseModelFlagsFromCommandLineFlags(
            "--input_arrays."),
       Flag("std_value", parsed_flags.std_value.bind(),
            parsed_flags.std_value.default_value(),
-           "std_value parameter for image models, used to compute input "
+           "Deprecated: use --std_values instead. std_value parameter for "
+           "image models, used to compute input "
            "activations from input pixel data."),
       Flag("std_values", parsed_flags.std_values.bind(),
            parsed_flags.std_values.default_value(),
@@ -95,13 +112,6 @@ bool ParseModelFlagsFromCommandLineFlags(
            "exclusive "
            "with the 'batch' field: at most one of these two fields can be "
            "set."),
-      Flag(
-          "drop_control_dependency",
-          parsed_flags.drop_control_dependency.bind(),
-          parsed_flags.drop_control_dependency.default_value(),
-          "If true, ignore control dependency requirements in input TensorFlow "
-          "GraphDef. Otherwise an error will be raised upon control dependency "
-          "inputs."),
       Flag("rnn_states", parsed_flags.rnn_states.bind(),
            parsed_flags.rnn_states.default_value(), ""),
       Flag("model_checks", parsed_flags.model_checks.bind(),
@@ -124,6 +134,20 @@ bool ParseModelFlagsFromCommandLineFlags(
            parsed_flags.dump_graphviz_video.default_value(),
            "If true, will dump graphviz at each "
            "graph transformation, which may be used to generate a video."),
+      Flag("allow_nonexistent_arrays",
+           parsed_flags.allow_nonexistent_arrays.bind(),
+           parsed_flags.allow_nonexistent_arrays.default_value(),
+           "If true, will allow passing inexistent arrays in --input_arrays "
+           "and --output_arrays. This makes little sense, is only useful to "
+           "more easily get graph visualizations."),
+      Flag("allow_nonascii_arrays", parsed_flags.allow_nonascii_arrays.bind(),
+           parsed_flags.allow_nonascii_arrays.default_value(),
+           "If true, will allow passing non-ascii-printable characters in "
+           "--input_arrays and --output_arrays. By default (if false), only "
+           "ascii printable characters are allowed, i.e. character codes "
+           "ranging from 32 to 127. This is disallowed by default so as to "
+           "catch common copy-and-paste issues where invisible unicode "
+           "characters are unwittingly added to these strings."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -232,16 +256,33 @@ void ReadModelFlagsFromCommandLineFlags(
       CHECK(last != std_values[i].data());
     }
   }
+  if (parsed_model_flags.input_data_type.specified()) {
+    QCHECK(uses_single_input_flags);
+    IODataType type;
+    QCHECK(IODataType_Parse(parsed_model_flags.input_data_type.value(), &type));
+    model_flags->mutable_input_arrays(0)->set_data_type(type);
+  }
+  if (parsed_model_flags.input_data_types.specified()) {
+    QCHECK(uses_multi_input_flags);
+    std::vector<string> input_data_types =
+        absl::StrSplit(parsed_model_flags.input_data_types.value(), ',');
+    QCHECK(input_data_types.size() == model_flags->input_arrays_size());
+    for (int i = 0; i < input_data_types.size(); ++i) {
+      IODataType type;
+      QCHECK(IODataType_Parse(input_data_types[i], &type));
+      model_flags->mutable_input_arrays(i)->set_data_type(type);
+    }
+  }
   if (parsed_model_flags.input_shape.specified()) {
     QCHECK(uses_single_input_flags);
     if (model_flags->input_arrays().empty()) {
       model_flags->add_input_arrays();
     }
     auto* shape = model_flags->mutable_input_arrays(0)->mutable_shape();
-    shape->Clear();
+    shape->clear_dims();
     const IntList& list = parsed_model_flags.input_shape.value();
     for (auto& dim : list.elements) {
-      shape->Add(dim);
+      shape->add_dims(dim);
     }
   }
   if (parsed_model_flags.input_shapes.specified()) {
@@ -251,25 +292,12 @@ void ReadModelFlagsFromCommandLineFlags(
     QCHECK(input_shapes.size() == model_flags->input_arrays_size());
     for (int i = 0; i < input_shapes.size(); ++i) {
       auto* shape = model_flags->mutable_input_arrays(i)->mutable_shape();
-      shape->Clear();
-      if (input_shapes[i].empty()) {
-        // empty i.e. 0-dimensional input shape.
-        // Unfortunately, the current toco::InputArray
-        // proto does not allow to distinguish between a known 0-D shape,
-        // and an unknown shape. Indeed, shape is currently a plain array,
-        // and it being empty means unknown shape. So here, we import a
-        // 0-D shape as a 1-D shape of size.
-        // TODO(benoitjacob): fix toco::InputArray to allow 0-D shape,
-        // probably by making shape an optional message,
-        // encapsulating the array.
-        shape->Add(1);
-      } else {
-        for (const auto& dim_str : absl::StrSplit(input_shapes[i], ',')) {
-          int size;
-          CHECK(absl::SimpleAtoi(dim_str, &size))
-              << "Failed to parse input_shape: " << input_shapes[i];
-          shape->Add(size);
-        }
+      shape->clear_dims();
+      for (const auto& dim_str : absl::StrSplit(input_shapes[i], ',')) {
+        int size;
+        CHECK(absl::SimpleAtoi(dim_str, &size))
+            << "Failed to parse input_shape: " << input_shapes[i];
+        shape->add_dims(size);
       }
     }
   }
@@ -282,7 +310,6 @@ void ReadModelFlagsFromCommandLineFlags(
   } while (false)
 
   READ_MODEL_FLAG(variable_batch);
-  READ_MODEL_FLAG(drop_control_dependency);
 
 #undef READ_MODEL_FLAG
 
@@ -336,6 +363,11 @@ void ReadModelFlagsFromCommandLineFlags(
       }
     }
   }
+
+  model_flags->set_allow_nonascii_arrays(
+      parsed_model_flags.allow_nonascii_arrays.value());
+  model_flags->set_allow_nonexistent_arrays(
+      parsed_model_flags.allow_nonexistent_arrays.value());
 }
 
 ParsedModelFlags* UncheckedGlobalParsedModelFlags(bool must_already_exist) {
diff --git a/tensorflow/contrib/lite/toco/model_flags.proto b/tensorflow/contrib/lite/toco/model_flags.proto
index b016f34621286aa3127e4c31916440969c80de0c..13fea29a07ed9ea75ebe1b9b046f2a68d814c649 100644
--- a/tensorflow/contrib/lite/toco/model_flags.proto
+++ b/tensorflow/contrib/lite/toco/model_flags.proto
@@ -16,7 +16,11 @@ import "tensorflow/contrib/lite/toco/types.proto";
 
 package toco;
 
-// Next ID to USE: 5.
+message InputArrayShape {
+  repeated int32 dims = 2;
+}
+
+// Next ID to USE: 7.
 message InputArray {
   // Name of the input arrays, i.e. the arrays from which input activations
   // will be read.
@@ -28,7 +32,7 @@ message InputArray {
   //
   // The last dimension is typically called 'depth' or 'channels'. For example,
   // for an image model taking RGB images as input, this would have the value 3.
-  repeated int32 shape = 2;
+  optional InputArrayShape shape = 6;
 
   // mean_value and std_value parameters control the interpretation of raw input
   // activation values (elements of the input array) as real numbers. The
@@ -46,6 +50,50 @@ message InputArray {
   // (TensorFlow via LegacyFedInput).
   optional float mean_value = 3;
   optional float std_value = 4 [default = 1.];
+
+  // Data type of the input.
+  //
+  // In many graphs, the input arrays already have defined data types,
+  // e.g. Placeholder nodes in a TensorFlow GraphDef have a dtype attribute.
+  // In those cases, it is not needed to specify this data_type flag.
+  // The purpose of this flag is only to define the data type of input
+  // arrays whose type isn't defined in the input graph file. For example,
+  // when specifying an arbitrary (not Placeholder) --input_array into
+  // a TensorFlow GraphDef.
+  //
+  // When this data_type is quantized (e.g. QUANTIZED_UINT8), the
+  // corresponding quantization parameters are the mean_value, std_value
+  // fields.
+  //
+  // It is also important to understand the nuance between this data_type
+  // flag and the inference_input_type in TocoFlags. The basic difference
+  // is that this data_type (like all ModelFlags) describes a property
+  // of the input graph, while inference_input_type (like all TocoFlags)
+  // describes an aspect of the toco transformation process and thus of
+  // the output file. The types of input arrays may be different between
+  // the input and output files if quantization or dequantization occurred.
+  // Such differences can only occur for real-number data i.e. only
+  // between FLOAT and quantized types (e.g. QUANTIZED_UINT8).
+  optional IODataType data_type = 5;
+}
+
+message RnnState {
+  optional string state_array = 1;
+  optional string back_edge_source_array = 2;
+  optional bool discardable = 5;
+  // TODO(benoitjacob): drop the 'size' field. Should be redundant with
+  // --input_shapes and shapes propagation.
+  optional int32 size = 3;
+  // TODO(benoitjacob): manually_create is a temporary hack:
+  // due to discrepancies between the current toco dims tracking and
+  // TensorFlow shapes, for some models we need to manually create RNN state
+  // arrays with a specified shape.
+  // Maybe we should actually implement back-edges as operators of their own,
+  // which would remove the need for much special-casing, including here,
+  // we could probably consistently let PropagateFixedSizes handle state
+  // arrays.
+  // TODO(benoitjacob): should really drop manually_create now.
+  optional bool manually_create = 4;
 }
 
 // ModelFlags encodes properties of a model that, depending on the file
@@ -69,7 +117,7 @@ message InputArray {
 //   optional int32 input_dims = 11 [ default = 4];
 //   repeated int32 input_shape = 13;
 //
-// Next ID to USE: 16.
+// Next ID to USE: 18.
 message ModelFlags {
   // Information about the input arrays, i.e. the arrays from which input
   // activations will be read.
@@ -83,20 +131,6 @@ message ModelFlags {
   // the 'batch' field: at most one of these two fields can be set.
   optional bool variable_batch = 10;
 
-  message RnnState {
-    optional string state_array = 1;
-    optional string back_edge_source_array = 2;
-    optional int32 size = 3;
-    // TODO(benoitjacob): manually_create is a temporary hack:
-    // due to discrepancies between the current toco dims tracking and
-    // TensorFlow shapes, for some models we need to manually create RNN state
-    // arrays with a specified shape.
-    // Maybe we should actually implement back-edges as operators of their own,
-    // which would remove the need for much special-casing, including here,
-    // we could probably consistently let PropagateFixedSizes handle state
-    // arrays.
-    optional bool manually_create = 4;
-  }
   repeated RnnState rnn_states = 12;
 
   // Checks applied to the model, typically after toco's comprehensive
@@ -114,7 +148,16 @@ message ModelFlags {
   }
   repeated ModelCheck model_checks = 14;
 
-  // If true, ignore control dependency requirements in input TensorFlow
-  // GraphDef. Otherwise an error will be raised upon control dependency inputs.
-  optional bool drop_control_dependency = 15;
+  // If true, will allow passing inexistent arrays in --input_arrays
+  // and --output_arrays. This makes little sense, is only useful to
+  // more easily get graph visualizations.
+  optional bool allow_nonexistent_arrays = 16;
+
+  // If true, will allow passing non-ascii-printable characters in
+  // --input_arrays and --output_arrays. By default (if false), only
+  // ascii printable characters are allowed, i.e. character codes
+  // ranging from 32 to 127. This is disallowed by default so as to
+  // catch common copy-and-paste issues where invisible unicode
+  // characters are unwittingly added to these strings.
+  optional bool allow_nonascii_arrays = 17;
 }
diff --git a/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py b/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
index ce19b7efbe087a0372a906195148f71339f228da..c35b6f99259b762aa83d92d21512169a7ab50b70 100644
--- a/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
+++ b/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
@@ -48,12 +48,12 @@ class TocoFromProtosTest(googletest.TestCase):
     toco_flags = toco_flags_pb2.TocoFlags()
     toco_flags.input_format = toco_flags_pb2.TENSORFLOW_GRAPHDEF
     toco_flags.output_format = toco_flags_pb2.TFLITE
-    toco_flags.input_types.append(types_pb2.FLOAT)
+    toco_flags.inference_input_type = types_pb2.FLOAT
     toco_flags.inference_type = types_pb2.FLOAT
     model_flags = model_flags_pb2.ModelFlags()
     input_array = model_flags.input_arrays.add()
     input_array.name = TensorName(in_tensor)
-    input_array.shape.extend(map(int, in_tensor.get_shape()))
+    input_array.shape.dims.extend(map(int, in_tensor.get_shape()))
     model_flags.output_arrays.append(TensorName(out_tensor))
     # Shell out to run toco (in case it crashes)
     with tempfile.NamedTemporaryFile() as fp_toco, \
diff --git a/tensorflow/contrib/lite/toco/tflite/BUILD b/tensorflow/contrib/lite/toco/tflite/BUILD
index e910e3957f77fcf28ab379026bae4cc33ed00bc5..332253a092aff812fb18601862c66bc0423599c2 100644
--- a/tensorflow/contrib/lite/toco/tflite/BUILD
+++ b/tensorflow/contrib/lite/toco/tflite/BUILD
@@ -1,3 +1,8 @@
+package(
+    # To suppress build cleaner error about inclusion of schema_generate.h.
+    features = ["-layering_check"],
+)
+
 licenses(["notice"])  # Apache 2.0
 
 load(
@@ -93,6 +98,7 @@ tf_cc_test(
     ],
     deps = [
         ":export",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
index beda710614fd607a2e373582620d24dc3656fcf4..bec694a23377c7c70684000069e9c08ee446b6c0 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export.cc
@@ -188,19 +188,26 @@ Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
     const details::OperatorKey operator_key = GetOperatorKey(*op);
     int op_index = operators_map.at(operator_key);
 
-    if (ops_by_type.count(op->type) == 0) {
-      LOG(FATAL) << "Unsupported operator: " << HelpfulOperatorTypeName(*op);
+    string name = HelpfulOperatorTypeName(*op);
+    bool is_builtin = false;
+    if (ops_by_type.count(op->type) != 0) {
+      name = ops_by_type.at(op->type)->name();
+      is_builtin = (builtin_ops.count(name) > 0);
     }
 
-    string name = ops_by_type.at(op->type)->name();
-    if (builtin_ops.count(name) > 0) {
+    if (is_builtin) {
       ordered_opcodes[op_index] =
           CreateOperatorCode(*builder, builtin_ops[name], 0);
     } else {
-      // If use the custom operation code if it's available in the OperatorKey.
+      // This could be a kTensorFlowUnsupported, in which case we should be
+      // able to retrieve the original Tensorflow name from the OperatorKey, or
+      // this could be a proper TOCO operator that is completely unknown to TF
+      // Lite.
       if (!operator_key.custom_code.empty()) {
         name = operator_key.custom_code;
       }
+      // Either way, this is an operator that is not supported by TF Lite,
+      // so we output it as a custom op and add it to the error summary.
       if (error_summary) {
         error_summary->insert(name);
       }
@@ -226,11 +233,6 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
   // The operators are in execution order, so we just follow tf.mini order.
   std::vector<Offset<Operator>> op_vector;
   for (const auto& op : model.operators) {
-    if (ops_by_type.count(op->type) == 0) {
-      LOG(FATAL) << "Op type '" << OperatorTypeName(op->type)
-                 << "' not supported";
-    }
-
     std::vector<int32_t> inputs;
     for (const string& input : op->inputs) {
       inputs.push_back(tensors_map.at(input));
@@ -241,8 +243,15 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
       outputs.push_back(tensors_map.at(output));
     }
 
-    auto options = ops_by_type.at(op->type)->Serialize(*op, builder);
     int op_index = operators_map.at(GetOperatorKey(*op));
+
+    // This is a custom op unless we can find it in ops_by_type, and even then
+    // it could be a custom op (such as kTensorFlowUnsupported).
+
+    auto options = Options::Custom(0);
+    if (ops_by_type.count(op->type) != 0) {
+      options = ops_by_type.at(op->type)->Serialize(*op, builder);
+    }
     // The only supported CustomOptionFormat is FLEXBUFFERS now.
     op_vector.push_back(CreateOperator(
         *builder, op_index, builder->CreateVector(inputs),
diff --git a/tensorflow/contrib/lite/toco/tflite/export_test.cc b/tensorflow/contrib/lite/toco/tflite/export_test.cc
index e395645383144f663fa108f05ca9930a56cf26a6..d4c4612d62c4eb5b14898eb8846314246ecbb815 100644
--- a/tensorflow/contrib/lite/toco/tflite/export_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export_test.cc
@@ -16,12 +16,14 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
 
 namespace toco {
-
 namespace tflite {
 namespace {
 
+using ::testing::ElementsAre;
+
 class ExportTest : public ::testing::Test {
  protected:
   // This is a very simplistic model. We are not interested in testing all the
@@ -31,11 +33,20 @@ class ExportTest : public ::testing::Test {
   void BuildTestModel() {
     input_model_.GetOrCreateArray("tensor_one");
     input_model_.GetOrCreateArray("tensor_two");
-    input_model_.operators.emplace_back(new ConvOperator);
+    {
+      auto* op = new ConvOperator;
+      op->padding.type = PaddingType::kSame;
+      input_model_.operators.emplace_back(op);
+    }
     input_model_.operators.emplace_back(new AddOperator);
-    auto unsupported_operator = new TensorFlowUnsupportedOperator;
-    unsupported_operator->tensorflow_op = "MyCrazyOp";
-    input_model_.operators.emplace_back(unsupported_operator);
+    {
+      auto* op = new TensorFlowUnsupportedOperator;
+      op->tensorflow_op = "MyCrazyOp";
+      input_model_.operators.emplace_back(op);
+    }
+    // Note that Sub is not know to TF Lite, so it gets exported as a custom
+    // op (and no options).
+    input_model_.operators.emplace_back(new SubOperator);
   }
 
   Model input_model_;
@@ -57,13 +68,44 @@ TEST_F(ExportTest, LoadOperatorsMap) {
   details::LoadOperatorsMap(input_model_, &operators);
   EXPECT_EQ(0, operators[details::OperatorKey(OperatorType::kAdd, "")]);
   EXPECT_EQ(1, operators[details::OperatorKey(OperatorType::kConv, "")]);
-  EXPECT_EQ(2, operators[details::OperatorKey(
+  EXPECT_EQ(2, operators[details::OperatorKey(OperatorType::kSub, "")]);
+  EXPECT_EQ(3, operators[details::OperatorKey(
                    OperatorType::kTensorFlowUnsupported, "MyCrazyOp")]);
 }
 
+TEST_F(ExportTest, Export) {
+  BuildTestModel();
+
+  string result;
+  Export(input_model_, true, &result);
+
+  auto* model = ::tflite::GetModel(result.data());
+
+  std::vector<string> names;
+  for (const ::tflite::OperatorCode* opcode : *model->operator_codes()) {
+    if (opcode->builtin_code() != ::tflite::BuiltinOperator_CUSTOM) {
+      names.push_back(string("builtin:") + ::tflite::EnumNameBuiltinOperator(
+                                               opcode->builtin_code()));
+    } else {
+      names.push_back(string("custom:") + opcode->custom_code()->c_str());
+    }
+  }
+
+  EXPECT_THAT(names, ElementsAre("builtin:ADD", "builtin:CONV_2D", "custom:Sub",
+                                 "custom:MyCrazyOp"));
+
+  std::vector<uint32_t> indices;
+  auto operators = (*model->subgraphs())[0]->operators();
+  EXPECT_EQ(operators->Length(), 4);
+  for (const auto* op : *operators) {
+    indices.push_back(op->opcode_index());
+  }
+
+  EXPECT_THAT(indices, ElementsAre(1, 0, 3, 2));
+}
+
 // TODO(ahentz): tests for tensors, inputs, outpus, opcodes and operators.
 
 }  // namespace
 }  // namespace tflite
-
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 8a33500ddcda67d97e68158ce40d8d7e086a27cc..ede6df88ab3a48f25c93f19c3d84a3ce5afc0450 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -130,6 +130,37 @@ class Add : public BuiltinOperator<AddOperator, ::tflite::AddOptions,
   }
 };
 
+class BatchToSpaceND
+    : public BuiltinOperator<BatchToSpaceNDOperator,
+                             ::tflite::BatchToSpaceNDOptions,
+                             ::tflite::BuiltinOptions_BatchToSpaceNDOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto block_shape = builder->CreateVector(op.block_shape);
+    auto before_crops = builder->CreateVector(op.before_crops);
+    auto after_crops = builder->CreateVector(op.after_crops);
+    return ::tflite::CreateBatchToSpaceNDOptions(*builder, block_shape,
+                                                 before_crops, after_crops);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->block_shape.insert(op->block_shape.end(),
+                           options.block_shape()->begin(),
+                           options.block_shape()->end());
+    op->before_crops.insert(op->before_crops.end(),
+                            options.before_crops()->begin(),
+                            options.before_crops()->end());
+    op->after_crops.insert(op->after_crops.end(),
+                           options.after_crops()->begin(),
+                           options.after_crops()->end());
+  }
+};
+
 class Cast : public CustomOperator<CastOperator> {
  public:
   using CustomOperator::CustomOperator;
@@ -153,12 +184,12 @@ class Concatenation
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
-    return ::tflite::CreateConcatenationOptions(*builder, op.concat_dim);
+    return ::tflite::CreateConcatenationOptions(*builder, op.axis);
   }
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {
-    op->concat_dim = options.axis();
+    op->axis = options.axis();
   }
 };
 
@@ -211,6 +242,22 @@ class FullyConnected
   }
 };
 
+class Gather : public BuiltinOperator<GatherOperator, ::tflite::GatherOptions,
+                                      ::tflite::BuiltinOptions_GatherOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateGatherOptions(*builder, op.axis);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->axis = options.axis();
+  }
+};
+
 class Svdf : public BuiltinOperator<SvdfOperator, ::tflite::SVDFOptions,
                                     ::tflite::BuiltinOptions_SVDFOptions> {
  public:
@@ -348,6 +395,30 @@ class Mul : public BuiltinOperator<MulOperator, ::tflite::MulOptions,
   }
 };
 
+class Pad : public BuiltinOperator<PadOperator, ::tflite::PadOptions,
+                                   ::tflite::BuiltinOptions_PadOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto before_padding = builder->CreateVector(op.left_padding);
+    auto after_padding = builder->CreateVector(op.right_padding);
+    return ::tflite::CreatePadOptions(*builder, before_padding, after_padding);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->left_padding.insert(op->left_padding.end(),
+                            options.before_padding()->begin(),
+                            options.before_padding()->end());
+    op->right_padding.insert(op->right_padding.end(),
+                             options.after_padding()->begin(),
+                             options.after_padding()->end());
+  }
+};
+
 class Reshape
     : public BuiltinOperator<TensorFlowReshapeOperator,
                              ::tflite::ReshapeOptions,
@@ -531,6 +602,9 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
   ops.emplace_back(new Add(::tflite::BuiltinOperator_ADD, OperatorType::kAdd));
   ops.emplace_back(new AveragePool(::tflite::BuiltinOperator_AVERAGE_POOL_2D,
                                    OperatorType::kAveragePool));
+  ops.emplace_back(
+      new BatchToSpaceND(::tflite::BuiltinOperator_BATCH_TO_SPACE_ND,
+                         OperatorType::kBatchToSpaceND));
   ops.emplace_back(new Concatenation(::tflite::BuiltinOperator_CONCATENATION,
                                      OperatorType::kConcatenation));
   ops.emplace_back(
@@ -540,6 +614,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
                                OperatorType::kDepthwiseConv));
   ops.emplace_back(new FullyConnected(::tflite::BuiltinOperator_FULLY_CONNECTED,
                                       OperatorType::kFullyConnected));
+  ops.emplace_back(
+      new Gather(::tflite::BuiltinOperator_GATHER, OperatorType::kGather));
   ops.emplace_back(
       new L2Normalization(::tflite::BuiltinOperator_L2_NORMALIZATION,
                           OperatorType::kL2Normalization));
@@ -551,6 +627,7 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
   ops.emplace_back(new MaxPool(::tflite::BuiltinOperator_MAX_POOL_2D,
                                OperatorType::kMaxPool));
   ops.emplace_back(new Mul(::tflite::BuiltinOperator_MUL, OperatorType::kMul));
+  ops.emplace_back(new Pad(::tflite::BuiltinOperator_PAD, OperatorType::kPad));
   ops.emplace_back(new Reshape(::tflite::BuiltinOperator_RESHAPE,
                                OperatorType::kTensorFlowReshape));
   ops.emplace_back(
@@ -571,6 +648,7 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
 
   // There operators are supported by Toco, but not by TF Lite, and has no
   // attributes.
+  ops.emplace_back(new SimpleOperator<NegOperator>("NEG", OperatorType::kNeg));
   ops.emplace_back(new SimpleOperator<TensorFlowRsqrtOperator>(
       "RSQRT", OperatorType::kTensorFlowRsqrt));
   ops.emplace_back(
@@ -581,8 +659,6 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       "DEQUANTIZE", OperatorType::kDequantize));
   ops.emplace_back(
       new SimpleOperator<FloorOperator>("FLOOR", OperatorType::kFloor));
-  ops.emplace_back(
-      new SimpleOperator<GatherOperator>("GATHER", OperatorType::kGather));
   ops.emplace_back(
       new SimpleOperator<ReluOperator>("RELU", OperatorType::kRelu));
   ops.emplace_back(
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 8e77c56d8aaa88d5c801ae246e1ee63e40b6f955..735eea4ddcb3906b8fdc2fa4576b55736ad4382f 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -101,7 +101,6 @@ TEST_F(OperatorTest, SimpleOperators) {
   CheckSimpleOperator<DequantizeOperator>("DEQUANTIZE",
                                           OperatorType::kDequantize);
   CheckSimpleOperator<FloorOperator>("FLOOR", OperatorType::kFloor);
-  CheckSimpleOperator<GatherOperator>("GATHER", OperatorType::kGather);
   CheckSimpleOperator<ReluOperator>("RELU", OperatorType::kRelu);
   CheckSimpleOperator<Relu1Operator>("RELU1", OperatorType::kRelu1);
   CheckSimpleOperator<Relu6Operator>("RELU6", OperatorType::kRelu6);
@@ -120,6 +119,19 @@ TEST_F(OperatorTest, BuiltinAdd) {
             output_toco_op->fused_activation_function);
 }
 
+TEST_F(OperatorTest, BuiltinBatchToSpaceND) {
+  BatchToSpaceNDOperator op;
+  op.block_shape = {2, 2};
+  op.before_crops = {1, 2};
+  op.after_crops = {3, 4};
+
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("BATCH_TO_SPACE_ND", OperatorType::kBatchToSpaceND), op);
+  EXPECT_EQ(op.block_shape, output_toco_op->block_shape);
+  EXPECT_EQ(op.before_crops, output_toco_op->before_crops);
+  EXPECT_EQ(op.after_crops, output_toco_op->after_crops);
+}
+
 TEST_F(OperatorTest, CustomCast) {
   CastOperator op;
   op.src_data_type = ArrayDataType::kFloat;
@@ -132,10 +144,10 @@ TEST_F(OperatorTest, CustomCast) {
 
 TEST_F(OperatorTest, CustomConcatenation) {
   ConcatenationOperator op;
-  op.concat_dim = 123;
+  op.axis = 123;
   auto output_toco_op = SerializeAndDeserialize(
       GetOperator("CONCATENATION", OperatorType::kConcatenation), op);
-  EXPECT_EQ(op.concat_dim, output_toco_op->concat_dim);
+  EXPECT_EQ(op.axis, output_toco_op->axis);
 }
 
 TEST_F(OperatorTest, CustomDepthToSpace) {
@@ -167,6 +179,13 @@ TEST_F(OperatorTest, CustomFullyConnected) {
             output_toco_op->fused_activation_function);
 }
 
+TEST_F(OperatorTest, BuiltinGather) {
+  GatherOperator op;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("GATHER", OperatorType::kGather), op);
+  ASSERT_NE(nullptr, output_toco_op.get());
+}
+
 TEST_F(OperatorTest, BuiltinL2Pool) {
   L2PoolOperator op;
   op.stride_width = 123;
@@ -215,6 +234,16 @@ TEST_F(OperatorTest, BuiltinMaxPool) {
   EXPECT_EQ(op.kheight, output_toco_op->kheight);
 }
 
+TEST_F(OperatorTest, BuiltinPad) {
+  PadOperator op;
+  op.left_padding = {1, 2, 3};
+  op.right_padding = {1, 2, 3};
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("PAD", OperatorType::kPad), op);
+  EXPECT_EQ(op.left_padding, output_toco_op->left_padding);
+  EXPECT_EQ(op.right_padding, output_toco_op->right_padding);
+}
+
 TEST_F(OperatorTest, BuiltinReshape) {
   TensorFlowReshapeOperator op;
   op.shape = {1, 2, 4, 5, 8};
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
index d43c3b4a8ee59893d7d294b76bbe7238a64dc609..f8281f3a5725283d472e5e1a36e4d904b4dc1c49 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -55,25 +55,25 @@ bool ParseTocoFlagsFromCommandLineFlags(
            parsed_flags.default_ranges_max.default_value(),
            "If defined, will be used as the default value for the max bound "
            "of min/max ranges used for quantization."),
+      Flag("inference_type", parsed_flags.inference_type.bind(),
+           parsed_flags.inference_type.default_value(),
+           "Target data type of arrays in the output file (for input_arrays, "
+           "this may be overridden by inference_input_type)."),
+      Flag("inference_input_type", parsed_flags.inference_input_type.bind(),
+           parsed_flags.inference_input_type.default_value(),
+           "Target data type of input arrays. If not specified, inference_type "
+           "is used."),
       Flag("input_type", parsed_flags.input_type.bind(),
            parsed_flags.input_type.default_value(),
-           "Data type of the input array in the "
-           "output file. "),
+           "Deprecated ambiguous flag that set both --input_data_types and "
+           "--inference_input_type."),
       Flag("input_types", parsed_flags.input_types.bind(),
            parsed_flags.input_types.default_value(),
-           "Data types of the input arrays in the "
-           "output file. "
-           "Comma-separated list matching the enumeration order of "
-           "input_arrays."),
-      Flag("inference_type", parsed_flags.inference_type.bind(),
-           parsed_flags.inference_type.default_value(),
-           "Data type, in the output file, of internal and output arrays "
-           "that are FLOAT in the input file. Thus, the value FLOAT means "
-           "keep doing floating-point inference, while the value "
-           "QUANTIZED_UINT8 means replace all internal floating-point "
-           "arithmetic by integer arithmetic producing 8-bit integer "
-           "activations instead of float activations --- which we call "
-           "\'quantized inference\'."),
+           "Deprecated ambiguous flag that set both --input_data_types and "
+           "--inference_input_type. Was meant to be a "
+           "comma-separated list, but this was deprecated before "
+           "multiple-input-types was ever properly supported."),
+
       Flag("drop_fake_quant", parsed_flags.drop_fake_quant.bind(),
            parsed_flags.drop_fake_quant.default_value(),
            "Ignore and discard FakeQuant nodes. For instance, that can be used "
@@ -105,6 +105,13 @@ bool ParseTocoFlagsFromCommandLineFlags(
            parsed_flags.allow_custom_ops.default_value(),
            "If true, allow TOCO to create TF Lite Custom operators for all the"
            "unsupported Tensorflow ops."),
+      Flag(
+          "drop_control_dependency",
+          parsed_flags.drop_control_dependency.bind(),
+          parsed_flags.drop_control_dependency.default_value(),
+          "If true, ignore control dependency requirements in input TensorFlow "
+          "GraphDef. Otherwise an error will be raised upon control dependency "
+          "inputs."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -135,7 +142,6 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
           << #name;                                                          \
     }                                                                        \
   } while (false)
-
 #define READ_TOCO_FLAG(name, requirement)                     \
   ENFORCE_FLAG_REQUIREMENT(name, requirement);                \
   do {                                                        \
@@ -158,49 +164,49 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
 
   PARSE_TOCO_FLAG(FileFormat, input_format, FlagRequirement::kMustBeSpecified);
   PARSE_TOCO_FLAG(FileFormat, output_format, FlagRequirement::kMustBeSpecified);
-  FlagRequirement tflite_flags_requirement =
-      toco_flags->output_format() == TFLITE
-          ? FlagRequirement::kMustBeSpecified
-          : FlagRequirement::kMustNotBeSpecified;
-  PARSE_TOCO_FLAG(IODataType, inference_type, tflite_flags_requirement);
+  PARSE_TOCO_FLAG(IODataType, inference_type, FlagRequirement::kNone);
+  PARSE_TOCO_FLAG(IODataType, inference_input_type, FlagRequirement::kNone);
   READ_TOCO_FLAG(default_ranges_min, FlagRequirement::kNone);
   READ_TOCO_FLAG(default_ranges_max, FlagRequirement::kNone);
   READ_TOCO_FLAG(drop_fake_quant, FlagRequirement::kNone);
   READ_TOCO_FLAG(reorder_across_fake_quant, FlagRequirement::kNone);
   READ_TOCO_FLAG(allow_custom_ops, FlagRequirement::kNone);
+  READ_TOCO_FLAG(drop_control_dependency, FlagRequirement::kNone);
 
-#undef READ_TOCO_FLAG
-#undef PARSE_TOCO_FLAG
-
-  const bool input_type_specified = parsed_toco_flags.input_type.specified();
-  const bool input_types_specified = parsed_toco_flags.input_types.specified();
-  if (toco_flags->output_format() == TFLITE) {
-    QCHECK(input_type_specified || input_types_specified)
-        << "When output_format=TFLITE, either input_type or input_types needs "
-           "to be specified.";
-  } else {
-    QCHECK(!input_type_specified && !input_types_specified)
-        << "With this output_format, neither input_type nor input_types must "
-           "be specified.";
-  }
-  QCHECK(!(input_type_specified && input_types_specified))
-      << "input_type and input_types are mutually exclusive";
-  if (input_type_specified) {
-    IODataType type;
-    QCHECK(IODataType_Parse(parsed_toco_flags.input_type.value(), &type))
-        << "Unrecognized input_type: " << parsed_toco_flags.input_type.value();
-    toco_flags->add_input_types(type);
+  // Deprecated flag handling.
+  if (parsed_toco_flags.input_type.specified()) {
+    LOG(WARNING)
+        << "--input_type is deprecated. It was an ambiguous flag that set both "
+           "--input_data_types and --inference_input_type. If you are trying "
+           "to complement the input file with information about the type of "
+           "input arrays, use --input_data_type. If you are trying to control "
+           "the quantization/dequantization of real-numbers input arrays in "
+           "the output file, use --inference_input_type.";
+    toco::IODataType input_type;
+    QCHECK(toco::IODataType_Parse(parsed_toco_flags.input_type.value(),
+                                  &input_type));
+    toco_flags->set_inference_input_type(input_type);
   }
-  if (input_types_specified) {
+  if (parsed_toco_flags.input_types.specified()) {
+    LOG(WARNING)
+        << "--input_types is deprecated. It was an ambiguous flag that set "
+           "both --input_data_types and --inference_input_type. If you are "
+           "trying to complement the input file with information about the "
+           "type of input arrays, use --input_data_type. If you are trying to "
+           "control the quantization/dequantization of real-numbers input "
+           "arrays in the output file, use --inference_input_type.";
     std::vector<string> input_types =
         absl::StrSplit(parsed_toco_flags.input_types.value(), ',');
-    for (const string& t : input_types) {
-      IODataType type;
-      QCHECK(IODataType_Parse(t, &type))
-          << "Unrecognized input_types value " << t
-          << " in input_types=" << parsed_toco_flags.input_types.value();
-      toco_flags->add_input_types(type);
+    QCHECK(!input_types.empty());
+    for (int i = 1; i < input_types.size(); i++) {
+      QCHECK_EQ(input_types[i], input_types[0]);
     }
+    toco::IODataType input_type;
+    QCHECK(toco::IODataType_Parse(input_types[0], &input_type));
+    toco_flags->set_inference_input_type(input_type);
   }
+
+#undef READ_TOCO_FLAG
+#undef PARSE_TOCO_FLAG
 }
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/toco_flags.proto b/tensorflow/contrib/lite/toco/toco_flags.proto
index e900e1a25aa0ec20db5d09cef252d6d8143b4cab..3b9d7e22570b66aef2c9fc819e5ab4ec38e179f5 100644
--- a/tensorflow/contrib/lite/toco/toco_flags.proto
+++ b/tensorflow/contrib/lite/toco/toco_flags.proto
@@ -36,7 +36,7 @@ enum FileFormat {
 // are not normally encoded in model files and in general may not be thought
 // of as properties of models, instead describing how models are to be
 // processed in the context of the present tooling job.
-// Next Id: 11
+// Next Id: 13
 message TocoFlags {
   // Input file format
   optional FileFormat input_format = 1;
@@ -44,23 +44,47 @@ message TocoFlags {
   // Output file format
   optional FileFormat output_format = 2;
 
-  // Numeric data types of the input arrays in the output format.
-  // This controls what input types the output file will be expecting.
-  // This is not a description of the input types of the input file.
-  // For example, the input file may have a float input placeholder,
-  // but we may want to generate a quantized TFLite file from it,
-  // or a float TFLite file taking a quantized input.
+  // Similar to inference_type, but allows to control specifically the
+  // quantization of input arrays, separately from other arrays.
   //
-  // The length of this list should match the length of the input_arrays
-  // list in ModelFlags.
-  repeated IODataType input_types = 9;
+  // If not set, then the value of inference_type is implicitly used, i.e.
+  // by default input arrays are quantized like other arrays.
+  //
+  // Like inference_type, this only affects real-number arrays. By "real-number"
+  // we mean float arrays, and quantized arrays. This excludes plain
+  // integer arrays, strings arrays, and every other data type.
+  //
+  // The typical use for this flag is for vision models taking a bitmap
+  // as input, typically with uint8 channels, yet still requiring floating-point
+  // inference. For such image models, the uint8 input is quantized, i.e.
+  // the uint8 values are interpreted as real numbers, and the quantization
+  // parameters used for such input arrays are their mean_value, std_value
+  // parameters.
+  optional IODataType inference_input_type = 11;
 
-  // Numeric data type of the internal activations array and output array.
+  // Sets the type of real-number arrays in the output file, that is, controls
+  // the representation (quantization) of real numbers in the output file,
+  // except for input arrays, which are controlled by inference_input_type.
+  //
+  // NOTE: this flag only impacts real-number arrays. By "real-number"
+  // we mean float arrays, and quantized arrays. This excludes plain
+  // integer arrays, strings arrays, and every other data type.
+  //
+  // For real-number arrays, the impact of this flag is to allow the output
+  // file to choose a different real-numbers representation (quantization)
+  // from what the input file used. For any other types of arrays, changing
+  // the data type would not make sense.
+  //
+  // Specifically:
+  //    - If FLOAT, then real-numbers arrays will be of type float in
+  //      the output file. If they were quantized in the input file, then
+  //      they get dequantized.
+  //    - If QUANTIZED_UINT8, then real-numbers arrays will be quantized
+  //      as uint8 in the output file. If they were float in the input file,
+  //      then they get quantized.
+  //    - If not set, then all real-numbers arrays retain the same type in the
+  //      output file as they have in the input file.
   //
-  // As a matter of implementation detail, most model
-  // parameter arrays (weights, etc) will tend to also use this data type.
-  // Not all will, though: for instance, bias vectors will typically
-  // get quantized as int32 when weights and activations get quantized as uint8.
   optional IODataType inference_type = 4;
 
   // default_ranges_min and default_ranges_max are helpers to experiment
@@ -104,4 +128,12 @@ message TocoFlags {
   // If true, allow TOCO to create TF Lite Custom operators for all the
   // unsupported Tensorflow ops.
   optional bool allow_custom_ops = 10;
+
+  // Applies only to the case when the input format is TENSORFLOW_GRAPHDEF.
+  // If true, then control dependencies will be immediately dropped during
+  // import.
+  // If not set, the default behavior is as follows:
+  //    - Default to false if the output format is TENSORFLOW_GRAPHDEF.
+  //    - Default to true in all other cases.
+  optional bool drop_control_dependency = 12;
 }
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 232538a84123050c722929536f94d780d8da624e..d6652b7a41b5e3694bccd855e3f632c4c9351a03 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -51,6 +51,7 @@ void CheckUnsupportedOperations(const Model& model) {
 void MakeGeneralGraphTransformationsSet(
     GraphTransformationsSet* transformations) {
   CHECK(transformations->empty());
+  transformations->Add(new ConvertExpandDimsToReshape);
   transformations->Add(new ResolveReshapeAttributes);
   transformations->Add(new PropagateArrayDataTypes);
   transformations->Add(new PropagateFixedSizes);
@@ -77,6 +78,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new IdentifyRelu1);
   transformations->Add(new RemoveTrivialBinaryOperator);
   transformations->Add(new ReadFakeQuantMinMax);
+  transformations->Add(new ResolveBatchToSpaceNDAttributes);
   transformations->Add(new ResolvePadAttributes);
   transformations->Add(new ResolveStridedSliceAttributes);
   transformations->Add(new ResolveSliceAttributes);
@@ -85,40 +87,57 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new MakeInitialDequantizeOperator);
 }
 
-void SetArrayFinalDataTypes(const TocoFlags& toco_flags, Model* model) {
-  const bool output_is_tflite = toco_flags.output_format() == TFLITE;
-
-  if (output_is_tflite) {
-    if (!toco_flags.input_types().empty()) {
-      for (int i = 0; i < model->flags.input_arrays_size(); i++) {
-        int input_types_index = toco_flags.input_types_size() == 1 ? 0 : i;
-        const auto input_type = toco_flags.input_types(input_types_index);
-        ArrayDataType final_data_type = ArrayDataType::kNone;
-        switch (input_type) {
-          case FLOAT:
-            final_data_type = ArrayDataType::kFloat;
-            break;
-          case QUANTIZED_UINT8:
-            final_data_type = ArrayDataType::kUint8;
-            break;
-          case INT32:
-            final_data_type = ArrayDataType::kInt32;
-            break;
-          case INT64:
-            final_data_type = ArrayDataType::kInt64;
-            break;
-          default:
-            LOG(FATAL) << "Unknown data type";
-        }
-        model->arrays[model->flags.input_arrays(i).name()]->final_data_type =
-            final_data_type;
-      }
-    }
+bool SupportsQuantization(FileFormat format) {
+  return (format == GRAPHVIZ_DOT || format == TFLITE);
+  ;
+}
+
+bool SupportsFusedActivationFunction(FileFormat format) {
+  return (format == GRAPHVIZ_DOT || format == TFLITE);
+}
+
+bool SupportsLstmCell(FileFormat format) {
+  return (format == TENSORFLOW_GRAPHDEF || format == GRAPHVIZ_DOT);
+}
+
+bool SupportsPreallocatedWorkspace(FileFormat format) {
+  return (format == TFLITE);
+}
+
+bool IsRealValued(toco::ArrayDataType type) {
+  return static_cast<bool>(type == toco::ArrayDataType::kFloat ||
+                           type == toco::ArrayDataType::kUint8);
+}
+
+void SetFinalDataTypeOnInputs(const TocoFlags& toco_flags, Model* model) {
+  const FileFormat output_format = toco_flags.output_format();
+  ArrayDataType type;
+  if (toco_flags.has_inference_input_type()) {
+    type = ConvertIODataTypeToArrayDataType(toco_flags.inference_input_type());
+  } else if (toco_flags.has_inference_type()) {
+    type = ConvertIODataTypeToArrayDataType(toco_flags.inference_type());
+  } else if (!SupportsQuantization(output_format)) {
+    // Data type is implicitly float for non-quantized formats
+    type = ArrayDataType::kFloat;
   } else {
-    for (int i = 0; i < model->flags.input_arrays_size(); i++) {
-      model->arrays[model->flags.input_arrays(i).name()]->final_data_type =
-          ArrayDataType::kFloat;
+    // Nothing to do. Data types stay as-is.
+    return;
+  }
+
+  for (int i = 0; i < model->flags.input_arrays_size(); i++) {
+    string const& array_name = model->flags.input_arrays(i).name();
+    auto* array = model->arrays[array_name].get();
+    // Note that the notion of changing data types only applies to real-numbers
+    // arrays (see the documentation for inference_input_type).
+    // TODO(benoitjacob) this is assuming that uint8 arrays are quantized,
+    // i.e. represent real numbers by means of quantization parameters,
+    // and not plain integer uint8 input arrays.
+    if (!IsRealValued(array->data_type)) {
+      // Ignore non-real data types.
+      continue;
     }
+
+    array->final_data_type = type;
   }
 }
 
@@ -129,9 +148,16 @@ std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
                               const string& input_file_contents) {
   std::unique_ptr<Model> model;
   switch (toco_flags.input_format()) {
-    case TENSORFLOW_GRAPHDEF:
-      model = ImportTensorFlowGraphDef(model_flags, input_file_contents);
+    case TENSORFLOW_GRAPHDEF: {
+      TensorFlowImportFlags tf_import_flags;
+      tf_import_flags.drop_control_dependency =
+          toco_flags.has_drop_control_dependency()
+              ? toco_flags.drop_control_dependency()
+              : (toco_flags.output_format() != TENSORFLOW_GRAPHDEF);
+      model = ImportTensorFlowGraphDef(model_flags, tf_import_flags,
+                                       input_file_contents);
       break;
+    }
     case TFLITE:
       model = toco::tflite::Import(model_flags, input_file_contents);
       ResolveModelFlags(model_flags, model.get());
@@ -150,31 +176,21 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   const FileFormat output_format = toco_flags.output_format();
   const IODataType inference_type = toco_flags.inference_type();
 
-  const bool output_is_tflite = output_format == TFLITE;
-
-  const bool output_is_tflite_quantized =
-      output_is_tflite && inference_type == QUANTIZED_UINT8;
+  const bool quantize_output =
+      SupportsQuantization(output_format) && inference_type == QUANTIZED_UINT8;
 
-  if (output_is_tflite) {
-    QCHECK(toco_flags.input_types_size() == 1 ||
-           toco_flags.input_types_size() == model->flags.input_arrays_size())
-        << "Mismatched numbers of input_arrays and input_types";
-  }
-
-  if (output_is_tflite_quantized) {
-    for (const auto& input_type : toco_flags.input_types()) {
-      QCHECK_NE(input_type, FLOAT)
-          << "Quantized inference is not allowed with float inputs.";
-    }
+  if (quantize_output) {
+    QCHECK_NE(toco_flags.inference_input_type(), FLOAT)
+        << "Quantized inference is not allowed with float inputs.";
   }
 
-  SetArrayFinalDataTypes(toco_flags, model);
+  SetFinalDataTypeOnInputs(toco_flags, model);
 
   GraphTransformationsSet transformations;
   MakeGeneralGraphTransformationsSet(&transformations);
   auto* remove_trivial_reshape = new RemoveTrivialReshape;
   transformations.Add(remove_trivial_reshape);
-  if (output_format == TFLITE) {
+  if (SupportsFusedActivationFunction(output_format)) {
     transformations.Add(new FuseActivationFunctions);
   } else {
     transformations.Add(new UnfuseActivationFunctions);
@@ -188,34 +204,33 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
     // See the doc for --reorder_across_fake_quant: that flag is needed to
     // support some existing models, e.g. WordLens, that have FakeQuant
     // nodes in the wrong places.
-    // We currently unconditionally enable that behavior when the output
-    // format is DarwiNN because the DarwiNN test code does not make it
-    // easy to pass a new toco flag. Once that is resolved on the DarwiNN
-    // tests side, the special-casing of DarwiNN here can go away.
-    // TODO(benoitjacob): so drop it when we can.
-    if ((output_is_tflite_quantized &&
-         toco_flags.reorder_across_fake_quant())) {
+    // TODO(benoitjacob): drop special casing when we can.
+    if ((quantize_output && toco_flags.reorder_across_fake_quant())) {
       transformations.Add(new DropFakeQuant);
     }
   }
   transformations.Add(new ConvertPureConvToDepthwise);
   // TFLite export does not yet support fused LSTM cell.
-  if (output_format == TENSORFLOW_GRAPHDEF) {
+  if (SupportsLstmCell(output_format)) {
     transformations.Add(new IdentifyLstmCell);
   }
   transformations.Add(new ResolveConstantConcatenation);
   RunGraphTransformations(model, "general graph transformations",
                           transformations);
-  if (output_is_tflite_quantized) {
+  if (quantize_output) {
     RunGraphTransformations(model, "pre-quantization graph transformations",
                             {new HardcodeMinMax, new DropFakeQuant});
   }
 
-  if (output_is_tflite_quantized) {
+  if (quantize_output) {
     if (toco_flags.has_default_ranges_min() &&
         toco_flags.has_default_ranges_max()) {
       UseDefaultMinMaxRangeValues(model, toco_flags.default_ranges_min(),
                                   toco_flags.default_ranges_max());
+      // The new MinMax info may need to be propagated a bit.
+      RunGraphTransformations(
+          model, "default min-max range propagation graph transformations",
+          {new HardcodeMinMax});
     }
     CheckIsReadyForQuantization(*model);
     RunGraphTransformations(
@@ -242,7 +257,7 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
     CheckUnsupportedOperations(*model);
   }
 
-  if (output_is_tflite) {
+  if (SupportsPreallocatedWorkspace(output_format)) {
     AllocateTransientArrays(model, kDefaultTransientDataAlignment);
     LogDump(kLogLevelModelChanged, "AFTER ALLOCATION", *model);
   }
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index bcbfed62d305fd05c1ad162d74d587ce28c7fbbe..381168d15a5dff7a64a28906c183486d5149106a 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <unordered_set>
 #include <utility>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_replace.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "tensorflow/contrib/lite/toco/toco_port.h"
 #include "tensorflow/core/platform/logging.h"
 
-
 namespace toco {
 
 string LogName(const Operator& op) {
@@ -223,6 +223,10 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Tanh)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowAll)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowAssert)
+    HANDLE_OPERATORTYPENAME_CASE(ExpandDims)
+    HANDLE_OPERATORTYPENAME_CASE(Fill)
+    HANDLE_OPERATORTYPENAME_CASE(FloorMod)
+    HANDLE_OPERATORTYPENAME_CASE(FloorDiv)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowGreater)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowGreaterEqual)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowIdentity)
@@ -234,8 +238,12 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowMerge)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowMin)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowMinimum)
+    HANDLE_OPERATORTYPENAME_CASE(Neg)
     HANDLE_OPERATORTYPENAME_CASE(Pad)
     HANDLE_OPERATORTYPENAME_CASE(StridedSlice)
+    HANDLE_OPERATORTYPENAME_CASE(Stack)
+    HANDLE_OPERATORTYPENAME_CASE(Range)
+    HANDLE_OPERATORTYPENAME_CASE(Rank)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowReshape)
     HANDLE_OPERATORTYPENAME_CASE(Squeeze)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowRsqrt)
@@ -248,6 +256,8 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Sub)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowSum)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowTile)
+    HANDLE_OPERATORTYPENAME_CASE(Transpose)
+    HANDLE_OPERATORTYPENAME_CASE(TransposeConv)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowConcat)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowConcatV2)
     HANDLE_OPERATORTYPENAME_CASE(Cast)
@@ -258,6 +268,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(BatchToSpaceND)
     HANDLE_OPERATORTYPENAME_CASE(Mean)
     HANDLE_OPERATORTYPENAME_CASE(Svdf)
+    HANDLE_OPERATORTYPENAME_CASE(ArgMax)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowUnsupported)
     default:
       LOG(FATAL) << "Unhandled op type";
@@ -294,6 +305,7 @@ void LogArray(int log_level, const Model& model, const string& name) {
   VLOG(log_level) << "Array: " << name;
   switch (array.data_type) {
     case ArrayDataType::kNone:
+      VLOG(log_level) << "  Data type:";
       break;
     case ArrayDataType::kFloat:
       VLOG(log_level) << "  Data type: kFloat";
@@ -309,6 +321,24 @@ void LogArray(int log_level, const Model& model, const string& name) {
                       << static_cast<int>(array.data_type) << ")";
       break;
   }
+  switch (array.final_data_type) {
+    case ArrayDataType::kNone:
+      VLOG(log_level) << "  Final type:";
+      break;
+    case ArrayDataType::kFloat:
+      VLOG(log_level) << "  Final type: kFloat";
+      break;
+    case ArrayDataType::kInt32:
+      VLOG(log_level) << "  Final type: kInt32";
+      break;
+    case ArrayDataType::kUint8:
+      VLOG(log_level) << "  Final type: kUint8";
+      break;
+    default:
+      VLOG(log_level) << "  Final type: other (numerical value: "
+                      << static_cast<int>(array.data_type) << ")";
+      break;
+  }
   if (array.buffer) {
     VLOG(log_level) << "  Constant Buffer";
   }
@@ -536,15 +566,65 @@ bool IsConstantParameterArray(const Model& model, const string& name) {
   return !!model.arrays.at(name)->buffer;
 }
 
-void CheckNoMissingArray(const Model& model) {
-  for (const auto& op : model.operators) {
-    for (const auto& input : op->inputs) {
-      CHECK(model.arrays.count(input));
+namespace {
+void CheckInputArraysAreNotOutputArrays(const ModelFlags& model_flags) {
+  for (const auto& input_array : model_flags.input_arrays()) {
+    for (const string& output_array : model_flags.output_arrays()) {
+      QCHECK_NE(input_array.name(), output_array)
+          << "The array " << output_array
+          << " is listed in both --input_arrays and --output_arrays.";
     }
-    for (const auto& output : op->outputs) {
-      CHECK(model.arrays.count(output));
+  }
+}
+
+bool IsAsciiPrintable(const string& name) {
+  for (char c : name) {
+    if (!absl::ascii_isprint(c)) {
+      return false;
     }
   }
+  return true;
+}
+
+string DumpAscii(const string& name) {
+  string result;
+  port::AppendF(&result, "ASCII | Hex\n");
+  port::AppendF(&result, "------+----\n");
+  for (char c : name) {
+    if (absl::ascii_isprint(c)) {
+      port::AppendF(&result, "%c     | %x\n", c, c);
+    } else {
+      port::AppendF(&result, "      | %x   Not ASCII printable!\n", c);
+    }
+  }
+  return result;
+}
+
+void CheckNonAsciiIOArrays(const ModelFlags& model_flags) {
+  if (model_flags.allow_nonascii_arrays()) {
+    return;
+  }
+  for (const auto& input_array : model_flags.input_arrays()) {
+    QCHECK(IsAsciiPrintable(input_array.name()))
+        << "Non-ASCII-printable character found in --input_arrays: "
+        << input_array.name()
+        << ". Pass --allow_nonascii_arrays to allow that. "
+        << "Here is a dump of the string:\n\n"
+        << DumpAscii(input_array.name());
+  }
+  for (const string& output_array : model_flags.output_arrays()) {
+    QCHECK(IsAsciiPrintable(output_array))
+        << "Non-ASCII-printable character found in --output_arrays: "
+        << output_array << ". Pass --allow_nonascii_arrays to allow that. "
+        << "Here is a dump of the string:\n\n"
+        << DumpAscii(output_array);
+  }
+}
+
+void CheckNonExistentIOArrays(const Model& model) {
+  if (model.flags.allow_nonexistent_arrays()) {
+    return;
+  }
   for (const auto& input_array : model.flags.input_arrays()) {
     CHECK(model.arrays.count(input_array.name()))
         << "Input array not found: " << input_array.name();
@@ -554,9 +634,24 @@ void CheckNoMissingArray(const Model& model) {
         << "Output array not found: " << output_array;
   }
   for (const auto& rnn_state : model.flags.rnn_states()) {
-    CHECK(model.arrays.count(rnn_state.state_array()));
-    CHECK(model.arrays.count(rnn_state.back_edge_source_array()));
+    if (!rnn_state.discardable()) {
+      CHECK(model.arrays.count(rnn_state.state_array()));
+      CHECK(model.arrays.count(rnn_state.back_edge_source_array()));
+    }
+  }
+}
+}  // namespace
+
+void CheckNoMissingArray(const Model& model) {
+  for (const auto& op : model.operators) {
+    for (const auto& input : op->inputs) {
+      CHECK(model.arrays.count(input));
+    }
+    for (const auto& output : op->outputs) {
+      CHECK(model.arrays.count(output));
+    }
   }
+  CheckNonExistentIOArrays(model);
 }
 
 void FixNoMissingArray(Model* model) {
@@ -572,17 +667,23 @@ void FixNoMissingArray(Model* model) {
       }
     }
   }
-  for (const string& output_array : model->flags.output_arrays()) {
-    if (!model->arrays.count(output_array)) {
+  if (model->flags.allow_nonexistent_arrays()) {
+    for (const string& output_array : model->flags.output_arrays()) {
       model->GetOrCreateArray(output_array);
     }
+    for (const auto& rnn_state : model->flags.rnn_states()) {
+      model->GetOrCreateArray(rnn_state.state_array());
+      model->GetOrCreateArray(rnn_state.back_edge_source_array());
+    }
   }
 }
 
 void CheckNoOrphanedArray(const Model& model) {
   std::unordered_set<string> arrays_without_known_use;
   for (const auto& array : model.arrays) {
-    arrays_without_known_use.insert(array.first);
+    if (IsDiscardableArray(model, array.first)) {
+      arrays_without_known_use.insert(array.first);
+    }
   }
   for (const auto& op : model.operators) {
     for (const auto& input : op->inputs) {
@@ -592,6 +693,10 @@ void CheckNoOrphanedArray(const Model& model) {
       arrays_without_known_use.erase(output);
     }
   }
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    arrays_without_known_use.erase(rnn_state.state_array());
+    arrays_without_known_use.erase(rnn_state.back_edge_source_array());
+  }
   if (!arrays_without_known_use.empty()) {
     for (const auto& array : arrays_without_known_use) {
       LOG(INFO) << "Error: Orphaned array: " << array;
@@ -613,8 +718,14 @@ void FixNoOrphanedArray(Model* model) {
       arrays_without_known_use.erase(output);
     }
   }
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    arrays_without_known_use.erase(rnn_state.state_array());
+    arrays_without_known_use.erase(rnn_state.back_edge_source_array());
+  }
   for (const auto& array : arrays_without_known_use) {
-    model->arrays.erase(array);
+    if (IsDiscardableArray(*model, array)) {
+      model->arrays.erase(array);
+    }
   }
 }
 
@@ -772,48 +883,13 @@ void FixOperatorOrdering(Model* model) {
       << "the above code should have generated a FATAL error already!";
 }
 
-// Checks that the --input_arrays of the Model are actually used by at least
-// one of the --output_arrays i.e. that the graph contains a path from each one
-// of the inputs to at least one of the outputs. This catches cases where the
-// user passed the wrong --input_arrays or --output_arrays, which otherwise may
-// result in cryptic error messages.
-void CheckInputUsedByOutputs(const Model& model) {
-  std::set<string> used_arrays;
-  for (const string& output : model.flags.output_arrays()) {
-    used_arrays.insert(output);
-  }
-  for (int i = model.operators.size() - 1; i >= 0; i--) {
-    bool is_op_used = false;
-    for (const string& op_output : model.operators[i]->outputs) {
-      if (used_arrays.count(op_output)) {
-        is_op_used = true;
-        break;
-      }
-    }
-    if (!is_op_used) {
-      continue;
-    }
-    for (const string& op_input : model.operators[i]->inputs) {
-      used_arrays.insert(op_input);
-    }
-  }
-  for (const auto& input_array : model.flags.input_arrays()) {
-    QCHECK(used_arrays.count(input_array.name()))
-        << "The graph does not connect the input (" << input_array.name()
-        << ") specified by --input_arrays to any of the specified "
-        << "--output_arrays ("
-        << absl::StrJoin(model.flags.output_arrays(), ", ")
-        << "). Did you pass the wrong flags for this model, "
-        << "or is that model's graph actually incomplete?";
-  }
-}
-
 void CheckInvariants(const Model& model) {
+  CheckInputArraysAreNotOutputArrays(model.flags);
+  CheckNonAsciiIOArrays(model.flags);
   CheckNoMissingArray(model);
   CheckNoOrphanedArray(model);
   CheckArrayFieldsConsistent(model);
   CheckOperatorOrdering(model);
-  CheckInputUsedByOutputs(model);
 }
 
 void CheckCountInRange(const ::toco::ModelFlags::ModelCheck& model_check,
@@ -891,9 +967,9 @@ void CreateOrCheckRnnStateArray(const string& name, int size, Model* model) {
     // Pick 'num_dims' and 'batch' from the first input_arrays, unless we find
     // a better match by name.
     if (input_array.name() == name || num_dims == -1) {
-      num_dims = input_array.shape_size();
-      if (num_dims != 0) {
-        batch = input_array.shape(0);
+      num_dims = input_array.shape().dims_size();
+      if (num_dims > 0) {
+        batch = input_array.shape().dims(0);
       }
     }
   }
@@ -962,35 +1038,39 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
     RESOLVE_MODEL_FLAG(mean_value);
 #undef RESOLVE_MODEL_FLAG
 
-    if (!specified_input_array.shape().empty()) {
-      if (!dst_input_array->shape().empty()) {
-        QCHECK_EQ(specified_input_array.shape().size(),
-                  dst_input_array->shape().size())
+    if (specified_input_array.has_shape()) {
+      if (dst_input_array->has_shape()) {
+        QCHECK_EQ(specified_input_array.shape().dims_size(),
+                  dst_input_array->shape().dims_size())
             << "For input array '" << specified_input_array.name() << "', "
             << "size of specified input shape flag with size: "
-            << specified_input_array.shape().size()
+            << specified_input_array.shape().dims_size()
             << " does not agree with already defined input shape"
                " of this model, with size: "
-            << dst_input_array->shape().size();
+            << dst_input_array->shape().dims_size();
         // We treat the first dimension as a special case, since it is often
         // a batch size and the input_shape flag is effectively overriding
         // the model.
-        for (int i = 1; i < specified_input_array.shape().size(); i++) {
-          QCHECK_EQ(specified_input_array.shape().Get(i),
-                    dst_input_array->shape().Get(i))
+        for (int i = 1; i < specified_input_array.shape().dims_size(); i++) {
+          QCHECK_EQ(specified_input_array.shape().dims(i),
+                    dst_input_array->shape().dims(i))
               << "At dimension number " << i << " of input array "
               << specified_input_array.name() << ", the specified shape's "
               << "dimension flag with dimension: "
-              << specified_input_array.shape().Get(i)
+              << specified_input_array.shape().dims(i)
               << " does not agree with already defined shape"
               << " of this model, with dimension: "
-              << dst_input_array->shape().Get(i);
+              << dst_input_array->shape().dims(i);
         }
       } else {
-        dst_input_array->mutable_shape()->CopyFrom(
-            specified_input_array.shape());
+        *dst_input_array->mutable_shape() = specified_input_array.shape();
       }
     }
+
+    if (specified_input_array.has_data_type()) {
+      QCHECK(!dst_input_array->has_data_type());
+      dst_input_array->set_data_type(specified_input_array.data_type());
+    }
   }
 
   if (model_flags.output_arrays_size() > 0) {
@@ -1011,41 +1091,37 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
   }
 
   RESOLVE_MODEL_FLAG(variable_batch)
-  RESOLVE_MODEL_FLAG(drop_control_dependency)
 
 #undef RESOLVE_MODEL_FLAG
 
-  if (model->flags.rnn_states_size() == 0) {
+  if (!model_flags.rnn_states().empty()) {
     model->flags.mutable_rnn_states()->CopyFrom(model_flags.rnn_states());
-  } else {
-    CHECK_EQ(model->flags.rnn_states_size(), model_flags.rnn_states_size());
-    for (int i = 0; i < model->flags.rnn_states_size(); i++) {
-      CHECK_EQ(model->flags.rnn_states(i).state_array(),
-               model_flags.rnn_states(i).state_array());
-      CHECK_EQ(model->flags.rnn_states(i).back_edge_source_array(),
-               model_flags.rnn_states(i).back_edge_source_array());
-    }
   }
 
   if (model->flags.model_checks_size() == 0) {
     model->flags.mutable_model_checks()->CopyFrom(model_flags.model_checks());
   }
 
-  QCHECK_GT(model->flags.input_arrays_size(), 0)
-      << "This model does not define input arrays, so a "
-         "--input_arrays flag must be given on the command-line.";
   QCHECK_GT(model->flags.output_arrays_size(), 0)
       << "This model does not define output arrays, so a "
          "--output_arrays flag must be given on the command-line.";
 
   for (const auto& input_array_proto : model->flags.input_arrays()) {
-    QCHECK(!input_array_proto.shape().empty())
-        << "This model does not have shape defined for input array "
-        << input_array_proto.name()
-        << ", so one must be specified by a non-empty --input_shape "
-           "command-line flag.";
-
     auto& input_array = model->GetOrCreateArray(input_array_proto.name());
+    if (input_array_proto.has_data_type()) {
+      const ArrayDataType specified_type =
+          ConvertIODataTypeToArrayDataType(input_array_proto.data_type());
+      QCHECK(specified_type != ArrayDataType::kNone);
+      if (input_array.data_type != ArrayDataType::kNone) {
+        QCHECK(specified_type == input_array.data_type)
+            << "For input array " << input_array_proto.name()
+            << " the specified input data type "
+            << IODataType_Name(input_array_proto.data_type())
+            << " conflicts with the existing type.";
+      }
+      input_array.data_type = specified_type;
+    }
+
     if (input_array.data_type == ArrayDataType::kNone) {
       // We start out with a float input array;
       // that may get replaced by a uint8 array later, by
@@ -1055,16 +1131,25 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
 
     // Compare/merge the model->flags describing the input_shape with
     // the actual input array's shape.
-    auto& input_array_dims = *input_array.mutable_shape()->mutable_dims();
-    if (input_array_dims.empty()) {
-      for (auto dim : input_array_proto.shape()) {
-        CHECK_GE(dim, 1);
-        input_array_dims.push_back(dim);
+    if (!input_array.has_shape()) {
+      if (input_array_proto.has_shape()) {
+        auto& input_array_dims = *input_array.mutable_shape()->mutable_dims();
+        for (auto dim : input_array_proto.shape().dims()) {
+          CHECK_GE(dim, 1);
+          input_array_dims.push_back(dim);
+        }
       }
     } else {
-      CHECK_EQ(input_array_dims.size(), input_array_proto.shape_size());
-      for (int i = 0; i < input_array_dims.size(); i++) {
-        CHECK_EQ(input_array_dims[i], input_array_proto.shape(i));
+      if (input_array_proto.has_shape()) {
+        // If an input shape was specified on the flags ensure that it matches
+        // the actual shape in the model.
+        const auto& input_array_dims =
+            *input_array.mutable_shape()->mutable_dims();
+        CHECK_EQ(input_array_dims.size(),
+                 input_array_proto.shape().dims_size());
+        for (int i = 0; i < input_array_dims.size(); i++) {
+          CHECK_EQ(input_array_dims[i], input_array_proto.shape().dims(i));
+        }
       }
     }
 
@@ -1093,6 +1178,16 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
     CreateOrCheckRnnStateArray(rnn_state.state_array(), rnn_state.size(),
                                model);
   }
+
+  for (const auto& input_array : model->flags.input_arrays()) {
+    if (input_array.has_shape()) {
+      CHECK(input_array.shape().dims_size());
+    }
+  }
+
+  model->flags.set_allow_nonascii_arrays(model_flags.allow_nonascii_arrays());
+  model->flags.set_allow_nonexistent_arrays(
+      model_flags.allow_nonexistent_arrays());
 }
 
 void CheckIsReadyForQuantization(const Model& model) {
@@ -1156,6 +1251,8 @@ int ElementSize(ArrayDataType data_type) {
       return 4;
     case ArrayDataType::kUint8:
       return 1;
+    case ArrayDataType::kInt64:
+      return 8;
     default:
       LOG(FATAL) << "Should not get here.";
       return 0;
@@ -1530,11 +1627,13 @@ bool IsDiscardableArray(const Model& model, const string& array_name) {
     }
   }
   for (const auto& rnn_state : model.flags.rnn_states()) {
-    if (array_name == rnn_state.state_array()) {
-      return false;
-    }
-    if (array_name == rnn_state.back_edge_source_array()) {
-      return false;
+    if (!rnn_state.discardable()) {
+      if (array_name == rnn_state.state_array()) {
+        return false;
+      }
+      if (array_name == rnn_state.back_edge_source_array()) {
+        return false;
+      }
     }
   }
   return true;
@@ -1544,9 +1643,28 @@ void CheckFinalDataTypesSatisfied(const Model& model) {
   for (const auto& array_entry : model.arrays) {
     const auto& array = *array_entry.second;
     if (array.final_data_type != ArrayDataType::kNone) {
-      CHECK(array.final_data_type == array.data_type);
+      CHECK(array.final_data_type == array.data_type)
+          << "Array \"" << array_entry.first
+          << "\" has mis-matching actual and final data types ("
+          << static_cast<int>(array.data_type) << ","
+          << static_cast<int>(array.final_data_type) << ").";
     }
   }
 }
 
+ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type) {
+  switch (type) {
+    case FLOAT:
+      return ArrayDataType::kFloat;
+    case QUANTIZED_UINT8:
+      return ArrayDataType::kUint8;
+    case INT32:
+      return ArrayDataType::kInt32;
+    case INT64:
+      return ArrayDataType::kInt64;
+    default:
+      return ArrayDataType::kNone;
+  }
+}
+
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index e863996d7b685e4a8741553cba90afe98568ea08..d820d619d0de425407e88076082a3e0f8d4783a9 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -288,6 +288,8 @@ bool IsDiscardableArray(const Model& model, const string& array_name);
 
 void CheckFinalDataTypesSatisfied(const Model& model);
 
+ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type);
+
 }  // namespace toco
 
 #endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 21b32d8434204ca625ba0c5d3f371ee8061b77d7..751682215bce37a8e4b8befe70b5288617053b54 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -13,6 +13,7 @@ tf_cc_binary(
         "//tensorflow/contrib/lite/tools:gen_op_registration",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/contrib/lite/tools/benchmark_model.cc b/tensorflow/contrib/lite/tools/benchmark_model.cc
index f80949b23e417d074e070a28608688d8863765b5..6ae3ab57294a92162b15f326630ac202a9ba2a82 100644
--- a/tensorflow/contrib/lite/tools/benchmark_model.cc
+++ b/tensorflow/contrib/lite/tools/benchmark_model.cc
@@ -31,7 +31,12 @@ void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
 #endif
 
 #define LOG(x) std::cerr
-#define CHECK(x) if (!(x)) { LOG(ERROR) << #x << "failed"; exit(1); }
+
+#define CHECK(x)                  \
+  if (!(x)) {                     \
+    LOG(ERROR) << #x << "failed"; \
+    exit(1);                      \
+  }
 
 namespace tensorflow {
 namespace benchmark_tflite_model {
diff --git a/tensorflow/contrib/lite/tools/gen_op_registration_main.cc b/tensorflow/contrib/lite/tools/gen_op_registration_main.cc
index 1b28b8bcd97125a67bdf8eecb2c61a999a72425d..17b514c9169817479e18eecf5799ea4371f3b051 100644
--- a/tensorflow/contrib/lite/tools/gen_op_registration_main.cc
+++ b/tensorflow/contrib/lite/tools/gen_op_registration_main.cc
@@ -13,30 +13,50 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cassert>
 #include <fstream>
+#include <map>
 #include <sstream>
 #include <string>
 #include <vector>
 
+#include "absl/strings/strip.h"
 #include "tensorflow/contrib/lite/tools/gen_op_registration.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
+const char kInputModelFlag[] = "input_model";
+const char kOutputRegistrationFlag[] = "output_registration";
+const char kTfLitePathFlag[] = "tflite_path";
+
 using tensorflow::Flag;
 using tensorflow::Flags;
 using tensorflow::string;
 
+void ParseFlagAndInit(int argc, char** argv, string* input_model,
+                      string* output_registration, string* tflite_path) {
+  std::vector<tensorflow::Flag> flag_list = {
+      Flag(kInputModelFlag, input_model, "path to the tflite model"),
+      Flag(kOutputRegistrationFlag, output_registration,
+           "filename for generated registration code"),
+      Flag(kTfLitePathFlag, tflite_path, "Path to tensorflow lite dir"),
+  };
+
+  Flags::Parse(&argc, argv, flag_list);
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+}
+
 namespace {
 
-void GenerateFileContent(const string& filename,
+void GenerateFileContent(const std::string& tflite_path,
+                         const std::string& filename,
                          const std::vector<string>& builtin_ops,
                          const std::vector<string>& custom_ops) {
   std::ofstream fout(filename);
 
-  fout << "#include "
-          "\"third_party/tensorflow/contrib/lite/model.h\"\n";
-  fout << "#include "
-          "\"third_party/tensorflow/contrib/lite/tools/mutable_op_resolver.h\"\n";
+  fout << "#include \"" << tflite_path << "/model.h\"\n";
+  fout << "#include \"" << tflite_path << "/tools/mutable_op_resolver.h\"\n";
+
   fout << "namespace tflite {\n";
   fout << "namespace ops {\n";
   if (!builtin_ops.empty()) {
@@ -78,22 +98,20 @@ void GenerateFileContent(const string& filename,
 int main(int argc, char** argv) {
   string input_model;
   string output_registration;
-  std::vector<tensorflow::Flag> flag_list = {
-      Flag("input_model", &input_model, "path to the tflite model"),
-      Flag("output_registration", &output_registration,
-           "filename for generated registration code"),
-  };
-  Flags::Parse(&argc, argv, flag_list);
+  string tflite_path;
+  ParseFlagAndInit(argc, argv, &input_model, &output_registration,
+                   &tflite_path);
 
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
   std::vector<string> builtin_ops;
   std::vector<string> custom_ops;
-
   std::ifstream fin(input_model);
   std::stringstream content;
   content << fin.rdbuf();
-  const ::tflite::Model* model = ::tflite::GetModel(content.str().data());
+  // Need to store content data first, otherwise, it won't work in bazel.
+  string content_str = content.str();
+  const ::tflite::Model* model = ::tflite::GetModel(content_str.data());
   ::tflite::ReadOpsFromModel(model, &builtin_ops, &custom_ops);
-  GenerateFileContent(output_registration, builtin_ops, custom_ops);
+  GenerateFileContent(tflite_path, output_registration, builtin_ops,
+                      custom_ops);
   return 0;
 }
diff --git a/tensorflow/contrib/lite/tools/mutable_op_resolver.h b/tensorflow/contrib/lite/tools/mutable_op_resolver.h
index 8206a5481d7c43a9c8fb8445d056dbc7f022cfcc..906553da570720a0c4b90bbd2eebb6d8bdea6bb8 100644
--- a/tensorflow/contrib/lite/tools/mutable_op_resolver.h
+++ b/tensorflow/contrib/lite/tools/mutable_op_resolver.h
@@ -20,15 +20,14 @@ limitations under the License.
 #include "tensorflow/contrib/lite/model.h"
 
 // Needed to resolve unordered_set hash on older compilers.
-namespace std
-{
-template<>
-  struct hash<tflite::BuiltinOperator> {
-    size_t operator()(const tflite::BuiltinOperator &op) const {
-      return std::hash<int>()(op);
-    }
-  };
-}
+namespace std {
+template <>
+struct hash<tflite::BuiltinOperator> {
+  size_t operator()(const tflite::BuiltinOperator& op) const {
+    return std::hash<int>()(op);
+  }
+};
+}  // namespace std
 
 namespace tflite {
 
@@ -47,7 +46,7 @@ class MutableOpResolver : public OpResolver {
   void AddCustom(const char* name, TfLiteRegistration* registration);
 
  private:
-  std::map<tflite::BuiltinOperator, TfLiteRegistration*> builtins_;
+  std::map<int, TfLiteRegistration*> builtins_;
   std::map<std::string, TfLiteRegistration*> custom_ops_;
 };
 
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index b7b5418fe91e496f021b44fc32a33d2a549782e5..8ca03f4193f260ce32f942ccaf76a8260b282156 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -7,7 +7,7 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:internal"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 # TODO(yleon): Refactor after one we switching to the V2 kernels.
 py_library(
@@ -26,13 +26,14 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "lookup_ops_test",
     size = "small",
     srcs = ["lookup_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":lookup_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -43,9 +44,8 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
     ],
+    grpc_enabled = True,
 )
 
 filegroup(
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index e2e6c055912ccc1bfad70e88d65308225964822a..ee84b5b4c8a9e41fe07b4e9dfdc93e31f807d35d 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -300,7 +300,7 @@ ifeq ($(TARGET),ANDROID)
 	ifeq ($(ANDROID_ARCH),x86_64)
 		TOOLCHAIN := x86_64-4.9
 		SYSROOT_ARCH := x86_64
-		BIN_PREFIX := x86-64-linux-android
+		BIN_PREFIX := x86_64-linux-android
 		MARCH_OPTION :=
 	endif
     
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index 65bd60c12a9f511b729352c99cb0bf6cd1c74507..9345303ff11462a447ed6299b0ac3cba558ea68b 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -16,17 +16,17 @@ This static library will not contain:
 
  - Python or other language bindings
  - GPU support
- 
+
 You can target:
 - iOS
 - OS X (macOS)
 - Android
 - Raspberry-PI
- 
+
 You will compile tensorflow and protobuf libraries that you can link into other
 applications.  You will also compile the [benchmark](../../tools/benchmark/)
 application that will let you check your application.
- 
+
 ## Before you start (all platforms)
 
 First, clone this TensorFlow repository.
@@ -58,9 +58,9 @@ You should then be able to run the `build_all_linux.sh` script to compile:
 tensorflow/contrib/makefile/build_all_linux.sh
 ```
 
-This should compile a static library in 
-`tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a`, 
-and create an example executable at `tensorflow/contrib/makefile/gen/bin/benchmark`. 
+This should compile a static library in
+`tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a`,
+and create an example executable at `tensorflow/contrib/makefile/gen/bin/benchmark`.
 
 Get the graph file, if you have not already:
 
@@ -201,7 +201,7 @@ library in a simple app.
 ### Building by hand
 
 This section covers each step of building.  For all the code in one place, see
-[build_all_ios.sh](build_all_ios.sh). 
+[build_all_ios.sh](build_all_ios.sh).
 
 If you have not already, you will need to download dependencies:
 
@@ -232,7 +232,7 @@ make -f tensorflow/contrib/makefile/Makefile \
 
 This creates a library in
 `tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a` that you can link any
-xcode project against. 
+xcode project against.
 
 To see TensorFlow running on iOS, the example Xcode project in
 [tensorflow/examples/ios](../../examples/ios/) shows how to use the static
@@ -258,15 +258,15 @@ tensorflow/contrib/makefile/compile_ios_tensorflow.sh -f "-O3" -h tensorflow/con
 
 In XCode, you will need to use -force_load in the linker flags
 section of the build settings to pull in the global constructors that are used
-to register ops and kernels. 
+to register ops and kernels.
 
 #### Optimization
- 
+
 The `compile_ios_tensorflow.sh` script can take optional command-line arguments.
 The first argument will be passed as a C++ optimization flag and defaults to
 debug mode. If you are concerned about performance or are working on a release
 build, you would likely want a higher optimization setting, like so:
- 
+
 ```bash
 compile_ios_tensorflow.sh -f "-Os"
 ```
@@ -330,7 +330,7 @@ what you need for your desired system.
 ## Dependency Management
 
 The Makefile loads in a list of dependencies stored in text files. These files
-are generated from the main Bazel build by running 
+are generated from the main Bazel build by running
 `tensorflow/contrib/makefile/gen_file_lists.sh`. You'll need to re-run this i
 you make changes to the files that are included in the build.
 
@@ -361,10 +361,10 @@ codebase can sometimes break the makefile build process. If you find that tests
 relying on this makefile are failing with a change you're involved in, here are
 some trouble-shooting steps:
 
- - Try to reproduce the issue on your platform. If you're on Linux, running 
+ - Try to reproduce the issue on your platform. If you're on Linux, running
  `make -f tensorflow/contrib/makefile/Makefile` should be enough to recreate
   most issues. For other platforms, see the sections earlier in this document.
-  
+
  - The most common cause of breakages are files that have been added to the
   Bazel build scripts, but that the makefile isn't aware of. Typical symptoms
   of this include linker errors mentioning missing symbols or protobuf headers
@@ -377,11 +377,11 @@ some trouble-shooting steps:
   `tensorflow/core/BUILD`, so if you change the wildcards there to include new
   files you'll need to also update `CORE_CC_ALL_SRCS` and `CORE_CC_EXCLUDE_SRCS`
   in the makefile.
-  
+
  - Some of the supported platforms use clang instead of gcc as their compiler,
   so if you're hitting compile errors you may need to tweak your code to be more
   friendly to different compilers by avoiding gcc extensions or idioms.
-  
+
 These are the most common reasons for makefile breakages, but it's also
 possible you may hit something unusual, like a platform incompatibility. For
 those, you'll need to see if you can reproduce the issue on that particular
diff --git a/tensorflow/contrib/makefile/compile_ios_protobuf.sh b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
index 43e5809dd255d8eeb39e9a113a52808d6414dc8f..8fa20213633414d134d6c6a50e151cce2ac8a368 100755
--- a/tensorflow/contrib/makefile/compile_ios_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
@@ -270,7 +270,7 @@ case "$1" in
         echo "Unknown ARCH"
         exit 1
         ;;
-esac 
+esac
 }
 
 for build_element in "${build_targets[@]}"
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index 930e6b8dea723aad91e3fdae10cf3b58cdd0fa46..7927997678f077a716d81749561068f259d9744f 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -28,7 +28,7 @@ usage="usage: $prog [-t linux|ios|android|macos|native]
         [-a architecture] [-v android_api_version]
 
 A script to build nsync for tensorflow.
-This script can be run on Linux or MacOS host platforms, and can target 
+This script can be run on Linux or MacOS host platforms, and can target
 Linux, MacOS, iOS, or Android.
 
 Options:
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index a2b444d53aeb5738786483a451bcc529686a92fd..b61044130897cf0dddc37e460b4e1618c3a7e2e9 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -19,13 +19,20 @@ set -e
 DOWNLOADS_DIR=tensorflow/contrib/makefile/downloads
 BZL_FILE_PATH=tensorflow/workspace.bzl
 
-EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
+# Ensure it is being run from repo root
+if [ ! -f $BZL_FILE_PATH ]; then
+  echo "Could not find ${BZL_FILE_PATH}":
+  echo "Likely you are not running this from the root directory of the repository.";
+  exit 1;
+fi
+
+EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
 GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 RE2_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
-FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
+FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
diff --git a/tensorflow/contrib/makefile/rename_protobuf.sh b/tensorflow/contrib/makefile/rename_protobuf.sh
index b3bff2d5032fd817f34546b1a0663f255d8d3f77..8d52c1a1694b79f24e6a85a7757df1c35c9a99b5 100755
--- a/tensorflow/contrib/makefile/rename_protobuf.sh
+++ b/tensorflow/contrib/makefile/rename_protobuf.sh
@@ -38,7 +38,7 @@
 #
 # Note that this script modifies the source code in-place, so once it's been run
 # it's no longer suitable for further manual modifications, since the difference
-# with the top of tree will already be large. 
+# with the top of tree will already be large.
 
 mv tensorflow/contrib/makefile/downloads/protobuf/src/google/protobuf \
  tensorflow/contrib/makefile/downloads/protobuf//src/google/protobuf3
@@ -71,7 +71,7 @@ sed -i '' 's%::google::protobuf;%google::protobuf3;%' \
 
 # Fix up a couple of special build scripts that look for particular files.
 sed -i '' 's%src/google/protobuf/message.cc%src/google/protobuf3/message.cc%' \
- tensorflow/contrib/makefile/downloads/protobuf/configure.ac 
+ tensorflow/contrib/makefile/downloads/protobuf/configure.ac
 sed -i '' 's%src/google/protobuf/stubs/common.h%src/google/protobuf3/stubs/common.h%' \
  tensorflow/contrib/makefile/downloads/protobuf/autogen.sh
 
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index fbcda0421e38a48b090f58ae30dffac95a7d7614..5f275663986f9d480659880ab601eeb5c41037be 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -68,6 +68,8 @@ tensorflow/core/kernels/scatter_nd_op_cpu_impl_2.cc
 tensorflow/core/kernels/scatter_nd_op_cpu_impl_3.cc
 tensorflow/core/kernels/scatter_nd_op_cpu_impl_4.cc
 tensorflow/core/kernels/scatter_nd_op_cpu_impl_5.cc
+tensorflow/core/kernels/scatter_nd_op_cpu_impl_6.cc
+tensorflow/core/kernels/scatter_nd_op_cpu_impl_7.cc
 tensorflow/core/kernels/scatter_nd_op.cc
 tensorflow/core/kernels/save_restore_tensor.cc
 tensorflow/core/kernels/save_restore_v2_ops.cc
@@ -78,6 +80,7 @@ tensorflow/core/kernels/reverse_op.cc
 tensorflow/core/kernels/restore_op.cc
 tensorflow/core/kernels/resize_nearest_neighbor_op.cc
 tensorflow/core/kernels/resize_bilinear_op.cc
+tensorflow/core/kernels/reshape_util.cc
 tensorflow/core/kernels/reshape_op.cc
 tensorflow/core/kernels/relu_op.cc
 tensorflow/core/kernels/reduction_ops_sum.cc
@@ -131,6 +134,8 @@ tensorflow/core/kernels/gather_nd_op_cpu_impl_2.cc
 tensorflow/core/kernels/gather_nd_op_cpu_impl_3.cc
 tensorflow/core/kernels/gather_nd_op_cpu_impl_4.cc
 tensorflow/core/kernels/gather_nd_op_cpu_impl_5.cc
+tensorflow/core/kernels/gather_nd_op_cpu_impl_6.cc
+tensorflow/core/kernels/gather_nd_op_cpu_impl_7.cc
 tensorflow/core/kernels/fused_batch_norm_op.cc
 tensorflow/core/kernels/function_ops.cc
 tensorflow/core/kernels/fill_functor.cc
@@ -143,6 +148,7 @@ tensorflow/core/kernels/dynamic_stitch_op.cc
 tensorflow/core/kernels/dynamic_partition_op.cc
 tensorflow/core/kernels/decode_bmp_op.cc
 tensorflow/core/kernels/depthtospace_op.cc
+tensorflow/core/kernels/data_format_ops.cc
 tensorflow/core/kernels/spacetodepth_op.cc
 tensorflow/core/kernels/dense_update_ops.cc
 tensorflow/core/kernels/deep_conv2d.cc
@@ -284,3 +290,4 @@ tensorflow/core/kernels/spacetobatch_op.cc
 tensorflow/core/kernels/batchtospace_op.cc
 tensorflow/core/kernels/warn_about_ints.cc
 tensorflow/core/kernels/segment_reduction_ops.cc
+tensorflow/core/kernels/batch_util.cc
diff --git a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
index 7e2e96e160167ae68d3bdabacbbbeb45df61778f..39c0d5af45b4a81fa4dde0b5deac14a3af372cbb 100644
--- a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
+++ b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
@@ -59,7 +59,7 @@ REGISTER_KERNEL_BUILDER(Name("BytesInUse").Device(DEVICE_GPU).HostMemory("out"),
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(
-    Name("BytesInUse").Device(DEVICE_SYCL).HostMemory("out"), MaxBytesInUseOp);
+    Name("BytesInUse").Device(DEVICE_SYCL).HostMemory("out"), BytesInUseOp);
 #endif  // TENSORFLOW_USE_SYCL
 
 // Op that measures the total memory (in bytes) of a device.
diff --git a/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py b/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py
index d1b430b8039fcf7e10bcb842c3f34b960b9026b3..02c2ac06fb7dc0c930deaaa4c21a6971d96f19a1 100644
--- a/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py
+++ b/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py
@@ -77,8 +77,9 @@ class MemoryStatsOpsTest(test_util.TensorFlowTestCase):
         bytes_in_use_op = memory_stats_ops.BytesInUse()
       with ops.control_dependencies([bytes_in_use_op]):
         b = random_ops.random_uniform(matrix_shape, dtype=dtype)
+        c = math_ops.matmul(a, b)
 
-      _, bytes_in_use, max_bytes_in_use = sess.run([a, bytes_in_use_op,
+      _, bytes_in_use, max_bytes_in_use = sess.run([c, bytes_in_use_op,
                                                     max_bytes_in_use_op])
 
       # intermediate result allocates 1 matrix, max usage is at least 2
diff --git a/tensorflow/contrib/metrics/README.md b/tensorflow/contrib/metrics/README.md
index 247ebac5bb6eabbd87ca9d5dc1a18fa9dbe95aca..e0f2d74fa3270e68acadda026a28e9e5c71e0671 100644
--- a/tensorflow/contrib/metrics/README.md
+++ b/tensorflow/contrib/metrics/README.md
@@ -4,7 +4,7 @@
 
 Metrics are used in evaluation to assess the quality of a model. Most are
 "streaming" ops, meaning they create variables to accumulate a running total,
-and return an update tensor to update these variables, and a value tensor to 
+and return an update tensor to update these variables, and a value tensor to
 read the accumulated value. Example:
 
 value, update_op = metrics.streaming_mean_squared_error(
diff --git a/tensorflow/contrib/metrics/__init__.py b/tensorflow/contrib/metrics/__init__.py
index 8eed45c4b38873e02237aaf7193242497af6a101..27dad5379a2e56b91960a1f2274610e4f2568dbc 100644
--- a/tensorflow/contrib/metrics/__init__.py
+++ b/tensorflow/contrib/metrics/__init__.py
@@ -67,6 +67,7 @@ See the @{$python/contrib.metrics} guide.
 @@set_size
 @@set_union
 @@count
+@@precision_recall_at_equal_thresholds
 @@recall_at_precision
 
 """
@@ -82,6 +83,7 @@ from tensorflow.contrib.metrics.python.ops.histogram_ops import auc_using_histog
 from tensorflow.contrib.metrics.python.ops.metric_ops import aggregate_metric_map
 from tensorflow.contrib.metrics.python.ops.metric_ops import aggregate_metrics
 from tensorflow.contrib.metrics.python.ops.metric_ops import count
+from tensorflow.contrib.metrics.python.ops.metric_ops import precision_recall_at_equal_thresholds
 from tensorflow.contrib.metrics.python.ops.metric_ops import recall_at_precision
 from tensorflow.contrib.metrics.python.ops.metric_ops import sparse_recall_at_top_k
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_accuracy
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 6e2190cb7af974e5e1fc70e1741e81cf040f5fb2..2f2798563481cc0c53360944f967e6b31991057d 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -927,7 +927,7 @@ def streaming_curve_points(labels=None,
       tuple.
 
   TODO(chizeng): Consider rewriting this method to make use of logic within the
-  streaming_precision_recall_at_equal_thresholds method (to improve run time).
+  precision_recall_at_equal_thresholds method (to improve run time).
   """
   with variable_scope.variable_scope(name, 'curve_points',
                                      (labels, predictions, weights)):
@@ -1196,12 +1196,12 @@ def streaming_dynamic_auc(labels,
       return auc, update_op
 
 
-def streaming_precision_recall_at_equal_thresholds(predictions,
-                                                   labels,
-                                                   num_thresholds=None,
-                                                   weights=None,
-                                                   name=None,
-                                                   use_locking=None):
+def precision_recall_at_equal_thresholds(labels,
+                                         predictions,
+                                         weights=None,
+                                         num_thresholds=None,
+                                         use_locking=None,
+                                         name=None):
   """A helper method for creating metrics related to precision-recall curves.
 
   These values are true positives, false negatives, true negatives, false
@@ -1222,20 +1222,20 @@ def streaming_precision_recall_at_equal_thresholds(predictions,
   reweight certain values, or more commonly used for masking values.
 
   Args:
+    labels: A bool `Tensor` whose shape matches `predictions`.
     predictions: A floating point `Tensor` of arbitrary shape and whose values
       are in the range `[0, 1]`.
-    labels: A bool `Tensor` whose shape matches `predictions`.
+    weights: Optional; If provided, a `Tensor` that has the same dtype as,
+      and broadcastable to, `predictions`. This tensor is multplied by counts.
     num_thresholds: Optional; Number of thresholds, evenly distributed in
       `[0, 1]`. Should be `>= 2`. Defaults to 201. Note that the number of bins
       is 1 less than `num_thresholds`. Using an even `num_thresholds` value
       instead of an odd one may yield unfriendly edges for bins.
-    weights: Optional; If provided, a `Tensor` that has the same dtype as,
-      and broadcastable to, `predictions`. This tensor is multplied by counts.
-    name: Optional; variable_scope name. If not provided, the string
-      'precision_recall_at_equal_threshold' is used.
     use_locking: Optional; If True, the op will be protected by a lock.
       Otherwise, the behavior is undefined, but may exhibit less contention.
       Defaults to True.
+    name: Optional; variable_scope name. If not provided, the string
+      'precision_recall_at_equal_threshold' is used.
 
   Returns:
     result: A named tuple (See PrecisionRecallData within the implementation of
@@ -2268,7 +2268,7 @@ def recall_at_precision(labels,
     thresholds = [0.0 - _EPSILON] + thresholds + [1.0 + _EPSILON]
 
     values, update_ops = _streaming_confusion_matrix_at_thresholds(
-        labels, predictions, thresholds, weights)
+        predictions, labels, thresholds, weights)
 
     recall = _compute_recall_at_precision(values['tp'], values['fp'],
                                           values['fn'], precision, 'value')
@@ -3301,6 +3301,7 @@ __all__ = [
     'aggregate_metric_map',
     'aggregate_metrics',
     'count',
+    'precision_recall_at_equal_thresholds',
     'recall_at_precision',
     'sparse_recall_at_top_k',
     'streaming_accuracy',
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 5d0463e1f74832e3ed4c2cd3c5ee4aeded4f8aa9..f05ae394e6b46809f9f3f963733076f1a3933059 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -2218,11 +2218,11 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
       if weights:
         weights_tensor = constant_op.constant(weights, dtype=dtypes_lib.float32)
       gotten_result, update_op = (
-          metric_ops.streaming_precision_recall_at_equal_thresholds(
-              predictions=predictions_tensor,
+          metric_ops.precision_recall_at_equal_thresholds(
               labels=labels_tensor,
-              num_thresholds=3,
-              weights=weights_tensor))
+              predictions=predictions_tensor,
+              weights=weights_tensor,
+              num_thresholds=3))
 
       sess.run(variables.local_variables_initializer())
       sess.run(update_op)
@@ -2230,17 +2230,17 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
       self._testResultsEqual(expected_result, gotten_result)
 
   def testVars(self):
-    metric_ops.streaming_precision_recall_at_equal_thresholds(
-        predictions=constant_op.constant([0.42], dtype=dtypes_lib.float32),
-        labels=constant_op.constant([True], dtype=dtypes_lib.bool))
+    metric_ops.precision_recall_at_equal_thresholds(
+        labels=constant_op.constant([True], dtype=dtypes_lib.bool),
+        predictions=constant_op.constant([0.42], dtype=dtypes_lib.float32))
     _assert_metric_variables(
         self, ('precision_recall_at_equal_thresholds/variables/tp_buckets:0',
                'precision_recall_at_equal_thresholds/variables/fp_buckets:0'))
 
   def testVarsWithName(self):
-    metric_ops.streaming_precision_recall_at_equal_thresholds(
-        predictions=constant_op.constant([0.42], dtype=dtypes_lib.float32),
+    metric_ops.precision_recall_at_equal_thresholds(
         labels=constant_op.constant([True], dtype=dtypes_lib.bool),
+        predictions=constant_op.constant([0.42], dtype=dtypes_lib.float32),
         name='foo')
     _assert_metric_variables(
         self, ('foo/variables/tp_buckets:0', 'foo/variables/fp_buckets:0'))
@@ -2251,9 +2251,8 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
     labels = constant_op.constant(
         np.random.uniform(size=(10, 3)) > 0.5, dtype=dtypes_lib.bool)
 
-    result, update_op = (
-        metric_ops.streaming_precision_recall_at_equal_thresholds(
-            predictions=predictions, labels=labels))
+    result, update_op = metric_ops.precision_recall_at_equal_thresholds(
+        labels=labels, predictions=predictions)
 
     with self.test_session() as sess:
       # Run several updates.
@@ -3163,7 +3162,7 @@ class RecallAtPrecisionTest(test.TestCase):
     labels = random_ops.random_uniform(
         (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=2)
     recall, update_op = metrics.recall_at_precision(
-        predictions, labels, precision=0.7)
+        labels, predictions, precision=0.7)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -3183,7 +3182,7 @@ class RecallAtPrecisionTest(test.TestCase):
     predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
     labels = constant_op.constant(inputs)
     recall, update_op = metrics.recall_at_precision(
-        predictions, labels, precision=1.0)
+        labels, predictions, precision=1.0)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -3198,7 +3197,7 @@ class RecallAtPrecisionTest(test.TestCase):
         predictions_values, dtype=dtypes_lib.float32)
     labels = constant_op.constant(labels_values)
     recall, update_op = metrics.recall_at_precision(
-        predictions, labels, precision=0.8)
+        labels, predictions, precision=0.8)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -3213,7 +3212,7 @@ class RecallAtPrecisionTest(test.TestCase):
         predictions_values, dtype=dtypes_lib.float32)
     labels = constant_op.constant(labels_values)
     recall, update_op = metrics.recall_at_precision(
-        predictions, labels, precision=0.4)
+        labels, predictions, precision=0.4)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -3231,7 +3230,7 @@ class RecallAtPrecisionTest(test.TestCase):
     labels = constant_op.constant(labels_values)
     weights = constant_op.constant(weights_values)
     recall, update_op = metrics.recall_at_precision(
-        predictions, labels, weights=weights, precision=0.4)
+        labels, predictions, weights=weights, precision=0.4)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
diff --git a/tensorflow/contrib/model_pruning/README.md b/tensorflow/contrib/model_pruning/README.md
index 764e126e0d64d5e6c6caf0a9f0d43a87995447eb..d286750c257e9a78a82c95c1fc872b3ca6972203 100644
--- a/tensorflow/contrib/model_pruning/README.md
+++ b/tensorflow/contrib/model_pruning/README.md
@@ -42,10 +42,13 @@ The pruning library allows for specification of the following hyper parameters:
 | name | string | model_pruning | Name of the pruning specification. Used for adding summaries and ops under a common tensorflow name_scope |
 | begin_pruning_step | integer | 0 | The global step at which to begin pruning |
 | end_pruning_step   | integer | -1 | The global step at which to terminate pruning. Defaults to -1 implying that pruning continues till  the training stops |
-| do_not_prune | list of strings | [""] | list of layers strings that are not pruned |
+| do_not_prune | list of strings | [""] | list of layers names that are not pruned |
 | threshold_decay | float | 0.9 | The decay factor to use for exponential decay of the thresholds |
 | pruning_frequency | integer | 10 | How often should the masks be updated? (in # of global_steps) |
 | nbins | integer | 255 | Number of bins to use for histogram computation |
+| block_height|integer | 1 | Number of rows in a block for block sparse matrices|
+| block_width |integer | 1 | Number of cols in a block for block sparse matrices|
+| block_pooling_function| string | AVG | The function to use to pool weight values in a block: average (AVG) or max (MAX)|
 | initial_sparsity | float | 0.0 | Initial sparsity value |
 | target_sparsity | float | 0.5 | Target sparsity value |
 | sparsity_function_begin_step | integer | 0 | The global step at this which the gradual sparsity function begins to take effect |
@@ -128,3 +131,12 @@ Eval:
 ```shell
 $ bazel-bin/$examples_dir/cifar10/cifar10_eval --run_once
 ```
+
+### Block Sparsity
+
+For some hardware architectures, it may be beneficial to induce spatially correlated sparsity. To train models in which the weight tensors have block sparse structure, set *block_height* and *block_width* hyperparameters to the desired block configuration (2x2, 4x4, 4x1, 1x8, etc). Currently, block sparsity is supported for weight tensors with rank 2 only. The matrix is partitioned into non-overlapping blocks of size *[block_height, block_dim]* and the either the average or max absolute value in this block is taken as a proxy for the entire block (set by *block_pooling_function* hyperparameter).
+The convolution layer tensors are always pruned used block dimensions of [1,1].
+
+## References
+
+Michael Zhu and Suyog Gupta, “To prune, or not to prune: exploring the efficacy of pruning for model compression”, *2017 NIPS Workshop on Machine Learning of Phones and other Consumer Devices* (https://arxiv.org/pdf/1710.01878.pdf)
diff --git a/tensorflow/contrib/model_pruning/python/layers/core_layers.py b/tensorflow/contrib/model_pruning/python/layers/core_layers.py
index ae60d8b1e189335ec93e2b8e50edcf8b41bc6725..95dfd8f4213a8729f5954eb0626f28ecc9265bbb 100644
--- a/tensorflow/contrib/model_pruning/python/layers/core_layers.py
+++ b/tensorflow/contrib/model_pruning/python/layers/core_layers.py
@@ -72,8 +72,8 @@ class _MaskedConv(base.Layer):
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
@@ -279,8 +279,8 @@ class MaskedConv2D(_MaskedConv):
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Regularizer function for the output.
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index 42d91a71fde41d8681d7a0c439d6c49325730418..d16af9da19816211ee22f6ea48a347f0b9a4e612 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -72,8 +72,10 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary
 from tensorflow.python.training import training_util
@@ -129,6 +131,23 @@ def _weight_threshold_variable(var, scope):
     return threshold
 
 
+def _kronecker_product(mat1, mat2):
+  """Computes the Kronecker product of two matrices mat1 and mat2.
+
+  Args:
+    mat1: A matrix of size m x n
+    mat2: A matrix of size p x q
+  Returns:
+    Kronecker product of matrices mat1 and mat2 of size mp x nq
+  """
+
+  m1, n1 = mat1.get_shape().as_list()
+  mat1_rsh = array_ops.reshape(mat1, [m1, 1, n1, 1])
+  m2, n2 = mat2.get_shape().as_list()
+  mat2_rsh = array_ops.reshape(mat2, [1, m2, 1, n2])
+  return array_ops.reshape(mat1_rsh * mat2_rsh, [m1 * m2, n1 * n2])
+
+
 def _histogram(values, value_range, nbins=100, dtype=np.int32, name=None):
   """Return histogram of values.
 
@@ -297,6 +316,13 @@ def get_pruning_hparams():
       How often should the masks be updated? (in # of global_steps)
     nbins: integer
       number of bins to use for histogram computation
+    block_height: integer
+      number of rows in a block (defaults to 1)
+    block_width: integer
+      number of cols in a block (defaults to 1)
+    block_pooling_function: string
+      Whether to perform average (AVG) or max (MAX) pooling in the block
+      (default: AVG)
     initial_sparsity: float
       initial sparsity value
     target_sparsity: float
@@ -332,6 +358,9 @@ def get_pruning_hparams():
       threshold_decay=0.9,
       pruning_frequency=10,
       nbins=255,
+      block_height=1,
+      block_width=1,
+      block_pooling_function='AVG',
       initial_sparsity=0,
       target_sparsity=0.5,
       sparsity_function_begin_step=0,
@@ -341,11 +370,7 @@ def get_pruning_hparams():
 
 class Pruning(object):
 
-  def __init__(self,
-               spec=None,
-               global_step=None,
-               sparsity=None,
-               partitioner=None):
+  def __init__(self, spec=None, global_step=None, sparsity=None):
     """Set up the specification for model pruning.
 
     If a spec is provided, the sparsity is set up based on the sparsity_function
@@ -358,8 +383,6 @@ class Pruning(object):
       global_step: A tensorflow variable that is used while setting up the
         sparsity function
       sparsity: A tensorflow scalar variable storing the sparsity
-      partitioner: The tensorflow partitioner function used to distribute
-        parameters across shards
     """
     # Pruning specification
     self._spec = spec if spec else get_pruning_hparams()
@@ -373,9 +396,6 @@ class Pruning(object):
     # Built using self._setup_sparsity() or provided externally
     self._sparsity = sparsity if sparsity else self._setup_sparsity()
 
-    # Stores the partitioner function uses to partition variables across tasks/
-    self._partitioner = partitioner
-
     # List of tensorflow assignments ops for new masks and thresholds
     self._assign_ops = []
 
@@ -383,6 +403,12 @@ class Pruning(object):
     # were updated
     self._last_update_step = self._setup_last_update_step()
 
+    # Block dimensions
+    self._block_dim = [self._spec.block_height, self._spec.block_width]
+
+    # Block pooling function
+    self._block_pooling_function = self._spec.block_pooling_function
+
   def _setup_global_step(self, global_step):
     graph_global_step = global_step
     if graph_global_step is None:
@@ -457,9 +483,10 @@ class Pruning(object):
 
     Returns:
       new_threshold: The new value of the threshold based on weights, and
-        desired_sparsity
-      new_mask: A n-D numpy array containing 0 or 1 to indicate which of the
-        values in weights falls below the threshold
+        sparsity at the current global_step
+      new_mask: A numpy array of the same size and shape as weights containing
+        0 or 1 to indicate which of the values in weights falls below
+        the threshold
 
     Raises:
       ValueError: if sparsity is not defined
@@ -492,6 +519,63 @@ class Pruning(object):
           math_ops.greater(abs_weights, smoothed_threshold), np.float32)
     return smoothed_threshold, new_mask
 
+  def _maybe_update_block_mask(self, weights, threshold):
+    """Performs block-granular masking of the weights.
+
+    Block pruning occurs only if the block_height or block_width is > 1 and
+    if the weight tensor has ndims = 2. Otherwise, elementwise pruning occurs.
+    Args:
+      weights: The weight tensor that needs to be masked.
+      threshold: The current threshold value. The function will compute a new
+        threshold and return the exponential moving average using the current
+        value of threshold
+
+    Returns:
+      new_threshold: The new value of the threshold based on weights, and
+        sparsity at the current global_step
+      new_mask: A numpy array of the same size and shape as weights containing
+        0 or 1 to indicate which of the values in weights falls below
+        the threshold
+
+    Raises:
+      ValueError: if block pooling function is not AVG or MAX
+    """
+    if weights.get_shape().ndims != 2 or self._block_dim == [1, 1]:
+      return self._update_mask(weights, threshold)
+
+    if self._block_pooling_function not in ['AVG', 'MAX']:
+      raise ValueError('Unknown pooling function for block sparsity: %s' %
+                       self._block_pooling_function)
+
+    with ops.name_scope(weights.op.name + '_pruning_ops'):
+      abs_weights = math_ops.abs(
+          array_ops.reshape(
+              weights, [1, weights.get_shape()[0],
+                        weights.get_shape()[1], 1]))
+      pool_window = [self._block_dim[0], self._block_dim[1]]
+      pooled_weights = nn_ops.pool(
+          abs_weights,
+          window_shape=pool_window,
+          pooling_type=self._block_pooling_function,
+          strides=pool_window,
+          padding='SAME',
+          name=weights.op.name + '_pooled')
+
+      smoothed_threshold, new_mask = self._update_mask(pooled_weights,
+                                                       threshold)
+
+      reshaped_mask = array_ops.reshape(
+          new_mask,
+          [pooled_weights.get_shape()[1],
+           pooled_weights.get_shape()[2]])
+      updated_mask = _kronecker_product(reshaped_mask,
+                                        array_ops.ones(self._block_dim))
+      sliced_mask = array_ops.slice(
+          updated_mask, [0, 0],
+          [weights.get_shape()[0],
+           weights.get_shape()[1]])
+    return smoothed_threshold, sliced_mask
+
   def _get_mask_assign_ops(self):
     # Make sure the assignment ops have not already been added to the list
     if self._assign_ops:
@@ -509,18 +593,21 @@ class Pruning(object):
 
     for index, mask in enumerate(masks):
       threshold = thresholds[index]
-      weight = weights[index] if self._partitioner is None else weights[
-          index].as_tensor()
+      weight = weights[index]
+      is_partitioned = isinstance(weight, variables.PartitionedVariable)
+      if is_partitioned:
+        weight = weight.as_tensor()
 
       if self._spec.do_not_prune:
         if self._exists_in_do_not_prune_list(mask.name):
           continue
 
-      new_threshold, new_mask = self._update_mask(weight, threshold)
+      new_threshold, new_mask = self._maybe_update_block_mask(weight, threshold)
       self._assign_ops.append(_variable_assign(threshold, new_threshold))
+
       self._assign_ops.append(
-          _variable_assign(mask, new_mask) if self._partitioner is None else
-          _partitioned_variable_assign(mask, new_mask))
+          _partitioned_variable_assign(mask, new_mask)
+          if is_partitioned else _variable_assign(mask, new_mask))
 
   def mask_update_op(self):
     with ops.name_scope(self._spec.name):
diff --git a/tensorflow/contrib/model_pruning/python/pruning_test.py b/tensorflow/contrib/model_pruning/python/pruning_test.py
index c23fd649ce1fc72a2e8d516bfa3750b7ced1b111..1767b4bb94a9bb56bc6a4933423ad27d8cf3ed35 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_test.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.model_pruning.python import pruning
+from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
@@ -111,6 +112,39 @@ class PruningTest(test.TestCase):
       masked_weights_val = masked_weights.eval()
       self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
 
+  def _blockMasking(self, hparams, weights, expected_mask):
+
+    threshold = variables.Variable(0.0, name="threshold")
+    sparsity = variables.Variable(0.51, name="sparsity")
+    test_spec = ",".join(hparams)
+    pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)
+
+    # Set up pruning
+    p = pruning.Pruning(pruning_hparams, sparsity=sparsity)
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      _, new_mask = p._maybe_update_block_mask(weights, threshold)
+      # Check if the mask is the same size as the weights
+      self.assertAllEqual(new_mask.get_shape(), weights.get_shape())
+      mask_val = new_mask.eval()
+      self.assertAllEqual(mask_val, expected_mask)
+
+  def testBlockMasking(self):
+    param_list = ["block_height=2", "block_width=2", "threshold_decay=0"]
+
+    weights_avg = constant_op.constant(
+        [[0.1, 0.1, 0.2, 0.2], [0.1, 0.1, 0.2, 0.2], [0.3, 0.3, 0.4, 0.4],
+         [0.3, 0.3, 0.4, 0.4]])
+    weights_max = constant_op.constant(
+        [[0.1, 0.0, 0.2, 0.0], [0.0, -0.1, 0.0, -0.2], [0.3, 0.0, 0.4, 0.0],
+         [0.0, -0.3, 0.0, -0.4]])
+    expected_mask = [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]]
+
+    self._blockMasking(param_list + ["block_pooling_function=MAX"], weights_max,
+                       expected_mask)
+    self._blockMasking(param_list + ["block_pooling_function=AVG"],
+                       weights_avg, expected_mask)
+
   def testPartitionedVariableMasking(self):
     partitioner = partitioned_variables.variable_axis_size_partitioner(40)
     with self.test_session() as session:
@@ -120,7 +154,7 @@ class PruningTest(test.TestCase):
             "weights", initializer=math_ops.linspace(1.0, 100.0, 100))
         masked_weights = pruning.apply_mask(
             weights, scope=variable_scope.get_variable_scope())
-      p = pruning.Pruning(sparsity=sparsity, partitioner=partitioner)
+      p = pruning.Pruning(sparsity=sparsity)
       p._spec.threshold_decay = 0.0
       mask_update_op = p.mask_update_op()
       variables.global_variables_initializer().run()
diff --git a/tensorflow/contrib/mpi/README.md b/tensorflow/contrib/mpi/README.md
index b0d03d05a26312273ae65415547d498ca866638c..75cb8230483a7648e771904c7087e2848929d2b4 100644
--- a/tensorflow/contrib/mpi/README.md
+++ b/tensorflow/contrib/mpi/README.md
@@ -23,7 +23,7 @@ The following environment variables can be set to modify the behavior at runtime
 
 **MPI_DISABLED=[0,1]**
 
-This environment variable allows you to disable the MPI path before launch (e.g. for performance or correctness testing). 
+This environment variable allows you to disable the MPI path before launch (e.g. for performance or correctness testing).
 
 **MPI_OPTIMAL_PATH=[0,1]**
 
@@ -34,10 +34,10 @@ This path is disabled by default as it requires that the MPI library can directl
 
 ## Known problems
 
-For certain complex neural nets the implementation sometimes crashes inside the MPI libraries. This seems to be related to memory allocations/routines that register the memory for the Infiniband transfers. (The crashes do not happen when all MPI processes are within the same physical machine). 
+For certain complex neural nets the implementation sometimes crashes inside the MPI libraries. This seems to be related to memory allocations/routines that register the memory for the Infiniband transfers. (The crashes do not happen when all MPI processes are within the same physical machine).
 
 **MVAPICH**
-- The problem manifests itself with a segmentation fault inside a memory copy routine and during startup you will get the following warning: "WARNING: Error in initializing MVAPICH2 ptmalloc library. Continuing without InfiniBand registration cache support." 
+- The problem manifests itself with a segmentation fault inside a memory copy routine and during startup you will get the following warning: "WARNING: Error in initializing MVAPICH2 ptmalloc library. Continuing without InfiniBand registration cache support."
 
 **OpenMPI**
 - With OpenMPI corrupt data will be received resulting in an assertion or the MPI library will print an error and exit. The error is "Attempt to free memory that is still in use by an ongoing MPI communication.  MPI job will now abort."
@@ -58,11 +58,11 @@ Once a request has arrived from a remote process the request is forwarded to the
 * Receive tensor request
 The MPI thread will check if there are any incoming tensor request messages on the communication lines using MPI_Iprobe. Once a request has been received it will be passed on to the standard TensorFlow code and eventually will be placed on the sendQueue.
 
-* Receive tensor 
+* Receive tensor
 At some point after a request has been sent the remote process will transmit the tensor. This tensor will be received and we look-up the callback that is associated with this tensor in our request table and execute the callback on the received data.
 
 
-In the implementation all send operations are non-blocking, all probe operations are non-blocking and all receive-operations are blocking. The receive-operations are only executed after the probe has determined that there is something to receive. 
+In the implementation all send operations are non-blocking, all probe operations are non-blocking and all receive-operations are blocking. The receive-operations are only executed after the probe has determined that there is something to receive.
 The MPI processes identify each other using an MPI process ID. The TensorFlow gRPC processes identify each other using a name. During launch we create a mapping between the TensorFlow process name and the MPI process ID to allow the processes to communicate with the correct destinations when using MPI operations.
 
 
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
index 1eb1481675e08ffc6c952fe4811785ac94f6b0b4..913935b38246f1c5c0f7da4c1ea1f986bc00891b 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
@@ -258,9 +258,37 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
     devices[i] = collective->participants[i]->gpu_device_id;
   }
 
+  int device_count = num_devices;
+#if NCCL_MAJOR >= 2
+  // NCCL2 prevents InitAll for more communicators than devices (but doesn't
+  // check that device ids are unique). Work around it by initializing each
+  // rank individually.
+  cudaGetDeviceCount(&device_count);
+#endif
   std::vector<ncclComm_t> nccl_comms(num_devices);
-  auto result = ncclCommInitAll(nccl_comms.data(), num_devices, devices.data());
-  CHECK_EQ(result, ncclSuccess) << ncclGetErrorString(result);
+  if (num_devices <= device_count) {
+    auto result =
+        ncclCommInitAll(nccl_comms.data(), num_devices, devices.data());
+    CHECK_EQ(result, ncclSuccess) << ncclGetErrorString(result);
+  } else {
+    int savedDevice = 0;
+    CHECK_EQ(cudaGetDevice(&savedDevice), cudaSuccess);
+    ncclUniqueId commId;
+    ncclGetUniqueId(&commId);
+#if NCCL_MAJOR >= 2
+    CHECK_EQ(ncclGroupStart(), ncclSuccess);
+#endif
+    for (int rank = 0; rank < num_devices; ++rank) {
+      cudaSetDevice(devices[rank]);
+      auto result =
+          ncclCommInitRank(nccl_comms.data() + rank, num_devices, commId, rank);
+      CHECK_EQ(result, ncclSuccess) << ncclGetErrorString(result);
+    }
+#if NCCL_MAJOR >= 2
+    CHECK_EQ(ncclGroupEnd(), ncclSuccess);
+#endif
+    cudaSetDevice(savedDevice);
+  }
   for (int rank = 0; rank < num_devices; ++rank) {
     members[rank].nccl_comm = nccl_comms[rank];
   }
@@ -370,7 +398,7 @@ void NcclManager::AddParticipant(int num_devices, const string& key,
 }
 
 void NcclManager::RunCollective(const string& key, Collective* collective) {
-  static mutex collective_mu;
+  static mutex collective_mu(LINKER_INITIALIZED);
 
   auto* communicator = GetCommunicator(collective);
   collective->communicator = communicator;
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
index 505c4b0d71028c64b5075cff7ea010597b4263b3..658baf18d3c706d3b7fbba1ec9d02a1f0cda638e 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
@@ -30,6 +30,8 @@ namespace tensorflow {
 static std::vector<BaseGPUDevice*> GetGPUDevices() {
   std::vector<Device*> devices;
   SessionOptions session_options;
+  session_options.config.mutable_gpu_options()
+      ->set_per_process_gpu_memory_fraction(0.1);
   session_options.env = Env::Default();
   Status s = DeviceFactory::GetFactory(DEVICE_GPU)
                  ->AddDevices(session_options, "", &devices);
@@ -173,7 +175,7 @@ class NcclManagerTest : public ::testing::Test {
       auto out_gpu_mem = AsDeviceMemory(out_gpu.flat<float>().data());
       stream->ThenMemcpy(out_cpu.flat<float>().data(), out_gpu_mem,
                          out_cpu.TotalBytes());
-      stream->BlockHostUntilDone();
+      SE_ASSERT_OK(stream->BlockHostUntilDone());
       test::ExpectTensorEqual<float>(test_case->expected, out_cpu);
     }
   }
@@ -234,7 +236,7 @@ TEST_F(NcclManagerTest, MultipleCallers) {
     for (int i = 0; i < num_ranks; ++i) {
       auto* device = devices->at(i % devices->size());
       auto* stream = device->tensorflow_gpu_device_info()->stream;
-      stream->BlockHostUntilDone();
+      SE_ASSERT_OK(stream->BlockHostUntilDone());
     }
 
     std::random_shuffle(case_and_device_num.begin(), case_and_device_num.end());
diff --git a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
index 62ee6630ac613c80a56d4e854cf7af4ae19f6faa..2b412fac9a621f01bd21c6b4391da3c462dd78b3 100644
--- a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
+++ b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
@@ -45,16 +45,16 @@ class HyperplaneLSHProbesOp : public OpKernel {
     const Tensor& products_tensor = context->input(0);
     OP_REQUIRES(context, products_tensor.dims() == 2,
                 InvalidArgument("Need a two-dimensional products tensor, got ",
-                                products_tensor.dims(), " dimensions."))
+                                products_tensor.dims(), " dimensions."));
 
     const Tensor& num_tables_tensor = context->input(1);
     OP_REQUIRES(context, num_tables_tensor.dims() == 0,
                 InvalidArgument("Need a scalar num_tables tensor, got ",
-                                num_tables_tensor.dims(), " dimensions."))
+                                num_tables_tensor.dims(), " dimensions."));
     int num_tables = num_tables_tensor.scalar<int32>()();
     OP_REQUIRES(context, num_tables >= 1,
                 InvalidArgument("num_tables must be at least 1 but got ",
-                                num_tables, "."))
+                                num_tables, "."));
     OP_REQUIRES(context, num_tables <= 1000,
                 InvalidArgument("Need num_tables <= 1000, got ", num_tables,
                                 ". This is mostly to protect against incorrect "
@@ -66,12 +66,13 @@ class HyperplaneLSHProbesOp : public OpKernel {
                 InvalidArgument("Need a scalar num_hyperplanes_per_table "
                                 "tensor, got ",
                                 num_hyperplanes_per_table_tensor.dims(),
-                                " dimensions."))
+                                " dimensions."));
     int num_hyperplanes_per_table =
         num_hyperplanes_per_table_tensor.scalar<int32>()();
     OP_REQUIRES(context, num_hyperplanes_per_table >= 1,
                 InvalidArgument("num_hyperplanes_per_table must be at least 1 "
-                                "but got ", num_hyperplanes_per_table, "."))
+                                "but got ",
+                                num_hyperplanes_per_table, "."));
     OP_REQUIRES(context, num_hyperplanes_per_table <= 30,
                 InvalidArgument("Need num_hyperplanes_per_table <= 30, got ",
                                 num_hyperplanes_per_table, ". "
@@ -81,10 +82,10 @@ class HyperplaneLSHProbesOp : public OpKernel {
     const Tensor& num_probes_tensor = context->input(3);
     OP_REQUIRES(context, num_probes_tensor.dims() == 0,
                 InvalidArgument("Need a scalar num_probes tensor, got ",
-                                num_probes_tensor.dims(), " dimensions."))
+                                num_probes_tensor.dims(), " dimensions."));
     int num_probes = num_probes_tensor.scalar<int32>()();
     OP_REQUIRES(context, num_probes >= 1,
-                InvalidArgument("num_probes must be at least 1."))
+                InvalidArgument("num_probes must be at least 1."));
 
     int expected_num_hyperplanes = num_tables * num_hyperplanes_per_table;
     OP_REQUIRES(
diff --git a/tensorflow/contrib/nn/__init__.py b/tensorflow/contrib/nn/__init__.py
index 0bc133a00e619930f1d5fe4c7a8996556b833ddf..96d60e149809aff6fcb7eff77edc23737db177e8 100644
--- a/tensorflow/contrib/nn/__init__.py
+++ b/tensorflow/contrib/nn/__init__.py
@@ -21,6 +21,7 @@
 @@deprecated_flipped_sigmoid_cross_entropy_with_logits
 @@nth_element
 @@rank_sampled_softmax_loss
+@@sampled_sparse_softmax_loss
 @@scaled_softplus
 """
 
diff --git a/tensorflow/contrib/nn/python/ops/sampling_ops.py b/tensorflow/contrib/nn/python/ops/sampling_ops.py
index 2ae529e0155f5ad9b40391c2f728c5c594e72dc9..63fc487dca69a4777821595a0366d0ae0b393ce2 100644
--- a/tensorflow/contrib/nn/python/ops/sampling_ops.py
+++ b/tensorflow/contrib/nn/python/ops/sampling_ops.py
@@ -24,6 +24,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import nn_ops
 
 
 def _rank_resample(weights, biases, inputs, sampled_values, num_resampled,
@@ -34,7 +36,7 @@ def _rank_resample(weights, biases, inputs, sampled_values, num_resampled,
 
       log(sum_j exp((w_i * x_j + b_i) / resampling_temperature))
 
-  where w_i, b_i are the weight and bias of the i-th class, repsectively,
+  where w_i, b_i are the weight and bias of the i-th class, respectively,
   and j ranges over the rows of `inputs`. For efficiency, we rearrange the
   computation to
 
@@ -240,3 +242,101 @@ def rank_sampled_softmax_loss(weights,
         remove_accidental_hits=remove_accidental_hits,
         partition_strategy=partition_strategy,
         name=name)
+
+
+def sampled_sparse_softmax_loss(weights,
+                                biases,
+                                labels,
+                                inputs,
+                                num_sampled,
+                                num_classes,
+                                sampled_values=None,
+                                remove_accidental_hits=True,
+                                partition_strategy="mod",
+                                name="sampled_sparse_softmax_loss"):
+  """Computes and returns the sampled sparse softmax training loss.
+
+  This is a faster way to train a softmax classifier over a huge number of
+  classes.
+
+  This operation is for training only.  It is generally an underestimate of
+  the full softmax loss.
+
+  A common use case is to use this method for training, and calculate the full
+  softmax loss for evaluation or inference. In this case, you must set
+  `partition_strategy="div"` for the two losses to be consistent, as in the
+  following example:
+
+  ```python
+  if mode == "train":
+    loss = tf.nn.sampled_sparse_softmax_loss(
+        weights=weights,
+        biases=biases,
+        labels=labels,
+        inputs=inputs,
+        ...,
+        partition_strategy="div")
+  elif mode == "eval":
+    logits = tf.matmul(inputs, tf.transpose(weights))
+    logits = tf.nn.bias_add(logits, biases)
+    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=tf.squeeze(labels),
+        logits=logits)
+  ```
+
+  See our [Candidate Sampling Algorithms Reference]
+  (https://www.tensorflow.org/extras/candidate_sampling.pdf)
+
+  Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
+  ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.
+
+  Args:
+    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
+        objects whose concatenation along dimension 0 has shape
+        [num_classes, dim].  The (possibly-sharded) class embeddings.
+    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
+    labels: A `Tensor` of type `int64` and shape `[batch_size, 1]`.
+        The index of the single target class for each row of logits.  Note that
+        this format differs from the `labels` argument of
+        `nn.sparse_softmax_cross_entropy_with_logits`.
+    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
+        activations of the input network.
+    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_classes: An `int`. The number of possible classes.
+    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
+        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
+        (if None, we default to `log_uniform_candidate_sampler`)
+    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
+        where a sampled class equals one of the target classes.  Default is
+        True.
+    partition_strategy: A string specifying the partitioning strategy, relevant
+        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
+        Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `batch_size` 1-D tensor of per-example sampled softmax losses.
+
+  """
+  logits, _ = nn_impl._compute_sampled_logits(
+      weights=weights,
+      biases=biases,
+      labels=labels,
+      inputs=inputs,
+      num_sampled=num_sampled,
+      num_classes=num_classes,
+      num_true=1,
+      sampled_values=sampled_values,
+      subtract_log_q=True,
+      remove_accidental_hits=remove_accidental_hits,
+      partition_strategy=partition_strategy,
+      name=name)
+
+  # There is only one true label. _compute_sampled_logits puts the true logit
+  # at index 0.
+  labels = array_ops.zeros([array_ops.shape(logits)[0], 1], dtype=dtypes.int64)
+
+  sampled_losses = nn_ops.sparse_softmax_cross_entropy_with_logits(
+      labels=array_ops.squeeze(labels), logits=logits)
+  # sampled_losses is a [batch_size] tensor.
+  return sampled_losses
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 022e5ab06fa9d90b216149055a1ffe91e87f4137..9c961f2b9c828f7406516860b7e3fd3dc343d993 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -14,12 +14,16 @@ py_library(
     name = "opt_py",
     srcs = [
         "__init__.py",
+        "python/training/addsign.py",
         "python/training/drop_stale_gradient_optimizer.py",
+        "python/training/elastic_average_optimizer.py",
         "python/training/external_optimizer.py",
         "python/training/lazy_adam_optimizer.py",
         "python/training/moving_average_optimizer.py",
         "python/training/multitask_optimizer_wrapper.py",
         "python/training/nadam_optimizer.py",
+        "python/training/powersign.py",
+        "python/training/sign_decay.py",
         "python/training/variable_clipping_optimizer.py",
     ],
     srcs_version = "PY2AND3",
@@ -77,22 +81,22 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "variable_clipping_optimizer_test",
     srcs = ["python/training/variable_clipping_optimizer_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",  # Flaky: b/29892493
-        "notap",  # data race due to b/62910646
-    ],
-    deps = [
+    additional_deps = [
         ":opt_py",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
-        "//third_party/py/numpy",
+    ],
+    grpc_enabled = True,
+    tags = [
+        "manual",  # Flaky: b/29892493
+        "notap",  # data race due to b/62910646
     ],
 )
 
@@ -165,11 +169,78 @@ tf_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    grpc_enabled = True,
     tags = [
         "no_oss",  # Flaky due to port collisions
     ],
 )
 
+tf_py_test(
+    name = "elastic_average_optimizer_test",
+    srcs = ["python/training/elastic_average_optimizer_test.py"],
+    additional_deps = [
+        ":opt_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "sign_decay_test",
+    srcs = ["python/training/sign_decay_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "addsign_test",
+    srcs = ["python/training/addsign_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "powersign_test",
+    srcs = ["python/training/powersign_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index af47e3937a0c5adc8854f055f82c92372b20bd56..ef20a132fb4e2d5203b5d080856afa32c3016bc0 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+    # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,24 +19,36 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import
+from tensorflow.contrib.opt.python.training.addsign import *
 from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import *
 from tensorflow.contrib.opt.python.training.external_optimizer import *
 from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
-from tensorflow.contrib.opt.python.training.nadam_optimizer import *
 from tensorflow.contrib.opt.python.training.moving_average_optimizer import *
 from tensorflow.contrib.opt.python.training.multitask_optimizer_wrapper import *
+from tensorflow.contrib.opt.python.training.nadam_optimizer import *
+from tensorflow.contrib.opt.python.training.powersign import *
 from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import *
+from tensorflow.contrib.opt.python.training.elastic_average_optimizer import *
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
 
 
 _allowed_symbols = [
+    'PowerSignOptimizer',
+    'AddSignOptimizer'
     'DelayCompensatedGradientDescentOptimizer',
-    'DropStaleGradientOptimizer', 'ExternalOptimizerInterface',
-    'LazyAdamOptimizer', 'NadamOptimizer', 'MovingAverageOptimizer',
-    'ScipyOptimizerInterface', 'VariableClippingOptimizer',
-    'MultitaskOptimizerWrapper', 'clip_gradients_by_global_norm',
+    'DropStaleGradientOptimizer',
+    'ExternalOptimizerInterface',
+    'LazyAdamOptimizer',
+    'NadamOptimizer',
+    'MovingAverageOptimizer',
+    'ScipyOptimizerInterface',
+    'VariableClippingOptimizer',
+    'MultitaskOptimizerWrapper',
+    'clip_gradients_by_global_norm',
+    'ElasticAverageOptimizer',
+    'ElasticAverageCustomGetter'
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/opt/python/training/addsign.py b/tensorflow/contrib/opt/python/training/addsign.py
new file mode 100644
index 0000000000000000000000000000000000000000..729e59cb0aab97e6cd657571647fc45a44ae0ab1
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/addsign.py
@@ -0,0 +1,169 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of AddSign."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+
+
+class AddSignOptimizer(optimizer.Optimizer):
+  """Optimizer that implements the AddSign update.
+
+  See  Neural Optimizer Search with Reinforcement Learning
+  [Bello et al., ICML2017].
+  """
+
+  def __init__(self,
+               learning_rate=0.1,
+               alpha=1.0,
+               beta=0.9,
+               sign_decay_fn=None,
+               use_locking=False,
+               name='AddSignOptimizer'):
+    """Constructs a new AddSignOptimizer object.
+
+    Initialization:
+
+    m_0 <- 0 (Initialize initial 1st moment vector)
+    t <- 0 (Initialize timestep)
+    ```
+
+    Update:
+
+    ```
+    t <- t + 1
+    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+    sign_decay <- sign_decay(t)
+    update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+    variable <- variable - lr_t * update
+    ```
+
+    Example for AddSign-ld (AddSign with linear sign decay)
+    ```
+    decay_steps = 1000
+    linear_decay_fn = sign_decays.get_linear_decay_fn(decay_steps)
+    opt = AddSignOptimizer(learning_rate=0.1, sign_decay_fn=linear_decay_fn)
+    ```
+
+    Args:
+      learning_rate: learning_rate used when taking a step.
+      alpha: alpha used in optimizer.
+      beta: decay used for computing the moving average m.
+      sign_decay_fn: decay function applied to the sign(g*m) quantity.
+          Takes global_step as an argument and returns the quantity to multiply
+          the sign(g*m) by.
+        compute (1.0 + alpha * decay * sign(g) * sign(m)) * m.
+      use_locking: If True use locks for update operations.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "AddSignOptimizer".
+    """
+    super(AddSignOptimizer, self).__init__(use_locking, name)
+    self._lr = learning_rate
+    self._alpha = alpha
+    self._beta = beta
+
+    self._sign_decay_fn = sign_decay_fn
+
+    # Tensor versions of the constructor arguments, created in _prepare().
+    self._lr_t = None
+    self._alpha_t = None
+    self._beta_t = None
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    if self._sign_decay_fn is not None:
+      self._sign_decay_t = ops.convert_to_tensor(
+          self._sign_decay_fn(global_step), name='sign_decay')
+    return super(AddSignOptimizer, self).apply_gradients(
+        grads_and_vars, global_step=global_step, name=name)
+
+  def _create_slots(self, var_list):
+    # Create slots for the first moment.
+    for v in var_list:
+      self._zeros_slot(v, 'm', self._name)
+
+  def _prepare(self):
+    self._lr_t = ops.convert_to_tensor(self._lr, name='learning_rate')
+    self._beta_t = ops.convert_to_tensor(self._beta, name='beta')
+    self._alpha_t = ops.convert_to_tensor(self._alpha, name='alpha')
+    if self._sign_decay_fn is None:
+      self._sign_decay_t = ops.convert_to_tensor(1.0, name='sign_decay')
+
+  def _apply_dense(self, grad, var):
+    m = self.get_slot(var, 'm')
+    return training_ops.apply_add_sign(
+        var,
+        m,
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._alpha_t, var.dtype.base_dtype),
+        math_ops.cast(self._sign_decay_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta_t, var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var):
+    m = self.get_slot(var, 'm')
+    return training_ops.resource_apply_add_sign(
+        var.handle,
+        m.handle,
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._alpha_t, var.dtype.base_dtype),
+        math_ops.cast(self._sign_decay_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta_t, var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var):
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype)
+    beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype)
+
+    m = self.get_slot(var, 'm')
+    m_t = state_ops.assign(
+        m, (m * beta_t) + (grad * (1 - beta_t)), use_locking=self._use_locking)
+
+    sign_g = ops.IndexedSlices(
+        math_ops.sign(grad.values), grad.indices, dense_shape=grad.dense_shape)
+    sign_gm = ops.IndexedSlices(
+        array_ops.gather(math_ops.sign(m_t), sign_g.indices) * sign_g.values,
+        sign_g.indices,
+        dense_shape=sign_g.dense_shape)
+
+    sign_decayed = math_ops.cast(
+        self._sign_decay_t, var.dtype.base_dtype)
+    multiplier_values = alpha_t + sign_decayed * sign_gm.values
+    multiplier = ops.IndexedSlices(
+        multiplier_values, sign_gm.indices, dense_shape=sign_gm.dense_shape)
+
+    final_update = ops.IndexedSlices(
+        lr_t * multiplier.values * grad.values,
+        multiplier.indices,
+        dense_shape=multiplier.dense_shape)
+
+    var_update = state_ops.scatter_sub(
+        var,
+        final_update.indices,
+        final_update.values,
+        use_locking=self._use_locking)
+
+    return control_flow_ops.group(* [var_update, m_t])
diff --git a/tensorflow/contrib/opt/python/training/addsign_test.py b/tensorflow/contrib/opt/python/training/addsign_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd19ee3e7ac514448c6d79272abb86a154f55e9a
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/addsign_test.py
@@ -0,0 +1,262 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for AddSign."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import addsign
+from tensorflow.contrib.opt.python.training import sign_decay
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def py_linear_decay_fn(decay_steps):
+  def linear_decay(step):
+    step = min(step, decay_steps)
+    return float(decay_steps - step) / decay_steps
+  return linear_decay
+
+
+def addsign_update_numpy(params,
+                         g_t,
+                         m,
+                         lr,
+                         alpha=1.0,
+                         beta=0.9,
+                         py_sign_decay_fn=None,
+                         t=None):
+  m_t = beta * m + (1 - beta) * g_t
+  if py_sign_decay_fn is None:
+    sign_decayed = 1.0
+  else:
+    sign_decayed = py_sign_decay_fn(t-1)
+  multiplier = alpha + sign_decayed * np.sign(g_t) * np.sign(m_t)
+  params_t = params - lr * multiplier * g_t
+  return params_t, m_t
+
+
+class AddSignTest(test.TestCase):
+
+  def _testDense(self,
+                 use_resource=False,
+                 learning_rate=0.1,
+                 sign_decay_fn=None,
+                 py_sign_decay_fn=None,
+                 alpha=1.0,
+                 beta=0.9):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session(use_gpu=True):
+        # Initialize variables for numpy implementation.
+        m0, m1 = 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+          global_step = resource_variable_ops.ResourceVariable(
+              0, trainable=False)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+          global_step = variables.Variable(
+              0, trainable=False)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = addsign.AddSignOptimizer(
+            learning_rate=learning_rate,
+            alpha=alpha,
+            beta=beta,
+            sign_decay_fn=sign_decay_fn,
+        )
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        neg_update = opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]),
+                                         global_step=global_step)
+        if context.in_graph_mode():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 7 steps of AddSign
+        # first 4 steps with positive gradient
+        # last 3 steps with negative gradient (sign(gm) should be -1)
+        for t in range(1, 8):
+          if t < 5:
+            if context.in_graph_mode():
+              self.evaluate(update)
+            elif t > 1:
+              opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                  global_step=global_step)
+          else:
+            if context.in_graph_mode():
+              self.evaluate(neg_update)
+            elif t > 1:
+              opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]),
+                                  global_step=global_step)
+
+          var0_np, m0 = addsign_update_numpy(
+              var0_np,
+              grads0_np if t < 5 else -grads0_np,
+              m0,
+              learning_rate,
+              alpha=alpha,
+              beta=beta,
+              py_sign_decay_fn=py_sign_decay_fn,
+              t=t,
+          )
+          var1_np, m1 = addsign_update_numpy(
+              var1_np,
+              grads1_np if t < 5 else -grads1_np,
+              m1,
+              learning_rate,
+              alpha=alpha,
+              beta=beta,
+              py_sign_decay_fn=py_sign_decay_fn,
+              t=t,
+          )
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testDense(self):
+    decay_steps = 10
+    sign_decay_fn = sign_decay.get_linear_decay_fn(decay_steps)
+    py_sign_decay_fn = py_linear_decay_fn(decay_steps)
+    self._testDense(use_resource=False)
+    self._testDense(use_resource=False, learning_rate=0.01, alpha=0.1, beta=0.8)
+    self._testDense(use_resource=False,
+                    sign_decay_fn=sign_decay_fn,
+                    py_sign_decay_fn=py_sign_decay_fn)
+
+    self._testDense(use_resource=True)
+    self._testDense(use_resource=True, learning_rate=0.01, alpha=0.1, beta=0.8)
+    self._testDense(use_resource=True,
+                    sign_decay_fn=sign_decay_fn,
+                    py_sign_decay_fn=py_sign_decay_fn)
+
+  def _testSparse(self,
+                  use_resource=False,
+                  learning_rate=0.1,
+                  sign_decay_fn=None,
+                  py_sign_decay_fn=None,
+                  alpha=1.0,
+                  beta=0.9):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session(use_gpu=True):
+        # Initialize variables for numpy implementation.
+        m0, m1 = 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+          global_step = resource_variable_ops.ResourceVariable(
+              0, trainable=False)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+          global_step = variables.Variable(
+              0, trainable=False)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = addsign.AddSignOptimizer(
+            learning_rate=learning_rate,
+            alpha=alpha,
+            beta=beta,
+            sign_decay_fn=sign_decay_fn,
+        )
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        neg_update = opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]),
+                                         global_step=global_step)
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 7 steps of AddSign
+        # first 4 steps with positive gradient
+        # last 3 steps with negative gradient (sign(gm) should be -1)
+        for t in range(1, 4):
+          if t < 5:
+            update.run()
+          else:
+            neg_update.run()
+
+          var0_np, m0 = addsign_update_numpy(
+              var0_np,
+              grads0_np,
+              m0,
+              learning_rate,
+              alpha=alpha,
+              beta=beta,
+              py_sign_decay_fn=py_sign_decay_fn,
+              t=t,
+          )
+          var1_np, m1 = addsign_update_numpy(
+              var1_np,
+              grads1_np,
+              m1,
+              learning_rate,
+              alpha=alpha,
+              beta=beta,
+              py_sign_decay_fn=py_sign_decay_fn,
+              t=t,
+          )
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSparse(self):
+    decay_steps = 10
+    sign_decay_fn = sign_decay.get_linear_decay_fn(decay_steps)
+    py_sign_decay_fn = py_linear_decay_fn(decay_steps)
+    self._testSparse(use_resource=False)
+    self._testSparse(use_resource=False,
+                     learning_rate=0.01,
+                     alpha=0.1,
+                     beta=0.8)
+    self._testSparse(use_resource=False,
+                     sign_decay_fn=sign_decay_fn,
+                     py_sign_decay_fn=py_sign_decay_fn)
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py
index f20c172ee376d0a808a21fe96bec80367bf2e9f4..4a905b1b2a0c3b7c4002451f37102eb2abdc5a2b 100644
--- a/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py
@@ -78,10 +78,11 @@ class DropStaleGradientOptimizer(optimizer.Optimizer):
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
     gradients = []
     # Number of stale gradients.
-    stale_counter = variable_scope.get_variable(
-        "stale_counter", [],
-        initializer=init_ops.zeros_initializer(),
-        trainable=False)
+    with ops.colocate_with(global_step):
+      stale_counter = variable_scope.get_variable(
+          "stale_counter", [],
+          initializer=init_ops.zeros_initializer(),
+          trainable=False)
 
     def _AcceptGradientOp():
       with ops.control_dependencies(
diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6132cba1f5aecbafd8ca820ecda39355dd768847
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
@@ -0,0 +1,344 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Wrapper optimizer for Elastic Average SGD """
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+
+from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import constant_op
+
+LOCAL_VARIABLE_NAME = 'local_center_variable'
+GLOBAL_VARIABLE_NAME = 'global_center_variable'
+
+
+class ElasticAverageCustomGetter(object):
+  """Custom_getter class is used to do:
+  1. Change trainable variables to local collection and place them at worker
+    device
+  2. Generate global variables(global center variables)
+  3. Generate local variables(local center variables) which record the global
+    variables and place them at worker device
+    Notice that the class should be used with tf.replica_device_setter,
+    so that the global center variables and global step variable can be placed
+    at ps device. Besides, use 'tf.get_variable' instead of 'tf.Variable' to
+    use this custom getter.
+
+  For example,
+  ea_custom_getter = ElasticAverageCustomGetter(worker_device)
+  with tf.device(
+    tf.train.replica_device_setter(
+      worker_device=worker_device,
+      ps_device="/job:ps/cpu:0",
+      cluster=cluster)),
+    tf.variable_scope('',custom_getter=ea_custom_getter):
+    hid_w = tf.get_variable(
+      initializer=tf.truncated_normal(
+          [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
+          stddev=1.0 / IMAGE_PIXELS),
+      name="hid_w")
+    hid_b = tf.get_variable(initializer=tf.zeros([FLAGS.hidden_units]),
+                            name="hid_b")
+  """
+
+  def __init__(self, worker_device):
+    """Create a new `ElasticAverageCustomGetter`.
+
+    Args:
+      worker_device: String.  Name of the `worker` job.
+    """
+    self._worker_device = worker_device
+    self._local_map = {}
+    self._global_map = {}
+
+  def __call__(self, getter, name, trainable, collections, *args, **kwargs):
+    if trainable:
+      with ops.device(self._worker_device):
+        local_var = getter(name, trainable=True,
+                           collections=[ops.GraphKeys.LOCAL_VARIABLES],
+                           *args, **kwargs)
+      global_center_variable = variable_scope.variable(
+        name='%s/%s' %
+             (GLOBAL_VARIABLE_NAME,
+              name),
+        initial_value=local_var.initialized_value(),
+        trainable=False,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+
+      with ops.device(self._worker_device):
+        local_center_variable = variable_scope.variable(
+          name='%s/%s' % (LOCAL_VARIABLE_NAME, name),
+          initial_value=local_var.initialized_value(),
+          trainable=False,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES])
+
+      self._local_map[local_var] = local_center_variable
+      self._global_map[local_var] = global_center_variable
+      return local_var
+    else:
+      return getter(name, trainable, collections, *args, **kwargs)
+
+
+class ElasticAverageOptimizer(optimizer.Optimizer):
+  """Wrapper optimizer that implements the Elastic Average SGD algorithm.
+  This is an async optimizer. During the training, Each worker will update
+  the local variables and maintains its own local_step, which starts from 0
+  and is incremented by 1 after each update of local variables. Whenever
+  the communication period divides the local step, the worker requests
+  the current global center variables and then computed the elastic difference
+  between global center variables and local variables. The elastic difference
+  then be used to update both local variables and global variables.
+  """
+
+  # Default value as paper described
+  BETA = 0.9
+
+  def __init__(
+      self,
+      opt,
+      num_worker,
+      ea_custom_getter,
+      communication_period=10,
+      moving_rate=None,
+      rho=None,
+      use_locking=True,
+      name="ElasticAverageOptimizer"):
+    """Construct a new gradient descent optimizer.
+
+    Args:
+      opt: The actual optimizer that will be used to update local variables.
+        Must be one of the Optimizer classes.
+      num_worker: The number of workers
+      ea_custom_getter: The ElasticAverageCustomGetter
+      communication_period: An int point value to controls the frequency
+        of the communication between every worker and the ps.
+      moving_rate: A floating point value to control the elastic difference.
+      rho: the amount of exploration we allow ine the model. The default
+        value is moving_rate/learning_rate
+      use_locking: If True use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "ElasticAverageOptimizer".
+    """
+    super(ElasticAverageOptimizer, self).__init__(use_locking, name)
+    self._opt = opt
+    self._num_worker = num_worker
+    self._period = communication_period
+    self._local_map = ea_custom_getter._local_map
+    self._global_map = ea_custom_getter._global_map
+
+    if moving_rate is None:
+      self._moving_rate = BETA / communication_period / num_worker
+    else:
+      self._moving_rate = moving_rate
+    if rho is None:
+      self._rho = self._moving_rate / self._opt._learning_rate
+    else:
+      self._rho = rho
+
+    self._local_step = variable_scope.get_variable(
+      initializer=0,
+      trainable=False,
+      collections=[ops.GraphKeys.LOCAL_VARIABLES],
+      name="local_step")
+    self._opt._prepare()
+
+  def compute_gradients(self, loss, var_list=None,
+                        gate_gradients=optimizer.Optimizer.GATE_OP,
+                        aggregation_method=None,
+                        colocate_gradients_with_ops=False,
+                        grad_loss=None):
+    """Compute gradients of `loss` for the variables in `var_list`.
+
+    Add rho*elastic_difference to loss to control the exploration
+    This is the first part of `minimize()`.  It returns a list
+    of (gradient, variable) pairs where "gradient" is the gradient
+    for "variable".  Note that "gradient" can be a `Tensor`, an
+    `IndexedSlices`, or `None` if there is no gradient for the
+    given variable.
+
+    Args:
+      loss: A Tensor containing the value to minimize.
+      var_list: Optional list or tuple of `tf.Variable` to update to minimize
+        `loss`.  Defaults to the list of variables collected in the graph
+        under the key `GraphKey.TRAINABLE_VARIABLES`.
+      gate_gradients: How to gate the computation of gradients.  Can be
+        `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with
+        the corresponding op.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+
+    Returns:
+      A list of (gradient, variable) pairs. Variable is always present, but
+      gradient can be `None`.
+
+    Raises:
+      TypeError: If `var_list` contains anything else than `Variable` objects.
+      ValueError: If some arguments are invalid.
+    """
+    if not var_list:
+      var_list = variables.trainable_variables()
+
+    elastic_difference = [math_ops.subtract(v, lv) for v, lv in zip(
+      variables.trainable_variables(),
+      [self._local_map[var] for var in var_list])]
+
+    distance_loss = self._rho * math_ops.add_n(
+                      [gen_nn_ops.l2_loss(ed) for ed in elastic_difference])
+
+    total_loss = loss + distance_loss
+    return self._opt.compute_gradients(total_loss, var_list,
+                                       gate_gradients, aggregation_method,
+                                       colocate_gradients_with_ops, grad_loss)
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """Apply gradients to global variables.
+
+    This is the second part of `minimize()`. It returns an `Operation` that
+    applies gradients.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()`.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the `Optimizer` constructor.
+
+    Returns:
+      An `Operation` that applies the specified gradients. If `global_step`
+      was not None, that operation also increments `global_step`.
+
+    Raises:
+      TypeError: If `grads_and_vars` is malformed.
+      ValueError: If none of the variables have gradients.
+    """
+    apply_updates = self._opt.apply_gradients(grads_and_vars)
+    with ops.control_dependencies([apply_updates]):
+      local_update = state_ops.assign_add(
+        self._local_step, 1, name='local_step_update').op
+
+    # update global variables.
+    def _Update_global_variables():
+      local_vars = [v for g, v in grads_and_vars if g is not None]
+      global_center_vars = [self._global_map[var] for var in local_vars]
+      local_center_vars = [self._local_map[var] for var in local_vars]
+      local_center_vars_update = []
+      for lvar, var in zip(local_center_vars, global_center_vars):
+        local_center_vars_update.append(lvar.assign(var))
+      update_ops = []
+      differences = []
+      with ops.control_dependencies(local_center_vars_update):
+        for v, lv in zip(local_vars, local_center_vars):
+          with ops.device(v.device):
+            differences.append(math_ops.subtract(v, lv))
+        for lvar, diff in zip(local_vars, differences):
+          with ops.device(lvar.device):
+            update_ops.append(state_ops.assign_sub(lvar, math_ops.multiply(
+              self._moving_rate, diff)))
+        for var, diff in zip(global_center_vars, differences):
+          with ops.device(var.device):
+            update_ops.append(state_ops.assign_add(var, math_ops.multiply(
+              self._moving_rate, diff)))
+        if global_step:
+          with ops.colocate_with(global_step):
+            update_ops.append(state_ops.assign_add(global_step, 1))
+      variable_update = control_flow_ops.group(*(update_ops))
+      return variable_update
+
+    with ops.control_dependencies([local_update]):
+      condition = math_ops.equal(math_ops.mod(
+        self._local_step, self._period), 0)
+      conditional_update = control_flow_ops.cond(
+        condition, _Update_global_variables, control_flow_ops.no_op)
+    return conditional_update
+
+  def get_init_op(self, task_index):
+    """Returns the op to let all the local variables and local center
+    variables equal to the global center variables before the training begins"""
+
+    def _Add_sync_queues_and_barrier(enqueue_after_list):
+      """Adds ops to enqueu on all worker queues"""
+      sync_queues = [
+        data_flow_ops.FIFOQueue(self._num_worker, [dtypes.bool], shapes=[[]],
+                                shared_name='%s%s' % (
+                                  'variable_init_sync_queue', i)) for i in
+        range(self._num_worker)]
+      queue_ops = []
+      # For each other worker, add an entry in a queue
+      token = constant_op.constant(False)
+      with ops.control_dependencies(enqueue_after_list):
+        for i, q in enumerate(sync_queues):
+          if i == task_index:
+            queue_ops.append(control_flow_ops.no_op())
+          else:
+            queue_ops.append(q.enqueue(token))
+      queue_ops.append(
+        sync_queues[task_index].dequeue_many(len(sync_queues) - 1))
+      return control_flow_ops.group(*queue_ops)
+
+    init_ops = []
+    local_vars = variables.trainable_variables()
+    global_center_vars = [self._global_map[var] for var in local_vars]
+    local_center_vars = [self._local_map[var] for var in local_vars]
+    if not (local_vars and global_center_vars and local_center_vars):
+      raise ValueError(
+        'The lists of local_variables, global_center_variables, '
+        'local_center_variables should not be empty  ')
+    for lvar, gc_var, lc_var in zip(
+        local_vars, global_center_vars, local_center_vars):
+      init_ops.append(state_ops.assign(lvar, gc_var))
+      init_ops.append(state_ops.assign(lc_var, gc_var))
+
+    init_op = control_flow_ops.group(*(init_ops))
+    sync_queue_op = _Add_sync_queues_and_barrier([init_op])
+    return sync_queue_op
+
+  def make_session_run_hook(self, is_chief, task_index):
+    """Creates a hook to handle ElasticAverageOptimizerHook ops such as initialization."""
+    return _ElasticAverageOptimizerHook(self, is_chief, task_index)
+
+
+class _ElasticAverageOptimizerHook(session_run_hook.SessionRunHook):
+  def __init__(self, ea_optimizer, is_chief, task_index):
+    """Creates hook to handle ElasticAverageOptimizer initialization ops.
+
+    Args:
+      ea_optimizer: `ElasticAverageOptimizer` which this hook will initialize.
+      is_chief: `Bool`, whether is this a chief replica or not.
+    """
+    self._ea_optimizer = ea_optimizer
+    self._is_chief = is_chief
+    self._task_index = task_index
+
+  def begin(self):
+    self._local_init_op = variables.local_variables_initializer()
+    self._global_init_op = None
+    if self._is_chief:
+      self._global_init_op = variables.global_variables_initializer()
+    self._variable_init_op = self._ea_optimizer.get_init_op(self._task_index)
diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..446e91018d477d75116f6b78a2443ed79ed3b3ef
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
@@ -0,0 +1,225 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ElasticAverageOptimizer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import portpicker
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import server_lib
+from tensorflow.python.training import training
+from tensorflow.python.training import training_util
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import device_setter
+
+from tensorflow.contrib.opt.python.training.elastic_average_optimizer import \
+  ElasticAverageOptimizer, ElasticAverageCustomGetter, GLOBAL_VARIABLE_NAME
+
+
+def create_local_cluster(num_workers, num_ps, protocol="grpc"):
+  """Create local GRPC servers and return them."""
+  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
+  cluster_dict = {
+    "worker": ["localhost:%s" % port for port in worker_ports],
+    "ps": ["localhost:%s" % port for port in ps_ports]
+  }
+  cs = server_lib.ClusterSpec(cluster_dict)
+
+  workers = [
+    server_lib.Server(
+      cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
+    for ix in range(num_workers)
+  ]
+  ps_servers = [
+    server_lib.Server(
+      cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
+    for ix in range(num_ps)
+  ]
+
+  return cluster_dict, workers, ps_servers
+
+
+# Creates the workers and return their sessions, graphs, train_ops.
+# Cheif worker will update at last
+def _get_workers(num_workers, period, workers, moving_rate):
+  sessions = []
+  graphs = []
+  train_ops = []
+  for worker_id in range(num_workers):
+    graph = ops.Graph()
+    is_chief = (worker_id == 0)
+    with graph.as_default():
+      worker_device = "/job:worker/task:%d/cpu:0" % (worker_id)
+      ea_coustom = ElasticAverageCustomGetter(
+        worker_device=worker_device)
+      with variable_scope.variable_scope('',
+                                         custom_getter=ea_coustom), ops.device(
+        device_setter.replica_device_setter(worker_device=worker_device,
+                                            ps_device="/job:ps/task:0/cpu:0",
+                                            ps_tasks=1)):
+        global_step = variables.Variable(0, name='global_step',
+                                         trainable=False)
+        var_0 = variable_scope.get_variable(initializer=0.0, name="v0")
+        var_1 = variable_scope.get_variable(initializer=1.0, name="v1")
+
+      with ops.device("/job:worker/task:" + str(worker_id)):
+        grads_0 = constant_op.constant(-1.0)
+        grads_1 = constant_op.constant(-1.0)
+
+        sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
+        opt = ElasticAverageOptimizer(
+          opt=sgd_opt,
+          num_worker=num_workers,
+          moving_rate=moving_rate,
+          communication_period=period,
+          ea_custom_getter=ea_coustom
+        )
+        train_op = [
+          opt.apply_gradients(
+            ([grads_0, var_0],
+             [grads_1, var_1]), global_step)
+        ]
+        easgd_hook = opt.make_session_run_hook(is_chief, worker_id)
+      # Creates MonitoredSession
+      sess = training.MonitoredTrainingSession(workers[worker_id].target,
+                                               hooks=[easgd_hook])
+
+    sessions.append(sess)
+    graphs.append(graph)
+    train_ops.append(train_op)
+
+  return sessions, graphs, train_ops
+
+
+class ElasticAverageOptimizerTest(test.TestCase):
+  def _run(self, train_op, sess):
+    sess.run(train_op)
+
+  def test1Workers2Period(self):
+    num_workers = 1
+    communication_period = 2
+    num_ps = 1
+    cluster, workers, _ = create_local_cluster(num_workers=num_workers,
+                                               num_ps=num_ps)
+
+    sessions, graphs, train_ops = _get_workers(num_workers,
+                                               communication_period,
+                                               workers, 1.0)
+
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    global_step = training_util.get_global_step(graphs[0])
+    var_0_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v0:0")
+    var_1_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v1:0")
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(1.0, sessions[0].run(var_1_g))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    sessions[0].run(train_ops[0])
+
+    self.assertAllEqual(1.0, sessions[0].run(var_0))
+    self.assertAllEqual(2.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(1.0, sessions[0].run(var_1_g))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    # iteration 2, global variable update
+    sessions[0].run(train_ops[0])
+
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(2.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(3.0, sessions[0].run(var_1_g))
+    self.assertAllEqual(1, sessions[0].run(global_step))
+
+    # iteration 3
+    sessions[0].run(train_ops[0])
+
+    self.assertAllEqual(1.0, sessions[0].run(var_0))
+    self.assertAllEqual(2.0, sessions[0].run(var_1))
+    self.assertAllEqual(2.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(3.0, sessions[0].run(var_1_g))
+    self.assertAllEqual(1, sessions[0].run(global_step))
+
+  def test2Worker1Period(self):
+    num_workers = 2
+    communication_period = 1
+    num_ps = 2
+    cluster, workers, _ = create_local_cluster(num_workers=num_workers,
+                                               num_ps=num_ps)
+
+    sessions, graphs, train_ops = _get_workers(num_workers,
+                                               communication_period,
+                                               workers, 0.5)
+
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+
+    var_0_1 = graphs[1].get_tensor_by_name('v0:0')
+    var_1_1 = graphs[1].get_tensor_by_name('v1:0')
+
+    var_0_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v0:0")
+    var_1_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v1:0")
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[1].run(var_0_1))
+    self.assertAllEqual(1.0, sessions[1].run(var_1_1))
+    self.assertAllEqual(0.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(1.0, sessions[0].run(var_1_g))
+
+    sessions[0].run(train_ops[0])
+    sessions[1].run(train_ops[1])
+
+    self.assertAllEqual(0.5, sessions[0].run(var_0))
+    self.assertAllEqual(1.5, sessions[0].run(var_1))
+    self.assertAllEqual(0.75, sessions[0].run(var_0_g))
+    self.assertAllEqual(1.75, sessions[0].run(var_1_g))
+    self.assertAllEqual(0.75, sessions[1].run(var_0_1))
+    self.assertAllEqual(1.75, sessions[1].run(var_1_1))
+
+  def testPS2TasksWithClusterSpecClass(self):
+    cluster_spec = server_lib.ClusterSpec({
+      "ps": ["ps0:2222", "ps1:2222"],
+      "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    ea_coustom = ElasticAverageCustomGetter(
+      worker_device="/job:worker/task:0")
+    from tensorflow.python.training import device_setter
+    with ops.device(
+        device_setter.replica_device_setter(cluster=cluster_spec,
+                                            worker_device="/job:worker/task:0",
+                                            ps_device="/job:ps")), \
+         variable_scope.variable_scope('', custom_getter=ea_coustom):
+      v = variable_scope.get_variable(initializer=[1, 2], name="v")
+      w = variable_scope.get_variable(initializer=[2, 1], name='w')
+      v_g, w_g = ea_coustom._global_map[v],ea_coustom._global_map[w]
+      self.assertDeviceEqual("/job:worker/task:0", v.device)
+      self.assertDeviceEqual("job:ps/task:0", v_g.device)
+      self.assertDeviceEqual("/job:worker/task:0", w.device)
+      self.assertDeviceEqual("job:ps/task:1", w_g.device)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
index c48494585eb66c40e69a87439265b9cd08d51712..d68ad23d65500cc2348459cdc53030c2ea08373a 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
@@ -86,6 +86,9 @@ class MovingAverageOptimizer(optimizer.Optimizer):
     self._variable_map = None
     self._sequential_update = sequential_update
 
+  def compute_gradients(self, *args, **kwargs):
+    return self._optimizer.compute_gradients(*args, **kwargs)
+
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
     train_op = self._optimizer.apply_gradients(
         grads_and_vars, global_step=global_step, name=name)
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
index a4ffbfe1c6bf8a63b10593e6c783047c99cad523..60929add198f2e69b5acc2eb5516dafc82b1f3ba 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
@@ -116,6 +116,37 @@ class MovingAverageOptimizerTest(test.TestCase):
       with self.assertRaises(RuntimeError):
         _ = opt.swapping_saver([var])
 
+  def testCorrectOverride(self):
+
+    class WrapperOptimizer(gradient_descent.GradientDescentOptimizer):
+
+      def compute_gradients(self, *args, **kwargs):
+        self.compute_gradients_called = True
+        return super(WrapperOptimizer, self).compute_gradients(
+            *args, **kwargs)
+
+      def apply_gradients(self, *args, **kwargs):
+        self.apply_gradients_called = True
+        return super(WrapperOptimizer, self).apply_gradients(*args, **kwargs)
+
+    with self.test_session() as sess:
+      var = variables.Variable([1.2], name='var', dtype=dtypes.float32)
+      loss = var ** 2
+      wrapper_opt = WrapperOptimizer(learning_rate=2.0)
+      opt = moving_average_optimizer.MovingAverageOptimizer(wrapper_opt)
+      train_op = opt.minimize(loss)
+
+      # Check that both methods are called on the underlying optimizer.
+      self.assertTrue(wrapper_opt.compute_gradients_called)
+      self.assertTrue(wrapper_opt.apply_gradients_called)
+
+      # Run train_op once, and verify that we've updated the variable.
+      variables.global_variables_initializer().run()
+      sess.run(train_op)
+      var_value = sess.run(var)
+      # Started at 1.2, gradient is 2*1.2=2.4, lr=2, so should now be -3.6.
+      self.assertNear(-3.6, var_value, 1e-6)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py
index c26037935d9756d56b6778cbabffebda4c274a47..cb6c77a86feedde3285d75092511c8eb1e63b2a5 100644
--- a/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py
+++ b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py
@@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-"""An optimizer wrapper that ensures correct behaviour
-of stateful optimizers with multitask loss."""
+"""An optimizer wrapper for stateful optimizers with multitask loss."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -30,26 +28,27 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 
-__all__ = ["MultitaskOptimizerWrapper",
-           "clip_gradients_by_global_norm"]
+__all__ = ['MultitaskOptimizerWrapper', 'clip_gradients_by_global_norm']
+
 
 def _is_all_zeros(grad):
   all_zeros = math_ops.equal(math_ops.count_nonzero(grad), 0)
   return all_zeros
 
+
 def _get_wrapper(fn, opt):
+
   def wrapper(self, grad, *args, **kwargs):  # pylint: disable=unused-argument
     all_zeros = _is_all_zeros(grad)
-    return control_flow_ops.cond(
-        all_zeros,
-        control_flow_ops.no_op,
-        lambda: fn(grad, *args, **kwargs))
+    return control_flow_ops.cond(all_zeros, control_flow_ops.no_op,
+                                 lambda: fn(grad, *args, **kwargs))
+
   wrapper = types.MethodType(wrapper, opt)
   return wrapper
 
+
 class MultitaskOptimizerWrapper(object):
-  """Optimizer wrapper that ensures that
-  all-zero gradients don't affect the optimizer state.
+  """Optimizer wrapper making all-zero gradients harmless.
 
   This might be useful when a multi-task loss is used,
   and some components of the loss might be
@@ -88,20 +87,20 @@ class MultitaskOptimizerWrapper(object):
     gradvars_clipped, global_step=batch)
   ```
   """
+
   def __init__(self, opt):
-    """
+    """Constructor.
+
     Args:
-    opt: an instance of a class that implements tf.train.Optimizer.
+      opt: an instance of a class that implements tf.train.Optimizer.
     """
     if not isinstance(opt, optimizer.Optimizer):
       raise TypeError(
-          "Supplied optimizer must be an instance of tf.train.Optimizer")
+          'Supplied optimizer must be an instance of tf.train.Optimizer')
     self._opt = opt
-    overriden_methods = ('_apply_dense',
-                         '_resource_apply_dense',
-                         '_apply_sparse',
-                         '_resource_apply_sparse')
-    for name in overriden_methods:
+    overridden_methods = ('_apply_dense', '_resource_apply_dense',
+                          '_apply_sparse', '_resource_apply_sparse')
+    for name in overridden_methods:
       fn = getattr(self._opt, name)
       wrapper = _get_wrapper(fn, self._opt)
       setattr(self._opt, name, wrapper)
@@ -112,27 +111,30 @@ class MultitaskOptimizerWrapper(object):
 
 def clip_gradients_by_global_norm(gradients_variables, clip_norm=20.):
   """Clips gradients of a multitask loss by their global norm.
+
   Ignores all-zero tensors when computing the global norm.
 
   Args:
-  gradients_variables: a list of pairs (gradient, variable).
-  clip_norm: a float Tensor, the global norm to clip on. Default is 20.0.
+    gradients_variables: a list of pairs (gradient, variable).
+    clip_norm: a float Tensor, the global norm to clip on. Default is 20.0.
 
   Returns:
-  list: A list of pairs of the same type as gradients_variables,.
-  fixed_global_norm: A 0-D (scalar) Tensor representing the global norm.
+    list: A list of pairs of the same type as gradients_variables,.
+    fixed_global_norm: A 0-D (scalar) Tensor representing the global norm.
   """
   gradients, variables = six.moves.zip(*gradients_variables)
+
   def _replace_nonexisting_grad(grad):
     if grad is None:
       return grad
     all_zeros = _is_all_zeros(grad)
-    return control_flow_ops.cond(all_zeros,
-                                 lambda: array_ops.zeros(
-                                     [], dtype=dtypes.as_dtype(grad.dtype)),
-                                 lambda: grad)
+    return control_flow_ops.cond(
+        all_zeros,
+        lambda: array_ops.zeros([], dtype=dtypes.as_dtype(grad.dtype)),
+        lambda: grad)
+
   nonzero_gradients = [_replace_nonexisting_grad(g) for g in gradients]
   fixed_global_norm = clip_ops.global_norm(nonzero_gradients)
-  gradients, _ = clip_ops.clip_by_global_norm(gradients, clip_norm,
-                                              use_norm=fixed_global_norm)
+  gradients, _ = clip_ops.clip_by_global_norm(
+      gradients, clip_norm, use_norm=fixed_global_norm)
   return list(six.moves.zip(gradients, variables)), fixed_global_norm
diff --git a/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper_test.py b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper_test.py
index b06213f71594c96051fd0bccc28402e8ae2bd208..618d8eb18d2e9b738d2c2f5b8e563aeffdf82988 100644
--- a/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper_test.py
+++ b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper_test.py
@@ -18,6 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+import six
+
 from tensorflow.contrib.opt.python.training import multitask_optimizer_wrapper
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -25,13 +28,11 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import momentum
 
-import numpy as np
-import six
 
 class MultitaskOptimizerWrapperTest(test.TestCase):
+  """Tests for the multitask optimizer wrapper.
   """
-  Tests for the multitask optimizer wrapper.
-  """
+
   def testWrapper(self):
     with self.test_session():
       var0 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
@@ -39,12 +40,10 @@ class MultitaskOptimizerWrapperTest(test.TestCase):
       grads0 = constant_op.constant([0.1, 0.1], dtype=dtypes.float32)
       grads1 = constant_op.constant([0.01, 0.01], dtype=dtypes.float32)
       grads_allzero = constant_op.constant([0.0, 0.0], dtype=dtypes.float32)
-      mom_opt_impl = momentum.MomentumOptimizer(
-          learning_rate=2.0, momentum=0.9)
+      mom_opt_impl = momentum.MomentumOptimizer(learning_rate=2.0, momentum=0.9)
       mom_opt = multitask_optimizer_wrapper.MultitaskOptimizerWrapper(
           mom_opt_impl)
-      mom_update = mom_opt.apply_gradients(
-          zip([grads0, grads1], [var0, var1]))
+      mom_update = mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       mom_update_partial = mom_opt.apply_gradients(
           zip([grads_allzero, grads1], [var0, var1]))
       mom_update_no_action = mom_opt.apply_gradients(
@@ -63,14 +62,13 @@ class MultitaskOptimizerWrapperTest(test.TestCase):
       # Step 1: normal momentum update.
       self.evaluate(mom_update)
       # Check that the momentum accumulators have been updated.
-      self.assertAllCloseAccordingToType(np.array([0.1, 0.1]),
-                                         self.evaluate(slot0))
-      self.assertAllCloseAccordingToType(np.array([0.01, 0.01]),
-                                         self.evaluate(slot1))
+      self.assertAllCloseAccordingToType(
+          np.array([0.1, 0.1]), self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([0.01, 0.01]), self.evaluate(slot1))
       # Check that the parameters have been updated.
       self.assertAllCloseAccordingToType(
-          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
-          self.evaluate(var0))
+          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), self.evaluate(var0))
       self.assertAllCloseAccordingToType(
           np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
           self.evaluate(var1))
@@ -78,8 +76,8 @@ class MultitaskOptimizerWrapperTest(test.TestCase):
       # Step 2: momentum update that changes only slot1 but not slot0.
       self.evaluate(mom_update_partial)
       # Check that only the relevant momentum accumulator has been updated.
-      self.assertAllCloseAccordingToType(np.array([0.1, 0.1]),
-                                         self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([0.1, 0.1]), self.evaluate(slot0))
       self.assertAllCloseAccordingToType(
           np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
           self.evaluate(slot1))
@@ -87,8 +85,8 @@ class MultitaskOptimizerWrapperTest(test.TestCase):
       # Step 3: momentum update that does not change anything.
       self.evaluate(mom_update_no_action)
       # Check that the momentum accumulators have *NOT* been updated.
-      self.assertAllCloseAccordingToType(np.array([0.1, 0.1]),
-                                         self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([0.1, 0.1]), self.evaluate(slot0))
       self.assertAllCloseAccordingToType(
           np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
           self.evaluate(slot1))
@@ -105,8 +103,9 @@ class MultitaskOptimizerWrapperTest(test.TestCase):
       grads3 = None
       varlist = [var0, var1, var2, var3]
       gradients = [grads0, grads1, grads2, grads3]
-      clipped_gradvars, global_norm = multitask_optimizer_wrapper.clip_gradients_by_global_norm(
-          six.moves.zip(gradients, varlist), clip_norm=1.0)
+      clipped_gradvars, global_norm = (
+          multitask_optimizer_wrapper.clip_gradients_by_global_norm(
+              six.moves.zip(gradients, varlist), clip_norm=1.0))
       clipped_grads = list(six.moves.zip(*clipped_gradvars))[0]
       reference_global_norm = np.sqrt(np.sum(np.square([10.0, 15.0, 0.0, 5.0])))
       self.assertAllCloseAccordingToType(
@@ -115,5 +114,6 @@ class MultitaskOptimizerWrapperTest(test.TestCase):
           self.evaluate(clipped_grads[2]), np.array([0., 0.]))
       self.assertEqual(clipped_grads[3], None)
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/opt/python/training/powersign.py b/tensorflow/contrib/opt/python/training/powersign.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f7521581fd685c7a65119e2bd2b4af64aafcd69
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/powersign.py
@@ -0,0 +1,173 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of PowerSign."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+
+
+class PowerSignOptimizer(optimizer.Optimizer):
+  """Optimizer that implements the PowerSign update.
+
+  See  Neural Optimizer Search with Reinforcement Learning
+  [Bello et al., ICML2017].
+  """
+
+  def __init__(self,
+               learning_rate=0.1,
+               base=math.e,
+               beta=0.9,
+               sign_decay_fn=None,
+               use_locking=False,
+               name='PowerSignOptimizer'):
+    """Constructs a new PowerSignOptimizer object.
+
+    Initialization:
+
+    ```
+    m_0 <- 0 (Initialize initial 1st moment vector)
+    t <- 0 (Initialize timestep)
+    ```
+
+    Update:
+
+    ```
+    t <- t + 1
+    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+    sign_decay <- sign_decay(t)
+    update <- base ** (sign_decay * sign(g) * sign(m)) * g
+    variable <- variable - lr_t * update
+    ```
+
+    Example usage for PowerSign-cd (PowerSign with cosine sign decay)
+    ```
+    decay_steps = 1000
+    linear_decay_fn = sign_decays.get_linear_decay_fn(decay_steps)
+    opt = PowerSignOptimizer(learning_rate=0.1, sign_decay_fn=linear_decay_fn)
+    ```
+
+    Args:
+      learning_rate: learning_rate used when taking a step.
+      base: base used in optimizer.
+      beta: decay used for computing the moving average m.
+      sign_decay_fn: decay function applied to the sign(g*m) quantity.
+          Takes global_step as an argument and returns the quantity to multiply
+          the sign(g*m) by.
+      use_locking: If True use locks for update operations.
+      name: Optional name for the operations created iwhen applying gradients.
+        Defaults to "PowerSignOptimizer".
+    """
+    super(PowerSignOptimizer, self).__init__(use_locking, name)
+    self._lr = learning_rate
+    self._beta = beta
+    self._logbase = math.log(base)
+
+    self._sign_decay_fn = sign_decay_fn
+
+    # Tensor versions of the constructor arguments, created in _prepare().
+    self._lr_t = None
+    self._beta_t = None
+    self._logbase_t = None
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    if self._sign_decay_fn is not None:
+      self._sign_decay_t = ops.convert_to_tensor(
+          self._sign_decay_fn(global_step), name='sign_decay')
+    return super(PowerSignOptimizer, self).apply_gradients(
+        grads_and_vars, global_step=global_step, name=name)
+
+  def _create_slots(self, var_list):
+    # Create slots for the first moment.
+    for v in var_list:
+      self._zeros_slot(v, 'm', self._name)
+
+  def _prepare(self):
+    self._lr_t = ops.convert_to_tensor(self._lr, name='learning_rate')
+    self._beta_t = ops.convert_to_tensor(self._beta, name='beta')
+    self._logbase_t = ops.convert_to_tensor(self._logbase, name='logbase')
+    if self._sign_decay_fn is None:
+      self._sign_decay_t = ops.convert_to_tensor(1.0, name='sign_decay')
+
+  def _apply_dense(self, grad, var):
+    m = self.get_slot(var, 'm')
+    return training_ops.apply_power_sign(
+        var,
+        m,
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._logbase_t, var.dtype.base_dtype),
+        math_ops.cast(self._sign_decay_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta_t, var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var):
+    m = self.get_slot(var, 'm')
+    return training_ops.resource_apply_power_sign(
+        var.handle,
+        m.handle,
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._logbase_t, var.dtype.base_dtype),
+        math_ops.cast(self._sign_decay_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta_t, var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var):
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype)
+    logbase_t = math_ops.cast(self._logbase_t, var.dtype.base_dtype)
+    e_t = math_ops.cast(math.e, var.dtype.base_dtype)
+
+    m = self.get_slot(var, 'm')
+    m_t = state_ops.assign(
+        m, (m * beta_t) + (grad * (1 - beta_t)), use_locking=self._use_locking)
+
+    sign_g = ops.IndexedSlices(
+        math_ops.sign(grad.values), grad.indices, dense_shape=grad.dense_shape)
+    sign_gm = ops.IndexedSlices(
+        array_ops.gather(math_ops.sign(m_t), sign_g.indices) * sign_g.values,
+        sign_g.indices,
+        dense_shape=sign_g.dense_shape)
+
+    sign_decayed = math_ops.cast(
+        self._sign_decay_t, var.dtype.base_dtype)
+    multiplier_values = math_ops.pow(
+        e_t, logbase_t * sign_decayed * sign_gm.values)
+    multiplier = ops.IndexedSlices(
+        multiplier_values, sign_gm.indices, dense_shape=sign_gm.dense_shape)
+
+    final_update = ops.IndexedSlices(
+        lr_t * multiplier.values * grad.values,
+        multiplier.indices,
+        dense_shape=multiplier.dense_shape)
+
+    var_update = state_ops.scatter_sub(
+        var,
+        final_update.indices,
+        final_update.values,
+        use_locking=self._use_locking)
+
+    return control_flow_ops.group(* [var_update, m_t])
diff --git a/tensorflow/contrib/opt/python/training/powersign_test.py b/tensorflow/contrib/opt/python/training/powersign_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff7b1a72d47d8ef54980905323bcaf358c988a82
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/powersign_test.py
@@ -0,0 +1,268 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for PowerSign."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import powersign
+from tensorflow.contrib.opt.python.training import sign_decay
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def py_linear_decay_fn(decay_steps):
+  def linear_decay(step):
+    step = min(step, decay_steps)
+    return float(decay_steps - step) / decay_steps
+  return linear_decay
+
+
+def powersign_update_numpy(params,
+                           g_t,
+                           m,
+                           lr,
+                           base=math.e,
+                           beta=0.9,
+                           py_sign_decay_fn=None,
+                           t=None):
+  m_t = beta * m + (1 - beta) * g_t
+  if py_sign_decay_fn is None:
+    sign_decayed = 1.0
+  else:
+    sign_decayed = py_sign_decay_fn(t-1)
+  multiplier = base ** (sign_decayed * np.sign(g_t) * np.sign(m_t))
+  params_t = params - lr * multiplier * g_t
+  return params_t, m_t
+
+
+class PowerSignTest(test.TestCase):
+
+  def _testDense(self,
+                 use_resource=False,
+                 learning_rate=0.1,
+                 sign_decay_fn=None,
+                 py_sign_decay_fn=None,
+                 base=math.e,
+                 beta=0.9):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session(use_gpu=True):
+        # Initialize variables for numpy implementation.
+        m0, m1 = 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+          global_step = resource_variable_ops.ResourceVariable(
+              0, trainable=False)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+          global_step = variables.Variable(
+              0, trainable=False)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = powersign.PowerSignOptimizer(
+            learning_rate=learning_rate,
+            base=base,
+            beta=beta,
+            sign_decay_fn=sign_decay_fn,
+        )
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        neg_update = opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]),
+                                         global_step=global_step)
+
+        if context.in_graph_mode():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 7 steps of powersign
+        # first 4 steps with positive gradient
+        # last 3 steps with negative gradient (sign(gm) should be -1)
+        for t in range(1, 8):
+          if t < 5:
+            if context.in_graph_mode():
+              self.evaluate(update)
+            elif t > 1:
+              opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                  global_step=global_step)
+          else:
+            if context.in_graph_mode():
+              self.evaluate(neg_update)
+            elif t > 1:
+              opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]),
+                                  global_step=global_step)
+
+          var0_np, m0 = powersign_update_numpy(
+              var0_np,
+              grads0_np if t < 5 else -grads0_np,
+              m0,
+              learning_rate,
+              base=base,
+              beta=beta,
+              py_sign_decay_fn=py_sign_decay_fn,
+              t=t,
+          )
+          var1_np, m1 = powersign_update_numpy(
+              var1_np,
+              grads1_np if t < 5 else -grads1_np,
+              m1,
+              learning_rate,
+              base=base,
+              beta=beta,
+              py_sign_decay_fn=py_sign_decay_fn,
+              t=t,
+          )
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testDense(self):
+    decay_steps = 10
+    sign_decay_fn = sign_decay.get_linear_decay_fn(decay_steps)
+    py_sign_decay_fn = py_linear_decay_fn(decay_steps)
+    self._testDense(use_resource=False)
+    self._testDense(use_resource=False,
+                    learning_rate=0.1,
+                    base=10.0,
+                    beta=0.8)
+    self._testDense(use_resource=False,
+                    sign_decay_fn=sign_decay_fn,
+                    py_sign_decay_fn=py_sign_decay_fn)
+
+    self._testDense(use_resource=True)
+    self._testDense(use_resource=True, learning_rate=0.1, base=10.0, beta=0.8)
+    self._testDense(use_resource=True,
+                    sign_decay_fn=sign_decay_fn,
+                    py_sign_decay_fn=py_sign_decay_fn)
+
+  def _testSparse(self,
+                  use_resource=False,
+                  learning_rate=0.1,
+                  sign_decay_fn=None,
+                  py_sign_decay_fn=None,
+                  base=math.e,
+                  beta=0.9):
+    with self.test_session(use_gpu=True):
+      for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+        # Initialize variables for numpy implementation.
+        m0, m1 = 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+          global_step = resource_variable_ops.ResourceVariable(
+              0, trainable=False)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+          global_step = variables.Variable(
+              0, trainable=False)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = powersign.PowerSignOptimizer(
+            learning_rate=learning_rate,
+            base=base,
+            beta=beta,
+            sign_decay_fn=sign_decay_fn,
+        )
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        neg_update = opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]),
+                                         global_step=global_step)
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of powersign
+        # first 4 steps with positive gradient
+        # last 3 steps with negative gradient (sign(gm) should be -1)
+        for t in range(1, 8):
+          if t < 5:
+            update.run()
+          else:
+            neg_update.run()
+
+          var0_np, m0 = powersign_update_numpy(
+              var0_np,
+              grads0_np if t < 5 else -grads0_np,
+              m0,
+              learning_rate,
+              base=base,
+              beta=beta,
+              py_sign_decay_fn=py_sign_decay_fn,
+              t=t,
+          )
+          var1_np, m1 = powersign_update_numpy(
+              var1_np,
+              grads1_np if t < 5 else -grads1_np,
+              m1,
+              learning_rate,
+              base=base,
+              beta=beta,
+              py_sign_decay_fn=py_sign_decay_fn,
+              t=t,
+          )
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSparse(self):
+    decay_steps = 10
+    sign_decay_fn = sign_decay.get_linear_decay_fn(decay_steps)
+    py_sign_decay_fn = py_linear_decay_fn(decay_steps)
+    self._testSparse(use_resource=False)
+    self._testSparse(use_resource=False,
+                     learning_rate=0.01,
+                     base=2.0,
+                     beta=0.8)
+    self._testSparse(use_resource=False,
+                     sign_decay_fn=sign_decay_fn,
+                     py_sign_decay_fn=py_sign_decay_fn)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/sign_decay.py b/tensorflow/contrib/opt/python/training/sign_decay.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8870c072110da145c0bb78e20c3584083438ea0
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/sign_decay.py
@@ -0,0 +1,158 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of the sign decay functions used in PowerSign and AddSign.
+
+See [Bello et al., ICML 2017] Neural Optimizer Search with Reinforcement
+Learning for details.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def get_linear_decay_fn(decay_steps):
+  """Returns a function that computes a linear decay.
+
+  This decay computes linear annealing:
+    max(0, (decay_steps - global_step) / decay_steps)
+
+  Example usage:
+  ```
+  decay_steps = 1000
+  linear_decay_fn = get_linear_decay_fn(decay_steps)
+  decayed = linear_decay_fn(global_step)
+  x *= decayed
+  ```
+  Args:
+    decay_steps: number of steps to decay over.
+  Returns:
+    linear_decay_fn: a function that computes the linear decay.
+  """
+  # pylint:disable=missing-docstring
+  def linear_decay_fn(global_step):
+    if global_step is None:
+      raise ValueError("global_step is required for linear_decay.")
+    global_step = math_ops.minimum(global_step, decay_steps)
+    remaining_steps = math_ops.to_int32(decay_steps) - math_ops.to_int32(
+        global_step)
+    decayed = math_ops.to_float(remaining_steps) / math_ops.to_float(
+        decay_steps)
+    return math_ops.maximum(0.0, decayed)
+  # pylint:enable=missing-docstring
+  return linear_decay_fn
+
+
+def get_cosine_decay_fn(decay_steps, num_periods=0.5, zero_after=None):
+  """Returns a function that computes a cosine decay.
+
+  This decay computes cosine annealing:
+    0.5 * (1.0 + cos(2.0 * pi * num_periods * global_step / decay_steps))
+
+  This decay can be used to decay the sign quantity in the AddSign and PowerSign
+  optimizers discovered in
+  [Bello et al., ICML 2017] Neural Optimizer Search with RL.
+
+  Example usage:
+  ```
+  decay_steps = 1000
+  num_periods = 2
+  cosine_decay_fn = get_cosine_decay_fn(decay_steps, num_periods=num_periods)
+  decayed = cosine_decay_fn(global_step)
+  x *= decayed
+  ```
+  Args:
+    decay_steps: number of steps to decay over.
+    num_periods: number of periods for cosine signal. 0.5 by default,
+      which maps the last decay step to 0.
+    zero_after: if not None, number after which the decay function
+      will just return 0.
+  Returns:
+    cosine_decay_fn: a function that computes the cosine decay.
+  """
+  # pylint:disable=missing-docstring
+  def cosine_decay_fn(global_step):
+    if global_step is None:
+      raise ValueError("global_step is required for cosine_decay.")
+    global_step = math_ops.minimum(global_step, decay_steps)
+    completed_fraction = math_ops.to_float(global_step) / math_ops.to_float(
+        decay_steps)
+    fraction = 2.0 * num_periods * completed_fraction
+    decayed = 0.5 * (
+        1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
+    if zero_after is not None:
+      decayed = array_ops.where(
+          math_ops.greater_equal(fraction, 2 * zero_after), 0.0, decayed)
+    return decayed
+  # pylint:enable=missing-docstring
+  return cosine_decay_fn
+
+
+def get_restart_decay_fn(decay_steps, num_periods=1, zero_after=None):
+  """Returns a function that computes a restart decay.
+
+  This decay computes
+    0.5 * (1.0 + cos(pi * (num_periods * global_step) % num_training_steps))
+
+  This is a simplified version of the restart decay introduced in
+  "SGDR: Stochastic Gradient Descent with Warm Restarts"
+  by Ilya Loshchilov & Frank Hutter, Proceedings of
+  ICLR'2017, available at https://arxiv.org/pdf/1608.03983.pdf
+
+  This decay can be used to decay the sign quantity in the AddSign and PowerSign
+  optimizers discovered in
+  [Bello et al., ICML 2017] Neural Optimizer Search with RL.
+
+  Example usage:
+  ```
+  decay_steps = 1000
+  num_periods = 2.0
+  restart_decay_fn = get_restart_decay_fn(decay_steps,
+                                          num_periods=num_periods)
+  decayed = restart_decay_fn(global_step)
+  x *= decayed
+  ```
+  Args:
+    decay_steps: number of steps to decay over.
+    num_periods: number of periods for cosine signal. 1 by default,
+      which maps the last decay step to 0.
+    zero_after: if not None, number after which the decay function
+      will return 0.
+  Returns:
+    restart_decay_fn: a function that computes the restart decay.
+  """
+  # pylint:disable=missing-docstring
+  def restart_decay_fn(global_step):
+    if global_step is None:
+      raise ValueError("global_step is required for cosine_decay.")
+    global_step = math_ops.minimum(global_step, decay_steps)
+    num = math_ops.mod(num_periods * math_ops.to_float(global_step),
+                       decay_steps)
+    fraction = num / math_ops.to_float(decay_steps)
+    decayed = 0.5 * (
+        1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
+    if zero_after is not None:
+      tmp = math_ops.to_float(
+          num_periods * global_step) / math_ops.to_float(decay_steps)
+      decayed = array_ops.where(
+          math_ops.greater_equal(tmp, zero_after), 0.0, decayed)
+    return decayed
+  # pylint:enable=missing-docstring
+  return restart_decay_fn
diff --git a/tensorflow/contrib/opt/python/training/sign_decay_test.py b/tensorflow/contrib/opt/python/training/sign_decay_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c31cb924eacfc8feea6bbd1f5c9ae903442b04b1
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/sign_decay_test.py
@@ -0,0 +1,110 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sign_decay."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from tensorflow.contrib.opt.python.training import sign_decay
+from tensorflow.python.platform import test
+
+
+def py_linear_decay_fn(decay_steps):
+
+  def linear_decay(step):
+    step = min(step, decay_steps)
+    return float(decay_steps - step) / decay_steps
+
+  return linear_decay
+
+
+def py_cosine_decay_fn(decay_steps, num_periods=0.5, zero_after=None):
+
+  def cosine_decay(step):
+    step = min(step, decay_steps)
+    fraction = 2.0 * num_periods * step / float(decay_steps)
+    if zero_after is not None and fraction >= 2 * zero_after:
+      return 0.0
+    return 0.5 * (1.0 + math.cos(math.pi * fraction))
+
+  return cosine_decay
+
+
+def py_restart_decay_fn(decay_steps, num_periods=1, zero_after=None):
+
+  def restart_decay(step):
+    step = min(step, decay_steps)
+    tmp = num_periods * step / float(decay_steps)
+    fraction = (
+        num_periods * step % decay_steps) / float(decay_steps)
+    if zero_after is not None and tmp >= zero_after:
+      return 0
+    return 0.5 * (1.0 + math.cos(math.pi * fraction))
+
+  return restart_decay
+
+
+class SignDecaysTest(test.TestCase):
+
+  def testLinearDecay(self):
+    num_training_steps = 1000
+    linear_decay_fn = sign_decay.get_linear_decay_fn(num_training_steps)
+
+    for step in range(0, 1000, 100):
+      with self.test_session():
+        tf_decayed = linear_decay_fn(step).eval()
+        py_decayed = py_linear_decay_fn(num_training_steps)(step)
+        self.assertAlmostEqual(tf_decayed, py_decayed, places=4)
+
+  def testCosineDecay(self):
+    num_training_steps = 1000
+    cosine_decay_fn = sign_decay.get_cosine_decay_fn(num_training_steps)
+    cosine_decay_2_fn = sign_decay.get_cosine_decay_fn(
+        num_training_steps, num_periods=5, zero_after=2)
+
+    for step in range(0, 1000, 100):
+      with self.test_session():
+        tf_decayed = cosine_decay_fn(step).eval()
+        py_decayed = py_cosine_decay_fn(num_training_steps)(step)
+        self.assertAlmostEqual(tf_decayed, py_decayed, places=4)
+
+        tf_decayed = cosine_decay_2_fn(step).eval()
+        py_decayed = py_cosine_decay_fn(
+            num_training_steps, num_periods=5, zero_after=2)(step)
+        self.assertAlmostEqual(tf_decayed, py_decayed, places=4)
+
+  def testRestartDecay(self):
+    num_training_steps = 1000
+    restart_decay_fn = sign_decay.get_restart_decay_fn(num_training_steps)
+    restart_decay_2_fn = sign_decay.get_restart_decay_fn(
+        num_training_steps, num_periods=5, zero_after=2)
+
+    for step in range(0, 1000, 100):
+      with self.test_session():
+        tf_decayed = restart_decay_fn(step).eval()
+        py_decayed = py_restart_decay_fn(num_training_steps)(step)
+        self.assertAlmostEqual(tf_decayed, py_decayed, places=4)
+
+        tf_decayed = restart_decay_2_fn(step).eval()
+        py_decayed = py_restart_decay_fn(
+            num_training_steps, num_periods=5, zero_after=2)(step)
+        self.assertAlmostEqual(tf_decayed, py_decayed, places=4)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..71582f9c9a01eb221666e2c71c4a2edb18e7cb98
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -0,0 +1,113 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_gen_op_libs",
+    "tf_custom_op_library",
+    "tf_custom_op_py_library",
+    "tf_gen_op_wrapper_py",
+)
+
+cc_library(
+    name = "all_ops",
+    srcs = [":custom_op_sources"],
+    hdrs = [":custom_op_headers"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_library(
+    name = "python/ops/_periodic_resample_op.so",
+    srcs = [
+        ":custom_op_headers",
+        ":custom_op_sources",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["array_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_periodic_resample_op_py",
+    out = "python/ops/gen_periodic_resample_op.py",
+    deps = [":array_ops_op_lib"],
+)
+
+tf_custom_op_py_library(
+    name = "periodic_resample_op_py",
+    srcs = ["python/ops/periodic_resample_op.py"],
+    dso = ["python/ops/_periodic_resample_op.so"],
+    kernels = [
+        ":array_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_periodic_resample_op_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
+
+py_library(
+    name = "init_py",
+    srcs = [
+        "__init__.py",
+        "python/__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":periodic_resample_op_py",
+    ],
+)
+
+# py_library(
+#     name = "periodic_resample_op_py",
+#     srcs = ["python/ops/periodic_resample_op.py"],
+#     data = ["python/ops/_periodic_resample_op.so"],
+#     srcs_version = "PY2AND3",
+# )
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "custom_op_sources",
+    srcs = glob(
+        [
+            "ops/*.cc",
+            "kernels/*.cc",
+        ],
+        exclude = [
+            "ops/*_test.cc",
+            "kernels/*_test.cc",
+        ],
+    ),
+)
+
+filegroup(
+    name = "custom_op_headers",
+    srcs = glob(
+        [
+            "kernels/*.h",
+            "ops/*.h",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/periodic_resample/__init__.py b/tensorflow/contrib/periodic_resample/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fde9091b88f96da8f880ea341c8fd809b619c807
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/__init__.py
@@ -0,0 +1,27 @@
+# =============================================================================
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Custom op used by periodic_resample."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.periodic_resample.python.ops.periodic_resample_op import periodic_resample
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["periodic_resample"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9cee405cef25f54fd064f8002265c42016c4fa50
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
@@ -0,0 +1,26 @@
+// =============================================================================
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("PeriodicResample")
+                            .Device(DEVICE_CPU),
+                        PeriodicResampleOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..bef21f7a5c8a27011f95eb7fae8451ca944d3cde
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
@@ -0,0 +1,230 @@
+// =============================================================================
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
+#define TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
+
+#include <cmath>
+#include <type_traits>
+#include <vector>
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace {
+
+template <class IndexVecT, class IndexT>
+IndexT compute_input_index(
+    IndexVecT* target_dimensions, const IndexT& output_index,
+    const IndexVecT& original_dimensions, const int& adjustable_dimension,
+    const std::vector<tensorflow::int64>& dimension_ceiling,
+    const std::vector<tensorflow::int64>& cumulative_dimensions, IndexT* result,
+    std::vector<IndexT>* output_indices, const int& rank) {
+  *result = 0;
+  output_indices->clear();
+
+  // un-rasterize the output index
+  auto last_reduced_i = output_index;
+  for (auto r = rank - 1; r >= 0; --r) {
+    (*output_indices)[r] = last_reduced_i % (*target_dimensions)[r];
+    last_reduced_i =
+        (last_reduced_i - (*output_indices)[r]) / (*target_dimensions)[r];
+  }
+
+  // rasterize the input index
+  IndexT last_index_factor = 1;
+  for (auto r = rank - 1; r >= 0; --r) {
+    IndexT index = 0;
+    if (r != adjustable_dimension)
+      index = (*output_indices)[r] / dimension_ceiling[r];
+    else {
+      for (int qi = 0; qi < rank; ++qi) {
+        if (qi == adjustable_dimension) continue;
+        index += cumulative_dimensions[qi] *
+                 ((*output_indices)[qi] % dimension_ceiling[qi]);
+      }
+      index *= (*target_dimensions)[adjustable_dimension];
+      index += (*output_indices)[r];
+    }
+    *result += last_index_factor * index;
+    last_index_factor *= original_dimensions[r];
+  }
+
+  return *result;
+}
+
+template <class InputDataT,
+          class IndexVecT>  // both types are needed here b/c IndexVecT and
+                            // InputDataT are not related
+                            void
+                            fill_periodic_tensor(
+                                tensorflow::OpKernelContext* context,
+                                const IndexVecT& desired_shape,
+                                const tensorflow::Tensor& input_tensor) {
+  // input is a strided array (last index is fastest, C-ordered)
+  auto input = input_tensor.flat<InputDataT>();
+  const int rank = input_tensor.dims();
+  // original and target dimensions
+  std::vector<tensorflow::int64> original_dimensions(rank),
+      target_dimensions(rank);
+  tensorflow::int64 total_size(input_tensor.NumElements()), new_sliced_size(1);
+  // factors by which original_dimensions increases/decreases w.r.t.
+  // target_dimensions
+  std::vector<tensorflow::int64> dimension_ceiling(rank),
+      cumulative_dimensions(rank);
+  // index of adjustable dimension
+  int adjustable_dimension;
+  tensorflow::TensorShape output_shape;
+
+  // requires that the rank of the input tensor and length of the desired shape
+  // are equal
+  OP_REQUIRES(context, rank == desired_shape.size(),
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample expects the rank of the input tensor, ",
+                  rank, ", to be the same as the length of the desired shape, ",
+                  desired_shape.size(), "."));
+
+  bool found = false;
+  for (int i = 0; i < rank; ++i) {
+    // if (desired_shape(i) < 1) {
+    if (desired_shape[i] < 1) {
+      // only one index can be adjustable
+      OP_REQUIRES(context, !found,
+                  tensorflow::errors::InvalidArgument(
+                      "periodic_resample expects only "
+                      "one index to be marked as adjustable."));
+      adjustable_dimension = i;
+      found = true;
+    } else {
+      // target_dimensions[i] = desired_shape(i);
+      target_dimensions[i] = desired_shape[i];
+      new_sliced_size *= target_dimensions[i];
+    }
+  }
+  // at least one index needs to be adjustable
+  OP_REQUIRES(context, found,
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample expects at least "
+                  "one index to be marked as adjustable."));
+
+  int count = 0;
+  for (const auto dim_info : input_tensor.shape()) {
+    original_dimensions[count] = dim_info.size;
+    ++count;
+  }
+
+  target_dimensions[adjustable_dimension] = total_size / new_sliced_size;
+
+  count = 0;
+  for (int i = 0; i < input_tensor.shape().dims(); ++i) {
+    dimension_ceiling[count] = tensorflow::int64(std::ceil(
+        float(target_dimensions[count]) / float(original_dimensions[count])));
+    if (count == 0)
+      cumulative_dimensions[count] = 1;
+    else
+      cumulative_dimensions[count] =
+          cumulative_dimensions[count - 1] * dimension_ceiling[count - 1];
+    ++count;
+  }
+
+  // ensure that the new dimension is greater than zero
+  OP_REQUIRES(context, target_dimensions[adjustable_dimension] > 0,
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample found that the "
+                  "adjustable dimension, ",
+                  adjustable_dimension, ", isn't greater than zero, ",
+                  target_dimensions[adjustable_dimension], "."));
+  for (int i = 0; i < rank; ++i) {
+    output_shape.AddDim(target_dimensions[i]);
+  }
+  const auto new_size =
+      new_sliced_size * target_dimensions[adjustable_dimension];
+
+  // Create an output tensor and attach it to the current context
+  tensorflow::Tensor* output_tensor = nullptr;
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(0, output_shape, &output_tensor));
+  auto output = output_tensor->flat<InputDataT>();
+
+  // memory is allocated for these variables outside the inner loop for
+  // efficiency (although, I could create a separate class scope for
+  // this purpose instead)
+  tensorflow::int64 result = 0;
+  std::vector<tensorflow::int64> output_indices(target_dimensions.size());
+
+  // Fill output tensor with periodically resampled input tensor values
+  for (tensorflow::int64 output_index = 0; output_index < new_size;
+       ++output_index) {
+    output(output_index) = input(compute_input_index(
+        &target_dimensions, output_index, original_dimensions,
+        adjustable_dimension, dimension_ceiling, cumulative_dimensions, &result,
+        &output_indices, rank));
+  }
+}
+
+void create_output_tensor(
+    tensorflow::OpKernelContext* context,
+    const tensorflow::Tensor& input_tensor,
+    const tensorflow::DataType& input_tensor_type,
+    const tensorflow::PartialTensorShape& desired_shape_tensor) {
+  auto desired_shape = desired_shape_tensor.dim_sizes();
+
+  // obligatory type switch
+  switch (input_tensor_type) {
+    case tensorflow::DataTypeToEnum<float>::value:
+      fill_periodic_tensor<float>(context, desired_shape, input_tensor);
+      break;
+    case tensorflow::DataTypeToEnum<double>::value:
+      fill_periodic_tensor<double>(context, desired_shape, input_tensor);
+      break;
+    case tensorflow::DataTypeToEnum<tensorflow::int32>::value:
+      fill_periodic_tensor<tensorflow::int32>(context, desired_shape,
+                                              input_tensor);
+      break;
+    case tensorflow::DataTypeToEnum<tensorflow::int64>::value:
+      fill_periodic_tensor<tensorflow::int64>(context, desired_shape,
+                                              input_tensor);
+      break;
+    default:;
+  }
+}
+
+}  // namespace
+
+class PeriodicResampleOp : public tensorflow::OpKernel {
+ public:
+  explicit PeriodicResampleOp(tensorflow::OpKernelConstruction* context)
+      : tensorflow::OpKernel(context) {
+    // Get the desired shape
+    OP_REQUIRES_OK(context, context->GetAttr("shape", &desired_shape));
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // Grab the input tensor
+    const tensorflow::Tensor& input_tensor = context->input(0);
+    const tensorflow::DataType input_tensor_type = context->input_dtype(0);
+
+    create_output_tensor(context, input_tensor, input_tensor_type,
+                         desired_shape);
+  }
+
+ private:
+  tensorflow::PartialTensorShape desired_shape;
+};
+
+#endif  // TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops.cc b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c90fc06c7fb9d79e8fd7a937e786a34947d8c1cb
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
@@ -0,0 +1,90 @@
+// =============================================================================
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("PeriodicResample")
+    .Attr("T: numbertype")
+    .Input("values: T")
+    .Attr("shape: shape")
+    .Output("output: T")
+    .SetShapeFn(shape_inference::ExplicitShape)
+    .Doc(R"doc(
+Periodically resample elements of a tensor to conform to `shape`.
+
+This function implements a slightly more generic version of the subpixel
+convolutions found in this [paper](https://arxiv.org/abs/1609.05158).
+
+The formula for computing the elements in the `output` tensor is as follows:
+  `T` = `values` tensor of rank `R`
+  `S` = desired `shape` of output tensor (vector of length `R`)
+  `P` = `output` tensor of rank `R`
+  \((T_1,\ldots,T_R)\) = shape(`T`)
+  \([S_1,\ldots,S_q,\ldots,S_R]\) = elements of vector `S`
+
+  A single element in `S` is left unspecified (denoted \(S_q=-1\)).
+  Let \(f_i\) denote the (possibly non-integer) factor that relates the original
+  dimension to the desired dimensions, \(S_i=f_i T_i\), for \(i\neq q\) where
+  \(f_i>0\).
+  Define the following:
+    \(g_i=\lceil f_i\rceil\)
+    \(t=\prod_i T_i\)
+    \(s=\prod_{i\neq q} S_i\)
+  \(S_q\) can then be defined as by \(S_q=\lfloor t/s\rfloor\).
+  The elements of the resulting tensor are defined as
+  \(P_{s_1,\ldots,s_R}=T_{h_1,\ldots,h_q,\ldots,h_R}\).
+  The \(h_i\) (\(i\neq q\)) are defined by \(h_i=\lfloor s_i/g_i\rfloor\).
+  \(h_q=S_q\sum_{j\neq q}^{q-1}G_j \mathrm{mod}(s_j,g_j) + s_q\), where
+  \(G_j=\prod_{i}^{j-1}g_i\) (\(G_0=1\)).
+
+One drawback of this method is that whenever the output dimensions are slightly
+less than integer multiples of the input dimensions, many of the tensor elements
+are repeated in an inefficient way. This is resolved by specifying that all
+desired dimensions are integer multiples of the input tensor.
+
+For example:
+
+```prettyprint
+`input` is [[ 0  1  2  3]
+            [ 4  5  6  7]
+            [ 8  9 10 11]]
+
+tf.periodic_resample(input, [6, None]) ==> [[ 0  1]
+                                            [ 2  3]
+                                            [ 4  5]
+                                            [ 6  7]
+                                            [ 8  9]
+                                            [10 11]]
+```
+
+values: The tensor of rank `R` to periodic_resample
+shape: A 1-D tensor representing the desired shape of the output tensor.
+  Exactly one element of this tensor must have the value `None` which represents
+  that this dimension of `values` can be adjusted downward in order to
+  accommodate increases in other dimensions. The specified sizes of the
+  non-adjustable dimensions must by at least as large as in the `values` tensor.
+output: Periodically resampled tensor that has dimensions specified as in
+  `shape` except that the dimension specified as `None` will be minimally
+  decreased as necessary.
+
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/python/__init__.py b/tensorflow/contrib/periodic_resample/python/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8b6ead0f594ad23e73901254857313635fbd1c5
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/python/__init__.py
@@ -0,0 +1,20 @@
+# =============================================================================
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Public API of periodic_resample."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d727870f652f3606218928983ea18e990d0afe6
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
@@ -0,0 +1,101 @@
+# =============================================================================
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+import tensorflow
+from tensorflow.contrib.periodic_resample import periodic_resample
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+class PeriodicResampleTest(test_util.TensorFlowTestCase):
+
+  def testPeriodicResampleBasic2D(self):
+
+    input_tensor = numpy.arange(12).reshape((3, 4))
+    desired_shape = numpy.array([6, None])
+    output_tensor = input_tensor.reshape((6, 2))
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      result = periodic_resample(input_tensor, desired_shape).eval()
+      self.assertAllEqual(result, output_tensor)
+
+  def testPeriodicResampleTruncatedBasic2D(self):
+
+    input_tensor = numpy.arange(12).reshape((3, 4))
+    desired_shape = numpy.array([5, None])
+    output_tensor = input_tensor.reshape((6, 2))[:-1]
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      result = periodic_resample(input_tensor, desired_shape).eval()
+      self.assertAllEqual(result, output_tensor)
+
+  def testPeriodicResampleBasic3D(self):
+
+    input_tensor = numpy.arange(2*2*4).reshape((2, 2, 4))
+    desired_shape = numpy.array([4, 4, None])
+    output_tensor = numpy.array([[[0], [2], [4], [6]],
+                                 [[1], [3], [5], [7]],
+                                 [[8], [10], [12], [14]],
+                                 [[9], [11], [13], [15]]])
+
+    # NOTE: output_tensor != input_tensor.reshape((4, 4, -1))
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      result = periodic_resample(input_tensor, desired_shape).eval()
+      # input_tensor[0, 0, 0] == result[0, 0, 0]
+      # input_tensor[0, 0, 1] == result[1, 0, 0]
+      # input_tensor[0, 0, 2] == result[0, 1, 0]
+      # input_tensor[0, 0, 3] == result[1, 1, 0]
+      self.assertAllEqual(result, output_tensor)
+
+  def testPeriodicResampleBasic4D(self):
+
+    input_tensor = numpy.arange(2*2*2*8).reshape((2, 2, 2, 8))
+    desired_shape = numpy.array([4, 4, 4, None])
+    output_tensor = numpy.array([[[[0], [4], [8], [12]],
+                                  [[2], [6], [10], [14]],
+                                  [[16], [20], [24], [28]],
+                                  [[18], [22], [26], [30]]],
+                                 [[[1], [5], [9], [13]],
+                                  [[3], [7], [11], [15]],
+                                  [[17], [21], [25], [29]],
+                                  [[19], [23], [27], [31]]],
+                                 [[[32], [36], [40], [44]],
+                                  [[34], [38], [42], [46]],
+                                  [[48], [52], [56], [60]],
+                                  [[50], [54], [58], [62]]],
+                                 [[[33], [37], [41], [45]],
+                                  [[35], [39], [43], [47]],
+                                  [[49], [53], [57], [61]],
+                                  [[51], [55], [59], [63]]]])
+
+    # NOTE: output_tensor != input_tensor.reshape((4, 4, 4, -1))
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      result = periodic_resample(input_tensor, desired_shape).eval()
+      self.assertAllEqual(result, output_tensor)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a09f70f442131da7da2a4e98a238f21c3ccb6ec
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
@@ -0,0 +1,30 @@
+# =============================================================================
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.contrib.periodic_resample.python.ops import gen_periodic_resample_op
+
+from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+
+_periodic_resample_op = loader.load_op_library(
+    resource_loader.get_path_to_datafile('_periodic_resample_op.so'))
diff --git a/tensorflow/contrib/pi_examples/README.md b/tensorflow/contrib/pi_examples/README.md
index f550228083712da4ddc725cd233c1eb7bbffeb25..177357bca64b51fe82360095d677cdddc11ec948 100644
--- a/tensorflow/contrib/pi_examples/README.md
+++ b/tensorflow/contrib/pi_examples/README.md
@@ -13,7 +13,7 @@ sudo apt-get install -y libjpeg-dev
 ```
 
  - To download the example model you'll need, run these commands:
- 
+
 ```bash
 curl https://storage.googleapis.com/download.tensorflow.org/models/inception_dec_2015_stripped.zip \
 -o /tmp/inception_dec_2015_stripped.zip
diff --git a/tensorflow/contrib/pi_examples/camera/Makefile b/tensorflow/contrib/pi_examples/camera/Makefile
index 578f1336f3282f647b18d1622b85905d53b3ebfa..b354c03b6e563c98347ad901bf07430d1fd17b49 100644
--- a/tensorflow/contrib/pi_examples/camera/Makefile
+++ b/tensorflow/contrib/pi_examples/camera/Makefile
@@ -76,7 +76,7 @@ $(EXECUTABLE_NAME): $(EXECUTABLE_OBJS) $(TFLIBS)
 	$(LIBFLAGS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
 
 # Matches on C++ source files.
-$(OBJDIR)%.o: %.cc 
+$(OBJDIR)%.o: %.cc
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
 
diff --git a/tensorflow/contrib/pi_examples/label_image/Makefile b/tensorflow/contrib/pi_examples/label_image/Makefile
index 19652e581d2403cf8e4dbd7b9e10b7c386959069..9d054a3133a44e8a612ecad1e95adffa09e4a352 100644
--- a/tensorflow/contrib/pi_examples/label_image/Makefile
+++ b/tensorflow/contrib/pi_examples/label_image/Makefile
@@ -75,7 +75,7 @@ $(EXECUTABLE_NAME): $(EXECUTABLE_OBJS) $(TFLIBS)
 	$(LIBFLAGS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
 
 # Matches on C++ source files.
-$(OBJDIR)%.o: %.cc 
+$(OBJDIR)%.o: %.cc
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
 
diff --git a/tensorflow/contrib/pi_examples/label_image/label_image.cc b/tensorflow/contrib/pi_examples/label_image/label_image.cc
index 7817cd0c6459aad88836503857301ec6334d486b..0b18045789f3a87ceb228033407d6b696bdb33f6 100644
--- a/tensorflow/contrib/pi_examples/label_image/label_image.cc
+++ b/tensorflow/contrib/pi_examples/label_image/label_image.cc
@@ -89,7 +89,7 @@ Status LoadJpegFile(string file_name, std::vector<tensorflow::uint8>* data,
   FILE * infile;
   JSAMPARRAY buffer;
   int row_stride;
-  
+
   if ((infile = fopen(file_name.c_str(), "rb")) == NULL) {
     LOG(ERROR) << "Can't open " << file_name;
     return tensorflow::errors::NotFound("JPEG file ", file_name,
@@ -105,7 +105,7 @@ Status LoadJpegFile(string file_name, std::vector<tensorflow::uint8>* data,
     fclose(infile);
     return tensorflow::errors::Unknown("JPEG decoding failed");
   }
-  
+
   jpeg_create_decompress(&cinfo);
   jpeg_stdio_src(&cinfo, infile);
   jpeg_read_header(&cinfo, TRUE);
@@ -119,14 +119,14 @@ Status LoadJpegFile(string file_name, std::vector<tensorflow::uint8>* data,
   buffer = (*cinfo.mem->alloc_sarray)
     ((j_common_ptr) &cinfo, JPOOL_IMAGE, row_stride, 1);
   while (cinfo.output_scanline < cinfo.output_height) {
-    tensorflow::uint8* row_address = &((*data)[cinfo.output_scanline * row_stride]); 
+    tensorflow::uint8* row_address = &((*data)[cinfo.output_scanline * row_stride]);
     jpeg_read_scanlines(&cinfo, buffer, 1);
     memcpy(row_address, buffer[0], row_stride);
   }
 
   jpeg_finish_decompress(&cinfo);
   jpeg_destroy_decompress(&cinfo);
-  fclose(infile);  
+  fclose(infile);
   return Status::OK();
 }
 
@@ -167,7 +167,7 @@ Status ReadTensorFromImageFile(string file_name, const int wanted_height,
     const int top_y_index = static_cast<int>(floorf(in_y));
     const int bottom_y_index =
       std::min(static_cast<int>(ceilf(in_y)), (image_height - 1));
-    const float y_lerp = in_y - top_y_index; 
+    const float y_lerp = in_y - top_y_index;
     tensorflow::uint8* in_top_row = in + (top_y_index * image_rowlen);
     tensorflow::uint8* in_bottom_row = in + (bottom_y_index * image_rowlen);
     float *out_row = out + (y * wanted_width * wanted_channels);
@@ -186,7 +186,7 @@ Status ReadTensorFromImageFile(string file_name, const int wanted_height,
 	in_bottom_row + (right_x_index * wanted_channels);
       const float x_lerp = in_x - left_x_index;
       float *out_pixel = out_row + (x * wanted_channels);
-      for (int c = 0; c < wanted_channels; ++c) {	
+      for (int c = 0; c < wanted_channels; ++c) {
 	const float top_left((in_top_left_pixel[c] - input_mean) / input_std);
 	const float top_right((in_top_right_pixel[c] - input_mean) / input_std);
 	const float bottom_left((in_bottom_left_pixel[c] - input_mean) / input_std);
@@ -198,7 +198,7 @@ Status ReadTensorFromImageFile(string file_name, const int wanted_height,
       }
     }
   }
-  
+
   out_tensors->push_back(image_tensor);
   return Status::OK();
 }
diff --git a/tensorflow/contrib/predictor/BUILD b/tensorflow/contrib/predictor/BUILD
index 82cd7b4c8aeb64cf461d9244c5aaf32a91691a5a..d7c3d6c3be5c38bc89e0f65549306c0e411fb97a 100644
--- a/tensorflow/contrib/predictor/BUILD
+++ b/tensorflow/contrib/predictor/BUILD
@@ -136,6 +136,17 @@ py_test(
     ],
 )
 
+py_test(
+    name = "predictor_factories_test",
+    srcs = ["predictor_factories_test.py"],
+    data = [":test_export_dir"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":predictor_factories",
+    ],
+)
+
 py_test(
     name = "core_estimator_predictor_test",
     srcs = ["core_estimator_predictor_test.py"],
diff --git a/tensorflow/contrib/predictor/predictor_factories.py b/tensorflow/contrib/predictor/predictor_factories.py
index e3f30d917d637d2e2d821a727e12b8d0b54942df..9485187c5d54737120f94c8e0ec0c4a57bc1ef62 100644
--- a/tensorflow/contrib/predictor/predictor_factories.py
+++ b/tensorflow/contrib/predictor/predictor_factories.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Factory functions for `Predictor`s."""
 
 from __future__ import absolute_import
@@ -59,9 +58,9 @@ def from_contrib_estimator(estimator,
   return contrib_estimator_predictor.ContribEstimatorPredictor(
       estimator,
       prediction_input_fn,
-      input_alternative_key,
-      output_alternative_key,
-      graph)
+      input_alternative_key=input_alternative_key,
+      output_alternative_key=output_alternative_key,
+      graph=graph)
 
 
 def from_estimator(estimator,
@@ -92,10 +91,7 @@ def from_estimator(estimator,
                     'tf.contrib.learn.Estimator. You likely want to call '
                     'from_contrib_estimator.')
   return core_estimator_predictor.CoreEstimatorPredictor(
-      estimator,
-      serving_input_receiver_fn,
-      output_key,
-      graph)
+      estimator, serving_input_receiver_fn, output_key=output_key, graph=graph)
 
 
 def from_saved_model(export_dir,
@@ -125,8 +121,9 @@ def from_saved_model(export_dir,
     ValueError: More than one of `signature_def_key` and `signature_def` is
       specified.
   """
-  return saved_model_predictor.SavedModelPredictor(export_dir,
-                                                   signature_def_key,
-                                                   signature_def,
-                                                   tags,
-                                                   graph)
+  return saved_model_predictor.SavedModelPredictor(
+      export_dir,
+      signature_def_key=signature_def_key,
+      signature_def=signature_def,
+      tags=tags,
+      graph=graph)
diff --git a/tensorflow/contrib/predictor/predictor_factories_test.py b/tensorflow/contrib/predictor/predictor_factories_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..60ffeec6536e785f8830b37715222491c3a8a6fc
--- /dev/null
+++ b/tensorflow/contrib/predictor/predictor_factories_test.py
@@ -0,0 +1,51 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for predictor.predictor_factories."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.predictor import predictor_factories
+from tensorflow.python.platform import test
+
+MODEL_DIR_NAME = 'contrib/predictor/test_export_dir'
+
+
+class PredictorFactoriesTest(test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    # Load a saved model exported from the arithmetic `Estimator`.
+    # See `testing_common.py`.
+    cls._export_dir = test.test_src_dir_path(MODEL_DIR_NAME)
+
+  def testFromSavedModel(self):
+    """Test loading from_saved_model."""
+    predictor_factories.from_saved_model(self._export_dir)
+
+  def testFromSavedModelWithTags(self):
+    """Test loading from_saved_model with tags."""
+    predictor_factories.from_saved_model(self._export_dir, tags='serve')
+
+  def testFromSavedModelWithBadTags(self):
+    """Test that loading fails for bad tags."""
+    bad_tags_regex = ('.*? could not be found in SavedModel')
+    with self.assertRaisesRegexp(RuntimeError, bad_tags_regex):
+      predictor_factories.from_saved_model(self._export_dir, tags='bad_tag')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/README.md b/tensorflow/contrib/py2tf/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cd50675ad57316b9c749c137e6acd30b91c10073
--- /dev/null
+++ b/tensorflow/contrib/py2tf/README.md
@@ -0,0 +1,4 @@
+# Py2TF
+
+A compiler for generating TensorFlow numeric and control flow ops from Python
+code.
diff --git a/tensorflow/contrib/py2tf/__init__.py b/tensorflow/contrib/py2tf/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7fd8725e0a57737aa8294f7d1389060706697fe
--- /dev/null
+++ b/tensorflow/contrib/py2tf/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Py2TF compiles Python code into equivalent TensorFlow code.
+
+Equivalent here means that they have the same effect when executed.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+
+_allowed_symbols = []
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index 782232e85ff57076927ac724d9ceebb2280bddb9..40541729da5fd9d0ae75579e11f20999337de124 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -13,7 +13,7 @@ through estimator [2]. Note that during back propagation, the parameters are
 updated at high precision as this is needed to ensure sufficient precision in
 accumulating tiny adjustments to the parameters. However, for the forward pass,
 the parameters and activations are quantized to the desired lower precision.
- 
+
 ![drawing](g3doc/drawings/Fake_Quantization.jpg)
 
 ###Forward pass
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index 647d4044001f7be701037d07dc46db86c0aa3a0e..7df5fa8372c16d10d6462bd7e5ed524a8b18ad3d 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -66,23 +66,26 @@ def _FoldFusedBatchNorms(graph):
     # `scope`.
     with graph.as_default(), graph.name_scope(scope + sep), ops.device(
         match.bn_op.device):
-      # new weights = old weights * gamma / sqrt(variance + epsilon)
-      # new biases = -mean * gamma / sqrt(variance + epsilon) + beta
-      multiplier_tensor = match.gamma_tensor * math_ops.rsqrt(
-          match.variance_tensor + match.bn_op.get_attr('epsilon'))
-      bias_tensor = math_ops.subtract(
-          match.beta_tensor, match.mean_tensor * multiplier_tensor, name='bias')
-
-      # The shape of depthwise weights is different, so we need to reshape the
-      # multiplier_tensor to ensure that the scaled_weight_tensor has the
-      # expected shape.
-      if match.layer_op.type == 'DepthwiseConv2dNative':
-        new_shape = [
-            match.weight_tensor.get_shape().as_list()[2],
-            match.weight_tensor.get_shape().as_list()[3]
-        ]
-        multiplier_tensor = array_ops.reshape(
-            multiplier_tensor, new_shape, name='scale_reshape')
+      with graph.name_scope(scope + sep + 'BatchNorm_Fold' + sep):
+        # new weights = old weights * gamma / sqrt(variance + epsilon)
+        # new biases = -mean * gamma / sqrt(variance + epsilon) + beta
+        multiplier_tensor = match.gamma_tensor * math_ops.rsqrt(
+            match.variance_tensor + match.bn_op.get_attr('epsilon'))
+        bias_tensor = math_ops.subtract(
+            match.beta_tensor,
+            match.mean_tensor * multiplier_tensor,
+            name='bias')
+
+        # The shape of depthwise weights is different, so we need to reshape the
+        # multiplier_tensor to ensure that the scaled_weight_tensor has the
+        # expected shape.
+        if match.layer_op.type == 'DepthwiseConv2dNative':
+          new_shape = [
+              match.weight_tensor.get_shape().as_list()[2],
+              match.weight_tensor.get_shape().as_list()[3]
+          ]
+          multiplier_tensor = array_ops.reshape(
+              multiplier_tensor, new_shape, name='scale_reshape')
 
       # TODO(suharshs): This naming of the following ops needs to carefully
       # follow the naming expected by quantize.py. Generalize the quantize code
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
index 2cecf6851467f82675bd67bf1fb108e9a39df1b0..4dc5994885530b631bb5ef73d5e464cd26b8c526 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
@@ -284,16 +284,20 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
 
     folded_mul = g.get_operation_by_name(scope + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
+    if fused_batch_norm:
+      scale_reshape_op_name = scope + '/BatchNorm_Fold/scale_reshape'
+    else:
+      scale_reshape_op_name = scope + '/scale_reshape'
     self._AssertInputOpsAre(folded_mul,
                             [scope + '/depthwise_weights/read',
-                             scope + '/scale_reshape'])
+                             scale_reshape_op_name])
     self._AssertOutputGoesToOps(folded_mul, g, [scope + '/depthwise_Fold'])
 
-    scale_reshape = g.get_operation_by_name(scope + '/scale_reshape')
+    scale_reshape = g.get_operation_by_name(scale_reshape_op_name)
     self.assertEqual(scale_reshape.type, 'Reshape')
     self._AssertInputOpsAre(scale_reshape, [
         self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm),
-        scope + '/scale_reshape/shape'
+        scale_reshape_op_name + '/shape'
     ])
     self._AssertOutputGoesToOps(scale_reshape, g, [scope + '/mul_fold'])
 
@@ -326,13 +330,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
   def _BatchNormMultiplierName(self, scope, has_scaling, fused):
     if has_scaling:
       if fused:
-        return scope + '/mul'
+        return scope + '/BatchNorm_Fold/mul'
       return scope + '/BatchNorm/batchnorm/mul'
     return scope + '/BatchNorm/batchnorm/Rsqrt'
 
   def _BathNormBiasName(self, scope, fused):
     if fused:
-      return scope + '/bias'
+      return scope + '/BatchNorm_Fold/bias'
     return scope + '/BatchNorm/batchnorm/sub'
 
   def _WeightInit(self, stddev):
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index 7db2d863aa4b16ddcb630603c0a960ccb81f3c71..50a2b4c91c9e7a2681f6041646a023a4225fb0c5 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -164,7 +164,10 @@ class _QuantizeContext(object):
 
   def QuantizeAddContexts(self):
     """Quantizes all add ops in self.add_contexts."""
-    for add_context in self.add_contexts:
+    # Loop through sorted self.add_contexts so that op creation is
+    # deterministic. This is needed when using multiple worker replicas so that
+    # the ops can be initialized consistently.
+    for add_context in sorted(self.add_contexts):
       add_op = self.GetOperationByNamesDontThrow([
           add_context + '/Add', add_context + '/add'])
       if add_op is not None:
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.cc b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
index 7d9ef14cefc578e9401d95db9a625428cc0e2605..e02c1b6a2bd9daf9e1f81059f7c1f92106cebc8f 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops.cc
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
@@ -406,10 +406,10 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
                                    data_channels);
     OP_REQUIRES(ctx, grad_output_shape == resampler_output_shape,
                 ::tensorflow::errors::InvalidArgument(
-                   "grad_output shape is not consistent with data and warp "
-                   "shapes; it should be ",
-                   resampler_output_shape.DebugString(), " but is ",
-                   grad_output_shape.DebugString()))
+                    "grad_output shape is not consistent with data and warp "
+                    "shapes; it should be ",
+                    resampler_output_shape.DebugString(), " but is ",
+                    grad_output_shape.DebugString()));
     const int num_sampling_points = warp.NumElements() / batch_size / 2;
     ::tensorflow::Tensor* grad_data = nullptr;
     ::tensorflow::Tensor* grad_warp = nullptr;
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index 16b6d145e3fd3e4e5bb34481cc61eb5706cf1772..63155faf1e48d05f3abb9c17612b8646ccad612d 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.contrib.rnn.python.ops import core_rnn_cell
+from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -42,7 +43,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
 
 
-
 # pylint: enable=protected-access
 Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
 
@@ -374,19 +374,20 @@ class RNNCellTest(test.TestCase):
         h = array_ops.zeros([batch_size, num_proj])
         state = rnn_cell_impl.LSTMStateTuple(c, h)
         cell = contrib_rnn_cell.LayerNormLSTMCell(
-          num_units=num_units,
-          num_proj=num_proj,
-          forget_bias=1.0,
-          layer_norm=True,
-          norm_gain=1.0,
-          norm_shift=0.0)
+            num_units=num_units,
+            num_proj=num_proj,
+            forget_bias=1.0,
+            layer_norm=True,
+            norm_gain=1.0,
+            norm_shift=0.0)
         g, out_m = cell(x, state)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g, out_m], {
-          x.name: np.ones((batch_size, input_size)),
-          c.name: 0.1 * np.ones((batch_size, num_units)),
-          h.name: 0.1 * np.ones((batch_size, num_proj))
-        })
+        res = sess.run(
+            [g, out_m], {
+                x.name: np.ones((batch_size, input_size)),
+                c.name: 0.1 * np.ones((batch_size, num_units)),
+                h.name: 0.1 * np.ones((batch_size, num_proj))
+            })
         self.assertEqual(len(res), 2)
         # The numbers in results were not calculated, this is mostly just a
         # smoke test.
@@ -396,9 +397,9 @@ class RNNCellTest(test.TestCase):
         # Different inputs so different outputs and states
         for i in range(1, batch_size):
           self.assertTrue(
-            float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) < 1e-6)
+              float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) < 1e-6)
           self.assertTrue(
-            float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) < 1e-6)
+              float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) < 1e-6)
 
   def testOutputProjectionWrapper(self):
     with self.test_session() as sess:
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index 9cea2ec79a982e4fb362ec564eb72b3894917842..0258d7202df20a536ae4240a532249b6b5e7e641 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -2175,9 +2175,9 @@ class DeviceWrapperCell(rnn_cell.RNNCell):
   def __call__(self, input_, state, scope=None):
     if self._device is not None:
       with ops_lib.device(self._device):
-        return self._cell(input_, state, scope)
+        return self._cell(input_, state, scope=scope)
     else:
-      return self._cell(input_, state, scope)
+      return self._cell(input_, state, scope=scope)
 
 
 class TensorArrayOnCorrectDeviceTest(test.TestCase):
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
index a288072ae5da0751f1999128029f38bea933490e..7957edf68cc8a1461fccfc2de93ad5250dc9fdb5 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
@@ -49,6 +49,7 @@ def blocks_match(sess, use_peephole):
     inp = ops.convert_to_tensor(
         np.random.randn(batch_size, input_size), dtype=dtypes.float32)
     inputs.append(inp)
+  stacked_inputs = array_ops.stack(inputs)
 
   initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890212)
 
@@ -72,23 +73,6 @@ def blocks_match(sess, use_peephole):
         dtype=dtypes.float32,
         initializer=init_ops.zeros_initializer())
 
-    if use_peephole:
-      wci_block = variable_scope.get_variable(
-          "rnn/lstm_cell/lstm_block_wrapper/w_i_diag",
-          initializer=wci.initialized_value())
-      wcf_block = variable_scope.get_variable(
-          "rnn/lstm_cell/lstm_block_wrapper/w_f_diag",
-          initializer=wcf.initialized_value())
-      wco_block = variable_scope.get_variable(
-          "rnn/lstm_cell/lstm_block_wrapper/w_o_diag",
-          initializer=wco.initialized_value())
-    w_block = variable_scope.get_variable(
-        "rnn/lstm_cell/lstm_block_wrapper/kernel",
-        initializer=w.initialized_value())
-    b_block = variable_scope.get_variable(
-        "rnn/lstm_cell/lstm_block_wrapper/bias",
-        initializer=b.initialized_value())
-
     basic_cell = rnn_cell.LSTMCell(
         cell_size, use_peepholes=use_peephole, state_is_tuple=True, reuse=True)
     basic_outputs_op, basic_state_op = rnn.static_rnn(
@@ -113,11 +97,11 @@ def blocks_match(sess, use_peephole):
           b,
           cell_clip=0)
 
-    with variable_scope.variable_scope("rnn/lstm_cell", reuse=True):
-      fused_cell = lstm_ops.LSTMBlockFusedCell(
-          cell_size, cell_clip=0, use_peephole=use_peephole)
-      fused_outputs_op, fused_state_op = fused_cell(
-          inputs, dtype=dtypes.float32)
+    fused_cell = lstm_ops.LSTMBlockFusedCell(
+        cell_size, cell_clip=0, use_peephole=use_peephole, reuse=True,
+        name="rnn/lstm_cell")
+    fused_outputs_op, fused_state_op = fused_cell(
+        stacked_inputs, dtype=dtypes.float32)
 
     sess.run([variables.global_variables_initializer()])
     basic_outputs, basic_state = sess.run([basic_outputs_op, basic_state_op[0]])
@@ -131,9 +115,9 @@ def blocks_match(sess, use_peephole):
     block_grads = sess.run(gradients_impl.gradients(block_outputs_op, inputs))
     block_wgrads = sess.run(gradients_impl.gradients(block_outputs_op, xs))
 
-    xs = [w_block, b_block]
+    xs = [w, b]
     if use_peephole:
-      xs += [wci_block, wcf_block, wco_block]
+      xs += [wci, wcf, wco]
     fused_outputs, fused_state = sess.run([fused_outputs_op, fused_state_op[0]])
     fused_grads = sess.run(gradients_impl.gradients(fused_outputs_op, inputs))
     fused_wgrads = sess.run(gradients_impl.gradients(fused_outputs_op, xs))
@@ -216,7 +200,7 @@ class LSTMBlockCellTest(test.TestCase):
     with self.test_session(use_gpu=True, graph=ops.Graph()):
       cell = lstm_ops.LSTMBlockFusedCell(10)
       pcell = lstm_ops.LSTMBlockFusedCell(10, use_peephole=True)
-      inputs = [array_ops.zeros([4, 5])] * 6
+      inputs = array_ops.stack([array_ops.zeros([4, 5])] * 6)
       cell(inputs, dtype=dtypes.float32, scope="basic/lstm_cell")
       pcell(inputs, dtype=dtypes.float32, scope="peephole/lstm_cell")
       fused_names = {
@@ -380,13 +364,13 @@ class LSTMBlockCellTest(test.TestCase):
             np.random.randn(batch_size, input_size), dtype=dtypes.float32)
         inputs.append(inp)
       seq_lengths = constant_op.constant([3, 4, 5])
+      cell_inputs = array_ops.stack(inputs)
 
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=19890213)
 
-      with variable_scope.variable_scope(
-          "lstm_block_wrapper", initializer=initializer):
-        # magic naming so that the cells pick up these variables and resuse them
+      with variable_scope.variable_scope("lstm_cell", initializer=initializer):
+        # magic naming so that the cells pick up these variables and reuse them
         variable_scope.get_variable(
             "kernel",
             shape=[input_size + cell_size, cell_size * 4],
@@ -398,13 +382,12 @@ class LSTMBlockCellTest(test.TestCase):
             dtype=dtypes.float32,
             initializer=init_ops.zeros_initializer())
 
-      with variable_scope.variable_scope(
-          variable_scope.get_variable_scope(), reuse=True):
-        cell = lstm_ops.LSTMBlockFusedCell(
-            cell_size, cell_clip=0, use_peephole=False)
+      cell = lstm_ops.LSTMBlockFusedCell(
+          cell_size, cell_clip=0, use_peephole=False, reuse=True,
+          name="lstm_cell")
 
-        fused_outputs_op, fused_state_op = cell(
-            inputs, dtype=dtypes.float32, sequence_length=seq_lengths)
+      fused_outputs_op, fused_state_op = cell(
+          cell_inputs, dtype=dtypes.float32, sequence_length=seq_lengths)
 
       cell_vars = [
           v for v in variables.trainable_variables()
@@ -420,7 +403,7 @@ class LSTMBlockCellTest(test.TestCase):
         for i, inp in enumerate(inputs):
           lengths = [int(i < l) for l in seq_lengths.eval()]
           output, state = cell(
-              [inp],
+              array_ops.expand_dims(inp, 0),
               initial_state=state,
               dtype=dtypes.float32,
               sequence_length=lengths)
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index b4a5f2d7ebaaa7fd916fb7129db7e2bdbee19706..46823fa3643c5b4a3d857fa38d1a70792d97ca40 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -996,26 +996,19 @@ class RNNCellTest(test.TestCase):
         output, state = cell(x, hidden)
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([output, state], {
-            hidden[0].name:
-                np.array([[[[[1.],[1.]], 
-                            [[1.],[1.]]],
-                           [[[1.],[1.]],
-                            [[1.],[1.]]]], 
-                          [[[[2.],[2.]],
-                            [[2.],[2.]]],
-                           [[[2.],[2.]],
-                            [[2.],[2.]]]]]),
-            x.name:
-                np.array([[[[[1.],[1.]],
-                            [[1.],[1.]]],
-                           [[[1.],[1.]],
-                            [[1.],[1.]]]],
-                          [[[[2.],[2.]],
-                            [[2.],[2.]]],
-                           [[[2.],[2.]],
-                           [[2.],[2.]]]]])
-        })
+        res = sess.run(
+            [output, state], {
+                hidden[0].name:
+                    np.array([[[[[1.], [1.]], [[1.], [1.]]], [[[1.], [1.]], [[
+                        1.
+                    ], [1.]]]], [[[[2.], [2.]], [[2.], [2.]]],
+                                 [[[2.], [2.]], [[2.], [2.]]]]]),
+                x.name:
+                    np.array([[[[[1.], [1.]], [[1.], [1.]]], [[[1.], [1.]], [[
+                        1.
+                    ], [1.]]]], [[[[2.], [2.]], [[2.], [2.]]], [[[2.], [2.]],
+                                                                [[2.], [2.]]]]])
+            })
         # This is a smoke test, making sure expected values are unchanged.
         self.assertEqual(len(res), 2)
         self.assertAllClose(res[0], res[1].h)
@@ -1276,10 +1269,8 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         self.assertAllClose(res[2].c, expected_c1, 1e-5)
         self.assertAllClose(res[2].h, expected_h1, 1e-5)
 
-
   def testBasicLSTMCellWithStateTupleLayerNorm(self):
-    """The results of LSTMCell and LayerNormBasicLSTMCell 
-    should be same. """
+    """The results of LSTMCell and LayerNormBasicLSTMCell should be the same."""
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
@@ -1290,21 +1281,21 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         c1 = array_ops.zeros([1, 2])
         h1 = array_ops.zeros([1, 2])
         state1 = rnn_cell_impl.LSTMStateTuple(c1, h1)
-        cell = rnn_cell_impl.MultiRNNCell(
-          [contrib_rnn_cell.LayerNormLSTMCell(
-              2,
-              layer_norm=True,
-              norm_gain=1.0,
-              norm_shift=0.0) for _ in range(2)])
+        cell = rnn_cell_impl.MultiRNNCell([
+            contrib_rnn_cell.LayerNormLSTMCell(
+                2, layer_norm=True, norm_gain=1.0, norm_shift=0.0)
+            for _ in range(2)
+        ])
         h, (s0, s1) = cell(x, (state0, state1))
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([h, s0, s1], {
-          x.name: np.array([[1., 1.]]),
-          c0.name: 0.1 * np.asarray([[0, 1]]),
-          h0.name: 0.1 * np.asarray([[2, 3]]),
-          c1.name: 0.1 * np.asarray([[4, 5]]),
-          h1.name: 0.1 * np.asarray([[6, 7]]),
-        })
+        res = sess.run(
+            [h, s0, s1], {
+                x.name: np.array([[1., 1.]]),
+                c0.name: 0.1 * np.asarray([[0, 1]]),
+                h0.name: 0.1 * np.asarray([[2, 3]]),
+                c1.name: 0.1 * np.asarray([[4, 5]]),
+                h1.name: 0.1 * np.asarray([[6, 7]]),
+            })
 
         expected_h = np.array([[-0.38079708, 0.38079708]])
         expected_h0 = np.array([[-0.38079708, 0.38079708]])
diff --git a/tensorflow/contrib/rnn/python/ops/gru_ops.py b/tensorflow/contrib/rnn/python/ops/gru_ops.py
index 75536e3f5f8cbe44231f19d4d455537e654f7a08..4c964ec201f153d6c8293d3bf93bc231ff8f751d 100644
--- a/tensorflow/contrib/rnn/python/ops/gru_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/gru_ops.py
@@ -20,18 +20,20 @@ from __future__ import print_function
 from tensorflow.contrib.rnn.ops import gen_gru_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
+from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import rnn_cell_impl
-from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.util.deprecation import deprecated_args
 
 _gru_ops_so = loader.load_op_library(
     resource_loader.get_path_to_datafile("_gru_ops.so"))
 
+LayerRNNCell = rnn_cell_impl._LayerRNNCell  # pylint: disable=invalid-name,protected-access
+
 
 @ops.RegisterGradient("GRUBlockCell")
 def _GRUBlockCellGrad(op, *grad):
@@ -95,7 +97,7 @@ def _GRUBlockCellGrad(op, *grad):
   return d_x, d_h_prev, d_w_ru, d_w_c, d_b_ru, d_b_c
 
 
-class GRUBlockCell(rnn_cell_impl.RNNCell):
+class GRUBlockCell(LayerRNNCell):
   r"""Block GRU cell implementation.
 
   Deprecated: use GRUBlockCellV2 instead.
@@ -132,22 +134,37 @@ class GRUBlockCell(rnn_cell_impl.RNNCell):
 
   @deprecated_args(None, "cell_size is deprecated, use num_units instead",
                    "cell_size")
-  def __init__(self, num_units=None, cell_size=None):
+  def __init__(self,
+               num_units=None,
+               cell_size=None,
+               reuse=None,
+               name="gru_cell"):
     """Initialize the Block GRU cell.
 
     Args:
       num_units: int, The number of units in the GRU cell.
       cell_size: int, The old (deprecated) name for `num_units`.
+      reuse: (optional) boolean describing whether to reuse variables in an
+        existing scope.  If not `True`, and the existing scope already has the
+        given variables, an error is raised.
+      name: String, the name of the layer. Layers with the same name will
+        share weights, but to avoid mistakes we require reuse=True in such
+        cases.  By default this is "lstm_cell", for variable-name compatibility
+        with `tf.nn.rnn_cell.GRUCell`.
 
     Raises:
       ValueError: if both cell_size and num_units are not None;
         or both are None.
     """
+    super(GRUBlockCell, self).__init__(_reuse=reuse, name=name)
     if (cell_size is None) == (num_units is None):
-      raise ValueError("Exactly one of num_units or cell_size must be provided.")
+      raise ValueError(
+          "Exactly one of num_units or cell_size must be provided.")
     if num_units is None:
       num_units = cell_size
     self._cell_size = num_units
+    # Inputs must be 2-dimensional.
+    self.input_spec = base_layer.InputSpec(ndim=2)
 
   @property
   def state_size(self):
@@ -157,40 +174,43 @@ class GRUBlockCell(rnn_cell_impl.RNNCell):
   def output_size(self):
     return self._cell_size
 
-  def __call__(self, x, h_prev, scope=None):
+  def build(self, input_shape):
+    # Check if the input size exist.
+    input_size = input_shape[1].value
+    if input_size is None:
+      raise ValueError("Expecting input_size to be set.")
+
+    self._gate_kernel = self.add_variable(
+        "w_ru", [input_size + self._cell_size, self._cell_size * 2])
+    self._gate_bias = self.add_variable(
+        "b_ru", [self._cell_size * 2],
+        initializer=init_ops.constant_initializer(1.0))
+    self._candidate_kernel = self.add_variable(
+        "w_c", [input_size + self._cell_size, self._cell_size])
+    self._candidate_bias = self.add_variable(
+        "b_c", [self._cell_size],
+        initializer=init_ops.constant_initializer(0.0))
+
+    self.built = True
+
+  def call(self, inputs, h_prev):
     """GRU cell."""
-    with vs.variable_scope(scope or type(self).__name__):
-      input_size = x.get_shape().with_rank(2)[1]
-
-      # Check if the input size exist.
-      if input_size is None:
-        raise ValueError("Expecting input_size to be set.")
-
-      # Check cell_size == state_size from h_prev.
-      cell_size = h_prev.get_shape().with_rank(2)[1]
-      if cell_size != self._cell_size:
-        raise ValueError("Shape of h_prev[1] incorrect: cell_size %i vs %s" %
-                         (self._cell_size, cell_size))
-
-      if cell_size is None:
-        raise ValueError("cell_size from `h_prev` should not be None.")
-
-      w_ru = vs.get_variable("w_ru", [input_size + self._cell_size,
-                                      self._cell_size * 2])
-      b_ru = vs.get_variable(
-          "b_ru", [self._cell_size * 2],
-          initializer=init_ops.constant_initializer(1.0))
-      w_c = vs.get_variable("w_c",
-                            [input_size + self._cell_size, self._cell_size])
-      b_c = vs.get_variable(
-          "b_c", [self._cell_size],
-          initializer=init_ops.constant_initializer(0.0))
+    # Check cell_size == state_size from h_prev.
+    cell_size = h_prev.get_shape().with_rank(2)[1]
+    if cell_size != self._cell_size:
+      raise ValueError("Shape of h_prev[1] incorrect: cell_size %i vs %s" %
+                       (self._cell_size, cell_size))
 
-      _gru_block_cell = gen_gru_ops.gru_block_cell  # pylint: disable=invalid-name
-      _, _, _, new_h = _gru_block_cell(
-          x=x, h_prev=h_prev, w_ru=w_ru, w_c=w_c, b_ru=b_ru, b_c=b_c)
+    _gru_block_cell = gen_gru_ops.gru_block_cell  # pylint: disable=invalid-name
+    _, _, _, new_h = _gru_block_cell(
+        x=inputs,
+        h_prev=h_prev,
+        w_ru=self._gate_kernel,
+        w_c=self._candidate_kernel,
+        b_ru=self._gate_bias,
+        b_c=self._candidate_bias)
 
-      return new_h, new_h
+    return new_h, new_h
 
 
 class GRUBlockCellV2(GRUBlockCell):
@@ -199,39 +219,21 @@ class GRUBlockCellV2(GRUBlockCell):
   Only differs from GRUBlockCell by variable names.
   """
 
-  def __call__(self, x, h_prev, scope=None):
+  def build(self, input_shape):
     """GRU cell."""
-    with vs.variable_scope(scope or type(self).__name__):
-      input_size = x.get_shape().with_rank(2)[1]
-
-      # Check if the input size exist.
-      if input_size is None:
-        raise ValueError("Expecting input_size to be set.")
-
-      # Check cell_size == state_size from h_prev.
-      cell_size = h_prev.get_shape().with_rank(2)[1]
-      if cell_size != self._cell_size:
-        raise ValueError("Shape of h_prev[1] incorrect: cell_size %i vs %s" %
-                         (self._cell_size, cell_size))
-
-      if cell_size is None:
-        raise ValueError("cell_size from `h_prev` should not be None.")
-
-      with vs.variable_scope("gates"):
-        w_ru = vs.get_variable("kernel", [input_size + self._cell_size,
-                                          self._cell_size * 2])
-        b_ru = vs.get_variable(
-            "bias", [self._cell_size * 2],
-            initializer=init_ops.constant_initializer(1.0))
-      with vs.variable_scope("candidate"):
-        w_c = vs.get_variable("kernel",
-                              [input_size + self._cell_size, self._cell_size])
-        b_c = vs.get_variable(
-            "bias", [self._cell_size],
-            initializer=init_ops.constant_initializer(0.0))
-
-      _gru_block_cell = gen_gru_ops.gru_block_cell  # pylint: disable=invalid-name
-      _, _, _, new_h = _gru_block_cell(
-          x=x, h_prev=h_prev, w_ru=w_ru, w_c=w_c, b_ru=b_ru, b_c=b_c)
-
-      return new_h, new_h
+    input_size = input_shape[1].value
+    if input_size is None:
+      raise ValueError("Expecting input_size to be set.")
+
+    self._gate_kernel = self.add_variable(
+        "gates/kernel", [input_size + self._cell_size, self._cell_size * 2])
+    self._gate_bias = self.add_variable(
+        "gates/bias", [self._cell_size * 2],
+        initializer=init_ops.constant_initializer(1.0))
+    self._candidate_kernel = self.add_variable(
+        "candidate/kernel", [input_size + self._cell_size, self._cell_size])
+    self._candidate_bias = self.add_variable(
+        "candidate/bias", [self._cell_size],
+        initializer=init_ops.constant_initializer(0.0))
+
+    self.built = True
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
index df910a3423083972bdee42bec10733e37b8e5f96..04f342cd18271425068b2b02c2937236c900c5e2 100644
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -20,21 +20,22 @@ from __future__ import print_function
 import abc
 
 from tensorflow.contrib.rnn.ops import gen_lstm_ops
-from tensorflow.contrib.rnn.python.ops import fused_rnn_cell
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import rnn_cell_impl
-from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import resource_loader
 
 _lstm_ops_so = loader.load_op_library(
     resource_loader.get_path_to_datafile("_lstm_ops.so"))
 
+LayerRNNCell = rnn_cell_impl._LayerRNNCell  # pylint: disable=invalid-name,protected-access
+
 
 # pylint: disable=invalid-name
 def _lstm_block_cell(x,
@@ -327,7 +328,7 @@ def _BlockLSTMGrad(op, *grad):
   ]
 
 
-class LSTMBlockCell(rnn_cell_impl.RNNCell):
+class LSTMBlockCell(LayerRNNCell):
   """Basic LSTM recurrent network cell.
 
   The implementation is based on: http://arxiv.org/abs/1409.2329.
@@ -345,7 +346,8 @@ class LSTMBlockCell(rnn_cell_impl.RNNCell):
                forget_bias=1.0,
                cell_clip=None,
                use_peephole=False,
-               reuse=None):
+               reuse=None,
+               name="lstm_cell"):
     """Initialize the basic LSTM cell.
 
     Args:
@@ -356,11 +358,15 @@ class LSTMBlockCell(rnn_cell_impl.RNNCell):
       reuse: (optional) boolean describing whether to reuse variables in an
         existing scope.  If not `True`, and the existing scope already has the
         given variables, an error is raised.
+      name: String, the name of the layer. Layers with the same name will
+        share weights, but to avoid mistakes we require reuse=True in such
+        cases.  By default this is "lstm_cell", for variable-name compatibility
+        with `tf.nn.rnn_cell.LSTMCell`.
 
       When restoring from CudnnLSTM-trained checkpoints, must use
       CudnnCompatibleLSTMBlockCell instead.
     """
-    super(LSTMBlockCell, self).__init__(_reuse=reuse)
+    super(LSTMBlockCell, self).__init__(_reuse=reuse, name=name)
     self._num_units = num_units
     self._forget_bias = forget_bias
     self._use_peephole = use_peephole
@@ -373,6 +379,8 @@ class LSTMBlockCell(rnn_cell_impl.RNNCell):
         "wco": "w_o_diag",
         "scope": "lstm_cell"
     }
+    # Inputs must be 2-dimensional.
+    self.input_spec = base_layer.InputSpec(ndim=2)
 
   @property
   def state_size(self):
@@ -382,45 +390,54 @@ class LSTMBlockCell(rnn_cell_impl.RNNCell):
   def output_size(self):
     return self._num_units
 
-  def __call__(self, x, states_prev, scope=None):
+  def build(self, inputs_shape):
+    if not inputs_shape[1].value:
+      raise ValueError(
+          "Expecting inputs_shape[1] to be set: %s" % str(inputs_shape))
+    input_size = inputs_shape[1].value
+    self._kernel = self.add_variable(
+        self._names["W"], [input_size + self._num_units, self._num_units * 4])
+    self._bias = self.add_variable(
+        self._names["b"], [self._num_units * 4],
+        initializer=init_ops.constant_initializer(0.0))
+    if self._use_peephole:
+      self._w_i_diag = self.add_variable(self._names["wci"], [self._num_units])
+      self._w_f_diag = self.add_variable(self._names["wcf"], [self._num_units])
+      self._w_o_diag = self.add_variable(self._names["wco"], [self._num_units])
+
+    self.built = True
+
+  def call(self, inputs, state):
     """Long short-term memory cell (LSTM)."""
-    with vs.variable_scope(scope or self._names["scope"]):
-      x_shape = x.get_shape().with_rank(2)
-      if not x_shape[1].value:
-        raise ValueError("Expecting x_shape[1] to be set: %s" % str(x_shape))
-      if len(states_prev) != 2:
-        raise ValueError("Expecting states_prev to be a tuple with length 2.")
-      input_size = x_shape[1].value
-      w = vs.get_variable(self._names["W"], [input_size + self._num_units,
-                                             self._num_units * 4])
-      b = vs.get_variable(
-          self._names["b"], [w.get_shape().with_rank(2)[1].value],
-          initializer=init_ops.constant_initializer(0.0))
-      if self._use_peephole:
-        wci = vs.get_variable(self._names["wci"], [self._num_units])
-        wcf = vs.get_variable(self._names["wcf"], [self._num_units])
-        wco = vs.get_variable(self._names["wco"], [self._num_units])
-      else:
-        wci = wcf = wco = array_ops.zeros([self._num_units])
-      (cs_prev, h_prev) = states_prev
-      (_, cs, _, _, _, _, h) = _lstm_block_cell(
-          x,
-          cs_prev,
-          h_prev,
-          w,
-          b,
-          wci=wci,
-          wcf=wcf,
-          wco=wco,
-          forget_bias=self._forget_bias,
-          cell_clip=self._cell_clip,
-          use_peephole=self._use_peephole)
-
-      new_state = rnn_cell_impl.LSTMStateTuple(cs, h)
-      return h, new_state
-
-
-class LSTMBlockWrapper(fused_rnn_cell.FusedRNNCell):
+    if len(state) != 2:
+      raise ValueError("Expecting state to be a tuple with length 2.")
+
+    if self._use_peephole:
+      wci = self._w_i_diag
+      wcf = self._w_f_diag
+      wco = self._w_o_diag
+    else:
+      wci = wcf = wco = array_ops.zeros([self._num_units])
+
+    (cs_prev, h_prev) = state
+    (_, cs, _, _, _, _, h) = _lstm_block_cell(
+        inputs,
+        cs_prev,
+        h_prev,
+        self._kernel,
+        self._bias,
+        wci=wci,
+        wcf=wcf,
+        wco=wco,
+        forget_bias=self._forget_bias,
+        cell_clip=self._cell_clip,
+        use_peephole=self._use_peephole)
+
+    new_state = rnn_cell_impl.LSTMStateTuple(cs, h)
+    return h, new_state
+
+
+class LSTMBlockWrapper(base_layer.Layer):
   """This is a helper class that provides housekeeping for LSTM cells.
 
   This may be useful for alternative LSTM and similar type of cells.
@@ -459,12 +476,7 @@ class LSTMBlockWrapper(fused_rnn_cell.FusedRNNCell):
     """
     pass
 
-  def __call__(self,
-               inputs,
-               initial_state=None,
-               dtype=None,
-               sequence_length=None,
-               scope=None):
+  def call(self, inputs, initial_state=None, dtype=None, sequence_length=None):
     """Run this LSTM on inputs, starting from the given state.
 
     Args:
@@ -480,7 +492,6 @@ class LSTMBlockWrapper(fused_rnn_cell.FusedRNNCell):
         `int32` or `int64` vector (tensor) size `[batch_size]`, values in `[0,
         time_len).`
         Defaults to `time_len` for each element.
-      scope: `VariableScope` for the created subgraph; defaults to class name.
 
     Returns:
       A pair containing:
@@ -493,75 +504,71 @@ class LSTMBlockWrapper(fused_rnn_cell.FusedRNNCell):
     Raises:
       ValueError: in case of shape mismatches
     """
-    with vs.variable_scope(scope or "lstm_block_wrapper"):
-      is_list = isinstance(inputs, list)
-      if is_list:
-        inputs = array_ops.stack(inputs)
-      inputs_shape = inputs.get_shape().with_rank(3)
-      if not inputs_shape[2]:
-        raise ValueError("Expecting inputs_shape[2] to be set: %s" %
-                         inputs_shape)
-      batch_size = inputs_shape[1].value
-      if batch_size is None:
-        batch_size = array_ops.shape(inputs)[1]
-      time_len = inputs_shape[0].value
-      if time_len is None:
-        time_len = array_ops.shape(inputs)[0]
-
-      # Provide default values for initial_state and dtype
-      if initial_state is None:
-        if dtype is None:
-          raise ValueError(
-              "Either initial_state or dtype needs to be specified")
-        z = array_ops.zeros(
-            array_ops.stack([batch_size, self.num_units]), dtype=dtype)
-        initial_state = z, z
-      else:
-        if len(initial_state) != 2:
-          raise ValueError(
-              "Expecting initial_state to be a tuple with length 2 or None")
-        if dtype is None:
-          dtype = initial_state[0].dtype
-
-      # create the actual cell
-      if sequence_length is not None:
-        sequence_length = ops.convert_to_tensor(sequence_length)
-      initial_cell_state, initial_output = initial_state  # pylint: disable=unpacking-non-sequence
-      cell_states, outputs = self._call_cell(inputs, initial_cell_state,
-                                             initial_output, dtype,
-                                             sequence_length)
-
-      if sequence_length is not None:
-        # Mask out the part beyond sequence_length
-        mask = array_ops.transpose(
-            array_ops.sequence_mask(
-                sequence_length, time_len, dtype=dtype), [1, 0])
-        mask = array_ops.tile(
-            array_ops.expand_dims(mask, [-1]), [1, 1, self.num_units])
-        outputs *= mask
-        # Prepend initial states to cell_states and outputs for indexing to work
-        # correctly,since we want to access the last valid state at
-        # sequence_length - 1, which can even be -1, corresponding to the
-        # initial state.
-        mod_cell_states = array_ops.concat(
-            [array_ops.expand_dims(initial_cell_state, [0]), cell_states], 0)
-        mod_outputs = array_ops.concat(
-            [array_ops.expand_dims(initial_output, [0]), outputs], 0)
-        final_cell_state = self._gather_states(mod_cell_states, sequence_length,
-                                               batch_size)
-        final_output = self._gather_states(mod_outputs, sequence_length,
-                                           batch_size)
-      else:
-        # No sequence_lengths used: final state is the last state
-        final_cell_state = cell_states[-1]
-        final_output = outputs[-1]
-
-      if is_list:
-        # Input was a list, so return a list
-        outputs = array_ops.unstack(outputs)
-
-      final_state = rnn_cell_impl.LSTMStateTuple(final_cell_state, final_output)
-      return outputs, final_state
+    is_list = isinstance(inputs, list)
+    if is_list:
+      inputs = array_ops.stack(inputs)
+    inputs_shape = inputs.get_shape().with_rank(3)
+    if not inputs_shape[2]:
+      raise ValueError("Expecting inputs_shape[2] to be set: %s" % inputs_shape)
+    batch_size = inputs_shape[1].value
+    if batch_size is None:
+      batch_size = array_ops.shape(inputs)[1]
+    time_len = inputs_shape[0].value
+    if time_len is None:
+      time_len = array_ops.shape(inputs)[0]
+
+    # Provide default values for initial_state and dtype
+    if initial_state is None:
+      if dtype is None:
+        raise ValueError("Either initial_state or dtype needs to be specified")
+      z = array_ops.zeros(
+          array_ops.stack([batch_size, self.num_units]), dtype=dtype)
+      initial_state = z, z
+    else:
+      if len(initial_state) != 2:
+        raise ValueError(
+            "Expecting initial_state to be a tuple with length 2 or None")
+      if dtype is None:
+        dtype = initial_state[0].dtype
+
+    # create the actual cell
+    if sequence_length is not None:
+      sequence_length = ops.convert_to_tensor(sequence_length)
+    initial_cell_state, initial_output = initial_state  # pylint: disable=unpacking-non-sequence
+    cell_states, outputs = self._call_cell(
+        inputs, initial_cell_state, initial_output, dtype, sequence_length)
+
+    if sequence_length is not None:
+      # Mask out the part beyond sequence_length
+      mask = array_ops.transpose(
+          array_ops.sequence_mask(sequence_length, time_len, dtype=dtype),
+          [1, 0])
+      mask = array_ops.tile(
+          array_ops.expand_dims(mask, [-1]), [1, 1, self.num_units])
+      outputs *= mask
+      # Prepend initial states to cell_states and outputs for indexing to work
+      # correctly,since we want to access the last valid state at
+      # sequence_length - 1, which can even be -1, corresponding to the
+      # initial state.
+      mod_cell_states = array_ops.concat(
+          [array_ops.expand_dims(initial_cell_state, [0]), cell_states], 0)
+      mod_outputs = array_ops.concat(
+          [array_ops.expand_dims(initial_output, [0]), outputs], 0)
+      final_cell_state = self._gather_states(mod_cell_states, sequence_length,
+                                             batch_size)
+      final_output = self._gather_states(mod_outputs, sequence_length,
+                                         batch_size)
+    else:
+      # No sequence_lengths used: final state is the last state
+      final_cell_state = cell_states[-1]
+      final_output = outputs[-1]
+
+    if is_list:
+      # Input was a list, so return a list
+      outputs = array_ops.unstack(outputs)
+
+    final_state = rnn_cell_impl.LSTMStateTuple(final_cell_state, final_output)
+    return outputs, final_state
 
   def _gather_states(self, data, indices, batch_size):
     """Produce `out`, s.t. out(i, j) = data(indices(i), i, j)."""
@@ -589,7 +596,9 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
                num_units,
                forget_bias=1.0,
                cell_clip=None,
-               use_peephole=False):
+               use_peephole=False,
+               reuse=None,
+               name="lstm_fused_cell"):
     """Initialize the LSTM cell.
 
     Args:
@@ -597,19 +606,48 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
       forget_bias: float, The bias added to forget gates (see above).
       cell_clip: clip the cell to this value. Default is no cell clipping.
       use_peephole: Whether to use peephole connections or not.
+      reuse: (optional) boolean describing whether to reuse variables in an
+        existing scope.  If not `True`, and the existing scope already has the
+        given variables, an error is raised.
+      name: String, the name of the layer. Layers with the same name will
+        share weights, but to avoid mistakes we require reuse=True in such
+        cases.  By default this is "lstm_cell", for variable-name compatibility
+        with `tf.nn.rnn_cell.LSTMCell`.
     """
+    super(LSTMBlockFusedCell, self).__init__(_reuse=reuse, name=name)
     self._num_units = num_units
     self._forget_bias = forget_bias
     self._cell_clip = cell_clip if cell_clip is not None else -1
     self._use_peephole = use_peephole
 
+    # Inputs must be 3-dimensional.
+    self.input_spec = base_layer.InputSpec(ndim=3)
+
   @property
   def num_units(self):
     """Number of units in this cell (output dimension)."""
     return self._num_units
 
-  def _call_cell(self, inputs, initial_cell_state, initial_output, dtype,
-                 sequence_length):
+  def build(self, input_shape):
+    input_size = input_shape[2].value
+    self._kernel = self.add_variable(
+        "kernel", [input_size + self._num_units, self._num_units * 4])
+    self._bias = self.add_variable(
+        "bias", [self._num_units * 4],
+        initializer=init_ops.constant_initializer(0.0))
+    if self._use_peephole:
+      self._w_i_diag = self.add_variable("w_i_diag", [self._num_units])
+      self._w_f_diag = self.add_variable("w_f_diag", [self._num_units])
+      self._w_o_diag = self.add_variable("w_o_diag", [self._num_units])
+
+    self.built = True
+
+  def _call_cell(self,
+                 inputs,
+                 initial_cell_state=None,
+                 initial_output=None,
+                 dtype=None,
+                 sequence_length=None):
     """Run this LSTM on inputs, starting from the given state.
 
     Args:
@@ -636,18 +674,11 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
     time_len = inputs_shape[0].value
     if time_len is None:
       time_len = array_ops.shape(inputs)[0]
-    input_size = inputs_shape[2].value
-    w = vs.get_variable(
-        "kernel",
-        [input_size + self._num_units, self._num_units * 4], dtype=dtype)
-    b = vs.get_variable(
-        "bias", [w.get_shape().with_rank(2)[1]],
-        initializer=init_ops.constant_initializer(0.0),
-        dtype=dtype)
+
     if self._use_peephole:
-      wci = vs.get_variable("w_i_diag", [self._num_units], dtype=dtype)
-      wcf = vs.get_variable("w_f_diag", [self._num_units], dtype=dtype)
-      wco = vs.get_variable("w_o_diag", [self._num_units], dtype=dtype)
+      wci = self._w_i_diag
+      wco = self._w_o_diag
+      wcf = self._w_f_diag
     else:
       wci = wcf = wco = array_ops.zeros([self._num_units], dtype=dtype)
 
@@ -661,11 +692,11 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
         x=inputs,
         cs_prev=initial_cell_state,
         h_prev=initial_output,
-        w=w,
+        w=self._kernel,
         wci=wci,
         wcf=wcf,
         wco=wco,
-        b=b,
+        b=self._bias,
         forget_bias=self._forget_bias,
         cell_clip=self._cell_clip,
         use_peephole=self._use_peephole)
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 5e85c125df8ca0d632fa9b0db86d942bb354631e..c6b131604359f358d155f7870559ec8f0bcfa862 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -115,7 +115,6 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
 
   The class uses optional peep-hole connections, and an optional projection
   layer.
-  
   Layer normalization implementation is based on:
 
     https://arxiv.org/abs/1607.06450.
@@ -124,15 +123,24 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
   Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
 
   and is applied before the internal nonlinearities.
-  
+
   """
 
-  def __init__(self, num_units, use_peepholes=False,
-               initializer=None, num_proj=None, proj_clip=None,
-               num_unit_shards=1, num_proj_shards=1,
-               forget_bias=1.0, state_is_tuple=True,
-               activation=math_ops.tanh, reuse=None,
-               layer_norm=False, norm_gain=1.0, norm_shift=0.0):
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
+               initializer=None,
+               num_proj=None,
+               proj_clip=None,
+               num_unit_shards=1,
+               num_proj_shards=1,
+               forget_bias=1.0,
+               state_is_tuple=True,
+               activation=math_ops.tanh,
+               reuse=None,
+               layer_norm=False,
+               norm_gain=1.0,
+               norm_shift=0.0):
     """Initialize the parameters for an LSTM cell.
 
     Args:
@@ -164,8 +172,6 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
         `layer_norm` has been set to `False`, this argument will be ignored.
       norm_shift: float, The layer normalization shift initial value. If
         `layer_norm` has been set to `False`, this argument will be ignored.
-        
-        
     """
     super(CoupledInputForgetGateLSTMCell, self).__init__(_reuse=reuse)
     if not state_is_tuple:
@@ -1816,7 +1822,7 @@ class CompiledWrapper(rnn_cell_impl.RNNCell):
         return not _REGISTERED_OPS[node_def.op].is_stateful
 
     with jit.experimental_jit_scope(compile_ops=compile_ops):
-      return self._cell(inputs, state, scope)
+      return self._cell(inputs, state, scope=scope)
 
 
 def _random_exp_initializer(minval,
@@ -2049,8 +2055,8 @@ class ConvLSTMCell(rnn_cell_impl.RNNCell):
     if self._skip_connection:
       self._total_output_channels += self._input_shape[-1]
 
-    state_size = tensor_shape.TensorShape(self._input_shape[:-1] 
-                                          + [self._output_channels])
+    state_size = tensor_shape.TensorShape(
+        self._input_shape[:-1] + [self._output_channels])
     self._state_size = rnn_cell_impl.LSTMStateTuple(state_size, state_size)
     self._output_size = tensor_shape.TensorShape(self._input_shape[:-1]
                                                  + [self._total_output_channels])
@@ -2110,14 +2116,11 @@ class Conv3DLSTMCell(ConvLSTMCell):
     """Construct Conv3DLSTM. See `ConvLSTMCell` for more details."""
     super(Conv3DLSTMCell, self).__init__(conv_ndims=3, **kwargs)
 
-def _conv(args, 
-          filter_size,
-          num_features,
-          bias,
-          bias_start=0.0):
+
+def _conv(args, filter_size, num_features, bias, bias_start=0.0):
   """convolution:
   Args:
-    args: a Tensor or a list of Tensors of dimension 3D, 4D or 5D, 
+    args: a Tensor or a list of Tensors of dimension 3D, 4D or 5D,
     batch x n, Tensors.
     filter_size: int tuple of filter height and width.
     num_features: int, number of features.
@@ -2211,7 +2214,7 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
         has the given variables, an error is raised.
 
     Raises:
-      ValueError: If `num_units` or `num_proj` is not divisible by 
+      ValueError: If `num_units` or `num_proj` is not divisible by
         `number_of_groups`.
     """
     super(GLSTMCell, self).__init__(_reuse=reuse)
@@ -2391,12 +2394,19 @@ class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
 
   """
 
-  def __init__(self, num_units,
-               use_peepholes=False, cell_clip=None,
-               initializer=None, num_proj=None, proj_clip=None,
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
+               cell_clip=None,
+               initializer=None,
+               num_proj=None,
+               proj_clip=None,
                forget_bias=1.0,
-               activation=None, layer_norm=False,
-               norm_gain=1.0, norm_shift=0.0, reuse=None):
+               activation=None,
+               layer_norm=False,
+               norm_gain=1.0,
+               norm_shift=0.0,
+               reuse=None):
     """Initialize the parameters for an LSTM cell.
 
     Args:
@@ -2457,7 +2467,6 @@ class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
   def output_size(self):
     return self._output_size
 
-
   def _linear(self,
               args,
               output_size,
@@ -2507,9 +2516,9 @@ class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
     scope = vs.get_variable_scope()
     with vs.variable_scope(scope) as outer_scope:
       weights = vs.get_variable(
-        "kernel", [total_arg_size, output_size],
-        dtype=dtype,
-        initializer=kernel_initializer)
+          "kernel", [total_arg_size, output_size],
+          dtype=dtype,
+          initializer=kernel_initializer)
       if len(args) == 1:
         res = math_ops.matmul(args[0], weights)
       else:
@@ -2521,9 +2530,7 @@ class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
         if bias_initializer is None:
           bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
         biases = vs.get_variable(
-          "bias", [output_size],
-          dtype=dtype,
-          initializer=bias_initializer)
+            "bias", [output_size], dtype=dtype, initializer=bias_initializer)
 
     if not layer_norm:
       res = nn_ops.bias_add(res, biases)
@@ -2554,7 +2561,6 @@ class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
       ValueError: If input size cannot be inferred from inputs via
         static shape inference.
     """
-    num_proj = self._num_units if self._num_proj is None else self._num_proj
     sigmoid = math_ops.sigmoid
 
     (c_prev, m_prev) = state
@@ -2567,10 +2573,14 @@ class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
     with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
 
       # i = input_gate, j = new_input, f = forget_gate, o = output_gate
-      lstm_matrix = self._linear([inputs, m_prev], 4 * self._num_units, bias=True,
-                            bias_initializer=None, layer_norm=self._layer_norm)
+      lstm_matrix = self._linear(
+          [inputs, m_prev],
+          4 * self._num_units,
+          bias=True,
+          bias_initializer=None,
+          layer_norm=self._layer_norm)
       i, j, f, o = array_ops.split(
-        value=lstm_matrix, num_or_size_splits=4, axis=1)
+          value=lstm_matrix, num_or_size_splits=4, axis=1)
 
       if self._layer_norm:
         i = _norm(self._norm_gain, self._norm_shift, i, "input")
@@ -2580,20 +2590,22 @@ class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
 
       # Diagonal connections
       if self._use_peepholes:
-        with vs.variable_scope(unit_scope) as projection_scope:
+        with vs.variable_scope(unit_scope):
           w_f_diag = vs.get_variable(
-            "w_f_diag", shape=[self._num_units], dtype=dtype)
+              "w_f_diag", shape=[self._num_units], dtype=dtype)
           w_i_diag = vs.get_variable(
-            "w_i_diag", shape=[self._num_units], dtype=dtype)
+              "w_i_diag", shape=[self._num_units], dtype=dtype)
           w_o_diag = vs.get_variable(
-            "w_o_diag", shape=[self._num_units], dtype=dtype)
+              "w_o_diag", shape=[self._num_units], dtype=dtype)
 
       if self._use_peepholes:
-        c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
-             sigmoid(i + w_i_diag * c_prev) * self._activation(j))
+        c = (
+            sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
+            sigmoid(i + w_i_diag * c_prev) * self._activation(j))
       else:
-        c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
-             self._activation(j))
+        c = (
+            sigmoid(f + self._forget_bias) * c_prev +
+            sigmoid(i) * self._activation(j))
 
       if self._layer_norm:
         c = _norm(self._norm_gain, self._norm_shift, c, "state")
@@ -2608,7 +2620,7 @@ class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
         m = sigmoid(o) * self._activation(c)
 
       if self._num_proj is not None:
-        with vs.variable_scope("projection") as proj_scope:
+        with vs.variable_scope("projection"):
           m = self._linear(m, self._num_proj, bias=False)
 
         if self._proj_clip is not None:
diff --git a/tensorflow/contrib/saved_model/python/saved_model/reader_test.py b/tensorflow/contrib/saved_model/python/saved_model/reader_test.py
index a8331cbc8f04f74294675d7ceb57412e1f0b6170..d10ec9cf0cad56930ed1e101bf60cea6cad9d7a4 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/reader_test.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/reader_test.py
@@ -86,6 +86,13 @@ class ReaderTest(test.TestCase):
       self._init_and_validate_variable(sess, "v", 44)
       builder.add_meta_graph([tag_constants.SERVING, tag_constants.GPU])
 
+    # Graph that updates the single variable. SavedModel is invoked:
+    # - to add the model (weights are not updated).
+    # - multiple predefined tags for serving on TPU.
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 44)
+      builder.add_meta_graph([tag_constants.SERVING, tag_constants.TPU])
+
     # Graph that updates the single variable. SavedModel is invoked:
     # - to add the model (weights are not updated).
     # - multiple custom tags.
@@ -97,7 +104,8 @@ class ReaderTest(test.TestCase):
     builder.save()
 
     actual_tags = reader.get_saved_model_tag_sets(saved_model_dir)
-    expected_tags = [["train"], ["serve"], ["serve", "gpu"], ["foo", "bar"]]
+    expected_tags = [["train"], ["serve"], ["serve", "gpu"], ["serve", "tpu"],
+                     ["foo", "bar"]]
     self.assertEqual(expected_tags, actual_tags)
 
 
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index 01a5540121ae9ebf22de0493daadff6c7710d29a..e5d591788fa6350bb59458d12e9bb01c910ba37d 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -254,6 +254,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
         alignment_history=())
     expected_final_alignment_history = ResultSummary(
         shape=(3, 5, 8), dtype=dtype('float32'), mean=0.12500001)
@@ -286,6 +288,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
         alignment_history=())
 
     self._testWithAttention(
@@ -313,6 +317,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
         alignment_history=())
 
     self._testWithAttention(
@@ -342,6 +348,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
         alignment_history=())
 
     self._testWithAttention(
@@ -370,6 +378,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
         alignment_history=())
 
     self._testWithAttention(
@@ -545,6 +555,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.032228071),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.032228071),
         alignment_history=())
     expected_final_alignment_history = ResultSummary(
         shape=(3, 5, 8), dtype=dtype('float32'), mean=0.050430927)
@@ -578,6 +590,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.028698336),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.028698336),
         alignment_history=())
     expected_final_alignment_history = ResultSummary(
         shape=(3, 5, 8), dtype=dtype('float32'), mean=0.046009291)
@@ -599,7 +613,8 @@ class AttentionWrapperTest(test.TestCase):
           random_ops.random_normal((b, t, u)),
           mode='hard')
       # Just feed previous attention as [1, 0, 0, ...]
-      attn = a(random_ops.random_normal((b, d)), array_ops.one_hot([0]*b, t))
+      attn, unused_state = a(
+          random_ops.random_normal((b, d)), array_ops.one_hot([0]*b, t))
       sess.run(variables.global_variables_initializer())
       attn_out = attn.eval()
       # All values should be 0 or 1
@@ -629,6 +644,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.032198936),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.032198936),
         alignment_history=())
     expected_final_alignment_history = ResultSummary(
         shape=(3, 5, 8), dtype=dtype('float32'), mean=0.050387777)
@@ -663,6 +680,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.032198936),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.032198936),
         alignment_history=())
     expected_final_alignment_history = ResultSummary(
         shape=(3, 5, 8), dtype=dtype('float32'), mean=0.050387777)
@@ -697,6 +716,9 @@ class AttentionWrapperTest(test.TestCase):
         alignments=(
             ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
             ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        attention_state=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
         alignment_history=())
 
     expected_final_alignment_history = (
@@ -723,7 +745,8 @@ class AttentionWrapperTest(test.TestCase):
           random_ops.random_normal((b, t, u)),
           mode='hard')
       # Just feed previous attention as [1, 0, 0, ...]
-      attn = a(random_ops.random_normal((b, d)), array_ops.one_hot([0]*b, t))
+      attn, unused_state = a(
+          random_ops.random_normal((b, d)), array_ops.one_hot([0]*b, t))
       sess.run(variables.global_variables_initializer())
       attn_out = attn.eval()
       # All values should be 0 or 1
@@ -753,6 +776,9 @@ class AttentionWrapperTest(test.TestCase):
         alignments=(
             ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
             ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        attention_state=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
         alignment_history=())
     expected_final_alignment_history = (
         ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125),
@@ -787,6 +813,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=(
             ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),),
+        attention_state=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),),
         alignment_history=())
 
     expected_final_alignment_history = (
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index c3b180d9f49e6a7379741809bd6087fdab4c7093..36bfc5685d51350acb86c270626cce5cec9647f8 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -61,7 +61,14 @@ _zero_state_tensors = rnn_cell_impl._zero_state_tensors  # pylint: disable=prote
 
 
 class AttentionMechanism(object):
-  pass
+
+  @property
+  def alignments_size(self):
+    raise NotImplementedError
+
+  @property
+  def state_size(self):
+    raise NotImplementedError
 
 
 def _prepare_memory(memory, memory_sequence_length, check_inner_dims_defined):
@@ -161,7 +168,7 @@ class _BaseAttentionMechanism(AttentionMechanism):
         tensor should be shaped `[batch_size, max_time, ...]`.
       probability_fn: A `callable`.  Converts the score and previous alignments
         to probabilities. Its signature should be:
-        `probabilities = probability_fn(score, previous_alignments)`.
+        `probabilities = probability_fn(score, state)`.
       memory_sequence_length (optional): Sequence lengths for the batch entries
         in memory.  If provided, the memory tensor rows are masked with zeros
         for values past the respective sequence lengths.
@@ -192,7 +199,8 @@ class _BaseAttentionMechanism(AttentionMechanism):
       raise TypeError("probability_fn must be callable, saw type: %s" %
                       type(probability_fn).__name__)
     if score_mask_value is None:
-      score_mask_value = dtypes.as_dtype(self._memory_layer.dtype).as_numpy_dtype(-np.inf)
+      score_mask_value = dtypes.as_dtype(
+          self._memory_layer.dtype).as_numpy_dtype(-np.inf)
     self._probability_fn = lambda score, prev: (  # pylint:disable=g-long-lambda
         probability_fn(
             _maybe_mask_score(score, memory_sequence_length, score_mask_value),
@@ -234,6 +242,10 @@ class _BaseAttentionMechanism(AttentionMechanism):
   def alignments_size(self):
     return self._alignments_size
 
+  @property
+  def state_size(self):
+    return self._alignments_size
+
   def initial_alignments(self, batch_size, dtype):
     """Creates the initial alignment values for the `AttentionWrapper` class.
 
@@ -253,6 +265,23 @@ class _BaseAttentionMechanism(AttentionMechanism):
     max_time = self._alignments_size
     return _zero_state_tensors(max_time, batch_size, dtype)
 
+  def initial_state(self, batch_size, dtype):
+    """Creates the initial state values for the `AttentionWrapper` class.
+
+    This is important for AttentionMechanisms that use the previous alignment
+    to calculate the alignment at the next time step (e.g. monotonic attention).
+
+    The default behavior is to return the same output as initial_alignments.
+
+    Args:
+      batch_size: `int32` scalar, the batch_size.
+      dtype: The `dtype`.
+
+    Returns:
+      A structure of all-zero tensors with shapes as described by `state_size`.
+    """
+    return self.initial_alignments(batch_size, dtype)
+
 
 def _luong_score(query, keys, scale):
   """Implements Luong-style (multiplicative) scoring function.
@@ -380,13 +409,13 @@ class LuongAttention(_BaseAttentionMechanism):
     self._scale = scale
     self._name = name
 
-  def __call__(self, query, previous_alignments):
+  def __call__(self, query, state):
     """Score the query based on the keys and values.
 
     Args:
       query: Tensor of dtype matching `self.values` and shape
         `[batch_size, query_depth]`.
-      previous_alignments: Tensor of dtype matching `self.values` and shape
+      state: Tensor of dtype matching `self.values` and shape
         `[batch_size, alignments_size]`
         (`alignments_size` is memory's `max_time`).
 
@@ -397,8 +426,9 @@ class LuongAttention(_BaseAttentionMechanism):
     """
     with variable_scope.variable_scope(None, "luong_attention", [query]):
       score = _luong_score(query, self._keys, self._scale)
-    alignments = self._probability_fn(score, previous_alignments)
-    return alignments
+    alignments = self._probability_fn(score, state)
+    next_state = alignments
+    return alignments, next_state
 
 
 def _bahdanau_score(processed_query, keys, normalize):
@@ -525,13 +555,13 @@ class BahdanauAttention(_BaseAttentionMechanism):
     self._normalize = normalize
     self._name = name
 
-  def __call__(self, query, previous_alignments):
+  def __call__(self, query, state):
     """Score the query based on the keys and values.
 
     Args:
       query: Tensor of dtype matching `self.values` and shape
         `[batch_size, query_depth]`.
-      previous_alignments: Tensor of dtype matching `self.values` and shape
+      state: Tensor of dtype matching `self.values` and shape
         `[batch_size, alignments_size]`
         (`alignments_size` is memory's `max_time`).
 
@@ -543,8 +573,9 @@ class BahdanauAttention(_BaseAttentionMechanism):
     with variable_scope.variable_scope(None, "bahdanau_attention", [query]):
       processed_query = self.query_layer(query) if self.query_layer else query
       score = _bahdanau_score(processed_query, self._keys, self._normalize)
-    alignments = self._probability_fn(score, previous_alignments)
-    return alignments
+    alignments = self._probability_fn(score, state)
+    next_state = alignments
+    return alignments, next_state
 
 
 def safe_cumprod(x, *args, **kwargs):
@@ -804,13 +835,13 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
     self._name = name
     self._score_bias_init = score_bias_init
 
-  def __call__(self, query, previous_alignments):
+  def __call__(self, query, state):
     """Score the query based on the keys and values.
 
     Args:
       query: Tensor of dtype matching `self.values` and shape
         `[batch_size, query_depth]`.
-      previous_alignments: Tensor of dtype matching `self.values` and shape
+      state: Tensor of dtype matching `self.values` and shape
         `[batch_size, alignments_size]`
         (`alignments_size` is memory's `max_time`).
 
@@ -827,8 +858,9 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
           "attention_score_bias", dtype=processed_query.dtype,
           initializer=self._score_bias_init)
       score += score_bias
-    alignments = self._probability_fn(score, previous_alignments)
-    return alignments
+    alignments = self._probability_fn(score, state)
+    next_state = alignments
+    return alignments, next_state
 
 
 class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
@@ -905,13 +937,13 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
     self._score_bias_init = score_bias_init
     self._name = name
 
-  def __call__(self, query, previous_alignments):
+  def __call__(self, query, state):
     """Score the query based on the keys and values.
 
     Args:
       query: Tensor of dtype matching `self.values` and shape
         `[batch_size, query_depth]`.
-      previous_alignments: Tensor of dtype matching `self.values` and shape
+      state: Tensor of dtype matching `self.values` and shape
         `[batch_size, alignments_size]`
         (`alignments_size` is memory's `max_time`).
 
@@ -927,14 +959,15 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
           "attention_score_bias", dtype=query.dtype,
           initializer=self._score_bias_init)
       score += score_bias
-    alignments = self._probability_fn(score, previous_alignments)
-    return alignments
+    alignments = self._probability_fn(score, state)
+    next_state = alignments
+    return alignments, next_state
 
 
 class AttentionWrapperState(
     collections.namedtuple("AttentionWrapperState",
                            ("cell_state", "attention", "time", "alignments",
-                            "alignment_history"))):
+                            "alignment_history", "attention_state"))):
   """`namedtuple` storing the state of a `AttentionWrapper`.
 
   Contains:
@@ -948,6 +981,9 @@ class AttentionWrapperState(
     - `alignment_history`: (if enabled) a single or tuple of `TensorArray`(s)
        containing alignment matrices from all time steps for each attention
        mechanism. Call `stack()` on each to convert to a `Tensor`.
+    - `attention_state`: A single or tuple of nested objects
+       containing attention mechanism state for each attention mechanism.
+       The objects may contain Tensors or TensorArrays.
   """
 
   def clone(self, **kwargs):
@@ -992,11 +1028,11 @@ def hardmax(logits, name=None):
         math_ops.argmax(logits, -1), depth, dtype=logits.dtype)
 
 
-def _compute_attention(attention_mechanism, cell_output, previous_alignments,
+def _compute_attention(attention_mechanism, cell_output, attention_state,
                        attention_layer):
   """Computes the attention and alignments for a given attention_mechanism."""
-  alignments = attention_mechanism(
-      cell_output, previous_alignments=previous_alignments)
+  alignments, next_attention_state = attention_mechanism(
+      cell_output, state=attention_state)
 
   # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
   expanded_alignments = array_ops.expand_dims(alignments, 1)
@@ -1017,7 +1053,7 @@ def _compute_attention(attention_mechanism, cell_output, previous_alignments,
   else:
     attention = context
 
-  return attention, alignments
+  return attention, alignments, next_attention_state
 
 
 class AttentionWrapper(rnn_cell_impl.RNNCell):
@@ -1145,7 +1181,9 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
             % (len(attention_layer_sizes), len(attention_mechanisms)))
       self._attention_layers = tuple(
           layers_core.Dense(
-              attention_layer_size, name="attention_layer", use_bias=False,
+              attention_layer_size,
+              name="attention_layer",
+              use_bias=False,
               dtype=attention_mechanisms[i].dtype)
           for i, attention_layer_size in enumerate(attention_layer_sizes))
       self._attention_layer_size = sum(attention_layer_sizes)
@@ -1226,6 +1264,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         attention=self._attention_layer_size,
         alignments=self._item_or_tuple(
             a.alignments_size for a in self._attention_mechanisms),
+        attention_state=self._item_or_tuple(
+            a.state_size for a in self._attention_mechanisms),
         alignment_history=self._item_or_tuple(
             () for _ in self._attention_mechanisms))  # sometimes a TensorArray
 
@@ -1275,6 +1315,9 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
           alignments=self._item_or_tuple(
               attention_mechanism.initial_alignments(batch_size, dtype)
               for attention_mechanism in self._attention_mechanisms),
+          attention_state=self._item_or_tuple(
+              attention_mechanism.initial_state(batch_size, dtype)
+              for attention_mechanism in self._attention_mechanisms),
           alignment_history=self._item_or_tuple(
               tensor_array_ops.TensorArray(dtype=dtype, size=0,
                                            dynamic_size=True)
@@ -1336,33 +1379,36 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
           cell_output, name="checked_cell_output")
 
     if self._is_multi:
-      previous_alignments = state.alignments
+      previous_attention_state = state.attention_state
       previous_alignment_history = state.alignment_history
     else:
-      previous_alignments = [state.alignments]
+      previous_attention_state = [state.attention_state]
       previous_alignment_history = [state.alignment_history]
 
     all_alignments = []
     all_attentions = []
-    all_histories = []
+    all_attention_states = []
+    maybe_all_histories = []
     for i, attention_mechanism in enumerate(self._attention_mechanisms):
-      attention, alignments = _compute_attention(
-          attention_mechanism, cell_output, previous_alignments[i],
+      attention, alignments, next_attention_state = _compute_attention(
+          attention_mechanism, cell_output, previous_attention_state[i],
           self._attention_layers[i] if self._attention_layers else None)
       alignment_history = previous_alignment_history[i].write(
           state.time, alignments) if self._alignment_history else ()
 
+      all_attention_states.append(next_attention_state)
       all_alignments.append(alignments)
-      all_histories.append(alignment_history)
       all_attentions.append(attention)
+      maybe_all_histories.append(alignment_history)
 
     attention = array_ops.concat(all_attentions, 1)
     next_state = AttentionWrapperState(
         time=state.time + 1,
         cell_state=next_cell_state,
         attention=attention,
+        attention_state=self._item_or_tuple(all_attention_states),
         alignments=self._item_or_tuple(all_alignments),
-        alignment_history=self._item_or_tuple(all_histories))
+        alignment_history=self._item_or_tuple(maybe_all_histories))
 
     if self._output_attention:
       return attention, next_state
diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index b55d90cbabcc0bb63aaff86ba74c9fa2c6c917cf..ef3722ee41bb0b49e5f81d4d6514e2f40d2ad9f1 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -540,8 +540,7 @@ class GreedyEmbeddingHelper(Helper):
     if not isinstance(outputs, ops.Tensor):
       raise TypeError("Expected outputs to be a single Tensor, got: %s" %
                       type(outputs))
-    sample_ids = math_ops.cast(
-        math_ops.argmax(outputs, axis=-1), dtypes.int32)
+    sample_ids = math_ops.argmax(outputs, axis=-1, output_type=dtypes.int32)
     return sample_ids
 
   def next_inputs(self, time, outputs, state, sample_ids, name=None):
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index f7a85557ca3df6325502da1052c96beff3c5ae08..dc92ae0c859394f44ba83d814adbef7d324a9ada 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -441,7 +441,8 @@ module. Consider the simple case where we want to train the VGG network:
 
 ```python
 import tensorflow as tf
-vgg = tf.contrib.slim.nets.vgg
+import tensorflow.contrib.slim.nets as nets
+vgg = nets.vgg
 
 # Load the images and labels.
 images, labels = ...
@@ -559,9 +560,10 @@ examine the following sample of training the VGG network:
 
 ```python
 import tensorflow as tf
+import tensorflow.contrib.slim.nets as nets
 
 slim = tf.contrib.slim
-vgg = tf.contrib.slim.nets.vgg
+vgg = nets.vgg
 
 ...
 
@@ -809,9 +811,10 @@ Putting it all together:
 
 ```python
 import tensorflow as tf
+import tensorflow.contrib.slim.nets as nets
 
 slim = tf.contrib.slim
-vgg = tf.contrib.slim.nets.vgg
+vgg = nets.vgg
 
 
 # Load the data
diff --git a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
index 82c6b5a619662ba5cbaba1b3a238045a8d9a2cd2..c42c7b3391db40fd0aad89c45f449487f484f371 100644
--- a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
+++ b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
@@ -62,7 +62,9 @@ class DatasetDataProvider(data_provider.DataProvider):
                seed=None,
                scope=None):
     """Creates a DatasetDataProvider.
-
+    Note: if `num_epochs` is not `None`,  local counter `epochs` will be created
+    by relevant function. Use `local_variables_initializer()` to initialize
+    local variables.
     Args:
       dataset: An instance of the Dataset class.
       num_readers: The number of parallel readers to use.
@@ -96,12 +98,12 @@ class DatasetDataProvider(data_provider.DataProvider):
     items = dataset.decoder.list_items()
     tensors = dataset.decoder.decode(data, items)
 
-    if record_key in items:
+    items_to_tensors = dict(zip(items, tensors))
+    if record_key in items_to_tensors:
       raise ValueError('The item name used for `record_key` cannot also be '
                        'used for a dataset item: %s', record_key)
-    items.append(record_key)
-    tensors.append(key)
+    items_to_tensors[record_key] = key
 
     super(DatasetDataProvider, self).__init__(
-        items_to_tensors=dict(zip(items, tensors)),
+        items_to_tensors=items_to_tensors,
         num_samples=dataset.num_samples)
diff --git a/tensorflow/contrib/slim/python/slim/evaluation.py b/tensorflow/contrib/slim/python/slim/evaluation.py
index cdb720b36ba2b01b4d42d0c0a657b00405c33519..3caf4e02da3aa2d7e586c4e76807a11f84585ea6 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation.py
@@ -34,7 +34,7 @@ the metrics and finally call the `evaluation` method:
       "mse": slim.metrics.mean_squared_error(predictions, labels),
   })
 
-  inital_op = tf.group(
+  initial_op = tf.group(
       tf.global_variables_initializer(),
       tf.local_variables_initializer())
 
@@ -42,7 +42,7 @@ the metrics and finally call the `evaluation` method:
     metric_values = slim.evaluation(
         sess,
         num_evals=1,
-        inital_op=initial_op,
+        initial_op=initial_op,
         eval_op=names_to_updates.values(),
         final_op=name_to_values.values())
 
diff --git a/tensorflow/contrib/slim/python/slim/nets/inception_v3.py b/tensorflow/contrib/slim/python/slim/nets/inception_v3.py
index e3c0c036d90c95a5f371bef2ca9f960926d82166..432e1f79f12ed5536887738339de845f6343e5ba 100644
--- a/tensorflow/contrib/slim/python/slim/nets/inception_v3.py
+++ b/tensorflow/contrib/slim/python/slim/nets/inception_v3.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework.python.ops import arg_scope
+from tensorflow.contrib.layers.python.layers import initializers
 from tensorflow.contrib.layers.python.layers import layers as layers_lib
 from tensorflow.contrib.layers.python.layers import regularizers
 from tensorflow.python.framework import ops
@@ -675,14 +676,12 @@ def _reduced_kernel_size_for_small_input(input_tensor, kernel_size):
 
 
 def inception_v3_arg_scope(weight_decay=0.00004,
-                           stddev=0.1,
                            batch_norm_var_collection='moving_vars',
                            use_fused_batchnorm=True):
   """Defines the default InceptionV3 arg scope.
 
   Args:
     weight_decay: The weight decay to use for regularizing the model.
-    stddev: The standard deviation of the trunctated normal weight initializer.
     batch_norm_var_collection: The name of the collection for the batch norm
       variables.
     use_fused_batchnorm: Enable fused batchnorm.
@@ -714,8 +713,7 @@ def inception_v3_arg_scope(weight_decay=0.00004,
       weights_regularizer=regularizers.l2_regularizer(weight_decay)):
     with arg_scope(
         [layers.conv2d],
-        weights_initializer=init_ops.truncated_normal_initializer(
-            stddev=stddev),
+        weights_initializer=initializers.variance_scaling_initializer(),
         activation_fn=nn_ops.relu,
         normalizer_fn=layers_lib.batch_norm,
         normalizer_params=batch_norm_params) as sc:
diff --git a/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
index cd4d46aa07bfa92b8243f2f168fd1e4682ad70e2..bea6341cfdcf7d56f255bec275b7861228e44e12 100644
--- a/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
+++ b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
@@ -69,16 +69,17 @@ class StatelessOpsTest(test.TestCase):
   def testDeterminism(self):
     # Stateless values should be equal iff the seeds are equal (roughly)
     with self.test_session(use_gpu=True):
-      seed_t = array_ops.placeholder(dtypes.int64, shape=[2])
-      seeds = [(x, y) for x in range(5) for y in range(5)] * 3
-      for stateless_op, _ in CASES:
-        for shape in (), (3,), (2, 5):
-          pure = stateless_op(shape, seed=seed_t)
-          values = [(seed, pure.eval(feed_dict={seed_t: seed}))
-                    for seed in seeds]
-          for s0, v0 in values:
-            for s1, v1 in values:
-              self.assertEqual(s0 == s1, np.all(v0 == v1))
+      for seed_type in [dtypes.int32, dtypes.int64]:
+        seed_t = array_ops.placeholder(seed_type, shape=[2])
+        seeds = [(x, y) for x in range(5) for y in range(5)] * 3
+        for stateless_op, _ in CASES:
+          for shape in (), (3,), (2, 5):
+            pure = stateless_op(shape, seed=seed_t)
+            values = [(seed, pure.eval(feed_dict={seed_t: seed}))
+                      for seed in seeds]
+            for s0, v0 in values:
+              for s1, v1 in values:
+                self.assertEqual(s0 == s1, np.all(v0 == v1))
 
   def testShapeType(self):
     with self.test_session(use_gpu=True):
diff --git a/tensorflow/contrib/summary/BUILD b/tensorflow/contrib/summary/BUILD
index 3892654f2578f89e4c0fd13312cc78a6f1fe54c0..5ee5f1ae763db0ede9df464a08a9f1c7341b7cab 100644
--- a/tensorflow/contrib/summary/BUILD
+++ b/tensorflow/contrib/summary/BUILD
@@ -13,10 +13,7 @@ load(
 tf_gen_op_wrapper_py(
     name = "gen_summary_ops",
     out = "gen_summary_ops.py",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/core:summary_ops_op_lib",
-    ],
+    deps = ["//tensorflow/core:summary_ops_op_lib"],
 )
 
 py_test(
@@ -25,7 +22,6 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":summary_ops",
-        ":summary_test_internal",
         ":summary_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
@@ -46,11 +42,16 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":summary_ops",
-        ":summary_test_internal",
+        ":summary_test_util",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:ops",
-        "//tensorflow/python:platform",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
+        "@six_archive//:six",
     ],
 )
 
@@ -61,6 +62,7 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":gen_summary_ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -73,6 +75,7 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
+        "@six_archive//:six",
     ],
 )
 
@@ -111,15 +114,3 @@ py_library(
         "//tensorflow/python:platform",
     ],
 )
-
-py_library(
-    name = "summary_test_internal",
-    testonly = 1,
-    srcs = ["summary_test_internal.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:private"],
-    deps = [
-        "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
-    ],
-)
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index f783179f61495f33c80b897d00aecb46743fddd9..7d3b8b7437a9ff5aaa0834db79bca8883cd679c8 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -28,9 +28,11 @@ from __future__ import print_function
 from tensorflow.contrib.summary.summary_ops import all_summary_ops
 from tensorflow.contrib.summary.summary_ops import always_record_summaries
 from tensorflow.contrib.summary.summary_ops import audio
-from tensorflow.contrib.summary.summary_ops import create_summary_db_writer
+from tensorflow.contrib.summary.summary_ops import create_db_writer
+from tensorflow.contrib.summary.summary_ops import create_file_writer
 from tensorflow.contrib.summary.summary_ops import create_summary_file_writer
 from tensorflow.contrib.summary.summary_ops import eval_dir
+from tensorflow.contrib.summary.summary_ops import flush
 from tensorflow.contrib.summary.summary_ops import generic
 from tensorflow.contrib.summary.summary_ops import graph
 from tensorflow.contrib.summary.summary_ops import histogram
diff --git a/tensorflow/contrib/summary/summary_ops.py b/tensorflow/contrib/summary/summary_ops.py
index a72c0c80aabcbdb931df891ab1570db84f177a91..ee661dfdc11451bb72bc2741b0b54ebf5c1e6543 100644
--- a/tensorflow/contrib/summary/summary_ops.py
+++ b/tensorflow/contrib/summary/summary_ops.py
@@ -38,14 +38,15 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import summary_op_util
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import training_util
 from tensorflow.python.util import tf_contextlib
 
+
 # Name for a collection which is expected to have at most a single boolean
 # Tensor. If this tensor is True the summary ops will record summaries.
 _SHOULD_RECORD_SUMMARIES_NAME = "ShouldRecordSummaries"
 
-_SUMMARY_COLLECTION_NAME = "_SUMMARY_V2"
 _SUMMARY_WRITER_INIT_COLLECTION_NAME = "_SUMMARY_WRITER_V2"
 
 _EXPERIMENT_NAME_PATTERNS = re.compile(r"^[^\x00-\x1F<>]{0,256}$")
@@ -70,7 +71,7 @@ def should_record_summaries():
 def record_summaries_every_n_global_steps(n, global_step=None):
   """Sets the should_record_summaries Tensor to true if global_step % n == 0."""
   if global_step is None:
-    global_step = training_util.get_global_step()
+    global_step = training_util.get_or_create_global_step()
   collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
   old = collection_ref[:]
   with ops.device("cpu:0"):
@@ -103,8 +104,8 @@ class SummaryWriter(object):
   """Encapsulates a stateful summary writer resource.
 
   See also:
-  - @{tf.contrib.summary.create_summary_file_writer}
-  - @{tf.contrib.summary.create_summary_db_writer}
+  - @{tf.contrib.summary.create_file_writer}
+  - @{tf.contrib.summary.create_db_writer}
   """
 
   def  __init__(self, resource):
@@ -170,11 +171,11 @@ def initialize(
     session.run(_graph(x, 0), feed_dict={x: data})
 
 
-def create_summary_file_writer(logdir,
-                               max_queue=None,
-                               flush_millis=None,
-                               filename_suffix=None,
-                               name=None):
+def create_file_writer(logdir,
+                       max_queue=None,
+                       flush_millis=None,
+                       filename_suffix=None,
+                       name=None):
   """Creates a summary file writer in the current context.
 
   Args:
@@ -211,11 +212,11 @@ def create_summary_file_writer(logdir,
         filename_suffix=filename_suffix)
 
 
-def create_summary_db_writer(db_uri,
-                             experiment_name=None,
-                             run_name=None,
-                             user_name=None,
-                             name=None):
+def create_db_writer(db_uri,
+                     experiment_name=None,
+                     run_name=None,
+                     user_name=None,
+                     name=None):
   """Creates a summary database writer in the current context.
 
   This can be used to write tensors from the execution graph directly
@@ -298,7 +299,7 @@ def all_summary_ops():
   if context.in_eager_mode():
     raise RuntimeError(
         "tf.contrib.summary.all_summary_ops is only supported in graph mode.")
-  return ops.get_collection(_SUMMARY_COLLECTION_NAME)
+  return ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
 
 
 def summary_writer_initializer_op():
@@ -340,14 +341,13 @@ def summary_writer_function(name, tensor, function, family=None):
   with ops.device("cpu:0"):
     op = utils.smart_cond(
         should_record_summaries(), record, _nothing, name="")
-    ops.add_to_collection(_SUMMARY_COLLECTION_NAME, op)
+    ops.add_to_collection(ops.GraphKeys._SUMMARY_COLLECTION, op)  # pylint: disable=protected-access
   return op
 
 
-def generic(name, tensor, metadata=None, family=None, global_step=None):
+def generic(name, tensor, metadata=None, family=None, step=None):
   """Writes a tensor summary if possible."""
-  if global_step is None:
-    global_step = training_util.get_global_step()
+
   def function(tag, scope):
     if metadata is None:
       serialized_metadata = constant_op.constant("")
@@ -358,67 +358,88 @@ def generic(name, tensor, metadata=None, family=None, global_step=None):
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_summary(
         context.context().summary_writer_resource,
-        global_step, array_ops.identity(tensor),
-        tag, serialized_metadata, name=scope)
+        _choose_step(step),
+        array_ops.identity(tensor),
+        tag,
+        serialized_metadata,
+        name=scope)
   return summary_writer_function(name, tensor, function, family=family)
 
 
-def scalar(name, tensor, family=None, global_step=None):
-  """Writes a scalar summary if possible."""
-  if global_step is None:
-    global_step = training_util.get_global_step()
+def scalar(name, tensor, family=None, step=None):
+  """Writes a scalar summary if possible.
+
+  Unlike @{tf.contrib.summary.generic} this op may change the dtype
+  depending on the writer, for both practical and efficiency concerns.
+
+  Args:
+    name: An arbitrary name for this summary.
+    tensor: A @{tf.Tensor} Must be one of the following types:
+      `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`,
+      `int8`, `uint16`, `half`, `uint32`, `uint64`.
+    family: Optional, the summary's family.
+    step: The `int64` monotonic step variable, which defaults
+      to @{tf.train.get_global_step}.
+
+  Returns:
+    The created @{tf.Operation} or a @{tf.no_op} if summary writing has
+    not been enabled for this context.
+  """
+
   def function(tag, scope):
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_scalar_summary(
         context.context().summary_writer_resource,
-        global_step, tag, array_ops.identity(tensor),
+        _choose_step(step),
+        tag,
+        array_ops.identity(tensor),
         name=scope)
 
   return summary_writer_function(name, tensor, function, family=family)
 
 
-def histogram(name, tensor, family=None, global_step=None):
+def histogram(name, tensor, family=None, step=None):
   """Writes a histogram summary if possible."""
-  if global_step is None:
-    global_step = training_util.get_global_step()
+
   def function(tag, scope):
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_histogram_summary(
         context.context().summary_writer_resource,
-        global_step, tag, array_ops.identity(tensor),
+        _choose_step(step),
+        tag,
+        array_ops.identity(tensor),
         name=scope)
 
   return summary_writer_function(name, tensor, function, family=family)
 
 
-def image(name, tensor, bad_color=None, max_images=3, family=None,
-          global_step=None):
+def image(name, tensor, bad_color=None, max_images=3, family=None, step=None):
   """Writes an image summary if possible."""
-  if global_step is None:
-    global_step = training_util.get_global_step()
+
   def function(tag, scope):
     bad_color_ = (constant_op.constant([255, 0, 0, 255], dtype=dtypes.uint8)
                   if bad_color is None else bad_color)
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_image_summary(
         context.context().summary_writer_resource,
-        global_step, tag, array_ops.identity(tensor),
+        _choose_step(step),
+        tag,
+        array_ops.identity(tensor),
         bad_color_,
-        max_images, name=scope)
+        max_images,
+        name=scope)
 
   return summary_writer_function(name, tensor, function, family=family)
 
 
-def audio(name, tensor, sample_rate, max_outputs, family=None,
-          global_step=None):
+def audio(name, tensor, sample_rate, max_outputs, family=None, step=None):
   """Writes an audio summary if possible."""
-  if global_step is None:
-    global_step = training_util.get_global_step()
+
   def function(tag, scope):
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_audio_summary(
         context.context().summary_writer_resource,
-        global_step,
+        _choose_step(step),
         tag,
         array_ops.identity(tensor),
         sample_rate=sample_rate,
@@ -465,15 +486,13 @@ def graph(param, step=None, name=None):
   if writer is None:
     return control_flow_ops.no_op()
   with ops.device("cpu:0"):
-    if step is None:
-      step = training_util.get_global_step()
-    else:
-      step = ops.convert_to_tensor(step, dtypes.int64)
     if isinstance(param, (ops.Graph, graph_pb2.GraphDef)):
       tensor = ops.convert_to_tensor(_serialize_graph(param), dtypes.string)
     else:
       tensor = array_ops.identity(param)
-    return gen_summary_ops.write_graph_summary(writer, step, tensor, name=name)
+    return gen_summary_ops.write_graph_summary(
+        writer, _choose_step(step), tensor, name=name)
+
 
 _graph = graph  # for functions with a graph parameter
 
@@ -481,7 +500,7 @@ _graph = graph  # for functions with a graph parameter
 def import_event(tensor, name=None):
   """Writes a @{tf.Event} binary proto.
 
-  When using create_summary_db_writer(), this can be used alongside
+  When using create_db_writer(), this can be used alongside
   @{tf.TFRecordReader} to load event logs into the database. Please
   note that this is lower level than the other summary functions and
   will ignore any conditions set by methods like
@@ -499,13 +518,49 @@ def import_event(tensor, name=None):
       context.context().summary_writer_resource, tensor, name=name)
 
 
+def flush(writer=None, name=None):
+  """Forces summary writer to send any buffered data to storage.
+
+  This operation blocks until that finishes.
+
+  Args:
+    writer: The @{tf.contrib.summary.SummaryWriter} resource to flush.
+      The thread default will be used if this parameter is None.
+      Otherwise a @{tf.no_op} is returned.
+    name: A name for the operation (optional).
+
+  Returns:
+    The created @{tf.Operation}.
+  """
+  if writer is None:
+    writer = context.context().summary_writer_resource
+    if writer is None:
+      return control_flow_ops.no_op()
+  return gen_summary_ops.flush_summary_writer(writer, name=name)
+
+
 def eval_dir(model_dir, name=None):
   """Construct a logdir for an eval summary writer."""
   return os.path.join(model_dir, "eval" if not name else "eval_" + name)
 
 
+def create_summary_file_writer(*args, **kwargs):
+  """Please use @{tf.contrib.summary.create_file_writer}."""
+  logging.warning("Deprecation Warning: create_summary_file_writer was renamed "
+                  "to create_file_writer")
+  return create_file_writer(*args, **kwargs)
+
+
 def _serialize_graph(arbitrary_graph):
   if isinstance(arbitrary_graph, ops.Graph):
     return arbitrary_graph.as_graph_def(add_shapes=True).SerializeToString()
   else:
     return arbitrary_graph.SerializeToString()
+
+
+def _choose_step(step):
+  if step is None:
+    return training_util.get_or_create_global_step()
+  if not isinstance(step, ops.Tensor):
+    return ops.convert_to_tensor(step, dtypes.int64)
+  return step
diff --git a/tensorflow/contrib/summary/summary_ops_graph_test.py b/tensorflow/contrib/summary/summary_ops_graph_test.py
index 8f85f67a2580ebbc4307c9d128ba10a5b991a31a..2b7806f80d020e0064b0f5cf32fd765a9ee993d1 100644
--- a/tensorflow/contrib/summary/summary_ops_graph_test.py
+++ b/tensorflow/contrib/summary/summary_ops_graph_test.py
@@ -16,20 +16,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import tempfile
+
 import six
 
 from tensorflow.contrib.summary import summary_ops
-from tensorflow.contrib.summary import summary_test_internal
+from tensorflow.contrib.summary import summary_test_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import training_util
 
-get_all = summary_test_internal.get_all
+get_all = summary_test_util.get_all
 
 
-class DbTest(summary_test_internal.SummaryDbTest):
+class DbTest(summary_test_util.SummaryDbTest):
 
   def testGraphPassedToGraph_isForbiddenForThineOwnSafety(self):
     with self.assertRaises(TypeError):
@@ -42,11 +49,85 @@ class DbTest(summary_test_internal.SummaryDbTest):
     name = 'hi'
     graph = graph_pb2.GraphDef(node=(node_def_pb2.NodeDef(name=name),))
     with self.test_session():
-      with self.create_summary_db_writer().as_default():
+      with self.create_db_writer().as_default():
         summary_ops.initialize(graph=graph)
     six.assertCountEqual(self, [name],
                          get_all(self.db, 'SELECT node_name FROM Nodes'))
 
+  def testScalarSummary(self):
+    """Test record_summaries_every_n_global_steps and all_summaries()."""
+    with ops.Graph().as_default(), self.test_session() as sess:
+      global_step = training_util.get_or_create_global_step()
+      global_step.initializer.run()
+      with ops.device('/cpu:0'):
+        step_increment = state_ops.assign_add(global_step, 1)
+      sess.run(step_increment)  # Increment global step from 0 to 1
+
+      logdir = tempfile.mkdtemp()
+      with summary_ops.create_file_writer(logdir, max_queue=0,
+                                          name='t2').as_default():
+        with summary_ops.record_summaries_every_n_global_steps(2):
+          summary_ops.initialize()
+          summary_op = summary_ops.scalar('my_scalar', 2.0)
+
+          # Neither of these should produce a summary because
+          # global_step is 1 and "1 % 2 != 0"
+          sess.run(summary_ops.all_summary_ops())
+          sess.run(summary_op)
+          events = summary_test_util.events_from_logdir(logdir)
+          self.assertEqual(len(events), 1)
+
+          # Increment global step from 1 to 2 and check that the summary
+          # is now written
+          sess.run(step_increment)
+          sess.run(summary_ops.all_summary_ops())
+          events = summary_test_util.events_from_logdir(logdir)
+          self.assertEqual(len(events), 2)
+          self.assertEqual(events[1].summary.value[0].tag, 'my_scalar')
+
+  def testSummaryGraphModeCond(self):
+    with ops.Graph().as_default(), self.test_session():
+      training_util.get_or_create_global_step()
+      logdir = tempfile.mkdtemp()
+      with summary_ops.create_file_writer(
+          logdir, max_queue=0,
+          name='t2').as_default(), summary_ops.always_record_summaries():
+        summary_ops.initialize()
+        training_util.get_or_create_global_step().initializer.run()
+        def f():
+          summary_ops.scalar('scalar', 2.0)
+          return constant_op.constant(True)
+        pred = array_ops.placeholder(dtypes.bool)
+        x = control_flow_ops.cond(pred, f,
+                                  lambda: constant_op.constant(False))
+        x.eval(feed_dict={pred: True})
+
+      events = summary_test_util.events_from_logdir(logdir)
+      self.assertEqual(len(events), 2)
+      self.assertEqual(events[1].summary.value[0].tag, 'cond/scalar')
+
+  def testSummaryGraphModeWhile(self):
+    with ops.Graph().as_default(), self.test_session():
+      training_util.get_or_create_global_step()
+      logdir = tempfile.mkdtemp()
+      with summary_ops.create_file_writer(
+          logdir, max_queue=0,
+          name='t2').as_default(), summary_ops.always_record_summaries():
+        summary_ops.initialize()
+        training_util.get_or_create_global_step().initializer.run()
+        def body(unused_pred):
+          summary_ops.scalar('scalar', 2.0)
+          return constant_op.constant(False)
+        def cond(pred):
+          return pred
+        pred = array_ops.placeholder(dtypes.bool)
+        x = control_flow_ops.while_loop(cond, body, [pred])
+        x.eval(feed_dict={pred: True})
+
+      events = summary_test_util.events_from_logdir(logdir)
+      self.assertEqual(len(events), 2)
+      self.assertEqual(events[1].summary.value[0].tag, 'while/scalar')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index c5ca054f77f9648bddb9deda9290ee54f31800a5..4ef03434b76ee04ce1bb0bd09c27a46db115bab3 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -21,7 +21,6 @@ import tempfile
 import six
 
 from tensorflow.contrib.summary import summary_ops
-from tensorflow.contrib.summary import summary_test_internal
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
@@ -35,8 +34,8 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.training import training_util
 
-get_all = summary_test_internal.get_all
-get_one = summary_test_internal.get_one
+get_all = summary_test_util.get_all
+get_one = summary_test_util.get_one
 
 
 class TargetTest(test_util.TensorFlowTestCase):
@@ -45,7 +44,7 @@ class TargetTest(test_util.TensorFlowTestCase):
     logdir = '/tmp/apath/that/doesnt/exist'
     self.assertFalse(gfile.Exists(logdir))
     with self.assertRaises(errors.NotFoundError):
-      summary_ops.create_summary_file_writer(logdir, max_queue=0, name='t0')
+      summary_ops.create_file_writer(logdir, max_queue=0, name='t0')
 
   def testShouldRecordSummary(self):
     self.assertFalse(summary_ops.should_record_summaries())
@@ -55,7 +54,7 @@ class TargetTest(test_util.TensorFlowTestCase):
   def testSummaryOps(self):
     training_util.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
-    with summary_ops.create_summary_file_writer(
+    with summary_ops.create_file_writer(
         logdir, max_queue=0,
         name='t0').as_default(), summary_ops.always_record_summaries():
       summary_ops.generic('tensor', 1, '')
@@ -70,7 +69,7 @@ class TargetTest(test_util.TensorFlowTestCase):
   def testDefunSummarys(self):
     training_util.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
-    with summary_ops.create_summary_file_writer(
+    with summary_ops.create_file_writer(
         logdir, max_queue=0,
         name='t1').as_default(), summary_ops.always_record_summaries():
 
@@ -86,7 +85,7 @@ class TargetTest(test_util.TensorFlowTestCase):
   def testSummaryName(self):
     training_util.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
-    with summary_ops.create_summary_file_writer(
+    with summary_ops.create_file_writer(
         logdir, max_queue=0,
         name='t2').as_default(), summary_ops.always_record_summaries():
 
@@ -97,23 +96,51 @@ class TargetTest(test_util.TensorFlowTestCase):
       self.assertEqual(events[1].summary.value[0].tag, 'scalar')
 
   def testSummaryGlobalStep(self):
-    global_step = training_util.get_or_create_global_step()
+    step = training_util.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
-    with summary_ops.create_summary_file_writer(
+    with summary_ops.create_file_writer(
         logdir, max_queue=0,
         name='t2').as_default(), summary_ops.always_record_summaries():
 
-      summary_ops.scalar('scalar', 2.0, global_step=global_step)
+      summary_ops.scalar('scalar', 2.0, step=step)
 
       events = summary_test_util.events_from_logdir(logdir)
       self.assertEqual(len(events), 2)
       self.assertEqual(events[1].summary.value[0].tag, 'scalar')
 
-
-class DbTest(summary_test_internal.SummaryDbTest):
+  def testMaxQueue(self):
+    logs = tempfile.mkdtemp()
+    with summary_ops.create_file_writer(
+        logs, max_queue=2, flush_millis=999999,
+        name='lol').as_default(), summary_ops.always_record_summaries():
+      get_total = lambda: len(summary_test_util.events_from_logdir(logs))
+      # Note: First tf.Event is always file_version.
+      self.assertEqual(1, get_total())
+      summary_ops.scalar('scalar', 2.0, step=1)
+      self.assertEqual(1, get_total())
+      summary_ops.scalar('scalar', 2.0, step=2)
+      self.assertEqual(3, get_total())
+
+  def testFlush(self):
+    logs = tempfile.mkdtemp()
+    with summary_ops.create_file_writer(
+        logs, max_queue=999999, flush_millis=999999,
+        name='lol').as_default(), summary_ops.always_record_summaries():
+      get_total = lambda: len(summary_test_util.events_from_logdir(logs))
+      # Note: First tf.Event is always file_version.
+      self.assertEqual(1, get_total())
+      summary_ops.scalar('scalar', 2.0, step=1)
+      summary_ops.scalar('scalar', 2.0, step=2)
+      self.assertEqual(1, get_total())
+      summary_ops.flush()
+      self.assertEqual(3, get_total())
+
+
+class DbTest(summary_test_util.SummaryDbTest):
 
   def testIntegerSummaries(self):
     step = training_util.create_global_step()
+    writer = self.create_db_writer()
 
     def adder(x, y):
       state_ops.assign_add(step, 1)
@@ -124,7 +151,7 @@ class DbTest(summary_test_internal.SummaryDbTest):
       return sum_
 
     with summary_ops.always_record_summaries():
-      with self.create_summary_db_writer().as_default():
+      with writer.as_default():
         self.assertEqual(5, adder(int64(2), int64(3)).numpy())
 
     six.assertCountEqual(self, [1, 1, 1],
@@ -136,7 +163,7 @@ class DbTest(summary_test_internal.SummaryDbTest):
     sum_id = get_one(self.db, 'SELECT tag_id FROM Tags WHERE tag_name = "sum"')
 
     with summary_ops.always_record_summaries():
-      with self.create_summary_db_writer().as_default():
+      with writer.as_default():
         self.assertEqual(9, adder(int64(4), int64(5)).numpy())
 
     six.assertCountEqual(self, [1, 1, 1, 2, 2, 2],
@@ -159,26 +186,26 @@ class DbTest(summary_test_internal.SummaryDbTest):
 
   def testBadExperimentName(self):
     with self.assertRaises(ValueError):
-      self.create_summary_db_writer(experiment_name='\0')
+      self.create_db_writer(experiment_name='\0')
 
   def testBadRunName(self):
     with self.assertRaises(ValueError):
-      self.create_summary_db_writer(run_name='\0')
+      self.create_db_writer(run_name='\0')
 
   def testBadUserName(self):
     with self.assertRaises(ValueError):
-      self.create_summary_db_writer(user_name='-hi')
+      self.create_db_writer(user_name='-hi')
     with self.assertRaises(ValueError):
-      self.create_summary_db_writer(user_name='hi-')
+      self.create_db_writer(user_name='hi-')
     with self.assertRaises(ValueError):
-      self.create_summary_db_writer(user_name='@')
+      self.create_db_writer(user_name='@')
 
   def testGraphSummary(self):
     training_util.get_or_create_global_step()
     name = 'hi'
     graph = graph_pb2.GraphDef(node=(node_def_pb2.NodeDef(name=name),))
     with summary_ops.always_record_summaries():
-      with self.create_summary_db_writer().as_default():
+      with self.create_db_writer().as_default():
         summary_ops.graph(graph)
     six.assertCountEqual(self, [name],
                          get_all(self.db, 'SELECT node_name FROM Nodes'))
diff --git a/tensorflow/contrib/summary/summary_test_internal.py b/tensorflow/contrib/summary/summary_test_internal.py
deleted file mode 100644
index 54233f2f50bdf3312bdc2b3e033aa71539fe467c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/summary/summary_test_internal.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Internal helpers for tests in this directory."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import os
-import sqlite3
-
-from tensorflow.contrib.summary import summary_ops
-from tensorflow.python.framework import test_util
-
-
-class SummaryDbTest(test_util.TensorFlowTestCase):
-  """Helper for summary database testing."""
-
-  def setUp(self):
-    super(SummaryDbTest, self).setUp()
-    self.db_path = os.path.join(self.get_temp_dir(), 'DbTest.sqlite')
-    if os.path.exists(self.db_path):
-      os.unlink(self.db_path)
-    self.db = sqlite3.connect(self.db_path)
-    self.create_summary_db_writer = functools.partial(
-        summary_ops.create_summary_db_writer,
-        db_uri=self.db_path,
-        experiment_name='experiment',
-        run_name='run',
-        user_name='user')
-
-  def tearDown(self):
-    self.db.close()
-    super(SummaryDbTest, self).tearDown()
-
-
-def get_one(db, q, *p):
-  return db.execute(q, p).fetchone()[0]
-
-
-def get_all(db, q, *p):
-  return unroll(db.execute(q, p).fetchall())
-
-
-def unroll(list_of_tuples):
-  return sum(list_of_tuples, ())
diff --git a/tensorflow/contrib/summary/summary_test_util.py b/tensorflow/contrib/summary/summary_test_util.py
index 794c5b8bab11f92474615cce40bb701e69b55f9f..bda57e6a0ca8e1ddb979a80de276911c7738f0aa 100644
--- a/tensorflow/contrib/summary/summary_test_util.py
+++ b/tensorflow/contrib/summary/summary_test_util.py
@@ -19,13 +19,38 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import os
+import sqlite3
 
+from tensorflow.contrib.summary import summary_ops
 from tensorflow.core.util import event_pb2
+from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import tf_record
 from tensorflow.python.platform import gfile
 
 
+class SummaryDbTest(test_util.TensorFlowTestCase):
+  """Helper for summary database testing."""
+
+  def setUp(self):
+    super(SummaryDbTest, self).setUp()
+    self.db_path = os.path.join(self.get_temp_dir(), 'DbTest.sqlite')
+    if os.path.exists(self.db_path):
+      os.unlink(self.db_path)
+    self.db = sqlite3.connect(self.db_path)
+    self.create_db_writer = functools.partial(
+        summary_ops.create_db_writer,
+        db_uri=self.db_path,
+        experiment_name='experiment',
+        run_name='run',
+        user_name='user')
+
+  def tearDown(self):
+    self.db.close()
+    super(SummaryDbTest, self).tearDown()
+
+
 def events_from_file(filepath):
   """Returns all events in a single event file.
 
@@ -58,5 +83,17 @@ def events_from_logdir(logdir):
   """
   assert gfile.Exists(logdir)
   files = gfile.ListDirectory(logdir)
-  assert len(files) == 1, "Found not exactly one file in logdir: %s" % files
+  assert len(files) == 1, 'Found not exactly one file in logdir: %s' % files
   return events_from_file(os.path.join(logdir, files[0]))
+
+
+def get_one(db, q, *p):
+  return db.execute(q, p).fetchone()[0]
+
+
+def get_all(db, q, *p):
+  return unroll(db.execute(q, p).fetchall())
+
+
+def unroll(list_of_tuples):
+  return sum(list_of_tuples, ())
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
index 09b83e2af1f2038665ac6abc1fedd99426066d02..66aa293dc1cb93b82f06d838ad7b0f9c09761585 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
@@ -70,7 +70,7 @@ REGISTER_OP("StochasticHardRoutingFunction")
       return Status::OK();
     })
     .Doc(R"doc(
-  Samples a path for each instance in `input_data` and returns the 
+  Samples a path for each instance in `input_data` and returns the
   probability of the path and the path taken.
 
   tree_depth: The depth of the decision tree.
diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/models/decisions_to_data_then_nn_test.py b/tensorflow/contrib/tensor_forest/hybrid/python/models/decisions_to_data_then_nn_test.py
index cccf444db809df5032877f026f8f89363ca085bc..a56beeeb2c13cd17082531877670475a16396ca6 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/python/models/decisions_to_data_then_nn_test.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/models/decisions_to_data_then_nn_test.py
@@ -80,7 +80,7 @@ class DecisionsToDataThenNNTest(test_util.TensorFlowTestCase):
         isinstance(self.params.num_trees, tensor_forest.ForestHParams))
 
     with variable_scope.variable_scope(
-        "DecisionsToDataThenNNTest_testContructionPollution"):
+        "DecisionsToDataThenNNTest_testConstructionPollution"):
       graph_builder = decisions_to_data_then_nn.DecisionsToDataThenNN(
           self.params)
 
@@ -95,7 +95,7 @@ class DecisionsToDataThenNNTest(test_util.TensorFlowTestCase):
          for _ in range(100)])
 
     with variable_scope.variable_scope(
-        "DecisionsToDataThenNNTest_testInferenceContruction"):
+        "DecisionsToDataThenNNTest_testInferenceConstruction"):
       graph_builder = decisions_to_data_then_nn.DecisionsToDataThenNN(
           self.params)
       graph = graph_builder.inference_graph(data, None)
@@ -111,7 +111,7 @@ class DecisionsToDataThenNNTest(test_util.TensorFlowTestCase):
     labels = [1 for _ in range(100)]
 
     with variable_scope.variable_scope(
-        "DecisionsToDataThenNNTest_testTrainingContruction"):
+        "DecisionsToDataThenNNTest_testTrainingConstruction"):
       graph_builder = decisions_to_data_then_nn.DecisionsToDataThenNN(
           self.params)
       graph = graph_builder.training_graph(data, labels, None)
diff --git a/tensorflow/contrib/tensorboard/db/schema.cc b/tensorflow/contrib/tensorboard/db/schema.cc
index d63b2c6cc23248c2dc5bdd4433047d3fa58c1d14..fd024d692c3feddea2e5cbd29380686e8a0e9839 100644
--- a/tensorflow/contrib/tensorboard/db/schema.cc
+++ b/tensorflow/contrib/tensorboard/db/schema.cc
@@ -21,6 +21,48 @@ class SqliteSchema {
  public:
   explicit SqliteSchema(std::shared_ptr<Sqlite> db) : db_(std::move(db)) {}
 
+  /// \brief Creates Ids table.
+  ///
+  /// This table must be used to randomly allocate Permanent IDs for
+  /// all top-level tables, in order to maintain an invariant where
+  /// foo_id != bar_id for all IDs of any two tables.
+  ///
+  /// A row should only be deleted from this table if it can be
+  /// guaranteed that it exists absolutely nowhere else in the entire
+  /// system.
+  ///
+  /// Fields:
+  ///   id: An ID that was allocated globally. This must be in the
+  ///     range [1,2**47). 0 is assigned the same meaning as NULL and
+  ///     shouldn't be stored; 2**63-1 is reserved for statically
+  ///     allocating space in a page to UPDATE later; and all other
+  ///     int64 values are reserved for future use.
+  Status CreateIdsTable() {
+    return Run(R"sql(
+      CREATE TABLE IF NOT EXISTS Ids (
+        id INTEGER PRIMARY KEY
+      )
+    )sql");
+  }
+
+  /// \brief Creates Descriptions table.
+  ///
+  /// This table allows TensorBoard to associate Markdown text with any
+  /// object in the database that has a Permanent ID.
+  ///
+  /// Fields:
+  ///   id: The Permanent ID of the associated object. This is also the
+  ///     SQLite rowid.
+  ///   description: Arbitrary Markdown text.
+  Status CreateDescriptionsTable() {
+    return Run(R"sql(
+      CREATE TABLE IF NOT EXISTS Descriptions (
+        id INTEGER PRIMARY KEY,
+        description TEXT
+      )
+    )sql");
+  }
+
   /// \brief Creates Tensors table.
   ///
   /// Fields:
@@ -83,15 +125,15 @@ class SqliteSchema {
   ///
   /// Fields:
   ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   tag_id: Permanent >0 unique ID.
+  ///   tag_id: The Permanent ID of the Tag.
   ///   run_id: Optional ID of associated Run.
   ///   tag_name: The tag field in summary.proto, unique across Run.
   ///   inserted_time: Float UNIX timestamp with µs precision. This is
   ///     always the wall time of when the row was inserted into the
   ///     DB. It may be used as a hint for an archival job.
-  ///   metadata: Optional BLOB of SummaryMetadata proto.
   ///   display_name: Optional for GUI and defaults to tag_name.
-  ///   summary_description: Optional markdown information.
+  ///   plugin_name: Arbitrary TensorBoard plugin name for dispatch.
+  ///   plugin_data: Arbitrary data that plugin wants.
   Status CreateTagsTable() {
     return Run(R"sql(
       CREATE TABLE IF NOT EXISTS Tags (
@@ -100,28 +142,31 @@ class SqliteSchema {
         tag_id INTEGER NOT NULL,
         tag_name TEXT,
         inserted_time DOUBLE,
-        metadata BLOB,
         display_name TEXT,
-        description TEXT
+        plugin_name TEXT,
+        plugin_data BLOB
       )
     )sql");
   }
 
   /// \brief Creates Runs table.
   ///
-  /// This table stores information about runs. Each row usually
+  /// This table stores information about Runs. Each row usually
   /// represents a single attempt at training or testing a TensorFlow
   /// model, with a given set of hyper-parameters, whose summaries are
   /// written out to a single event logs directory with a monotonic step
   /// counter.
   ///
-  /// When a run is deleted from this table, TensorBoard should treat all
-  /// information associated with it as deleted, even if those rows in
-  /// different tables still exist.
-  ///
   /// Fields:
   ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   run_id: Permanent >0 unique ID.
+  ///   run_id: The Permanent ID of the Run. This has a 1:1 mapping
+  ///     with a SummaryWriter instance. If two writers spawn for a
+  ///     given (user_name, run_name, run_name) then each should
+  ///     allocate its own run_id and whichever writer puts it in the
+  ///     database last wins. The Tags / Tensors associated with the
+  ///     previous invocations will then enter limbo, where they may be
+  ///     accessible for certain operations, but should be garbage
+  ///     collected eventually.
   ///   experiment_id: Optional ID of associated Experiment.
   ///   run_name: User-supplied string, unique across Experiment.
   ///   inserted_time: Float UNIX timestamp with µs precision. This is
@@ -134,7 +179,10 @@ class SqliteSchema {
   ///     started, from the perspective of whichever machine talks to
   ///     the database. This field will be mutated if the run is
   ///     restarted.
-  ///   description: Optional markdown information.
+  ///   finished_time: Float UNIX timestamp with µs precision of when
+  ///     SummaryWriter resource that created this run was destroyed.
+  ///     Once this value becomes non-NULL a Run and its Tags and
+  ///     Tensors should be regarded as immutable.
   ///   graph_id: ID of associated Graphs row.
   Status CreateRunsTable() {
     return Run(R"sql(
@@ -145,7 +193,7 @@ class SqliteSchema {
         run_name TEXT,
         inserted_time REAL,
         started_time REAL,
-        description TEXT,
+        finished_time REAL,
         graph_id INTEGER
       )
     )sql");
@@ -159,15 +207,15 @@ class SqliteSchema {
   /// Fields:
   ///   rowid: Ephemeral b-tree ID dictating locality.
   ///   user_id: Optional ID of associated User.
-  ///   experiment_id: Permanent >0 unique ID.
+  ///   experiment_id: The Permanent ID of the Experiment.
   ///   experiment_name: User-supplied string, unique across User.
   ///   inserted_time: Float UNIX timestamp with µs precision. This is
   ///     always the time the row was inserted into the database. It
   ///     does not change.
   ///   started_time: Float UNIX timestamp with µs precision. This is
   ///     the MIN(experiment.started_time, run.started_time) of each
-  ///     Run added to the database.
-  ///   description: Optional markdown information.
+  ///     Run added to the database, including Runs which have since
+  ///     been overwritten.
   Status CreateExperimentsTable() {
     return Run(R"sql(
       CREATE TABLE IF NOT EXISTS Experiments (
@@ -176,8 +224,7 @@ class SqliteSchema {
         experiment_id INTEGER NOT NULL,
         experiment_name TEXT,
         inserted_time REAL,
-        started_time REAL,
-        description TEXT
+        started_time REAL
       )
     )sql");
   }
@@ -186,7 +233,7 @@ class SqliteSchema {
   ///
   /// Fields:
   ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   user_id: Permanent >0 unique ID.
+  ///   user_id: The Permanent ID of the User.
   ///   user_name: Unique user name.
   ///   email: Optional unique email address.
   ///   inserted_time: Float UNIX timestamp with µs precision. This is
@@ -208,7 +255,7 @@ class SqliteSchema {
   ///
   /// Fields:
   ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   graph_id: Permanent >0 unique ID.
+  ///   graph_id: The Permanent ID of the Graph.
   ///   inserted_time: Float UNIX timestamp with µs precision. This is
   ///     always the wall time of when the row was inserted into the
   ///     DB. It may be used as a hint for an archival job.
@@ -229,7 +276,7 @@ class SqliteSchema {
   ///
   /// Fields:
   ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   graph_id: Permanent >0 unique ID.
+  ///   graph_id: The Permanent ID of the associated Graph.
   ///   node_id: ID for this node. This is more like a 0-index within
   ///     the Graph. Please note indexes are allowed to be removed.
   ///   node_name: Unique name for this Node within Graph. This is
@@ -258,7 +305,7 @@ class SqliteSchema {
   ///
   /// Fields:
   ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   graph_id: Permanent >0 unique ID.
+  ///   graph_id: The Permanent ID of the associated Graph.
   ///   node_id: Index of Node in question. This can be considered the
   ///     'to' vertex.
   ///   idx: Used for ordering inputs on a given Node.
@@ -420,6 +467,8 @@ class SqliteSchema {
 
 Status SetupTensorboardSqliteDb(std::shared_ptr<Sqlite> db) {
   SqliteSchema s(std::move(db));
+  TF_RETURN_IF_ERROR(s.CreateIdsTable());
+  TF_RETURN_IF_ERROR(s.CreateDescriptionsTable());
   TF_RETURN_IF_ERROR(s.CreateTensorsTable());
   TF_RETURN_IF_ERROR(s.CreateTensorChunksTable());
   TF_RETURN_IF_ERROR(s.CreateTagsTable());
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc b/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
index ae063d24efef3fd1127f45473b4ed1be4507042d..04b9c8e457bd52ff476ed8b13ff9608bdc8a933e 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
@@ -29,22 +29,25 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+// https://www.sqlite.org/fileformat.html#record_format
+const uint64 kIdTiers[] = {
+    0x7fffffULL,        // 23-bit (3 bytes on disk)
+    0x7fffffffULL,      // 31-bit (4 bytes on disk)
+    0x7fffffffffffULL,  // 47-bit (5 bytes on disk)
+                        // Remaining bits reserved for future use.
+};
+const int kMaxIdTier = sizeof(kIdTiers) / sizeof(uint64);
+const int kIdCollisionDelayMicros = 10;
+const int kMaxIdCollisions = 21;  // sum(2**i*10µs for i in range(21))~=21s
+const int64 kAbsent = 0LL;
+const int64 kReserved = 0x7fffffffffffffffLL;
+
 double GetWallTime(Env* env) {
   // TODO(@jart): Follow precise definitions for time laid out in schema.
   // TODO(@jart): Use monotonic clock from gRPC codebase.
   return static_cast<double>(env->NowMicros()) / 1.0e6;
 }
 
-int64 MakeRandomId() {
-  // TODO(@jart): Try generating ID in 2^24 space, falling back to 2^63
-  //              https://sqlite.org/src4/doc/trunk/www/varint.wiki
-  int64 id = static_cast<int64>(random::New64() & ((1ULL << 63) - 1));
-  if (id == 0) {
-    ++id;
-  }
-  return id;
-}
-
 Status Serialize(const protobuf::MessageLite& proto, string* output) {
   output->clear();
   if (!proto.SerializeToString(output)) {
@@ -67,7 +70,7 @@ Status BindProto(SqliteStatement* stmt, int parameter,
   TF_RETURN_IF_ERROR(Serialize(proto, &serialized));
   string compressed;
   TF_RETURN_IF_ERROR(Compress(serialized, &compressed));
-  stmt->BindBlobUnsafe(parameter, compressed);
+  stmt->BindBlob(parameter, compressed);
   return Status::OK();
 }
 
@@ -81,54 +84,118 @@ Status BindTensor(SqliteStatement* stmt, int parameter, const Tensor& t) {
   return BindProto(stmt, parameter, p);
 }
 
-class Transactor {
+// Tries to fudge shape and dtype to something with smaller storage.
+Status CoerceScalar(const Tensor& t, Tensor* out) {
+  switch (t.dtype()) {
+    case DT_DOUBLE:
+      *out = t;
+      break;
+    case DT_INT64:
+      *out = t;
+      break;
+    case DT_FLOAT:
+      *out = {DT_DOUBLE, {}};
+      out->scalar<double>()() = t.scalar<float>()();
+      break;
+    case DT_HALF:
+      *out = {DT_DOUBLE, {}};
+      out->scalar<double>()() = static_cast<double>(t.scalar<Eigen::half>()());
+      break;
+    case DT_INT32:
+      *out = {DT_INT64, {}};
+      out->scalar<int64>()() = t.scalar<int32>()();
+      break;
+    case DT_INT16:
+      *out = {DT_INT64, {}};
+      out->scalar<int64>()() = t.scalar<int16>()();
+      break;
+    case DT_INT8:
+      *out = {DT_INT64, {}};
+      out->scalar<int64>()() = t.scalar<int8>()();
+      break;
+    case DT_UINT32:
+      *out = {DT_INT64, {}};
+      out->scalar<int64>()() = t.scalar<uint32>()();
+      break;
+    case DT_UINT16:
+      *out = {DT_INT64, {}};
+      out->scalar<int64>()() = t.scalar<uint16>()();
+      break;
+    case DT_UINT8:
+      *out = {DT_INT64, {}};
+      out->scalar<int64>()() = t.scalar<uint8>()();
+      break;
+    default:
+      return errors::Unimplemented("Scalar summary for dtype ",
+                                   DataTypeString(t.dtype()),
+                                   " is not supported.");
+  }
+  return Status::OK();
+}
+
+/// \brief Generates unique IDs randomly in the [1,2**63-2] range.
+///
+/// This class starts off generating IDs in the [1,2**23-1] range,
+/// because it's human friendly and occupies 4 bytes max on disk with
+/// SQLite's zigzag varint encoding. Then, each time a collision
+/// happens, the random space is increased by 8 bits.
+///
+/// This class uses exponential back-off so writes will slow down as
+/// the ID space becomes exhausted.
+class IdAllocator {
  public:
-  explicit Transactor(std::shared_ptr<Sqlite> db)
-      : db_(std::move(db)),
-        begin_(db_->Prepare("BEGIN TRANSACTION")),
-        commit_(db_->Prepare("COMMIT TRANSACTION")),
-        rollback_(db_->Prepare("ROLLBACK TRANSACTION")) {}
-
-  template <typename T, typename... Args>
-  Status Transact(T callback, Args&&... args) {
-    TF_RETURN_IF_ERROR(begin_.StepAndReset());
-    Status s = callback(std::forward<Args>(args)...);
-    if (s.ok()) {
-      TF_RETURN_IF_ERROR(commit_.StepAndReset());
-    } else {
-      TF_RETURN_WITH_CONTEXT_IF_ERROR(rollback_.StepAndReset(), s.ToString());
+  IdAllocator(Env* env, Sqlite* db)
+      : env_{env}, inserter_{db->Prepare("INSERT INTO Ids (id) VALUES (?)")} {}
+
+  Status CreateNewId(int64* id) {
+    Status s;
+    for (int i = 0; i < kMaxIdCollisions; ++i) {
+      int64 tid = MakeRandomId();
+      inserter_.BindInt(1, tid);
+      s = inserter_.StepAndReset();
+      if (s.ok()) {
+        *id = tid;
+        break;
+      }
+      // SQLITE_CONSTRAINT maps to INVALID_ARGUMENT in sqlite.cc
+      if (s.code() != error::INVALID_ARGUMENT) break;
+      if (tier_ < kMaxIdTier) {
+        LOG(INFO) << "IdAllocator collision at tier " << tier_ << " (of "
+                  << kMaxIdTier << ") so auto-adjusting to a higher tier";
+        ++tier_;
+      } else {
+        LOG(WARNING) << "IdAllocator (attempt #" << i << ") "
+                     << "resulted in a collision at the highest tier; this "
+                        "is problematic if it happens often; you can try "
+                        "pruning the Ids table; you can also file a bug "
+                        "asking for the ID space to be increased; otherwise "
+                        "writes will gradually slow down over time until they "
+                        "become impossible";
+      }
+      env_->SleepForMicroseconds((1 << i) * kIdCollisionDelayMicros);
     }
     return s;
   }
 
  private:
-  std::shared_ptr<Sqlite> db_;
-  SqliteStatement begin_;
-  SqliteStatement commit_;
-  SqliteStatement rollback_;
+  int64 MakeRandomId() {
+    int64 id = static_cast<int64>(random::New64() & kIdTiers[tier_]);
+    if (id == kAbsent) ++id;
+    if (id == kReserved) --id;
+    return id;
+  }
+
+  Env* env_;
+  SqliteStatement inserter_;
+  int tier_ = 0;
 };
 
 class GraphSaver {
  public:
-  static Status SaveToRun(Env* env, Sqlite* db, GraphDef* graph, int64 run_id) {
-    auto get = db->Prepare("SELECT graph_id FROM Runs WHERE run_id = ?");
-    get.BindInt(1, run_id);
-    bool is_done;
-    TF_RETURN_IF_ERROR(get.Step(&is_done));
-    int64 graph_id = is_done ? 0 : get.ColumnInt(0);
-    if (graph_id == 0) {
-      graph_id = MakeRandomId();
-      // TODO(@jart): Check for ID collision.
-      auto set = db->Prepare("UPDATE Runs SET graph_id = ? WHERE run_id = ?");
-      set.BindInt(1, graph_id);
-      set.BindInt(2, run_id);
-      TF_RETURN_IF_ERROR(set.StepAndReset());
-    }
-    return Save(env, db, graph, graph_id);
-  }
-
-  static Status Save(Env* env, Sqlite* db, GraphDef* graph, int64 graph_id) {
-    GraphSaver saver{env, db, graph, graph_id};
+  static Status Save(Env* env, Sqlite* db, IdAllocator* id_allocator,
+                     GraphDef* graph, int64* graph_id) {
+    TF_RETURN_IF_ERROR(id_allocator->CreateNewId(graph_id));
+    GraphSaver saver{env, db, graph, *graph_id};
     saver.MapNameToNodeId();
     TF_RETURN_IF_ERROR(saver.SaveNodeInputs());
     TF_RETURN_IF_ERROR(saver.SaveNodes());
@@ -153,9 +220,6 @@ class GraphSaver {
   }
 
   Status SaveNodeInputs() {
-    auto purge = db_->Prepare("DELETE FROM NodeInputs WHERE graph_id = ?");
-    purge.BindInt(1, graph_id_);
-    TF_RETURN_IF_ERROR(purge.StepAndReset());
     auto insert = db_->Prepare(R"sql(
       INSERT INTO NodeInputs (graph_id, node_id, idx, input_node_id, is_control)
       VALUES (?, ?, ?, ?, ?)
@@ -184,9 +248,6 @@ class GraphSaver {
   }
 
   Status SaveNodes() {
-    auto purge = db_->Prepare("DELETE FROM Nodes WHERE graph_id = ?");
-    purge.BindInt(1, graph_id_);
-    TF_RETURN_IF_ERROR(purge.StepAndReset());
     auto insert = db_->Prepare(R"sql(
       INSERT INTO Nodes (graph_id, node_id, node_name, op, device, node_def)
       VALUES (?, ?, ?, ?, ?, ?)
@@ -214,7 +275,7 @@ class GraphSaver {
 
   Status SaveGraph() {
     auto insert = db_->Prepare(R"sql(
-      INSERT OR REPLACE INTO Graphs (graph_id, inserted_time, graph_def)
+      INSERT INTO Graphs (graph_id, inserted_time, graph_def)
       VALUES (?, ?, ?)
     )sql");
     insert.BindInt(1, graph_id_);
@@ -229,95 +290,292 @@ class GraphSaver {
   GraphDef* graph_;
   int64 graph_id_;
   std::vector<string> name_copies_;
-  std::unordered_map<StringPiece, int64, StringPiece::Hasher> name_to_node_id_;
+  std::unordered_map<StringPiece, int64, StringPieceHasher> name_to_node_id_;
 };
 
-class SummaryDbWriter : public SummaryWriterInterface {
+class RunWriter {
  public:
-  SummaryDbWriter(Env* env, std::shared_ptr<Sqlite> db)
-      : SummaryWriterInterface(),
-        env_(env),
-        db_(std::move(db)),
-        txn_(db_),
-        run_id_{0LL} {}
-  ~SummaryDbWriter() override {}
+  RunWriter(Env* env, std::shared_ptr<Sqlite> db, const string& experiment_name,
+            const string& run_name, const string& user_name)
+      : env_{env},
+        db_{std::move(db)},
+        id_allocator_{env_, db_.get()},
+        experiment_name_{experiment_name},
+        run_name_{run_name},
+        user_name_{user_name},
+        insert_tensor_{db_->Prepare(R"sql(
+          INSERT OR REPLACE INTO Tensors (tag_id, step, computed_time, tensor)
+          VALUES (?, ?, ?, ?)
+        )sql")} {}
+
+  ~RunWriter() {
+    if (run_id_ == kAbsent) return;
+    auto update = db_->Prepare(R"sql(
+      UPDATE Runs SET finished_time = ? WHERE run_id = ?
+    )sql");
+    update.BindDouble(1, GetWallTime(env_));
+    update.BindInt(2, run_id_);
+    Status s = update.StepAndReset();
+    if (!s.ok()) {
+      LOG(ERROR) << "Failed to set Runs[" << run_id_
+                 << "].finish_time: " << s.ToString();
+    }
+  }
 
-  Status Initialize(const string& experiment_name, const string& run_name,
-                    const string& user_name) {
-    mutex_lock ml(mu_);
-    insert_tensor_ = db_->Prepare(R"sql(
-      INSERT OR REPLACE INTO Tensors (tag_id, step, computed_time, tensor)
-      VALUES (?, ?, ?, ?)
+  Status InsertTensor(int64 tag_id, int64 step, double computed_time,
+                      Tensor t) {
+    insert_tensor_.BindInt(1, tag_id);
+    insert_tensor_.BindInt(2, step);
+    insert_tensor_.BindDouble(3, computed_time);
+    if (t.shape().dims() == 0 && t.dtype() == DT_INT64) {
+      insert_tensor_.BindInt(4, t.scalar<int64>()());
+    } else if (t.shape().dims() == 0 && t.dtype() == DT_DOUBLE) {
+      insert_tensor_.BindDouble(4, t.scalar<double>()());
+    } else {
+      TF_RETURN_IF_ERROR(BindTensor(&insert_tensor_, 4, t));
+    }
+    return insert_tensor_.StepAndReset();
+  }
+
+  Status InsertGraph(std::unique_ptr<GraphDef> g, double computed_time) {
+    TF_RETURN_IF_ERROR(InitializeRun(computed_time));
+    int64 graph_id;
+    TF_RETURN_IF_ERROR(
+        GraphSaver::Save(env_, db_.get(), &id_allocator_, g.get(), &graph_id));
+    if (run_id_ != kAbsent) {
+      auto set = db_->Prepare("UPDATE Runs SET graph_id = ? WHERE run_id = ?");
+      set.BindInt(1, graph_id);
+      set.BindInt(2, run_id_);
+      TF_RETURN_IF_ERROR(set.StepAndReset());
+    }
+    return Status::OK();
+  }
+
+  Status GetTagId(double computed_time, const string& tag_name,
+                  const SummaryMetadata& metadata, int64* tag_id) {
+    TF_RETURN_IF_ERROR(InitializeRun(computed_time));
+    auto e = tag_ids_.find(tag_name);
+    if (e != tag_ids_.end()) {
+      *tag_id = e->second;
+      return Status::OK();
+    }
+    TF_RETURN_IF_ERROR(id_allocator_.CreateNewId(tag_id));
+    tag_ids_[tag_name] = *tag_id;
+    if (!metadata.summary_description().empty()) {
+      SqliteStatement insert_description = db_->Prepare(R"sql(
+        INSERT INTO Descriptions (id, description) VALUES (?, ?)
+      )sql");
+      insert_description.BindInt(1, *tag_id);
+      insert_description.BindText(2, metadata.summary_description());
+      TF_RETURN_IF_ERROR(insert_description.StepAndReset());
+    }
+    SqliteStatement insert = db_->Prepare(R"sql(
+      INSERT INTO Tags (
+        run_id,
+        tag_id,
+        tag_name,
+        inserted_time,
+        display_name,
+        plugin_name,
+        plugin_data
+      ) VALUES (?, ?, ?, ?, ?, ?, ?)
+    )sql");
+    if (run_id_ != kAbsent) insert.BindInt(1, run_id_);
+    insert.BindInt(2, *tag_id);
+    insert.BindText(3, tag_name);
+    insert.BindDouble(4, GetWallTime(env_));
+    if (!metadata.display_name().empty()) {
+      insert.BindText(5, metadata.display_name());
+    }
+    if (!metadata.plugin_data().plugin_name().empty()) {
+      insert.BindText(6, metadata.plugin_data().plugin_name());
+    }
+    if (!metadata.plugin_data().content().empty()) {
+      insert.BindBlob(7, metadata.plugin_data().content());
+    }
+    return insert.StepAndReset();
+  }
+
+ private:
+  Status InitializeUser() {
+    if (user_id_ != kAbsent || user_name_.empty()) return Status::OK();
+    SqliteStatement get = db_->Prepare(R"sql(
+      SELECT user_id FROM Users WHERE user_name = ?
     )sql");
-    update_metadata_ = db_->Prepare(R"sql(
-      UPDATE Tags SET metadata = ? WHERE tag_id = ?
+    get.BindText(1, user_name_);
+    bool is_done;
+    TF_RETURN_IF_ERROR(get.Step(&is_done));
+    if (!is_done) {
+      user_id_ = get.ColumnInt(0);
+      return Status::OK();
+    }
+    TF_RETURN_IF_ERROR(id_allocator_.CreateNewId(&user_id_));
+    SqliteStatement insert = db_->Prepare(R"sql(
+      INSERT INTO Users (user_id, user_name, inserted_time) VALUES (?, ?, ?)
     )sql");
-    experiment_name_ = experiment_name;
-    run_name_ = run_name;
-    user_name_ = user_name;
+    insert.BindInt(1, user_id_);
+    insert.BindText(2, user_name_);
+    insert.BindDouble(3, GetWallTime(env_));
+    TF_RETURN_IF_ERROR(insert.StepAndReset());
+    return Status::OK();
+  }
+
+  Status InitializeExperiment(double computed_time) {
+    if (experiment_name_.empty()) return Status::OK();
+    if (experiment_id_ == kAbsent) {
+      TF_RETURN_IF_ERROR(InitializeUser());
+      SqliteStatement get = db_->Prepare(R"sql(
+        SELECT
+          experiment_id,
+          started_time
+        FROM
+          Experiments
+        WHERE
+          user_id IS ?
+          AND experiment_name = ?
+      )sql");
+      if (user_id_ != kAbsent) get.BindInt(1, user_id_);
+      get.BindText(2, experiment_name_);
+      bool is_done;
+      TF_RETURN_IF_ERROR(get.Step(&is_done));
+      if (!is_done) {
+        experiment_id_ = get.ColumnInt(0);
+        experiment_started_time_ = get.ColumnInt(1);
+      } else {
+        TF_RETURN_IF_ERROR(id_allocator_.CreateNewId(&experiment_id_));
+        experiment_started_time_ = computed_time;
+        SqliteStatement insert = db_->Prepare(R"sql(
+          INSERT INTO Experiments (
+            user_id,
+            experiment_id,
+            experiment_name,
+            inserted_time,
+            started_time
+          ) VALUES (?, ?, ?, ?, ?)
+        )sql");
+        if (user_id_ != kAbsent) insert.BindInt(1, user_id_);
+        insert.BindInt(2, experiment_id_);
+        insert.BindText(3, experiment_name_);
+        insert.BindDouble(4, GetWallTime(env_));
+        insert.BindDouble(5, computed_time);
+        TF_RETURN_IF_ERROR(insert.StepAndReset());
+      }
+    }
+    if (computed_time < experiment_started_time_) {
+      experiment_started_time_ = computed_time;
+      SqliteStatement update = db_->Prepare(R"sql(
+        UPDATE Experiments SET started_time = ? WHERE experiment_id = ?
+      )sql");
+      update.BindDouble(1, computed_time);
+      update.BindInt(2, experiment_id_);
+      TF_RETURN_IF_ERROR(update.StepAndReset());
+    }
+    return Status::OK();
+  }
+
+  Status InitializeRun(double computed_time) {
+    if (run_name_.empty()) return Status::OK();
+    TF_RETURN_IF_ERROR(InitializeExperiment(computed_time));
+    if (run_id_ == kAbsent) {
+      TF_RETURN_IF_ERROR(id_allocator_.CreateNewId(&run_id_));
+      run_started_time_ = computed_time;
+      SqliteStatement insert = db_->Prepare(R"sql(
+        INSERT OR REPLACE INTO Runs (
+          experiment_id,
+          run_id,
+          run_name,
+          inserted_time,
+          started_time
+        ) VALUES (?, ?, ?, ?, ?)
+      )sql");
+      if (experiment_id_ != kAbsent) insert.BindInt(1, experiment_id_);
+      insert.BindInt(2, run_id_);
+      insert.BindText(3, run_name_);
+      insert.BindDouble(4, GetWallTime(env_));
+      insert.BindDouble(5, computed_time);
+      TF_RETURN_IF_ERROR(insert.StepAndReset());
+    }
+    if (computed_time < run_started_time_) {
+      run_started_time_ = computed_time;
+      SqliteStatement update = db_->Prepare(R"sql(
+        UPDATE Runs SET started_time = ? WHERE run_id = ?
+      )sql");
+      update.BindDouble(1, computed_time);
+      update.BindInt(2, run_id_);
+      TF_RETURN_IF_ERROR(update.StepAndReset());
+    }
     return Status::OK();
   }
 
-  // TODO(@jart): Use transactions that COMMIT on Flush()
-  // TODO(@jart): Retry Commit() on SQLITE_BUSY with exponential back-off.
+  Env* env_;
+  std::shared_ptr<Sqlite> db_;
+  IdAllocator id_allocator_;
+  const string experiment_name_;
+  const string run_name_;
+  const string user_name_;
+  int64 experiment_id_ = kAbsent;
+  int64 run_id_ = kAbsent;
+  int64 user_id_ = kAbsent;
+  std::unordered_map<string, int64> tag_ids_;
+  double experiment_started_time_ = 0.0;
+  double run_started_time_ = 0.0;
+  SqliteStatement insert_tensor_;
+};
+
+class SummaryDbWriter : public SummaryWriterInterface {
+ public:
+  SummaryDbWriter(Env* env, std::shared_ptr<Sqlite> db,
+                  const string& experiment_name, const string& run_name,
+                  const string& user_name)
+      : SummaryWriterInterface(),
+        env_{env},
+        run_writer_{env, std::move(db), experiment_name, run_name, user_name} {}
+  ~SummaryDbWriter() override {}
+
   Status Flush() override { return Status::OK(); }
 
   Status WriteTensor(int64 global_step, Tensor t, const string& tag,
                      const string& serialized_metadata) override {
     mutex_lock ml(mu_);
-    TF_RETURN_IF_ERROR(InitializeParents());
-    // TODO(@jart): Memoize tag_id.
-    int64 tag_id;
-    TF_RETURN_IF_ERROR(GetTagId(run_id_, tag, &tag_id));
+    SummaryMetadata metadata;
     if (!serialized_metadata.empty()) {
-      // TODO(@jart): Only update metadata for first tensor.
-      update_metadata_.BindBlobUnsafe(1, serialized_metadata);
-      update_metadata_.BindInt(2, tag_id);
-      TF_RETURN_IF_ERROR(update_metadata_.StepAndReset());
-    }
-    // TODO(@jart): Lease blocks of rowids and *_ids to minimize fragmentation.
-    // TODO(@jart): Check for random ID collisions without needing txn retry.
-    insert_tensor_.BindInt(1, tag_id);
-    insert_tensor_.BindInt(2, global_step);
-    insert_tensor_.BindDouble(3, GetWallTime(env_));
-    switch (t.dtype()) {
-      case DT_INT64:
-        insert_tensor_.BindInt(4, t.scalar<int64>()());
-        break;
-      case DT_DOUBLE:
-        insert_tensor_.BindDouble(4, t.scalar<double>()());
-        break;
-      default:
-        TF_RETURN_IF_ERROR(BindTensor(&insert_tensor_, 4, t));
-        break;
+      metadata.ParseFromString(serialized_metadata);
     }
-    return insert_tensor_.StepAndReset();
+    double now = GetWallTime(env_);
+    int64 tag_id;
+    TF_RETURN_IF_ERROR(run_writer_.GetTagId(now, tag, metadata, &tag_id));
+    return run_writer_.InsertTensor(tag_id, global_step, now, t);
+  }
+
+  Status WriteScalar(int64 global_step, Tensor t, const string& tag) override {
+    Tensor t2;
+    TF_RETURN_IF_ERROR(CoerceScalar(t, &t2));
+    // TODO(jart): Generate scalars plugin metadata on this value.
+    return WriteTensor(global_step, std::move(t2), tag, "");
   }
 
   Status WriteGraph(int64 global_step, std::unique_ptr<GraphDef> g) override {
     mutex_lock ml(mu_);
-    TF_RETURN_IF_ERROR(InitializeParents());
-    return txn_.Transact(GraphSaver::SaveToRun, env_, db_.get(), g.get(),
-                         run_id_);
+    return run_writer_.InsertGraph(std::move(g), GetWallTime(env_));
   }
 
   Status WriteEvent(std::unique_ptr<Event> e) override {
     switch (e->what_case()) {
       case Event::WhatCase::kSummary: {
         mutex_lock ml(mu_);
-        TF_RETURN_IF_ERROR(InitializeParents());
-        const Summary& summary = e->summary();
-        for (int i = 0; i < summary.value_size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteSummary(e.get(), summary.value(i)));
+        Status s;
+        for (const auto& value : e->summary().value()) {
+          s.Update(WriteSummary(e.get(), value));
         }
-        return Status::OK();
+        return s;
       }
       case Event::WhatCase::kGraphDef: {
+        mutex_lock ml(mu_);
         std::unique_ptr<GraphDef> graph{new GraphDef};
         if (!ParseProtoUnlimited(graph.get(), e->graph_def())) {
           return errors::DataLoss("parse event.graph_def failed");
         }
-        return WriteGraph(e->step(), std::move(graph));
+        return run_writer_.InsertGraph(std::move(graph), e->wall_time());
       }
       default:
         // TODO(@jart): Handle other stuff.
@@ -325,15 +583,6 @@ class SummaryDbWriter : public SummaryWriterInterface {
     }
   }
 
-  Status WriteScalar(int64 global_step, Tensor t, const string& tag) override {
-    // TODO(@jart): Unlike WriteTensor, this method would be granted leniency
-    //              to change the dtype if it saves storage space. For example,
-    //              DT_UINT32 would be stored in the database as an INTEGER
-    //              rather than a serialized BLOB. But when reading it back,
-    //              the dtype would become DT_INT64.
-    return errors::Unimplemented("WriteScalar");
-  }
-
   Status WriteHistogram(int64 global_step, Tensor t,
                         const string& tag) override {
     return errors::Unimplemented(
@@ -358,128 +607,26 @@ class SummaryDbWriter : public SummaryWriterInterface {
   string DebugString() override { return "SummaryDbWriter"; }
 
  private:
-  Status InitializeParents() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (run_id_ > 0) {
-      return Status::OK();
-    }
-    int64 user_id;
-    TF_RETURN_IF_ERROR(GetUserId(user_name_, &user_id));
-    int64 experiment_id;
-    TF_RETURN_IF_ERROR(
-        GetExperimentId(user_id, experiment_name_, &experiment_id));
-    TF_RETURN_IF_ERROR(GetRunId(experiment_id, run_name_, &run_id_));
-    return Status::OK();
-  }
-
-  Status GetUserId(const string& user_name, int64* user_id)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (user_name.empty()) {
-      *user_id = 0LL;
-      return Status::OK();
-    }
-    SqliteStatement get_user_id = db_->Prepare(R"sql(
-      SELECT user_id FROM Users WHERE user_name = ?
-    )sql");
-    get_user_id.BindText(1, user_name);
-    bool is_done;
-    TF_RETURN_IF_ERROR(get_user_id.Step(&is_done));
-    if (!is_done) {
-      *user_id = get_user_id.ColumnInt(0);
-    } else {
-      *user_id = MakeRandomId();
-      SqliteStatement insert_user = db_->Prepare(R"sql(
-        INSERT INTO Users (user_id, user_name, inserted_time) VALUES (?, ?, ?)
-      )sql");
-      insert_user.BindInt(1, *user_id);
-      insert_user.BindText(2, user_name);
-      insert_user.BindDouble(3, GetWallTime(env_));
-      TF_RETURN_IF_ERROR(insert_user.StepAndReset());
-    }
-    return Status::OK();
-  }
-
-  Status GetExperimentId(int64 user_id, const string& experiment_name,
-                         int64* experiment_id) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    // TODO(@jart): Compute started_time.
-    return GetId("Experiments", "user_id", user_id, "experiment_name",
-                 experiment_name, "experiment_id", experiment_id);
-  }
-
-  Status GetRunId(int64 experiment_id, const string& run_name, int64* run_id)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    // TODO(@jart): Compute started_time.
-    return GetId("Runs", "experiment_id", experiment_id, "run_name", run_name,
-                 "run_id", run_id);
-  }
-
-  Status GetTagId(int64 run_id, const string& tag_name, int64* tag_id)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    return GetId("Tags", "run_id", run_id, "tag_name", tag_name, "tag_id",
-                 tag_id);
-  }
-
-  Status GetId(const char* table, const char* parent_id_field, int64 parent_id,
-               const char* name_field, const string& name, const char* id_field,
-               int64* id) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (name.empty()) {
-      *id = 0LL;
-      return Status::OK();
-    }
-    SqliteStatement select = db_->Prepare(
-        strings::Printf("SELECT %s FROM %s WHERE %s = ? AND %s = ?", id_field,
-                        table, parent_id_field, name_field));
-    if (parent_id > 0) {
-      select.BindInt(1, parent_id);
-    }
-    select.BindText(2, name);
-    bool is_done;
-    TF_RETURN_IF_ERROR(select.Step(&is_done));
-    if (!is_done) {
-      *id = select.ColumnInt(0);
-    } else {
-      *id = MakeRandomId();
-      SqliteStatement insert = db_->Prepare(strings::Printf(
-          "INSERT INTO %s (%s, %s, %s, inserted_time) VALUES (?, ?, ?, ?)",
-          table, parent_id_field, id_field, name_field));
-      if (parent_id > 0) {
-        insert.BindInt(1, parent_id);
-      }
-      insert.BindInt(2, *id);
-      insert.BindText(3, name);
-      insert.BindDouble(4, GetWallTime(env_));
-      TF_RETURN_IF_ERROR(insert.StepAndReset());
-    }
-    return Status::OK();
-  }
-
   Status WriteSummary(const Event* e, const Summary::Value& summary)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    int64 tag_id;
-    TF_RETURN_IF_ERROR(GetTagId(run_id_, summary.tag(), &tag_id));
-    insert_tensor_.BindInt(1, tag_id);
-    insert_tensor_.BindInt(2, e->step());
-    insert_tensor_.BindDouble(3, e->wall_time());
     switch (summary.value_case()) {
-      case Summary::Value::ValueCase::kSimpleValue:
-        insert_tensor_.BindDouble(4, summary.simple_value());
-        break;
+      case Summary::Value::ValueCase::kSimpleValue: {
+        int64 tag_id;
+        TF_RETURN_IF_ERROR(run_writer_.GetTagId(e->wall_time(), summary.tag(),
+                                                summary.metadata(), &tag_id));
+        Tensor t{DT_DOUBLE, {}};
+        t.scalar<double>()() = summary.simple_value();
+        return run_writer_.InsertTensor(tag_id, e->step(), e->wall_time(), t);
+      }
       default:
         // TODO(@jart): Handle the rest.
         return Status::OK();
     }
-    return insert_tensor_.StepAndReset();
   }
 
   mutex mu_;
   Env* env_;
-  std::shared_ptr<Sqlite> db_ GUARDED_BY(mu_);
-  Transactor txn_ GUARDED_BY(mu_);
-  SqliteStatement insert_tensor_ GUARDED_BY(mu_);
-  SqliteStatement update_metadata_ GUARDED_BY(mu_);
-  string user_name_ GUARDED_BY(mu_);
-  string experiment_name_ GUARDED_BY(mu_);
-  string run_name_ GUARDED_BY(mu_);
-  int64 run_id_ GUARDED_BY(mu_);
+  RunWriter run_writer_ GUARDED_BY(mu_);
 };
 
 }  // namespace
@@ -489,14 +636,8 @@ Status CreateSummaryDbWriter(std::shared_ptr<Sqlite> db,
                              const string& run_name, const string& user_name,
                              Env* env, SummaryWriterInterface** result) {
   TF_RETURN_IF_ERROR(SetupTensorboardSqliteDb(db));
-  SummaryDbWriter* w = new SummaryDbWriter(env, std::move(db));
-  const Status s = w->Initialize(experiment_name, run_name, user_name);
-  if (!s.ok()) {
-    w->Unref();
-    *result = nullptr;
-    return s;
-  }
-  *result = w;
+  *result = new SummaryDbWriter(env, std::move(db), experiment_name, run_name,
+                                user_name);
   return Status::OK();
 }
 
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc b/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
index 3431842ca212435f02bbc7f725c6a0d46d54bc5f..5ea844b6685d15ac4c0549816770060c6f25ce38 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
@@ -101,6 +101,7 @@ TEST_F(SummaryDbWriterTest, NothingWritten_NoRowsCreated) {
   TF_ASSERT_OK(writer_->Flush());
   writer_->Unref();
   writer_ = nullptr;
+  EXPECT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Ids"));
   EXPECT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Users"));
   EXPECT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Experiments"));
   EXPECT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Runs"));
@@ -109,13 +110,24 @@ TEST_F(SummaryDbWriterTest, NothingWritten_NoRowsCreated) {
 }
 
 TEST_F(SummaryDbWriterTest, TensorsWritten_RowsGetInitialized) {
+  SummaryMetadata metadata;
+  metadata.set_display_name("display_name");
+  metadata.set_summary_description("description");
+  metadata.mutable_plugin_data()->set_plugin_name("plugin_name");
+  metadata.mutable_plugin_data()->set_content("plugin_data");
+  SummaryMetadata metadata_nope;
+  metadata_nope.set_display_name("nope");
+  metadata_nope.set_summary_description("nope");
+  metadata_nope.mutable_plugin_data()->set_plugin_name("nope");
+  metadata_nope.mutable_plugin_data()->set_content("nope");
   TF_ASSERT_OK(CreateSummaryDbWriter(db_, "mad-science", "train", "jart", &env_,
                                      &writer_));
   env_.AdvanceByMillis(23);
   TF_ASSERT_OK(writer_->WriteTensor(1, MakeScalarInt64(123LL), "taggy",
-                                    "this-is-metaaa"));
+                                    metadata.SerializeAsString()));
   env_.AdvanceByMillis(23);
-  TF_ASSERT_OK(writer_->WriteTensor(2, MakeScalarInt64(314LL), "taggy", ""));
+  TF_ASSERT_OK(writer_->WriteTensor(2, MakeScalarInt64(314LL), "taggy",
+                                    metadata_nope.SerializeAsString()));
   TF_ASSERT_OK(writer_->Flush());
 
   ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Users"));
@@ -148,27 +160,28 @@ TEST_F(SummaryDbWriterTest, TensorsWritten_RowsGetInitialized) {
   EXPECT_EQ(run_id, QueryInt("SELECT run_id FROM Tags"));
   EXPECT_EQ("taggy", QueryString("SELECT tag_name FROM Tags"));
   EXPECT_EQ(0.023, QueryDouble("SELECT inserted_time FROM Tags"));
-  EXPECT_EQ("this-is-metaaa", QueryString("SELECT metadata FROM Tags"));
+
+  EXPECT_EQ("display_name", QueryString("SELECT display_name FROM Tags"));
+  EXPECT_EQ("plugin_name", QueryString("SELECT plugin_name FROM Tags"));
+  EXPECT_EQ("plugin_data", QueryString("SELECT plugin_data FROM Tags"));
+  EXPECT_EQ("description", QueryString("SELECT description FROM Descriptions"));
 
   EXPECT_EQ(tag_id, QueryInt("SELECT tag_id FROM Tensors WHERE step = 1"));
   EXPECT_EQ(0.023,
             QueryDouble("SELECT computed_time FROM Tensors WHERE step = 1"));
-  EXPECT_EQ("this-is-metaaa", QueryString("SELECT metadata FROM Tags"));
   EXPECT_FALSE(
       QueryString("SELECT tensor FROM Tensors WHERE step = 1").empty());
 
   EXPECT_EQ(tag_id, QueryInt("SELECT tag_id FROM Tensors WHERE step = 2"));
   EXPECT_EQ(0.046,
             QueryDouble("SELECT computed_time FROM Tensors WHERE step = 2"));
-  EXPECT_EQ("this-is-metaaa", QueryString("SELECT metadata FROM Tags"));
   EXPECT_FALSE(
       QueryString("SELECT tensor FROM Tensors WHERE step = 2").empty());
 }
 
 TEST_F(SummaryDbWriterTest, EmptyParentNames_NoParentsCreated) {
   TF_ASSERT_OK(CreateSummaryDbWriter(db_, "", "", "", &env_, &writer_));
-  TF_ASSERT_OK(writer_->WriteTensor(1, MakeScalarInt64(123LL), "taggy",
-                                    "this-is-metaaa"));
+  TF_ASSERT_OK(writer_->WriteTensor(1, MakeScalarInt64(123LL), "taggy", ""));
   TF_ASSERT_OK(writer_->Flush());
   ASSERT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Users"));
   ASSERT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Experiments"));
@@ -290,5 +303,66 @@ TEST_F(SummaryDbWriterTest, WriteGraph) {
   EXPECT_EQ(1LL, QueryInt("SELECT is_control FROM NodeInputs WHERE idx = 2"));
 }
 
+TEST_F(SummaryDbWriterTest, WriteScalarInt32_CoercesToInt64) {
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "", "", "", &env_, &writer_));
+  Tensor t(DT_INT32, {});
+  t.scalar<int32>()() = -17;
+  TF_ASSERT_OK(writer_->WriteScalar(1, t, "t"));
+  TF_ASSERT_OK(writer_->Flush());
+  ASSERT_EQ(-17LL, QueryInt("SELECT tensor FROM Tensors"));
+}
+
+TEST_F(SummaryDbWriterTest, WriteScalarInt8_CoercesToInt64) {
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "", "", "", &env_, &writer_));
+  Tensor t(DT_INT8, {});
+  t.scalar<int8>()() = static_cast<int8>(-17);
+  TF_ASSERT_OK(writer_->WriteScalar(1, t, "t"));
+  TF_ASSERT_OK(writer_->Flush());
+  ASSERT_EQ(-17LL, QueryInt("SELECT tensor FROM Tensors"));
+}
+
+TEST_F(SummaryDbWriterTest, WriteScalarUint8_CoercesToInt64) {
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "", "", "", &env_, &writer_));
+  Tensor t(DT_UINT8, {});
+  t.scalar<uint8>()() = static_cast<uint8>(254);
+  TF_ASSERT_OK(writer_->WriteScalar(1, t, "t"));
+  TF_ASSERT_OK(writer_->Flush());
+  ASSERT_EQ(254LL, QueryInt("SELECT tensor FROM Tensors"));
+}
+
+TEST_F(SummaryDbWriterTest, UsesIdsTable) {
+  SummaryMetadata metadata;
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "mad-science", "train", "jart", &env_,
+                                     &writer_));
+  env_.AdvanceByMillis(23);
+  TF_ASSERT_OK(writer_->WriteTensor(1, MakeScalarInt64(123LL), "taggy",
+                                    metadata.SerializeAsString()));
+  TF_ASSERT_OK(writer_->Flush());
+  ASSERT_EQ(4LL, QueryInt("SELECT COUNT(*) FROM Ids"));
+  EXPECT_EQ(4LL, QueryInt(strings::StrCat(
+                     "SELECT COUNT(*) FROM Ids WHERE id IN (",
+                     QueryInt("SELECT user_id FROM Users"), ", ",
+                     QueryInt("SELECT experiment_id FROM Experiments"), ", ",
+                     QueryInt("SELECT run_id FROM Runs"), ", ",
+                     QueryInt("SELECT tag_id FROM Tags"), ")")));
+}
+
+TEST_F(SummaryDbWriterTest, SetsRunFinishedTime) {
+  SummaryMetadata metadata;
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "mad-science", "train", "jart", &env_,
+                                     &writer_));
+  env_.AdvanceByMillis(23);
+  TF_ASSERT_OK(writer_->WriteTensor(1, MakeScalarInt64(123LL), "taggy",
+                                    metadata.SerializeAsString()));
+  TF_ASSERT_OK(writer_->Flush());
+  ASSERT_EQ(0.023, QueryDouble("SELECT started_time FROM Runs"));
+  ASSERT_EQ(0.0, QueryDouble("SELECT finished_time FROM Runs"));
+  env_.AdvanceByMillis(23);
+  writer_->Unref();
+  writer_ = nullptr;
+  ASSERT_EQ(0.023, QueryDouble("SELECT started_time FROM Runs"));
+  ASSERT_EQ(0.046, QueryDouble("SELECT finished_time FROM Runs"));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD
index 755b0657e9fb29c167911407cee340ac7e3e9b7a..bb86ecb2209f9bed3ad6c37f4b23bc7b361e1bd6 100644
--- a/tensorflow/contrib/timeseries/examples/BUILD
+++ b/tensorflow/contrib/timeseries/examples/BUILD
@@ -103,6 +103,7 @@ py_test(
     deps = [
         ":lstm",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/estimator:estimator_py",
     ],
 )
 
diff --git a/tensorflow/contrib/timeseries/examples/lstm.py b/tensorflow/contrib/timeseries/examples/lstm.py
index 3ba823f638da8f750981bc910d960706ff652fb7..c7193cef6915f9d0caf5b52fc084129cbc736994 100644
--- a/tensorflow/contrib/timeseries/examples/lstm.py
+++ b/tensorflow/contrib/timeseries/examples/lstm.py
@@ -165,12 +165,13 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
         "Exogenous inputs are not implemented for this example.")
 
 
-def train_and_predict(csv_file_name=_DATA_FILE, training_steps=200):
+def train_and_predict(
+    csv_file_name=_DATA_FILE, training_steps=200, estimator_config=None):
   """Train and predict using a custom time series model."""
   # Construct an Estimator from our LSTM model.
   estimator = ts_estimators.TimeSeriesRegressor(
       model=_LSTMModel(num_features=5, num_units=128),
-      optimizer=tf.train.AdamOptimizer(0.001))
+      optimizer=tf.train.AdamOptimizer(0.001), config=estimator_config)
   reader = tf.contrib.timeseries.CSVReader(
       csv_file_name,
       column_names=((tf.contrib.timeseries.TrainEvalFeatures.TIMES,)
diff --git a/tensorflow/contrib/timeseries/examples/lstm_test.py b/tensorflow/contrib/timeseries/examples/lstm_test.py
index 56daa1e10d9d1e7e96d71f33afc72671512dbaf8..3cace567266d497b12d836f44a335bbe5d916949 100644
--- a/tensorflow/contrib/timeseries/examples/lstm_test.py
+++ b/tensorflow/contrib/timeseries/examples/lstm_test.py
@@ -20,14 +20,23 @@ from __future__ import print_function
 
 from tensorflow.contrib.timeseries.examples import lstm
 
+from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.platform import test
 
 
+class _SeedRunConfig(estimator_lib.RunConfig):
+
+  @property
+  def tf_random_seed(self):
+    return 3
+
+
 class LSTMExampleTest(test.TestCase):
 
   def test_periodicity_learned(self):
     (observed_times, observed_values,
-     all_times, predicted_values) = lstm.train_and_predict(training_steps=100)
+     all_times, predicted_values) = lstm.train_and_predict(
+         training_steps=100, estimator_config=_SeedRunConfig())
     self.assertAllEqual([100], observed_times.shape)
     self.assertAllEqual([100, 5], observed_values.shape)
     self.assertAllEqual([200], all_times.shape)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head.py b/tensorflow/contrib/timeseries/python/timeseries/head.py
index 5896fc2a206bc747688b5b012e0f87465592dd8a..f0330bfbbd6e8067e5d085376acdf2e6bcaccb6a 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import re
 
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import optimizers
 
 from tensorflow.contrib.timeseries.python.timeseries import feature_keys
@@ -79,7 +79,7 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
 
     train_op = optimizers.optimize_loss(
         model_outputs.loss,
-        global_step=variables.get_global_step(),
+        global_step=training_util.get_global_step(),
         optimizer=self.optimizer,
         # Learning rate is set in the Optimizer object
         learning_rate=None)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/g3doc/periodic_multires_derivation.md b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/g3doc/periodic_multires_derivation.md
index b174bb6af323da62afda2a74a397f25e977a48d0..872474aee1149d36671f660f33f63a204ef8ca43 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/g3doc/periodic_multires_derivation.md
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/g3doc/periodic_multires_derivation.md
@@ -66,7 +66,7 @@ def make_eigval_mat_fn(to_power=1):
         if i == j:
             number = j // 2 + 1
             powersign = ((j + 1) % 2) * 2 - 1
-            return root_of_unity(matsize + 1, number=number, 
+            return root_of_unity(matsize + 1, number=number,
                                  to_power=powersign*to_power)
         else:
             return 0
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 64e9d0e765063a662c846e187dcad57f098ef64d..a34c7f91f275ea544a3114e85d53f4258f683ebc 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -31,21 +31,6 @@ cc_library(
     ],
 )
 
-py_library(
-    name = "tpu_test_util",
-    srcs = ["python/tpu/test_util.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":tpu_lib",
-        ":tpu_py",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variables",
-    ],
-)
-
 py_library(
     name = "tpu_estimator",
     srcs = [
@@ -155,6 +140,8 @@ py_library(
     name = "tpu_lib",
     srcs = [
         "python/tpu/__init__.py",
+        "python/tpu/device_assignment.py",
+        "python/tpu/topology.py",
         "python/tpu/tpu.py",
         "python/tpu/tpu_feed.py",
         "python/tpu/tpu_function.py",
@@ -166,6 +153,7 @@ py_library(
     deps = [
         ":profiler",
         ":tpu_py",
+        "//tensorflow/contrib/tpu/proto:topology_proto_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
diff --git a/tensorflow/contrib/tpu/__init__.py b/tensorflow/contrib/tpu/__init__.py
index ec4c4e1be6f178595e937e9b66202daf942d2528..ea6e874f2d952b03e8cdabeee00ccfe1b076a0d0 100644
--- a/tensorflow/contrib/tpu/__init__.py
+++ b/tensorflow/contrib/tpu/__init__.py
@@ -23,6 +23,7 @@
 
 @@initialize_system
 @@shutdown_system
+@@device_assignment
 @@core
 @@replicate
 @@shard
@@ -33,6 +34,9 @@
 
 @@InfeedQueue
 
+@@DeviceAssignment
+@@Topology
+
 @@while_loop
 @@repeat
 
@@ -49,6 +53,8 @@ from __future__ import print_function
 # pylint: disable=wildcard-import,unused-import
 from tensorflow.contrib.tpu.python import profiler
 from tensorflow.contrib.tpu.python.ops.tpu_ops import *
+from tensorflow.contrib.tpu.python.tpu.device_assignment import *
+from tensorflow.contrib.tpu.python.tpu.topology import *
 from tensorflow.contrib.tpu.python.tpu.tpu import *
 from tensorflow.contrib.tpu.python.tpu.tpu_config import *
 from tensorflow.contrib.tpu.python.tpu.tpu_estimator import *
diff --git a/tensorflow/contrib/tpu/ops/cross_replica_ops.cc b/tensorflow/contrib/tpu/ops/cross_replica_ops.cc
index cbbd19800eb2e336fc343671fb82bb3ed631c129..d389050e67f9a9e48b91583e5088058ec4e2832f 100644
--- a/tensorflow/contrib/tpu/ops/cross_replica_ops.cc
+++ b/tensorflow/contrib/tpu/ops/cross_replica_ops.cc
@@ -22,7 +22,7 @@ namespace tensorflow {
 REGISTER_OP("CrossReplicaSum")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {float}")
+    .Attr("T: {bfloat16, float}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 An Op to sum inputs across replicated TPU instances. Each
diff --git a/tensorflow/contrib/tpu/ops/outfeed_ops.cc b/tensorflow/contrib/tpu/ops/outfeed_ops.cc
index ed5756cc540a202148a02747bc62001ee363be9d..5900c61a38726551391c212f92b9b9eacd4a465b 100644
--- a/tensorflow/contrib/tpu/ops/outfeed_ops.cc
+++ b/tensorflow/contrib/tpu/ops/outfeed_ops.cc
@@ -39,7 +39,7 @@ REGISTER_OP("OutfeedEnqueueTuple")
     .Doc(R"doc(
 An op which emits multiple Tensor values from an XLA computation.
 
-inputs: A list of tensors that will be inserted into the outfeed queue as an 
+inputs: A list of tensors that will be inserted into the outfeed queue as an
 XLA tuple.
 )doc");
 
diff --git a/tensorflow/contrib/tpu/ops/replication_ops.cc b/tensorflow/contrib/tpu/ops/replication_ops.cc
index b40dac471708793d5a033279e2d2f4b4a0dac480..cba71c6b98e1079de6c6c4c32fa2ffc44a9ce71e 100644
--- a/tensorflow/contrib/tpu/ops/replication_ops.cc
+++ b/tensorflow/contrib/tpu/ops/replication_ops.cc
@@ -24,7 +24,9 @@ using shape_inference::ShapeHandle;
 
 REGISTER_OP("TPUReplicateMetadata")
     .Attr("num_replicas: int >= 0")
-    .Attr("global_tpu_id: list(int) = []")
+    .Attr("topology: string = \"\"")
+    .Attr("device_assignment: list(int) = []")
+    .Attr("computation_shape: list(int) = []")
     .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("TPUReplicatedInput")
@@ -64,14 +66,18 @@ REGISTER_OP("TPUReplicatedOutput")
 REGISTER_OP("TPUReplicate")
     .Attr("computation: func")
     .Attr("num_replicas: int >= 1")
-    .Attr("global_tpu_id: list(int) = []")
+    .Attr("topology: string = \"\"")
+    .Attr("device_assignment: list(int) = []")
+    .Attr("computation_shape: list(int) = []")
     .Attr("Tinputs: list(type) >= 0")
     .Attr("Tbroadcast_inputs: list(type) >= 0")
     .Attr("NumVariables: int >= 0")
+    .Attr("Tguaranteed_constants: list(type) >= 0")
     .Attr("output_types: list(type) >= 0")
     .Input("inputs: Tinputs")
     .Input("broadcast_inputs: Tbroadcast_inputs")
     .Input("variables: NumVariables * resource")
+    .Input("guaranteed_constants: Tguaranteed_constants")
     .Output("outputs: output_types")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
@@ -79,14 +85,25 @@ Runs replicated computations on a distributed TPU system.
 
 computation: a function containing the computation to run.
 num_replicas: the number of replicas of the computation to run.
-global_tpu_id: map from device to global tpu id.
+topology: A serialized tensorflow.tpu.TopologyProto that describes the TPU
+topology.
+computation_shape: a [mesh_dimension] array describing the shape of each
+  computation replica in numbers of cores in the TPU mesh.
+device_assignment: a flattened array with shape
+  [replica] + computation_shape + [mesh_dimension] that maps the coordinates of
+  logical cores in each replica of a computation to physical coordinates in
+  the TPU topology.
 Tinputs: the types of the arguments to 'computation'.
 inputs: the inputs to 'computation', flattened, in replica-major order.
 Tbroadcast_inputs: the types of the additional arguments to broadcast to all
   replicas.
+Tguaranteed_constants: the types of the arguments to 'guaranteed_constants'.
 broadcast_inputs: additional arguments to broadcast to all replicas. The
   broadcast inputs are appended to the per-replica inputs when calling
   computation.
+guaranteed_constants: arguments which have been guaranteed to not
+change their values during the session lifetime. These contain tensors marked as
+constant using the GuaranteeConstOp.
 output_types: the types of the outputs of 'computation'.
 outputs: the outputs of 'computation'.
 )doc");
diff --git a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc b/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
index 8a87a91056efeba5d094503cfa68df104e310f30..8c4fe5538d832f390845fe2d31aa6a08342b280b 100644
--- a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
+++ b/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
@@ -107,7 +107,7 @@ in a host.
 
 REGISTER_OP("_WaitForDistributedTPU")
     .Input("inputs: N * int32")
-    .Output("global_tpu_array: int32")
+    .Output("topology: string")
     .Attr("host_specs: list(string)")
     .Attr("startup_timeout_sec: int = 20")
     .Attr("N: int")
@@ -118,7 +118,7 @@ REGISTER_OP("_WaitForDistributedTPU")
       for (int i = 0; i < c->num_inputs(); ++i) {
         TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &input));
       }
-      c->set_output(0, c->UnknownShapeOfRank(2));
+      c->set_output(0, c->Scalar());
       return ::tensorflow::Status::OK();
     })
     .Doc(R"doc(
@@ -129,30 +129,26 @@ _InitializeHostForDistributedTPU Ops.
 
 inputs: For each initialized host, a vector giving the global TPU id
 of each TPU on the host.
-global_tpu_array: A two-dimensional array. For each host (the outer
-dimension) the array lists the global ids of the TPUs on that host.
-host_specs: For each initialized host, the partial device specification
-indicating job, replica, and task. Combining this spec with
-'/device:TPU:k' gives the full device name of the k'th TPU on the
-host.
+topology: A serialized tensorflow.tpu.TopologyProto that describes the TPU
+topology.
 startup_timeout_sec: The number of seconds to wait for the TPU system
 to stabilize.
 )doc");
 
 REGISTER_OP("_SetGlobalTPUArray")
-    .Input("global_tpu_array: int32")
+    .Input("topology: string")
     .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &input));
       return ::tensorflow::Status::OK();
     })
     .Doc(R"doc(
 An op that informs a host of the global ids of all the of TPUs in the
 system.
 
-global_tpu_array: A two-dimensional array. For each host (the outer
-dimension) the array lists the global ids of the TPUs on that host.
+topology: A serialized tensorflow.tpu.TopologyProto that describes the TPU
+topology.
 )doc");
 
 REGISTER_OP("_ShutdownDistributedTPU")
@@ -198,7 +194,7 @@ chips on the host.
 )doc");
 
 REGISTER_OP("ConfigureDistributedTPU")
-    .Output("global_tpu_array: int32")
+    .Output("topology: string")
     .Attr("embedding_config: string = ''")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape)
@@ -206,9 +202,8 @@ REGISTER_OP("ConfigureDistributedTPU")
 An op that sets up the centralized structures for a distributed TPU
 system.
 
-global_tpu_array: A two-dimensional array. For each host (the outer
-dimension) the array lists the global ids of the TPUs on that host.
-embedding_config: Internal use.
+topology: A serialized tensorflow.tpu.TopologyProto that describes the TPU
+topology.
 )doc");
 
 REGISTER_OP("ShutdownDistributedTPU")
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
index 3bdd475fade39baeea67333a55fdd548fb235672..7970c20a2693cbbe91a136080240f676d29f2053 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
@@ -24,22 +24,18 @@ import sys
 
 import tensorflow as tf
 
-
 tf.flags.DEFINE_string('service_addr', '',
                        'Address of TPU profiler service e.g. localhost:8466')
-
-
 tf.flags.DEFINE_string('logdir', '',
                        'Path of TensorBoard log directory e.g. /tmp/tb_log')
-
-
 tf.flags.DEFINE_integer('duration_ms', 2000, 'Duration of tracing in ms.')
 
-
 FLAGS = tf.flags.FLAGS
+EXECUTABLE = 'data/capture_tpu_profile'
 
 
-EXECUTABLE = 'data/capture_tpu_profile'
+def run_main():
+  tf.app.run(main)
 
 
 def main(unused_argv=None):
@@ -54,4 +50,4 @@ def main(unused_argv=None):
 
 
 if __name__ == '__main__':
-  tf.app.run(main)
+  run_main()
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index e77cae4695daa54f690f11982ece44ea6a2a3fc4..179d29602b9f970fb450bc057332fa092066255c 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -23,7 +23,7 @@ from setuptools import setup
 _VERSION = '1.3.0-a1'
 
 CONSOLE_SCRIPTS = [
-    'capture_tpu_profile=cloud_tpu_profiler.main:main',
+    'capture_tpu_profile=cloud_tpu_profiler.main:run_main',
 ]
 
 REQUIRED_PACKAGES = [
@@ -70,7 +70,7 @@ setup(
         'Topic :: Scientific/Engineering :: Mathematics',
         'Topic :: Scientific/Engineering :: Artificial Intelligence',
         'Topic :: Software Development',
-        'Topic :: Software Development :: Libraries',  
+        'Topic :: Software Development :: Libraries',
         'Topic :: Software Development :: Libraries :: Python Modules',
     ],
     license='Apache 2.0',
diff --git a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
index 2d2207a43fed8fe184b238be9708f9199b92d63d..5440bbbfdd75207bd209c19d5cc42dc69504d39b 100644
--- a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
+++ b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
@@ -104,6 +104,8 @@ message HloExtraInfoResult {
   optional string category = 1;
   // The long name of the HLO that includes the dimensions.
   optional string long_name = 2;
+  // The per-TPU-core batch size inferred from this HLO.
+  optional int64 per_core_batch_size = 3;
 }
 
 // Result proto for HloExtraInfoMap.
@@ -112,6 +114,42 @@ message HloExtraInfoMapResult {
   map<string, HloExtraInfoResult> hlo_extrainfo_map = 1;
 }
 
+// Result proto for host-independent job information.
+message HostIndependentJobInfoResult {
+  // The change-list number of this build.
+  optional int64 change_list = 1;
+  // The time of this build.
+  optional int64 build_time = 2;
+  // The target of this build.
+  optional string build_target = 3;
+}
+
+// Result proto for host-dependent job information.
+message HostDependentJobInfoResult {
+  // This ID of the host where the job was run on.
+  optional string host_id = 1;
+  // The command line used to run the job.
+  optional string command_line = 2;
+  // The start time of the job on this host.
+  optional int64 start_time = 3;
+}
+
+// Result proto for RunEnvironment (the run environment of a profiling session).
+message RunEnvironmentResult {
+  // Number of hosts used.
+  optional int32 host_count = 1;
+  // The type of TPU used.
+  optional string tpu_type = 2;
+  // The number of TPU cores used.
+  optional int32 tpu_core_count = 3;
+  // The per-TPU-core batch size.
+  optional int32 per_core_batch_size = 4;
+  // Host-independent job information.
+  optional HostIndependentJobInfoResult host_independent_job_info = 5;
+  // Host-dependent job information.
+  repeated HostDependentJobInfoResult host_dependent_job_info = 6;
+}
+
 // Result proto for TfStatsHelper.
 message TfOpStats {
   // The result for the TF-metric database.
@@ -126,4 +164,6 @@ message TfOpStats {
   optional HloExtraInfoMapResult hlo_extrainfo_map = 5;
   // Overall matrix unit utilization in percentage.
   optional double matrix_unit_utilization_percent = 6;
+  // The run environment of this profiling session.
+  optional RunEnvironmentResult run_environment = 7;
 }
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
index 9c3fd45fd1ec9736b638b45907e585165d4d9057..bf30d2ce091302eaf361a0018464d3b7de94ea6d 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -40,7 +40,7 @@ message ProfileToolData {
 }
 
 message ProfileResponse {
-  uint64 xprof_response_size = 1;  // Placeholder: return something meaningful.
+  reserved 1;  // was uint64 placeholder for returning something meaningful.
   // Graphs of programs executed on TPUs during the profiling period.
   repeated GraphDef computation_graph = 2;
 
diff --git a/tensorflow/contrib/tpu/proto/BUILD b/tensorflow/contrib/tpu/proto/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..79a79efb6b62d3e98127558e951ceefd276b580c
--- /dev/null
+++ b/tensorflow/contrib/tpu/proto/BUILD
@@ -0,0 +1,25 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+tf_proto_library(
+    name = "topology_proto",
+    srcs = [
+        "topology.proto",
+    ],
+    cc_api_version = 2,
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/contrib/tpu/proto/topology.proto b/tensorflow/contrib/tpu/proto/topology.proto
new file mode 100644
index 0000000000000000000000000000000000000000..17064ee5a2ee241824573d51c8c433c3c6c390b7
--- /dev/null
+++ b/tensorflow/contrib/tpu/proto/topology.proto
@@ -0,0 +1,27 @@
+syntax = "proto3";
+
+option cc_enable_arenas = true;
+
+package tensorflow.tpu;
+
+// Describes the geometry of a TPU mesh.
+message TopologyProto {
+  // The dimensions of the TPU topology, in cores. Typically, this is a 3D
+  // topology [x, y, core], where the major dimensions correspond to TPU chips,
+  // and the minor dimension describes the number of cores on a multicore chip.
+  repeated int32 mesh_shape = 1;
+
+  // Number of TensorFlow tasks in the cluster.
+  int32 num_tasks = 2;
+
+  // Number of TPU devices per task.
+  int32 num_tpu_devices_per_task = 3;
+
+  // A flattened rank 3 int32 array with shape
+  // [num_tasks, num_tpu_devices_per_task, len(mesh_shape)].
+  // `tasks` is the number of tasks in the TPU cluster, `devices` is the number
+  // of TPU devices per task, and the minor dimension corresponds to a position
+  // in the TPU mesh topology. Each entry [task, device, axis] gives the
+  // `axis`-th coordinate in the topology of a task/device pair.
+  repeated int32 device_coordinates = 4;
+}
diff --git a/tensorflow/contrib/tpu/python/tpu/device_assignment.py b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee202610a8a8a1406363b3010771e7806d5d84bf
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
@@ -0,0 +1,299 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Library of TPU helper functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.tpu.python.tpu.topology import Topology
+
+
+def _tpu_device_name(job, task, device):
+  """Returns the device name for the TPU `device` on `task` of `job`."""
+  if job is None:
+    return "/task:%d/device:TPU:%d" % (task, device)
+  else:
+    return "/job:%s/task:%d/device:TPU:%d" % (job, task, device)
+
+
+def _tpu_host_device_name(job, task):
+  """Returns the device name for the CPU device on `task` of `job`."""
+  if job is None:
+    return "/task:%d/device:CPU:0" % task
+  else:
+    return "/job:%s/task:%d/device:CPU:0" % (job, task)
+
+
+class DeviceAssignment(object):
+  """Mapping from logical cores in a computation to the physical TPU topology.
+
+  Prefer to use the `device_assignment()` helper to construct a
+  `DeviceAssignment`; it is easier if less flexible than constructing a
+  `DeviceAssignment` directly.
+  """
+
+  def __init__(self, topology, core_assignment):
+    """Constructs a `DeviceAssignment` object.
+
+    Args:
+      topology: A `Topology` object that describes the physical TPU topology.
+      core_assignment: A logical to physical core mapping, represented as a
+        rank 3 numpy array. See the description of the `core_assignment`
+        property for more details.
+
+    Raises:
+      ValueError: If `topology` is not `Topology` object.
+      ValueError: If `core_assignment` is not a rank 3 numpy array.
+    """
+    if not isinstance(topology, Topology):
+      raise ValueError("topology must be a Topology object, got {}".format(
+          type(topology)))
+    core_assignment = np.asarray(core_assignment, dtype=np.int32)
+
+    self._topology = topology
+    self._topology_tasks, self._topology_devices = (
+        self._invert_topology(topology))
+
+    topology_rank = self._topology_tasks.ndim
+    if core_assignment.ndim != topology_rank + 2:
+      raise ValueError("core_assignment must be a rank {} numpy array".format(
+          topology_rank + 2))
+
+    self._num_replicas = core_assignment.shape[0]
+    self._computation_shape = np.array(
+        core_assignment.shape[1:-1], dtype=np.int32)
+
+    if core_assignment.shape[-1] != topology_rank:
+      raise ValueError(
+          "minor dimension of core_assignment must have size equal to topology "
+          "rank ({}), got shape {}".format(topology_rank,
+                                           core_assignment.shape))
+
+    self._core_assignment = core_assignment
+
+  def _invert_topology(self, topology):
+    """Inverts a [task,device,axis] topology to [x,y,z] -> task/device maps."""
+    mesh_shape = topology.mesh_shape
+    tasks = np.full(list(mesh_shape), -1, dtype=np.int32)
+    devices = np.full(list(mesh_shape), -1, dtype=np.int32)
+    for task in xrange(topology.device_coordinates.shape[0]):
+      for device in xrange(topology.device_coordinates.shape[1]):
+        x, y, z = topology.device_coordinates[task, device, :]
+        tasks[x, y, z] = task
+        devices[x, y, z] = device
+    return tasks, devices
+
+  @property
+  def topology(self):
+    """A `Topology` that describes the TPU topology."""
+    return self._topology
+
+  @property
+  def computation_shape(self):
+    """The computation shape.
+
+    Returns:
+      A rank-1 int32 numpy array with size equal to the TPU topology rank.
+      Describes the logical shape in numbers of core of each replica of the
+      computation in the TPU topology.
+
+    Returns:
+      The computation shape.
+    """
+    return self._computation_shape
+
+  @property
+  def num_replicas(self):
+    """The number of replicas of the computation."""
+    return self._num_replicas
+
+  @property
+  def core_assignment(self):
+    """The logical to physical core mapping.
+
+    Returns:
+      A numpy array of rank `topology_rank + 2`, with shape
+      `[num_replicas] + computation_shape + [topology_rank]`. Maps
+      (replica, logical core coordinates) pairs to physical topology
+      coordinates.
+    """
+    return self._core_assignment
+
+  def _coordinates(self, replica, logical_core):
+    """Returns the physical topology coordinates of a logical core."""
+    if logical_core is None:
+      logical_core = np.array([0, 0, 0], np.int32)
+
+    if any(logical_core < 0) or any(logical_core >= self.computation_shape):
+      raise ValueError("Invalid core {}; computation shape is {}".format(
+          logical_core, self.computation_shape))
+
+    logical_offset = tuple([replica] + logical_core.tolist() + [slice(3)])
+    return tuple(self.core_assignment[logical_offset])
+
+  def tpu_ordinal(self, replica=0, logical_core=None):
+    """Returns the ordinal of the TPU device assigned to a logical core."""
+    coordinates = self._coordinates(replica, logical_core)
+    return self._topology_devices[coordinates]
+
+  def host_device(self, replica=0, logical_core=None, job=None):
+    """Returns the CPU device attached to a logical core."""
+    coordinates = self._coordinates(replica, logical_core)
+    return _tpu_host_device_name(job, self._topology_tasks[coordinates])
+
+  def tpu_device(self, replica=0, logical_core=None, job=None):
+    """Returns the name of the TPU device assigned to a logical core."""
+    coordinates = self._coordinates(replica, logical_core)
+    return _tpu_device_name(job, self._topology_tasks[coordinates],
+                            self._topology_devices[coordinates])
+
+
+def device_assignment(topology,
+                      computation_shape=None,
+                      computation_stride=None,
+                      num_replicas=1):
+  """Computes a device_assignment of a computation across a TPU topology.
+
+  Returns a `DeviceAssignment` that describes the cores in the topology assigned
+  to each core of each replica.
+
+  `computation_shape` and `computation_stride` values should be powers of 2 for
+  optimal packing.
+
+  Args:
+    topology: A `Topology` object that describes the TPU cluster topology.
+      To obtain a TPU topology, evaluate the `Tensor` returned by
+      `initialize_system` using `Session.run`. Either a serialized
+      `TopologyProto` or a `Topology` object may be passed. Note: you must
+      evaluate the `Tensor` first; you cannot pass an unevaluated `Tensor` here.
+    computation_shape: A rank 1 int32 numpy array of size 3, describing the
+      shape of the computation's block of cores. If None, the
+      `computation_shape` is `[1, 1, 1]`.
+    computation_stride: A rank 1 int32 numpy array of size 3, describing the
+      inter-core spacing of the `computation_shape` cores in the TPU topology.
+      If None, the `computation_stride` is `[1, 1, 1]`.
+    num_replicas: The number of computation replicas to run. The replicas will
+      be packed into the free spaces of the topology.
+
+  Returns:
+    A DeviceAssignment object, which describes the mapping between the logical
+    cores in each computation replica and the physical cores in the TPU
+    topology.
+
+  Raises:
+    ValueError: If `topology` is not a valid `Topology` object.
+    ValueError: If `computation_shape` or `computation_stride` are not 1D int32
+      numpy arrays with shape [3] where all values are positive.
+    ValueError: If computation's replicas cannot fit into the TPU topology.
+  """
+  # Deserialize the Topology proto, if it is a string.
+  if isinstance(topology, bytes):
+    topology = Topology(serialized=topology)
+
+  if not isinstance(topology, Topology):
+    raise ValueError("`topology` is not a Topology object; got {}".format(
+        type(topology)))
+
+  topology_rank = len(topology.mesh_shape)
+  mesh_shape = topology.mesh_shape
+  if computation_shape is None:
+    computation_shape = np.array([1, 1, 1], dtype=np.int32)
+  else:
+    computation_shape = np.asarray(computation_shape, dtype=np.int32)
+
+  if computation_stride is None:
+    computation_stride = np.array([1, 1, 1], dtype=np.int32)
+  else:
+    computation_stride = np.asarray(computation_stride, dtype=np.int32)
+
+  if computation_shape.shape != (3,):
+    raise ValueError("computation_shape must have shape [3]; got {}".format(
+        computation_shape.shape))
+  if computation_stride.shape != (3,):
+    raise ValueError("computation_stride must have shape [3]; got {}".format(
+        computation_stride.shape))
+
+  if any(computation_shape < 1):
+    raise ValueError(
+        "computation_shape must be positive; got computation_shape={}".format(
+            computation_shape))
+  if any(computation_stride < 1):
+    raise ValueError(
+        "computation_stride must be positive; got computation_stride={}".format(
+            computation_stride))
+
+  # Computes the physical size of one computation instance.
+  computation_footprint = computation_shape * computation_stride
+  if any(computation_footprint > mesh_shape):
+    raise ValueError(
+        "computation footprint {} does not fit in TPU topology shape {}".format(
+            computation_footprint, mesh_shape))
+
+  # Computes how many copies of the computation footprint fit in the mesh.
+  block_counts = mesh_shape // computation_footprint
+
+  replica_counts = block_counts * computation_stride
+  max_replicas = np.prod(replica_counts)
+  if num_replicas > max_replicas:
+    raise ValueError(
+        "requested {} replicas but only {} replicas with shape {} and "
+        "computation_stride {} fit in a TPU mesh of shape {}".format(
+            num_replicas, max_replicas, computation_shape, computation_stride,
+            mesh_shape))
+
+  # Choose a compact layout for the cores. Choose the smaller dimension in the
+  # topology to be close to the square root of the number of replicas.
+  num_chips = int(math.ceil(num_replicas / replica_counts[2]))
+  target_size = int(math.ceil(math.sqrt(num_chips)))
+
+  # Prefer an even size, if possible. Odd numbered rows head back towards the
+  # first column, so it's best if the last row has an odd index.
+  if target_size % 2 != 0:
+    target_size -= 1
+  y_size = min(replica_counts[1], target_size)
+  if y_size * replica_counts[0] < num_chips:
+    y_size = replica_counts[1]
+
+  # Assigns an offset to each replica such that no two replicas overlap.
+  replica_offsets = np.full([num_replicas, 3], -1, dtype=np.int32)
+  for replica in xrange(num_replicas):
+    # Chooses a replica number in X/Y/Z axes.
+    z = replica % replica_counts[2]
+    t = replica // replica_counts[2]
+    y = t % y_size
+    x = t // y_size
+    replica_pos = np.array([x, y, z], dtype=np.int32)
+
+    # Determines where that replica starts in each axis.
+    outer = replica_pos // computation_stride
+    inner = replica_pos % computation_stride
+    replica_offsets[replica, :] = outer * computation_footprint + inner
+
+  # Computes a complete logical core -> physical core mapping for each replica.
+  indices = [
+      np.arange(0, computation_shape[i] * computation_stride[i],
+                computation_stride[i]) for i in xrange(topology_rank)
+  ]
+  indices = np.concatenate(
+      [i[..., np.newaxis] for i in np.meshgrid(*indices, indexing="ij")],
+      axis=-1)
+  assignment = (
+      indices + replica_offsets[:, np.newaxis, np.newaxis, np.newaxis, :])
+  return DeviceAssignment(topology, core_assignment=assignment)
diff --git a/tensorflow/contrib/tpu/python/tpu/test_util.py b/tensorflow/contrib/tpu/python/tpu/test_util.py
deleted file mode 100644
index a5d4ff972277cda0bd6f5b3ecdb4bef59a2f8d0e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tpu/python/tpu/test_util.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ===================================================================
-"""Utilities to ease testing on TPU devices."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os.path
-import pickle
-import tempfile
-
-import numpy as np
-
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.contrib.tpu.python.tpu import tpu_config
-from tensorflow.contrib.tpu.python.tpu import tpu_estimator
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session as tf_session
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import saver as tf_saver
-
-
-def has_tpu():
-  """Check if a TPU device is available.
-
-  Device enumeration via `device_lib` currently fails for TPU systems.
-  (http://b/68333779).  To work around this, we determine the existence of a
-  TPU by a successful call to `initialize_system`.
-
-  Returns:
-    boolean, True if a TPU device is available, otherwise False.
-  """
-
-  def _check():
-    with tf_session.Session() as sess:
-      sess.run(tpu.initialize_system())
-      sess.run(tpu.shutdown_system())
-
-  try:
-    _check()
-    return True
-  except errors.OpError as _:
-    return False
-
-
-def _available_devices():
-  devices = ["cpu"]
-  if not test_util.gpu_device_name():
-    devices.append("gpu")
-
-  if has_tpu():
-    devices.append("tpu")
-
-  return tuple(devices)
-
-
-def copy_dir(src, tgt):
-  """Copy src to tgt."""
-  gfile.MakeDirs(tgt)
-  seen_dirs = set()
-  for dirname, _, files in gfile.Walk(src):
-    for f in files:
-      src_f = os.path.join(dirname, f)
-      tgt_f = src_f.replace(src, tgt)
-      tgt_d = os.path.dirname(tgt_f)
-      if tgt_d not in seen_dirs:
-        gfile.MkDir(tgt_d)
-        seen_dirs.add(tgt_d)
-      gfile.Copy(src_f, tgt_f, overwrite=True)
-
-
-def compare_model(model_fn,
-                  input_fn,
-                  params,
-                  master="local",
-                  temp_dir=None,
-                  num_shards=2,
-                  tolerance=1e-4):
-  """Compare the results of running `model_fn` on the TPU and CPU."""
-  if not temp_dir:
-    temp_dir = tempfile.mkdtemp()
-
-  cpu_model_dir = "%s/cpu-model" % temp_dir
-  tpu_model_dir = "%s/tpu-model" % temp_dir
-  initial_model_dir = "%s/initial-model" % temp_dir
-
-  logging.info("Checkpoints and weights will be written to %s", temp_dir)
-
-  num_steps = 1
-
-  def _model_adapter(features, labels, mode, params):
-    """Run users model function with random seeds fixed to known values."""
-    random_seed.set_random_seed(0)
-    np.random.seed(0)
-    return model_fn(features, labels, mode, params)
-
-  def _input_adapter(params):
-    random_seed.set_random_seed(0)
-    np.random.seed(0)
-    return input_fn(params)
-
-  def _make_run_config(model_dir):
-    return tpu_config.RunConfig(
-        master=master,
-        model_dir=model_dir,
-        save_checkpoints_secs=10000,
-        session_config=config_pb2.ConfigProto(
-            allow_soft_placement=True, log_device_placement=False),
-        tpu_config=tpu_config.TPUConfig(
-            iterations_per_loop=num_steps,
-            num_shards=num_shards,
-        ),
-    )
-
-  def _make_estimator(use_tpu, model_dir):
-    return tpu_estimator.TPUEstimator(
-        model_fn=_model_adapter,
-        use_tpu=use_tpu,
-        config=_make_run_config(model_dir),
-        train_batch_size=num_shards,
-        params=dict(params, use_tpu=use_tpu),
-    )
-
-  def _extract_weights(checkpoint):
-    """Extract model weights from the given checkpoint file."""
-    weights = {}
-    graph = ops.Graph()
-    with graph.as_default():
-      features, labels = _input_adapter(dict(params, batch_size=num_shards))
-      model_fn(
-          features, labels,
-          params=dict(params, use_tpu=False),
-          mode=model_fn_lib.ModeKeys.TRAIN)
-      saver = tf_saver.Saver()
-      with tf_session.Session(graph=graph) as sess:
-        saver.restore(sess, checkpoint)
-        all_vars = []
-        all_vars.extend(graph.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-        all_vars.extend(graph.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-        all_vars.extend(graph.get_collection(ops.GraphKeys.MODEL_VARIABLES))
-
-        for var in all_vars:
-          weights[var.name] = sess.run(var)
-    return weights
-
-  def _run_step(use_tpu, model_dir):
-    """Create an estimator and run a single step on the given device."""
-    tf_session.Session.reset(target=master)
-
-    logging.info("Running step.  TPU=%d.  model_dir=%s", use_tpu, model_dir)
-    est = _make_estimator(use_tpu=use_tpu, model_dir=model_dir)
-    est.train(input_fn=_input_adapter, steps=num_steps)
-    weights = _extract_weights(est.latest_checkpoint())
-    with gfile.Open(os.path.join(temp_dir, "tpu-%d.weights" % use_tpu),
-                    "wb") as f:
-      f.write(pickle.dumps(weights))
-    return weights
-
-  # initialize models to the same weights by running a single step on the CPU
-  _run_step(use_tpu=False, model_dir=initial_model_dir)
-
-  copy_dir(initial_model_dir, cpu_model_dir)
-  copy_dir(initial_model_dir, tpu_model_dir)
-
-  cpu_weights = _run_step(use_tpu=False, model_dir=cpu_model_dir)
-  tpu_weights = _run_step(use_tpu=True, model_dir=tpu_model_dir)
-
-  bad_weights = False
-  for k in cpu_weights:
-    if k not in tpu_weights:
-      raise KeyError("Missing weight %s from TPU checkpoint.", k)
-
-    if not np.allclose(
-        cpu_weights[k], tpu_weights[k], rtol=tolerance, atol=tolerance):
-      bad_weights = True
-      logging.error("Weights for layer %s have diverged.", k)
-
-  if bad_weights:
-    raise ValueError("Some weights have diverged.  Output pickle files have "
-                     "been written to %s for inspection." % temp_dir)
-
-
-class TPUTestCase(test_util.TensorFlowTestCase):
-  """Adds helpers for testing on TPU devices to `TensorFlowTestCase`.
-
-  Example usage:
-
-  ```
-  def model_fn(features):
-    return tf.reduce_sum(features * 2)
-
-  class ModelTests(test_util.TPUTestCase):
-    def test_sum(self):
-      v = np.random.randn(10, 10).astype("float32")
-      self.assert_device_output(model_fn, [v], (v*2).sum(),
-                                devices=("cpu", "tpu"))
-  ```
-  """
-
-  def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
-    super(TPUTestCase, self).__init__(methodName)
-    self._available_devices = _available_devices()
-
-  def run_on_device(self, model_fn, model_inputs, device):
-    """Runs `model_fn` on the given device.
-
-    Raises an exception if no such device is available.  `model_fn` should
-    return one or more tensors as a list or tuple.
-
-    Args:
-      model_fn: Function returning one or more tensors.
-      model_inputs: An iterable of Numpy arrays or scalars.
-                    These will be passed as arguments to `model_fn`.
-      device: Device to run on.  One of ("tpu", "gpu", "cpu").
-
-    Returns:
-      Output from the model function.
-    """
-
-    def _make_placeholders():
-      return dict([(gen_array_ops.placeholder_with_default(v, v.shape), v)
-                   for v in model_inputs])
-
-    if device == "tpu":
-      with self.test_session(graph=ops.Graph()) as sess:
-        placeholders = _make_placeholders()
-        tpu_computation = tpu.rewrite(model_fn, placeholders.keys())
-        sess.run(tpu.initialize_system())
-        sess.run(variables.global_variables_initializer())
-        result = sess.run(tpu_computation, placeholders)
-        sess.run(tpu.shutdown_system())
-        # TODO(b/36891278): supports non-flat returns lists in tpu.rewrite().
-        if len(result) == 1:
-          return result[0]
-        return result
-    elif device == "gpu":
-      with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
-        placeholders = _make_placeholders()
-        sess.run(variables.global_variables_initializer())
-        return sess.run(model_fn(placeholders.keys()), placeholders)
-    elif device == "cpu":
-      # TODO(power) -- will this interact poorly with cached GPU sessions?
-      with self.test_session(graph=ops.Graph(), use_gpu=False) as sess:
-        placeholders = _make_placeholders()
-        sess.run(variables.global_variables_initializer())
-        return sess.run(model_fn(placeholders.keys()), placeholders)
-
-  def _compare_values(self, actual_outputs, expected_outputs):
-    if isinstance(expected_outputs, (list, tuple)):
-      for a, b in zip(actual_outputs, expected_outputs):
-        self.assertAllCloseAccordingToType(a, b)
-    else:
-      self.assertAllCloseAccordingToType(actual_outputs, expected_outputs)
-
-  def assert_device_output(self,
-                           model_fn,
-                           model_inputs,
-                           expected_outputs,
-                           devices=("cpu", "gpu", "tpu")):
-    """Run `model_fn` on the given devices.
-
-    Results are compared via `assertAllCloseAccordingToType`.
-
-    Args:
-      model_fn: Function returning one or more tensors
-      model_inputs: Numpy arrays or scalars passed as arguments to model_fn
-      expected_outputs: Numpy arrays or scalars to compare against.
-      devices: Set of devices to run on.  If a device is not available, tests
-               will be skipped for that device.
-    """
-    devices = set(devices).intersection(self._available_devices)
-
-    for device in devices:
-      device_out = self.run_on_device(model_fn, model_inputs, device=device)
-      self._compare_values(device_out, expected_outputs)
diff --git a/tensorflow/contrib/tpu/python/tpu/topology.py b/tensorflow/contrib/tpu/python/tpu/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..cda9a63f204ed686b527c95dd5b4fd7786ac60cf
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/topology.py
@@ -0,0 +1,137 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Defines the `Topology` class, that describes a TPU fabric topology."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tpu.proto import topology_pb2
+
+
+class Topology(object):
+  """Describes a set of TPU devices.
+
+  Represents both the shape of the physical mesh, and the mapping between
+  TensorFlow TPU devices to physical mesh coordinates.
+  """
+
+  def __init__(self, serialized=None, mesh_shape=None, device_coordinates=None):
+    """Builds a Topology object.
+
+    If `serialized` is not `None`, the topology is parsed from `serialized` and
+    the other arguments are ignored. Otherwise, the topology is computed from
+    `mesh_shape` and `device_coordinates`.
+
+    Args:
+      serialized: A serialized `TopologyProto`, or `None`. If not `None`, the
+        serialized proto is parsed to discover the topology.
+      mesh_shape: A sequence of 3 positive integers, or `None`. If not `None`,
+        the shape of the TPU topology, in number of cores. Ignored if
+        `serialized` is not `None`.
+      device_coordinates: A rank 3 numpy array that describes the mapping from
+        TensorFlow TPU devices to TPU fabric coordinates, or `None`. Ignored
+        if `serialized is not `None`.
+
+    Raises:
+      ValueError: If `serialized` does not describe a well-formed topology.
+      ValueError: If `serialized` is `None` and `mesh_shape` is not a sequence
+        of 3 positive integers.
+      ValueError: If `serialized` is `None` and `device_coordinates` is not a
+        rank 3 numpy int32 array that describes a valid coordinate mapping.
+    """
+
+    if serialized:
+      self._serialized = serialized
+      self._parse_topology(serialized)
+    else:
+      self._mesh_shape = np.asarray(mesh_shape, dtype=np.int32)
+      self._device_coordinates = np.asarray(device_coordinates, np.int32)
+      if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
+        raise ValueError("`mesh_shape` must be a sequence of 3 positive "
+                         "entries; got {}".format(self._mesh_shape))
+
+      if (len(self._device_coordinates.shape) != 3 or
+          self._device_coordinates.shape[2] != len(self._mesh_shape)):
+        raise ValueError("`device_coordinates` must be a rank 3 int32 array "
+                         "with minor dimension equal to the mesh shape rank")
+
+  def _parse_topology(self, serialized):
+    """Parses a serialized `TopologyProto` into `self`."""
+    proto = topology_pb2.TopologyProto()
+    proto.ParseFromString(serialized)
+
+    self._mesh_shape = np.array(proto.mesh_shape, dtype=np.int32)
+    if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
+      raise ValueError("`mesh_shape` must be a vector of size 3 with positive "
+                       "entries; got {}".format(self._mesh_shape))
+
+    if proto.num_tasks < 0:
+      raise ValueError("`num_tasks` must be >= 0; got {}".format(
+          proto.num_tasks))
+    if proto.num_tpu_devices_per_task < 0:
+      raise ValueError("`num_tpu_devices_per_task` must be >= 0; got {}".format(
+          proto.num_tpu_devices_per_task))
+
+    expected_coordinates_size = (
+        proto.num_tasks * proto.num_tpu_devices_per_task * len(
+            proto.mesh_shape))
+    if len(proto.device_coordinates) != expected_coordinates_size:
+      raise ValueError("`device_coordinates` must have shape num_tasks ({}) * "
+                       "num_tpu_devices_per_task ({}) * len(mesh_shape) ({}); "
+                       "got shape {}".format(proto.num_tasks,
+                                             proto.num_tpu_devices_per_task,
+                                             proto.mesh_shape,
+                                             len(proto.device_coordinates)))
+
+    coords = np.array(proto.device_coordinates, dtype=np.int32)
+    if any(coords < 0):
+      raise ValueError("`device_coordinates` must be >= 0")
+    coords = coords.reshape((proto.num_tasks, proto.num_tpu_devices_per_task,
+                             len(proto.mesh_shape)))
+    self._device_coordinates = coords
+
+  @property
+  def mesh_shape(self):
+    """A rank 1 int32 array describing the shape of the TPU topology."""
+    return self._mesh_shape
+
+  @property
+  def device_coordinates(self):
+    """Describes the mapping from TPU devices to topology coordinates.
+
+    Returns:
+      A rank 3 int32 array with shape `[tasks, devices, axis]`.
+      `tasks` is the number of tasks in the TPU cluster, `devices` is the number
+      of TPU devices per task, and `axis` is the number of axes in the TPU
+      cluster topology. Each entry gives the `axis`-th coordinate in the
+      topology of a task/device pair. TPU topologies are 3-dimensional, with
+      dimensions `(x, y, core number)`.
+    """
+    return self._device_coordinates
+
+  def serialized(self):
+    """Returns the serialized form of the topology."""
+    if self._serialized is None:
+      proto = topology_pb2.TopologyProto()
+      proto.mesh_shape[:] = list(self._mesh_shape)
+      proto.num_tasks = self._device_coordinates.shape[0]
+      proto.num_tpu_devices_per_task = self._device_coordinates.shape[1]
+      proto.device_coordinates = list(self._device_coordinates.flatten())
+      self._serialized = proto.SerializeToString()
+
+    return self._serialized
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index f3ddc097544b62a3bce813aa4fd3c58c3b1d7aa2..24596bdb0af66314d402a7f6e21a8f00ca06dfbe 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -32,15 +32,42 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 
 
-_SUMMARY_OPS = ("ScalarSummary",)
-_PLACEHOLDER_OPS = ("Placeholder",)
+# Operations that indicate some error in the users graph, e.g. a placeholder
+# that's introduced outside of the infeed.
+_BLACKLISTED_OPS = set([
+    "Placeholder",
+])
+
+# These operations will currently fail to compile, but we should be able to
+# support them eventually via CPU offload or extending our operation set.
+_NOT_IMPLEMENTED_OPS = set([
+    "AudioSummary",
+    "AudioSummaryV2",
+    "HistogramSummary",
+    "ImageSummary",
+    "MergeSummary",
+    "Print",
+    "ScalarSummary",
+    "TensorSummary",
+    "TensorSummaryV2",
+    ])
+
+_TPU_REPLICATE_ATTR = "_tpu_replicate"
+
+
+def _tpu_system_device_name(job):
+  """Returns the device name for the TPU_SYSTEM device of `job`."""
+  if job is None:
+    return "/device:TPU_SYSTEM:0"
+  else:
+    return "/job:%s/device:TPU_SYSTEM:0" % job
 
 
 def initialize_system(embedding_config=None, job=None):
   """Initializes a distributed TPU system for use with TensorFlow.
 
   Args:
-    embedding_config: If not None, an EmbeddingLayerConfiguration proto
+    embedding_config: If not None, an `EmbeddingLayerConfiguration` proto
       describing the desired configuration of the hardware embedding lookup
       tables. If embedding_config is None, no hardware embeddings can be used.
     job: The job (the XXX in TensorFlow device specification /job:XXX)
@@ -48,27 +75,18 @@ def initialize_system(embedding_config=None, job=None):
       it is assumed there is only one job in the TensorFlow flock, and an
       error will be returned if this assumption does not hold.
   Returns:
-    Op which, when executed, will initialize the system.
+    A serialized `TopologyProto` that describes the TPU system. Note:
+      the topology must be evaluated using `Session.run` before it can be used.
   """
-  if job is None:
-    device_name = "/device:TPU_SYSTEM:0"
-  else:
-    device_name = "/job:%s/device:TPU_SYSTEM:0" % job
   config_string = ("" if embedding_config is None else
                    embedding_config.SerializeToString())
-  with ops.device(device_name):
-    init_distributed_tpu = tpu_ops.configure_distributed_tpu(
-        embedding_config=config_string)
-  return init_distributed_tpu
+  with ops.device(_tpu_system_device_name(job)):
+    return tpu_ops.configure_distributed_tpu(embedding_config=config_string)
 
 
 def shutdown_system(job=None):
   """Shuts down a running a distributed TPU system."""
-  if job is None:
-    device_name = "/device:TPU_SYSTEM:0"
-  else:
-    device_name = "/job:%s/device:TPU_SYSTEM:0" % job
-  with ops.device(device_name):
+  with ops.device(_tpu_system_device_name(job)):
     shutdown_distributed_tpu = tpu_ops.shutdown_distributed_tpu()
   return shutdown_distributed_tpu
 
@@ -80,23 +98,24 @@ def core(num):
     num: the virtual core number within each replica to which operators should
     be assigned.
   Returns:
-    A device name, suitable for passing to tf.device().
+    A device name, suitable for passing to `tf.device()`.
   """
   return "device:TPU_REPLICATED_CORE:{}".format(num)
 
 
 class TPUReplicateContext(control_flow_ops.ControlFlowContext):
-  """A ControlFlowContext for nodes inside a TPU computation.
+  """A `ControlFlowContext` for nodes inside a TPU computation.
 
-  The primary role of TPUReplicateContext is to mark operators inside a
+  The primary role of `TPUReplicateContext` is to mark operators inside a
   tpu.replicate() computation with the attribute "_tpu_replicate=XYZ", where XYZ
   is a unique name.
 
-  We use a ControlFlowContext to perform the annotation since it
+  We use a `ControlFlowContext` to perform the annotation since it
   integrates with Tensorflow constructs like ResourceVariables. For example,
-  if a ResourceVariable is constructed inside a tpu.replicate() block, the
-  ResourceVariable implementation can use "with ops.control_dependencies(None)"
-  to build the variable's definition outside the replicated computation.
+  if a `ResourceVariable` is constructed inside a tpu.replicate() block, the
+  `ResourceVariable` implementation can use
+  `with ops.control_dependencies(None)` to build the variable's definition
+  outside the replicated computation.
   """
 
   def __init__(self, name):
@@ -108,21 +127,22 @@ class TPUReplicateContext(control_flow_ops.ControlFlowContext):
 
   def _AddOpInternal(self, op):
     # pylint: disable=protected-access
-    if op.type in _PLACEHOLDER_OPS:
-      raise ValueError("Placeholder %s is not supported." % op.name)
+    if op.type in _BLACKLISTED_OPS:
+      raise ValueError("Operation of type %s (%s) is not supported on the TPU" %
+                       (op.type, op.name))
 
-    if op.type in _SUMMARY_OPS:
+    if op.type in _NOT_IMPLEMENTED_OPS:
       logging.warning(
-          "Summary operations are not currently supported (%s)" % op.name)
+          "Operation %s (%s) is not currently supported", op.type, op.name)
 
     if any(x.dtype._is_ref_dtype for x in op.inputs):
       raise NotImplementedError(
           "Non-resource Variables are not supported inside TPU computations "
           "(operator name: %s)" % op.name)
     # pylint: enable=protected-access
-    if "_tpu_replicate" in op.node_def.attr:
+    if _TPU_REPLICATE_ATTR in op.node_def.attr:
       raise ValueError("TPU computations cannot be nested")
-    op.node_def.attr["_tpu_replicate"].s = self._name
+    op.node_def.attr[_TPU_REPLICATE_ATTR].s = self._name
     op.graph.prevent_feeding(op)
     op.graph.prevent_fetching(op)
 
@@ -149,37 +169,47 @@ class TPUReplicateContext(control_flow_ops.ControlFlowContext):
 def replicate(computation,
               inputs=None,
               infeed_queue=None,
-              global_tpu_id=None,
+              device_assignment=None,
               name=None):
   """Builds a graph operator that runs a replicated TPU computation.
 
   Args:
-    computation: a Python function that builds the computation to replicate.
-    inputs: a list of lists of input tensors or None (equivalent to
-      [[]]), indexed by [replica_num][input_num]. All replicas must
+    computation: A Python function that builds the computation to replicate.
+    inputs: A list of lists of input tensors or `None` (equivalent to
+      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
       have the same number of inputs.
-    infeed_queue: if not None, the InfeedQueue from which to append a tuple
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
       of arguments as inputs to computation.
-    global_tpu_id: if not None, a Numpy 2D array indicating the global
-      id of each TPU device in the system. The outer dimension of the
-      array is host task id, and the inner dimension is device ordinal,
-      so e.g., global_tpu_id[x][y] indicates the global id of device
-      /task:x/device:TPU_NODE:y.
-    name: name of the operator.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each replica of the computation uses
+      only one core, and there is either only one replica, or the number of
+      replicas is equal to the number of cores in the TPU system.
+    name: The name of the operator.
   Returns:
-    A list of lists of output tensors, indexed by [replica_num][output_num].
+    A list of lists of output tensors, indexed by `[replica_num][output_num]`.
   Raises:
-    ValueError: if all replicas do not have equal numbers of input tensors.
-    ValueError: if the number of inputs per replica does not match
+    ValueError: If all replicas do not have equal numbers of input tensors.
+    ValueError: If the number of inputs per replica does not match
       the number of formal parameters to `computation`.
   """
   if name is None:
     name = "TPUReplicate"
   inputs = [[]] if inputs is None else inputs
 
-  if global_tpu_id is not None:
-    # Turn the Numpy array into a flattened list.
-    global_tpu_id = global_tpu_id.flatten().tolist()
+  metadata_kwargs = {}
+  if device_assignment is not None:
+    # Turn the Numpy array into a flattened list so we can pass it as an
+    # operator attribute.
+    metadata_kwargs = {
+        "topology":
+            device_assignment.topology.serialized(),
+        "device_assignment":
+            device_assignment.core_assignment.flatten().tolist(),
+        "computation_shape":
+            device_assignment.computation_shape.tolist()
+    }
 
   if ((not isinstance(inputs, list)) or
       any(not isinstance(inp, (list, tuple)) for inp in inputs)):
@@ -242,7 +272,7 @@ def replicate(computation,
       context.Enter()
 
       metadata = tpu_ops.tpu_replicate_metadata(
-          num_replicas=num_replicas, global_tpu_id=global_tpu_id)
+          num_replicas=num_replicas, **metadata_kwargs)
 
       with tpu_function.tpu_shard_context(
           num_replicas), ops.control_dependencies([metadata]):
@@ -349,7 +379,7 @@ def shard(computation,
           outputs_from_all_shards=True,
           output_shard_axes=None,
           infeed_queue=None,
-          global_tpu_id=None,
+          device_assignment=None,
           name=None):
   """Shards `computation` for parallel execution.
 
@@ -377,39 +407,40 @@ def shard(computation,
   Inputs and outputs of the computation must be at least rank-1 Tensors.
 
   Args:
-    computation: a Python function that builds a computation to apply to each
+    computation: A Python function that builds a computation to apply to each
       shard of the input.
-    inputs: a list of input tensors or None (equivalent to an empty
+    inputs: A list of input tensors or None (equivalent to an empty
       list). Each input tensor has a corresponding shard axes, given
       by `input_shard_axes`, which must have size divisible by
       `num_shards`.
-    num_shards: the number of shards.
-    input_shard_axes: a list of dimensions along which to shard `inputs`, or
+    num_shards: The number of shards.
+    input_shard_axes: A list of dimensions along which to shard `inputs`, or
       `None`. `None` means "shard all inputs along dimension 0". If not `None`,
       there must be one dimension per input.
-    outputs_from_all_shards: boolean or list of boolean. For each output, if
+    outputs_from_all_shards: Boolean or list of boolean. For each output, if
       `True`, outputs from all shards are concatenated along the corresponding
       `output_shard_axes` entry. Otherwise, each output is taken
       from an arbitrary shard. If the argument is a boolean, the argument's
       value is used for each output.
-    output_shard_axes: a list of dimensions along which to concatenate the
+    output_shard_axes: A list of dimensions along which to concatenate the
       outputs of `computation`, or `None`. `None` means "concatenate all outputs
       along dimension 0". If not `None`, there must be one dimension per output.
       Ignored if `outputs_from_all_shards` is False.
-    infeed_queue: if not None, the InfeedQueue to use to augment the inputs of
-      `computation`.
-    global_tpu_id: if not None, a Numpy 2D array indicating the global
-      id of each TPU device in the system. The outer dimension of the
-      array is host task id, and the inner dimension is device ordinal,
-      so e.g., global_tpu_id[x][y] indicates the global id of device
-      /task:x/device:TPU_NODE:y.
-    name: name of the operator.
+    infeed_queue: If not `None`, the `InfeedQueue` to use to augment the inputs
+      of `computation`.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each shard of the computation uses
+      only one core, and there is either only one shard, or the number of shards
+      is equal to the number of cores in the TPU system.
+    name: The name of the operator.
   Returns:
     A list of output tensors.
   Raises:
-    ValueError: if num_shards <= 0
-    ValueError: if len(input_shard_axes) != len(inputs)
-    ValueError: if len(output_shard_axes) != len(outputs from `computation`)
+    ValueError: If num_shards <= 0
+    ValueError: If len(input_shard_axes) != len(inputs)
+    ValueError: If len(output_shard_axes) != len(outputs from `computation`)
   """
 
   if num_shards <= 0:
@@ -440,7 +471,7 @@ def shard(computation,
       computation,
       transposed_inputs,
       infeed_queue=infeed_queue,
-      global_tpu_id=global_tpu_id,
+      device_assignment=device_assignment,
       name=name)
 
   # There must be at least one shard since num_shards > 0.
@@ -494,7 +525,7 @@ def batch_parallel(computation,
                    inputs=None,
                    num_shards=1,
                    infeed_queue=None,
-                   global_tpu_id=None,
+                   device_assignment=None,
                    name=None):
   """Shards `computation` along the batch dimension for parallel execution.
 
@@ -518,55 +549,55 @@ def batch_parallel(computation,
   Inputs and outputs of the computation must be at least rank-1 Tensors.
 
   Args:
-    computation: a Python function that builds a computation to apply to each
+    computation: A Python function that builds a computation to apply to each
       shard of the input.
-    inputs: a list of input tensors or None (equivalent to an empty
+    inputs: A list of input tensors or None (equivalent to an empty
       list). The 0-th dimension of each Tensor must have size
       divisible by `num_shards`.
-    num_shards: the number of shards.
-    infeed_queue: if not None, the InfeedQueue from which to append a tuple
+    num_shards: The number of shards.
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
       of arguments as inputs to `computation`.
-    global_tpu_id: if not None, a Numpy 2D array indicating the global
-      id of each TPU device in the system. The outer dimension of the
-      array is host task id, and the inner dimension is device ordinal,
-      so e.g., global_tpu_id[x][y] indicates the global id of device
-      /task:x/device:TPU_NODE:y.
-    name: name of the operator.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each shard of the computation uses
+      only one core, and there is either only one shard, or the number of shards
+      is equal to the number of cores in the TPU system.
+    name: The name of the operator.
   Returns:
     A list of output tensors.
   Raises:
-    ValueError: if num_shards <= 0
+    ValueError: If `num_shards <= 0`
   """
   return shard(
       computation,
       inputs,
       num_shards=num_shards,
       infeed_queue=infeed_queue,
-      global_tpu_id=global_tpu_id,
+      device_assignment=device_assignment,
       name=name)
 
 
 def rewrite(computation,
             inputs=None,
             infeed_queue=None,
-            global_tpu_id=None,
+            device_assignment=None,
             name=None):
   """Rewrites `computation` for execution on a TPU system.
 
   Args:
-    computation: a Python function that builds a computation to apply
+    computation: A Python function that builds a computation to apply
       to the input. If the function takes n inputs, 'inputs' should be
       a list of n tensors. If the function returns m outputs, rewrite
       will return a list of m tensors.
-    inputs: a list of input tensors or None (equivalent to an empty list).
-    infeed_queue: if not None, the InfeedQueue from which to append a tuple
+    inputs: A list of input tensors or `None` (equivalent to an empty list).
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
       of arguments as inputs to `computation`.
-    global_tpu_id: if not None, a Numpy 2D array indicating the global
-      id of each TPU device in the system. The outer dimension of the
-      array is host task id, and the inner dimension is device ordinal,
-      so e.g., global_tpu_id[x][y] indicates the global id of device
-      /task:x/device:TPU_NODE:y.
-    name: name of the operator.
+    device_assignment: if not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. May be omitted for a single-core computation, in which
+      case the core attached to task 0, TPU device 0 is used.
+    name: The name of the operator.
   Returns:
     A list of output tensors.
   """
@@ -579,6 +610,6 @@ def rewrite(computation,
       computation,
       None if inputs is None else [inputs],
       infeed_queue=infeed_queue,
-      global_tpu_id=global_tpu_id,
+      device_assignment=device_assignment,
       name=name)[0]
   # pylint: enable=indexing-exception
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index 916b9b3082fc197694933bdd6042706891be115c..77ce38991bb6c952b7e378f9fa8a527421d45803 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -31,6 +31,7 @@ class TPUConfig(
         'num_shards',
         'per_host_input_for_training',
         'tpu_job_name',
+        'initial_infeed_sleep_secs',
     ])):
   """TPU related configuration required by `TPUEstimator`.
 
@@ -50,13 +51,17 @@ class TPUConfig(
       within TPUEstimator, however when using ClusterSpec propagation in more
       esoteric cluster configurations, you may need to specify the job name as a
       string.
+    initial_infeed_sleep_secs: The number of seconds the infeed thread should
+      wait before enqueueing the first batch. This helps avoid timeouts for
+      models that require a long compilation time.
   """
 
   def __new__(cls,
               iterations_per_loop=2,
               num_shards=2,
               per_host_input_for_training=True,
-              tpu_job_name=None):
+              tpu_job_name=None,
+              initial_infeed_sleep_secs=None):
 
     # Check iterations_per_loop.
     util_lib.check_positive_integer(iterations_per_loop,
@@ -64,12 +69,18 @@ class TPUConfig(
 
     # Check num_shards.
     util_lib.check_positive_integer(num_shards, 'TPUConfig num_shards')
+
+    # Check initial_infeed_sleep_secs.
+    if initial_infeed_sleep_secs:
+      util_lib.check_positive_integer(initial_infeed_sleep_secs,
+                                      'TPUConfig initial_infeed_sleep_secs')
     return super(TPUConfig, cls).__new__(
         cls,
         iterations_per_loop=iterations_per_loop,
         num_shards=num_shards,
         per_host_input_for_training=per_host_input_for_training,
-        tpu_job_name=tpu_job_name)
+        tpu_job_name=tpu_job_name,
+        initial_infeed_sleep_secs=initial_infeed_sleep_secs)
 
 
 class RunConfig(run_config_lib.RunConfig):
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 97b2d25e0cf81b1dbf72bc97f5e6ee9c04b8c690..6bf11e1ae535f26cfa8326e224f2b8d4e0124b13 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -23,6 +23,8 @@ import collections
 from contextlib import contextmanager
 import copy
 import threading
+import time
+
 import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 
@@ -30,7 +32,6 @@ from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_config
 from tensorflow.contrib.tpu.python.tpu import tpu_feed
-from tensorflow.contrib.tpu.python.tpu import tpu_function
 from tensorflow.contrib.tpu.python.tpu import training_loop
 from tensorflow.contrib.tpu.python.tpu import util as util_lib
 
@@ -363,13 +364,17 @@ class TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
     'loss',
     'train_op',
     'eval_metrics',
-    'export_outputs'])):
+    'export_outputs',
+    'scaffold_fn'])):
   """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
 
   See `EstimatorSpec` for `mode`, 'predictions, 'loss', 'train_op', and
   'export_outputs`.
 
-  TPU evaluation expects a slightly different signature from the
+  For evaluation, `eval_metrics `is a tuple of `metric_fn` and `tensors`, where
+  `metric_fn` runs on CPU to generate metrics and `tensors` represents the
+  `Tensor`s transferred from TPU system to CPU host and passed to `metric_fn`.
+  To be precise, TPU evaluation expects a slightly different signature from the
   ${tf.estimator.Estimator}. While `EstimatorSpec.eval_metric_ops` expects a
   dict, `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`.
   The `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. The
@@ -380,9 +385,11 @@ class TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
   to the `metric_fn` if `tensors` is list or keyword arguments if `tensors` is
   dict. `metric_fn` takes the `tensors` and returns a dict from metric string
   name to the result of calling a metric function, namely a `(metric_tensor,
-  update_op)` tuple.
+  update_op)` tuple. See `TPUEstimator` for MNIST example how to specify the
+  `eval_metrics`.
 
-  See `TPUEstimator` for MNIST example how to specify the `eval_metrics`.
+  `scaffold_fn` is a function running on CPU to generate the `Scaffold`. This
+  function should not capture any Tensors in `model_fn`.
   """
 
   def __new__(cls,
@@ -391,7 +398,8 @@ class TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
               loss=None,
               train_op=None,
               eval_metrics=None,
-              export_outputs=None):
+              export_outputs=None,
+              scaffold_fn=None):
     """Creates a validated `TPUEstimatorSpec` instance."""
     if eval_metrics is not None:
       _EvalMetrics.validate(eval_metrics)
@@ -401,18 +409,21 @@ class TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
                                                 loss=loss,
                                                 train_op=train_op,
                                                 eval_metrics=eval_metrics,
-                                                export_outputs=export_outputs)
+                                                export_outputs=export_outputs,
+                                                scaffold_fn=scaffold_fn)
 
   def as_estimator_spec(self):
     """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
     eval_metric_ops = _EvalMetrics.to_metric_metric_ops_for_cpu(
         self.eval_metrics)
+    scaffold = self.scaffold_fn() if self.scaffold_fn else None
     return model_fn_lib.EstimatorSpec(mode=self.mode,
                                       predictions=self.predictions,
                                       loss=self.loss,
                                       train_op=self.train_op,
                                       eval_metric_ops=eval_metric_ops,
-                                      export_outputs=self.export_outputs)
+                                      export_outputs=self.export_outputs,
+                                      scaffold=scaffold)
 
 
 class _InfeedOutfeedThreadBaseController(object):
@@ -465,13 +476,20 @@ class _OutfeedThreadController(_InfeedOutfeedThreadBaseController):
 class _InfeedThreadController(_InfeedOutfeedThreadBaseController):
   """This wraps the infeed thread and stops when Estimator finishes."""
 
-  def __init__(self, session, enqueue_ops):
+  def __init__(self, session, enqueue_ops, initial_infeed_sleep_secs):
     super(_InfeedThreadController, self).__init__(
-        threading.Thread(target=self._input_thread_fn_for_loading,
-                         args=(session, enqueue_ops)))
+        threading.Thread(
+            target=self._input_thread_fn_for_loading,
+            args=(session, enqueue_ops, initial_infeed_sleep_secs)))
 
-  def _input_thread_fn_for_loading(self, session, enqueue_ops):
+  def _input_thread_fn_for_loading(self, session, enqueue_ops,
+                                   initial_infeed_sleep_secs):
     count = 0
+    if initial_infeed_sleep_secs:
+      logging.info('Infeed thread sleeping for %d seconds.',
+                   initial_infeed_sleep_secs)
+      time.sleep(initial_infeed_sleep_secs)
+      logging.info('Infeed thread starting after sleep')
     try:
       while True:
         signal = self._signal_queue.get()
@@ -490,11 +508,29 @@ class _InfeedThreadController(_InfeedOutfeedThreadBaseController):
           count += 1
 
     except Exception:  # pylint: disable=broad-except
+      # Close the session to avoid the main thread from hanging. If input
+      # pipeline triggers any error, the infeed thread dies but the main thread
+      # for TPU computation waits for the infeed enqueue forever. Close the
+      # Session to cancel the main thread Session.run execution.
+      #
+      # However, sleep for 2 minutes before explicit closing to give some time
+      # for the TPU compilation error, if any, propagating, from TPU to CPU
+      # host. Compilation errors should be reported by the main thread so that
+      # the program can be interrupted and users can take action.  Due to a race
+      # condition, the infeed thread might see an error first.  Closing the
+      # session here immediately would result in a session cancellation
+      # exception in the main thread, instead of the expected compile error.
+      # User code that depends on having the proper exception type will
+      # therefore be confused.
       logging.error(
           'Failed running infeed, closing session.\n'
-          'You may see an exception from your main session after this.',
+          'You may see an exception from your main session after this. '
+          'Sleep for 2 minutes before close Session from infeed thread to '
+          'allow the main thread returning an error first, if any.',
           exc_info=1
       )
+      time.sleep(120)
+      logging.error('Closing the failed session.')
       session.close()
 
   def join(self):
@@ -515,6 +551,8 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     self._master_job = ctx.master_job
     self._enqueue_ops = enqueue_ops
     self._dequeue_ops = dequeue_ops
+    self._initial_infeed_sleep_secs = (
+        ctx.config.tpu_config.initial_infeed_sleep_secs)
 
   def begin(self):
     logging.info('TPU job name %s', self._master_job)
@@ -529,7 +567,7 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
 
     logging.info('Start infeed thread controller')
     self._infeed_thd_controller = _InfeedThreadController(
-        session, self._enqueue_ops)
+        session, self._enqueue_ops, self._initial_infeed_sleep_secs)
 
     if self._dequeue_ops is not None:
       logging.info('Start outfeed thread controller')
@@ -650,7 +688,7 @@ class _SetEvalIterationsHook(session_run_hook.SessionRunHook):
 def generate_per_core_enqueue_ops_fn_for_host(
     ctx, input_fn, inputs_structure_recorder):
   """Generates infeed enqueue ops for per-core input_fn on a single host."""
-  infeed_queue_holder = {'instance': None}
+  captured_infeed_queue = _CapturedObject()
 
   def enqueue_ops_fn():
     """A fn returns enqueue_ops."""
@@ -673,7 +711,7 @@ def generate_per_core_enqueue_ops_fn_for_host(
 
     infeed_queue = tpu_feed.InfeedQueue(
         number_of_tuple_elements=len(per_host_sharded_inputs[0]))
-    infeed_queue_holder['instance'] = infeed_queue
+    captured_infeed_queue.capture(infeed_queue)
     infeed_queue.set_configuration_from_sharded_input_tensors(
         per_host_sharded_inputs)
 
@@ -681,13 +719,13 @@ def generate_per_core_enqueue_ops_fn_for_host(
         per_host_sharded_inputs,
         tpu_ordinal_function=ctx.tpu_ordinal_function)
     return per_host_enqueue_ops
-  return enqueue_ops_fn, (lambda: infeed_queue_holder['instance'])
+  return enqueue_ops_fn, captured_infeed_queue
 
 
 def generate_per_host_enqueue_ops_fn_for_host(
     ctx, input_fn, inputs_structure_recorder, batch_axis, device):
   """Generates infeed enqueue ops for per-host input_fn on a single host."""
-  infeed_queue_holder = {'instance': None}
+  captured_infeed_queue = _CapturedObject()
 
   def enqueue_ops_fn():
     with ops.device(device):
@@ -707,7 +745,7 @@ def generate_per_host_enqueue_ops_fn_for_host(
           tuple_types=[t.dtype for t in unsharded_tensor_list],
           tuple_shapes=[t.shape for t in unsharded_tensor_list],
           shard_dimensions=batch_axis)
-      infeed_queue_holder['instance'] = infeed_queue
+      captured_infeed_queue.capture(infeed_queue)
       infeed_queue.set_number_of_shards(num_cores_per_host)
 
       per_host_enqueue_ops = (
@@ -715,7 +753,7 @@ def generate_per_host_enqueue_ops_fn_for_host(
               unsharded_tensor_list,
               placement_function=lambda x: device))
       return per_host_enqueue_ops
-  return enqueue_ops_fn, (lambda: infeed_queue_holder['instance'])
+  return enqueue_ops_fn, captured_infeed_queue
 
 
 class _InputPipeline(object):
@@ -905,7 +943,7 @@ class _InputPipeline(object):
         host_device = tpu_host_placement_fn(host_id=host_id)
         with ops.device(host_device):
           with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            enqueue_ops_fn, infeed_queue_getter = (
+            enqueue_ops_fn, captured_infeed_queue = (
                 generate_per_core_enqueue_ops_fn_for_host(
                     self._ctx, self._input_fn, self._inputs_structure_recorder))
 
@@ -915,14 +953,14 @@ class _InputPipeline(object):
             else:
               enqueue_ops.append(enqueue_ops_fn())
             # Infeed_queue_getter must be called after enqueue_ops_fn is called.
-            infeed_queues.append(infeed_queue_getter())
+            infeed_queues.append(captured_infeed_queue.get())
 
     else:
       for host_id in range(num_hosts):
         host_device = tpu_host_placement_fn(host_id=host_id)
         with ops.device(host_device):
           with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            enqueue_ops_fn, infeed_queue_getter = (
+            enqueue_ops_fn, captured_infeed_queue = (
                 generate_per_host_enqueue_ops_fn_for_host(
                     self._ctx, self._input_fn, self._inputs_structure_recorder,
                     self._batch_axis, host_device))
@@ -932,7 +970,7 @@ class _InputPipeline(object):
                   device=host_device, op_fn=enqueue_ops_fn))
             else:
               enqueue_ops.append(enqueue_ops_fn())
-            infeed_queues.append(infeed_queue_getter())
+            infeed_queues.append(captured_infeed_queue.get())
     # infeed_queue is used to generate dequeue ops. The only thing it uses for
     # dequeue is dtypes and types. So, any one can be used. Here, grab the
     # first one.
@@ -973,10 +1011,7 @@ class _ModelFnWrapper(object):
     self._ctx = ctx
 
   def call_without_tpu(self, features, labels):
-    # Let CrossShardOptimizer be called without TPU in model_fn, since it's
-    # common to set the train_op even when running evaluate() or predict().
-    with tpu_function.tpu_shard_context(1):
-      return self._call_model_fn(features, labels)
+    return self._call_model_fn(features, labels)
 
   def convert_to_single_tpu_train_step(self, dequeue_fn):
     """Converts user provided model_fn` as a single train step on TPU.
@@ -1000,6 +1035,8 @@ class _ModelFnWrapper(object):
       A Fn representing the train step for TPU.
     """
 
+    captured_scaffold_fn = _CapturedObject()
+
     def train_step(loss):
       """Training step function for use inside a while loop."""
       del loss  # unused; required in function signature.
@@ -1008,9 +1045,15 @@ class _ModelFnWrapper(object):
       estimator_spec = self._verify_estimator_spec(
           self._call_model_fn(features, labels))
       loss, train_op = estimator_spec.loss, estimator_spec.train_op
+
+      if isinstance(estimator_spec, TPUEstimatorSpec):
+        captured_scaffold_fn.capture(estimator_spec.scaffold_fn)
+      else:
+        captured_scaffold_fn.capture(None)
+
       with ops.control_dependencies([train_op]):
         return array_ops.identity(loss)
-    return train_step
+    return train_step, captured_scaffold_fn
 
   def convert_to_single_tpu_eval_step(self, dequeue_fn):
     """Converts user provided model_fn` as a single eval step on TPU.
@@ -1039,6 +1082,7 @@ class _ModelFnWrapper(object):
       step for TPU. and eval_metrics is an `_EvalMetrics` instance.
     """
     eval_metrics = _EvalMetrics(self._ctx)
+    captured_scaffold_fn = _CapturedObject()
 
     def eval_step(total_loss):
       """Evaluation step function for use inside a while loop."""
@@ -1051,12 +1095,13 @@ class _ModelFnWrapper(object):
             '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
 
       loss = tpu_estimator_spec.loss
+      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
       eval_metrics.record(tpu_estimator_spec)
       outfeed_ops = tpu_ops.outfeed_enqueue_tuple(eval_metrics.outfeed_tensors)
 
       with ops.control_dependencies([outfeed_ops]):
         return math_ops.add(total_loss, loss)
-    return eval_step, eval_metrics
+    return eval_step, eval_metrics, captured_scaffold_fn
 
   def _call_model_fn(self, features, labels):
     """Calls the model_fn with required parameters."""
@@ -1110,6 +1155,10 @@ class _ModelFnWrapper(object):
       raise ValueError(err_msg.format('training_hooks'))
     if estimator_spec.evaluation_hooks:
       raise ValueError(err_msg.format('evaluation_hooks'))
+
+    if estimator_spec.scaffold:
+      logging.warning('EstimatorSpec.Scaffold is ignored by TPU train/eval. '
+                      'Please use TPUEstimatorSpec.')
     return estimator_spec
 
 
@@ -1578,7 +1627,8 @@ class TPUEstimator(estimator_lib.Estimator):
             input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
 
         if mode == model_fn_lib.ModeKeys.TRAIN:
-          loss = _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn)
+          loss, scaffold = (
+              _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
           hooks = [
               TPUInfeedOutfeedSessionHook(ctx, enqueue_ops),
               training.LoggingTensorHook(
@@ -1597,10 +1647,11 @@ class TPUEstimator(estimator_lib.Estimator):
               mode,
               loss=loss,
               training_hooks=hooks,
-              train_op=control_flow_ops.group(*update_ops))
+              train_op=control_flow_ops.group(*update_ops),
+              scaffold=scaffold)
 
         # Now eval.
-        total_loss, eval_metric_ops = _eval_on_tpu_system(
+        total_loss, eval_metric_ops, scaffold = _eval_on_tpu_system(
             ctx, model_fn_wrapper, dequeue_fn)
         iterations_per_loop_var = _create_or_get_iterations_per_loop()
         mean_loss = math_ops.div(
@@ -1631,7 +1682,8 @@ class TPUEstimator(estimator_lib.Estimator):
             mode,
             loss=mean_loss,
             evaluation_hooks=hooks,
-            eval_metric_ops=eval_metric_ops)
+            eval_metric_ops=eval_metric_ops,
+            scaffold=scaffold)
     return _model_fn
 
 
@@ -1640,7 +1692,7 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   num_cores = ctx.num_cores
   iterations_per_loop_var = _create_or_get_iterations_per_loop()
 
-  single_tpu_eval_step, eval_metric_ops = (
+  single_tpu_eval_step, eval_metric_ops, captured_scaffold_fn = (
       model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn))
 
   def multi_tpu_eval_steps_on_single_shard():
@@ -1653,7 +1705,9 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
                       inputs=[],
                       num_shards=num_cores,
                       outputs_from_all_shards=False)
-  return loss, eval_metric_ops
+
+  scaffold = _get_scaffold(captured_scaffold_fn)
+  return loss, eval_metric_ops, scaffold
 
 
 def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
@@ -1661,8 +1715,8 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   num_cores = ctx.num_cores
   iterations_per_loop_var = _create_or_get_iterations_per_loop()
 
-  single_tpu_train_step = model_fn_wrapper.convert_to_single_tpu_train_step(
-      dequeue_fn)
+  single_tpu_train_step, captured_scaffold_fn = (
+      model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
 
   def multi_tpu_train_steps_on_single_shard():
     return training_loop.repeat(
@@ -1675,7 +1729,9 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
                       inputs=[],
                       num_shards=num_cores,
                       outputs_from_all_shards=False)
-  return loss
+
+  scaffold = _get_scaffold(captured_scaffold_fn)
+  return loss, scaffold
 
 
 def _wrap_computation_in_while_loop(device, op_fn):
@@ -1711,3 +1767,76 @@ def _validate_tpu_training_graph():
         'CrossShardOptimizer must be used for model training on TPUs.')
 
 
+class _CapturedObject(object):
+  """A placeholder to capture an object.
+
+  This is useful when we need to capture a Python object in the Tensorflow
+  control flow body function and use it outside the control flow.
+  """
+
+  def __init__(self):
+    self._object = None
+    self._captured = False
+
+  def capture(self, o):
+    if self._captured:
+      raise RuntimeError(
+          'InternalError: Object can be captured only. Please file bug .')
+
+    self._captured = True
+    self._object = o
+
+  def get(self):
+    if not self._captured:
+      raise RuntimeError(
+          'InternalError: Object is not captured properly before `get`. '
+          'Please file bug .')
+    return self._object
+
+
+def _get_scaffold(captured_scaffold_fn):
+  """Retrieves the Scaffold from `captured_scaffold_fn`."""
+  with _CapturingContext(message='Inside scaffold_fn'):
+    scaffold_fn = captured_scaffold_fn.get()
+    if scaffold_fn:
+      scaffold = scaffold_fn()
+      if scaffold is None:
+        raise ValueError(
+            'TPUEstimatorSpec.scaffold_fn returns None, which is not allowed')
+    else:
+      scaffold = None
+
+  if scaffold:
+    wrapped_finalize = scaffold.finalize
+    def _finalize():
+      with _CapturingContext('Inside Scaffold.finalize'):
+        wrapped_finalize()
+    scaffold.finalize = _finalize
+  return scaffold
+
+
+class _CapturingContext(control_flow_ops.ControlFlowContext):
+  """Tracks references to Tensors defined in TPU replication."""
+
+  def __init__(self, message):
+    control_flow_ops.ControlFlowContext.__init__(self)
+    self._message = message
+
+  def AddOp(self, op):  # pylint: disable=invalid-name
+    for c in op.inputs:
+      if tpu._TPU_REPLICATE_ATTR in c.op.node_def.attr:  # pylint: disable=protected-access
+        raise ValueError(
+            '{}: Op {} depends on TPU computation {}, '
+            'which is not allowed.'.format(self._message, op, c))
+
+  def __enter__(self):
+    # pylint: disable=protected-access
+    self._g = ops.get_default_graph()
+    self._old = self._g._get_control_flow_context()
+    self._g._set_control_flow_context(self)
+    # pylint: enable=protected-access
+
+  def __exit__(self, _, __, ___):  # pylint: disable=invalid-name
+    self._g._set_control_flow_context(self._old)  # pylint: disable=protected-access
+
+
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
index 1c8ea63f00ba4b2298abd8053a7fe8702b6fc0bc..42ac6eb680437ec82287468bcba2b770ac0e5749 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
@@ -513,7 +513,7 @@ class InfeedQueue(object):
   # for automatic placement of input pipelines.
   def split_inputs_and_generate_enqueue_ops(self,
                                             inputs,
-                                            global_tpu_id=None,
+                                            device_assignment=None,
                                             placement_function=None,
                                             tpu_ordinal_function=None):
     """POORLY-PERFORMING ON MULTI-HOST SYSTEMS.
@@ -536,14 +536,12 @@ class InfeedQueue(object):
     Args:
       inputs: a list of Tensors which indicates the types and shapes of the
         queue tuple.
-     global_tpu_id: if not None, a Numpy 2D array indicating the global
-        id of each TPU device in the system. The outer dimension of the
-        array is host task id, and the inner dimension is device ordinal,
-        so e.g., global_tpu_id[x][y] indicates the global id of device
-        /task:x/device:TPU_NODE:y. If global_tpu_id is not None, but
-        placement_function and ordinal_function are None, then global_tpu_id
-        will be used to place infeed on the TPUs with the first k global ids,
-        where k is the number of shards in the queue.
+     device_assignment: if not `None`, a TPU `DeviceAssignment`. If
+        device_assignment is not `None`, but `placement_function` and
+        `ordinal_function` are None, then `device_assignment` will be used to
+        place infeeds on the first k TPU shards, where k is the number of shards
+        in the queue. If all three are `None`, then default placement and
+        ordinal functions are used.
       placement_function: if not None, a function that takes the shard
         index as input and returns a device string indicating which
         device the shard's infeed should be placed on. If placement_function
@@ -567,22 +565,18 @@ class InfeedQueue(object):
         types of the elements of inputs are not compatible with the frozen
         configuration.
     """
-    if global_tpu_id is None:
+    if device_assignment is None:
       if placement_function is None:
         placement_function = self._default_placement_function
       if tpu_ordinal_function is None:
         tpu_ordinal_function = self._default_ordinal_function
     else:
-      global_id_map = {}
-      for host, devices in enumerate(global_tpu_id):
-        for ordinal, global_id in enumerate(devices):
-          global_id_map[global_id] = (host, ordinal)
 
       def _placement_function_from_map(index):
-        return "/task:%d/device:CPU:0" % global_id_map[index][0]
+        return device_assignment.host_device(replica=index)
 
       def _ordinal_function_from_map(index):
-        return global_id_map[index][1]
+        return device_assignment.tpu_ordinal(replica=index)
 
       if placement_function is None:
         placement_function = _placement_function_from_map
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py b/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
index a00fd1d0869ab4403d879d2fc08f2bba0a13a7a8..e76cf83e4ddcd86ab3971bcecefe2e2dc979bf63 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu_function
 from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer
 
 
@@ -74,8 +75,10 @@ class CrossShardOptimizer(optimizer.Optimizer):
     """
     num_shards = tpu_function.get_tpu_context().number_of_shards
     if num_shards is None:
-      raise ValueError("CrossShardOptimizer must be used within a "
-                       "tpu_shard_context.")
+      logging.warning(
+          "CrossShardOptimizer should be used within a tpu_shard_context, but "
+          "got unset number_of_shards. Assuming 1.")
+      num_shards = 1
     if num_shards > 1 and self._reduction == losses.Reduction.MEAN:
       scale = 1.0 / num_shards
       loss *= scale
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index 7db625cdd59a2a110809d305c7b43cc110a93534..80de0f6eb7e36a1c86f7d44e4053a9757b09f0ae 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -550,13 +550,26 @@ class HParams(object):
   def get_model_structure(self):
     return self._model_structure
 
-  def to_json(self):
+  def to_json(self, indent=None, separators=None, sort_keys=False):
     """Serializes the hyperparameters into JSON.
 
+    Args:
+      indent: If a non-negative integer, JSON array elements and object members
+        will be pretty-printed with that indent level. An indent level of 0, or
+        negative, will only insert newlines. `None` (the default) selects the
+        most compact representation.
+      separators: Optional `(item_separator, key_separator)` tuple. Default is
+        `(', ', ': ')`.
+      sort_keys: If `True`, the output dictionaries will be sorted by key.
+
     Returns:
       A JSON string.
     """
-    return json.dumps(self.values())
+    return json.dumps(
+        self.values(),
+        indent=indent,
+        separators=separators,
+        sort_keys=sort_keys)
 
   def parse_json(self, values_json):
     """Override hyperparameter values, parsing new values from a json object.
@@ -582,6 +595,33 @@ class HParams(object):
     """
     return {n: getattr(self, n) for n in self._hparam_types.keys()}
 
+  def get(self, key, default=None):
+    """Returns the value of `key` if it exists, else `default`."""
+    if key in self._hparam_types:
+      # Ensure that default is compatible with the parameter type.
+      if default is not None:
+        param_type, is_param_list = self._hparam_types[key]
+        type_str = 'list<%s>' % param_type if is_param_list else str(param_type)
+        fail_msg = ("Hparam '%s' of type '%s' is incompatible with "
+                    'default=%s' % (key, type_str, default))
+
+        is_default_list = isinstance(default, list)
+        if is_param_list != is_default_list:
+          raise ValueError(fail_msg)
+
+        try:
+          if is_default_list:
+            for value in default:
+              _cast_to_type_if_compatible(key, param_type, value)
+          else:
+            _cast_to_type_if_compatible(key, param_type, default)
+        except ValueError as e:
+          raise ValueError('%s. %s' % (fail_msg, e))
+
+      return getattr(self, key)
+
+    return default
+
   def __contains__(self, key):
     return key in self._hparam_types
 
diff --git a/tensorflow/contrib/training/python/training/hparam_test.py b/tensorflow/contrib/training/python/training/hparam_test.py
index 949c262f5bbc11657347fefcff175147fa13059a..28e4b4d01eda9bef07ff7929f74894e09a3e987c 100644
--- a/tensorflow/contrib/training/python/training/hparam_test.py
+++ b/tensorflow/contrib/training/python/training/hparam_test.py
@@ -292,6 +292,16 @@ class HParamsTest(test.TestCase):
     self.assertEqual('relu4', hparams2.c_c)
     self.assertEqual(False, hparams2.d)
 
+    hparams3 = hparam.HParams(aaa=123)
+    self.assertEqual('{"aaa": 123}', hparams3.to_json())
+    self.assertEqual('{\n  "aaa": 123\n}', hparams3.to_json(indent=2))
+    self.assertEqual('{"aaa"=123}', hparams3.to_json(separators=(';', '=')))
+
+    hparams4 = hparam.HParams(aaa=123, b='hello', c_c=False)
+    self.assertEqual(
+        '{"aaa": 123, "b": "hello", "c_c": false}',
+        hparams4.to_json(sort_keys=True))
+
   def testSetHParam(self):
     hparams = hparam.HParams(aaa=1, b=2.0, c_c='relu6', d=True)
     self.assertDictEqual({
@@ -364,6 +374,49 @@ class HParamsTest(test.TestCase):
     with self.assertRaisesRegexp(AssertionError, ''):
       hparam.HParams(hparam_def=[1, 2, 3])
 
+  def testGet(self):
+    hparams = hparam.HParams(aaa=1, b=2.0, c_c='relu6', d=True, e=[5.0, 6.0])
+
+    # Existing parameters with default=None.
+    self.assertEqual(1, hparams.get('aaa'))
+    self.assertEqual(2.0, hparams.get('b'))
+    self.assertEqual('relu6', hparams.get('c_c'))
+    self.assertEqual(True, hparams.get('d'))
+    self.assertEqual([5.0, 6.0], hparams.get('e', None))
+
+    # Existing parameters with compatible defaults.
+    self.assertEqual(1, hparams.get('aaa', 2))
+    self.assertEqual(2.0, hparams.get('b', 3.0))
+    self.assertEqual(2.0, hparams.get('b', 3))
+    self.assertEqual('relu6', hparams.get('c_c', 'default'))
+    self.assertEqual(True, hparams.get('d', True))
+    self.assertEqual([5.0, 6.0], hparams.get('e', [1.0, 2.0, 3.0]))
+    self.assertEqual([5.0, 6.0], hparams.get('e', [1, 2, 3]))
+
+    # Existing parameters with incompatible defaults.
+    with self.assertRaises(ValueError):
+      hparams.get('aaa', 2.0)
+
+    with self.assertRaises(ValueError):
+      hparams.get('b', False)
+
+    with self.assertRaises(ValueError):
+      hparams.get('c_c', [1, 2, 3])
+
+    with self.assertRaises(ValueError):
+      hparams.get('d', 'relu')
+
+    with self.assertRaises(ValueError):
+      hparams.get('e', 123.0)
+
+    with self.assertRaises(ValueError):
+      hparams.get('e', ['a', 'b', 'c'])
+
+    # Nonexistent parameters.
+    self.assertEqual(None, hparams.get('unknown'))
+    self.assertEqual(123, hparams.get('unknown', 123))
+    self.assertEqual([1, 2, 3], hparams.get('unknown', [1, 2, 3]))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py b/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py
index 0ef5f111b2a467fcca76b5d80c24c525345a9ae4..ed0f398e30a7f3c0b1b9378f8fc5d5bfbea1536a 100644
--- a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py
+++ b/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py
@@ -28,7 +28,7 @@ from tensorflow.python.ops import math_ops, control_flow_ops
 def sgdr_decay(learning_rate, global_step, initial_period_steps,
                t_mul=2.0, m_mul=1.0, name=None):
   """Implements Stochastic Gradient Descent with Warm Restarts (SGDR).
-  
+
   As described in "SGDR: Stochastic Gradient Descent
   with Warm Restarts" by Ilya Loshchilov & Frank Hutter, Proceedings of
   ICLR'2017, available at https://arxiv.org/pdf/1608.03983.pdf
@@ -48,7 +48,7 @@ def sgdr_decay(learning_rate, global_step, initial_period_steps,
   where `t_0` = `initial_period_steps` is the user-defined number of batch
   iterations (not epochs as in the paper) to be performed before the first
   restart is launched.
-  
+
   Then, we perform the first restart (i=1) by setting the learning rate to
   `learning_rate*(m_mul^i)`, where `m_mul in [0,1]` (set to 1 by default).
   The i-th restart runs for `t_i=t_0*(t_mul^i)` steps, i.e., every new
@@ -73,7 +73,7 @@ def sgdr_decay(learning_rate, global_step, initial_period_steps,
       Training dataset size: 10000
       If the user wants the first decay period to span across 5 epochs, then
       `initial_period_steps` = 5 * 10000/100 = 500
-  
+
       Train for 10000 batch iterations with the initial learning rate set to
       0.1, then restart to run 2 times longer, i.e, for 20000 batch iterations
       and with the initial learning rate 0.05, then restart again and again,
diff --git a/tensorflow/contrib/training/python/training/training.py b/tensorflow/contrib/training/python/training/training.py
index eee2b8881230125335753b54e757a5045ade0a43..f72e0a3f831f9e9c61a2e9d77828ffb12d8428b1 100644
--- a/tensorflow/contrib/training/python/training/training.py
+++ b/tensorflow/contrib/training/python/training/training.py
@@ -244,7 +244,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -255,6 +254,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import optimizer as tf_optimizer
+from tensorflow.python.training import training_util
 
 # TODO(nsilberman): move add_gradients_summaries, clip_gradient_norms and
 # multiply_gradients into contrib/summaries and contrib/optimizers.py
@@ -409,7 +409,7 @@ def create_train_op(total_loss,
       loss value.
   """
   if global_step is _USE_GLOBAL_STEP:
-    global_step = variables.get_or_create_global_step()
+    global_step = training_util.get_or_create_global_step()
 
   # Update ops use GraphKeys.UPDATE_OPS collection if update_ops is None.
   global_update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
diff --git a/tensorflow/contrib/verbs/BUILD b/tensorflow/contrib/verbs/BUILD
index 746ff38b37fd6ba012f1791bfa35209e84305f5c..38a84ffb10e594568a18dbd06debf32545cb2229 100644
--- a/tensorflow/contrib/verbs/BUILD
+++ b/tensorflow/contrib/verbs/BUILD
@@ -7,6 +7,8 @@ package(default_visibility = [
 
 licenses(["notice"])  # Apache 2.0
 
+load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+
 exports_files(["LICENSE"])
 
 filegroup(
@@ -97,7 +99,7 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
+tf_cuda_library(
     name = "rdma_rendezvous_mgr",
     srcs = ["rdma_rendezvous_mgr.cc"],
     hdrs = ["rdma_rendezvous_mgr.h"],
@@ -130,7 +132,7 @@ cc_library(
     ],
 )
 
-cc_library(
+tf_cuda_library(
     name = "rdma",
     srcs = ["rdma.cc"],
     hdrs = ["rdma.h"],
diff --git a/tensorflow/contrib/verbs/README.md b/tensorflow/contrib/verbs/README.md
index dcb390b0a5e343157dfd04ef8b18b7f723da27e0..7c1c8ea45912be8c471efbe42f43e083639e91fc 100644
--- a/tensorflow/contrib/verbs/README.md
+++ b/tensorflow/contrib/verbs/README.md
@@ -38,7 +38,7 @@ The following improvements can be made in the future. First, conversion to Tenso
 * **RDMA channel:** Responsible for RDMA connection to a particular node. It manages multiple buffers. A channel has a callback table which stores all the callbacks for the requested tensors.
 * **RDMA buffer:** Responsible for sending or receiving data. It has a fixed size memory to store the data. It has a queue to store the pending jobs. There are three types of buffers, message buffer, ACK buffer and tensor buffer. A channel has two message buffers, two ack buffers and many tensor buffers.
 * **RDMA manager:** Manages the adapter and channels, including channel creation, channel setup via GRPC service, channel lookup, etc.
-* **RDMA rendezvous manager:** manages multiple rdma rendezvous. 
+* **RDMA rendezvous manager:** manages multiple rdma rendezvous.
 * **RDMA rendezvous:** a derived class of BaseRemoteRendezvous. This class is the back end for "send" and "recv" ops. When the sendrecv_op wants to send or receive a tensor, it calls the rendezvous' "send" and "recv" functions respectively. Rendezvous are identified by "step_id", a random number, so that tensors for different iterations don't get mixed up.
 
 ### The SEND operation
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index 331943a3ef059329a28372edbfd2f2ffc0931f58..ae9a384565a6ad0e63a6cf3acf07c591c65f0637 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -16,13 +16,16 @@ limitations under the License.
 #ifdef TENSORFLOW_USE_VERBS
 
 #include "tensorflow/contrib/verbs/rdma.h"
+#include <fcntl.h>
 #include <cstdlib>
 #include <fcntl.h>
 #include "tensorflow/contrib/verbs/verbs_util.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #include "tensorflow/core/common_runtime/gpu/process_state.h"
+#endif
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/framework/rendezvous.h"
@@ -31,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 
 namespace tensorflow {
 
@@ -137,7 +141,7 @@ ibv_device* set_device() {
   if (!env_p_rdma_device.empty()) {
     for (device_index = 0; device_index < dev_num; device_index++) {
       if (!env_p_rdma_device.compare(
-               ibv_get_device_name(dev_list[device_index]))) {
+              ibv_get_device_name(dev_list[device_index]))) {
         CHECK(get_dev_active_port_count(dev_list[device_index]) != 0)
             << "Device " << ibv_get_device_name(dev_list[device_index])
             << " has no active ports";
@@ -147,7 +151,7 @@ ibv_device* set_device() {
     // check validity of input device
     CHECK(false) << "The device " << env_p_rdma_device << " wasn't found";
   } else {
-  // set default device
+    // set default device
     str_port_num = get_env_var("RDMA_DEVICE_PORT");
     CHECK(str_port_num.empty())
         << "RDMA_DEVICE should be provided if RDMA_DEVICE_PORT is set by user";
@@ -177,7 +181,7 @@ ibv_device* set_device() {
 // Returns:
 //   port to use
 uint8_t set_port(ibv_context* context) {
-  uint8_t port_num = 0; //0 is illegal port number
+  uint8_t port_num = 0;  // 0 is illegal port number
   string str_port_num;
   ibv_device_attr device_att;
   ibv_port_attr port_attr;
@@ -199,9 +203,7 @@ uint8_t set_port(ibv_context* context) {
     // check if port id active
     CHECK(port_attr.state == IBV_PORT_ACTIVE)
         << "Selected RDMA_DEVICE_PORT is not active";
-  }
-  // set default port
-  else {
+  } else {  // set default port
     for (port_index = 1; port_index <= device_att.phys_port_cnt; port_index++) {
       rc = ibv_query_port(context, port_index, &port_attr);
       CHECK(!rc) << "Failed to query the port" << port_index;
@@ -269,7 +271,7 @@ bool is_gid_type_roce_v2(ibv_context* context, uint8_t port_num,
 // Function to set GID index.
 // If the port link is IB, no GID index should be selected.
 // If Ethernet but RDMA_GID_INDEX not set gid index that supports
-//   RoCE V2 will be chosen(fails if more then one IP is configured)
+//   RoCE V2 will be chosen(fails if more than one IP is configured)
 // Args:
 //   context - device context
 //   port_num - port number
@@ -302,7 +304,7 @@ uint8_t set_gid(uint8_t port_num, ibv_context* context) {
     }
   }
   switch (port_attr.link_layer) {
-    case(IBV_LINK_LAYER_ETHERNET) :
+    case (IBV_LINK_LAYER_ETHERNET):
       gid_str = get_env_var("RDMA_GID_INDEX");
       if (!gid_str.empty()) {
         gid_index = stoi(gid_str);
@@ -313,7 +315,7 @@ uint8_t set_gid(uint8_t port_num, ibv_context* context) {
             << "More than one IP is available, please specify GID_INDEX";
       }
       break;
-    case(IBV_LINK_LAYER_INFINIBAND) :  // no need in GID index
+    case (IBV_LINK_LAYER_INFINIBAND):  // no need in GID index
       break;
     default:
       LOG(INFO) << "Unknown port link layer. Currently supporting Ethernet and "
@@ -374,7 +376,8 @@ enum ibv_mtu set_mtu(uint8_t port_num, ibv_context* context) {
         break;
       default:
         CHECK(0) << "Error: MTU input value must be one of the following: 256, "
-                    "512, 1024, 2048, 4096. MTU " << mtu << " is invalid\n";
+                    "512, 1024, 2048, 4096. MTU "
+                 << mtu << " is invalid\n";
         break;
     }
     CHECK(mtu < port_attr.active_mtu)
@@ -419,9 +422,6 @@ RdmaAdapter::RdmaAdapter(const WorkerEnv* worker_env)
                       0);
   CHECK(cq_) << "Failed to create completion queue";
   CHECK(!ibv_req_notify_cq(cq_, 0)) << "Failed to request CQ notification";
-  polling_thread_.reset(Env::Default()->StartThread(
-      ThreadOptions(), "RdmaAdapterCQThread", [this] { Process_CQ(); }));
-  VLOG(2) << "Start RdmaAdapter: " << name();
 }
 
 RdmaAdapter::~RdmaAdapter() {
@@ -433,6 +433,12 @@ RdmaAdapter::~RdmaAdapter() {
   CHECK(!ibv_close_device(context_)) << "Failed to release context";
 }
 
+void RdmaAdapter::StartPolling() {
+  polling_thread_.reset(Env::Default()->StartThread(
+      ThreadOptions(), "RdmaAdapterCQThread", [this] { Process_CQ(); }));
+  VLOG(2) << "Start RdmaAdapter: " << name();
+}
+
 string RdmaAdapter::name() const { return string(context_->device->name); }
 
 // Function to process incoming messages
@@ -558,9 +564,44 @@ void RdmaAdapter::Process_CQ() {
   }
 }
 
+int RdmaChannel::PingPostRecv() {
+  struct ibv_recv_wr wr, *bad_wr;
+  memset(&wr, 0, sizeof(wr));
+  wr.sg_list = &ping_sge_list_;
+  wr.num_sge = 1;
+  wr.wr_id = kPingRecvWrid;
+
+  return ibv_post_recv(qp_, &wr, &bad_wr);
+}
+
+int RdmaChannel::PingPostSend() {
+  struct ibv_send_wr wr, *bad_wr;
+  memset(&wr, 0, sizeof(wr));
+  wr.wr_id = (uint64_t) this;
+  wr.sg_list = &ping_sge_list_;
+  wr.num_sge = 1;
+  wr.opcode = IBV_WR_SEND;
+  wr.send_flags = IBV_SEND_SIGNALED;
+
+  return ibv_post_send(qp_, &wr, &bad_wr);
+}
+
 RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
                          const string remote_name)
     : adapter_(adapter), local_name_(local_name), remote_name_(remote_name) {
+
+  struct ibv_sge list;
+
+  mr_ = ibv_reg_mr(adapter_->pd_, ping_buff_, kPingBuffSize,
+                   IBV_ACCESS_LOCAL_WRITE);
+  CHECK(mr_) << "Failed to register memory region";
+
+  memset(&list, 0, sizeof(list));
+  list.addr = (uintptr_t)ping_buff_;
+  list.length = kPingBuffSize;
+  list.lkey = mr_->lkey;
+
+  ping_sge_list_ = list;
   // Create queue pair
   {
     struct ibv_qp_init_attr attr;
@@ -633,15 +674,13 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
       buffer_index_name_table_.insert({index, buffer_names[i]});
       buffer_name_index_table_.insert({buffer_names[i], index});
     }
-
-    // Initiate recv
-    for (int i = 0; i < 100; i++) {
-      Recv();
-    }
   }
+  CHECK(PingPostRecv() == 0) << "Couldn't post receive from " << remote_name_
+                             << " with error " << std::strerror(errno);
 }
 
 RdmaChannel::~RdmaChannel() {
+  ibv_dereg_mr(mr_);
   CHECK(!ibv_destroy_qp(qp_)) << "Failed to destroy QP";
   delete tx_message_buffer_;
   delete rx_message_buffer_;
@@ -1026,6 +1065,7 @@ Rendezvous::DoneCallback RdmaTensorBuffer::getRecvTensorCallback(
     TensorProto proto;
     if (src_dev->tensorflow_gpu_device_info() &&
         (!send_args.alloc_attrs.on_host())) {
+#if GOOGLE_CUDA
       CHECK(send_args.device_context) << "send dev name: " << src_dev->name()
                                       << " gpu_info: "
                                       << src_dev->tensorflow_gpu_device_info();
@@ -1064,6 +1104,7 @@ Rendezvous::DoneCallback RdmaTensorBuffer::getRecvTensorCallback(
                                  &proto, NULL, send_args, recv_args);
             });
       }
+#endif  // GOOGLE_CUDA
     } else {
       // tensor is in CPU memory.
       StringPiece copy_buf;
diff --git a/tensorflow/contrib/verbs/rdma.h b/tensorflow/contrib/verbs/rdma.h
index 52d92a7c5bb6f21e2449e06792d8d40c9bcbf9bd..fea2327d77ffff67c4b3c45835a81f790bbd1574 100644
--- a/tensorflow/contrib/verbs/rdma.h
+++ b/tensorflow/contrib/verbs/rdma.h
@@ -107,6 +107,7 @@ class RdmaAdapter {
   ~RdmaAdapter();
   // Adapter name, e.g. mlx5_0.
   string name() const;
+  void StartPolling();
   void Process_CQ();
 
  protected:
@@ -161,6 +162,15 @@ class RdmaChannel {
   void RemoveRecvCallback(const string& key);
   void RunRecvCallback(const string& key);
   static const int kNumMessageBuffers = 4;
+  static const int kPingRecvWrid = 0;
+
+ private:
+  static const int kPingBuffSize = 1024;
+  char ping_buff_[kPingBuffSize];
+  struct ibv_mr* mr_;
+  struct ibv_sge ping_sge_list_;
+  int PingPostRecv();
+  int PingPostSend();
 
  protected:
   const RdmaAdapter* adapter_;
diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc
index 09b878843f52c910f78f3769522d1fa80319c7d7..9cb307bcfa06cfdf5ecb9b4faa1d3710e5701080 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_mgr.cc
@@ -115,6 +115,57 @@ void RdmaMgr::SetupChannels() {
   }
 }
 
+// Check connectivity by pinging every channel
+bool RdmaMgr::ConnectivityCheck() {
+  int i, rcnt = 0, scnt = 0;
+
+  for (const auto& p : channel_table_) {
+    string worker_name = p.first;
+    RdmaChannel* rc = p.second;
+
+    VLOG(2) << "Ping to " << worker_name;
+    CHECK(rc->PingPostSend() == 0) << "Couldn't post send  to " << worker_name
+                                   << " with error: " << std::strerror(errno);
+    for (i = 0; i < rc->adapter_->params_.queue_depth - 1; i++) {
+      rc->Recv();
+    }
+  }
+
+  while (rcnt < num_remote_workers_ || scnt < num_remote_workers_) {
+    int ne;
+    do {
+      ne = ibv_poll_cq(rdma_adapter_->cq_, 2 * num_remote_workers_,
+                       rdma_adapter_->wc_);
+      CHECK(ne >= 0) << "poll CQ failed " << ne << "with error"
+                     << std::strerror(errno);
+    } while (ne < 1);
+
+    for (i = 0; i < ne; ++i) {
+      ibv_wc_status s = rdma_adapter_->wc_[i].status;
+      // recv complete
+      if ((int)rdma_adapter_->wc_[i].wr_id == RdmaChannel::kPingRecvWrid) {
+        CHECK(s == IBV_WC_SUCCESS) << ": " << ibv_wc_status_str(
+                                                  rdma_adapter_->wc_[i].status)
+                                   << "(" << rdma_adapter_->wc_[i].status
+                                   << ") for PING_RECV_WRID";
+        ++rcnt;
+        // send complete
+      } else {
+        RdmaChannel* rc =
+            reinterpret_cast<RdmaChannel*>(rdma_adapter_->wc_[i].wr_id);
+        CHECK(s == IBV_WC_SUCCESS) << ": " << ibv_wc_status_str(
+                                                  rdma_adapter_->wc_[i].status)
+                                   << "(" << rdma_adapter_->wc_[i].status
+                                   << ") to " << rc->remote_name_;
+        ++scnt;
+      }
+    }  // for
+  }    // while
+  CHECK(rcnt == scnt) << "Connectivity check failed!";
+  rdma_adapter_->StartPolling();
+  return (num_remote_workers_ == rcnt) && (num_remote_workers_ == scnt);
+}
+
 RdmaMgr::~RdmaMgr() {
   for (const auto& p : channel_table_) delete p.second;
   channel_table_.clear();
diff --git a/tensorflow/contrib/verbs/rdma_mgr.h b/tensorflow/contrib/verbs/rdma_mgr.h
index b156f64096c113bb0ac3780b0f64fd1e6bd7cb89..e711e604788b12ff0c1a0977a90db21f9f8fa50e 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.h
+++ b/tensorflow/contrib/verbs/rdma_mgr.h
@@ -28,12 +28,16 @@ limitations under the License.
 namespace tensorflow {
 
 class RdmaMgr {
+  friend class RdmaChannel;
+  friend class RdmaAdapter;
+
  public:
   explicit RdmaMgr(const WorkerEnv* const worker_env,
                    GrpcChannelCache* const channel_cache);
   ~RdmaMgr();
   RdmaChannel* FindChannel(const string& key);
   void SetupChannels();
+  bool ConnectivityCheck();
   const string& local_worker() { return local_worker_; }
 
  private:
@@ -44,7 +48,6 @@ class RdmaMgr {
   RdmaAdapter* rdma_adapter_;
   typedef std::unordered_map<string, RdmaChannel*> ChannelTable;
   ChannelTable channel_table_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(RdmaMgr);
 };
 
diff --git a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
index ce82ca288307a73295368501ad68f88b60c9623c..74f6681af3c29f370d6cdb37d64e10a30cbb7b84 100644
--- a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
@@ -21,8 +21,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #include "tensorflow/core/common_runtime/gpu/process_state.h"
+#endif  // GOOGLE_CUDA
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -58,20 +60,13 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
   // parse src_name and dst_name
   string src_name, dst_name, unused;
   if (!DeviceNameUtils::SplitDeviceName(parsed.src_device, &src_name,
+                                        &unused) ||
+      !DeviceNameUtils::SplitDeviceName(parsed.dst_device, &dst_name,
                                         &unused)) {
-    s = errors::Internal("Could not parse src name.");
+    s = errors::Internal("Could not parse src or dst name.");
   }
-  CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
-  if (!s.ok()) {
-    done(s, Args(), recv_args, Tensor{}, false);
-    return;
-  }
-  if (!DeviceNameUtils::SplitDeviceName(parsed.dst_device, &dst_name,
-                                        &unused)) {
-    s = errors::Internal("Could not parse dst name.");
-  }
-  CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
   if (!s.ok()) {
+    LOG(ERROR) << "s is not ok, error code " << s.error_message();
     done(s, Args(), recv_args, Tensor{}, false);
     return;
   }
@@ -82,18 +77,13 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
   // insert callback
   rc->InsertRecvCallback(key_with_step_id, [this, key, key_with_step_id, rc,
                                             recv_args, parsed, done]() {
-    Status s;
-    Device* src_dev;
-    s = env_->device_mgr->LookupDevice("CPU:0", &src_dev);
-    CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
-    if (!s.ok()) {
-      done(s, Args(), recv_args, Tensor(), true);
-      return;
-    }
-    Device* dst_dev;
-    s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_dev);
-    CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
-    if (!s.ok()) {
+    Status src_s, dst_s, s;
+    Device* src_dev, *dst_dev;
+    src_s = env_->device_mgr->LookupDevice("CPU:0", &src_dev);
+    dst_s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_dev);
+    if (!src_s.ok() || !dst_s.ok()) {
+      s = src_s.ok() ? dst_s : src_s;
+      LOG(ERROR) << "s is not ok, error code " << s.error_message();
       done(s, Args(), recv_args, Tensor(), true);
       return;
     }
@@ -110,9 +100,10 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
       if (can_memcpy) {
         if (dst_dev->tensorflow_gpu_device_info() &&
             (!recv_args.alloc_attrs.on_host())) {
+#if GOOGLE_CUDA
           CHECK(recv_args.device_context)
-            << "send dev name: " << src_dev->name()
-            << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
+              << "send dev name: " << src_dev->name()
+              << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
           Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
           Tensor copy(alloc, rm.data_type_, rm.tensor_shape_);
           memcpy(DMAHelper::base(&copy), input, rm.tensor_bytes_);
@@ -122,14 +113,15 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
 
           GPUUtil::CopyCPUTensorToGPU(
               &copy, recv_args.device_context, dst_dev, &gpu_copy,
-              [this, gpu_copy, key, key_with_step_id, recv_args, done, rm,
-               rc](const Status& s) {
+              [this, gpu_copy, key, key_with_step_id, recv_args, done, rm, rc](
+                  const Status& s) {
                 CHECK(s.ok()) << "copy tensor to gpu sync";
                 Tensor val;
                 val = std::move(gpu_copy);
                 RecvPostCopyOps(key, key_with_step_id, recv_args, done, rm, rc,
                                 val, s);
               });
+#endif  // GOOGLE_CUDA
           return;
         } else {
           AllocatorAttributes host_alloc_attrs;
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc
index 6d1c79c0fb2f75a9cae835d78fbbe0b40774482b..a606ef75a42069b3c32eb13a69e981a5c4c8f83c 100644
--- a/tensorflow/contrib/verbs/verbs_server_lib.cc
+++ b/tensorflow/contrib/verbs/verbs_server_lib.cc
@@ -49,8 +49,8 @@ VerbsServer::~VerbsServer() {
 Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
                                         GrpcChannelCache** channel_cache) {
   string name_prefix =
-      strings::StrCat("/job:", server_def.job_name(), "/replica:0",
-                      "/task:", server_def.task_index());
+      strings::StrCat("/job:", server_def.job_name(), "/replica:0", "/task:",
+                      server_def.task_index());
 
   GrpcChannelSpec channel_spec;
   TF_RETURN_IF_ERROR(ParseChannelSpec(server_def, &channel_spec));
@@ -103,6 +103,7 @@ Status VerbsServer::Start() {
           ThreadOptions(), "TF_verbs_service",
           [this] { verbs_service_->HandleRPCsLoop(); }));
       rdma_mgr_->SetupChannels();
+      CHECK(rdma_mgr_->ConnectivityCheck()) << "Connectivity check failed!";
       verbs_state_ = CONNECTED;
     }
   }
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 206ccc1c72f5539a595fb586653f2845667f83c3..ae38025942b711c4ff9f635066f4429d965a1984 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -127,9 +127,9 @@ load(
     "tf_additional_verbs_lib_defines",
     "tf_additional_mpi_lib_defines",
     "tf_additional_gdr_lib_defines",
-    "tf_additional_gpu_tracer_srcs",
-    "tf_additional_gpu_tracer_deps",
-    "tf_additional_gpu_tracer_cuda_deps",
+    "tf_additional_device_tracer_srcs",
+    "tf_additional_device_tracer_deps",
+    "tf_additional_device_tracer_cuda_deps",
     "tf_pyclif_proto_library",
     "tf_jspb_proto_library",
     "tf_nano_proto_library",
@@ -575,6 +575,7 @@ cc_library(
 
 # Generates library per group of ops.
 tf_gen_op_libs(
+    is_external = False,
     op_lib_names = [
         "bitwise_ops",
         "candidate_sampling_ops",
@@ -752,6 +753,7 @@ tf_cuda_library(
     name = "core_cpu",
     hdrs = [
         "common_runtime/device.h",
+        "common_runtime/device_factory.h",
         "common_runtime/optimization_registry.h",
         "common_runtime/shape_refiner.h",
         "graph/algorithm.h",
@@ -1013,7 +1015,7 @@ filegroup(
 cc_library(
     name = "android_tensorflow_lib_lite",
     srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts() + if_not_android_mips_and_mips64(["-Os"]),
+    copts = tf_copts(android_optimization_level_override = None),
     linkopts = ["-lz"],
     tags = [
         "manual",
@@ -1101,8 +1103,7 @@ cc_library(
 cc_library(
     name = "android_tensorflow_lib_selective_registration",
     srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts() + [
-        "-Os",
+    copts = tf_copts(android_optimization_level_override = None) + [
         "-DSUPPORT_SELECTIVE_REGISTRATION",
     ],
     tags = [
@@ -1123,8 +1124,7 @@ cc_library(
 cc_library(
     name = "android_tensorflow_lib_selective_registration_nortti",
     srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts() + tf_opts_nortti_if_android() + [
-        "-Os",
+    copts = tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_android() + [
         "-DSUPPORT_SELECTIVE_REGISTRATION",
     ],
     tags = [
@@ -1203,7 +1203,7 @@ cc_library(
         "framework/tensor_testutil.h",
         "util/reporter.h",
     ],
-    copts = tf_copts() + ["-Os"],
+    copts = tf_copts(android_optimization_level_override = None),
     tags = [
         "manual",
         "notap",
@@ -1467,7 +1467,7 @@ cc_library(
             "lib/jpeg/**/*",
             "platform/**/env_time.cc",
             "platform/**/cuda_libdevice_path.cc",
-            "platform/**/gpu_tracer.cc",
+            "platform/**/device_tracer.cc",
             "platform/variant_coding.cc",
             "platform/**/variant_cord_coding.cc",
         ],
@@ -1478,7 +1478,7 @@ cc_library(
             "platform/**/cuda_libdevice_path.cc",
             "platform/**/stream_executor.h",
             "platform/**/env_time.cc",
-            "platform/**/gpu_tracer.cc",
+            "platform/**/device_tracer.cc",
             "platform/variant_coding.cc",
             "platform/**/variant_cord_coding.cc",
         ] +
@@ -1854,11 +1854,13 @@ cc_library(
     deps = ["//tensorflow/core/platform/default/build_config:protos_cc"],
 )
 
-CORE_CPU_BASE_HDRS = [
-    "common_runtime/device.h",
-    "common_runtime/graph_runner.h",
-    "common_runtime/shape_refiner.h",
-    "framework/versions.h",
+# Library containing all of the graph construction code that is
+# independent of the runtime.
+#
+# TODO(mrry): Refactor graph_constructor.cc so that it does not depend on code
+# in "common_runtime/", and then the entire "graph/" directory can be included
+# in this library.
+GRAPH_HDRS = [
     "graph/algorithm.h",
     "graph/colors.h",
     "graph/control_flow.h",
@@ -1866,7 +1868,7 @@ CORE_CPU_BASE_HDRS = [
     "graph/default_device.h",
     "graph/edgeset.h",
     "graph/graph.h",
-    "graph/graph_constructor.h",
+    "graph/graph_constructor.h",  # NOTE(mrry): Don't include the .cc since it depends on common_runtime.
     "graph/graph_def_builder.h",
     "graph/graph_partition.h",
     "graph/mkl_layout_pass.h",
@@ -1882,16 +1884,12 @@ CORE_CPU_BASE_HDRS = [
 ]
 
 tf_cuda_library(
-    name = "core_cpu_base",
+    name = "graph",
     srcs = [
-        "common_runtime/shape_refiner.cc",
-        "common_runtime/shape_refiner.h",
-        "framework/versions.h",
         "graph/algorithm.cc",
         "graph/colors.cc",
         "graph/control_flow.cc",
         "graph/costmodel.cc",
-        "graph/graph_constructor.cc",
         "graph/graph_def_builder.cc",
         "graph/graph_partition.cc",
         "graph/node_builder.cc",
@@ -1899,6 +1897,33 @@ tf_cuda_library(
         "graph/subgraph.cc",
         "graph/tensor_id.cc",
         "graph/validate.cc",
+    ],
+    hdrs = GRAPH_HDRS,
+    deps = [
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":proto_text",
+        ":protos_all_cc",
+        "//third_party/eigen3",
+    ],
+)
+
+CORE_CPU_BASE_HDRS = GRAPH_HDRS + [
+    "common_runtime/device.h",
+    "common_runtime/graph_runner.h",
+    "common_runtime/shape_refiner.h",
+    "framework/versions.h",
+]
+
+tf_cuda_library(
+    name = "core_cpu_base",
+    srcs = [
+        "common_runtime/shape_refiner.cc",
+        "common_runtime/shape_refiner.h",
+        "framework/versions.h",
+        "graph/graph_constructor.cc",  # Depends on common_runtime.
         "public/session.h",
         "public/session_options.h",
         "public/version.h",
@@ -1906,6 +1931,7 @@ tf_cuda_library(
     hdrs = CORE_CPU_BASE_HDRS,
     copts = tf_copts(),
     deps = [
+        ":graph",
         ":framework",
         ":framework_internal",
         ":lib",
@@ -2009,6 +2035,7 @@ tf_cuda_library(
     hdrs = CORE_CPU_LIB_HEADERS,
     copts = tf_copts(),
     deps = [
+        ":graph",
         ":framework",
         ":framework_internal",
         ":lib",
@@ -2050,6 +2077,7 @@ tf_cuda_library(
         ":function_ops_op_lib",
         ":functional_grad",
         ":functional_ops_op_lib",
+        ":graph",
         ":lib",
         ":lib_internal",
         ":proto_text",
@@ -2091,13 +2119,11 @@ tf_cuda_library(
         "util/env_var.h",
     ],
     copts = tf_copts(),
-    cuda_deps = [
-        ":gpu_tracer",
-    ],
-    linkstatic = 1,
     deps = [
         ":core_cpu_internal",
+        ":device_tracer",
         ":framework",
+        ":graph",
         ":lib",
         ":lib_internal",
         ":proto_text",
@@ -2128,18 +2154,18 @@ cc_library(
 )
 
 tf_cuda_library(
-    name = "gpu_tracer",
-    srcs = tf_additional_gpu_tracer_srcs(),
+    name = "device_tracer",
+    srcs = tf_additional_device_tracer_srcs(),
     hdrs = [
-        "platform/gpu_tracer.h",
+        "platform/device_tracer.h",
     ],
     copts = tf_copts(),
-    cuda_deps = tf_additional_cupti_wrapper_deps() + tf_additional_gpu_tracer_cuda_deps(),
+    cuda_deps = tf_additional_cupti_wrapper_deps() + tf_additional_device_tracer_cuda_deps(),
     deps = [
         ":core_cpu_internal",
         ":lib",
         ":protos_all_cc",
-    ] + tf_additional_gpu_tracer_deps(),
+    ] + tf_additional_device_tracer_deps(),
 )
 
 GPU_RUNTIME_HEADERS = [
@@ -2147,12 +2173,15 @@ GPU_RUNTIME_HEADERS = [
     "common_runtime/gpu/gpu_cudamalloc_allocator.h",
     "common_runtime/gpu/gpu_debug_allocator.h",
     "common_runtime/gpu/gpu_device.h",
+    "common_runtime/gpu/gpu_id.h",
+    "common_runtime/gpu/gpu_id_utils.h",
     "common_runtime/gpu/gpu_init.h",
     "common_runtime/gpu/gpu_managed_allocator.h",
     "common_runtime/gpu/gpu_stream_util.h",
     "common_runtime/gpu/gpu_util.h",
     "common_runtime/gpu/pool_allocator.h",
     "common_runtime/gpu/process_state.h",
+    "common_runtime/gpu_device_context.h",
 ]
 
 tf_cuda_library(
@@ -2163,13 +2192,13 @@ tf_cuda_library(
         "common_runtime/gpu/gpu_debug_allocator.cc",
         "common_runtime/gpu/gpu_device.cc",
         "common_runtime/gpu/gpu_device_factory.cc",
+        "common_runtime/gpu/gpu_id_utils.cc",
         "common_runtime/gpu/gpu_managed_allocator.cc",
         "common_runtime/gpu/gpu_stream_util.cc",
         "common_runtime/gpu/gpu_util.cc",
         "common_runtime/gpu/gpu_util_platform_specific.cc",
         "common_runtime/gpu/pool_allocator.cc",
         "common_runtime/gpu/process_state.cc",
-        "common_runtime/gpu_device_context.h",
     ],
     hdrs = GPU_RUNTIME_HEADERS,
     copts = tf_copts(),
@@ -2180,6 +2209,7 @@ tf_cuda_library(
         ":framework_internal",
         ":gpu_init_impl",
         ":gpu_lib",
+        ":graph",
         ":lib",
         ":lib_internal",
         ":protos_all_cc",
@@ -2757,11 +2787,25 @@ tf_cc_test_mkl(
     ]),
 )
 
+tf_cc_tests_gpu(
+    name = "gpu_device_on_non_gpu_machine_test",
+    size = "small",
+    srcs = ["common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":gpu_headers_lib",
+        ":gpu_runtime",
+        ":test",
+    ],
+)
+
 tf_cc_tests_gpu(
     name = "gpu_related_tests",
     size = "small",
     srcs = glob(["user_ops/**/*_test.cc"]) + [
         "common_runtime/gpu/gpu_bfc_allocator_test.cc",
+        "common_runtime/gpu/gpu_device_test.cc",
+        "common_runtime/gpu/gpu_id_utils_test.cc",
         "common_runtime/gpu/gpu_event_mgr_test.cc",
         "common_runtime/gpu/pool_allocator_test.cc",
     ],
@@ -3126,6 +3170,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:matmul_op",
+        "//tensorflow/core/kernels:random_ops",
         "//tensorflow/core/kernels:shape_ops",
         "//third_party/eigen3",
     ],
@@ -3378,12 +3423,12 @@ tf_cc_test(
 
 filegroup(
     name = "base_api_def",
-    data = glob(["api_def/base_api/*"]),
+    srcs = glob(["api_def/base_api/*"]),
 )
 
 filegroup(
     name = "python_api_def",
-    data = glob(["api_def/python_api/*"]),
+    srcs = glob(["api_def/python_api/*"]),
 )
 
 tf_cc_test(
@@ -3393,10 +3438,6 @@ tf_cc_test(
         ":base_api_def",
         "//tensorflow/cc:ops/op_gen_overrides.pbtxt",
     ],
-    tags = [
-        "manual",
-        "notap",
-    ],
     deps = [
         ":framework",
         ":framework_internal",
@@ -3412,9 +3453,9 @@ tf_cc_test(
 )
 
 tf_cc_test_gpu(
-    name = "gpu_tracer_test",
+    name = "device_tracer_test",
     size = "small",
-    srcs = ["platform/gpu_tracer_test.cc"],
+    srcs = ["platform/device_tracer_test.cc"],
     args = ["--heap_check=local"],
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["nomac"],
@@ -3422,12 +3463,12 @@ tf_cc_test_gpu(
         ":all_kernels",
         ":core_cpu",
         ":core_cpu_internal",
+        ":device_tracer",
         ":direct_session",
         ":direct_session_internal",
         ":framework",
         ":framework_internal",
         ":gpu_runtime",
-        ":gpu_tracer",
         ":lib",
         ":lib_internal",
         ":protos_all_cc",
diff --git a/tensorflow/core/api_def/api_test.cc b/tensorflow/core/api_def/api_test.cc
index f222d345abec2254434e1e221eefb0ca7f40ccbe..2cdc14843f61a2585b61e214527e0a0b5bdea446 100644
--- a/tensorflow/core/api_def/api_test.cc
+++ b/tensorflow/core/api_def/api_test.cc
@@ -221,9 +221,18 @@ std::unordered_map<string, ApiDefs> GenerateApiDef(
 
   std::unordered_map<string, ApiDefs> api_defs_map;
 
+  // These ops are included in OpList only if TF_NEED_GCP
+  // is set to true. So, we skip them for now so that this test passes
+  // whether TF_NEED_GCP is set or not.
+  const std::unordered_set<string> ops_to_exclude = {
+      "BigQueryReader", "GenerateBigQueryReaderPartitions"};
   for (const auto& op : ops.op()) {
     CHECK(!op.name().empty())
         << "Encountered empty op name: %s" << op.DebugString();
+    if (ops_to_exclude.find(op.name()) != ops_to_exclude.end()) {
+      LOG(INFO) << "Skipping " << op.name();
+      continue;
+    }
     string file_path = io::JoinPath(api_def_dir, kApiDefFileFormat);
     file_path = strings::Printf(file_path.c_str(), op.name().c_str());
     ApiDef* api_def = api_defs_map[file_path].add_op();
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAddSign.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAddSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dd4609525262d1e03af7d945cdacac7ea32f0546
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAddSign.pbtxt
@@ -0,0 +1,65 @@
+op {
+  graph_op_name: "ApplyAddSign"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "alpha"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "sign_decay"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and m tensors is
+protected by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the AddSign update."
+  description: <<END
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+variable <- variable - lr_t * update
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyPowerSign.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyPowerSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cfa5619b87944a80c2915c196e4ae10a4cccb25f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyPowerSign.pbtxt
@@ -0,0 +1,65 @@
+op {
+  graph_op_name: "ApplyPowerSign"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "logbase"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "sign_decay"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and m tensors is
+protected by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the AddSign update."
+  description: <<END
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+variable <- variable - lr_t * update
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..73df11b2f75f82fad174fb7e77eccbef35c2c7d1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BytesProducedStatsDataset"
+  summary: "Records the bytes size of each element of `input_dataset` in a StatsAggregator."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt
index 6522ce976f2b507c4c66d4d3709427b5fa8222e9..070d6adb978e4a62e7209f299dba08515aa21e83 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt
@@ -26,7 +26,7 @@ END
     description: <<END
 1-D tensor of length 4.  The stride of the sliding window for each
 dimension of `input`. The dimension order is determined by the value of
-  `data_format`, see below for details.
+`data_format`, see below for details.
 END
   }
   attr {
@@ -43,6 +43,16 @@ default format "NHWC", the data is stored in the order of:
     [batch, height, width, channels].
 Alternatively, the format could be "NCHW", the data storage order of:
     [batch, channels, height, width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each
+filter element on that dimension. The dimension order is determined by the
+value of `data_format`, see above for details. Dilations in the batch and
+depth dimensions must be 1.
 END
   }
   summary: "Computes a 2-D convolution given 4-D `input` and `filter` tensors."
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt
index 4ea3374dbbc8c690143a3a7a5fb9e67aca5bf1b0..ff2d9d71db646a27a88763f79bb6beb6b5ede44b 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt
@@ -51,6 +51,16 @@ default format "NHWC", the data is stored in the order of:
     [batch, in_height, in_width, in_channels].
 Alternatively, the format could be "NCHW", the data storage order of:
     [batch, in_channels, in_height, in_width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each filter
+element on that dimension. The dimension order is determined by the value of
+`data_format`, see above for details. Dilations in the batch and depth
+dimensions must be 1.
 END
   }
   summary: "Computes the gradients of convolution with respect to the filter."
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt
index 4420073e384c1c24d3109b8c6c4cadb59e9ed9d0..2de38b4263a380b5d0aec45270b9b67347c7021d 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt
@@ -50,6 +50,16 @@ default format "NHWC", the data is stored in the order of:
     [batch, in_height, in_width, in_channels].
 Alternatively, the format could be "NCHW", the data storage order of:
     [batch, in_channels, in_height, in_width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each filter
+element on that dimension. The dimension order is determined by the value of
+`data_format`, see above for details. Dilations in the batch and depth
+dimensions must be 1.
 END
   }
   summary: "Computes the gradients of convolution with respect to the input."
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv3D.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv3D.pbtxt
index 8f3cd4493c7af152c7a4eab78d1f96e02e325bbc..d26564097e976013fbb7f026c6a403cf6bd808e0 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv3D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv3D.pbtxt
@@ -34,6 +34,16 @@ default format "NDHWC", the data is stored in the order of:
     [batch, in_depth, in_height, in_width, in_channels].
 Alternatively, the format could be "NCDHW", the data storage order is:
     [batch, in_channels, in_depth, in_height, in_width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 5.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each
+filter element on that dimension. The dimension order is determined by the
+value of `data_format`, see above for details. Dilations in the batch and
+depth dimensions must be 1.
 END
   }
   summary: "Computes a 3-D convolution given 5-D `input` and `filter` tensors."
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropFilterV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropFilterV2.pbtxt
index 6f9b917237b5748ac91c0a3bfbe35a21954dfd9d..937c9c8eadaaeceaadc180ad44f35a12ba9a2dfb 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropFilterV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropFilterV2.pbtxt
@@ -43,6 +43,16 @@ default format "NDHWC", the data is stored in the order of:
     [batch, in_depth, in_height, in_width, in_channels].
 Alternatively, the format could be "NCDHW", the data storage order is:
     [batch, in_channels, in_depth, in_height, in_width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 5.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each
+filter element on that dimension. The dimension order is determined by the
+value of `data_format`, see above for details. Dilations in the batch and
+depth dimensions must be 1.
 END
   }
   summary: "Computes the gradients of 3-D convolution with respect to the filter."
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropInputV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropInputV2.pbtxt
index 19aba156d5907eb79d1438c16f866dfbd99ed548..414e418dc5a91e55f22dc5eec93d16fabad3d8fb 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropInputV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropInputV2.pbtxt
@@ -43,6 +43,16 @@ default format "NDHWC", the data is stored in the order of:
     [batch, in_depth, in_height, in_width, in_channels].
 Alternatively, the format could be "NCDHW", the data storage order is:
     [batch, in_channels, in_depth, in_height, in_width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 5.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each
+filter element on that dimension. The dimension order is determined by the
+value of `data_format`, see above for details. Dilations in the batch and
+depth dimensions must be 1.
 END
   }
   summary: "Computes the gradients of 3-D convolution with respect to the input."
diff --git a/tensorflow/core/api_def/base_api/api_def_DataFormatDimMap.pbtxt b/tensorflow/core/api_def/base_api/api_def_DataFormatDimMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..62098acd38239f0ee29198796415cd33a627a5a5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DataFormatDimMap.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "DataFormatDimMap"
+  in_arg {
+    name: "x"
+    description: <<END
+Scalar. Dimension index in source data format. Must be in the range [-4, 4).
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+Scalar. Dimension index in destination data format.
+END
+  }
+  attr {
+    name: "src_format"
+    description: <<END
+source data format.
+END
+  }
+  attr {
+    name: "dst_format"
+    description: <<END
+destination data format.
+END
+  }
+  summary: "Returns the dimension index in the destination data format given the one in"
+  description: <<END
+the source data format.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DataFormatVecPermute.pbtxt b/tensorflow/core/api_def/base_api/api_def_DataFormatVecPermute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d87c088899e26bdd8a86f41c07681fa5aa49a07a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DataFormatVecPermute.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "DataFormatVecPermute"
+  in_arg {
+    name: "x"
+    description: <<END
+Vector of size 4 or Tensor of shape (4, 2) in source data format.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+Vector of size 4 or Tensor of shape (4, 2) in destination data format.
+END
+  }
+  attr {
+    name: "src_format"
+    description: <<END
+source data format.
+END
+  }
+  attr {
+    name: "dst_format"
+    description: <<END
+destination data format.
+END
+  }
+  summary: "Returns the permuted vector/tensor in the destination data format given the"
+  description: <<END
+one in the source data format.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeCompressed.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeCompressed.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9babd822938dce8609a91816bcfb3988dd6a06d4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeCompressed.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "DecodeCompressed"
+  in_arg {
+    name: "bytes"
+    description: <<END
+A Tensor of string which is compressed.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A Tensor with the same shape as input `bytes`, uncompressed
+from bytes.
+END
+  }
+  attr {
+    name: "compression_type"
+    description: <<END
+A scalar containing either (i) the empty string (no
+compression), (ii) "ZLIB", or (iii) "GZIP".
+END
+  }
+  summary: "Decompress strings."
+  description: <<END
+This op decompresses each element of the `bytes` input `Tensor`, which
+is assumed to be compressed using the given `compression_type`.
+
+The `output` is a string `Tensor` of the same shape as `bytes`,
+each element containing the decompressed data from the corresponding
+element in `bytes`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DepthToSpace.pbtxt b/tensorflow/core/api_def/base_api/api_def_DepthToSpace.pbtxt
index e7a18cd6b474d34bcc839f51fd13218c76c61294..d20b47a3ed50f9a8bb65f0cd6c332d03172e6bd0 100644
--- a/tensorflow/core/api_def/base_api/api_def_DepthToSpace.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DepthToSpace.pbtxt
@@ -28,7 +28,7 @@ with the following options:
   "NHWC": `[ batch, height, width, channels ]`
   "NCHW": `[ batch, channels, height, width ]`
   "NCHW_VECT_C":
-      `qint8 [ batch, channels / 4, height, width, channels % 4 ]`
+      `qint8 [ batch, channels / 4, height, width, 4 ]`
 
 It is useful to consider the operation as transforming a 6-D Tensor.
 e.g. for data_format = NHWC,
diff --git a/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNative.pbtxt b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNative.pbtxt
index cc10ebe923870426bc9076ca6c96f0497bce1d51..3c313f7be6b38317ab7721a0d494fec42bdb52f4 100644
--- a/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNative.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNative.pbtxt
@@ -21,6 +21,16 @@ default format "NHWC", the data is stored in the order of:
     [batch, height, width, channels].
 Alternatively, the format could be "NCHW", the data storage order of:
     [batch, channels, height, width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each filter
+element on that dimension. The dimension order is determined by the value of
+`data_format`, see above for details. Dilations in the batch and depth
+dimensions must be 1.
 END
   }
   summary: "Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors."
diff --git a/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
index 9126be2afa9bafb3372cfe38fe43f73239e86c72..e66aa3b70707c2216ff5195b9d2dda407c50ec74 100644
--- a/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
@@ -54,6 +54,16 @@ default format "NHWC", the data is stored in the order of:
     [batch, height, width, channels].
 Alternatively, the format could be "NCHW", the data storage order of:
     [batch, channels, height, width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each filter
+element on that dimension. The dimension order is determined by the value of
+`data_format`, see above for details. Dilations in the batch and depth
+dimensions must be 1.
 END
   }
   summary: "Computes the gradients of depthwise convolution with respect to the filter."
diff --git a/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
index f1d16858dbf17e2974f6f1487857b63a40c99b91..f501ad21b35b6ad8d3ee16650919b1ff897cdccb 100644
--- a/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
@@ -54,6 +54,16 @@ default format "NHWC", the data is stored in the order of:
     [batch, height, width, channels].
 Alternatively, the format could be "NCHW", the data storage order of:
     [batch, channels, height, width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each filter
+element on that dimension. The dimension order is determined by the value of
+`data_format`, see above for details. Dilations in the batch and depth
+dimensions must be 1.
 END
   }
   summary: "Computes the gradients of depthwise convolution with respect to the input."
diff --git a/tensorflow/core/api_def/base_api/api_def_DeserializeSparse.pbtxt b/tensorflow/core/api_def/base_api/api_def_DeserializeSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dfaa531cbcc8adf46e5c6c57164fa7f674cda18d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DeserializeSparse.pbtxt
@@ -0,0 +1,60 @@
+op {
+  graph_op_name: "DeserializeSparse"
+  in_arg {
+    name: "serialized_sparse"
+    description: <<END
+The serialized `SparseTensor` objects. The last dimension
+must have 3 columns.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The `dtype` of the serialized `SparseTensor` objects.
+END
+  }
+  summary: "Deserialize `SparseTensor` objects."
+  description: <<END
+The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+the last dimension stores serialized `SparseTensor` objects and the other N
+dimensions (N >= 0) correspond to a batch. The ranks of the original
+`SparseTensor` objects must all match. When the final `SparseTensor` is
+created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+the sparse tensors have been concatenated along new dimensions, one for each
+batch.
+
+The output `SparseTensor` object's shape values for the original dimensions
+are the max across the input `SparseTensor` objects' shape values for the
+corresponding dimensions. The new dimensions match the size of the batch.
+
+The input `SparseTensor` objects' indices are assumed ordered in
+standard lexicographic order.  If this is not the case, after this
+step run `SparseReorder` to restore index ordering.
+
+For example, if the serialized input is a `[2 x 3]` matrix representing two
+original `SparseTensor` objects:
+
+    index = [ 0]
+            [10]
+            [20]
+    values = [1, 2, 3]
+    shape = [50]
+
+and
+
+    index = [ 2]
+            [10]
+    values = [4, 5]
+    shape = [30]
+
+then the final deserialized `SparseTensor` will be:
+
+    index = [0  0]
+            [0 10]
+            [0 20]
+            [1  2]
+            [1 10]
+    values = [1, 2, 3, 4, 5]
+    shape = [2 50]
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EagerPyFunc.pbtxt b/tensorflow/core/api_def/base_api/api_def_EagerPyFunc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9231368e1654d6bb710a128e076e93005f31116d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EagerPyFunc.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "EagerPyFunc"
+  summary: "Eagerly executes a python function to compute func(input)->output. The"
+  description: <<END
+semantics of the input, output, and attributes are the same as those for
+PyFunc.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GenerateVocabRemapping.pbtxt b/tensorflow/core/api_def/base_api/api_def_GenerateVocabRemapping.pbtxt
index 085acf7ff18c3b638641a7fcf4dc421b98f7343d..662e4c54b6c29124dd39ae6e14f1af20c48a0b41 100644
--- a/tensorflow/core/api_def/base_api/api_def_GenerateVocabRemapping.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GenerateVocabRemapping.pbtxt
@@ -36,6 +36,13 @@ END
     name: "num_new_vocab"
     description: <<END
 Number of entries in the new vocab file to remap.
+END
+  }
+  attr {
+    name: "old_vocab_size"
+    description: <<END
+Number of entries in the old vocab file to consider.  If -1,
+use the entire old vocabulary.
 END
   }
   summary: "Given a path to new and old vocabulary files, returns a remapping Tensor of"
@@ -43,7 +50,11 @@ END
 length `num_new_vocab`, where `remapping[i]` contains the row number in the old
 vocabulary that corresponds to row `i` in the new vocabulary (starting at line
 `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
-in the new vocabulary is not in the old vocabulary.  `num_vocab_offset` enables
+in the new vocabulary is not in the old vocabulary.  The old vocabulary is
+constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
+default value of -1.
+
+`num_vocab_offset` enables
 use in the partitioned variable case, and should generally be set through
 examining partitioning info.  The format of the files should be a text file,
 with each line containing a single entity within the vocabulary.
diff --git a/tensorflow/core/api_def/base_api/api_def_GuaranteeConst.pbtxt b/tensorflow/core/api_def/base_api/api_def_GuaranteeConst.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b2a2e1aaef84f8c978f8c9312cc52b9bdcd35ca8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GuaranteeConst.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "GuaranteeConst"
+  summary: "Gives a guarantee to the TF runtime that the input tensor is a constant."
+  description: <<END
+The runtime is then free to make optimizations based on this.
+
+Only accepts value typed tensors as inputs and rejects resource variable handles
+as input.
+
+Returns the input tensor without modification.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IteratorSetStatsAggregator.pbtxt b/tensorflow/core/api_def/base_api/api_def_IteratorSetStatsAggregator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c6f2212cd4fa5fe81ecc97c33ebe17d18ac7c616
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IteratorSetStatsAggregator.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IteratorSetStatsAggregator"
+  summary: "Associates the given iterator with the given statistics aggregator."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..78d946b0b47044855ff145e9492fdb3721ff0044
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LatencyStatsDataset"
+  summary: "Records the latency of producing `input_dataset` elements in a StatsAggregator."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixExponential.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixExponential.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d680f653121677e97d88655979521c67d566882
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixExponential.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "MatrixExponential"
+  in_arg {
+    name: "input"
+    description: <<END
+Shape is `[..., M, M]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Shape is `[..., M, M]`.
+
+@compatibility(scipy)
+Equivalent to scipy.linalg.expm
+@end_compatibility
+END
+  }
+  summary: "Computes the matrix exponential of one or more square matrices:"
+  description: <<END
+exp(A) = \sum_{n=0}^\infty A^n/n!
+
+The exponential is computed using a combination of the scaling and squaring
+method and the Pade approximation. Details can be founds in:
+Nicholas J. Higham, "The scaling and squaring method for the matrix exponential
+revisited," SIAM J. Matrix Anal. Applic., 26:1179-1193, 2005.
+
+The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices. The output is a tensor of the same shape as the input
+containing the exponential for all input submatrices `[..., :, :]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_NthElement.pbtxt b/tensorflow/core/api_def/base_api/api_def_NthElement.pbtxt
index 9ef20a26db80ce3199041142938e953ac827d750..2f5d8496190c87ef7d037f6e6ab5a6c44af100f4 100644
--- a/tensorflow/core/api_def/base_api/api_def_NthElement.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_NthElement.pbtxt
@@ -26,7 +26,7 @@ When set to True, find the nth-largest value in the vector and vice
 versa.
 END
   }
-  summary: "Finds values of the `n`-th order statistic for the last dmension."
+  summary: "Finds values of the `n`-th order statistic for the last dimension."
   description: <<END
 If the input is a vector (rank-1), finds the entries which is the nth-smallest
 value in the vector and outputs their values as scalar tensor.
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2D.pbtxt
index b19bbeab12db322064dcbf31779ce01adffadeb9..d18bafdce9b3aaccfae6eff0c489e133b492f26d 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2D.pbtxt
@@ -53,6 +53,16 @@ END
     name: "padding"
     description: <<END
 The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each
+filter element on that dimension. The dimension order is determined by the
+value of `data_format`, see above for details. Dilations in the batch and
+depth dimensions must be 1.
 END
   }
   summary: "Computes a 2D convolution given quantized 4D input and filter tensors."
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0466b40f85eb118c94404e2f0d7670392bc7afdf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt
@@ -0,0 +1,18 @@
+op {
+  graph_op_name: "RandomDataset"
+  in_arg {
+    name: "seed"
+    description: <<END
+A scalar seed for the random number generator. If either seed or
+seed2 is set to be non-zero, the random number generator is seeded
+by the given seed.  Otherwise, a random seed is used.
+END
+  }
+  in_arg {
+    name: "seed2"
+    description: <<END
+A second scalar seed to avoid seed collision.
+END
+  }
+  summary: "Creates a Dataset that returns pseudorandom numbers."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RecordInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_RecordInput.pbtxt
index 7efc8cd8334e80be3b1cc8ba5b50c2259931b1b6..333144d76e3f78204a8e35cbbf195871bbed3aef 100644
--- a/tensorflow/core/api_def/base_api/api_def_RecordInput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RecordInput.pbtxt
@@ -41,6 +41,13 @@ END
     name: "batch_size"
     description: <<END
 The batch size.
+END
+  }
+  attr {
+    name: "compression_type"
+    description: <<END
+The type of compression for the file. Currently ZLIB and
+GZIP are supported. Defaults to none.
 END
   }
   summary: "Emits randomized records."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAddSign.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAddSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..94ba3a8d81abdc40e781c8c76c43123b38567c6b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAddSign.pbtxt
@@ -0,0 +1,59 @@
+op {
+  graph_op_name: "ResourceApplyAddSign"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "alpha"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "sign_decay"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and m tensors is
+protected by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the AddSign update."
+  description: <<END
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+variable <- variable - lr_t * update
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyPowerSign.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyPowerSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..909861e668a7b6911523861624c37657df5a154f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyPowerSign.pbtxt
@@ -0,0 +1,59 @@
+op {
+  graph_op_name: "ResourceApplyPowerSign"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "logbase"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "sign_decay"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and m tensors is
+protected by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the AddSign update."
+  description: <<END
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+variable <- variable - lr_t * update
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b07ee9fda94851b7bc64a02dbf748b74eb63cdee
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdUpdate.pbtxt
@@ -0,0 +1,69 @@
+op {
+  graph_op_name: "ResourceScatterNdUpdate"
+  in_arg {
+    name: "ref"
+    description: <<END
+A resource handle. Must be from a VarHandleOp.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A Tensor. Must be one of the following types: int32, int64.
+A tensor of indices into ref.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A Tensor. Must have the same type as ref. A tensor of updated
+values to add to ref.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+An optional bool. Defaults to True. If True, the assignment will
+be protected by a lock; otherwise the behavior is undefined,
+but may exhibit less contention.
+END
+  }
+  summary: "Applies sparse `updates` to individual values or slices within a given"
+  description: <<END
+variable according to `indices`.
+
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+```
+
+For example, say we want to update 4 scattered elements to a rank-1 tensor to
+8 elements. In Python, that update would look like this:
+
+```python
+    ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+    indices = tf.constant([[4], [3], [1] ,[7]])
+    updates = tf.constant([9, 10, 11, 12])
+    update = tf.scatter_nd_update(ref, indices, updates)
+    with tf.Session() as sess:
+      print sess.run(update)
+```
+
+The resulting update to ref would look like this:
+
+    [1, 11, 3, 10, 9, 6, 7, 12]
+
+See @{tf.scatter_nd} for more details about how to make updates to
+slices.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SerializeManySparse.pbtxt b/tensorflow/core/api_def/base_api/api_def_SerializeManySparse.pbtxt
index 0010bca0b017bcaac5552f7aa9462b0c56d4c01a..d46b4b20eeb58ef1cc261372d69acfe5a70668fe 100644
--- a/tensorflow/core/api_def/base_api/api_def_SerializeManySparse.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SerializeManySparse.pbtxt
@@ -18,7 +18,14 @@ END
 1-D.  The `shape` of the minibatch `SparseTensor`.
 END
   }
-  summary: "Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`."
+  attr {
+    name: "out_type"
+    description: <<END
+The `dtype` to use for serialization; the supported types are `string`
+(default) and `variant`.
+END
+  }
+  summary: "Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object."
   description: <<END
 The `SparseTensor` must have rank `R` greater than 1, and the first dimension
 is treated as the minibatch dimension.  Elements of the `SparseTensor`
diff --git a/tensorflow/core/api_def/base_api/api_def_SerializeSparse.pbtxt b/tensorflow/core/api_def/base_api/api_def_SerializeSparse.pbtxt
index bb4a352d489c597b6e953bc79e307b0d74042e14..491f69fda088edb8a051b81e65d581094823ca5a 100644
--- a/tensorflow/core/api_def/base_api/api_def_SerializeSparse.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SerializeSparse.pbtxt
@@ -18,5 +18,12 @@ END
 1-D.  The `shape` of the `SparseTensor`.
 END
   }
-  summary: "Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object."
+  attr {
+    name: "out_type"
+    description: <<END
+The `dtype` to use for serialization; the supported types are `string`
+(default) and `variant`.
+END
+  }
+  summary: "Serialize a `SparseTensor` into a `[3]` `Tensor` object."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb425b24a4134366df1129df63dc0361537dd746
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "ShuffleAndRepeatDataset"
+  in_arg {
+    name: "buffer_size"
+    description: <<END
+The number of output elements to buffer in an iterator over
+this dataset. Compare with the `min_after_dequeue` attr when creating a
+`RandomShuffleQueue`.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+A scalar seed for the random number generator. If either `seed` or
+`seed2` is set to be non-zero, the random number generator is seeded
+by the given seed.  Otherwise, a random seed is used.
+END
+  }
+  in_arg {
+    name: "seed2"
+    description: <<END
+A second scalar seed to avoid seed collision.
+END
+  }
+  in_arg {
+    name: "count"
+    description: <<END
+A scalar representing the number of times the underlying dataset
+should be repeated. The default is `-1`, which results in infinite repetition.
+END
+  }
+  summary: "Creates a dataset that shuffles and repeats elements from `input_dataset`"
+  description: <<END
+pseudorandomly.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
index b12d3af9d74411fb46fb50d7dba57b7e60bbe933..ea5c52c0ee3826076b855ca243f03cb940b8e0b2 100644
--- a/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
@@ -11,8 +11,8 @@ END
   in_arg {
     name: "seed"
     description: <<END
-A scalar seed for the random number generator. If either seed or
-seed2 is set to be non-zero, the random number generator is seeded
+A scalar seed for the random number generator. If either `seed` or
+`seed2` is set to be non-zero, the random number generator is seeded
 by the given seed.  Otherwise, a random seed is used.
 END
   }
diff --git a/tensorflow/core/api_def/base_api/api_def_Snapshot.pbtxt b/tensorflow/core/api_def/base_api/api_def_Snapshot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49b7f5798cd58d7c96c9b0a582a6d79df4dab5a6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Snapshot.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Snapshot"
+  summary: "Returns a copy of the input tensor."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SpaceToDepth.pbtxt b/tensorflow/core/api_def/base_api/api_def_SpaceToDepth.pbtxt
index 8fd3966f7038a507ea3402e300f9362bd4f3d54b..b808ff5f9cf9072bdb95e779589668160d909b8f 100644
--- a/tensorflow/core/api_def/base_api/api_def_SpaceToDepth.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SpaceToDepth.pbtxt
@@ -25,7 +25,7 @@ with the following options:
   "NHWC": `[ batch, height, width, channels ]`
   "NCHW": `[ batch, channels, height, width ]`
   "NCHW_VECT_C":
-      `qint8 [ batch, channels / 4, height, width, channels % 4 ]`
+      `qint8 [ batch, channels / 4, height, width, 4 ]`
 
 It is useful to consider the operation as transforming a 6-D Tensor.
 e.g. for data_format = NHWC,
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6e105400307b178720a3b1e04955aaad61c9931
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "SparseSegmentMeanWithNumSegments"
+  in_arg {
+    name: "indices"
+    description: <<END
+A 1-D tensor. Has same rank as `segment_ids`.
+END
+  }
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor. Values should be sorted and can be repeated.
+END
+  }
+  in_arg {
+    name: "num_segments"
+    description: <<END
+Should equal the number of distinct segment IDs.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which has size
+`num_segments`.
+END
+  }
+  summary: "Computes the mean along sparse segments of a tensor."
+  description: <<END
+Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+misisng, the `output` tensor at that position will be zeroed.
+
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9ba98b81911cc85d942d91a0f689cb075fc987e9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "SparseSegmentSqrtNWithNumSegments"
+  in_arg {
+    name: "indices"
+    description: <<END
+A 1-D tensor. Has same rank as `segment_ids`.
+END
+  }
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor. Values should be sorted and can be repeated.
+END
+  }
+  in_arg {
+    name: "num_segments"
+    description: <<END
+Should equal the number of distinct segment IDs.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `k`, the number of segments.
+END
+  }
+  summary: "Computes the sum along sparse segments of a tensor divided by the sqrt of N."
+  description: <<END
+N is the size of the segment being reduced.
+
+Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+misisng, the `output` tensor at that position will be zeroed.
+
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3aeaba38e9447d175e33eae4cf6168679129bc8d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
@@ -0,0 +1,57 @@
+op {
+  graph_op_name: "SparseSegmentSumWithNumSegments"
+  in_arg {
+    name: "indices"
+    description: <<END
+A 1-D tensor. Has same rank as `segment_ids`.
+END
+  }
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor. Values should be sorted and can be repeated.
+END
+  }
+  in_arg {
+    name: "num_segments"
+    description: <<END
+Should equal the number of distinct segment IDs.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `num_segments`.
+END
+  }
+  summary: "Computes the sum along sparse segments of a tensor."
+  description: <<END
+Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+misisng, the `output` tensor at that position will be zeroed.
+
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+For example:
+
+```python
+c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+
+tf.sparse_segment_sum_with_num_segments(
+    c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+# => [[0 0 0 0]
+#     [0 0 0 0]
+#     [0 0 0 0]]
+
+tf.sparse_segment_sum_with_num_segments(c,
+                                        tf.constant([0, 1]),
+                                        tf.constant([0, 2],
+                                        num_segments=4))
+# => [[ 1  2  3  4]
+#     [ 0  0  0  0]
+#     [-1 -2 -3 -4]
+#     [ 0  0  0  0]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b30d64afe18a71fbbe73b397979796b8b844faa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StatsAggregatorHandle"
+  summary: "Creates a statistics manager resource."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bcaf9fea1af5123848b2d6267b3ef0f7279a7230
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StatsAggregatorSummary"
+  summary: "Produces a summary of any statistics recorded by the given statistics manager."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayV3.pbtxt
index d1de753ee59372fcb30fba7006f4864a33d3980f..48ac6f5e7def2e19434660f96798aa1bc834c25e 100644
--- a/tensorflow/core/api_def/base_api/api_def_TensorArrayV3.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayV3.pbtxt
@@ -48,6 +48,17 @@ END
 If true (default), Tensors in the TensorArray are cleared
 after being read.  This disables multiple read semantics but allows early
 release of memory.
+END
+  }
+  attr {
+    name: "identical_element_shapes"
+    description: <<END
+If true (default is false), then all
+elements in the TensorArray will be expected to have have identical shapes.
+This allows certain behaviors, like dynamically checking for
+consistent shapes on write, and being able to fill in properly
+shaped zero tensors on stack -- even if the element_shape attribute
+is not fully defined.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_UniqueV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_UniqueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd7ec6e5518c5a7788bb4fff88a38b74295e9df4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UniqueV2.pbtxt
@@ -0,0 +1,47 @@
+op {
+  graph_op_name: "UniqueV2"
+  in_arg {
+    name: "x"
+    description: <<END
+A `Tensor`.
+END
+  }
+  in_arg {
+    name: "axis"
+    description: <<END
+A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
+find the unique elements.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+A `Tensor`. Unique elements along the `axis` of `Tensor` x.
+END
+  }
+  out_arg {
+    name: "idx"
+    description: <<END
+A 1-D Tensor. Has the same type as x that contains the index of each
+value of x in the output y.
+END
+  }
+  summary: "Finds unique elements in a 1-D tensor."
+  description: <<END
+This operation returns a tensor `y` containing all of the unique elements of `x`
+sorted in the same order that they occur in `x`. This operation also returns a
+tensor `idx` the same size as `x` that contains the index of each value of `x`
+in the unique output `y`. In other words:
+
+`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+
+For example:
+
+```
+# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+y, idx = unique(x)
+y ==> [1, 2, 4, 7, 8]
+idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
index 0a3355cdbc0ba594205a765f2e975ff87078ec71..77a96d1e03d577ca0f6dfd69c51d2551d1ad4b2a 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
@@ -26,6 +26,8 @@ need not be sorted and need not cover all values in the full
 range of valid values.
 
 If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+If the given segment ID `i` is negative, the value is dropped and will not be
+added to the sum of the segment.
 
 `num_segments` should equal the number of distinct segment IDs.
 
diff --git a/tensorflow/core/api_def/python_api/api_def_DeserializeSparse.pbtxt b/tensorflow/core/api_def/python_api/api_def_DeserializeSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d0679907809ebc9f7762b2fdb4b1184d21259e3c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DeserializeSparse.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DeserializeSparse"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_EagerPyFunc.pbtxt b/tensorflow/core/api_def/python_api/api_def_EagerPyFunc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee0f95dacbc09702039da97fccd98a2d8bb83b1b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_EagerPyFunc.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "EagerPyFunc"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixExponential.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixExponential.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d215b86c7256145fa4ada58c7d2b54d418f8ac7d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixExponential.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MatrixExponential"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Snapshot.pbtxt b/tensorflow/core/api_def/python_api/api_def_Snapshot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea9ccee39765b659cc27e04a48cffc1caf97d5af
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Snapshot.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Snapshot"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
index 81cd44870e3031313ca8202ab67a333e1d6eca38..a1e3b21e4f2d6af1b7e3c68d82a77f96bd34e613 100644
--- a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
+++ b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
@@ -35,7 +35,7 @@ Tensor make_zeros(const DataType& dtype, const TensorShapeProto& shape) {
 // Replaces occurrences of the "AccumulateNV2" stub operator with a graph of
 // lower-level ops. The graph is equivalent (modulo certain corner cases)
 // to the semantics of the original accumulate_n() Python op in math_ops.py.
-// Implementing the op with a rewrite allows this new variant of accumulate_n 
+// Implementing the op with a rewrite allows this new variant of accumulate_n
 // to be differentiable.
 //
 // The binary code that generates AccumulateNV2 stub ops is located in a
diff --git a/tensorflow/core/common_runtime/copy_tensor.cc b/tensorflow/core/common_runtime/copy_tensor.cc
index 9084081119b2285eee5c9b2b250be464ca562843..e35548729b993c68f6e58180e0c2dc18b4eea801 100644
--- a/tensorflow/core/common_runtime/copy_tensor.cc
+++ b/tensorflow/core/common_runtime/copy_tensor.cc
@@ -90,8 +90,7 @@ void CopyHostToDevice(const Tensor* input, Allocator* cpu_allocator,
     Status s_copy_init;
     for (int64 i = 0; i < input->NumElements(); ++i) {
       s_copy_init = VariantDeviceCopy(
-          VariantDeviceCopyDirection::HOST_TO_DEVICE, v[i], &v_out[i],
-          (input->NumElements() == 1) ? std::move(copier) : copier);
+          VariantDeviceCopyDirection::HOST_TO_DEVICE, v[i], &v_out[i], copier);
       if (!s_copy_init.ok()) {
         status_cb->UpdateStatus(s_copy_init);
         break;
@@ -149,8 +148,7 @@ void CopyDeviceToHost(const Tensor* input, Allocator* cpu_allocator,
     Status s_copy_init;
     for (int64 i = 0; i < input->NumElements(); ++i) {
       s_copy_init = VariantDeviceCopy(
-          VariantDeviceCopyDirection::DEVICE_TO_HOST, v[i], &v_out[i],
-          (input->NumElements() == 1) ? std::move(copier) : copier);
+          VariantDeviceCopyDirection::DEVICE_TO_HOST, v[i], &v_out[i], copier);
       if (!s_copy_init.ok()) {
         status_cb->UpdateStatus(s_copy_init);
         break;
@@ -213,9 +211,9 @@ void CopyDeviceToDevice(CopyTensor::CopyFunction copy_function,
     Variant* v_out = copy.flat<Variant>().data();
     Status s_copy_init;
     for (int64 i = 0; i < input->NumElements(); ++i) {
-      s_copy_init = VariantDeviceCopy(
-          VariantDeviceCopyDirection::DEVICE_TO_DEVICE, v[i], &v_out[i],
-          (input->NumElements() == 1) ? std::move(copier) : copier);
+      s_copy_init =
+          VariantDeviceCopy(VariantDeviceCopyDirection::DEVICE_TO_DEVICE, v[i],
+                            &v_out[i], copier);
       if (!s_copy_init.ok()) {
         status_cb->UpdateStatus(s_copy_init);
         break;
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 3912cd177b6ceee11ea89bd933989db42d4d333d..d5a452a796d67400d56ca08c675e0386348dea13 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -131,7 +131,7 @@ class Device : public DeviceBase {
   OpSegment* op_segment() { return &op_seg_; }
 
   // Returns the resource manager associated w/ this device.
-  ResourceMgr* resource_manager() { return rmgr_; }
+  virtual ResourceMgr* resource_manager() { return rmgr_; }
 
   // Summarizes the status of this Device, for debugging.
   string DebugString() const { return ProtoDebugString(device_attributes_); }
diff --git a/tensorflow/core/common_runtime/device_factory.cc b/tensorflow/core/common_runtime/device_factory.cc
index fa12c48fb90064ed2de68a6d018a17551ec3390a..b43c718817558f0e44eff5f5e5d5ec3a81d25ddd 100644
--- a/tensorflow/core/common_runtime/device_factory.cc
+++ b/tensorflow/core/common_runtime/device_factory.cc
@@ -32,7 +32,7 @@ namespace tensorflow {
 namespace {
 
 static mutex* get_device_factory_lock() {
-  static mutex device_factory_lock;
+  static mutex device_factory_lock(LINKER_INITIALIZED);
   return &device_factory_lock;
 }
 
diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h
index d16681ac59d3bc34a54f63b8b55f372c661591b4..cd93f76324b937046f61b305a65fb53c2c133ab7 100644
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@@ -68,7 +68,7 @@ class DeviceMgr {
 
   StringPiece CopyToBackingStore(StringPiece s);
 
-  std::unordered_map<StringPiece, Device*, StringPiece::Hasher> device_map_;
+  std::unordered_map<StringPiece, Device*, StringPieceHasher> device_map_;
   core::Arena name_backing_store_;  // Storage for keys in device_map_
   std::unordered_map<string, int> device_type_counts_;
 
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 2f57164dcd8d676fc4269a31258e44f014dd4960..6e243c4b7c31ceb1b3c21b40981fde96a9c173b1 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -54,15 +54,13 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/device_tracer.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/env_var.h"
 
-#if GOOGLE_CUDA
-#include "tensorflow/core/platform/gpu_tracer.h"
-#endif  // GOOGLE_CUDA
 
 namespace tensorflow {
 
@@ -523,9 +521,7 @@ Status DirectSession::Run(const RunOptions& run_options,
 
   args.rendezvous = run_state.rendez;
   args.cancellation_manager = &step_cancellation_manager;
-  args.runner = [this, pool](Executor::Args::Closure c) {
-    SchedClosure(pool, std::move(c));
-  };
+
   args.session_state = &session_state_;
   args.tensor_store = &run_state.tensor_store;
   args.step_container = &run_state.step_container;
@@ -555,15 +551,19 @@ Status DirectSession::Run(const RunOptions& run_options,
     args.stats_collector = run_state.collector.get();
   }
 
-#if GOOGLE_CUDA
-  std::unique_ptr<GPUTracer> tracer;
+  std::unique_ptr<DeviceTracer> tracer;
   if (run_options.trace_level() >= RunOptions::HARDWARE_TRACE) {
-    tracer = CreateGPUTracer();
-    // tracer will be NULL on non-GPU platforms.
-    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-    if (tracer) tracer->Start().IgnoreError();
+    tracer = CreateDeviceTracer();
+    // tracer may be NULL on platforms without accelerators.
+    if (tracer) {
+      Status s = tracer->Start();
+      if (!s.ok()) {
+        run_state.executors_done.Notify();
+        delete barrier;
+        return s;
+      }
+    }
   }
-#endif  // GOOGLE_CUDA
 
   // Register this step with session's cancellation manager, so that
   // `Session::Close()` will cancel the step.
@@ -582,7 +582,23 @@ Status DirectSession::Run(const RunOptions& run_options,
     return errors::Cancelled("Run call was cancelled");
   }
 
+  Executor::Args::Runner default_runner = [this,
+                                           pool](Executor::Args::Closure c) {
+    SchedClosure(pool, std::move(c));
+  };
   for (const auto& item : executors_and_keys->items) {
+    // TODO(zhengxq): support partial run.
+    // TODO(zhengxq): if the device picks its own threadpool, we need to assign
+    //     less threads to the main compute pool by default.
+    thread::ThreadPool* device_thread_pool =
+        item.device->tensorflow_device_thread_pool();
+    if (!device_thread_pool) {
+      args.runner = default_runner;
+    } else {
+      args.runner = [this, device_thread_pool](Executor::Args::Closure c) {
+        SchedClosure(device_thread_pool, std::move(c));
+      };
+    }
     item.executor->RunAsync(args, barrier->Get());
   }
 
@@ -598,13 +614,10 @@ Status DirectSession::Run(const RunOptions& run_options,
     run_state.status.Update(errors::Cancelled("Run call was cancelled"));
   }
 
-#if GOOGLE_CUDA
   if (tracer) {
-    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-    tracer->Stop().IgnoreError();
-    tracer->Collect(args.stats_collector).IgnoreError();
+    TF_RETURN_IF_ERROR(tracer->Stop());
+    TF_RETURN_IF_ERROR(tracer->Collect(args.stats_collector));
   }
-#endif  // GOOGLE_CUDA
 
   {
     mutex_lock l(run_state.mu_);
@@ -1136,7 +1149,7 @@ Status DirectSession::GetOrCreateExecutors(
 
   if (run_state_args->is_partial_run) {
     ek->graph = std::move(run_state_args->graph);
-    std::unordered_set<StringPiece, StringPiece::Hasher> names;
+    std::unordered_set<StringPiece, StringPieceHasher> names;
     for (const string& input : inputs) {
       TensorId id(ParseTensorName(input));
       names.emplace(id.first);
@@ -1187,8 +1200,14 @@ Status DirectSession::GetOrCreateExecutors(
     auto opseg = device->op_segment();
     params.create_kernel = [this, lib, opseg](const NodeDef& ndef,
                                               OpKernel** kernel) {
-      // Caches the kernel only if the node is stateful.
-      if (!lib->IsStateful(ndef.op())) {
+      // We do not share the kernel via the OpSegment if the node is
+      // stateless, or a function.
+      // NOTE(mrry): We must not share function kernels (implemented
+      // using `CallOp`) between subgraphs, because `CallOp::handle_`
+      // is tied to a particular subgraph. Even if the function itself
+      // is stateful, the `CallOp` that invokes it is not.
+      if (!lib->IsStateful(ndef.op()) ||
+          lib->GetFunctionLibraryDefinition()->Find(ndef.op()) != nullptr) {
         return lib->CreateKernel(ndef, kernel);
       }
       auto create_fn = [lib, &ndef](OpKernel** kernel) {
@@ -1223,6 +1242,7 @@ Status DirectSession::GetOrCreateExecutors(
     // NewLocalExecutor takes ownership of partition_graph.
     item->graph = partition_graph.get();
     item->executor = nullptr;
+    item->device = device;
     Executor* executor;
     TF_RETURN_IF_ERROR(
         NewLocalExecutor(params, partition_graph.release(), &executor));
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 7fbabf6d818f3f8ace64235724f35740fee5cec0..ab768b97c48420e92beb360dc6aa97f42e59ca61 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -64,8 +64,7 @@ class DirectSession : public Session {
   ~DirectSession() override;
 
   typedef std::vector<std::pair<string, Tensor>> NamedTensorList;
-  typedef std::unordered_map<StringPiece, Node*, StringPiece::Hasher>
-      NameNodeMap;
+  typedef std::unordered_map<StringPiece, Node*, StringPieceHasher> NameNodeMap;
 
   ::tensorflow::Status Create(const GraphDef& graph) override;
   ::tensorflow::Status Extend(const GraphDef& graph) override;
@@ -113,6 +112,7 @@ class DirectSession : public Session {
   // every partition.
   struct PerPartitionExecutorsAndLib {
     Graph* graph = nullptr;                  // not owned.
+    Device* device = nullptr;                // not owned.
     FunctionLibraryRuntime* flib = nullptr;  // not owned.
     std::unique_ptr<Executor> executor;
   };
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 1896baaf668864fc1b29ac3ea6c9b1ab6eaaaeaa..fe1cf1b12e0c62e560e5bcac0cf3c203ba091af8 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1188,7 +1188,7 @@ class ExecutorState {
   // QUESTION: Make it a checkpoint::TensorSliceReaderCacheWrapper
   // instead of a pointer?  (avoids having to delete).
   checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache_;
-  FunctionCallFrame* call_frame_;
+  CallFrameInterface* call_frame_;
   const ExecutorImpl* impl_;
   CancellationManager* cancellation_manager_;
   Executor::Args::Runner runner_;
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index e09dc4e34630fc0ab22615b7204bd0ec2d117d35..b5f4ebb00532670f06d1182088395f46c3481ed6 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -84,7 +84,7 @@ class Executor {
     int64 step_id = 0;
     Rendezvous* rendezvous = nullptr;
     StepStatsCollector* stats_collector = nullptr;
-    FunctionCallFrame* call_frame = nullptr;
+    CallFrameInterface* call_frame = nullptr;
     CancellationManager* cancellation_manager = nullptr;
     SessionState* session_state = nullptr;
     TensorStore* tensor_store = nullptr;
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 23d0f331c5d096cfb944da48e9b5ce58e04daf65..b921cbcafca3569909e493cc608f4ecdb2a50d75 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -153,12 +153,20 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   Status Instantiate(const string& function_name, AttrSlice attrs,
                      Handle* handle) override;
 
+  Status ReleaseHandle(Handle handle) override;
+
   const FunctionBody* GetFunctionBody(Handle handle) override;
 
   Status CreateKernel(const NodeDef& ndef, OpKernel** kernel) override;
 
   void Run(const Options& opts, Handle handle, gtl::ArraySlice<Tensor> args,
            std::vector<Tensor>* rets, DoneCallback done) override;
+  // NOTE(mrry): This overload is currently only implemented for local function
+  // execution.
+  // TODO(b/70346412): Implement support for remote function execution when
+  // passing a call frame.
+  void Run(const Options& opts, Handle handle, CallFrameInterface* frame,
+           DoneCallback done) override;
 
   bool IsStateful(const string& function) override;
 
@@ -190,18 +198,21 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 
   mutable mutex mu_;
 
-  // func_graphs_ never shrinks or reorders its members.
-  std::vector<FunctionBody*> func_graphs_ GUARDED_BY(mu_);
+  int next_handle_ GUARDED_BY(mu_);
 
   // The instantiated and transformed function is encoded as a Graph
   // object, and an executor is created for the graph.
   struct Item : public core::RefCounted {
     const Graph* graph = nullptr;  // Owned by exec.
+    FunctionBody* func_graph = nullptr;
     Executor* exec = nullptr;
 
-    ~Item() override { delete this->exec; }
+    ~Item() override {
+      delete this->func_graph;
+      delete this->exec;
+    }
   };
-  std::vector<Item*> items_;
+  std::unordered_map<Handle, Item*> items_ GUARDED_BY(mu_);
 
   ProcessFunctionLibraryRuntime* parent_ = nullptr;  // not owned.
 
@@ -236,6 +247,7 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
       device_name_(device_ == nullptr
                        ? ProcessFunctionLibraryRuntime::kDefaultFLRDevice
                        : device_->name()),
+      next_handle_(0),
       parent_(parent) {
   get_func_sig_ = [this](const string& op, const OpDef** sig) {
     return lib_def_->LookUpOpDef(op, sig);
@@ -246,9 +258,9 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
 }
 
 FunctionLibraryRuntimeImpl::~FunctionLibraryRuntimeImpl() {
-  for (FunctionBody* p : func_graphs_) delete p;
-  for (Item* item : items_)
-    if (item) item->Unref();
+  for (auto item : items_) {
+    if (item.second) item.second->Unref();
+  }
 }
 
 // An asynchronous op kernel which executes an instantiated function
@@ -309,9 +321,8 @@ const FunctionBody* FunctionLibraryRuntimeImpl::GetFunctionBody(Handle h) {
   }
 
   mutex_lock l(mu_);
-  CHECK_LE(0, local_handle);
-  CHECK_LT(local_handle, func_graphs_.size());
-  return func_graphs_[local_handle];
+  CHECK_EQ(1, items_.count(local_handle));
+  return items_[local_handle]->func_graph;
 }
 
 Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
@@ -337,7 +348,7 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
                                  kernel);
   }
 
-  // Try to instantiate this function for the func/attr. Maybe its
+  // Try to instantiate this function for the func/attr. Maybe it's
   // cached already.
   Handle handle;
   TF_RETURN_IF_ERROR(Instantiate(ndef.op(), AttrSlice(&ndef.attr()), &handle));
@@ -478,14 +489,32 @@ Status FunctionLibraryRuntimeImpl::Instantiate(const string& function_name,
     if (*handle != kInvalidHandle) {
       delete fbody;
     } else {
-      *handle = parent_->AddHandle(key, device_name_, func_graphs_.size());
-      func_graphs_.push_back(fbody);
-      items_.resize(func_graphs_.size());
+      *handle = parent_->AddHandle(key, device_name_, next_handle_);
+      Item* item = new Item;
+      item->func_graph = fbody;
+      items_.insert({next_handle_, item});
+      next_handle_++;
     }
   }
   return Status::OK();
 }
 
+Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
+  if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
+    return parent_->ReleaseHandle(handle);
+  }
+
+  LocalHandle h = parent_->GetHandleOnDevice(device_name_, handle);
+  mutex_lock l(mu_);
+  CHECK_EQ(1, items_.count(h));
+  Item* item = items_[h];
+  if (item->Unref()) {
+    items_.erase(h);
+    TF_RETURN_IF_ERROR(parent_->RemoveHandle(handle));
+  }
+  return Status::OK();
+}
+
 void DumpGraph(StringPiece label, const Graph* g) {
   // TODO(zhifengc): Change Graph to record #nodes.
   VLOG(1) << "Graph " << label << " #nodes " << g->num_nodes() << " #edges "
@@ -506,12 +535,39 @@ void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g) {
   optimizer.Optimize(lib, lib->env(), lib->device(), g, /*shape_map=*/nullptr);
 }
 
+namespace {
+// Removes all stateless nodes that do not contribute to a return
+// value from the function body.  Unlike `RemoveDeadNodes()`, which is
+// triggered by `OptimizerOptions.do_function_inlining`, this pass
+// ignores the SINK node, from which (by definition) all nodes are
+// reverse reachable.
+void PruneFunctionBody(Graph* g) {
+  VLOG(2) << "Pruning function body";
+  std::unordered_set<const Node*> nodes;
+  for (auto n : g->nodes()) {
+    // NOTE(mrry): "_Retval" nodes are stateful, and so will be added
+    // to the seed set of `nodes`.
+    // TODO(mrry): Investigate whether the `n->IsControlFlow()` test is
+    // still needed. It would be preferable to prune entire loops and/or
+    // conditionals if they are not used in the graph.
+    if (n->IsControlFlow() || n->op_def().is_stateful()) {
+      nodes.insert(n);
+    }
+  }
+  bool changed = PruneForReverseReachability(g, std::move(nodes));
+  if (changed) {
+    FixupSourceAndSinkEdges(g);
+  }
+}
+}  // namespace
+
 Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) {
   const FunctionBody* fbody = GetFunctionBody(handle);
   CHECK_NOTNULL(fbody);
   std::unique_ptr<Graph> g(new Graph(lib_def_));
   CopyGraph(*fbody->graph, g.get());
 
+  PruneFunctionBody(g.get());
   optimizer_.Optimize(this, env(), device(), &g, /*shape_map=*/nullptr);
   TF_RETURN_IF_ERROR(EnsureMemoryTypes(DeviceType(device()->device_type()),
                                        device()->name(), g.get()));
@@ -529,9 +585,16 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) {
   Executor* exec;
   TF_RETURN_IF_ERROR(NewLocalExecutor(params, g.release(), &exec));
 
-  *item = new Item;
-  (*item)->graph = graph;
-  (*item)->exec = exec;
+  {
+    // Guard item since it is already inserted in items_.
+    mutex_lock l(mu_);
+    if ((*item)->exec) {
+      delete exec;
+    } else {
+      (*item)->graph = graph;
+      (*item)->exec = exec;
+    }
+  }
   return Status::OK();
 }
 
@@ -539,29 +602,18 @@ Status FunctionLibraryRuntimeImpl::GetOrCreateItem(Handle handle, Item** item) {
   LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
   {
     mutex_lock l(mu_);
-    if (local_handle >= items_.size()) {
+    if (items_.count(local_handle) == 0) {
       return errors::NotFound("Function handle ", handle,
                               " is not valid. Likely an internal error.");
     }
     *item = items_[local_handle];
-    if (*item != nullptr) {
-      (*item)->Ref();
+    if ((*item)->exec != nullptr) {
       return Status::OK();
     }
   }
   // NOTE: We need to call CreateItem out of mu_ because creating an
   // executor needs to call CreateKernel.
-  TF_RETURN_IF_ERROR(CreateItem(handle, item));
-
-  {
-    mutex_lock l(mu_);
-    if (items_[local_handle] == nullptr) {
-      // Install *item in items_.
-      items_[local_handle] = *item;
-      (*item)->Ref();
-    }
-  }
-  return Status::OK();
+  return CreateItem(handle, item);
 }
 
 void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
@@ -569,14 +621,13 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
                                            std::vector<Tensor>* rets,
                                            Executor::Args* exec_args,
                                            Item* item, DoneCallback done) {
-  FunctionCallFrame* frame = exec_args->call_frame;
+  DCHECK(exec_args->call_frame == nullptr);
   string target_device = parent_->GetDeviceName(handle);
   string source_device = opts.source_device;
   Rendezvous* rendezvous = opts.rendezvous;
   DeviceContext* device_context;
   Status s = parent_->GetDeviceContext(target_device, &device_context);
   if (!s.ok()) {
-    delete frame;
     delete exec_args;
     done(s);
     return;
@@ -584,6 +635,16 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
   int64 src_incarnation, target_incarnation;
   s = parent_->GetDeviceIncarnation(source_device, &src_incarnation);
   s.Update(parent_->GetDeviceIncarnation(target_device, &target_incarnation));
+  if (!s.ok()) {
+    delete exec_args;
+    done(s);
+    return;
+  }
+
+  const FunctionBody* fbody = GetFunctionBody(handle);
+  FunctionCallFrame* frame =
+      new FunctionCallFrame(fbody->arg_types, fbody->ret_types);
+  exec_args->call_frame = frame;
   if (!s.ok()) {
     delete frame;
     delete exec_args;
@@ -617,7 +678,6 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
             *exec_args, [item, frame, rets, done, source_device, target_device,
                          target_incarnation, rendezvous, device_context,
                          remote_args, exec_args](const Status& status) {
-              item->Unref();
               Status s = status;
               if (s.ok()) {
                 s = frame->ConsumeRetvals(rets);
@@ -661,17 +721,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     parent_->Run(run_opts, handle, args, rets, done);
     return;
   }
-  const FunctionBody* fbody = GetFunctionBody(handle);
-  FunctionCallFrame* frame =
-      new FunctionCallFrame(fbody->arg_types, fbody->ret_types);
 
-  Item* item = nullptr;
-  Status s = GetOrCreateItem(handle, &item);
-  if (!s.ok()) {
-    delete frame;
-    done(s);
-    return;
-  }
   DCHECK(run_opts.runner != nullptr);
 
   Executor::Args* exec_args = new Executor::Args;
@@ -679,16 +729,28 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   exec_args->step_id = run_opts.step_id;
   exec_args->rendezvous = run_opts.rendezvous;
   exec_args->stats_collector = run_opts.stats_collector;
-  exec_args->call_frame = frame;
   exec_args->cancellation_manager = run_opts.cancellation_manager;
   exec_args->step_container = run_opts.step_container;
   exec_args->runner = *run_opts.runner;
 
+  Item* item = nullptr;
+  Status s = GetOrCreateItem(handle, &item);
+  if (!s.ok()) {
+    delete exec_args;
+    done(s);
+    return;
+  }
+
   if (run_opts.remote_execution) {
+    // NOTE(mrry): `RunRemote()` will set `exec_args->call_frame` for us.
     RunRemote(run_opts, handle, args, rets, exec_args, item, done);
     return;
   }
 
+  const FunctionBody* fbody = GetFunctionBody(handle);
+  FunctionCallFrame* frame =
+      new FunctionCallFrame(fbody->arg_types, fbody->ret_types);
+  exec_args->call_frame = frame;
   s = frame->SetArgs(args);
   if (!s.ok()) {
     delete frame;
@@ -696,12 +758,12 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     done(s);
     return;
   }
+
   item->exec->RunAsync(
       // Executor args
       *exec_args,
       // Done callback.
       [item, frame, rets, done, exec_args](const Status& status) {
-        item->Unref();
         Status s = status;
         if (s.ok()) {
           s = frame->ConsumeRetvals(rets);
@@ -712,6 +774,66 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
       });
 }
 
+void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
+                                     CallFrameInterface* frame,
+                                     DoneCallback done) {
+  if (opts.cancellation_manager && opts.cancellation_manager->IsCancelled()) {
+    done(errors::Cancelled(""));
+    return;
+  }
+  if (!parent_->IsInstantiatedOnDevice(device_name_, handle) ||
+      opts.remote_execution) {
+    done(errors::Unimplemented("Remote calling with CallFrameInterface"));
+    return;
+  }
+
+  Options run_opts = opts;
+  if (opts.create_rendezvous) {
+    Rendezvous* rendezvous = new IntraProcessRendezvous(device_mgr_);
+    run_opts.rendezvous = rendezvous;
+    run_opts.create_rendezvous = false;
+    done = std::bind(
+        [rendezvous](DoneCallback done,
+                     // Begin unbound arguments.
+                     const Status& status) {
+          rendezvous->Unref();
+          done(status);
+        },
+        std::move(done), std::placeholders::_1);
+  }
+
+  Item* item = nullptr;
+  Status s = GetOrCreateItem(handle, &item);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+  DCHECK(run_opts.runner != nullptr);
+
+  Executor::Args* exec_args = new Executor::Args;
+  // Inherit the step_id from the caller.
+  exec_args->step_id = run_opts.step_id;
+  exec_args->rendezvous = run_opts.rendezvous;
+  exec_args->stats_collector = run_opts.stats_collector;
+  exec_args->cancellation_manager = run_opts.cancellation_manager;
+  exec_args->step_container = run_opts.step_container;
+  exec_args->runner = *run_opts.runner;
+  exec_args->call_frame = frame;
+
+  item->exec->RunAsync(
+      // Executor args
+      *exec_args,
+      // Done callback.
+      std::bind(
+          [item, frame, exec_args](DoneCallback done,
+                                   // Start unbound arguments.
+                                   const Status& status) {
+            delete exec_args;
+            done(status);
+          },
+          std::move(done), std::placeholders::_1));
+}
+
 bool FunctionLibraryRuntimeImpl::IsStateful(const string& func) {
   const OpDef* op_def;
   const Status s = lib_def_->LookUpOpDef(func, &op_def);
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index d183bf7c978f1a39882b6f2f0a94386e25e5f0cd..7b553c2dcde43b1f4442bb85d2d8ad98aae144ec 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function_testlib.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/op.h"
@@ -207,7 +208,83 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       return status;
     }
     FunctionLibraryRuntime::Options opts;
-    return Run(flr, handle, opts, args, std::move(rets));
+    status = Run(flr, handle, opts, args, rets);
+    if (!status.ok()) return status;
+
+    // Release the handle and try running again. It should not succeed.
+    status = flr->ReleaseHandle(handle);
+    if (!status.ok()) return status;
+
+    Status status2 = Run(flr, handle, opts, args, std::move(rets));
+    EXPECT_TRUE(errors::IsInvalidArgument(status2));
+    EXPECT_TRUE(
+        StringPiece(status2.error_message()).contains("remote execution."));
+
+    return status;
+  }
+
+  Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle,
+             FunctionLibraryRuntime::Options opts, CallFrameInterface* frame) {
+    std::atomic<int32> call_count(0);
+    std::function<void(std::function<void()>)> runner =
+        [&call_count](std::function<void()> fn) {
+          ++call_count;
+          test::function::FunctionTestSchedClosure(fn);
+        };
+
+    Notification done;
+    opts.runner = &runner;
+    std::vector<Tensor> out;
+    Status status;
+    flr->Run(opts, handle, frame, [&status, &done](const Status& s) {
+      status = s;
+      done.Notify();
+    });
+    done.WaitForNotification();
+    if (!status.ok()) {
+      return status;
+    }
+
+    EXPECT_GE(call_count, 1);  // Test runner is used.
+
+    return Status::OK();
+  }
+
+  Status InstantiateAndRunViaCallFrameInterface(FunctionLibraryRuntime* flr,
+                                                const string& name,
+                                                test::function::Attrs attrs,
+                                                const std::vector<Tensor>& args,
+                                                std::vector<Tensor*> rets) {
+    FunctionLibraryRuntime::Handle handle;
+    Status status = flr->Instantiate(name, attrs, &handle);
+    if (!status.ok()) {
+      return status;
+    }
+    const FunctionBody* fbody = flr->GetFunctionBody(handle);
+    FunctionCallFrame frame(fbody->arg_types, fbody->ret_types);
+    TF_RETURN_IF_ERROR(frame.SetArgs(args));
+
+    FunctionLibraryRuntime::Options opts;
+    status = Run(flr, handle, opts, &frame);
+    if (!status.ok()) return status;
+
+    std::vector<Tensor> retvals;
+    TF_RETURN_IF_ERROR(frame.GetRetvals(&retvals));
+    CHECK_EQ(rets.size(), retvals.size());
+    for (size_t i = 0; i < rets.size(); ++i) {
+      *rets[i] = retvals[i];
+    }
+
+    // Release the handle and try running again. It should not succeed.
+    status = flr->ReleaseHandle(handle);
+    if (!status.ok()) return status;
+
+    Status status2 = Run(flr, handle, opts, args, std::move(rets));
+    EXPECT_TRUE(errors::IsInvalidArgument(status2));
+    EXPECT_TRUE(
+        StringPiece(status2.error_message()).contains("remote execution."));
+
+    return status;
   }
 
   std::unique_ptr<Graph> GetFuncBody(FunctionLibraryRuntime* flr,
@@ -268,6 +345,9 @@ TEST_F(FunctionLibraryRuntimeTest, XTimesTwo) {
   TF_CHECK_OK(
       InstantiateAndRun(flr0_, "XTimesTwo", {{"T", DT_FLOAT}}, {x}, {&y}));
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
+  TF_CHECK_OK(InstantiateAndRunViaCallFrameInterface(
+      flr0_, "XTimesTwo", {{"T", DT_FLOAT}}, {x}, {&y}));
+  test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
 }
 
 TEST_F(FunctionLibraryRuntimeTest, XTimesN) {
@@ -487,6 +567,65 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithControlDeps) {
   }
 }
 
+TEST_F(FunctionLibraryRuntimeTest, PruneBody) {
+  auto T = DT_INT32;
+  FunctionDef stateful_func = FDH::Define(
+      // Name
+      "SquareAndAddOneWithStatefulNodes",
+      // Args
+      {"x: int32"},
+      // Return values
+      {"y: int32"},
+      // Attrs
+      {},
+      // Nodes
+      {// a = Square<T>(x)
+       {{"a"}, "Square", {"x"}, {{"T", T}}},
+       // 1
+       FDH::Const("o", 1),
+       // A bunch of extra arithmetic that y doesn't depend on
+       {{"x1"}, "Add", {"o", "o"}, {{"T", T}}},
+       {{"x2"}, "Mul", {"a", "x1"}, {{"T", T}}},
+       {{"x3"}, "Mul", {"x1", "x2"}, {{"T", T}}},
+       FDH::Const<int32>("shape", {1, 2}),
+       // A stateful node.
+       {{"keep_me"},
+        "RandomUniform",
+        {"shape"},
+        {{"T", T}, {"dtype", DT_FLOAT}}},
+       // y = Add<T>(a, o)
+       {{"y"}, "Add", {"a", "o"}, {{"T", T}}}});
+  Init({stateful_func});
+
+  auto x = test::AsTensor<int32>({1, 2, 3, 4});
+  Tensor y;
+
+  FunctionLibraryRuntime::Handle handle;
+  TF_CHECK_OK(
+      Instantiate(flr0_, "SquareAndAddOneWithStatefulNodes", {}, &handle));
+
+  StepStats stats;
+  StepStatsCollector stats_collector(&stats);
+  FunctionLibraryRuntime::Options opts;
+  opts.stats_collector = &stats_collector;
+  TF_CHECK_OK(Run(flr0_, handle, opts, {x}, {&y}));
+
+  TF_CHECK_OK(InstantiateAndRun(flr0_, "SquareAndAddOneWithStatefulNodes", {},
+                                {x}, {&y}));
+  test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({2, 5, 10, 17}));
+
+  stats_collector.FinalizeAndSwap(&stats);
+
+  // Note that we do not expect the nodes named "x1", "x2", or "x3" to execute.
+  std::set<string> expected_node_names(
+      {"_SOURCE", "shape", "x", "o", "a", "keep_me", "y", "y_RetVal"});
+  std::set<string> executed_node_names;
+  for (const auto& node_stats : stats.dev_stats()[0].node_stats()) {
+    executed_node_names.insert(node_stats.node_name());
+  }
+  EXPECT_EQ(expected_node_names, executed_node_names);
+}
+
 TEST_F(FunctionLibraryRuntimeTest, OptimizeGraph) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
@@ -498,7 +637,7 @@ TEST_F(FunctionLibraryRuntimeTest, OptimizeGraph) {
     Scope s = Scope::NewRootScope();
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
     auto x4_x2_scale = ops::Const<float>(
-        s.WithOpName("x4/x2/scale/_12__cf__2")
+        s.WithOpName("x4/x2/scale/_12__cf__4")
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         2.0f);
     auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), x, x4_x2_scale);
@@ -694,13 +833,13 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
     auto func0 = ops::_Arg(s.WithOpName("Func/_0"), DT_FLOAT, 1);
     auto scale = ops::Const(
-        s.WithOpName("scale/_5__cf__6")
+        s.WithOpName("scale/_5__cf__8")
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         2.0f);
     auto func1_gx = ops::Mul(s.WithOpName("Func/_1/gx"), func0, scale);
     auto func1_sx = ops::Shape(s.WithOpName("Func/_1/sx"), x);
     auto const0 = ops::Const(
-        s.WithOpName("Func/_1/sy/_6__cf__7")
+        s.WithOpName("Func/_1/sy/_6__cf__9")
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         0, {0});
     auto func1_rx = ops::internal::BroadcastGradientArgs(
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
index 646cd88a3a340a7ce3d85f19cb55fea27d9dc1b2..2f7fbbbec2a285976701b94c426bc3f870c65cf5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
@@ -15,20 +15,23 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
 
-GPUBFCAllocator::GPUBFCAllocator(int device_id, size_t total_memory)
-    : GPUBFCAllocator(device_id, total_memory, GPUOptions()) {}
+GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
+                                 const string& name)
+    : GPUBFCAllocator(cuda_gpu_id, total_memory, GPUOptions(), name) {}
 
-GPUBFCAllocator::GPUBFCAllocator(int device_id, size_t total_memory,
-                                 const GPUOptions& gpu_options)
+GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
+                                 const GPUOptions& gpu_options,
+                                 const string& name)
     : BFCAllocator(
           new GPUMemAllocator(
-              GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie()),
-          total_memory, gpu_options.allow_growth(),
-          strings::StrCat("GPU_", device_id, "_bfc")) {}
+              GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie()),
+          total_memory, gpu_options.allow_growth(), name) {}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
index 2c23340b6d3fb26a38304fd2d0544dcdfcdeeb5e..c2c0b020c7409e7be168d42e83579a2ff3c29a60 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/allocator_retry.h"
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
@@ -36,11 +37,12 @@ namespace tensorflow {
 // algorithm.
 class GPUBFCAllocator : public BFCAllocator {
  public:
-  // 'device_id' refers to the StreamExecutor ID of the device within
+  // 'cuda_gpu_id' refers to the ID of the GPU device within
   // the process and must reference a valid ID in the process.
-  GPUBFCAllocator(int device_id, size_t total_memory);
-  GPUBFCAllocator(int device_id, size_t total_memory,
-                  const GPUOptions& gpu_options);
+  GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
+                  const string& name);
+  GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
+                  const GPUOptions& gpu_options, const string& name);
   virtual ~GPUBFCAllocator() {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 00ef130d34bbbe06ad9dabae124ff3fa0d38450a..9e4b617d2bd5b070f5b8bdeedabb15b94d212743 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -45,7 +46,7 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
 }
 
 TEST(GPUBFCAllocatorTest, NoDups) {
-  GPUBFCAllocator a(0, 1 << 30);
+  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
   CheckStats(&a, 0, 0, 0, 0);
 
   // Allocate a lot of raw pointers
@@ -74,7 +75,7 @@ TEST(GPUBFCAllocatorTest, NoDups) {
 }
 
 TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
-  GPUBFCAllocator a(0, 1 << 30);
+  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
   // Allocate 256 raw pointers of sizes between 100 bytes and about
   // a meg
   random::PhiloxRandom philox(123, 17);
@@ -132,7 +133,7 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
 }
 
 TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
-  GPUBFCAllocator a(0, 1 << 30);
+  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
   CheckStats(&a, 0, 0, 0, 0);
 
   float* first_ptr = a.Allocate<float>(1024);
@@ -166,18 +167,18 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
 }
 
 TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) {
-  GPUBFCAllocator a(0, 1 << 30);
+  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
   float* ptr = a.Allocate<float>(0);
   EXPECT_EQ(nullptr, ptr);
 }
 
 TEST(GPUBFCAllocatorTest, TracksSizes) {
-  GPUBFCAllocator a(0, 1 << 30);
+  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
   EXPECT_EQ(true, a.TracksAllocationSizes());
 }
 
 TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
-  GPUBFCAllocator a(0, 1 << 30);
+  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
   float* t1 = a.Allocate<float>(1);
   EXPECT_EQ(4, a.RequestedSize(t1));
   EXPECT_EQ(256, a.AllocatedSize(t1));
@@ -186,7 +187,7 @@ TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
 
 TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) {
   // Configure a 1MiB byte limit
-  GPUBFCAllocator a(0, 1 << 20);
+  GPUBFCAllocator a(CudaGpuId(0), 1 << 20, "GPU_0_bfc");
 
   float* first_ptr = a.Allocate<float>(1 << 6);
   float* second_ptr = a.Allocate<float>(1 << 20);
@@ -201,7 +202,7 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
   options.set_allow_growth(true);
 
   // Max of 2GiB, but starts out small.
-  GPUBFCAllocator a(0, 1LL << 31, options);
+  GPUBFCAllocator a(CudaGpuId(0), 1LL << 31, options, "GPU_0_bfc");
 
   // Allocate 10 raw pointers of sizes between 100 bytes and about
   // 64 megs.
@@ -262,8 +263,8 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
 }
 
 TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
-  GPUBFCAllocator a(0, 1UL << 60);
-  GPUBFCAllocator b(0, 1UL << 60);
+  GPUBFCAllocator a(CudaGpuId(0), 1UL << 60, "GPU_0_bfc");
+  GPUBFCAllocator b(CudaGpuId(0), 1UL << 60, "GPU_0_bfc");
   void* amem = a.AllocateRaw(1, 1);
   void* bmem = b.AllocateRaw(1, 1 << 30);
   a.DeallocateRaw(amem);
@@ -271,7 +272,7 @@ TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
 }
 
 static void BM_Allocation(int iters) {
-  GPUBFCAllocator a(0, 1uLL << 33);
+  GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc");
   // Exercise a few different allocation sizes
   std::vector<size_t> sizes = {256,        4096,      16384,    524288,
                                512,        1048576,   10485760, 104857600,
@@ -287,7 +288,7 @@ static void BM_Allocation(int iters) {
 BENCHMARK(BM_Allocation);
 
 static void BM_AllocationThreaded(int iters, int num_threads) {
-  GPUBFCAllocator a(0, 1uLL << 33);
+  GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc");
   thread::ThreadPool pool(Env::Default(), "test", num_threads);
   std::atomic_int_fast32_t count(iters);
   mutex done_lock;
@@ -323,7 +324,7 @@ BENCHMARK(BM_AllocationThreaded)->Arg(1)->Arg(4)->Arg(16);
 // A more complex benchmark that defers deallocation of an object for
 // "delay" allocations.
 static void BM_AllocationDelayed(int iters, int delay) {
-  GPUBFCAllocator a(0, 1 << 30);
+  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
   // Exercise a few different allocation sizes
   std::vector<int> sizes = {256, 4096, 16384, 4096, 512, 1024, 1024};
   int size_index = 0;
@@ -361,7 +362,7 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
   // only methods inside this class can access private members of BFCAllocator.
 
   void TestBinDebugInfo() {
-    GPUBFCAllocator a(0, 1 << 30);
+    GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
 
     std::vector<void*> initial_ptrs;
     std::vector<size_t> initial_ptrs_allocated_sizes;
@@ -439,7 +440,7 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
   }
 
   void TestLog2FloorNonZeroSlow() {
-    GPUBFCAllocator a(0 /* device_id */, 1 /* total_memory */);
+    GPUBFCAllocator a(CudaGpuId(0), 1 /* total_memory */, "GPU_0_bfc");
     EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0));
     EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1));
     EXPECT_EQ(1, a.Log2FloorNonZeroSlow(2));
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
index 70c2d96763e72909bd1d58ae637d8393f1368197..7c09451a8afde4216e59ca627a1b405278d1be67 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
@@ -20,6 +20,8 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
@@ -28,9 +30,9 @@ namespace gpu = ::perftools::gputools;
 namespace tensorflow {
 
 GPUcudaMallocAllocator::GPUcudaMallocAllocator(VisitableAllocator* allocator,
-                                               int device_id)
+                                               CudaGpuId cuda_gpu_id)
     : base_allocator_(allocator) {
-  stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+  stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 }
 
 GPUcudaMallocAllocator::~GPUcudaMallocAllocator() { delete base_allocator_; }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
index 23552b809a8a735aaeb8ac9643eccd0b0542f03b..208697361d2dfc4f3b8290ea511d15c9bd86857b 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -30,7 +31,8 @@ namespace tensorflow {
 // allocated memory.
 class GPUcudaMallocAllocator : public VisitableAllocator {
  public:
-  explicit GPUcudaMallocAllocator(VisitableAllocator* allocator, int device_id);
+  explicit GPUcudaMallocAllocator(VisitableAllocator* allocator,
+                                  CudaGpuId cuda_gpu_id);
   ~GPUcudaMallocAllocator() override;
   string Name() override { return "gpu_debug"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index 6480f0b256b2fe05db5ac5bc8037b4fa216682ac..45e97fdbf043d469966c3206fd0558195dabeae0 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -16,6 +16,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
 
 #include <vector>
+
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
@@ -75,9 +78,9 @@ void InitMask(perftools::gputools::StreamExecutor* exec, void* ptr,
 // GPUDebugAllocator
 // -----------------------------------------------------------------------------
 GPUDebugAllocator::GPUDebugAllocator(VisitableAllocator* allocator,
-                                     int device_id)
+                                     CudaGpuId cuda_gpu_id)
     : base_allocator_(allocator) {
-  stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+  stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 }
 
 GPUDebugAllocator::~GPUDebugAllocator() { delete base_allocator_; }
@@ -154,9 +157,9 @@ bool GPUDebugAllocator::CheckFooter(void* ptr) {
 // GPUNanResetAllocator
 // -----------------------------------------------------------------------------
 GPUNanResetAllocator::GPUNanResetAllocator(VisitableAllocator* allocator,
-                                           int device_id)
+                                           CudaGpuId cuda_gpu_id)
     : base_allocator_(allocator) {
-  stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+  stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 }
 
 GPUNanResetAllocator::~GPUNanResetAllocator() { delete base_allocator_; }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
index 9fbaf64f8a296fe012511cc91d845566f52f13d5..a990f5ce7cf8bb866c04bff3b0ca2311a8b96d7f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <unordered_map>
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -32,7 +33,8 @@ namespace tensorflow {
 // allocated memory.
 class GPUDebugAllocator : public VisitableAllocator {
  public:
-  explicit GPUDebugAllocator(VisitableAllocator* allocator, int device_id);
+  explicit GPUDebugAllocator(VisitableAllocator* allocator,
+                             CudaGpuId cuda_gpu_id);
   ~GPUDebugAllocator() override;
   string Name() override { return "gpu_debug"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
@@ -62,7 +64,8 @@ class GPUDebugAllocator : public VisitableAllocator {
 // user forgets to initialize the memory.
 class GPUNanResetAllocator : public VisitableAllocator {
  public:
-  explicit GPUNanResetAllocator(VisitableAllocator* allocator, int device_id);
+  explicit GPUNanResetAllocator(VisitableAllocator* allocator,
+                                CudaGpuId cuda_gpu_id);
   ~GPUNanResetAllocator() override;
   string Name() override { return "gpu_nan_reset"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
index 14d8591731f0d544976a661c591920fb937f0cbd..ca4b93815c637613d196cc39546d148d92bd7216 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/logging.h"
@@ -33,10 +35,10 @@ namespace gpu = ::perftools::gputools;
 namespace tensorflow {
 
 TEST(GPUDebugAllocatorTest, OverwriteDetection_None) {
-  const int device_id = 0;
-  GPUDebugAllocator a(new GPUBFCAllocator(device_id, 1 << 30), device_id);
-  auto stream_exec =
-      GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+  const CudaGpuId cuda_gpu_id(0);
+  GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+                      cuda_gpu_id);
+  auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 
   for (int s : {8}) {
     std::vector<int64> cpu_array(s);
@@ -57,11 +59,11 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) {
   for (int s : {8, 211}) {
     EXPECT_DEATH(
         {
-          const int device_id = 0;
-          GPUDebugAllocator a(new GPUBFCAllocator(device_id, 1 << 30),
-                              device_id);
+          const CudaGpuId cuda_gpu_id(0);
+          GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+                              cuda_gpu_id);
           auto stream_exec =
-              GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+              GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 
           std::vector<int64> cpu_array(s);
           memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
@@ -90,11 +92,11 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
   for (int s : {8, 22}) {
     EXPECT_DEATH(
         {
-          const int device_id = 0;
-          GPUDebugAllocator a(new GPUBFCAllocator(device_id, 1 << 30),
-                              device_id);
+          const CudaGpuId cuda_gpu_id(0);
+          GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+                              cuda_gpu_id);
           auto stream_exec =
-              GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+              GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 
           std::vector<int64> cpu_array(s);
           memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
@@ -120,10 +122,10 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
 }
 
 TEST(GPUDebugAllocatorTest, ResetToNan) {
-  const int device_id = 0;
-  GPUNanResetAllocator a(new GPUBFCAllocator(device_id, 1 << 30), device_id);
-  auto stream_exec =
-      GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+  const CudaGpuId cuda_gpu_id(0);
+  GPUNanResetAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+                         cuda_gpu_id);
+  auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 
   std::vector<float> cpu_array(1024);
   std::vector<float> cpu_array_result(1024);
@@ -160,13 +162,13 @@ TEST(GPUDebugAllocatorTest, ResetToNan) {
 }
 
 TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
-  const int device_id = 0;
+  const CudaGpuId cuda_gpu_id(0);
   // NaN reset must be the outer-most allocator.
   GPUNanResetAllocator a(
-      new GPUDebugAllocator(new GPUBFCAllocator(device_id, 1 << 30), device_id),
-      device_id);
-  auto stream_exec =
-      GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+      new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+                            cuda_gpu_id),
+      cuda_gpu_id);
+  auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 
   std::vector<float> cpu_array(1024);
   std::vector<float> cpu_array_result(1024);
@@ -203,13 +205,18 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
 }
 
 TEST(GPUDebugAllocatorTest, TracksSizes) {
-  GPUDebugAllocator a(new GPUBFCAllocator(0, 1 << 30), 0);
+  const CudaGpuId cuda_gpu_id(0);
+  GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+                      cuda_gpu_id);
   EXPECT_EQ(true, a.TracksAllocationSizes());
 }
 
 TEST(GPUDebugAllocatorTest, AllocatedVsRequested) {
+  const CudaGpuId cuda_gpu_id(0);
   GPUNanResetAllocator a(
-      new GPUDebugAllocator(new GPUBFCAllocator(0, 1 << 30), 0), 0);
+      new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+                            cuda_gpu_id),
+      cuda_gpu_id);
   float* t1 = a.Allocate<float>(1);
   EXPECT_EQ(4, a.RequestedSize(t1));
   EXPECT_EQ(256, a.AllocatedSize(t1));
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 5a7d96445e0ca0db7a90dec004adeafe69600279..1390810c288b4834c3c13e458104e2acb5215e67 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -32,6 +32,8 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
@@ -60,6 +62,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
 namespace tensorflow {
@@ -84,7 +87,8 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
   }
   ~EigenCudaStreamDevice() override {}
   void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
-                    int gpu_id, ::tensorflow::Allocator* alloc, char* scratch) {
+                    TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc,
+                    char* scratch) {
     if (LogMemory::IsEnabled()) {
       operation_ = context->op_kernel().name() + "/EigenAllocator";
       step_id_ = context->step_id();
@@ -95,7 +99,8 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
         reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize);
     stream_ = cuda_stream;
     allocator_ = alloc;
-    device_prop_ = &Eigen::m_deviceProperties[gpu_id];
+    const int cuda_gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id).value();
+    device_prop_ = &Eigen::m_deviceProperties[cuda_gpu_id];
   }
 
   const cudaStream_t& stream() const override { return *stream_; }
@@ -185,13 +190,15 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
 class BaseGPUDevice::StreamGroupFactory {
  public:
   // Returns the unique stream group for use with the stream defined by
-  // {gpu_id, stream_group_within_gpu}, creating it if it does not yet exist.
+  // {tf_gpu_id, stream_group_within_gpu}, creating it if it does not yet
+  // exist.
   // This function is thread safe.
-  BaseGPUDevice::StreamGroup* GetOrCreate(int gpu_id,
+  BaseGPUDevice::StreamGroup* GetOrCreate(TfGpuId tf_gpu_id,
                                           int stream_group_within_gpu,
                                           gpu::StreamExecutor* executor) {
     mutex_lock guard(lock_);
-    StreamGroup* group = &streams_[key_type(gpu_id, stream_group_within_gpu)];
+    StreamGroup* group =
+        &streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)];
     if (!group->compute) {
       group->compute = new gpu::Stream(executor);
       group->compute->Init();
@@ -236,7 +243,8 @@ class BaseGPUDevice::StreamGroupFactory {
 
 BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
                              Bytes memory_limit, const DeviceLocality& locality,
-                             int gpu_id, const string& physical_device_desc,
+                             TfGpuId tf_gpu_id,
+                             const string& physical_device_desc,
                              Allocator* gpu_allocator, Allocator* cpu_allocator,
                              bool sync_every_op, int32 max_streams)
     : LocalDevice(options, Device::BuildDeviceAttributes(name, DEVICE_GPU,
@@ -244,7 +252,7 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
                                                          physical_device_desc)),
       gpu_allocator_(gpu_allocator),
       cpu_allocator_(cpu_allocator),
-      gpu_id_(gpu_id),
+      tf_gpu_id_(tf_gpu_id),
       sync_every_op_(sync_every_op),
       max_streams_(max_streams) {
   ProcessState::singleton()->EnableGPUDevice();
@@ -256,10 +264,10 @@ BaseGPUDevice::~BaseGPUDevice() {
 }
 
 Status BaseGPUDevice::Init(const SessionOptions& options) {
-  auto executor_status = GPUMachineManager()->ExecutorForDevice(gpu_id_);
+  auto executor_status = GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id_);
   if (!executor_status.status().ok()) {
     return errors::Internal("Failed to get StreamExecutor for device ",
-                            gpu_id_);
+                            tf_gpu_id_.value());
   }
 
   executor_ = executor_status.ValueOrDie();
@@ -272,14 +280,14 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
   // Create the specified number of GPU streams
   for (int i = 0; i < max_streams_; i++) {
     streams_.push_back(
-        StreamGroupFactory::Global().GetOrCreate(gpu_id_, i, executor_));
+        StreamGroupFactory::Global().GetOrCreate(tf_gpu_id_, i, executor_));
 
     size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int);
     void* scratch_buffer = gpu_allocator_->AllocateRaw(
         Allocator::kAllocatorAlignment, scratch_buffer_size);
     if (scratch_buffer == nullptr) {
       return errors::FailedPrecondition(
-          "Failed to allocate scratch buffer for device ", gpu_id_);
+          "Failed to allocate scratch buffer for device ", tf_gpu_id_.value());
     }
     scratch_.push_back(static_cast<char*>(scratch_buffer));
 
@@ -291,7 +299,8 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
         &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
     if (!ok) {
       return errors::FailedPrecondition(
-          "Failed to memcopy into scratch buffer for device ", gpu_id_);
+          "Failed to memcopy into scratch buffer for device ",
+          tf_gpu_id_.value());
     }
 
     device_contexts_.push_back(new GPUDeviceContext(
@@ -302,9 +311,49 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
   gpu_device_info_->stream = streams_[0]->compute;
   gpu_device_info_->default_context = device_contexts_[0];
   gpu_device_info_->event_mgr = em_.get();
-  gpu_device_info_->gpu_id = gpu_id_;
+  gpu_device_info_->gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id_).value();
   set_tensorflow_gpu_device_info(gpu_device_info_);
 
+  // Whether and how the GPU device uses its own threadpool.
+  // This option is experimental. Once we confirm the best setting, we
+  // may change the default behavior and completely remove this flag.
+  // Default values might change in future releases.
+  // Possible values:
+  //   * global: GPU uses threads shared with CPU in the main compute
+  //          thread-pool. This is currently the default.
+  //   * gpu_private: GPU uses threads dedicated to this device.
+  //   * gpu_shared: All GPUs share a dedicated thread pool.
+  string gpu_thread_mode;
+  TF_RETURN_IF_ERROR(
+      ReadStringFromEnvVar("TF_GPU_THREAD_MODE", "global", &gpu_thread_mode));
+  gpu_thread_mode = str_util::Lowercase(gpu_thread_mode);
+  if (gpu_thread_mode != "global") {
+    int64 gpu_thread_count = -1;
+    // Default to two threads. One for device compute and another for memory
+    // copies.
+    TF_RETURN_IF_ERROR(
+        ReadInt64FromEnvVar("TF_GPU_THREAD_COUNT", 2, &gpu_thread_count));
+    if (gpu_thread_mode == "gpu_private") {
+      // TODO(zhengxq): since these threads only serve a single GPU device,
+      //   we should set the device context once for each thread, and avoid
+      //   setting them for each kernel.
+      // TODO(zhengxq): pin the thread to the same socket of the target GPU.
+      thread_pool_.reset(new thread::ThreadPool(
+          options.env, strings::StrCat("gpu_private_", tf_gpu_id_.value()),
+          static_cast<int32>(gpu_thread_count)));
+      set_tensorflow_device_thread_pool(thread_pool_.get());
+    } else if (gpu_thread_mode == "gpu_shared") {
+      static thread::ThreadPool* thread_pool = new thread::ThreadPool(
+          options.env, "gpu_shared", static_cast<int32>(gpu_thread_count));
+      set_tensorflow_device_thread_pool(thread_pool);
+    } else {
+      string error_message =
+          strings::StrCat("Invalid gpu_thread_mode: ", gpu_thread_mode);
+      LOG(WARNING) << error_message;
+      return errors::InvalidArgument(error_message);
+    }
+  }
+
   return Status::OK();
 }
 
@@ -394,7 +443,7 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
 
   if (vlog_1) {
     VLOG(1) << "GpuDevice::Compute " << op_kernel->name() << " op "
-            << op_kernel->type_string() << " on GPU" << gpu_id_ << " stream["
+            << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
             << stream_id << "]";
   }
 
@@ -469,7 +518,7 @@ void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
   const auto stream_id = gpu_device_context->stream_id();
 
   VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op "
-          << op_kernel->type_string() << " on GPU" << gpu_id_ << " stream["
+          << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
           << stream_id << "]";
 
   // When TraceMe profiling is off (which is the default), the
@@ -561,10 +610,8 @@ Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
     };
     Status s;
     for (int64 ix = 0; ix < parsed.NumElements(); ++ix) {
-      s = VariantDeviceCopy(
-          VariantDeviceCopyDirection::HOST_TO_DEVICE, from[ix],
-          &copy_variant[ix],
-          parsed.NumElements() == 1 ? std::move(copier) : copier);
+      s = VariantDeviceCopy(VariantDeviceCopyDirection::HOST_TO_DEVICE,
+                            from[ix], &copy_variant[ix], copier);
       if (!s.ok()) {
         break;
       }
@@ -596,8 +643,9 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
   ConcretePerOpGpuDevice() : device_(&stream_device_) {}
 
   void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
-                    int gpu_id, Allocator* base_allocator, char* scratch) {
-    stream_device_.Reinitialize(context, cuda_stream, gpu_id, base_allocator,
+                    TfGpuId tf_gpu_id, Allocator* base_allocator,
+                    char* scratch) {
+    stream_device_.Reinitialize(context, cuda_stream, tf_gpu_id, base_allocator,
                                 scratch);
   }
 
@@ -607,6 +655,146 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
   EigenCudaStreamDevice stream_device_;
   Eigen::GpuDevice device_;
 };
+
+// Parse 'visible_device_list' into a list of CUDA GPU ids.
+Status ParseVisibleDeviceList(const string& visible_device_list,
+                              std::vector<CudaGpuId>* visible_gpu_order) {
+  visible_gpu_order->clear();
+  gpu::Platform* gpu_manager = GPUMachineManager();
+
+  // If the user wants to remap the visible to virtual GPU mapping,
+  // check for that here.
+  if (visible_device_list.empty()) {
+    visible_gpu_order->resize(gpu_manager->VisibleDeviceCount());
+    // By default, visible to virtual mapping is unchanged.
+    int deviceNo = 0;
+    std::generate(visible_gpu_order->begin(), visible_gpu_order->end(),
+                  [&deviceNo] { return deviceNo++; });
+  } else {
+    const std::vector<string> order_str =
+        str_util::Split(visible_device_list, ',');
+    for (const string& cuda_gpu_id_str : order_str) {
+      int32 cuda_gpu_id;
+      if (!strings::safe_strto32(cuda_gpu_id_str, &cuda_gpu_id)) {
+        return errors::InvalidArgument(
+            "Could not parse entry in 'visible_device_list': '",
+            cuda_gpu_id_str, "'. visible_device_list = ", visible_device_list);
+      }
+      if (cuda_gpu_id < 0 || cuda_gpu_id >= gpu_manager->VisibleDeviceCount()) {
+        return errors::InvalidArgument(
+            "'visible_device_list' listed an invalid GPU id '", cuda_gpu_id,
+            "' but visible device count is ",
+            gpu_manager->VisibleDeviceCount());
+      }
+      visible_gpu_order->push_back(CudaGpuId(cuda_gpu_id));
+    }
+  }
+
+  // Validate no repeats.
+  std::set<CudaGpuId> visible_device_set(visible_gpu_order->begin(),
+                                         visible_gpu_order->end());
+  if (visible_device_set.size() != visible_gpu_order->size()) {
+    return errors::InvalidArgument(
+        "visible_device_list contained a duplicate entry: ",
+        visible_device_list);
+  }
+  return Status::OK();
+}
+
+Status VerifyVirtualDeviceSettings(
+    const size_t num_gpus_to_use, const GPUOptions& gpu_options,
+    const std::vector<CudaGpuId>& visible_gpu_order,
+    const std::vector<CudaGpuId>& valid_cuda_gpu_ids) {
+  const auto& virtual_devices = gpu_options.experimental().virtual_devices();
+  CHECK(!virtual_devices.empty());
+  if (gpu_options.per_process_gpu_memory_fraction() > 0) {
+    return errors::InvalidArgument(
+        "It's invalid to set per_process_gpu_memory_fraction when "
+        "virtual_devices is set.");
+  }
+  if (num_gpus_to_use < virtual_devices.size()) {
+    return errors::Unknown(
+        "Not enough GPUs to create virtual devices."
+        " num_gpus_to_use: ",
+        num_gpus_to_use, " #virtual_devices: ", virtual_devices.size());
+  }
+  if (!gpu_options.visible_device_list().empty() &&
+      visible_gpu_order.size() != virtual_devices.size()) {
+    return errors::InvalidArgument(
+        "The number of GPUs in visible_device_list doesn't match the number "
+        "of elements in the virtual_devices list.",
+        " #GPUs in visible_device_list: ", visible_gpu_order.size(),
+        " virtual_devices.size(): ", virtual_devices.size());
+  }
+  if (valid_cuda_gpu_ids.size() != virtual_devices.size()) {
+    return errors::Unknown(
+        "The number of valid GPUs doesn't match the number of elements in "
+        "the virtual_devices list.",
+        " #valid GPUs: ", valid_cuda_gpu_ids.size(),
+        " virtual_devices.size(): ", virtual_devices.size());
+  }
+  return Status::OK();
+}
+
+int64 MinSystemMemory(int64 available_memory) {
+  // We use the following heuristic for now:
+  //
+  // If the available_memory is < 2GiB, we allocate 225MiB to system memory.
+  // Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory.
+  //
+  // In the future we could be more sophisticated by using a table of devices.
+  int64 min_system_memory;
+  if (available_memory < (1LL << 31)) {
+    // 225MiB
+    min_system_memory = 225 * 1024 * 1024;
+  } else {
+    // max(300 MiB, 0.05 * available_memory)
+    min_system_memory =
+        std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
+  }
+#if defined(__GNUC__) && defined(__OPTIMIZE__)
+// Do nothing
+#elif !defined(__GNUC__) && defined(NDEBUG)
+// Do nothing
+#else
+  // Double the amount of available GPU memory in non-opt builds (debug
+  // builds in windows); because in non-opt builds more system memory
+  // is necessary.
+  min_system_memory *= 2;
+#endif
+  return min_system_memory;
+}
+
+// Get the memory limit for the virtual device being created on GPU with
+// 'cuda_gpu_id', when that virtual device is the only virtual device being
+// created on that GPU.
+Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
+                                      CudaGpuId cuda_gpu_id,
+                                      int64* memory_limit) {
+  int64 total_memory = 0;
+  int64 available_memory = 0;
+  gpu::StreamExecutor* se =
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
+  if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
+    return errors::Unknown("Failed to query available memory for GPU ",
+                           cuda_gpu_id.value());
+  }
+
+  int64 allocated_memory = 0;
+  const double per_process_gpu_memory_fraction =
+      gpu_options.per_process_gpu_memory_fraction();
+  if (per_process_gpu_memory_fraction == 0) {
+    allocated_memory = available_memory;
+    const int64 min_system_memory = MinSystemMemory(available_memory);
+    if (min_system_memory < allocated_memory) {
+      allocated_memory -= min_system_memory;
+    }
+  } else {
+    allocated_memory = total_memory * per_process_gpu_memory_fraction;
+  }
+  *memory_limit = allocated_memory;
+  return Status::OK();
+}
 }  // namespace
 
 void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
@@ -617,7 +805,7 @@ void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
   DCHECK(concrete_device);
   const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>(
       streams_[stream_id]->compute->implementation()->CudaStreamMemberHack());
-  concrete_device->Reinitialize(context, cuda_stream, gpu_id_, allocator,
+  concrete_device->Reinitialize(context, cuda_stream, tf_gpu_id_, allocator,
                                 scratch_[stream_id]);
 }
 
@@ -644,18 +832,32 @@ void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
 Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
                                            const string& name_prefix,
                                            std::vector<Device*>* devices) {
-  size_t n = INT_MAX;
+  TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
+  gpu::Platform* gpu_manager = GPUMachineManager();
+  if (gpu_manager == nullptr) {
+    return Status::OK();
+  }
+  // If there are no GPUs visible, do nothing.
+  if (gpu_manager->VisibleDeviceCount() <= 0) {
+    return Status::OK();
+  }
+
+  size_t num_gpus_to_use = INT_MAX;
   auto iter = options.config.device_count().find("GPU");
   if (iter != options.config.device_count().end()) {
-    n = iter->second;
+    num_gpus_to_use = iter->second;
   }
-  std::vector<int> valid_gpu_ids;
-  TF_RETURN_IF_ERROR(GetValidDeviceIds(
-      options.config.gpu_options().visible_device_list(), &valid_gpu_ids));
-  if (static_cast<size_t>(n) > valid_gpu_ids.size()) {
-    n = valid_gpu_ids.size();
+  const auto& gpu_options = options.config.gpu_options();
+  std::vector<CudaGpuId> visible_gpu_order;
+  TF_RETURN_IF_ERROR(ParseVisibleDeviceList(gpu_options.visible_device_list(),
+                                            &visible_gpu_order));
+
+  std::vector<CudaGpuId> valid_cuda_gpu_ids;
+  TF_RETURN_IF_ERROR(GetValidDeviceIds(visible_gpu_order, &valid_cuda_gpu_ids));
+  if (num_gpus_to_use > valid_cuda_gpu_ids.size()) {
+    num_gpus_to_use = valid_cuda_gpu_ids.size();
   }
-  if (!valid_gpu_ids.empty()) {
+  if (!valid_cuda_gpu_ids.empty()) {
     // Save the original device.
     int original_device = 0;
     cudaError_t err = cudaGetDevice(&original_device);
@@ -665,16 +867,16 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
     }
     // Force to implicitly initialize CUDA runtime on each valid GPU before
     // CreateGPUDevice().
-    for (int gpu_id : valid_gpu_ids) {
-      err = cudaSetDevice(gpu_id);
+    for (CudaGpuId cuda_gpu_id : valid_cuda_gpu_ids) {
+      err = cudaSetDevice(cuda_gpu_id.value());
       if (err != cudaSuccess) {
-        return errors::Internal("cudaSetDevice() on GPU:", gpu_id,
+        return errors::Internal("cudaSetDevice() on GPU:", cuda_gpu_id.value(),
                                 " failed. Status: ", cudaGetErrorString(err));
       }
       err = cudaFree(nullptr);
       if (err != cudaSuccess) {
         return errors::Internal(
-            "CUDA runtime implicit initialization on GPU:", gpu_id,
+            "CUDA runtime implicit initialization on GPU:", cuda_gpu_id.value(),
             " failed. Status: ", cudaGetErrorString(err));
       }
     }
@@ -685,51 +887,45 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
                               " failed. Status: ", cudaGetErrorString(err));
     }
   }
-  for (int i = 0; i < n; i++) {
-    BaseGPUDevice* gpu_device;
-    TF_RETURN_IF_ERROR(CreateGPUDevice(
-        options, strings::StrCat(name_prefix, "/device:GPU:", i),
-        valid_gpu_ids[i], &gpu_device));
-    TF_RETURN_IF_ERROR(gpu_device->Init(options));
-    devices->push_back(gpu_device);
-  }
-
-  return Status::OK();
-}
 
-namespace {
-int64 MinSystemMemory(int64 available_memory) {
-  // We use the following heuristic for now:
-  //
-  // If the available_memory is < 2GiB, we allocate 225MiB to system memory.
-  // Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory.
-  //
-  // In the future we could be more sophisticated by using a table of devices.
-  int64 min_system_memory;
-  if (available_memory < (1LL << 31)) {
-    // 225MiB
-    min_system_memory = 225 * 1024 * 1024;
-  } else {
-    // max(300 MiB, 0.05 * available_memory)
-    min_system_memory =
-        std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
+  const auto& virtual_devices = gpu_options.experimental().virtual_devices();
+  if (!virtual_devices.empty()) {
+    TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(
+        num_gpus_to_use, gpu_options, visible_gpu_order, valid_cuda_gpu_ids));
+    // We've verified that num_gpus_to_use >= virtual_devices.size().
+    num_gpus_to_use = virtual_devices.size();
+    CHECK(gpu_options.visible_device_list().empty() ||
+          valid_cuda_gpu_ids == visible_gpu_order);
   }
-#if defined(__GNUC__) && defined(__OPTIMIZE__)
-// Do nothing
-#elif !defined(__GNUC__) && defined(NDEBUG)
-// Do nothing
-#else
-  // Double the amount of available GPU memory in non-opt builds (debug
-  // builds in windows); because in non-opt builds more system memory
-  // is necessary.
-  min_system_memory *= 2;
-#endif
-  return min_system_memory;
+  int next_tf_gpu_id = 0;
+  for (int i = 0; i < num_gpus_to_use; ++i) {
+    const CudaGpuId cuda_gpu_id = valid_cuda_gpu_ids[i];
+    std::vector<int64> memory_limit_bytes;
+    if (virtual_devices.empty() ||
+        virtual_devices.Get(i).memory_limit_mb_size() == 0) {
+      int64 single_virtual_device_memory_limit = 0;
+      TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit(
+          gpu_options, cuda_gpu_id, &single_virtual_device_memory_limit));
+      memory_limit_bytes.push_back(single_virtual_device_memory_limit);
+    } else {
+      const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb();
+      std::transform(memory_limit_mb.begin(), memory_limit_mb.end(),
+                     std::back_inserter(memory_limit_bytes), [](float mb) {
+                       return static_cast<int64>(mb) * (1ll << 20);
+                     });
+    }
+    for (int64 bytes : memory_limit_bytes) {
+      TfGpuId tf_gpu_id(next_tf_gpu_id);
+      ++next_tf_gpu_id;
+      GpuIdUtil::InsertTfCudaGpuIdPair(tf_gpu_id, cuda_gpu_id);
+      TF_RETURN_IF_ERROR(
+          CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes, devices));
+    }
+  }
+  return Status::OK();
 }
 
-}  // namespace
-
-static string GetShortDeviceDescription(int device_id,
+static string GetShortDeviceDescription(CudaGpuId cuda_gpu_id,
                                         const gpu::DeviceDescription& desc) {
   int cc_major;
   int cc_minor;
@@ -738,22 +934,26 @@ static string GetShortDeviceDescription(int device_id,
     cc_minor = 0;
   }
   // LINT.IfChange
-  return strings::StrCat("device: ", device_id, ", name: ", desc.name(),
+  return strings::StrCat("device: ", cuda_gpu_id.value(),
+                         ", name: ", desc.name(),
                          ", pci bus id: ", desc.pci_bus_id(),
                          ", compute capability: ", cc_major, ".", cc_minor);
   // LINT.ThenChange(//tensorflow/python/platform/test.py)
 }
 
 Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
-                                             const string& name, int gpu_id,
-                                             BaseGPUDevice** out_device) {
-  CHECK_GE(gpu_id, 0);
+                                             const string& name_prefix,
+                                             TfGpuId tf_gpu_id,
+                                             int64 memory_limit,
+                                             std::vector<Device*>* devices) {
+  CHECK_GE(tf_gpu_id.value(), 0);
+  const string device_name =
+      strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
 
   // Look up the device, to see its attributes.
-  gpu::Platform* gpu_platform = GPUMachineManager();
-  CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount());
+  GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
   gpu::StreamExecutor* se =
-      gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
+      GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
   const gpu::DeviceDescription& desc = se->GetDeviceDescription();
   int numa_node = desc.numa_node();
   if (numa_node < 0) {
@@ -763,60 +963,49 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
     // may run into trouble later with data transfer operations.  The
     // trouble may manifest as slower than expected performance, or
     // outright failures.
-    LOG(INFO) << "Could not identify NUMA node of " << name
+    LOG(INFO) << "Could not identify NUMA node of " << device_name
               << ", defaulting to 0.  Your kernel may not have been built "
               << "with NUMA support.";
     numa_node = 0;
   }
-
-  int64 total_memory, available_memory;
-  if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
-    return errors::Unknown(
-        strings::StrCat("Failed to query available memory for GPU ", gpu_id));
-  }
-
-  int64 allocated_memory;
-  double config_memory_fraction =
-      options.config.gpu_options().per_process_gpu_memory_fraction();
-  if (config_memory_fraction == 0) {
-    allocated_memory = available_memory;
-    const int64 min_system_memory = MinSystemMemory(available_memory);
-    if (min_system_memory < allocated_memory) {
-      allocated_memory -= min_system_memory;
-    }
-  } else {
-    allocated_memory = total_memory * config_memory_fraction;
-  }
-
-  Bytes allocated_bytes = static_cast<Bytes>(allocated_memory);
+  Bytes allocated_bytes = static_cast<Bytes>(memory_limit);
 
   // Get GPU bus_id from its reported NUMA affinity.  Because GPUs are
   // virtualized in some environments, we can't just use the GPU id.
   // NUMA locales are indexed from 0, buses are indexed from 1.
   DeviceLocality dev_locality;
   dev_locality.set_bus_id(numa_node + 1);
-  VLOG(1) << "GPUDevice id " << gpu_id << " on bus " << dev_locality.bus_id()
-          << " numa: " << numa_node << " pci: " << desc.pci_bus_id();
-
+  const CudaGpuId cuda_gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id);
+  VLOG(1) << "GPUDevice id " << cuda_gpu_id << " on bus "
+          << dev_locality.bus_id() << " numa: " << numa_node
+          << " pci: " << desc.pci_bus_id();
+
+  LOG(INFO) << "Creating TensorFlow device (" << device_name << " with "
+            << (memory_limit >> 20) << " MB memory) -> physical GPU ("
+            << GetShortDeviceDescription(cuda_gpu_id, desc) << ")";
   ProcessState* process_state = ProcessState::singleton();
-  *out_device = CreateGPUDevice(
-      options, name, allocated_bytes, dev_locality, gpu_id,
-      GetShortDeviceDescription(gpu_id, desc),
-      process_state->GetGPUAllocator(options.config.gpu_options(), gpu_id,
-                                     allocated_memory),
+  BaseGPUDevice* gpu_device = CreateGPUDevice(
+      options, device_name, allocated_bytes, dev_locality, tf_gpu_id,
+      GetShortDeviceDescription(cuda_gpu_id, desc),
+      process_state->GetGPUAllocator(options.config.gpu_options(), tf_gpu_id,
+                                     memory_limit),
       process_state->GetCPUAllocator(numa_node));
+  TF_RETURN_IF_ERROR(gpu_device->Init(options));
+  devices->push_back(gpu_device);
 
   return Status::OK();
 }
 
 static int GetDefaultMinGPUMultiprocessorCount(
-    gpu::Platform* gpu_manager, const std::vector<int>& visible_gpu_order) {
+    gpu::Platform* gpu_manager,
+    const std::vector<CudaGpuId>& visible_gpu_order) {
   static const int kDefaultMinGPUMultiprocessorCount = 8;
 
   // Find the highest multi-processor count across all visible GPUs.
   int max_count = -1;
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
-    auto exec_status = gpu_manager->ExecutorForDevice(visible_gpu_order[i]);
+    auto exec_status =
+        GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_order[i]);
     if (!exec_status.ok()) {
       continue;
     }
@@ -834,7 +1023,8 @@ static int GetDefaultMinGPUMultiprocessorCount(
 }
 
 static int GetMinGPUMultiprocessorCount(
-    gpu::Platform* gpu_manager, const std::vector<int>& visible_gpu_order) {
+    gpu::Platform* gpu_manager,
+    const std::vector<CudaGpuId>& visible_gpu_order) {
   const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
 
   if (tf_min_gpu_core_count == nullptr ||
@@ -912,17 +1102,17 @@ std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
 }
 
 std::unique_ptr<std::map<std::pair<int, int>, bool>> GetPeerAccessMap(
-    gpu::Platform* platform, const std::vector<int>& visible_gpu_order) {
+    gpu::Platform* platform, const std::vector<CudaGpuId>& visible_gpu_order) {
   std::unique_ptr<std::map<std::pair<int, int>, bool>> map(
       new std::map<std::pair<int, int>, bool>);
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
-    const int i_gpu_id = visible_gpu_order[i];
+    const CudaGpuId i_gpu_id = visible_gpu_order[i];
     for (int j = 0; j < visible_gpu_order.size(); ++j) {
-      const int j_gpu_id = visible_gpu_order[j];
+      const CudaGpuId j_gpu_id = visible_gpu_order[j];
       gpu::StreamExecutor* from =
-          platform->ExecutorForDevice(i_gpu_id).ValueOrDie();
+          GpuIdUtil::ExecutorForCudaGpuId(platform, i_gpu_id).ValueOrDie();
       gpu::StreamExecutor* to =
-          platform->ExecutorForDevice(j_gpu_id).ValueOrDie();
+          GpuIdUtil::ExecutorForCudaGpuId(platform, j_gpu_id).ValueOrDie();
       (*map)[{i, j}] = from->CanEnablePeerAccessTo(to);
     }
   }
@@ -931,19 +1121,18 @@ std::unique_ptr<std::map<std::pair<int, int>, bool>> GetPeerAccessMap(
 }
 
 Status EnablePeerAccess(gpu::Platform* platform,
-                        const std::vector<int>& visible_gpu_order) {
+                        const std::vector<CudaGpuId>& visible_gpu_order) {
   int possible_peer_count = 0;
   int enabled_peer_count = 0;
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
-    const int i_gpu_id = visible_gpu_order[i];
+    const CudaGpuId i_gpu_id = visible_gpu_order[i];
     for (int j = 0; j < visible_gpu_order.size(); ++j) {
-      const int j_gpu_id = visible_gpu_order[j];
-      // We have already validated that ExecutorForDevice() calls
-      // return OK.
+      const CudaGpuId j_gpu_id = visible_gpu_order[j];
+      // We have already validated that ExecutorForDevice() calls return OK.
       gpu::StreamExecutor* from =
-          platform->ExecutorForDevice(i_gpu_id).ValueOrDie();
+          GpuIdUtil::ExecutorForCudaGpuId(platform, i_gpu_id).ValueOrDie();
       gpu::StreamExecutor* to =
-          platform->ExecutorForDevice(j_gpu_id).ValueOrDie();
+          GpuIdUtil::ExecutorForCudaGpuId(platform, j_gpu_id).ValueOrDie();
 
       if (from->CanEnablePeerAccessTo(to)) {
         ++possible_peer_count;
@@ -951,7 +1140,7 @@ Status EnablePeerAccess(gpu::Platform* platform,
         if (!status.ok()) {
           LOG(WARNING)
               << "Unable to enable peer access between device ordinals "
-              << i_gpu_id << " and " << j_gpu_id;
+              << i_gpu_id << " and " << j_gpu_id << ", status: " << status;
         } else {
           ++enabled_peer_count;
         }
@@ -974,73 +1163,22 @@ Status EnablePeerAccess(gpu::Platform* platform,
 }  // namespace
 
 Status BaseGPUDeviceFactory::GetValidDeviceIds(
-    const string& visible_device_list, std::vector<int>* ids) {
-  TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
-
+    const std::vector<CudaGpuId>& visible_gpu_order,
+    std::vector<CudaGpuId>* ids) {
   gpu::Platform* gpu_manager = GPUMachineManager();
-  if (gpu_manager == nullptr) {
-    return Status::OK();
-  }
-
-  // If there are no GPUs visible, do nothing.
-  if (gpu_manager->VisibleDeviceCount() <= 0) {
-    return Status::OK();
-  }
-
-  // If the user wants to remap the visible to virtual GPU mapping,
-  // check for that here.
-  std::vector<int> visible_gpu_order;
-  if (visible_device_list.empty()) {
-    visible_gpu_order.resize(gpu_manager->VisibleDeviceCount());
-    // By default, visible to virtual mapping is unchanged.
-    int deviceNo = 0;
-    std::generate(visible_gpu_order.begin(), visible_gpu_order.end(),
-                  [&deviceNo] { return deviceNo++; });
-  } else {
-    std::vector<string> order_str = str_util::Split(visible_device_list, ',');
-    for (int i = 0; i < order_str.size(); ++i) {
-      const string& gpu_id_str = order_str[i];
-      int32 gpu_id;
-      if (!strings::safe_strto32(gpu_id_str, &gpu_id)) {
-        return errors::InvalidArgument(
-            "Could not parse entry in 'visible_device_list': '", gpu_id_str,
-            "'.  visible_device_list = ", visible_device_list);
-      }
-
-      if (gpu_id < 0 || gpu_id >= gpu_manager->VisibleDeviceCount()) {
-        return errors::InvalidArgument(
-            "'visible_device_list' listed an invalid GPU id '", gpu_id,
-            "' but visible device count is ",
-            gpu_manager->VisibleDeviceCount());
-      }
-
-      visible_gpu_order.push_back(gpu_id);
-    }
-  }
-
-  // Validate no repeats.
-  std::set<int> visible_device_set(visible_gpu_order.begin(),
-                                   visible_gpu_order.end());
-  if (visible_device_set.size() != visible_gpu_order.size()) {
-    return errors::InvalidArgument(
-        "visible_device_list contained "
-        "a duplicate entry: ",
-        visible_device_list);
-  }
-
   bool new_gpu_found = false;
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
-    int gpu_id = visible_gpu_order[i];
+    const CudaGpuId cuda_gpu_id = visible_gpu_order[i];
 
-    // Only perform this once per visible gpu id.
-    if (visible_gpu_initialized_[gpu_id]) {
+    // Only perform this once per visible cuda gpu id.
+    if (visible_gpu_initialized_[cuda_gpu_id.value()]) {
       continue;
     }
 
-    visible_gpu_initialized_[gpu_id] = true;
+    visible_gpu_initialized_[cuda_gpu_id.value()] = true;
     new_gpu_found = true;
 
-    auto executor = gpu_manager->ExecutorForDevice(gpu_id);
+    auto executor = GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, cuda_gpu_id);
     if (!executor.ok()) {
       return StreamExecutorUtil::ConvertStatus(executor.status());
     }
@@ -1080,11 +1218,11 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
     string line_buf = "DMA: ";
     for (int i = 0; i < visible_gpu_order.size(); ++i) {
-      strings::StrAppend(&line_buf, visible_gpu_order[i], " ");
+      strings::StrAppend(&line_buf, visible_gpu_order[i].value(), " ");
     }
     LOG(INFO) << line_buf;
     for (int i = 0; i < visible_gpu_order.size(); ++i) {
-      line_buf = strings::StrCat(visible_gpu_order[i], ":   ");
+      line_buf = strings::StrCat(visible_gpu_order[i].value(), ":   ");
       for (int j = 0; j < visible_gpu_order.size(); ++j) {
         if ((*access_map)[{i, j}]) {
           line_buf.append("Y ");
@@ -1109,9 +1247,13 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
 
   // Filter out devices that don't have the right capability or power.
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
-    const int32 visible_gpu_id = visible_gpu_order[i];
-    auto exec_status = gpu_manager->ExecutorForDevice(visible_gpu_id);
+    const CudaGpuId visible_gpu_id = visible_gpu_order[i];
+    auto exec_status =
+        GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_id);
     if (!exec_status.ok()) {
+      LOG(INFO) << "Ignoring visible gpu device " << visible_gpu_id
+                << " whose executor is in invalid state: "
+                << exec_status.status().ToString();
       continue;
     }
     gpu::StreamExecutor* se = exec_status.ValueOrDie();
@@ -1119,6 +1261,10 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     CudaVersion device_capability;
     if (!desc.cuda_compute_capability(&device_capability.major_part,
                                       &device_capability.minor_part)) {
+      LOG(INFO) << "Ignoring visible gpu device "
+                << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
+                << ") "
+                << "whose CUDA compute capability is not available.";
       continue;
     }
     // Only GPUs with no less than the minimum supported compute capability is
@@ -1138,7 +1284,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     // multiprocessors. If the TF_MIN_GPU_MULTIPROCESSOR_COUNT environment
     // variable is set, its value will be used to filter out GPUs.
     if (desc.core_count() < min_gpu_core_count) {
-      LOG(INFO) << "Ignoring gpu device "
+      LOG(INFO) << "Ignoring visible gpu device "
                 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
                 << ") "
                 << "with Cuda multiprocessor count: " << desc.core_count()
@@ -1147,12 +1293,8 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
                    "TF_MIN_GPU_MULTIPROCESSOR_COUNT.";
       continue;
     }
-
-    size_t new_id = ids->size();
+    LOG(INFO) << "Adding visible gpu device " << visible_gpu_id;
     ids->push_back(visible_gpu_id);
-
-    LOG(INFO) << "Creating TensorFlow device (/device:GPU:" << new_id << ") -> "
-              << "(" << GetShortDeviceDescription(visible_gpu_id, desc) << ")";
   }
 
   return Status::OK();
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 442496437af5f4796f6d216f7c688d31f2f457d7..41e60b4884673673f2e791cbbafa4ef0091bdf8f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -28,6 +28,8 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -45,10 +47,10 @@ namespace tensorflow {
 class BaseGPUDevice : public LocalDevice {
  public:
   BaseGPUDevice(const SessionOptions& options, const string& name,
-                Bytes memory_limit, const DeviceLocality& locality, int gpu_id,
-                const string& physical_device_desc, Allocator* gpu_allocator,
-                Allocator* cpu_allocator, bool sync_every_op,
-                int32 max_streams);
+                Bytes memory_limit, const DeviceLocality& locality,
+                TfGpuId tf_gpu_id, const string& physical_device_desc,
+                Allocator* gpu_allocator, Allocator* cpu_allocator,
+                bool sync_every_op, int32 max_streams);
 
   ~BaseGPUDevice() override;
 
@@ -84,9 +86,9 @@ class BaseGPUDevice : public LocalDevice {
   void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
                              DeviceContext* dc, Allocator* allocator) override;
 
-  // Returns the id of this device within the native driver system; e.g., for
-  // CUDA this is the ordinal of the GPU within the system.
-  int gpu_id() const { return gpu_id_; }
+  // Returns the CUDA GPU id of this device within the native driver system;
+  // e.g., for CUDA this is the ordinal of the GPU within the system.
+  int gpu_id() const { return GpuIdUtil::TfToCudaGpuId(tf_gpu_id_).value(); }
 
   // The executor that provides control for the device; e.g., for CUDA this
   // corresponds to the cuda context.
@@ -112,10 +114,11 @@ class BaseGPUDevice : public LocalDevice {
   std::vector<GPUDeviceContext*> device_contexts_;
   GpuDeviceInfo* gpu_device_info_ = nullptr;
   mutex trace_mu_;
-  int gpu_id_ = -1;
+  TfGpuId tf_gpu_id_;
   const bool sync_every_op_ = false;
   const int32 max_streams_;
   std::unique_ptr<EventMgr> em_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
 
   void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device,
                           int stream_id, Allocator* allocator);
@@ -138,25 +141,30 @@ class BaseGPUDeviceFactory : public DeviceFactory {
                        std::vector<Device*>* devices) override;
 
  private:
-  Status CreateGPUDevice(const SessionOptions& options, const string& name,
-                         int gpu_id, BaseGPUDevice** out_device);
+  // Creates a BaseGPUDevice associated with 'tf_gpu_id', allocates (strictly)
+  // 'memory_limit' bytes of GPU memory to it, and adds it to the 'devices'
+  // vector.
+  Status CreateGPUDevice(const SessionOptions& options,
+                         const string& name_prefix, TfGpuId tf_gpu_id,
+                         int64 memory_limit, std::vector<Device*>* devices);
 
   virtual BaseGPUDevice* CreateGPUDevice(const SessionOptions& options,
                                          const string& name, Bytes memory_limit,
                                          const DeviceLocality& locality,
-                                         int gpu_id,
+                                         TfGpuId tf_gpu_id,
                                          const string& physical_device_desc,
                                          Allocator* gpu_allocator,
                                          Allocator* cpu_allocator) = 0;
 
-  // Returns into 'ids' the list of valid GPU ids, in the order that
-  // they should map to logical gpu ids "/device:GPU:0", "/device:GPU:1", etc, based
-  // upon 'visible_device_list', a comma-separated list of 'visible
-  // gpu ids'.
-  Status GetValidDeviceIds(const string& visible_device_list,
-                           std::vector<int>* ids);
+  // Returns into 'ids' the list of valid CUDA GPU ids, in the order that
+  // they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc,
+  // based upon 'visible_gpu_order' which was generated by parsing
+  // GPUOptions::visible_device_list which is a comma-separated list of CUDA GPU
+  // ids.
+  Status GetValidDeviceIds(const std::vector<CudaGpuId>& visible_gpu_order,
+                           std::vector<CudaGpuId>* ids);
 
-  // visible_gpu_initialized_[gpu_id] is true if visible GPU gpu_id
+  // visible_gpu_initialized_[cuda_gpu_id] is true if visible GPU cuda_gpu_id
   // has been initialized by the process.
   std::unordered_map<int, bool> visible_gpu_initialized_;
 };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
index 63ac3daba142b0076407110509034a512b00ff37..9a000749c6e677743ea700eb941f4147646ddc55 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/common_runtime/gpu/gpu_device.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
 
@@ -26,10 +27,10 @@ namespace tensorflow {
 class GPUDevice : public BaseGPUDevice {
  public:
   GPUDevice(const SessionOptions& options, const string& name,
-            Bytes memory_limit, const DeviceLocality& locality, int gpu_id,
-            const string& physical_device_desc, Allocator* gpu_allocator,
-            Allocator* cpu_allocator)
-      : BaseGPUDevice(options, name, memory_limit, locality, gpu_id,
+            Bytes memory_limit, const DeviceLocality& locality,
+            TfGpuId tf_gpu_id, const string& physical_device_desc,
+            Allocator* gpu_allocator, Allocator* cpu_allocator)
+      : BaseGPUDevice(options, name, memory_limit, locality, tf_gpu_id,
                       physical_device_desc, gpu_allocator, cpu_allocator,
                       false /* sync every op */, 1 /* max_streams */) {
     if (options.config.has_gpu_options()) {
@@ -59,11 +60,12 @@ class GPUDeviceFactory : public BaseGPUDeviceFactory {
  private:
   BaseGPUDevice* CreateGPUDevice(const SessionOptions& options,
                                  const string& name, Bytes memory_limit,
-                                 const DeviceLocality& locality, int gpu_id,
+                                 const DeviceLocality& locality,
+                                 TfGpuId tf_gpu_id,
                                  const string& physical_device_desc,
                                  Allocator* gpu_allocator,
                                  Allocator* cpu_allocator) override {
-    return new GPUDevice(options, name, memory_limit, locality, gpu_id,
+    return new GPUDevice(options, name, memory_limit, locality, tf_gpu_id,
                          physical_device_desc, gpu_allocator, cpu_allocator);
   }
 };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..75be6d60b86af101fb9de7497490e72c523d632b
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/test.h"
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(GPUDeviceOnNonGPUMachineTest, CreateGPUDevicesOnNonGPUMachine) {
+  SessionOptions opts;
+  std::vector<tensorflow::Device*> devices;
+  TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, "/job:localhost/replica:0/task:0", &devices));
+  EXPECT_TRUE(devices.empty());
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+
+int main(int argc, char** argv) {
+#if GOOGLE_CUDA
+  // Sets CUDA_VISIBLE_DEVICES to empty string to simulate non-gpu environment.
+  setenv("CUDA_VISIBLE_DEVICES", "", 1);
+#endif  // GOOGLE_CUDA
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff46be9c015ac3d0ad59e302f53d52c4bd3e25ea
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -0,0 +1,189 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
+
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+const char* kDeviceNamePrefix = "/job:localhost/replica:0/task:0";
+
+static SessionOptions MakeSessionOptions(
+    const string& visible_device_list = "",
+    double per_process_gpu_memory_fraction = 0, int gpu_device_count = 1,
+    const std::vector<std::vector<float>>& memory_limit_mb = {}) {
+  SessionOptions options;
+  ConfigProto* config = &options.config;
+  (*config->mutable_device_count())["GPU"] = gpu_device_count;
+  GPUOptions* gpu_options = config->mutable_gpu_options();
+  gpu_options->set_visible_device_list(visible_device_list);
+  gpu_options->set_per_process_gpu_memory_fraction(
+      per_process_gpu_memory_fraction);
+  for (const auto& v : memory_limit_mb) {
+    auto virtual_devices =
+        gpu_options->mutable_experimental()->add_virtual_devices();
+    for (float mb : v) {
+      virtual_devices->add_memory_limit_mb(mb);
+    }
+  }
+  return options;
+}
+
+static bool StartsWith(const string& lhs, const string& rhs) {
+  if (rhs.length() > lhs.length()) return false;
+  return lhs.substr(0, rhs.length()) == rhs;
+}
+
+TEST(GPUDeviceTest, FailedToParseVisibleDeviceList) {
+  SessionOptions opts = MakeSessionOptions("0,abc");
+  std::vector<tensorflow::Device*> devices;
+  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices);
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_TRUE(StartsWith(status.error_message(), "Could not parse entry"))
+      << status;
+}
+
+TEST(GPUDeviceTest, InvalidGpuId) {
+  SessionOptions opts = MakeSessionOptions("100");
+  std::vector<tensorflow::Device*> devices;
+  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices);
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_TRUE(StartsWith(status.error_message(),
+                         "'visible_device_list' listed an invalid GPU id"))
+      << status;
+}
+
+TEST(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
+  SessionOptions opts = MakeSessionOptions("0,0");
+  std::vector<tensorflow::Device*> devices;
+  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices);
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_TRUE(StartsWith(status.error_message(),
+                         "visible_device_list contained a duplicate entry"))
+      << status;
+}
+
+TEST(GPUDeviceTest, VirtualDeviceConfigConflictsWithMemoryFractionSettings) {
+  SessionOptions opts = MakeSessionOptions("0", 0.1, 1, {{}});
+  std::vector<tensorflow::Device*> devices;
+  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices);
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_TRUE(StartsWith(status.error_message(),
+                         "It's invalid to set per_process_gpu_memory_fraction"))
+      << status;
+}
+
+TEST(GPUDeviceTest, GpuDeviceCountTooSmall) {
+  // device_count is 0, but with one entry in visible_device_list and one
+  // (empty) VirtualDevices messages.
+  SessionOptions opts = MakeSessionOptions("0", 0, 0, {{}});
+  std::vector<tensorflow::Device*> devices;
+  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices);
+  EXPECT_EQ(status.code(), error::UNKNOWN);
+  EXPECT_TRUE(StartsWith(status.error_message(),
+                         "Not enough GPUs to create virtual devices."))
+      << status;
+}
+
+TEST(GPUDeviceTest, NotEnoughGpuInVisibleDeviceList) {
+  // Single entry in visible_device_list with two (empty) VirtualDevices
+  // messages.
+  SessionOptions opts = MakeSessionOptions("0", 0, 8, {{}, {}});
+  std::vector<tensorflow::Device*> devices;
+  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices);
+  EXPECT_EQ(status.code(), error::UNKNOWN);
+  EXPECT_TRUE(StartsWith(status.error_message(),
+                         "Not enough GPUs to create virtual devices."))
+      << status;
+}
+
+TEST(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
+  // This test requires at least two visible GPU hardware.
+  if (GPUMachineManager()->VisibleDeviceCount() < 2) return;
+  // Three entries in visible_device_list with two (empty) VirtualDevices
+  // messages.
+  SessionOptions opts = MakeSessionOptions("0,1", 0, 8, {{}});
+  std::vector<tensorflow::Device*> devices;
+  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices);
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_TRUE(StartsWith(status.error_message(),
+                         "The number of GPUs in visible_device_list doesn't "
+                         "match the number of elements in the virtual_devices "
+                         "list."))
+      << status;
+}
+
+TEST(GPUDeviceTest, EmptyVirtualDeviceConfig) {
+  // It'll create single virtual device when the virtual device config is empty.
+  SessionOptions opts = MakeSessionOptions("0");
+  std::vector<tensorflow::Device*> devices;
+  TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices));
+  EXPECT_EQ(1, devices.size());
+  EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
+  for (auto d : devices) delete d;
+}
+
+TEST(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
+  // It'll create single virtual device for the gpu in question when
+  // memory_limit_mb is unset.
+  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{}});
+  std::vector<tensorflow::Device*> devices;
+  TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices));
+  EXPECT_EQ(1, devices.size());
+  EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
+  for (auto d : devices) delete d;
+}
+
+TEST(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimit) {
+  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123}});
+  std::vector<tensorflow::Device*> devices;
+  TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices));
+  EXPECT_EQ(1, devices.size());
+  EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
+  for (auto d : devices) delete d;
+}
+
+TEST(GPUDeviceTest, MultipleVirtualDevices) {
+  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}});
+  std::vector<tensorflow::Device*> devices;
+  TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices));
+  EXPECT_EQ(2, devices.size());
+  EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
+  EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit());
+  for (auto d : devices) delete d;
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id.h b/tensorflow/core/common_runtime/gpu/gpu_id.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff81ccd4325e0ad22636cd78ba99e0bff6a03347
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_id.h
@@ -0,0 +1,88 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
+
+#include "tensorflow/core/lib/gtl/int_type.h"
+
+namespace tensorflow {
+
+// There are three types of GPU ids:
+// - *physical* GPU id: this is the integer index of a GPU hardware in the
+//   physical machine, it can be filtered by CUDA environment variable
+//   CUDA_VISIBLE_DEVICES. Note that this id is not visible to Tensorflow, but
+//   result after filtering by CUDA_VISIBLE_DEVICES is visible to TF and is
+//   called CUDA GPU id as below. See
+//   http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars
+//   for more details.
+// - CUDA GPU id (also called *visible* GPU id in
+//   third_party/tensorflow/core/protobuf/config.proto): this is the id that is
+//   visible to Tensorflow after filtering by CUDA_VISIBLE_DEVICES, and is
+//   generated by the CUDA GPU driver. It starts from 0 and is used for CUDA API
+//   calls like cuDeviceGet().
+// - TF GPU id (also called *virtual* GPU id in
+//   third_party/tensorflow/core/protobuf/config.proto): this is the id that
+//   Tensorflow generates and exposes to its users. It is the id in the <id>
+//   field of the device name "/device:GPU:<id>", and is also the identifier of
+//   a BaseGPUDevice. Note that the configuration allows us to create multiple
+//   BaseGPUDevice per GPU hardware in order to use multi CUDA streams on the
+//   hardware, so the mapping between TF GPU id and CUDA GPU id is not a 1:1
+//   mappping, see the example below.
+//
+// For example, assuming that in the machine we have GPU device with index 0, 1,
+// 2 and 3 (physical GPU id). Setting "CUDA_VISIBLE_DEVICES=1,2,3" will create
+// the following mapping between CUDA GPU id and physical GPU id:
+//
+//        CUDA GPU id ->  physical GPU id
+//                 0  ->  1
+//                 1  ->  2
+//                 2  ->  3
+//
+// Note that physical GPU id 0 is invisible to TF so there is no mapping entry
+// for it.
+//
+// Assuming we configure the Session to create one BaseGPUDevice per GPU
+// hardware, then setting GPUOptions::visible_device_list to "2,0" will create
+// the following mappting between TF GPU id and CUDA GPU id:
+//
+//                  TF GPU id  ->  CUDA GPU ID
+//      0 (i.e. /device:GPU:0) ->  2
+//      1 (i.e. /device:GPU:1) ->  0
+//
+// Note that CUDA GPU id 1 is filtered out by GPUOptions::visible_device_list,
+// so it won't be used by the TF process.
+//
+// On the other hand, if we configure it to create 2 BaseGPUDevice per GPU
+// hardware, then setting GPUOptions::visible_device_list to "2,0" will create
+// the following mappting between TF GPU id and CUDA GPU id:
+//
+//                  TF GPU id  ->  CUDA GPU ID
+//      0 (i.e. /device:GPU:0) ->  2
+//      1 (i.e. /device:GPU:1) ->  2
+//      2 (i.e. /device:GPU:2) ->  0
+//      3 (i.e. /device:GPU:3) ->  0
+//
+// We create strong-typed integer classes for both TF GPU id and CUDA GPU id to
+// minimize programming errors and improve code readability. Except for the
+// StreamExecutor interface (as we don't change its API), whenever we need a
+// TF GPU id (or CUDA GPU id) we should use TfGpuId (or CudaGpuId) instead of a
+// raw integer.
+TF_LIB_GTL_DEFINE_INT_TYPE(TfGpuId, int32);
+TF_LIB_GTL_DEFINE_INT_TYPE(CudaGpuId, int32);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_utils.cc b/tensorflow/core/common_runtime/gpu/gpu_id_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..92cd19453f14c886c0d105a5c1809b7fdbcafc9b
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_utils.cc
@@ -0,0 +1,74 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
+
+#include <unordered_map>
+
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace {
+// Manages the map between TfGpuId and CUDA GPU id.
+class GpuIdManager {
+ public:
+  static GpuIdManager* singleton() {
+    static auto* manager = new GpuIdManager;
+    return manager;
+  }
+
+  void InsertOrDie(TfGpuId tf_gpu_id, CudaGpuId cuda_gpu_id)
+      LOCKS_EXCLUDED(mu_) {
+    std::pair<IdMapType::iterator, bool> result;
+    {
+      mutex_lock lock(mu_);
+      result = id_map_.insert({tf_gpu_id.value(), cuda_gpu_id.value()});
+    }
+    if (!result.second) {
+      CHECK_EQ(cuda_gpu_id.value(), result.first->second)
+          << "Mapping the same TfGpuId to a different CUDA GPU id."
+          << " TfGpuId: " << tf_gpu_id
+          << " Existing mapped CUDA GPU id: " << result.first->second
+          << " CUDA GPU id being tried to map to: " << cuda_gpu_id;
+    }
+  }
+
+  int32 FindOrDie(TfGpuId tf_gpu_id) const LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+    auto result = id_map_.find(tf_gpu_id.value());
+    CHECK(result != id_map_.end())
+        << "Could not find the mapping for TfGpuId: " << tf_gpu_id;
+    return result->second;
+  }
+
+ private:
+  using IdMapType = std::unordered_map<int32, int32>;
+  mutable mutex mu_;
+  IdMapType id_map_ GUARDED_BY(mu_);
+};
+}  // namespace
+
+void GpuIdUtil::InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id,
+                                      CudaGpuId cuda_gpu_id) {
+  GpuIdManager::singleton()->InsertOrDie(tf_gpu_id, cuda_gpu_id);
+}
+
+CudaGpuId GpuIdUtil::TfToCudaGpuId(TfGpuId tf_gpu_id) {
+  return CudaGpuId(GpuIdManager::singleton()->FindOrDie(tf_gpu_id));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..78e51c84c146693dfc02ce445bda030797de6c07
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
@@ -0,0 +1,61 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_UTILS_H_
+
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/lib/gtl/int_type.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+namespace gpu = ::perftools::gputools;
+
+// Utility methods for translation between Tensorflow GPU ids and CUDA GPU ids.
+class GpuIdUtil {
+ public:
+  static void InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id, CudaGpuId cuda_gpu_id);
+  static CudaGpuId TfToCudaGpuId(TfGpuId tf_gpu_id);
+
+  // Convenient methods for getting the associated executor given a TfGpuId or
+  // CudaGpuId.
+  static gpu::port::StatusOr<gpu::StreamExecutor*> ExecutorForCudaGpuId(
+      gpu::Platform* gpu_manager, CudaGpuId cuda_gpu_id) {
+    return gpu_manager->ExecutorForDevice(cuda_gpu_id.value());
+  }
+  static gpu::port::StatusOr<gpu::StreamExecutor*> ExecutorForCudaGpuId(
+      CudaGpuId cuda_gpu_id) {
+    return ExecutorForCudaGpuId(GPUMachineManager(), cuda_gpu_id);
+  }
+  static gpu::port::StatusOr<gpu::StreamExecutor*> ExecutorForTfGpuId(
+      TfGpuId tf_gpu_id) {
+    return ExecutorForCudaGpuId(GpuIdUtil::TfToCudaGpuId(tf_gpu_id));
+  }
+
+  // Verify that the cuda_gpu_id associated with a TfGpuId is legitimate.
+  static void CheckValidTfGpuId(TfGpuId tf_gpu_id) {
+    const CudaGpuId cuda_gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id);
+    const int visible_device_count = GPUMachineManager()->VisibleDeviceCount();
+    CHECK_LT(cuda_gpu_id.value(), visible_device_count)
+        << "cuda_gpu_id is outside discovered device range."
+        << " TF GPU id: " << tf_gpu_id << " CUDA GPU id: " << cuda_gpu_id
+        << " visible device count: " << visible_device_count;
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_UTILS_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_utils_test.cc b/tensorflow/core/common_runtime/gpu/gpu_id_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bebe00a4317becdba1fc6146b4eb188b93933fff
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_utils_test.cc
@@ -0,0 +1,55 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
+
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace test {
+
+TEST(GpuIdTest, Basics) {
+  TfGpuId key_0(0);
+  CudaGpuId value_0(0);
+  GpuIdUtil::InsertTfCudaGpuIdPair(key_0, value_0);
+  EXPECT_EQ(value_0, GpuIdUtil::TfToCudaGpuId(key_0));
+
+  // Multiple calls to map the same value is ok.
+  GpuIdUtil::InsertTfCudaGpuIdPair(key_0, value_0);
+  EXPECT_EQ(value_0, GpuIdUtil::TfToCudaGpuId(key_0));
+
+  // Map a different TfGpuId to a different value.
+  TfGpuId key_1(3);
+  CudaGpuId value_1(2);
+  GpuIdUtil::InsertTfCudaGpuIdPair(key_1, value_1);
+  EXPECT_EQ(value_1, GpuIdUtil::TfToCudaGpuId(key_1));
+
+  // Mapping a different TfGpuId to the same value is ok.
+  TfGpuId key_2(10);
+  GpuIdUtil::InsertTfCudaGpuIdPair(key_2, value_1);
+  EXPECT_EQ(value_1, GpuIdUtil::TfToCudaGpuId(key_2));
+
+  // Mapping the same TfGpuId to a different value will crash the program.
+  ASSERT_DEATH(GpuIdUtil::InsertTfCudaGpuIdPair(key_2, value_0),
+               "Mapping the same TfGpuId to a different CUDA GPU id");
+
+  // Getting an nonexistent mapping will crash the program.
+  ASSERT_DEATH(GpuIdUtil::TfToCudaGpuId(TfGpuId(100)),
+               "Could not find the mapping for TfGpuId");
+}
+
+}  // namespace test
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 657bdf0601bcc721c36209060654a19c3b6afb8a..a0f5877d62f0c889c2a598b8e03771e4bb49e0a9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -352,11 +352,7 @@ Status GPUUtil::Sync(Device* gpu_device) {
   if (!dev_info) {
     return errors::Internal("Failed to find dest device GPUDeviceInfo");
   }
-  dev_info->stream->BlockHostUntilDone();
-  if (!dev_info->stream->ok()) {
-    return errors::Internal("GPU sync failed");
-  }
-  return Status::OK();
+  return dev_info->stream->BlockHostUntilDone();
 }
 
 Status GPUUtil::SyncAll(Device* gpu_device) {
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index 0675dbf3fcdc772f4d45025d296eaddbf4397271..8a3220ce2bb72ee32cd5b3b4d50d568e34c201e5 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/gpu/pool_allocator.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -107,23 +109,20 @@ ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
   return MemDesc();
 }
 
-Allocator* ProcessState::GetGPUAllocator(const GPUOptions& options, int gpu_id,
+Allocator* ProcessState::GetGPUAllocator(const GPUOptions& options,
+                                         TfGpuId tf_gpu_id,
                                          size_t total_bytes) {
 #if GOOGLE_CUDA
   const string& allocator_type = options.allocator_type();
   mutex_lock lock(mu_);
-  gpu::Platform* gpu_platform = GPUMachineManager();
+  GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
 
-  // Verify that gpu_id is legitimate.
-  CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount())
-      << "gpu_id is outside discovered device range";
-
-  if (gpu_id >= static_cast<int64>(gpu_allocators_.size())) {
-    gpu_allocators_.resize(gpu_id + 1);
-    if (FLAGS_brain_gpu_record_mem_types) gpu_al_.resize(gpu_id + 1);
+  if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
+    gpu_allocators_.resize(tf_gpu_id.value() + 1);
+    if (FLAGS_brain_gpu_record_mem_types) gpu_al_.resize(tf_gpu_id.value() + 1);
   }
 
-  if (gpu_allocators_[gpu_id] == nullptr) {
+  if (gpu_allocators_[tf_gpu_id.value()] == nullptr) {
     VisitableAllocator* gpu_allocator;
 
     // Validate allocator types.
@@ -132,45 +131,49 @@ Allocator* ProcessState::GetGPUAllocator(const GPUOptions& options, int gpu_id,
       return nullptr;
     }
 
-    gpu_allocator = new GPUBFCAllocator(gpu_id, total_bytes, options);
+    const CudaGpuId cuda_gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id);
+    gpu_allocator =
+        new GPUBFCAllocator(cuda_gpu_id, total_bytes, options,
+                            strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
 
     // If true, checks for memory overwrites by writing
     // distinctive patterns on both ends of allocated memory.
     if (useCudaMemoryGuardAllocator()) {
-      gpu_allocator = new GPUDebugAllocator(gpu_allocator, gpu_id);
-      gpu_allocator = new GPUNanResetAllocator(gpu_allocator, gpu_id);
+      gpu_allocator = new GPUDebugAllocator(gpu_allocator, cuda_gpu_id);
+      gpu_allocator = new GPUNanResetAllocator(gpu_allocator, cuda_gpu_id);
     } else if (useCudaMallocAllocator()) {
       // If true, passes all allocation requests through to cudaMalloc
       // useful for doing memory debugging with tools like cuda-memcheck
       // **WARNING** probably will not work in a multi-gpu scenario
-      gpu_allocator = new GPUcudaMallocAllocator(gpu_allocator, gpu_id);
+      gpu_allocator = new GPUcudaMallocAllocator(gpu_allocator, cuda_gpu_id);
     }
-    gpu_allocators_[gpu_id] = gpu_allocator;
+    gpu_allocators_[tf_gpu_id.value()] = gpu_allocator;
 
     // If there are any pending AllocVisitors for this bus, add
     // them now.
     gpu::StreamExecutor* se =
-        gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
+        GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
     int bus_id = se->GetDeviceDescription().numa_node();
     if (bus_id >= 0 && bus_id < static_cast<int64>(gpu_visitors_.size())) {
       for (const auto& v : gpu_visitors_[bus_id]) {
-        gpu_allocators_[gpu_id]->AddAllocVisitor(v);
+        gpu_allocator->AddAllocVisitor(v);
       }
     }
     if (FLAGS_brain_gpu_record_mem_types) {
       MemDesc md;
       md.loc = MemDesc::GPU;
-      md.dev_index = gpu_id;
+      md.dev_index = cuda_gpu_id.value();
       md.gpu_registered = false;
       md.nic_registered = true;
-      if (static_cast<int64>(gpu_al_.size()) <= gpu_id)
-        gpu_al_.resize(gpu_id + 1);
-      gpu_al_[gpu_id] = new internal::RecordingAllocator(
-          &mem_desc_map_, gpu_allocators_[gpu_id], md, &mu_);
+      if (static_cast<int64>(gpu_al_.size()) <= tf_gpu_id.value()) {
+        gpu_al_.resize(tf_gpu_id.value() + 1);
+      }
+      gpu_al_[tf_gpu_id.value()] = new internal::RecordingAllocator(
+          &mem_desc_map_, gpu_allocator, md, &mu_);
     }
   }
-  if (FLAGS_brain_gpu_record_mem_types) return gpu_al_[gpu_id];
-  return gpu_allocators_[gpu_id];
+  if (FLAGS_brain_gpu_record_mem_types) return gpu_al_[tf_gpu_id.value()];
+  return gpu_allocators_[tf_gpu_id.value()];
 #else
   LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda.";
   return nullptr;
@@ -246,7 +249,7 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
   gpu::StreamExecutor* se = nullptr;
   for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
     if (gpu_allocators_[i] != nullptr) {
-      se = GPUMachineManager()->ExecutorForDevice(i).ValueOrDie();
+      se = GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie();
       break;
     }
   }
@@ -290,14 +293,12 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
 void ProcessState::AddGPUAllocVisitor(int bus_id, AllocVisitor visitor) {
 #if GOOGLE_CUDA
   mutex_lock lock(mu_);
-  gpu::Platform* gpu_platform = GPUMachineManager();
-  for (int gpu_id = 0; gpu_id < static_cast<int64>(gpu_allocators_.size());
-       ++gpu_id) {
+  for (int i = 0; i < static_cast<int64>(gpu_allocators_.size()); ++i) {
     gpu::StreamExecutor* se =
-        gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
-    if (gpu_allocators_[gpu_id] &&
+        GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie();
+    if (gpu_allocators_[i] &&
         (se->GetDeviceDescription().numa_node() + 1) == bus_id) {
-      gpu_allocators_[gpu_id]->AddAllocVisitor(visitor);
+      gpu_allocators_[i]->AddAllocVisitor(visitor);
     }
   }
   while (bus_id >= static_cast<int64>(gpu_visitors_.size())) {
diff --git a/tensorflow/core/common_runtime/gpu/process_state.h b/tensorflow/core/common_runtime/gpu/process_state.h
index 319c508b92f539cdac04ff5acfa4740b0697bcd5..fa1e3fd785a991fc961b2ff937c08fb3654da9f4 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.h
+++ b/tensorflow/core/common_runtime/gpu/process_state.h
@@ -17,9 +17,11 @@ limitations under the License.
 #define TENSORFLOW_COMMON_RUNTIME_GPU_PROCESS_STATE_H_
 
 #include <functional>
+#include <map>
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -80,17 +82,17 @@ class ProcessState {
   //
   // 'total_bytes' is the total number of bytes that should be made
   // available to the allocator.  The first call to this function for
-  // a given gpu_id creates the allocator, so only the total_bytes
+  // a given tf_gpu_id creates the allocator, so only the total_bytes
   // used on that first call is used.
   //
   // "Allocator type" describes the type of algorithm to use for the
   // underlying allocator.  REQUIRES: Must be a valid type (see
   // config.proto for the list of supported strings.).
   //
-  // REQUIRES: gpu_id must be a valid ordinal for a GPU available in the
+  // REQUIRES: tf_gpu_id must be a valid id for a BaseGPUDevice available in the
   // current system environment.  Otherwise returns nullptr.
-  virtual Allocator* GetGPUAllocator(const GPUOptions& options, int gpu_id,
-                                     size_t total_bytes);
+  virtual Allocator* GetGPUAllocator(const GPUOptions& options,
+                                     TfGpuId tf_gpu_id, size_t total_bytes);
 
   virtual Allocator* GetCUDAHostAllocator(int numa_node);
 
diff --git a/tensorflow/core/common_runtime/pending_counts.h b/tensorflow/core/common_runtime/pending_counts.h
index 9e39b6b7b93a8e35ad3b47c1c637f7d906649823..5707f5259228c0e54d6d858652a8c50986c0c49b 100644
--- a/tensorflow/core/common_runtime/pending_counts.h
+++ b/tensorflow/core/common_runtime/pending_counts.h
@@ -44,7 +44,7 @@ namespace tensorflow {
 
 //    PendingCounts counts(layout);
 //    ...
-//    counts.decrement_panding(h[id], 1);
+//    counts.decrement_pending(h[id], 1);
 class PendingCounts {
  public:
   // The state machine for a node's execution.
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 73fdf60fd5f1669c5c4e0d0c64b37d983c7601fd..54f082e823d463301fc5f437781d01ce96741568 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -129,7 +129,7 @@ class ColocationGraph {
     // 'string' values stored in NodeDef attribute lists, as well as StringPiece
     // values that refer to 'string' values from NodeDef::name(), without
     // performing any string allocations.
-    std::unordered_map<StringPiece, const Node*, StringPiece::Hasher>
+    std::unordered_map<StringPiece, const Node*, StringPieceHasher>
         colocation_group_root;
 
     for (Node* node : graph_->nodes()) {
@@ -171,7 +171,7 @@ class ColocationGraph {
   }
 
   Status ColocateNodeToGroup(
-      std::unordered_map<StringPiece, const Node*, StringPiece::Hasher>*
+      std::unordered_map<StringPiece, const Node*, StringPieceHasher>*
           colocation_group_root,
       Node* node, StringPiece colocation_group) {
     const Node*& root_node = (*colocation_group_root)[colocation_group];
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 142ff2339b90d56381e211c4c7b73009c8134949..53a14121d478edccbcacc12916de2ee2e12602b5 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -30,7 +30,10 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
     const FunctionLibraryDefinition* lib_def,
     const OptimizerOptions& optimizer_options,
     DistributedFunctionLibraryRuntime* parent)
-    : device_mgr_(device_mgr), lib_def_(lib_def), parent_(parent) {
+    : device_mgr_(device_mgr),
+      lib_def_(lib_def),
+      next_handle_(0),
+      parent_(parent) {
   if (device_mgr == nullptr) {
     flr_map_[nullptr] =
         NewFunctionLibraryRuntime(nullptr, env, nullptr, graph_def_version,
@@ -50,7 +53,10 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
     const OptimizerOptions& optimizer_options,
     CustomKernelCreator custom_kernel_creator,
     DistributedFunctionLibraryRuntime* parent)
-    : device_mgr_(device_mgr), lib_def_(lib_def), parent_(parent) {
+    : device_mgr_(device_mgr),
+      lib_def_(lib_def),
+      next_handle_(0),
+      parent_(parent) {
   if (device_mgr == nullptr) {
     flr_map_[nullptr] = NewFunctionLibraryRuntime(
         nullptr, env, nullptr, graph_def_version, lib_def, optimizer_options,
@@ -185,30 +191,38 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle(
   FunctionLibraryRuntime::Handle h =
       gtl::FindWithDefault(table_, function_key, kInvalidHandle);
   if (h != kInvalidHandle) {
-    return h;
+    if (function_data_.count(h) != 0) return h;
   }
-  h = function_data_.size();
-  function_data_.emplace_back(device_name, local_handle);
+  h = next_handle_;
+  function_data_.insert({h, FunctionData(device_name, local_handle)});
   table_[function_key] = h;
+  next_handle_++;
   return h;
 }
 
 FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::GetHandle(
     const string& function_key) const {
   mutex_lock l(mu_);
-  return gtl::FindWithDefault(table_, function_key, kInvalidHandle);
+  FunctionLibraryRuntime::Handle h =
+      gtl::FindWithDefault(table_, function_key, kInvalidHandle);
+  if (h != kInvalidHandle) {
+    if (function_data_.count(h) == 0) return kInvalidHandle;
+  }
+  return h;
 }
 
 bool ProcessFunctionLibraryRuntime::IsInstantiatedOnDevice(
     const string& device_name, FunctionLibraryRuntime::Handle handle) {
-  return GetHandleOnDevice(device_name, handle) != -1;
+  return GetHandleOnDevice(device_name, handle) != kInvalidHandle;
 }
 
 FunctionLibraryRuntime::LocalHandle
 ProcessFunctionLibraryRuntime::GetHandleOnDevice(
     const string& device_name, FunctionLibraryRuntime::Handle handle) {
   mutex_lock l(mu_);
-  CHECK_LE(handle, function_data_.size());
+  if (function_data_.count(handle) == 0) {
+    return kInvalidLocalHandle;
+  }
   const FunctionData& function_data = function_data_[handle];
   if (function_data.target_device != device_name) {
     return kInvalidLocalHandle;
@@ -219,7 +233,7 @@ ProcessFunctionLibraryRuntime::GetHandleOnDevice(
 string ProcessFunctionLibraryRuntime::GetDeviceName(
     FunctionLibraryRuntime::Handle handle) {
   mutex_lock l(mu_);
-  CHECK_LE(handle, function_data_.size());
+  CHECK_EQ(1, function_data_.count(handle));
   const FunctionData& function_data = function_data_[handle];
   return function_data.target_device;
 }
@@ -245,6 +259,29 @@ Status ProcessFunctionLibraryRuntime::Instantiate(
   return Status::OK();
 }
 
+Status ProcessFunctionLibraryRuntime::RemoveHandle(
+    FunctionLibraryRuntime::Handle handle) {
+  mutex_lock l(mu_);
+  function_data_.erase(handle);
+  return Status::OK();
+}
+
+Status ProcessFunctionLibraryRuntime::ReleaseHandle(
+    FunctionLibraryRuntime::Handle handle) {
+  FunctionLibraryRuntime* flr = nullptr;
+  string target_device;
+  {
+    mutex_lock l(mu_);
+    CHECK_EQ(1, function_data_.count(handle));
+    target_device = function_data_[handle].target_device;
+  }
+  flr = GetFLR(target_device);
+  if (flr != nullptr) {
+    return flr->ReleaseHandle(handle);
+  }
+  return errors::InvalidArgument("Handle not found: ", handle);
+}
+
 void ProcessFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
     FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
@@ -261,7 +298,10 @@ void ProcessFunctionLibraryRuntime::Run(
   FunctionLibraryRuntime::LocalHandle local_handle;
   {
     mutex_lock l(mu_);
-    CHECK_LE(handle, function_data_.size());
+    if (function_data_.count(handle) == 0) {
+      done(errors::NotFound("Handle: ", handle, " not found."));
+      return;
+    }
     target_device = function_data_[handle].target_device;
     local_handle = function_data_[handle].local_handle;
   }
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index a267bc3601f990206f7fb5202f6186543e42eb19..3aa7b87286f4875740738b573e8f454cc1331a20 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -123,6 +123,12 @@ class ProcessFunctionLibraryRuntime {
   Status Instantiate(const string& function_name, AttrSlice attrs,
                      FunctionLibraryRuntime::Handle* handle);
 
+  // Delegates to the local FLR that owns state corresponding to `handle` and
+  // tells it to release it. If the `handle` isnt' needed at all, the local FLR
+  // might call RemoveHandle on this to get rid of the state owned by the Proc
+  // FLR.
+  Status ReleaseHandle(FunctionLibraryRuntime::Handle handle);
+
   // Runs the function with given `handle`. Function could have been
   // instantiated on any device. More details in framework/function.h
   void Run(const FunctionLibraryRuntime::Options& opts,
@@ -140,6 +146,9 @@ class ProcessFunctionLibraryRuntime {
   // of the device where the function is registered.
   string GetDeviceName(FunctionLibraryRuntime::Handle handle);
 
+  // Removes handle from the state owned by this object.
+  Status RemoveHandle(FunctionLibraryRuntime::Handle handle);
+
   friend class FunctionLibraryRuntimeImpl;
 
   mutable mutex mu_;
@@ -151,6 +160,7 @@ class ProcessFunctionLibraryRuntime {
     FunctionData(const string& target_device,
                  FunctionLibraryRuntime::LocalHandle local_handle)
         : target_device(target_device), local_handle(local_handle) {}
+    FunctionData() : FunctionData("", -1) {}
   };
 
   const DeviceMgr* const device_mgr_;
@@ -158,8 +168,10 @@ class ProcessFunctionLibraryRuntime {
   // Holds all the function invocations here.
   std::unordered_map<string, FunctionLibraryRuntime::Handle> table_
       GUARDED_BY(mu_);
-  std::vector<FunctionData> function_data_ GUARDED_BY(mu_);
+  std::unordered_map<FunctionLibraryRuntime::Handle, FunctionData>
+      function_data_ GUARDED_BY(mu_);
   std::unordered_map<Device*, std::unique_ptr<FunctionLibraryRuntime>> flr_map_;
+  int next_handle_ GUARDED_BY(mu_);
   DistributedFunctionLibraryRuntime* const parent_;
 };
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 6bc8f980c7ab508f80a7c85a8e557880b8a4ab58..270e46dfe901a985629b452a2747fa654cb4135d 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -82,6 +82,22 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
 
     EXPECT_GE(call_count, 1);  // Test runner is used.
 
+    // Release the handle and then try running the function. It shouldn't
+    // succeed.
+    status = proc_flr_->ReleaseHandle(handle);
+    if (!status.ok()) {
+      return status;
+    }
+    Notification done2;
+    proc_flr_->Run(opts, handle, args, &out,
+                   [&status, &done2](const Status& s) {
+                     status = s;
+                     done2.Notify();
+                   });
+    done2.WaitForNotification();
+    EXPECT_TRUE(errors::IsNotFound(status));
+    EXPECT_TRUE(StringPiece(status.error_message()).contains("not found."));
+
     return Status::OK();
   }
 
diff --git a/tensorflow/core/common_runtime/renamed_device.cc b/tensorflow/core/common_runtime/renamed_device.cc
index fa9713735edd05c36e1787be0e8c89e69c043fb2..56766a8df4526cb2d6fb20c5dcd461a65d2a994b 100644
--- a/tensorflow/core/common_runtime/renamed_device.cc
+++ b/tensorflow/core/common_runtime/renamed_device.cc
@@ -21,7 +21,8 @@ namespace tensorflow {
 /* static */
 Device* RenamedDevice::NewRenamedDevice(const string& new_base,
                                         Device* underlying,
-                                        bool owns_underlying) {
+                                        bool owns_underlying,
+                                        bool isolate_session_state) {
   DeviceNameUtils::ParsedName parsed_name;
   CHECK(DeviceNameUtils::ParseFullName(new_base, &parsed_name));
   DeviceNameUtils::ParsedName underlying_parsed_name =
@@ -35,15 +36,17 @@ Device* RenamedDevice::NewRenamedDevice(const string& new_base,
                                           parsed_name.id);
   DeviceAttributes attributes(underlying->attributes());
   attributes.set_name(name);
-  return new RenamedDevice(underlying, attributes, owns_underlying);
+  return new RenamedDevice(underlying, attributes, owns_underlying,
+                           isolate_session_state);
 }
 
 RenamedDevice::RenamedDevice(Device* underlying,
                              const DeviceAttributes& attributes,
-                             bool owns_underlying)
+                             bool owns_underlying, bool isolate_session_state)
     : Device(underlying->env(), attributes),
       underlying_(underlying),
-      owns_underlying_(owns_underlying) {}
+      owns_underlying_(owns_underlying),
+      isolate_session_state_(isolate_session_state) {}
 
 RenamedDevice::~RenamedDevice() {
   if (owns_underlying_) {
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index 3103ca07512d206b0a62dccb69e56266052d88a2..c5c204d4faff8c5016cc0a48fec266b06409b668 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -29,7 +29,9 @@ namespace tensorflow {
 class RenamedDevice : public Device {
  public:
   static Device* NewRenamedDevice(const string& new_base, Device* underlying,
-                                  bool owns_underlying);
+                                  bool owns_underlying,
+                                  bool isolate_session_state);
+
   ~RenamedDevice() override;
 
   // Below are virtual methods defined on DeviceBase
@@ -113,11 +115,21 @@ class RenamedDevice : public Device {
     return underlying_->FillContextMap(graph, device_context_map);
   }
 
+  // Returns the resource manager associated w/ this device.
+  ResourceMgr* resource_manager() override {
+    if (isolate_session_state_) {
+      return Device::resource_manager();
+    } else {
+      return underlying_->resource_manager();
+    }
+  }
+
  private:
   RenamedDevice(Device* underlying, const DeviceAttributes& attributes,
-                bool owns_underlying);
+                bool owns_underlying, bool isolate_session_state);
   Device* const underlying_;
   const bool owns_underlying_;
+  const bool isolate_session_state_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/rendezvous_util.cc b/tensorflow/core/common_runtime/rendezvous_util.cc
index a1e31016c2bc93aeae76175320255e0d43602265..92dc03812e9941e07500a9dc26baa7c1227430dc 100644
--- a/tensorflow/core/common_runtime/rendezvous_util.cc
+++ b/tensorflow/core/common_runtime/rendezvous_util.cc
@@ -32,6 +32,10 @@ Status SendTensorsToRendezvous(
         "; alloc_attrs.size() = ", alloc_attrs.size());
   }
 
+  if (!rendezvous) {
+    return errors::InvalidArgument("Rendezvous is null.");
+  }
+
   Rendezvous::ParsedKey parsed;
   for (int i = 0; i < keys.size(); ++i) {
     Rendezvous::Args rendez_args;
diff --git a/tensorflow/core/common_runtime/session_factory.cc b/tensorflow/core/common_runtime/session_factory.cc
index dba7a9253e9cc8837a1a471dab621475b1405a49..0234d4c37250d8ed3c645759dd17f94093e57df0 100644
--- a/tensorflow/core/common_runtime/session_factory.cc
+++ b/tensorflow/core/common_runtime/session_factory.cc
@@ -29,7 +29,7 @@ namespace tensorflow {
 namespace {
 
 static mutex* get_session_factory_lock() {
-  static mutex session_factory_lock;
+  static mutex session_factory_lock(LINKER_INITIALIZED);
   return &session_factory_lock;
 }
 
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 10901da192f6ad9382f8b2e8dbcde2c2a3c53575..3ae52f414faf5c47531d6e64fd8666906ce0159a 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -127,7 +127,7 @@ Status InferShapesForFunctionSubNode(const Node* node, ShapeRefiner* refiner,
 //
 // NOTE: Recursive user-defined functions are not supported.
 // Maybe we won't support recursive functions at all in TF, because of
-// other maintanabilty issues.
+// other maintainability issues.
 Status ShapeRefiner::InferShapesForFunction(
     const tensorflow::FunctionDef* function_def, bool keep_nested_shapes,
     ExtendedInferenceContext* outer_context) {
@@ -335,10 +335,14 @@ Status ShapeRefiner::UpdateNode(const Node* node, bool relax, bool* refined) {
     InferenceContext* c = iter->second->get_context();
     DCHECK_GE(dst_input, 0);
     ShapeHandle existing_input = node_context->input(dst_input);
-    if (!relax && node_context->MergeInput(dst_input, c->output(src_output)) &&
-        !existing_input.SameHandle(node_context->input(dst_input))) {
-      *refined = true;
-    } else if (relax) {
+    if (!relax) {
+      if (node_context->MergeInput(dst_input, c->output(src_output))) {
+        if (!SameDefinedShape(node_context, node_context->input(dst_input),
+                              existing_input)) {
+          *refined = true;
+        }
+      }
+    } else {
       if (node_context->RelaxInput(dst_input, c->output(src_output))) {
         if (!SameDefinedShape(node_context, node_context->input(dst_input),
                               existing_input)) {
@@ -703,6 +707,8 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
     *result = target_context->Scalar();
   } else if (src_op == "Shape") {
     *result = src_context->input(0);
+  } else if (src_op == "ShapeN") {
+    *result = src_context->input(input_edge->src_output());
   } else if (src_op == "Pack") {
     std::vector<DimensionHandle> dims;
     // Pack is concatenating its input scalars to form the shape tensor vector.
@@ -865,15 +871,22 @@ Status ShapeRefiner::RunShapeFn(const Node* node,
 
 bool ShapeRefiner::SameDefinedShape(InferenceContext* c, ShapeHandle s0,
                                     ShapeHandle s1) {
-  if (!c->RankKnown(s0)) {
-    return !c->RankKnown(s1);
-  } else if (!c->RankKnown(s1) || c->Rank(s0) != c->Rank(s1)) {
+  if (s0.SameHandle(s1)) {
+    return true;
+  }
+  if (c->Rank(s0) != c->Rank(s1)) {
+    return false;
+  }
+  if (!c->RankKnown(s0) && !c->RankKnown(s1)) {
     return false;
   }
-
   for (int i = 0; i < c->Rank(s0); ++i) {
-    if (c->Value(c->Dim(s0, i)) != c->Value(c->Dim(s1, i))) {
-      return false;
+    if (!c->Dim(s0, i).SameHandle(c->Dim(s1, i))) {
+      int64 val0 = c->Value(c->Dim(s0, i));
+      int64 val1 = c->Value(c->Dim(s1, i));
+      if (val0 < 0 || val1 < 0 || val0 != val1) {
+        return false;
+      }
     }
   }
 
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index ff32e855d591707f822d4c8f6fc3c1adac3ac7de..e4eef1dbe28bc79d2838b90ba6595a04ad1e4e2e 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -1161,11 +1161,13 @@ TEST_F(ShapeRefinerTest, SameDefinedShape) {
   auto s_unknown_2 = ctx->MakeShape({-1, 2});
   auto s_unknown_2_b = ctx->MakeShape({-1, 2});
 
-  EXPECT_TRUE(SameDefinedShape(ctx, unknown, unknown_b));
+  EXPECT_TRUE(SameDefinedShape(ctx, unknown, unknown));
+  EXPECT_FALSE(SameDefinedShape(ctx, unknown, unknown_b));
   EXPECT_FALSE(SameDefinedShape(ctx, unknown, s_1_2));
   EXPECT_TRUE(SameDefinedShape(ctx, s_1_2, s_1_2_b));
   EXPECT_FALSE(SameDefinedShape(ctx, s_1_2, s_2_2));
-  EXPECT_TRUE(SameDefinedShape(ctx, s_unknown_2, s_unknown_2_b));
+  EXPECT_TRUE(SameDefinedShape(ctx, s_unknown_2, s_unknown_2));
+  EXPECT_FALSE(SameDefinedShape(ctx, s_unknown_2, s_unknown_2_b));
 }
 
 TEST_F(ShapeRefinerTest, IsUpdatedShapesOrTypes) {
@@ -1178,14 +1180,15 @@ TEST_F(ShapeRefinerTest, IsUpdatedShapesOrTypes) {
   TF_ASSERT_OK(m.AddNode(test));
   shape_inference::InferenceContext* ctx = m.GetContext(test);
 
+  shape_inference::ShapeHandle unknown = ctx->UnknownShape();
   std::vector<shape_inference::ShapeAndType> t0{
       {ctx->MakeShape({1, 2, 3}), DT_FLOAT},
-      {ctx->UnknownShape(), DT_INVALID},
+      {unknown, DT_INVALID},
       {ctx->MakeShape({4, 3, 2, 1}), DT_INT32}};
 
   std::vector<shape_inference::ShapeAndType> t1{
       {ctx->MakeShape({1, 2, 3}), DT_FLOAT},
-      {ctx->UnknownShape(), DT_INVALID},
+      {unknown, DT_INVALID},
       {ctx->MakeShape({4, 3, 2, 1}), DT_INT32}};
 
   std::vector<shape_inference::ShapeAndType> t2{
@@ -1256,10 +1259,10 @@ TEST_F(ShapeRefinerTest, IncrementalUpdates) {
       0, std::vector<shape_inference::ShapeAndType>{{shp, DT_FLOAT}});
   refined = false;
   TF_ASSERT_OK(m.UpdateNode(dequeue, true /* relax */, &refined));
-  EXPECT_FALSE(refined);
+  EXPECT_TRUE(refined);
   ctx = m.GetContext(dequeue);
   EXPECT_EQ("[?,7]", ctx->DebugString(ctx->output(0)));
-  EXPECT_FALSE(SameHandle(ctx->Dim(ctx->output(0), 0), ctx->Dim(shp, 0)));
+  EXPECT_TRUE(SameHandle(ctx->Dim(ctx->output(0), 0), ctx->Dim(shp, 0)));
 
   // Inject a shape of the same handle and expect refined to not change.
   ctx = m.GetContext(queue);
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index bfe7a32b1b46739ce2b000765c2563fc937a280a..d7e01144c9ef3aa09ddd212947eafe48ccff555b 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -150,7 +150,7 @@ void StepStatsCollector::BuildCostModel(
     const DeviceStepStats* hardware_stats;
   };
 
-  std::unordered_map<StringPiece, DeviceStats, StringPiece::Hasher>
+  std::unordered_map<StringPiece, DeviceStats, StringPieceHasher>
       per_device_stats;
   std::unordered_map<int, const DeviceStepStats*> gpu_hardware_stats;
 
@@ -190,7 +190,7 @@ void StepStatsCollector::BuildCostModel(
     CostModel* cm = cost_model_manager->FindOrCreateCostModel(graph);
     cm->IncrementUpdateTimes();
 
-    std::unordered_map<StringPiece, Node*, StringPiece::Hasher> name_to_node;
+    std::unordered_map<StringPiece, Node*, StringPieceHasher> name_to_node;
     for (Node* n : graph->nodes()) {
       name_to_node.emplace(n->name(), n);
     }
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 525f96a3de3a9d7f9b4929d22d8db45bac4c5174..a32badef6dfdb8b62662da880c99842b1cafd13c 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -56,6 +56,7 @@ tf_proto_library(
     cc_grpc_version = 1,
     protodeps = [
         ":debugger_event_metadata_proto",
+        "//tensorflow/core/profiler:protos_all",
     ] + tf_additional_all_protos(),
     visibility = ["//tensorflow:__subpackages__"],
 )
@@ -89,9 +90,9 @@ tf_cuda_library(
     deps = [
         ":debug",
         "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:device_tracer",
         "//tensorflow/core:direct_session_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_tracer",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:proto_text",
@@ -123,6 +124,7 @@ tf_cuda_library(
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:proto_text",
@@ -144,6 +146,7 @@ tf_cuda_library(
         ":debugger_event_metadata_proto_cc",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:proto_text",
diff --git a/tensorflow/core/debug/debug_gateway_test.cc b/tensorflow/core/debug/debug_gateway_test.cc
index 3903040e4d936dd23f97a7c82c06d7524b9c98a2..57583349069a0b4deb137cb09564cdbb3909a4b0 100644
--- a/tensorflow/core/debug/debug_gateway_test.cc
+++ b/tensorflow/core/debug/debug_gateway_test.cc
@@ -40,6 +40,9 @@ std::unique_ptr<DirectSession> CreateSession() {
   options.config.mutable_graph_options()
       ->mutable_rewrite_options()
       ->set_constant_folding(RewriterConfig::OFF);
+  options.config.mutable_graph_options()
+      ->mutable_rewrite_options()
+      ->set_dependency_optimization(RewriterConfig::OFF);
 
   return std::unique_ptr<DirectSession>(
       dynamic_cast<DirectSession*>(NewSession(options)));
@@ -55,7 +58,7 @@ class SessionDebugMinusAXTest : public ::testing::Test {
 #elif defined(TENSORFLOW_USE_SYCL)
     const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
-    const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:CPU:0";
 #endif
 
     Tensor a_tensor(DT_FLOAT, TensorShape({2, 2}));
@@ -503,7 +506,7 @@ TEST_F(SessionDebugMinusAXTest,
 }
 #endif
 
-class SessionDebugOutputSlotWithoutOngoingEdgeTest : public ::testing::Test {
+class SessionDebugOutputSlotWithoutOutgoingEdgeTest : public ::testing::Test {
  public:
   void Initialize() {
     Graph graph(OpRegistry::Global());
@@ -513,7 +516,7 @@ class SessionDebugOutputSlotWithoutOngoingEdgeTest : public ::testing::Test {
 #elif defined(TENSORFLOW_USE_SYCL)
     const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
-    const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:CPU:0";
 #endif
 
     Tensor a_tensor(DT_FLOAT, TensorShape({1, 1}));
@@ -540,7 +543,7 @@ class SessionDebugOutputSlotWithoutOngoingEdgeTest : public ::testing::Test {
   GraphDef def_;
 };
 
-TEST_F(SessionDebugOutputSlotWithoutOngoingEdgeTest,
+TEST_F(SessionDebugOutputSlotWithoutOutgoingEdgeTest,
        WatchSlotWithoutOutgoingEdge) {
   Initialize();
   auto session = CreateSession();
@@ -615,7 +618,7 @@ class SessionDebugVariableTest : public ::testing::Test {
 #elif defined(TENSORFLOW_USE_SYCL)
     const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
-    const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:CPU:0";
 #endif
 
     // Define variable node.
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 85d04daa6592afbfd024d1aeec07ad43088db19b..f81445c20bd2ba56a6d7d3bb4ddefc71f5199784 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -736,7 +736,7 @@ Status DebugGrpcChannel::ReceiveServerRepliesAndClose() {
   }
 }
 
-mutex DebugGrpcIO::streams_mu;
+mutex DebugGrpcIO::streams_mu(LINKER_INITIALIZED);
 
 int64 DebugGrpcIO::channel_connection_timeout_micros = 900 * 1000 * 1000;
 // TODO(cais): Make this configurable?
diff --git a/tensorflow/core/debug/debug_service.proto b/tensorflow/core/debug/debug_service.proto
index 547c0576f08769f9e373a98231caf172a9312937..4bef74dfc5706b0033ff91b5e6cf09bb119d657d 100644
--- a/tensorflow/core/debug/debug_service.proto
+++ b/tensorflow/core/debug/debug_service.proto
@@ -18,6 +18,8 @@ syntax = "proto3";
 package tensorflow;
 
 import "tensorflow/core/framework/tensor.proto";
+import "tensorflow/core/profiler/tfprof_log.proto";
+import "tensorflow/core/protobuf/debug.proto";
 import "tensorflow/core/util/event.proto";
 
 // Reply message from EventListener to the client, i.e., to the source of the
@@ -46,6 +48,38 @@ message EventReply {
   // during debugging.
 }
 
+// Data on the traceback of a debugged call, e.g., a Session.run() call, or the
+// execution of an eager operation.
+message CallTraceback {
+  enum CallType {
+    UNSPECIFIED = 0;
+    GRAPH_EXECUTION = 1;
+    EAGER_EXECUTION = 2;
+  }
+
+  CallType call_type = 1;
+
+  // A key for the call. For example, for graph execution, this is a key
+  // consisting of the names of the fed and fetched tensors.
+  string call_key = 2;
+
+  // Traceback stack for the origin of the call event.
+  // For graph execution, this is the stack of the Session.run() call.
+  // For eager execution, this is the stack of the Python line that invokes
+  // the execution of the eager op.
+  tfprof.CodeDef origin_stack = 3;
+
+  // Keeps track of the mapping from integer IDs in `origin_stack` to actual
+  // string values (e.g., file paths, function names).
+  map<int64, string> origin_id_to_string = 4;
+
+  // Traceback for the graph (if any) involved in the call.
+  tfprof.OpLogProto graph_traceback = 5;
+
+  // Version of the graph in `graph_traceback` (if any).
+  int64 graph_version = 6;
+}
+
 // EventListener: Receives Event protos, e.g., from debugged TensorFlow
 // runtime(s).
 service EventListener {
@@ -57,4 +91,10 @@ service EventListener {
   //      ops that get executed immediately after the beginning of the graph
   //      execution.
   rpc SendEvents(stream Event) returns (stream EventReply);
+
+  // Send the tracebacks of a TensorFlow execution call.
+  rpc SendTracebacks(CallTraceback) returns (EventReply);
+
+  // Send a collection of source code files being debugged.
+  rpc SendSourceFiles(DebuggedSourceFiles) returns (EventReply);
 }
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 93adc7ef4f047beeee0a002f362415c70fb82456..2db7ebd7952c9e1edf374267ee33f697eb846885 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -140,6 +140,7 @@ cc_library(
     hdrs = ["session_mgr.h"],
     deps = [
         ":graph_mgr",
+        ":worker_cache_wrapper",
         ":worker_session",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
@@ -263,6 +264,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "worker_cache_wrapper",
+    hdrs = ["worker_cache_wrapper.h"],
+    deps = [
+        ":worker_cache",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "remote_device",
     srcs = ["remote_device.cc"],
@@ -323,6 +334,7 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
@@ -360,6 +372,7 @@ cc_library(
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:tensorflow_opensource",
     ],
 )
@@ -403,6 +416,7 @@ cc_library(
         ":worker_env",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
index 593fe0e363edc543a74572ed51128777e048a47d..d84b69d06b77b03dee6e1041e7189ec6f3fb8682 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
@@ -105,6 +105,7 @@ Status ClusterFunctionLibraryRuntime::ConstructFunctionGraph(
         Rendezvous::CreateKey(target, 1 /* src_incarnation */, target,
                               out.name(), FrameAndIter(0, 0));
     recv_keys->push_back(key);
+    ++i;
   }
   return Status::OK();
 }
@@ -124,8 +125,11 @@ Status ClusterFunctionLibraryRuntime::Instantiate(
   WorkerInterface* wi = worker_session_->worker_cache->CreateWorker(target);
 
   if (wi == nullptr) {
-    return errors::InvalidArgument("Could not find worker with target: ",
-                                   target);
+    std::vector<string> workers;
+    worker_session_->worker_cache->ListWorkers(&workers);
+    return errors::InvalidArgument(
+        "Could not find worker with target: ", target,
+        " Available workers: ", str_util::Join(workers, ", "));
   }
 
   // Make RPC and obtain a graph handle.
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
index 04587dd8ca8638d031d840b0b53b5168bdab63c2..6dd8b9ec73778baea0ed2876ac5111e9fd331dcf 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
@@ -103,14 +103,54 @@ TEST_F(ClusterFunctionLibraryRuntimeTest, ConstructFunctionGraph) {
   GraphDef actual;
   std::vector<string> send_keys, recv_keys;
   TF_CHECK_OK(ConstructFunctionGraphHelper(
-      test::function::XTimesTwo().signature(),
+      test::function::Swap().signature(),
       {{"T", DT_FLOAT}, {"_target", "/job:a/replica:0/task:0/cpu:0"}}, &actual,
       &send_keys, &recv_keys));
-
   GraphDef expected;
   protobuf::TextFormat::ParseFromString(R"(
 node {
-  name: "_recv_x_0"
+  name: "_recv_i0_0"
+  op: "_Recv"
+  device: "/job:a/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:a/replica:0/task:0/device:CPU:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:a/replica:0/task:0/device:CPU:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "i0"
+    }
+  }
+  attr {
+    key: "tensor_type"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "_recv_i1_1"
   op: "_Recv"
   device: "/job:a/replica:0/task:0/device:CPU:0"
   attr {
@@ -140,7 +180,7 @@ node {
   attr {
     key: "tensor_name"
     value {
-      s: "x"
+      s: "i1"
     }
   }
   attr {
@@ -151,9 +191,10 @@ node {
   }
 }
 node {
-  name: "XTimesTwo"
-  op: "XTimesTwo"
-  input: "_recv_x_0"
+  name: "Swap"
+  op: "Swap"
+  input: "_recv_i0_0"
+  input: "_recv_i1_1"
   device: "/job:a/replica:0/task:0/device:CPU:0"
   attr {
     key: "T"
@@ -163,15 +204,57 @@ node {
   }
   attr {
     key: "_target"
+    value {
+      s: "/job:a/replica:0/task:0/cpu:0"
+    }
+  }
+}
+node {
+  name: "_send_o0_0"
+  op: "_Send"
+  input: "Swap"
+  device: "/job:a/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:a/replica:0/task:0/device:CPU:0"
+    }
+  }
+  attr {
+    key: "send_device"
     value {
       s: "/job:a/replica:0/task:0/device:CPU:0"
     }
   }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "o0"
+    }
+  }
 }
 node {
-  name: "_send_y_0"
+  name: "_send_o1_1"
   op: "_Send"
-  input: "XTimesTwo"
+  input: "Swap:1"
   device: "/job:a/replica:0/task:0/device:CPU:0"
   attr {
     key: "T"
@@ -206,10 +289,11 @@ node {
   attr {
     key: "tensor_name"
     value {
-      s: "y"
+      s: "o1"
     }
   }
-})",
+}
+)",
                                         &expected);
   TF_EXPECT_GRAPH_EQ(expected, actual);
 }
@@ -234,16 +318,18 @@ TEST_F(ClusterFunctionLibraryRuntimeTest, DISABLED_InstantiateAndRun) {
 TEST_F(ClusterFunctionLibraryRuntimeTest,
        DISABLED_InstantiateAndRunAttrSubstitution) {
   FunctionDefLibrary proto;
-  *(proto.add_function()) = test::function::XTimesTwo();
+  *(proto.add_function()) = test::function::Swap();
   FunctionLibraryDefinition lib_def(OpRegistry::Global(), proto);
 
-  Tensor y;
-  auto x = test::AsTensor<float>({1, 2, 3, 4});
+  Tensor y1, y2;
+  auto x1 = test::AsTensor<float>({1, 2, 3, 4});
+  auto x2 = test::AsTensor<float>({4, 3, 2, 1});
   TF_EXPECT_OK(InstantiateAndRun(
-      "XTimesTwo", lib_def,
+      "Swap", lib_def,
       {{"T", DT_FLOAT}, {"_target", "/job:localhost/replica:0/task:1/cpu:0"}},
-      {x}, {&y}));
-  test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
+      {x1, x2}, {&y1, &y2}));
+  test::ExpectTensorEqual<float>(y1, test::AsTensor<float>({4, 3, 2, 1}));
+  test::ExpectTensorEqual<float>(y2, test::AsTensor<float>({1, 2, 3, 4}));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 60d58af61dad56fbb09df041fb5ca1429fd451ad..0120f612ac8bee32999304b1a6f63fff3802606a 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -228,8 +228,14 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
     params.function_library = lib;
     params.create_kernel = [session, lib, opseg](const NodeDef& ndef,
                                                  OpKernel** kernel) {
-      // Caches the kernel only if the node is stateful.
-      if (!lib->IsStateful(ndef.op())) {
+      // We do not share the kernel via the OpSegment if the node is
+      // stateless, or a function.
+      // NOTE(mrry): We must not share function kernels (implemented
+      // using `CallOp`) between subgraphs, because `CallOp::handle_`
+      // is tied to a particular subgraph. Even if the function itself
+      // is stateful, the `CallOp` that invokes it is not.
+      if (!lib->IsStateful(ndef.op()) ||
+          lib->GetFunctionLibraryDefinition()->Find(ndef.op()) != nullptr) {
         return lib->CreateKernel(ndef, kernel);
       }
       auto create_fn = [lib, &ndef](OpKernel** kernel) {
@@ -475,8 +481,18 @@ void GraphMgr::StartParallelExecutors(const string& handle, int64 step_id,
   using std::placeholders::_1;
   // Line below is equivalent to this code, but does one less indirect call:
   //  args.runner = [pool](std::function<void()> fn) { pool->Schedule(fn); };
-  args.runner = std::bind(&thread::ThreadPool::Schedule, pool, _1);
+  auto default_runner = std::bind(&thread::ThreadPool::Schedule, pool, _1);
   for (const auto& unit : item->units) {
+    // TODO(zhengxq): if the device picks its own threadpool, we need to assign
+    //     less threads to the main compute pool by default.
+    thread::ThreadPool* device_thread_pool =
+        unit.device->tensorflow_device_thread_pool();
+    if (!device_thread_pool) {
+      args.runner = default_runner;
+    } else {
+      args.runner =
+          std::bind(&thread::ThreadPool::Schedule, device_thread_pool, _1);
+    }
     unit.root->RunAsync(args, barrier->Get());
   }
 }
diff --git a/tensorflow/core/distributed_runtime/local_master.cc b/tensorflow/core/distributed_runtime/local_master.cc
index c7ba7abeaffc654b24adfcc320ed45990cf5bc77..aaa4cfa7341c42bf9f7302e8ef30a28b68e6213c 100644
--- a/tensorflow/core/distributed_runtime/local_master.cc
+++ b/tensorflow/core/distributed_runtime/local_master.cc
@@ -159,7 +159,7 @@ Status LocalMaster::Reset(CallOptions* call_options,
 
 namespace {
 mutex* get_local_master_registry_lock() {
-  static mutex local_master_registry_lock;
+  static mutex local_master_registry_lock(LINKER_INITIALIZED);
   return &local_master_registry_lock;
 }
 
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 91a1fa7d1e1292b9c1149a456a212dc14712aec0..03b65d8cba9112e272f52518ca6050ce5f16eb5d 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -67,13 +67,14 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                     const SessionOptions& session_opts,
                     const StatsPublisherFactory& stats_publisher_factory,
                     GraphExecutionState* execution_state, bool is_partial,
-                    WorkerCacheInterface* worker_cache)
+                    WorkerCacheInterface* worker_cache, bool should_deregister)
       : session_handle_(handle),
         client_graph_(std::move(cg)),
         session_opts_(session_opts),
         is_partial_(is_partial),
         debug_opts_(bopts.debug_options),
-        worker_cache_(worker_cache) {
+        worker_cache_(worker_cache),
+        should_deregister_(should_deregister) {
     VLOG(1) << "Created ReffedClientGraph for node with "
             << client_graph()->graph.num_node_ids();
 
@@ -85,7 +86,11 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
     }
   }
 
-  ~ReffedClientGraph() override { DeregisterPartitions(); }
+  ~ReffedClientGraph() override {
+    if (should_deregister_) {
+      DeregisterPartitions();
+    }
+  }
 
   const ClientGraph* client_graph() { return client_graph_.get(); }
 
@@ -208,7 +213,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   const bool is_partial_;
   const DebugOptions& debug_opts_;
   WorkerCacheInterface* const worker_cache_;  // Not owned.
-  std::unordered_map<StringPiece, Node*, StringPiece::Hasher> name_to_node_;
+  std::unordered_map<StringPiece, Node*, StringPieceHasher> name_to_node_;
+  const bool should_deregister_;
 
   // Graph partitioned into per-location subgraphs.
   struct Part {
@@ -486,7 +492,7 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
   VLOG(2) << "RunPartitions step_id " << step_id << " execution_count "
           << execution_count;
   // Maps the names of fed tensors to their index in `req`.
-  std::unordered_map<StringPiece, size_t, StringPiece::Hasher> feeds(3);
+  std::unordered_map<StringPiece, size_t, StringPieceHasher> feeds(3);
 
   for (size_t i = 0; i < req.num_feeds(); ++i) {
     if (!feeds.insert({req.feed_name(i), i}).second) {
@@ -1043,7 +1049,10 @@ Status MasterSession::Create(GraphDef* graph_def,
     TF_RETURN_IF_ERROR(GraphExecutionState::MakeForBaseGraph(
         graph_def, execution_options, &execution_state_));
   }
-  if (options.cluster_def != nullptr) {
+  // TODO(b/36574172): Remove these conditions when ClusterSpec
+  // propagation is supported in all servers.
+  if (options.cluster_def != nullptr ||
+      session_opts_.config.isolate_session_state()) {
     should_delete_worker_sessions_ = true;
     return CreateWorkerSessions(options);
   }
@@ -1052,10 +1061,9 @@ Status MasterSession::Create(GraphDef* graph_def,
 
 Status MasterSession::CreateWorkerSessions(
     const WorkerCacheFactoryOptions& options) {
-  CHECK(worker_cache_) << "CreateWorkerSessions should be called only with "
-                       << "dynamic cluster membership.";
   std::vector<string> worker_names;
-  worker_cache_->ListWorkers(&worker_names);
+  WorkerCacheInterface* worker_cache = get_worker_cache();
+  worker_cache->ListWorkers(&worker_names);
 
   struct WorkerGroup {
     // The worker name. (Not owned.)
@@ -1073,10 +1081,10 @@ Status MasterSession::CreateWorkerSessions(
   std::vector<WorkerGroup> workers(worker_names.size());
 
   // Release the workers.
-  auto cleanup = gtl::MakeCleanup([this, &workers] {
+  auto cleanup = gtl::MakeCleanup([this, &workers, worker_cache] {
     for (auto&& worker_group : workers) {
       if (worker_group.worker != nullptr) {
-        worker_cache_->ReleaseWorker(*worker_group.name, worker_group.worker);
+        worker_cache->ReleaseWorker(*worker_group.name, worker_group.worker);
       }
     }
   });
@@ -1085,11 +1093,19 @@ Status MasterSession::CreateWorkerSessions(
   // Create all the workers & kick off the computations.
   for (size_t i = 0; i < worker_names.size(); ++i) {
     workers[i].name = &worker_names[i];
-    workers[i].worker = worker_cache_->CreateWorker(worker_names[i]);
+    workers[i].worker = worker_cache->CreateWorker(worker_names[i]);
     workers[i].request.set_session_handle(handle_);
-    *workers[i].request.mutable_server_def()->mutable_cluster() =
-        *options.cluster_def;
-    workers[i].request.mutable_server_def()->set_protocol(*options.protocol);
+    if (options.cluster_def) {
+      *workers[i].request.mutable_server_def()->mutable_cluster() =
+          *options.cluster_def;
+      workers[i].request.mutable_server_def()->set_protocol(*options.protocol);
+      // Session state is always isolated when ClusterSpec propagation
+      // is in use.
+      workers[i].request.set_isolate_session_state(true);
+    } else {
+      workers[i].request.set_isolate_session_state(
+          session_opts_.config.isolate_session_state());
+    }
 
     DeviceNameUtils::ParsedName name;
     if (!DeviceNameUtils::ParseFullName(worker_names[i], &name)) {
@@ -1156,7 +1172,7 @@ Status MasterSession::DeleteWorkerSessions() {
   // Create all the workers & kick off the computations.
   for (size_t i = 0; i < worker_names.size(); ++i) {
     workers[i].name = &worker_names[i];
-    workers[i].worker = worker_cache_->CreateWorker(worker_names[i]);
+    workers[i].worker = worker_cache->CreateWorker(worker_names[i]);
     workers[i].request.set_session_handle(handle_);
   }
 
@@ -1262,7 +1278,7 @@ Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
       auto entry = new ReffedClientGraph(
           handle_, opts, std::move(client_graph), session_opts_,
           stats_publisher_factory_, execution_state_.get(), is_partial,
-          worker_cache);
+          worker_cache, !should_delete_worker_sessions_);
       iter = m->insert({hash, entry}).first;
       VLOG(1) << "Preparing to execute new graph";
     }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
index 5639691804f58f8dfaa0a2d0eba5e1095ffb1534..e51894b4c756b6f4cfc09fe0adf57e06cb22ee0f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
@@ -214,22 +214,13 @@ void EncodeTensorToByteBuffer(bool is_dead, const Tensor& val,
 
     if (tensor_data_is_large) {
       // (E) Encode tensor data, but by sharing backing store
-
-      // TODO(vpai): Use the pure C++ ::grpc::Slice constructor that uses
-      // grpc_slice_new_with_user_data once TensorFlow pins a version of gRPC
-      // that includes https://github.com/grpc/grpc/pull/12065
-
       const TensorBuffer* buf = DMAHelper::buffer(&val);
       buf->Ref();
       slices[1] = ::grpc::Slice(
-          grpc_slice_new_with_user_data(
-              const_cast<void*>(static_cast<const void*>(tdata.data())),
-              tdata.size(),
-              [](void* backing) {
-                static_cast<TensorBuffer*>(backing)->Unref();
-              },
-              const_cast<TensorBuffer*>(buf)),
-          ::grpc::Slice::STEAL_REF);
+          const_cast<void*>(static_cast<const void*>(tdata.data())),
+          tdata.size(),
+          [](void* backing) { static_cast<TensorBuffer*>(backing)->Unref(); },
+          const_cast<TensorBuffer*>(buf));
       num_slices += 1;
     }
     size_t total_bytes = 0;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.h b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
index 0ddcd89130b3b1b1209c255b6200d8ce88d4cb7c..ac0a33a2b9cbe2ba415a0f6cd7d94aee1fb142ac 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
@@ -28,10 +28,30 @@ limitations under the License.
 
 namespace tensorflow {
 
+constexpr char kStreamRemovedMessage[] = "Stream removed";
+
+// Identify if the given grpc::Status corresponds to an HTTP stream removed
+// error (see chttp2_transport.cc).
+//
+// When auto-reconnecting to a remote TensorFlow worker after it restarts, gRPC
+// can return an UNKNOWN error code with a "Stream removed" error message.
+// This should not be treated as an unrecoverable error.
+//
+// N.B. This is dependent on the error message from grpc remaining consistent.
+inline bool IsStreamRemovedError(const ::grpc::Status& s) {
+  return !s.ok() && s.error_code() == ::grpc::StatusCode::UNKNOWN &&
+         s.error_message() == kStreamRemovedMessage;
+}
+
 inline Status FromGrpcStatus(const ::grpc::Status& s) {
   if (s.ok()) {
     return Status::OK();
   } else {
+    // Convert "UNKNOWN" stream removed errors into unavailable, to allow
+    // for retry upstream.
+    if (IsStreamRemovedError(s)) {
+      return Status(tensorflow::error::UNAVAILABLE, s.error_message());
+    }
     return Status(static_cast<tensorflow::error::Code>(s.error_code()),
                   s.error_message());
   }
diff --git a/tensorflow/core/distributed_runtime/server_lib.cc b/tensorflow/core/distributed_runtime/server_lib.cc
index 0b7fed79cd8fc8f987858beb957b64b461b6272a..7d308bb723a71e23482b6f52fa6d8fa53f89dda8 100644
--- a/tensorflow/core/distributed_runtime/server_lib.cc
+++ b/tensorflow/core/distributed_runtime/server_lib.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 
 namespace {
 mutex* get_server_factory_lock() {
-  static mutex server_factory_lock;
+  static mutex server_factory_lock(LINKER_INITIALIZED);
   return &server_factory_lock;
 }
 
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index b97749dc41e46e500da2e656406a6b0362013969..fabcbd00f5e59a68a8db54c441dcc74377c44617 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -20,7 +20,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_cache_wrapper.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 
 namespace tensorflow {
 
@@ -29,7 +32,10 @@ SessionMgr::SessionMgr(
     std::unique_ptr<WorkerCacheInterface> default_worker_cache,
     WorkerCacheFactory worker_cache_factory)
     : worker_env_(worker_env),
-      legacy_session_("", default_worker_name, std::move(default_worker_cache),
+      default_worker_cache_(std::move(default_worker_cache)),
+      legacy_session_("", default_worker_name,
+                      std::unique_ptr<WorkerCacheInterface>(
+                          new WorkerCacheWrapper(default_worker_cache_.get())),
                       std::unique_ptr<DeviceMgr>(worker_env->device_mgr),
                       std::unique_ptr<GraphMgr>(
                           new GraphMgr(worker_env, worker_env->device_mgr))),
@@ -41,7 +47,8 @@ string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
 }
 
 Status SessionMgr::CreateSession(const string& session,
-                                 const ServerDef& server_def) {
+                                 const ServerDef& server_def,
+                                 bool isolate_session_state) {
   mutex_lock l(mu_);
   if (session.empty()) {
     return errors::InvalidArgument("Session must be non-empty.");
@@ -50,12 +57,18 @@ Status SessionMgr::CreateSession(const string& session,
   const string worker_name = WorkerNameFromServerDef(server_def);
 
   WorkerCacheInterface* worker_cache = nullptr;
-  TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache));
+  if (server_def.cluster().job().empty()) {
+    worker_cache = new WorkerCacheWrapper(default_worker_cache_.get());
+  } else {
+    TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache));
+  }
 
+  CHECK(!worker_env_->local_devices.empty())
+      << "The WorkerEnv must have at least one device in `local_devices`.";
   std::vector<Device*> renamed_devices;
   for (Device* d : worker_env_->local_devices) {
-    renamed_devices.push_back(
-        RenamedDevice::NewRenamedDevice(worker_name, d, false));
+    renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
+        worker_name, d, false, isolate_session_state));
   }
   std::unique_ptr<DeviceMgr> device_mgr(new DeviceMgr(renamed_devices));
 
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index c44bca7b7a407957b1a36d7659f2b35ea0b30d07..d85b6c305941014fb52c4b4da6d646a707054c3a 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -45,7 +45,8 @@ class SessionMgr {
   ~SessionMgr() {}
 
   // Allocates state for a new session.
-  Status CreateSession(const string& session, const ServerDef& server_def);
+  Status CreateSession(const string& session, const ServerDef& server_def,
+                       bool isolate_session_state);
 
   // Locates the worker session for a given session handle
   WorkerSession* WorkerSessionForSession(const string& session);
@@ -71,6 +72,7 @@ class SessionMgr {
   // legacy_session_ is deleted. Further, we must ensure that WorkerSession's
   // device_mgr is deleted after WorkerSession's graph_mgr.
 
+  std::unique_ptr<WorkerCacheInterface> default_worker_cache_;
   WorkerSession legacy_session_;
 
   const WorkerCacheFactory worker_cache_factory_;
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 7132f123a5943d0680743f3cc3bc17470f49d65d..ffe4809f2b10398ca4c7dc503dd82236cbc8dd18 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -22,14 +22,36 @@ limitations under the License.
 
 namespace tensorflow {
 
+class FakeDevice : public Device {
+ private:
+  explicit FakeDevice(const DeviceAttributes& device_attributes)
+      : Device(nullptr, device_attributes) {}
+
+ public:
+  Status Sync() override { return errors::Unimplemented("FakeDevice::Sync()"); }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override { return nullptr; }
+
+  static std::unique_ptr<Device> MakeCPU(const string& name) {
+    DeviceAttributes device_attributes;
+    device_attributes.set_name(name);
+    device_attributes.set_device_type(DeviceType("FakeCPU").type());
+    return std::unique_ptr<Device>(new FakeDevice(device_attributes));
+  }
+};
+
 class SessionMgrTest : public ::testing::Test {
  protected:
   SessionMgrTest()
-      : mgr_(&env_, "/job:mnist/replica:0/task:0",
-             std::unique_ptr<WorkerCacheInterface>(),
-             factory_),
-        legacy_session_(mgr_.WorkerSessionForSession("novel_session_id")) {}
+      : device_(FakeDevice::MakeCPU(
+            "/job:mnist/replica:0/task:0/device:fakecpu:0")),
+        mgr_(&env_, "/job:mnist/replica:0/task:0",
+             std::unique_ptr<WorkerCacheInterface>(), factory_),
+        legacy_session_(mgr_.WorkerSessionForSession("novel_session_id")) {
+    env_.local_devices = {device_.get()};
+  }
 
+  std::unique_ptr<Device> device_;
   WorkerEnv env_;
   SessionMgr::WorkerCacheFactory factory_ =
       [](const ServerDef& server_def, WorkerCacheInterface** worker_cache) {
@@ -42,14 +64,48 @@ class SessionMgrTest : public ::testing::Test {
 
 TEST_F(SessionMgrTest, CreateSessionSimple) {
   ServerDef server_def;
+  server_def.set_job_name("worker");
+  server_def.set_task_index(3);
+
   string session_handle = "test_session_handle";
-  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def));
+  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def, true));
   WorkerSession* session = mgr_.WorkerSessionForSession(session_handle);
   EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null";
   EXPECT_NE(mgr_.LegacySession(), session);
   TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
 }
 
+TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) {
+  ServerDef server_def;
+  server_def.set_job_name("worker");
+  server_def.set_task_index(3);
+
+  TF_EXPECT_OK(mgr_.CreateSession("handle_1", server_def, false));
+  WorkerSession* session_1 = mgr_.WorkerSessionForSession("handle_1");
+  std::vector<Device*> devices_1 = session_1->device_mgr->ListDevices();
+  EXPECT_EQ(1, devices_1.size());
+
+  TF_EXPECT_OK(mgr_.CreateSession("handle_2", server_def, false));
+  WorkerSession* session_2 = mgr_.WorkerSessionForSession("handle_2");
+  std::vector<Device*> devices_2 = session_2->device_mgr->ListDevices();
+  EXPECT_EQ(1, devices_2.size());
+
+  TF_EXPECT_OK(mgr_.CreateSession("handle_3", server_def, true));
+  WorkerSession* session_3 = mgr_.WorkerSessionForSession("handle_3");
+  std::vector<Device*> devices_3 = session_3->device_mgr->ListDevices();
+  EXPECT_EQ(1, devices_3.size());
+
+  TF_EXPECT_OK(mgr_.CreateSession("handle_4", server_def, true));
+  WorkerSession* session_4 = mgr_.WorkerSessionForSession("handle_4");
+  std::vector<Device*> devices_4 = session_4->device_mgr->ListDevices();
+  EXPECT_EQ(1, devices_4.size());
+
+  EXPECT_EQ(devices_1[0]->resource_manager(), devices_2[0]->resource_manager());
+  EXPECT_NE(devices_1[0]->resource_manager(), devices_3[0]->resource_manager());
+  EXPECT_NE(devices_1[0]->resource_manager(), devices_4[0]->resource_manager());
+  EXPECT_NE(devices_3[0]->resource_manager(), devices_4[0]->resource_manager());
+}
+
 TEST_F(SessionMgrTest, LegacySession) {
   ServerDef server_def;
   string session_handle = "";
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 8bf87923ed4a8bd93b11b698908113a016e8e788..6cd92f5fe7a9edaef1ed7db0926281d1a91cdcf2 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -44,7 +44,8 @@ void Worker::CreateWorkerSessionAsync(const CreateWorkerSessionRequest* request,
                                       CreateWorkerSessionResponse* response,
                                       StatusCallback done) {
   Status s = env_->session_mgr->CreateSession(request->session_handle(),
-                                              request->server_def());
+                                              request->server_def(),
+                                              request->isolate_session_state());
   done(s);
 }
 
diff --git a/tensorflow/core/distributed_runtime/worker_cache_wrapper.h b/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..43c3b6285b9d1a76d5207537ccd1343928c59010
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
@@ -0,0 +1,90 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_WRAPPER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_WRAPPER_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+
+namespace tensorflow {
+
+class WorkerCacheWrapper : public WorkerCacheInterface {
+ public:
+  WorkerCacheWrapper(WorkerCacheInterface* wrapped) : wrapped_(wrapped) {}
+
+  // Updates *workers with strings naming the remote worker tasks to
+  // which open channels have been established.
+  virtual void ListWorkers(std::vector<string>* workers) const {
+    return wrapped_->ListWorkers(workers);
+  }
+
+  // If "target" names a remote task for which an RPC channel exists
+  // or can be constructed, returns a pointer to a WorkerInterface object
+  // wrapping that channel. The returned value must be destroyed by
+  // calling `this->ReleaseWorker(target, ret)`
+  // TODO(mrry): rename this to GetOrCreateWorker() or something that
+  // makes it more obvious that this method returns a potentially
+  // shared object.
+  virtual WorkerInterface* CreateWorker(const string& target) {
+    return wrapped_->CreateWorker(target);
+  }
+
+  // Release a worker previously returned by this->CreateWorker(target).
+  //
+  // TODO(jeff,sanjay): Consider moving target into WorkerInterface.
+  // TODO(jeff,sanjay): Unify all worker-cache impls and factor out a
+  //                    per-rpc-subsystem WorkerInterface creator.
+  virtual void ReleaseWorker(const string& target, WorkerInterface* worker) {
+    return wrapped_->ReleaseWorker(target, worker);
+  }
+
+  // Set *locality with the DeviceLocality of the specified remote device
+  // within its local environment.  Returns true if *locality
+  // was set, using only locally cached data.  Returns false
+  // if status data for that device was not available.  Never blocks.
+  virtual bool GetDeviceLocalityNonBlocking(const string& device,
+                                            DeviceLocality* locality) {
+    return wrapped_->GetDeviceLocalityNonBlocking(device, locality);
+  }
+
+  // Set *locality with the DeviceLocality of the specified remote device
+  // within its local environment.  Callback gets Status::OK if *locality
+  // was set.
+  virtual void GetDeviceLocalityAsync(const string& device,
+                                      DeviceLocality* locality,
+                                      StatusCallback done) {
+    return wrapped_->GetDeviceLocalityAsync(device, locality, std::move(done));
+  }
+
+  // Start/stop logging activity.
+  virtual void SetLogging(bool active) { wrapped_->SetLogging(active); }
+
+  // Discard any saved log data.
+  virtual void ClearLogs() { wrapped_->ClearLogs(); }
+
+  // Return logs for the identified step in *ss.  Any returned data will no
+  // longer be stored.
+  virtual bool RetrieveLogs(int64 step_id, StepStats* ss) {
+    return wrapped_->RetrieveLogs(step_id, ss);
+  }
+
+ private:
+  WorkerCacheInterface* wrapped_;  // Not owned.
+};
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_WRAPPER_H_
diff --git a/tensorflow/core/framework/bfloat16.cc b/tensorflow/core/framework/bfloat16.cc
index a5ac0e1a8df1ce0e6e622ae62d2ee8012fff58b7..0efe43fde2dadd42aa03d3bf2968d2cbfb113e8d 100644
--- a/tensorflow/core/framework/bfloat16.cc
+++ b/tensorflow/core/framework/bfloat16.cc
@@ -21,13 +21,13 @@ void FloatToBFloat16(const float* src, bfloat16* dst, int64 size) {
   const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
   uint16_t* q = reinterpret_cast<uint16_t*>(dst);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    for (; size != 0; p += 2, q++, size--) {  
-      *q = p[0];  
-    }  
+    for (; size != 0; p += 2, q++, size--) {
+      *q = p[0];
+    }
 #else
-    for (; size != 0; p += 2, q++, size--) {  
-     *q = p[1];  
-    }  
+    for (; size != 0; p += 2, q++, size--) {
+     *q = p[1];
+    }
 #endif
 }
 
@@ -35,15 +35,15 @@ void BFloat16ToFloat(const bfloat16* src, float* dst, int64 size) {
   const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
   uint16_t* q = reinterpret_cast<uint16_t*>(dst);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    for (; size != 0; p++, q += 2, size--) {  
-      q[0] = *p;  
-      q[1] = 0;  
+    for (; size != 0; p++, q += 2, size--) {
+      q[0] = *p;
+      q[1] = 0;
+    }
+#else
+    for (; size != 0; p++, q += 2, size--) {
+      q[0] = 0;
+      q[1] = *p;
     }
-#else  
-    for (; size != 0; p++, q += 2, size--) {  
-      q[0] = 0;  
-      q[1] = *p;  
-    } 
 #endif
 }
 
diff --git a/tensorflow/core/framework/bfloat16.h b/tensorflow/core/framework/bfloat16.h
index b936e899d4ce71de91af0934ccec982013dde658..968c18bdd2159fee4eb6982c62697951d79b706c 100644
--- a/tensorflow/core/framework/bfloat16.h
+++ b/tensorflow/core/framework/bfloat16.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/platform/types.h"
 
-#if defined(PLATFORM_WINDOWS)  
-#include "tensorflow/core/platform/windows/cpu_info.h"  
-#endif  
+#if defined(PLATFORM_WINDOWS)
+#include "tensorflow/core/platform/windows/cpu_info.h"
+#endif
 
 // Compact 16-bit encoding of floating point numbers. This representation uses
 // 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa.  It
diff --git a/tensorflow/core/framework/bfloat16_test.cc b/tensorflow/core/framework/bfloat16_test.cc
index 6e4533875160120229877664cff7429cfaf71d43..17e6209f8e5ad5240dfc8ca1def75c178da45c27 100644
--- a/tensorflow/core/framework/bfloat16_test.cc
+++ b/tensorflow/core/framework/bfloat16_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/bfloat16.h"
 
+#include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -104,6 +105,17 @@ TEST(Bfloat16Test, Conversion) {
   }
 }
 
+TEST(Bfloat16Test, Epsilon) {
+  EXPECT_LT(1.0f, static_cast<float>(bfloat16::epsilon() + bfloat16(1.0f)));
+  EXPECT_EQ(1.0f, static_cast<float>((bfloat16::epsilon() / bfloat16(2.0f)) +
+                                     bfloat16(1.0f)));
+}
+
+TEST(Bfloat16Test, Negate) {
+  EXPECT_EQ(-3.0f, static_cast<float>(-bfloat16(3.0f)));
+  EXPECT_EQ(4.5f, static_cast<float>(-bfloat16(-4.5f)));
+}
+
 static void BM_FloatToBFloat16(int iters) {
   testing::StopTiming();
   static const int N = 32 << 20;
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index ea66863bed3f3c9d91587a64370f635766d0794d..7ab8e3ec188a223e35b47b6f9517abd9327b23f8 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -397,6 +397,15 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(
       CheckFormatConstraintsOnShape(data_format, filter_shape, "filter", c));
 
+  std::vector<int32> dilations;
+  TF_RETURN_IF_ERROR(c->GetAttr("dilations", &dilations));
+
+  if (dilations.size() != 4) {
+    return errors::InvalidArgument(
+        "Conv2D requires the dilation attribute to contain 4 values, but got: ",
+        dilations.size());
+  }
+
   std::vector<int32> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
 
@@ -410,6 +419,8 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
 
   const int32 stride_rows = GetTensorDim(strides, data_format, 'H');
   const int32 stride_cols = GetTensorDim(strides, data_format, 'W');
+  const int32 dilation_rows = GetTensorDim(dilations, data_format, 'H');
+  const int32 dilation_cols = GetTensorDim(dilations, data_format, 'W');
 
   DimensionHandle batch_size_dim;
   DimensionHandle input_depth_dim;
@@ -447,12 +458,12 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
 
   DimensionHandle output_rows, output_cols;
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(c, input_spatial_dims[0],
-                                                   filter_rows_dim, stride_rows,
-                                                   padding, &output_rows));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(c, input_spatial_dims[1],
-                                                   filter_cols_dim, stride_cols,
-                                                   padding, &output_cols));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, input_spatial_dims[0], filter_rows_dim, dilation_rows, stride_rows,
+      padding, &output_rows));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, input_spatial_dims[1], filter_cols_dim, dilation_cols, stride_cols,
+      padding, &output_cols));
 
   ShapeHandle output_shape;
   TF_RETURN_IF_ERROR(
@@ -1114,16 +1125,20 @@ Status ConcatShapeHelper(InferenceContext* c, int start_value_index,
     for (int i = start_value_index; i < end_value_index; ++i) {
       if (rank == InferenceContext::kUnknownRank) rank = c->Rank(c->input(i));
       if (rank != InferenceContext::kUnknownRank) {
-        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), rank, &unused));
+        break;
       }
     }
     if (rank == InferenceContext::kUnknownRank) {
       c->set_output(0, c->UnknownShape());
       return Status::OK();
-    }
-    if (rank == 0) {
+    } else if (rank == 0) {
       return errors::InvalidArgument(
           "Can't concatenate scalars (use tf.stack instead)");
+    } else {
+      for (int i = start_value_index; i < end_value_index; ++i) {
+        // Check that all the inputs are of the correct rank.
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), rank, &unused));
+      }
     }
     // Build result of <rank> different unknown dims.
     std::vector<DimensionHandle> dims;
@@ -1307,6 +1322,9 @@ Status ValidateSparseTensor(InferenceContext* c, ShapeHandle indices_shape,
 
 Status ScatterNdUpdateShape(InferenceContext* c) {
   ShapeHandle input_shape = c->input(0);
+  if (c->input_handle_shapes_and_types(0) != nullptr) {
+    input_shape = (*c->input_handle_shapes_and_types(0))[0].shape;
+  }
   ShapeHandle indices_shape;
   TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &indices_shape));
   ShapeHandle updates_shape;
@@ -1361,7 +1379,9 @@ Status ScatterNdUpdateShape(InferenceContext* c) {
     }
   }
 
-  c->set_output(0, input_shape);
+  if (c->input_handle_shapes_and_types(0) == nullptr) {
+    c->set_output(0, input_shape);
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index ec9746b2af1ed0da348fbe7459c5d93d842b25d9..5f3e5ad45731750bfd73181c41cd029f23aab55f 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -423,6 +423,15 @@ TEST(CommonShapeFnsTest, Conv2DShapeTest) {
                     .Finalize(&op.node_def));
   };
 
+  // Invalid rank for input
+  INFER_ERROR("must be rank 4", op, "[4,4];[2,1,1,1]");
+  // Invalid rank for filter
+  INFER_ERROR("must be rank 4", op, "[1,4,4,1];[2,1,1]");
+
+  // Invalid value for strides
+  set_op({{1, 1, 0, 1}}, "VALID", "NHWC", "HWIO");
+  INFER_ERROR("must be > 0", op, "[1,2,2,1];[1,1,1,1]");
+
   // 1x1 filter
   set_op({{1, 1, 1, 1}}, "VALID", "NHWC", "HWIO");
   INFER_OK(op, "[1,2,2,1];[1,1,1,1]", "[d0_0,2,2,d1_3]");
@@ -443,11 +452,6 @@ TEST(CommonShapeFnsTest, Conv2DShapeTest) {
   set_op({{1, 1, 2, 1}}, "VALID", "NHWC", "HWIO");
   INFER_OK(op, "[1,4,4,1];[2,1,1,1]", "[d0_0,3,2,d1_3]");
 
-  // Invalid rank for input
-  INFER_ERROR("must be rank 4", op, "[4,4];[2,1,1,1]");
-  // Invalid rank for filter
-  INFER_ERROR("must be rank 4", op, "[1,4,4,1];[2,1,1]");
-
   // Unknown dims in the critical fields lead to partial inference.
   INFER_OK(op, "[1,4,4,1];[2,1,1,1]", "[d0_0,3,2,d1_3]");
   INFER_OK(op, "[1,?,4,1];[2,1,1,1]", "[d0_0,?,2,d1_3]");
@@ -538,6 +542,98 @@ TEST(CommonShapeFnsTest, Conv2DShapeTest) {
   INFER_OK(op, "[1,4,4,?];[?,?,?,?]", "[d0_0,2,2,d1_3]");
 }
 
+TEST(CommonShapeFnsTest, Conv2DDilatedShapeTest) {
+  ShapeInferenceTestOp op("Conv2D");
+  auto set_op = [&op](const std::vector<int32>& dilations,
+                      const std::vector<int32>& strides, const string& padding,
+                      const string& data_format) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Conv2D")
+                    .Input("input", 0, DT_FLOAT)
+                    .Input("filter", 0, DT_FLOAT)
+                    .Attr("dilations", dilations)
+                    .Attr("strides", strides)
+                    .Attr("padding", padding)
+                    .Attr("data_format", data_format)
+                    .Finalize(&op.node_def));
+  };
+
+  // Invalid rank for dilation
+  set_op({{1, 2, 1}}, {{1, 1, 1, 1}}, "VALID", "NHWC");
+  INFER_ERROR("contain 4 values", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Invalid value for dilation
+  set_op({{1, 0, 1, 1}}, {{1, 1, 1, 1}}, "VALID", "NHWC");
+  INFER_ERROR("must be >= 1", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Tests for NHWC
+  // 1x1 filter, 2x1 dilations, 1x1 strides
+  set_op({{1, 2, 1, 1}}, {{1, 1, 1, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,2,2,1];[1,1,1,1]", "[d0_0,2,2,d1_3]");
+
+  // 1x1 filter, 2x1 dilations, 2x1 strides
+  set_op({{1, 2, 1, 1}}, {{1, 2, 1, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,4,4,1];[1,1,1,1]", "[d0_0,2,4,d1_3]");
+
+  // 1x1 filter, 2x1 dilations, 2x2 strides
+  set_op({{1, 2, 1, 1}}, {{1, 2, 2, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,4,4,1];[1,1,1,1]", "[d0_0,2,2,d1_3]");
+
+  // 3x3 filter, 2x1 dilations, 1x1 strides
+  set_op({{1, 2, 1, 1}}, {{1, 1, 1, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,5,5,1];[3,3,1,1]", "[d0_0,1,3,d1_3]");
+
+  // 3x3 filter, 2x1 dilations, 2x1 strides
+  set_op({{1, 2, 1, 1}}, {{1, 2, 1, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,5,5,1];[3,3,1,1]", "[d0_0,1,3,d1_3]");
+
+  // 3x3 filter, 1x2 dilations, 2x2 strides
+  set_op({{1, 1, 2, 1}}, {{1, 2, 2, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,5,5,1];[3,3,1,1]", "[d0_0,2,1,d1_3]");
+
+  // Tests for NCHW
+  // 1x1 filter, 2x1 dilations, 1x1 strides
+  set_op({{1, 1, 2, 1}}, {{1, 1, 1, 1}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,2,2];[1,1,1,1]", "[d0_0,d1_3,2,2]");
+
+  // 1x1 filter, 2x1 dilations, 2x1 strides
+  set_op({{1, 1, 2, 1}}, {{1, 1, 2, 1}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,4,4];[1,1,1,1]", "[d0_0,d1_3,2,4]");
+
+  // 1x1 filter, 2x1 dilations, 2x2 strides
+  set_op({{1, 1, 2, 1}}, {{1, 1, 2, 2}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,4,4];[1,1,1,1]", "[d0_0,d1_3,2,2]");
+
+  // 3x3 filter, 2x1 dilations, 1x1 strides
+  set_op({{1, 1, 2, 1}}, {{1, 1, 1, 1}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,5,5];[3,3,1,1]", "[d0_0,d1_3,1,3]");
+
+  // 3x3 filter, 2x1 dilations, 2x1 strides
+  set_op({{1, 1, 2, 1}}, {{1, 1, 2, 1}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,5,5];[3,3,1,1]", "[d0_0,d1_3,1,3]");
+
+  // 3x3 filter, 1x2 dilations, 2x2 strides
+  set_op({{1, 1, 1, 2}}, {{1, 1, 2, 2}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,5,5];[3,3,1,1]", "[d0_0,d1_3,2,1]");
+
+  // Some tests for "SAME" padding
+
+  // 4x4 input, 1x1 filter, 2x1 dilations, 1x1 stride
+  set_op({{1, 2, 1, 1}}, {{1, 1, 1, 1}}, "SAME", "NHWC");
+  INFER_OK(op, "[1,4,4,1];[1,1,1,1]", "[d0_0,d0_1,d0_2,d1_3]");
+
+  // 3x3 input, 2x2 filter, 2x2 dilations, 1x1 stride
+  set_op({{1, 2, 2, 1}}, {{1, 1, 1, 1}}, "SAME", "NHWC");
+  INFER_OK(op, "[1,3,3,1];[2,2,1,1]", "[d0_0,d0_1,d0_2,d1_3]");
+
+  // 4x4 input, 2x2 filter, 1x2 dilations, 2x2 stride
+  set_op({{1, 1, 2, 1}}, {{1, 2, 2, 1}}, "SAME", "NHWC");
+  INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,2,2,d1_3]");
+
+  // 4x4 input, 2x2 filter, 2x2 dilations, 1x1 stride
+  set_op({{1, 2, 2, 1}}, {{1, 1, 1, 1}}, "SAME", "NHWC");
+  INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,d0_1,d0_2,d1_3]");
+}
+
 TEST(CommonShapeFnsTest, Conv3DShapeTest) {
   ShapeInferenceTestOp op("Conv3D");
   auto set_op = [&op](const std::vector<int32>& strides,
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 33bd5d250cd6b5df8c933e3f353efd9a1eee592c..1838a8ad02d2bd5522ce3162fea53e3f5afc0309 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -145,6 +145,12 @@ class DeviceBase {
     return gpu_device_info_;
   }
 
+  // The preferred thread pool for this device. If it is nullptr, the system
+  // automatically assigns a thread pool for execution.
+  virtual thread::ThreadPool* tensorflow_device_thread_pool() {
+    return device_thread_pool_;
+  }
+
   // Does not take ownership.
   void set_eigen_cpu_device(Eigen::ThreadPoolDevice* d) {
     eigen_cpu_device_ = d;
@@ -215,10 +221,17 @@ class DeviceBase {
     return errors::Internal("Device does not implement MakeTensorFromProto()");
   }
 
+ protected:
+  // Does not take ownership.
+  void set_tensorflow_device_thread_pool(thread::ThreadPool* thread_pool) {
+    device_thread_pool_ = thread_pool;
+  }
+
  private:
   Env* const env_;
   CpuWorkerThreads* cpu_worker_threads_ = nullptr;
   GpuDeviceInfo* gpu_device_info_ = nullptr;
+  thread::ThreadPool* device_thread_pool_ = nullptr;
   Eigen::ThreadPoolDevice* eigen_cpu_device_ = nullptr;
 #ifdef TENSORFLOW_USE_SYCL
   Eigen::SyclDevice* eigen_sycl_device_ = nullptr;
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 305b140a446171ddc4b249c97967057aa3e00152..1a579ab63125ff5abc2f76d06187482234a54b9c 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -243,13 +243,24 @@ uint64 FunctionDefHash(const FunctionDef& fdef);
 // address spaces.
 string Canonicalize(const string& funcname, AttrSlice attrs);
 
+class CallFrameInterface {
+ public:
+  virtual ~CallFrameInterface() {}
+
+  virtual size_t num_args() const = 0;
+  virtual size_t num_retvals() const = 0;
+
+  virtual Status GetArg(int index, Tensor* val) const = 0;
+  virtual Status SetRetval(int index, const Tensor& val) = 0;
+};
+
 // Represents a function call frame. I.e., the data structure used to
 // pass arguments to a function and retrieve its results.
 //
 // Runtime must arrange accesses to one FunctionCallFrame s.t.
 //   1. SetArgs() happens before any GetArg();
 //   2. GetRetvals happens after all SetRetval();
-class FunctionCallFrame {
+class FunctionCallFrame : public CallFrameInterface {
  public:
   FunctionCallFrame(DataTypeSlice arg_types, DataTypeSlice ret_types);
   ~FunctionCallFrame();
@@ -259,9 +270,12 @@ class FunctionCallFrame {
   Status GetRetvals(std::vector<Tensor>* rets) const;
   Status ConsumeRetvals(std::vector<Tensor>* rets);
 
+  size_t num_args() const override { return arg_types_.size(); }
+  size_t num_retvals() const override { return ret_types_.size(); }
+
   // Callee methods.
-  Status GetArg(int index, Tensor* val) const;
-  Status SetRetval(int index, const Tensor& val);
+  Status GetArg(int index, Tensor* val) const override;
+  Status SetRetval(int index, const Tensor& val) override;
 
  private:
   DataTypeVector arg_types_;
@@ -408,6 +422,9 @@ class FunctionLibraryRuntime {
   virtual Status Instantiate(const string& function_name, AttrSlice attrs,
                              Handle* handle) = 0;
 
+  // Releases state associated with the handle.
+  virtual Status ReleaseHandle(Handle handle) = 0;
+
   // Returns the function body for the instantiated function given its
   // handle 'h'. Returns nullptr if "h" is not found.
   //
@@ -453,6 +470,8 @@ class FunctionLibraryRuntime {
   virtual void Run(const Options& opts, Handle handle,
                    gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
                    DoneCallback done) = 0;
+  virtual void Run(const Options& opts, Handle handle,
+                   CallFrameInterface* call_frame, DoneCallback done) = 0;
 
   // Creates a "kernel" for the given node def "ndef".
   //
diff --git a/tensorflow/core/framework/load_library.cc b/tensorflow/core/framework/load_library.cc
index f825335300881a0bf506ec461b6e6313fefe8cdd..b9e33b148f71cd6b1856cf55436a7e73df9df059 100644
--- a/tensorflow/core/framework/load_library.cc
+++ b/tensorflow/core/framework/load_library.cc
@@ -45,7 +45,7 @@ struct Library {
 // perform initialization again, so the OpList would be empty.
 Status LoadLibrary(const char* library_filename, void** result,
                    const void** buf, size_t* len) {
-  static mutex mu;
+  static mutex mu(LINKER_INITIALIZED);
   static std::unordered_map<string, Library> loaded_libs;
   Env* env = Env::Default();
   Library library;
diff --git a/tensorflow/core/framework/memory_types.cc b/tensorflow/core/framework/memory_types.cc
index 6a2eed94b94971d20faffa1608627290c1109d66..270118bb678e110269be9aa67a3904e36c34c512 100644
--- a/tensorflow/core/framework/memory_types.cc
+++ b/tensorflow/core/framework/memory_types.cc
@@ -61,7 +61,8 @@ void MemoryTypesHelper(const NameRangeMap& name_map,
 }
 
 MemoryType MTypeFromDType(const DataType dtype) {
-  return (dtype == DT_INT32) ? HOST_MEMORY : DEVICE_MEMORY;
+  return (dtype == DT_INT32 || DataTypeAlwaysOnHost(dtype)) ? HOST_MEMORY
+                                                            : DEVICE_MEMORY;
 }
 
 }  // namespace
@@ -118,6 +119,20 @@ Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
         "HostMemory args '", str_util::Join(host_memory_args, "', '"),
         "' not found in OpDef: ", SummarizeOpDef(*op_def));
   }
+  CHECK_LE(inp_mtypes->size(), inp_dtypes.size());
+  CHECK_LE(out_mtypes->size(), out_dtypes.size());
+
+  // Mark e.g. all resource and string types as host memory.
+  for (int i = 0; i < inp_mtypes->size(); ++i) {
+    if (DataTypeAlwaysOnHost(inp_dtypes[i])) {
+      (*inp_mtypes)[i] = HOST_MEMORY;
+    }
+  }
+  for (int i = 0; i < out_mtypes->size(); ++i) {
+    if (DataTypeAlwaysOnHost(out_dtypes[i])) {
+      (*out_mtypes)[i] = HOST_MEMORY;
+    }
+  }
 
   std::vector<int32> hostmem_attr;
   if (GetNodeAttr(ndef, "_input_hostmem", &hostmem_attr).ok()) {
diff --git a/tensorflow/core/framework/memory_types_test.cc b/tensorflow/core/framework/memory_types_test.cc
index 4704da9a119c2b06db5c8b1a3874417a0b1c3617..3126ea8e5f8974cb11f88301de613eb5b920830f 100644
--- a/tensorflow/core/framework/memory_types_test.cc
+++ b/tensorflow/core/framework/memory_types_test.cc
@@ -36,11 +36,13 @@ REGISTER_OP("HostMemoryTest")
     .Input("b: T")
     .Input("c: N * string")
     .Input("d: Tlist")
+    .Input("e: Rlist")
     .Output("o: N * T")
     .Output("p: Tlist")
     .Attr("T: type")
     .Attr("N: int")
-    .Attr("Tlist: list(type)");
+    .Attr("Tlist: list(type)")
+    .Attr("Rlist: list(type)");
 REGISTER_KERNEL_BUILDER(Name("HostMemoryTest").Device(DEVICE_CPU), DummyKernel);
 REGISTER_KERNEL_BUILDER(Name("HostMemoryTest")
                             .Device(DEVICE_GPU)
@@ -57,15 +59,20 @@ TEST(MemoryTypesForNode, Simple) {
                    .Input(FakeInput(DT_BOOL))
                    .Input(FakeInput(3))
                    .Input(FakeInput({DT_INT32, DT_FLOAT, DT_INT32}))
+                   .Input(FakeInput({DT_RESOURCE, DT_STRING, DT_RESOURCE}))
                    .Finalize(&node_def));
   MemoryTypeVector input, output;
 
   TF_EXPECT_OK(MemoryTypesForNode(OpRegistry::Global(), DEVICE_CPU, node_def,
                                   &input, &output));
-  EXPECT_EQ(MemoryTypeVector({DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
-                              DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
-                              DEVICE_MEMORY, DEVICE_MEMORY}),
-            input);
+  // a:float, b:bool, c:3*string, d:(int32, float, int32),
+  // e:(resource, string, resource)
+  EXPECT_EQ(
+      MemoryTypeVector({DEVICE_MEMORY, DEVICE_MEMORY, HOST_MEMORY, HOST_MEMORY,
+                        HOST_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
+                        DEVICE_MEMORY, HOST_MEMORY, HOST_MEMORY, HOST_MEMORY}),
+      input);
+  // o:3*bool, p:(int32, float, int32)
   EXPECT_EQ(MemoryTypeVector({DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
                               DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY}),
             output);
@@ -74,7 +81,8 @@ TEST(MemoryTypesForNode, Simple) {
                                   &input, &output));
   EXPECT_EQ(
       MemoryTypeVector({HOST_MEMORY, DEVICE_MEMORY, HOST_MEMORY, HOST_MEMORY,
-                        HOST_MEMORY, HOST_MEMORY, HOST_MEMORY, HOST_MEMORY}),
+                        HOST_MEMORY, HOST_MEMORY, HOST_MEMORY, HOST_MEMORY,
+                        HOST_MEMORY, HOST_MEMORY, HOST_MEMORY}),
       input);
   EXPECT_EQ(MemoryTypeVector({HOST_MEMORY, HOST_MEMORY, HOST_MEMORY,
                               DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY}),
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index 2b080e13fdb8308f71c967ab14c6ed71ccd8f357..650aa4203ec73ffe123f2f41bd5a13cc88b291ad 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint"
 // clang-format on
 
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -46,6 +47,10 @@ struct bfloat16 {
   EIGEN_DEVICE_FUNC bfloat16() {}
 
   EIGEN_DEVICE_FUNC explicit bfloat16(const float v) {
+    if (Eigen::numext::isnan(v)) {
+      value = NAN_VALUE;
+      return;
+    }
     const uint16_t* p = reinterpret_cast<const uint16_t*>(&v);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
     value = p[0];
@@ -54,11 +59,19 @@ struct bfloat16 {
 #endif
   }
 
+  // Following the convention of numpy, converting between complex and
+  // float will lead to loss of imag value.
+  explicit EIGEN_DEVICE_FUNC bfloat16(const complex64& val)
+      : bfloat16(val.real()) {}
+
+  explicit EIGEN_DEVICE_FUNC bfloat16(const complex128& val)
+      : bfloat16(static_cast<float>(val.real())) {}
+
   template <class T>
   explicit EIGEN_DEVICE_FUNC bfloat16(const T& val)
       : bfloat16(static_cast<float>(val)) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
+  EIGEN_DEVICE_FUNC explicit operator float() const {
     float result;
 
     uint16_t* q = reinterpret_cast<uint16_t*>(&result);
@@ -89,6 +102,10 @@ struct bfloat16 {
     return static_cast<int>(float(*this));
   }
 
+  EIGEN_DEVICE_FUNC explicit operator long() const {
+    return static_cast<long>(float(*this));
+  }
+
   EIGEN_DEVICE_FUNC explicit operator char() const {
     return static_cast<char>(float(*this));
   }
@@ -121,17 +138,76 @@ struct bfloat16 {
     return static_cast<double>(float(*this));
   }
 
+  EIGEN_DEVICE_FUNC explicit operator complex64() const {
+    return complex64(float(*this), float(0.0));
+  }
+
+  EIGEN_DEVICE_FUNC explicit operator complex128() const {
+    return complex128(double(*this), double(0.0));
+  }
+
+  static bfloat16 epsilon() {
+    bfloat16 x;
+    x.value = 0x3c00;  // 0x1.0p-7
+    return x;
+  }
+
   uint16_t value;
+
+  // A value that represents "not a number".
+  static const uint16_t NAN_VALUE = 0x7FC0;
 };
 
-inline bool operator==(const bfloat16 a, const bfloat16 b) {
-  return a.value == b.value;
+inline bfloat16 operator+(bfloat16 a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
 }
-
-inline bool operator!=(const bfloat16 a, const bfloat16 b) {
-  return a.value != b.value;
+inline bfloat16 operator-(bfloat16 a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) - static_cast<float>(b));
+}
+inline bfloat16 operator*(bfloat16 a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) * static_cast<float>(b));
+}
+inline bfloat16 operator/(bfloat16 a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
+}
+inline bfloat16 operator-(bfloat16 a) {
+  a.value ^= 0x8000;
+  return a;
+}
+inline bool operator<(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) < static_cast<float>(b);
+}
+inline bool operator<=(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) <= static_cast<float>(b);
+}
+inline bool operator==(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) == static_cast<float>(b);
+}
+inline bool operator!=(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) != static_cast<float>(b);
+}
+inline bool operator>(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) > static_cast<float>(b);
+}
+inline bool operator>=(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) >= static_cast<float>(b);
+}
+inline bfloat16& operator+=(bfloat16& a, bfloat16 b) {
+  a = a + b;
+  return a;
+}
+inline bfloat16& operator-=(bfloat16& a, bfloat16 b) {
+  a = a - b;
+  return a;
+}
+inline bfloat16& operator*=(bfloat16& a, bfloat16 b) {
+  a = a * b;
+  return a;
+}
+inline bfloat16& operator/=(bfloat16& a, bfloat16 b) {
+  a = a / b;
+  return a;
 }
-
 }  // end namespace tensorflow
 
 namespace Eigen {
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index 4f5a1f80a025744f4b2189aa3216304a36b99044..fadb60d744217daa0c569601c437146a70f9b4d5 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -63,26 +63,32 @@ Status OpRegistry::LookUp(const string& op_type_name,
   const OpRegistrationData* res = nullptr;
 
   bool first_call = false;
+  bool first_unregistered = false;
   {  // Scope for lock.
     mutex_lock lock(mu_);
     first_call = MustCallDeferred();
     res = gtl::FindWithDefault(registry_, op_type_name, nullptr);
+
+    static bool unregistered_before = false;
+    first_unregistered = !unregistered_before && (res == nullptr);
+    if (first_unregistered) {
+      unregistered_before = true;
+    }
     // Note: Can't hold mu_ while calling Export() below.
   }
   if (first_call) {
     TF_QCHECK_OK(ValidateKernelRegistrations(*this));
   }
   if (res == nullptr) {
-    static bool first_unregistered = true;
     if (first_unregistered) {
       OpList op_list;
       Export(true, &op_list);
       if (VLOG_IS_ON(3)) {
-         LOG(INFO) << "All registered Ops:";
-         for (const auto& op : op_list.op())
-            LOG(INFO) << SummarizeOpDef(op);
+        LOG(INFO) << "All registered Ops:";
+        for (const auto& op : op_list.op()) {
+          LOG(INFO) << SummarizeOpDef(op);
+        }
       }
-      first_unregistered = false;
     }
     Status status =
         errors::NotFound("Op type not registered '", op_type_name,
diff --git a/tensorflow/core/framework/op_def_builder_test.cc b/tensorflow/core/framework/op_def_builder_test.cc
index c1511ebe340d99fc67f588596e028cca92e23250..9b24e3aa00425321eda2e196b1e7b243a552c730 100644
--- a/tensorflow/core/framework/op_def_builder_test.cc
+++ b/tensorflow/core/framework/op_def_builder_test.cc
@@ -124,22 +124,23 @@ TEST_F(OpDefBuilderTest, AttrWithRestrictions) {
       "attr: { name: 'a' type: 'type' allowed_values { list { type: "
       "[DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, DT_INT16, "
       "DT_UINT16, DT_INT8, DT_COMPLEX64, DT_COMPLEX128, DT_QINT8, DT_QUINT8, "
-      "DT_QINT32, DT_UINT32, DT_UINT64] } } }");
+      "DT_QINT32, DT_UINT32, DT_UINT64, DT_BFLOAT16] } } }");
   ExpectSuccess(
       b().Attr("a:{numbertype, variant}"),
       "attr: { name: 'a' type: 'type' allowed_values { list { type: "
       "[DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, DT_INT16, "
       "DT_UINT16, DT_INT8, DT_COMPLEX64, DT_COMPLEX128, DT_QINT8, DT_QUINT8, "
-      "DT_QINT32, DT_UINT32, DT_UINT64, DT_VARIANT] } } }");
+      "DT_QINT32, DT_UINT32, DT_UINT64, DT_BFLOAT16, DT_VARIANT] } } }");
   ExpectSuccess(b().Attr("a:realnumbertype"),
                 "attr: { name: 'a' type: 'type' allowed_values { list { type: "
                 "[DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, "
-                "DT_INT16, DT_UINT16, DT_INT8, DT_UINT32, DT_UINT64] } } }");
+                "DT_INT16, DT_UINT16, DT_INT8, DT_UINT32, DT_UINT64, "
+                "DT_BFLOAT16] } } }");
   ExpectSuccess(b().Attr("a:{realnumbertype,  variant , string, }"),
                 "attr: { name: 'a' type: 'type' allowed_values { list { type: "
                 "[DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, "
                 "DT_INT16, DT_UINT16, DT_INT8, DT_UINT32, DT_UINT64, "
-                "DT_VARIANT, DT_STRING] } } }");
+                "DT_BFLOAT16, DT_VARIANT, DT_STRING] } } }");
   ExpectSuccess(b().Attr("a:quantizedtype"),
                 "attr: { name: 'a' type: 'type' allowed_values { list { type: "
                 "[DT_QINT8, DT_QUINT8, DT_QINT32, DT_QINT16, DT_QUINT16]} } }");
@@ -216,12 +217,14 @@ TEST_F(OpDefBuilderTest, AttrListOfRestricted) {
       b().Attr("a:list(realnumbertype)"),
       "attr: { name: 'a' type: 'list(type)' allowed_values { list { type: "
       "[DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, DT_INT16, "
-      "DT_UINT16, DT_INT8, DT_HALF, DT_UINT32, DT_UINT64] } } }");
+      "DT_UINT16, DT_INT8, DT_HALF, DT_BFLOAT16, DT_UINT32, DT_UINT64"
+      "] } } }");
   ExpectSuccess(
       b().Attr("a:list({realnumbertype, variant})"),
       "attr: { name: 'a' type: 'list(type)' allowed_values { list { type: "
       "[DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, DT_INT16, "
-      "DT_UINT16, DT_INT8, DT_HALF, DT_UINT32, DT_UINT64, DT_VARIANT] } } }");
+      "DT_UINT16, DT_INT8, DT_HALF, DT_BFLOAT16, DT_UINT32, DT_UINT64, "
+      "DT_VARIANT] } } }");
   ExpectSuccess(
       b().Attr("a:list(quantizedtype)"),
       "attr: { name: 'a' type: 'list(type)' allowed_values { list { type: "
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index f7d4166f970097a077b6e2a4595728758c65592f..29feda499fd2646a00c1f5bc9fc7223e9f134af9 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -332,7 +332,7 @@ Status CheckOpDeprecation(const OpDef& op_def, int graph_def_version) {
           ". ", dep.explanation(), ".");
     } else {
       // Warn only once for each op name, and do it in a threadsafe manner.
-      static mutex mu;
+      static mutex mu(LINKER_INITIALIZED);
       static std::unordered_set<string> warned;
       bool warn;
       {
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index d84d5431e981a97ac49f9a2a3662cc6ca954d714..acff74070da92cc7f298560b7bb81a812924cb0f 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -281,6 +281,9 @@ static void StringReplace(const string& from, const string& to, string* s) {
     } else {
       split.push_back(s->substr(pos, found - pos));
       pos = found + from.size();
+      if (pos == s->size()) {  // handle case where `from` is at the very end.
+        split.push_back("");
+      }
     }
   }
   // Join the pieces back together with a new delimiter.
@@ -316,6 +319,36 @@ static void RenameInDocs(const string& from, const string& to, OpDef* op_def) {
   }
 }
 
+static void RenameInDocs(const string& from, const string& to,
+                         ApiDef* api_def) {
+  const string from_quoted = strings::StrCat("`", from, "`");
+  const string to_quoted = strings::StrCat("`", to, "`");
+  for (int i = 0; i < api_def->in_arg_size(); ++i) {
+    if (!api_def->in_arg(i).description().empty()) {
+      StringReplace(from_quoted, to_quoted,
+                    api_def->mutable_in_arg(i)->mutable_description());
+    }
+  }
+  for (int i = 0; i < api_def->out_arg_size(); ++i) {
+    if (!api_def->out_arg(i).description().empty()) {
+      StringReplace(from_quoted, to_quoted,
+                    api_def->mutable_out_arg(i)->mutable_description());
+    }
+  }
+  for (int i = 0; i < api_def->attr_size(); ++i) {
+    if (!api_def->attr(i).description().empty()) {
+      StringReplace(from_quoted, to_quoted,
+                    api_def->mutable_attr(i)->mutable_description());
+    }
+  }
+  if (!api_def->summary().empty()) {
+    StringReplace(from_quoted, to_quoted, api_def->mutable_summary());
+  }
+  if (!api_def->description().empty()) {
+    StringReplace(from_quoted, to_quoted, api_def->mutable_description());
+  }
+}
+
 const OpGenOverride* OpGenOverrideMap::ApplyOverride(OpDef* op_def) const {
   // Look up
   const auto iter = map_.find(op_def->name());
@@ -521,6 +554,7 @@ Status MergeApiDefs(ApiDef* base_api_def, const ApiDef& new_api_def) {
           ". All elements in arg_order override must match base arg_order: ",
           str_util::Join(base_api_def->arg_order(), ", "));
     }
+
     base_api_def->clear_arg_order();
     std::copy(
         new_api_def.arg_order().begin(), new_api_def.arg_order().end(),
@@ -595,19 +629,42 @@ Status ApiDefMap::LoadApiDef(const string& api_def_file_contents) {
   ApiDefs api_defs;
   protobuf::TextFormat::ParseFromString(contents, &api_defs);
   for (const auto& api_def : api_defs.op()) {
-    // Check if the op definition is already loaded.
+    // Check if the op definition is loaded. If op definition is not
+    // loaded, then we just skip this ApiDef.
     if (map_.find(api_def.graph_op_name()) != map_.end()) {
       // Overwrite current api def with data in api_def.
       TF_RETURN_IF_ERROR(MergeApiDefs(&map_[api_def.graph_op_name()], api_def));
-    } else {
-      return errors::FailedPrecondition(
-          "Unexpected ApiDef override: ", api_def.graph_op_name(),
-          " is not defined in base ApiDef.");
     }
   }
   return Status::OK();
 }
 
+void ApiDefMap::UpdateDocs() {
+  for (auto& name_and_api_def : map_) {
+    auto& api_def = name_and_api_def.second;
+    CHECK_GT(api_def.endpoint_size(), 0);
+    const string canonical_name = api_def.endpoint(0).name();
+    if (api_def.graph_op_name() != canonical_name) {
+      RenameInDocs(api_def.graph_op_name(), canonical_name, &api_def);
+    }
+    for (const auto& in_arg : api_def.in_arg()) {
+      if (in_arg.name() != in_arg.rename_to()) {
+        RenameInDocs(in_arg.name(), in_arg.rename_to(), &api_def);
+      }
+    }
+    for (const auto& out_arg : api_def.out_arg()) {
+      if (out_arg.name() != out_arg.rename_to()) {
+        RenameInDocs(out_arg.name(), out_arg.rename_to(), &api_def);
+      }
+    }
+    for (const auto& attr : api_def.attr()) {
+      if (attr.name() != attr.rename_to()) {
+        RenameInDocs(attr.name(), attr.rename_to(), &api_def);
+      }
+    }
+  }
+}
+
 const tensorflow::ApiDef* ApiDefMap::GetApiDef(const string& name) const {
   return gtl::FindOrNull(map_, name);
 }
diff --git a/tensorflow/core/framework/op_gen_lib.h b/tensorflow/core/framework/op_gen_lib.h
index efb287477bedde9bfbdef8e318bf6804e79f1ac5..1ede3af8d7cf8f591ba3927f7fc99d646629109d 100644
--- a/tensorflow/core/framework/op_gen_lib.h
+++ b/tensorflow/core/framework/op_gen_lib.h
@@ -106,6 +106,12 @@ class ApiDefMap {
   // passed to the constructor.
   Status LoadApiDef(const string& api_def_file_contents);
 
+  // Updates ApiDef docs. For example, if ApiDef renames an argument
+  // or attribute, applies these renames to descriptions as well.
+  // UpdateDocs should only be called once after all ApiDefs are loaded
+  // since it replaces original op names.
+  void UpdateDocs();
+
   // Look up ApiDef proto based on the given graph op name.
   // If graph op name is not in this ApiDefMap, returns nullptr.
   //
diff --git a/tensorflow/core/framework/op_gen_lib_test.cc b/tensorflow/core/framework/op_gen_lib_test.cc
index da9b4dfbb1738c855c0bfc4752853d5d501d80a8..857b1c8dbcac66899f98bb4f2ef87f65f7442f6b 100644
--- a/tensorflow/core/framework/op_gen_lib_test.cc
+++ b/tensorflow/core/framework/op_gen_lib_test.cc
@@ -410,8 +410,8 @@ op {
 
   ApiDefMap api_map(op_list);
   TF_CHECK_OK(api_map.LoadApiDef(kTestApiDef));
-  auto status = api_map.LoadApiDef(api_def1);
-  ASSERT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
+  TF_CHECK_OK(api_map.LoadApiDef(api_def1));
+  ASSERT_EQ(nullptr, api_map.GetApiDef("different_testop"));
 }
 
 TEST(OpGenLibTest, ApiDefInvalidArgOrder) {
@@ -455,5 +455,62 @@ op {
   status = api_map.LoadApiDef(api_def3);
   ASSERT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
 }
+
+TEST(OpGenLibTest, ApiDefUpdateDocs) {
+  const string op_list1 = R"(op {
+  name: "testop"
+  input_arg {
+    name: "arg_a"
+    description: "`arg_a`, `arg_c`, `attr_a`, `testop`"
+  }
+  output_arg {
+    name: "arg_c"
+    description: "`arg_a`, `arg_c`, `attr_a`, `testop`"
+  }
+  attr {
+    name: "attr_a"
+    description: "`arg_a`, `arg_c`, `attr_a`, `testop`"
+  }
+  description: "`arg_a`, `arg_c`, `attr_a`, `testop`"
+}
+)";
+
+  const string api_def1 = R"(
+op {
+  graph_op_name: "testop"
+  endpoint {
+    name: "testop2"
+  }
+  in_arg {
+    name: "arg_a"
+    rename_to: "arg_aa"
+  }
+  out_arg {
+    name: "arg_c"
+    rename_to: "arg_cc"
+    description: "New description: `arg_a`, `arg_c`, `attr_a`, `testop`"
+  }
+  attr {
+    name: "attr_a"
+    rename_to: "attr_aa"
+  }
+}
+)";
+  OpList op_list;
+  protobuf::TextFormat::ParseFromString(op_list1, &op_list);  // NOLINT
+  ApiDefMap api_map(op_list);
+  TF_CHECK_OK(api_map.LoadApiDef(api_def1));
+  api_map.UpdateDocs();
+
+  const string expected_description =
+      "`arg_aa`, `arg_cc`, `attr_aa`, `testop2`";
+  EXPECT_EQ(expected_description, api_map.GetApiDef("testop")->description());
+  EXPECT_EQ(expected_description,
+            api_map.GetApiDef("testop")->in_arg(0).description());
+  EXPECT_EQ("New description: " + expected_description,
+            api_map.GetApiDef("testop")->out_arg(0).description());
+  EXPECT_EQ(expected_description,
+            api_map.GetApiDef("testop")->attr(0).description());
+}
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index da0dc549435a35cb1dec25b9e8e5ddbea7b904b3..3a9a6121c05b02e0f7724dc77adbddca22f0ff19 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -61,7 +61,7 @@ class TensorSliceReaderCacheWrapper;
 }  // namespace checkpoint
 
 class AsyncOpKernel;
-class FunctionCallFrame;
+class CallFrameInterface;
 class FunctionLibraryRuntime;
 class OpKernelConstruction;  // declared below
 class OpKernelContext;       // declared below
@@ -548,7 +548,7 @@ class OpKernelContext {
     FrameAndIter frame_iter;
 
     // Function call supports.
-    FunctionCallFrame* call_frame = nullptr;
+    CallFrameInterface* call_frame = nullptr;
     FunctionLibraryRuntime* function_library = nullptr;
     std::function<void(std::function<void()>)>* runner = nullptr;
     StepStatsCollector* stats_collector = nullptr;
@@ -930,7 +930,7 @@ class OpKernelContext {
   //
   // If this kernel invocation is within a function execution,
   // call_frame() returns the call frame for the function call.
-  FunctionCallFrame* call_frame() const { return params_->call_frame; }
+  CallFrameInterface* call_frame() const { return params_->call_frame; }
 
   // If not nullptr, the kernel invoke functions defined in the
   // library. E.g., CHECK_NOTNULL(function_library())->Run("Foo", ...).
@@ -1492,10 +1492,12 @@ inline void OpOutputList::set_ref(int i, mutex* mu, Tensor* tensor_for_ref) {
 // }
 
 #define OP_REQUIRES(CTX, EXP, STATUS) \
-  if (!TF_PREDICT_TRUE(EXP)) {        \
-    (CTX)->CtxFailure((STATUS));      \
-    return;                           \
-  }
+  do {                                \
+    if (!TF_PREDICT_TRUE(EXP)) {      \
+      (CTX)->CtxFailure((STATUS));    \
+      return;                         \
+    }                                 \
+  } while (0)
 
 #define OP_REQUIRES_OK(CTX, ...)          \
   do {                                    \
@@ -1507,11 +1509,13 @@ inline void OpOutputList::set_ref(int i, mutex* mu, Tensor* tensor_for_ref) {
   } while (0)
 
 #define OP_REQUIRES_ASYNC(CTX, EXP, STATUS, CALLBACK) \
-  if (!TF_PREDICT_TRUE(EXP)) {                        \
-    (CTX)->CtxFailure((STATUS));                      \
-    (CALLBACK)();                                     \
-    return;                                           \
-  }
+  do {                                                \
+    if (!TF_PREDICT_TRUE(EXP)) {                      \
+      (CTX)->CtxFailure((STATUS));                    \
+      (CALLBACK)();                                   \
+      return;                                         \
+    }                                                 \
+  } while (0)
 
 #define OP_REQUIRES_OK_ASYNC(CTX, STATUS, CALLBACK) \
   do {                                              \
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index fe0742e1db5be2725d8f437e01d65f5811af608c..c13f13a126f148fa6d23dcb80c2fae8e8ecbcf3c 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -342,8 +342,8 @@ Status InferenceContext::WithRank(ShapeHandle shape, int64 rank,
     for (int i = 0; i < rank; ++i) {
       dims.push_back(UnknownDim());
     }
-    *out = shape_manager_.MakeShape(dims);
-    return Status::OK();
+    ShapeHandle shp = shape_manager_.MakeShape(dims);
+    return Merge(shape, shp, out);
   }
   *out = nullptr;
 
@@ -357,13 +357,10 @@ Status InferenceContext::WithRankAtLeast(ShapeHandle shape, int64 rank,
     return errors::InvalidArgument("Rank cannot exceed kint32max");
   }
   const int32 existing = Rank(shape);
-  if (existing >= rank) {
+  if (existing >= rank || existing == kUnknownRank) {
     *out = shape;
     return Status::OK();
   }
-  if (existing == kUnknownRank) {
-    return ReturnUnknownShape(out);
-  }
   *out = nullptr;
   return errors::InvalidArgument("Shape must be at least rank ", rank,
                                  " but is rank ", existing);
@@ -375,10 +372,7 @@ Status InferenceContext::WithRankAtMost(ShapeHandle shape, int64 rank,
     return errors::InvalidArgument("Rank cannot exceed kint32max");
   }
   const int32 existing = Rank(shape);
-  if (existing == kUnknownRank) {
-    return ReturnUnknownShape(out);
-  }
-  if (existing <= rank) {
+  if (existing <= rank || existing == kUnknownRank) {
     *out = shape;
     return Status::OK();
   }
@@ -395,23 +389,36 @@ Status InferenceContext::WithValue(DimensionHandle dim, int64 value,
     return Status::OK();
   }
   if (existing == kUnknownDim) {
-    *out = MakeDim(value);
-    return Status::OK();
+    DimensionHandle d = MakeDim(value);
+    return Merge(dim, d, out);
   }
   *out = nullptr;
   return errors::InvalidArgument("Dimension must be ", value, " but is ",
                                  existing);
 }
 
-void InferenceContext::Relax(DimensionHandle d0, DimensionHandle d1,
+void InferenceContext::Relax(DimensionHandle d_old, DimensionHandle d_new,
                              DimensionHandle* out) {
-  if (d0.SameHandle(d1)) {
-    *out = d0;
-  } else if (!ValueKnown(d0) || !ValueKnown(d1)) {
-    *out = UnknownDim();
-  } else if (Value(d0) == Value(d1)) {
-    *out = d0;
+  if (d_old.SameHandle(d_new)) {
+    *out = d_old;
+  } else if (!ValueKnown(d_old) && !ValueKnown(d_new)) {
+    // The node will be fed by the dimension d_new instead of d_old: any
+    // equality assertion between d_old and other input dimension on this node
+    // may not be true anymore, so forget them all.
+    ForgetMerges();
+    // Return the new shape handle to force the relaxation to propagate to the
+    // fanout of the context.
+    *out = d_new;
+  } else if (!ValueKnown(d_new)) {
+    ForgetMerges();
+    *out = d_new;
+  } else if (Value(d_old) == Value(d_new)) {
+    // Return the old shape handle. This will stop the relaxation in the fanout
+    // of the context.
+    *out = d_old;
   } else {
+    // Return a new handle that encodes a different unknown dim.
+    ForgetMerges();
     *out = UnknownDim();
   }
 }
@@ -463,45 +470,48 @@ Status InferenceContext::MergePrefix(ShapeHandle s, ShapeHandle prefix,
   return Status::OK();
 }
 
-void InferenceContext::Relax(ShapeHandle s0, ShapeHandle s1, ShapeHandle* out) {
-  if (s0.SameHandle(s1)) {
-    *out = s0;
+void InferenceContext::Relax(ShapeHandle s_old, ShapeHandle s_new,
+                             ShapeHandle* out) {
+  if (s_old.SameHandle(s_new)) {
+    *out = s_old;
     return;
-  } else if (!RankKnown(s0) || !RankKnown(s1)) {
-    *out = UnknownShape();
+  } else if (!RankKnown(s_new) || !s_old.IsSet()) {
+    ForgetMerges();
+    *out = s_new;
     return;
   }
 
-  const int32 rank = Rank(s0);
-  if (rank != Rank(s1)) {
+  const int32 rank = Rank(s_old);
+  if (rank != Rank(s_new)) {
+    ForgetMerges();
     *out = UnknownShape();
     return;
   }
 
-  bool return_s0 = true;
+  bool return_s_old = true;
   for (int i = 0; i < rank; ++i) {
-    auto d0 = Dim(s0, i);
-    auto d1 = Dim(s1, i);
+    auto d0 = Dim(s_old, i);
+    auto d1 = Dim(s_new, i);
     if (d0.SameHandle(d1)) continue;
 
     auto v0 = Value(d0);
     auto v1 = Value(d1);
     if (v0 == kUnknownDim || v1 == kUnknownDim || v0 != v1) {
-      return_s0 = false;
+      return_s_old = false;
       break;
     }
   }
-  if (return_s0) {
-    *out = s0;
+  if (return_s_old) {
+    *out = s_old;
     return;
   }
 
   // Relax dims.
   std::vector<DimensionHandle> dims(rank);
   for (int i = 0; i < rank; ++i) {
-    // Invariant for relax was checked earlier, so CHECK is ok.
-    Relax(Dim(s0, i), Dim(s1, i), &dims[i]);
+    Relax(Dim(s_old, i), Dim(s_new, i), &dims[i]);
   }
+  ForgetMerges();
   *out = MakeShape(dims);
 }
 
@@ -905,7 +915,7 @@ Status InferenceContext::Add(DimensionHandle first, DimensionOrConstant second,
   if (first_value == 0) {
     *out = MakeDim(second);
   } else if (second_value == 0) {
-    *out = MakeDim(first);
+    *out = first;
   } else if (first_value == kUnknownDim || second_value == kUnknownDim) {
     *out = UnknownDim();
   } else {
@@ -930,7 +940,7 @@ Status InferenceContext::Subtract(DimensionHandle first,
   const int64 second_value = Value(second);
   // Special cases.
   if (second_value == 0) {
-    *out = MakeDim(first);
+    *out = first;
   } else if (first_value == kUnknownDim || second_value == kUnknownDim) {
     *out = UnknownDim();
   } else {
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index b12d37b4c037f0af6bfe99fa6f743daf28c0cc98..4a4ef12635f867fccb594d50a2c9e8f3059ce337 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -62,7 +62,7 @@ class DimensionHandle {
  private:
   DimensionHandle(const Dimension* dim) { ptr_ = dim; }
 
-  const Dimension* operator->() { return ptr_; }
+  const Dimension* operator->() const { return ptr_; }
   bool IsSet() const { return ptr_ != nullptr; }
 
   const Dimension* ptr_ = nullptr;
@@ -104,7 +104,7 @@ class ShapeHandle {
 
  private:
   ShapeHandle(const Shape* shape) { ptr_ = shape; }
-  const Shape* operator->() { return ptr_; }
+  const Shape* operator->() const { return ptr_; }
   bool IsSet() const { return ptr_ != nullptr; }
 
   const Shape* ptr_ = nullptr;
@@ -678,14 +678,17 @@ class InferenceContext {
   // Adds additional context to the given status.
   Status AttachContext(const Status& status);
 
-  // Relaxes <d0> and <d1> and returns the relaxed dimension in <*out>. If <d0>
-  // and <d1> have incompatible values, returns an error.
+  // Relaxes an existing value <d_old> with a new value <d_new> and returns the
+  // relaxed dimension in <*out>. If <d_old> and <d_new> have incompatible
+  // values, returns an error.
   //
-  // Note that <*out> may be set to <d0> or <d1>.
-  void Relax(DimensionHandle d0, DimensionHandle d1, DimensionHandle* out);
-  // Relaxes <s0> and <s1> and returns the relaxed shape in <*out>. See
-  // 'RelaxInput' function for full details and examples.
-  void Relax(ShapeHandle s0, ShapeHandle s1, ShapeHandle* out);
+  // Note that <*out> may be set to <d_old> or <d_new>.
+  void Relax(DimensionHandle d_old, DimensionHandle d_new,
+             DimensionHandle* out);
+  // Relaxes an existing shape <s_old> with a new shape <s_new> and returns the
+  // relaxed shape in <*out>. See 'RelaxInput' function for full details and
+  // examples.
+  void Relax(ShapeHandle s_old, ShapeHandle s_new, ShapeHandle* out);
 
   // Used to implement MergeInputHandleShapesAndTypes and
   // MergeOutputHandleShapesAndTypes.
@@ -698,6 +701,12 @@ class InferenceContext {
       const std::vector<ShapeAndType>& shapes_and_types,
       std::vector<ShapeAndType>* to_update) TF_MUST_USE_RESULT;
 
+  // Forget all the previous merged shapes and dims.
+  void ForgetMerges() {
+    merged_shapes_.clear();
+    merged_dims_.clear();
+  }
+
   ShapeManager shape_manager_;
 
   // inputs_, outputs_, and input_tensors_as_shapes_ refer to values from
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index d03cc8ce6dd5f1f70270c05dfaf11f8271a6e45b..a9b63ca60e4574bb0d59c4b939ac157e62f317e8 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -359,11 +359,11 @@ TEST_F(ShapeInferenceTest, WithRankAtMost) {
   // WithRankAtMost on a shape with unknown dimensionality always succeeds.
   EXPECT_TRUE(c.WithRankAtMost(in0, 1, &s1).ok());
   EXPECT_EQ("?", c.DebugString(s1));
-  EXPECT_FALSE(SameHandle(in0, s1));
+  EXPECT_TRUE(SameHandle(in0, s1));
 
   EXPECT_TRUE(c.WithRankAtMost(in0, 2, &s2).ok());
   EXPECT_EQ("?", c.DebugString(s2));
-  EXPECT_FALSE(SameHandle(s1, s2));
+  EXPECT_TRUE(SameHandle(s1, s2));
 
   // WithRankAtMost on shape with known dimensionality.
   s1 = in1;
@@ -398,11 +398,11 @@ TEST_F(ShapeInferenceTest, WithRankAtLeast) {
   // WithRankAtLeast on a shape with unknown dimensionality always succeeds.
   EXPECT_TRUE(c.WithRankAtLeast(in0, 1, &s1).ok());
   EXPECT_EQ("?", c.DebugString(s1));
-  EXPECT_FALSE(SameHandle(in0, s1));
+  EXPECT_TRUE(SameHandle(in0, s1));
 
   EXPECT_TRUE(c.WithRankAtLeast(in0, 2, &s2).ok());
   EXPECT_EQ("?", c.DebugString(s2));
-  EXPECT_FALSE(SameHandle(s1, s2));
+  EXPECT_TRUE(SameHandle(s1, s2));
 
   // WithRankAtLeast on shape with known dimensionality.
   s1 = in1;
@@ -544,9 +544,10 @@ TEST_F(ShapeInferenceTest, RelaxDim) {
   auto d_unknown_b = c.Dim(c.input(0), 4);
   DimensionHandle out;
 
-  // Relaxing anything with unknown returns a new unknown.
+  // Relaxing anything with unknown returns a new unknown or the existing
+  // unknown.
   Relax(&c, d2, d_unknown, &out);
-  EXPECT_FALSE(SameHandle(d_unknown, out));
+  EXPECT_TRUE(SameHandle(d_unknown, out));
   EXPECT_FALSE(SameHandle(d_unknown_b, out));
   EXPECT_EQ(InferenceContext::kUnknownDim, c.Value(out));
   Relax(&c, d_unknown, d2, &out);
@@ -554,7 +555,7 @@ TEST_F(ShapeInferenceTest, RelaxDim) {
   EXPECT_EQ(InferenceContext::kUnknownDim, c.Value(out));
   Relax(&c, d_unknown, d_unknown_b, &out);
   EXPECT_FALSE(SameHandle(d_unknown, out));
-  EXPECT_FALSE(SameHandle(d_unknown_b, out));
+  EXPECT_TRUE(SameHandle(d_unknown_b, out));
   EXPECT_EQ(InferenceContext::kUnknownDim, c.Value(out));
 
   // Relaxing with self returns self.
@@ -602,7 +603,7 @@ TEST_F(ShapeInferenceTest, RelaxShape) {
   EXPECT_EQ("?", c.DebugString(out));
   Relax(&c, s_unknown, s_unknown_b, &out);
   EXPECT_FALSE(SameHandle(s_unknown, out));
-  EXPECT_FALSE(SameHandle(s_unknown_b, out));
+  EXPECT_TRUE(SameHandle(s_unknown_b, out));
   EXPECT_EQ("?", c.DebugString(out));
 
   // Relaxing with self returns self.
@@ -623,7 +624,7 @@ TEST_F(ShapeInferenceTest, RelaxShape) {
   Relax(&c, s_u_2, s_1_u, &out);
   EXPECT_EQ("[?,?]", c.DebugString(out));
   EXPECT_FALSE(SameHandle(c.Dim(s_u_2, 0), c.Dim(out, 0)));
-  EXPECT_FALSE(SameHandle(c.Dim(s_1_u, 1), c.Dim(out, 1)));
+  EXPECT_TRUE(SameHandle(c.Dim(s_1_u, 1), c.Dim(out, 1)));
   auto s_u1 = c.UnknownShapeOfRank(1);
   auto s_u2 = c.UnknownShapeOfRank(1);
   Relax(&c, s_u1, s_u2, &out);
@@ -637,7 +638,7 @@ TEST_F(ShapeInferenceTest, RelaxShape) {
   EXPECT_EQ("[?,?]", c.DebugString(out));
   out = s_unknown;
   Relax(&c, s_1_3, s_u_2, &out);
-  EXPECT_FALSE(SameHandle(c.Dim(s_u_2, 0), c.Dim(out, 0)));
+  EXPECT_TRUE(SameHandle(c.Dim(s_u_2, 0), c.Dim(out, 0)));
   EXPECT_EQ("[?,?]", c.DebugString(out));
   out = s_unknown;
 
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 24b7b08ebcb8371dfa5d46c788a3146ca727da3f..4f08cdc1d7c130bd351de7b5f7574ea199977804 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -415,18 +415,10 @@ struct ProtoHelper<qint32> {
 
 template <>
 struct ProtoHelper<bfloat16> {
-  typedef Helper<float>::RepeatedFieldType FieldType;
-  static const bfloat16* Begin(const TensorProto& proto) {
-    // TODO: Isn't this wrong, given that int_val is 32 bits long?
-    return reinterpret_cast<const bfloat16*>(proto.int_val().data());
-  }
-  static size_t NumElements(const TensorProto& proto) {
-    return proto.int_val().size();
-  }
   static void Fill(const bfloat16* data, size_t n, TensorProto* proto) {
-    proto->mutable_int_val()->Reserve(n);
+    proto->mutable_half_val()->Reserve(n);
     for (size_t i = 0; i < n; ++i) {
-      proto->mutable_int_val()->AddAlreadyReserved(data[i].value);
+      proto->mutable_half_val()->AddAlreadyReserved(data[i].value);
     }
   }
 };
@@ -529,9 +521,9 @@ TensorBuffer* FromProtoField<Variant>(Allocator* a, const TensorProto& in,
   return buf;
 }
 
-// fp16 is opaque to the protobuf, so we deserialize these identical to uint16
-// but with data stored in half_val instead of int_val (ie., we don't use
-// ProtoHelper<uint16>).
+// fp16 and bfloat16 are opaque to the protobuf, so we deserialize these
+// identical to uint16 but with data stored in half_val instead of int_val (ie.,
+// we don't use ProtoHelper<uint16>).
 template <>
 TensorBuffer* FromProtoField<Eigen::half>(Allocator* a, const TensorProto& in,
                                           int64 n) {
@@ -556,6 +548,30 @@ TensorBuffer* FromProtoField<Eigen::half>(Allocator* a, const TensorProto& in,
   return buf;
 }
 
+template <>
+TensorBuffer* FromProtoField<bfloat16>(Allocator* a, const TensorProto& in,
+                                       int64 n) {
+  CHECK_GT(n, 0);
+  Buffer<bfloat16>* buf = new Buffer<bfloat16>(a, n);
+  uint16* data = buf->template base<uint16>();
+  if (data == nullptr) {
+    buf->Unref();
+    return nullptr;
+  }
+  const int64 in_n = in.half_val().size();
+  auto begin = in.half_val().begin();
+  if (n <= in_n) {
+    std::copy_n(begin, n, data);
+  } else if (in_n > 0) {
+    std::copy_n(begin, in_n, data);
+    const uint16 last = *(data + in_n - 1);
+    std::fill_n(data + in_n, n - in_n, last);
+  } else {
+    std::fill_n(data, n, 0);
+  }
+  return buf;
+}
+
 // Copies T[n] stored in the buffer "in" into the repeated field in
 // "out" corresponding to type T.
 template <typename T>
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 3a7df6a4781ba9b1f98a9a9918bfb7ae0b655599..92d10f0d8cf452264885917bc0c897e03527a782 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -42,6 +42,9 @@ class TensorCApi;
 class TensorDescription;
 class TensorProto;
 class VariantTensorData;
+namespace batch_util {
+Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
+}  // namespace batch_util
 
 /// @ingroup core
 /// Represents an n-dimensional array of values.
@@ -421,7 +424,8 @@ class Tensor {
   typename TTypes<T, NDIMS>::ConstTensor flat_outer_dims() const;
 
   template <typename T, size_t NDIMS = 3>
-  typename TTypes<T, NDIMS>::ConstTensor flat_inner_outer_dims(int64 begin) const;
+  typename TTypes<T, NDIMS>::ConstTensor flat_inner_outer_dims(
+      int64 begin) const;
 
   /// Render the first `max_entries` values in `*this` into a string.
   string SummarizeValue(int64 max_entries) const;
@@ -461,10 +465,6 @@ class Tensor {
   void CheckTypeAndIsAligned(DataType expected_dtype) const;
   void CheckIsAlignedAndSingleElement() const;
   void set_dtype(DataType t) { shape_.set_data_type(t); }
-  template <size_t NDIMS>
-  void FillDimsAndValidateCompatibleShape(
-      gtl::ArraySlice<int64> new_sizes,
-      Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const;
 
   // TensorShape's InlineVector.
   static gtl::InlinedVector<int64, 4> ComputeFlatInnerDims(
@@ -487,6 +487,9 @@ class Tensor {
   template <typename Device, typename T>
   friend Status PrepareToUpdateVariable(
       OpKernelContext* ctx, Tensor* tensor);  // For access to RefCountIsOne().
+  friend Status batch_util::CopyElementToSlice(
+      Tensor element, Tensor* parent,
+      int64 index);                // For access to RefCountIsOne().
   friend class NumpyTensorBuffer;  // For access to the private constructor
                                    // taking the buffer.
 
@@ -514,8 +517,13 @@ class Tensor {
 
   template <size_t NDIMS>
   void FillDimsAndValidateCompatibleShape(
-      Eigen::array<Eigen::DenseIndex, NDIMS>* dims,
-      gtl::ArraySlice<int64> new_sizes) const;
+      gtl::ArraySlice<int64> new_sizes,
+      Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const;
+
+  template <typename T, size_t NDIMS>
+  void FillDimsAndValidateCompatibleShape(
+      gtl::ArraySlice<int64> new_sizes,
+      Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const;
 };
 
 // Implementation details
@@ -625,12 +633,36 @@ void Tensor::FillDimsAndValidateCompatibleShape(
   CHECK_EQ(new_num_elements, NumElements());
 }
 
+template <typename T, size_t NDIMS>
+void Tensor::FillDimsAndValidateCompatibleShape(
+    gtl::ArraySlice<int64> new_sizes,
+    Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const {
+  CHECK_EQ(NDIMS, new_sizes.size());
+  int64 new_num_elements = 1;
+  for (size_t d = 0; d < NDIMS; d++) {
+    new_num_elements *= new_sizes[d];
+    (*dims)[d] = new_sizes[d];
+  }
+  const int element_size = DataTypeSize(BaseType(dtype()));
+  if (element_size > 0) {
+    CHECK_EQ(new_num_elements * sizeof(T), NumElements() * element_size);
+  } else {
+    // DataTypeSize() returns 0 for some data types. In this case, assume that T
+    // has the same size as the buffer type.
+    // NOTE: If we can be sure that DataTypeSize() does not return 0 for all POD
+    // types, then we should check DataTypeToEnum<T>::v() == dtype(). Or simply
+    // check if `element_size > 0` to err when bit cast is attempted on Tensor
+    // of unknown data type size.
+    CHECK_EQ(new_num_elements, NumElements());
+  }
+}
+
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::shaped(
     gtl::ArraySlice<int64> new_sizes) {
   CheckTypeAndIsAligned(DataTypeToEnum<T>::v());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
-  FillDimsAndValidateCompatibleShape<NDIMS>(new_sizes, &dims);
+  FillDimsAndValidateCompatibleShape(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::Tensor(base<T>(), dims);
 }
 
@@ -639,7 +671,7 @@ typename TTypes<T, NDIMS>::Tensor Tensor::bit_casted_shaped(
     gtl::ArraySlice<int64> new_sizes) {
   CHECK(IsAligned());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
-  FillDimsAndValidateCompatibleShape<NDIMS>(new_sizes, &dims);
+  FillDimsAndValidateCompatibleShape<T>(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::Tensor(base<T>(), dims);
 }
 
@@ -648,29 +680,16 @@ typename TTypes<T, NDIMS>::UnalignedTensor Tensor::unaligned_shaped(
     gtl::ArraySlice<int64> new_sizes) {
   CheckType(DataTypeToEnum<T>::v());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
-  FillDimsAndValidateCompatibleShape<NDIMS>(new_sizes, &dims);
+  FillDimsAndValidateCompatibleShape(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::UnalignedTensor(base<T>(), dims);
 }
 
-template <size_t NDIMS>
-void Tensor::FillDimsAndValidateCompatibleShape(
-    Eigen::array<Eigen::DenseIndex, NDIMS>* dims,
-    gtl::ArraySlice<int64> new_sizes) const {
-  CHECK_EQ(NDIMS, new_sizes.size());
-  int64 new_num_elements = 1;
-  for (size_t d = 0; d < NDIMS; d++) {
-    new_num_elements *= new_sizes[d];
-    (*dims)[d] = new_sizes[d];
-  }
-  CHECK_EQ(new_num_elements, NumElements());
-}
-
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::shaped(
     gtl::ArraySlice<int64> new_sizes) const {
   CheckTypeAndIsAligned(DataTypeToEnum<T>::v());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
-  FillDimsAndValidateCompatibleShape(&dims, new_sizes);
+  FillDimsAndValidateCompatibleShape(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::ConstTensor(base<T>(), dims);
 }
 
@@ -679,7 +698,7 @@ typename TTypes<T, NDIMS>::ConstTensor Tensor::bit_casted_shaped(
     gtl::ArraySlice<int64> new_sizes) const {
   CHECK(IsAligned());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
-  FillDimsAndValidateCompatibleShape(&dims, new_sizes);
+  FillDimsAndValidateCompatibleShape<T>(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::ConstTensor(base<T>(), dims);
 }
 
@@ -688,7 +707,7 @@ typename TTypes<T, NDIMS>::UnalignedConstTensor Tensor::unaligned_shaped(
     gtl::ArraySlice<int64> new_sizes) const {
   CheckType(DataTypeToEnum<T>::v());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
-  FillDimsAndValidateCompatibleShape(&dims, new_sizes);
+  FillDimsAndValidateCompatibleShape(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::UnalignedConstTensor(base<T>(), dims);
 }
 
@@ -716,8 +735,8 @@ typename TTypes<T, NDIMS>::Tensor Tensor::flat_outer_dims() {
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::flat_inner_outer_dims(int64 begin) {
-  gtl::InlinedVector<int64,4> flat_outer = ComputeFlatOuterDims(
-      shape_.dim_sizes(), begin + NDIMS);
+  gtl::InlinedVector<int64, 4> flat_outer =
+      ComputeFlatOuterDims(shape_.dim_sizes(), begin + NDIMS);
   return shaped<T, NDIMS>(ComputeFlatInnerDims(flat_outer, NDIMS));
 }
 
@@ -732,9 +751,10 @@ typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_outer_dims() const {
 }
 
 template <typename T, size_t NDIMS>
-typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_inner_outer_dims(int64 begin) const {
-  gtl::InlinedVector<int64,4> flat_outer = ComputeFlatOuterDims(
-      shape_.dim_sizes(), begin + NDIMS);
+typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_inner_outer_dims(
+    int64 begin) const {
+  gtl::InlinedVector<int64, 4> flat_outer =
+      ComputeFlatOuterDims(shape_.dim_sizes(), begin + NDIMS);
   return shaped<T, NDIMS>(ComputeFlatInnerDims(flat_outer, NDIMS));
 }
 
diff --git a/tensorflow/core/framework/tensor.proto b/tensorflow/core/framework/tensor.proto
index 6dab325969bacbda15552a79eb3c0862dbde20a1..abbf16e8103326011525feb0017922474ff8d2cf 100644
--- a/tensorflow/core/framework/tensor.proto
+++ b/tensorflow/core/framework/tensor.proto
@@ -40,8 +40,8 @@ message TensorProto {
   // be set.  The values hold the flattened representation of the tensor in
   // row major order.
 
-  // DT_HALF. Note that since protobuf has no int16 type, we'll have some
-  // pointless zero padding for each value here.
+  // DT_HALF, DT_BFLOAT16. Note that since protobuf has no int16 type, we'll
+  // have some pointless zero padding for each value here.
   repeated int32 half_val = 13 [packed = true];
 
   // DT_FLOAT.
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index 06c576c7d41e5bf48f9db6754e5814142632a371..d8a9c0bac5b950157044dae07771b6733481ac9e 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -359,7 +359,8 @@ Status TensorShapeOld::IsValidShape(const TensorShapeProto& proto) {
   for (const auto& d : proto.dim()) {
     if (d.size() < 0) {
       return errors::InvalidArgument("Shape ", DebugString(proto),
-                                     " has negative dimensions");
+                                     " has negative dimensions; ",
+                                     "perhaps an un-fed placeholder?");
     }
     num_elements *= d.size();
     if (num_elements > kMaxElements) {
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index 47ff29fbe1a4d118e52c8faaa04019f88db0e1ae..14828804285a8115dd49f596b4aea38f7f6af1ff 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -175,6 +175,28 @@ void TestCopies(const Tensor& t) {
   }
 }
 
+TEST(Tensor_Half, Simple) {
+  Tensor t(DT_HALF, TensorShape({5, 7}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({5, 7})));
+  for (int64 a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64 b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<Eigen::half>()(a, b) = static_cast<Eigen::half>(a * b);
+    }
+  }
+  TestCopies<Eigen::half>(t);
+}
+
+TEST(Tensor_Bfloat16, Simple) {
+  Tensor t(DT_BFLOAT16, TensorShape({5, 7}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({5, 7})));
+  for (int64 a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64 b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<bfloat16>()(a, b) = static_cast<bfloat16>(a * b);
+    }
+  }
+  TestCopies<bfloat16>(t);
+}
+
 TEST(Tensor_Float, Simple) {
   Tensor t(DT_FLOAT, TensorShape({10, 20}));
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({10, 20})));
@@ -334,41 +356,126 @@ class TensorReshapeTest : public ::testing::Test {
     tensor(0, 0, 0, 0) = 0.01f;
     tensor(1, 2, 3, 4) = 0.02f;
   }
-};
 
-TEST_F(TensorReshapeTest, Reshape) {
-  LOG(INFO) << "shaped";
-  {
-    auto shaped = t.shaped<float, 1>({120});
-    EXPECT_EQ(120, shaped.dimension(0));
-    EXPECT_EQ(shaped(0), 0.01f);
-    EXPECT_EQ(shaped(119), 0.02f);
-  }
-  {
-    auto shaped = t.shaped<float, 2>({6, 20});
-    EXPECT_EQ(6, shaped.dimension(0));
-    EXPECT_EQ(20, shaped.dimension(1));
-    EXPECT_EQ(shaped(0, 0), 0.01f);
-    EXPECT_EQ(shaped(5, 19), 0.02f);
+  template <typename T>
+  using ReshapeFunc = T (Tensor::*)(gtl::ArraySlice<int64>);
+  template <typename T>
+  using ConstReshapeFunc = T (Tensor::*)(gtl::ArraySlice<int64>) const;
+
+  template <typename T, ReshapeFunc<T> Func>
+  void TestReshape(std::initializer_list<int64> sizes) {
+    T shaped = (t.*Func)(sizes);
+    TestReshapeImpl(shaped, sizes);
   }
-  {
-    auto shaped = t.shaped<float, 3>({6, 4, 5});
-    EXPECT_EQ(6, shaped.dimension(0));
-    EXPECT_EQ(4, shaped.dimension(1));
-    EXPECT_EQ(5, shaped.dimension(2));
-    EXPECT_EQ(shaped(0, 0, 0), 0.01f);
-    EXPECT_EQ(shaped(5, 3, 4), 0.02f);
+
+  template <typename T, ConstReshapeFunc<T> Func>
+  void TestReshape(std::initializer_list<int64> sizes) {
+    T shaped = (static_cast<const Tensor&>(t).*Func)(sizes);
+    TestReshapeImpl(shaped, sizes);
   }
-  {
-    auto shaped = t.shaped<float, 4>({2, 3, 4, 5});
-    EXPECT_EQ(2, shaped.dimension(0));
-    EXPECT_EQ(3, shaped.dimension(1));
-    EXPECT_EQ(4, shaped.dimension(2));
-    EXPECT_EQ(5, shaped.dimension(3));
 
-    EXPECT_EQ(shaped(0, 0, 0, 0), 0.01f);
-    EXPECT_EQ(shaped(1, 2, 3, 4), 0.02f);
+  template <typename T>
+  void TestReshapeImpl(T shaped, std::initializer_list<int64> sizes) {
+    auto iter = sizes.begin();
+    for (int i = 0; i < shaped.rank(); ++i, ++iter) {
+      EXPECT_EQ(*iter, shaped.dimension(i));
+    }
+
+    using Index = typename T::Index;
+    using Scalar = typename T::Scalar;
+    constexpr int N = T::NumIndices;
+
+    // To handle the cast when `shaped` is bit casted into a different type.
+    const float expected_first = 0.01f;
+    Eigen::DSizes<Index, N> coord;
+    EXPECT_EQ(shaped(coord), *reinterpret_cast<const Scalar*>(&expected_first));
+
+    for (int i = 0; i < N; ++i) {
+      coord[i] = shaped.dimension(i) - 1;
+    }
+    const float expected_last = 0.02f;
+    constexpr int kNumScalarPerFloat =
+        sizeof(float) / sizeof(Scalar);  // Assuming even divide.
+    EXPECT_EQ(shaped(coord), reinterpret_cast<const Scalar*>(
+                                 &expected_last)[kNumScalarPerFloat - 1]);
   }
+};
+
+TEST_F(TensorReshapeTest, Reshape) {
+  LOG(INFO) << "shaped";
+
+#define TEST_RESHAPE(...)                                                  \
+  {                                                                        \
+    constexpr int N = (sizeof((int[]){__VA_ARGS__}) / sizeof(int));        \
+    TestReshape<TTypes<float, N>::Tensor, &Tensor::shaped<float, N>>(      \
+        {__VA_ARGS__});                                                    \
+    TestReshape<TTypes<float, N>::ConstTensor, &Tensor::shaped<float, N>>( \
+        {__VA_ARGS__});                                                    \
+    TestReshape<TTypes<float, N>::UnalignedTensor,                         \
+                &Tensor::unaligned_shaped<float, N>>({__VA_ARGS__});       \
+    TestReshape<TTypes<float, N>::UnalignedConstTensor,                    \
+                &Tensor::unaligned_shaped<float, N>>({__VA_ARGS__});       \
+    TestReshape<TTypes<float, N>::Tensor,                                  \
+                &Tensor::bit_casted_shaped<float, N>>({__VA_ARGS__});      \
+    TestReshape<TTypes<float, N>::ConstTensor,                             \
+                &Tensor::bit_casted_shaped<float, N>>({__VA_ARGS__});      \
+    TestReshape<TTypes<int32, N>::Tensor,                                  \
+                &Tensor::bit_casted_shaped<int32, N>>({__VA_ARGS__});      \
+    TestReshape<TTypes<int32, N>::ConstTensor,                             \
+                &Tensor::bit_casted_shaped<int32, N>>({__VA_ARGS__});      \
+  }
+
+  TEST_RESHAPE(120);
+  TEST_RESHAPE(6, 20);
+  TEST_RESHAPE(6, 4, 5);
+  TEST_RESHAPE(2, 3, 4, 5);
+#undef TEST_RESHAPE
+}
+
+TEST_F(TensorReshapeTest, BitcastReshapeDifferentSize) {
+#define TEST_BITCAST8_RESHAPE(...)                                    \
+  {                                                                   \
+    constexpr int N = (sizeof((int[]){__VA_ARGS__}) / sizeof(int));   \
+    TestReshape<TTypes<uint8, N>::Tensor,                             \
+                &Tensor::bit_casted_shaped<uint8, N>>({__VA_ARGS__}); \
+  }
+
+  TEST_BITCAST8_RESHAPE(480);
+  TEST_BITCAST8_RESHAPE(24, 20);
+  TEST_BITCAST8_RESHAPE(6, 16, 5);
+  TEST_BITCAST8_RESHAPE(2, 3, 4, 20);
+#undef TEST_BITCAST8_RESHAPE
+#define TEST_BITCAST16_RESHAPE(...)                                   \
+  {                                                                   \
+    constexpr int N = (sizeof((int[]){__VA_ARGS__}) / sizeof(int));   \
+    TestReshape<TTypes<int16, N>::Tensor,                             \
+                &Tensor::bit_casted_shaped<int16, N>>({__VA_ARGS__}); \
+  }
+
+  TEST_BITCAST16_RESHAPE(240);
+  TEST_BITCAST16_RESHAPE(6, 40);
+  TEST_BITCAST16_RESHAPE(12, 4, 5);
+  TEST_BITCAST16_RESHAPE(2, 3, 8, 5);
+  TEST_BITCAST16_RESHAPE(2, 3, 4, 1, 10);
+#undef TEST_BITCAST16_RESHAPE
+}
+
+TEST_F(TensorReshapeTest, ReshapeError) {
+  EXPECT_DEATH((t.shaped<float, 0>({})), "1 vs. 120");
+  EXPECT_DEATH((t.shaped<float, 1>({119})), "119 vs. 120");
+  EXPECT_DEATH((t.shaped<float, 4>({2, 3, 4, 6})), "144 vs. 120");
+
+  EXPECT_DEATH((t.unaligned_shaped<float, 0>({})), "1 vs. 120");
+  EXPECT_DEATH((t.unaligned_shaped<float, 1>({119})), "119 vs. 120");
+  EXPECT_DEATH((t.unaligned_shaped<float, 4>({2, 3, 4, 6})), "144 vs. 120");
+
+  EXPECT_DEATH((t.bit_casted_shaped<float, 0>({})), "4 vs. 480");
+  EXPECT_DEATH((t.bit_casted_shaped<float, 1>({119})), "476 vs. 480");
+  EXPECT_DEATH((t.bit_casted_shaped<float, 4>({2, 3, 4, 6})), "576 vs. 480");
+
+  Tensor string_tensor{DT_STRING, {10}};
+  // Note that the error message compare # of elements, not # of bytes.
+  EXPECT_DEATH((string_tensor.bit_casted_shaped<string, 1>({9})), "9 vs. 10");
 }
 
 TEST_F(TensorReshapeTest, Flat) {
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index faae19585d9dd2bc5f351772af93723daaa3b8be..58354d6f4edea1f29ba033f2579324d400a532ab 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -206,18 +206,18 @@ string DataTypeSliceString(const DataTypeSlice types) {
 }
 
 DataTypeVector AllTypes() {
-  return {DT_FLOAT,   DT_DOUBLE, DT_INT32,  DT_UINT8,     DT_INT16,
-          DT_UINT16,  DT_INT8,   DT_STRING, DT_COMPLEX64, DT_COMPLEX128,
-          DT_INT64,   DT_BOOL,   DT_QINT8,  DT_QUINT8,    DT_QINT16,
-          DT_QUINT16, DT_QINT32, DT_HALF,   DT_RESOURCE,  DT_VARIANT,
-          DT_UINT32,  DT_UINT64};
+  return {DT_FLOAT,   DT_DOUBLE, DT_INT32,   DT_UINT8,     DT_INT16,
+          DT_UINT16,  DT_INT8,   DT_STRING,  DT_COMPLEX64, DT_COMPLEX128,
+          DT_INT64,   DT_BOOL,   DT_QINT8,   DT_QUINT8,    DT_QINT16,
+          DT_QUINT16, DT_QINT32, DT_HALF,    DT_RESOURCE,  DT_VARIANT,
+          DT_UINT32,  DT_UINT64, DT_BFLOAT16};
 }
 
 #if !defined(IS_MOBILE_PLATFORM) || defined(SUPPORT_SELECTIVE_REGISTRATION)
 
 DataTypeVector RealNumberTypes() {
-  return {DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64,  DT_UINT8, DT_INT16,
-          DT_INT8,  DT_UINT16, DT_HALF,  DT_UINT32, DT_UINT64};
+  return {DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64,  DT_UINT8,  DT_INT16,
+          DT_INT8,  DT_UINT16, DT_HALF,  DT_UINT32, DT_UINT64, DT_BFLOAT16};
 }
 
 DataTypeVector QuantizedTypes() {
@@ -227,14 +227,14 @@ DataTypeVector QuantizedTypes() {
 DataTypeVector RealAndQuantizedTypes() {
   return {DT_FLOAT,  DT_DOUBLE,  DT_INT32,  DT_INT64, DT_UINT8,
           DT_UINT16, DT_UINT16,  DT_INT8,   DT_QINT8, DT_QUINT8,
-          DT_QINT16, DT_QUINT16, DT_QINT32, DT_HALF};
+          DT_QINT16, DT_QUINT16, DT_QINT32, DT_HALF,  DT_BFLOAT16};
 }
 
 DataTypeVector NumberTypes() {
-  return {DT_FLOAT,     DT_DOUBLE,     DT_INT64,  DT_INT32,
-          DT_UINT8,     DT_UINT16,     DT_INT16,  DT_INT8,
-          DT_COMPLEX64, DT_COMPLEX128, DT_QINT8,  DT_QUINT8,
-          DT_QINT32,    DT_HALF,       DT_UINT32, DT_UINT64};
+  return {DT_FLOAT,  DT_DOUBLE,  DT_INT64,  DT_INT32,     DT_UINT8,
+          DT_UINT16, DT_INT16,   DT_INT8,   DT_COMPLEX64, DT_COMPLEX128,
+          DT_QINT8,  DT_QUINT8,  DT_QINT32, DT_HALF,      DT_UINT32,
+          DT_UINT64, DT_BFLOAT16};
 }
 
 #elif defined(__ANDROID_TYPES_FULL__)
@@ -306,6 +306,40 @@ bool DataTypeCanUseMemcpy(DataType dt) {
   }
 }
 
+bool DataTypeAlwaysOnHost(DataType dt) {
+  // Includes DT_STRING and DT_RESOURCE.
+  switch (dt) {
+    case DT_STRING:
+    case DT_STRING_REF:
+    case DT_RESOURCE:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool DataTypeIsFloating(DataType dt) {
+  switch (dt) {
+    case DT_HALF:
+    case DT_BFLOAT16:
+    case DT_FLOAT:
+    case DT_DOUBLE:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool DataTypeIsComplex(DataType dt) {
+  switch (dt) {
+    case DT_COMPLEX64:
+    case DT_COMPLEX128:
+      return true;
+    default:
+      return false;
+  }
+}
+
 bool DataTypeIsQuantized(DataType dt) {
   switch (dt) {
     case DT_QINT8:
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index dc53ed41780d90448872b1bd98e97f5e16d49592..27005c0e93267ff4f91d470a011be6d673fe8cc2 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -222,6 +222,12 @@ static_assert(IsValidDataType<int32>::value, "Incorrect impl for int32");
 
 bool DataTypeCanUseMemcpy(DataType dt);
 
+// Returns true iff 'dt' is a real, non-quantized floating point type.
+bool DataTypeIsFloating(DataType dt);
+
+// Returns true iff 'dt' is a complex type.
+bool DataTypeIsComplex(DataType dt);
+
 bool DataTypeIsQuantized(DataType dt);
 
 // Is the dtype nonquantized integral?
@@ -233,6 +239,11 @@ bool DataTypeIsUnsigned(DataType dt);
 // Returns a 0 on failure
 int DataTypeSize(DataType dt);
 
+// Types that always sit on host: DT_STRING, DT_STRING_REF, DT_RESOURCE.
+// For DT_RESOURCE, the handle always sits on host (even if the underlying
+// object has device-allocated resources).
+bool DataTypeAlwaysOnHost(DataType dt);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_FRAMEWORK_TYPES_H_
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index bc57740469f96fee28de1cea8920cc0431511db1..5ddc9865633623561760bbcb06d1edf4eecec7a6 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -130,6 +130,13 @@ TEST(TypesTest, QuantizedTypes) {
   EXPECT_FALSE(DataTypeIsQuantized(DT_BFLOAT16));
 }
 
+TEST(TypesTest, ComplexTypes) {
+  EXPECT_TRUE(DataTypeIsComplex(DT_COMPLEX64));
+  EXPECT_TRUE(DataTypeIsComplex(DT_COMPLEX128));
+  EXPECT_FALSE(DataTypeIsComplex(DT_FLOAT));
+  EXPECT_FALSE(DataTypeIsComplex(DT_DOUBLE));
+}
+
 TEST(TypesTest, IntegerTypes) {
   for (auto dt : AllTypes()) {
     const string name = DataTypeString(dt);
diff --git a/tensorflow/core/framework/variant_encode_decode.h b/tensorflow/core/framework/variant_encode_decode.h
index 09ebf6257bdffc314e09a124db70e33801ae338d..5a84f9d94385a7048a0f4adfe78e1805b367f02d 100644
--- a/tensorflow/core/framework/variant_encode_decode.h
+++ b/tensorflow/core/framework/variant_encode_decode.h
@@ -233,6 +233,7 @@ void EncodeVariant(const T& value, string* buf) {
   VariantTensorData data;
   EncodeVariantImpl(value, TypeResolver<T>(), &data);
   data.set_type_name(TypeNameVariant(value));
+  DCHECK(buf != nullptr);
   data.SerializeToString(buf);
 }
 
diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h
index 831dbd3dffe33db3b5fab2ca8feb4225121bc0c7..13f6908cae1ed1b1964bf827dce0fcb2bee4e6d1 100644
--- a/tensorflow/core/framework/variant_op_registry.h
+++ b/tensorflow/core/framework/variant_op_registry.h
@@ -145,9 +145,8 @@ class UnaryVariantOpRegistry {
   static std::unordered_set<string>* PersistentStringStorage();
 
  private:
-  std::unordered_map<StringPiece, VariantShapeFn, StringPiece::Hasher>
-      shape_fns;
-  std::unordered_map<StringPiece, VariantDecodeFn, StringPiece::Hasher>
+  std::unordered_map<StringPiece, VariantShapeFn, StringPieceHasher> shape_fns;
+  std::unordered_map<StringPiece, VariantDecodeFn, StringPieceHasher>
       decode_fns;
 
   // Map std::pair<Direction, type_name> to function.
@@ -159,7 +158,7 @@ class UnaryVariantOpRegistry {
       ret = Hash64Combine(ret, sp_hasher_(std::get<1>(x)));
       return ret;
     }
-    StringPiece::Hasher sp_hasher_;
+    StringPieceHasher sp_hasher_;
   };
 
   std::unordered_map<std::pair<VariantDeviceCopyDirection, StringPiece>,
@@ -177,7 +176,7 @@ class UnaryVariantOpRegistry {
       ret = Hash64Combine(ret, sp_hasher_(std::get<2>(x)));
       return ret;
     }
-    StringPiece::Hasher sp_hasher_;
+    StringPieceHasher sp_hasher_;
   };
   std::unordered_map<std::tuple<VariantUnaryOp, StringPiece, StringPiece>,
                      VariantUnaryOpFn, TupleHash>
diff --git a/tensorflow/core/graph/algorithm.cc b/tensorflow/core/graph/algorithm.cc
index 6ef51aa7dfcd48f840f80040f068a766a33ff5bf..4652fbe40691a01e0567c7df2fba0ca2ea482fe1 100644
--- a/tensorflow/core/graph/algorithm.cc
+++ b/tensorflow/core/graph/algorithm.cc
@@ -83,13 +83,16 @@ void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
   ReverseDFSFrom(g, {g.sink_node()}, enter, leave, stable_comparator);
 }
 
-void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
-                    const std::function<void(Node*)>& enter,
-                    const std::function<void(Node*)>& leave,
-                    const NodeComparator& stable_comparator) {
+namespace {
+
+template <typename T>
+void ReverseDFSFromHelper(const Graph& g, gtl::ArraySlice<T> start,
+                          const std::function<void(T)>& enter,
+                          const std::function<void(T)>& leave,
+                          const NodeComparator& stable_comparator) {
   // Stack of work to do.
   struct Work {
-    Node* node;
+    T node;
     bool leave;  // Are we entering or leaving n?
   };
   std::vector<Work> stack(start.size());
@@ -102,7 +105,7 @@ void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
     Work w = stack.back();
     stack.pop_back();
 
-    Node* n = w.node;
+    T n = w.node;
     if (w.leave) {
       leave(n);
       continue;
@@ -117,7 +120,7 @@ void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
 
     gtl::iterator_range<NeighborIter> nodes = n->in_nodes();
 
-    auto add_work = [&visited, &stack](Node* out) {
+    auto add_work = [&visited, &stack](T out) {
       if (!visited[out->id()]) {
         // Note; we must not mark as visited until we actually process it.
         stack.push_back(Work{out, false});
@@ -125,22 +128,38 @@ void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
     };
 
     if (stable_comparator) {
-      std::vector<Node*> nodes_sorted;
-      for (Node* in : nodes) {
+      std::vector<T> nodes_sorted;
+      for (T in : nodes) {
         nodes_sorted.emplace_back(in);
       }
       std::sort(nodes_sorted.begin(), nodes_sorted.end(), stable_comparator);
-      for (Node* in : nodes_sorted) {
+      for (T in : nodes_sorted) {
         add_work(in);
       }
     } else {
-      for (Node* in : nodes) {
+      for (T in : nodes) {
         add_work(in);
       }
     }
   }
 }
 
+}  // namespace
+
+void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
+                    const std::function<void(const Node*)>& enter,
+                    const std::function<void(const Node*)>& leave,
+                    const NodeComparator& stable_comparator) {
+  ReverseDFSFromHelper(g, start, enter, leave, stable_comparator);
+}
+
+void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+                    const std::function<void(Node*)>& enter,
+                    const std::function<void(Node*)>& leave,
+                    const NodeComparator& stable_comparator) {
+  ReverseDFSFromHelper(g, start, enter, leave, stable_comparator);
+}
+
 void GetPostOrder(const Graph& g, std::vector<Node*>* order,
                   const NodeComparator& stable_comparator) {
   order->clear();
diff --git a/tensorflow/core/graph/algorithm.h b/tensorflow/core/graph/algorithm.h
index 5bb6041d98b6aebd3036b68fffeed32afda85e50..ac4a099013b67e0d256a9310495e4b585eb40e0a 100644
--- a/tensorflow/core/graph/algorithm.h
+++ b/tensorflow/core/graph/algorithm.h
@@ -69,6 +69,10 @@ extern void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
                            const std::function<void(Node*)>& enter,
                            const std::function<void(Node*)>& leave,
                            const NodeComparator& stable_comparator = {});
+extern void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
+                           const std::function<void(const Node*)>& enter,
+                           const std::function<void(const Node*)>& leave,
+                           const NodeComparator& stable_comparator = {});
 
 // Stores in *order the post-order numbering of all nodes
 // in graph found via a depth first search starting at the source node.
diff --git a/tensorflow/core/graph/costmodel.h b/tensorflow/core/graph/costmodel.h
index a908a4843ca0a3fadeca088f8019d2a1cb228cb4..8afa4971ad054b31eeb63d0dadaa1a2937c47a6e 100644
--- a/tensorflow/core/graph/costmodel.h
+++ b/tensorflow/core/graph/costmodel.h
@@ -30,7 +30,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
-typedef std::unordered_map<StringPiece, int32, StringPiece::Hasher>
+typedef std::unordered_map<StringPiece, int32, StringPieceHasher>
     NodeNameToCostIdMap;
 
 class StepStats;
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index d0dba6e1f0198b853c66e43ea7c8baa409390cdb..b620127d9072a845721f97112f4bad107412b06f 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -455,7 +455,6 @@ class Graph {
   // the corresponding NodeDef to reflect the change.
   // REQUIRES: The control edge must exist.
   void RemoveControlEdge(const Edge* e);
-  
   // Updates the input to a node.  The existing edge to `dst` is removed and an
   // edge from `new_src` to `dst` is created. The NodeDef associated with `dst`
   // is also updated.
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index e45828b7ba0d580f31b271e791fe6ecfbf20175d..e19f4aebba97ac45c21630383e92ace30d6b31f5 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -77,6 +77,7 @@ class GraphConstructor {
                      ? in.prefix
                      : in.prefix + "/"),
           uniquify_names(in.uniquify_names),
+          uniquify_prefix(in.uniquify_prefix),
           input_map(in.input_map),
           skip_mapped_nodes(in.skip_mapped_nodes),
           control_dependencies(in.control_dependencies),
@@ -90,6 +91,7 @@ class GraphConstructor {
 
     string prefix;
     bool uniquify_names;
+    bool uniquify_prefix;
     std::map<TensorId, TensorId> input_map;
     bool skip_mapped_nodes;
     std::vector<string> control_dependencies;
@@ -111,20 +113,20 @@ class GraphConstructor {
   typedef gtl::ArraySlice<const NodeDef*> NodeDefSlice;
 
   // versions and library may be nullptr
-  static Status Construct(const Options& opts, NodeDefSlice node_defs,
-                          const VersionDef* versions,
-                          const FunctionDefLibrary* library, Graph* g,
-                          ShapeRefiner* refiner,
-                          std::vector<std::pair<Node*, int>>* return_tensors,
-                          std::vector<Node*>* return_nodes,
-                          std::vector<TensorId>* unused_input_map_keys) {
+  static Status Construct(
+      const Options& opts, NodeDefSlice node_defs, const VersionDef* versions,
+      const FunctionDefLibrary* library, Graph* g, ShapeRefiner* refiner,
+      std::vector<std::pair<Node*, int>>* return_tensors,
+      std::vector<Node*>* return_nodes,
+      std::vector<TensorId>* missing_unused_input_map_keys) {
     if (versions) {
       TF_RETURN_IF_ERROR(CheckVersions(*versions, TF_GRAPH_DEF_VERSION,
                                        TF_GRAPH_DEF_VERSION_MIN_PRODUCER,
                                        "GraphDef", "graph"));
     }
     GraphConstructor c(opts, node_defs, versions, library, g, refiner,
-                       return_tensors, return_nodes, unused_input_map_keys);
+                       return_tensors, return_nodes,
+                       missing_unused_input_map_keys);
     const Status s = c.TryImport();
     if (!s.ok()) c.Undo();
     return s;
@@ -137,17 +139,18 @@ class GraphConstructor {
                    ShapeRefiner* refiner,
                    std::vector<std::pair<Node*, int>>* return_tensors,
                    std::vector<Node*>* return_nodes,
-                   std::vector<TensorId>* unused_input_map_keys)
+                   std::vector<TensorId>* missing_unused_input_map_keys)
       : opts_(opts),
         node_defs_(node_defs),
         versions_(versions),
         library_(library),
         g_(g),
         original_versions_(g->versions()),
+        prefix_(opts.prefix),
         refiner_(refiner),
         return_tensors_(return_tensors),
         return_nodes_(return_nodes),
-        unused_input_map_keys_(unused_input_map_keys) {}
+        missing_unused_input_map_keys_(missing_unused_input_map_keys) {}
 
   Status TryImport() {
     TF_RETURN_IF_ERROR(EnsureNoNameCollisions());
@@ -159,6 +162,8 @@ class GraphConstructor {
     TF_RETURN_IF_ERROR(UpdateVersionDef());
     TF_RETURN_IF_ERROR(PopulateReturnTensors());
     TF_RETURN_IF_ERROR(PopulateReturnNodes());
+    TF_RETURN_IF_ERROR(PopulateMissingUnusedInputMapKeys());
+    UpdateUniquifiedColocationNames();
     FixupSourceAndSinkEdges(g_);
     return Status::OK();
   }
@@ -172,6 +177,7 @@ class GraphConstructor {
   Status UpdateVersionDef();
   Status PopulateReturnTensors();
   Status PopulateReturnNodes();
+  Status PopulateMissingUnusedInputMapKeys();
 
   void Undo();
 
@@ -201,9 +207,18 @@ class GraphConstructor {
   void UniquifyNames(const std::vector<bool>& input_already_exists,
                      NodeDef* node_def);
 
+  // Updates any constructed nodes' colocation group names if the name has been
+  // updated by UniquifyNames. This is called after all the nodes have been
+  // constructed so all the names have been uniquified if necessary.
+  void UpdateUniquifiedColocationNames();
+
   // Returns true if `name` already exists in `g_` (either as a node name or
   // prefix).
-  bool NameExists(StringPiece name);
+  bool NameExistsInGraph(StringPiece name);
+
+  // Returns true if `name` already exists in the GraphDef being imported
+  // (either as a node name or prefix).
+  bool NameExistsInGraphDef(StringPiece name);
 
   // Returns a unique version of `original_name`, or `original_name` if it's
   // already unique in the graph.
@@ -217,6 +232,9 @@ class GraphConstructor {
   Graph* g_;
   const VersionDef original_versions_;
 
+  // A copy of opts_.prefix, possibly uniquified.
+  string prefix_;
+
   ShapeRefiner* refiner_;
 
   // May be null. Not owned.
@@ -226,9 +244,10 @@ class GraphConstructor {
   std::vector<Node*>* return_nodes_;
 
   // May be null. Not owned.
-  std::vector<TensorId>* unused_input_map_keys_;
+  std::vector<TensorId>* missing_unused_input_map_keys_;
 
-  // Intermediate datastructure used to populate `unused_input_map_keys_`.
+  // Intermediate datastructure used to populate
+  // `missing_unused_input_map_keys_`.
   std::set<TensorId> used_input_map_keys_;
 
   // Mapping from node name to the index within node_defs_.
@@ -241,13 +260,16 @@ class GraphConstructor {
   };
   // TODO(vrv): Profile this data structure to see if we should use an
   // alternative implementation of std::unordered_map.
-  std::unordered_map<StringPiece, NodeInfo, StringPiece::Hasher> gdef_nodes_;
+  std::unordered_map<StringPiece, NodeInfo, StringPieceHasher> gdef_nodes_;
+
+  // Prefixes already used in the GraphDef being imported.
+  std::unordered_set<StringPiece, StringPieceHasher> gdef_prefixes_;
 
   // Mapping from node name to the existing node in g_.
-  std::unordered_map<StringPiece, Node*, StringPiece::Hasher> existing_nodes_;
+  std::unordered_map<StringPiece, Node*, StringPieceHasher> existing_nodes_;
 
   // Prefixes already used in the graph.
-  std::unordered_set<StringPiece, StringPiece::Hasher> existing_prefixes_;
+  std::unordered_set<StringPiece, StringPieceHasher> existing_prefixes_;
 
   // Imported node names that have been uniquified. The key is the original
   // name, the value is the new unique name.
@@ -305,6 +327,16 @@ bool NodeNameInValues(const std::vector<string>& control_dependencies,
                    node_name) != control_dependencies.end();
 }
 
+// Adds any prefixes of `node_name` (not including the full name itself) to
+// `prefixes`.
+void AddPrefixes(StringPiece node_name,
+                 std::unordered_set<StringPiece, StringPieceHasher>* prefixes) {
+  size_t idx = -1;
+  while ((idx = node_name.find('/', idx + 1)) != StringPiece::npos) {
+    prefixes->insert(node_name.substr(0, idx));
+  }
+}
+
 Status GraphConstructor::EnsureNoNameCollisions() {
   existing_nodes_.reserve(g_->num_nodes());
   // Populate existing_nodes_ and existing_prefixes_.
@@ -323,34 +355,32 @@ Status GraphConstructor::EnsureNoNameCollisions() {
             n->name(), "'");
       }
     }
-    // Add all of node's prefixes to existing_prefixes_ (if it has any).
-    size_t idx = -1;
-    while ((idx = n->name().find('/', idx + 1)) != string::npos) {
-      StringPiece name(n->name());
-      existing_prefixes_.insert(name.substr(0, idx));
-    }
+    AddPrefixes(n->name(), &existing_prefixes_);
   }
-  if (opts_.prefix.empty() && opts_.importing && !opts_.uniquify_names) {
+  if (prefix_.empty() && opts_.importing && !opts_.uniquify_names) {
     for (const NodeDef* n : node_defs_) {
       const string& name = n->name();
-      if (NameExists(name)) {
+      if (NameExistsInGraph(name)) {
         return errors::InvalidArgument("Node name '", name,
                                        "' already exists in the Graph");
       }
     }
-  } else if (!opts_.prefix.empty()) {
-    StringPiece prefix_no_slash(opts_.prefix);
+  } else if (!prefix_.empty()) {
+    StringPiece prefix_no_slash(prefix_);
     prefix_no_slash.remove_suffix(1);
     if (!IsValidNodeName(prefix_no_slash, false)) {
-      return errors::InvalidArgument("Imported node name prefix '",
-                                     opts_.prefix,
+      return errors::InvalidArgument("Imported node name prefix '", prefix_,
                                      "' would lead to invalid node names");
     }
-    if (NameExists(prefix_no_slash)) {
-      return errors::InvalidArgument("Import node name prefix '",
-                                     prefix_no_slash,
-                                     "' conflicts with "
-                                     "name already used in the graph");
+    if (NameExistsInGraph(prefix_no_slash)) {
+      if (opts_.uniquify_prefix) {
+        prefix_ = strings::StrCat(FindUniqueName(prefix_no_slash), "/");
+      } else {
+        return errors::InvalidArgument("Import node name prefix '",
+                                       prefix_no_slash,
+                                       "' conflicts with "
+                                       "name already used in the graph");
+      }
     }
   }
   return Status::OK();
@@ -384,7 +414,7 @@ Status GraphConstructor::ValidateInputMapAndControlDependencies() {
 }
 
 Status GraphConstructor::BuildNodeIndex() {
-  // Validate the node names and add them to gdef_nodes_.
+  // Validate the node names and add them to gdef_nodes_ and gdef_prefixes_.
   for (int n = 0; n < node_defs_.size(); ++n) {
     const NodeDef& node_def = *node_defs_[n];
     if (!IsValidNodeName(node_def.name(), opts_.allow_internal_ops)) {
@@ -419,6 +449,8 @@ Status GraphConstructor::BuildNodeIndex() {
             "': Control dependencies must come after regular dependencies");
       }
     }
+    // Update gdef_prefixes_.
+    AddPrefixes(node_def.name(), &gdef_prefixes_);
   }
   return Status::OK();
 }
@@ -720,8 +752,8 @@ void GraphConstructor::AddControlDependencies(
 
 void GraphConstructor::AddPrefixToNodeDef(
     const std::vector<bool>& input_already_exists, NodeDef* node_def) {
-  if (opts_.prefix.empty()) return;
-  node_def->set_name(strings::StrCat(opts_.prefix, node_def->name()));
+  if (prefix_.empty()) return;
+  node_def->set_name(strings::StrCat(prefix_, node_def->name()));
   // Update names of input nodes
   for (int i = 0; i < node_def->input_size(); ++i) {
     StringPiece input(node_def->input(i));
@@ -729,9 +761,9 @@ void GraphConstructor::AddPrefixToNodeDef(
     // imported).
     if (input_already_exists[i]) continue;
     if (input.Consume("^")) {
-      node_def->set_input(i, strings::StrCat("^", opts_.prefix, input));
+      node_def->set_input(i, strings::StrCat("^", prefix_, input));
     } else {
-      node_def->set_input(i, strings::StrCat(opts_.prefix, input));
+      node_def->set_input(i, strings::StrCat(prefix_, input));
     }
   }
   // Update names of colocation groups
@@ -741,8 +773,7 @@ void GraphConstructor::AddPrefixToNodeDef(
     for (int i = 0; i < list->s_size(); ++i) {
       StringPiece v(list->s(i));
       if (v.Consume(kColocationGroupPrefix)) {
-        list->set_s(i,
-                    strings::StrCat(kColocationGroupPrefix, opts_.prefix, v));
+        list->set_s(i, strings::StrCat(kColocationGroupPrefix, prefix_, v));
       }
     }
   }
@@ -750,10 +781,13 @@ void GraphConstructor::AddPrefixToNodeDef(
 
 void GraphConstructor::UniquifyNames(
     const std::vector<bool>& input_already_exists, NodeDef* node_def) {
-  if (NameExists(node_def->name())) {
+  if (NameExistsInGraph(node_def->name())) {
     string old_name = node_def->name();
     node_def->set_name(FindUniqueName(node_def->name()));
     uniquified_names_[old_name] = node_def->name();
+    // Note that we don't have to update gdef_nodes_ or gdef_prefixes_ with
+    // `name` because we guarantee the original NodeDef names are unique,
+    // meaning we won't generate this name again.
   }
   for (int i = 0; i < node_def->input_size(); ++i) {
     // Skip remapped inputs (which already exist in g_ and are not being
@@ -768,31 +802,52 @@ void GraphConstructor::UniquifyNames(
     id.first = iter->second;
     node_def->set_input(i, id.ToString());
   }
-  // Update names of colocation groups
-  if (node_def->attr().find(kColocationAttrName) != node_def->attr().end()) {
-    auto* list =
-        node_def->mutable_attr()->at(kColocationAttrName).mutable_list();
-    for (int i = 0; i < list->s_size(); ++i) {
-      StringPiece v(list->s(i));
-      if (v.Consume(kColocationGroupPrefix)) {
-        auto iter = uniquified_names_.find(v.ToString());
-        if (iter == uniquified_names_.end()) continue;
-        list->set_s(i, strings::StrCat(kColocationGroupPrefix, iter->second));
+}
+
+void GraphConstructor::UpdateUniquifiedColocationNames() {
+  for (const auto& pair : gdef_nodes_) {
+    Node* node = pair.second.node;
+    if (node == nullptr) continue;
+    std::vector<string> coloc_values;
+    Status status =
+        GetNodeAttr(node->attrs(), kColocationAttrName, &coloc_values);
+    if (!status.ok()) continue;
+    bool updated = false;
+    for (int i = 0; i < coloc_values.size(); ++i) {
+      StringPiece val(coloc_values[i]);
+      if (val.Consume(kColocationGroupPrefix)) {
+        const auto& name_pair = uniquified_names_.find(val.ToString());
+        if (name_pair == uniquified_names_.end()) continue;
+        updated = true;
+        coloc_values[i] =
+            strings::StrCat(kColocationGroupPrefix, name_pair->second);
       }
     }
+    if (updated) {
+      node->AddAttr(kColocationAttrName, coloc_values);
+    }
   }
 }
 
-bool GraphConstructor::NameExists(StringPiece name) {
+bool GraphConstructor::NameExistsInGraph(StringPiece name) {
   if (existing_nodes_.find(name) != existing_nodes_.end()) return true;
-  return existing_prefixes_.find(name) != existing_prefixes_.end();
+  if (existing_prefixes_.find(name) != existing_prefixes_.end()) return true;
+  return false;
+}
+
+bool GraphConstructor::NameExistsInGraphDef(StringPiece name) {
+  if (gdef_nodes_.find(name) != gdef_nodes_.end()) return true;
+  if (gdef_prefixes_.find(name) != gdef_prefixes_.end()) return true;
+  return false;
 }
 
 string GraphConstructor::FindUniqueName(StringPiece original_name) {
   string name = original_name.ToString();
-  int count = 1;
-  while (NameExists(name)) {
-    name = strings::StrCat(original_name, "_", count++);
+  int count = 0;
+  // Check that any generated names don't collide with imported NodeDefs (as
+  // well as nodes in g_).
+  while (NameExistsInGraph(name) || (count > 0 && NameExistsInGraphDef(name))) {
+    name = strings::StrCat(original_name, "_", ++count);
   }
   return name;
 }
@@ -931,7 +986,7 @@ Status GraphConstructor::Convert() {
 
     Node* node;
     if (opts_.importing) {
-      if (!opts_.prefix.empty()) {
+      if (!prefix_.empty()) {
         AddPrefixToNodeDef(input_already_exists, &imported_node_def);
       } else if (opts_.uniquify_names) {
         UniquifyNames(input_already_exists, &imported_node_def);
@@ -972,15 +1027,6 @@ Status GraphConstructor::Convert() {
                                    " nodes in a cycle");
   }
 
-  // Update unused_input_map_keys_
-  if (unused_input_map_keys_ != nullptr) {
-    for (const auto& pair : opts_.input_map) {
-      if (used_input_map_keys_.find(pair.first) == used_input_map_keys_.end()) {
-        unused_input_map_keys_->push_back(pair.first);
-      }
-    }
-  }
-
   return Status::OK();
 }
 
@@ -1070,6 +1116,33 @@ Status GraphConstructor::PopulateReturnNodes() {
   return Status::OK();
 }
 
+Status GraphConstructor::PopulateMissingUnusedInputMapKeys() {
+  if (missing_unused_input_map_keys_ == nullptr) return Status::OK();
+  for (const auto& input_map_pair : opts_.input_map) {
+    TensorId key = input_map_pair.first;
+    if (used_input_map_keys_.count(key) > 0) continue;
+
+    auto pair = gdef_nodes_.find(key.first);
+    if (pair == gdef_nodes_.end()) {
+      // key's node doesn't exist in GraphDef
+      missing_unused_input_map_keys_->push_back(key);
+      continue;
+    }
+
+    // Check that key's index is in bounds. Get the number of outputs from the
+    // NodeDef, rather than the imported Node, since the Node may not exist if
+    // opts_.skip_mapped_nodes is true.
+    const NodeDef* node_def = node_defs_[pair->second.gdef_index];
+    const OpDef* op_def;
+    TF_RETURN_IF_ERROR(g_->op_registry()->LookUpOpDef(node_def->op(), &op_def));
+    if (key.second >= op_def->output_arg_size()) {
+      // key's index out of bounds
+      missing_unused_input_map_keys_->push_back(key);
+    }
+  }
+  return Status::OK();
+}
+
 void GraphConstructor::Undo() {
   for (const auto& iter : gdef_nodes_) {
     if (iter.second.node != nullptr) {
@@ -1101,7 +1174,7 @@ Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
   return GraphConstructor::Construct(
       opts, gdef.node(), &gdef.versions(), &gdef.library(), g, &refiner,
       /*return_tensors=*/nullptr, /*return_nodes=*/nullptr,
-      /*unused_input_map_keys=*/nullptr);
+      /*missing_unused_input_map_keys=*/nullptr);
 }
 
 Status ConvertNodeDefsToGraph(const GraphConstructorOptions& opts,
@@ -1115,7 +1188,7 @@ Status ConvertNodeDefsToGraph(const GraphConstructorOptions& opts,
   return GraphConstructor::Construct(opts, node_defs, nullptr, nullptr, g,
                                      &refiner, /*return_tensors=*/nullptr,
                                      /*return_nodes=*/nullptr,
-                                     /*unused_input_map_keys=*/nullptr);
+                                     /*missing_unused_input_map_keys=*/nullptr);
 }
 
 Status ImportGraphDef(const ImportGraphDefOptions& opts, const GraphDef& gdef,
@@ -1144,7 +1217,7 @@ Status ImportGraphDef(const ImportGraphDefOptions& opts, const GraphDef& gdef,
 
   if (results != nullptr) {
     if (!results->return_tensors.empty() || !results->return_nodes.empty() ||
-        !results->unused_input_map_keys.empty()) {
+        !results->missing_unused_input_map_keys.empty()) {
       return errors::InvalidArgument(
           "All fields in results argument to ImportGraphDef() must be empty.");
     }
@@ -1187,7 +1260,7 @@ Status ImportGraphDef(const ImportGraphDefOptions& opts, const GraphDef& gdef,
     return GraphConstructor::Construct(
         opts, gdef.node(), &gdef.versions(), &gdef.library(), g, refiner,
         &results->return_tensors, &results->return_nodes,
-        &results->unused_input_map_keys);
+        &results->missing_unused_input_map_keys);
   }
 }
 
diff --git a/tensorflow/core/graph/graph_constructor.h b/tensorflow/core/graph/graph_constructor.h
index 4b418b862290d23f6838f6a1f43345adee467884..07814b2ef743b27b17e98bba55ea1e6417642c06 100644
--- a/tensorflow/core/graph/graph_constructor.h
+++ b/tensorflow/core/graph/graph_constructor.h
@@ -54,7 +54,10 @@ extern Status ConvertNodeDefsToGraph(const GraphConstructorOptions& opts,
 
 // Options for calling ImportGraphDef().
 struct ImportGraphDefOptions {
-  ImportGraphDefOptions() : uniquify_names(false), skip_mapped_nodes(false) {}
+  ImportGraphDefOptions()
+      : uniquify_names(false),
+        uniquify_prefix(false),
+        skip_mapped_nodes(false) {}
 
   // Name prefix to use for nodes imported from the GraphDef.  For example, if
   // prefix="animals" and GraphDef contains a node "bunny" then the node will be
@@ -68,6 +71,11 @@ struct ImportGraphDefOptions {
   // will guarantee all node names are unique.
   bool uniquify_names;
 
+  // If true, `prefix` will be modified if it already exists as a node name or
+  // prefix in the graph. If false, a conflicting prefix will be treated as an
+  // error. This option has no effect if `prefix` isn't specified.
+  bool uniquify_prefix;
+
   // Maps tensors in `gdef` to existing tensors in `g`. Inputs in `gdef`
   // corresponding to `input_map` keys will be remapped to the nodes in `g`
   // corresponding to the values.
@@ -140,9 +148,10 @@ struct ImportGraphDefResults {
   // The requested nodes associated with ImportGraphDefOptions::return_nodes.
   std::vector<Node*> return_nodes;
 
-  // Keys in ImportGraphDefOptions::input_map that weren't used as an input to
-  // any node in`gdef`.
-  std::vector<TensorId> unused_input_map_keys;
+  // Keys in ImportGraphDefOptions::input_map that don't appear in `gdef` and
+  // weren't used as an input to any node in `gdef`. These keys are likely due
+  // to typos, and callers may wish to treat their existence as an error.
+  std::vector<TensorId> missing_unused_input_map_keys;
 };
 
 // Adds the graph in GraphDef `gdef` into an existing Graph `*g`.
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index 0f88c80b85a4b05c21f76713a3406c72354cba0c..01bb1ac748fd512dcd1d715d949de8eb6e77142d 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -1433,7 +1433,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapDuplicateNodeNames) {
       &refiner);
 }
 
-TEST_F(GraphConstructorTest, ImportGraphDef_InputMapUnusedKeys) {
+TEST_F(GraphConstructorTest, ImportGraphDef_InputMapMissingUnusedKeys) {
   ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
 
   // No input map
@@ -1443,10 +1443,10 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapUnusedKeys) {
       "node { name: 'W1' op: 'TestParams' }"
       "node { name: 'input' op: 'TestInput' }",
       opts, &refiner, &results);
-  EXPECT_TRUE(results.unused_input_map_keys.empty());
+  EXPECT_TRUE(results.missing_unused_input_map_keys.empty());
 
-  // Non-empty unused_input_map_keys
-  results.unused_input_map_keys.push_back(TensorId());
+  // Non-empty missing_unused_input_map_keys
+  results.missing_unused_input_map_keys.push_back(TensorId());
   ExpectError(
       "node { name: 'W2' op: 'TestParams' }", opts,
       {"All fields in results argument to ImportGraphDef() must be empty."},
@@ -1454,13 +1454,16 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapUnusedKeys) {
 
   // Input map with some used, some unused keys
   const int kControlSlot = Graph::kControlSlot;
-  results.unused_input_map_keys.clear();
+  results.missing_unused_input_map_keys.clear();
   opts.input_map[TensorId("W2", kControlSlot)] = TensorId("W1", kControlSlot);
   opts.input_map[TensorId("new_input", 0)] = TensorId("input", 0);
   opts.input_map[TensorId("new_input", 1)] = TensorId("input", 0);
-  opts.input_map[TensorId("new_input", kControlSlot)] =
-      TensorId("input", kControlSlot);
-  opts.input_map[TensorId("t1", 1)] = TensorId("input", 0);
+  // Unused and missing (nonexistent index)
+  opts.input_map[TensorId("new_input", 3)] = TensorId("input", 0);
+  // Unused and missing (nonexistent node)
+  opts.input_map[TensorId("DNE", 0)] = TensorId("input", 0);
+  // Unused but not missing
+  opts.input_map[TensorId("t1", 0)] = TensorId("W1", 0);
   ExpectOK(
       R"EOF(
       node { name: 'W2' op: 'TestParams' }
@@ -1470,9 +1473,36 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapUnusedKeys) {
       )EOF",
       opts, &refiner, &results);
 
-  std::vector<TensorId> expected_unused_keys = {
-      TensorId("new_input", kControlSlot), TensorId("t1", 1)};
-  EXPECT_EQ(results.unused_input_map_keys, expected_unused_keys);
+  std::set<TensorId> expected_unused_keys = {TensorId("new_input", 3),
+                                             TensorId("DNE", 0)};
+  ASSERT_EQ(results.missing_unused_input_map_keys.size(),
+            expected_unused_keys.size());
+
+  std::set<TensorId> actual_unused_keys(
+      results.missing_unused_input_map_keys.begin(),
+      results.missing_unused_input_map_keys.end());
+  EXPECT_EQ(actual_unused_keys, expected_unused_keys);
+
+  // Test edge case: node isn't imported due to skip_mapped_nodes, but we still
+  // have a bad input_map key involving it.
+  opts = ImportGraphDefOptions();
+  opts.input_map[TensorId("new_input", 0)] = TensorId("input", 0);
+  opts.input_map[TensorId("new_input", 1)] = TensorId("input", 1);
+  // Index out of bounds
+  opts.input_map[TensorId("new_input", 2)] = TensorId("input", 1);
+  opts.skip_mapped_nodes = true;
+  opts.prefix = "import";
+  results = ImportGraphDefResults();
+  ExpectOK(
+      R"EOF(
+      node { name: 'W2' op: 'TestParams' }
+      node { name: 'new_input' op: 'TestInput' input: [ '^W2' ] }
+      node { name: 't1' op: 'TestMul' input: [ 'new_input:0', 'new_input:1' ] }
+      )EOF",
+      opts, &refiner, &results);
+
+  ASSERT_EQ(results.missing_unused_input_map_keys.size(), 1);
+  EXPECT_EQ(results.missing_unused_input_map_keys[0], TensorId("new_input", 2));
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithUnboundInput) {
@@ -1709,7 +1739,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ReturnNodes) {
   // Check return tensors
   ASSERT_EQ(results.return_nodes.size(), 2);
   EXPECT_EQ(results.return_tensors.size(), 0);
-  EXPECT_EQ(results.unused_input_map_keys.size(), 0);
+  EXPECT_EQ(results.missing_unused_input_map_keys.size(), 0);
   EXPECT_EQ(results.return_nodes[0]->name(), "input");
   EXPECT_EQ(results.return_nodes[1]->name(), "t1");
 
@@ -1806,6 +1836,21 @@ TEST_F(GraphConstructorTest, ImportGraphDef_UniquifyNames) {
   EXPECT_EQ(results.return_nodes[1]->name(), "B_2");
   EXPECT_EQ(results.return_nodes[1]->def().input(0), "A_2:0");
 
+  // Import with an already-used prefix
+  opts.prefix = "A";
+  opts.uniquify_prefix = true;
+  results = ImportGraphDefResults();
+  ExpectOK(graph_def_str, opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_3/A");
+  EXPECT_EQ(results.return_nodes[1]->name(), "A_3/B");
+  EXPECT_EQ(results.return_nodes[1]->def().input(0), "A_3/A");
+
+  // Create B_3 node to keep the A/B numbering in sync
+  opts = ImportGraphDefOptions();
+  ExpectOK("node { name: 'B_3' op: 'TestInput' }");
+
   // Import with existing de-duped node names
   opts = ImportGraphDefOptions();
   opts.uniquify_names = true;
@@ -1822,6 +1867,30 @@ TEST_F(GraphConstructorTest, ImportGraphDef_UniquifyNames) {
   EXPECT_EQ(results.return_nodes[1]->name(), "B_1_1");
   EXPECT_EQ(results.return_nodes[1]->def().input(0), "A_1_1:0");
 
+  // Import with node names that must be de-duped from names and prefixes that
+  // exist in both the existing graph and the GraphDef being imported.
+  opts = ImportGraphDefOptions();
+  opts.uniquify_names = true;
+  opts.return_nodes.push_back("A");
+  opts.return_nodes.push_back("A_4");
+  opts.return_nodes.push_back("B");
+  opts.return_nodes.push_back("B_4/B");
+  results = ImportGraphDefResults();
+  ExpectOK(
+      "node { name: 'A' op: 'TestInput' }"
+      "node { name: 'A_4' op: 'TestInput' }"
+      "node { name: 'B' op: 'TestOneInputTwoOutputs' input: ['A'] }"
+      "node { name: 'B_4/B' op: 'TestOneInputTwoOutputs' input: ['A_4'] }",
+      opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 4);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_5");
+  EXPECT_EQ(results.return_nodes[1]->name(), "A_4");
+  EXPECT_EQ(results.return_nodes[2]->name(), "B_5");
+  EXPECT_EQ(results.return_nodes[2]->def().input(0), "A_5:0");
+  EXPECT_EQ(results.return_nodes[3]->name(), "B_4/B");
+  EXPECT_EQ(results.return_nodes[3]->def().input(0), "A_4");
+
   // Create node with prefix and then import node with same name
   ExpectOK("node { name: 'foo/abc' op: 'ABC' }");
   opts = ImportGraphDefOptions();
@@ -1871,16 +1940,25 @@ TEST_F(GraphConstructorTest, ImportGraphDef_UniquifyNames) {
   ExpectOK(graph_def_str, opts, &refiner, &results);
 
   ASSERT_EQ(results.return_nodes.size(), 2);
-  EXPECT_EQ(results.return_nodes[0]->name(), "A_3");
-  EXPECT_EQ(results.return_nodes[1]->name(), "B_3");
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_6");
+  EXPECT_EQ(results.return_nodes[1]->name(), "B_6");
   EXPECT_EQ(results.return_nodes[1]->def().input(0), "A:0");
+}
+
+TEST_F(GraphConstructorTest, ImportGraphDef_UniquifyNames_ColocationGroups) {
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
+
+  // Create nodes 'A' and 'b"
+  ExpectOK(
+      "node { name: 'A' op: 'TestInput' }"
+      "node { name: 'B' op: 'TestOneInputTwoOutputs' input: ['A'] }");
 
   // Check that colocation groups are updated
-  opts = ImportGraphDefOptions();
+  ImportGraphDefOptions opts;
   opts.uniquify_names = true;
   opts.return_nodes.push_back("A");
   opts.return_nodes.push_back("B");
-  results = ImportGraphDefResults();
+  ImportGraphDefResults results;
   ExpectOK(
       "node { name: 'A' op: 'TestInput' }"
       "node { name: 'B' op: 'TestOneInputTwoOutputs' input: ['A:0'] "
@@ -1888,14 +1966,48 @@ TEST_F(GraphConstructorTest, ImportGraphDef_UniquifyNames) {
       opts, &refiner, &results);
 
   ASSERT_EQ(results.return_nodes.size(), 2);
-  EXPECT_EQ(results.return_nodes[0]->name(), "A_4");
-  EXPECT_EQ(results.return_nodes[1]->name(), "B_4");
-  EXPECT_EQ(results.return_nodes[1]->def().input(0), "A_4:0");
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_1");
+  EXPECT_EQ(results.return_nodes[1]->name(), "B_1");
   const AttrValue* class_attr =
       results.return_nodes[1]->attrs().Find(kColocationAttrName);
   ASSERT_TRUE(class_attr != nullptr);
   ASSERT_EQ(class_attr->list().s_size(), 1);
-  EXPECT_EQ(class_attr->list().s(0), "loc:@A_4");
+  EXPECT_EQ(class_attr->list().s(0), "loc:@A_1");
+
+  results = ImportGraphDefResults();
+  ExpectOK(
+      "node { name: 'A' op: 'TestInput' "
+      "       attr { key: '_class' value { list { s:'loc:@B' } } } }"
+      "node { name: 'B' op: 'TestOneInputTwoOutputs' input: ['A:0'] }",
+      opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_2");
+  EXPECT_EQ(results.return_nodes[1]->name(), "B_2");
+  class_attr = results.return_nodes[0]->attrs().Find(kColocationAttrName);
+  ASSERT_TRUE(class_attr != nullptr);
+  ASSERT_EQ(class_attr->list().s_size(), 1);
+  EXPECT_EQ(class_attr->list().s(0), "loc:@B_2");
+
+  results = ImportGraphDefResults();
+  ExpectOK(
+      "node { name: 'A' op: 'TestInput' "
+      "       attr { key: '_class' value { list { s:'loc:@B' } } } }"
+      "node { name: 'B' op: 'TestOneInputTwoOutputs' input: ['A:0'] "
+      "       attr { key: '_class' value { list { s:'loc:@B' } } } }",
+      opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_3");
+  EXPECT_EQ(results.return_nodes[1]->name(), "B_3");
+  class_attr = results.return_nodes[0]->attrs().Find(kColocationAttrName);
+  ASSERT_TRUE(class_attr != nullptr);
+  ASSERT_EQ(class_attr->list().s_size(), 1);
+  EXPECT_EQ(class_attr->list().s(0), "loc:@B_3");
+  class_attr = results.return_nodes[1]->attrs().Find(kColocationAttrName);
+  ASSERT_TRUE(class_attr != nullptr);
+  ASSERT_EQ(class_attr->list().s_size(), 1);
+  EXPECT_EQ(class_attr->list().s(0), "loc:@B_3");
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_WithCycle) {
diff --git a/tensorflow/core/graph/graph_def_builder.h b/tensorflow/core/graph/graph_def_builder.h
index b389cd80531e7458089393a69d32b81d4fb577ce..a2c0c4d553e7229ae7e0f116691d8f717fe77f87 100644
--- a/tensorflow/core/graph/graph_def_builder.h
+++ b/tensorflow/core/graph/graph_def_builder.h
@@ -99,6 +99,10 @@ class GraphDefBuilder {
     // Use this to skip processing that may depend on prior results.
     bool HaveError() const { return status_ != nullptr && !status_->ok(); }
 
+    // Returns a string representation of the status associated with *this.
+    // Returns the string `"OK"` if the status doesn't have any error.
+    string StatusToString() const { return status_->ToString(); }
+
     // Given the Op type name, return a name for a node of that type.
     // Uses the value set in WithName() if that has been called.  Otherwise,
     // returns a name built out of the Op type name.
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 1924c05d3dd3944d0fa14d53c9ddb2ab14be751d..add80eda23d7887fb06902c0b123c03db8f4cccf 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -1152,7 +1152,7 @@ Status Partition(const PartitionOptions& opts, Graph* g,
     // Add control edges from 'ref_control_inputs' to 'ref_recvs'.
     // NOTE(yuanbyu): Adding these control edges should not introduce
     // deadlocks. 'dst' has implicit "read" nodes that, when we split
-    // across devices, are made explicit; Retargettig the dependencies
+    // across devices, are made explicit; Retargeting the dependencies
     // to 'dst' to those nodes would not introduce cycles if there isn't
     // one before the transformation.
     // NOTE(yuanbyu): This may impact performance because it defers the
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index 2aa1b31e155c709abd60067291b66fb9b27c4be7..e2ce0ba046f26b69bdb8f427afeb480727977844 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -118,11 +118,9 @@ class GraphTest : public ::testing::Test {
     LOG(FATAL) << name;
   }
 
-  bool ControlEdgeExistsInGraphOrNodeDef(const Node* src,
-                                         const Node* dst) {
-    for (const Edge *e : dst->in_edges()) {
-      if (e->IsControlEdge() &&
-          e->src() == src &&
+  bool ControlEdgeExistsInGraphOrNodeDef(const Node* src, const Node* dst) {
+    for (const Edge* e : dst->in_edges()) {
+      if (e->IsControlEdge() && e->src() == src &&
           e->src_output() == Graph::kControlSlot &&
           e->dst_input() == Graph::kControlSlot) {
         return true;
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 880e4e712ef0a0d9378afefd91acd125351992f7..3df981437afed760744ef870fd542d7abdd6e25d 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -76,12 +76,12 @@ namespace tensorflow {
 namespace mkl_op_registry {
   static const char* kMklOpLabel = "MklOp";
   static const char* kMklOpLabelPattern = "label='MklOp'";
+  // Prefix that we add to Tensorflow op name to construct Mkl op name.
+  static const char* const kMklOpPrefix = "_Mkl";
 
   // Get the name of Mkl op from original TensorFlow op
   // We prefix 'Mkl' to the original op to get Mkl op.
   inline string GetMklOpName(const string& name) {
-    // Prefix that we add to Tensorflow op name to construct Mkl op name.
-    const char* const kMklOpPrefix = "_Mkl";
     return string(kMklOpPrefix) + name;
   }
 
@@ -94,9 +94,6 @@ namespace mkl_op_registry {
     string kernel = KernelsRegisteredForOp(op_name);
     bool result =
         kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
-    if (result) {
-      VLOG(1) << "mkl_op_registry::" << op_name << " is " << kMklOpLabel;
-    }
     return result;
   }
 
@@ -112,15 +109,12 @@ namespace mkl_op_registry {
     if (!IsMklOp(op_name, T)) {
       return false;
     }
-
     bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
                     0 == op_name.compare(GetMklOpName("Sub")) ||
                     0 == op_name.compare(GetMklOpName("Mul")) ||
                     0 == op_name.compare(GetMklOpName("Maximum")) ||
                     0 == op_name.compare(GetMklOpName("SquaredDifference")));
 
-    VLOG(1) << "mkl_op_registry::" << op_name
-            << " is elementwise MKL op: " << result;
     return result;
   }
 }  // namespace mkl_op_registry
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 912075aa286042319a93bf60495f52af3f940ec8..3beca1e5d2922424972baf564e6b4601a9b3ee5b 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -42,6 +42,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+#ifndef INTEL_MKL_DNN
+
 // This pass implements rewriting of graph to support following scenarios:
 // (A) Merging nodes in the graph
 // (B) Rewriting a node in the graph to a new node
@@ -2213,6 +2215,2087 @@ Status MklLayoutRewritePass::Run(
   return Status::OK();
 }
 
+#else  // INTEL_MKL_DNN
+
+// This pass implements rewriting of graph to support following scenarios:
+// (A) Merging nodes in the graph
+// (B) Rewriting a node in the graph to a new node
+//     Rewrite happens under following scenario:
+//     - Propagating Mkl layout as an additional output tensor
+//        (we will loosely call a tensor that carries Mkl layout as Mkl tensor
+//         henceforth.) from every Mkl supported NN layer.
+//
+// Example of A : Merging nodes in the graph
+// -----------------------------------------
+// Currently, we merge Conv2D+AddBias together. Consider Conv2D and BiasAdd as:
+//
+//           O = Conv2D(A, B)
+//           P = BiasAdd(O, C)
+//
+// We merge them into Conv2DWithBias as:
+//           P = _MklConv2DWithBias(A, A_m, B, B_m, C, C_m)
+//
+// The meaning of A_m, B_m and C_m is explained in B.1.
+//
+// Merge rules:
+//  - The merge for Conv2D and BiasAdd happens when the output of Conv2D _only_
+//    goes to BiasAdd.
+//  - Also, the intersection of attributes of both the nodes must have same
+//    values.
+//  - Both the nodes must have been assigned to same device (if any).
+//
+// Example of B.1 : Rewriting nodes to Mkl nodes
+// ---------------------------------------------
+// Consider a Relu node. Current definition of Relu node looks like:
+//
+//           O = Relu(A)
+//
+// Relu has 1 input (A), and 1 output (O).
+//
+// This rewrite pass will generate a new graph node for Relu (new node is
+// called MklRelu) as:
+//
+//          O, O_m = MklRelu(A, A_m)
+//
+// MklRelu has 2 inputs (A and A_m) and 2 outputs (O and O_m). Here input A is
+// same as input A of Relu; output O is same as output O of Relu. O_m is the
+// additional output tensor that will be set by MklRelu, and it represents
+// Mkl tensor corresponding to O -- in other words, O_m is some kind of
+// metadata for O. A_m is additional input of Relu, and it represents metadata
+// for A - as O_m is metadata for O, A_m is metadata for A. MklRelu receives
+// this metadata from previous node in the graph.
+//
+// When a previous node in the graph is an Mkl node, A_m will represent a valid
+// Mkl tensor. But when a previous node is not an Mkl node, A_m will represent
+// a dummy Mkl tensor.
+//
+// Rewriting rules:
+//  - Selection of a node for rewriting happens by registering the op type of
+//    the node with the rewriting pass. If the op type is not registered, then
+//    all nodes of this op type will not be rewritten.
+//  - Number of inputs after rewriting:
+//      Since for every input Tensorflow tensor, the rewritten node gets Mkl
+//      tensor(s), rewritten node gets 2*N inputs, where N is the number of
+//      inputs for the original node.
+//  - Number of outputs after rewriting:
+//      Since for every output Tensorflow tensor, the rewritten node generates
+//      Mkl tensor(s), the rewritten node generates 2*N outputs, where N is the
+//      number of outputs of the original node.
+//  - Ordering of Tensorflow tensors and Mkl tensors:
+//      Since every rewritten node generates twice the number of inputs and
+//      outputs, one could imagine various orderings among Tensorflow tensors
+//      and Mkl tensors. E.g., assume an op 'Conv2D' that takes (A, B) as
+//      inputs, then the new op '_MklConv2D' can take inputs A, B, A_m and B_m
+//      in A, A_m, B, B_m order or it can also take them in A, B, A_m, B_m
+//      order. Among N inputs one can get N! permutations.
+//
+//      So the question is: which order do we follow? We support 2 types of
+//      orderings: (1) interleaved, and (2) contiguous. Interleaved ordering
+//      follows an intuitive order where an Mkl tensor follows the
+//      corresponding Tensorflow tensor immediately. In the context of the
+//      above example, it will be: A, A_m, B, B_m. Note that the ordering rule
+//      applies to both the inputs and outputs. Contiguous ordering means
+//      all the Tensorflow tensors are contiguous followed by all the Mkl
+//      tensors. We use contiguous ordering as default.
+//
+// Graph rewrite algorithm:
+//      Algorithm: Graph Rewrite
+//      Input: Graph G, Names of the nodes to rewrite and their new names
+//      Output: Modified Graph G' if the nodes are modified, G otherwise.
+//      Start:
+//        N = Topological_Sort(G) // N is a set of nodes in toposort order.
+//        foreach node n in N
+//        do
+//          if (Is_MKL_Op(n))  // Can this node accept an Mkl layout as input.
+//          then
+//            E = set of <incoming edge and its src_output slot> of n
+//            E' = {}   // a new set of edges for rewritten node
+//            foreach <e,s> in E
+//            do
+//              E' U {<e,s>}  // First copy edge which generates Tensorflow
+//                            // tensor as it is
+//              m = Source node of edge e
+//              if Is_Rewritten(m)  // Did we rewrite this node in this pass?
+//              then
+//                E' U {<m,s+1>}    // If yes, then m will generate an Mkl
+//                                  // tensor as an additional output.
+//              else
+//                d = Generate_Dummy_Mkl_Tensor()  // If not, generate a dummy
+//                                                 // Mkl tensor.
+//                E' U {<d,0>}  // The dummy Mkl tensor has only 1 output slot.
+//              fi
+//            done
+//            n' = Build_New_Node(G,new_name,E')
+//            Mark_Rewritten(n')  // Mark the new node as being rewritten.
+//          fi
+//        done
+//
+//      Explanation:
+//        For graph rewrite, we visit nodes of the input graph in the
+//        topological sort order. With this ordering, we visit nodes in the
+//        top-to-bottom fashion. We need this order because while visiting a
+//        node we want that all of its input nodes are visited and rewritten if
+//        applicable. This is because if we need to rewrite a given node
+//        then all of its input nodes need to be fixed (in other words they
+//        cannot be deleted later.)
+//
+//        While visiting a node, we first check if the op type of the node is
+//        an Mkl op. If it is, then we rewrite that node after constructing
+//        new inputs to the node. If the op type of the node is not Mkl op,
+//        then we do not rewrite that node.
+//
+// Handling workspace propagation for certain ops:
+//
+//        Certain backward ops in MKL (MaxPool, LRN and BatchNorm) require
+//        passing of a workspace from their respective forward ops. Workspace
+//        tensors provide memory for storing results of intermediate operations
+//        which are helpful in backward propagation. TensorFlow does not have
+//        a notion of a workspace and as a result does not allow producing
+//        additional outputs from these forward ops. For these ops, we need
+//        to add 2 extra edges between forward ops and their corresponding
+//        backward ops - the first extra edge carries a workspace tensor and
+//        the second one carries an Mkl tensor for the workspace tensor.
+//
+//        Example:
+//
+//        Typical graph for MaxPool and its gradient looks like:
+//
+//        A = MaxPool(T)
+//        B = MaxPoolGrad(X, A, Y)
+//
+//        We will transform this graph to propagate the workspace as:
+//        (with the contiguous ordering)
+//
+//        A, W, A_m, W_m = MklMaxPool(T, T_m)
+//        B, B_m = MklMaxPoolGrad(X, A, Y, W, X_m, A_m, Y_m, W_m)
+//
+//        Here W is the workspace tensor. Transformed tensor names with the
+//        suffix _m are Mkl tensors, and this transformation has been done
+//        using the algorithm discussed earlier. The transformation for
+//        workspace propagation only adds extra outputs (W, W_m) for a forward
+//        op and connects them to the corresponding backward ops.
+//
+//        Terms:
+//
+//        Forward op name = name of the op in the forward pass
+//          where a workspace tensor originates (MaxPool in this example)
+//        Backward op name = name of the op in the backward pass that receives
+//          a workspace tensor from the forward op (MaxPoolGrad in the example)
+//        Slot = Position of the output or input slot that will be
+//               used by the workspace tensor (1 for MklMaxPool as W is the 2nd
+//               output of MaxPool (0 is 1st); 3 for MklMaxPoolGrad)
+//
+//        Question:
+//
+//        How do we associate a backward op to a forward op? There can be more
+//        than one op with the exact same name.
+//
+//        In this example, we associate MaxPoolGrad with MaxPool. But there
+//        could be more than one MaxPool ops. To solve this problem, we look
+//        for _direct_ edge between a forward op and a backward op (tensor A is
+//        flowing along this edge in the example).
+//
+//        How do we transform forward and backward ops when there is no direct
+//        edge between them? In such a case, we generate dummy tensors for
+//        workspace tensors. For the example, transformation of MaxPool will
+//        be exactly same as it would be when there is a direct edge between
+//        the forward and the backward op --- it is just that MaxPool won't
+//        generate any workspace tensor. For MaxPoolGrad, the transformation
+//        will also be same, but instead of connecting W and W_m with the
+//        outputs of MaxPool, we will produce dummy tensors for them, and we
+//        will set workspace_enabled attribute to false.
+//
+class MklLayoutRewritePass : public GraphOptimizationPass {
+ public:
+  MklLayoutRewritePass() {
+    // NOTE: names are alphabetically sorted.
+    csinfo_.addn = "AddN";
+    csinfo_.avg_pool = "AvgPool";
+    csinfo_.avg_pool_grad = "AvgPoolGrad";
+    csinfo_.bias_add = "BiasAdd";
+    csinfo_.bias_add_grad = "BiasAddGrad";
+    csinfo_.concat = "Concat";
+    csinfo_.concatv2 = "ConcatV2";
+    csinfo_.conv2d = "Conv2D";
+    csinfo_.conv2d_with_bias = "__MklDummyConv2DWithBias";
+    csinfo_.conv2d_grad_input = "Conv2DBackpropInput";
+    csinfo_.conv2d_grad_filter = "Conv2DBackpropFilter";
+    csinfo_.conv2d_grad_filter_with_bias =
+                              "__MklDummyConv2DBackpropFilterWithBias";
+    csinfo_.fused_batch_norm = "FusedBatchNorm";
+    csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
+    csinfo_.identity = "Identity";
+    csinfo_.lrn = "LRN";
+    csinfo_.lrn_grad = "LRNGrad";
+    csinfo_.matmul = "MatMul";
+    csinfo_.max_pool = "MaxPool";
+    csinfo_.max_pool_grad = "MaxPoolGrad";
+    csinfo_.mkl_conv2d = "_MklConv2D";
+    csinfo_.mkl_conv2d_grad_input = "_MklConv2DBackpropInput";
+    csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter";
+    csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
+    csinfo_.mkl_conv2d_grad_filter_with_bias =
+                                   "_MklConv2DBackpropFilterWithBias";
+    csinfo_.relu = "Relu";
+    csinfo_.relu_grad = "ReluGrad";
+    csinfo_.tanh       = "Tanh";
+    csinfo_.tanh_grad  = "TanhGrad";
+    csinfo_.reshape = "Reshape";
+    csinfo_.softmax = "Softmax";
+    csinfo_.split = "Split";
+    // Element-wise ops. Ensure you also add any new ops to IsOpElementWise
+    // in the MklUtil.h (IsMklElementWiseOp method) to ensure that the
+    // MklInputConversion op is added before it.
+    csinfo_.add = "Add";
+    csinfo_.maximum = "Maximum";
+    csinfo_.mul = "Mul";
+    csinfo_.squared_difference = "SquaredDifference";
+    csinfo_.sub = "Sub";
+    // End - element-wise ops. See note above.
+
+    // NOTE: names are alphabetically sorted.
+    rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
+                      CopyAttrsAddN, AddNRewrite});
+    rinfo_.push_back({csinfo_.add,
+                      mkl_op_registry::GetMklOpName(csinfo_.add),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.avg_pool,
+                      mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
+                      CopyAttrsPooling, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.avg_pool_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.avg_pool_grad),
+                      CopyAttrsPooling, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.concat,
+                      mkl_op_registry::GetMklOpName(csinfo_.concat),
+                      CopyAttrsConcat, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.concatv2,
+                      mkl_op_registry::GetMklOpName(csinfo_.concatv2),
+                      CopyAttrsConcatV2, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv2d),
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d_with_bias,
+                      csinfo_.mkl_conv2d_with_bias,
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d_grad_filter,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter),
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d_grad_filter_with_bias,
+                      csinfo_.mkl_conv2d_grad_filter_with_bias,
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d_grad_input,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_input),
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.fused_batch_norm,
+                      mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
+                      CopyAttrsFusedBatchNorm, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.fused_batch_norm_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
+                      CopyAttrsFusedBatchNorm, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.identity,
+                      mkl_op_registry::GetMklOpName(csinfo_.identity),
+                      CopyAttrsDataType, AlwaysRewrite});
+    /*
+    rinfo_.push_back({csinfo_.lrn,
+                      mkl_op_registry::GetMklOpName(csinfo_.lrn),
+                      CopyAttrsLRN, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.lrn_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
+                      CopyAttrsLRN, AlwaysRewrite});
+    */
+    rinfo_.push_back({csinfo_.max_pool,
+                      mkl_op_registry::GetMklOpName(csinfo_.max_pool),
+                      CopyAttrsPooling, NonDepthBatchWisePoolRewrite});
+    rinfo_.push_back({csinfo_.max_pool_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad),
+                      CopyAttrsPooling, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.maximum,
+                      mkl_op_registry::GetMklOpName(csinfo_.maximum),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.mul,
+                      mkl_op_registry::GetMklOpName(csinfo_.mul),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.relu,
+                      mkl_op_registry::GetMklOpName(csinfo_.relu),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.relu_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.tanh,
+                      mkl_op_registry::GetMklOpName(csinfo_.tanh),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.tanh_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.reshape,
+                      mkl_op_registry::GetMklOpName(csinfo_.reshape),
+                      CopyAttrsReshape, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.softmax,
+                      mkl_op_registry::GetMklOpName(csinfo_.softmax),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.squared_difference,
+                      mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.sub,
+                      mkl_op_registry::GetMklOpName(csinfo_.sub),
+                      CopyAttrsDataType, AlwaysRewrite});
+
+    // Add info about which ops to add workspace edge to and the slots.
+    wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
+    wsinfo_.push_back({csinfo_.max_pool, csinfo_.max_pool_grad, 0, 1, 1, 3});
+
+    // Add a rule for merging nodes
+    minfo_.push_back({csinfo_.conv2d, csinfo_.bias_add,
+                      csinfo_.conv2d_with_bias,
+                      GetConv2DOrBiasAdd});
+
+    minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
+                      csinfo_.conv2d_grad_filter_with_bias,
+                      GetConv2DBackpropFilterOrBiasAddGrad});
+  }
+
+  // Standard interface to run pass
+  Status Run(const GraphOptimizationPassOptions& options);
+
+  // Helper function which does most of heavy lifting for rewriting
+  // Mkl nodes to propagate Mkl tensor as additional output
+  //
+  // Extracts common functionality between Run public interface and
+  // test interface.
+  //
+  // @return true, if and only if graph is mutated; false otherwise.
+  bool RunPass(std::unique_ptr<Graph>* g);
+
+  /// Structure to specify the name of an original node, its new name after
+  /// rewrite, the number of inputs to the original node, the function to
+  /// be used to copy attributes for the op, and the rule (if any) which
+  /// must hold for rewriting the node
+  typedef struct {
+    string name;      // Original name of op of the node in the graph
+    string new_name;  // New name of the op of the node in the graph
+    // A function handler to copy attributes from an old node to a new node.
+    std::function<void(const Node*, NodeBuilder*)> copy_attrs;
+    // A rule under which to rewrite this node
+    std::function<bool(const Node*)> rewrite_rule;
+  } RewriteInfo;
+
+  /// Structure to specify a forward op, a backward op, and the slot numbers
+  /// in the forward and backward ops where we will add a workspace edge.
+  typedef struct {
+    string fwd_op;    // Name of a forward op in the graph
+    string bwd_op;    // Name of a backward op in the graph
+    int fwd_slot;     // Output slot in the forward op node where actual
+                      // output tensor resides
+    int bwd_slot;     // Input slot in the backward op node where actual
+                      // input tensor resides
+    int ws_fwd_slot;  // Output slot in the forward op node where workspace
+                      // edge is added
+    int ws_bwd_slot;  // Input slot in the backward op node where workspace
+                      // edge is added
+  } WorkSpaceInfo;
+
+  /// Structure to specify information used in node merge of 2 operators
+  typedef struct {
+    string op1;       // Node string for one operator.
+    string op2;       // Node string for second operator.
+    string new_node;  // Name of the node after merge
+    // Function that enables user of the node merger to specify how to find
+    // second operator given the first operator.
+    std::function<Node*(const Node*)> get_node_to_be_merged;
+  } MergeInfo;
+
+  /// Structure to store all constant strings
+  /// NOTE: names are alphabetically sorted.
+  typedef struct {
+    string addn;
+    string add;
+    string avg_pool;
+    string avg_pool_grad;
+    string bias_add;
+    string bias_add_grad;
+    string concat;
+    string concatv2;
+    string conv2d;
+    string conv2d_with_bias;
+    string conv2d_grad_input;
+    string conv2d_grad_filter;
+    string conv2d_grad_filter_with_bias;
+    string fused_batch_norm;
+    string fused_batch_norm_grad;
+    string identity;
+    string lrn;
+    string lrn_grad;
+    string matmul;
+    string max_pool;
+    string max_pool_grad;
+    string maximum;
+    string mkl_conv2d;
+    string mkl_conv2d_grad_input;
+    string mkl_conv2d_grad_filter;
+    string mkl_conv2d_grad_filter_with_bias;
+    string mkl_conv2d_with_bias;
+    string mul;
+    string relu;
+    string relu_grad;
+    string tanh;
+    string tanh_grad;
+    string reshape;
+    string softmax;
+    string split;
+    string squared_difference;
+    string sub;
+  } ConstStringsInfo;
+
+ private:
+  /// Maintain info about nodes to rewrite
+  std::vector<RewriteInfo> rinfo_;
+
+  /// Maintain info about nodes to add workspace edge
+  std::vector<WorkSpaceInfo> wsinfo_;
+
+  /// Maintain info about nodes to be merged
+  std::vector<MergeInfo> minfo_;
+
+  /// Maintain structure of constant strings
+  static ConstStringsInfo csinfo_;
+
+ private:
+  // Is OpDef::ArgDef a list type? It could be N * T or list(type).
+  // Refer to opdef.proto for details of list type.
+  inline bool ArgIsList(const OpDef::ArgDef& arg) const {
+    return !arg.type_list_attr().empty() || !arg.number_attr().empty();
+  }
+
+  // Get length of a list in 'n' if 'arg' is of list type. Refer to
+  // description of ArgIsList for definition of list type.
+  inline int GetTensorListLength(const OpDef::ArgDef& arg, Node* n) {
+    CHECK_EQ(ArgIsList(arg), true);
+    int N = 0;
+    const string attr_name = !arg.type_list_attr().empty()
+                                 ? arg.type_list_attr()
+                                 : arg.number_attr();
+    if (!arg.type_list_attr().empty()) {
+      std::vector<DataType> value;
+      TF_CHECK_OK(GetNodeAttr(n->def(), attr_name, &value));
+      N = value.size();
+    } else {
+      TF_CHECK_OK(GetNodeAttr(n->def(), attr_name, &N));
+    }
+    return N;
+  }
+
+  // Can op represented by node 'n' run on DEVICE_CPU?
+  // Op can run on CPU with MKL if the runtime assigned device or the
+  // user requested device contains device CPU, or both are empty.
+  bool CanOpRunOnCPUDevice(const Node* n) {
+    bool result = true;
+    string reason;
+
+    // Substring that should be checked for in device name for CPU device.
+    const char* const kCPUDeviceSubStr = "CPU";
+
+    // If Op has been specifically assigned to a non-CPU device, then No.
+    if (!n->assigned_device_name().empty() &&
+        !StringPiece(n->assigned_device_name()).contains(kCPUDeviceSubStr)) {
+      result = false;
+      reason = "Op has been assigned a runtime device that is not CPU.";
+    }
+
+    // If user has specifically assigned this op to a non-CPU device, then No.
+    if (!n->def().device().empty() &&
+        !StringPiece(n->def().device()).contains(kCPUDeviceSubStr)) {
+      result = false;
+      reason = "User has assigned a device that is not CPU.";
+    }
+
+    if (result == false) {
+      VLOG(1) << "MklLayoutRewritePass: Skipping rewriting of the node "
+              << n->type_string() << ", reason: " << reason;
+    }
+
+    // Otherwise Yes.
+    return result;
+  }
+
+  // Return a node that can be merged with input node 'n'
+  //
+  // @return pointer to the node if we can find such a
+  // node. Otherwise, it returns nullptr.
+  Node* CheckForNodeMerge(const Node* n) const;
+
+  // Merge node 'm' with node 'n'.
+  // Currently, we merge (1) Conv2D with BiasAdd, and (2) BiasAddGrad with
+  // Conv2DBackpropFilter.
+  //
+  // Input nodes m and n may be deleted if the call to
+  // this function is successful. Attempt to use the pointers
+  // after the call to function may result in undefined behaviors.
+  //
+  // @input g - input graph, m - graph node, n - graph node to be merged with m
+  // @return Status::OK(), if merging is successful and supported.
+  //         Returns appropriate Status error code otherwise.
+  //         Graph is updated in case nodes are merged. Otherwise, it is
+  //         not updated.
+  Status MergeNode(std::unique_ptr<Graph>* g, Node* m, Node* n);
+
+  // Helper function to merge different nodes
+  Status MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g, Node* m, Node* n);
+  Status MergeConv2DBackpropFilterWithBiasAddGrad(std::unique_ptr<Graph>* g,
+                                                  Node* m, Node* n);
+
+  // Find BiasAdd or Conv2D node that can be merged with input node 'm'.
+  // If input 'm' is BiasAdd, then check if there exists Conv2D node that can be
+  // merged with 'm'. If input 'm' is Conv2D, then check if there exists BiasAdd
+  // node that can be merged with 'm'.
+  static Node* GetConv2DOrBiasAdd(const Node* m) {
+    CHECK_NOTNULL(m);
+    Node* n = nullptr;
+
+    if (m->type_string() == csinfo_.bias_add) {
+      // If a is BiasAdd, then Conv2D is 0th input of BiasAdd.
+      TF_CHECK_OK(m->input_node(0, &n));
+    } else {
+      CHECK_EQ(m->type_string(), csinfo_.conv2d);
+      // Go over all output edges and search for BiasAdd Node.
+      // 0th input of BiasAdd is Conv2D.
+      for (const Edge* e : m->out_edges()) {
+        if (!e->IsControlEdge() &&
+            e->dst()->type_string() == csinfo_.bias_add &&
+            e->dst_input() == 0) {
+          n = e->dst();
+          break;
+        }
+      }
+    }
+
+    if (n == nullptr) {
+      VLOG(1) << "MklLayoutRewritePass: Could not find matching "
+              << "Conv2D and BiasAdd node for merging. Input node: "
+              << m->DebugString();
+    }
+
+    return n;
+  }
+
+  // Find Conv2DBackpropFilter or BiasAddGrad node that can be merged with input
+  // node 'm'. If input 'm' is Conv2DBackpropFilter, then check if there exists
+  // BiasAddGrad node that can be merged with 'm'. If input 'm' is BiasAddGrad,
+  // then check if there exists Conv2DBackpropFilter node that can be merged
+  // with 'm'.
+  //
+  // Graph that will allow us to connect Conv2DBackpropFilter with BiasAddGrad
+  // would look like:
+  //
+  // _ = Conv2DBackpropFilter(F, _, G)
+  // _ = BiasAddGrad(G)
+  //
+  // So 1st input of BiasAddGrad connects with 3rd input of
+  // Conv2DBackpropFilter and vice versa.
+  static Node* GetConv2DBackpropFilterOrBiasAddGrad(const Node* m) {
+    CHECK_NOTNULL(m);
+    Node* n = nullptr;
+
+    if (m->type_string() == csinfo_.bias_add_grad) {
+      // Get 1st input 'g' of BiasAddGrad.
+      Node* g = nullptr;
+      TF_CHECK_OK(m->input_node(0, &g));
+      // Now traverse all outgoing edges from g that have destination node as
+      // Conv2DBackpropFilter.
+      for (const Edge* e : g->out_edges()) {
+        if (!e->IsControlEdge() &&
+            e->dst()->type_string() == csinfo_.conv2d_grad_filter &&
+            e->dst_input() == 2 /* 3rd input of BackpropFilter */) {
+          n = e->dst();
+          break;
+        }
+      }
+    } else {
+      CHECK_EQ(m->type_string(), csinfo_.conv2d_grad_filter);
+      // Get 3rd input 'g' of Conv2DBackpropFilter.
+      Node* g = nullptr;
+      TF_CHECK_OK(m->input_node(2, &g));
+      // Now traverse all outgoing edges from g that have destination node as
+      // BiasAddGrad.
+      for (const Edge* e : g->out_edges()) {
+        if (!e->IsControlEdge() &&
+            e->dst()->type_string() == csinfo_.bias_add_grad &&
+            e->dst_input() == 0 /* 1st input of BiasAddGrad */) {
+          n = e->dst();
+          break;
+        }
+      }
+    }
+
+    if (n == nullptr) {
+      VLOG(1) << "MklLayoutRewritePass: Could not find matching "
+              << "Conv2DBackpropFilter and BiasAddGrad node for merging. "
+              << "Input node: " << m->DebugString();
+    }
+    return n;
+  }
+
+  // Check if the node 'n' has any applicable rewrite rule
+  // We check for 2 scenarios for rewrite.
+  //
+  // @return RewriteInfo* for the applicable rewrite rule
+  const RewriteInfo* CheckForNodeRewrite(const Node* n) const;
+
+  // Default rewrite rule to be used in scenario 1 for rewrite.
+  // @return - true (since we want to always rewrite)
+  static bool AlwaysRewrite(const Node* n) {
+    return true;
+  }
+
+  // Check if we are performing pooling on depth or batch. If it is, then we
+  // do not rewrite MaxPool node to Mkl version.
+  // @return - true (if it is not a depth/batch wise pooling case);
+  //           false otherwise.
+  static bool NonDepthBatchWisePoolRewrite(const Node* n) {
+    CHECK_NOTNULL(n);
+
+    string data_format_str;
+    TensorFormat data_format;
+    std::vector<int32> ksize, strides;
+    CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true);
+    CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true);
+    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(),
+             true);
+    CHECK_EQ(FormatFromString(data_format_str, &data_format), true);
+
+    // Condition that specifies non-batch-wise and non-depth-wise pooling.
+    if (GetTensorDim(ksize,   data_format, 'N') == 1 &&
+        GetTensorDim(strides, data_format, 'N') == 1 &&
+        GetTensorDim(ksize,   data_format, 'C') == 1 &&
+        GetTensorDim(strides, data_format, 'C') == 1) {
+      return true;
+    }
+
+    return false;
+  }
+
+  static bool AddNRewrite(const Node* n) {
+    CHECK_NOTNULL(n);
+
+    int num;
+    CHECK_EQ(GetNodeAttr(n->def(), "N", &num).ok(), true);
+
+    // Condition that specifies non-batch-wise and non-depth-wise pooling.
+    if (num == 2) {
+      return true;
+    }
+
+    return false;
+  }
+
+  // Rewrites input node to a new node specified by its matching rewrite info.
+  //
+  // Method first searches matching rewrite info for input node and then
+  // uses that info to rewrite.
+  //
+  // Input node may be deleted in case of rewrite. Attempt to use the node
+  // after the call can result in undefined behaviors.
+  //
+  // @input  g - input graph, n - Node to be rewritten,
+  //         ri - matching rewriteinfo
+  // @return Status::OK(), if the input node is rewritten;
+  //         Returns appropriate Status error code otherwise.
+  //         Graph is updated in case the input node is rewritten.
+  //         Otherwise, it is not updated.
+  Status RewriteNode(std::unique_ptr<Graph>* g, Node* n, const RewriteInfo* ri);
+
+  // Get nodes that will feed a list of TF tensors to the new
+  // node that we are constructing.
+  //
+  // @input g - input graph,
+  // @input inputs - inputs to old node that we are using for constructing
+  //                 new inputs,
+  // @input input_idx - the index in the 'inputs' vector pointing to the
+  //                    current input that we have processed so far
+  // @output input_idx - index will be incremented by the number of nodes
+  //                     from 'inputs' that are processed
+  // @input list_length - The expected length of list of TF tensors
+  // @output output_nodes - the list of new nodes creating TF tensors
+  //
+  // @return None
+  void GetNodesProducingTFTensorList(
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+      int* input_idx, int list_length,
+      std::vector<NodeBuilder::NodeOut>* output_nodes);
+
+  // Get nodes that will feed a list of Mkl tensors to the new
+  // node that we are constructing.
+  //
+  // @input g - input graph,
+  // @input orig_node - Original node that we are rewriting
+  // @input inputs - inputs to old node that we are using for constructing
+  //                 new inputs,
+  // @input input_idx - the index in the 'inputs' vector pointing to the
+  //                    current input that we have processed so far
+  // @output input_idx - index will be incremented by the number of nodes
+  //                     from 'inputs' that are processed
+  // @input list_length - The expected length of list of Mkl tensors
+  // @output output_nodes - the list of new nodes creating Mkl tensors
+  //
+  // @return None
+  void GetNodesProducingMklTensorList(std::unique_ptr<Graph>* g,
+    Node* orig_node, const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+    int* input_idx, int list_length,
+    std::vector<NodeBuilder::NodeOut>* output_nodes);
+
+  // Get a node that will feed an Mkl tensor to the new
+  // node that we are constructing. The output node could be (1) 'n'
+  // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
+  // if 'n' is not an Mkl layer.
+  //
+  // @input g - input graph,
+  // @input orig_node - Original node that we are rewriting,
+  // @input n - Node based on which we are creating Mkl node,
+  // @input n_output_slot - the output slot of node 'n'
+  //            which is feeding to the node that we are constructing
+  // @output mkl_node - the new node that will feed Mkl tensor
+  // @output mkl_node_output_slot - the slot number of mkl_node that
+  //                                will feed the tensor
+  // @return None
+  void GetNodeProducingMklTensor(std::unique_ptr<Graph>* g, Node* orig_node,
+    Node* n, int n_output_slot, Node** mkl_node, int* mkl_node_output_slot);
+
+  // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
+  // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are
+  // set up in contiguous fashion. 'workspace_tensors' carry graph nodes
+  // producing workspace edges if 'are_workspace_tensors_available' is true.
+  // Otherwise, 'workspace_tensors' is empty vector.
+  //
+  // For details, refer to 'Ordering of inputs after rewriting' section in the
+  // documentation above.
+  //
+  // Returns Status::OK() if setting up inputs is successful, otherwise
+  // returns appropriate status code.
+  int SetUpContiguousInputs(
+      std::unique_ptr<Graph>* g,
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+      NodeBuilder* nb, Node* old_node,
+      std::vector<NodeBuilder::NodeOut>* workspace_tensors,
+      bool are_workspace_tensors_available);
+
+  // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
+  // in graph 'g'. Original node is input in 'orig_node'.
+  //
+  // For details, refer to 'Ordering of Tensorflow tensors and Mkl tensors'
+  // section in the documentation above.
+  //
+  // Returns Status::OK() if setting up inputs is successful, otherwise
+  // returns appropriate status code.
+  Status SetUpInputs(std::unique_ptr<Graph>* g,
+                     const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+                     NodeBuilder* nb, Node* orig_node);
+
+  // Add workspace edge on the input or output side of Node 'orig_node' by using
+  // NodeBuilder 'nb' for the new node provided. If 'orig_node' does not dictate
+  // adding workspace edge then do not add it. Workspace Tensorflow and Mkl
+  // tensors, if they need to be added, will be set into these tensors.
+  // If we set workspace tensors, then are_ws_tensors_added should be true.
+  void AddWorkSpaceEdgeIfNeeded(std::unique_ptr<Graph>* g, Node* orig_node,
+                                NodeBuilder* nb,
+                                std::vector<NodeBuilder::NodeOut>* ws_tensors,
+                                bool* are_ws_tensors_added);
+
+  // Functions specific to operators to copy attributes
+  // We need operator-specific function to copy attributes because the framework
+  // does not provide any generic function for it.
+  // NOTE: names are alphabetically sorted.
+  static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConv2D(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb);
+
+  // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
+  // using node for original node 'orig_node' and return it in '*out'.
+  // TODO(nhasabni) We should move this to mkl_util.h
+  void GetDummyMklTensorNode(std::unique_ptr<Graph>* g, Node** out,
+                             Node* orig_node);
+  void GetDummyWorkspaceTensorNode(std::unique_ptr<Graph>* g, Node** out,
+                                   Node* orig_node);
+};
+
+MklLayoutRewritePass::ConstStringsInfo MklLayoutRewritePass::csinfo_;
+
+// We register Mkl rewrite pass for phase 1 in post partitioning group.
+// We register it here so that we get a complete picture of all users of Mkl
+// nodes. Do not change the ordering of the Mkl passes.
+const OptimizationPassRegistry::Grouping kMklLayoutRewritePassGroup =
+    OptimizationPassRegistry::POST_PARTITIONING;
+REGISTER_OPTIMIZATION(kMklLayoutRewritePassGroup, 1, MklLayoutRewritePass);
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions for creating new node
+//////////////////////////////////////////////////////////////////////////
+
+static void FillInputs(const Node* n,
+                       gtl::InlinedVector<Node*, 4>* control_edges,
+                       gtl::InlinedVector<std::pair<Node*, int>, 4>* in) {
+  control_edges->clear();
+  for (const Edge* e : n->in_edges()) {
+    if (e->IsControlEdge()) {
+      control_edges->push_back(e->src());
+    } else {
+      (*in)[e->dst_input()] = std::make_pair(e->src(), e->src_output());
+    }
+  }
+  std::sort(control_edges->begin(), control_edges->end());
+  if (n->op_def().is_commutative()) {
+    // For commutative inputs, we sort the input by the input Node*
+    // to get a canonical ordering (so that add(a,b) and add(b, a) will
+    // hash to the same value if is_commutative is true for 'add').
+    std::sort(in->begin(), in->end());
+  }
+}
+
+void MklLayoutRewritePass::GetNodesProducingTFTensorList(
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
+    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
+  CHECK_LT(*input_idx, inputs.size());
+  CHECK_GT(list_length, 0);
+  CHECK_NOTNULL(output_nodes);
+  output_nodes->reserve(list_length);
+
+  while (list_length != 0) {
+    CHECK_GT(list_length, 0);
+    CHECK_LT(*input_idx, inputs.size());
+    Node* n = inputs[*input_idx].first;
+    int slot = inputs[*input_idx].second;
+    // If input node 'n' is just producing a single tensor at
+    // output slot 'slot' then we just add that single node.
+    output_nodes->push_back(NodeBuilder::NodeOut(n, slot));
+    (*input_idx)++;
+    list_length--;
+  }
+}
+
+// TODO(nhasabni) We should move this to mkl_util.h.
+void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
+                                                 Node** out, Node* orig_node) {
+  // We use a tensor of shape {8} and value 0,0,0,0,0,0,0,0 to represent
+  // dummy Mkl tensor. 8 = 2*size_t.
+  const DataType dt = DataTypeToEnum<uint8>::v();
+  TensorProto proto;
+  proto.set_dtype(dt);
+  uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
+                           8);
+  TensorShape dummy_shape({8});
+  dummy_shape.AsProto(proto.mutable_tensor_shape());
+  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
+               .Attr("value", proto)
+               .Attr("dtype", dt)
+               .Device(orig_node->def().device())  // We place this node on
+                                                   // the same device as the
+                                                   // device of the original
+                                                   // node.
+               .Finalize(&**g, out));
+
+  // If number of inputs to the original node is > 0, then we add
+  // control dependency between 1st input (index 0) of the original node and
+  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
+  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
+  // rewritten node. Adding control edge between 1st input of the original node
+  // and the dummy Mkl node ensures that the dummy node is in the same frame
+  // as the original node. Choosing 1st input is not necessary - any input of
+  // the original node is fine because all the inputs of a node are always in
+  // the same frame.
+  if (orig_node->num_inputs() > 0) {
+    Node* orig_input0 = nullptr;
+    TF_CHECK_OK(orig_node->input_node(0,
+                                      const_cast<const Node**>(&orig_input0)));
+    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
+  }
+
+  (*out)->set_assigned_device_name(orig_node->assigned_device_name());
+}
+
+void MklLayoutRewritePass::GetNodesProducingMklTensorList(
+    std::unique_ptr<Graph>* g,
+    Node* orig_node,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+    int* input_idx, int list_length,
+    std::vector<NodeBuilder::NodeOut>* output_nodes) {
+  CHECK_LT(*input_idx, inputs.size());
+  CHECK_GT(list_length, 0);
+  CHECK_NOTNULL(output_nodes);
+  output_nodes->reserve(list_length);
+
+  while (list_length != 0) {
+    CHECK_GT(list_length, 0);
+    CHECK_LT(*input_idx, inputs.size());
+    Node* n = inputs[*input_idx].first;
+    int slot = inputs[*input_idx].second;
+    // If 'n' is producing a single tensor, then create a single Mkl tensor
+    // node.
+    Node* mkl_node = nullptr;
+    int mkl_node_output_slot = 0;
+    GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node,
+                              &mkl_node_output_slot);
+    output_nodes->push_back(NodeBuilder::NodeOut(mkl_node,
+                                                mkl_node_output_slot));
+    (*input_idx)++;
+    list_length--;
+  }
+}
+
+// Get an input node that will feed Mkl tensor to the new
+// node that we are constructing. An input node could be (1) 'n'
+// if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
+// if 'n' is not an Mkl layer.
+void MklLayoutRewritePass::GetNodeProducingMklTensor(std::unique_ptr<Graph>* g,
+    Node* orig_node, Node* n,
+    int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) {
+  CHECK_NOTNULL(n);
+  CHECK_NOTNULL(mkl_node);
+  CHECK_NOTNULL(mkl_node_output_slot);
+
+  // If this is an MKL op, then it will create extra output for MKL layout.
+  DataType T;
+  if (GetNodeAttr(n->def(), "T", &T).ok() &&
+      mkl_op_registry::IsMklOp(n->type_string(), T)) {
+    // If this is an MKL op, then it will generate an edge that will receive
+    // Mkl tensor from a node.
+    // output slot number for Mkl tensor would be N+slot number of TensorFlow
+    // tensor, where N is total number of TensorFlow tensors.
+    *mkl_node = n;
+    *mkl_node_output_slot =
+        GetTensorMetaDataIndex(n_output_slot, n->num_outputs());
+  } else {
+    // If we have not visited the node and rewritten it, then we need
+    // to create a dummy node that will feed a dummy Mkl tensor to this node.
+    // DummyMklTensor node has no input and generates only 1 output
+    // (dummy Mkl tensor) as output slot number 0.
+    GetDummyMklTensorNode(g, mkl_node, orig_node);
+    CHECK_NOTNULL(*mkl_node);
+    *mkl_node_output_slot = 0;
+  }
+}
+
+int MklLayoutRewritePass::SetUpContiguousInputs(
+    std::unique_ptr<Graph>* g,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+    NodeBuilder* nb, Node* old_node,
+    std::vector<NodeBuilder::NodeOut>* workspace_tensors,
+    bool are_workspace_tensors_available) {
+  CHECK_NOTNULL(workspace_tensors);
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+
+  // TODO(nhasabni): Temporary solution to connect filter input of
+  // BackpropInput with the converted filter from Conv2D.
+  bool do_connect_conv2d_backprop_input_filter = false;
+  Node* conv2d_node = nullptr;
+  // Filter node is 2nd input (slot index 1) of Conv2D.
+  int kConv2DFilterInputSlotIdx = 1;
+  int kConv2DBackpropInputFilterInputSlotIdx = 1;
+  int kConv2DFilterOutputSlotIdx = 1;
+  if (old_node->type_string() == csinfo_.conv2d_grad_input) {
+    // We need to find Conv2D node from Conv2DBackpropInput.
+    // For that let's first find filter node that is 2nd input (slot 1)
+    // of BackpropInput.
+    Node* filter_node = nullptr;
+    old_node->input_node(kConv2DBackpropInputFilterInputSlotIdx, &filter_node);
+    CHECK_NOTNULL(filter_node);
+
+    // Now check which nodes receive from filter_node. Filter feeds as
+    // 2nd input (slot 1) of _MklConv2D and _MklConv2DWithBias.
+    for (const Edge* e : filter_node->out_edges()) {
+      if ((e->dst()->type_string() == csinfo_.mkl_conv2d ||
+           e->dst()->type_string() == csinfo_.mkl_conv2d_with_bias) &&
+          e->dst_input() == kConv2DFilterInputSlotIdx
+          /* filter is 2nd input of Conv2D and _MklConv2D. */) {
+        if (conv2d_node != nullptr) {
+          VLOG(1) << "MklLayoutRewritePass: unusual case of same filter"
+                  << " feeding multiple Conv2D nodes: "
+                  << filter_node->DebugString();
+          // We will not connect filter input of Conv2DBackpropInput
+          // to be safe here.
+          do_connect_conv2d_backprop_input_filter = false;
+          break;
+        } else {
+          conv2d_node = e->dst();
+          do_connect_conv2d_backprop_input_filter = true;
+        }
+      }
+    }
+  }
+
+  // Number of input slots to original op
+  // Input slots are represented by .Input() calls in REGISTER_OP.
+  int old_node_input_slots = old_node->op_def().input_arg_size();
+  // Actual number of inputs can be greater than or equal to number
+  // of Input slots because inputs of type list could be unfolded.
+  CHECK_GE(old_node_inputs.size(), old_node_input_slots);
+  int nn_slot_idx = 0;  // slot index for inputs of new node
+
+  // Let's copy all inputs (TF tensors) of original node to new node.
+  int iidx = 0;
+  for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
+    // An input slot could be a single tensor or a list. We need
+    // to handle this case accordingly.
+    CHECK_LT(iidx, old_node_inputs.size());
+    const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
+    if (ArgIsList(arg)) {
+      std::vector<NodeBuilder::NodeOut> new_node_inputs;
+      int N = GetTensorListLength(arg, old_node);
+      GetNodesProducingTFTensorList(old_node_inputs, &iidx, N,
+                                    &new_node_inputs);
+      nb->Input(new_node_inputs);
+      nn_slot_idx++;
+    } else {
+      // Special case for connecting filter input of Conv2DBackpropInput
+      if (do_connect_conv2d_backprop_input_filter &&
+          iidx == kConv2DBackpropInputFilterInputSlotIdx) {
+        nb->Input(conv2d_node, kConv2DFilterOutputSlotIdx);
+      } else {
+        nb->Input(old_node_inputs[iidx].first, old_node_inputs[iidx].second);
+      }
+      iidx++;
+      nn_slot_idx++;
+    }
+  }
+
+  // If workspace tensors are available for this op and we are using
+  // contiguous ordering then we need to add Tensorflow tensor for
+  // workspace here because Tensorflow tensor for workspace is the
+  // last tensor in the list of Tensorflow tensors.
+  if (are_workspace_tensors_available) {
+    CHECK_EQ(workspace_tensors->size(), 2);
+    // Tensorflow tensor
+    nb->Input((*workspace_tensors)[0].node, (*workspace_tensors)[0].index);
+    nn_slot_idx++;
+  }
+
+  // Let's now setup all Mkl inputs to a new node.
+  // Number of Mkl inputs must be same as number of TF inputs.
+  iidx = 0;
+  for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
+    // An input slot could be a single tensor or a list. We need
+    // to handle this case accordingly.
+    CHECK_LT(iidx, old_node_inputs.size());
+    const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
+    if (ArgIsList(arg)) {
+      std::vector<NodeBuilder::NodeOut> new_node_inputs;
+      int N = GetTensorListLength(arg, old_node);
+      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx,
+                                     N, &new_node_inputs);
+      nb->Input(new_node_inputs);
+      nn_slot_idx++;
+    } else {
+      Node* mkl_node = nullptr;
+      int mkl_node_output_slot = 0;
+      // Special case for connecting filter input of Conv2DBackpropInput
+      if (do_connect_conv2d_backprop_input_filter &&
+          iidx == kConv2DBackpropInputFilterInputSlotIdx) {
+        GetNodeProducingMklTensor(g, old_node, conv2d_node,
+                                  kConv2DFilterOutputSlotIdx, &mkl_node,
+                                  &mkl_node_output_slot);
+      } else {
+        GetNodeProducingMklTensor(g, old_node, old_node_inputs[iidx].first,
+                                  old_node_inputs[iidx].second, &mkl_node,
+                                  &mkl_node_output_slot);
+      }
+      nb->Input(mkl_node, mkl_node_output_slot);
+      iidx++;
+      nn_slot_idx++;
+    }
+  }
+
+  // If workspace tensors are available for this op and we are using
+  // contiguous ordering then we need to add Mkl tensor for
+  // workspace here because Mkl tensor for workspace is the
+  // last tensor in the list of Mkl tensors.
+  if (are_workspace_tensors_available) {
+    CHECK_EQ(workspace_tensors->size(), 2);
+    // Mkl tensor
+    nb->Input((*workspace_tensors)[1].node, (*workspace_tensors)[1].index);
+    nn_slot_idx++;
+  }
+
+  return nn_slot_idx;
+}
+
+Status MklLayoutRewritePass::SetUpInputs(
+    std::unique_ptr<Graph>* g,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+    NodeBuilder* nb, Node* old_node) {
+  // Let's check if we need to add workspace tensors for this node.
+  // We add workspace edge only for MaxPool, LRN and BatchNorm.
+  std::vector<NodeBuilder::NodeOut> workspace_tensors;
+  bool are_workspace_tensors_available = false;
+  AddWorkSpaceEdgeIfNeeded(g, old_node, nb, &workspace_tensors,
+                           &are_workspace_tensors_available);
+
+  int new_node_input_slots = 0;
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    // TODO(nhasabni): implement this function just for same of completion.
+    // We do not use interleaved ordering right now.
+    return Status(
+        error::Code::UNIMPLEMENTED,
+        "Interleaved ordering of tensors is currently not supported.");
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    new_node_input_slots = SetUpContiguousInputs(
+        g, old_node_inputs, nb, old_node, &workspace_tensors,
+        are_workspace_tensors_available);
+  }
+
+  // Sanity check
+  int old_node_input_slots = old_node->op_def().input_arg_size();
+  if (!are_workspace_tensors_available) {
+    // If we are not adding workspace tensors for this op, then the total
+    // number of input slots to the new node _must_ be 2 times the number
+    // of input slots to the original node: N original Tensorflow tensors and
+    // N for Mkl tensors corresponding to each Tensorflow tensors.
+    CHECK_EQ(new_node_input_slots, old_node_input_slots * 2);
+  } else {
+    // If we are adding workspace tensors for this op, then the total
+    // The total number of input slots to new node _must_ be 2 times the number
+    // of input slots to the original node: N original Tensorflow tensors and
+    // N for Mkl tensors corresponding to each Tensorflow tensors plus 2
+    // (for workspace Tensorflow tensor and workspace Mkl tensor).
+    CHECK_EQ(new_node_input_slots, old_node_input_slots * 2 + 2);
+  }
+
+  return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions related to workspace pass
+//////////////////////////////////////////////////////////////////////////
+
+// TODO(nhasabni) We should move this to mkl_util.h.
+void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
+    std::unique_ptr<Graph>* g, Node** out, Node* orig_node) {
+  // We use a tensor of shape {1} and value 0 to represent
+  // dummy float tensor. We need this as a dummy workspace tensor.
+  // Workspace tensor has type float.
+  const DataType dt = DataTypeToEnum<float>::v();
+  TensorProto proto;
+  proto.set_dtype(dt);
+  float zero[1] = {0};
+  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
+                           4);
+  TensorShape dummy_shape({1});
+  dummy_shape.AsProto(proto.mutable_tensor_shape());
+  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
+                .Attr("value", proto)
+                .Attr("dtype", dt)
+                .Device(orig_node->def().device())  // We place this node on
+                                                    // same the device as the
+                                                    // device of the original
+                                                    // node.
+                .Finalize(&**g, out));
+
+  // If number of inputs to the original node is > 0, then we add
+  // control dependency between 1st input (index 0) of the original node and
+  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
+  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
+  // rewritten node. Adding control edge between 1st input of the original node
+  // and the dummy Mkl node ensures that the dummy node is in the same frame
+  // as the original node. Choosing 1st input is not necessary - any input of
+  // the original node is fine because all the inputs of a node are always in
+  // the same frame.
+  if (orig_node->num_inputs() > 0) {
+    Node* orig_input0 = nullptr;
+    TF_CHECK_OK(orig_node->input_node(0,
+                                      const_cast<const Node**>(&orig_input0)));
+    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
+  }
+
+  (*out)->set_assigned_device_name(orig_node->assigned_device_name());
+}
+
+void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
+    std::unique_ptr<Graph>* g, Node* orig_node, NodeBuilder* nb,
+    std::vector<NodeBuilder::NodeOut>* ws_tensors, bool* are_ws_tensors_added) {
+  bool workspace_edge_added = false;  // Default initializer
+  CHECK_NOTNULL(are_ws_tensors_added);
+  *are_ws_tensors_added = false;  // Default initializer
+
+  DataType T;
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  for (auto ws : wsinfo_) {
+    if (orig_node->type_string() == ws.fwd_op &&
+        mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
+          orig_node->type_string()), T)) {
+      // If this op is a fwd op, then we need to check if there is an
+      // edge from this node's fwd_slot to bwdop's bwd_slot. If there is
+      // an edge, then we just add an attribute on this node for setting
+      // workspace_passed to true. We don't add actual workspace edge
+      // in this node. Actual workspace edge gets added in the backward
+      // op for this node.
+      for (const Edge* e : orig_node->out_edges()) {
+        if (e->src_output() == ws.fwd_slot &&
+            e->dst()->type_string() == ws.bwd_op &&
+            e->dst_input() == ws.bwd_slot) {
+          nb->Attr("workspace_enabled", true);
+          VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
+                  << orig_node->type_string();
+          workspace_edge_added = true;
+          // We found the edge that we were looking for, so break.
+          break;
+        }
+      }
+
+      if (!workspace_edge_added) {
+        // If we are here, then we did not find backward operator for this
+        // node.
+        nb->Attr("workspace_enabled", false);
+      }
+    } else if (orig_node->type_string() == ws.bwd_op &&
+               mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
+                                          orig_node->type_string()), T)) {
+      // If this op is a bwd op, then we need to add workspace edge and
+      // it's Mkl tensor edge between its corresponding fwd op and this
+      // op. Corresponding fwd op is specified in 'fwd_op' field of
+      // workspace info. fwd_slot and bwd_slot in workspace info specify
+      // an edge between which slots connect forward and backward op.
+      // Once all these criteria match, we add a workspace edge between
+      // ws_fwd_slot and ws_bwd_slot. Its corresponding Mkl tensor is
+      // determined by interleaved/contiguous ordering. Function
+      // DataIndexToMetaDataIndex tells us the location of Mkl tensor
+      // from the location of the Tensorflow tensor.
+      for (const Edge* e : orig_node->in_edges()) {
+        if (e->src_output() == ws.fwd_slot &&
+            // We would have rewritten the forward op, so we need to use
+            // GetMklOpName call to get its Mkl name.
+            e->src()->type_string() == mkl_op_registry::GetMklOpName(
+                                                          ws.fwd_op) &&
+            e->dst_input() == ws.bwd_slot) {
+          nb->Attr("workspace_enabled", true);
+          CHECK_NOTNULL(ws_tensors);
+          // Add workspace edge between fwd op and bwd op.
+          ws_tensors->push_back(NodeBuilder::NodeOut(e->src(), ws.ws_fwd_slot));
+          // Add Mkl tensor edge for workspace edge between fwd op and bwd op.
+          ws_tensors->push_back(NodeBuilder::NodeOut(
+              e->src(), DataIndexToMetaDataIndex(ws.ws_fwd_slot,
+                                                 e->src()->num_outputs())));
+          *are_ws_tensors_added = true;
+          // In terms of input ordering, we add these calls to add Input
+          // here because workspace edge (and its Mkl tensor) is the last
+          // edge in the fwdop and bwdop. So all inputs before workspace
+          // tensor have been added by SetUpInputs function.
+          VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
+                  << orig_node->type_string();
+          workspace_edge_added = true;
+          // We found the edge that we were looking for, so break.
+          break;
+        }
+      }
+
+      // If we are here means we did not find fwd op that feeds to this
+      // bwd op. So in this case, we need to generate dummy tensors for
+      // workspace input and Mkl tensor for workspace, and set
+      // workspace_enabled to false.
+      if (!workspace_edge_added) {
+        nb->Attr("workspace_enabled", false);
+        Node* dmt_ws = nullptr;      // Dummy tensor for workspace
+        Node* dmt_mkl_ws = nullptr;  // Dummy Mkl tensor for workspace
+        GetDummyWorkspaceTensorNode(g, &dmt_ws, orig_node);
+        GetDummyMklTensorNode(g, &dmt_mkl_ws, orig_node);
+        CHECK_NOTNULL(dmt_ws);
+        CHECK_NOTNULL(dmt_mkl_ws);
+        CHECK_NOTNULL(ws_tensors);
+        // We add dummy tensor as workspace tensor.
+        ws_tensors->push_back(NodeBuilder::NodeOut(dmt_ws, 0));
+        // We add dummy tensor as Mkl tensor for workspace tensor.
+        ws_tensors->push_back(NodeBuilder::NodeOut(dmt_mkl_ws, 0));
+        *are_ws_tensors_added = true;
+        VLOG(1) << "MklLayoutRewritePass: dummy workspace_enabled for "
+                << orig_node->type_string();
+      }
+    } else {
+      // If this node does not match any workspace info, then we do not
+      // do anything special for workspace propagation for it.
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Op-specific functions to copy attributes from old node to new node
+//////////////////////////////////////////////////////////////////////////
+
+void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node,
+                                           NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  bool use_cudnn_on_gpu;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(
+      GetNodeAttr(orig_node->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
+}
+
+void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
+                                         NodeBuilder* nb) {
+  DataType T;
+  int N;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("N", N);
+}
+
+void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
+                                                NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  std::vector<int32> strides;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node,
+                                        NodeBuilder* nb) {
+  DataType T;
+  int depth_radius;
+  float bias;
+  float alpha;
+  float beta;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "depth_radius", &depth_radius));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "bias", &bias));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "alpha", &alpha));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "beta", &beta));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("depth_radius", depth_radius);
+  nb->Attr("bias", bias);
+  nb->Attr("alpha", alpha);
+  nb->Attr("beta", beta);
+}
+
+void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
+                                            NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> ksize, strides;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "ksize", &ksize));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("ksize", ksize);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
+                                             NodeBuilder* nb) {
+  DataType T;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+}
+
+void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
+                                           NodeBuilder* nb) {
+  DataType T;
+  DataType Tshape;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tshape", &Tshape));
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("Tshape", Tshape);
+}
+
+void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
+                                          NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  int num_split;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "num_split", &num_split));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("num_split", num_split);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
+                                           NodeBuilder* nb) {
+  DataType T;
+  int N;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("N", N);
+}
+
+void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node,
+                                             NodeBuilder* nb) {
+  DataType T;
+  int N;
+  DataType tidx;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tidx", &tidx));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("N", N);
+  nb->Attr("Tidx", tidx);
+}
+
+void MklLayoutRewritePass::CopyAttrsFusedBatchNorm(const Node* orig_node,
+                                                   NodeBuilder* nb) {
+  DataType T;
+  float epsilon;
+  string data_format;
+  bool is_training;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "is_training", &is_training));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("epsilon", epsilon);
+  nb->Attr("data_format", data_format);
+  nb->Attr("is_training", is_training);
+}
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions related to node merge pass
+//////////////////////////////////////////////////////////////////////////
+
+Node* MklLayoutRewritePass::CheckForNodeMerge(const Node* a) const {
+  // TODO(nhasabni) Add check for type of node similar to CheckForNodeRewrite
+  // once we support BiasAddGrad as Mkl layer.
+
+  // Search for all matching mergeinfo.
+  // We allow more than one match for extensibility.
+  std::vector<const MergeInfo*> matching_mi;
+  for (auto mi = minfo_.cbegin(); mi != minfo_.cend(); ++mi) {
+    if (a->type_string() == mi->op1 || a->type_string() == mi->op2) {
+      matching_mi.push_back(&*mi);
+    }
+  }
+
+  for (const MergeInfo* mi : matching_mi) {
+    // Get the operand with which 'a' can be merged.
+    Node* b = nullptr;
+    if ((b = mi->get_node_to_be_merged(a)) == nullptr) {
+      continue;
+    }
+
+    // Get the control edges and input of node
+    const int N_in = a->num_inputs();
+    gtl::InlinedVector<Node*, 4> a_control_edges;
+    gtl::InlinedVector<std::pair<Node*, int>, 4> a_in(N_in);
+    FillInputs(a, &a_control_edges, &a_in);
+
+    const int B_in = b->num_inputs();
+    gtl::InlinedVector<Node*, 4> b_control_edges;
+    gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(B_in);
+    FillInputs(b, &b_control_edges, &b_in);
+
+    // Shouldn't merge if a and b have different control edges.
+    if (a_control_edges != b_control_edges) {
+      continue;
+    } else {
+      // We found a match.
+      return b;
+    }
+  }
+
+  return nullptr;
+}
+
+Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
+                                                    Node* m, Node* n) {
+  CHECK_EQ(((m->type_string() == csinfo_.bias_add &&
+             n->type_string() == csinfo_.conv2d)) ||
+           ((n->type_string() == csinfo_.bias_add &&
+             m->type_string() == csinfo_.conv2d)), true);
+
+  // If 'm' is BiasAdd, then 'n' is Conv2D. Since Conv2D feeds BiasAdd,
+  // BiasAdd is successor node, and Conv2D predecessor node.
+  Node* pred = m->type_string() == csinfo_.bias_add ? n : m;
+  Node* succ = m->type_string() == csinfo_.bias_add ? m : n;
+
+  // 1. Get all attributes from input nodes.
+  DataType T_pred, T_succ;
+  string padding;
+  std::vector<int32> strides;
+  string data_format_pred, data_format_succ;
+  bool use_cudnn_on_gnu;
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
+  TF_CHECK_OK(
+      GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
+  // We check to ensure that data formats of both succ and pred are same.
+  // We expect them to be same, so we can enforce this as assert.
+  // But assert can be too strict, so we enforce this as a check.
+  // If the check fails, then we do not merge two nodes.
+  // We also do same check for devices.
+  if (data_format_pred != data_format_succ || T_pred != T_succ ||
+      pred->assigned_device_name() != succ->assigned_device_name() ||
+      pred->def().device() != succ->def().device()) {
+    return Status(error::Code::INVALID_ARGUMENT,
+                  "data_format or T attribute or devices of Conv2D and "
+                  "BiasAdd do not match. Will skip node merge optimization");
+  }
+
+  const int succ_num = succ->num_inputs();
+  gtl::InlinedVector<Node*, 4> succ_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> succ_in(succ_num);
+  FillInputs(succ, &succ_control_edges, &succ_in);
+
+  const int pred_num = pred->num_inputs();
+  gtl::InlinedVector<Node*, 4> pred_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> pred_in(pred_num);
+  FillInputs(pred, &pred_control_edges, &pred_in);
+
+  // We need to ensure that Conv2D only feeds to BiasAdd (some other operator is
+  // not expecting output of Conv2D). If this is not the case, then we cannot
+  // merge Conv2D with BiasAdd.
+  const int kFirstOutputSlot = 0;
+  for (const Edge* e : pred->out_edges()) {
+    if (e->src_output() == kFirstOutputSlot && e->dst() != succ) {
+      return Status(error::Code::INVALID_ARGUMENT,
+                    "Conv2D does not feed to BiasAdd, or "
+                    "it feeds BiasAdd but has multiple outputs. "
+                    "Will skip node merge optimization");
+    }
+  }
+
+  // 2. Get inputs from both the nodes.
+  // Find the 2 inputs from the conv and the bias from the add Bias.
+  // Get operand 0, 1 of conv2D.
+  CHECK_EQ(pred->in_edges().size(), 2);  // Conv2D must have 2 inputs.
+  // Get operand 1 of add_bias
+  // BiasAdd must have 2 inputs: Conv, bias
+  CHECK_EQ(succ->in_edges().size(), 2);
+
+  // We will use the node name of BiasAdd as the name of new node
+  // Build new node. We use same name as original node, but change the op
+  // name.
+  NodeBuilder nb(succ->name(), csinfo_.conv2d_with_bias);
+  nb.Input(pred_in[0].first, pred_in[0].second);  // In1 of Conv2D
+  // pred_in[1] will be 2nd Tensorflow tensor for Conv2D.
+  nb.Input(pred_in[1].first, pred_in[1].second);  // In2 of Conv2D
+  // In1 of BiasAdd is same as output of Conv2D.
+  nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
+
+  // Copy attributes from Conv2D to Conv2DWithBias.
+  CopyAttrsConv2D(const_cast<const Node*>(pred), &nb);
+
+  // Copy the device assigned to old node to new node.
+  nb.Device(succ->def().device());
+
+  // Create node.
+  Node* new_node;
+  nb.Finalize(&**g, &new_node);
+  CHECK_NOTNULL(new_node);
+
+  // Incoming data edges from 'pred' node and 'succ' node to new 'new_node'
+  // node are already copied in BuildNode. We handle control edges now.
+  for (const Edge* e : pred->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+  for (const Edge* e : succ->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+
+  // Incoming edges are fixed, we will fix the outgoing edges now.
+  // First, we will fix outgoing control edges from 'pred' node.
+  for (const Edge* e : pred->out_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+    }
+  }
+
+  // Second, we will fix outgoing control and data edges from 'succ' node.
+  for (const Edge* e : succ->out_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+    } else {
+      // BiasAdd has only 1 output (at slot 0) and merged node also has only 1
+      // output (at slot 0).
+      const int kConv2DWithBiasOutputSlot = 0;
+      CHECK_NOTNULL((*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot,
+                                    e->dst(), e->dst_input()));
+    }
+  }
+
+  // Copy device assigned to old node to new node.
+  // It's ok to use pred or succ as we have enforced a check that
+  // both have same device assigned.
+  new_node->set_assigned_device_name(pred->assigned_device_name());
+
+  VLOG(1) << "MklLayoutRewritePass: Merged old node:" << pred->DebugString()
+          << ", and node: " << succ->DebugString()
+          << ", into node:" << new_node->DebugString();
+
+  (*g)->RemoveNode(succ);
+  (*g)->RemoveNode(pred);
+
+  return Status::OK();
+}
+
+Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
+    std::unique_ptr<Graph>* g, Node* m, Node* n) {
+  CHECK_EQ(((m->type_string() == csinfo_.bias_add_grad &&
+             n->type_string() == csinfo_.conv2d_grad_filter)) ||
+           ((n->type_string() == csinfo_.bias_add_grad &&
+             m->type_string() == csinfo_.conv2d_grad_filter)), true);
+
+  // If 'm' is BiasAddGrad, then 'n' is BackpropFilter.
+  Node* badd = m->type_string() == csinfo_.bias_add_grad ? m : n;
+  Node* fltr = m->type_string() == csinfo_.bias_add_grad ? n : m;
+
+  // Sanity check for attributes from input nodes.
+  DataType T_b, T_f;
+  string data_format_b, data_format_f;
+  TF_CHECK_OK(GetNodeAttr(badd->def(), "T", &T_b));
+  TF_CHECK_OK(GetNodeAttr(fltr->def(), "T", &T_f));
+  TF_CHECK_OK(GetNodeAttr(badd->def(), "data_format", &data_format_b));
+  TF_CHECK_OK(GetNodeAttr(fltr->def(), "data_format", &data_format_f));
+  if (data_format_b != data_format_f || T_b != T_f ||
+      badd->assigned_device_name() != fltr->assigned_device_name() ||
+      badd->def().device() != fltr->def().device()) {
+    return Status(error::Code::INVALID_ARGUMENT,
+                  "data_format or T attribute or devices of "
+                  "Conv2DBackpropFilter and BiasAddGrad do not match. "
+                  "Will skip node merge optimization");
+  }
+
+  // We will use the node name of Conv2DBackpropFilter as the name of new node.
+  // This is because BackpropFilterWithBias is going to emit bias output also.
+  NodeBuilder nb(fltr->name(), csinfo_.conv2d_grad_filter_with_bias);
+  // Since Conv2DBackpropFilterWithBias has same number of inputs as
+  // Conv2DBackpropFilter, we can just copy input edges directly. We dont need
+  // to copy any data input of BiasAddGrad because that input also goes to
+  // Conv2DBackpropFilter.
+  const int fltr_ins = fltr->num_inputs();
+  gtl::InlinedVector<Node*, 4> fltr_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> fltr_in_edges(fltr_ins);
+  FillInputs(fltr, &fltr_control_edges, &fltr_in_edges);
+  for (int idx = 0; idx < fltr_ins; idx++) {
+    nb.Input(fltr_in_edges[idx].first, fltr_in_edges[idx].second);
+  }
+
+  // Copy attributes from Conv2DBackpropFilter.
+  CopyAttrsConv2D(const_cast<const Node*>(fltr), &nb);
+
+  // Copy the device assigned to old node to new node.
+  nb.Device(fltr->def().device());
+
+  // Create node.
+  Node* new_node;
+  nb.Finalize(&**g, &new_node);
+  CHECK_NOTNULL(new_node);
+
+  // Incoming data edges from BiasAddGrad node and Conv2DBackpropFilter node to
+  // new 'new_node' node are already copied in BuildNode. We handle control
+  // edges now.
+  for (const Edge* e : badd->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+  for (const Edge* e : fltr->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+
+  // Incoming edges are fixed, we will fix the outgoing edges now.
+  // First, we will fix outgoing control edges from 'badd' node.
+  // Conv2DBackpropFilter has 1 output -- filter_grad.
+  // Conv2DBackpropFilterWithBias has 2 outputs -- filter_grad and
+  // bias_grad. But filter_grad is at same slot number (0) in both the
+  // nodes. bias_grad is at slot number 1 in Conv2DBackpropFilterWithBias, while
+  // it is at slot number 0 in BiasAddGrad.
+  const int kMergedNodeFilterGradOutputIdx = 0;
+  const int kMergedNodeBiasGradOutputIdx = 1;
+
+  for (const Edge* e : badd->out_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+    } else {
+      CHECK_NOTNULL((*g)->AddEdge(new_node, kMergedNodeBiasGradOutputIdx,
+                                  e->dst(), e->dst_input()));
+    }
+  }
+
+  // Second, we will fix outgoing control and data edges from 'fltr' node.
+  for (const Edge* e : fltr->out_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+    } else {
+      CHECK_NOTNULL((*g)->AddEdge(new_node, kMergedNodeFilterGradOutputIdx,
+                                  e->dst(), e->dst_input()));
+    }
+  }
+
+  // Copy device assigned to old node to new node.
+  // It's ok to use badd or fltr as we have enforced a check that
+  // both have same device assigned.
+  new_node->set_assigned_device_name(badd->assigned_device_name());
+
+  VLOG(1) << "MklLayoutRewritePass: Merged old node:" << badd->DebugString()
+          << ", and node: " << fltr->DebugString()
+          << ", into node:" << new_node->DebugString();
+
+  (*g)->RemoveNode(badd);
+  (*g)->RemoveNode(fltr);
+
+  return Status::OK();
+}
+
+Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* m,
+                                       Node* n) {
+  CHECK_NOTNULL(m);
+  CHECK_NOTNULL(n);
+
+  if (((m->type_string() == csinfo_.bias_add &&
+        n->type_string() == csinfo_.conv2d)) ||
+      ((n->type_string() == csinfo_.bias_add &&
+        m->type_string() == csinfo_.conv2d))) {
+    return this->MergeConv2DWithBiasAdd(g, m, n);
+  }
+
+  if (((m->type_string() == csinfo_.bias_add_grad &&
+        n->type_string() == csinfo_.conv2d_grad_filter)) ||
+      ((n->type_string() == csinfo_.bias_add_grad &&
+        m->type_string() == csinfo_.conv2d_grad_filter))) {
+    return this->MergeConv2DBackpropFilterWithBiasAddGrad(g, m, n);
+  }
+
+  return Status(error::Code::UNIMPLEMENTED,
+                "Unimplemented case for node merge optimization.");
+}
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions for node rewrite
+//////////////////////////////////////////////////////////////////////////
+
+Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
+                                         Node* orig_node,
+                                         const RewriteInfo* ri) {
+  CHECK_NOTNULL(ri);
+  CHECK_NOTNULL(orig_node);
+
+  VLOG(1) << "MklLayoutRewritePass: Original node:" << orig_node->DebugString();
+
+  // Get all inputs.
+  int num_inputs = orig_node->in_edges().size();
+
+  // Drop count for control edges from inputs
+  for (const Edge* e : orig_node->in_edges()) {
+    if (e->IsControlEdge()) {
+      num_inputs--;
+    }
+  }
+
+  gtl::InlinedVector<Node*, 4> control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> inputs(num_inputs);
+  FillInputs(orig_node, &control_edges, &inputs);
+
+  // Build new node. We use same name as original node, but change the op name.
+  NodeBuilder nb(orig_node->name().c_str(), ri->new_name.c_str());
+  // Copy user-specified device assigned to original node to new node.
+  nb.Device(orig_node->def().device());
+  // Set up new inputs to the rewritten node.
+  Status s = SetUpInputs(g, inputs, &nb, orig_node);
+  if (s != Status::OK()) {
+    return s;
+  }
+
+  ri->copy_attrs(const_cast<const Node*>(orig_node), &nb);
+  // Set the Mkl layer label for this op.
+  nb.Attr("_kernel", mkl_op_registry::kMklOpLabel);
+
+  // Finalize graph and get new node.
+  Node* new_node = nullptr;
+  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
+  CHECK_NOTNULL(new_node);
+
+  // Incoming data edges from 'orig_node' node to new 'new_node' node are
+  // already copied in BuildNode. We need to handle control edges now.
+  for (const Edge* e : orig_node->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+
+  // Copy outgoing edges from 'orig_node' node to new
+  // 'new_node' node, since the output also follows same ordering among
+  // Tensorflow tensors and Mkl tensors. We need to connect Tensorflow
+  // tensors appropriately. Specifically, nth output of the original node
+  // will become 2*nth output of the Mkl node for the interleaved ordering
+  // of the tensors. For the contiguous ordering of the tensors, it will be n.
+  // GetTensorDataIndex provides this mapping function.
+  for (const Edge* e : orig_node->out_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+    } else {
+      CHECK_NOTNULL((*g)->AddEdge(new_node, GetTensorDataIndex(e->src_output(),
+                            e->src()->num_outputs()),
+                    e->dst(), e->dst_input()));
+    }
+  }
+
+  // Copy the runtime device assigned from original code to new node.
+  new_node->set_assigned_device_name(orig_node->assigned_device_name());
+
+  // Delete original node and mark new node as rewritten.
+  (*g)->RemoveNode(orig_node);
+
+  VLOG(1) << "MklLayoutRewritePass: New node:" << new_node->DebugString();
+  return Status::OK();
+}
+
+const MklLayoutRewritePass::RewriteInfo*
+MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
+  CHECK_NOTNULL(n);
+
+  // First check if node along with its type is supported by MKL layer.
+  // We do not want to rewrite an op into Mkl op if types are not supported.
+  // E.g., MklRelu does not support INT32. So we cannot rewrite Relu to
+  // MklRelu if type is INT32.
+  DataType T;
+  if (!GetNodeAttr(n->def(), "T", &T).ok()) {
+    return nullptr;
+  }
+
+  // We make an exception for __MklDummyConv2DWithBias and
+  // __MklConv2DBackpropFilterWithBias since their names do not match Mkl node
+  // names.
+  if (n->type_string() != csinfo_.conv2d_with_bias &&
+      n->type_string() != csinfo_.conv2d_grad_filter_with_bias &&
+      !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
+                                        n->type_string()), T)) {
+      return nullptr;
+  }
+
+  // For elementwise node, we reuse the Eigen implementation and pass the MKL
+  // metadata tensor through so we can avoid conversions. However, if all
+  // incoming edges are in TF format, we don't need all this overhead, so
+  // replace the elementwise node only if at least one of its parents is a MKL
+  // node.
+  //
+  // Identity nodes can also skip replacement if they are not being served by
+  // any MKL nodes.
+  //
+  // TODO(vrane): Add implementation for element-wise ops that doesn't reuse
+  // eigen code to reduce cross-library dependency.
+  VLOG(1) << "ELEMENTWISE: checking op: " << n->type_string();
+  if (mkl_op_registry::IsMklElementWiseOp(
+        mkl_op_registry::GetMklOpName(n->type_string()), T) ||
+      n->type_string().find("Identity") != string::npos) {
+    VLOG(1) << "ELEMENTWISE: op is elementwise: " << n->type_string();
+    bool incoming_mkl_edge = false;
+    int num_parent = 0;
+    for (auto parent : n->in_edges()) {
+      if (mkl_op_registry::IsMklOp(parent->src()->type_string(), T)) {
+        VLOG(1) << "ELEMENTWISE: parent " << num_parent++ << " is MKL op: "
+                << parent->src()->type_string();
+        incoming_mkl_edge = true;
+        break;
+      } else {
+        VLOG(1) << "ELEMENTWISE: parent " << num_parent++ << " is NON-MKL op: "
+                << parent->src()->type_string();
+      }
+    }
+    if (incoming_mkl_edge == false) {
+      VLOG(1) << "ELEMENTWISE: Skipping replacement of elementwise node which has no MKL "
+                 "parents.";
+      return nullptr;
+    } else {
+      VLOG(1) << "ELEMENTWISE: Replacing elementwise node " << n->type_string() <<
+        " which has MKL parents";
+    }
+  }
+
+  // We now check if rewrite rule applies for this op. If rewrite rule passes
+  // for this op, then we rewrite it to Mkl op.
+  // Find matching RewriteInfo and then check that rewrite rule applies.
+  for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
+    if (n->type_string().compare(ri->name) == 0 &&
+        ri->rewrite_rule(n)) {
+      return &*ri;
+    }
+  }
+
+  // Else return not found.
+  return nullptr;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//              Run function for the pass
+///////////////////////////////////////////////////////////////////////////////
+
+bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
+  bool result = false;
+  CHECK_NOTNULL(g);
+
+  DumpGraph("Before running MklLayoutRewritePass", &**g);
+
+  std::vector<Node*> order;
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+
+    Node* m = nullptr;
+    if ((m = CheckForNodeMerge(n)) != nullptr && CanOpRunOnCPUDevice(m)) {
+      // Check if the node 'n' can be merged with any other node. If it can
+      // be 'm' contains the node with which it can be merged.
+      string n1_name = n->name();
+      string n2_name = m->name();
+
+      VLOG(1) << "MklLayoutRewritePass: Scheduled nodes " << n1_name << " and "
+              << n2_name << " for merging";
+
+      if (MergeNode(g, n, m) == Status::OK()) {
+        VLOG(1) << "MklLayoutRewritePass: Merged nodes " << n1_name << " and "
+                << n2_name;
+        result = true;
+      }
+    }
+  }
+
+  DumpGraph("After running MklLayoutRewritePass(NodeMerge)", &**g);
+
+  order.clear();
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+
+    const RewriteInfo* ri = nullptr;
+    // We will first search if node is to be rewritten.
+    if ((ri = CheckForNodeRewrite(n)) != nullptr) {
+      string node_name = n->name();
+      string op_name = n->type_string();
+
+      VLOG(1) << "MklLayoutRewritePass: Scheduled node " << node_name
+              << " with op " << op_name << " for rewrite using"
+              << " layout optimization.";
+
+      if (RewriteNode(g, n, ri) == Status::OK()) {
+        VLOG(1) << "MklLayoutRewritePass: rewrote node " << node_name
+                << " with op " << op_name << " for Mkl layout optimization.";
+        result = true;
+      }
+    }
+  }
+
+  DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite)", &**g);
+
+  return result;
+}
+
+bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
+  return MklLayoutRewritePass().RunPass(g);
+}
+
+Status MklLayoutRewritePass::Run(
+  const GraphOptimizationPassOptions& options) {
+  if (options.graph == nullptr && options.partition_graphs == nullptr) {
+    return Status::OK();
+  }
+
+  auto process_graph = [&](std::unique_ptr<Graph>* g) {
+    // Get the ownership of a graph
+    std::unique_ptr<Graph>* ng = std::move(g);
+    RunPass(ng);
+    // Return the ownership of a graph back
+    g->reset(ng->release());
+  };
+
+  if (kMklLayoutRewritePassGroup !=
+      OptimizationPassRegistry::POST_PARTITIONING) {
+    // For any pre-partitioning phase, a graph is stored in options.graph.
+    process_graph(options.graph);
+  } else {
+    // For post partitioning phase, graphs are stored in
+    // options.partition_graphs.
+    for (auto& pg : *options.partition_graphs) {
+      process_graph(&pg.second);
+    }
+  }
+
+  return Status::OK();
+}
+#endif  // INTEL_MKL_DNN
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index abc63e4f35aa9fd6f1df127741ae6d10f49024b9..75f7ca2d4d7ce7c86858a40fe34fed6aa707c9e5 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -37,6 +37,9 @@ limitations under the License.
 #include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
+
+#ifndef INTEL_MKL_DNN
+
 namespace {
 
 const char kCPUDevice[] = "/job:a/replica:0/task:0/device:CPU:0";
@@ -1881,6 +1884,1627 @@ static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
 BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
 
 }  // namespace
+
+#else  // INTEL_MKL_DNN
+
+namespace {
+
+const char kCPUDevice[] = "/job:a/replica:0/task:0/device:CPU:0";
+const char kGPUDevice[] = "/job:a/replica:0/task:0/device:GPU:0";
+
+static void InitGraph(const string& s, Graph* graph,
+                      const string& device = kCPUDevice) {
+  GraphDef graph_def;
+
+  auto parser = protobuf::TextFormat::Parser();
+  //  parser.AllowRelaxedWhitespace(true);
+  CHECK(parser.MergeFromString(s, &graph_def)) << s;
+  GraphConstructorOptions opts;
+  TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
+
+  for (Node* node : graph->nodes()) {
+    node->set_assigned_device_name(device);
+  }
+}
+
+class MklLayoutPassTest : public ::testing::Test {
+ public:
+  MklLayoutPassTest() : graph_(OpRegistry::Global()) {}
+
+  void InitGraph(const string& s, const string& device = kCPUDevice) {
+    ::tensorflow::InitGraph(s, &graph_, device);
+    original_ = CanonicalGraphString(&graph_);
+  }
+
+  static bool IncludeNode(const Node* n) { return n->IsOp(); }
+
+  static string EdgeId(const Node* n, int index) {
+    if (index == 0) {
+      return n->name();
+    } else if (index == Graph::kControlSlot) {
+      return strings::StrCat(n->name(), ":control");
+    } else {
+      return strings::StrCat(n->name(), ":", index);
+    }
+  }
+
+  string CanonicalGraphString(Graph* g) {
+    std::vector<string> nodes;
+    std::vector<string> edges;
+    for (const Node* n : g->nodes()) {
+      if (IncludeNode(n)) {
+        nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")"));
+      }
+    }
+    for (const Edge* e : g->edges()) {
+      if (IncludeNode(e->src()) && IncludeNode(e->dst())) {
+        edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->",
+                                        EdgeId(e->dst(), e->dst_input())));
+      }
+    }
+    // Canonicalize
+    std::sort(nodes.begin(), nodes.end());
+    std::sort(edges.begin(), edges.end());
+    return strings::StrCat(str_util::Join(nodes, ";"), "|",
+                           str_util::Join(edges, ";"));
+  }
+
+  string DoMklLayoutOptimizationPass() {
+    string before = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "Before MKL layout rewrite pass: " << before;
+
+    std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(&graph_);
+    RunMklLayoutRewritePass(ug);
+
+    string result = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "After MKL layout rewrite pass:  " << result;
+    return result;
+  }
+
+  const string& OriginalGraph() const { return original_; }
+
+  Graph graph_;
+  string original_;
+};
+
+REGISTER_OP("Input").Output("o: float").SetIsStateful();
+REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
+REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
+REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
+REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
+REGISTER_OP("_MklInput2").Output("o: uint8")
+                        .Output("o1: uint8").SetIsStateful();
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to node merge optiimization
+/////////////////////////////////////////////////////////////////////
+
+TEST_F(MklLayoutPassTest, Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Zeta);D(Zeta)|"
+            "A->C;A->D;B->C:1;B->D:1");
+}
+
+// Test set 1: Conv2D + AddBias
+
+// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Zeta(E,Y)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(_MklConv2DWithBias);Y(Input);Z(Zeta)|A->E;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->E:1;D->E:2;DMT/_0->E:3;DMT/_1->E:4;"
+            "DMT/_2->E:5;E->Z;Y->Z:1");
+}
+
+// Graph contains only Conv2D, no AddBias.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_NoAddBias) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);DMT/_0(Const);DMT/_1(Const)|"
+            "A->C;A:control->DMT/_0:control;A:control->DMT/_1:control;B->C:1;"
+            "DMT/_0->C:2;DMT/_1->C:3");
+}
+
+// Conv2D output does not go to BiasAdd.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D', 'E'] }");  // Output of _MklConv2D does not go to BiasAdd.
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);E(Input);F(BiasAdd)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;D->F;DMT/_0->C:2;DMT/_1->C:3;"
+            "E->F:1");
+}
+
+// Conv2D has two outgoing edges: BiasAdd and some other dummy node (Zeta).
+// Merge should not be done in such case.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D', 'E'] }"  // Conv2D has two outputs.
+                              // No merge should happen.
+      "node { name: 'G' op: 'Zeta'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);E(Input);F(BiasAdd);G(Zeta)|A->C;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;B->C:1;C->G;"
+            "D->F;DMT/_0->C:2;DMT/_1->C:3;E->F:1;E->G:1");
+}
+
+// data_format attribute value mismatch. Merge should not be done
+// in such case.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHCW' } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);E(BiasAdd)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;C->E;D->E:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
+}
+
+// Test set 2: BiasAddGrad + Conv2DBackpropFilter fusion tests
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);"
+            "D(_MklConv2DBackpropFilterWithBias);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const)|A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;C->D:2;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// BiasAddGrad fusion in the presence of BackpropFilter. But nodes do not match
+// criteria for rewrite. So rewrite should not happen. 3rd input of
+// Conv2DBackpropFilter is different than input to BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['A'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);"
+            "D(_MklConv2DBackpropFilter);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(BiasAddGrad)|A->D;A->E;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;C->D:2;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// BiasAddGrad fusion, but nodes do not match criteria for fusion.
+// Different input formats.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " input: ['A'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);"
+            "D(_MklConv2DBackpropFilter);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(BiasAddGrad)|A->D;A->E;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;C->D:2;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// BiasAddGrad fusion in the presence of BackpropFilter only. Fusion is done
+// before node rewrite. Check this ordering.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['E', 'F', 'A', 'M', 'N', 'O'] }"
+      "node { name: 'H' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;"
+            "C->D:2;D->E;E->G;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;"
+            "O->G:5");
+}
+
+// C=Conv2D(A,B); E=BiasAdd(C,D); Y=Zeta(E,X);
+// G=Conv2DBackpropInput(F,B,E)
+// This is a case of node rewrite followed by node merge followed by connecting
+// filter output of Conv2DWithBias to filter input of Conv2DBackpropInput.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'X' op: 'Input'}"
+      "node { name: 'Y' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'X']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: 'Conv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['F', 'B', 'E']}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['G', 'X']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2DWithBias);F(Int32Input);"
+            "G(_MklConv2DBackpropInput);X(Input);Y(Zeta);Z(Zeta)|"
+            "A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->E:1;D->E:2;DMT/_0->E:3;"
+            "DMT/_1->E:4;DMT/_2->E:5;DMT/_3->G:3;E->G:2;E->Y;E:1->G:1;E:2->G:5;"
+            "E:3->G:4;F->G;F:control->DMT/_3:control;G->Z;X->Y:1;X->Z:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to rewriting node to Mkl node
+/////////////////////////////////////////////////////////////////////
+
+// Single Conv2D Op; No Mkl layer on the input and on the output.
+// We will generate dummy Mkl tensor as 2nd input of Conv2D.
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
+}
+
+// 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will
+// have 2 outputs, both of which will be inputs to next Conv2D.
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(_MklConv2D);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->C;A->D;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->C:1;C->D:1;C->E;"
+            "C:2->D:3;D->E:1;DMT/_0->C:2;DMT/_1->C:3;DMT/_2->D:2");
+}
+
+// Conv2D with INT32 which is not supported by Mkl
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
+  InitGraph(
+      "node { name: 'A' op: 'HalfInput'}"
+      "node { name: 'B' op: 'HalfInput'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_HALF } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(HalfInput);B(HalfInput);C(Conv2D);D(Zeta)|"
+            "A->C;B->C:1;B->D;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropFilter);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['B', 'A', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropInput);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
+            "A->D:1;A->E;B->D;B:control->DMT/_0:control;"
+            "B:control->DMT/_1:control;B:control->DMT/_2:control;C->D:2;"
+            "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// Check that we never rewrite BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Polygamma'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Polygamma);D(Zeta);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// Check that we never rewrite BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'MatMul'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'transpose_a'      value { b: false } }"
+      " attr { key: 'transpose_b'      value { b: false } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MatMul);D(Zeta);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// Check that we never rewrite BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Zeta);E(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput)|A->C;A->D:1;B->C:1;C->D;D->E;"
+            "M->C:2;N->C:3");
+}
+
+// Concat Op test: Concat with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['A', 'B:0', 'B:1']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
+            "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// Concat with 2 Mkl layers feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['G', 'E', 'F']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
+            "F(_MklConv2D);G(Const);H(_MklConcat);I(Zeta)|A->E;A->I;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "B->E:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
+            "DMT/_4->H:3;E->H:1;E:2->H:4;F->H:2;F:2->H:5;G->H;"
+            "G:control->DMT/_4:control;H->I:1");
+}
+
+// Concat with 1 Mkl and 1 non-Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['G', 'E', 'F']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);"
+            "H(_MklConcat);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
+            "DMT/_1->E:3;DMT/_2->H:3;DMT/_3->H:5;E->H:1;E:2->H:4;F->H:2;"
+            "G->H;G:control->DMT/_2:control;G:control->DMT/_3:control;H->I:1");
+}
+
+// ConcatV2 Op test: ConcatV2 with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['B:0', 'B:1', 'A']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(_MklConcatV2);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D:2;B->D;B:1->D:1;"
+            "B:control->DMT/_0:control;B:control->DMT/_1:control;"
+            "B:control->DMT/_2:control;C->E;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// ConcatV2 with 2 Mkl layers feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['E', 'F', 'G']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
+            "F(_MklConv2D);G(Const);H(_MklConcatV2);I(Zeta)|A->E;A->I;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;B->E:1;C->F;"
+            "C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
+            "DMT/_4->H:5;E->H;E:2->H:3;E:control->DMT/_4:control;F->H:1;"
+            "F:2->H:4;G->H:2;H->I:1");
+}
+
+// ConcatV2 with 1 Mkl and 1 non-Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['E', 'F', 'G']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);"
+            "H(_MklConcatV2);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
+            "DMT/_1->E:3;DMT/_2->H:4;DMT/_3->H:5;E->H;E:2->H:3;"
+            "E:control->DMT/_2:control;E:control->DMT/_3:control;F->H:1;"
+            "G->H:2;H->I:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu);C(Zeta);DMT/_0(Const)|A->B;A->C;"
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklReluGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;C->D:1;DMT/_0->C:2;DMT/_1->C:3");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluReluGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu);C(_MklReluGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->B;A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;"
+            "DMT/_1->C:2");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklAvgPool);C(Zeta);DMT/_0(Const)|A->B;A->C;"
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Int32Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'AvgPoolGrad' "
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Int32Input);B(Input);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolAvgPoolGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'I' op: 'Int32Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'AvgPoolGrad' "
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['I', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklAvgPool);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const);I(Int32Input)|A->B;A->D;A:control->DMT/_0:control;"
+            "B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;DMT/_1->C:2;I->C;"
+            "I:control->DMT/_1:control");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNormGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
+            "F(_MklFusedBatchNormGrad);G(Zeta)|A->F;A->G;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
+            "E->F:4;F->G:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNorm'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
+            "F(_MklFusedBatchNorm);G(Zeta)|A->F;A->G;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
+            "E->F:4;F->G:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to rewriting node for workspace edges
+/////////////////////////////////////////////////////////////////////
+
+/* Test LRN->MaxPool->MaxPoolGrad->LRNGrad replacement by workspace nodes. */
+TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['B'] }"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['B', 'C', 'D'] }"
+      "node { name: 'F' op: 'Input'}"
+      "node { name: 'G' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['E', 'F', 'B'] }"
+      "node { name: 'H' op: 'Input'}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['H', 'G'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+      "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
+      "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
+      "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
+      "B:2->C:1;B:2->E:4;B:2->G:6;B:3->G:7;B:control->DMT/_1:control;C->E:1;"
+      "C:1->E:3;C:2->E:5;C:3->E:7;D->E:2;DMT/_0->B:1;DMT/_1->E:6;DMT/_2->G:5;"
+      "E->G;E:1->G:4;E:control->DMT/_2:control;F->G:1;G->I:1;H->I");
+}
+
+/* Test LRN->LRNGrad replacement by workspace nodes. */
+TEST_F(MklLayoutPassTest, LRN_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'D', 'B'] }"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(_MklLRNGrad);F(Zeta)|"
+            "A->B;A:control->DMT/_0:control;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;"
+            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "D->E:1;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;E->F:1");
+}
+
+/* Test LRN->LRNGrad replacement when only one of them is present. */
+TEST_F(MklLayoutPassTest, LRN_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Zeta);DMT/_0(Const)|"
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+/* Test LRN->LRNGrad replacement when only one of them is present. */
+TEST_F(MklLayoutPassTest, LRN_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklLRNGrad);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Zeta)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
+}
+
+/* Test LRN->LRNGrad negative case, where single LRN feeds
+   2 LRNGrad nodes at different slots. */
+TEST_F(MklLayoutPassTest, LRN_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'D', 'B'] }"
+      "node { name: 'F' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'B', 'D'] }"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['E', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);DMT/_5(Const);"
+            "DMT/_6(Const);E(_MklLRNGrad);F(_MklLRNGrad);G(Zeta)|A->B;"
+            "A:control->DMT/_0:control;B->E:2;"
+            "B->F:1;B:1->E:3;B:2->E:6;B:2->F:5;B:3->E:7;C->E;C->F;"
+            "C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "C:control->DMT/_3:control;C:control->DMT/_4:control;"
+            "C:control->DMT/_5:control;C:control->DMT/_6:control;"
+            "D->E:1;D->F:2;DMT/_0->B:1;DMT/_1->F:3;DMT/_2->F:7;DMT/_3->F:4;"
+            "DMT/_4->F:6;DMT/_5->E:4;DMT/_6->E:5;E->G;F->G:1");
+}
+
+/* Test MaxPool->MaxPoolGrad replacement by workspace+rewrite nodes. */
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['C', 'B', 'D'] }"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklMaxPool);C(Input);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);F(Zeta)|"
+            "A->B;A:control->DMT/_0:control;B->E:1;B:1->E:3;B:2->E:5;B:3->E:7;"
+            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "D->E:2;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:6;E->F:1");
+}
+
+// Test MaxPool>MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPool node but workspace edges will not
+// be present.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklMaxPool);C(Zeta);DMT/_0(Const)|"
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+// Test MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPoolGrad and for workspace tensor and
+// its Mkl part, we will generate dummy tensor.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklMaxPoolGrad);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Zeta)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
+}
+
+// Test MaxPool handling for batch-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative4) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative5) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:2, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative6) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:2, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative7) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative8) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative9) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:2} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative10) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+
+// Single Conv2D Op on GPU device
+// No rewrite should happen
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Conv2D);D(Zeta)|A->C;B->C:1;B->D;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Zeta);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
+            "O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;E->F;"
+            "M->D:3;N->D:4;O->D:5");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Zeta)|"
+            "A->D;A->E;B->D:1;C->D:2;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Relu);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(AvgPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Concat Op test: Concat with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['A', 'B:0', 'B:1']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(Concat);E(Zeta)|A->D;"
+            "B->D:1;B:1->D:2;C->E;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['B:0', 'B:1', 'A']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|"
+            "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNorm'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);E(Input);"
+            "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
+            "E->F:4;F->G:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);"
+            "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->C;"
+            "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+
+static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
+  testing::StopTiming();
+  string s;
+  for (int in = 0; in < 10; in++) {
+    s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
+  }
+  random::PhiloxRandom philox(301, 17);
+  random::SimplePhilox rnd(&philox);
+  for (int op = 0; op < op_nodes; op++) {
+    s += strings::Printf(
+        "node { name: 'op%04d' op: 'Zeta' attr { key: 'T' value { "
+        "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
+        op, rnd.Uniform(10), rnd.Uniform(10));
+  }
+
+  bool first = true;
+  while (iters > 0) {
+    Graph* graph = new Graph(OpRegistry::Global());
+    InitGraph(s, graph);
+    int N = graph->num_node_ids();
+    if (first) {
+      testing::SetLabel(strings::StrCat("Per graph node.  Nodes: ", N));
+      first = false;
+    }
+    {
+      testing::StartTiming();
+      std::unique_ptr<Graph> ug(graph);
+      RunMklLayoutRewritePass(&ug);
+      testing::StopTiming();
+    }
+    iters -= N;  // Our benchmark units are individual graph nodes,
+                 // not whole graphs
+    // delete graph;
+  }
+}
+BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
+
+}  // namespace
+
+#endif  // INTEL_MKL_DNN
+
 }  // namespace tensorflow
 
 #endif /* INTEL_MKL */
diff --git a/tensorflow/core/graph/quantize_training.cc b/tensorflow/core/graph/quantize_training.cc
index d9cb55f4489b67a001f30628c5df8cfb80997063..cb0fc8a1547a8498aa0bd089a2c9395119de2789 100644
--- a/tensorflow/core/graph/quantize_training.cc
+++ b/tensorflow/core/graph/quantize_training.cc
@@ -42,7 +42,7 @@ const float kEMADecay = 0.999;
 
 // Node types to rewrite. Insert quantize_and_dequantize op for their inputs.
 const auto* nodes_to_rewrite =
-    new std::unordered_set<string, StringPiece::Hasher>{"MatMul", "Conv2D"};
+    new std::unordered_set<string, StringPieceHasher>{"MatMul", "Conv2D"};
 
 // Contains necessary parameters to convert an edge.
 struct EdgeToConvert {
@@ -563,7 +563,7 @@ Status ProcessTargetEdges(Graph* graph, const string& quant_op_type,
                           const std::vector<EdgeToConvert>& target_edges) {
   // Remember previously converted ops to avoid duplicated conversion on the
   // same input.
-  std::unordered_map<string, Node*, StringPiece::Hasher> name_index;
+  std::unordered_map<string, Node*, StringPieceHasher> name_index;
   std::vector<Node*> added_variables;
   for (const EdgeToConvert edge : target_edges) {
     Node* convert_node;
diff --git a/tensorflow/core/graph/subgraph.h b/tensorflow/core/graph/subgraph.h
index 8ccc27914bce325469b0e73deacf6a3c44a55246..3c1f8870f57f6d585f795cc92c320927e1a29315 100644
--- a/tensorflow/core/graph/subgraph.h
+++ b/tensorflow/core/graph/subgraph.h
@@ -71,7 +71,7 @@ Status RewriteGraphForExecution(
     const DeviceAttributes& device_info, bool use_function_convention,
     RewriteGraphMetadata* out_metadata);
 
-typedef std::unordered_map<StringPiece, Node*, StringPiece::Hasher> NameIndex;
+typedef std::unordered_map<StringPiece, Node*, StringPieceHasher> NameIndex;
 
 // Augment "*g" by adding special "fetch" nodes that connect to the
 // tensor outputs specified in "fetch_outputs" to retrieve the output
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 7b18e79c8d3342b8aaeabaf69371dbe7a5f54abd..2ca9b720ee127b892c06230efb3517f5afabea45 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -21,6 +21,9 @@ cc_library(
     hdrs = ["op_types.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -43,6 +46,7 @@ tf_cc_test(
     srcs = ["utils_test.cc"],
     deps = [
         ":utils",
+        "//tensorflow/cc:cc_ops",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -155,6 +159,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
     ],
 )
diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc
index e2db47b758f588f0a356bde1c9eacc0d5ff7f335..01a618ed7775eee64ce40e283394c09622353157 100644
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@@ -35,6 +35,10 @@ void Cluster::SetNumWarmupSteps(int num_steps) {
       num_steps);
 }
 
+int Cluster::NumWarmupSteps() const {
+  return options_.config.graph_options().build_cost_model_after();
+}
+
 void Cluster::DisableDetailedStats(bool disable) {
   if (disable) {
     options_.config.mutable_graph_options()->set_build_cost_model(0);
diff --git a/tensorflow/core/grappler/clusters/cluster.h b/tensorflow/core/grappler/clusters/cluster.h
index 616ab6ffdcc1e62c4c56f6826a8a5852d51b00d7..d7af50f7dc7e21db189118d84f3181a4e99563b8 100644
--- a/tensorflow/core/grappler/clusters/cluster.h
+++ b/tensorflow/core/grappler/clusters/cluster.h
@@ -64,6 +64,9 @@ class Cluster {
   // before Provision().
   void SetNumWarmupSteps(int num_steps);
 
+  // Returns the number of warmup steps.
+  int NumWarmupSteps() const;
+
   // Disable the collection of detailed statistics. Must be called
   // before Provision().
   void DisableDetailedStats(bool disable);
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 1a6fad41828c1cc3eaa0d78d12d984dcf5b59692..b39d8c752669f84e763dd13f269f5bd30b7ee3f2 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -31,20 +31,13 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-static std::atomic<bool> already_created(false);
+static std::atomic<bool> already_provisioned(false);
 
 SingleMachine::SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus)
     : Cluster(timeout_s),
       num_gpus_(num_gpus),
       expected_init_time_s_(0),
       closing_(false) {
-  // This is really ugly: to avoid leaking variables, we need to reset the tf
-  // session every time we're done processing a grappler item. However,
-  // variables are global, and therefore we can't have more than 1 session alive
-  // at a time. This check detects when more that one cluster is created.
-  CHECK(!already_created);
-  already_created = true;
-
   VLOG(1) << "Number of CPU cores: " << num_cpu_cores
           << " Number of GPUs: " << num_gpus;
   thread_pool_.reset(new thread::ThreadPool(
@@ -71,17 +64,20 @@ SingleMachine::~SingleMachine() {
   // Reset the thread-pool so that there are no outstanding Session::Run(...)s
   // when we delete the session.
   thread_pool_.reset();
-
-  CHECK(already_created);
-  already_created = false;
 }
 
 Status SingleMachine::Provision() {
-  Status status = ResetSession();
-  if (!status.ok()) {
-    return status;
+  // This is really ugly: to avoid leaking variables, we need to reset the tf
+  // session every time we're done processing a grappler item. However,
+  // variables are global, and therefore we can't have more than 1 session alive
+  // at a time. This check detects when more that one cluster is provisioned.
+  if (already_provisioned) {
+    return errors::Unavailable(
+        "Can't provision more than one single cluster at a time");
   }
 
+  TF_RETURN_IF_ERROR(ResetSession());
+
   DeviceProperties attr = GetLocalCPUInfo();
   devices_["/job:localhost/replica:0/task:0/cpu:0"] = GetLocalCPUInfo();
 
@@ -92,6 +88,7 @@ Status SingleMachine::Provision() {
     VLOG(1) << "Adding GPU device " << device_name;
     devices_[device_name] = GetLocalGPUInfo(i);
   }
+  already_provisioned = true;
   return Status::OK();
 }
 
@@ -108,27 +105,12 @@ Status SingleMachine::Initialize(const GrapplerItem& item) {
 }
 
 Status SingleMachine::Shutdown() {
-  TF_RETURN_IF_ERROR(CloseSession(true /*use_timeout*/));
+  TF_RETURN_IF_ERROR(ShutdownSession());
+
+  mutex_lock l(this->last_graph_mu_);
+  last_graph_ = nullptr;
+  already_provisioned = false;
 
-  // Delete the threadpool: this ensures that all the pending closures complete
-  // before we return. Note that if TF deadlocked on us, the closures will
-  // never complete, and the call to thread_pool_.reset() will never return:
-  // therefore we need to delete the threadpool with the background thread.
-  // That thread itself will also never complete, so the user should
-  // abort the process to avoid leaking too many resources.
-  auto n = std::make_shared<Notification>();
-  Env::Default()->SchedClosure([this, n]() {
-    thread_pool_.reset();
-    n->Notify();
-  });
-  int64 timeout_us = 1000000ll * timeout_s_;
-  const bool notified = WaitForNotificationWithTimeout(n.get(), timeout_us);
-  if (!notified) {
-    // Let the caller know that we can't shutdown the session properly since
-    // there are calls to Session::Run() still running.
-    return errors::Unavailable("The session is still running graphs after ",
-                               timeout_s_, " seconds");
-  }
   return Status::OK();
 }
 
@@ -230,7 +212,7 @@ Status SingleMachine::RunWithTimeout(
 }
 
 Status SingleMachine::CloseSession(bool use_timeout) {
-  if (!session_) {
+  if (!session_ || !thread_pool_) {
     return Status::OK();
   }
 
@@ -274,12 +256,38 @@ Status SingleMachine::CloseSession(bool use_timeout) {
   return Status::OK();
 }
 
+Status SingleMachine::ShutdownSession() {
+  TF_RETURN_IF_ERROR(CloseSession(true /*use_timeout*/));
+
+  // Delete the threadpool: this ensures that all the pending closures complete
+  // before we return. Note that if TF deadlocked on us, the closures will
+  // never complete, and the call to thread_pool_.reset() will never return:
+  // therefore we need to delete the threadpool with the background thread.
+  // That thread itself will also never complete, so the user should
+  // abort the process to avoid leaking too many resources.
+  auto n = std::make_shared<Notification>();
+  Env::Default()->SchedClosure([this, n]() {
+    thread_pool_.reset();
+    n->Notify();
+  });
+  int64 timeout_us = 1000000ll * timeout_s_;
+  const bool notified = WaitForNotificationWithTimeout(n.get(), timeout_us);
+  if (!notified) {
+    // Let the caller know that we can't shutdown the session properly since
+    // there are calls to Session::Run() still running.
+    return errors::Unavailable("The session is still running graphs after ",
+                               timeout_s_, " seconds");
+  }
+
+  return Status::OK();
+}
+
 Status SingleMachine::ResetSession() {
   if (session_) {
     LOG(INFO) << "Cleaning up previous session";
 
     // Make sure the session is properly closed
-    TF_RETURN_IF_ERROR(Shutdown());
+    TF_RETURN_IF_ERROR(ShutdownSession());
 
     // Destroying the object deletes all its variables as well. This is only
     // true for DirectSession.
diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h
index d3efbe3c614580d0502874412697cd5719e28be5..be005a95091de5bca6e193d571dfd2f64dcf095c 100644
--- a/tensorflow/core/grappler/clusters/single_machine.h
+++ b/tensorflow/core/grappler/clusters/single_machine.h
@@ -49,6 +49,7 @@ class SingleMachine : public Cluster {
                         RunMetadata* run_metadata, int64 timeout_s);
   Status ResetSession();
   Status CloseSession(bool use_timeout);
+  Status ShutdownSession();
   void MergeCosts(CostGraphDef* graph_costs, const CostGraphDef& init_costs,
                   const CostGraphDef& queue_costs);
 
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index f6c325c2a4bb1877f07fbfd034755ff501344f48..df936efad104dd92595bcc7d325e964347b86cb8 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -48,6 +48,9 @@ class SingleMachineTest : public ::testing::Test {
   }
 
   void TearDown() override {
+    if (cluster_) {
+      TF_CHECK_OK(cluster_->Shutdown());
+    }
     cluster_.reset();
   }
 
@@ -178,8 +181,7 @@ TEST_F(SingleMachineTest, GraphOptimizations) {
   // With optimizations turned on, some nodes could have been optimized away,
   // and the cost model could be partial. Restart the cluster with optimizations
   // disabled and make sure we have all the information we're looking for.
-  cluster_.reset();
-  cluster_.reset(new SingleMachine(5, 3, 0));
+  TF_CHECK_OK(cluster_->Shutdown());
   cluster_->DisableOptimizer(true);
   TF_CHECK_OK(cluster_->Provision());
 
@@ -324,7 +326,7 @@ static void RunInfiniteTFLoop() {
 
 TEST_F(SingleMachineTest, InfiniteLoops) {
   // The RunInfiniteTFLoop function creates its own cluster.
-  cluster_.reset();
+  TF_CHECK_OK(cluster_->Shutdown());
 
   EXPECT_EXIT(RunInfiniteTFLoop(), ::testing::ExitedWithCode(0), ".*");
 }
@@ -578,7 +580,8 @@ TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
   EXPECT_EQ(device_memory.size(), 1);
   EXPECT_GT(device_memory.begin()->second.bytes_in_use, 0);
 
-  // Reset cluster_ would release all memory.
+  // Shutting down the cluster_ would release all memory.
+  TF_CHECK_OK(cluster_->Shutdown());
   cluster_.reset();
   std::unordered_map<string, AllocatorStats> device_memory_after;
   TF_CHECK_OK(GetDeviceMemoryStats(options, &device_memory_after));
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index f02cb51038a1d34475d9c13b0ca14b7137c41f35..d6ce72639ca3293b057efe70661df4d71dfad437 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -50,6 +50,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
     ],
 )
@@ -132,8 +133,8 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = [
         ":op_performance_data_cc",
-        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
@@ -306,6 +307,7 @@ cc_library(
         ":virtual_placer",
         ":virtual_scheduler",
         "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
index d1f3e36aa8164c4a80537b8affc324503af5488b..1c2c1713834a11d0a7c85247e9a7e4cdf779c592 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@@ -102,8 +102,14 @@ TEST_F(AnalyticalCostEstimatorTest, SimpleTest) {
   Costs summary;
   TF_ASSERT_OK(estimator.PredictCosts(item.graph, &cost_graph, &summary));
 
-  EXPECT_EQ(Costs::NanoSeconds(9156), summary.execution_time);
-  EXPECT_FALSE(summary.inaccurate);
+  EXPECT_EQ(Costs::NanoSeconds(9150), summary.execution_time);
+
+  // Make this estimate accurate:
+  // TODO(http://b/70031255): Accurate estimator for RandomUniform op needed
+  // TODO(http://b/70031363): Accurate estimator for Softmax needed
+  //
+  // Change to EXPECT_FALSE when the above TODOs are done:
+  EXPECT_TRUE(summary.inaccurate);
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index cf9fa4fdaf947cba8c38d6eb3ca67d3a43f35d29..852e69737baa14e0d05de1fdcb6fc24a143f6a2d 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -40,6 +40,16 @@ struct Costs {
   // Builds a Costs structure with all zero values, rather than unknowns.
   static inline Costs ZeroCosts();
 
+  struct MilliSeconds : std::chrono::milliseconds {
+    MilliSeconds() : std::chrono::milliseconds(0) {}
+    MilliSeconds(double d) : std::chrono::milliseconds(static_cast<int64>(d)) {}
+    MilliSeconds(const std::chrono::milliseconds& d)
+        : std::chrono::milliseconds(d) {}
+    MilliSeconds& operator=(const std::chrono::milliseconds& d) {
+      std::chrono::milliseconds::operator=(d);
+      return *this;
+    }
+  };
   struct MicroSeconds : std::chrono::microseconds {
     MicroSeconds() : std::chrono::microseconds(0) {}
     MicroSeconds(double d) : std::chrono::microseconds(static_cast<int64>(d)) {}
@@ -49,6 +59,9 @@ struct Costs {
       std::chrono::microseconds::operator=(d);
       return *this;
     }
+    MilliSeconds asMilliSeconds() const {
+      return std::chrono::duration_cast<std::chrono::milliseconds>(*this);
+    }
   };
   struct NanoSeconds : std::chrono::nanoseconds {
     NanoSeconds() : std::chrono::nanoseconds(0) {}
@@ -60,9 +73,10 @@ struct Costs {
       return *this;
     }
     MicroSeconds asMicroSeconds() const {
-      std::chrono::microseconds us =
-          std::chrono::duration_cast<std::chrono::microseconds>(*this);
-      return MicroSeconds(us);
+      return std::chrono::duration_cast<std::chrono::microseconds>(*this);
+    }
+    MilliSeconds asMilliSeconds() const {
+      return std::chrono::duration_cast<std::chrono::milliseconds>(*this);
     }
   };
 
@@ -100,6 +114,10 @@ struct Costs {
   std::unordered_map<string, uint64> estimated_max_memory_per_device;
 };
 
+inline std::ostream& operator<<(std::ostream& os, const Costs::MilliSeconds d) {
+  os << d.count() << "ms";
+  return os;
+}
 inline std::ostream& operator<<(std::ostream& os, const Costs::MicroSeconds d) {
   os << d.count() << "us";
   return os;
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index fc6d02cf15dc6776520bae49c6dd57233248a581..0453ceb6d180de4ea9af86e676efde7716c0297c 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -249,106 +250,340 @@ typename DisjointSet<Handle>::Rep* DisjointSet<Handle>::Find(Handle value) {
   return root;
 }
 
-// If a Merge node has a NextIteration node as an input then that input will
-// try to forward an UnknownShape at graph construction time. However, the
-// Merge shape function will always propagate an UnknownShape if any of its
-// inputs are UnknownShapes. So we need to ignore the input from NextIteration
-// nodes to propagate any known shape from the Merge node.
-Status ShapeOfMergeNode(const Node* node, InferenceContext* c) {
-  ShapeHandle out = c->input(0);
-  if (!c->RankKnown(out)) {
-    out = c->UnknownShape();
-  } else {
-    int32 rank = c->Rank(out);
-    for (const Edge* e : node->in_edges()) {
-      if (e->src()->IsNextIteration() || e->dst_input() <= 0) {
-        continue;
+bool IsQueue(const Node& node) {
+  StringPiece type(node.type_string());
+  return type.ends_with("QueueV2");
+}
+
+// Returns true if the node is an Enter op AND its input is a Queue.
+bool IsEnterWithQueue(const Node& node) {
+  if (node.IsEnter()) {
+    const Node* in_node;
+    TF_CHECK_OK(node.input_node(0, &in_node));
+    return IsQueue(*in_node);
+  }
+  return false;
+}
+
+bool HasAnyUnknownDimensions(const TensorShapeProto& proto) {
+  if (proto.unknown_rank()) {
+    return true;
+  }
+  for (const auto& dim : proto.dim()) {
+    if (dim.size() < 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void VerboseLogUnknownDimensionSources(
+    const Graph& graph,
+    const std::map<string, std::vector<OpInfo::TensorProperties>>&
+        input_properties_map,
+    const std::map<string, std::vector<OpInfo::TensorProperties>>&
+        output_properties_map) {
+  if (!VLOG_IS_ON(2)) {
+    return;
+  }
+
+  VLOG(2) << "Nodes with known inputs, but with unknown output dimensions:";
+
+  // Find all nodes in the graph for which we
+  // do not have any unknown dimensions in their inputs, but
+  // we have some unknown dimensions in their outputs.
+  std::map<string, int> op_to_count;
+  for (const Node* const node : graph.nodes()) {
+    if (node->num_outputs() == 0) {
+      continue;
+    }
+
+    const auto& input_properties = input_properties_map.at(node->name());
+    const auto& output_properties = output_properties_map.at(node->name());
+
+    bool has_unknown_inputs = false;
+    for (int i = 0; i < node->num_inputs(); ++i) {
+      if (HasAnyUnknownDimensions(input_properties[i].shape())) {
+        has_unknown_inputs = true;
+        break;
       }
-      ShapeHandle input = c->input(e->dst_input());
-      if (!c->RankKnown(input) || c->Rank(input) != rank) {
-        out = c->UnknownShape();
+    }
+
+    if (has_unknown_inputs) {
+      continue;
+    }
+
+    for (int i = 0; i < node->num_outputs(); ++i) {
+      if (HasAnyUnknownDimensions(output_properties[i].shape())) {
+        string inputs = "input_shapes=[";
+        for (int i = 0; i < node->num_inputs(); ++i) {
+          inputs +=
+              PartialTensorShape::DebugString(input_properties[i].shape());
+        }
+        inputs += "]";
+
+        string outputs = "output_shapes=[";
+        for (int i = 0; i < node->num_outputs(); ++i) {
+          outputs +=
+              PartialTensorShape::DebugString(output_properties[i].shape());
+        }
+        outputs += "]";
+
+        VLOG(2) << "Node: " << node->name() << ", Op: " << node->def().op()
+                << ", " << inputs << ", " << outputs;
+
+        op_to_count[node->def().op()]++;
+
+        // don't log again for this node
         break;
       }
+    }
+  }
+  VLOG(2) << "Op types with known inputs, but with unknown output dimensions "
+          << "(format: <op_type> (<count>)):";
+  for (const auto& p : op_to_count) {
+    VLOG(2) << p.first << " (" << p.second << ")";
+  }
+}
+
+}  // namespace
 
+// Queue of nodes to process. Nodes can be enqueued in any order, but will be
+// dequeued in (roughly) topological order. Propagating shapes following a
+// topological ordering isn't required for correctness but helps speed things up
+// since it avoids processing the same node multiple times as its inputs
+// information is refined.
+class TopoQueue {
+ public:
+  void push(const Node* n) { queue_.insert(n); }
+  const Node* pop() {
+    CHECK(!empty());
+    auto it = queue_.begin();
+    const Node* n = *it;
+    queue_.erase(it);
+    return n;
+  }
+
+  bool empty() const { return queue_.empty(); }
+  std::size_t size() const { return queue_.size(); }
+
+ private:
+  // Graph nodes are created in (roughly) topological order. Therefore we can
+  // use their id to ensure they're sorted topologically.
+  struct CompareNodes {
+    bool operator()(const Node* lhs, const Node* rhs) const {
+      return lhs->id() > rhs->id();
+    }
+  };
+  std::set<const Node*, CompareNodes> queue_;
+};
+
+// Merge and relax symbolic shapes.
+// Each symbolic shape or dimension is represented by a handle. Unlike the TF
+// shape refiner which creates new handles every time it processes an unknown
+// shape/dimension, the symbolic shape refiner assigns a specific handle to each
+// unknown shape/dimension of a given node.
+class SymbolicShapeRefiner {
+ public:
+  explicit SymbolicShapeRefiner(ShapeRefiner* shape_refiner)
+      : shape_refiner_(shape_refiner) {}
+
+  InferenceContext* GetContext(const Node* node) {
+    return shape_refiner_->GetContext(node);
+  }
+  Status UpdateNode(const Node* node, bool relax, bool* refined) {
+    return shape_refiner_->UpdateNode(node, relax, refined);
+  }
+  Status SetUnknownShape(const Node* node, int output_port) {
+    shape_inference::ShapeHandle shape =
+        GetUnknownOutputShape(node, output_port);
+    InferenceContext* ctx = GetContext(node);
+    if (ctx == nullptr) {
+      return errors::InvalidArgument("Missing context");
+    }
+    ctx->set_output(output_port, shape);
+    return Status::OK();
+  }
+
+  struct ShapeId {
+    const Node* node;
+    int port_id;
+    bool operator==(const ShapeId& other) const {
+      return node == other.node && port_id == other.port_id;
+    }
+  };
+  struct HashShapeId {
+    std::size_t operator()(const ShapeId& shp) const {
+      return std::hash<const Node*>{}(shp.node) + shp.port_id;
+    }
+  };
+
+  struct DimId {
+    const Node* node;
+    int port_id;
+    int dim_index;
+    bool operator==(const DimId& other) const {
+      return node == other.node && port_id == other.port_id &&
+             dim_index == other.dim_index;
+    }
+  };
+
+  struct HashDimId {
+    std::size_t operator()(const DimId& dim) const {
+      return std::hash<const Node*>{}(dim.node) + dim.port_id + dim.dim_index;
+    }
+  };
+
+  // Compute the shape of the tensors outputed by node 'node' at output port
+  // 'port_index' as the intersection of shape1 and shape2.
+  ShapeHandle OutputAsIntersection(const Node* node, int port_index,
+                                   ShapeHandle shape1, ShapeHandle shape2) {
+    if (shape1.SameHandle(shape2)) {
+      return shape1;
+    }
+    InferenceContext* ctx = shape_refiner_->GetContext(node);
+    ShapeHandle merged = shape1;
+    if (!ctx->RankKnown(shape2) && !ctx->RankKnown(shape1)) {
+      // Return either one since they're expected to represent the same value.
+      return shape1;
+    } else if (!ctx->RankKnown(shape2) && ctx->RankKnown(shape1)) {
+      return shape1;
+    } else if (ctx->RankKnown(shape2) && !ctx->RankKnown(shape1)) {
+      return shape2;
+    } else {
+      const int rank = ctx->Rank(shape1);
+      if (ctx->Rank(shape2) != rank) {
+        // We detected an inconsistency, return an unknown shape. This can
+        // happen in the fanout of a merge node since during the initial
+        // propagation we optimistically assume that all the inputs to the merge
+        // node have the same shape.
+        return GetUnknownOutputShape(node, port_index);
+      }
       for (int d = 0; d < rank; ++d) {
-        if (c->Value(c->Dim(input, d)) != c->Value(c->Dim(out, d))) {
-          TF_RETURN_IF_ERROR(c->ReplaceDim(out, d, c->UnknownDim(), &out));
+        if (!ctx->Dim(shape1, d).SameHandle(ctx->Dim(shape2, d))) {
+          if (ctx->Value(ctx->Dim(shape1, d)) !=
+              ctx->Value(ctx->Dim(shape2, d))) {
+            DimensionHandle new_dim;
+            if (ctx->Value(ctx->Dim(shape1, d)) < 0) {
+              new_dim = ctx->Dim(shape2, d);
+            } else if (ctx->Value(ctx->Dim(shape2, d)) < 0) {
+              new_dim = ctx->Dim(shape1, d);
+            } else {
+              new_dim = GetUnknownOutputDim(node, port_index, d);
+            }
+            TF_CHECK_OK(ctx->ReplaceDim(merged, d, new_dim, &merged));
+          }
         }
       }
     }
+    return merged;
   }
-  c->set_output(0, out);
-  c->set_output(1, c->Scalar());
-  return Status::OK();
-}
 
-// Manually propagate the input shape for Enter nodes and update any Merge node
-// outputs.
-Status UpdateEnter(ShapeRefiner* shape_refiner, const Node* node, bool relax,
-                   std::queue<const Node*>* new_shapes) {
-  auto enter_ctx = shape_refiner->GetContext(node);
-  CHECK_NE(enter_ctx, nullptr);
-  for (int i = 0; i < enter_ctx->num_outputs(); i++) {
-    TF_RETURN_IF_ERROR(shape_refiner->SetShape(node, i, enter_ctx->input(0)));
-  }
-  for (const Edge* e : node->out_edges()) {
-    Node* dst = e->dst();
-    if (dst->IsMerge()) {
-      bool updated = false;
-      TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(dst, relax, &updated));
-      if (!updated) {
-        continue;
+  // Compute the shape of the tensors outputed by node 'node' at output port
+  // 'port_index' as the union of shape1 and shape2.
+  ShapeHandle OutputAsUnion(const Node* node, int port_index,
+                            ShapeHandle shape1, ShapeHandle shape2) {
+    if (shape1.SameHandle(shape2)) {
+      return shape1;
+    }
+    InferenceContext* ctx = shape_refiner_->GetContext(node);
+    ShapeHandle relaxed = shape1;
+    const int rank = ctx->Rank(shape1);
+    if (!ctx->RankKnown(shape2) || ctx->Rank(shape2) != rank) {
+      relaxed = GetUnknownOutputShape(node, port_index);
+    } else {
+      for (int d = 0; d < rank; ++d) {
+        if (!ctx->Dim(shape1, d).SameHandle(ctx->Dim(shape2, d))) {
+          int64 val1 = ctx->Value(ctx->Dim(shape1, d));
+          int64 val2 = ctx->Value(ctx->Dim(shape2, d));
+          if (val1 != val2 || (val1 < 0 && val2 < 0)) {
+            DimensionHandle new_dim = GetUnknownOutputDim(node, port_index, d);
+            TF_CHECK_OK(ctx->ReplaceDim(relaxed, d, new_dim, &relaxed));
+          }
+        }
       }
-      InferenceContext* merge_ctx = shape_refiner->GetContext(dst);
-      CHECK_NE(merge_ctx, nullptr);
-      TF_RETURN_IF_ERROR(ShapeOfMergeNode(dst, merge_ctx));
-      new_shapes->push(dst);
     }
+    return relaxed;
   }
-  return Status::OK();
-}
 
-// Propagates the shapes in the transitive fan-out of <new_shapes>.
-Status PropagateShapes(ShapeRefiner* shape_refiner, bool relax,
-                       std::queue<const Node*>* new_shapes) {
-  while (!new_shapes->empty()) {
-    const Node* n = new_shapes->front();
-    new_shapes->pop();
-    for (const Node* fanout : n->out_nodes()) {
-      bool updated = false;
-      TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(fanout, relax, &updated));
-      if (fanout->IsEnter()) {
-        TF_RETURN_IF_ERROR(
-            UpdateEnter(shape_refiner, fanout, relax, new_shapes));
-      } else if (updated) {
-        // We want to avoid propagating through loops on the merge pass because
-        // the shapes are not guaranteed to converge.
-        if (!relax && fanout->IsNextIteration()) {
+  bool EquivalentShapes(ShapeHandle s1, ShapeHandle s2) const {
+    if (s1.SameHandle(s2)) {
+      return true;
+    }
+    if (InferenceContext::Rank(s1) != InferenceContext::Rank(s2)) {
+      return false;
+    }
+    if (!InferenceContext::RankKnown(s1) && !InferenceContext::RankKnown(s2)) {
+      return true;
+    }
+    const int rank = InferenceContext::Rank(s1);
+    for (int i = 0; i < rank; ++i) {
+      if (!InferenceContext::DimKnownRank(s1, i).SameHandle(
+              InferenceContext::DimKnownRank(s2, i))) {
+        int64 val1 =
+            InferenceContext::Value(InferenceContext::DimKnownRank(s1, i));
+        int64 val2 =
+            InferenceContext::Value(InferenceContext::DimKnownRank(s2, i));
+        if (val1 >= 0 && val2 >= 0 && val1 == val2) {
           continue;
         }
-        new_shapes->push(fanout);
+        return false;
       }
     }
+    return true;
   }
-  return Status::OK();
-}
 
-bool IsQueue(const Node& node) {
-  StringPiece type(node.type_string());
-  return type.ends_with("QueueV2");
-}
+  bool EquivalentShapesAndTypes(const std::vector<ShapeAndType>& st1,
+                                const std::vector<ShapeAndType>& st2) const {
+    if (st1.size() != st2.size()) {
+      return false;
+    }
+    for (int i = 0; i < st1.size(); ++i) {
+      const ShapeAndType& s1 = st1[i];
+      const ShapeAndType& s2 = st2[i];
+      if (s1.dtype != s2.dtype) {
+        return false;
+      }
+      if (!EquivalentShapes(s1.shape, s2.shape)) {
+        return false;
+      }
+    }
+    return true;
+  }
 
-// Returns true if the node is an Enter op AND its input is a Queue.
-bool IsEnterWithQueue(const Node& node) {
-  if (node.IsEnter()) {
-    const Node* in_node;
-    TF_CHECK_OK(node.input_node(0, &in_node));
-    return IsQueue(*in_node);
+ private:
+  // Return the one ShapeHandle used to denote a fully unknown shape for a node
+  // output.
+  ShapeHandle GetUnknownOutputShape(const Node* node, int index) {
+    ShapeId id{node, index};
+    auto it = unknown_shapes_.find(id);
+    if (it != unknown_shapes_.end()) {
+      return it->second;
+    }
+    InferenceContext* c = shape_refiner_->GetContext(node);
+    ShapeHandle shp = c->UnknownShape();
+    unknown_shapes_[id] = shp;
+    return shp;
+  }
+  // Return the one ShapeHandle used to denote a fully unknown dimension for a
+  // node output.
+  DimensionHandle GetUnknownOutputDim(const Node* node, int index, int dim_id) {
+    DimId id{node, index, dim_id};
+    auto it = unknown_dims_.find(id);
+    if (it != unknown_dims_.end()) {
+      return it->second;
+    }
+    InferenceContext* c = shape_refiner_->GetContext(node);
+    DimensionHandle dim = c->UnknownDim();
+    unknown_dims_[id] = dim;
+    return dim;
   }
-  return false;
-}
 
-}  // namespace
+  ShapeRefiner* shape_refiner_;
+
+  std::unordered_map<ShapeId, ShapeHandle, HashShapeId> unknown_shapes_;
+  std::unordered_map<DimId, DimensionHandle, HashDimId> unknown_dims_;
+};
 
 // Keep track of shapes and dimensions in a graph.
 // In particular, use disjoint sets to track equivalence between shapes and
@@ -401,24 +636,9 @@ class SymbolicShapeManager {
   DisjointSet<shape_inference::DimensionHandle> dims_;
 };
 
-void GraphProperties::Relax(InferenceContext* c, ShapeHandle s0, ShapeHandle s1,
-                            ShapeHandle* out) {
-  c->Relax(s0, s1, out);
-}
-
-bool GraphProperties::SameDefinedShape(InferenceContext* c, ShapeHandle s0,
-                                       ShapeHandle s1) {
-  return ShapeRefiner::SameDefinedShape(c, s0, s1);
-}
-
-bool GraphProperties::IsUpdatedShapesOrTypes(
-    InferenceContext* c, const std::vector<ShapeAndType>& existing,
-    const std::vector<ShapeAndType>& updated) {
-  return ShapeRefiner::IsUpdatedShapesOrTypes(c, existing, updated);
-}
-
 Status GraphProperties::MergeEnqueueShapesAndTypes(
-    const std::vector<ShapeAndType>& shapes_and_types, InferenceContext* qctx,
+    SymbolicShapeRefiner* shape_refiner, const Node* qnode,
+    const std::vector<ShapeAndType>& shapes_and_types,
     std::vector<ShapeAndType>* queue_shapes_and_types) {
   if (shapes_and_types.size() != queue_shapes_and_types->size()) {
     return errors::InvalidArgument(
@@ -434,13 +654,14 @@ Status GraphProperties::MergeEnqueueShapesAndTypes(
                                      DataTypeString(b.dtype));
     }
 
-    TF_RETURN_IF_ERROR(qctx->Merge(a.shape, b.shape, &b.shape));
+    b.shape = shape_refiner->OutputAsIntersection(qnode, i, a.shape, b.shape);
   }
   return Status::OK();
 }
 
 Status GraphProperties::RelaxEnqueueShapesAndMergeTypes(
-    const std::vector<ShapeAndType>& shapes_and_types, InferenceContext* qctx,
+    SymbolicShapeRefiner* shape_refiner, const Node* qnode,
+    const std::vector<ShapeAndType>& shapes_and_types,
     std::vector<ShapeAndType>* queue_shapes_and_types) {
   if (shapes_and_types.size() != queue_shapes_and_types->size()) {
     return errors::InvalidArgument(
@@ -456,12 +677,246 @@ Status GraphProperties::RelaxEnqueueShapesAndMergeTypes(
                                      DataTypeString(b.dtype));
     }
 
-    Relax(qctx, a.shape, b.shape, &b.shape);
+    b.shape = shape_refiner->OutputAsUnion(qnode, i, a.shape, b.shape);
+  }
+  return Status::OK();
+}
+
+// If a Merge node has a NextIteration node as an input then that input will
+// try to forward an UnknownShape at graph construction time. However, the
+// Merge shape function will always propagate an UnknownShape if any of its
+// inputs are UnknownShapes. So we need to ignore the input from NextIteration
+// nodes to propagate any known shape from the Merge node.
+Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
+                                        const Node* node, bool relax,
+                                        TopoQueue* new_shapes) {
+  InferenceContext* c = shape_refiner->GetContext(node);
+  CHECK_NE(c, nullptr);
+
+  ShapeHandle out;
+  bool out_initialized = false;
+  for (const Edge* e : node->in_edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+    // Skip back edges during the initial propagation phase. This is equivalent
+    // to assuming that all the inputs to the merge nodes are fed by the same
+    // shape, and will be corrected as needed in the relaxation phase.
+    if (!relax && e->src()->IsNextIteration()) {
+      continue;
+    }
+
+    InferenceContext* in = shape_refiner->GetContext(e->src());
+    ShapeHandle input = in->output(e->src_output());
+    if (relax) {
+      c->RelaxInput(e->dst_input(), input);
+    } else {
+      c->MergeInput(e->dst_input(), input);
+    }
+    if (!out_initialized) {
+      out_initialized = true;
+      out = input;
+      continue;
+    }
+    if (relax) {
+      out = shape_refiner->OutputAsUnion(node, 0, input, out);
+    } else {
+      out = shape_refiner->OutputAsIntersection(node, 0, input, out);
+    }
+  }
+
+  if (!shape_refiner->EquivalentShapes(out, c->output(0))) {
+    c->set_output(0, out);
+    c->set_output(1, c->Scalar());
+    new_shapes->push(node);
+  }
+
+  return Status::OK();
+}
+
+Status GraphProperties::OverwriteFedPorts(
+    SymbolicShapeRefiner* shape_refiner,
+    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+    const Node* node, TopoQueue* new_shapes) const {
+  auto it = fed_ports.find(node->name());
+  Status status;
+  if (it != fed_ports.end()) {
+    // It is possible to feed node output ports with tensors of any shape: as a
+    // result, the shape of a fed port is completely unknown.
+    for (const int output_port : it->second) {
+      status.Update(shape_refiner->SetUnknownShape(node, output_port));
+    }
+    new_shapes->push(node);
+  }
+  return status;
+}
+
+// Manually propagate the input shape for Enter nodes and update any Merge node
+// outputs.
+Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
+                                    const Node* node, bool relax,
+                                    TopoQueue* new_shapes) {
+  auto enter_ctx = shape_refiner->GetContext(node);
+  CHECK_NE(enter_ctx, nullptr);
+
+  for (const Edge* e : node->in_edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+    InferenceContext* in = shape_refiner->GetContext(e->src());
+    ShapeHandle input = in->output(e->src_output());
+    if (!enter_ctx->output(0).SameHandle(input)) {
+      if (relax) {
+        enter_ctx->RelaxInput(0, input);
+      } else {
+        enter_ctx->MergeInput(0, input);
+      }
+      enter_ctx->set_output(0, input);
+      new_shapes->push(node);
+    }
+  }
+  return Status::OK();
+}
+
+Status GraphProperties::UpdateShapes(
+    SymbolicShapeRefiner* shape_refiner, bool relax,
+    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+    const Node* n, TopoQueue* new_shapes) const {
+  if (n->IsEnter()) {
+    // The Enter shape function always forwards an UnknownShape, so do the right
+    // thing here.
+    TF_RETURN_IF_ERROR(UpdateEnter(shape_refiner, n, relax, new_shapes));
+  } else if (n->IsMerge()) {
+    // Properly handle merge nodes.
+    TF_RETURN_IF_ERROR(UpdateMergeNode(shape_refiner, n, relax, new_shapes));
+  } else {
+    // Rely on regular TF shape refinement for all the other nodes.
+    bool updated = false;
+    TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(n, relax, &updated));
+    if (updated) {
+      // We want to avoid propagating through loops on the merge pass because
+      // the shapes are not guaranteed to converge.
+      if (relax || !n->IsNextIteration()) {
+        new_shapes->push(n);
+      }
+    }
+  }
+  // Nodes can be fed with any shape. The TensorFlow shape inference code can't
+  // handle this properly, so overwrite its behavior here.
+  return OverwriteFedPorts(shape_refiner, fed_ports, n, new_shapes);
+}
+
+// Propagates the shapes in the transitive fan-out of <new_shapes>.
+Status GraphProperties::PropagateShapes(
+    SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes,
+    const std::unordered_map<const Node*, std::unordered_set<const Node*>>&
+        resources,
+    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+    int num_loops) const {
+  // Limit the number of iterations to prevent infinite loops in the presence of
+  // incorrect shape functions. The algoritm should converge in at most
+  // num_nested_loops^2 * max_rank. We approximate max_rank with the constant 4.
+  // The same applies to resources.
+  VLOG(1) << "Propagating (relax=" << relax << ") " << new_shapes->size()
+          << " new shapes through " << num_loops << " loops and "
+          << resources.size() << " resources" << std::endl;
+
+  const int64 max_loop_length = item_.graph.node_size();
+  const int64 max_rank = 4;
+  const int64 max_loop_iterations =
+      max_rank * max_loop_length * std::max<int64>(1, num_loops * num_loops);
+  const int64 num_queues = resources.size();
+  const int64 max_resource_iterations = num_queues * num_queues * max_rank;
+
+  int64 num_resource_iterations = 0;
+  do {
+    int64 num_loop_iterations = 0;
+    while (!new_shapes->empty() &&
+           num_loop_iterations++ < max_loop_iterations) {
+      const Node* n = new_shapes->pop();
+      for (const Edge* e : n->out_edges()) {
+        if (!e->IsControlEdge()) {
+          const Node* fanout = e->dst();
+          TF_RETURN_IF_ERROR(UpdateShapes(shape_refiner, relax, fed_ports,
+                                          fanout, new_shapes));
+        }
+      }
+    }
+
+    for (const auto& resource : resources) {
+      // Resources need special handling: since the enqueue nodes are in the
+      // fanout of the queues, we need to manually propagate the shapes from
+      // enqueue node to the corresponding queue.
+      TF_RETURN_IF_ERROR(UpdateResource(resource.first, resource.second,
+                                        shape_refiner, relax, new_shapes));
+    }
+  } while (!new_shapes->empty() &&
+           num_resource_iterations++ < max_resource_iterations);
+
+  if (!new_shapes->empty()) {
+    return errors::Internal("Shape inference failed to converge");
   }
+
   return Status::OK();
 }
 
-Status GraphProperties::InferStatically() {
+Status GraphProperties::UpdateResource(
+    const Node* qnode, const std::unordered_set<const Node*>& queue_inputs,
+    SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes) {
+  // Proceed only if qnode is a queue or an Enter with queue input.
+  if (!IsQueue(*qnode) && !IsEnterWithQueue(*qnode)) {
+    return Status::OK();
+  }
+  auto qctx = shape_refiner->GetContext(qnode);
+  if (!qctx) {
+    return Status::OK();
+  }
+  auto* queue_handle_data = qctx->output_handle_shapes_and_types(0);
+
+  // Merge all inputs into the enqueue node, regardless of which phase we
+  // are in.
+  std::vector<ShapeAndType> queue_shapes_and_types;
+  if (queue_handle_data) {
+    queue_shapes_and_types = *queue_handle_data;
+  }
+  for (const auto& node : queue_inputs) {
+    auto ctx = shape_refiner->GetContext(node);
+    if (!ctx) {
+      continue;
+    }
+    // TODO(bsteiner): handle EnqueueMany as well.
+    if (node->type_string().find("Enqueue") != std::string::npos &&
+        node->type_string().find("EnqueueMany") == std::string::npos) {
+      std::vector<ShapeAndType> shapes_and_types;
+      for (int i = 1; i < ctx->num_inputs(); ++i) {
+        shapes_and_types.push_back({ctx->input(i), node->input_type(i)});
+      }
+      if (queue_shapes_and_types.empty()) {
+        queue_shapes_and_types = shapes_and_types;
+      } else {
+        if (relax) {
+          TF_RETURN_IF_ERROR(RelaxEnqueueShapesAndMergeTypes(
+              shape_refiner, qnode, shapes_and_types, &queue_shapes_and_types));
+        } else {
+          TF_RETURN_IF_ERROR(MergeEnqueueShapesAndTypes(
+              shape_refiner, qnode, shapes_and_types, &queue_shapes_and_types));
+        }
+      }
+    }
+  }
+
+  if (queue_handle_data == nullptr ||
+      !shape_refiner->EquivalentShapesAndTypes(*queue_handle_data,
+                                               queue_shapes_and_types)) {
+    qctx->set_output_handle_shapes_and_types(0, queue_shapes_and_types);
+
+    new_shapes->push(qnode);
+  }
+
+  return Status::OK();
+}
+
+Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   Graph graph(OpRegistry::Global());
   FunctionLibraryDefinition function_library(graph.op_registry(),
                                              item_.graph.library());
@@ -478,11 +933,22 @@ Status GraphProperties::InferStatically() {
   Status s = ImportGraphDef(options, item_.graph, &graph, &shape_refiner);
   TF_RETURN_IF_ERROR(s);
 
+  std::unordered_map<string, std::unordered_set<int>> fed_ports;
+  if (!assume_valid_feeds) {
+    for (const auto& feed : item_.feed) {
+      int port_index = 0;
+      string node_name = ParseNodeName(feed.first, &port_index);
+      fed_ports[node_name].insert(port_index);
+    }
+  }
+
   // List the resources and the nodes using them. Also collect the Enter and
   // Merge nodes.
   std::unordered_map<const Node*, std::unordered_set<const Node*>> resources;
   std::unordered_set<const Node*> enter_nodes;
   std::unordered_set<const Node*> merge_nodes;
+  std::unordered_set<const Node*> fed_nodes;
+  int num_loops = 0;
   for (const Node* const node : graph.nodes()) {
     for (int i = 0; i < node->num_inputs(); ++i) {
       if (node->input_type(i) == DataType::DT_RESOURCE) {
@@ -493,147 +959,46 @@ Status GraphProperties::InferStatically() {
     }
     if (node->IsEnter()) {
       enter_nodes.insert(node);
+    } else if (node->IsMerge()) {
+      merge_nodes.insert(node);
     } else if (node->IsNextIteration()) {
-      for (const Node* output : node->out_nodes()) {
-        if (output->IsMerge()) {
-          merge_nodes.insert(output);
-        }
-      }
+      ++num_loops;
+    }
+    if (fed_ports.find(node->name()) != fed_ports.end()) {
+      fed_nodes.insert(node);
     }
   }
 
-  // Propagate the initial shapes of Enter nodes manually (the Enter shape
-  // function always forwards an UnknownShape).
-  std::queue<const Node*> new_shapes;
-  for (const Node* node : enter_nodes) {
-    TF_RETURN_IF_ERROR(
-        UpdateEnter(&shape_refiner, node, false /* relax */, &new_shapes));
-  }
-  TF_RETURN_IF_ERROR(
-      PropagateShapes(&shape_refiner, false /* relax */, &new_shapes));
+  SymbolicShapeRefiner refiner(&shape_refiner);
 
   // We propagate shapes through the graph in two phases. In the first phase, we
-  // exclusively merge shapes but we do not propagate shapes through loops. Then
-  // on the second phase, we exclusively relax shapes and propagate shapes
-  // through loops until reaching fixed point.
+  // exclusively merge shapes but we do not propagate shapes through the
+  // backedge of loops (i.e. the NextIteration node). Then on the second phase,
+  // we exclusively relax shapes and propagate shapes through loops until
+  // reaching fixed point.
   for (int relax = 0; relax < 2; relax++) {
-    // We don't update Merge nodes with the input of NextIteration nodes on the
-    // merge pass. So we do that at the beginning of the relax pass instead.
-    if (relax) {
-      bool updated = false;
-      for (const Node* node : merge_nodes) {
-        TF_RETURN_IF_ERROR(
-            shape_refiner.UpdateNode(node, false /* relax */, &updated));
-      }
+    TopoQueue new_shapes;
+    // Force the propagation of shapes of Enter nodes manually (the Enter shape
+    // function always forwards an UnknownShape).
+    for (const Node* node : enter_nodes) {
+      TF_RETURN_IF_ERROR(
+          UpdateShapes(&refiner, relax, fed_ports, node, &new_shapes));
     }
-
-    bool done = true;
-    do {
-      if (relax) {
-        // Propagate shapes through any loops in the graph by relaxing.
-        for (const Node* node : merge_nodes) {
-          new_shapes.push(node);
-        }
-        TF_RETURN_IF_ERROR(PropagateShapes(&shape_refiner, relax, &new_shapes));
-      }
-
-      // If we found a resource, try to propagate the shapes through it.
-      new_shapes = std::queue<const Node*>();
-      for (const auto& resource_data : resources) {
-        const Node* qnode = resource_data.first;
-        // Proceed only if qnode is a queue or an Enter with queue input.
-        if (!IsQueue(*qnode) && !IsEnterWithQueue(*qnode)) {
-          continue;
-        }
-        auto qctx = shape_refiner.GetContext(qnode);
-        if (!qctx) {
-          continue;
-        }
-
-        // Check to see if the shape is fully defined.
-        auto* queue_handle_data = qctx->output_handle_shapes_and_types(0);
-        if (queue_handle_data != nullptr) {
-          bool fully_defined = true;
-          for (const auto& shape_and_type : *queue_handle_data) {
-            if (!qctx->FullyDefined(shape_and_type.shape) ||
-                shape_and_type.dtype == DT_INVALID) {
-              fully_defined = false;
-            }
-          }
-          // If we are merging, then we are done. If we are relaxing, then we
-          // could potentially propagate a less specific shape.
-          if (fully_defined && !relax) {
-            continue;
-          }
-        }
-
-        // Merge all inputs into the enqueue node, regardless of which phase we
-        // are in.
-        std::vector<ShapeAndType> queue_shapes_and_types;
-        for (const auto& node : resource_data.second) {
-          auto ctx = shape_refiner.GetContext(node);
-          if (!ctx) {
-            continue;
-          }
-          // TODO(bsteiner): handle EnqueueMany as well.
-          if (node->type_string().find("Enqueue") != std::string::npos &&
-              node->type_string().find("EnqueueMany") == std::string::npos) {
-            std::vector<ShapeAndType> shapes_and_types;
-            for (int i = 1; i < ctx->num_inputs(); ++i) {
-              shapes_and_types.push_back({ctx->input(i), node->input_type(i)});
-            }
-
-            if (queue_shapes_and_types.empty()) {
-              queue_shapes_and_types = shapes_and_types;
-            } else {
-              TF_RETURN_IF_ERROR(MergeEnqueueShapesAndTypes(
-                  shapes_and_types, qctx, &queue_shapes_and_types));
-            }
-          }
-        }
-        // Combine the input shapes with the existing output shape. We either
-        // merge or relax depending on which phase we are in.
-        if (queue_handle_data != nullptr) {
-          if (relax) {
-            TF_RETURN_IF_ERROR(RelaxEnqueueShapesAndMergeTypes(
-                *queue_handle_data, qctx, &queue_shapes_and_types));
-          } else {
-            TF_RETURN_IF_ERROR(MergeEnqueueShapesAndTypes(
-                *queue_handle_data, qctx, &queue_shapes_and_types));
-          }
-        }
-        // Set the output ShapeAndType handles. If we successfully update the
-        // resource node, add its fan-out to the queue.
-        const std::vector<ShapeAndType>* outputs =
-            qctx->output_handle_shapes_and_types(0);
-        std::vector<ShapeAndType> existing_outputs;
-        if (outputs) {
-          existing_outputs = *outputs;
-        }
-        if (!queue_shapes_and_types.empty()) {
-          if (!relax && qctx->MergeOutputHandleShapesAndTypes(
-                            0, queue_shapes_and_types)) {
-            new_shapes.push(qnode);
-          } else if (relax && qctx->RelaxOutputHandleShapesAndMergeTypes(
-                                  0, queue_shapes_and_types)) {
-            if (IsUpdatedShapesOrTypes(
-                    qctx, existing_outputs,
-                    *qctx->output_handle_shapes_and_types(0))) {
-              new_shapes.push(qnode);
-            }
-          }
-        }
-      }
-      // Propagate the shapes in the transitive fan-out of the queue.
-      done = new_shapes.empty();
-      if (!done) {
-        TF_RETURN_IF_ERROR(PropagateShapes(&shape_refiner, relax, &new_shapes));
-      }
-    } while (!done);
+    // Seed the propagation of shapes through merge nodes.
+    for (const Node* node : merge_nodes) {
+      TF_RETURN_IF_ERROR(
+          UpdateShapes(&refiner, relax, fed_ports, node, &new_shapes));
+    }
+    // Also seed the propagation of shapes in the fanout of fed nodes.
+    for (const Node* node : fed_nodes) {
+      TF_RETURN_IF_ERROR(
+          OverwriteFedPorts(&refiner, fed_ports, node, &new_shapes));
+    }
+    // Propagate shapes normally.
+    TF_RETURN_IF_ERROR(PropagateShapes(&refiner, relax, &new_shapes, resources,
+                                       fed_ports, num_loops));
   }
 
-  std::unordered_map<const shape_inference::Dimension*, int> dim_ids;
-
   // Track shapes globally across the graph.
   SymbolicShapeManager shape_manager;
   bool found_error = false;
@@ -642,6 +1007,10 @@ Status GraphProperties::InferStatically() {
     if (!node_ctx) {
       continue;
     }
+    // Skip any information that comes from fed nodes.
+    if (fed_ports.find(node->name()) != fed_ports.end()) {
+      continue;
+    }
     for (const auto& merged_shapes : node_ctx->MergedShapes()) {
       if (!shape_manager.Merge(merged_shapes.first, merged_shapes.second)
                .ok()) {
@@ -664,7 +1033,7 @@ Status GraphProperties::InferStatically() {
   }
 
   for (const Node* const node : graph.nodes()) {
-    VLOG(1) << "<Node> " << node->name();
+    VLOG(3) << "Filling in graph properties for node: " << node->name();
     auto ctx = shape_refiner.GetContext(node);
     if (!ctx) {
       continue;
@@ -684,6 +1053,9 @@ Status GraphProperties::InferStatically() {
                                          &input_properties[i]);
       }
       for (const auto& edge : node->in_edges()) {
+        if (edge->IsControlEdge()) {
+          continue;
+        }
         if (!edge->src()->IsConstant()) {
           continue;
         }
@@ -713,6 +1085,10 @@ Status GraphProperties::InferStatically() {
     }
   }
 
+  // Help trace the unknown dimensions to their origins.
+  VerboseLogUnknownDimensionSources(graph, input_properties_,
+                                    output_properties_);
+
   return Status::OK();
 }
 
@@ -742,6 +1118,9 @@ Status GraphProperties::AnnotateOutputShapes(GraphDef* output_graph_def) const {
 }
 
 Status GraphProperties::InferFromCostGraph(const CostGraphDef& cost_graph) {
+  if (cost_graph.node_size() == 0) {
+    LOG(WARNING) << "cost_graph is empty: nothing can be inferred!";
+  }
   std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
   std::unordered_map<string, const NodeDef*> name_to_node;  // Empty
   for (auto& node : cost_graph.node()) {
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index e2fe9f96892f00350587de7f6f8540329e9f6dc9..6fc53a7f2e7da7bae7b6f49c7b32291c981fef53 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -26,17 +26,27 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+class SymbolicShapeRefiner;
+class TopoQueue;
+
 // A TensorFlow model to optimize.
 // Models are represented by the combination of a graph, one of more fetch
 // nodes, and potentially a set of nodes to feed.
 class GraphProperties {
  public:
-  // Factory method for creating a GrapplerShapes from a MetaGraphDef.
-  // Returns nullptr if the given meta_graph cannot be converted.
   explicit GraphProperties(const GrapplerItem& item) : item_(item) {}
 
-  Status InferStatically();
+  // Infer the shapes through abstract interpretation. Feed information can be
+  // incorrect so it should be discarded to ensure correctness of the analysis.
+  // However, it can help infer shapes in the fanout of fed nodes (even though
+  // the correctness of these shapes can't be guaranteed), so in some cases
+  // (such as simulation or scheduling) it makes sense of keep these shapes.
+  Status InferStatically(bool assume_valid_feeds);
+  // Infer the shape by running the graph on the specified cluster and recording
+  // the shapes of the processed tensors.
   Status InferDynamically(Cluster* cluster);
+  // Extract the properties from a cost graph. For testing only since there is
+  // no way to ensure that the cost graph match the item.
   Status InferFromCostGraph(const CostGraphDef& cost_graph);
 
   // Stores `item_.graph` with the inferred output shapes to `output_graph_def`.
@@ -62,39 +72,59 @@ class GraphProperties {
       OpInfo::TensorProperties*);
 
  private:
-  // Inputs
-  GrapplerItem item_;
-  std::map<string, std::vector<OpInfo::TensorProperties>> input_properties_;
-  std::map<string, std::vector<OpInfo::TensorProperties>> output_properties_;
-  const std::vector<OpInfo::TensorProperties> missing_properties_;
-
   // Merges shapes <shapes_and_types>, determined from an EnqueueV2 node, into
   // <*queue_shapes_and_types>.
-  Status MergeEnqueueShapesAndTypes(
+  static Status MergeEnqueueShapesAndTypes(
+      SymbolicShapeRefiner* shape_refiner, const Node* qnode,
       const std::vector<shape_inference::ShapeAndType>& shapes_and_types,
-      shape_inference::InferenceContext* qctx,
       std::vector<shape_inference::ShapeAndType>* queue_shapes_and_types);
   // Relaxes shapes <shapes_and_types>, determined from an EnqueueV2 node, into
   // <*queue_shapes_and_types>.
-  Status RelaxEnqueueShapesAndMergeTypes(
+  static Status RelaxEnqueueShapesAndMergeTypes(
+      SymbolicShapeRefiner* shape_refiner, const Node* qnode,
       const std::vector<shape_inference::ShapeAndType>& shapes_and_types,
-      shape_inference::InferenceContext* qctx,
       std::vector<shape_inference::ShapeAndType>* queue_shapes_and_types);
 
-  // This gives access to private function of InferenceContext.
-  static void Relax(shape_inference::InferenceContext* c,
-                    shape_inference::ShapeHandle s0,
-                    shape_inference::ShapeHandle s1,
-                    shape_inference::ShapeHandle* out);
-
-  // These give access to private functions of ShapeRefiner.
-  static bool SameDefinedShape(shape_inference::InferenceContext* c,
-                               shape_inference::ShapeHandle s0,
-                               shape_inference::ShapeHandle s1);
-  static bool IsUpdatedShapesOrTypes(
-      shape_inference::InferenceContext* c,
-      const std::vector<shape_inference::ShapeAndType>& existing,
-      const std::vector<shape_inference::ShapeAndType>& updated);
+  // Update the shapes for qnode. If output shapes of qnode have changed,
+  // enqueue its fanout in 'new_shapes'.
+  static Status UpdateResource(
+      const Node* qnode, const std::unordered_set<const Node*>& queue_inputs,
+      SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes);
+
+  // Update the output shapes of a Merge node, and enqueue its fanout in
+  // new_shapes if needed.
+  static Status UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
+                                const Node* node, bool relax,
+                                TopoQueue* new_shapes);
+  // Process the Enter node, and enqueue its fanout in new_shapes if needed.
+  static Status UpdateEnter(SymbolicShapeRefiner* shape_refiner,
+                            const Node* node, bool relax,
+                            TopoQueue* new_shapes);
+  // Process a node that is used to feed the model.
+  Status OverwriteFedPorts(
+      SymbolicShapeRefiner* shape_refiner,
+      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+      const Node* node, TopoQueue* new_shapes) const;
+  // Update the shapes for node 'n'. If output shapes for n have changed,
+  // enqueue its fanout in 'new_shapes'.
+  Status UpdateShapes(
+      SymbolicShapeRefiner* shape_refiner, bool relax,
+      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+      const Node* n, TopoQueue* new_shapes) const;
+  // Propagate the shapes for the nodes enqueued in new_shapes and their
+  // transitive fanout until a fixed point is reached.
+  Status PropagateShapes(
+      SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes,
+      const std::unordered_map<const Node*, std::unordered_set<const Node*>>&
+          resources,
+      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+      int num_loops) const;
+
+  // Data members
+  GrapplerItem item_;
+  std::map<string, std::vector<OpInfo::TensorProperties>> input_properties_;
+  std::map<string, std::vector<OpInfo::TensorProperties>> output_properties_;
+  const std::vector<OpInfo::TensorProperties> missing_properties_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index f785f627e12f295717ffe1b61d0367f5c9f13294..5f2ac0c652e601e88e6285358d959c2f6c6d59fe 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -43,7 +43,10 @@ class GraphPropertiesTest : public ::testing::Test {
     TF_CHECK_OK(cluster_->Provision());
   }
 
-  void TearDown() override { cluster_.reset(); }
+  void TearDown() override {
+    TF_CHECK_OK(cluster_->Shutdown());
+    cluster_.reset();
+  }
 
  protected:
   // Returns a string form of <p>, suitable for comparing type and shape.
@@ -73,7 +76,7 @@ TEST_F(GraphPropertiesTest, StaticProperties) {
   CHECK(fake_input.NextItem(&item));
 
   GraphProperties properties(item);
-  Status s = properties.InferStatically();
+  Status s = properties.InferStatically(true);
   TF_CHECK_OK(s);
 
   for (const auto& node : item.graph.node()) {
@@ -179,7 +182,7 @@ TEST_F(GraphPropertiesTest, Variables) {
 
   {
     GraphProperties static_properties(item);
-    TF_CHECK_OK(static_properties.InferStatically());
+    TF_CHECK_OK(static_properties.InferStatically(false));
 
     const auto props = static_properties.GetOutputProperties("Var");
     EXPECT_EQ(1, props.size());
@@ -219,7 +222,7 @@ TEST_F(GraphPropertiesTest, VarHandles) {
                   .Finalize(item.graph.add_node()));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   const auto props = properties.GetOutputProperties("VarRead");
   EXPECT_EQ(1, props.size());
@@ -286,7 +289,7 @@ TEST_F(GraphPropertiesTest, Queues) {
   TF_CHECK_OK(root.ToGraphDef(&item.graph));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   const auto props1 = properties.GetOutputProperties("Dequeue1");
   ASSERT_EQ(1, props1.size());
@@ -335,7 +338,7 @@ TEST_F(GraphPropertiesTest, MergeWithoutLoops) {
                                  "merge_without_loops.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> nodes{"cond/Merge", "cond/concat", "cond/concat_1"};
   std::vector<string> expected_outputs{"float: [-1,-1,1]", "float: [2,1,1]",
@@ -362,7 +365,7 @@ TEST_F(GraphPropertiesTest, WhileLoop) {
   /*
      with tf.Graph().as_default():
        i0 = tf.constant(0)
-       m0 = tf.ones([2, 2])
+       m0 = tf.placeholder([-1, 2])
        c = lambda i, m: i < 10
        b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
        r = tf.while_loop(
@@ -377,7 +380,7 @@ TEST_F(GraphPropertiesTest, WhileLoop) {
                                  "while_loop.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> nodes{"while/Merge_1", "while/NextIteration_1",
                             "while/Exit_1"};
@@ -387,6 +390,14 @@ TEST_F(GraphPropertiesTest, WhileLoop) {
     EXPECT_EQ(DT_FLOAT, prop.dtype());
     EXPECT_EQ("float: [-1,2]", PropToString(prop));
   }
+
+  // The loop outputs batch dim should be different from the input batch dim
+  // since we concatenated along the batch dim.
+  auto shape_in = properties.GetOutputProperties("ones").at(0).shape();
+  auto shape_out = properties.GetOutputProperties("while/Exit_1").at(0).shape();
+  EXPECT_GE(-2, shape_in.dim(0).size());
+  EXPECT_GE(-2, shape_out.dim(0).size());
+  EXPECT_NE(shape_in.dim(0).size(), shape_out.dim(0).size());
 }
 
 TEST_F(GraphPropertiesTest, NestedLoop) {
@@ -427,7 +438,7 @@ TEST_F(GraphPropertiesTest, NestedLoop) {
                                  "nested_loop.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> outer_nodes{"while/Merge_1", "while/NextIteration_1",
                                   "while/Exit_1"};
@@ -490,7 +501,7 @@ TEST_F(GraphPropertiesTest, LoopsAndQueues) {
                                  "loops_and_queues.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> outer_nodes{"while/Merge_1", "while/NextIteration_1",
                                   "while/Exit_1"};
@@ -548,7 +559,7 @@ TEST_F(GraphPropertiesTest, LoopsAndResourceVars) {
                                  "loops_and_resource_vars.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> outer_nodes{"while/Merge_1", "while/NextIteration_1",
                                   "while/Exit_1"};
@@ -600,7 +611,7 @@ TEST_F(GraphPropertiesTest, QueuesAndLoops) {
                                  "queues_and_loops.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> nodes{"while/Merge_1", "while/NextIteration_1",
                             "while/Exit_1"};
@@ -649,7 +660,7 @@ TEST_F(GraphPropertiesTest, InferRestoreOpShape) {
   item.fetch.push_back("init_restore");
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   const auto restore_props = properties.GetOutputProperties("restore");
   const OpInfo::TensorProperties& restore_prop = restore_props[0];
@@ -696,7 +707,7 @@ TEST_F(GraphPropertiesTest, InferRestoreOpShape_WithTwoNodesShareSameOutput) {
   item.fetch.push_back("init2");
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   const auto props = properties.GetOutputProperties("restore");
   const OpInfo::TensorProperties& prop = props[0];
@@ -724,7 +735,7 @@ TEST_F(GraphPropertiesTest, FunctionStaticShapeInference) {
                                  "simple_function.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
   const auto props = properties.GetOutputProperties("MyAdd_55e046a8_1");
   const OpInfo::TensorProperties& prop = props[0];
   EXPECT_EQ(DT_FLOAT, prop.dtype());
@@ -732,6 +743,10 @@ TEST_F(GraphPropertiesTest, FunctionStaticShapeInference) {
   EXPECT_EQ(2, prop.shape().dim_size());
   EXPECT_EQ(1, prop.shape().dim(0).size());
   EXPECT_EQ(2, prop.shape().dim(1).size());
+
+  PartialTensorShape shape(prop.shape());
+  EXPECT_TRUE(shape.IsFullyDefined());
+  EXPECT_FALSE(shape.unknown_rank());
 }
 
 TEST_F(GraphPropertiesTest, SymbolicShapes) {
@@ -750,11 +765,15 @@ TEST_F(GraphPropertiesTest, SymbolicShapes) {
   Output e = ops::Add(s.WithOpName("e"), c, d);
   Output f = ops::Add(s.WithOpName("f"), a, c);
 
+  Output zero = ops::Const(s.WithOpName("zero"), 0.0f, {});
+  Output g = ops::Shape(s.WithOpName("g"), c);
+  Output h = ops::Fill(s.WithOpName("h"), g, zero);
+
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
   const auto shape_a = properties.GetOutputProperties("a").at(0).shape();
   const auto shape_c = properties.GetOutputProperties("c").at(0).shape();
   EXPECT_EQ(2, shape_a.dim_size());
@@ -764,6 +783,10 @@ TEST_F(GraphPropertiesTest, SymbolicShapes) {
   EXPECT_GE(-2, shape_a.dim(1).size());
   EXPECT_EQ(shape_a.dim(1).size(), shape_c.dim(1).size());
 
+  PartialTensorShape shape(shape_a);
+  EXPECT_FALSE(shape.IsFullyDefined());
+  EXPECT_FALSE(shape.unknown_rank());
+
   const auto shape_b = properties.GetOutputProperties("b").at(0).shape();
   const auto shape_d = properties.GetOutputProperties("d").at(0).shape();
   EXPECT_EQ(1, shape_b.dim_size());
@@ -773,15 +796,20 @@ TEST_F(GraphPropertiesTest, SymbolicShapes) {
   EXPECT_EQ(shape_b.dim(0).size(), shape_d.dim(0).size());
 
   const auto shape_e = properties.GetOutputProperties("e").at(0).shape();
-  EXPECT_EQ(2, shape_e.dim_size());
+  ASSERT_EQ(2, shape_e.dim_size());
   EXPECT_EQ(shape_e.dim(0).size(), shape_c.dim(0).size());
   EXPECT_NE(shape_e.dim(1).size(), shape_c.dim(1).size());
   EXPECT_NE(shape_e.dim(0).size(), shape_d.dim(0).size());
 
   const auto shape_f = properties.GetOutputProperties("f").at(0).shape();
-  EXPECT_EQ(2, shape_f.dim_size());
+  ASSERT_EQ(2, shape_f.dim_size());
   EXPECT_EQ(shape_f.dim(0).size(), shape_a.dim(0).size());
   EXPECT_EQ(shape_f.dim(1).size(), shape_a.dim(1).size());
+
+  const auto shape_h = properties.GetOutputProperties("h").at(0).shape();
+  ASSERT_EQ(2, shape_f.dim_size());
+  EXPECT_EQ(shape_h.dim(0).size(), shape_c.dim(0).size());
+  EXPECT_EQ(shape_h.dim(1).size(), shape_c.dim(1).size());
 }
 
 TEST_F(GraphPropertiesTest, DoNotValidateColocationConstraints) {
@@ -805,7 +833,92 @@ TEST_F(GraphPropertiesTest, DoNotValidateColocationConstraints) {
   GraphProperties properties(item);
   // This function should return OK, since it doesn't validate the colocation
   // constraints internally.
-  TF_EXPECT_OK(properties.InferStatically());
+  TF_EXPECT_OK(properties.InferStatically(false));
+}
+
+TEST_F(GraphPropertiesTest, ShapeTracking) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a =
+      ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
+  Output b =
+      ops::Placeholder(s.WithOpName("b"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({-1})));
+  Output zero = ops::Const(s.WithOpName("zero"), 0.0f, {});
+  auto shp = ops::ShapeN(s.WithOpName("shapes"), {a, b});
+  Output o1 = ops::Fill(s.WithOpName("o1"), shp[0], zero);
+  Output o2 = ops::Fill(s.WithOpName("o2"), shp[1], zero);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(false));
+  const auto shape_a = properties.GetOutputProperties("a").at(0).shape();
+  const auto shape_b = properties.GetOutputProperties("b").at(0).shape();
+  const auto shape_o1 = properties.GetOutputProperties("o1").at(0).shape();
+  const auto shape_o2 = properties.GetOutputProperties("o2").at(0).shape();
+  EXPECT_EQ(shape_a.DebugString(), shape_o1.DebugString());
+  EXPECT_EQ(shape_b.DebugString(), shape_o2.DebugString());
+}
+
+TEST_F(GraphPropertiesTest, FedNodes) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false,
+                                          cluster_->GetDeviceNames());
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  {
+    // Conservative shape analysis: the shape of fed ports should be unknown
+    GraphProperties properties(item);
+    Status s = properties.InferStatically(false);
+    TF_CHECK_OK(s);
+    for (const auto& node : item.graph.node()) {
+      if (node.op() == "Const") {
+        continue;
+      }
+      const auto in_props = properties.GetInputProperties(node.name());
+      EXPECT_EQ(1, in_props.size());
+      const OpInfo::TensorProperties& in_prop = in_props[0];
+      const auto out_props = properties.GetOutputProperties(node.name());
+      EXPECT_EQ(1, out_props.size());
+      const OpInfo::TensorProperties& out_prop = out_props[0];
+
+      if (node.name() == "x") {
+        // x is fed: its input should have a known shape, while its output
+        // doesn't
+        EXPECT_FALSE(in_prop.shape().unknown_rank());
+        EXPECT_EQ(1, in_prop.shape().dim_size());
+        EXPECT_EQ(2, in_prop.shape().dim(0).size());
+        EXPECT_TRUE(out_prop.shape().unknown_rank());
+      } else if (node.op() == "Square" || node.op() == "AddN") {
+        // These nodes are in the fanout of x: their shapes should be unknown.
+        EXPECT_TRUE(in_prop.shape().unknown_rank());
+        EXPECT_TRUE(out_prop.shape().unknown_rank());
+      }
+    }
+  }
+  {
+    // Optimistic shape analysis: the shape of fed ports should be derived from
+    // the shape of the fanin.
+    GraphProperties properties(item);
+    Status s = properties.InferStatically(true);
+    TF_CHECK_OK(s);
+    for (const auto& node : item.graph.node()) {
+      if (node.op() == "Square" || node.op() == "AddN") {
+        const auto in_props = properties.GetInputProperties(node.name());
+        EXPECT_EQ(1, in_props.size());
+        const OpInfo::TensorProperties& in_prop = in_props[0];
+        EXPECT_EQ(DT_FLOAT, in_prop.dtype());
+        EXPECT_FALSE(in_prop.shape().unknown_rank());
+        EXPECT_EQ(2, in_prop.shape().dim_size());
+        const auto out_props = properties.GetOutputProperties(node.name());
+        EXPECT_EQ(1, out_props.size());
+        const OpInfo::TensorProperties& out_prop = out_props[0];
+        EXPECT_EQ(in_prop.DebugString(), out_prop.DebugString());
+      }
+    }
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/costs/graph_properties_testdata/while_loop.pbtxt b/tensorflow/core/grappler/costs/graph_properties_testdata/while_loop.pbtxt
index c11833bd1a73a6680b666027398cfd77f335aeff..fbc3659d9a796420f4be5a948b986d457e004cb4 100644
--- a/tensorflow/core/grappler/costs/graph_properties_testdata/while_loop.pbtxt
+++ b/tensorflow/core/grappler/costs/graph_properties_testdata/while_loop.pbtxt
@@ -21,7 +21,7 @@ node {
 }
 node {
   name: "ones"
-  op: "Const"
+  op: "PlaceholderV2"
   attr {
     key: "dtype"
     value {
@@ -29,19 +29,15 @@ node {
     }
   }
   attr {
-    key: "value"
+    key: "shape"
     value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 2
-          }
-          dim {
-            size: 2
-          }
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: 2
         }
-        float_val: 1.0
       }
     }
   }
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index bd84331b6740d7693c561647669485160435cf48..6bc136a3f89c9a1dbfd4be15c143d4c893897494 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -25,11 +25,13 @@ namespace tensorflow {
 namespace grappler {
 
 constexpr int kOpsPerMac = 2;
+constexpr char kConst[] = "Const";
 constexpr char kConv2d[] = "Conv2D";
 constexpr char kConv2dBackpropFilter[] = "Conv2DBackpropFilter";
 constexpr char kConv2dBackpropInput[] = "Conv2DBackpropInput";
 constexpr char kMatMul[] = "MatMul";
 constexpr char kSparseMatMul[] = "SparseMatMul";
+constexpr char kPlaceholder[] = "Placeholder";
 constexpr char kIdentity[] = "Identity";
 constexpr char kRefIdentity[] = "RefIdentity";
 constexpr char kNoOp[] = "NoOp";
@@ -159,6 +161,9 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
        wrap(&OpLevelCostEstimator::PredictConv2DBackpropInput)},
       {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
+      {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
+
+      {kPlaceholder, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kRefIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kStopGradient, wrap(&OpLevelCostEstimator::PredictNoOp)},
@@ -167,9 +172,10 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {kReshape, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kRecv, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kSend, wrap(&OpLevelCostEstimator::PredictNoOp)},
+      {kConst, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kVariable, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kVariableV2, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
+
       {kRank, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kShape, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kSize, wrap(&OpLevelCostEstimator::PredictMetadata)}};
@@ -221,6 +227,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
                      Eigen::internal::scalar_square_op<float>>::Cost},
       {"Tanh", Eigen::internal::functor_traits<
                    Eigen::internal::scalar_tanh_op<float>>::Cost},
+      {"Relu", Eigen::internal::functor_traits<
+                   Eigen::internal::scalar_max_op<float>>::Cost},
       {"Sigmoid", Eigen::internal::functor_traits<
                       Eigen::internal::scalar_sigmoid_op<float>>::Cost},
       {"Sign", Eigen::internal::functor_traits<
@@ -283,8 +291,10 @@ Costs OpLevelCostEstimator::PredictCosts(const OpContext& op_context) const {
     if (elementwise_ops_.find(op_features.op()) != elementwise_ops_.end()) {
       return PredictCwiseOp(op_context);
     }
-    VLOG(1) << "Missing implementation for op: " << op_features.op();
-    return DummyExecutionTime(op_context);
+
+    VLOG(1) << "Missing accurate estimator for op: " << op_features.op();
+
+    return PredictCostOfAnUnknownOp(op_context);
   }
 
   std::function<Costs(const OpContext&)> estimator = it->second;
@@ -324,7 +334,8 @@ OpLevelCostEstimator::DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
       // Maxwell
       cores_per_multiprocessor = 128;
     } else {
-      // Pascal
+      // Pascal (compute capability version 6) and Volta (compute capability
+      // version 7)
       cores_per_multiprocessor = 64;
     }
     gflops = device.num_cores() * device.frequency() * 1e-3 *
@@ -365,19 +376,27 @@ Costs OpLevelCostEstimator::PredictCwiseOp(const OpContext& op_context) const {
   }
 
   int op_cost = 1;
+  bool is_known_elementwise_op = false;
   auto it = elementwise_ops_.find(op_features.op());
   if (it != elementwise_ops_.end()) {
     op_cost = it->second;
+    is_known_elementwise_op = true;
+  } else {
+    LOG(WARNING) << "Not a cwise op: " << op_features.op();
   }
+
   Costs costs = PredictOpCountBasedCost(op_count * op_cost, op_features);
-  costs.inaccurate = found_unknown_shapes;
+  if (found_unknown_shapes || !is_known_elementwise_op) {
+    costs.inaccurate = true;
+  }
   return costs;
 }
 
-Costs OpLevelCostEstimator::DummyExecutionTime(
+Costs OpLevelCostEstimator::PredictCostOfAnUnknownOp(
     const OpContext& op_context) const {
-  // Use CwiseOp time as an estimation
-  auto costs = PredictCwiseOp(op_context);
+  // Don't assume the operation is cwise, return cost based on input/output size
+  // and admit that it is inaccurate...
+  auto costs = PredictOpCountBasedCost(0, op_context.op_info);
   costs.inaccurate = true;
   return costs;
 }
@@ -390,11 +409,11 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
           << " Execution Time (ns):" << compute_cost.count();
 
   bool found_unknown_shapes = false;
-  double total_input_size =
+  const double total_input_size =
       CalculateInputSize(op_features, &found_unknown_shapes);
-  double total_output_size =
+  const double total_output_size =
       CalculateOutputSize(op_features, &found_unknown_shapes);
-  double total_io_size = total_input_size + total_output_size;
+  const double total_io_size = total_input_size + total_output_size;
 
   Costs::NanoSeconds memory_cost(
       std::ceil(total_io_size / device_perf.gb_per_sec));
@@ -509,7 +528,12 @@ int64 OpLevelCostEstimator::CountMatMulOperations(
     bool* found_unknown_shapes) const {
   double ops = 0;
 
-  // first matrix
+  if (op_features.inputs_size() < 2) {
+    LOG(ERROR) << "Need 2 inputs but got " << op_features.inputs_size();
+    *found_unknown_shapes = true;
+    return 0;
+  }
+
   auto& a_matrix = op_features.inputs(0);
   auto& b_matrix = op_features.inputs(1);
 
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 3a8385dd732d1747eca690339e098d741f68effc..5f541ccf04dc74eb868d26365a50d2e3542ea7d9 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -35,7 +35,6 @@ class OpLevelCostEstimator {
 
   virtual Costs PredictCosts(const OpContext& op_context) const;
 
- protected:
   // Basic device performance info, sufficient for roofline estimate.
   struct DeviceInfo {
     double gigaops;     // Billions of operations executed per second.
@@ -45,11 +44,12 @@ class OpLevelCostEstimator {
   // Returns basic device performance info.
   virtual DeviceInfo GetDeviceInfo(const DeviceProperties& device) const;
 
-  // For operations for which we haven't yet built estimates, returns a dummy
-  // value based on input size.
-  Costs DummyExecutionTime(const OpContext& op_context) const;
+ protected:
+  // Predict cost of an op for which no accurate estimator is defined.
+  Costs PredictCostOfAnUnknownOp(const OpContext& op_context) const;
 
-  // Naive cost estimate based on operations divided by device ops/sec.
+  // Naive cost estimate based on operations divided by device ops/sec,
+  // and input/output tensor sizes.
   Costs PredictOpCountBasedCost(double operations,
                                 const OpInfo& op_features) const;
 
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index f19be4a0ee53609fa0196405da4ecb8b94fa39e6..60fc783472d2b6a1d50eb52e912da1fccbe8cf08 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -167,8 +167,8 @@ class OpLevelCostEstimatorTest : public ::testing::Test {
 TEST_F(OpLevelCostEstimatorTest, DummyExecutionTime) {
   auto cost = PredictCosts(DescribeOp("Dummy", 1000, 1));
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
-  EXPECT_EQ(Costs::Duration(200), cost.compute_time);
-  EXPECT_EQ(Costs::Duration(2200), cost.execution_time);
+  EXPECT_EQ(Costs::Duration(0), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(2000), cost.execution_time);
   EXPECT_TRUE(cost.inaccurate);
 }
 
@@ -176,7 +176,7 @@ TEST_F(OpLevelCostEstimatorTest, ExecutionTimeSumOrMax) {
   SetComputeMemoryOverlap(true);
   auto cost = PredictCosts(DescribeOp("Dummy", 1000, 1));
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
-  EXPECT_EQ(Costs::Duration(200), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(0), cost.compute_time);
   EXPECT_EQ(Costs::Duration(2000), cost.execution_time);  // max(2000, 200)
   EXPECT_TRUE(cost.inaccurate);
   SetComputeMemoryOverlap(false);  // Set it back to default.
diff --git a/tensorflow/core/grappler/costs/virtual_placer.h b/tensorflow/core/grappler/costs/virtual_placer.h
index 7ccb1ebb9999989f17548aeb88d1d64abdcc5341..fee5ce0f510014988656f418b857a73b8d68b807 100644
--- a/tensorflow/core/grappler/costs/virtual_placer.h
+++ b/tensorflow/core/grappler/costs/virtual_placer.h
@@ -41,7 +41,7 @@ class VirtualPlacer {
  private:
   // Converts given device name to Lowercase Fully-Qualified Name (LFQN) string.
   // This helps us disambiguate device names internally and simplify matching.
-  // If device_name couldn't be parsed succesfully, returns empty string.
+  // If device_name couldn't be parsed successfully, returns empty string.
   string to_lfqn_or_empty(const string& device_name) const;
 
   // Map based on the cluster info: cluster device name -> device properties.
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 0bb98d379308248f9681f15fd35b6a84730f2727..1e3da6f525a94db8b1e859f4af3e4f4f6e1fd721 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -43,6 +43,9 @@ Costs CombineCosts(const Costs& left, const Costs& right) {
 
   Costs result = left;
   result.execution_time += right.execution_time;
+  if (right.inaccurate) {
+    result.inaccurate = true;
+  }
   if (right.max_memory != kMemoryUnknown) {
     result.max_memory += right.max_memory;
   }
@@ -88,6 +91,152 @@ struct RecvNodeDescriptorEqual {
 };
 }  // namespace
 
+// ReadyNodeManager
+const NodeDef* LIFOManager::GetCurrNode() {
+  CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
+  if (curr_pos_ == nodes_.end()) {
+    curr_pos_ = --(nodes_.rbegin().base());  // Last one in the list.
+  }
+  // Once curr_pos_ is set to a valid entry in the list, we keep using the
+  // cached curr_pos_ until RemoveCurrNode() is called. AddNode() will not
+  // change the GetCurrNode() return value.
+  return *curr_pos_;
+}
+
+void LIFOManager::RemoveCurrNode() {
+  // Make sure we have curr_pos_ ready to be removed.
+  GetCurrNode();
+  // Note curr_pos_ may not be pointing the last element if some nodes are
+  // added.
+  nodes_.erase(curr_pos_);
+
+  curr_pos_ = nodes_.end();  // Reset curr_pos_.
+}
+
+FirstReadyManager::FirstReadyManager(
+    const std::unordered_map<const NodeDef*, NodeState>* node_state)
+    : ReadyNodeManager(), node_state_(node_state) {
+  std::make_heap(nodes_.begin(), nodes_.end());
+  greater_ = [this](const NodeDef* a, const NodeDef* b) -> bool {
+    // Note: we need a node with minimum time_ready, not
+    // maximum; hence, using a > b for comparison function.
+    return node_state_->at(a).time_ready > node_state_->at(b).time_ready;
+  };
+}
+
+const NodeDef* FirstReadyManager::GetCurrNode() {
+  if (nodes_.empty()) {
+    // Nothing in the node_; probably, the very first call. Move
+    // waiting_queue_ to node_.
+    DrainWaitingQueue();
+    CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
+  }
+  return nodes_.front();
+}
+
+void FirstReadyManager::RemoveCurrNode() {
+  if (nodes_.empty()) {
+    // Make sure that there is a node to be removed at the front of nodes_.
+    GetCurrNode();
+  }
+  std::pop_heap(nodes_.begin(), nodes_.end(), greater_);
+  nodes_.pop_back();
+  DrainWaitingQueue();
+}
+
+bool FirstReadyManager::Empty() const {
+  return nodes_.empty() && waiting_queue_.empty();
+}
+
+void FirstReadyManager::DrainWaitingQueue() {
+  for (const auto* node : waiting_queue_) {
+    // push_heap in AddNode() and pop_heap in RemoveCurrNode() guarantees that
+    // the first element is the node with minimum time_ready.
+    nodes_.push_back(node);
+    std::push_heap(nodes_.begin(), nodes_.end(), greater_);
+  }
+  waiting_queue_.clear();
+}
+
+CompositeNodeManager::CompositeNodeManager(
+    const std::unordered_map<const NodeDef*, NodeState>* node_state)
+    : ReadyNodeManager(),
+      send_manager_(node_state),
+      recv_manager_(node_state),
+      node_state_(node_state) {
+  curr_node_ = nullptr;
+}
+
+void CompositeNodeManager::AddNode(const NodeDef* node) {
+  if (IsSend(*node)) {
+    send_manager_.AddNode(node);
+  } else if (IsRecv(*node)) {
+    recv_manager_.AddNode(node);
+  } else {
+    const auto& device = node_state_->at(node).device_name;
+    ops_lifo_map_[device].AddNode(node);
+  }
+}
+
+const NodeDef* CompositeNodeManager::GetCurrNode() {
+  if (curr_node_) return curr_node_;
+
+  // Locally (normal ops, not _Send / _Recv) LIFO,
+  // Globally (among the LIFO-selected ops from each device and _Send and
+  // _Recv) FirstReady.
+  std::vector<std::pair<const NodeDef*, Costs::Duration>> candidates;
+  for (auto& ops_lifo : ops_lifo_map_) {
+    if (!ops_lifo.second.Empty()) {
+      const auto* op = ops_lifo.second.GetCurrNode();
+      candidates.emplace_back(op, node_state_->at(op).time_ready);
+    }
+  }
+  if (!send_manager_.Empty()) {
+    const auto* send = send_manager_.GetCurrNode();
+    candidates.emplace_back(send, node_state_->at(send).time_ready);
+  }
+  if (!recv_manager_.Empty()) {
+    const auto* recv = recv_manager_.GetCurrNode();
+    candidates.emplace_back(recv, node_state_->at(recv).time_ready);
+  }
+  CHECK(!candidates.empty());
+  auto first_ready =
+      std::min_element(candidates.begin(), candidates.end(),
+                       [](const std::pair<const NodeDef*, Costs::Duration>& a,
+                          const std::pair<const NodeDef*, Costs::Duration>& b) {
+                         return a.second < b.second;
+                       });
+  // Next time we call GetCurrNode(), it just returns the cached one,
+  // curr_node_ until we call RemovCurrNode().
+  curr_node_ = first_ready->first;
+
+  return curr_node_;
+}
+
+void CompositeNodeManager::RemoveCurrNode() {
+  const auto* node = GetCurrNode();
+  if (IsSend(*node)) {
+    send_manager_.RemoveCurrNode();
+  } else if (IsRecv(*node)) {
+    recv_manager_.RemoveCurrNode();
+  } else {
+    const auto device = node_state_->at(node).device_name;
+    ops_lifo_map_[device].RemoveCurrNode();
+  }
+  // Reset curr_node_ so that GetCurrNode() finds another node.
+  curr_node_ = nullptr;
+}
+
+bool CompositeNodeManager::Empty() const {
+  // Empty if all the ready managers are empty.
+  bool empty = true;
+  for (const auto& ops_lifo : ops_lifo_map_) {
+    empty &= ops_lifo.second.Empty();
+  }
+  return empty && send_manager_.Empty() && recv_manager_.Empty();
+}
+
+// VirtualScheduler
 VirtualScheduler::VirtualScheduler(const GrapplerItem* grappler_item,
                                    const bool use_static_shapes,
                                    Cluster* cluster)
@@ -109,6 +258,8 @@ ReadyNodeManager* VirtualScheduler::ReadyNodeManagerFactory(
     return new LIFOManager();
   } else if (ready_node_manager == "FirstReady") {
     return new FirstReadyManager(GetNodeStates());
+  } else if (ready_node_manager == "Composite") {
+    return new CompositeNodeManager(GetNodeStates());
   }
   LOG(FATAL) << "Not a valid ready node manager: " << ready_node_manager;
 }
@@ -122,7 +273,7 @@ Status VirtualScheduler::Init() {
   // Construct graph properties.
   Status status;
   if (use_static_shapes_) {
-    status = graph_properties_.InferStatically();
+    status = graph_properties_.InferStatically(true);
   } else {
     status = graph_properties_.InferDynamically(cluster_);
   }
@@ -538,7 +689,8 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   string node_description = GetOpDescription(op_context.op_info);
   op_counts_[node_description] += 1;
   op_costs_[node_description] =
-      node_costs.execution_time.asMicroSeconds().count();
+      std::make_pair(node_costs.execution_time.asMicroSeconds().count(),
+                     !node_costs.inaccurate);
 
   auto& op_cost = FindOrCreateZero(op_name, &op_to_cost_);
   op_cost = CombineCosts(op_cost, node_costs);
@@ -647,8 +799,10 @@ Costs VirtualScheduler::Summary() const {
   for (const auto& op_cost_pair : op_to_cost_) {
     const auto& op = op_cost_pair.first;
     const auto& cost = op_cost_pair.second.execution_time.count();
+    const bool is_op_cost_accurate = !op_cost_pair.second.inaccurate;
     if (cost) {  // Skip printing out zero-cost ops.
-      VLOG(1) << " + " << op << " : " << cost;
+      VLOG(1) << " + " << op << " : " << (is_op_cost_accurate ? "" : "~")
+              << cost;
     }
   }
 
@@ -699,10 +853,16 @@ Costs VirtualScheduler::Summary() const {
           CalculateOutputSize(node_map_.at(node).output_properties, port);
     }
     Costs::NanoSeconds total_compute_time_ns;
+    bool is_total_cost_accurate = true;
     for (const auto& op_cost_pair : state.op_to_cost) {
       const auto& op = op_cost_pair.first;
       const auto& cost = op_cost_pair.second.execution_time.count();
       total_compute_time_ns += op_cost_pair.second.execution_time;
+      const bool is_op_cost_accurate = !op_cost_pair.second.inaccurate;
+      if (!is_op_cost_accurate) {
+        is_total_cost_accurate = false;
+      }
+
       int64 op_mem_usage = 0;
       auto it = op_to_memory.find(op);
       if (it != op_to_memory.end()) {
@@ -714,9 +874,9 @@ Costs VirtualScheduler::Summary() const {
                                : 0.0;
       if (cost || mem_usage_percent > 1.0) {
         // Print out only non-zero cost ops or ops with > 1% memory usage.
-        VLOG(1) << " + " << op << " : " << cost << " ("
-                << strings::HumanReadableNumBytes(op_mem_usage) << " ["
-                << mem_usage_percent << "%] "
+        VLOG(1) << " + " << op << " : " << (is_op_cost_accurate ? "" : "~")
+                << cost << " (" << strings::HumanReadableNumBytes(op_mem_usage)
+                << " [" << mem_usage_percent << "%] "
                 << (persisent_ops.count(op) > 0 ? ": persistent op)" : ")");
       }
     }
@@ -725,8 +885,9 @@ Costs VirtualScheduler::Summary() const {
     if (wall_time_ns.count() > 0) {
       utilization = total_compute_time_ns.count() * 100 / wall_time_ns.count();
     }
-    VLOG(1) << "Device = " << name
-            << ", total_compute_time_ns = " << total_compute_time_ns.count()
+    VLOG(1) << "Device = " << name << ", total_compute_time_ns = "
+            << (is_total_cost_accurate ? "" : "~")
+            << total_compute_time_ns.count()
             << ", utilization = " << utilization << "%";
 
     if (critical_path_costs.execution_time <= state.GetCurrTime()) {
@@ -738,8 +899,11 @@ Costs VirtualScheduler::Summary() const {
     // Also log the op description and their corresponding counts.
     VLOG(2) << "Node description, counts, cost:";
     for (const auto& item : op_counts_) {
+      int cost;
+      bool is_cost_accurate;
+      std::tie(cost, is_cost_accurate) = op_costs_.at(item.first);
       VLOG(2) << "Node: " << item.first << ", Count: " << item.second
-              << ", Individual Cost: " << op_costs_.at(item.first);
+              << ", Individual Cost: " << (is_cost_accurate ? "" : "~") << cost;
     }
   }
 
@@ -752,8 +916,7 @@ Costs VirtualScheduler::Summary(RunMetadata* metadata) {
   if (metadata != nullptr) {
     StepStats* stepstats = metadata->mutable_step_stats();
     for (const auto& device : device_) {
-      GraphDef* device_partition_graph =
-          metadata->mutable_partition_graphs()->Add();
+      GraphDef* device_partition_graph = metadata->add_partition_graphs();
       DeviceStepStats* device_stepstats = stepstats->add_dev_stats();
       device_stepstats->set_device(device.first);
       for (const auto& node_def : device.second.nodes_executed) {
@@ -804,7 +967,7 @@ Costs VirtualScheduler::Summary(RunMetadata* metadata) {
         mem_stats->set_host_persistent_memory_size(host_persistent_memory_size);
         mem_stats->set_device_persistent_memory_size(
             device_persistent_memory_size);
-        *device_partition_graph->mutable_node()->Add() = *node_def;
+        *device_partition_graph->add_node() = *node_def;
       }
     }
   }
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index c74d80c2bee9b99afbcd68cfc8a7d4177e3160bc..74088780cb3d655e2ae402a9e50fadaf5703a098 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -158,25 +158,8 @@ class LIFOManager : public ReadyNodeManager {
   LIFOManager() : ReadyNodeManager() {}
   ~LIFOManager() override {}
   void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
-  const NodeDef* GetCurrNode() override {
-    CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
-    if (curr_pos_ == nodes_.end()) {
-      curr_pos_ = --(nodes_.rbegin().base());  // Last one in the list.
-    }
-    // Once curr_pos_ is set to a valid entry in the list, we keep using the
-    // cached curr_pos_ until RemoveCurrNode() is called. AddNode() will not
-    // change the GetCurrNode() return value.
-    return *curr_pos_;
-  }
-  void RemoveCurrNode() override {
-    // Make sure we have curr_pos_ ready to be removed.
-    GetCurrNode();
-    // Note curr_pos_ may not be pointing the last element if some nodes are
-    // added.
-    nodes_.erase(curr_pos_);
-
-    curr_pos_ = nodes_.end();  // Reset curr_pos_.
-  }
+  const NodeDef* GetCurrNode() override;
+  void RemoveCurrNode() override;
   bool Empty() const override { return nodes_.empty(); }
 
  private:
@@ -194,54 +177,16 @@ class LIFOManager : public ReadyNodeManager {
 class FirstReadyManager : public ReadyNodeManager {
  public:
   FirstReadyManager(
-      const std::unordered_map<const NodeDef*, NodeState>* node_state)
-      : ReadyNodeManager(), node_state_(node_state) {
-    std::make_heap(nodes_.begin(), nodes_.end());
-    greater_ = [this](const NodeDef* a, const NodeDef* b) -> bool {
-      // Note: we need a node with minimum time_ready, not
-      // maximum; hence, using a > b for comparison function.
-      return node_state_->at(a).time_ready > node_state_->at(b).time_ready;
-    };
-  }
+      const std::unordered_map<const NodeDef*, NodeState>* node_state);
   ~FirstReadyManager() override {}
-
   void AddNode(const NodeDef* node) override { waiting_queue_.push_back(node); }
-
-  const NodeDef* GetCurrNode() override {
-    if (nodes_.empty()) {
-      // Nothing in the node_; probably, the very first call. Move
-      // waiting_queue_ to node_.
-      _DrainWaitingQueue();
-      CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
-    }
-    return nodes_.front();
-  }
-
-  void RemoveCurrNode() override {
-    if (nodes_.empty()) {
-      // Make sure that there is a node to be removed at the front of nodes_.
-      GetCurrNode();
-    }
-    std::pop_heap(nodes_.begin(), nodes_.end(), greater_);
-    nodes_.pop_back();
-    _DrainWaitingQueue();
-  }
-
-  bool Empty() const override {
-    return nodes_.empty() && waiting_queue_.empty();
-  }
+  const NodeDef* GetCurrNode() override;
+  void RemoveCurrNode() override;
+  bool Empty() const override;
 
  private:
   // Move all the nodes in the waiting_queue_ to nodes_.
-  void _DrainWaitingQueue() {
-    for (const auto* node : waiting_queue_) {
-      // push_heap in AddNode() and pop_heap in RemoveCurrNode() guarantees that
-      // the first element is the node with minimum time_ready.
-      nodes_.push_back(node);
-      std::push_heap(nodes_.begin(), nodes_.end(), greater_);
-    }
-    waiting_queue_.clear();
-  }
+  void DrainWaitingQueue();
 
   // nodes_ is the main queue, where we construct heap, and the front is the
   // current node.
@@ -259,6 +204,41 @@ class FirstReadyManager : public ReadyNodeManager {
   const std::unordered_map<const NodeDef*, NodeState>* node_state_;
 };
 
+// CompositeNodeManager has a few other NodeManagers: per-device LIFO for normal
+// ops (neither _Send nor _Recv) and FirstyReadyManagers for _Send ops and _Recv
+// ops, and then it chooses FirstReady among the ops chosen from each
+// internal NodeManagers. The objective is to maximize producer-consumer
+// locality within device, while processing nodes across devices, including
+// _Send and _Recv, fairly, in terms of their time_ready.
+class CompositeNodeManager : public ReadyNodeManager {
+ public:
+  CompositeNodeManager(
+      const std::unordered_map<const NodeDef*, NodeState>* node_state);
+  ~CompositeNodeManager() override {}
+
+  void AddNode(const NodeDef* node) override;
+  const NodeDef* GetCurrNode() override;
+  void RemoveCurrNode() override;
+  bool Empty() const override;
+
+ private:
+  // Internal ready node managers:
+  // LIFO for normal ops to maximize producer consumer locality.
+  // One LIFO per device.
+  std::unordered_map<string, LIFOManager> ops_lifo_map_;
+  // FirstReady for send and recv. Handle send and recv separately ensures that
+  // send and recv do not block previously read ops with LIFO schedule.
+  FirstReadyManager send_manager_;
+  FirstReadyManager recv_manager_;
+
+  // NodeState structure from VirtualScheduler to get time_ready of ready nodes.
+  // Not owned by FirstReadyManager.
+  const std::unordered_map<const NodeDef*, NodeState>* node_state_;
+
+  // Cached curr node. Set back to nullptr from RemoveCurrNode().
+  const NodeDef* curr_node_;
+};
+
 // The virtual scheduler emulates execution of nodes in a graph, considering
 // dependencies, device, etc.
 class VirtualScheduler {
@@ -330,7 +310,10 @@ class VirtualScheduler {
 
   // Stats:
   std::map<string, int> op_counts_;  // Op counts with key with input shape.
-  std::map<string, int> op_costs_;   // Individual op costs (with input shapes).
+  // Individual op costs (with input shapes).
+  // Boolean field for whether the cost is accurate.
+  std::map<string, std::pair<int, bool>> op_costs_;
+
   Costs graph_costs_;                // Graph cost.
   std::map<string, Costs> op_to_cost_;  // Per-op cost.
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index 412b494be730c21bf8b3d8bd791cc42dcbf15794..161e1d4ece0baef6f36d91a511d38a29f06b456a 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -44,8 +44,15 @@ class VirtualSchedulerTest : public ::testing::Test {
   NodeDef node1_, node2_, node3_, node4_, node5_, node6_;
   std::unordered_map<const NodeDef*, NodeState> node_states_;
 
+  // Device names:
   const string kCPU0 = "/job:localhost/replica:0/task:0/cpu:0";
   const string kCPU1 = "/job:localhost/replica:0/task:0/cpu:1";
+  const string kChannelFrom0To1 = "Channel from CPU0 to CPU1";
+  const string kChannelFrom1To0 = "Channel from CPU1 to CPU0";
+  // Op names:
+  const string kSend = "_Send";
+  const string kRecv = "_Recv";
+  const string kConv2D = "Conv2D";
 
   DeviceProperties GetDummyCPUDevice() {
     // Create CPU with 2 cores, 4 Ghz freq, 2 GB/s mem bandwidth.
@@ -59,29 +66,26 @@ class VirtualSchedulerTest : public ::testing::Test {
     return cpu_device;
   }
 
+  void NodeSetUp(const string& name, const string& op_name,
+                 const string& device_name, const uint64 time_ready,
+                 NodeDef* node) {
+    node->set_name(name);
+    node->set_op(op_name);
+    node->set_device(device_name);
+
+    node_states_[node] = NodeState();
+    node_states_[node].time_ready = time_ready;
+    node_states_[node].device_name = device_name;
+  }
+
   void SetUp() override {
-    // Initializes nodes for manager
-    node1_.set_name("Node1");
-    node2_.set_name("Node2");
-    node3_.set_name("Node3");
-    node4_.set_name("Node4");
-    node5_.set_name("Node5");
-    node6_.set_name("Node6");
-
-    // Initialize node_states, with time_ready in reverse order.
-    node_states_[&node1_] = NodeState();
-    node_states_[&node2_] = NodeState();
-    node_states_[&node3_] = NodeState();
-    node_states_[&node4_] = NodeState();
-    node_states_[&node5_] = NodeState();
-    node_states_[&node6_] = NodeState();
-
-    node_states_[&node6_].time_ready = 1000;
-    node_states_[&node5_].time_ready = 2000;
-    node_states_[&node4_].time_ready = 3000;
-    node_states_[&node3_].time_ready = 4000;
-    node_states_[&node2_].time_ready = 5000;
-    node_states_[&node1_].time_ready = 6000;
+    // node1_ to node6_ on kCPU0, with time_ready in reverse_order.
+    NodeSetUp("Node1", kConv2D, kCPU0, 6000, &node1_);
+    NodeSetUp("Node2", kConv2D, kCPU0, 5000, &node2_);
+    NodeSetUp("Node3", kConv2D, kCPU0, 4000, &node3_);
+    NodeSetUp("Node4", kConv2D, kCPU0, 3000, &node4_);
+    NodeSetUp("Node5", kConv2D, kCPU0, 2000, &node5_);
+    NodeSetUp("Node6", kConv2D, kCPU0, 1000, &node6_);
 
     // Initializes cluster_ and placer_.
     std::unordered_map<string, DeviceProperties> devices;
@@ -1207,15 +1211,9 @@ TEST_F(VirtualSchedulerTest, GetCurrNodeFirstReadyManager) {
   NodeDef node7;
   NodeDef node8;
   NodeDef node9;
-  node7.set_name("Node7");
-  node8.set_name("Node8");
-  node9.set_name("Node9");
-  node_states_[&node7] = NodeState();
-  node_states_[&node8] = NodeState();
-  node_states_[&node9] = NodeState();
-  node_states_[&node7].time_ready = 5;
-  node_states_[&node8].time_ready = 4;
-  node_states_[&node9].time_ready = 3;
+  NodeSetUp("Node7", kConv2D, kCPU0, 5, &node7);
+  NodeSetUp("Node8", kConv2D, kCPU0, 4, &node8);
+  NodeSetUp("Node9", kConv2D, kCPU0, 3, &node9);
 
   manager.AddNode(&node7);
   EXPECT_EQ("Node6", manager.GetCurrNode()->name());
@@ -1249,6 +1247,132 @@ TEST_F(VirtualSchedulerTest, GetCurrNodeFirstReadyManager) {
   EXPECT_TRUE(manager.Empty());
 }
 
+TEST_F(VirtualSchedulerTest, RemoveSingleNodeCompositeNodeManager) {
+  CompositeNodeManager manager = CompositeNodeManager(&node_states_);
+
+  manager.AddNode(&node1_);
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+TEST_F(VirtualSchedulerTest, RemoveSingleNodeComopsiteNodeManager) {
+  CompositeNodeManager manager = CompositeNodeManager(&node_states_);
+
+  manager.AddNode(&node1_);
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+TEST_F(VirtualSchedulerTest, GetAndRemoveMultipleComopsiteNodeManager) {
+  CompositeNodeManager manager = CompositeNodeManager(&node_states_);
+
+  // Add the nodes to LIFOManager.
+  manager.AddNode(&node1_);
+  manager.AddNode(&node2_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node4_);
+
+  // Keep checking current node as nodes are removed and added.
+  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
+  manager.AddNode(&node5_);
+  // GetCurrNode()  should return the same node even if some nodes are added,
+  // until RemoveCurrNode() is called.
+  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node5", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
+  manager.AddNode(&node6_);
+  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node6", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+TEST_F(VirtualSchedulerTest, MultiDeviceSendRecvComopsiteNodeManager) {
+  CompositeNodeManager manager = CompositeNodeManager(&node_states_);
+
+  // Additional nodes on kCPU1
+  NodeDef node7;
+  NodeDef node8;
+  NodeDef node9;
+  NodeSetUp("Node7", kConv2D, kCPU1, 1001, &node7);
+  NodeSetUp("Node8", kConv2D, kCPU1, 2001, &node8);
+  NodeSetUp("Node9", kConv2D, kCPU1, 3001, &node9);
+
+  // Send and Recv nodes.
+  NodeDef send1;
+  NodeDef send2;
+  NodeDef recv1;
+  NodeDef recv2;
+  NodeSetUp("Send1", kSend, kChannelFrom0To1, 2002, &send1);
+  NodeSetUp("Send2", kSend, kChannelFrom1To0, 2005, &send2);
+  NodeSetUp("Recv1", kRecv, kCPU0, 2003, &recv1);
+  NodeSetUp("Recv2", kRecv, kCPU1, 2003, &recv2);
+
+  // Insert nodes.
+  manager.AddNode(&node1_);
+  manager.AddNode(&node2_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node4_);
+  manager.AddNode(&node5_);
+  manager.AddNode(&node6_);
+  manager.AddNode(&node7);
+  manager.AddNode(&node8);
+  manager.AddNode(&node9);
+  manager.AddNode(&send1);
+  manager.AddNode(&send2);
+  manager.AddNode(&recv1);
+  manager.AddNode(&recv2);
+
+  // on kCPU0; last one is node6_, on kCPU1: last one is node9;
+  // so choose one that has earliest time_ready among node6_, node9,
+  // Send1, Send2, Recv1, and Recv2.
+  EXPECT_EQ("Node6", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Then, the next one on kCPU0 is node5_; choose the earliest time_ready node
+  // among node5_, node9, Send1, Send2, Recv1, and Recv2.
+  EXPECT_EQ("Node5", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Next, choose among node4_, node9, Send1, Send2, Recv1, and Recv2.
+  EXPECT_EQ("Send1", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Next, choose among node4_, node9, Sen2, Recv1, and Recv2.
+  EXPECT_EQ("Recv1", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Next, choose among node4_, node9, Send2, and Recv2.
+  EXPECT_EQ("Recv2", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Next, choose among node4_, node9, and Send2.
+  EXPECT_EQ("Send2", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Next, choose between node4_, node9.
+  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Next, choose between node3_, node9.
+  EXPECT_EQ("Node9", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Next, choose between node3_, node8.
+  EXPECT_EQ("Node8", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Next, choose between node3_, node7.
+  EXPECT_EQ("Node7", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Then, just the nodes on kCPU1 -- LIFO.
+  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
 // Create small graph, run predict costs on it, make sure the costs from the
 // summary match the hand-calculated costs.
 TEST_F(VirtualSchedulerTest, SummaryCostTest) {
@@ -1580,7 +1704,7 @@ TEST_F(VirtualSchedulerTest, WhileLoop) {
   EXPECT_NE(exit_start_micro, exit_1_start_micro);
 
   // Check dependency among the nodes; no matter what scheduling mechanism we
-  // use, the scheduled ops should follow these depedency chains.
+  // use, the scheduled ops should follow these dependency chains.
   // Note that currently, VirtualScheduler executes while/Merge twice; hence,
   // we're not testing dependency chains related to while/Merge.
   // TODO(dyoon): after fixing while loop behavior correctly (run nodes in the
@@ -1634,20 +1758,20 @@ TEST_F(VirtualSchedulerTest, InterDeviceTransfer) {
     const auto& name = x.first;
     const auto& node_info = x.second;
     const auto& op = node_info.op_info.op();
-    if (op == "_Recv") {
+    if (op == kRecv) {
       recv_op_names[get_port_num(name)] = name;
-    } else if (op == "_Send") {
+    } else if (op == kSend) {
       send_op_names[get_port_num(name)] = name;
     }
     op_count[op]++;
   }
 
   // Same number of _Send and _Recv.
-  EXPECT_EQ(op_count.at("_Send"), op_count.at("_Recv"));
+  EXPECT_EQ(op_count.at(kSend), op_count.at(kRecv));
 
   // Expect 4 Send and Recvs each: port 0, 1, and, 2, and control dependency.
-  EXPECT_EQ(op_count.at("_Recv"), 4);
-  EXPECT_EQ(op_count.at("_Send"), 4);
+  EXPECT_EQ(op_count.at(kRecv), 4);
+  EXPECT_EQ(op_count.at(kSend), 4);
 
   // Helper lambda for extracting output Tensor size.
   auto get_output_size = [this, ops_executed](const string& name) -> int64 {
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 844a1fa3283722a5d5c7d4d862eb800224bd744d..149f6fc7353b3c96e9d780c20697873c15bccaa8 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -72,9 +72,11 @@ std::vector<const NodeDef*> GrapplerItem::MainVariables() const {
 std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
   std::unordered_set<string> result;
   for (const string& f : fetch) {
+    VLOG(1) << "Add fetch " << f;
     result.insert(NodeName(f));
   }
   for (const auto& f : feed) {
+    VLOG(1) << "Add feed " << f.first;
     result.insert(NodeName(f.first));
   }
   for (const auto& node : init_ops) {
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 3f6183b6f1ecb92dcc99abccacda74ceaf72cce0..866f87688c89d2c52773846711296c8e1f033a6e 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -126,9 +126,6 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
   graph_ctor_opts.allow_internal_ops = true;
   graph_ctor_opts.expect_device_spec = false;
   std::unique_ptr<Graph> graphptr(new Graph(function_library));
-  // Populate default attrs to the NodeDefs in the GraphDef.
-  TF_RETURN_IF_ERROR(
-      AddDefaultAttrsToGraphDef(&graph_def, *graphptr->op_registry(), 0));
 
   TF_RETURN_IF_ERROR(
       ConvertGraphDefToGraph(graph_ctor_opts, graph_def, graphptr.get()));
@@ -138,7 +135,10 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
   optimizer.Optimize(flr, env, devices[0], &graphptr, /*shape_map=*/nullptr);
   graphptr->ToGraphDef(output_graph_def);
 
-  return Status::OK();
+  // The default values of attributes might have been stripped by the optimizer.
+  // Add them back.
+  return AddDefaultAttrsToGraphDef(output_graph_def, *graphptr->op_registry(),
+                                   0);
 }
 
 // Applies the same graph pruning logic to the graph as Session.Run in TF.
@@ -173,7 +173,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
                  << ", skipping this input.";
       return nullptr;
     }
-    LOG(INFO) << "Will use feed node " << feed_name;
+    VLOG(1) << "Will use feed node " << feed_name;
     new_item->feed.emplace_back(feed_name, Tensor());
   }
 
@@ -188,7 +188,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
                      << ", skipping this input";
           return nullptr;
         }
-        LOG(INFO) << "Will use fetch node " << name;
+        VLOG(1) << "Will use fetch node " << name;
         new_item->fetch.push_back(name);
       }
     }
@@ -297,7 +297,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   }
 
   for (auto& node : *new_item->graph.mutable_node()) {
-    if (IsPlaceholder(node)) {
+    if (IsPlaceholder(node) && node.op() != "PlaceholderWithDefault") {
       if (node.attr().count("dtype") == 0) {
         LOG(ERROR) << "Unknown type for placeholder " << node.name()
                    << ", skipping this input";
@@ -449,6 +449,18 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
     new_item->save_restore_loc_tensor = saver.filename_tensor_name();
   }
 
+  // Instantiate all the missing attributes with their default values.
+  Status attr_status = AddDefaultAttrsToGraphDef(
+      &new_item->graph,
+      FunctionLibraryDefinition(OpRegistry::Global(),
+                                new_item->graph.library()),
+      0);
+  if (!attr_status.ok()) {
+    LOG(ERROR) << "Failed to instantiate default attribute values: "
+               << attr_status.error_message();
+    return nullptr;
+  }
+
   // Optimize the graph (function inlining, l1 optimizations, etc).
   VLOG(1) << "Number of nodes in graph before OptimizeGraph: "
           << new_item->graph.node_size();
diff --git a/tensorflow/core/grappler/grappler_item_builder_test.cc b/tensorflow/core/grappler/grappler_item_builder_test.cc
index 4272179d3cbef35362dc3330b5d1b3076df9bdb1..09d9aa4ef19d9c2bfb7b60920537b77d9478680b 100644
--- a/tensorflow/core/grappler/grappler_item_builder_test.cc
+++ b/tensorflow/core/grappler/grappler_item_builder_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/cc/gradients/grad_testutil.h"
 #include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
@@ -253,6 +254,31 @@ TEST_F(GrapplerItemBuilderTest, AssetFilepathOverrideTest_FileNotAccessible) {
   ASSERT_TRUE(item == nullptr);
 }
 
+TEST_F(GrapplerItemBuilderTest, GraphWithFunctions) {
+  MetaGraphDef meta_graph;
+  // y = XTimesTwo(x)
+  constexpr char device[] = "/cpu:0";
+  *meta_graph.mutable_graph_def() = test::function::GDef(
+      {test::function::NDef("x", "Const", {}, {{"dtype", DT_FLOAT}}, device),
+       test::function::NDef("y", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}},
+                            device)},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+
+  CollectionDef train_op;
+  train_op.mutable_node_list()->add_value("y");
+  (*meta_graph.mutable_collection_def())["train_op"] = train_op;
+
+  ItemConfig cfg;
+  cfg.inline_functions = false;
+
+  std::unique_ptr<GrapplerItem> item =
+      GrapplerItemFromMetaGraphDef("0", meta_graph, cfg);
+  ASSERT_TRUE(item != nullptr);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index acb849814217b97600a9cfc6b730838e0733f86b..e4c1da52ec0051b9ecee17c85ae3cff627e5558f 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -13,24 +13,67 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <unordered_set>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
-bool IsAddN(const NodeDef& node) {
-  const auto op = node.op();
-  return op == "AddN";
+bool IsAdd(const NodeDef& node) {
+  if (node.op() == "AddV2" || node.op() == "Add") {
+    DataType type = node.attr().at("T").type();
+    return type != DT_STRING;
+  }
+  return false;
 }
 
-bool IsConcat(const NodeDef& node) {
-  const auto op = node.op();
-  return op == "Concat" || op == "ConcatV2";
+bool IsAddN(const NodeDef& node) { return node.op() == "AddN"; }
+
+bool IsAnyDiv(const NodeDef& node) {
+  return node.op() == "RealDiv" || node.op() == "Div" ||
+         node.op() == "FloorDiv" || node.op() == "TruncateDiv";
 }
 
-bool IsConstant(const NodeDef& node) {
-  const auto op = node.op();
-  return op == "Const";
+bool IsAvgPoolGrad(const NodeDef& node) { return node.op() == "AvgPoolGrad"; }
+
+bool IsAssert(const NodeDef& node) { return node.op() == "Assert"; }
+
+bool IsBiasAdd(const NodeDef& node) {
+  return node.op() == "BiasAdd" || node.op() == "BiasAddV1";
+}
+
+bool IsBiasAddGrad(const NodeDef& node) { return node.op() == "BiasAddGrad"; }
+
+bool IsConcatOffset(const NodeDef& node) { return node.op() == "ConcatOffset"; }
+
+bool IsConstant(const NodeDef& node) { return node.op() == "Const"; }
+
+bool IsConv2D(const NodeDef& node) { return node.op() == "Conv2D"; }
+
+bool IsConv2DBackpropFilter(const NodeDef& node) {
+  return node.op() == "Conv2DBackpropFilter";
+}
+
+bool IsConv2DBackpropInput(const NodeDef& node) {
+  return node.op() == "Conv2DBackpropInput";
+}
+
+bool IsDepthwiseConv2dNative(const NodeDef& node) {
+  return node.op() == "DepthwiseConv2dNative";
+}
+
+bool IsDepthwiseConv2dNativeBackpropFilter(const NodeDef& node) {
+  return node.op() == "DepthwiseConv2dNativeBackpropFilter";
+}
+
+bool IsDepthwiseConv2dNativeBackpropInput(const NodeDef& node) {
+  return node.op() == "DepthwiseConv2dNativeBackpropInput";
 }
 
 bool IsDequeueOp(const NodeDef& node) {
@@ -40,6 +83,10 @@ bool IsDequeueOp(const NodeDef& node) {
          op == "QueueDequeueUpToV2" || op == "QueueDequeueUpTo";
 }
 
+bool IsDiv(const NodeDef& node) { return node.op() == "Div"; }
+
+bool IsEluGrad(const NodeDef& node) { return node.op() == "EluGrad"; }
+
 bool IsEnter(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Enter" || op == "RefEnter";
@@ -50,43 +97,65 @@ bool IsExit(const NodeDef& node) {
   return op == "Exit" || op == "RefExit";
 }
 
+bool IsFloorMod(const NodeDef& node) { return node.op() == "FloorMod"; }
+
+bool IsFusedBatchNormGradV1(const NodeDef& node) {
+  return node.op() == "FusedBatchNormGrad";
+}
+
 bool IsIdentity(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Identity" || op == "RefIdentity";
 }
 
+bool IsInvGrad(const NodeDef& node) { return node.op() == "InvGrad"; }
+
+bool IsMatMul(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "MatMul" || op == "BatchMatMul" || op == "QuantizedMatMul" ||
+         op == "SparseMatMul";
+}
+
 bool IsMerge(const NodeDef& node) {
-  const auto op = node.op();
+  const auto& op = node.op();
   return op == "Merge" || op == "RefMerge";
 }
 
-bool IsNoOp(const NodeDef& node) {
-  const auto op = node.op();
-  return op == "NoOp";
-}
+bool IsMul(const NodeDef& node) { return node.op() == "Mul"; }
+
+bool IsNoOp(const NodeDef& node) { return node.op() == "NoOp"; }
 
 bool IsNextIteration(const NodeDef& node) {
   const auto& op = node.op();
   return op == "NextIteration" || op == "RefNextIteration";
 }
 
+bool IsPad(const NodeDef& node) { return node.op() == "Pad"; }
+
 bool IsPlaceholder(const NodeDef& node) {
-  const auto op = node.op();
+  const auto& op = node.op();
   return op == "Placeholder" || op == "PlaceholderV2" ||
          op == "PlaceholderWithDefault";
 }
 
-bool IsRecv(const NodeDef& node) {
-  const auto op = node.op();
-  return op == "_Recv";
+bool IsRealDiv(const NodeDef& node) { return node.op() == "RealDiv"; }
+
+bool IsReciprocalGrad(const NodeDef& node) {
+  return node.op() == "ReciprocalGrad";
 }
 
+bool IsRecv(const NodeDef& node) { return node.op() == "_Recv"; }
+
 bool IsReduction(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Sum" || op == "Prod" || op == "Min" || op == "Max" ||
          op == "Mean" || op == "Any" || op == "All";
 }
 
+bool IsReluGrad(const NodeDef& node) { return node.op() == "ReluGrad"; }
+
+bool IsRelu6Grad(const NodeDef& node) { return node.op() == "Relu6Grad"; }
+
 bool IsReshape(const NodeDef& node) { return (node.op() == "Reshape"); }
 
 bool IsRestore(const NodeDef& node) {
@@ -94,31 +163,126 @@ bool IsRestore(const NodeDef& node) {
           node.op() == "RestoreSlice");
 }
 
-bool IsSend(const NodeDef& node) {
-  const auto op = node.op();
-  return op == "_Send";
+bool IsRsqrtGrad(const NodeDef& node) { return node.op() == "RsqrtGrad"; }
+
+bool IsSeluGrad(const NodeDef& node) { return node.op() == "SeluGrad"; }
+
+bool IsSend(const NodeDef& node) { return node.op() == "_Send"; }
+
+bool IsShape(const NodeDef& node) { return node.op() == "Shape"; }
+
+bool IsShapeN(const NodeDef& node) { return node.op() == "ShapeN"; }
+
+bool IsSigmoidGrad(const NodeDef& node) { return node.op() == "SigmoidGrad"; }
+
+bool IsSlice(const NodeDef& node) { return node.op() == "Slice"; }
+
+bool IsSoftplusGrad(const NodeDef& node) { return node.op() == "SoftplusGrad"; }
+
+bool IsSoftsignGrad(const NodeDef& node) { return node.op() == "SoftsignGrad"; }
+
+bool IsSplit(const NodeDef& node) { return node.op() == "Split"; }
+
+bool IsSqrtGrad(const NodeDef& node) { return node.op() == "SqrtGrad"; }
+
+bool IsSquaredDifference(const NodeDef& node) {
+  return node.op() == "SquaredDifference";
 }
 
+bool IsSqueeze(const NodeDef& node) { return node.op() == "Squeeze"; }
+
 bool IsStopGradient(const NodeDef& node) {
   const auto& op = node.op();
   return op == "StopGradient" || op == "PreventGradient";
 }
 
+bool IsSub(const NodeDef& node) { return node.op() == "Sub"; }
+
+bool IsSum(const NodeDef& node) { return node.op() == "Sum"; }
+
 bool IsSwitch(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Switch" || op == "RefSwitch";
 }
 
-bool IsTranspose(const NodeDef& node) {
-  const auto op = node.op();
-  return op == "Transpose";
-}
+bool IsTanhGrad(const NodeDef& node) { return node.op() == "TanhGrad"; }
+
+bool IsTranspose(const NodeDef& node) { return node.op() == "Transpose"; }
 
 bool IsVariable(const NodeDef& node) {
-  const auto op = node.op();
+  const auto& op = node.op();
   return op == "Variable" || op == "VariableV2" || op == "AutoReloadVariable" ||
          op == "VarHandleOp" || op == "ReadVariableOp";
 }
 
-}  // end namespace grappler
+namespace {
+bool GetBoolAttr(const NodeDef& node, const string& name) {
+  return node.attr().count(name) > 0 && node.attr().at(name).b();
+}
+}  // namespace
+
+bool IsFreeOfSideEffect(const NodeDef& node) {
+  // Placeholders must be preserved to keep the graph feedable.
+  if (IsPlaceholder(node)) {
+    return false;
+  }
+  const OpDef* op_def = nullptr;
+  Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
+  if (!status.ok()) {
+    return false;
+  }
+  if (op_def->is_stateful()) {
+    return false;
+  }
+  // Nodes such as Assign or AssignAdd modify one of their inputs.
+  for (const auto& input : op_def->input_arg()) {
+    if (input.is_ref()) {
+      return false;
+    }
+  }
+  // Some nodes do in-place updates on regular tensor inputs.
+  if (GetBoolAttr(node, "in_place") || GetBoolAttr(node, "inplace")) {
+    return false;
+  }
+  return true;
+}
+
+bool ModifiesFrameInfo(const NodeDef& node) {
+  return IsEnter(node) || IsExit(node) || IsNextIteration(node);
+}
+
+#define OPDEF_PROPERTY_HELPER(PROPERTY_CAP, PROPERTY)                      \
+  bool Is##PROPERTY_CAP(const NodeDef& node) {                             \
+    if (node.op() == "Add") {                                              \
+      /* Workaround for "Add" not being marked is_commutative and */       \
+      /* is_aggregate. (See cl/173915048). */                              \
+      const auto type = GetDataTypeFromAttr(node, "T");                    \
+      return type != DT_INVALID && type != DT_STRING;                      \
+    }                                                                      \
+    const OpDef* op_def = nullptr;                                         \
+    Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def); \
+    return status.ok() && op_def->is_##PROPERTY();                         \
+  }
+
+OPDEF_PROPERTY_HELPER(Aggregate, aggregate)
+OPDEF_PROPERTY_HELPER(Commutative, commutative)
+
+bool IsInvolution(const NodeDef& node) {
+  const std::unordered_set<string> involution_ops{
+      "Conj", "Reciprocal", "Invert", "Neg", "LogicalNot"};
+  return involution_ops.count(node.op()) > 0;
+}
+
+bool IsValuePreserving(const NodeDef& node) {
+  if (NumNonControlInputs(node) == 1 && IsAggregate(node)) {
+    return true;
+  }
+  const std::unordered_set<string> value_preserving_ops{
+      "Transpose",  "Reshape",      "Identity",        "InvertPermutation",
+      "Reverse",    "StopGradient", "PreventGradient", "CheckNumerics",
+      "ExpandDims", "Squeeze"};
+  return value_preserving_ops.count(node.op()) > 0;
+}
+
+}  // namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 0de954fcb45366e40bb89d3704fec496cd514b41..0e246a661f0749b5c28e501f6e614e250bc71fef 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -17,31 +17,91 @@ limitations under the License.
 #define TENSORFLOW_GRAPPLER_OP_TYPES_H_
 
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
+bool IsAdd(const NodeDef& node);
 bool IsAddN(const NodeDef& node);
-bool IsConcat(const NodeDef& node);
+bool IsAnyDiv(const NodeDef& node);
+bool IsAvgPoolGrad(const NodeDef& node);
+bool IsAssert(const NodeDef& node);
+bool IsBiasAdd(const NodeDef& node);
+bool IsBiasAddGrad(const NodeDef& node);
+bool IsConcatOffset(const NodeDef& node);
 bool IsConstant(const NodeDef& node);
+bool IsConv2D(const NodeDef& node);
+bool IsConv2DBackpropFilter(const NodeDef& node);
+bool IsConv2DBackpropInput(const NodeDef& node);
+bool IsDepthwiseConv2dNative(const NodeDef& node);
+bool IsDepthwiseConv2dNativeBackpropFilter(const NodeDef& node);
+bool IsDepthwiseConv2dNativeBackpropInput(const NodeDef& node);
 bool IsDequeueOp(const NodeDef& node);
+bool IsDiv(const NodeDef& node);
+bool IsEluGrad(const NodeDef& node);
 bool IsEnter(const NodeDef& node);
 bool IsExit(const NodeDef& node);
+bool IsFloorMod(const NodeDef& node);
+bool IsFusedBatchNormGradV1(const NodeDef& node);
 bool IsIdentity(const NodeDef& node);
+bool IsInvGrad(const NodeDef& node);
 bool IsMerge(const NodeDef& node);
+bool IsMul(const NodeDef& node);
+bool IsMatMul(const NodeDef& node);
 bool IsNextIteration(const NodeDef& node);
+bool IsPad(const NodeDef& node);
 bool IsNoOp(const NodeDef& node);
 bool IsPlaceholder(const NodeDef& node);
+bool IsRealDiv(const NodeDef& node);
+bool IsRelu6Grad(const NodeDef& node);
+bool IsReluGrad(const NodeDef& node);
+bool IsReciprocalGrad(const NodeDef& node);
 bool IsRecv(const NodeDef& node);
 bool IsReduction(const NodeDef& node);
 bool IsReshape(const NodeDef& node);
 bool IsRestore(const NodeDef& node);
+bool IsRsqrtGrad(const NodeDef& node);
+bool IsSeluGrad(const NodeDef& node);
 bool IsSend(const NodeDef& node);
+bool IsSlice(const NodeDef& node);
+bool IsShape(const NodeDef& node);
+bool IsShapeN(const NodeDef& node);
+bool IsSigmoidGrad(const NodeDef& node);
+bool IsSoftplusGrad(const NodeDef& node);
+bool IsSoftsignGrad(const NodeDef& node);
+bool IsSplit(const NodeDef& node);
+bool IsSqrtGrad(const NodeDef& node);
+bool IsSquaredDifference(const NodeDef& node);
+bool IsSqueeze(const NodeDef& node);
 bool IsStopGradient(const NodeDef& node);
+bool IsSub(const NodeDef& node);
+bool IsSum(const NodeDef& node);
 bool IsSwitch(const NodeDef& node);
+bool IsTanhGrad(const NodeDef& node);
 bool IsTranspose(const NodeDef& node);
 bool IsVariable(const NodeDef& node);
 
+// Return true if the op is an aggregation (e.g. Add, AddN).
+// Returns false if it could not be determined to be so.
+bool IsAggregate(const NodeDef& node);
+
+// Return true if the op is commutative (e.g. Mul, Add).
+// Returns false if it could not be determined to be so.
+bool IsCommutative(const NodeDef& node);
+
+bool IsFreeOfSideEffect(const NodeDef& node);
+bool ModifiesFrameInfo(const NodeDef& node);
+
+// Returns true if the op is an element-wise involution, i.e. if it is its
+// own inverse such that f(f(x)) == x.
+bool IsInvolution(const NodeDef& node);
+
+// Returns true if the op in node only rearranges the order of elements in its
+// first input tensor and possible changes its shape. More precisely, this
+// function returns true if the op commutes with all element-wise operations.
+bool IsValuePreserving(const NodeDef& node);
+
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index dbfa8ae503f66fb0bf39ef6cda8bb683d3af2851..e557adc2111608f6a77f292275200a8b5dfab9ec 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -96,6 +96,7 @@ cc_library(
         ":graph_optimizer",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
@@ -202,7 +203,6 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":arithmetic_optimizer",
         ":constant_folding",
         ":graph_optimizer",
         "//tensorflow/core:framework",
@@ -213,7 +213,7 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/core/grappler/utils:frame",
+        "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
 
@@ -232,6 +232,7 @@ tf_cc_test(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
 
@@ -314,6 +315,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_optimizer",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:devices",
@@ -322,6 +324,7 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/costs:virtual_placer",
         "//tensorflow/core/grappler/utils:frame",
     ],
 )
@@ -332,12 +335,18 @@ tf_cc_test(
     deps = [
         ":layout_optimizer",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 2394c07e18703180bc4109d9de96947c03c83851..d6bc8614f91af85229c2ebb8b7040a218c594c81 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -25,10 +25,12 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -37,6 +39,8 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/saved_tensor_slice_util.h"
 
+using tensorflow::strings::StrCat;
+
 namespace tensorflow {
 namespace grappler {
 namespace {
@@ -80,23 +84,6 @@ Status SetTensorValue(DataType dtype, int value, Tensor* tensor) {
   return Status::OK();
 }
 
-bool IsInvolution(const NodeDef& node) {
-  const std::unordered_set<string> involution_ops = {
-      "Conj", "Reciprocal", "Invert", "Neg", "LogicalNot"};
-  return involution_ops.count(node.op()) > 0;
-}
-
-// Returns true if the op in node only rearranges the order of elements in an
-// input tensor, or more specifically, if it commutes with all element-wise
-// operations on the values.
-bool IsValuePreserving(const NodeDef& node) {
-  const std::unordered_set<string> value_preserving_ops = {
-      "Transpose",  "Reshape",      "Identity",        "InvertPermutation",
-      "Reverse",    "StopGradient", "PreventGradient", "CheckNumerics",
-      "ExpandDims", "Squeeze"};
-  return value_preserving_ops.count(node.op()) > 0;
-}
-
 template <typename T>
 bool AreInversePermutations(const std::vector<T>& a, const std::vector<T>& b) {
   if (a.size() != b.size()) {
@@ -185,44 +172,6 @@ bool IsInnerMatrixTransposeNode(const NodeDef& transpose_node,
   return false;
 }
 
-bool SimplyReordersData(const NodeDef& node) {
-  return node.op() == "Transpose";
-}
-
-// Follow a chain (through input(0)) of ops starting at `source->input(0)` as
-// long as they
-//  1. preserve the values of their first input,
-//  2. have a single (non-control) output,
-//  3. are not in nodes_to_preserve.
-// Returns the last node in the chain satisfying these properties or source
-// itself if a chain of length zero was found.
-//
-// source <- vp <- vp <- vp <- non_vp
-//                       ^^
-//                   return value
-NodeDef* GetTailOfValuePreservingChain(
-    const NodeDef* source, const NodeMap* node_map,
-    const std::unordered_set<string>& nodes_to_preserve) {
-  const NodeDef* source_parent = source;
-  if (!IsControlInput(source->input(0))) {
-    source = node_map->GetNode(source->input(0));
-    while (IsValuePreserving(*source) &&
-           node_map->GetOutputs(source->name()).size() == 1 &&
-           // Do not skip over preserved nodes, because folding will change
-           // the results of these skipped data-reordering nodes.
-           // TODO(jingyue): A more elegant way is to copy this chain of
-           // data-reordering nodes and modify only the copy.
-           !nodes_to_preserve.count(source->name())) {
-      source_parent = source;
-      if (IsControlInput(source->input(0))) {
-        break;
-      }
-      source = node_map->GetNode(source->input(0));
-    }
-  }
-  return const_cast<NodeDef*>(source_parent);
-}
-
 bool MaybeAddControlInput(const string& new_input, NodeDef* node,
                           GraphDef* graph, NodeMap* node_map) {
   bool already_exists = false;
@@ -253,43 +202,6 @@ int CopyControlInputs(const NodeDef& from, NodeDef* to, GraphDef* graph,
   return num_copied;
 }
 
-// Returns the data type in attribute `attr_name` of `node`. If that attribute
-// doesn't exist, returns DT_INVALID.
-DataType GetDataTypeFromAttr(const NodeDef& node, const string& attr_name) {
-  if (!node.attr().count(attr_name)) {
-    return DT_INVALID;
-  }
-  const auto& attr = node.attr().at(attr_name);
-  if (attr.value_case() != AttrValue::kType) {
-    return DT_INVALID;
-  }
-  return attr.type();
-}
-
-bool IsCommutative(const NodeDef& node) {
-  if (node.op() == "Add" && node.input_size() > 0) {
-    // Workaround for "Add" not being marked is_commutative and is_aggregate.
-    // (See cl/173915048).
-    const auto type = GetDataTypeFromAttr(node, "T");
-    return type != DT_INVALID && type != DT_STRING;
-  }
-  const OpDef* op_def = nullptr;
-  const Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
-  return status.ok() && op_def->is_commutative();
-}
-
-bool IsAggregate(const NodeDef& node) {
-  if (node.op() == "Add" && node.input_size() > 0) {
-    // Workaround for "Add" not being marked is_commutative and is_aggregate.
-    // (See cl/173915048).
-    const auto type = GetDataTypeFromAttr(node, "T");
-    return type != DT_INVALID && type != DT_STRING;
-  }
-  const OpDef* op_def = nullptr;
-  const Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
-  return status.ok() && op_def->is_aggregate();
-}
-
 void SetDataTypeToAttr(DataType dtype, const string& attr_name, NodeDef* node) {
   (*node->mutable_attr())[attr_name].set_type(dtype);
 }
@@ -341,6 +253,30 @@ bool IsNumberType(DataType dtype) {
 
 const char kOutputShapesAttr[] = "_output_shapes";
 
+PartialTensorShape GetInputShape(const string& input, const NodeMap& node_map) {
+  int output_pos;
+  string node_name = ParseNodeName(input, &output_pos);
+  const NodeDef* input_node = node_map.GetNode(node_name);
+  return input_node->attr().at(kOutputShapesAttr).list().shape(output_pos);
+}
+
+bool ShapesEqual(const string& input_x, const string& input_y,
+                 const NodeMap& node_map) {
+  PartialTensorShape x_shape = GetInputShape(input_x, node_map);
+  PartialTensorShape y_shape = GetInputShape(input_y, node_map);
+  if (x_shape.unknown_rank() || y_shape.unknown_rank() ||
+      x_shape.dims() != y_shape.dims()) {
+    return false;
+  }
+  for (int i = 0; i < x_shape.dims(); ++i) {
+    if (x_shape.dim_size(i) == -1 || y_shape.dim_size(i) == -1 ||
+        x_shape.dim_size(i) != y_shape.dim_size(i)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // Returns whether `reshape` is an identity op. The tensor that `reshape`
 // reshapes is the `output_pos`-th output of node `input`.
 bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
@@ -387,28 +323,16 @@ bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
   return true;
 }
 
-// Fix frame dependencies by adding control dependencies from old_input to nodes
-// in new_nodes_for_control_dep, and update frame_map for all nodes in
-// new_nodes.
-void AddFrameControlDeps(const NodeDef* old_node,
-                         const std::vector<NodeDef*>& new_nodes,
-                         const string& source_for_ctrl_dep,
-                         const std::vector<NodeDef*>& sinks_for_control_dep,
-                         GraphDef* graph, NodeMap* node_map,
-                         FrameMap* frame_map) {
-  const auto frame_it = frame_map->find(old_node);
-  if (frame_it != frame_map->end()) {
-    for (auto node : new_nodes) {
-      frame_map->emplace(node, frame_it->second);
-    }
-    if (!source_for_ctrl_dep.empty() && !sinks_for_control_dep.empty()) {
-      const string ctrl_dep = ConstantFolding::AddControlDependency(
-          source_for_ctrl_dep, graph, node_map);
-      for (auto node : sinks_for_control_dep) {
-        node->add_input(ctrl_dep);
-      }
-    }
-  }
+NodeDef* GetTailOfValuePreservingChain(
+    const NodeDef& node, const NodeMap& node_map,
+    const std::unordered_set<string>& nodes_to_preserve) {
+  auto is_value_preserving_non_branching = [&](const NodeDef& node) {
+    return IsValuePreserving(node) &&
+           NumNonControlOutputs(node, node_map) == 1 &&
+           nodes_to_preserve.count(node.name()) == 0;
+  };
+  return GetTailOfChain(node, node_map, /*follow_control_input=*/false,
+                        is_value_preserving_non_branching);
 }
 
 }  // namespace
@@ -516,68 +440,75 @@ bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
   return true;
 }
 
-// static
-bool ArithmeticOptimizer::CanDedup(
-    const NodeDef& node, const std::unordered_set<string>& nodes_to_preserve) {
-  if (nodes_to_preserve.find(node.name()) != nodes_to_preserve.end()) {
-    return false;
+NodeDef* ArithmeticOptimizer::AddNode(const string& name,
+                                      const NodeDef* node_to_copy) {
+  NodeDef* new_node = optimized_graph_->add_node();
+  const string name_with_prefix =
+      AddPrefixToNodeName(name, kArithmeticOptimizer);
+  node_map_->AddNode(NodeName(name_with_prefix), new_node);
+  if (node_to_copy != nullptr) {
+    *new_node = *node_to_copy;
   }
-  if (IsEnter(node) || IsExit(node) || IsPlaceholder(node)) {
-    return false;
-  }
-  if (node.device().find("SPU") != string::npos) {
+  new_node->set_name(name_with_prefix);
+  return new_node;
+}
+
+bool ArithmeticOptimizer::OptimizedNodeExists(const string& name) {
+  const string name_with_prefix =
+      AddPrefixToNodeName(name, kArithmeticOptimizer);
+  return node_map_->NodeExists(name_with_prefix);
+}
+
+bool ArithmeticOptimizer::CanDedup(const NodeDef& node) const {
+  if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
   }
-  const OpDef* op_def = nullptr;
-  Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
-  if (!status.ok()) {
+  if (IsEnter(node) || IsExit(node)) {
     return false;
   }
-  if (op_def->is_stateful()) {
+  if (node.device().find("SPU") != string::npos) {
     return false;
   }
-  // Don't consolidate ops such as AssignAdd
-  for (const auto& input : op_def->input_arg()) {
-    if (input.is_ref()) {
-      return false;
-    }
+  // Workaround for Assert mistakenly being labeled as stateful.
+  if (IsAssert(node)) {
+    return true;
   }
-  return true;
+  return IsFreeOfSideEffect(node);
 }
 
-void ArithmeticOptimizer::DedupComputations(GraphDef* optimized_graph) const {
-  NodeMap map(optimized_graph);
+void ArithmeticOptimizer::DedupComputations() {
   bool stop = true;
   std::set<int> duplicates;
   do {
     stop = true;
     UniqueNodes nodes;
-    for (int i = 0; i < optimized_graph->node_size(); ++i) {
+    for (int i = 0; i < optimized_graph_->node_size(); ++i) {
       if (duplicates.find(i) != duplicates.end()) {
         continue;
       }
-      NodeDef* node = optimized_graph->mutable_node(i);
-      if (!CanDedup(*node, nodes_to_preserve_)) {
+      NodeDef* node = optimized_graph_->mutable_node(i);
+      if (!CanDedup(*node)) {
         continue;
       }
       NodeDef* rep = nodes.FindOrAddRepresentative(node);
       if (rep == node) {
         continue;
       }
-      const std::set<NodeDef*>& fanouts = map.GetOutputs(node->name());
+      const std::set<NodeDef*>& fanouts = node_map_->GetOutputs(node->name());
       for (NodeDef* fanout : fanouts) {
         for (string& name : *fanout->mutable_input()) {
           int position;
-          string nodename = ParseNodeName(name, &position);
+          const string nodename = ParseNodeName(name, &position);
           if (nodename == node->name()) {
+            // Update name in-place.
             if (position > 0) {
-              name = strings::StrCat(rep->name(), ":", position);
+              name = StrCat(rep->name(), ":", position);
             } else if (position == 0) {
               name = rep->name();
             } else {
-              name = strings::StrCat("^", rep->name());
+              name = StrCat("^", rep->name());
             }
-            map.AddOutput(rep->name(), fanout->name());
+            node_map_->AddOutput(rep->name(), fanout->name());
           }
         }
       }
@@ -587,21 +518,41 @@ void ArithmeticOptimizer::DedupComputations(GraphDef* optimized_graph) const {
   } while (!stop);
 
   // Delete duplicates
-  if (!duplicates.empty()) {
-    int last = optimized_graph->node_size() - 1;
+  if (fetch_nodes_known_ && !duplicates.empty()) {
+    int last = optimized_graph_->node_size() - 1;
     for (auto it = duplicates.rbegin(); it != duplicates.rend(); ++it) {
       int index = *it;
-      optimized_graph->mutable_node()->SwapElements(index, last);
+      optimized_graph_->mutable_node()->SwapElements(index, last);
       last--;
     }
-    optimized_graph->mutable_node()->DeleteSubrange(last + 1,
-                                                    duplicates.size());
+    optimized_graph_->mutable_node()->DeleteSubrange(last + 1,
+                                                     duplicates.size());
+    // Rebuild the NodeMap which was invalidated by the node  swapping above.
+    node_map_.reset(new NodeMap(optimized_graph_));
+  }
+}
+
+void ArithmeticOptimizer::AddFrameControlDeps(
+    const NodeDef* old_node, const std::vector<NodeDef*>& new_nodes,
+    const string& source_for_ctrl_dep,
+    const std::vector<NodeDef*>& sinks_for_control_dep) {
+  const auto frame_it = frame_map_.find(old_node);
+  if (frame_it != frame_map_.end()) {
+    for (auto node : new_nodes) {
+      frame_map_.emplace(node, frame_it->second);
+    }
+    if (!source_for_ctrl_dep.empty() && !sinks_for_control_dep.empty()) {
+      const string ctrl_dep = ConstantFolding::AddControlDependency(
+          source_for_ctrl_dep, optimized_graph_, node_map_.get());
+      for (auto node : sinks_for_control_dep) {
+        MaybeAddControlInput(ctrl_dep, node, optimized_graph_, node_map_.get());
+      }
+    }
   }
 }
 
 string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
-    const NodeDef* node, GraphDef* graph_def, NodeMap* node_map,
-    std::vector<const NodeDef*>* new_nodes, FrameMap* frame_map) const {
+    const NodeDef* node, SetVector<NodeDef*>* nodes_to_simplify) {
   // Remove involutions applied twice.
   if (IsInvolution(*node)) {
     // An involution is an element-wise function f(x) that is its own inverse,
@@ -611,8 +562,8 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     // the two instances of the involution from the graph, since they cancel
     // each other.
     NodeDef* tail =
-        GetTailOfValuePreservingChain(node, node_map, nodes_to_preserve_);
-    NodeDef* involution = node_map->GetNode(tail->input(0));
+        GetTailOfValuePreservingChain(*node, *node_map_, nodes_to_preserve_);
+    NodeDef* involution = node_map_->GetNode(tail->input(0));
     if (involution->op() == node->op()) {
       // Skip both *node and *involution since they cancel each other.
       if (tail == node) {
@@ -620,8 +571,8 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         return involution->input(0);
       } else {
         tail->set_input(0, involution->input(0));
-        node_map->UpdateInput(tail->name(), involution->name(),
-                              involution->input(0));
+        node_map_->UpdateInput(tail->name(), involution->name(),
+                               involution->input(0));
         return node->input(0);
       }
     }
@@ -629,10 +580,10 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
 
   // Remove inverse transposes.
   if (node->op() == "Transpose" || node->op() == "ConjugateTranspose") {
-    const NodeDef* input = node_map->GetNode(node->input(0));
+    NodeDef* input = node_map_->GetNode(node->input(0));
     if (input->op() == node->op()) {
-      const NodeDef* node_perm = node_map->GetNode(node->input(1));
-      const NodeDef* input_perm = node_map->GetNode(input->input(1));
+      const NodeDef* node_perm = node_map_->GetNode(node->input(1));
+      const NodeDef* input_perm = node_map_->GetNode(input->input(1));
       // Try 32-bit indices.
       std::vector<int> node_perm_values;
       std::vector<int> input_perm_values;
@@ -669,14 +620,14 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     //      ^      |
     //      |      |
     //    input ---+
-    NodeDef* reshape = node_map->GetNode(node->name());
+    NodeDef* reshape = node_map_->GetNode(node->name());
     int output_pos = 0;
     string input_node_name = ParseNodeName(node->input(0), &output_pos);
-    const NodeDef* input = node_map->GetNode(input_node_name);
+    const NodeDef* input = node_map_->GetNode(input_node_name);
     if (input->op() == "Reshape") {
       reshape->set_input(0, input->input(0));
-      node_map->UpdateInput(reshape->name(), input->name(), input->input(0));
-      new_nodes->push_back(reshape);
+      node_map_->UpdateInput(reshape->name(), input->name(), input->input(0));
+      nodes_to_simplify->PushBack(reshape);
       return reshape->name();
     }
 
@@ -716,38 +667,30 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
                                          &device) &&
         (StringPiece(device).contains(DEVICE_CPU) ||
          StringPiece(device).contains(DEVICE_GPU))) {
-      const NodeDef* cast = node_map->GetNode(transpose->input(0));
+      const NodeDef* cast = node_map_->GetNode(transpose->input(0));
       if (cast->op() == "Cast") {
-        const NodeDef* input = node_map->GetNode(cast->input(0));
+        const NodeDef* input = node_map_->GetNode(cast->input(0));
         const DataType src_type = GetSourceDataType(*cast);
         const DataType dst_type = GetDestinationDataType(*cast);
         if (IsNumberType(src_type) && IsNumberType(dst_type) &&
             DataTypeSize(src_type) < DataTypeSize(dst_type)) {
-          NodeDef* new_transpose = graph_def->add_node();
-          *new_transpose = *transpose;
-          new_transpose->set_name(transpose->name() + "_" +
-                                  DataTypeString(src_type));
+          NodeDef* new_transpose =
+              AddNode(StrCat(transpose->name(), "_", DataTypeString(src_type)),
+                      transpose);
           (*new_transpose->mutable_attr())["T"].set_type(src_type);
-          node_map->AddNode(new_transpose->name(), new_transpose);
-
           new_transpose->set_input(0, cast->input(0));
-          node_map->AddOutput(input->name(), new_transpose->name());
-          node_map->AddOutput(NodeName(new_transpose->input(1)),
-                              new_transpose->name());
-
-          NodeDef* new_cast = graph_def->add_node();
-          *new_cast = *cast;
-          new_cast->set_name(cast->name() + "_new");
-          node_map->AddNode(new_cast->name(), new_cast);
+          node_map_->AddOutput(input->name(), new_transpose->name());
+          node_map_->AddOutput(NodeName(new_transpose->input(1)),
+                               new_transpose->name());
 
+          NodeDef* new_cast = AddNode(StrCat(cast->name(), "_new"), cast);
           new_cast->set_input(0, new_transpose->name());
-          node_map->AddOutput(new_transpose->name(), new_cast->name());
+          node_map_->AddOutput(new_transpose->name(), new_cast->name());
 
-          new_nodes->push_back(new_transpose);
+          nodes_to_simplify->PushBack(new_transpose);
           //  Add frame dependencies that the original node might have had.
           AddFrameControlDeps(node, {new_transpose, new_cast},
-                              new_transpose->input(0), {new_transpose},
-                              graph_def, node_map, frame_map);
+                              new_transpose->input(0), {new_transpose});
 
           return new_cast->name();
         }
@@ -756,20 +699,20 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   }
 
   if (node->op() == "Bitcast") {
-    NodeDef* bitcast = node_map->GetNode(node->name());
+    NodeDef* bitcast = node_map_->GetNode(node->name());
     // Bypass bitcasts whose source type and destination type are equal.
     if (GetSourceDataType(*bitcast) == GetDestinationDataType(*bitcast)) {
       return bitcast->input(0);
     }
 
-    const NodeDef* operand = node_map->GetNode(bitcast->input(0));
+    const NodeDef* operand = node_map_->GetNode(bitcast->input(0));
     if (operand->op() == bitcast->op()) {
       // Bitcast(Bitcast(x, type1), type2) => Bitcast(x, type2)
       bitcast->set_input(0, operand->input(0));
       SetSourceDataType(GetSourceDataType(*operand), bitcast);
-      node_map->UpdateInput(bitcast->name(), bitcast->input(0),
-                            operand->input(0));
-      new_nodes->push_back(bitcast);
+      node_map_->UpdateInput(bitcast->name(), bitcast->input(0),
+                             operand->input(0));
+      nodes_to_simplify->PushBack(bitcast);
       return bitcast->name();
     }
   }
@@ -811,22 +754,22 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   // Conv?DBackpropInput.
   if (node->op() == "Conv2D" || node->op() == "Conv3D") {
     NodeDef* conv = const_cast<NodeDef*>(node);
-    const NodeDef* weights = node_map->GetNode(NodeName(conv->input(1)));
+    const NodeDef* weights = node_map_->GetNode(NodeName(conv->input(1)));
     // Fold the multiply to conv only when the weights are constant, so the
     // multiply can be constant-folded. TODO(jingyue): When the weights aren't
     // constant, this should also help performance a bit and memory usage a lot,
     // since the weights tend to be smaller than the activations.
     if (weights->op() == "Const") {
-      const NodeDef* source = node_map->GetNode(
-          GetTailOfValuePreservingChain(node, node_map, nodes_to_preserve_)
+      const NodeDef* source = node_map_->GetNode(
+          GetTailOfValuePreservingChain(*node, *node_map_, nodes_to_preserve_)
               ->input(0));
       if (source->op() == "Mul" &&
-          node_map->GetOutputs(source->name()).size() == 1) {
+          node_map_->GetOutputs(source->name()).size() == 1) {
         const NodeDef* mul = source;
         // `scale` is the scalar multiplier, and `other` is the other operand.
         // TODO(jingyue): handle the case where `scale` is 0-th operand.
-        const NodeDef* scale = node_map->GetNode(mul->input(1));
-        const NodeDef* other = node_map->GetNode(mul->input(0));
+        const NodeDef* scale = node_map_->GetNode(mul->input(1));
+        const NodeDef* other = node_map_->GetNode(mul->input(0));
         if (scale->op() == "Const" && scale->attr().at("dtype").type() ==
                                           weights->attr().at("dtype").type()) {
           const TensorProto& scale_tensor = scale->attr().at("value").tensor();
@@ -834,39 +777,36 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
           if (scale_tensor.has_tensor_shape() &&
               scale_tensor.tensor_shape().dim_size() == 0) {
             // Create new node `scaled_weights`.
-            NodeDef* scaled_weights = graph_def->add_node();
-            scaled_weights->set_name(weights->name() + "_scaled_" +
-                                     conv->name());
+            NodeDef* scaled_weights = AddNode(
+                StrCat(weights->name(), "_scaled_", conv->name()), nullptr);
             scaled_weights->set_op("Mul");
             scaled_weights->set_device(weights->device());
             (*scaled_weights->mutable_attr())["T"] =
                 weights->attr().at("dtype");
-            node_map->AddNode(scaled_weights->name(), scaled_weights);
-            new_nodes->push_back(scaled_weights);
+            nodes_to_simplify->PushBack(scaled_weights);
 
             // Link in its inputs.
             scaled_weights->add_input(conv->input(1));
-            node_map->AddOutput(weights->name(), scaled_weights->name());
+            node_map_->AddOutput(weights->name(), scaled_weights->name());
             scaled_weights->add_input(mul->input(1));
-            node_map->AddOutput(scale->name(), scaled_weights->name());
-            AddFrameControlDeps(node, {scaled_weights}, "", {}, graph_def,
-                                node_map, frame_map);
+            node_map_->AddOutput(scale->name(), scaled_weights->name());
+            AddFrameControlDeps(node, {scaled_weights}, "", {});
 
             // Update `conv`'s weights to `scaled_weights`.
             conv->set_input(1, scaled_weights->name());
-            node_map->UpdateInput(conv->name(), weights->name(),
-                                  scaled_weights->name());
-            new_nodes->push_back(conv);
+            node_map_->UpdateInput(conv->name(), weights->name(),
+                                   scaled_weights->name());
+            nodes_to_simplify->PushBack(conv);
 
             // Update `mul`'s consumer to bypass `mul` because it's folded to
             // the weights.
-            CHECK_EQ(node_map->GetOutputs(mul->name()).size(), 1);
+            CHECK_EQ(node_map_->GetOutputs(mul->name()).size(), 1);
             NodeDef* consumer_of_mul =
-                *node_map->GetOutputs(mul->name()).begin();
+                *node_map_->GetOutputs(mul->name()).begin();
             consumer_of_mul->set_input(0, mul->input(0));
-            node_map->UpdateInput(consumer_of_mul->name(), mul->name(),
-                                  other->name());
-            new_nodes->push_back(consumer_of_mul);
+            node_map_->UpdateInput(consumer_of_mul->name(), mul->name(),
+                                   other->name());
+            nodes_to_simplify->PushBack(consumer_of_mul);
             return conv->name();
           }
         }
@@ -874,7 +814,19 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     }
   }
 
-  if (node->input_size() > 0 && IsAggregate(*node)) {
+  if (node->op() == "Mul" && node->input(0) == node->input(1) &&
+      !OptimizedNodeExists(StrCat(node->name(), "_square"))) {
+    NodeDef* new_square_node =
+        AddNode(strings::StrCat(node->name(), "_square"), node);
+    new_square_node->set_op("Square");
+    for (int i = 1; i < new_square_node->input_size(); ++i) {
+      new_square_node->set_input(i - 1, new_square_node->input(i));
+    }
+    new_square_node->mutable_input()->RemoveLast();
+    return new_square_node->name();
+  }
+
+  if (IsAggregate(*node) && NumNonControlInputs(*node) > 0) {
     // Discard aggregate nodes with a single input.
     if (node->input_size() == 1) {
       return node->input(0);
@@ -900,7 +852,8 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         break;
       }
     }
-    if (all_equal && node_map->GetNode(node->name() + "_const") == nullptr) {
+    const string mul_node_name = StrCat(node->name(), "_mul");
+    if (all_equal && !OptimizedNodeExists(mul_node_name)) {
       // 1. Create constant node with value N.
       const auto type = GetDataTypeFromAttr(*node, "T");
       Tensor t(type, TensorShape({}));
@@ -911,28 +864,26 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         return "";
       }
       TensorValue value(&t);
-      NodeDef* new_const_node = graph_def->add_node();
+      NodeDef* new_const_node =
+          AddNode(StrCat(node->name(), "_const"), nullptr);
       *new_const_node =
-          ConstantFolding::CreateNodeDef(node->name() + "_const", value);
+          ConstantFolding::CreateNodeDef(new_const_node->name(), value);
       new_const_node->set_device(node->device());
-      node_map->AddNode(new_const_node->name(), new_const_node);
-      new_nodes->push_back(new_const_node);
+      nodes_to_simplify->PushBack(new_const_node);
 
       // 2. Replace the aggregate node with Mul(Const(N), x).
-      NodeDef* new_mul_node = graph_def->add_node();
-      new_mul_node->set_name(node->name() + "_mul");
+      NodeDef* new_mul_node = AddNode(mul_node_name, nullptr);
       new_mul_node->set_op("Mul");
       new_mul_node->set_device(node->device());
       SetDataTypeToAttr(type, "T", new_mul_node);
-      node_map->AddNode(new_mul_node->name(), new_mul_node);
       new_mul_node->add_input(new_const_node->name());
-      node_map->AddOutput(new_const_node->name(), new_mul_node->name());
+      node_map_->AddOutput(new_const_node->name(), new_mul_node->name());
       new_mul_node->add_input(node->input(0));
-      node_map->AddOutput(node->input(0), new_mul_node->name());
+      node_map_->AddOutput(node->input(0), new_mul_node->name());
 
-      CopyControlInputs(*node, new_mul_node, graph_def, node_map);
+      CopyControlInputs(*node, new_mul_node, optimized_graph_, node_map_.get());
       AddFrameControlDeps(node, {new_const_node, new_mul_node}, node->input(0),
-                          {new_const_node}, graph_def, node_map, frame_map);
+                          {new_const_node});
       return new_mul_node->name();
     }
   }
@@ -941,14 +892,23 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   // multiplication over addition to hoist common factors out of aggregate nodes
   // where all the inputs are Mul nodes. This pattern occurs frequently in
   // regularization terms for the gradients during training.
-  if (node->input_size() > 1 && IsAggregate(*node) &&
-      node_map->GetNode(node->name() + "_hoist_add") == nullptr) {
+  // For example, we can rewrite an expression of the form:
+  //   AddN(Mul(x, y1), Mul(y2, x), Mul(x, y3), ... Mul(x, yn))
+  // to the following:
+  //   Mul(x, AddN(y1, y2, y3, ... yn))
+  if (opt_level_ == RewriterConfig::AGGRESSIVE && IsAggregate(*node) &&
+      NumNonControlInputs(*node) > 1 &&
+      !OptimizedNodeExists(StrCat(node->name(), "_hoist_add"))) {
     // Determine the set of common factors if the input nodes are all Mul nodes.
     std::set<string> common_factors;
-    int i = 0;
-    while (i < node->input_size() && (i == 0 || !common_factors.empty()) &&
-           !IsControlInput(node->input(i))) {
-      const NodeDef* input = node_map->GetNode(node->input(i));
+    for (int i = 0; i < node->input_size(); ++i) {
+      if (i > 0 && common_factors.empty()) {
+        break;
+      }
+      if (IsControlInput(node->input(i))) {
+        break;
+      }
+      const NodeDef* input = node_map_->GetNode(node->input(i));
       if (input->op() == "Mul") {
         std::set<string> factors_i{input->input(0), input->input(1)};
         if (i == 0) {
@@ -963,58 +923,70 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         }
       } else {
         common_factors.clear();
-        break;
       }
-      ++i;
     }
     if (common_factors.size() == 1) {
-      // In this case we have an expression of the form
-      //   AddN(Mul(x, y1), Mul(y2, x), Mul(x, y3), ... Mul(x, yn))
-      // that can be rewritten as
-      //   Mul(x, AddN(y1, y2, y3, ... yn))
-      // 1. Hoist non-shared factors up into AddN node.
       const string& common_factor = *common_factors.begin();
-      NodeDef* new_mul_node = graph_def->add_node();
-      NodeDef* new_add_node = graph_def->add_node();
-      *new_add_node = *node;
-      new_add_node->set_name(node->name() + "_hoist_add");
-      new_nodes->push_back(new_add_node);
-      node_map->AddNode(new_add_node->name(), new_add_node);
-      for (int i = 0; i < node->input_size(); ++i) {
+
+      // Gather up the non-shared factors (the y's in the example).
+      // Unless the aggregation is Add, we have to make sure that all the y's
+      // have the same shape since the other aggregation ops do not support
+      // broadcasting.
+      std::vector<string> unique_factors;
+      unique_factors.reserve(node->input_size());
+      bool shapes_match = true;
+      for (int i = 0; i < node->input_size() && shapes_match; ++i) {
         const string& input = node->input(i);
         if (IsControlInput(input)) {
-          MaybeAddControlInput(input, new_add_node, graph_def, node_map);
-          continue;
+          break;
         }
-        NodeDef* mul_node = node_map->GetNode(input);
-        int unique_factor_index = mul_node->input(0) == common_factor ? 1 : 0;
-        const string unique_factor = mul_node->input(unique_factor_index);
-        new_add_node->set_input(i, unique_factor);
-        // 2. Use a copy of the first Mul node for the outer multiplication.
-        if (i == 0) {
-          *new_mul_node = *mul_node;
-          new_mul_node->set_device(node->device());
-          new_mul_node->set_name(node->name() + "_hoist_mul");
-          new_mul_node->set_input(0, common_factor);
-          new_mul_node->set_input(1, new_add_node->name());
-          node_map->AddNode(new_mul_node->name(), new_mul_node);
+        const NodeDef* mul_node = node_map_->GetNode(input);
+        const int unique_factor_index =
+            mul_node->input(0) == common_factor ? 1 : 0;
+        unique_factors.push_back(mul_node->input(unique_factor_index));
+        if (i > 0 && !IsAdd(*node)) {
+          shapes_match = ShapesEqual(unique_factors.front(),
+                                     unique_factors.back(), *node_map_);
         }
       }
 
-      // 3. Add frame dependencies that the original node might have had.
-      AddFrameControlDeps(node, {new_add_node, new_mul_node}, common_factor,
-                          {new_add_node}, graph_def, node_map, frame_map);
+      if (shapes_match) {
+        // 1. Use a copy of the first Mul node for the outer multiplication.
+        NodeDef* new_mul_node = AddNode(StrCat(node->name(), "_hoist_mul"),
+                                        node_map_->GetNode(node->input(0)));
+        NodeDef* new_add_node =
+            AddNode(StrCat(node->name(), "_hoist_add"), node);
+        new_mul_node->set_device(node->device());
+        new_mul_node->set_input(0, common_factor);
+        node_map_->AddOutput(common_factor, new_mul_node->name());
+        new_mul_node->set_input(1, new_add_node->name());
+        node_map_->AddOutput(new_add_node->name(), new_mul_node->name());
+
+        // 2. Hoist non-shared factors up into the new AddN node.
+        nodes_to_simplify->PushBack(new_add_node);
+        for (int i = 0; i < node->input_size(); ++i) {
+          const string& input = node->input(i);
+          if (IsControlInput(input)) {
+            break;
+          }
+          new_add_node->set_input(i, unique_factors[i]);
+        }
 
-      return new_mul_node->name();
+        // 3. Add frame dependencies that the original node might have had.
+        AddFrameControlDeps(node, {new_add_node, new_mul_node}, common_factor,
+                            {new_add_node});
+
+        return new_mul_node->name();
+      }
     }
   }
 
   // Fold Transpose into matrix multiplication.
   if ((node->op() == "MatMul" || node->op() == "SparseMatMul" ||
        node->op() == "BatchMatMul") &&
-      node_map->GetNode(node->name() + "_fused") == nullptr) {
-    const NodeDef* a = node_map->GetNode(node->input(0));
-    const NodeDef* b = node_map->GetNode(node->input(1));
+      !OptimizedNodeExists(StrCat(node->name(), "_fused"))) {
+    const NodeDef* a = node_map_->GetNode(node->input(0));
+    const NodeDef* b = node_map_->GetNode(node->input(1));
     bool is_complex = false;
     if (node->op() != "SparseMatMul") {
       const DataType type = GetDataTypeFromAttr(*node, "T");
@@ -1026,32 +998,27 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
                            ? std::set<string>{"ConjugateTranspose"}
                            : std::set<string>{"Transpose"});
     const bool a_is_foldable = foldable_transpose_ops.count(a->op()) > 0 &&
-                               IsInnerMatrixTransposeNode(*a, node_map);
+                               IsInnerMatrixTransposeNode(*a, node_map_.get());
     const bool b_is_foldable = foldable_transpose_ops.count(b->op()) > 0 &&
-                               IsInnerMatrixTransposeNode(*b, node_map);
+                               IsInnerMatrixTransposeNode(*b, node_map_.get());
     if (a_is_foldable || b_is_foldable) {
-      NodeDef* new_op = graph_def->add_node();
-      *new_op = *node;
-      new_op->set_name(node->name() + "_fused");
-      node_map->AddNode(new_op->name(), new_op);
+      NodeDef* new_op = AddNode(StrCat(node->name(), "_fused"), node);
       if (a_is_foldable) {
         const string attr_a =
             node->op() == "BatchMatMul" ? "adj_x" : "transpose_a";
         FlipBooleanAttr(attr_a, new_op);
         new_op->set_input(0, a->input(0));
-        node_map->UpdateInput(new_op->name(), a->name(), a->input(0));
-        AddFrameControlDeps(node, {new_op}, a->input(0), {new_op}, graph_def,
-                            node_map, frame_map);
+        node_map_->UpdateInput(new_op->name(), a->name(), a->input(0));
+        AddFrameControlDeps(node, {new_op}, a->input(0), {new_op});
       }
       if (b_is_foldable) {
         const string attr_b =
             node->op() == "BatchMatMul" ? "adj_y" : "transpose_b";
         FlipBooleanAttr(attr_b, new_op);
         new_op->set_input(1, b->input(0));
-        node_map->UpdateInput(new_op->name(), b->name(), b->input(0));
+        node_map_->UpdateInput(new_op->name(), b->name(), b->input(0));
         if (!a_is_foldable) {
-          AddFrameControlDeps(node, {new_op}, b->input(0), {new_op}, graph_def,
-                              node_map, frame_map);
+          AddFrameControlDeps(node, {new_op}, b->input(0), {new_op});
         }
       }
     }
@@ -1060,25 +1027,21 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   // Fold Conj into Transpose or ConjugateTranspose.
   if ((node->op() == "Conj" || node->op() == "Transpose" ||
        node->op() == "ConjugateTranspose") &&
-      node_map->GetNode(node->name() + "_fused") == nullptr) {
-    const NodeDef* input = node_map->GetNode(node->input(0));
+      !OptimizedNodeExists(StrCat(node->name(), "_fused"))) {
+    const NodeDef* input = node_map_->GetNode(node->input(0));
     const NodeDef* transpose_op = node->op() == "Conj" ? input : node;
     const NodeDef* conj_op = node->op() == "Conj" ? node : input;
 
     if ((transpose_op->op() == "Transpose" ||
          transpose_op->op() == "ConjugateTranspose") &&
         conj_op->op() == "Conj") {
-      NodeDef* new_op = graph_def->add_node();
-      *new_op = *transpose_op;
-      new_op->set_name(node->name() + "_fused");
+      NodeDef* new_op = AddNode(StrCat(node->name(), "_fused"), transpose_op);
       // Flip the type of transpose op to absorb the conjugation.
       new_op->set_op(transpose_op->op() == "Transpose" ? "ConjugateTranspose"
                                                        : "Transpose");
       new_op->set_input(0, input->input(0));
-      node_map->AddNode(new_op->name(), new_op);
-      node_map->UpdateInput(new_op->name(), node->name(), input->input(0));
-      AddFrameControlDeps(node, {new_op}, "", {}, graph_def, node_map,
-                          frame_map);
+      node_map_->UpdateInput(new_op->name(), node->name(), input->input(0));
+      AddFrameControlDeps(node, {new_op}, "", {});
       return new_op->name();
     }
   }
@@ -1086,63 +1049,23 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   return "";
 }
 
-namespace {
-// A vector with a set. The set stores the same elements as the vector, and
-// quickly answers whether a value is in the vector. Duplicated elements are not
-// allowed for now.
-template <class T>
-class SetVector {
- public:
-  // Returns false if value already existed in the set, true otherwise.
-  bool PushBack(const T& value) {
-    if (!set_.insert(value).second) {
-      VLOG(2) << "Value " << value << " is already in the set.";
-      return false;
-    }
-    vector_.push_back(value);
-    return true;
-  }
-
-  T PopBack() {
-    T back = vector_.back();
-    set_.erase(back);
-    vector_.pop_back();
-    return back;
-  }
-
-  bool Exists(const T& value) const { return set_.count(value); }
-
-  bool Empty() const { return vector_.empty(); }
-
- private:
-  std::unordered_set<T> set_;
-  std::vector<T> vector_;
-};
-}  // namespace
-
-Status ArithmeticOptimizer::SimplifyArithmeticOps(
-    GraphDef* optimized_graph) const {
-  NodeMap node_map(optimized_graph);
-  FrameMap frame_map;
-  int num_frames;
-  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph, node_map,
-                                               &frame_map, &num_frames));
-  SetVector<const NodeDef*> nodes_to_simplify;
-  for (int i = 0; i < optimized_graph->node_size(); ++i) {
-    nodes_to_simplify.PushBack(optimized_graph->mutable_node()->Mutable(i));
+Status ArithmeticOptimizer::SimplifyArithmeticOps() {
+  SetVector<NodeDef*> nodes_to_simplify;
+  nodes_to_simplify.Reserve(optimized_graph_->node_size());
+  for (int i = 0; i < optimized_graph_->node_size(); ++i) {
+    nodes_to_simplify.PushBack(optimized_graph_->mutable_node(i));
   }
   while (!nodes_to_simplify.Empty()) {
     const NodeDef* node = nodes_to_simplify.PopBack();
-    std::vector<const NodeDef*> new_nodes;
-    const string simplified_tensor = TrySimplifyAndReplaceUses(
-        node, optimized_graph, &node_map, &new_nodes, &frame_map);
+    const string simplified_tensor =
+        TrySimplifyAndReplaceUses(node, &nodes_to_simplify);
     if (simplified_tensor.empty()) {
       continue;
     }
 
     if (NodeName(simplified_tensor) != node->name()) {
       // Always consider simplified_tensor for further optimizations.
-      const NodeDef* simplified_node = node_map.GetNode(simplified_tensor);
+      NodeDef* simplified_node = node_map_->GetNode(simplified_tensor);
       if (simplified_node != nullptr) {
         nodes_to_simplify.PushBack(simplified_node);
       }
@@ -1150,7 +1073,7 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(
       // consumers of `node` are already redirected to `simplified_tensor`.
       // Re-push the consumers into `nodes_to_simplify` for further
       // optimizations.
-      std::set<NodeDef*> consumers = node_map.GetOutputs(node->name());
+      std::set<NodeDef*> consumers = node_map_->GetOutputs(node->name());
       for (NodeDef* consumer : consumers) {
         // Update `consumer`'s use of `node` to `input`'s operand.
         for (int i = 0; i < consumer->input_size(); ++i) {
@@ -1163,16 +1086,12 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(
                      ? AsControlDependency(NodeName(simplified_tensor))
                      : simplified_tensor);
           }
-          VLOG(2) << "Update input " << consumer->input(i) << " of "
-                  << consumer->name() << " to " << simplified_tensor;
         }
-        node_map.UpdateInput(consumer->name(), node->name(), simplified_tensor);
+        node_map_->UpdateInput(consumer->name(), node->name(),
+                               simplified_tensor);
         nodes_to_simplify.PushBack(consumer);
       }
     }
-    for (const NodeDef* new_node : new_nodes) {
-      nodes_to_simplify.PushBack(new_node);
-    }
   }
   return Status::OK();
 }
@@ -1180,18 +1099,31 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(
 Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
                                      const GrapplerItem& item,
                                      GraphDef* optimized_graph) {
-  *optimized_graph = item.graph;
+  optimized_graph_ = optimized_graph;
+  *optimized_graph_ = item.graph;
+
+  // Set up helper data structures.
   nodes_to_preserve_ = item.NodesToPreserve();
-  GraphProperties graph_properties(item);
-  TF_RETURN_IF_ERROR(graph_properties.InferStatically());
-  TF_RETURN_IF_ERROR(graph_properties.AnnotateOutputShapes(optimized_graph));
+  fetch_nodes_known_ = !item.fetch.empty();
+  node_map_.reset(new NodeMap(optimized_graph_));
+  int num_frames;
+  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
+                                               &frame_map_, &num_frames));
+  // Shapes are only needed in aggressive mode.
+  if (opt_level_ == RewriterConfig::AGGRESSIVE) {
+    graph_properties_.reset(new GraphProperties(item));
+    TF_RETURN_IF_ERROR(graph_properties_->InferStatically(false));
+    TF_RETURN_IF_ERROR(
+        graph_properties_->AnnotateOutputShapes(optimized_graph_));
+  }
 
-  DedupComputations(optimized_graph);
-  TF_RETURN_IF_ERROR(SimplifyArithmeticOps(optimized_graph));
+  // Perform the optimizations.
+  DedupComputations();
+  TF_RETURN_IF_ERROR(SimplifyArithmeticOps());
 
   // Clear output shapes.
   for (int i = 0; i < optimized_graph->node_size(); ++i) {
-    optimized_graph->mutable_node(i)->mutable_attr()->erase(kOutputShapesAttr);
+    optimized_graph_->mutable_node(i)->mutable_attr()->erase(kOutputShapesAttr);
   }
 
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index c8cc292295ce7dec9b3ab266da910f347bfe628e..ec269792386189e5a590a99af020803810f36b1a 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -17,22 +17,21 @@ limitations under the License.
 #define TENSORFLOW_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_H_
 
 #include <unordered_set>
+#include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
 
+constexpr char kArithmeticOptimizer[] = "ArithmeticOptimizer";
+
 // Optimize TF computations by reducing the arithmetic complexity required to
 // run a model.
 class ArithmeticOptimizer : public GraphOptimizer {
  public:
-  // Returns true if it is safe to dedup node from the graph.
-  // TODO(rmlarsen): Refactor to op_types.{h,cc}.
-  static bool CanDedup(const NodeDef& node,
-                       const std::unordered_set<string>& nodes_to_preserve);
-
   ArithmeticOptimizer() : opt_level_(RewriterConfig::ON) {}
   explicit ArithmeticOptimizer(RewriterConfig::Toggle opt_level)
       : opt_level_(opt_level) {}
@@ -47,10 +46,32 @@ class ArithmeticOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
-  void DedupComputations(GraphDef* optimized_graph) const;
+  // Returns true is a node with given name and the optimizer prefix already
+  // exists.
+  bool OptimizedNodeExists(const string& name);
+
+  // Creates a new node in the graph, prefixed with "ArithmeticOptimizer/",
+  // updates node_map_, and optionally copies *node_to_copy into the new
+  // node, if node_to_copy is not nullptr.
+  NodeDef* AddNode(const string& name, const NodeDef* node_to_copy);
+
+  // Returns true if it is safe to dedup node from the graph.
+  bool CanDedup(const NodeDef& node) const;
+
+  // Dedup redundant nodes in the graph.
+  void DedupComputations();
+
+  // Fix frame dependencies by adding control dependencies from old_input to
+  // nodes in new_nodes_for_control_dep, and update frame_map for all nodes in
+  // new_nodes.
+  void AddFrameControlDeps(const NodeDef* old_node,
+                           const std::vector<NodeDef*>& new_nodes,
+                           const string& source_for_ctrl_dep,
+                           const std::vector<NodeDef*>& sinks_for_control_dep);
+
   // Runs peep-hole optimizations on `optimized_graph`, e.g., removing inverse
   // transposes.
-  Status SimplifyArithmeticOps(GraphDef* optimized_graph) const;
+  Status SimplifyArithmeticOps();
   // Tries to simplify the expression that roots at `node` and replaces the uses
   // of `node` to the simplified expression. Returns the name of the simplified
   // tensor (e.g. "split:1") or an emtpy string if no simplification is
@@ -66,14 +87,17 @@ class ArithmeticOptimizer : public GraphOptimizer {
   // TODO(jingyue): This interface is not suitable for optimizing nodes with
   // multiple output tensors. We should pass in a tensor name instead of a
   // NodeDef.
-  string TrySimplifyAndReplaceUses(
-      const NodeDef* node, GraphDef* graph_def, NodeMap* node_map,
-      std::vector<const NodeDef*>* new_nodes,
-      std::unordered_map<const NodeDef*, std::vector<int>>* frame_map) const;
-
-  std::unordered_set<string> nodes_to_preserve_;
+  string TrySimplifyAndReplaceUses(const NodeDef* node,
+                                   SetVector<NodeDef*>* nodes_to_simplify);
 
   RewriterConfig::Toggle opt_level_;
+
+  bool fetch_nodes_known_;
+  std::unordered_set<string> nodes_to_preserve_;
+  std::unique_ptr<NodeMap> node_map_;
+  FrameMap frame_map_;
+  std::unique_ptr<GraphProperties> graph_properties_;
+  GraphDef* optimized_graph_;  // Not owned.
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 354a3069052b8175249775b1be26ea0218db5133..da4263ff421d348645d33489428c1edc0bbdf9a0 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -28,6 +28,25 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+string OptimizedName(const string& name) {
+  return AddPrefixToNodeName(name, kArithmeticOptimizer);
+}
+
+void VerifyGraphsMatch(const GraphDef& original_graph,
+                       const GraphDef& optimized_graph, int line) {
+  EXPECT_EQ(original_graph.node_size(), optimized_graph.node_size()) << line;
+  for (int i = 0; i < original_graph.node_size(); ++i) {
+    const NodeDef& original = original_graph.node(i);
+    const NodeDef& optimized = optimized_graph.node(i);
+    EXPECT_EQ(original.name(), optimized.name()) << line;
+    EXPECT_EQ(original.op(), optimized.op()) << line;
+    EXPECT_EQ(original.input_size(), optimized.input_size()) << line;
+    for (int j = 0; j < original.input_size(); ++j) {
+      EXPECT_EQ(original.input(j), optimized.input(j)) << line;
+    }
+  }
+}
+
 class ArithmeticOptimizerTest : public ::testing::Test {};
 
 TEST_F(ArithmeticOptimizerTest, NoOp) {
@@ -40,27 +59,17 @@ TEST_F(ArithmeticOptimizerTest, NoOp) {
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-
-  EXPECT_EQ(item.graph.node_size(), output.node_size());
-  for (int i = 0; i < item.graph.node_size(); ++i) {
-    const NodeDef& original = item.graph.node(i);
-    const NodeDef& optimized = output.node(i);
-    EXPECT_EQ(original.name(), optimized.name());
-    EXPECT_EQ(original.op(), optimized.op());
-    EXPECT_EQ(original.input_size(), optimized.input_size());
-    for (int j = 0; j < original.input_size(); ++j) {
-      EXPECT_EQ(original.input(j), optimized.input(j));
-    }
-  }
+  VerifyGraphsMatch(item.graph, output, __LINE__);
 }
 
 TEST_F(ArithmeticOptimizerTest, OpDedupping) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output c1 = ops::Const(s.WithOpName("c1"), {3.14, 2.7}, {1, 2});
   Output c2 = ops::Const(s.WithOpName("c2"), {3.14, 2.7}, {1, 2});
-  Output mul = ops::Mul(s.WithOpName("mul"), c1, c2);
+  Output div = ops::Div(s.WithOpName("div"), c1, c2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"div"};
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -74,11 +83,44 @@ TEST_F(ArithmeticOptimizerTest, OpDedupping) {
   EXPECT_EQ(2, output.node_size());
   const NodeDef& new_c1 = output.node(0);
   EXPECT_EQ("c1", new_c1.name());
-  const NodeDef& new_mul = output.node(1);
-  EXPECT_EQ("mul", new_mul.name());
-  EXPECT_EQ(2, new_mul.input_size());
-  EXPECT_EQ("c1", new_mul.input(0));
-  EXPECT_EQ("c1", new_mul.input(1));
+  const NodeDef& new_div = output.node(1);
+  EXPECT_EQ("div", new_div.name());
+  EXPECT_EQ(2, new_div.input_size());
+  EXPECT_EQ("c1", new_div.input(0));
+  EXPECT_EQ("c1", new_div.input(1));
+}
+
+TEST_F(ArithmeticOptimizerTest, OpDeduppingAssertAndCheckNumerics) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output p = ops::Placeholder(s, DT_BOOL, ops::Placeholder::Shape({}));
+  Output c = ops::Const(s.WithOpName("c"), {3.14, 2.7}, {1, 2});
+  auto check1 = ops::CheckNumerics(s.WithOpName("check1"), c, "foo");
+  auto check2 = ops::CheckNumerics(s.WithOpName("check2"), c, "foo");
+  auto assert1 = ops::Assert(s.WithOpName("assert1"), p, {c});
+  auto assert2 = ops::Assert(s.WithOpName("assert2"), p, {c});
+  Output div = ops::Div(s.WithOpName("div").WithControlDependencies(
+                            {assert1.operation, assert2.operation}),
+                        check1, check2);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"div"};
+
+  ArithmeticOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(5, output.node_size());
+  const NodeDef& new_div = output.node(3);
+  EXPECT_EQ(4, new_div.input_size());
+  EXPECT_EQ("check1", new_div.input(0));
+  EXPECT_EQ("check1", new_div.input(1));
+  EXPECT_EQ("^assert1", new_div.input(2));
+  EXPECT_EQ("^assert1", new_div.input(3));
 }
 
 TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
@@ -87,9 +129,10 @@ TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
   Output c2 = ops::Const(s.WithOpName("c2"), {3.0f, 4.0f}, {1, 2});
   Output mul1 = ops::Mul(s.WithOpName("mul1"), c1, c2);
   Output mul2 = ops::Mul(s.WithOpName("mul2"), c2, c1);
-  Output mul3 = ops::Mul(s.WithOpName("mul3"), mul1, mul2);
+  Output div1 = ops::Div(s.WithOpName("div1"), mul1, mul2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"div"};
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -110,11 +153,35 @@ TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
   EXPECT_EQ(2, new_mul1.input_size());
   EXPECT_EQ("c1", new_mul1.input(0));
   EXPECT_EQ("c2", new_mul1.input(1));
-  const NodeDef& new_mul3 = output.node(3);
-  EXPECT_EQ("mul3", new_mul3.name());
-  EXPECT_EQ(2, new_mul3.input_size());
-  EXPECT_EQ("mul1", new_mul3.input(0));
-  EXPECT_EQ("mul1", new_mul3.input(1));
+  const NodeDef& new_div1 = output.node(3);
+  EXPECT_EQ("div1", new_div1.name());
+  EXPECT_EQ(2, new_div1.input_size());
+  EXPECT_EQ("mul1", new_div1.input(0));
+  EXPECT_EQ("mul1", new_div1.input(1));
+}
+
+TEST_F(ArithmeticOptimizerTest, MulToSquare) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
+  Output d = ops::Const(s.WithOpName("d"), {3.0f, 4.0f}, {1, 2});
+  Output mul = ops::Mul(s.WithControlDependencies(d).WithOpName("mul"), c, c);
+  Output id = ops::Identity(s.WithOpName("id"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ArithmeticOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(5, output.node_size());
+  EXPECT_EQ("id", output.node(3).name());
+  EXPECT_EQ(OptimizedName("mul_square"), output.node(3).input(0));
+  EXPECT_EQ("Square", output.node(4).op());
+  EXPECT_EQ(OptimizedName("mul_square"), output.node(4).name());
+  EXPECT_EQ(2, output.node(4).input_size());
+  EXPECT_EQ("c", output.node(4).input(0));
+  EXPECT_EQ("^d", output.node(4).input(1));
 }
 
 TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsReal) {
@@ -215,17 +282,17 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
 
   EXPECT_EQ(5, output.node_size());
   const NodeDef& new_const = output.node(3);
-  EXPECT_EQ("add_const", new_const.name());
+  EXPECT_EQ(OptimizedName("add_const"), new_const.name());
   EXPECT_EQ("^x", new_const.input(0));
   EXPECT_EQ(std::string("\0\0\0@", 4),
             new_const.attr().at("value").tensor().tensor_content());
   const NodeDef& new_mul = output.node(4);
-  EXPECT_EQ("add_mul", new_mul.name());
-  EXPECT_EQ("add_const", new_mul.input(0));
+  EXPECT_EQ(OptimizedName("add_mul"), new_mul.name());
+  EXPECT_EQ(OptimizedName("add_const"), new_mul.input(0));
   EXPECT_EQ("x", new_mul.input(1));
   const NodeDef& new_id = output.node(2);
   EXPECT_EQ("id", new_id.name());
-  EXPECT_EQ("add_mul", new_id.input(0));
+  EXPECT_EQ(OptimizedName("add_mul"), new_id.input(0));
 }
 
 TEST_F(ArithmeticOptimizerTest, TrivialSumsSimpleWithControlDep) {
@@ -249,18 +316,18 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimpleWithControlDep) {
 
   EXPECT_EQ(6, output.node_size());
   const NodeDef& new_const = output.node(4);
-  EXPECT_EQ("add_const", new_const.name());
+  EXPECT_EQ(OptimizedName("add_const"), new_const.name());
   EXPECT_EQ("^x", new_const.input(0));
   EXPECT_EQ(std::string("\0\0\0@", 4),
             new_const.attr().at("value").tensor().tensor_content());
   const NodeDef& new_mul = output.node(5);
-  EXPECT_EQ("add_mul", new_mul.name());
-  EXPECT_EQ("add_const", new_mul.input(0));
+  EXPECT_EQ(OptimizedName("add_mul"), new_mul.name());
+  EXPECT_EQ(OptimizedName("add_const"), new_mul.input(0));
   EXPECT_EQ("x", new_mul.input(1));
   EXPECT_EQ("^y", new_mul.input(2));
   const NodeDef& new_id = output.node(3);
   EXPECT_EQ("id", new_id.name());
-  EXPECT_EQ("add_mul", new_id.input(0));
+  EXPECT_EQ(OptimizedName("add_mul"), new_id.input(0));
 }
 
 TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
@@ -283,7 +350,7 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
   for (int i = 0; i < item.graph.node_size(); ++i) {
     item.graph.mutable_node(i)->set_device(devices[i]);
   }
-  ArithmeticOptimizer optimizer;
+  ArithmeticOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -297,38 +364,39 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
   // Mul(p,
   //     Add(Add(Const(2), Const(2)),
   //         Add(Const(2), Const(2))))
+  EXPECT_EQ(17, output.node_size());
   for (const auto& node : output.node()) {
     if ("id" == node.name()) {
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("Add_6_hoist_mul", node.input(0));
-    } else if ("Add_6_hoist_mul" == node.name()) {
+      EXPECT_EQ(OptimizedName("Add_6_hoist_mul"), node.input(0));
+    } else if (OptimizedName("Add_6_hoist_mul") == node.name()) {
       EXPECT_EQ("Mul", node.op());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("Placeholder", node.input(0));
-      EXPECT_EQ("Add_6_hoist_add", node.input(1));
-    } else if ("Add_6_hoist_add" == node.name()) {
+      EXPECT_EQ(OptimizedName("Add_6_hoist_add"), node.input(1));
+    } else if (OptimizedName("Add_6_hoist_add") == node.name()) {
       EXPECT_EQ("Add", node.op());
       EXPECT_EQ(3, node.input_size());
-      EXPECT_EQ("Add_4_hoist_add", node.input(0));
-      EXPECT_EQ("Add_5_hoist_add", node.input(1));
+      EXPECT_EQ(OptimizedName("Add_4_hoist_add"), node.input(0));
+      EXPECT_EQ(OptimizedName("Add_5_hoist_add"), node.input(1));
       EXPECT_EQ("^Placeholder", node.input(2));
-    } else if ("Add_4_hoist_add" == node.name()) {
+    } else if (OptimizedName("Add_4_hoist_add") == node.name()) {
       EXPECT_EQ("Add", node.op());
       EXPECT_EQ(3, node.input_size());
-      EXPECT_EQ("Add_const", node.input(0));
-      EXPECT_EQ("Add_1_const", node.input(1));
+      EXPECT_EQ(OptimizedName("Add_const"), node.input(0));
+      EXPECT_EQ(OptimizedName("Add_1_const"), node.input(1));
       EXPECT_EQ("^Placeholder", node.input(2));
-    } else if ("Add_5_hoist_add" == node.name()) {
+    } else if (OptimizedName("Add_5_hoist_add") == node.name()) {
       EXPECT_EQ("Add", node.op());
       EXPECT_EQ(3, node.input_size());
-      EXPECT_EQ("Add_const", node.input(0));
-      EXPECT_EQ("Add_1_const", node.input(1));
+      EXPECT_EQ(OptimizedName("Add_const"), node.input(0));
+      EXPECT_EQ(OptimizedName("Add_1_const"), node.input(1));
       EXPECT_EQ("^Placeholder", node.input(2));
-    } else if ("Add_const" == node.name()) {
+    } else if (OptimizedName("Add_const") == node.name()) {
       EXPECT_EQ("Const", node.op());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("^Placeholder", node.input(0));
-    } else if ("Add_1_const" == node.name()) {
+    } else if (OptimizedName("Add_1_const") == node.name()) {
       EXPECT_EQ("Const", node.op());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("^Placeholder", node.input(0));
@@ -337,39 +405,51 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
 }
 
 TEST_F(ArithmeticOptimizerTest, HoistFactor) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
-  Output y1 = ops::Const(s.WithOpName("y1"), {3.0f, 4.0f}, {1, 2});
-  Output y2 = ops::Const(s.WithOpName("y2"), {5.0f, 6.0f}, {1, 2});
-  Output mul1 = ops::Mul(s.WithOpName("mul1"), x, y1);
-  Output mul2 = ops::Mul(s.WithOpName("mul2"), y2, x);
-  Output add = ops::Add(s.WithOpName("add"), mul1, mul2);
-  Output id = ops::Identity(s.WithOpName("id"), add);
-
-  GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
-  ArithmeticOptimizer optimizer;
-  GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-
-  EXPECT_EQ(9, output.node_size());
-  const NodeDef& new_add = output.node(8);
-  EXPECT_EQ("add_hoist_add", new_add.name());
-  EXPECT_EQ("y1", new_add.input(0));
-  EXPECT_EQ("y2", new_add.input(1));
-  const NodeDef& new_mul = output.node(7);
-  EXPECT_EQ("add_hoist_mul", new_mul.name());
-  EXPECT_EQ("x", new_mul.input(0));
-  EXPECT_EQ("add_hoist_add", new_mul.input(1));
-  const NodeDef& new_id = output.node(6);
-  EXPECT_EQ("id", new_id.name());
-  EXPECT_EQ("add_hoist_mul", new_id.input(0));
+  for (bool matching_shapes : {true, false}) {
+    for (bool use_addn : {true, false}) {
+      tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+      Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+      Output y1 = ops::Const(s.WithOpName("y1"), {3.0f, 4.0f}, {1, 2});
+      Output y2 = matching_shapes
+                      ? ops::Const(s.WithOpName("y2"), {5.0f, 6.0f}, {1, 2})
+                      : ops::Const(s.WithOpName("y2"), {5.0f}, {1, 1});
+      Output mul1 = ops::Mul(s.WithOpName("mul1"), x, y1);
+      Output mul2 = ops::Mul(s.WithOpName("mul2"), y2, x);
+      Output id =
+          use_addn ? ops::Identity(s.WithOpName("id"),
+                                   ops::AddN(s.WithOpName("add"), {mul1, mul2}))
+                   : ops::Identity(s.WithOpName("id"),
+                                   ops::Add(s.WithOpName("add"), mul1, mul2));
+
+      GrapplerItem item;
+      TF_CHECK_OK(s.ToGraphDef(&item.graph));
+      ArithmeticOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+      GraphDef output;
+      Status status = optimizer.Optimize(nullptr, item, &output);
+      TF_EXPECT_OK(status);
+      // Run the optimizer twice to make sure the rewrite is idempotent.
+      item.graph.Swap(&output);
+      status = optimizer.Optimize(nullptr, item, &output);
+      TF_EXPECT_OK(status);
+
+      if (use_addn && !matching_shapes) {
+        VerifyGraphsMatch(item.graph, output, __LINE__);
+      } else {
+        EXPECT_EQ(9, output.node_size());
+        const NodeDef& new_add = output.node(8);
+        EXPECT_EQ(OptimizedName("add_hoist_add"), new_add.name());
+        EXPECT_EQ("y1", new_add.input(0));
+        EXPECT_EQ("y2", new_add.input(1));
+        const NodeDef& new_mul = output.node(7);
+        EXPECT_EQ(OptimizedName("add_hoist_mul"), new_mul.name());
+        EXPECT_EQ("x", new_mul.input(0));
+        EXPECT_EQ(OptimizedName("add_hoist_add"), new_mul.input(1));
+        const NodeDef& new_id = output.node(6);
+        EXPECT_EQ("id", new_id.name());
+        EXPECT_EQ(OptimizedName("add_hoist_mul"), new_id.input(0));
+      }
+    }
+  }
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseConjAndTranspose) {
@@ -393,7 +473,7 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndTranspose) {
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(7, output.node_size());
-  EXPECT_EQ("trans_fused", output.node(6).name());
+  EXPECT_EQ(OptimizedName("trans_fused"), output.node(6).name());
   EXPECT_EQ("ConjugateTranspose", output.node(6).op());
   EXPECT_EQ("z", output.node(6).input(0));
   EXPECT_EQ("perm", output.node(6).input(1));
@@ -417,7 +497,7 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(7, output.node_size());
-  EXPECT_EQ("conjugate_trans_fused", output.node(6).name());
+  EXPECT_EQ(OptimizedName("conjugate_trans_fused"), output.node(6).name());
   EXPECT_EQ("Transpose", output.node(6).op());
   EXPECT_EQ("z", output.node(6).input(0));
   EXPECT_EQ("perm", output.node(6).input(1));
@@ -444,7 +524,7 @@ TEST_F(ArithmeticOptimizerTest, FuseTransposeAndConj) {
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(7, output.node_size());
-  EXPECT_EQ("conj_fused", output.node(6).name());
+  EXPECT_EQ(OptimizedName("conj_fused"), output.node(6).name());
   EXPECT_EQ("ConjugateTranspose", output.node(6).op());
   EXPECT_EQ("z", output.node(6).input(0));
   EXPECT_EQ("perm", output.node(6).input(1));
@@ -480,7 +560,7 @@ TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
     TF_EXPECT_OK(status);
 
     EXPECT_EQ(7, output.node_size());
-    EXPECT_EQ("matmul_fused", output.node(6).name());
+    EXPECT_EQ(OptimizedName("matmul_fused"), output.node(6).name());
     EXPECT_EQ("a", output.node(6).input(0));
     EXPECT_EQ("b", output.node(6).input(1));
     if (matmul_type == "BatchMatMul") {
@@ -518,7 +598,7 @@ TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(11, output.node_size());
-  EXPECT_EQ("matmul_fused", output.node(10).name());
+  EXPECT_EQ(OptimizedName("matmul_fused"), output.node(10).name());
   EXPECT_EQ("a", output.node(10).input(0));
   EXPECT_EQ("b", output.node(10).input(1));
   EXPECT_TRUE(output.node(10).attr().at("adj_x").b());
@@ -964,10 +1044,11 @@ TEST_F(ArithmeticOptimizerTest, OptimizeCastMulTransposeConv) {
   NodeMap node_map(&output);
   const NodeDef* inputs_node = CHECK_NOTNULL(node_map.GetNode("Placeholder"));
   const NodeDef* transpose_node =
-      CHECK_NOTNULL(node_map.GetNode("Transpose_uint8"));
-  const NodeDef* cast_node = CHECK_NOTNULL(node_map.GetNode("Cast_new"));
+      CHECK_NOTNULL(node_map.GetNode(OptimizedName("Transpose_uint8")));
+  const NodeDef* cast_node =
+      CHECK_NOTNULL(node_map.GetNode(OptimizedName("Cast_new")));
   const NodeDef* weights_node =
-      CHECK_NOTNULL(node_map.GetNode("weights_scaled_Conv2D"));
+      CHECK_NOTNULL(node_map.GetNode(OptimizedName("weights_scaled_Conv2D")));
   const NodeDef* conv_node = CHECK_NOTNULL(node_map.GetNode("Conv2D"));
 
   EXPECT_EQ(output.node_size(), 7);
@@ -1011,11 +1092,11 @@ TEST_F(ArithmeticOptimizerTest, OptimizeMultipleMulTransposeConv) {
 
   NodeMap node_map(&output);
   const NodeDef* weights_node =
-      CHECK_NOTNULL(node_map.GetNode("weights_scaled_Conv2D"));
+      CHECK_NOTNULL(node_map.GetNode(OptimizedName("weights_scaled_Conv2D")));
   const NodeDef* conv_node = CHECK_NOTNULL(node_map.GetNode("Conv2D"));
 
   const NodeDef* weights_node_1 =
-      CHECK_NOTNULL(node_map.GetNode("weights_scaled_Conv2D_1"));
+      CHECK_NOTNULL(node_map.GetNode(OptimizedName("weights_scaled_Conv2D_1")));
   const NodeDef* conv_node_1 = CHECK_NOTNULL(node_map.GetNode("Conv2D_1"));
   EXPECT_EQ(conv_node->input(1), weights_node->name());
   EXPECT_EQ(conv_node_1->input(1), weights_node_1->name());
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 8ae0d57068a4f9277ee3d5d040544c4eb7284272..59df49c245a26a4e062dd815f298cbc65aa752dc 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -30,13 +30,16 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/bcast.h"
+#include "tensorflow/core/util/saved_tensor_slice_util.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -95,7 +98,38 @@ class DeviceSimple : public DeviceBase {
   std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
 };
 
+template <typename T>
+bool AllValuesAre(const TensorProto& tensor, const T& value) {
+  // TensorProto represents the content of the tensor in either <type>_val or
+  // tensor_content.
+  typename checkpoint::SaveTypeTraits<T>::RepeatedField* tensor_values =
+      checkpoint::MutableTensorProtoData<T>(const_cast<TensorProto*>(&tensor));
+  if (!tensor_values->empty()) {
+    for (const T& tensor_value : *tensor_values) {
+      if (tensor_value != value) {
+        return false;
+      }
+    }
+    return true;
+  }
+  const auto tensor_content_size = tensor.tensor_content().size();
+  if (tensor_content_size > 0) {
+    CHECK_EQ(0, tensor_content_size % sizeof(T));
+    std::vector<T> raw_values(tensor_content_size / sizeof(T));
+    port::CopyToArray(tensor.tensor_content(),
+                      reinterpret_cast<char*>(raw_values.data()));
+    for (int i = 0; i < tensor_content_size / sizeof(T); ++i) {
+      if (raw_values[i] != value) {
+        return false;
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
 }  // namespace
+
 ConstantFolding::ConstantFolding(RewriterConfig::Toggle opt_level,
                                  DeviceBase* cpu_device)
     : opt_level_(opt_level), cpu_device_(cpu_device) {
@@ -190,23 +224,48 @@ Status ConvertShapeToConstant(const string& op, const DataType& type,
   return Status::OK();
 }
 
-Status ConstantFolding::MaterializeShapes(const GrapplerItem& item,
-                                          const GraphProperties& properties) {
+// TODO(rmlarsen): Perhaps we should move this to the GraphOptimizer base class.
+bool ConstantFolding::OptimizedNodeExists(const NodeDef& node,
+                                          StringPiece suffix) const {
+  return node_map_->NodeExists(OptimizedNodeName(node, suffix));
+}
+
+string ConstantFolding::OptimizedNodeName(const NodeDef& node) const {
+  return OptimizedNodeName(node, "");
+}
+string ConstantFolding::OptimizedNodeName(const NodeDef& node,
+                                          StringPiece suffix) const {
+  return AddPrefixToNodeName(strings::StrCat(node.name(), suffix),
+                             kConstantFoldingConst);
+}
+
+bool ConstantFolding::IsReallyConstant(const NodeDef& node) const {
+  if (!IsConstant(node)) {
+    return false;
+  }
+  // If the node is fed it's not constant anymore.
+  return feed_nodes_.find(node.name()) == feed_nodes_.end();
+}
+
+Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
   // We may add some nodes to the graph to encode control dependencies: there is
   // no need to process these, so only iterate over the nodes of the input
   // graph.
-  const int node_count = graph_.node_size();
+  const int node_count = graph_->node_size();
   for (int i = 0; i < node_count; ++i) {
-    NodeDef& node = *graph_.mutable_node(i);
+    NodeDef& node = *graph_->mutable_node(i);
     const string op = node.op();
     if (op != "Shape" && op != "Size" && op != "Rank" && op != "ShapeN") {
       continue;
     }
 
-    std::vector<OpInfo::TensorProperties> output =
+    const std::vector<OpInfo::TensorProperties>& output =
         properties.GetOutputProperties(node.name());
-    std::vector<OpInfo::TensorProperties> input =
+    const std::vector<OpInfo::TensorProperties>& input =
         properties.GetInputProperties(node.name());
+    if (input.empty() || output.empty()) {
+      continue;
+    }
     if (op == "Shape" || op == "Size" || op == "Rank") {
       CHECK_EQ(1, output.size());
       CHECK_EQ(1, input.size());
@@ -241,7 +300,7 @@ Status ConstantFolding::MaterializeShapes(const GrapplerItem& item,
           // cases where the shape/rank/size would have been run in
           // the original graph. Additional inputs are extra control
           string ctrl_dep =
-              AddControlDependency(node.input(0), &graph_, node_map_.get());
+              AddControlDependency(node.input(0), graph_, node_map_.get());
           node.set_input(0, ctrl_dep);
           node_map_->AddOutput(NodeName(ctrl_dep), node.name());
         } else {
@@ -252,11 +311,10 @@ Status ConstantFolding::MaterializeShapes(const GrapplerItem& item,
               string node_name = ParseNodeName(output->input(k), &port);
               if (node_name == node.name() && port == j) {
                 // Create a const node as ShapeN's output if not already.
-                string const_name =
-                    AddPrefixToNodeName(strings::StrCat(node.name(), "-", j),
-                                        kConstantFoldingConst);
+                const string const_name =
+                    OptimizedNodeName(node, strings::StrCat("-", j));
                 if (node_map_->GetNode(const_name) == nullptr) {
-                  NodeDef* added_node = graph_.add_node();
+                  NodeDef* added_node = graph_->add_node();
                   added_node->set_name(const_name);
                   added_node->set_op("Const");
                   added_node->set_device(node.device());
@@ -267,7 +325,7 @@ Status ConstantFolding::MaterializeShapes(const GrapplerItem& item,
                   // We add a control dependency to the original ShapeN node,
                   // so that the node will only be run if all inputs of the
                   // original ShapeN node are run.
-                  string ctrl_dep = AddControlDependency(node.name(), &graph_,
+                  string ctrl_dep = AddControlDependency(node.name(), graph_,
                                                          node_map_.get());
                   *added_node->add_input() = ctrl_dep;
                   node_map_->AddOutput(NodeName(ctrl_dep), added_node->name());
@@ -285,6 +343,7 @@ Status ConstantFolding::MaterializeShapes(const GrapplerItem& item,
   return Status::OK();
 }
 
+namespace {
 bool ShapesEqual(const TensorShapeProto& shape1,
                  const TensorShapeProto& shape2) {
   if (shape1.unknown_rank() || shape2.unknown_rank()) {
@@ -297,11 +356,13 @@ bool ShapesEqual(const TensorShapeProto& shape1,
     if (shape1.dim(i).size() != shape2.dim(i).size()) {
       return false;
     }
+    if (shape1.dim(i).size() == -1 || shape2.dim(i).size() == -1) {
+      return false;
+    }
   }
   return true;
 }
 
-namespace {
 bool ExtractShape(const NodeDef& shape_node, const GraphProperties& properties,
                   BCast::Vec* shape, int64* min_id) {
   if (shape_node.op() == "Shape") {
@@ -344,11 +405,12 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
   const NodeDef* shape_node1 = node_map_->GetNode(node.input(0));
   const NodeDef* shape_node2 = node_map_->GetNode(node.input(1));
   if (shape_node1 == nullptr ||
-      (shape_node1->op() != "Shape" && shape_node1->op() != "Const") ||
+      (shape_node1->op() != "Shape" && !IsReallyConstant(*shape_node1)) ||
       shape_node2 == nullptr ||
-      (shape_node2->op() != "Shape" && shape_node2->op() != "Const")) {
+      (shape_node2->op() != "Shape" && !IsReallyConstant(*shape_node2))) {
     return Status::OK();
   }
+
   int64 min_id = 0;
   BCast::Vec shape1;
   if (!ExtractShape(*shape_node1, properties, &shape1, &min_id)) {
@@ -375,6 +437,9 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
   if (!bcast.IsValid()) {
     return Status::OK();
   }
+  // Beware: the reduction dimensions are valid iff we assume that two distinct
+  // symbolic dimensions can't be equal. This is often but not always true, so
+  // this optimization isn't safe.
   BCast::Vec reduce_dims[2];
   reduce_dims[0] = bcast.grad_x_reduce_idx();
   reduce_dims[1] = bcast.grad_y_reduce_idx();
@@ -388,25 +453,24 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
       // which case there would be no reduction.
       out[j] = nullptr;
     } else {
-      string const_name = AddPrefixToNodeName(
-          strings::StrCat(node.name(), "-", j), kConstantFoldingConst);
+      string const_name = OptimizedNodeName(node, strings::StrCat("-", j));
       out[j] = node_map_->GetNode(const_name);
       if (out[j] == nullptr) {
-        out[j] = graph_.add_node();
+        out[j] = graph_->add_node();
         Tensor value(type, TensorShape({0}));
         *out[j] = CreateNodeDef(const_name, TensorValue(&value));
         out[j]->set_device(node.device());
         node_map_->AddNode(const_name, out[j]);
         string ctrl_dep =
-            AddControlDependency(node.name(), &graph_, node_map_.get());
+            AddControlDependency(node.name(), graph_, node_map_.get());
         *out[j]->add_input() = ctrl_dep;
         node_map_->AddOutput(NodeName(ctrl_dep), const_name);
       }
     }
   }
 
-  auto outputs = node_map_->GetOutputs(node.name());
-  for (const auto& output : outputs) {
+  const std::set<NodeDef*> outputs = node_map_->GetOutputs(node.name());
+  for (NodeDef* output : outputs) {
     for (int k = 0; k < output->input_size(); ++k) {
       int port;
       string node_name = ParseNodeName(output->input(k), &port);
@@ -426,13 +490,17 @@ Status ConstantFolding::MaterializeReductionIndices(
     return Status::OK();
   }
   const NodeDef* indices = node_map_->GetNode(node->input(1));
-  if (!indices || IsConstant(*indices)) {
+  if (!indices || IsReallyConstant(*indices)) {
     // The reduction indices are already constant, there's nothing to do.
     return Status::OK();
   }
 
-  const OpInfo::TensorProperties& input_prop =
-      properties.GetInputProperties(node->name())[0];
+  const std::vector<OpInfo::TensorProperties>& input_props =
+      properties.GetInputProperties(node->name());
+  if (input_props.size() != 2) {
+    return Status::OK();
+  }
+  const OpInfo::TensorProperties& input_prop = input_props[0];
   if (input_prop.shape().unknown_rank()) {
     // We can't do anything if we don't know the rank of the input.
     return Status::OK();
@@ -442,17 +510,31 @@ Status ConstantFolding::MaterializeReductionIndices(
     // Unexpected graph, don't try to change it.
     return Status::OK();
   }
-  const OpInfo::TensorProperties& output_prop =
-      properties.GetOutputProperties(node->name())[0];
+  const std::vector<OpInfo::TensorProperties>& output_props =
+      properties.GetOutputProperties(node->name());
+  if (output_props.size() != 1) {
+    return Status::OK();
+  }
+  const bool keep_dims =
+      node->attr().count("keep_dims") && node->attr().at("keep_dims").b();
+  const OpInfo::TensorProperties& output_prop = output_props[0];
   PartialTensorShape output_shape(output_prop.shape());
   if (output_shape.num_elements() != 1) {
     bool full_reduction = false;
     for (const NodeDef* fanout : node_map_->GetOutputs(node->name())) {
-      if (!IsReshape(*fanout)) {
-        continue;
+      if (!IsReshape(*fanout) && !keep_dims) {
+        // Depending on how it's setup, a full reduction will generate a tensor
+        // of shape [], [1], [1, 1], [1, 1, ...]. If keep_dims isn't true, we
+        // rely on the existence of a reshape node following the reduction to
+        // ensure that the fanout is fed a scalar of the right shape.
+        return Status::OK();
+      }
+      const std::vector<OpInfo::TensorProperties>& reshape_props =
+          properties.GetOutputProperties(fanout->name());
+      if (reshape_props.size() != 1) {
+        return Status::OK();
       }
-      const OpInfo::TensorProperties& reshape_prop =
-          properties.GetOutputProperties(fanout->name())[0];
+      const OpInfo::TensorProperties& reshape_prop = reshape_props[0];
       PartialTensorShape shape(reshape_prop.shape());
       if (shape.num_elements() != 1) {
         return Status::OK();
@@ -465,21 +547,18 @@ Status ConstantFolding::MaterializeReductionIndices(
     }
   }
 
-  const OpInfo::TensorProperties& reduction_prop =
-      properties.GetInputProperties(node->name())[1];
+  const OpInfo::TensorProperties& reduction_prop = input_props[1];
   DataType dtype = reduction_prop.dtype();
   if (dtype != DT_INT32 && dtype != DT_INT64) {
     return Status::OK();
   }
   // We know it's a full reduction. We can generate the set of indices to
   // reduce.
-  string const_name =
-      AddPrefixToNodeName(strings::StrCat(node->name(), "-reduction_indices"),
-                          kConstantFoldingConst);
+  string const_name = OptimizedNodeName(*node, "-reduction_indices");
   if (node_map_->GetNode(const_name)) {
     return Status::OK();
   }
-  NodeDef* reduction_indices = graph_.add_node();
+  NodeDef* reduction_indices = graph_->add_node();
   Tensor value(dtype, TensorShape({rank}));
   for (int i = 0; i < rank; ++i) {
     if (dtype == DT_INT32) {
@@ -491,7 +570,7 @@ Status ConstantFolding::MaterializeReductionIndices(
   *reduction_indices = CreateNodeDef(const_name, TensorValue(&value));
   reduction_indices->set_device(node->device());
   string ctrl_dep =
-      AddControlDependency(node->input(1), &graph_, node_map_.get());
+      AddControlDependency(node->input(1), graph_, node_map_.get());
   *reduction_indices->add_input() = ctrl_dep;
   node_map_->AddNode(const_name, reduction_indices);
   node_map_->AddOutput(NodeName(ctrl_dep), const_name);
@@ -504,12 +583,13 @@ Status ConstantFolding::MaterializeReductionIndices(
 }
 
 Status ConstantFolding::MaterializeConstants(
-    const GrapplerItem& item, const GraphProperties& properties) {
-  const int node_count = graph_.node_size();
+    const GraphProperties& properties) {
+  const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
+  const int node_count = graph_->node_size();
   for (int i = 0; i < node_count; ++i) {
-    NodeDef& node = *graph_.mutable_node(i);
+    NodeDef& node = *graph_->mutable_node(i);
     const string& op = node.op();
-    if (op == "BroadcastGradientArgs") {
+    if (is_aggressive && op == "BroadcastGradientArgs") {
       TF_RETURN_IF_ERROR(MaterializeBroadcastGradientArgs(node, properties));
     } else if (IsReduction(node)) {
       TF_RETURN_IF_ERROR(MaterializeReductionIndices(&node, properties));
@@ -523,24 +603,23 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   if (node.input().empty()) {
     return false;
   }
-
   // Skips nodes that must be preserved except whitelisted nodes.
   if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end() &&
       nodes_whitelist_.find(node.name()) == nodes_whitelist_.end()) {
     return false;
   }
-
-  // Skips ops that don't benefit from folding.
-  const string& op = node.op();
-  // Skip constants, they're already folded
-  if (op == "Const") {
+  // Skip control flow nodes, they can't be folded
+  if (ModifiesFrameInfo(node)) {
     return false;
   }
-  // Skip constrol flow nodes, they can't be folded
-  if (op == "Enter" || op == "RefEnter" || op == "Exit" || op == "RefExit" ||
-      op == "NextIteration" || op == "RefNextIteration") {
+  // Skip constants, they're already folded
+  if (IsConstant(node)) {
     return false;
   }
+
+  // Skips ops that don't benefit from folding.
+  const string& op = node.op();
+
   if (op.find("Placeholder") == 0) {
     return false;
   }
@@ -594,7 +673,7 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
     if (!input_node) {
       return false;
     }
-    bool is_const = IsConstant(*input_node);
+    bool is_const = IsReallyConstant(*input_node);
     if (!is_const && !is_merge) {
       return false;
     }
@@ -612,6 +691,37 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   return true;
 }
 
+namespace {
+
+#define SET_TENSOR_VAL_CASE(DTYPE, TYPE, NAME)     \
+  case DTYPE:                                      \
+    t->add_##NAME##_val(static_cast<TYPE>(value)); \
+    break;
+
+Status CreateConstantTensorAttrValue(DataType type, double value,
+                                     const TensorShapeProto& shape,
+                                     AttrValue* attr_tensor) {
+  TensorProto* t = attr_tensor->mutable_tensor();
+  t->set_dtype(type);
+  *t->mutable_tensor_shape() = shape;
+  switch (type) {
+    SET_TENSOR_VAL_CASE(DT_FLOAT, float, float);
+    SET_TENSOR_VAL_CASE(DT_DOUBLE, double, double);
+    SET_TENSOR_VAL_CASE(DT_INT64, int64, int64);
+    SET_TENSOR_VAL_CASE(DT_INT32, int32, int);
+    SET_TENSOR_VAL_CASE(DT_INT16, int32, int);
+    SET_TENSOR_VAL_CASE(DT_INT8, int32, int);
+    SET_TENSOR_VAL_CASE(DT_UINT8, int32, int);
+    SET_TENSOR_VAL_CASE(DT_BOOL, bool, bool);
+    default:
+      return errors::InvalidArgument("Unsupported type: ", type);
+  }
+  return Status::OK();
+}
+
+#undef SET_TENSOR_CAL_CASE
+}  // namespace
+
 // static
 NodeDef ConstantFolding::CreateNodeDef(const string& name,
                                        const TensorValue& tensor) {
@@ -652,6 +762,14 @@ NodeDef ConstantFolding::CreateNodeDef(const string& name,
       POPULATE_TENSOR_PROTO(tensor, t, int64, int64)
     } else if (tensor->dtype() == DT_INT32) {
       POPULATE_TENSOR_PROTO(tensor, t, int32, int)
+    } else if (tensor->dtype() == DT_INT16) {
+      POPULATE_TENSOR_PROTO(tensor, t, int16, int)
+    } else if (tensor->dtype() == DT_INT8) {
+      POPULATE_TENSOR_PROTO(tensor, t, int8, int)
+    } else if (tensor->dtype() == DT_UINT8) {
+      POPULATE_TENSOR_PROTO(tensor, t, uint8, int)
+    } else if (tensor->dtype() == DT_BOOL) {
+      POPULATE_TENSOR_PROTO(tensor, t, bool, bool)
     }
   }
   if (optimized) {
@@ -720,7 +838,7 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
       break;
     }
     const NodeDef* input_node = node_map_->GetNode(input);
-    if (!IsConstant(*input_node)) {
+    if (!IsReallyConstant(*input_node)) {
       return Status(error::INVALID_ARGUMENT,
                     strings::StrCat("Can't fold ", node.name(), ", its ", input,
                                     " isn't constant"));
@@ -737,7 +855,7 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
   }
 
   for (size_t i = 0; i < output_tensors.size(); i++) {
-    string node_name = AddPrefixToNodeName(node.name(), kConstantFoldingConst);
+    string node_name = OptimizedNodeName(node, "");
     if (output_tensors.size() > 1) {
       node_name = strings::StrCat(node_name, "-", i);
     }
@@ -774,7 +892,7 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph) {
         continue;
       }
       NodeDef* input_node = node_map_->GetNode(input);
-      if (!IsConstant(*input_node)) {
+      if (!IsReallyConstant(*input_node)) {
         continue;
       }
       bool valid_input = true;
@@ -789,10 +907,8 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph) {
         continue;
       }
 
-      string const_out_name =
-          AddPrefixToNodeName(node->name(), kConstantFoldingConst);
-      string const_index_name = AddPrefixToNodeName(
-          strings::StrCat(node->name(), "_index"), kConstantFoldingConst);
+      string const_out_name = OptimizedNodeName(*node);
+      string const_index_name = OptimizedNodeName(*node, "_index");
       if (node_map_->GetNode(const_out_name) ||
           node_map_->GetNode(const_index_name)) {
         // Intended name already exists.
@@ -955,8 +1071,8 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph) {
 Status ConstantFolding::FoldGraph(GraphDef* output) {
   std::unordered_set<string> processed_nodes;
   std::deque<NodeDef*> queue;
-  for (int i = 0; i < graph_.node_size(); i++) {
-    auto node = graph_.mutable_node(i);
+  for (int i = 0; i < graph_->node_size(); i++) {
+    auto node = graph_->mutable_node(i);
     if (IsFoldable(*node)) {
       queue.push_back(node);
     }
@@ -969,6 +1085,7 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
     }
     // We need to record a copy of output nodes before FoldNode() modifies it.
     std::set<NodeDef*> outputs = node_map_->GetOutputs(node->name());
+
     Status s = FoldNode(node, output);
     processed_nodes.insert(node->name());
     if (!s.ok()) {
@@ -995,7 +1112,7 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
   output->mutable_node()->DeleteSubrange(last + 1,
                                          output->node_size() - last - 1);
 
-  for (const auto& node : graph_.node()) {
+  for (const auto& node : graph_->node()) {
     // If no fetch nodes is provided, we conservatively
     // keep all nodes in the original graph in case users need to fetch
     // their values.
@@ -1016,7 +1133,7 @@ bool ConstantFolding::IsSimplifiableReduction(const NodeDef& node) const {
   if (IsReduction(node)) {
     CHECK_LE(2, node.input_size());
     const NodeDef* reductions_indices = node_map_->GetNode(node.input(1));
-    if (IsConstant(*reductions_indices)) {
+    if (IsReallyConstant(*reductions_indices)) {
       TensorVector output;
       Status s = EvaluateNode(*reductions_indices, TensorVector(), &output);
       if (!s.ok()) {
@@ -1040,7 +1157,7 @@ bool ConstantFolding::IsSimplifiableReshape(
   }
   CHECK_LE(2, node.input_size());
   const NodeDef* new_shape = node_map_->GetNode(node.input(1));
-  if (!IsConstant(*new_shape)) {
+  if (!IsReallyConstant(*new_shape)) {
     return false;
   }
   TensorVector outputs;
@@ -1090,49 +1207,331 @@ bool ConstantFolding::IsSimplifiableReshape(
   return shape.IsCompatibleWith(new_dims);
 }
 
+#define IS_VALUE_CASE(DTYPE, VALUE)                   \
+  case DTYPE:                                         \
+    return AllValuesAre<EnumToDataType<DTYPE>::Type>( \
+        node.attr().at("value").tensor(), EnumToDataType<DTYPE>::Type(VALUE))
+
+#define IS_ONES_CASE(TYPE) IS_VALUE_CASE(TYPE, 1)
+#define IS_ZEROS_CASE(TYPE) IS_VALUE_CASE(TYPE, 0)
+
+bool ConstantFolding::IsOnes(const NodeDef& node) const {
+  if (feed_nodes_.find(node.name()) != feed_nodes_.end()) {
+    return false;
+  }
+  if (node.op() == "OnesLike") {
+    return true;
+  }
+  if (node.op() != "Const") {
+    return false;
+  }
+  const auto dtype = node.attr().at("dtype").type();
+  switch (dtype) {
+    //    IS_ONES_CASE(DT_HALF);
+    IS_ONES_CASE(DT_FLOAT);
+    IS_ONES_CASE(DT_DOUBLE);
+    IS_ONES_CASE(DT_UINT8);
+    IS_ONES_CASE(DT_INT8);
+    IS_ONES_CASE(DT_UINT16);
+    IS_ONES_CASE(DT_INT16);
+    IS_ONES_CASE(DT_INT32);
+    IS_ONES_CASE(DT_INT64);
+    IS_ONES_CASE(DT_COMPLEX64);
+    IS_ONES_CASE(DT_COMPLEX128);
+    default:
+      LOG(ERROR) << "Unexpected type " << DataTypeString(dtype);
+      return false;
+  }
+  return false;
+}
+
+bool ConstantFolding::IsZeros(const NodeDef& node) const {
+  if (feed_nodes_.find(node.name()) != feed_nodes_.end()) {
+    return false;
+  }
+  if (node.op() == "ZerosLike") {
+    return true;
+  }
+  if (!IsConstant(node)) {
+    return false;
+  }
+  const auto dtype = node.attr().at("dtype").type();
+  switch (dtype) {
+    //    IS_ZEROS_CASE(DT_HALF);
+    IS_ZEROS_CASE(DT_FLOAT);
+    IS_ZEROS_CASE(DT_DOUBLE);
+    IS_ZEROS_CASE(DT_UINT8);
+    IS_ZEROS_CASE(DT_INT8);
+    IS_ZEROS_CASE(DT_UINT16);
+    IS_ZEROS_CASE(DT_INT16);
+    IS_ZEROS_CASE(DT_INT32);
+    IS_ZEROS_CASE(DT_INT64);
+    IS_ZEROS_CASE(DT_COMPLEX64);
+    IS_ZEROS_CASE(DT_COMPLEX128);
+    default:
+      LOG(ERROR) << "Unexpected type " << DataTypeString(dtype);
+      return false;
+  }
+  return false;
+}
+
+void ConstantFolding::ReplaceOperationWithIdentity(int input_to_forward,
+                                                   NodeDef* node) {
+  node->set_op("Identity");
+  // Propagate the designated input through the identity.
+  node->mutable_input()->SwapElements(0, input_to_forward);
+  // Add all other inputs as control dependencies.
+  for (int i = 1; i < node->input_size(); ++i) {
+    node->set_input(i, AsControlDependency(node->input(i)));
+  }
+  graph_modified_ = true;
+}
+
+void ConstantFolding::ReplaceDivisionOfOnesByReciprocal(NodeDef* node) {
+  node->set_op("Reciprocal");
+  node->mutable_input()->SwapElements(0, 1);
+  node->set_input(1, AsControlDependency(node->input(1)));
+  graph_modified_ = true;
+}
+
+Status ConstantFolding::ReplaceOperationWithConstant(
+    double value, const TensorShapeProto& shape, NodeDef* node) {
+  AttrValue tensor_attr;
+  AttrValue dtype_attr = node->attr().at("T");
+  TF_RETURN_IF_ERROR(CreateConstantTensorAttrValue(dtype_attr.type(), value,
+                                                   shape, &tensor_attr));
+  node->clear_attr();
+  node->mutable_attr()->insert({"dtype", dtype_attr});
+  node->mutable_attr()->insert({"value", tensor_attr});
+  node->set_op("Const");
+  // Convert all inputs to control dependencies.
+  for (int i = 0; i < node->input_size(); ++i) {
+    if (IsControlInput(node->input(i))) {
+      break;
+    }
+    node->set_input(i, AsControlDependency(node->input(i)));
+  }
+  graph_modified_ = true;
+  return Status::OK();
+}
+
 Status ConstantFolding::SimplifyGraph(GraphDef* output,
-                                      const GraphProperties& properties) {
-  for (auto& node : *output->mutable_node()) {
-    if (IsSimplifiableReduction(node)) {
+                                      const GraphProperties& properties,
+                                      bool use_shape_info) {
+  const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
+  for (int i = 0; i < output->node_size(); ++i) {
+    NodeDef* node = output->mutable_node(i);
+    if (IsSimplifiableReduction(*node)) {
       // Replace the reduction node with an identity node, that can be further
       // optimized by the model pruner.
-      const NodeDef* reductions_indices = node_map_->GetNode(node.input(1));
       DataType output_type;
-      if (node.attr().count("T") > 0) {
-        output_type = node.attr().at("T").type();
+      if (node->attr().count("T") > 0) {
+        output_type = node->attr().at("T").type();
       } else {
         // This is an 'any' or 'all' reduction. The output is always boolean.
         output_type = DT_BOOL;
       }
-      node.set_op("Identity");
-      node.clear_attr();
-      (*node.mutable_attr())["T"].set_type(output_type);
-      if (node.input_size() > 2) {
-        node.mutable_input()->SwapElements(1, node.input_size() - 1);
+      node->set_op("Identity");
+      node->clear_attr();
+      (*node->mutable_attr())["T"].set_type(output_type);
+      *node->mutable_input(1) = AsControlDependency(node->input(1));
+      graph_modified_ = true;
+      continue;
+    }
+    const bool safe_to_use_shapes =
+        use_shape_info && (feed_nodes_.empty() || is_aggressive);
+    if (safe_to_use_shapes && IsSimplifiableReshape(*node, properties)) {
+      DataType output_type = node->attr().at("T").type();
+      node->set_op("Identity");
+      node->clear_attr();
+      (*node->mutable_attr())["T"].set_type(output_type);
+      *node->mutable_input(1) = AsControlDependency(node->input(1));
+      graph_modified_ = true;
+      continue;
+    }
+
+    const bool is_mul = IsMul(*node);
+    const bool is_matmul = IsMatMul(*node);
+    const bool is_add = IsAdd(*node) || IsBiasAdd(*node);
+    const bool is_sub = IsSub(*node);
+    const bool is_any_div = IsAnyDiv(*node);
+    // Simplify arithmetic operations with ones or zeros.
+    if (safe_to_use_shapes &&
+        (is_mul || is_matmul || is_add || is_sub || is_any_div) &&
+        properties.HasInputProperties(node->name()) &&
+        properties.HasOutputProperties(node->name())) {
+      const NodeDef* x = node_map_->GetNode(node->input(0));
+      const NodeDef* y = node_map_->GetNode(node->input(1));
+      if (x == nullptr || y == nullptr) {
+        return errors::InvalidArgument("Invalid inputs to node: ",
+                                       node->DebugString());
+      }
+      const TensorShapeProto& output_shape =
+          properties.GetOutputProperties(node->name())[0].shape();
+
+      // Simplify element-wise multiplication by ones or addition/subtraction
+      // of zeros.
+      const TensorShapeProto& y_shape =
+          properties.GetInputProperties(node->name())[1].shape();
+      const bool x_is_zero = IsZeros(*x);
+      const bool x_is_one = IsOnes(*x);
+      const bool y_matches_output_shape = ShapesEqual(output_shape, y_shape);
+      if (y_matches_output_shape &&
+          ((is_mul && x_is_one) || (is_add && x_is_zero))) {
+        // TODO(rmlarsen): Handle subtraction 0 - y.
+        // 1 * y = y or 0 + y = y.
+        ReplaceOperationWithIdentity(1, node);
+        continue;
       }
-      node.mutable_input()->RemoveLast();
-      for (const auto& input : reductions_indices->input()) {
-        DCHECK(IsControlInput(input));
-        *node.add_input() = input;
+
+      // Replace 1 / y with Reciprocal op.
+      if (y_matches_output_shape && is_any_div && x_is_one) {
+        DataType type = node->attr().at("T").type();
+        if (DataTypeIsFloating(type) || DataTypeIsComplex(type)) {
+          ReplaceDivisionOfOnesByReciprocal(node);
+          continue;
+        }
       }
+
+      const TensorShapeProto& x_shape =
+          properties.GetInputProperties(node->name())[0].shape();
+      const bool y_is_zero = IsZeros(*y);
+      const bool y_is_one = IsOnes(*y);
+      const bool x_matches_output_shape = ShapesEqual(output_shape, x_shape);
+      if (x_matches_output_shape &&
+          (((is_mul || is_any_div) && y_is_one) ||
+           ((is_add || is_sub) && y_is_zero && is_aggressive))) {
+        // x * 1 = x or x / 1 = x or x +/- 0 = x
+        ReplaceOperationWithIdentity(0, node);
+        continue;
+      }
+
+      // Simplify multiplication and matmul by zeros.
+      // Also optimize zeros divided by a tensor, but only if we are in
+      // aggressive mode, since we might get rid of divisions by zero.
+      bool optimize_zeros_divided_by_y =
+          is_any_div && x_is_zero && is_aggressive;
+      if ((x_is_zero || y_is_zero) &&
+          (is_mul || is_matmul || optimize_zeros_divided_by_y)) {
+        const PartialTensorShape shp(output_shape);
+        if (shp.IsFullyDefined()) {
+          TF_RETURN_IF_ERROR(
+              ReplaceOperationWithConstant(0, output_shape, node));
+          continue;
+        }
+        // Even if an input shape is only partially known, we may known that it
+        // matches the output shape and thus forward the corresponding zero
+        // input.
+        if ((is_mul || is_any_div) && x_is_zero && x_matches_output_shape) {
+          ReplaceOperationWithIdentity(0, node);
+          continue;
+        } else if (is_mul && y_is_zero && y_matches_output_shape) {
+          ReplaceOperationWithIdentity(1, node);
+          continue;
+        }
+      }
+    }
+
+    // Strength reduce floating point division by a constant Div(x, const) to
+    // multiplication by the reciprocal Mul(x, Reciprocal(const)). This in turn
+    // will be constant folded to Mul(x, 1.0/const).
+    if (node->input_size() >= 2 && (IsRealDiv(*node) || IsDiv(*node))) {
+      const string& const_input = node->input(1);
+      const NodeDef* denom = node_map_->GetNode(const_input);
+      CHECK(denom != nullptr);
+      if (!IsReallyConstant(*denom)) {
+        continue;
+      }
+      if (node->attr().count("T") == 0) {
+        continue;
+      }
+      DataType type = node->attr().at("T").type();
+      if (IsDiv(*node) &&
+          !(DataTypeIsFloating(type) || DataTypeIsComplex(type))) {
+        continue;
+      }
+      // Insert new reciprocal op and change node from Div to Mul.
+      NodeDef* reciprocal_node = output->add_node();
+      reciprocal_node->set_name(AddPrefixToNodeName(
+          strings::StrCat(node->name(), "_recip"), kConstantFoldingConst));
+      reciprocal_node->set_op("Reciprocal");
+      reciprocal_node->set_device(node->device());
+      node->set_op("Mul");
+      // Re-wire inputs and outputs.
+      reciprocal_node->add_input(const_input);
+      (*reciprocal_node->mutable_attr())["T"].set_type(type);
+      node->set_input(1, reciprocal_node->name());
+      node_map_->AddNode(reciprocal_node->name(), reciprocal_node);
+      node_map_->UpdateInput(node->name(), const_input,
+                             reciprocal_node->name());
+      node_map_->AddOutput(NodeName(const_input), reciprocal_node->name());
+      graph_modified_ = true;
     }
-    // It's possible to feed a placeholder with a tensor that doesn't have the
-    // proper shape, and reshape this tensor later on. Therefore only remove
-    // reshapes in graphs that don't have placeholders.
-    if (IsSimplifiableReshape(node, properties)) {
-      const NodeDef* new_shape = node_map_->GetNode(node.input(1));
-      DataType output_type = node.attr().at("T").type();
-      node.set_op("Identity");
-      node.clear_attr();
-      (*node.mutable_attr())["T"].set_type(output_type);
-      if (node.input_size() > 2) {
-        node.mutable_input()->SwapElements(1, node.input_size() - 1);
+
+    // Consider the transformation
+    //
+    //                      +                +       = parent
+    //                     / \              / \
+    //                  Const +    -- >    X   +     = children
+    //                       / \              / \
+    //                      X   Y          Const Y   = leaves
+    //
+    // where '+' denotes an associative and commutative operator like addition
+    // or multiplication. This optimization pushes constants down in the tree
+    // to canonicalize it. Moreoever, in cases where the child node has a
+    // constant input we will create a node that can be folded, e.g.
+    //
+    //    Add(C1, Add(C2, X)) -> Add(X, Add(C1, C2)) -> Add(X, C1 + C2)
+    //
+    // TODO(rmlarsen): Handle non-associative/non-commutative operators like
+    // subtraction and division, as well as mixed subtraction/addition,
+    // division/multiplication.
+    if ((is_add || is_mul) && NumNonControlInputs(*node) == 2) {
+      NodeDef* left_child = node_map_->GetNode(node->input(0));
+      NodeDef* right_child = node_map_->GetNode(node->input(1));
+      // One child must be constant, and the other the same op as the parent.
+      if (node->op() != left_child->op() && node->op() != right_child->op()) {
+        continue;
+      }
+      const bool left_child_is_constant = IsReallyConstant(*left_child);
+      const bool right_child_is_constant = IsReallyConstant(*right_child);
+      if (!left_child_is_constant && !right_child_is_constant) {
+        continue;
+      }
+      if (node->device() != left_child->device() ||
+          node->device() != right_child->device()) {
+        continue;
+      }
+      NodeDef* child_node = left_child_is_constant ? right_child : left_child;
+      // Make sure that it is safe to change the value of the child node->
+      if (child_node->input_size() < 2 ||
+          NumNonControlOutputs(*child_node, *node_map_) > 1 || !has_fetch_ ||
+          nodes_to_preserve_.find(child_node->name()) !=
+              nodes_to_preserve_.end()) {
+        continue;
       }
-      node.mutable_input()->RemoveLast();
-      for (const auto& input : new_shape->input()) {
-        DCHECK(IsControlInput(input));
-        *node.add_input() = input;
+
+      // Identify the nodes to swap.
+      const NodeDef* left_leaf = node_map_->GetNode(child_node->input(0));
+      const NodeDef* right_leaf = node_map_->GetNode(child_node->input(1));
+      const bool left_leaf_is_constant = IsReallyConstant(*left_leaf);
+      const bool right_leaf_is_constant = IsReallyConstant(*right_leaf);
+      if (left_leaf_is_constant && right_leaf_is_constant) {
+        // Child is already foldable, leave it alone.
+        continue;
       }
+      const int non_const_leaf_input = left_leaf_is_constant ? 1 : 0;
+      const int parent_const_input = left_child_is_constant ? 0 : 1;
+
+      // Swap the constant child with a non-constant leaf node.
+      node_map_->UpdateInput(node->name(), node->input(parent_const_input),
+                             child_node->input(non_const_leaf_input));
+      node_map_->UpdateInput(child_node->name(),
+                             child_node->input(non_const_leaf_input),
+                             node->input(parent_const_input));
+      std::swap(*node->mutable_input(parent_const_input),
+                *child_node->mutable_input(non_const_leaf_input));
+      graph_modified_ = true;
     }
   }
   return Status::OK();
@@ -1141,7 +1540,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
 Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
                                             const GrapplerItem& item,
                                             GraphDef* output) {
-  node_map_.reset(new NodeMap(&graph_));
+  node_map_.reset(new NodeMap(graph_));
   nodes_whitelist_.clear();
   // Fold fetch nodes iff it has a single fanout. Note that if a fetch node
   // has a single fanout, it would be rewritten as a constant with the same
@@ -1158,31 +1557,30 @@ Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
   }
 
   GraphProperties properties(item);
-  Status s = properties.InferStatically();
-  bool has_feed = !item.feed.empty();
+  // It's possible to feed a placeholder with a tensor of any shape: make sure
+  // that the shape inference deals with this conservatively unless we're in
+  // aggressive mode.
+  const bool assume_valid_feeds = opt_level_ == RewriterConfig::AGGRESSIVE;
+  Status s = properties.InferStatically(assume_valid_feeds);
+  const bool can_use_shape_info = s.ok();
 
-  if (!has_feed && s.ok()) {
-    // Only use static shape information when there is no feed in the
-    // graph. That's because it's possible to feed a placeholder with a tensor
-    // of any shape, which could make the static information inconsistent with
-    // the shapes actually fed.
-    TF_RETURN_IF_ERROR(MaterializeShapes(item, properties));
-  }
-  if (opt_level_ == RewriterConfig::AGGRESSIVE && s.ok()) {
-    TF_RETURN_IF_ERROR(MaterializeConstants(item, properties));
+  if (can_use_shape_info) {
+    TF_RETURN_IF_ERROR(MaterializeShapes(properties));
+    TF_RETURN_IF_ERROR(MaterializeConstants(properties));
   }
 
   TF_RETURN_IF_ERROR(FoldGraph(output));
-
-  if (!has_feed && s.ok()) {
-    TF_RETURN_IF_ERROR(SimplifyGraph(output, properties));
-  }
+  node_map_.reset(new NodeMap(output));
+  TF_RETURN_IF_ERROR(SimplifyGraph(output, properties, can_use_shape_info));
   return Status::OK();
 }
 
 Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* output) {
   nodes_to_preserve_ = item.NodesToPreserve();
+  for (const auto& feed : item.feed) {
+    feed_nodes_.insert(NodeName(feed.first));
+  }
 
   if (cpu_device_ == nullptr) {
     owned_device_.reset(new DeviceSimple());
@@ -1195,13 +1593,13 @@ Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
   *output = item.graph;
   int64 node_count;
   do {
-    graph_.Swap(output);
-    item_to_optimize.graph = graph_;
+    graph_modified_ = false;
+    item_to_optimize.graph.Swap(output);
+    graph_ = &item_to_optimize.graph;
     *output = GraphDef();
-    node_count = graph_.node_size();
+    node_count = graph_->node_size();
     TF_RETURN_IF_ERROR(RunOptimizationPass(cluster, item_to_optimize, output));
-  } while (output->node_size() != node_count);
-
+  } while (graph_modified_ || output->node_size() != node_count);
   *output->mutable_library() = item.graph.library();
   *output->mutable_versions() = item.graph.versions();
 
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index f04f413c10a7e8e19520cc462f88b2a9a2d0fecd..87f275c1c0037612deabcbcda968b0258d37d081 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -51,16 +51,20 @@ class ConstantFolding : public GraphOptimizer {
                 const GraphDef& optimize_output, double result) override;
 
  private:
-  Status MaterializeShapes(const GrapplerItem& item,
-                           const GraphProperties& properties);
+  string OptimizedNodeName(const NodeDef& node, StringPiece suffix) const;
+  string OptimizedNodeName(const NodeDef& node) const;
+  bool OptimizedNodeExists(const NodeDef& node, StringPiece suffix) const;
+
+  bool IsReallyConstant(const NodeDef& node) const;
+
+  Status MaterializeShapes(const GraphProperties& properties);
 
   Status MaterializeBroadcastGradientArgs(const NodeDef& node,
                                           const GraphProperties& properties);
   Status MaterializeReductionIndices(NodeDef* node,
                                      const GraphProperties& properties);
 
-  Status MaterializeConstants(const GrapplerItem& item,
-                              const GraphProperties& properties);
+  Status MaterializeConstants(const GraphProperties& properties);
   bool IsFoldable(const NodeDef& node) const;
 
   Status EvaluateNode(const NodeDef& node,
@@ -72,12 +76,20 @@ class ConstantFolding : public GraphOptimizer {
 
   Status FoldNode(NodeDef* node, GraphDef* output_graph);
 
+  bool IsOnes(const NodeDef& node) const;
+  bool IsZeros(const NodeDef& node) const;
+  void ReplaceOperationWithIdentity(int input_to_forward, NodeDef* node);
+  Status ReplaceOperationWithConstant(double value,
+                                      const TensorShapeProto& shape,
+                                      NodeDef* node);
+  void ReplaceDivisionOfOnesByReciprocal(NodeDef* node);
   Status FoldGraph(GraphDef* output);
 
   bool IsSimplifiableReduction(const NodeDef& node) const;
   bool IsSimplifiableReshape(const NodeDef& node,
                              const GraphProperties& properties) const;
-  Status SimplifyGraph(GraphDef* output, const GraphProperties& properties);
+  Status SimplifyGraph(GraphDef* output, const GraphProperties& properties,
+                       bool use_shape_info);
 
   Status RunOptimizationPass(Cluster* cluster, const GrapplerItem& item,
                              GraphDef* output);
@@ -88,11 +100,13 @@ class ConstantFolding : public GraphOptimizer {
   std::unique_ptr<DeviceBase> owned_device_;
 
   std::unique_ptr<ResourceMgr> resource_mgr_;
-  GraphDef graph_;
+  GraphDef* graph_;
   std::unique_ptr<NodeMap> node_map_;
   std::unordered_set<string> nodes_to_preserve_;
   std::unordered_set<string> nodes_whitelist_;
+  std::unordered_set<string> feed_nodes_;
   bool has_fetch_;
+  bool graph_modified_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index b2d9b02c68358fc3e22881bba60a34feb3d4211e..a3b3e522eb8c3b00bba0289bb4da9eca32e2b435 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -77,11 +77,483 @@ TEST_F(ConstantFoldingTest, SimpleFolding) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
+TEST_F(ConstantFoldingTest, AddTree) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output c1 = ops::Const(s.WithOpName("c1"), 2.0f, {1});
+  Output c2 = ops::Const(s.WithOpName("c2"), 2.0f, {2});
+  Output c4 = ops::Const(s.WithOpName("c4"), 4.0f, {2});
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape(TensorShape({2, 2})));
+  Output add_child = ops::Add(s.WithOpName("add_child"), c2, x);
+  Output add_parent = ops::Add(s.WithOpName("add_parent"), c1, add_child);
+  Output mul_child = ops::Mul(s.WithOpName("mul_child"), c2, x);
+  Output mul_parent = ops::Mul(s.WithOpName("mul_parent"), c1, mul_child);
+  Output addmul_child = ops::Add(s.WithOpName("addmul_child"), c2, x);
+  Output addmul_parent =
+      ops::Mul(s.WithOpName("addmul_parent"), c1, addmul_child);
+
+  GrapplerItem item;
+  item.fetch = {"add_parent", "mul_parent", "addmul_parent"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding fold(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(9, output.node_size());
+
+  // We expect the following rewrite(s) to occur (for both Add and Mul):
+  //    +                +             +
+  //   / \              / \           / \
+  // 2.0   +     -->   x   +    -->  x  4.0
+  //      / \             / \
+  //    2.0  x          2.0 2.0
+
+  for (const auto& node : output.node()) {
+    if (node.name() == "add_child") {
+      EXPECT_EQ("Const", node.op());
+      TensorProto t = node.attr().at("value").tensor();
+      EXPECT_EQ(1, t.tensor_shape().dim_size());
+      EXPECT_EQ(2, t.tensor_shape().dim(0).size());
+    } else if (node.name() == "add_parent") {
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("add_child", node.input(1));
+    } else if (node.name() == "mul_child") {
+      EXPECT_EQ("Const", node.op());
+      TensorProto t = node.attr().at("value").tensor();
+      EXPECT_EQ(1, t.tensor_shape().dim_size());
+      EXPECT_EQ(2, t.tensor_shape().dim(0).size());
+    } else if (node.name() == "mul_parent") {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("mul_child", node.input(1));
+    } else if (node.name() == "addmul_child") {
+      // Unchanged.
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("c2", node.input(0));
+      EXPECT_EQ("x", node.input(1));
+    }
+  }
+
+  // Check that the reciprocals have the expected value.
+  std::vector<string> fetch = {"c4"};
+  auto tensor_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(fetch.size(), tensor_expected.size());
+  fetch = {"add_child", "mul_child"};
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(fetch.size(), tensors.size());
+  for (int i = 0; i < fetch.size(); i++) {
+    test::ExpectTensorEqual<float>(tensor_expected[0], tensors[i]);
+  }
+}
+
+TEST_F(ConstantFoldingTest, NeutralElement) {
+  for (bool use_const : {true, false}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                                ops::Placeholder::Shape(TensorShape({2, 2})));
+    Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                                ops::Placeholder::Shape(TensorShape({2, 2})));
+    Output a = ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
+                                ops::Placeholder::Shape(TensorShape({3, 2})));
+    Output b = ops::Placeholder(s.WithOpName("b"), DT_FLOAT,
+                                ops::Placeholder::Shape(TensorShape({2, 3})));
+    Output bias = ops::Placeholder(s.WithOpName("bias"), DT_FLOAT,
+                                   ops::Placeholder::Shape(TensorShape({2})));
+    Output zeros = !use_const ? ops::ZerosLike(s.WithOpName("zeros"), x)
+                              : ops::Const(s.WithOpName("zeros"), 0.0f, {2, 2});
+    Output zeros_1d = ops::Const(s.WithOpName("zeros_1d"), 0.0f, {2});
+    Output ones = !use_const ? ops::OnesLike(s.WithOpName("ones"), x)
+                             : ops::Const(s.WithOpName("ones"), 1.0f, {2, 2});
+    Output mul1 = ops::Mul(s.WithOpName("mul1"), x, zeros);
+    Output mul2 = ops::Mul(s.WithOpName("mul2"), zeros, y);
+    Output mul3 = ops::Mul(s.WithOpName("mul3"), x, ones);
+    Output mul4 = ops::Mul(s.WithOpName("mul4"), ones, y);
+    Output mul5 = ops::Mul(s.WithOpName("mul5"), x, zeros_1d);
+    Output mul6 = ops::Mul(s.WithOpName("mul6"), zeros_1d, y);
+    Output div1 = ops::Div(s.WithOpName("div1"), x, ones);
+    Output div2 = ops::Div(s.WithOpName("div2"), ones, y);
+    Output matmul1 = ops::MatMul(s.WithOpName("matmul1"), x, zeros);
+    Output matmul2 = ops::MatMul(s.WithOpName("matmul2"), zeros, y);
+    Output matmul3 = ops::MatMul(s.WithOpName("matmul3"), a, zeros);
+    Output matmul4 = ops::MatMul(s.WithOpName("matmul4"), zeros, b);
+    Output add1 = ops::Add(s.WithOpName("add1"), x, zeros);
+    Output add2 = ops::Add(s.WithOpName("add2"), zeros, y);
+    Output bias_add1 = ops::BiasAdd(s.WithOpName("bias_add1"), x, zeros_1d);
+    Output bias_add2 = ops::BiasAdd(s.WithOpName("bias_add2"), zeros, bias);
+    Output sub1 = ops::Sub(s.WithOpName("sub1"), x, zeros);
+    Output sub2 = ops::Sub(s.WithOpName("sub2"), zeros, y);
+    Output addn =
+        ops::AddN(s.WithOpName("addn"),
+                  {mul1, mul2, mul3, mul4, mul5, mul6, div1, div2, matmul1,
+                   matmul2, add1, add2, bias_add1, bias_add2, sub1, sub2});
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    item.fetch = {"addn", "matmul3", "matmul4"};
+
+    ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                              nullptr /* cpu_device */);
+    GraphDef output;
+    Status status = optimizer.Optimize(nullptr, item, &output);
+    TF_EXPECT_OK(status);
+
+    EXPECT_EQ(27, output.node_size());
+    for (int i = 0; i < output.node_size(); ++i) {
+      const NodeDef& node = output.node(i);
+      const string& name = node.name();
+      if (name == "mul1") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^x", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "mul2") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^zeros", node.input(0));
+        EXPECT_EQ("^y", node.input(1));
+      } else if (name == "mul3") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^ones", node.input(1));
+      } else if (name == "mul4") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("y", node.input(0));
+        EXPECT_EQ("^ones", node.input(1));
+      } else if (name == "mul5") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^x", node.input(0));
+        EXPECT_EQ("^zeros_1d", node.input(1));
+      } else if (name == "mul6") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^zeros_1d", node.input(0));
+        EXPECT_EQ("^y", node.input(1));
+      } else if (name == "div1") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^ones", node.input(1));
+      } else if (name == "div2") {
+        EXPECT_EQ("Reciprocal", node.op());
+        EXPECT_EQ("y", node.input(0));
+        EXPECT_EQ("^ones", node.input(1));
+      } else if (name == "matmul1") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^x", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "matmul2") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^zeros", node.input(0));
+        EXPECT_EQ("^y", node.input(1));
+      } else if (name == "matmul3") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^a", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+        TensorProto t = node.attr().at("value").tensor();
+        EXPECT_EQ(1, t.float_val_size());
+        EXPECT_EQ(0, t.float_val(0));
+        EXPECT_EQ(2, t.tensor_shape().dim_size());
+        EXPECT_EQ(3, t.tensor_shape().dim(0).size());
+        EXPECT_EQ(2, t.tensor_shape().dim(1).size());
+      } else if (name == "matmul4") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^zeros", node.input(0));
+        EXPECT_EQ("^b", node.input(1));
+        TensorProto t = node.attr().at("value").tensor();
+        EXPECT_EQ(1, t.float_val_size());
+        EXPECT_EQ(0, t.float_val(0));
+        EXPECT_EQ(2, t.tensor_shape().dim_size());
+        EXPECT_EQ(2, t.tensor_shape().dim(0).size());
+        EXPECT_EQ(3, t.tensor_shape().dim(1).size());
+      } else if (name == "add1") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "add2") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("y", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "bias_add1") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^zeros_1d", node.input(1));
+      } else if (name == "bias_add2") {
+        // We don't eliminate this one, because it requires broadcasting.
+        EXPECT_EQ("BiasAdd", node.op());
+        EXPECT_EQ("zeros", node.input(0));
+        EXPECT_EQ("bias", node.input(1));
+      } else if (name == "sub1") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "sub2") {
+        // We don't handle this case yet.
+        EXPECT_EQ("Sub", node.op());
+        EXPECT_EQ("zeros", node.input(0));
+        EXPECT_EQ("y", node.input(1));
+      }
+      const std::set<string> square_zero_const{"mul1", "mul2",    "mul5",
+                                               "mul6", "matmul1", "matmul2"};
+      if (square_zero_const.count(name) > 0) {
+        TensorProto t = node.attr().at("value").tensor();
+        EXPECT_EQ(1, t.float_val_size());
+        EXPECT_EQ(0, t.float_val(0));
+        EXPECT_EQ(2, t.tensor_shape().dim_size());
+        EXPECT_EQ(2, t.tensor_shape().dim(0).size());
+        EXPECT_EQ(2, t.tensor_shape().dim(1).size());
+      }
+    }
+  }
+}
+
+TEST_F(ConstantFoldingTest, StrengthReduce_Reciprocal) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output cf_half = ops::Const(s.WithOpName("cf_half"), 0.5f, {1});
+  Output xf = ops::Placeholder(s.WithOpName("xf"), DT_FLOAT,
+                               ops::Placeholder::Shape(TensorShape({2, 2})));
+  Output xi = ops::Placeholder(s.WithOpName("xi"), DT_INT32,
+                               ops::Placeholder::Shape(TensorShape({2, 2})));
+  Output ci = ops::Const(s.WithOpName("ci"), 2, {1});
+  Output cf = ops::Const(s.WithOpName("cf"), 2.0f, {1});
+  Output div_i = ops::Div(s.WithOpName("div_i"), xi, ci);
+  Output div_f = ops::Div(s.WithOpName("div_f"), xf, cf);
+  Output realdiv = ops::RealDiv(s.WithOpName("realdiv"), xf, cf);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"div_f", "div_i", "realdiv"};
+  ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                            nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(8, output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    const string& name = node.name();
+    if (name == "div_i") {
+      // Integer division is unchanged.
+      EXPECT_EQ("Div", node.op());
+      EXPECT_EQ("xi", node.input(0));
+      EXPECT_EQ("ci", node.input(1));
+    } else if (name == "div_f") {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ("xf", node.input(0));
+      EXPECT_EQ("ConstantFolding/div_f_recip", node.input(1));
+    } else if (name == "realdiv") {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ("xf", node.input(0));
+      EXPECT_EQ("ConstantFolding/realdiv_recip", node.input(1));
+    } else if (name == "ConstantFolding/div_f_recip") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
+      TensorProto t = node.attr().at("value").tensor();
+      EXPECT_EQ(DT_FLOAT, t.dtype());
+      EXPECT_EQ(1, t.tensor_shape().dim_size());
+      EXPECT_EQ(1, t.tensor_shape().dim(0).size());
+    } else if (name == "ConstantFolding/realdiv_recip") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
+      TensorProto t = node.attr().at("value").tensor();
+      EXPECT_EQ(DT_FLOAT, t.dtype());
+      EXPECT_EQ(1, t.tensor_shape().dim_size());
+      EXPECT_EQ(1, t.tensor_shape().dim(0).size());
+    }
+  }
+
+  // Check that the reciprocals have the expected value.
+  std::vector<string> fetch = {"cf_half"};
+  auto tensor_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(fetch.size(), tensor_expected.size());
+  fetch = {"ConstantFolding/div_f_recip", "ConstantFolding/realdiv_recip"};
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(fetch.size(), tensors.size());
+  for (int i = 0; i < fetch.size(); i++) {
+    test::ExpectTensorEqual<float>(tensor_expected[0], tensors[i]);
+  }
+}
+
+TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x_known =
+      ops::Placeholder(s.WithOpName("x_known"), DT_FLOAT,
+                       ops::Placeholder::Shape(TensorShape({2, 2})));
+  Output x_partially_known =
+      ops::Placeholder(s.WithOpName("x_partially_unknown"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
+  Output x_unknown = ops::Placeholder(s.WithOpName("x_unknown"), DT_FLOAT);
+  Output zeros_known = ops::ZerosLike(s.WithOpName("zeros_known"), x_known);
+  Output zeros_partially_known =
+      ops::ZerosLike(s.WithOpName("zeros_partially_known"), x_partially_known);
+  Output zeros_unknown =
+      ops::ZerosLike(s.WithOpName("zeros_unknown"), x_unknown);
+
+  // Multiplies without any additional ops to supply the output shape.
+  int count = 0;
+  std::vector<Output> muls;
+  std::unordered_set<string> not_converted;
+  std::unordered_set<string> to_const;
+  std::unordered_set<string> to_identity;
+  for (const auto* x : {&x_known, &x_partially_known, &x_unknown}) {
+    for (const auto* zeros :
+         {&zeros_known, &zeros_partially_known, &zeros_unknown}) {
+      const string name = strings::StrCat("mul_", count++);
+      muls.push_back(ops::Mul(s.WithOpName(name), *x, *zeros));
+      if (x == &x_partially_known && zeros == &zeros_partially_known) {
+        to_identity.insert(name);
+      } else if (x == &x_unknown || zeros == &zeros_unknown) {
+        not_converted.insert(name);
+      } else {
+        to_const.insert(name);
+      }
+    }
+  }
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                            nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  LOG(INFO) << output.DebugString();
+
+  EXPECT_EQ(15, output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    const string& name = node.name();
+    if (to_const.count(name) > 0) {
+      EXPECT_EQ("Const", node.op()) << node.name();
+    } else if (to_identity.count(name) > 0) {
+      EXPECT_EQ("Identity", node.op()) << node.name();
+    } else if (not_converted.count(name) > 0) {
+      EXPECT_EQ("Mul", node.op()) << node.name();
+    }
+  }
+}
+
+TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output known_shape = ops::Const(s.WithOpName("known_shape"), 0.0f, {2, 2});
+  Output x_partially_known =
+      ops::Placeholder(s.WithOpName("x_partially_unknown"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
+  Output x_unknown = ops::Placeholder(s.WithOpName("x_unknown"), DT_FLOAT);
+  Output zeros_partially_known =
+      ops::ZerosLike(s.WithOpName("zeros_partially_known"), x_partially_known);
+  Output zeros_unknown =
+      ops::ZerosLike(s.WithOpName("zeros_unknown"), x_unknown);
+
+  // If at least one of the inputs to AddN has a known shape, shape inference
+  // will propagate the shape back to the inputs of AddN, making the
+  // output shapes of all its inputs known
+  std::vector<Output> muls_deduced_output_shape;
+  std::unordered_set<string> to_const;
+  int count = 0;
+  for (const auto& x : {x_partially_known, x_unknown}) {
+    for (const auto& zeros : {zeros_partially_known, zeros_unknown}) {
+      const string name = strings::StrCat("mul_", count++);
+      muls_deduced_output_shape.push_back(
+          ops::Mul(s.WithOpName(name), x, zeros));
+      to_const.insert(name);
+    }
+  }
+  // We add a known shape as input to AddN to propagate it back to the
+  // multiplies above, which means they can all be turned into Const nodes.
+  muls_deduced_output_shape.push_back(known_shape);
+  Output addn1 = ops::AddN(s.WithOpName("addn1"), muls_deduced_output_shape);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                            nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  LOG(INFO) << output.DebugString();
+
+  EXPECT_EQ(10, output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    const string& name = node.name();
+    if (to_const.count(name) > 0) {
+      EXPECT_EQ("Const", node.op()) << node.name();
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_TRUE(IsControlInput(node.input(0)));
+      EXPECT_TRUE(IsControlInput(node.input(1)));
+    }
+  }
+}
+
+TEST_F(ConstantFoldingTest, CreateConstNodes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+#define MAKE_TEST_GRAPH(TYPE)                                               \
+  Output TYPE##_const =                                                     \
+      ops::Const(s.WithOpName(#TYPE "_const"), static_cast<TYPE>(10), {5}); \
+  Output TYPE##_mul =                                                       \
+      ops::Mul(s.WithOpName(#TYPE "_mul"), TYPE##_const, TYPE##_const);     \
+  Output TYPE##_id = ops::Identity(s.WithOpName(#TYPE "_id"), TYPE##_mul)
+
+  MAKE_TEST_GRAPH(float);
+  MAKE_TEST_GRAPH(double);
+  MAKE_TEST_GRAPH(int64);
+  MAKE_TEST_GRAPH(int32);
+  MAKE_TEST_GRAPH(int16);
+  MAKE_TEST_GRAPH(int8);
+  MAKE_TEST_GRAPH(uint8);
+#undef MAKE_TEST_GRAPH
+
+  Output bool_const = ops::Const(s.WithOpName("bool_const"), true, {5});
+  Output bool_and =
+      ops::LogicalAnd(s.WithOpName("bool_and"), bool_const, bool_const);
+  Output bool_id = ops::Identity(s.WithOpName("bool_id"), bool_and);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  ConstantFolding fold(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(24, output.node_size());
+  for (const NodeDef& node : output.node()) {
+#define CHECK_RESULT(TYPE, FIELD)                                             \
+  if (node.name() == #TYPE "_mul") {                                          \
+    EXPECT_EQ(5,                                                              \
+              node.attr().at("value").tensor().tensor_shape().dim(0).size()); \
+    EXPECT_EQ(1, node.attr().at("value").tensor().FIELD##_val_size());        \
+    EXPECT_EQ(10 * 10, node.attr().at("value").tensor().FIELD##_val(0));      \
+  }
+
+    CHECK_RESULT(float, float);
+    CHECK_RESULT(double, double);
+    CHECK_RESULT(int64, int64);
+    CHECK_RESULT(int32, int);
+    CHECK_RESULT(int16, int);
+    CHECK_RESULT(int8, int);
+    CHECK_RESULT(uint8, int);
+#undef CHECK_RESULT
+
+    if (node.name() == "bool_and") {
+      EXPECT_EQ(5,
+                node.attr().at("value").tensor().tensor_shape().dim(0).size());
+      EXPECT_EQ(1, node.attr().at("value").tensor().bool_val_size());
+      EXPECT_EQ(true && true, node.attr().at("value").tensor().bool_val(0));
+    }
+  }
+}
+
 TEST_F(ConstantFoldingTest, FoldingNodeWithTwoOutputs) {
   // Build a simple graph with a few trivially prunable ops.
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
-  Output a = ops::Const(s.WithOpName("a"), 10, {3});
+  Output a = ops::Const(s.WithOpName("a"), 10, {5});
   auto b = ops::Unique(s.WithOpName("b"), {a});
   Output c = ops::Identity(s.WithOpName("c"), {b.y});
   Output d = ops::Identity(s.WithOpName("d"), {b.idx});
@@ -735,7 +1207,7 @@ TEST_F(ConstantFoldingTest, NoOpReduction) {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("v", node.input(0));
-      EXPECT_EQ("^v", node.input(1));
+      EXPECT_EQ("^i", node.input(1));
     }
   }
   EXPECT_TRUE(found);
@@ -794,20 +1266,20 @@ TEST_F(ConstantFoldingTest, NoOpReshape) {
       EXPECT_EQ("Identity", node.op());
       ASSERT_EQ(3, node.input_size());
       EXPECT_EQ("v1", node.input(0));
-      EXPECT_EQ("^d1", node.input(1));
-      EXPECT_EQ("^v1", node.input(2));
+      EXPECT_EQ("^i1", node.input(1));
+      EXPECT_EQ("^d1", node.input(2));
     } else if (node.name() == "r3") {
       ++found;
       EXPECT_EQ("Identity", node.op());
       ASSERT_EQ(2, node.input_size());
       EXPECT_EQ("v3", node.input(0));
-      EXPECT_EQ("^v3", node.input(1));
+      EXPECT_EQ("^i3", node.input(1));
     } else if (node.name() == "r4") {
       ++found;
       EXPECT_EQ("Identity", node.op());
       ASSERT_EQ(2, node.input_size());
       EXPECT_EQ("v4", node.input(0));
-      EXPECT_EQ("^v4", node.input(1));
+      EXPECT_EQ("^i4", node.input(1));
     } else if (node.name() == "r2") {
       ++found;
       EXPECT_EQ("Reshape", node.op());
@@ -963,3 +1435,5 @@ TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
+
+//  LocalWords:  NewRootScope
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 49eb29d0371c7f89a5b796d5bf3ad4d47436d5de..62cebaef7613a1c147e5721f091e4fe947cd88ae 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -22,11 +22,11 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
-#include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
-#include "tensorflow/core/grappler/utils/frame.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -34,117 +34,162 @@ namespace tensorflow {
 namespace grappler {
 
 namespace {
-// A vector with a set. The set stores the same elements as the vector, and
-// quickly answers whether a value is in the vector. Duplicated elements are not
-// allowed for now.
-template <class T>
-class SetVector {
- public:
-  // Returns false if value already existed in the set, true otherwise.
-  bool PushBack(const T& value) {
-    if (!set_.insert(value).second) {
-      return false;
-    }
-    vector_.push_back(value);
-    return true;
-  }
-
-  T PopBack() {
-    T back = vector_.back();
-    set_.erase(back);
-    vector_.pop_back();
-    return back;
-  }
-
-  bool Exists(const T& value) const { return set_.count(value); }
 
-  bool Empty() const { return vector_.empty(); }
-
-  void Reserve(int64 size) { vector_.reserve(size); }
-
- private:
-  std::unordered_set<T> set_;
-  std::vector<T> vector_;
-};
-
-bool HasRegularOutputs(const NodeDef& node, const NodeMap& node_map) {
-  for (const NodeDef* output : node_map.GetOutputs(node.name())) {
-    for (const string& input : output->input()) {
-      if (input == node.name()) {
-        return true;
-      }
+int RemoveInput(NodeDef* node, const string& input, NodeMap* node_map) {
+  int num_removed = 0;
+  int pos = 0;
+  while (pos < node->input_size()) {
+    if (node->input(pos) == input) {
+      node->mutable_input()->SwapElements(pos, node->input_size() - 1);
+      node->mutable_input()->RemoveLast();
+      node_map->RemoveOutput(NodeName(input), node->name());
+    } else {
+      ++pos;
     }
+    ++num_removed;
   }
-  return false;
+  return num_removed;
 }
 
-int FindInputSlot(const NodeDef& node, const string& input) {
-  for (int i = 0; i < node.input_size(); ++i) {
-    if (node.input(i) == input) {
-      return i;
+// Remove duplicate control inputs.
+void PruneControlInputs(NodeDef* node) {
+  std::unordered_set<string> inputs;
+  int pos = 0;
+  while (pos < node->input_size()) {
+    const string& input = node->input(pos);
+    // TODO(rmlarsen): Remove control inputs that also appears as a regular
+    // inputs. Currently, doing so breaks testControlFlowStrictness in
+    // python/framework/function_test.
+    //    if (!inputs.insert(NodeName(input)).second && IsControlInput(input)) {
+    if (IsControlInput(input) && !inputs.insert(input).second) {
+      VLOG(1) << "**** Removing duplicate control input: " << input
+              << " from node " << node->DebugString();
+      node->mutable_input()->SwapElements(pos, node->input_size() - 1);
+      node->mutable_input()->RemoveLast();
+    } else {
+      ++pos;
     }
   }
-  return -1;
 }
 
 }  // namespace
 
 bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
-  if (!has_fetch_ || HasRegularOutputs(node, *node_map_)) {
+  if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
   }
-
-  if (IsMerge(node)) {
+  if (!fetch_nodes_known_ || NumNonControlOutputs(node, *node_map_) > 0) {
+    // The output values of this node may be needed.
     return false;
   }
-  if (!ArithmeticOptimizer::CanDedup(node, nodes_to_preserve_)) {
+  if (IsMerge(node) || IsSwitch(node)) {
+    return false;
+  }
+  if (ModifiesFrameInfo(node)) {
+    return false;
+  }
+  if (!IsFreeOfSideEffect(node)) {
+    return false;
+  }
+  if (node.op() == "ControlTrigger") {
+    return false;
+  }
+  if (node.op().rfind("Submodel", 0) == 0) {
     return false;
   }
-
   const OpDef* op_def = nullptr;
   Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
   if (!status.ok() || op_def->output_arg_size() == 0) {
     return false;
   }
 
-  // TODO(rmlarsen): We have to skip Const nodes to make
-  // core/debug/debug_gateway_test pass. See if we can fix that test.
   // TODO(rmlarsen): We have to skip Identity nodes to make an obsolete test in
   // python/training/session_manager_test.py pass. See if we can fix or get rid
   // of that test.
-  const std::unordered_set<string> do_not_rewrite_ops = {
-      "Assert", "CheckNumerics",         "Const",      "Identity", "_Retval",
-      "_Arg",   "_ParallelConcatUpdate", "_TPUExecute"};
+  const std::unordered_set<string> do_not_rewrite_ops{
+      "Assert", "CheckNumerics",         "Identity",    "_Retval",
+      "_Arg",   "_ParallelConcatUpdate", "_TPUExecute", "_TPUCompile"};
   return do_not_rewrite_ops.find(node.op()) == do_not_rewrite_ops.end();
 }
 
-string DependencyOptimizer::TryOptimizeDependencies(
-    NodeDef* node, GraphDef* graph, std::vector<NodeDef*>* new_nodes) {
+void DependencyOptimizer::OptimizeNode(int node_idx,
+                                       SetVector<int>* nodes_to_simplify,
+                                       std::set<int>* nodes_to_delete) {
+  NodeDef* node = optimized_graph_->mutable_node(node_idx);
+
+  // Constant nodes with no input control dependency are always executed early,
+  // so we can prune all their output control dependencies.
+  if (IsConstant(*node) && node->input_size() == 0) {
+    const std::set<NodeDef*> output_nodes = node_map_->GetOutputs(node->name());
+    for (NodeDef* fanout : output_nodes) {
+      bool optimize_fanout = false;
+      bool data_connection = false;
+      for (int i = fanout->input_size() - 1; i >= 0; --i) {
+        int pos;
+        string input_name = ParseNodeName(fanout->input(i), &pos);
+        if (input_name == node->name()) {
+          if (pos < 0) {
+            fanout->mutable_input()->SwapElements(i, fanout->input_size() - 1);
+            fanout->mutable_input()->RemoveLast();
+            optimize_fanout = true;
+          } else {
+            data_connection = true;
+          }
+        }
+      }
+      if (optimize_fanout) {
+        nodes_to_simplify->PushBack(node_to_idx_[fanout]);
+        if (!data_connection) {
+          node_map_->RemoveOutput(node->name(), fanout->name());
+        }
+      }
+    }
+    if (node_map_->GetOutputs(node->name()).empty() && fetch_nodes_known_ &&
+        nodes_to_preserve_.find(node->name()) == nodes_to_preserve_.end()) {
+      // Mark the node for deletion.
+      nodes_to_delete->insert(node_to_idx_[node]);
+    }
+
+    return;
+  }
+
   // Change ops that only have control dependencies as outputs to NoOps.
   if (node->op() != "NoOp" && SafeToConvertToNoOp(*node)) {
-    VLOG(2) << "***** Replacing  " << node->name() << " (" << node->op()
+    VLOG(1) << "***** Replacing  " << node->name() << " (" << node->op()
             << ") with NoOp.";
     // The outputs of this node are not consumed. Replace its inputs with
     // control dependencies and replace the op itself with the NoOp op.
-    for (int i = 0; i < node->input_size(); ++i) {
-      const string& old_input = node->input(i);
+    std::unordered_set<string> ctrl_inputs;
+    int pos = 0;
+    while (pos < node->input_size()) {
+      const string old_input = node->input(pos);
       if (IsControlInput(old_input)) {
+        if (!ctrl_inputs.insert(old_input).second) {
+          // We found a duplicate control input. Remove it.
+          node->mutable_input()->SwapElements(pos, node->input_size() - 1);
+          node->mutable_input()->RemoveLast();
+        } else {
+          ++pos;
+        }
         continue;
       }
       const string ctrl_input = ConstantFolding::AddControlDependency(
-          old_input, graph, node_map_.get());
-      node->set_input(i, ctrl_input);
-      node_map_->UpdateInput(node->name(), old_input, ctrl_input);
-      new_nodes->push_back(node_map_->GetNode(old_input));
+          old_input, optimized_graph_, node_map_.get());
+      if (ctrl_inputs.insert(ctrl_input).second) {
+        node->set_input(pos, ctrl_input);
+        node_map_->UpdateInput(node->name(), old_input, ctrl_input);
+        const NodeDef* old_input_node = node_map_->GetNode(old_input);
+        nodes_to_simplify->PushBack(node_to_idx_[old_input_node]);
+      }
+      ++pos;
     }
     node->set_op("NoOp");
     node->clear_attr();
-    new_nodes->push_back(node);
-    return "";
   }
 
-  // Remove NoOp nodes if their fan-in or fan-out is less than 2.
-  // The non-trivial rewrites take the following form:
+  // Remove NoOp nodes if the product of their fan-in and fan-out is less than
+  // or equal to the sum of the fan-in and fan-out. The non-trivial rewrites
+  // take the following form:
   //
   // Case a)
   //    x --^> +------+                x --^> +---+
@@ -157,112 +202,276 @@ string DependencyOptimizer::TryOptimizeDependencies(
   //    x --^> | NoOp | --^> b  ==>    | x | --^> b
   //           |      | ...            |   | ...
   //           +------+ --^> c         +---+ --^> c
-  if (node->op() == "NoOp" &&
-      nodes_to_preserve_.find(node->name()) == nodes_to_preserve_.end()) {
-    auto outputs = node_map_->GetOutputs(node->name());
-    const int num_outputs = outputs.size();
+  // Case c)
+  //           +------+                x ---^> a
+  //    x --^> | NoOp | --^> a  ==>      \/
+  //    y --^> |      | --^> b           /\
+  //           +------+                y ---^> b
+  //
+  // We only apply this optimization if we don't increase the number of control
+  // edges across device boundaries, e.g. in cases a) and b) if NoOp and
+  // a and x, respectively, are on the same device. Control edges across device
+  // boundaries require inter-device communication (Send/Recv pairs to be
+  // inserted in the graph), which is very costly.
+
+  if (node->op() == "NoOp") {
+    const auto& output_node_set = node_map_->GetOutputs(node->name());
+    const std::vector<NodeDef*> output_nodes(output_node_set.begin(),
+                                             output_node_set.end());
+    const int num_outputs = output_nodes.size();
     const int num_inputs = node->input_size();
-    if (num_inputs > 1 && num_outputs > 1) {
-      return "";
+
+    if (num_inputs * num_outputs > num_inputs + num_outputs) {
+      return;
+    }
+    VLOG(1) << "***** Rerouting input around " << node->name();
+    std::vector<NodeDef*> input_nodes;
+    for (int i = 0; i < num_inputs; ++i) {
+      NodeDef* tmp = node_map_->GetNode(node->input(i));
+      CHECK_NE(tmp, nullptr);
+      input_nodes.push_back(tmp);
     }
 
-    for (auto consumer : outputs) {
+    // Make sure that we don't increase the number of control edges that cross
+    // device boundaries.
+    if ((num_inputs == 1 && num_outputs > 1 &&
+         input_nodes[0]->device() != node->device()) ||
+        (num_inputs > 1 && num_outputs == 1 &&
+         output_nodes[0]->device() != node->device())) {
+      return;
+    }
+    if (num_inputs == 2 && num_outputs == 2) {
+      const string& noop_dev = node->device();
+      const string& in0_dev = input_nodes[0]->device();
+      const string& in1_dev = input_nodes[1]->device();
+      const string& out0_dev = output_nodes[0]->device();
+      const string& out1_dev = output_nodes[1]->device();
+      const int num_cross_before = static_cast<int>(in0_dev != noop_dev) +
+                                   static_cast<int>(in1_dev != noop_dev) +
+                                   static_cast<int>(out0_dev != noop_dev) +
+                                   static_cast<int>(out1_dev != noop_dev);
+      const int num_cross_after = static_cast<int>(in0_dev != out0_dev) +
+                                  static_cast<int>(in0_dev != out1_dev) +
+                                  static_cast<int>(in1_dev != out0_dev) +
+                                  static_cast<int>(in1_dev != out1_dev);
+      if (num_cross_after > num_cross_before) {
+        return;
+      }
+    }
+    for (auto consumer : output_nodes) {
+      bool updated_consumer = false;
+      VLOG(1) << "***** Considering consumer  " << consumer->name() << "\n"
+              << consumer->DebugString();
       for (int i = 0; i < num_inputs; ++i) {
-        const string& input = node->input(i);
-        // Forward dependencies from inputs to consumer if it doesn't already
+        const NodeDef* input = input_nodes[i];
+        // Forward dependency from input to consumer if it doesn't already
         // depend on it.
-        if (node_map_->GetOutputs(input).count(consumer) == 0) {
-          consumer->add_input(ConstantFolding::AddControlDependency(
-              input, graph, node_map_.get()));
-          node_map_->AddOutput(NodeName(input), consumer->name());
+        if (node_map_->GetOutputs(input->name()).count(consumer) == 0) {
+          consumer->add_input(AsControlDependency(input->name()));
+          updated_consumer = true;
+          node_map_->AddOutput(input->name(), consumer->name());
+          nodes_to_simplify->PushBack(node_to_idx_[input]);
         }
-        new_nodes->push_back(node_map_->GetNode(input));
       }
       // Remove dependency on node from consumer.
-      int pos = FindInputSlot(*consumer, AsControlDependency(node->name()));
-      if (pos >= 0) {
-        consumer->mutable_input()->SwapElements(pos,
-                                                consumer->input_size() - 1);
-        consumer->mutable_input()->RemoveLast();
-        node_map_->RemoveOutput(node->name(), consumer->name());
-        new_nodes->push_back(consumer);
+      updated_consumer |= RemoveInput(
+          consumer, AsControlDependency(node->name()), node_map_.get());
+      if (updated_consumer) {
+        VLOG(1) << "***** Updated consumer  " << consumer->name() << " ("
+                << consumer->op() << ")";
+        nodes_to_simplify->PushBack(node_to_idx_[consumer]);
       }
     }
 
-    // Clear all control inputs to node.
-    node_map_->RemoveInputs(node->name());
-    node->clear_input();
-    return "";
+    node_map_->RemoveOutputs(node->name());
+    if (fetch_nodes_known_ &&
+        nodes_to_preserve_.find(node->name()) == nodes_to_preserve_.end()) {
+      // Mark the node for deletion.
+      nodes_to_delete->insert(node_idx);
+
+      // Unconnect the node from its inputs to enable further optimizations.
+      node_map_->RemoveInputs(node->name());
+      node->clear_input();
+    }
+  }
+}
+
+void DependencyOptimizer::CleanControlInputs() {
+  for (int i = 0; i < optimized_graph_->node_size(); ++i) {
+    PruneControlInputs(optimized_graph_->mutable_node(i));
   }
+}
 
-  return "";
+void DependencyOptimizer::DeleteNodes(const std::set<int>& nodes_to_delete) {
+  int last = optimized_graph_->node_size() - 1;
+  for (auto it = nodes_to_delete.rbegin(); it != nodes_to_delete.rend(); ++it) {
+    const int index = *it;
+    optimized_graph_->mutable_node()->SwapElements(index, last);
+    last--;
+  }
+  optimized_graph_->mutable_node()->DeleteSubrange(last + 1,
+                                                   nodes_to_delete.size());
+  // Rebuild the NodeMap which was invalidated by the node swapping above.
+  node_map_.reset(new NodeMap(optimized_graph_));
+  BuildNodeToIdx();
 }
 
-Status DependencyOptimizer::OptimizeDependencies(GraphDef* optimized_graph) {
-  // TODO(rmlarsen,bsteiner): The folloing code is similar to the control loop
-  // in the ArithmeticOptimizer. Dedup this.
-  SetVector<NodeDef*> nodes_to_simplify;
-  for (int i = 0; i < optimized_graph->node_size(); ++i) {
-    const NodeDef& node = optimized_graph->node(i);
-    if (node.op() == "NoOp" || SafeToConvertToNoOp(node)) {
-      nodes_to_simplify.PushBack(optimized_graph->mutable_node()->Mutable(i));
+Status DependencyOptimizer::OptimizeDependencies() {
+  SetVector<int> nodes_to_simplify;
+  std::set<int> nodes_to_delete;
+  for (int i = 0; i < optimized_graph_->node_size(); ++i) {
+    const NodeDef& node = optimized_graph_->node(i);
+    if (node.op() == "NoOp" || IsConstant(node) || SafeToConvertToNoOp(node)) {
+      nodes_to_simplify.PushBack(i);
     }
   }
   while (!nodes_to_simplify.Empty()) {
-    NodeDef* node = nodes_to_simplify.PopBack();
-    std::vector<NodeDef*> new_nodes;
-    const string simplified_tensor =
-        TryOptimizeDependencies(node, optimized_graph, &new_nodes);
-    if (simplified_tensor.empty()) {
+    OptimizeNode(nodes_to_simplify.PopBack(), &nodes_to_simplify,
+                 &nodes_to_delete);
+  }
+
+  if (fetch_nodes_known_) {
+    VLOG(1) << "Deleted " << nodes_to_delete.size() << " out of "
+            << optimized_graph_->node_size() << " nodes.";
+    DeleteNodes(nodes_to_delete);
+  }
+  return Status::OK();
+}
+
+Status DependencyOptimizer::TransitiveReduction() {
+  // PRECONDITION: optimized_graph_ must be sorted topologically.
+  const int num_nodes = optimized_graph_->node_size();
+  // Set up a compressed version of the graph to save a constant factor in the
+  // expensive algorithm below. Also cache the set of control outputs and the
+  // highest index of a target of any control output from each node.
+  int num_controls = 0;
+  std::vector<gtl::InlinedVector<int, 4>> inputs(num_nodes);
+  std::vector<gtl::InlinedVector<std::pair<int, int>, 2>> control_outputs(
+      num_nodes);
+  for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
+    const NodeDef& node = optimized_graph_->node(node_idx);
+    if (ModifiesFrameInfo(node)) {
+      // Ignore nodes that modify frame info.
       continue;
     }
-    if (NodeName(simplified_tensor) != node->name()) {
-      // Always consider simplified_tensor for further optimizations.
-      NodeDef* simplified_node = node_map_->GetNode(simplified_tensor);
-      if (simplified_node != nullptr) {
-        nodes_to_simplify.PushBack(simplified_node);
+    for (int input_slot = 0; input_slot < node.input_size(); ++input_slot) {
+      const string& input = node.input(input_slot);
+      const NodeDef* input_node = node_map_->GetNode(input);
+      if (ModifiesFrameInfo(*input_node)) {
+        // Ignore edges from nodes that modify frame info.
+        continue;
+      }
+      const int input_node_idx = node_to_idx_[input_node];
+      inputs[node_idx].push_back(input_node_idx);
+      if (IsControlInput(input)) {
+        ++num_controls;
+        control_outputs[input_node_idx].emplace_back(node_idx, input_slot);
       }
-      // When `node` is simplifed to another node rather than in-place, the
-      // consumers of `node` are already redirected to `simplified_tensor`.
-      // Re-push the consumers into `nodes_to_simplify` for further
-      // optimizations.
-      std::set<NodeDef*> consumers = node_map_->GetOutputs(node->name());
-      for (NodeDef* consumer : consumers) {
-        // Update `consumer`'s use of `node` to `input`'s operand.
-        for (int i = 0; i < consumer->input_size(); ++i) {
-          int operand_pos;
-          string operand_node_name =
-              ParseNodeName(consumer->input(i), &operand_pos);
-          if (operand_node_name == node->name()) {
-            *consumer->mutable_input(i) =
-                (operand_pos < 0
-                     ? AsControlDependency(NodeName(simplified_tensor))
-                     : simplified_tensor);
+    }
+  }
+
+  // Run the longest path in DAG algorithm for each source node that has control
+  // outputs. If, for any target node of a control output, there exists a path
+  // of length > 1, we can drop that control dependency.
+  int num_controls_removed = 0;
+  std::vector<int> longest_distance(num_nodes);
+  for (int source = 0; source < num_nodes; ++source) {
+    int highest_control_target = -1;
+    for (const auto& control_output : control_outputs[source]) {
+      if (control_output.first > highest_control_target) {
+        highest_control_target = control_output.first;
+      }
+    }
+    if (highest_control_target < source) {
+      continue;
+    }
+    std::fill(longest_distance.begin() + source,
+              longest_distance.begin() + highest_control_target + 1, 0);
+    for (int target = source + 1; target <= highest_control_target; ++target) {
+      for (int input : inputs[target]) {
+        // If the input node is before source in the topo order, no path
+        // source -> input -> target can exits and we can skip it.
+        if (input >= source) {
+          // If source -> input -> target is longer than the longest
+          // path so far from source -> target, update the longest_distance.
+          int candidate_longest_distance = longest_distance[input] + 1;
+          if (candidate_longest_distance > longest_distance[target]) {
+            longest_distance[target] = candidate_longest_distance;
           }
-          VLOG(2) << "Update input " << consumer->input(i) << " of "
-                  << consumer->name() << " to " << simplified_tensor;
         }
-        node_map_->UpdateInput(consumer->name(), node->name(),
-                               simplified_tensor);
-        nodes_to_simplify.PushBack(consumer);
       }
     }
-    for (auto new_node : new_nodes) {
-      nodes_to_simplify.PushBack(new_node);
+
+    // If the longest path from the source to the target of a control dependency
+    // is longer than 1, there exists an alternate path, and we can eliminate
+    // the control dependency since it is redundant.
+    for (const auto& control_output : control_outputs[source]) {
+      const int target = control_output.first;
+      if (longest_distance[target] > 1) {
+        const int input_slot = control_output.second;
+        // We modify the node inplace here. This is safe because there can
+        // only be one control edge from a given source to a given target.
+        const NodeDef& source_node = optimized_graph_->node(source);
+        NodeDef* target_node = optimized_graph_->mutable_node(target);
+        target_node->mutable_input()->SwapElements(
+            input_slot, target_node->input_size() - 1);
+        node_map_->RemoveOutput(source_node.name(), target_node->name());
+        target_node->mutable_input()->RemoveLast();
+        ++num_controls_removed;
+      }
     }
   }
+  VLOG(1) << "Removed " << num_controls_removed << " out of " << num_controls
+          << " control dependencies";
   return Status::OK();
 }
 
+void DependencyOptimizer::BuildNodeToIdx() {
+  // Set up &node -> index map.
+  node_to_idx_.clear();
+  for (int i = 0; i < optimized_graph_->node_size(); ++i) {
+    const NodeDef& node = optimized_graph_->node(i);
+    node_to_idx_[&node] = i;
+  }
+}
+
 Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                      GraphDef* optimized_graph) {
-  *optimized_graph = item.graph;
+  optimized_graph_ = optimized_graph;
+  *optimized_graph_ = item.graph;
   nodes_to_preserve_ = item.NodesToPreserve();
-  node_map_.reset(new NodeMap(optimized_graph));
-  has_fetch_ = !item.fetch.empty();
-  VLOG(2) << "Graph before optimization:\n" << optimized_graph->DebugString();
-  TF_RETURN_IF_ERROR(OptimizeDependencies(optimized_graph));
-  VLOG(2) << "Graph after optimization:\n" << optimized_graph->DebugString();
+  fetch_nodes_known_ = !item.fetch.empty();
+
+  VLOG(1) << "Graph before optimization:\n" << optimized_graph_->DebugString();
+  CleanControlInputs();
+  const int num_iterations = opt_level_ == RewriterConfig::AGGRESSIVE ? 2 : 1;
+  for (int iteration = 0; iteration < num_iterations; ++iteration) {
+    Status topo_sort_status;
+    if (opt_level_ == RewriterConfig::AGGRESSIVE) {
+      // Prepare the graph for transitive reduction if enabled.
+      topo_sort_status = TopologicalSort(optimized_graph_);
+    }
+
+    node_map_.reset(new NodeMap(optimized_graph_));
+    BuildNodeToIdx();
+
+    // Remove redundant control dependencies, iteration 1.
+    if (opt_level_ == RewriterConfig::AGGRESSIVE) {
+      if (topo_sort_status.ok()) {
+        TF_RETURN_IF_ERROR(TransitiveReduction());
+      } else {
+        LOG(ERROR) << topo_sort_status.error_message();
+      }
+      VLOG(1) << "Graph after transitive reduction:\n"
+              << optimized_graph_->DebugString();
+    }
+
+    // Turn nodes without non-control outputs into NoOps, prune NoOps.
+    TF_RETURN_IF_ERROR(OptimizeDependencies());
+    VLOG(1) << "Graph after NoOp conversion & pruning:\n"
+            << optimized_graph_->DebugString();
+  }
+  VLOG(1) << "Graph after optimization:\n" << optimized_graph_->DebugString();
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
index 13ece87aff3cd006d097a9431fc51085871ddf4c..3f6f418bee69cc86d8865bccd266803ade2ef2c1 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -45,21 +45,29 @@ class DependencyOptimizer : public GraphOptimizer {
  private:
   // Returns true if it is safe to convert node to NoOp.
   bool SafeToConvertToNoOp(const NodeDef& node);
+  // Removes all duplicate control dependencies.
+  void CleanControlInputs();
+  // Builds a map from the &optimized_graph_->node(i) to i.
+  void BuildNodeToIdx();
+  // Removes the given set of nodes from the graph.
+  void DeleteNodes(const std::set<int>& nodes_to_delete);
+  // Tries to optimize the node with the given index, possibly additional
+  // optimizations by inserting nodes in nodes_to_simplify, and pruning nodes by
+  // inserting them in nodes_to_delete.
+  void OptimizeNode(int node_idx, SetVector<int>* nodes_to_simplify,
+                    std::set<int>* nodes_to_delete);
+  // Eliminates redundant control dependencies by computing the transitive
+  // reduction of the graph.
+  Status TransitiveReduction();
+  // Main driver of dependency optimizations.
+  Status OptimizeDependencies();
 
-  Status OptimizeDependencies(GraphDef* optimized_graph);
-  // Tries to simplify the expression that roots at `node` and replaces the uses
-  // of `node` to the simplified expression. Returns the name of the simplified
-  // tensor (e.g. "split:1") or an empty string if no simplification is
-  // performed.
-  string TryOptimizeDependencies(NodeDef* node, GraphDef* graph,
-                                 std::vector<NodeDef*>* new_nodes);
-
-  bool HasOnlyControlOutputs(const NodeDef* node);
-
-  bool has_fetch_;
   RewriterConfig::Toggle opt_level_;
+  bool fetch_nodes_known_;
   std::unordered_set<string> nodes_to_preserve_;
   std::unique_ptr<NodeMap> node_map_;
+  std::unordered_map<const NodeDef*, int> node_to_idx_;
+  GraphDef* optimized_graph_;  // Not owned.
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index d54d7b2093eb2d717a231826502c46d0a874268a..837fbba2fc1fa8d5fd7241912802ab5009a34f79 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -59,10 +60,47 @@ TEST_F(DependencyOptimizerTest, NoOp) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
-TEST_F(DependencyOptimizerTest, ChangeToNoop) {
+TEST_F(DependencyOptimizerTest, DependenciesDrivenByConstants) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
   Output y = ops::Const(s.WithOpName("y"), {1.0f, 2.0f}, {1, 2});
+  Output z = ops::Const(s.WithOpName("z"), {1.0f, 2.0f}, {1, 2});
+  Output add = ops::Add(s.WithOpName("add"), x, y);
+  Output id1 =
+      ops::Identity(s.WithOpName("id1").WithControlDependencies(x), add);
+  Output id2 = ops::Identity(
+      s.WithOpName("id2").WithControlDependencies(y).WithControlDependencies(z),
+      add);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("id1");
+  item.fetch.push_back("id2");
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // The 'z' node should have been optimized away leaving only 5 nodes.
+  EXPECT_EQ(5, output.node_size());
+
+  for (const NodeDef& node : item.graph.node()) {
+    if (node.name() == "id1" || node.name() == "id2") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("add", node.input(0));
+    }
+  }
+}
+
+TEST_F(DependencyOptimizerTest, ChangeToNoop) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y"), {1, 2}, DT_FLOAT);
   Output add = ops::Add(s.WithOpName("add"), x, y);
   Output id1 =
       ops::Identity(s.WithOpName("id1").WithControlDependencies(add), x);
@@ -85,29 +123,31 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop) {
 
   EXPECT_EQ(item.graph.node_size(), output.node_size());
   for (int i = 0; i < item.graph.node_size(); ++i) {
-    const NodeDef& original = item.graph.node(i);
-    const NodeDef& optimized = output.node(i);
-    EXPECT_EQ(original.name(), optimized.name());
-    if (original.name() == "add") {
-      EXPECT_EQ("NoOp", optimized.op());
-    } else {
-      EXPECT_EQ(original.op(), optimized.op());
-    }
-    EXPECT_EQ(original.input_size(), optimized.input_size());
-    for (int j = 0; j < original.input_size(); ++j) {
-      if (original.name() == "add") {
-        EXPECT_EQ(AsControlDependency(original.input(j)), optimized.input(j));
-      } else {
-        EXPECT_EQ(original.input(j), optimized.input(j));
-      }
+    const NodeDef& node = item.graph.node(i);
+    if (node.name() == "add") {
+      EXPECT_EQ("NoOp", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("^x", node.input(0));
+      EXPECT_EQ("^y", node.input(1));
+    } else if (node.name() == "id1") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^y", node.input(1));
+    } else if (node.name() == "id2") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
     }
   }
 }
 
+// TODO(rmlarsen): Add test to make sure we skip Switch and Merge.
 TEST_F(DependencyOptimizerTest, ChangeToNoop_NoFetch) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
-  Output y = ops::Const(s.WithOpName("y"), {1.0f, 2.0f}, {1, 2});
+  Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y"), {1, 2}, DT_FLOAT);
   Output add = ops::Add(s.WithOpName("add"), x, y);
   Output id1 =
       ops::Identity(s.WithOpName("id1").WithControlDependencies(add), x);
@@ -117,17 +157,18 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_NoFetch) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  DependencyOptimizer optimizer;
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
+  TF_CHECK_OK(TopologicalSort(&item.graph));
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
 TEST_F(DependencyOptimizerTest, RemoveNoOps_EmptyInputOrOutput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output x = ops::Const(s, {1.0f, 2.0f}, {1, 2});
+  Output x = ops::RandomUniform(s, {1, 2}, DT_FLOAT);
   auto noop1 = ops::NoOp(s);
   auto noop2 = ops::NoOp(s.WithControlDependencies(x));
   Output id = ops::Identity(s.WithControlDependencies({noop1.operation}), x);
@@ -151,15 +192,49 @@ TEST_F(DependencyOptimizerTest, RemoveNoOps_EmptyInputOrOutput) {
       EXPECT_EQ(0, node.input_size());
     } else if (node.name() == "Identity") {
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("Const", node.input(0));
+      EXPECT_EQ("RandomUniform", node.input(0));
     }
   }
 }
 
+TEST_F(DependencyOptimizerTest, RemoveNoOps_DeviceBoundaries) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s.WithOpName("x").WithDevice("/CPU:0"), {1, 2},
+                                DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y").WithDevice("/CPU:0"), {1, 2},
+                                DT_FLOAT);
+  // NoOp with a single input- and two output dependencies.
+  auto noop = ops::NoOp(s.WithControlDependencies(x).WithDevice("/CPU:1"));
+  // NoOp with a two input- and a single output dependency.
+  auto noop_1 = ops::NoOp(
+      s.WithControlDependencies(x).WithControlDependencies(y).WithDevice(
+          "/CPU:0"));
+  Output id = ops::Identity(
+      s.WithControlDependencies({noop.operation}).WithDevice("/CPU:1"), x);
+  Output id_1 = ops::Identity(
+      s.WithControlDependencies({noop.operation, noop_1.operation})
+          .WithDevice("/CPU:1"),
+      y);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("Identity");
+  item.fetch.push_back("Identity_1");
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // The optimization should be disabled to prevent increasing the number of
+  // nodes crossing device boundaries.
+  VerifyGraphsEqual(item.graph, output, __FUNCTION__);
+}
+
 TEST_F(DependencyOptimizerTest, RemoveNoOps_SingleInputOrOutput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
-  Output y = ops::Const(s.WithOpName("y"), {1.0f, 2.0f}, {1, 2});
+  Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y"), {1, 2}, DT_FLOAT);
   // NoOp with a single input- and two output dependencies.
   auto noop = ops::NoOp(s.WithControlDependencies(x));
   // NoOp with a two input- and a single output dependency.
@@ -196,6 +271,27 @@ TEST_F(DependencyOptimizerTest, RemoveNoOps_SingleInputOrOutput) {
   }
 }
 
+TEST_F(DependencyOptimizerTest, Transitive_Reduction_Simple) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
+  Output x = ops::Square(s.WithOpName("x"), c);
+  Output id1 = ops::Identity(s.WithOpName("id1"), x);
+  Output id2 =
+      ops::Identity(s.WithOpName("id2").WithControlDependencies({x}), id1);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("id2");
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  EXPECT_EQ(4, output.node_size());
+  EXPECT_EQ("id2", output.node(3).name());
+  EXPECT_EQ(1, output.node(3).input_size());
+  EXPECT_EQ("id1", output.node(3).input(0));
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer.h b/tensorflow/core/grappler/optimizers/graph_optimizer.h
index 55a90dce88f91bf88e6c6ad4ff5f9d2804d539f9..42d9837312d25f3504c85f12883c4ac818157cdd 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer.h
@@ -41,7 +41,7 @@ class GraphOptimizer {
                           GraphDef* optimized_graph) = 0;
 
   // Method invoked by the framework so that it can provide feedback
-  // on how well the "optimize_output" (produced as *output from a
+  // on how well the "optimized_graph" (produced as *optimized_graph from a
   // call to Optimize) performed.  Lower "result" scores are better.
   virtual void Feedback(Cluster* cluster, const GrapplerItem& item,
                         const GraphDef& optimized_graph, double result) = 0;
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index ba5d13eeaffab4151285b7b99ca4ac0ebe489d5f..bcf785f272207da1e12ada5f8fd92357893db1e8 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <deque>
 #include <unordered_set>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -27,48 +28,148 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace grappler {
+namespace {
 
-const char kConcatConst[] = "LayoutOptimizerConcatConst";
+const char kPrefix[] = "LayoutOptimizer";
 const char kPermNHWCToNCHW[] = "LayoutOptimizerPermConstNHWCToNCHW";
 const char kPermNCHWToNHWC[] = "LayoutOptimizerPermConstNCHWToNHWC";
-const char kGatherAxisConst[] = "LayoutOptimizerGatherAxisConst";
 const char kTransposeNHWCToNCHW[] = "LayoutOptimizerTransposeNHWCToNCHW";
 const char kTransposeNCHWToNHWC[] = "LayoutOptimizerTransposeNCHWToNHWC";
-const char kPermVecNHWCToNCHW[] = "LayoutOptimizerPermVecNHWCToNCHW";
+const char kVecPermuteNHWCToNCHW[] = "LayoutOptimizerVecPermuteNHWCToNCHW";
+const char kVecPermuteNCHWToNHWC[] = "LayoutOptimizerVecPermuteNCHWToNHWC";
 const char kReshapeNHWCToNCHW[] = "LayoutOptimizerReshapeNHWCToNCHW";
 const char kReshapeConst[] = "LayoutOptimizerReshapeConst";
 const char kReductionConst[] = "LayoutOptimizerReductionConst";
 
 std::set<string> GetOpsFormatSupported() {
-  std::set<string> ops_format_supported = {"AvgPool",
-                                           "AvgPoolGrad",
-                                           "Conv2D",
-                                           "Conv2DBackpropFilter",
-                                           "Conv2DBackpropInput",
-                                           "BiasAdd",
-                                           "BiasAddGrad",
-                                           "FusedBatchNorm",
-                                           "FusedBatchNormGrad",
-                                           "FusedConv2DBiasActivation",
-                                           "MaxPool",
-                                           "MaxPoolGrad"};
+  std::set<string> ops_format_supported = {
+      "AvgPool",
+      "AvgPoolGrad",
+      "Conv2D",
+      "Conv2DBackpropFilter",
+      "Conv2DBackpropInput",
+      "BiasAdd",
+      "BiasAddGrad",
+      "DepthwiseConv2dNative",
+      "DepthwiseConv2dNativeBackpropInput",
+      "DepthwiseConv2dNativeBackpropFilter",
+      "FusedBatchNorm",
+      "FusedBatchNormGrad",
+      "FusedConv2DBiasActivation",
+      "MaxPool",
+      "MaxPoolGrad",
+      "SpaceToDepth",
+      "DepthToSpace"};
   return ops_format_supported;
 }
 
+// TODO(yaozhang): enable SumProcessor with auto-tuning. Currently disabled
+// because of the worse performance in some cases.
 std::set<string> GetOpsFormatAgnostic() {
-  std::set<string> ops_format_agnostic = {
-      "Add",      "AddN",     "Concat", "ConcatV2",
-      "Floor",    "Identity", "Mul",    "Neg",
-      "Pad",      "RealDiv",  "Relu",   "Relu6",
-      "ReluGrad", "Sigmoid",  "Slice",  "SquaredDifference",
-      "Squeeze",  "Sub"};
+  std::set<string> ops_format_agnostic = {"Abs",
+                                          "Add",
+                                          "AddN",
+                                          "Acos",
+                                          "Acosh",
+                                          "Angle",
+                                          "Asin",
+                                          "Asinh",
+                                          "Atan",
+                                          "Atanh",
+                                          "Bitcast",
+                                          "Cast",
+                                          "Ceil",
+                                          "CheckNumerics",
+                                          "Cos",
+                                          "Cosh",
+                                          "ComplexAbs",
+                                          "Concat",
+                                          "ConcatV2",
+                                          "Conj",
+                                          "Digamma",
+                                          "Elu",
+                                          "EluGrad",
+                                          "Erf",
+                                          "Erfc",
+                                          "Exp",
+                                          "Expm1",
+                                          "Floor",
+                                          "GuaranteeConst",
+                                          "Identity",
+                                          "Imag",
+                                          "Inv",
+                                          "InvGrad",
+                                          "IsFinite",
+                                          "IsInf",
+                                          "IsNan",
+                                          "Lgamma",
+                                          "Log",
+                                          "Log1p",
+                                          "Merge",
+                                          "Mul",
+                                          "Neg",
+                                          "OnesLike",
+                                          "Pad",
+                                          "PreventGradient",
+                                          "Real",
+                                          "RealDiv",
+                                          "Reciprocal",
+                                          "ReciprocalGrad",
+                                          "Relu",
+                                          "Relu6",
+                                          "Relu6Grad",
+                                          "ReluGrad",
+                                          "Rint",
+                                          "Selu",
+                                          "SeluGrad",
+                                          "Shape",
+                                          "ShapeN",
+                                          "Sigmoid",
+                                          "SigmoidGrad",
+                                          "Sign",
+                                          "Sin",
+                                          "Sinh",
+                                          "Slice",
+                                          "Snapshot",
+                                          "Softplus",
+                                          "SoftplusGrad",
+                                          "Split",
+                                          "Switch",
+                                          "RefIdentity",
+                                          "RefMerge",
+                                          "RefSwitch",
+                                          "Round",
+                                          "Rsqrt",
+                                          "RsqrtGrad",
+                                          "Sqrt",
+                                          "SqrtGrad",
+                                          "Square",
+                                          "SquaredDifference",
+                                          "Squeeze",
+                                          "StopGradient",
+                                          /*"Sum",*/ "Sub",
+                                          "Tan",
+                                          "Tanh",
+                                          "TanhGrad",
+                                          "ZerosLike"};
   return ops_format_agnostic;
 }
 
+bool IsNodeByLayoutOptimizer(const string& node_name) {
+  const string prefix_pattern = kPrefix;
+  string prefix = node_name.substr(0, prefix_pattern.length());
+  if (prefix.compare(prefix_pattern) == 0) {
+    return true;
+  }
+  return false;
+}
+
 bool IsNodeNHWCToNCHW(const string& node_name) {
   const string transpose_node_prefix = kTransposeNHWCToNCHW;
   string prefix = node_name.substr(0, transpose_node_prefix.length());
@@ -87,10 +188,39 @@ bool IsNodeNCHWToNHWC(const string& node_name) {
   return false;
 }
 
+bool IsConcat(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Concat" || op == "ConcatV2";
+}
+
+bool IsConcatV1(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Concat";
+}
+
+bool IsMaxPoolGradV1(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "MaxPoolGrad";
+}
+
+bool IsUnaryGrad(const NodeDef& node) {
+  bool is_unary_grad =
+      IsEluGrad(node) || IsInvGrad(node) || IsReciprocalGrad(node) ||
+      IsRelu6Grad(node) || IsReluGrad(node) || IsRsqrtGrad(node) ||
+      IsSeluGrad(node) || IsSigmoidGrad(node) || IsSoftplusGrad(node) ||
+      IsSoftsignGrad(node) || IsSqrtGrad(node) || IsTanhGrad(node);
+  return is_unary_grad;
+}
+
 class GraphProcessor {
  public:
-  GraphProcessor(GraphDef* graph, NodeMap* node_map)
-      : graph_(graph), node_map_(node_map) {}
+  GraphProcessor(const VirtualPlacer& virtual_placer,
+                 const std::unordered_set<string>& nodes_to_preserve,
+                 GraphDef* graph, NodeMap* node_map)
+      : virtual_placer_(virtual_placer),
+        nodes_to_preserve_(nodes_to_preserve),
+        graph_(graph),
+        node_map_(node_map) {}
 
  protected:
   NodeDef* AddNodePermConst(const string& name, const string& device,
@@ -99,7 +229,6 @@ class GraphProcessor {
     node_map_->AddNode(name, node);
     node->set_name(name);
     node->set_op("Const");
-    node->set_device(device);
     AttrValue attr_data_type;
     attr_data_type.set_type(DT_INT32);
     node->mutable_attr()->insert({"dtype", attr_data_type});
@@ -110,6 +239,13 @@ class GraphProcessor {
     }
     tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
     node->mutable_attr()->insert({"value", attr_tensor});
+    string device_name;
+    if (device.empty()) {
+      device_name = virtual_placer_.get_canonical_device_name(*node);
+    } else {
+      device_name = device;
+    }
+    node->set_device(device_name);
     return node;
   }
 
@@ -119,7 +255,6 @@ class GraphProcessor {
     node_map_->AddNode(name, node);
     node->set_name(name);
     node->set_op("Const");
-    node->set_device(device);
     AttrValue attr_data_type;
     attr_data_type.set_type(dtype);
     node->mutable_attr()->insert({"dtype", attr_data_type});
@@ -128,43 +263,48 @@ class GraphProcessor {
     tensor.scalar<int>()() = value;
     tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
     node->mutable_attr()->insert({"value", attr_tensor});
-    return node;
-  }
-
-  NodeDef* AddNodeReductionConst(const string& name, const string& device) {
-    NodeDef* node = graph_->add_node();
-    node_map_->AddNode(name, node);
-    node->set_name(name);
-    node->set_op("Const");
-    node->set_device(device);
-    AttrValue attr_data_type;
-    attr_data_type.set_type(DT_INT32);
-    node->mutable_attr()->insert({"dtype", attr_data_type});
-
-    AttrValue attr_tensor;
-    Tensor tensor(DT_INT32, TensorShape({3}));
-    std::vector<int> axis = {0, 2, 3};
-    for (int i = 0; static_cast<size_t>(i) < axis.size(); i++) {
-      tensor.flat<int>()(i) = axis[i];
+    string device_name;
+    if (device.empty()) {
+      device_name = virtual_placer_.get_canonical_device_name(*node);
+    } else {
+      device_name = device;
     }
-    tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
-    node->mutable_attr()->insert({"value", attr_tensor});
+    node->set_device(device_name);
     return node;
   }
 
+  const VirtualPlacer& virtual_placer_;
+  const std::unordered_set<string>& nodes_to_preserve_;
   GraphDef* graph_;
   NodeMap* node_map_;
+};
 
- private:
+struct OptimizeContext {
+  OptimizeContext(GraphDef* graph, NodeDef* node, NodeMap* node_map,
+                  const VirtualPlacer& virtual_placer,
+                  const std::unordered_set<string>& nodes_to_preserve,
+                  bool is_in_frame)
+      : graph(graph),
+        node(node),
+        node_map(node_map),
+        virtual_placer(virtual_placer),
+        nodes_to_preserve(nodes_to_preserve),
+        is_in_frame(is_in_frame) {}
+  GraphDef* graph;
+  NodeDef* node;
+  NodeMap* node_map;
+  const VirtualPlacer& virtual_placer;
+  const std::unordered_set<string>& nodes_to_preserve;
+  bool is_in_frame;
 };
 
 class NodeProcessor : public GraphProcessor {
  public:
-  NodeProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                bool is_in_frame)
-      : GraphProcessor(graph, node_map),
-        node_(node),
-        is_in_frame_(is_in_frame) {}
+  explicit NodeProcessor(const OptimizeContext& opt_cxt)
+      : GraphProcessor(opt_cxt.virtual_placer, opt_cxt.nodes_to_preserve,
+                       opt_cxt.graph, opt_cxt.node_map),
+        node_(opt_cxt.node),
+        is_in_frame_(opt_cxt.is_in_frame) {}
   virtual ~NodeProcessor() {}
   virtual Status ConvertNode() {
     if (ShouldProcess()) {
@@ -180,17 +320,34 @@ class NodeProcessor : public GraphProcessor {
   }
 
  protected:
-  bool IsDimsN(const NodeDef& node, int n) const {
+  bool IsPortDimsN(const NodeDef& node, int port, int n) const {
     if (node.attr().find("_output_shapes") != node.attr().end()) {
-      auto shape = node.attr().at("_output_shapes").list().shape(0);
-      if (shape.dim_size() == n) {
-        return true;
+      if (node.attr().at("_output_shapes").list().shape_size() > port) {
+        auto shape = node.attr().at("_output_shapes").list().shape(port);
+        if (shape.unknown_rank()) {
+          return false;
+        }
+        if (shape.dim_size() == n) {
+          return true;
+        }
       }
     }
     return false;
   }
 
-  bool IsDimsFour(const NodeDef& node) const { return IsDimsN(node, 4); }
+  bool IsPortZeroDimsN(const NodeDef& node, int n) const {
+    return IsPortDimsN(node, 0, n);
+  }
+
+  bool IsPortZeroDimsFour(const NodeDef& node) const {
+    return NodeProcessor::IsPortZeroDimsN(node, 4) ||
+           IsNodeNCHWToNHWC(node.name());
+  }
+
+  bool IsPortDimsFour(const NodeDef& node, int port) const {
+    return NodeProcessor::IsPortDimsN(node, port, 4) ||
+           IsNodeNCHWToNHWC(node.name());
+  }
 
   bool IsNHWC() const {
     if (node_->attr().find("data_format") != node_->attr().end()) {
@@ -214,8 +371,30 @@ class NodeProcessor : public GraphProcessor {
     return Status::OK();
   }
 
+  bool MustPreserve() const {
+    return nodes_to_preserve_.find(node_->name()) != nodes_to_preserve_.end();
+  }
+
   virtual bool ShouldProcess() const {
-    return IsNHWC() && IsDimsFour(*node_) && HasOutputs();
+    return !MustPreserve() && IsNHWC() && IsPortZeroDimsFour(*node_) &&
+           HasOutputs() && IsOnGPU();
+  }
+
+  virtual bool IsOnGPU() const {
+    string device_name;
+    if (node_->device().empty()) {
+      device_name = virtual_placer_.get_canonical_device_name(*node_);
+    } else {
+      device_name = node_->device();
+    }
+    string device;
+    string not_used;
+    if (DeviceNameUtils::SplitDeviceName(device_name, &not_used, &device) &&
+        (StringPiece(str_util::Lowercase(device)))
+            .contains(str_util::Lowercase(DEVICE_GPU))) {
+      return true;
+    }
+    return false;
   }
 
   void UpdateAttrDataFormat() {
@@ -267,11 +446,30 @@ class NodeProcessor : public GraphProcessor {
     if (!success) {
       LOG(ERROR) << "Failed to parse TensorProto.";
     }
-    if (tensor.dims() == 1) {
-      int c = tensor.flat<int>()(3);
-      tensor.flat<int>()(3) = tensor.flat<int>()(2);
-      tensor.flat<int>()(2) = tensor.flat<int>()(1);
-      tensor.flat<int>()(1) = c;
+    if (tensor.dims() == 0) {
+      int value = tensor.scalar<int>()();
+      value = (value >= 0) ? value : value + 4;
+      if (value == 1 || value == 2) {
+        value = value + 1;
+      } else if (value == 3) {
+        value = 1;
+      }
+      tensor.scalar<int>()() = value;
+    } else if (tensor.dims() == 1) {
+      if (tensor.flat<int>().size() == 4) {
+        int c = tensor.flat<int>()(3);
+        tensor.flat<int>()(3) = tensor.flat<int>()(2);
+        tensor.flat<int>()(2) = tensor.flat<int>()(1);
+        tensor.flat<int>()(1) = c;
+      } else if (tensor.flat<int>().size() == 3) {
+        tensor.flat<int>()(0) = 0;
+        tensor.flat<int>()(1) = 2;
+        tensor.flat<int>()(2) = 3;
+      } else {
+        return Status(error::INVALID_ARGUMENT,
+                      strings::StrCat("Unsupported tensor size: ",
+                                      tensor.flat<int>().size()));
+      }
     } else if (tensor.dims() == 2) {
       for (int i = 0; i < 2; i++) {
         int c = tensor.matrix<int>()(3, i);
@@ -284,15 +482,21 @@ class NodeProcessor : public GraphProcessor {
           error::INVALID_ARGUMENT,
           strings::StrCat("Unsupported dimension size: ", tensor.dims()));
     }
-    tensor.AsProtoTensorContent(
-        node->mutable_attr()->at({"value"}).mutable_tensor());
+    if (tensor.dims() == 0) {
+      tensor.AsProtoField(node->mutable_attr()->at({"value"}).mutable_tensor());
+    } else {
+      tensor.AsProtoTensorContent(
+          node->mutable_attr()->at({"value"}).mutable_tensor());
+    }
     return Status::OK();
   }
 
   Status UpdateAttrValueOfInput(int input_index) {
     auto input_node = node_map_->GetNode(node_->input(input_index));
     // We created a copy of the node, so that we don't modify the original node,
-    // which might be used elsewhere.
+    // which might be used elsewhere. Note that this copy also copies the
+    // control dependency input in the case this node is inside a loop,
+    // to ensure added_node is in the same frame with node_.
     NodeDef* added_node = graph_->add_node();
     *added_node = *input_node;
     string base_name = strings::StrCat(node_->name(), "-", input_node->name());
@@ -309,6 +513,14 @@ class NodeProcessor : public GraphProcessor {
     return input_pos;
   }
 
+  virtual std::set<int> GetOutputPos() const {
+    // For most nodes, no need to process control nodes or nodes that use an
+    // output other than the first output: only the first output is of
+    // 4D NCHW/NHWC format and thus relevant here.
+    std::set<int> output_pos = {0};
+    return output_pos;
+  }
+
   NodeDef* AddNodeTranspose(const string& node_name, const string& input_name,
                             const string& const_name, DataType data_type,
                             const TensorShapeProto& input_shape,
@@ -348,16 +560,14 @@ class NodeProcessor : public GraphProcessor {
   virtual Status AddLayoutTransposeToInputs() {
     std::vector<int> input_pos = GetInputPos();
     for (const auto& pos : input_pos) {
-      int output_pos;
-      string input_node_name = ParseNodeName(node_->input(pos), &output_pos);
-      string base_name =
-          strings::StrCat(node_->name(), "-", input_node_name, "-", output_pos);
       string node_name =
-          AddPrefixToNodeName(base_name, kTransposeNHWCToNCHW, "-");
-      auto input_node = node_map_->GetNode(node_->input(pos));
+          strings::StrCat(kTransposeNHWCToNCHW, "-", node_->name(), "-", pos);
       TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
+      auto input_node = node_map_->GetNode(node_->input(pos));
       TF_RETURN_IF_ERROR(HasAttribute(*input_node, "_output_shapes"));
       string const_name = GetOrAddNodePermNHWCToNCHW(pos);
+      int output_pos;
+      ParseNodeName(node_->input(pos), &output_pos);
       AddNodeTranspose(
           node_name, node_->input(pos), const_name,
           node_->attr().at("T").type(),
@@ -370,52 +580,80 @@ class NodeProcessor : public GraphProcessor {
     return Status::OK();
   }
 
-  virtual Status AddLayoutTransposeToOutputs() {
+  Status AddTransformToOutputs(const string& op) {
     auto outputs = node_map_->GetOutputs(node_->name());
     string const_name = GetOrAddNodePermNCHWToNHWC();
+    int output_count = 0;
     for (const auto& output : outputs) {
-      string base_name = strings::StrCat(node_->name(), "-", output->name());
-      string node_name =
-          AddPrefixToNodeName(base_name, kTransposeNCHWToNHWC, "-");
-      // TODO(yaozhang): handle the rare case where node A is connected to more
-      // than one input of node B.
-      auto it = std::find_if(output->mutable_input()->begin(),
-                             output->mutable_input()->end(),
-                             [this](const string& input) {
-                               string node_name = NodeName(input);
-                               return node_name.compare(node_->name()) == 0;
-                             });
-      if (it == output->mutable_input()->end()) {
-        return Status(error::INVALID_ARGUMENT,
-                      strings::StrCat("Expect ", node_->name(),
-                                      " to be an input of ", output->name()));
+      int connections = 0;
+      int connections_removed = 0;
+      for (int i = 0; i < output->input_size(); i++) {
+        auto& input = *output->mutable_input(i);
+        int input_port;
+        string input_name = ParseNodeName(input, &input_port);
+        auto output_pos = GetOutputPos();
+        if (input_name == node_->name()) {
+          connections++;
+          if (output_pos.find(input_port) != output_pos.end()) {
+            connections_removed++;
+            string added_node_base_name =
+                strings::StrCat(node_->name(), "-", output_count, "-", i);
+            string added_node_name;
+            if (op == "Transpose") {
+              added_node_name = AddPrefixToNodeName(added_node_base_name,
+                                                    kTransposeNCHWToNHWC, "-");
+              DataType dtype;
+              if (op == "Imag" || op == "Real" || op == "Angle" ||
+                  op == "Conj" || op == "ComplexAbs") {
+                TF_RETURN_IF_ERROR(HasAttribute(*node_, "Tout"));
+                dtype = node_->attr().at("Tout").type();
+              } else if (op == "Bitcast") {
+                TF_RETURN_IF_ERROR(HasAttribute(*node_, "type"));
+                dtype = node_->attr().at("type").type();
+              } else {
+                TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
+                dtype = node_->attr().at("T").type();
+              }
+              TF_RETURN_IF_ERROR(HasAttribute(*node_, "_output_shapes"));
+              AddNodeTranspose(
+                  added_node_name, input, const_name, dtype,
+                  node_->attr().at("_output_shapes").list().shape(0), false);
+            } else if (op == "DataFormatVecPermute") {
+              added_node_name = AddPrefixToNodeName(added_node_base_name,
+                                                    kVecPermuteNCHWToNHWC, "-");
+              TF_RETURN_IF_ERROR(HasAttribute(*node_, "out_type"));
+              DataType dtype = (IsSplit(*node_))
+                                   ? DT_INT32
+                                   : node_->attr().at("out_type").type();
+              AddNodeDataFormatOp(added_node_name, input, op, dtype, false);
+            } else {
+              return errors::InvalidArgument("Unsupported op type: ", op);
+            }
+            input = added_node_name;
+            node_map_->AddOutput(node_->name(), added_node_name);
+            node_map_->AddOutput(added_node_name, output->name());
+          }
+        }
       }
-      int output_pos = NodePosition(*it);
-      // No need to process control nodes or nodes that use an output
-      // other than the first output: only the first output is of 4D NCHW/NHWC
-      // format and thus relevant here.
-      if (output_pos != 0) {
-        continue;
+      if (connections == connections_removed) {
+        node_map_->RemoveOutput(node_->name(), output->name());
       }
-      TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
-      TF_RETURN_IF_ERROR(HasAttribute(*node_, "_output_shapes"));
-      AddNodeTranspose(
-          node_name, node_->name(), const_name, node_->attr().at("T").type(),
-          node_->attr().at("_output_shapes").list().shape(0), false);
-      *it = node_name;
-      node_map_->UpdateOutput(node_->name(), output->name(), node_name);
-      node_map_->AddOutput(node_name, output->name());
+      output_count++;
     }
     return Status::OK();
   }
 
+  virtual Status AddLayoutTransposeToOutputs() {
+    return AddTransformToOutputs("Transpose");
+  }
+
   virtual Status CustomizedProcessing() { return Status::OK(); }
 
   NodeDef* AddNodePermNHWCToNCHW(const string& suffix,
                                  const string& depended_node,
                                  const string& device) {
-    auto const_node = AddNodePermConst(
-        strings::StrCat(kPermNHWCToNCHW, "-", suffix), device, {0, 3, 1, 2});
+    string name = strings::StrCat(kPermNHWCToNCHW, "-", suffix);
+    auto const_node = AddNodePermConst(name, device, {0, 3, 1, 2});
     // This is to ensure the transpose node and the const node are in the
     // same frame.
     *const_node->add_input() = AsControlDependency(depended_node);
@@ -433,6 +671,40 @@ class NodeProcessor : public GraphProcessor {
     return const_node;
   }
 
+  NodeDef* AddNodeDataFormatOp(const string& name, const string& input_name,
+                               const string& op, DataType dtype,
+                               bool nhwc_to_nchw) {
+    NodeDef* added_node = graph_->add_node();
+    added_node->set_name(name);
+    added_node->set_op(op);
+    node_map_->AddNode(added_node->name(), added_node);
+    added_node->set_device(node_->device());
+    AttrValue attr_data_type;
+    attr_data_type.set_type(dtype);
+    added_node->mutable_attr()->insert({"T", attr_data_type});
+    string src_format = (nhwc_to_nchw) ? "NHWC" : "NCHW";
+    string dst_format = (nhwc_to_nchw) ? "NCHW" : "NHWC";
+    AttrValue attr_format;
+    attr_format.set_s(src_format);
+    added_node->mutable_attr()->insert({"src_format", attr_format});
+    attr_format.set_s(dst_format);
+    added_node->mutable_attr()->insert({"dst_format", attr_format});
+    *added_node->add_input() = input_name;
+    return added_node;
+  }
+
+  void AddDataFormatTranformToInput(const string& op, int input_pos,
+                                    DataType dtype) {
+    string name = strings::StrCat(kVecPermuteNHWCToNCHW, "_", node_->name(),
+                                  "_", input_pos);
+    auto added_node =
+        AddNodeDataFormatOp(name, node_->input(input_pos), op, dtype, true);
+    *node_->mutable_input(input_pos) = added_node->name();
+    node_map_->UpdateOutput(added_node->input(0), node_->name(),
+                            added_node->name());
+    node_map_->AddOutput(added_node->name(), node_->name());
+  }
+
   NodeDef* node_;
   bool is_in_frame_;
 
@@ -440,8 +712,17 @@ class NodeProcessor : public GraphProcessor {
   string GetOrAddNodePermNHWCToNCHW(int pos) {
     string const_name;
     if (is_in_frame_) {
-      auto const_node = AddNodePermNHWCToNCHW(
-          node_->input(pos), NodeName(node_->input(pos)), node_->device());
+      string suffix = strings::StrCat(node_->name(), "_", pos);
+      string input = NodeName(node_->input(pos));
+      string depended_node;
+      if (!IsNodeNCHWToNHWC(input)) {
+        depended_node = input;
+      } else {
+        auto input_node = node_map_->GetNode(input);
+        depended_node = NodeName(input_node->input(0));
+      }
+      auto const_node =
+          AddNodePermNHWCToNCHW(suffix, depended_node, node_->device());
       const_name = const_node->name();
     } else {
       const_name = kPermNHWCToNCHW;
@@ -473,9 +754,8 @@ class NodeProcessor : public GraphProcessor {
 
 class AvgPoolGradProcessor : public NodeProcessor {
  public:
-  AvgPoolGradProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                       bool is_in_frame)
-      : NodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit AvgPoolGradProcessor(const OptimizeContext& opt_cxt)
+      : NodeProcessor(opt_cxt) {}
 
  protected:
   std::vector<int> GetInputPos() const override {
@@ -487,15 +767,22 @@ class AvgPoolGradProcessor : public NodeProcessor {
 
 class BiasAddGradProcessor : public NodeProcessor {
  public:
-  BiasAddGradProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                       bool is_in_frame)
-      : NodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit BiasAddGradProcessor(const OptimizeContext& opt_cxt)
+      : NodeProcessor(opt_cxt) {}
 
  protected:
   bool ShouldProcess() const override {
+    if (MustPreserve()) {
+      return false;
+    }
+    if (!IsOnGPU()) {
+      return false;
+    }
     auto input = node_map_->GetNode(node_->input(0));
     if (input) {
-      if ((IsNHWC() && IsDimsFour(*input)) || IsNodeNCHWToNHWC(input->name())) {
+      int port;
+      ParseNodeName(node_->input(0), &port);
+      if (IsNHWC() && IsPortDimsFour(*input, port)) {
         return true;
       }
     }
@@ -507,14 +794,13 @@ class BiasAddGradProcessor : public NodeProcessor {
 
 class Conv2DProcessor : public NodeProcessor {
  public:
-  Conv2DProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                  bool no_gemm, bool is_in_frame)
-      : NodeProcessor(graph, node, node_map, is_in_frame), no_gemm_(no_gemm) {}
+  Conv2DProcessor(const OptimizeContext& opt_cxt, bool no_gemm)
+      : NodeProcessor(opt_cxt), no_gemm_(no_gemm) {}
 
  protected:
   bool ShouldProcess() const override {
-    return IsNHWC() && IsDimsFour(*node_) && HasOutputs() &&
-           (!IsGemmUsed() || no_gemm_);
+    return !MustPreserve() && IsNHWC() && IsPortZeroDimsFour(*node_) &&
+           HasOutputs() && (!IsGemmUsed() || no_gemm_) && IsOnGPU();
   }
 
   TensorShapeProto GetShape(const string& input_name) const {
@@ -577,10 +863,8 @@ class Conv2DProcessor : public NodeProcessor {
 
 class Conv2DBackpropFilterProcessor : public Conv2DProcessor {
  public:
-  Conv2DBackpropFilterProcessor(GraphDef* graph, NodeDef* node,
-                                NodeMap* node_map, bool no_gemm,
-                                bool is_in_frame)
-      : Conv2DProcessor(graph, node, node_map, no_gemm, is_in_frame) {}
+  Conv2DBackpropFilterProcessor(const OptimizeContext& opt_cxt, bool no_gemm)
+      : Conv2DProcessor(opt_cxt, no_gemm) {}
 
  protected:
   bool IsGemmUsed() const override {
@@ -603,10 +887,8 @@ class Conv2DBackpropFilterProcessor : public Conv2DProcessor {
 
 class Conv2DBackpropInputProcessor : public Conv2DProcessor {
  public:
-  Conv2DBackpropInputProcessor(GraphDef* graph, NodeDef* node,
-                               NodeMap* node_map, bool no_gemm,
-                               bool is_in_frame)
-      : Conv2DProcessor(graph, node, node_map, no_gemm, is_in_frame) {}
+  Conv2DBackpropInputProcessor(const OptimizeContext& opt_cxt, bool no_gemm)
+      : Conv2DProcessor(opt_cxt, no_gemm) {}
 
  protected:
   bool IsGemmUsed() const override {
@@ -620,27 +902,47 @@ class Conv2DBackpropInputProcessor : public Conv2DProcessor {
     return input_pos;
   }
 
-  Status CustomizedProcessing() override { return UpdateAttrValueOfInput(0); }
+  Status CustomizedProcessing() override {
+    auto input_size_node = node_map_->GetNode(node_->input(0));
+    if (IsConstant(*input_size_node)) {
+      TF_RETURN_IF_ERROR(UpdateAttrValueOfInput(0));
+    } else {
+      AddDataFormatTranformToInput("DataFormatVecPermute", 0, DT_INT32);
+    }
+    return Status::OK();
+  }
 };
 
 class FusedBatchNormGradProcessor : public NodeProcessor {
  public:
-  FusedBatchNormGradProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                              bool is_in_frame)
-      : NodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit FusedBatchNormGradProcessor(const OptimizeContext& opt_cxt)
+      : NodeProcessor(opt_cxt) {}
 
  protected:
+  bool ShouldProcess() const override {
+    return NodeProcessor::ShouldProcess() && IsTraining();
+  }
+
   std::vector<int> GetInputPos() const override {
     std::vector<int> input_pos = {0, 1};
     return input_pos;
   }
+
+ private:
+  bool IsTraining() const {
+    if (node_->attr().find("is_training") != node_->attr().end()) {
+      if (node_->attr().at("is_training").b()) {
+        return true;
+      }
+    }
+    return false;
+  }
 };
 
 class MaxPoolGradProcessor : public NodeProcessor {
  public:
-  MaxPoolGradProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                       bool is_in_frame)
-      : NodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit MaxPoolGradProcessor(const OptimizeContext& opt_cxt)
+      : NodeProcessor(opt_cxt) {}
 
  protected:
   std::vector<int> GetInputPos() const override {
@@ -651,42 +953,77 @@ class MaxPoolGradProcessor : public NodeProcessor {
 
 class AgnosticNodeProcessor : public NodeProcessor {
  public:
-  AgnosticNodeProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                        bool is_in_frame)
-      : NodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit AgnosticNodeProcessor(const OptimizeContext& opt_cxt)
+      : NodeProcessor(opt_cxt) {}
 
  protected:
   bool ShouldProcess() const override {
-    return IsDimsFour(*node_) && HasOutputs() && IsNodeAfterNCHWToNHWC();
+    return !MustPreserve() && IsPortZeroDimsFour(*node_) && HasOutputs() &&
+           IsNodeAfterNCHWToNHWC() && IsOnGPU();
   }
 
-  bool IsNodeAfterNCHWToNHWC() const {
+  bool IsNodeAfterNCHWToNHWC(const NodeDef& node) const {
     std::set<string> ops_format_agnostic = GetOpsFormatAgnostic();
-    auto node = node_map_->GetNode(node_->name());
-    while (node->input_size() > 0) {
-      int data_input_pos = 0;
-      if (node->op().compare("Concat") == 0) {
-        data_input_pos = 1;
-      }
-      node = node_map_->GetNode(node->input(data_input_pos));
-      if (IsNodeNCHWToNHWC(node->name())) {
+    std::deque<NodeDef*> queue;
+    auto data_node_pos = DataInputPos(node);
+    for (const auto& pos : data_node_pos) {
+      auto input_node = node_map_->GetNode(node.input(pos));
+      queue.push_back(input_node);
+    }
+    // The code will exit this while loop in one iteration in most cases, as the
+    // graph is already topologically sorted.
+    while (!queue.empty()) {
+      NodeDef* current_node = queue.front();
+      queue.pop_front();
+      if (IsNodeNCHWToNHWC(current_node->name())) {
         return true;
       }
-      bool connected =
-          ops_format_agnostic.find(node->op()) != ops_format_agnostic.end();
-      if (!connected) {
-        return false;
+      // We only continue searching if the path is connected through
+      // format-agnostic nodes.
+      if (ops_format_agnostic.find(current_node->op()) !=
+          ops_format_agnostic.end()) {
+        auto current_node_pos = DataInputPos(*current_node);
+        for (const auto& pos : current_node_pos) {
+          auto input_node = node_map_->GetNode(current_node->input(pos));
+          queue.push_back(input_node);
+        }
       }
     }
     return false;
   }
+
+  bool IsNodeAfterNCHWToNHWC() const { return IsNodeAfterNCHWToNHWC(*node_); }
+
+ private:
+  std::vector<int> DataInputPos(const NodeDef& node) const {
+    if (IsSplit(node)) {
+      return {1};
+    }
+    if (IsConcatV1(node)) {
+      return {1};
+    }
+    if (IsAdd(node) || IsMul(node) || IsRealDiv(node) ||
+        IsSquaredDifference(node) || IsSub(node)) {
+      return {0, 1};
+    }
+    if (IsShapeN(node)) {
+      std::vector<int> pos;
+      for (int i = 0; i < node.input_size(); i++) {
+        pos.push_back(i);
+      }
+      return pos;
+    }
+    if (node.input_size() > 0 && !IsControlInput(node.input(0))) {
+      return {0};
+    }
+    return {};
+  }
 };
 
 class AddNProcessor : public AgnosticNodeProcessor {
  public:
-  AddNProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit AddNProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
   std::vector<int> GetInputPos() const override {
@@ -701,44 +1038,56 @@ class AddNProcessor : public AgnosticNodeProcessor {
 
 class BinaryOpProcessor : public AgnosticNodeProcessor {
  public:
-  BinaryOpProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                    bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {
-    is_4d_with_vector_ = Is4DOperateWithVector();
-  }
+  explicit BinaryOpProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
   bool ShouldProcess() const override {
-    return IsDimsFour(*node_) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
-           (Is4DOperateWithND(4) || Is4DOperateWithScalar() ||
-            Is4DOperateWithVector());
+    return !MustPreserve() && IsPortZeroDimsFour(*node_) && HasOutputs() &&
+           IsNodeAfterNCHWToNHWC() &&
+           (IsNDOperateWithMD(4, 0) || IsNDOperateWithMD(4, 1) ||
+            IsNDOperateWithMD(4, 4) || IsNDOperateWithMD(0, 4) ||
+            IsNDOperateWithMD(1, 4)) &&
+           IsOnGPU();
   }
 
   std::vector<int> GetInputPos() const override {
-    std::vector<int> input_pos = {0};
-    if (Is4DOperateWithND(4)) {
+    std::vector<int> input_pos;
+    auto input0 = node_map_->GetNode(node_->input(0));
+    auto input1 = node_map_->GetNode(node_->input(1));
+    int input0_port;
+    ParseNodeName(node_->input(0), &input0_port);
+    int input1_port;
+    ParseNodeName(node_->input(1), &input1_port);
+    if (IsPortDimsFour(*input0, input0_port)) {
+      input_pos.push_back(0);
+    }
+    if (IsPortDimsFour(*input1, input1_port)) {
       input_pos.push_back(1);
     }
     return input_pos;
   }
 
-  bool Is4DOperateWithND(int n) const {
+  bool IsNDOperateWithMD(int n, int m) const {
     auto input0 = node_map_->GetNode(node_->input(0));
     auto input1 = node_map_->GetNode(node_->input(1));
+    int input0_port;
+    ParseNodeName(node_->input(0), &input0_port);
+    int input1_port;
+    ParseNodeName(node_->input(1), &input1_port);
+
     if (input0 && input1) {
-      return (IsDimsFour(*input0) || IsNodeNCHWToNHWC(input0->name())) &&
-             ((n == 4)
-                  ? (IsDimsFour(*input1) || IsNodeNCHWToNHWC(input1->name()))
-                  : IsDimsN(*input1, n));
+      bool input0_is_n = (n == 4) ? IsPortDimsFour(*input0, input0_port)
+                                  : IsPortDimsN(*input0, input0_port, n);
+      bool input1_is_m = (m == 4) ? IsPortDimsFour(*input1, input1_port)
+                                  : IsPortDimsN(*input1, input1_port, m);
+      return input0_is_n && input1_is_m;
     }
     return false;
   }
 
-  bool Is4DOperateWithScalar() const { return Is4DOperateWithND(0); }
-
-  bool Is4DOperateWithVector() const { return Is4DOperateWithND(1); }
-
-  NodeDef* AddNodeShapeConst(const string& name, int num_channels) {
+  NodeDef* AddNodeShapeConst(const string& name, int num_channels,
+                             const string& depended_node) {
     NodeDef* node = graph_->add_node();
     node_map_->AddNode(name, node);
     node->set_name(name);
@@ -756,6 +1105,11 @@ class BinaryOpProcessor : public AgnosticNodeProcessor {
     }
     tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
     node->mutable_attr()->insert({"value", attr_tensor});
+    if (is_in_frame_) {
+      // This is to ensure the transpose node and the const node are in the
+      // same frame.
+      *node->add_input() = AsControlDependency(depended_node);
+    }
     return node;
   }
 
@@ -781,55 +1135,58 @@ class BinaryOpProcessor : public AgnosticNodeProcessor {
   }
 
   Status CustomizedProcessing() override {
-    if (is_4d_with_vector_) {
-      string base_name = strings::StrCat(node_->name(), "-", node_->input(1));
+    int vector_index = -1;
+    if (IsNDOperateWithMD(4, 1)) {
+      vector_index = 1;
+    } else if (IsNDOperateWithMD(1, 4)) {
+      vector_index = 0;
+    }
+    if (vector_index != -1) {
+      string base_name = strings::StrCat(node_->name(), "-", vector_index);
       string reshape_node_name =
           AddPrefixToNodeName(base_name, kReshapeNHWCToNCHW, "-");
       string shape_const_node_name =
           AddPrefixToNodeName(base_name, kReshapeConst, "-");
-      auto input_node = node_map_->GetNode(node_->input(1));
+      auto input_node = node_map_->GetNode(node_->input(vector_index));
       TF_RETURN_IF_ERROR(HasAttribute(*input_node, "_output_shapes"));
-      int vector_size =
-          input_node->attr().at("_output_shapes").list().shape(0).dim(0).size();
-      AddNodeShapeConst(shape_const_node_name, vector_size);
+      int port;
+      ParseNodeName(node_->input(vector_index), &port);
+      int vector_size = input_node->attr()
+                            .at("_output_shapes")
+                            .list()
+                            .shape(port)
+                            .dim(0)
+                            .size();
+      AddNodeShapeConst(shape_const_node_name, vector_size,
+                        NodeName(node_->input(vector_index)));
       TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
-      AddNodeReshape(reshape_node_name, node_->input(1), shape_const_node_name,
-                     node_->attr().at("T").type());
+      AddNodeReshape(reshape_node_name, node_->input(vector_index),
+                     shape_const_node_name, node_->attr().at("T").type());
       node_map_->AddOutput(shape_const_node_name, reshape_node_name);
-      node_map_->UpdateOutput(node_->input(1), node_->name(),
+      node_map_->UpdateOutput(node_->input(vector_index), node_->name(),
                               reshape_node_name);
       node_map_->AddOutput(reshape_node_name, node_->name());
-      *node_->mutable_input(1) = reshape_node_name;
+      *node_->mutable_input(vector_index) = reshape_node_name;
     }
     return Status::OK();
   }
-
- private:
-  bool is_4d_with_vector_;
 };
 
 class ConcatProcessor : public AgnosticNodeProcessor {
  public:
-  ConcatProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                  bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {
+  explicit ConcatProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {
     // For Concat,  the concat axis is the first input; for ConcatV2,
     // the last input.
-    axis_node_pos_ =
-        (node_->op().compare("Concat") == 0) ? 0 : (node_->input_size() - 1);
+    axis_node_pos_ = (IsConcatV1(*node_)) ? 0 : (node_->input_size() - 1);
   }
 
  protected:
-  bool ShouldProcess() const override {
-    return IsDimsFour(*node_) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
-           IsAlongDimC();
-  }
-
   std::vector<int> GetInputPos() const override {
     std::vector<int> input_pos;
-    int start = (node_->op().compare("Concat") == 0) ? 1 : 0;
-    int end = (node_->op().compare("Concat") == 0) ? node_->input_size()
-                                                   : (node_->input_size() - 1);
+    int start = (IsConcatV1(*node_)) ? 1 : 0;
+    int end =
+        (IsConcatV1(*node_)) ? node_->input_size() : (node_->input_size() - 1);
     for (int i = start; i < end; i++) {
       input_pos.push_back(i);
     }
@@ -837,85 +1194,99 @@ class ConcatProcessor : public AgnosticNodeProcessor {
   }
 
   Status CustomizedProcessing() override {
-    string concat_const_name = GetOrAddNodeConcatConst();
-    node_map_->AddOutput(concat_const_name, node_->name());
-    *node_->mutable_input(axis_node_pos_) = concat_const_name;
+    auto dim_node = node_map_->GetNode(node_->input(axis_node_pos_));
+    if (IsConstant(*dim_node)) {
+      TF_RETURN_IF_ERROR(UpdateAttrValueOfInput(axis_node_pos_));
+    } else {
+      DataType dtype =
+          (IsSplit(*node_)) ? DT_INT32 : node_->attr().at("Tidx").type();
+      AddDataFormatTranformToInput("DataFormatDimMap", axis_node_pos_, dtype);
+    }
     return Status::OK();
   }
+  int axis_node_pos_;
+};
 
-  bool IsAlongDimC() const {
-    auto axis_node = node_map_->GetNode(node_->input(axis_node_pos_));
-    if (axis_node->attr().find("value") != axis_node->attr().end()) {
-      return axis_node->attr().at("value").tensor().int_val(0) == 3;
-    }
-    return false;
+class MergeProcessor : public AgnosticNodeProcessor {
+ public:
+  explicit MergeProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
+
+ protected:
+  bool ShouldProcess() const override {
+    return !MustPreserve() && IsPortZeroDimsFour(*node_) && HasOutputs() &&
+           IsEveryInputAfterNCHWToNHWC() && IsOnGPU();
   }
 
-  int axis_node_pos_;
+  std::vector<int> GetInputPos() const override {
+    std::vector<int> input_pos;
+    input_pos.reserve(node_->input_size());
+    for (int i = 0; i < node_->input_size(); i++) {
+      input_pos.push_back(i);
+    }
+    return input_pos;
+  }
 
  private:
-  NodeDef* AddNodeConcatConst(const string& suffix, const string& depended_node,
-                              const string& device) {
-    auto const_node = AddNodeConstScalar(
-        strings::StrCat(kConcatConst, "-", suffix), device, DT_INT32, 1);
-    // This is to ensure the concat node and the const node are
-    // in the same frame.
-    *const_node->add_input() = AsControlDependency(depended_node);
-    return const_node;
+  bool IsEveryInputAfterNCHWToNHWC() const {
+    for (const auto& input : node_->input()) {
+      auto input_node = node_map_->GetNode(input);
+      if (IsNodeAfterNCHWToNHWC(*input_node) ||
+          IsNodeNCHWToNHWC(input_node->name())) {
+        continue;
+      }
+      return false;
+    }
+    return true;
   }
+};
 
-  string GetOrAddNodeConcatConst() {
-    string const_name;
-    if (is_in_frame_) {
-      int value_node_pos = (axis_node_pos_ == 0) ? 1 : 0;
-      auto const_node = AddNodeConcatConst(
-          node_->name(), NodeName(node_->input(value_node_pos)),
-          node_->device());
-      const_name = const_node->name();
+class PadProcessor : public AgnosticNodeProcessor {
+ public:
+  explicit PadProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
+
+ protected:
+  Status CustomizedProcessing() override {
+    auto index_node = node_map_->GetNode(node_->input(1));
+    if (IsConstant(*index_node)) {
+      TF_RETURN_IF_ERROR(UpdateAttrValueOfInput(1));
     } else {
-      const_name = kConcatConst;
+      DataType dtype = node_->attr().at("Tpaddings").type();
+      AddDataFormatTranformToInput("DataFormatVecPermute", 1, dtype);
     }
-    return const_name;
+    return Status::OK();
   }
 };
 
-class PadProcessor : public AgnosticNodeProcessor {
+class SplitProcessor : public ConcatProcessor {
  public:
-  PadProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-               bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit SplitProcessor(const OptimizeContext& opt_cxt)
+      : ConcatProcessor(opt_cxt) {
+    axis_node_pos_ = 0;
+  }
 
  protected:
-  bool ShouldProcess() const override {
-    return IsDimsFour(*node_) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
-           PaddingSupported();
+  std::vector<int> GetInputPos() const override {
+    std::vector<int> input_pos = {1};
+    return input_pos;
   }
-  Status CustomizedProcessing() override { return UpdateAttrValueOfInput(1); }
 
- private:
-  bool PaddingSupported() const {
-    auto pad_const = node_map_->GetNode(node_->input(1));
-    bool is_const = IsConstant(*pad_const);
-    bool is_4D = false;
-    if (HasAttribute(*pad_const, "value").ok()) {
-      Tensor tensor;
-      if (tensor.FromProto(pad_const->mutable_attr()->at({"value"}).tensor())) {
-        if (tensor.dims() == 2) {
-          if (tensor.dim_size(0) == 4 && tensor.dim_size(1) == 2) {
-            is_4D = true;
-          }
-        }
+  std::set<int> GetOutputPos() const override {
+    std::set<int> output_pos{0};
+    if (HasAttribute(*node_, "num_split").ok()) {
+      for (int i = 1; i < node_->attr().at("num_split").i(); i++) {
+        output_pos.insert(i);
       }
     }
-    return is_const && is_4D;
+    return output_pos;
   }
 };
 
-class ReluGradProcessor : public AgnosticNodeProcessor {
+class UnaryGradProcessor : public AgnosticNodeProcessor {
  public:
-  ReluGradProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                    bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit UnaryGradProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
   std::vector<int> GetInputPos() const override {
@@ -924,172 +1295,61 @@ class ReluGradProcessor : public AgnosticNodeProcessor {
   }
 };
 
-class SliceProcessor : public AgnosticNodeProcessor {
+class ShapeProcessor : public AgnosticNodeProcessor {
  public:
-  SliceProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                 bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit ShapeProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
-  Status CustomizedProcessing() override {
-    // Skip the first input, which is the data to be sliced.
-    for (int i = 1; i < node_->input_size(); i++) {
-      string base_name = strings::StrCat(node_->name(), "-input", i);
-      string node_name =
-          AddPrefixToNodeName(base_name, kPermVecNHWCToNCHW, "-");
-      TF_RETURN_IF_ERROR(HasAttribute(*node_, "Index"));
-      AddNodePermVec(node_name, node_->input(i),
-                     node_->attr().at("Index").type(), true);
-      node_map_->UpdateOutput(node_->input(i), node_->name(), node_name);
-      node_map_->AddOutput(node_name, node_->name());
-      *node_->mutable_input(i) = node_name;
-    }
-    return Status::OK();
-  }
-
- private:
-  NodeDef* AddNodeGatherAxisConst(const string& suffix,
-                                  const string& depended_node,
-                                  const string& device) {
-    auto const_node = AddNodeConstScalar(
-        strings::StrCat(kGatherAxisConst, "-", suffix), device, DT_INT32, 0);
-    // This is to ensure the Slice node and the const node are
-    // in the same frame.
-    *const_node->add_input() = AsControlDependency(depended_node);
-    return const_node;
-  }
-
-  string GetOrAddNodeGatherAxisConst() {
-    string const_name;
-    if (is_in_frame_) {
-      auto const_node = AddNodeGatherAxisConst(
-          node_->name(), NodeName(node_->input(0)), node_->device());
-      const_name = const_node->name();
-    } else {
-      const_name = kGatherAxisConst;
-    }
-    return const_name;
+  bool ShouldProcess() const override {
+    return !MustPreserve() && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
+           IsOnGPU();
   }
 
-  string GetOrAddNodePermNHWCToNCHW() {
-    string const_name;
-    if (is_in_frame_) {
-      auto const_node = AddNodePermNHWCToNCHW(
-          node_->name(), NodeName(node_->input(0)), node_->device());
-      const_name = const_node->name();
-    } else {
-      const_name = kPermNHWCToNCHW;
+  std::vector<int> GetInputPos() const override {
+    std::vector<int> input_pos;
+    for (int i = 0; i < node_->input_size(); i++) {
+      auto input = node_map_->GetNode(node_->input(i));
+      int port;
+      ParseNodeName(node_->input(i), &port);
+      if (IsPortDimsFour(*input, port) &&
+          (IsNodeAfterNCHWToNHWC(*input) || IsNodeNCHWToNHWC(input->name()))) {
+        input_pos.push_back(i);
+      }
     }
-    return const_name;
+    return input_pos;
   }
 
-  string GetOrAddNodePermNCHWToNHWC() {
-    string const_name;
-    if (is_in_frame_) {
-      auto const_node = AddNodePermNCHWToNHWC(
-          node_->name(), NodeName(node_->input(0)), node_->device());
-      const_name = const_node->name();
-    } else {
-      const_name = kPermNCHWToNHWC;
+  std::set<int> GetOutputPos() const override {
+    std::set<int> output_pos{};
+    for (const auto& input_pos : GetInputPos()) {
+      output_pos.insert(input_pos);
     }
-    return const_name;
+    return output_pos;
   }
 
-  void AddNodePermVec(const string& node_name, const string& input_name,
-                      DataType data_type, bool NHWCToNCHW) {
-    NodeDef* node = graph_->add_node();
-    node_map_->AddNode(node_name, node);
-    node->set_name(node_name);
-    *node->add_input() = input_name;
-    *node->add_input() = NHWCToNCHW ? GetOrAddNodePermNHWCToNCHW()
-                                    : GetOrAddNodePermNCHWToNHWC();
-    *node->add_input() = GetOrAddNodeGatherAxisConst();
-    node->set_op("GatherV2");
-
-    AttrValue attr_type_indices;
-    attr_type_indices.set_type(DT_INT32);
-    node->mutable_attr()->insert({"Tindices", attr_type_indices});
-
-    AttrValue attr_type_axis;
-    attr_type_axis.set_type(DT_INT32);
-    node->mutable_attr()->insert({"Taxis", attr_type_axis});
-
-    AttrValue attr_type_params;
-    attr_type_params.set_type(data_type);
-    node->mutable_attr()->insert({"Tparams", attr_type_params});
+  Status AddLayoutTransposeToOutputs() override { return Status::OK(); }
 
-    AttrValue attr_validate;
-    attr_validate.set_b(true);
-    node->mutable_attr()->insert({"validate_indices", attr_validate});
+  Status CustomizedProcessing() override {
+    return AddTransformToOutputs("DataFormatVecPermute");
   }
 };
 
-// Specialized SliceProcessor, used if the second and third input are const
-// nodes, which could be the case if a constant folding pass is applied
-// before this optimization.
-class SliceProcessorConst : public AgnosticNodeProcessor {
+class SliceProcessor : public AgnosticNodeProcessor {
  public:
-  SliceProcessorConst(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                      bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit SliceProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
   Status CustomizedProcessing() override {
     // Skip the first input, which is the data to be sliced.
     for (int i = 1; i < node_->input_size(); i++) {
-      TF_RETURN_IF_ERROR(UpdateAttrValueOfInput(i));
-    }
-    return Status::OK();
-  }
-};
-
-// Specialized SliceProcessor, used if the second input is ConcatOffset. An
-// example use case is in the gradient computation of Concat for InceptionV3.
-class SliceProcessorConcatOffset : public AgnosticNodeProcessor {
- public:
-  SliceProcessorConcatOffset(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                             bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {}
-
- protected:
-  Status CustomizedProcessing() override {
-    auto maybe_concatoffset_node =
-        node_map_->GetNode(NodeName(node_->input(1)));
-    if (maybe_concatoffset_node->op() == "ConcatOffset") {
-      auto maybe_axis_node =
-          node_map_->GetNode(maybe_concatoffset_node->input(0));
-      NodeDef* axis_node;
-      if (maybe_axis_node->op() == "Const") {
-        axis_node = maybe_axis_node;
-        // A FloorMod node might be added between ConcatOffset and the concat
-        // dimension const node to handle a negative dimension index -1, meaning
-        // the last dimension, which is consistent with the python's notation
-        // for negative index.
-      } else if (maybe_axis_node->op() == "FloorMod") {
-        axis_node = node_map_->GetNode(maybe_axis_node->input(0));
+      auto index_node = node_map_->GetNode(node_->input(i));
+      if (IsConstant(*index_node)) {
+        TF_RETURN_IF_ERROR(UpdateAttrValueOfInput(i));
       } else {
-        return Status(error::INVALID_ARGUMENT,
-                      strings::StrCat("Expect either Const or FloorMod for the "
-                                      "input 1 of ConcatOffset"));
-      }
-      // Need to process if the channel is at dimension 3, which indicates the
-      // NHWC format is being used. As multiple Slice nodes may share the same
-      // ConcatOffset node, the NHWC to NCHW conversion may have already
-      // been performed when processing other Slice nodes.
-      TF_RETURN_IF_ERROR(HasAttribute(*axis_node, "value"));
-      int concat_dim = axis_node->attr().at("value").tensor().int_val(0);
-      if (concat_dim == -1 || concat_dim == 3) {
-        // Update the dimension order for shape input nodes. Note that the input
-        // 2 of Slice also shares one of the shape nodes.
-        for (int i = 1; i < maybe_concatoffset_node->input_size(); i++) {
-          auto shape_node =
-              node_map_->GetNode(maybe_concatoffset_node->input(i));
-          TF_RETURN_IF_ERROR(UpdateAttrValue(shape_node));
-        }
-        // Set the channel dimension to 1, as we have converted the vector
-        // element order from NHWC to NCHW.
-        axis_node->mutable_attr()->at("value").mutable_tensor()->set_int_val(0,
-                                                                             1);
+        AddDataFormatTranformToInput("DataFormatVecPermute", i,
+                                     node_->attr().at("Index").type());
       }
     }
     return Status::OK();
@@ -1098,14 +1358,14 @@ class SliceProcessorConcatOffset : public AgnosticNodeProcessor {
 
 class SqueezeProcessor : public AgnosticNodeProcessor {
  public:
-  SqueezeProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                   bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit SqueezeProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
   bool ShouldProcess() const override {
-    return IsDimsN(*node_, 2) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
-           IsInputConvertible() && IsAlongDimHW();
+    return !MustPreserve() && IsPortZeroDimsN(*node_, 2) && HasOutputs() &&
+           IsNodeAfterNCHWToNHWC() && IsInputConvertible() && IsAlongDimHW() &&
+           IsOnGPU();
   }
 
   Status AddLayoutTransposeToOutputs() override { return Status::OK(); }
@@ -1148,34 +1408,35 @@ class SqueezeProcessor : public AgnosticNodeProcessor {
 
 class SumProcessor : public AgnosticNodeProcessor {
  public:
-  SumProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-               bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit SumProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
   bool ShouldProcess() const override {
     auto input0 = node_map_->GetNode(node_->input(0));
-    return HasOutputs() && IsNodeAfterNCHWToNHWC() &&
-           (IsDimsFour(*input0) || IsNodeNCHWToNHWC(input0->name())) &&
-           IsAlongDimNHW();
+    int port;
+    ParseNodeName(node_->input(0), &port);
+    return !MustPreserve() && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
+           IsPortDimsFour(*input0, port) && IsAlongDimNHW() && IsOnGPU();
   }
 
   Status AddLayoutTransposeToOutputs() override { return Status::OK(); }
 
-  Status CustomizedProcessing() override {
-    node_map_->AddOutput(kReductionConst, node_->name());
-    *node_->mutable_input(1) = GetOrAddNodeReductionConst();
-    return Status::OK();
-  }
+  Status CustomizedProcessing() override { return UpdateAttrValueOfInput(1); }
 
  private:
   bool IsAlongDimNHW() const {
-    NodeDef* node = node_map_->GetNode(node_->input(1));
+    NodeDef* reduction_indices = node_map_->GetNode(node_->input(1));
+    if (!IsConstant(*reduction_indices)) {
+      return false;
+    }
     Tensor tensor;
-    if (node->attr().find({"value"}) == node->attr().end()) {
+    if (reduction_indices->attr().find({"value"}) ==
+        reduction_indices->attr().end()) {
       return false;
     }
-    auto success = tensor.FromProto(node->attr().at({"value"}).tensor());
+    auto success =
+        tensor.FromProto(reduction_indices->attr().at({"value"}).tensor());
     if (!success) {
       LOG(ERROR) << "Failed to parse TensorProto.";
       return false;
@@ -1189,69 +1450,43 @@ class SumProcessor : public AgnosticNodeProcessor {
     }
     return false;
   }
+};
 
-  NodeDef* AddNodeReductionConst(const string& suffix,
-                                 const string& depended_node,
-                                 const string& device) {
-    auto const_node = GraphProcessor::AddNodeReductionConst(
-        strings::StrCat(kReductionConst, "-", suffix), device);
-    // This is to ensure the Sum node and the const node are in the
-    // same frame.
-    *const_node->add_input() = AsControlDependency(depended_node);
-    return const_node;
-  }
+class SwitchProcessor : public AgnosticNodeProcessor {
+ public:
+  explicit SwitchProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
-  string GetOrAddNodeReductionConst() {
-    string const_name;
-    if (is_in_frame_) {
-      auto const_node = AddNodeReductionConst(
-          node_->name(), NodeName(node_->input(0)), node_->device());
-      const_name = const_node->name();
-    } else {
-      const_name = kReductionConst;
-    }
-    return const_name;
-  }
+ protected:
+  std::set<int> GetOutputPos() const override { return {0, 1}; }
 };
 
 class DataLayoutOptimizer : GraphProcessor {
  public:
-  explicit DataLayoutOptimizer(const string& default_device, GraphDef* graph,
-                               NodeMap* node_map,
-                               LayoutOptimizer::TuningConfig config)
-      : GraphProcessor(graph, node_map),
-        default_device_(default_device),
+  explicit DataLayoutOptimizer(
+      const VirtualPlacer& virtual_placer,
+      const LayoutOptimizer::TuningConfig& config,
+      const std::unordered_set<string>& nodes_to_preserve, GraphDef* graph,
+      NodeMap* node_map)
+      : GraphProcessor(virtual_placer, nodes_to_preserve, graph, node_map),
         config_(config) {}
 
   Status Optimize() {
-    LOG(INFO) << "Number of nodes for original graph: " << graph_->node_size();
+    VLOG(1) << "Number of nodes for original graph: " << graph_->node_size();
     TF_RETURN_IF_ERROR(Expand());
-    LOG(INFO) << "Number of nodes after Expand: " << graph_->node_size();
+    VLOG(1) << "Number of nodes after Expand: " << graph_->node_size();
     TF_RETURN_IF_ERROR(Collapse());
-    LOG(INFO) << "Number of nodes after Collapse: " << graph_->node_size();
+    VLOG(1) << "Number of nodes after Collapse: " << graph_->node_size();
     return Status::OK();
   }
 
  private:
   NodeDef* AddNodePermNHWCToNCHW() {
-    return AddNodePermConst(kPermNHWCToNCHW, default_device_, {0, 3, 1, 2});
+    return AddNodePermConst(kPermNHWCToNCHW, "", {0, 3, 1, 2});
   }
 
   NodeDef* AddNodePermNCHWToNHWC() {
-    return AddNodePermConst(kPermNCHWToNHWC, default_device_, {0, 2, 3, 1});
-  }
-
-  NodeDef* AddNodeConcatConst() {
-    return AddNodeConstScalar(kConcatConst, default_device_, DT_INT32, 1);
-  }
-
-  NodeDef* AddNodeGatherAxisConst() {
-    return AddNodeConstScalar(kGatherAxisConst, default_device_, DT_INT32, 0);
-  }
-
-  NodeDef* AddNodeReductionConst() {
-    return GraphProcessor::AddNodeReductionConst(kReductionConst,
-                                                 default_device_);
+    return AddNodePermConst(kPermNCHWToNHWC, "", {0, 2, 3, 1});
   }
 
   // Expand all nodes which is in NHWC, but supports NCHW or is layout agnostic.
@@ -1264,35 +1499,42 @@ class DataLayoutOptimizer : GraphProcessor {
     // This is the first pass where we expand the nodes which support NCHW.
     std::set<string> ops_format_supported = GetOpsFormatSupported();
     for (int i = 0; i < node_size_original; i++) {
+      if (IsNodeByLayoutOptimizer(graph_->node(i).name())) {
+        return Status(error::INVALID_ARGUMENT,
+                      "The graph is already optimized by layout optimizer.");
+      }
       if (ops_format_supported.find(graph_->node(i).op()) !=
           ops_format_supported.end()) {
         auto node = graph_->mutable_node(i);
         bool is_in_frame = !frames[node].empty();
+        OptimizeContext opt_cxt(graph_, node, node_map_, virtual_placer_,
+                                nodes_to_preserve_, is_in_frame);
         std::unique_ptr<NodeProcessor> node_processor;
-        if (node->op().compare("AvgPoolGrad") == 0) {
+        if (IsAvgPoolGrad(*node)) {
+          node_processor.reset(new AvgPoolGradProcessor(opt_cxt));
+        } else if (IsBiasAddGrad(*node)) {
+          node_processor.reset(new BiasAddGradProcessor(opt_cxt));
+        } else if (IsConv2D(*node)) {
+          node_processor.reset(new Conv2DProcessor(opt_cxt, config_.no_gemm));
+        } else if (IsConv2DBackpropFilter(*node)) {
           node_processor.reset(
-              new AvgPoolGradProcessor(graph_, node, node_map_, is_in_frame));
-        } else if (node->op().compare("BiasAddGrad") == 0) {
+              new Conv2DBackpropFilterProcessor(opt_cxt, config_.no_gemm));
+        } else if (IsConv2DBackpropInput(*node)) {
           node_processor.reset(
-              new BiasAddGradProcessor(graph_, node, node_map_, is_in_frame));
-        } else if (node->op().compare("Conv2D") == 0) {
-          node_processor.reset(new Conv2DProcessor(
-              graph_, node, node_map_, config_.no_gemm, is_in_frame));
-        } else if (node->op().compare("Conv2DBackpropFilter") == 0) {
-          node_processor.reset(new Conv2DBackpropFilterProcessor(
-              graph_, node, node_map_, config_.no_gemm, is_in_frame));
-        } else if (node->op().compare("Conv2DBackpropInput") == 0) {
-          node_processor.reset(new Conv2DBackpropInputProcessor(
-              graph_, node, node_map_, config_.no_gemm, is_in_frame));
-        } else if (node->op().compare("FusedBatchNormGrad") == 0) {
-          node_processor.reset(new FusedBatchNormGradProcessor(
-              graph_, node, node_map_, is_in_frame));
-        } else if (node->op().compare("MaxPoolGrad") == 0) {
+              new Conv2DBackpropInputProcessor(opt_cxt, config_.no_gemm));
+        } else if (IsDepthwiseConv2dNative(*node)) {
+          node_processor.reset(new Conv2DProcessor(opt_cxt, true));
+        } else if (IsDepthwiseConv2dNativeBackpropFilter(*node)) {
           node_processor.reset(
-              new MaxPoolGradProcessor(graph_, node, node_map_, is_in_frame));
+              new Conv2DBackpropFilterProcessor(opt_cxt, true));
+        } else if (IsDepthwiseConv2dNativeBackpropInput(*node)) {
+          node_processor.reset(new Conv2DBackpropInputProcessor(opt_cxt, true));
+        } else if (IsFusedBatchNormGradV1(*node)) {
+          node_processor.reset(new FusedBatchNormGradProcessor(opt_cxt));
+        } else if (IsMaxPoolGradV1(*node)) {
+          node_processor.reset(new MaxPoolGradProcessor(opt_cxt));
         } else {
-          node_processor.reset(
-              new NodeProcessor(graph_, node, node_map_, is_in_frame));
+          node_processor.reset(new NodeProcessor(opt_cxt));
         }
         TF_RETURN_IF_ERROR(node_processor->ConvertNode());
       }
@@ -1304,58 +1546,42 @@ class DataLayoutOptimizer : GraphProcessor {
     if (graph_->node_size() > node_size_original) {
       NodeDef* n = AddNodePermNHWCToNCHW();
       n = AddNodePermNCHWToNHWC();
-      n = AddNodeConcatConst();
-      n = AddNodeGatherAxisConst();
-      n = AddNodeReductionConst();
       std::set<string> ops_format_agnostic = GetOpsFormatAgnostic();
       for (int i = 0; i < graph_->node_size(); i++) {
         if (ops_format_agnostic.find(graph_->node(i).op()) !=
             ops_format_agnostic.end()) {
           auto node = graph_->mutable_node(i);
           bool is_in_frame = !frames[node].empty();
+          OptimizeContext opt_cxt(graph_, node, node_map_, virtual_placer_,
+                                  nodes_to_preserve_, is_in_frame);
           std::unique_ptr<NodeProcessor> node_processor;
-          if (node->op().compare("AddN") == 0) {
-            node_processor.reset(
-                new AddNProcessor(graph_, node, node_map_, is_in_frame));
-          } else if (node->op().compare("Add") == 0 ||
-                     node->op().compare("Mul") == 0 ||
-                     node->op().compare("RealDiv") == 0 ||
-                     node->op().compare("SquaredDifference") == 0 ||
-                     node->op().compare("Sub") == 0) {
-            node_processor.reset(
-                new BinaryOpProcessor(graph_, node, node_map_, is_in_frame));
-          } else if (node->op().compare("Concat") == 0 ||
-                     node->op().compare("ConcatV2") == 0) {
-            node_processor.reset(
-                new ConcatProcessor(graph_, node, node_map_, is_in_frame));
-          } else if (node->op().compare("Pad") == 0) {
-            node_processor.reset(
-                new PadProcessor(graph_, node, node_map_, is_in_frame));
-          } else if (node->op().compare("ReluGrad") == 0) {
-            node_processor.reset(
-                new ReluGradProcessor(graph_, node, node_map_, is_in_frame));
-          } else if (node->op().compare("Slice") == 0) {
-            auto input1 = node_map_->GetNode(NodeName(node->input(1)));
-            auto input2 = node_map_->GetNode(NodeName(node->input(2)));
-            if (input1->op() == "ConcatOffset") {
-              node_processor.reset(new SliceProcessorConcatOffset(
-                  graph_, node, node_map_, is_in_frame));
-            } else if (input1->op() == "Const" && input2->op() == "Const") {
-              node_processor.reset(new SliceProcessorConst(
-                  graph_, node, node_map_, is_in_frame));
-            } else {
-              node_processor.reset(
-                  new SliceProcessor(graph_, node, node_map_, is_in_frame));
-            }
-          } else if (node->op().compare("Squeeze") == 0) {
-            node_processor.reset(
-                new SqueezeProcessor(graph_, node, node_map_, is_in_frame));
-          } else if (node->op().compare("Sum") == 0) {
-            node_processor.reset(
-                new SumProcessor(graph_, node, node_map_, is_in_frame));
+          if (IsAddN(*node)) {
+            node_processor.reset(new AddNProcessor(opt_cxt));
+          } else if (IsAdd(*node) || IsMul(*node) || IsRealDiv(*node) ||
+                     IsSquaredDifference(*node) || IsSub(*node)) {
+            node_processor.reset(new BinaryOpProcessor(opt_cxt));
+          } else if (IsConcat(*node)) {
+            node_processor.reset(new ConcatProcessor(opt_cxt));
+          } else if (IsMerge(*node)) {
+            node_processor.reset(new MergeProcessor(opt_cxt));
+          } else if (IsPad(*node)) {
+            node_processor.reset(new PadProcessor(opt_cxt));
+          } else if (IsSlice(*node)) {
+            node_processor.reset(new SliceProcessor(opt_cxt));
+          } else if (IsShape(*node) || IsShapeN(*node)) {
+            node_processor.reset(new ShapeProcessor(opt_cxt));
+          } else if (IsSplit(*node)) {
+            node_processor.reset(new SplitProcessor(opt_cxt));
+          } else if (IsSqueeze(*node)) {
+            node_processor.reset(new SqueezeProcessor(opt_cxt));
+          } else if (IsSum(*node)) {
+            node_processor.reset(new SumProcessor(opt_cxt));
+          } else if (IsSwitch(*node)) {
+            node_processor.reset(new SwitchProcessor(opt_cxt));
+          } else if (IsUnaryGrad(*node)) {
+            node_processor.reset(new UnaryGradProcessor(opt_cxt));
           } else {
-            node_processor.reset(new AgnosticNodeProcessor(
-                graph_, node, node_map_, is_in_frame));
+            node_processor.reset(new AgnosticNodeProcessor(opt_cxt));
           }
           TF_RETURN_IF_ERROR(node_processor->ConvertNode());
         }
@@ -1402,8 +1628,7 @@ class DataLayoutOptimizer : GraphProcessor {
     return Status::OK();
   }
 
-  string default_device_;
-  LayoutOptimizer::TuningConfig config_;
+  const LayoutOptimizer::TuningConfig& config_;
 };
 
 int GetNumTranspose(const GraphDef& graph) {
@@ -1413,13 +1638,31 @@ int GetNumTranspose(const GraphDef& graph) {
       number++;
     }
   }
-  LOG(INFO) << "Number of Transpose nodes: " << number;
+  VLOG(1) << "Number of Transpose nodes: " << number;
   return number;
 }
 
+int GetNumGPUs(const Cluster& cluster) {
+  auto devices = cluster.GetDevices();
+  int num_gpus = 0;
+  for (const auto& device : devices) {
+    if (device.second.type() == "GPU") {
+      if (device.second.environment().find("architecture") !=
+          device.second.environment().end()) {
+        const string arch = device.second.environment().at("architecture");
+        // TODO(yaozhang): Enable for Volta GPUs (compute capability version 7).
+        if (arch < "7") {
+          num_gpus++;
+        }
+      }
+    }
+  }
+  return num_gpus;
+}
+}  // namespace
+
 Status LayoutOptimizer::Tune(const GrapplerItem& item,
                              const GraphProperties& graph_properties,
-                             const string& default_device,
                              const TuningConfig& config, GraphDef* output) {
   auto status = graph_properties.AnnotateOutputShapes(output);
   if (!status.ok()) {
@@ -1427,52 +1670,37 @@ Status LayoutOptimizer::Tune(const GrapplerItem& item,
     return status;
   }
   NodeMap node_map(output);
-  DataLayoutOptimizer layout_optimizer(default_device, output, &node_map,
-                                       config);
+  DataLayoutOptimizer layout_optimizer(*virtual_placer_, config,
+                                       nodes_to_preserve_, output, &node_map);
   status = layout_optimizer.Optimize();
   return status;
 }
 
 Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* output) {
-  if (num_gpus_ == 0) {
-    num_gpus_ = GetNumAvailableGPUs();
-  }
-  if (num_gpus_ < 1) {
+  if (GetNumGPUs(*cluster) < 1) {
     // LayoutOptimizer is currently only tuned for GPU.
     *output = item.graph;
     return Status::OK();
   }
 
+  virtual_placer_.reset(new VirtualPlacer(cluster));
+  nodes_to_preserve_ = item.NodesToPreserve();
   GraphProperties graph_properties(item);
-  auto status = graph_properties.InferStatically();
+  auto status = graph_properties.InferStatically(false);
   if (!status.ok()) {
     *output = item.graph;
     return status;
   }
 
   TuningConfig config;
-  config.no_gemm = false;
-  string default_device = "/job:localhost/replica:0/task:0/cpu:0";
-  if (cluster) {
-    if (!cluster->GetDevices().empty()) {
-      default_device = cluster->GetDevices().begin()->first;
-    }
-  }
-
-  status = Tune(item, graph_properties, default_device, config, output);
-  // This is based on an empirical observation that if the introduced Transpose
-  // nodes is more than 30, not using GEMM implementation would result in better
-  // performance.
-  if (status.ok() && GetNumTranspose(*output) > 30) {
-    config.no_gemm = true;
-    status = Tune(item, graph_properties, default_device, config, output);
-  }
-
+  config.no_gemm = true;
+  // TODO(yaozhang): Enable tuning with various TuningConfig choices wtih
+  // the measurement-based estimator.
+  status = Tune(item, graph_properties, config, output);
   if (!status.ok()) {
     *output = item.graph;
   }
-
   return status;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.h b/tensorflow/core/grappler/optimizers/layout_optimizer.h
index 621c286976a8112538cf2923103ee72ccee2752c..357205828ddea3f35a6dd202606a5b59d8baa5a5 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
 
 #include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 
 namespace tensorflow {
@@ -29,9 +30,6 @@ class LayoutOptimizer : public GraphOptimizer {
 
   string name() const override { return "layout"; };
 
-  // This is for testing only.
-  void set_num_gpus(int num_gpus) { num_gpus_ = num_gpus; };
-
   struct TuningConfig {
     // If true, do not use the NHWC GEMM implementation. When filter size is
     // one or filter size is equal to input image size,
@@ -50,10 +48,10 @@ class LayoutOptimizer : public GraphOptimizer {
                 const GraphDef& optimize_output, double result) override;
 
  private:
-  int num_gpus_ = 0;
+  std::unique_ptr<VirtualPlacer> virtual_placer_;
+  std::unordered_set<string> nodes_to_preserve_;
   Status Tune(const GrapplerItem& item, const GraphProperties& graph_properties,
-              const string& default_device, const TuningConfig& config,
-              GraphDef* output);
+              const TuningConfig& config, GraphDef* output);
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index b760cf2ff2b3fba88817659708e986323ee0b7ca..98109f724ee4fa7a02f9cdc751933c50e7afd5de 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -17,10 +17,12 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -28,9 +30,21 @@ namespace {
 
 class LayoutOptimizerTest : public ::testing::Test {
  protected:
+  void SetUp() override {
+    DeviceProperties device_properties;
+    device_properties.set_type("GPU");
+    device_properties.mutable_environment()->insert({"architecture", "6"});
+    virtual_cluster_.reset(new VirtualCluster({{"/GPU:0", device_properties}}));
+  }
+
   Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
                       const string& padding) {
-    int batch_size = 128;
+    return SimpleConv2D(s, input_size, filter_size, padding, "");
+  }
+
+  Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
+                      const string& padding, const string& device) {
+    int batch_size = 8;
     int input_height = input_size;
     int input_width = input_size;
     int input_depth = 3;
@@ -50,13 +64,19 @@ class LayoutOptimizerTest : public ::testing::Test {
     Output filter =
         ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
 
-    Output conv = ops::Conv2D(s->WithOpName("Conv2D"), input, filter,
-                              {1, stride, stride, 1}, padding);
+    Output conv = ops::Conv2D(s->WithOpName("Conv2D").WithDevice(device), input,
+                              filter, {1, stride, stride, 1}, padding);
     return conv;
   }
 
   Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
                                    int filter_size, const string& padding) {
+    return SimpleConv2DBackpropInput(s, input_size, filter_size, padding, true);
+  }
+
+  Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
+                                   int filter_size, const string& padding,
+                                   bool const_input_size) {
     int batch_size = 128;
     int input_height = input_size;
     int input_width = input_size;
@@ -86,11 +106,18 @@ class LayoutOptimizerTest : public ::testing::Test {
     Output output =
         ops::Const(s->WithOpName("Output"), Input::Initializer(output_data));
 
-    Output conv_backprop_input = ops::Conv2DBackpropInput(
-        s->WithOpName("Conv2DBackpropInput"), input_sizes, filter, output,
-        {1, stride, stride, 1}, padding);
-    TensorShape input_shape(
-        {batch_size, input_height, input_width, input_depth});
+    Output conv_backprop_input;
+    Output input_sizes_i =
+        ops::Identity(s->WithOpName("InputSizesIdentity"), input_sizes);
+    if (const_input_size) {
+      conv_backprop_input = ops::Conv2DBackpropInput(
+          s->WithOpName("Conv2DBackpropInput"), input_sizes, filter, output,
+          {1, stride, stride, 1}, padding);
+    } else {
+      conv_backprop_input = ops::Conv2DBackpropInput(
+          s->WithOpName("Conv2DBackpropInput"), input_sizes_i, filter, output,
+          {1, stride, stride, 1}, padding);
+    }
     return conv_backprop_input;
   }
 
@@ -99,6 +126,38 @@ class LayoutOptimizerTest : public ::testing::Test {
     CHECK(tensor.FromProto(node.attr().at({"value"}).tensor()));
     return tensor;
   }
+
+  Output SimpleFusedBatchNormGrad(tensorflow::Scope* s, bool is_training) {
+    int batch_size = 16;
+    int input_height = 8;
+    int input_width = 8;
+    int input_channels = 3;
+    TensorShape shape({batch_size, input_height, input_width, input_channels});
+    Tensor data(DT_FLOAT, shape);
+    test::FillIota<float>(&data, 1.0f);
+    Output x = ops::Const(s->WithOpName("Input"), Input::Initializer(data));
+    Output y_backprop =
+        ops::Const(s->WithOpName("YBackprop"), Input::Initializer(data));
+
+    TensorShape shape_vector({input_channels});
+    Tensor data_vector(DT_FLOAT, shape_vector);
+    test::FillIota<float>(&data_vector, 2.0f);
+    Output scale =
+        ops::Const(s->WithOpName("Scale"), Input::Initializer(data_vector));
+    Output reserve1 =
+        ops::Const(s->WithOpName("Reserve1"), Input::Initializer(data_vector));
+    Output reserve2 =
+        ops::Const(s->WithOpName("Reserve2"), Input::Initializer(data_vector));
+
+    ops::FusedBatchNormGrad::Attrs attrs;
+    attrs.is_training_ = is_training;
+    auto output =
+        ops::FusedBatchNormGrad(s->WithOpName("FusedBatchNormGrad"), y_backprop,
+                                x, scale, reserve1, reserve2, attrs);
+    return output.x_backprop;
+  }
+
+  std::unique_ptr<VirtualCluster> virtual_cluster_;
 };
 
 TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
@@ -108,9 +167,9 @@ TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   LayoutOptimizer optimizer;
-  optimizer.set_num_gpus(1);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   string input_name = AddPrefixToNodeName("Conv2DBackpropInput-InputSizes",
                                           "LayoutOptimizer", "-");
@@ -125,6 +184,28 @@ TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
   test::ExpectTensorEqual<int>(input_sizes_expected, input_sizes);
 }
 
+TEST_F(LayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME", false);
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv2d_backprop_node = node_map.GetNode("Conv2DBackpropInput");
+  CHECK(conv2d_backprop_node);
+  EXPECT_EQ(conv2d_backprop_node->input(0),
+            "LayoutOptimizerVecPermuteNHWCToNCHW_Conv2DBackpropInput_0");
+  auto input_sizes_node = node_map.GetNode(
+      "LayoutOptimizerVecPermuteNHWCToNCHW_Conv2DBackpropInput_0");
+  CHECK(input_sizes_node);
+  EXPECT_EQ(input_sizes_node->input(0), "InputSizesIdentity");
+  EXPECT_EQ(input_sizes_node->op(), "DataFormatVecPermute");
+}
+
 TEST_F(LayoutOptimizerTest, FilterSizeIsOne) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto conv = SimpleConv2D(&s, 2, 1, "SAME");
@@ -132,9 +213,8 @@ TEST_F(LayoutOptimizerTest, FilterSizeIsOne) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   LayoutOptimizer optimizer;
-  optimizer.set_num_gpus(1);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   EXPECT_FALSE(
       node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
@@ -147,9 +227,8 @@ TEST_F(LayoutOptimizerTest, FilterSizeNotOne) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   LayoutOptimizer optimizer;
-  optimizer.set_num_gpus(1);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   EXPECT_FALSE(
       node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
@@ -162,9 +241,8 @@ TEST_F(LayoutOptimizerTest, EqualSizeWithValidPadding) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   LayoutOptimizer optimizer;
-  optimizer.set_num_gpus(1);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   EXPECT_FALSE(
       node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
@@ -177,41 +255,36 @@ TEST_F(LayoutOptimizerTest, EqualSizeWithSamePadding) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   LayoutOptimizer optimizer;
-  optimizer.set_num_gpus(1);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
-  EXPECT_TRUE(
-      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input-0"));
+  EXPECT_TRUE(node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-0"));
 }
 
 TEST_F(LayoutOptimizerTest, NotEqualSizeWithValidPadding) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   LayoutOptimizer optimizer;
-  optimizer.set_num_gpus(1);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
-  EXPECT_TRUE(
-      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input-0"));
+  EXPECT_TRUE(node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-0"));
 }
 
 TEST_F(LayoutOptimizerTest, Pad) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
   auto c = ops::Const(s.WithOpName("c"), {1, 2, 3, 4, 5, 6, 7, 8}, {4, 2});
   auto p = ops::Pad(s.WithOpName("p"), conv, c);
   auto o = ops::Identity(s.WithOpName("o"), p);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   LayoutOptimizer optimizer;
-  optimizer.set_num_gpus(1);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
 
   auto pad = node_map.GetNode("p");
@@ -230,7 +303,7 @@ TEST_F(LayoutOptimizerTest, Pad) {
 
 TEST_F(LayoutOptimizerTest, Connectivity) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
   auto i1 = ops::Identity(s.WithOpName("i1"), conv);
   auto i2 = ops::Identity(s.WithOpName("i2"), i1);
   auto i3 = ops::Identity(s.WithOpName("i3"), i2);
@@ -246,9 +319,8 @@ TEST_F(LayoutOptimizerTest, Connectivity) {
   auto node_i2 = node_map_original.GetNode("i2");
   node_i2->Swap(node_i1);
   LayoutOptimizer optimizer;
-  optimizer.set_num_gpus(1);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map_output(&output);
   auto node_i2_output = node_map_output.GetNode("i2");
   // Layout optimizer should process i2, as it detects i2 is connected with the
@@ -259,6 +331,741 @@ TEST_F(LayoutOptimizerTest, Connectivity) {
   EXPECT_EQ(node_i2_output->input(0), "i1");
 }
 
+TEST_F(LayoutOptimizerTest, ConnectivityBinaryOpWithInputScalarAnd4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto i1 = ops::Identity(s.WithOpName("i1"), conv);
+  auto i2 = ops::Identity(s.WithOpName("i2"), i1);
+  auto scalar_sub = ops::Const(s.WithOpName("scalar_sub"), 3.0f, {});
+  auto sub = ops::Sub(s.WithOpName("sub"), scalar_sub, i2);
+  auto i3 = ops::Identity(s.WithOpName("i3"), sub);
+  auto i4 = ops::Identity(s.WithOpName("i4"), i3);
+  auto i5 = ops::Identity(s.WithOpName("i5"), i4);
+  auto scalar_mul = ops::Const(s.WithOpName("scalar_mul"), 3.0f, {});
+  auto mul = ops::Mul(s.WithOpName("mul"), scalar_mul, i5);
+  auto i6 = ops::Identity(s.WithOpName("i6"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  // Make the graph not in topological order to test the handling of multi-hop
+  // connectivity (here we say two nodes are connected if all nodes in the
+  // middle are layout agnostic). If the graph is already in topological order,
+  // the problem is easier, where layout optimizer only needs to check
+  // single-hop connectivity.
+  NodeMap node_map_original(&item.graph);
+  auto node_i1 = node_map_original.GetNode("i1");
+  auto node_mul = node_map_original.GetNode("mul");
+  node_mul->Swap(node_i1);
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map_output(&output);
+  auto mul_node = node_map_output.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "scalar_mul");
+  EXPECT_EQ(mul_node->input(1), "i5");
+}
+
+TEST_F(LayoutOptimizerTest, PreserveFetch) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto i = ops::Identity(s.WithOpName("i"), conv);
+  GrapplerItem item;
+  item.fetch.push_back("Conv2D");
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv_node = node_map.GetNode("Conv2D");
+  EXPECT_EQ(conv_node->attr().at({"data_format"}).s(), "NHWC");
+}
+
+TEST_F(LayoutOptimizerTest, EmptyDevice) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv_node = node_map.GetNode("Conv2D");
+  EXPECT_EQ(conv_node->attr().at({"data_format"}).s(), "NCHW");
+}
+
+TEST_F(LayoutOptimizerTest, GPUDevice) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv =
+      SimpleConv2D(&s, 4, 2, "VALID", "/job:w/replica:0/task:0/device:gpu:0");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv_node = node_map.GetNode("Conv2D");
+  EXPECT_EQ(conv_node->attr().at({"data_format"}).s(), "NCHW");
+}
+
+TEST_F(LayoutOptimizerTest, CPUDeviceLowercase) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv =
+      SimpleConv2D(&s, 4, 2, "VALID", "/job:w/replica:0/task:0/device:cpu:0");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv_node = node_map.GetNode("Conv2D");
+  EXPECT_EQ(conv_node->attr().at({"data_format"}).s(), "NHWC");
+}
+
+TEST_F(LayoutOptimizerTest, CPUDeviceUppercase) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID", "/CPU:0");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv_node = node_map.GetNode("Conv2D");
+  EXPECT_EQ(conv_node->attr().at({"data_format"}).s(), "NHWC");
+}
+
+TEST_F(LayoutOptimizerTest, FusedBatchNormGradTrainingTrue) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x_backprop = SimpleFusedBatchNormGrad(&s, true);
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {x_backprop});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv_node = node_map.GetNode("FusedBatchNormGrad");
+  EXPECT_EQ(conv_node->attr().at({"data_format"}).s(), "NCHW");
+}
+
+TEST_F(LayoutOptimizerTest, FusedBatchNormGradTrainingFalse) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x_backprop = SimpleFusedBatchNormGrad(&s, false);
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {x_backprop});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv_node = node_map.GetNode("FusedBatchNormGrad");
+  EXPECT_EQ(conv_node->attr().at({"data_format"}).s(), "NHWC");
+}
+
+TEST_F(LayoutOptimizerTest, SplitDimC) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 5, 2, "VALID");
+  auto c = ops::Const(s.WithOpName("c"), 3, {});
+  auto split = ops::Split(s.WithOpName("split"), c, conv, 2);
+  auto i = ops::Identity(s.WithOpName("i"), split[0]);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto split_node = node_map.GetNode("split");
+  EXPECT_EQ(split_node->input(0), "LayoutOptimizer-split-c");
+  EXPECT_EQ(split_node->input(1), "Conv2D");
+  auto split_const = node_map.GetNode("LayoutOptimizer-split-c");
+  EXPECT_EQ(split_const->op(), "Const");
+  EXPECT_EQ(split_const->attr().at({"value"}).tensor().int_val(0), 1);
+}
+
+TEST_F(LayoutOptimizerTest, SplitDimH) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 6, 2, "SAME");
+  auto c = ops::Const(s.WithOpName("c"), 1, {});
+  auto split = ops::Split(s.WithOpName("split"), c, conv, 2);
+  auto i = ops::Identity(s.WithOpName("i"), split[0]);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto split_node = node_map.GetNode("split");
+  EXPECT_EQ(split_node->input(0), "LayoutOptimizer-split-c");
+  EXPECT_EQ(split_node->input(1), "Conv2D");
+  auto split_const = node_map.GetNode("LayoutOptimizer-split-c");
+  EXPECT_EQ(split_const->op(), "Const");
+  EXPECT_EQ(split_const->attr().at({"value"}).tensor().int_val(0), 2);
+}
+
+TEST_F(LayoutOptimizerTest, SplitDimW) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 5, 2, "VALID");
+  auto c = ops::Const(s.WithOpName("c"), 2, {});
+  auto split = ops::Split(s.WithOpName("split"), c, conv, 2);
+  auto i = ops::Identity(s.WithOpName("i"), split[0]);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto split_node = node_map.GetNode("split");
+  EXPECT_EQ(split_node->input(0), "LayoutOptimizer-split-c");
+  EXPECT_EQ(split_node->input(1), "Conv2D");
+  auto split_const = node_map.GetNode("LayoutOptimizer-split-c");
+  EXPECT_EQ(split_const->op(), "Const");
+  EXPECT_EQ(split_const->attr().at({"value"}).tensor().int_val(0), 3);
+}
+
+TEST_F(LayoutOptimizerTest, SplitDimN) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 5, 2, "VALID");
+  auto c = ops::Const(s.WithOpName("c"), 0, {});
+  auto split = ops::Split(s.WithOpName("split"), c, conv, 2);
+  auto i = ops::Identity(s.WithOpName("i"), split[0]);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto split_node = node_map.GetNode("split");
+  EXPECT_EQ(split_node->input(0), "LayoutOptimizer-split-c");
+  EXPECT_EQ(split_node->input(1), "Conv2D");
+  auto split_const = node_map.GetNode("LayoutOptimizer-split-c");
+  EXPECT_EQ(split_const->op(), "Const");
+  EXPECT_EQ(split_const->attr().at({"value"}).tensor().int_val(0), 0);
+}
+
+TEST_F(LayoutOptimizerTest, SplitNonConstDim) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 5, 2, "VALID");
+  auto c = ops::Const(s.WithOpName("c"), 0, {});
+  auto i1 = ops::Identity(s.WithOpName("i1"), c);
+  auto split = ops::Split(s.WithOpName("split"), i1, conv, 2);
+  auto i2 = ops::Identity(s.WithOpName("i"), split[0]);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto split_node = node_map.GetNode("split");
+  EXPECT_EQ(split_node->input(0),
+            "LayoutOptimizerVecPermuteNHWCToNCHW_split_0");
+  EXPECT_EQ(split_node->input(1), "Conv2D");
+  auto map_node =
+      node_map.GetNode("LayoutOptimizerVecPermuteNHWCToNCHW_split_0");
+  EXPECT_EQ(map_node->op(), "DataFormatDimMap");
+  EXPECT_EQ(map_node->input(0), "i1");
+}
+
+TEST_F(LayoutOptimizerTest, SplitSamePortToMultipleInputsOfSameNode) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 5, 2, "VALID");
+  auto axis = ops::Const(s.WithOpName("axis"), 3);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat =
+      ops::Concat(s.WithOpName("concat"), {split[1], split[1], split[1]}, axis);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split:1");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2), "split:1");
+  EXPECT_EQ(concat_node->input(3), "LayoutOptimizer-concat-axis");
+  auto concat_dim = node_map.GetNode("LayoutOptimizer-concat-axis");
+  EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 1);
+}
+
+TEST_F(LayoutOptimizerTest, ConcatDimH) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "SAME");
+  auto axis = ops::Const(s.WithOpName("axis"), 1);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat = ops::Concat(s.WithOpName("concat"), {split[0], split[1]}, axis);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2), "LayoutOptimizer-concat-axis");
+  auto concat_dim = node_map.GetNode("LayoutOptimizer-concat-axis");
+  EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 2);
+}
+
+TEST_F(LayoutOptimizerTest, ConcatNonConst) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "SAME");
+  auto axis = ops::Const(s.WithOpName("axis"), 1);
+  auto i = ops::Identity(s.WithOpName("i"), axis);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat = ops::Concat(s.WithOpName("concat"), {split[0], split[1]}, i);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2),
+            "LayoutOptimizerVecPermuteNHWCToNCHW_concat_2");
+  auto concat_dim =
+      node_map.GetNode("LayoutOptimizerVecPermuteNHWCToNCHW_concat_2");
+  EXPECT_EQ(concat_dim->op(), "DataFormatDimMap");
+  EXPECT_EQ(concat_dim->input(0), "i");
+}
+
+TEST_F(LayoutOptimizerTest, ConcatDimW) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "SAME");
+  auto axis = ops::Const(s.WithOpName("axis"), 2);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat = ops::Concat(s.WithOpName("concat"), {split[0], split[1]}, axis);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2), "LayoutOptimizer-concat-axis");
+  auto concat_dim = node_map.GetNode("LayoutOptimizer-concat-axis");
+  EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 3);
+}
+
+TEST_F(LayoutOptimizerTest, ConcatDimN) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto axis = ops::Const(s.WithOpName("axis"), 0);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat = ops::Concat(s.WithOpName("concat"), {split[0], split[1]}, axis);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2), "LayoutOptimizer-concat-axis");
+  auto concat_dim = node_map.GetNode("LayoutOptimizer-concat-axis");
+  EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 0);
+}
+
+TEST_F(LayoutOptimizerTest, ConcatDimC) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto axis = ops::Const(s.WithOpName("axis"), 3);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat = ops::Concat(s.WithOpName("concat"), {split[0], split[1]}, axis);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2), "LayoutOptimizer-concat-axis");
+  auto concat_dim = node_map.GetNode("LayoutOptimizer-concat-axis");
+  EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 1);
+}
+
+TEST_F(LayoutOptimizerTest, Sum) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto reduction_indices =
+      ops::Const(s.WithOpName("reduction_indices"), {0, 1, 2}, {3});
+  auto sum = ops::Sum(s.WithOpName("sum"), conv, reduction_indices);
+  auto o = ops::Identity(s.WithOpName("o"), sum);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  // TODO(yaozhang): enable SumProcessor with auto-tuning. Currently disabled
+  // because of the worse performance in some cases.
+  /*
+  NodeMap node_map(&output);
+  auto sum_node = node_map.GetNode("sum");
+  EXPECT_EQ(sum_node->input(0), "Conv2D");
+  EXPECT_EQ(sum_node->input(1), "LayoutOptimizer-sum-reduction_indices");
+  auto sum_const = node_map.GetNode("LayoutOptimizer-sum-reduction_indices");
+  Tensor tensor;
+  EXPECT_TRUE(
+      tensor.FromProto(sum_const->mutable_attr()->at({"value"}).tensor()));
+  Tensor tensor_expected(DT_INT32, {3});
+  test::FillValues<int>(&tensor_expected, {0, 2, 3});
+  test::ExpectTensorEqual<int>(tensor_expected, tensor);
+  */
+}
+
+TEST_F(LayoutOptimizerTest, MulScalarAnd4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto scalar = ops::Const(s.WithOpName("scalar"), 3.0f, {});
+  auto mul = ops::Mul(s.WithOpName("mul"), scalar, conv);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto mul_node = node_map.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "scalar");
+  EXPECT_EQ(mul_node->input(1), "Conv2D");
+}
+
+TEST_F(LayoutOptimizerTest, Mul4DAndScalar) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto scalar = ops::Const(s.WithOpName("scalar"), 3.0f, {});
+  auto mul = ops::Mul(s.WithOpName("mul"), conv, scalar);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto mul_node = node_map.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "Conv2D");
+  EXPECT_EQ(mul_node->input(1), "scalar");
+}
+
+TEST_F(LayoutOptimizerTest, Mul4DAndUnknownRank) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto unknown_rank =
+      ops::Placeholder(s.WithOpName("unknown"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape()));
+  Output c = ops::Const(s.WithOpName("c"), 3.0f, {8, 2, 2, 2});
+  Output mul = ops::Mul(s.WithOpName("mul"), conv, unknown_rank);
+  auto o = ops::AddN(s.WithOpName("o"), {mul, c});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto mul_node = node_map.GetNode("mul");
+  // Node mul should not be processed by layout optimizer, because one of its
+  // inputs is of unknown rank.
+  EXPECT_EQ(mul_node->input(0),
+            "LayoutOptimizerTransposeNCHWToNHWC-Conv2D-0-0");
+  EXPECT_EQ(mul_node->input(1), "unknown");
+}
+
+TEST_F(LayoutOptimizerTest, Mul4DAnd4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto i = ops::Identity(s.WithOpName("i"), conv);
+  auto mul = ops::Mul(s.WithOpName("mul"), conv, i);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto mul_node = node_map.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "Conv2D");
+  EXPECT_EQ(mul_node->input(1), "i");
+}
+
+TEST_F(LayoutOptimizerTest, Mul4DAndVector) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto vector = ops::Const(s.WithOpName("vector"), {3.0f, 7.0f}, {2});
+  auto mul = ops::Mul(s.WithOpName("mul"), conv, vector);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto mul_node = node_map.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "Conv2D");
+  EXPECT_EQ(mul_node->input(1), "LayoutOptimizerReshapeNHWCToNCHW-mul-1");
+  auto mul_const = node_map.GetNode("LayoutOptimizerReshapeConst-mul-1");
+  Tensor tensor;
+  EXPECT_TRUE(
+      tensor.FromProto(mul_const->mutable_attr()->at({"value"}).tensor()));
+  Tensor tensor_expected(DT_INT32, {4});
+  test::FillValues<int>(&tensor_expected, {1, 2, 1, 1});
+  test::ExpectTensorEqual<int>(tensor_expected, tensor);
+}
+
+TEST_F(LayoutOptimizerTest, MulVectorAnd4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto vector = ops::Const(s.WithOpName("vector"), {3.0f, 7.0f}, {2});
+  auto mul = ops::Mul(s.WithOpName("mul"), vector, conv);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto mul_node = node_map.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "LayoutOptimizerReshapeNHWCToNCHW-mul-0");
+  EXPECT_EQ(mul_node->input(1), "Conv2D");
+  auto mul_const = node_map.GetNode("LayoutOptimizerReshapeConst-mul-0");
+  Tensor tensor;
+  EXPECT_TRUE(
+      tensor.FromProto(mul_const->mutable_attr()->at({"value"}).tensor()));
+  Tensor tensor_expected(DT_INT32, {4});
+  test::FillValues<int>(&tensor_expected, {1, 2, 1, 1});
+  test::ExpectTensorEqual<int>(tensor_expected, tensor);
+}
+
+TEST_F(LayoutOptimizerTest, SliceConst) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 5, 2, "VALID");
+  auto begin = ops::Const(s.WithOpName("begin"), {0, 2, 3, 1}, {4});
+  auto size = ops::Const(s.WithOpName("size"), {4, 1, 2, 4}, {4});
+  auto slice = ops::Slice(s.WithOpName("slice"), conv, begin, size);
+  auto o = ops::Identity(s.WithOpName("o"), slice);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto slice_node = node_map.GetNode("slice");
+  EXPECT_EQ(slice_node->input(0), "Conv2D");
+  EXPECT_EQ(slice_node->input(1), "LayoutOptimizer-slice-begin");
+  EXPECT_EQ(slice_node->input(2), "LayoutOptimizer-slice-size");
+
+  auto begin_const = node_map.GetNode("LayoutOptimizer-slice-begin");
+  Tensor begin_tensor;
+  EXPECT_TRUE(begin_tensor.FromProto(
+      begin_const->mutable_attr()->at({"value"}).tensor()));
+  Tensor begin_tensor_expected(DT_INT32, {4});
+  test::FillValues<int>(&begin_tensor_expected, {0, 1, 2, 3});
+  test::ExpectTensorEqual<int>(begin_tensor_expected, begin_tensor);
+
+  auto size_const = node_map.GetNode("LayoutOptimizer-slice-size");
+  Tensor size_tensor;
+  EXPECT_TRUE(size_tensor.FromProto(
+      size_const->mutable_attr()->at({"value"}).tensor()));
+  Tensor size_tensor_expected(DT_INT32, {4});
+  test::FillValues<int>(&size_tensor_expected, {4, 4, 1, 2});
+  test::ExpectTensorEqual<int>(size_tensor_expected, size_tensor);
+}
+
+TEST_F(LayoutOptimizerTest, SliceNonConst) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 5, 2, "VALID");
+  auto begin = ops::Const(s.WithOpName("begin"), {0, 2, 3, 1}, {4});
+  auto ibegin = ops::Identity(s.WithOpName("ibegin"), begin);
+  auto size = ops::Const(s.WithOpName("size"), {4, 1, 2, 4}, {4});
+  auto isize = ops::Identity(s.WithOpName("isize"), size);
+  auto slice = ops::Slice(s.WithOpName("slice"), conv, ibegin, isize);
+  auto o = ops::Identity(s.WithOpName("o"), slice);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto slice_node = node_map.GetNode("slice");
+  EXPECT_EQ(slice_node->input(0), "Conv2D");
+  EXPECT_EQ(slice_node->input(1),
+            "LayoutOptimizerVecPermuteNHWCToNCHW_slice_1");
+  EXPECT_EQ(slice_node->input(2),
+            "LayoutOptimizerVecPermuteNHWCToNCHW_slice_2");
+  auto perm1 = node_map.GetNode("LayoutOptimizerVecPermuteNHWCToNCHW_slice_1");
+  EXPECT_EQ(perm1->op(), "DataFormatVecPermute");
+  EXPECT_EQ(perm1->input(0), "ibegin");
+  auto perm2 = node_map.GetNode("LayoutOptimizerVecPermuteNHWCToNCHW_slice_2");
+  EXPECT_EQ(perm1->op(), "DataFormatVecPermute");
+  EXPECT_EQ(perm2->input(0), "isize");
+}
+
+TEST_F(LayoutOptimizerTest, DoNotApplyOptimizerTwice) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto scalar =
+      ops::Const(s.WithOpName("LayoutOptimizerAlreadyApplied"), 3.0f, {});
+  auto mul = ops::Mul(s.WithOpName("mul"), scalar, scalar);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  EXPECT_TRUE(errors::IsInvalidArgument(status));
+}
+
+TEST_F(LayoutOptimizerTest, ShapeNWithInputs4DAnd4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto shapen = ops::ShapeN(s.WithOpName("shapen"), {conv, conv});
+  auto add = ops::Add(s.WithOpName("add"), shapen[0], shapen[1]);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto shapen_node = node_map.GetNode("shapen");
+  EXPECT_EQ(shapen_node->input(0), "Conv2D");
+  EXPECT_EQ(shapen_node->input(1), "Conv2D");
+  auto add_node = node_map.GetNode("add");
+  EXPECT_EQ(add_node->input(0),
+            "LayoutOptimizerVecPermuteNCHWToNHWC-shapen-0-0");
+  EXPECT_EQ(add_node->input(1),
+            "LayoutOptimizerVecPermuteNCHWToNHWC-shapen-0-1");
+  auto vec_permute1 =
+      node_map.GetNode("LayoutOptimizerVecPermuteNCHWToNHWC-shapen-0-0");
+  EXPECT_EQ(vec_permute1->input(0), "shapen");
+  EXPECT_EQ(vec_permute1->op(), "DataFormatVecPermute");
+  auto vec_permute2 =
+      node_map.GetNode("LayoutOptimizerVecPermuteNCHWToNHWC-shapen-0-1");
+  EXPECT_EQ(vec_permute2->input(0), "shapen:1");
+  EXPECT_EQ(vec_permute2->op(), "DataFormatVecPermute");
+}
+
+TEST_F(LayoutOptimizerTest, ShapeNWithInputsVectorAnd4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto vector = ops::Const(s.WithOpName("vector"), 3.0f, {7});
+  auto shapen = ops::ShapeN(s.WithOpName("shapen"), {vector, conv});
+  auto add = ops::Add(s.WithOpName("add"), shapen[0], shapen[1]);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto shapen_node = node_map.GetNode("shapen");
+  EXPECT_EQ(shapen_node->input(0), "vector");
+  EXPECT_EQ(shapen_node->input(1), "Conv2D");
+  auto add_node = node_map.GetNode("add");
+  EXPECT_EQ(add_node->input(0), "shapen");
+  EXPECT_EQ(add_node->input(1),
+            "LayoutOptimizerVecPermuteNCHWToNHWC-shapen-0-1");
+  auto vec_permute =
+      node_map.GetNode("LayoutOptimizerVecPermuteNCHWToNHWC-shapen-0-1");
+  EXPECT_EQ(vec_permute->input(0), "shapen:1");
+  EXPECT_EQ(vec_permute->op(), "DataFormatVecPermute");
+}
+
+TEST_F(LayoutOptimizerTest, ShapeNWithInputs4DAndNoNeedToTransform4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto tensor_4d = ops::Const(s.WithOpName("tensor_4d"), 3.0f, {1, 1, 1, 3});
+  auto i1 = ops::Identity(s.WithOpName("i1"), tensor_4d);
+  Output i2 = ops::Identity(s.WithOpName("i2"), i1);
+  auto shapen = ops::ShapeN(s.WithOpName("shapen"), {conv, i2});
+  auto add = ops::Add(s.WithOpName("add"), shapen[0], shapen[1]);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto shapen_node = node_map.GetNode("shapen");
+  EXPECT_EQ(shapen_node->input(0), "Conv2D");
+  EXPECT_EQ(shapen_node->input(1), "i2");
+}
+
+TEST_F(LayoutOptimizerTest, Switch) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  ops::Variable ctrl(s.WithOpName("ctrl"), {}, DT_BOOL);
+  auto sw = ops::Switch(s.WithOpName("switch"), conv, ctrl);
+  auto i1 = ops::Identity(s.WithOpName("i1"), sw.output_true);
+  auto i2 = ops::Identity(s.WithOpName("i2"), sw.output_false);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto switch_node = node_map.GetNode("switch");
+  EXPECT_EQ(switch_node->input(0), "Conv2D");
+  EXPECT_EQ(switch_node->input(1), "ctrl");
+  auto i1_node = node_map.GetNode("i1");
+  auto i2_node = node_map.GetNode("i2");
+  auto trans1 = node_map.GetNode(i1_node->input(0));
+  EXPECT_EQ(trans1->input(0), "switch:1");
+  auto trans2 = node_map.GetNode(i2_node->input(0));
+  EXPECT_EQ(trans2->input(0), "switch");
+}
+
+TEST_F(LayoutOptimizerTest, MergeBothInputsConvertible) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  Output i1 = ops::Identity(s.WithOpName("i1"), conv);
+  auto merge = ops::Merge(s.WithOpName("merge"), {conv, i1});
+  auto i2 = ops::Identity(s.WithOpName("i2"), merge.output);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto merge_node = node_map.GetNode("merge");
+  EXPECT_EQ(merge_node->input(0), "Conv2D");
+  EXPECT_EQ(merge_node->input(1), "i1");
+  auto i2_node = node_map.GetNode("i2");
+  EXPECT_EQ(i2_node->input(0), "LayoutOptimizerTransposeNCHWToNHWC-merge-0-0");
+  auto transpose =
+      node_map.GetNode("LayoutOptimizerTransposeNCHWToNHWC-merge-0-0");
+  EXPECT_EQ(transpose->input(0), "merge");
+}
+
+TEST_F(LayoutOptimizerTest, MergeOneInputNotConvertible) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto tensor_4d = ops::Const(s.WithOpName("tensor_4d"), 3.0f, {1, 1, 1, 3});
+  auto merge = ops::Merge(s.WithOpName("merge"), {tensor_4d, conv});
+  auto i2 = ops::Identity(s.WithOpName("i2"), merge.output);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto merge_node = node_map.GetNode("merge");
+  EXPECT_EQ(merge_node->input(0), "tensor_4d");
+  EXPECT_EQ(merge_node->input(1),
+            "LayoutOptimizerTransposeNCHWToNHWC-Conv2D-0-1");
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 7c44ce15c6efee1ca375665976db1dc15dc01096..1420fdb6feaab32a250f2837f829a695edbabefc 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -419,7 +419,7 @@ void RecomputationRewritingPass(RewriterConfig::MemOptType optimization_level,
   // We don't use the results of this topological sort until later, but this
   // call invalidates all NodeDef pointers, so it needs to be done before we
   // start collecting those.
-  TopologicalSort(graph);
+  TF_CHECK_OK(TopologicalSort(graph));
   NodeMap node_map(graph);
   std::vector<RecomputedSubGraph> recomputed_subgraphs;
   // Do not recompute nodes which are fed, since the recomputed node would not
@@ -716,7 +716,7 @@ Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   {
     // Estimate the size of the data to swap for each node.
     GraphProperties properties(item);
-    TF_RETURN_IF_ERROR(properties.InferStatically());
+    TF_RETURN_IF_ERROR(properties.InferStatically(true));
     for (auto& swap : nodes_to_swap) {
       const NodeDef* node = swap.first;
       std::vector<OpInfo::TensorProperties> props =
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 1fa639ad33d9e00ad5bfd7344204a6f0b464e37a..4228e7baba9741cf9160d4789d6bef04c50a7409 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -30,6 +30,23 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+namespace {
+int64 NumEdges(const GraphDef& graph) {
+  int64 num_edges = 0;
+  for (const auto& node : graph.node()) {
+    num_edges += node.input_size();
+  }
+  return num_edges;
+}
+
+string PrintSizesBeforeAfter(const GraphDef& before, const GraphDef& after) {
+  return strings::StrCat("Graph size before: ", before.node_size(), " nodes, ",
+                         NumEdges(before),
+                         " edges. Graph size after: ", after.node_size(),
+                         " nodes, ", NumEdges(after), " edges.");
+}
+}  // namespace
+
 std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
     const string& optimizer) {
   VLOG(1) << "Adding graph optimization pass: " << optimizer;
@@ -76,7 +93,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(
           new ArithmeticOptimizer(cfg_.arithmetic_optimization())));
     }
-    if (cfg_.dependency_optimization() == RewriterConfig::ON) {
+    if (cfg_.dependency_optimization() != RewriterConfig::OFF) {
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(
           new DependencyOptimizer(cfg_.dependency_optimization())));
     }
@@ -128,12 +145,11 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       } else {
         already_optimized = true;
         result = strings::StrCat(
-            "OK. "
-            "Graph size before: ",
-            item.graph.node_size(),
-            ". Graph size after: ", optimized_graph->node_size());
+            "OK. ", PrintSizesBeforeAfter(item.graph, *optimized_graph));
       }
       result_.push_back(std::make_pair(optimizer->name(), result));
+      VLOG(1) << "Optimizer " << optimizer->name()
+              << " return status: " << result;
     } else {
       GrapplerItem optimized_item(item, std::move(*optimized_graph));
       auto status =
@@ -146,17 +162,17 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
         result = status.ToString();
       } else {
         result = strings::StrCat(
-            "OK. "
-            "Graph size before: ",
-            optimized_item.graph.node_size(),
-            ". Graph size after: ", optimized_graph->node_size());
+            "OK. ",
+            PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph));
       }
       result_.push_back(std::make_pair(optimizer->name(), result));
+      VLOG(1) << "Optimizer " << optimizer->name()
+              << " return status: " << result;
     }
   }
 
   if (already_optimized) {
-    TopologicalSort(optimized_graph);
+    TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph));
     // Make sure that the optimizers preserved the graph version and library.
     DCHECK_GE(optimized_graph->library().function_size(),
               item.graph.library().function_size());
@@ -187,7 +203,7 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
   return !cfg.disable_model_pruning() ||
          cfg.layout_optimizer() == RewriterConfig::ON ||
          cfg.constant_folding() != RewriterConfig::OFF ||
-         cfg.dependency_optimization() == RewriterConfig::ON ||
+         cfg.dependency_optimization() != RewriterConfig::OFF ||
          cfg.arithmetic_optimization() != RewriterConfig::OFF ||
          cfg.auto_parallel().enable() || cfg.memory_optimization() > 1 ||
          !cfg.optimizers().empty();
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index b9df196f83b0e986a3eb4ed4c470c5520e7d611f..c9bec7890e6af008859d21555fb7ed74451c72c6 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -26,16 +26,6 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-int NumNonControlInputs(const NodeDef& node) {
-  int num_inputs = node.input_size();
-  for (int i = 0; i < node.input_size(); ++i) {
-    if (!node.input(i).empty() && node.input(i)[0] == '^') {
-      num_inputs--;
-    }
-  }
-  return num_inputs;
-}
-
 bool IsTrivialOp(const NodeDef& node) {
   // Remove the stop gradient nodes since they serve no purpose once the graph
   // is built. Also remove Identity ops.
diff --git a/tensorflow/core/grappler/optimizers/static_schedule.cc b/tensorflow/core/grappler/optimizers/static_schedule.cc
index 6ce6deef2ceacdfe44b49659109e432b87739f97..450e85340796fdde9afdfebbd0eb9a724cb9440a 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule.cc
+++ b/tensorflow/core/grappler/optimizers/static_schedule.cc
@@ -86,7 +86,7 @@ Status EstimateEarliestExecutionTimes(
   name_map.clear();
 
   GraphProperties properties(item);
-  TF_RETURN_IF_ERROR(properties.InferStatically());
+  TF_RETURN_IF_ERROR(properties.InferStatically(true));
   OpLevelCostEstimator estimator;
   VirtualPlacer placer(cluster);
 
@@ -154,7 +154,7 @@ Status EstimateRequiredTimes(
     }
   }
   GraphProperties properties(item);
-  TF_RETURN_IF_ERROR(properties.InferStatically());
+  TF_RETURN_IF_ERROR(properties.InferStatically(true));
   OpLevelCostEstimator estimator;
   VirtualPlacer placer(cluster);
 
diff --git a/tensorflow/core/grappler/optimizers/static_schedule_test.cc b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
index 5de593358727bf8b1f247c0fb9ec8f52b2819e4c..08580d92842377c2dd999950b2e01bef01e2fee6 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule_test.cc
+++ b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
@@ -64,17 +64,17 @@ TEST_F(StaticScheduleTest, BasicGraph) {
     if (time.first->name() == "Const/Const") {
       EXPECT_EQ(Costs::NanoSeconds(1), time.second);
     } else if (time.first->name() == "x") {
-      EXPECT_EQ(Costs::NanoSeconds(250002), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(250001), time.second);
     } else if (time.first->name() == "Square") {
-      EXPECT_EQ(Costs::NanoSeconds(1500005), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(1500004), time.second);
     } else if (time.first->name() == "Square_1") {
-      EXPECT_EQ(Costs::NanoSeconds(2750008), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(2750007), time.second);
     } else if (time.first->name() == "Square_2") {
-      EXPECT_EQ(Costs::NanoSeconds(4000011), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(4000010), time.second);
     } else if (time.first->name() == "Square_3") {
-      EXPECT_EQ(Costs::NanoSeconds(5250014), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(5250013), time.second);
     } else if (time.first->name() == "y") {
-      EXPECT_EQ(Costs::NanoSeconds(6500017), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(6500013), time.second);
     }
   }
 }
@@ -110,13 +110,13 @@ TEST_F(StaticScheduleTest, BasicGraphWithCtrlDependencies) {
     if (time.first->name() == "a") {
       EXPECT_EQ(Costs::NanoSeconds(1), time.second);
     } else if (time.first->name() == "b") {
-      EXPECT_EQ(Costs::NanoSeconds(12500026), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(12500001), time.second);
     } else if (time.first->name() == "c") {
-      EXPECT_EQ(Costs::NanoSeconds(12500027), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(12500002), time.second);
     } else if (time.first->name() == "d") {
-      EXPECT_EQ(Costs::NanoSeconds(12500028), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(12500003), time.second);
     } else if (time.first->name() == "e") {
-      EXPECT_EQ(Costs::NanoSeconds(25000053), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(25000003), time.second);
     }
   }
 }
@@ -142,17 +142,17 @@ TEST_F(StaticScheduleTest, RequiredTimes) {
 
   for (auto time : required_times) {
     if (time.first->name() == "Const/Const") {
-      EXPECT_EQ(Costs::NanoSeconds(-6500016), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-6500012), time.second);
     } else if (time.first->name() == "x") {
-      EXPECT_EQ(Costs::NanoSeconds(-6250015), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-6250012), time.second);
     } else if (time.first->name() == "Square") {
-      EXPECT_EQ(Costs::NanoSeconds(-5000012), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-5000009), time.second);
     } else if (time.first->name() == "Square_1") {
-      EXPECT_EQ(Costs::NanoSeconds(-3750009), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-3750006), time.second);
     } else if (time.first->name() == "Square_2") {
-      EXPECT_EQ(Costs::NanoSeconds(-2500006), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-2500003), time.second);
     } else if (time.first->name() == "Square_3") {
-      EXPECT_EQ(Costs::NanoSeconds(-1250003), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-1250000), time.second);
     } else if (time.first->name() == "y") {
       EXPECT_EQ(Costs::NanoSeconds(0), time.second);
     }
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 9452cfbf5575e612a2e88e62bd96d2eb588febbc..fc80772360a71e63c618ca4b2f697a92883196eb 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -14,10 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <vector>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/scanner.h"
@@ -27,22 +29,29 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-NodeMap::NodeMap(GraphDef* graph) : graph_(graph) {
-  for (int i = 0; i < graph_->node_size(); i++) {
-    auto node = graph_->mutable_node(i);
-    auto rslt = nodes_.insert(std::make_pair(node->name(), node));
+NodeMap::NodeMap(GraphDef* graph) {
+  CHECK(graph != nullptr);
+  for (int i = 0; i < graph->node_size(); i++) {
+    NodeDef* node = graph->mutable_node(i);
+    const string& node_name = node->name();
+    auto rslt = nodes_.emplace(node_name, node);
     // Check that the graph doesn't contain multiple nodes with the same name.
     if (!rslt.second) {
-      LOG(WARNING) << "Duplicated node in the graph: " << node->name();
+      LOG(WARNING) << "Duplicated node in the graph: " << node_name;
     }
     for (const auto& input : node->input()) {
-      outputs_[NodeName(input)].insert(nodes_[node->name()]);
+      outputs_[NodeName(input)].insert(nodes_[node_name]);
     }
   }
 }
 
+void NodeMap::RemoveNode(const string& name) {
+  nodes_.erase(NodeName(name));
+  outputs_.erase(NodeName(name));
+}
+
 NodeDef* NodeMap::GetNode(const string& name) const {
-  string node_name = NodeName(name);
+  const string node_name = NodeName(name);
   auto it = nodes_.find(node_name);
   if (it == nodes_.end()) {
     return nullptr;
@@ -50,6 +59,11 @@ NodeDef* NodeMap::GetNode(const string& name) const {
   return it->second;
 }
 
+bool NodeMap::NodeExists(const string& name) const {
+  const string node_name = NodeName(name);
+  return nodes_.find(node_name) != nodes_.end();
+}
+
 const std::set<NodeDef*>& NodeMap::GetOutputs(const string& node_name) const {
   auto it = outputs_.find(node_name);
   if (it == outputs_.end()) {
@@ -58,27 +72,27 @@ const std::set<NodeDef*>& NodeMap::GetOutputs(const string& node_name) const {
   return it->second;
 }
 
-void NodeMap::AddNode(const string& name, NodeDef* node) {
-  auto ret = nodes_.insert(std::make_pair(name, node));
-  CHECK(ret.second) << "Pair (" << name << "," << node
+void NodeMap::AddNode(const string& node_name, NodeDef* node) {
+  auto ret = nodes_.emplace(node_name, CHECK_NOTNULL(node));
+  CHECK(ret.second) << "Pair (" << node_name << "," << node
                     << ") is not inserted because the same key already exists.";
 }
 
 void NodeMap::AddOutput(const string& node_name, const string& output_name) {
-  auto output_node = nodes_[output_name];
+  auto output_node = nodes_[NodeName(output_name)];
   CHECK(output_node) << "Output node " << output_name
                      << " is missing in NodeMap.";
   outputs_[node_name].insert(output_node);
 }
 
 void NodeMap::RemoveOutput(const string& node_name, const string& output_name) {
-  outputs_[node_name].erase(nodes_[output_name]);
+  outputs_[node_name].erase(nodes_[NodeName(output_name)]);
 }
 
 void NodeMap::UpdateInput(const string& node_name, const string& old_input_name,
                           const string& new_input_name) {
-  RemoveOutput(old_input_name, node_name);
-  AddOutput(new_input_name, node_name);
+  RemoveOutput(NodeName(old_input_name), node_name);
+  AddOutput(NodeName(new_input_name), node_name);
 }
 
 void NodeMap::RemoveInputs(const string& node_name) {
@@ -96,14 +110,14 @@ void NodeMap::UpdateOutput(const string& node_name,
                            const string& old_output_name,
                            const string& new_output_name) {
   std::set<NodeDef*>& outputs = outputs_[node_name];
-  outputs.erase(nodes_[old_output_name]);
-  outputs.insert(nodes_[new_output_name]);
+  outputs.erase(nodes_[NodeName(old_output_name)]);
+  outputs.insert(nodes_[NodeName(new_output_name)]);
 }
 
 OutputMap::OutputMap(GraphDef* graph) : graph_(graph) {
   for (int i = 0; i < graph_->node_size(); i++) {
     auto node = graph_->mutable_node(i);
-    auto rslt = nodes_.insert(std::make_pair(node->name(), node));
+    auto rslt = nodes_.emplace(node->name(), node);
     // Check that the graph doesn't contain multiple nodes with the same name.
     CHECK(rslt.second);
     for (const auto& input : node->input()) {
@@ -247,5 +261,160 @@ int NumOutputs(const NodeDef& node) {
   return num_outputs;
 }
 
+int NumNonControlInputs(const NodeDef& node) {
+  int num_inputs = node.input_size();
+  for (const string& input : node.input()) {
+    if (IsControlInput(input)) {
+      --num_inputs;
+    }
+  }
+  return num_inputs;
+}
+
+int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map) {
+  int num_outputs = 0;
+  for (const NodeDef* output : node_map.GetOutputs(node.name())) {
+    for (const string& node_as_input : output->input()) {
+      if (IsControlInput(node_as_input)) {
+        break;
+      }
+      if (NodeName(node_as_input) == node.name()) {
+        ++num_outputs;
+      }
+    }
+  }
+  return num_outputs;
+}
+
+// Returns the data type in attribute `attr_name` of `node`. If that attribute
+// doesn't exist, returns DT_INVALID.
+DataType GetDataTypeFromAttr(const NodeDef& node, const string& attr_name) {
+  if (!node.attr().count(attr_name)) {
+    return DT_INVALID;
+  }
+  const auto& attr = node.attr().at(attr_name);
+  if (attr.value_case() != AttrValue::kType) {
+    return DT_INVALID;
+  }
+  return attr.type();
+}
+
+NodeDef* GetTailOfChain(const NodeDef& source, const NodeMap& node_map,
+                        bool follow_control_input,
+                        const std::function<bool(const NodeDef&)>& pred_fn) {
+  const NodeDef* current = &source;
+  const NodeDef* next = current;
+  while (next == &source || (next != nullptr && pred_fn(*next))) {
+    current = next;
+    if (current->input_size() == 0 ||
+        (!follow_control_input && IsControlInput(current->input(0)))) {
+      break;
+    }
+    next = node_map.GetNode(current->input(0));
+    if (next == nullptr) {
+      LOG(ERROR) << "Node not found: " << current->input(0);
+    }
+  }
+  return const_cast<NodeDef*>(current);
+}
+
+// Every permutation is a product of one or more cycles. Iterate over the cycles
+// in the permutation, and convert each of those into a product of
+// transpositions (swaps): https://en.wikipedia.org/wiki/Cyclic_permutation
+void PermuteNodesInPlace(GraphDef* graph, std::vector<int>* permutation,
+                         bool invert_permutation) {
+  CHECK_EQ(graph->node_size(), permutation->size());
+  std::vector<int> inv_perm(permutation->size(), 0);
+  if (invert_permutation) {
+    for (size_t n = 0; n < permutation->size(); ++n) {
+      inv_perm[(*permutation)[n]] = n;
+    }
+    permutation->swap(inv_perm);
+  }
+  for (std::size_t n = 0; n + 1 < permutation->size(); ++n) {
+    while (n != (*permutation)[n]) {
+      std::size_t r = (*permutation)[n];
+      graph->mutable_node()->SwapElements(n, r);
+      std::swap((*permutation)[n], (*permutation)[r]);
+    }
+  }
+}
+
+namespace {
+template <typename T>
+inline void STLSortAndRemoveDuplicates(T* v) {
+  std::sort(v->begin(), v->end());
+  v->erase(std::unique(v->begin(), v->end()), v->end());
+}
+}  // namespace
+
+Status SimpleGraphView::Initialize(const GraphDef& graph, bool dedup_inputs,
+                                   bool dedup_outputs) {
+  const int num_nodes = graph.node_size();
+  inputs_.clear();
+  inputs_.resize(num_nodes);
+  outputs_.clear();
+  outputs_.resize(num_nodes);
+  name_to_index_.clear();
+  name_to_index_.reserve(num_nodes);
+  index_to_name_.clear();
+  index_to_name_.reserve(num_nodes);
+
+  // Build map from name to index and vice versa.
+  for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
+    const NodeDef& node = graph.node(node_idx);
+    name_to_index_.emplace(node.name(), node_idx);
+    index_to_name_.push_back(node.name());
+  }
+
+  // Build forward and reverse adjacency lists.
+  for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
+    const NodeDef& node = graph.node(node_idx);
+    inputs_[node_idx].reserve(node.input_size());
+    for (const string& input : node.input()) {
+      auto it = name_to_index_.find(NodeName(input));
+      if (it == name_to_index_.end()) {
+        return errors::InvalidArgument("Non-existent input ", input,
+                                       " for node ", node.name());
+      }
+      const int input_idx = it->second;
+      inputs_[node_idx].push_back(input_idx);
+      outputs_[input_idx].push_back(node_idx);
+    }
+    if (dedup_inputs) {
+      // Dedup the input list while it's still hot in cache.
+      STLSortAndRemoveDuplicates(&inputs_[node_idx]);
+    }
+  }
+
+  // Dedup outputs.
+  if (dedup_outputs) {
+    for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
+      STLSortAndRemoveDuplicates(&outputs_[node_idx]);
+    }
+  }
+  return Status::OK();
+}
+
+string SimpleGraphView::PrintToString() const {
+  string str;
+  for (int i = 0; i < num_nodes(); ++i) {
+    strings::StrAppend(&str, "Node ", i, "'", node_name(i), "'\n", "Inputs: [");
+    for (int input : inputs(i)) {
+      strings::StrAppend(&str, input, " '", node_name(input), "', ");
+    }
+    strings::StrAppend(&str, "]\n", "Outputs: [");
+    for (int j = 0; j < outputs(i).size(); ++j) {
+      const int output = outputs(i)[j];
+      if (j > 0) {
+        strings::StrAppend(&str, ", ");
+      }
+      strings::StrAppend(&str, output, " '", node_name(output), "'");
+    }
+    strings::StrAppend(&str, "]\n");
+  }
+  return str;
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index f9fb418140833af4b1805ff3b02b5666d886407b..476ab8b51afcee839d8f30378d2fa00ed8406cc7 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -17,12 +17,16 @@ limitations under the License.
 #define TENSORFLOW_GRAPPLER_UTILS_H_
 
 #include <functional>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -30,12 +34,16 @@ namespace grappler {
 // A utility class to lookup a node and its outputs by node name.
 class NodeMap {
  public:
+  // Note: The NodeMap will store pointers to nodes in graph, which may become
+  // invalid if graph is changed.
   explicit NodeMap(GraphDef* graph);
   NodeDef* GetNode(const string& name) const;
+  bool NodeExists(const string& name) const;
   const std::set<NodeDef*>& GetOutputs(const string& node_name) const;
   // This method doesn't record the outputs of the added node; the outputs need
   // to be explicitly added by the AddOutput method.
   void AddNode(const string& name, NodeDef* node);
+  void RemoveNode(const string& name);
   void UpdateInput(const string& node_name, const string& old_input_name,
                    const string& new_input_name);
   void AddOutput(const string& node_name, const string& output_name);
@@ -46,8 +54,7 @@ class NodeMap {
                     const string& new_output_name);
 
  private:
-  GraphDef* graph_;
-  std::set<NodeDef*> empty_set_;
+  const std::set<NodeDef*> empty_set_;
   std::unordered_map<string, NodeDef*> nodes_;
   std::unordered_map<string, std::set<NodeDef*>> outputs_;
 };
@@ -68,6 +75,39 @@ class OutputMap {
   std::unordered_map<string, std::unordered_map<NodeDef*, int>> outputs_;
 };
 
+// A vector with a set. The set stores the same elements as the vector, and
+// quickly answers whether a value is in the vector. Duplicated elements are not
+// allowed for now.
+template <class T>
+class SetVector {
+ public:
+  // Returns false if value already existed in the set, true otherwise.
+  bool PushBack(const T& value) {
+    if (!set_.insert(value).second) {
+      return false;
+    }
+    vector_.push_back(value);
+    return true;
+  }
+
+  T PopBack() {
+    T back = vector_.back();
+    set_.erase(back);
+    vector_.pop_back();
+    return back;
+  }
+
+  bool Exists(const T& value) const { return set_.find(value) != set_.end(); }
+
+  bool Empty() const { return vector_.empty(); }
+
+  void Reserve(int64 size) { vector_.reserve(size); }
+
+ private:
+  std::unordered_set<T> set_;
+  std::vector<T> vector_;
+};
+
 // True iff 'name' refers to a control inputs, i.e. a node name prefixed with
 // the ^ character.
 bool IsControlInput(const string& name);
@@ -109,10 +149,70 @@ string AsControlDependency(const NodeDef& node);
 // for control dependency, given a node name
 string AsControlDependency(const string& node);
 
-// Returns the number of outputs of a node. Note that some of the outputs may be
-// unconnected.
+// Returns the number of outputs of a node according to its OpDef. Note that
+// some of the outputs may be unconnected.
 int NumOutputs(const NodeDef& node);
 
+// Number of connected non-control inputs.
+int NumNonControlInputs(const NodeDef& node);
+
+// Number of connected non-control outputs.
+int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map);
+
+// Returns the data type in attribute `attr_name` of `node`. If that attribute
+// doesn't exist, returns DT_INVALID.
+DataType GetDataTypeFromAttr(const NodeDef& node, const string& attr_name);
+
+// Returns the last node in the simple chain starting at source and traversing
+// through the input(0) edge from each node as long as the next node satisfies
+// the predicate given in pred_fn. If no nodes satisfy the predicate, &source
+// will be returned. Example: For the chain
+//    source <- a <- b <- ... <- y <- z
+// where
+//    pred_fn(a) = pred_fn(b) = ... = pred_fn(y) = true,
+//    pred_fn(z) = false,
+// the return value will be a pointer to y.
+NodeDef* GetTailOfChain(const NodeDef& source, const NodeMap& node_map,
+                        bool follow_control_input,
+                        const std::function<bool(const NodeDef&)>& pred_fn);
+
+// Permute the nodes of graph in place according to the permutation.
+void PermuteNodesInPlace(GraphDef* graph, std::vector<int>* permutation,
+                         bool invert_permutation);
+
+class SimpleGraphView {
+ public:
+  Status Initialize(const GraphDef& graph) {
+    return Initialize(graph, true, true);
+  }
+  Status Initialize(const GraphDef& graph, bool dedup_inputs,
+                    bool dedup_outputs);
+
+  inline int num_nodes() const { return index_to_name_.size(); }
+  inline const int index(const string& node_name) const {
+    const auto& it = name_to_index_.find(node_name);
+    DCHECK(it != name_to_index_.end());
+    return it == name_to_index_.end() ? -1 : it->second;
+  }
+  inline const string& node_name(int node_idx) const {
+    return index_to_name_[node_idx];
+  }
+  inline const gtl::InlinedVector<int, 4>& inputs(int node_idx) const {
+    return inputs_[node_idx];
+  }
+  inline const gtl::InlinedVector<int, 2>& outputs(int node_idx) const {
+    return outputs_[node_idx];
+  }
+
+  string PrintToString() const;
+
+ private:
+  std::vector<string> index_to_name_;
+  std::unordered_map<string, int> name_to_index_;
+  std::vector<gtl::InlinedVector<int, 4>> inputs_;
+  std::vector<gtl::InlinedVector<int, 2>> outputs_;
+};
+
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 21243833accff6ca3423c505091900564094557d..534f7a063fe90bf72f8a2afba7ae8f75b8472a36 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -53,6 +53,7 @@ cc_library(
     hdrs = ["topological_sort.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:op_types",
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index 77d4702d21e75b1689875eb17fbd2cda41aa1ba8..8d8ff4da3a8df5a2868f1a3a0ac6a5d0c2fd66ad 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -19,61 +19,56 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
 // Kahn's algorithm is implemented.
 // For details, see https://en.wikipedia.org/wiki/Topological_sorting
-void TopologicalSort(GraphDef* graph) {
-  OutputMap output_map(graph);
-  std::vector<NodeDef*> ready_nodes;
-  ready_nodes.reserve(graph->node_size());
+Status TopologicalSort(GraphDef* graph) {
+  SimpleGraphView graph_view;
+  TF_RETURN_IF_ERROR(graph_view.Initialize(*graph));
+
+  std::vector<int> ready_nodes;
+  ready_nodes.reserve(graph_view.num_nodes());
+
   int front = 0;
   int back = 0;
-  std::unordered_map<const NodeDef*, int> ready_inputs;
-  for (int i = 0; i < graph->node_size(); i++) {
-    auto node = graph->mutable_node(i);
-    if (node->input_size() == 0) {
-      ready_nodes.push_back(node);
+  std::vector<int> num_ready_inputs(graph_view.num_nodes(), 0);
+  for (int i = 0; i < graph_view.num_nodes(); i++) {
+    if (graph_view.inputs(i).empty()) {
+      ready_nodes.push_back(i);
       back++;
     }
-    if (IsMerge(*node)) {
-      ready_inputs[node] = 0;
-      for (const auto& input : node->input()) {
-        if (IsNextIteration(*output_map.GetNode(input))) {
-          ready_inputs[node]++;
+    if (IsMerge(graph->node(i))) {
+      for (int input : graph_view.inputs(i)) {
+        if (IsNextIteration(graph->node(input))) {
+          num_ready_inputs[i]++;
         }
       }
-    } else {
-      ready_inputs[node] = 0;
     }
   }
 
   while (front != back) {
-    auto ready_node = ready_nodes[front];
-    for (const auto& fanout_pair : output_map.GetOutputs(ready_node->name())) {
-      auto fanout = fanout_pair.first;
-      ready_inputs[fanout] += fanout_pair.second;
-      if (ready_inputs[fanout] == fanout->input_size()) {
+    int ready_node = ready_nodes[front];
+    for (int fanout : graph_view.outputs(ready_node)) {
+      ++num_ready_inputs[fanout];
+      if (num_ready_inputs[fanout] == graph_view.inputs(fanout).size()) {
         ready_nodes.push_back(fanout);
-        back++;
+        ++back;
       }
     }
-    front++;
+    ++front;
   }
 
-  if (back == graph->node_size()) {
-    GraphDef new_graph;
-    new_graph.mutable_node()->Reserve(graph->node_size());
-    for (int i = 0; i < graph->node_size(); i++) {
-      auto new_node = new_graph.add_node();
-      new_node->Swap(ready_nodes[i]);
-    }
-    graph->mutable_node()->Swap(new_graph.mutable_node());
-  } else {
-    LOG(ERROR) << "The graph couldn't be sorted in topological order.";
+  if (back != graph_view.num_nodes()) {
+    return errors::InvalidArgument(
+        "The graph couldn't be sorted in topological order.");
   }
+
+  PermuteNodesInPlace(graph, &ready_nodes, /*invert_permutation=*/true);
+  return Status::OK();
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/utils/topological_sort.h b/tensorflow/core/grappler/utils/topological_sort.h
index d4d8034ef577a0282dbce161aed8ba440bf248ab..f2c9bbfa4ebce373a4fa80f399ce3d2b59a576f4 100644
--- a/tensorflow/core/grappler/utils/topological_sort.h
+++ b/tensorflow/core/grappler/utils/topological_sort.h
@@ -17,12 +17,13 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
 
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
 // Sort a graph in topological order.
-void TopologicalSort(GraphDef* graph);
+Status TopologicalSort(GraphDef* graph);
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/topological_sort_test.cc b/tensorflow/core/grappler/utils/topological_sort_test.cc
index dc99cb1052ce9db3035401a2cd75e838281fb748..c96f15b0e8424d70e8dd1393cf254b52f69200d2 100644
--- a/tensorflow/core/grappler/utils/topological_sort_test.cc
+++ b/tensorflow/core/grappler/utils/topological_sort_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -51,7 +52,7 @@ TEST_F(TopologicalSortTest, NoLoop) {
   *graph.add_node() = CreateNode("5", {});
   *graph.add_node() = CreateNode("4", {});
 
-  TopologicalSort(&graph);
+  TF_EXPECT_OK(TopologicalSort(&graph));
   std::vector<string> order = {"5", "4", "2", "0", "3", "1"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
@@ -67,7 +68,7 @@ TEST_F(TopologicalSortTest, WithLoop) {
   *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
   *graph.add_node() = CreateNode("1", {});
 
-  TopologicalSort(&graph);
+  TF_EXPECT_OK(TopologicalSort(&graph));
   std::vector<string> order = {"1", "2", "3", "4", "5"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
@@ -82,7 +83,7 @@ TEST_F(TopologicalSortTest, WithIllegalLoop) {
   *graph.add_node() = CreateNode("3", {"2"});
   *graph.add_node() = CreateNode("1", {});
 
-  TopologicalSort(&graph);
+  EXPECT_FALSE(TopologicalSort(&graph).ok());
   std::vector<string> order = {"2", "3", "1"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
@@ -94,13 +95,34 @@ TEST_F(TopologicalSortTest, DuplicatedInputs) {
   *graph.add_node() = CreateNode("2", {"1", "1"});
   *graph.add_node() = CreateNode("1", {});
 
-  TopologicalSort(&graph);
+  TF_EXPECT_OK(TopologicalSort(&graph));
   std::vector<string> order = {"1", "2"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
   }
 }
 
+TEST_F(TopologicalSortTest, Idempotent) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("1", {});
+  *graph.add_node() = CreateNode("2", {});
+  *graph.add_node() = CreateNode("3", {"1", "2"});
+  *graph.add_node() = CreateNode("4", {"1", "3"});
+  *graph.add_node() = CreateNode("5", {"2", "3"});
+
+  TF_EXPECT_OK(TopologicalSort(&graph));
+  std::vector<string> order = {"1", "2", "3", "4", "5"};
+  for (int i = 0; i < order.size(); i++) {
+    EXPECT_EQ(graph.node(i).name(), order[i]);
+  }
+
+  // Run topo sort again to verify that it is idenpotent.
+  TF_EXPECT_OK(TopologicalSort(&graph));
+  for (int i = 0; i < order.size(); i++) {
+    EXPECT_EQ(graph.node(i).name(), order[i]);
+  }
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index 9d747fe7dc4e7bb739cb6f97a389df1de8417e20..77371c399e5fc7321f7c2b271aae32ce9655244b 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -181,7 +182,7 @@ TEST_F(UtilsTest, NumOutputs) {
   EXPECT_EQ(1, NumOutputs(CreateDequeueNode()));
 }
 
-TEST(AsControlDependency, BasicTest) {
+TEST_F(UtilsTest, AsControlDependency) {
   NodeDef node;
   node.set_name("foo");
   EXPECT_EQ("^foo", AsControlDependency(node));
@@ -189,6 +190,65 @@ TEST(AsControlDependency, BasicTest) {
   EXPECT_EQ("^foo", AsControlDependency("^foo"));
 }
 
+TEST_F(UtilsTest, GetTailOfChain) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output c0 = ops::Const(s.WithOpName("c0"), {1.0f, 2.0f}, {1, 2});
+  Output c1 = ops::Const(s.WithOpName("c1"), {3.0f, 4.0f}, {1, 2});
+  // Add a node with only connected by control output.
+  Output neg0 = ops::Neg(s.WithOpName("neg0"), c1);
+  // Add a node with two outputs.
+  Output neg1 =
+      ops::Neg(s.WithControlDependencies(neg0).WithOpName("neg1"), c0);
+  Output neg2 = ops::Neg(s.WithOpName("neg2"), neg1);
+  Output id1 = ops::Identity(s.WithOpName("id1"), neg2);
+  Output id2 = ops::Identity(s.WithOpName("id2"), neg1);
+  auto noop = ops::NoOp(s.WithControlDependencies(neg0).WithOpName("noop"));
+  GraphDef graph;
+  TF_CHECK_OK(s.ToGraphDef(&graph));
+  LOG(INFO) << graph.DebugString();
+
+  ASSERT_EQ("c0", graph.node(0).name());
+  ASSERT_EQ("c1", graph.node(1).name());
+  ASSERT_EQ("neg0", graph.node(2).name());
+  ASSERT_EQ("neg1", graph.node(3).name());
+  ASSERT_EQ("neg2", graph.node(4).name());
+  ASSERT_EQ("id1", graph.node(5).name());
+  ASSERT_EQ("id2", graph.node(6).name());
+  ASSERT_EQ("noop", graph.node(7).name());
+
+  NodeMap node_map(&graph);
+  auto is_neg = [&](const NodeDef& node) { return node.op() == "Neg"; };
+  // We walk backwards, starting as "id1", so tail should be "neg1".
+  NodeDef* tail = GetTailOfChain(graph.node(5), node_map,
+                                 /*follow_control_input=*/false, is_neg);
+  EXPECT_NE(tail, nullptr);
+  EXPECT_EQ("neg1", tail->name());
+
+  // We stop at branching nodes, so tail should be "neg2".
+  auto is_neg_and_non_branching = [&](const NodeDef& node) {
+    return node.op() == "Neg" && NumNonControlOutputs(node, node_map) == 1;
+  };
+  tail =
+      GetTailOfChain(graph.node(5), node_map,
+                     /*follow_control_input=*/false, is_neg_and_non_branching);
+  EXPECT_NE(tail, nullptr);
+  EXPECT_EQ("neg2", tail->name());
+
+  // We walk backwards, starting from "noop", also following control inputs,
+  // so tail should be "neg0".
+  tail = GetTailOfChain(graph.node(7), node_map,
+                        /*follow_control_input=*/true, is_neg);
+  EXPECT_NE(tail, nullptr);
+  EXPECT_EQ("neg0", tail->name());
+
+  // We walk backwards, starting from "noop", not following control inputs,
+  // so tail should be "noop" itself.
+  tail = GetTailOfChain(graph.node(7), node_map,
+                        /*follow_control_input=*/false, is_neg);
+  EXPECT_NE(tail, nullptr);
+  EXPECT_EQ("noop", tail->name());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index f49113277788c464ac9d6288996a3f437bbd939e..ae39c4522dbf304b5156478f2b5571180cba567d 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -269,13 +269,11 @@ cc_library(
 cc_library(
     name = "conv_ops_gpu_hdrs",
     hdrs = ["conv_ops_gpu.h"],
-    deps = ["//third_party/eigen3"],
 )
 
 cc_library(
     name = "gpu_util_hdrs",
     hdrs = ["gpu_utils.h"],
-    deps = ["//third_party/eigen3"],
 )
 
 tf_cc_test(
@@ -291,6 +289,17 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "reshape_util",
+    srcs = ["reshape_util.cc"],
+    hdrs = ["reshape_util.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 tf_cc_test(
     name = "variable_ops_test",
     size = "small",
@@ -329,6 +338,7 @@ cc_library(
     srcs = ["queue_base.cc"],
     hdrs = ["queue_base.h"],
     deps = [
+        ":batch_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -350,6 +360,7 @@ cc_library(
     srcs = ["priority_queue.cc"],
     hdrs = ["priority_queue.h"],
     deps = [
+        ":batch_util",
         ":queue_base",
         ":typed_queue",
         "//tensorflow/core:framework",
@@ -576,6 +587,7 @@ cc_library(
         ":extract_image_patches_op",
         ":gather_nd_op",
         ":gather_op",
+        ":guarantee_const_op",
         ":identity_n_op",
         ":identity_op",
         ":inplace_ops",
@@ -593,6 +605,7 @@ cc_library(
         ":reverse_sequence_op",
         ":shape_ops",
         ":slice_op",
+        ":snapshot_op",
         ":split_op",
         ":split_v_op",
         ":strided_slice_op",
@@ -622,6 +635,12 @@ tf_kernel_library(
     deps = ARRAY_DEPS,
 )
 
+tf_kernel_library(
+    name = "guarantee_const_op",
+    prefix = "guarantee_const_op",
+    deps = ARRAY_DEPS,
+)
+
 tf_kernel_library(
     name = "constant_op",
     prefix = "constant_op",
@@ -783,6 +802,12 @@ tf_kernel_library(
     deps = ARRAY_DEPS + [":strided_slice_op"],
 )
 
+tf_kernel_library(
+    name = "snapshot_op",
+    prefix = "snapshot_op",
+    deps = ARRAY_DEPS,
+)
+
 tf_kernel_library(
     name = "split_op",
     gpu_srcs = ["cuda_device_array.h"],
@@ -1180,6 +1205,25 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "guarantee_const_op_test",
+    size = "small",
+    srcs = ["guarantee_const_op_test.cc"],
+    deps = [
+        ":guarantee_const_op",
+        ":ops_testutil",
+        ":ops_util",
+        ":variable_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "identity_op_test",
     size = "small",
@@ -1581,7 +1625,10 @@ tf_kernel_library(
 tf_kernel_library(
     name = "random_shuffle_queue_op",
     prefix = "random_shuffle_queue_op",
-    deps = DATA_FLOW_DEPS + ["//tensorflow/core:protos_all_cc"],
+    deps = DATA_FLOW_DEPS + [
+        ":batch_util",
+        "//tensorflow/core:protos_all_cc",
+    ],
 )
 
 tf_kernel_library(
@@ -1704,6 +1751,7 @@ tf_cuda_cc_tests(
         ":data_flow",
         ":ops_testutil",
         ":ops_util",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -1734,6 +1782,7 @@ cc_library(
     hdrs = ["fifo_queue.h"],
     visibility = ["//visibility:private"],
     deps = [
+        ":batch_util",
         ":queue_base",
         ":typed_queue",
         "//tensorflow/core:framework",
@@ -1748,6 +1797,7 @@ cc_library(
     hdrs = ["padding_fifo_queue.h"],
     visibility = ["//visibility:private"],
     deps = [
+        ":batch_util",
         ":fifo_queue",
         ":queue_base",
         ":typed_queue",
@@ -2564,8 +2614,13 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "batch_matmul_op",
+    srcs = [] + if_mkl([
+        "mkl_batch_matmul_op.cc",
+    ]),
     prefix = "batch_matmul_op",
-    deps = MATH_DEPS,
+    deps = MATH_DEPS + if_mkl([
+        "//third_party/mkl:intel_binary_blob",
+    ]),
 )
 
 tf_kernel_library(
@@ -3065,6 +3120,7 @@ cc_library(
         ":batch_norm_op",
         ":bias_op",
         ":conv_ops",
+        ":data_format_ops",
         ":depthwise_conv_grad_op",
         ":depthwise_conv_op",
         ":dilation_ops",
@@ -3102,6 +3158,12 @@ tf_kernel_library(
     deps = NN_DEPS,
 )
 
+tf_kernel_library(
+    name = "data_format_ops",
+    prefix = "data_format_ops",
+    deps = NN_DEPS,
+)
+
 tf_kernel_library(
     name = "bias_op",
     prefix = "bias_op",
@@ -3433,6 +3495,7 @@ tf_kernel_library(
 cc_library(
     name = "parsing",
     deps = [
+        ":decode_compressed_op",
         ":decode_csv_op",
         ":decode_raw_op",
         ":example_parsing_ops",
@@ -3461,6 +3524,14 @@ tf_kernel_library(
     deps = PARSING_DEPS,
 )
 
+tf_kernel_library(
+    name = "decode_compressed_op",
+    prefix = "decode_compressed_op",
+    deps = [
+        "//tensorflow/core:lib_internal",
+    ] + PARSING_DEPS,
+)
+
 tf_kernel_library(
     name = "example_parsing_ops",
     prefix = "example_parsing_ops",
@@ -3682,7 +3753,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "sparse_reshape_op",
     prefix = "sparse_reshape_op",
-    deps = SPARSE_DEPS,
+    deps = SPARSE_DEPS + [
+        ":reshape_util",
+    ],
 )
 
 tf_kernel_library(
@@ -3730,7 +3803,10 @@ tf_kernel_library(
 tf_kernel_library(
     name = "serialize_sparse_op",
     prefix = "serialize_sparse_op",
-    deps = SPARSE_DEPS + ["//tensorflow/core:protos_all_cc"],
+    deps = SPARSE_DEPS + [
+        ":reshape_util",
+        "//tensorflow/core:protos_all_cc",
+    ],
 )
 
 tf_kernel_library(
@@ -3885,6 +3961,8 @@ tf_kernel_library(
         "scatter_nd_op_cpu_impl_3.cc",
         "scatter_nd_op_cpu_impl_4.cc",
         "scatter_nd_op_cpu_impl_5.cc",
+        "scatter_nd_op_cpu_impl_6.cc",
+        "scatter_nd_op_cpu_impl_7.cc",
     ],
     hdrs = [
         "scatter_nd_op.h",
@@ -3894,7 +3972,11 @@ tf_kernel_library(
         "scatter_nd_op.h",
         "scatter_nd_op_gpu.cu.cc",
     ],
-    deps = STATE_DEPS + [":dense_update_functor"],
+    deps = STATE_DEPS + [
+        ":dense_update_functor",
+        ":training_op_helpers",
+        ":variable_ops",
+    ],
 )
 
 tf_kernel_library(
@@ -4366,6 +4448,7 @@ filegroup(
     name = "mobile_srcs",
     srcs = [
         "avgpooling_op.h",
+        "batch_util.h",
         "bounds_check.h",
         "cwise_ops.h",
         "cwise_ops_common.h",
@@ -4452,6 +4535,8 @@ filegroup(
         "gather_nd_op_cpu_impl_3.cc",
         "gather_nd_op_cpu_impl_4.cc",
         "gather_nd_op_cpu_impl_5.cc",
+        "gather_nd_op_cpu_impl_6.cc",
+        "gather_nd_op_cpu_impl_7.cc",
         "gather_op.cc",
         "identity_n_op.cc",
         "identity_n_op.h",
@@ -4539,6 +4624,7 @@ filegroup(
         "control_flow_ops.h",
         "conv_2d.h",
         "conv_ops.h",
+        "data_format_ops.h",
         "depthtospace_op.h",
         "depthwise_conv_op.h",
         "fake_quant_ops_functor.h",
@@ -4561,6 +4647,7 @@ filegroup(
         "reduction_ops_common.h",
         "relu_op.h",
         "relu_op_functor.h",
+        "reshape_util.h",
         "resize_bilinear_op.h",
         "resize_nearest_neighbor_op.h",
         "reverse_op.h",
@@ -4651,6 +4738,7 @@ filegroup(
         "cwise_op_squared_difference.cc",
         "cwise_op_sub.cc",
         "cwise_op_tanh.cc",
+        "data_format_ops.cc",
         "decode_wav_op.cc",
         "deep_conv2d.cc",
         "deep_conv2d.h",
@@ -4677,6 +4765,7 @@ filegroup(
 filegroup(
     name = "android_extended_ops_group2",
     srcs = [
+        "batch_util.cc",
         "batchtospace_op.cc",
         "ctc_decoder_ops.cc",
         "decode_bmp_op.cc",
@@ -4715,6 +4804,7 @@ filegroup(
         "reduction_ops_prod.cc",
         "reduction_ops_sum.cc",
         "relu_op.cc",
+        "reshape_util.cc",
         "resize_bilinear_op.cc",
         "resize_nearest_neighbor_op.cc",
         "restore_op.cc",
@@ -4841,7 +4931,6 @@ filegroup(
             "summary_interface.*",
             "summary_kernels.*",
             "spectrogram_convert_test_data.cc",
-            "sql_dataset_ops.cc",
             # Excluded due to experimental status:
             "debug_ops.*",
             "scatter_nd_op*",
@@ -5015,7 +5104,6 @@ tf_cc_test(
         "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//third_party/eigen3",
     ],
@@ -5058,7 +5146,6 @@ tf_cc_binary(
             "//tensorflow/cc:client_session",
             "//tensorflow/core:framework",
             "//tensorflow/core:tensor_testutil",
-            "//tensorflow/core:test_main",
         ],
     }),
 )
@@ -5118,7 +5205,6 @@ cc_binary(
             "//tensorflow/core:tensor_testutil",
             "//tensorflow/core:tensorflow",
             "//tensorflow/core:test",
-            "//tensorflow/core:test_main",
         ],
     }),
 )
@@ -5142,7 +5228,6 @@ tf_cc_test(
         "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
     ],
 )
@@ -5163,7 +5248,6 @@ tf_cc_test(
         "//tensorflow/core:image_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
     ],
 )
@@ -5207,7 +5291,6 @@ cc_binary(
             "//tensorflow/core:image_ops_op_lib",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:test",
-            "//tensorflow/core:test_main",
             "//tensorflow/core:testlib",
         ],
     }),
@@ -5321,7 +5404,6 @@ cc_binary(
             ":quantized_ops",
             "//tensorflow/core:framework",
             "//tensorflow/core:tensor_testutil",
-            "//tensorflow/core:test_main",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:test",
         ],
@@ -5347,7 +5429,6 @@ tf_cc_test(
         "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
     ],
 )
@@ -5459,7 +5540,6 @@ cc_binary(
         "//conditions:default": [
             "//tensorflow/core:framework",
             "//tensorflow/core:tensor_testutil",
-            "//tensorflow/core:test_main",
         ],
     }),
 )
@@ -5480,7 +5560,6 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
     ],
 )
@@ -5776,463 +5855,35 @@ tf_mkl_kernel_library(
 )
 
 cc_library(
-    name = "dataset",
-    srcs = ["dataset.cc"],
-    hdrs = ["dataset.h"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/util/tensor_bundle",
-    ],
-)
-
-cc_library(
-    name = "dataset_utils",
-    srcs = ["dataset_utils.cc"],
-    hdrs = ["dataset_utils.h"],
+    name = "batch_util",
+    srcs = ["batch_util.cc"],
+    hdrs = ["batch_util.h"],
     deps = [
-        ":captured_function",
-        ":dataset",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
     ],
 )
 
 cc_library(
     name = "captured_function",
-    srcs = ["captured_function.cc"],
     hdrs = ["captured_function.h"],
     deps = [
-        ":dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:proto_text",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:session_options",
-        "//tensorflow/core/kernels:variable_ops",
+        "//tensorflow/core/kernels/data:captured_function",
     ],
 )
 
 cc_library(
-    name = "window_dataset",
-    srcs = ["window_dataset.cc"],
-    hdrs = ["window_dataset.h"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "batch_dataset_op",
-    srcs = ["batch_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "padded_batch_dataset_op",
-    srcs = ["padded_batch_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "dense_to_sparse_batch_dataset_op",
-    srcs = ["dense_to_sparse_batch_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "group_by_window_dataset_op",
-    srcs = ["group_by_window_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        ":window_dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "filter_dataset_op",
-    srcs = ["filter_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "map_dataset_op",
-    srcs = ["map_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "map_and_batch_dataset_op",
-    srcs = ["map_and_batch_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        ":inplace_ops",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "parallel_map_dataset_op",
-    srcs = ["parallel_map_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "scan_dataset_op",
-    srcs = ["scan_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "flat_map_dataset_op",
-    srcs = ["flat_map_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        ":dataset_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "interleave_dataset_op",
-    srcs = ["interleave_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        ":dataset_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "parallel_interleave_dataset_op",
-    srcs = ["parallel_interleave_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        ":dataset_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "prefetch_dataset_op",
-    srcs = ["prefetch_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "repeat_dataset_op",
-    srcs = ["repeat_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "take_dataset_op",
-    srcs = ["take_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "skip_dataset_op",
-    srcs = ["skip_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "ignore_errors_dataset_op",
-    srcs = ["ignore_errors_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "range_dataset_op",
-    srcs = ["range_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "shuffle_dataset_op",
-    srcs = ["shuffle_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "sparse_tensor_slice_dataset_op",
-    srcs = ["sparse_tensor_slice_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "tensor_dataset_op",
-    srcs = ["tensor_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "tensor_slice_dataset_op",
-    srcs = ["tensor_slice_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "zip_dataset_op",
-    srcs = ["zip_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "concatenate_dataset_op",
-    srcs = ["concatenate_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "reader_dataset_ops",
-    srcs = ["reader_dataset_ops.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "sql_dataset_ops",
-    srcs = [
-        "sql/driver_manager.cc",
-        "sql/sqlite_query_connection.cc",
-        "sql_dataset_ops.cc",
-    ],
-    hdrs = [
-        "sql/driver_manager.h",
-        "sql/query_connection.h",
-        "sql/sqlite_query_connection.h",
-    ],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/lib/db:sqlite",
-        "@sqlite_archive//:sqlite",
-    ],
-)
-
-tf_kernel_library(
-    name = "iterator_ops",
-    srcs = ["iterator_ops.cc"],
-    deps = [
-        ":dataset",
-        ":ops_util",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_kernel_library(
-    name = "cache_dataset_ops",
-    srcs = ["cache_dataset_ops.cc"],
+    name = "dataset",
+    hdrs = ["dataset.h"],
     deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/util/tensor_bundle",
+        "//tensorflow/core/kernels/data:dataset",
     ],
 )
 
 tf_kernel_library(
     name = "dataset_ops",
     deps = [
-        ":batch_dataset_op",
-        ":cache_dataset_ops",
-        ":concatenate_dataset_op",
-        ":dense_to_sparse_batch_dataset_op",
-        ":filter_dataset_op",
-        ":flat_map_dataset_op",
-        ":group_by_window_dataset_op",
-        ":ignore_errors_dataset_op",
-        ":interleave_dataset_op",
-        ":iterator_ops",
-        ":map_and_batch_dataset_op",
-        ":map_dataset_op",
-        ":padded_batch_dataset_op",
-        ":parallel_interleave_dataset_op",
-        ":parallel_map_dataset_op",
-        ":prefetch_dataset_op",
-        ":range_dataset_op",
-        ":reader_dataset_ops",
-        ":repeat_dataset_op",
-        ":scan_dataset_op",
-        ":shuffle_dataset_op",
-        ":skip_dataset_op",
-        ":sparse_tensor_slice_dataset_op",
-        ":sql_dataset_ops",
-        ":take_dataset_op",
-        ":tensor_dataset_op",
-        ":tensor_slice_dataset_op",
-        ":zip_dataset_op",
+        "//tensorflow/core/kernels/data:dataset_ops",
     ],
 )
 
@@ -6290,3 +5941,31 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
+
+# Library to link with when compiling the cwise_op kernels directly,
+# e.g. for selective registration.
+# should not be linked by projects that also link the cwise_op library.
+cc_library(
+    name = "cwise_lib",
+    srcs = [
+        "cwise_ops_common.cc",
+        "meta_support.cc",
+        "quantization_utils.cc",
+    ],
+    hdrs = [
+        "cwise_ops.h",
+        "cwise_ops_common.h",
+        "cwise_ops_gpu_common.cu.h",
+        "cwise_ops_gpu_gradients.cu.h",
+        "cwise_ops_gradients.h",
+        "meta_support.h",
+        "quantization_utils.h",
+    ],
+    deps = [
+        ":bounds_check",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+        "@gemmlowp//:gemmlowp",
+    ],
+)
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index 3b880a963538b05789e73a9100ec5d5472d3c249..d0bbea9fe27856cc0dedb4570d285bd872741099 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -161,9 +161,11 @@ class Barrier : public ResourceBase {
         component_shape.InsertDim(0, insertion_size);
         Tensor component(ready_tuples[0][i].dtype(), component_shape);
         for (int b = 0; b < insertion_size; ++b) {
-          OP_REQUIRES_OK_ASYNC(ctx, QueueBase::CopyElementToSlice(
-                                        ready_tuples[b][i], &component, b),
-                               callback);
+          OP_REQUIRES_OK_ASYNC(
+              ctx,
+              batch_util::CopyElementToSlice(std::move(ready_tuples[b][i]),
+                                             &component, b),
+              callback);
         }
         insert_tuple.push_back(component);
       }
diff --git a/tensorflow/core/kernels/batch_matmul_op_complex.cc b/tensorflow/core/kernels/batch_matmul_op_complex.cc
index a58ec027262a0c2fab729d2c434098d2795d1d62..96216764fd46971db47b6a11be622cef63e5d103 100644
--- a/tensorflow/core/kernels/batch_matmul_op_complex.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_complex.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 namespace tensorflow {
 
+#if !defined(INTEL_MKL)
 TF_CALL_complex64(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_complex128(REGISTER_BATCH_MATMUL_CPU);
+#endif
 
 #if GOOGLE_CUDA
 TF_CALL_complex64(REGISTER_BATCH_MATMUL_GPU);
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index 1900ed8e31483a84e216ea54bd08e6a4558bbfcb..8d155ca62b297a4bf59f62159d6b62b01f777721 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 namespace tensorflow {
 
+#if !defined(INTEL_MKL)
 TF_CALL_float(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_double(REGISTER_BATCH_MATMUL_CPU);
+#endif
 TF_CALL_half(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_int32(REGISTER_BATCH_MATMUL_CPU);
 
diff --git a/tensorflow/core/kernels/batch_util.cc b/tensorflow/core/kernels/batch_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f2df95e2d55ac93f8a934010244dcbd1dcd28c8
--- /dev/null
+++ b/tensorflow/core/kernels/batch_util.cc
@@ -0,0 +1,119 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/batch_util.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace batch_util {
+
+namespace {
+
+Status ValidateInput(const Tensor& parent, const Tensor& element, int64 index) {
+  DCHECK_NE(parent.dim_size(0), 0);
+  DCHECK_GE(index, 0);
+  if (element.NumElements() != (parent.NumElements() / parent.dim_size(0))) {
+    TensorShape chip_shape = parent.shape();
+    chip_shape.RemoveDim(0);
+    return errors::Internal(
+        "ValidateInput Cannot perform copy: number of elements does not match. "
+        " Shapes are: [element]: ",
+        element.shape().DebugString(),
+        ", [parent slice]: ", chip_shape.DebugString());
+  }
+  return Status::OK();
+}
+
+template <typename T>
+Status HandleElementToSlice(Tensor element, Tensor* parent, int64 index,
+                            bool /* can_move */) {
+  parent->flat_outer_dims<T>().chip(index, 0) = element.flat<T>();
+  return Status::OK();
+}
+
+template <>
+Status HandleElementToSlice<string>(Tensor element, Tensor* parent, int64 index,
+                                    bool can_move) {
+  auto parent_as_matrix = parent->flat_outer_dims<string>();
+  auto element_flat = element.flat<string>();
+  if (can_move) {
+    for (int64 i = 0; i < element.NumElements(); ++i) {
+      parent_as_matrix(index, i) = std::move(element_flat(i));
+    }
+  } else {
+    parent_as_matrix.chip(index, 0) = element_flat;
+  }
+  return Status::OK();
+}
+
+// TODO(jsimsa): Add HandleElementToSlice<variant> specialization that moves
+// the data when possible.
+
+template <typename T>
+static Status HandleSliceToElement(const Tensor& parent, Tensor* element,
+                                   int64 index) {
+  element->flat<T>() = parent.flat_outer_dims<T>().chip(index, 0);
+  return Status::OK();
+}
+
+}  // namespace
+
+// Copies element into the index^th slice of parent (in the 0th dimension).
+Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) {
+  TF_RETURN_IF_ERROR(ValidateInput(*parent, element, index));
+
+  bool can_move = element.RefCountIsOne();
+#define HANDLE_TYPE(T)                                                \
+  case DataTypeToEnum<T>::value: {                                    \
+    return HandleElementToSlice<T>(std::move(element), parent, index, \
+                                   can_move);                         \
+  }
+
+  switch (element.dtype()) {
+    TF_CALL_ALL_TYPES(HANDLE_TYPE);
+    TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+    TF_CALL_variant(HANDLE_TYPE);
+#undef HANDLE_TYPE
+    default:
+      return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
+                                   element.dtype());
+  }
+}
+
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
+  TF_RETURN_IF_ERROR(ValidateInput(parent, *element, index));
+
+#define HANDLE_TYPE(T)                                      \
+  case DataTypeToEnum<T>::value: {                          \
+    return HandleSliceToElement<T>(parent, element, index); \
+  }
+
+  switch (parent.dtype()) {
+    TF_CALL_ALL_TYPES(HANDLE_TYPE);
+    TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+    TF_CALL_variant(HANDLE_TYPE);
+#undef HANDLE_TYPE
+    default:
+      return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
+                                   element->dtype());
+  }
+}
+
+}  // namespace batch_util
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_util.h b/tensorflow/core/kernels/batch_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..b066e2a5748e6c2e0a63ef7e27a528be99067b83
--- /dev/null
+++ b/tensorflow/core/kernels/batch_util.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace batch_util {
+
+// Copies element into the index^th slice of parent (in the 0th dimension).
+//
+// NOTE(mrry): The `element` argument is taken by value. Use `std::move()`
+// to move the `element` argument into this function, and the implementation
+// may be able to optimize the copy to a move. This is particularly important
+// for DT_STRING tensors.
+Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
+
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
+
+}  // namespace batch_util
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
diff --git a/tensorflow/core/kernels/bincount_op.cc b/tensorflow/core/kernels/bincount_op.cc
index 766d63e3be4f5ffd5ad00d5b81c756e0b66747f8..890fa3121bbf719e7aa0d3e2d715ca6449af136b 100644
--- a/tensorflow/core/kernels/bincount_op.cc
+++ b/tensorflow/core/kernels/bincount_op.cc
@@ -97,8 +97,9 @@ class BincountOp : public OpKernel {
     const Tensor& weights_t = ctx->input(2);
 
     int32 size = size_tensor.scalar<int32>()();
-    OP_REQUIRES(ctx, size >= 0, errors::InvalidArgument(
-                                    "size (", size, ") must be non-negative"));
+    OP_REQUIRES(
+        ctx, size >= 0,
+        errors::InvalidArgument("size (", size, ") must be non-negative"));
 
     const auto arr = arr_t.flat<int32>();
     const auto weights = weights_t.flat<T>();
diff --git a/tensorflow/core/kernels/bincount_op.h b/tensorflow/core/kernels/bincount_op.h
index 0f8dd2b82aa7703327224fd30839ade2b0541463..cd3d560cd12a4afefa2c58f19fdfee44b8ed2684 100644
--- a/tensorflow/core/kernels/bincount_op.h
+++ b/tensorflow/core/kernels/bincount_op.h
@@ -16,11 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_BINCOUNT_OP_H_
 #define TENSORFLOW_BINCOUNT_OP_H_
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/bincount_op_gpu.cu.cc b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
index ae9e26ffdf45ee7a4a5cff817e93b3aba8fae431..6074b3e1f6f29fbb05b3adff29518b35a2df3b4f 100644
--- a/tensorflow/core/kernels/bincount_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
@@ -17,12 +17,12 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/bincount_op.h"
 #include "external/cub_archive/cub/device/device_histogram.cuh"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/bincount_op.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
@@ -93,8 +93,8 @@ struct BincountFunctor<GPUDevice, T> {
         /* num_samples */ num_samples,
         /* stream */ stream);
     if (err != cudaSuccess) {
-      return errors::Internal("Could not launch HistogramEven: ",
-                              cudaGetErrorString(err), ".");
+      return errors::Internal(
+          "Could not launch HistogramEven: ", cudaGetErrorString(err), ".");
     }
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/bincount_op_test.cc b/tensorflow/core/kernels/bincount_op_test.cc
index 14becc87a7abc7d4509a179a992fa9f5367207bb..cb04b40637a67e5398514f4cdf62ea960a70bf7c 100644
--- a/tensorflow/core/kernels/bincount_op_test.cc
+++ b/tensorflow/core/kernels/bincount_op_test.cc
@@ -30,8 +30,8 @@ static Graph* Bincount(int arr_size, int nbins) {
   Tensor arr(DT_INT32, TensorShape({arr_size}));
   arr.flat<int32>() = arr.flat<int32>().setRandom().abs();
 
-  Tensor size(DT_INT32, TensorShape({(int32)1}));
-  size.flat<int32>()(0) = (int32)nbins;
+  Tensor size(DT_INT32, TensorShape({static_cast<int32>(1)}));
+  size.flat<int32>()(0) = static_cast<int32>(nbins);
 
   Tensor weights(DT_INT32, TensorShape({0}));
 
diff --git a/tensorflow/core/kernels/bucketize_op.cc b/tensorflow/core/kernels/bucketize_op.cc
index c1693de53894228865af675746f8da13073574f8..4e4b6d52154cd1bacc621535f7dd9c56045a3c57 100644
--- a/tensorflow/core/kernels/bucketize_op.cc
+++ b/tensorflow/core/kernels/bucketize_op.cc
@@ -25,10 +25,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-using thread::ThreadPool;
-
-typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
 namespace functor {
 
@@ -49,6 +47,7 @@ struct BucketizeFunctor<CPUDevice, T> {
     return Status::OK();
   }
 };
+
 }  // namespace functor
 
 template <typename Device, typename T>
diff --git a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
index aafbbe41b4f9ddb8cf107a64426f49387dd6d30f..b08ccdbdc08d239bae03e6a7d8b1e24b19476d4b 100644
--- a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
@@ -33,11 +33,28 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename T>
+template <typename T, bool useSharedMem>
 __global__ void BucketizeCustomKernel(
     const int32 size_in, const T* in, const int32 size_boundaries,
     CudaDeviceArrayStruct<float> boundaries_array, int32* out) {
   const float* boundaries = GetCudaDeviceArrayOnDevice(&boundaries_array);
+
+  extern __shared__ __align__(sizeof(float)) unsigned char shared_mem[];
+  float* shared_mem_boundaries = reinterpret_cast<float*>(shared_mem);
+
+  if (useSharedMem) {
+    int32 lidx = threadIdx.y * blockDim.x + threadIdx.x;
+    int32 blockSize = blockDim.x * blockDim.y;
+
+    for (int32 i = lidx; i < size_boundaries; i += blockSize) {
+      shared_mem_boundaries[i] = boundaries[i];
+    }
+
+    __syncthreads();
+
+    boundaries = shared_mem_boundaries;
+  }
+
   CUDA_1D_KERNEL_LOOP(i, size_in) {
     T value = in[i];
     int32 bucket = 0;
@@ -77,11 +94,21 @@ struct BucketizeFunctor<GPUDevice, T> {
     TF_RETURN_IF_ERROR(boundaries_array.Finalize());
 
     CudaLaunchConfig config = GetCudaLaunchConfig(input.size(), d);
-    BucketizeCustomKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        input.size(), input.data(), boundaries_vector.size(),
-        boundaries_array.data(), output.data());
-
+    int32 shared_mem_size = sizeof(float) * boundaries_vector.size();
+    const int32 kMaxSharedMemBytes = 16384;
+    if (shared_mem_size < d.sharedMemPerBlock() &&
+        shared_mem_size < kMaxSharedMemBytes) {
+      BucketizeCustomKernel<T,
+                            true><<<config.block_count, config.thread_per_block,
+                                    shared_mem_size, d.stream()>>>(
+          input.size(), input.data(), boundaries_vector.size(),
+          boundaries_array.data(), output.data());
+    } else {
+      BucketizeCustomKernel<T, false><<<
+          config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          input.size(), input.data(), boundaries_vector.size(),
+          boundaries_array.data(), output.data());
+    }
     return Status::OK();
   }
 };
diff --git a/tensorflow/core/kernels/captured_function.h b/tensorflow/core/kernels/captured_function.h
index 9430127600a26df6cafd14022aa271e9e18ed78a..cdf191f4c768c2ed3bd15b0ff45fdfa27800653c 100644
--- a/tensorflow/core/kernels/captured_function.h
+++ b/tensorflow/core/kernels/captured_function.h
@@ -12,99 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_KERNELS_CAPTURED_FUNCTION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_KERNELS_CAPTURED_FUNCTION_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAPTURED_FUNCTION_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAPTURED_FUNCTION_H_
 
-#include <memory>
-#include <vector>
+#include "tensorflow/core/kernels/data/captured_function.h"
 
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/platform/macros.h"
-
-namespace tensorflow {
-
-class Device;
-class OpKernelContext;
-class ResourceMgr;
-
-// A `CapturedFunction` encapsulates a TensorFlow function and all of
-// the runtime support required to execute it.
-//
-// The `Dataset`-related classes use `CapturedFunction` to execute
-// TensorFlow functions outside a the normal `OpKernel::Compute()`
-// context.
-//
-// NOTE(mrry): Here we are taking a conservative approach to dealing with
-// ownership of the various framework and runtime objects that are needed
-// to execute functions. We copy the function library *definition* (i.e.
-// a set of FunctionDefs) out of this kernel's context's function library
-// *runtime*, then we use that together with a specially-created
-// ThreadPoolDevice to build a new FunctionLibraryRuntime for the Dataset.
-//
-// We need to do this (or refactor the ownership of framework components
-// in each of the session implementations) to make it possible to close
-// down a ParallelMapDataset::Iterator when its session is closed.
-//
-// TODO(mrry): Clean this up. Investigate whether it would be possible to
-// reuse the session's FunctionLibraryRuntime(s) or Device(s).
-class CapturedFunction {
- public:
-  // NOTE(mrry): The `captured_inputs` are passed by value. For
-  // efficiency, you are recommended to move this argument into the call.
-  static Status Create(OpKernelContext* ctx, const NameAttrList& func,
-                       int graph_def_version,
-                       std::vector<Tensor> captured_inputs,
-                       std::unique_ptr<CapturedFunction>* out_function);
-
-  Status Run(FunctionLibraryRuntime::Options f_opts,
-             gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets);
-
-  void RunAsync(FunctionLibraryRuntime::Options f_opts,
-                gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
-                FunctionLibraryRuntime::DoneCallback done);
-
-  const Device* device() const { return device_; }
-
-  ResourceMgr* resource_manager() const { return device_->resource_manager(); }
-
-  const std::vector<Tensor>& captured_inputs() { return captured_inputs_; }
-
-  static int64 generate_step_id() {
-    // Choose a step ID that is guaranteed not to clash with any
-    // Session-generated step ID. DirectSession only generates
-    // non-negative step IDs (contiguous, starting from 0), and
-    // MasterSession generates 56-bit random step IDs whose MSB is
-    // always 0, so a negative random step ID should suffice.
-    return -std::abs(static_cast<int64>(random::New64()));
-  }
-
- private:
-  CapturedFunction(Device* device, std::unique_ptr<DeviceMgr> device_mgr,
-                   std::unique_ptr<FunctionLibraryDefinition> flib_def,
-                   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
-                   FunctionLibraryRuntime* lib,
-                   FunctionLibraryRuntime::Handle f_handle,
-                   std::vector<Tensor> captured_inputs);
-
-  void RunHelper(FunctionLibraryRuntime::Options f_opts,
-                 gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
-                 FunctionLibraryRuntime::DoneCallback done);
-
-  Device* const device_;  // owned by device_mgr_.
-  const std::unique_ptr<DeviceMgr> device_mgr_;
-  const std::unique_ptr<FunctionLibraryDefinition> flib_def_;
-  const std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-  FunctionLibraryRuntime* const lib_;  // owned by pflr_.
-  const FunctionLibraryRuntime::Handle f_handle_;
-  const std::vector<Tensor> captured_inputs_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(CapturedFunction);
-};
-
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_KERNELS_CAPTURED_FUNCTION_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAPTURED_FUNCTION_H_
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index 7d3e0cbe3dc88477db8dbe048386f5f1a5971c74..8fedf2c271c2caf60a83fb1f4146dd94821c4643 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -128,10 +128,10 @@ struct scalar_cast_op<::tensorflow::bfloat16, float> {
     float ret;
     uint16_t* p = reinterpret_cast<uint16_t*>(&ret);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    p[0] = a.value;  
-    p[1] = 0;  
-#else  
-    static_assert(::tensorflow::port::kLittleEndian, "Not a little endian system!");  
+    p[0] = a.value;
+    p[1] = 0;
+#else
+    static_assert(::tensorflow::port::kLittleEndian, "Not a little endian system!");
     p[0] = 0;
     p[1] = a.value;
 #endif
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index b0bec0c5dcd30f4a630cd927e6ea922105249676..743e3acfd5c415a72eb70690f9692c961733c34f 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -73,12 +73,14 @@ REGISTER(quint16)
 REGISTER(qint16)
 REGISTER(qint32)
 REGISTER(bfloat16)
+TF_CALL_variant(REGISTER)
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION) && \
     !defined(__ANDROID_TYPES_FULL__)
-// Primarily used for SavedModel support on mobile. Registering it here only if
-// __ANDROID_TYPES_FULL__ is not defined, as that already register strings
-REGISTER(string);
+    // Primarily used for SavedModel support on mobile. Registering it here only
+    // if __ANDROID_TYPES_FULL__ is not defined (which already registers string)
+    // to avoid duplicate registration.
+    REGISTER(string);
 #endif  // defined(IS_MOBILE_PLATFORM) &&
         // !defined(SUPPORT_SELECTIVE_REGISTRATION) &&
         // !defined(__ANDROID_TYPES_FULL__)
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index 319ead49efd709932bed20e1e76a73749b1c4f19..d8643c0b2fb2633f6b640b4f54dc2f8c92da654d 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -116,8 +116,8 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER);
 TF_CALL_complex64(REGISTER);
 TF_CALL_complex128(REGISTER);
 TF_CALL_int64(REGISTER);
-REGISTER(bfloat16);
-REGISTER(bool);
+TF_CALL_bfloat16(REGISTER);
+TF_CALL_bool(REGISTER);
 
 #undef REGISTER
 
diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h
index 05ee855daee8a7ffe4730ec4a18c65a7bd91733a..27db6ee78533c59f26f538bc59956e50c6111ee7 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base.h
@@ -162,10 +162,12 @@ class ConditionalAccumulatorBase : public ResourceBase {
  * function can get an indication that a failure has occurred.
 */
 #define OP_REQUIRES_BOOLEAN(CTX, EXP, STATUS) \
-  if (!TF_PREDICT_TRUE(EXP)) {                \
-    (CTX)->CtxFailure((STATUS));              \
-    return false;                             \
-  }
+  do {                                        \
+    if (!TF_PREDICT_TRUE(EXP)) {              \
+      (CTX)->CtxFailure((STATUS));            \
+      return false;                           \
+    }                                         \
+  } while (0)
 
 #define OP_REQUIRES_OK_BOOLEAN(CTX, STATUS) \
   do {                                      \
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 72132574a4ccee474734425233ff687e955022ef..103a0e225ed4b3049fad87745a37744ec405efc5 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -250,6 +250,7 @@ REGISTER_KERNEL_BUILDER(Name("Fill")
 
 #if GOOGLE_CUDA
 REGISTER_KERNEL(GPU, Eigen::half);
+REGISTER_KERNEL(GPU, bfloat16);
 REGISTER_KERNEL(GPU, float);
 REGISTER_KERNEL(GPU, double);
 REGISTER_KERNEL(GPU, uint8);
@@ -328,6 +329,7 @@ REGISTER_KERNEL_BUILDER(Name("ZerosLike")
 #if GOOGLE_CUDA
 REGISTER_KERNEL(bool, GPU);
 REGISTER_KERNEL(Eigen::half, GPU);
+REGISTER_KERNEL(bfloat16, GPU);
 REGISTER_KERNEL(float, GPU);
 REGISTER_KERNEL(double, GPU);
 REGISTER_KERNEL(complex64, GPU);
@@ -380,6 +382,7 @@ REGISTER_KERNEL_BUILDER(Name("OnesLike")
 #if GOOGLE_CUDA
 REGISTER_KERNEL(bool, GPU);
 REGISTER_KERNEL(Eigen::half, GPU);
+REGISTER_KERNEL(bfloat16, GPU);
 REGISTER_KERNEL(float, GPU);
 REGISTER_KERNEL(double, GPU);
 REGISTER_KERNEL(complex64, GPU);
diff --git a/tensorflow/core/kernels/constant_op_gpu.cu.cc b/tensorflow/core/kernels/constant_op_gpu.cu.cc
index d1a1e34ec365da444a8465b34dd67f8865d29f5e..49beb499af28cca89079c55538483051873cf7d3 100644
--- a/tensorflow/core/kernels/constant_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/constant_op_gpu.cu.cc
@@ -77,7 +77,8 @@ struct FillFunctor<GPUDevice, T> {
 
 #define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>;
 TF_CALL_REAL_NUMBER_TYPES(DEFINE_FILL_GPU);
-DEFINE_FILL_GPU(bool);
+TF_CALL_bfloat16(DEFINE_FILL_GPU);
+TF_CALL_bool(DEFINE_FILL_GPU);
 #undef DEFINE_FILL_GPU
 
 // Partial specialization of FillFunctor<Device=GPUDevice, T>.
@@ -88,15 +89,10 @@ struct SetZeroFunctor<GPUDevice, T> {
   }
 };
 
-#define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T>
-DEFINE_SETZERO_GPU(bool);
-DEFINE_SETZERO_GPU(Eigen::half);
-DEFINE_SETZERO_GPU(float);
-DEFINE_SETZERO_GPU(double);
-DEFINE_SETZERO_GPU(complex64);
-DEFINE_SETZERO_GPU(complex128);
-DEFINE_SETZERO_GPU(int32);
-DEFINE_SETZERO_GPU(int64);
+#define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T>;
+TF_CALL_NUMBER_TYPES(DEFINE_SETZERO_GPU);
+TF_CALL_bfloat16(DEFINE_SETZERO_GPU);
+TF_CALL_bool(DEFINE_SETZERO_GPU);
 #undef DEFINE_SETZERO_GPU
 
 // Partial specialization of FillFunctor<Device=GPUDevice, T>.
@@ -107,15 +103,10 @@ struct SetOneFunctor<GPUDevice, T> {
   }
 };
 
-#define DEFINE_SETONE_GPU(T) template struct SetOneFunctor<GPUDevice, T>
-DEFINE_SETONE_GPU(bool);
-DEFINE_SETONE_GPU(Eigen::half);
-DEFINE_SETONE_GPU(float);
-DEFINE_SETONE_GPU(double);
-DEFINE_SETONE_GPU(complex64);
-DEFINE_SETONE_GPU(complex128);
-DEFINE_SETONE_GPU(int32);
-DEFINE_SETONE_GPU(int64);
+#define DEFINE_SETONE_GPU(T) template struct SetOneFunctor<GPUDevice, T>;
+TF_CALL_NUMBER_TYPES(DEFINE_SETONE_GPU);
+TF_CALL_bfloat16(DEFINE_SETONE_GPU);
+TF_CALL_bool(DEFINE_SETONE_GPU);
 #undef DEFINE_SETONE_GPU
 
 }  // end namespace functor
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 3d2bb57aff6b7c4a1de2f9221aea4b384fea45c3..1791c510966771f89d029dbc36a231d97daf2eff 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -194,7 +194,23 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
         context, (strides_[0] == 1 && strides_[3] == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, (dilations_[0] == 1 && dilations_[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
+    OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
+                errors::InvalidArgument(
+                    "Current Eigen and libxsmm implementations do not "
+                    "yet support dilation rates larger than 1."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -262,6 +278,7 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
@@ -290,7 +307,23 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
         context, (strides_[0] == 1 && strides_[3] == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, (dilations_[0] == 1 && dilations_[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
+    OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
+                errors::InvalidArgument(
+                    "Current libxsmm and customized CPU implementations do "
+                    "not yet support dilation rates larger than 1."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -459,6 +492,7 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
@@ -510,10 +544,30 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
     int stride_n = GetTensorDim(strides_, data_format_, 'N');
     int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    int stride_h = GetTensorDim(strides_, data_format_, 'H');
+    int stride_w = GetTensorDim(strides_, data_format_, 'W');
     OP_REQUIRES(
         context, (stride_n == 1 && stride_c == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, stride_h > 0 && stride_w > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+    int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+    int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+    int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+    OP_REQUIRES(context, dilation_n == 1 && dilation_c == 1,
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context, dilation_h > 0 && dilation_w > 0,
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
     use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
@@ -546,13 +600,16 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
     // do not support striding on the batch or depth dimension).
     const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
     const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
+    const int dilation_rows = GetTensorDim(dilations_, data_format_, 'H');
+    const int dilation_cols = GetTensorDim(dilations_, data_format_, 'W');
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, input,
-              stride_rows, stride_cols, padding_, filter_backprop,
-              data_format_);
+              dilation_rows, dilation_cols, stride_rows, stride_cols, padding_,
+              filter_backprop, data_format_);
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   bool use_cudnn_;
@@ -566,38 +623,46 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
 template <typename T>
 void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-    const Tensor& out_backprop, const Tensor& input, int row_stride,
-    int col_stride, const Padding& padding, Tensor* filter_backprop,
-    TensorFormat data_format) {
+    const Tensor& out_backprop, const Tensor& input, int row_dilation,
+    int col_dilation, int row_stride, int col_stride, const Padding& padding,
+    Tensor* filter_backprop, TensorFormat data_format) {
   using perftools::gputools::dnn::AlgorithmConfig;
   using perftools::gputools::dnn::AlgorithmDesc;
   using perftools::gputools::dnn::ProfileResult;
 
+  std::vector<int32> dilations(4, 1);
+  dilations[GetTensorDimIndex(data_format, 'H')] = row_dilation;
+  dilations[GetTensorDimIndex(data_format, 'W')] = col_dilation;
+
   std::vector<int32> strides(4, 1);
   strides[GetTensorDimIndex(data_format, 'H')] = row_stride;
   strides[GetTensorDimIndex(data_format, 'W')] = col_stride;
   TensorShape filter_shape = filter_backprop->shape();
 
   ConvBackpropDimensions dims;
-  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensions(
+  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensionsV2(
                           "Conv2DSlowBackpropFilter", /*num_spatial_dims=*/2,
                           input.shape(), filter_shape, out_backprop.shape(),
-                          strides, padding, data_format, &dims));
+                          dilations, strides, padding, data_format, &dims));
 
+  // TODO(yangzihao): The padding computations should be done in
+  // GetWindowedOutputSize() functions.
   const int padding_rows =
       (padding == VALID)
           ? 0
           : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
                                      dims.spatial_dims[0].stride +
-                                 dims.spatial_dims[0].filter_size -
-                                 dims.spatial_dims[0].input_size);
+                                 (dims.spatial_dims[0].filter_size - 1) *
+                                     dims.spatial_dims[0].dilation +
+                                 1 - dims.spatial_dims[0].input_size);
   const int padding_cols =
       (padding == VALID)
           ? 0
           : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
                                      dims.spatial_dims[1].stride +
-                                 dims.spatial_dims[1].filter_size -
-                                 dims.spatial_dims[1].input_size);
+                                 (dims.spatial_dims[1].filter_size - 1) *
+                                     dims.spatial_dims[1].dilation +
+                                 1 - dims.spatial_dims[1].input_size);
 
   // TODO(zhengxq): cuDNN only supports equal padding on both sides, so only
   // calling it when that is true. Remove this check when (if?) cuDNN starts
@@ -730,7 +795,9 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       .set_input_feature_map_count(dims.in_depth)
       .set_output_feature_map_count(dims.out_depth);
   perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
-  conv_desc.set_vertical_filter_stride(dims.spatial_dims[0].stride)
+  conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
+      .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
+      .set_vertical_filter_stride(dims.spatial_dims[0].stride)
       .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
       .set_zero_padding_height(padding_rows / 2)
       .set_zero_padding_width(padding_cols / 2);
@@ -821,6 +888,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       dims.out_depth,                        // out_depths
       {{dims.spatial_dims[0].filter_size,    // filter_rows
         dims.spatial_dims[1].filter_size}},  // filter_cols
+      {{dims.spatial_dims[0].dilation,       // dilation_rows
+        dims.spatial_dims[1].dilation}},     // dilation_cols
       {{dims.spatial_dims[0].stride,         // stride_rows
         dims.spatial_dims[1].stride}},       // stride_cols
       {{padding_rows,                        // padding_rows
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index d28f6b4d107647f8e2dc232dc5477cd7ee37f696..736241a029353b5872e243ce9205ff6cde2285d9 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -198,7 +198,23 @@ class Conv2DFastBackpropInputOp : public OpKernel {
         context, (strides_[0] == 1 && strides_[3] == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, (dilations_[0] && dilations_[3]),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
+    OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
+                errors::InvalidArgument(
+                    "Current Eigen and libxsmm implementations do not "
+                    "yet support dilation rates larger than 1."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -268,6 +284,7 @@ class Conv2DFastBackpropInputOp : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
@@ -296,7 +313,23 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
         context, (strides_[0] == 1 && strides_[3] == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, (dilations_[0] == 1 && dilations_[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
+    OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
+                errors::InvalidArgument(
+                    "Current libxsmm and customized CPU implementations do "
+                    "not yet support dilation rates larger than 1."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -532,6 +565,7 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
@@ -586,10 +620,30 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
                                         "specify 4 dimensions"));
     int stride_n = GetTensorDim(strides_, data_format_, 'N');
     int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    int stride_h = GetTensorDim(strides_, data_format_, 'H');
+    int stride_w = GetTensorDim(strides_, data_format_, 'W');
     OP_REQUIRES(
         context, (stride_n == 1 && stride_c == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, stride_h > 0 && stride_w > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+    int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+    int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+    int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+    OP_REQUIRES(context, (dilation_n == 1 && dilation_c == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context, dilation_h > 0 && dilation_w > 0,
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
     use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
@@ -622,12 +676,16 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
     // do not support striding on the batch or depth dimension).
     const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
     const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
+    const int dilation_rows = GetTensorDim(dilations_, data_format_, 'H');
+    const int dilation_cols = GetTensorDim(dilations_, data_format_, 'W');
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, filter,
-              stride_rows, stride_cols, padding_, in_backprop, data_format_);
+              dilation_rows, dilation_cols, stride_rows, stride_cols, padding_,
+              in_backprop, data_format_);
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   bool use_cudnn_;
@@ -641,39 +699,48 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
 template <typename T>
 void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-    const Tensor& out_backprop, const Tensor& filter, int row_stride,
-    int col_stride, const Padding& padding, Tensor* in_backprop,
-    TensorFormat data_format) {
+    const Tensor& out_backprop, const Tensor& filter, int row_dilation,
+    int col_dilation, int row_stride, int col_stride, const Padding& padding,
+    Tensor* in_backprop, TensorFormat data_format) {
   using perftools::gputools::dnn::AlgorithmConfig;
   using perftools::gputools::dnn::AlgorithmDesc;
   using perftools::gputools::dnn::ProfileResult;
 
   std::vector<int32> strides(4, 1);
-  strides[GetTensorDimIndex(data_format, 'H')] = row_stride;
-  strides[GetTensorDimIndex(data_format, 'W')] = col_stride;
+  std::vector<int32> dilations(4, 1);
+  auto input_h = GetTensorDimIndex(data_format, 'H');
+  auto input_w = GetTensorDimIndex(data_format, 'W');
+  strides[input_h] = row_stride;
+  strides[input_w] = col_stride;
+  dilations[input_h] = row_dilation;
+  dilations[input_w] = col_dilation;
   TensorShape input_shape = in_backprop->shape();
 
   const TensorShape& filter_shape = filter.shape();
   ConvBackpropDimensions dims;
-  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensions(
+  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensionsV2(
                           "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2,
                           input_shape, filter_shape, out_backprop.shape(),
-                          strides, padding, data_format, &dims));
+                          dilations, strides, padding, data_format, &dims));
 
+  // TODO(yangzihao): The padding computations should be done in
+  // GetWindowedOutputSize() functions.
   const int padding_rows =
       (padding == VALID)
           ? 0
           : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
                                      dims.spatial_dims[0].stride +
-                                 dims.spatial_dims[0].filter_size -
-                                 dims.spatial_dims[0].input_size);
+                                 (dims.spatial_dims[0].filter_size - 1) *
+                                     dims.spatial_dims[0].dilation +
+                                 1 - dims.spatial_dims[0].input_size);
   const int padding_cols =
       (padding == VALID)
           ? 0
           : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
                                      dims.spatial_dims[1].stride +
-                                 dims.spatial_dims[1].filter_size -
-                                 dims.spatial_dims[1].input_size);
+                                 (dims.spatial_dims[1].filter_size - 1) *
+                                     dims.spatial_dims[1].dilation +
+                                 1 - dims.spatial_dims[1].input_size);
 
   // TODO(keveman): cuDNN only supports equal padding on both sides, so only
   // calling it when that is true. Remove this check when (if?) cuDNN starts
@@ -789,7 +856,9 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       .set_input_feature_map_count(dims.in_depth)
       .set_output_feature_map_count(dims.out_depth);
   perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
-  conv_desc.set_vertical_filter_stride(dims.spatial_dims[0].stride)
+  conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
+      .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
+      .set_vertical_filter_stride(dims.spatial_dims[0].stride)
       .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
       .set_zero_padding_height(padding_rows / 2)
       .set_zero_padding_width(padding_cols / 2);
@@ -875,6 +944,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       dims.out_depth,                        // out_depths
       {{dims.spatial_dims[0].filter_size,    // filter_rows
         dims.spatial_dims[1].filter_size}},  // filter_cols
+      {{dims.spatial_dims[0].dilation,       // dilation_rows
+        dims.spatial_dims[1].dilation}},     // dilation_cols
       {{dims.spatial_dims[0].stride,         // stride_rows
         dims.spatial_dims[1].stride}},       // stride_cols
       {{padding_rows,                        // padding_rows
diff --git a/tensorflow/core/kernels/conv_grad_ops.h b/tensorflow/core/kernels/conv_grad_ops.h
index e068fb86848f93a4c826e1b19fc85790ab2500a4..535586d53ac916808a22a6ea55577b3be43321f9 100644
--- a/tensorflow/core/kernels/conv_grad_ops.h
+++ b/tensorflow/core/kernels/conv_grad_ops.h
@@ -175,15 +175,17 @@ template <typename Device, typename T>
 struct LaunchConv2DBackpropInputOp {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& filter,
-                  int row_stride, int col_stride, const Padding& padding,
-                  Tensor* in_backprop, TensorFormat data_format);
+                  int row_dilation, int col_dilation, int row_stride,
+                  int col_stride, const Padding& padding, Tensor* in_backprop,
+                  TensorFormat data_format);
 };
 
 template <typename Device, typename T>
 struct LaunchConv2DBackpropFilterOp {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& input,
-                  int row_stride, int col_stride, const Padding& padding,
+                  int row_dilation, int col_dilation, int row_stride,
+                  int col_stride, const Padding& padding,
                   Tensor* filter_backprop, TensorFormat data_format);
 };
 
@@ -191,8 +193,9 @@ struct LaunchConv2DBackpropFilterOp {
 template <typename T>
 struct LaunchConv2DBackpropInputOp<Eigen::GpuDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-                  const Tensor& input, const Tensor& filter, int row_stride,
-                  int col_stride, const Padding& padding, Tensor* output,
+                  const Tensor& input, const Tensor& filter, int row_dilation,
+                  int col_dilation, int row_stride, int col_stride,
+                  const Padding& padding, Tensor* output,
                   TensorFormat data_format);
 };
 
@@ -200,7 +203,8 @@ template <typename T>
 struct LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& input,
-                  int row_stride, int col_stride, const Padding& padding,
+                  int row_dilation, int col_dilation, int row_stride,
+                  int col_stride, const Padding& padding,
                   Tensor* filter_backprop, TensorFormat data_format);
 };
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index f819fccbfb5530dc5e83c8e8256f9dffc18f70f7..3650ab53b2533e3c95a764ead2d1318c4006c9e7 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -645,6 +645,9 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         {{input_size[0], input_size[1], input_size[2]}},
         out_depth,
         {{filter_size[0], filter_size[1], filter_size[2]}},
+        // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
+        // conv is supported.
+        /*dilation=*/{{1, 1, 1}},
         {{strides[0], strides[1], strides[2]}},
         {{padding_planes, padding_rows, padding_cols}},
         dtype,
@@ -1011,6 +1014,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         {{input_size[0], input_size[1], input_size[2]}},
         out_depth,
         {{filter_size[0], filter_size[1], filter_size[2]}},
+        {{1, 1, 1}},
         {{strides[0], strides[1], strides[2]}},
         {{padding_planes, padding_rows, padding_cols}},
         dtype,
@@ -1101,29 +1105,27 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
   bool cudnn_use_autotune_;
 };
 
-
-
 #define REGISTER_GPU_KERNEL(T)                                                \
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("Conv3DBackpropInput").Device(DEVICE_GPU).TypeConstraint<T>("T"),  \
       Conv3DBackpropInputOp<GPUDevice, T>);                                   \
   REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropInputV2")                       \
-                            .Device(DEVICE_GPU)                               \
-                            .TypeConstraint<T>("T")                           \
-                            .HostMemory("input_sizes"),                       \
-                        Conv3DBackpropInputOp<GPUDevice, T>);                 \
+                              .Device(DEVICE_GPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .HostMemory("input_sizes"),                     \
+                          Conv3DBackpropInputOp<GPUDevice, T>);               \
   REGISTER_KERNEL_BUILDER(                                                    \
-    Name("Conv3DBackpropFilter").Device(DEVICE_GPU).TypeConstraint<T>("T"),   \
-    Conv3DBackpropFilterOp<GPUDevice, T>);                                    \
+      Name("Conv3DBackpropFilter").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      Conv3DBackpropFilterOp<GPUDevice, T>);                                  \
   REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")                      \
-                            .Device(DEVICE_GPU)                               \
-                            .TypeConstraint<T>("T")                           \
-                            .HostMemory("filter_sizes"),                      \
-                        Conv3DBackpropFilterOp<GPUDevice, T>);
+                              .Device(DEVICE_GPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .HostMemory("filter_sizes"),                    \
+                          Conv3DBackpropFilterOp<GPUDevice, T>);
 TF_CALL_half(REGISTER_GPU_KERNEL);
 TF_CALL_float(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
-     
+
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index bb67113fb003ea58e2fb12ae6d79f02251cd3c3d..985586d6262b18e89b5fc5246cc00b10ba4924a7 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -112,8 +112,9 @@ struct LaunchGeneric {
 template <typename T>
 struct LaunchConv2DOp<CPUDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-                  const Tensor& input, const Tensor& filter, int row_stride,
-                  int col_stride, const Padding& padding, Tensor* output,
+                  const Tensor& input, const Tensor& filter, int row_dilation,
+                  int col_dilation, int row_stride, int col_stride,
+                  const Padding& padding, Tensor* output,
                   TensorFormat data_format) {
     if (data_format != FORMAT_NHWC) {
       ctx->SetStatus(
@@ -121,6 +122,13 @@ struct LaunchConv2DOp<CPUDevice, T> {
                                 "NHWC tensor format for now."));
       return;
     }
+    // TODO(yangzihao): Add the CPU implementation of dilated conv 2D.
+    if (row_dilation > 1 || col_dilation > 1) {
+      ctx->SetStatus(
+          errors::Unimplemented("Generic conv implementation only supports "
+                                "dilated rate of 1 for now."));
+      return;
+    }
     LaunchGeneric<CPUDevice, T>()(ctx, input, filter, row_stride, col_stride,
                                   padding, output, data_format);
   }
@@ -133,8 +141,10 @@ class LaunchDeepConvOp {
                   const Tensor& filter, int batch, int input_rows,
                   int input_cols, int in_depth, int filter_rows,
                   int filter_cols, int pad_rows, int pad_cols, int out_rows,
-                  int out_cols, int out_depth, int stride_rows, int stride_cols,
-                  Tensor* output, TensorFormat data_format) {
+                  int /*out_cols*/, int /*out_depth*/, int /*dilation_rows*/,
+                  int /*dilation_cols*/, int /*stride_rows*/,
+                  int /*stride_cols*/, Tensor* /*output*/,
+                  TensorFormat /*data_format*/) {
     return false;
   }
 };
@@ -147,9 +157,11 @@ class LaunchDeepConvOp<CPUDevice, float> {
                   const Tensor& filter, int batch, int input_rows,
                   int input_cols, int in_depth, int filter_rows,
                   int filter_cols, int pad_rows, int pad_cols, int out_rows,
-                  int out_cols, int out_depth, int stride_rows, int stride_cols,
+                  int out_cols, int out_depth, int dilation_rows,
+                  int dilation_cols, int stride_rows, int stride_cols,
                   Tensor* output, TensorFormat data_format) {
-    if (data_format != FORMAT_NHWC ||
+    if (data_format != FORMAT_NHWC || dilation_rows != 1 ||
+        dilation_cols != 1 ||
         !CanUseDeepConv2D(stride_rows, stride_cols, filter_rows, filter_cols,
                           in_depth, out_depth, out_rows, out_cols)) {
       return false;
@@ -187,7 +199,8 @@ class LaunchXsmmConvOp {
                   int input_cols, int in_depth, int filter_rows,
                   int filter_cols, int pad_rows, int pad_cols, int out_rows,
                   int out_cols, int out_depth, int stride_rows, int stride_cols,
-                  Tensor* output, TensorFormat data_format) {
+                  int dilation_rows, int dilation_cols, Tensor* output,
+                  TensorFormat data_format) {
     return false;
   }
 };
@@ -199,7 +212,8 @@ class LaunchXsmmConvOp<CPUDevice, float> {
                   const Tensor& filter, int batch, int input_rows,
                   int input_cols, int in_depth, int filter_rows,
                   int filter_cols, int pad_rows, int pad_cols, int out_rows,
-                  int out_cols, int out_depth, int stride_rows, int stride_cols,
+                  int out_cols, int out_depth, int dilation_rows,
+                  int dilation_cols, int stride_rows, int stride_cols,
                   Tensor* output, TensorFormat data_format) {
     auto num_threads =
         ctx->device()->tensorflow_cpu_worker_threads()->num_threads;
@@ -228,11 +242,8 @@ class LaunchXsmmConvOp<CPUDevice, float> {
     desc.options = LIBXSMM_DNN_CONV_OPTION_WU_EXT_FILTER_REDUCE_OVERWRITE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
 
-    if (!CanUseXsmmConv2D(desc, data_format)) {
-      return false;
-    }
-
-    if (!CanUseXsmmConv2D(desc, data_format)) {
+    if (dilation_rows != 1 || dilation_cols != 1 ||
+        !CanUseXsmmConv2D(desc, data_format)) {
       return false;
     }
 
@@ -251,6 +262,7 @@ template <typename Device, typename T>
 class Conv2DOp : public BinaryOp<T> {
  public:
   explicit Conv2DOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
     string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
@@ -259,15 +271,35 @@ class Conv2DOp : public BinaryOp<T> {
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
     use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
     OP_REQUIRES(context, strides_.size() == 4,
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 4 dimensions"));
     const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
     const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
+    const int64 stride_h = GetTensorDim(strides_, data_format_, 'H');
+    const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
     OP_REQUIRES(
         context, stride_n == 1 && stride_c == 1,
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, stride_h > 0 && stride_w > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
+
+    const int64 dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+    const int64 dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+    const int64 dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+    const int64 dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+    OP_REQUIRES(context, dilation_n == 1 && dilation_c == 1,
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context, dilation_h > 0 && dilation_w > 0,
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
   }
 
@@ -334,18 +366,22 @@ class Conv2DOp : public BinaryOp<T> {
                 errors::InvalidArgument("batch is too large"));
     const int batch = static_cast<int>(batch_raw);
 
-    // For now we take the stride from the second and third dimensions only (we
-    // do not support striding on the batch or depth dimension).
+    // For now we take the stride and dilation from the second and third
+    // dimensions only (we do not support striding or dilation on the batch or
+    // depth dimension).
     const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
     const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
 
+    const int dilation_rows = GetTensorDim(dilations_, data_format_, 'H');
+    const int dilation_cols = GetTensorDim(dilations_, data_format_, 'W');
+
     int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(input_rows, filter_rows, stride_rows,
-                                         padding_, &out_rows, &pad_rows));
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(input_cols, filter_cols, stride_cols,
-                                         padding_, &out_cols, &pad_cols));
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeV2(
+                                input_rows, filter_rows, dilation_rows,
+                                stride_rows, padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeV2(
+                                input_cols, filter_cols, dilation_cols,
+                                stride_cols, padding_, &out_cols, &pad_cols));
     TensorShape out_shape =
         ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
 
@@ -361,6 +397,8 @@ class Conv2DOp : public BinaryOp<T> {
             << ", filter_rows = " << filter_rows
             << ", stride_rows = " << stride_rows
             << ", stride_cols = " << stride_cols
+            << ", dilation_rows = " << dilation_rows
+            << ", dilation_cols = " << dilation_cols
             << ", out_depth = " << out_depth;
 
     // If there is nothing to compute, return.
@@ -372,7 +410,8 @@ class Conv2DOp : public BinaryOp<T> {
     if (LaunchXsmmConvOp<Device, T>::Run(
             context, input, filter, batch, input_rows, input_cols, in_depth,
             filter_rows, filter_cols, pad_rows, pad_cols, out_rows, out_cols,
-            out_depth, stride_rows, stride_cols, output, data_format_)) {
+            out_depth, dilation_rows, dilation_cols, stride_rows, stride_cols,
+            output, data_format_)) {
       return;
     }
 #endif
@@ -380,15 +419,18 @@ class Conv2DOp : public BinaryOp<T> {
     if (LaunchDeepConvOp<Device, T>::Run(
             context, input, filter, batch, input_rows, input_cols, in_depth,
             filter_rows, filter_cols, pad_rows, pad_cols, out_rows, out_cols,
-            out_depth, stride_rows, stride_cols, output, data_format_)) {
+            out_depth, dilation_rows, dilation_cols, stride_rows, stride_cols,
+            output, data_format_)) {
       return;
     }
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter,
-              stride_rows, stride_cols, padding_, output, data_format_);
+              dilation_rows, dilation_cols, stride_rows, stride_cols, padding_,
+              output, data_format_);
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   bool use_cudnn_;
   Padding padding_;
@@ -443,9 +485,9 @@ typedef AutoTuneSingleton<ConvAutoTuneGroup, ConvParameters,
 template <typename T>
 void LaunchConv2DOp<GPUDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-    const Tensor& input_param, const Tensor& filter, int row_stride,
-    int col_stride, const Padding& padding, Tensor* output,
-    TensorFormat data_format) {
+    const Tensor& input_param, const Tensor& filter, int row_dilation,
+    int col_dilation, int row_stride, int col_stride, const Padding& padding,
+    Tensor* output, TensorFormat data_format) {
   using perftools::gputools::dnn::AlgorithmConfig;
   using perftools::gputools::dnn::AlgorithmDesc;
   using perftools::gputools::dnn::ProfileResult;
@@ -461,8 +503,9 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
 
   Tensor input = input_param;
 
-  if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_stride == 1 &&
-      col_stride == 1 && data_format == FORMAT_NHWC) {
+  if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_dilation == 1 &&
+      col_dilation == 1 && row_stride == 1 && col_stride == 1 &&
+      data_format == FORMAT_NHWC) {
     // 1x1 filter, so call cublas directly.
     const uint64 m = input.dim_size(0) * input.dim_size(1) * input.dim_size(2);
     const uint64 k = filter.dim_size(2);
@@ -487,7 +530,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     }
     return;
   } else if (filter.dim_size(0) == input.dim_size(1) &&
-             filter.dim_size(1) == input.dim_size(2) && padding == VALID &&
+             filter.dim_size(1) == input.dim_size(2) && row_dilation == 1 &&
+             col_dilation == 1 && padding == VALID &&
              data_format == FORMAT_NHWC) {
     // The input data and filter have the same height/width, so call cublas
     // directly.
@@ -530,17 +574,19 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   const int64 patch_cols = filter.dim_size(1);
   if (padding == SAME) {
     // Total padding on rows and cols is
-    // Pr = (R' - 1) * S + Kr - R
-    // Pc = (C' - 1) * S + Kc - C
+    // Pr = (R' - 1) * S + (Kr - 1) * Dr + 1 - R
+    // Pc = (C' - 1) * S + (Kc - 1) * Dc + 1 - C
     // where (R', C') are output dimensions, (R, C) are input dimensions, S
-    // is stride, (Kr, Kc) are filter dimensions.
+    // is stride, (Dr, Dc) are dilations, (Kr, Kc) are filter dimensions.
     // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
     // and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
     // we pad more on the right and bottom than on the top and left.
     padding_rows =
-        std::max<int>(0, (out_rows - 1) * row_stride + patch_rows - in_rows);
+        std::max<int>(0, (out_rows - 1) * row_stride +
+                             (patch_rows - 1) * row_dilation + 1 - in_rows);
     padding_cols =
-        std::max<int>(0, (out_cols - 1) * col_stride + patch_cols - in_cols);
+        std::max<int>(0, (out_cols - 1) * col_stride +
+                             (patch_cols - 1) * col_dilation + 1 - in_cols);
     const bool rows_odd = (padding_rows % 2 != 0);
     const bool cols_odd = (padding_cols % 2 != 0);
     if (rows_odd || cols_odd) {
@@ -605,7 +651,9 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       .set_input_feature_map_count(filter.dim_size(2))
       .set_output_feature_map_count(filter.dim_size(3));
   perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
-  conv_desc.set_vertical_filter_stride(row_stride)
+  conv_desc.set_vertical_dilation_rate(row_dilation)
+      .set_horizontal_dilation_rate(col_dilation)
+      .set_vertical_filter_stride(row_stride)
       .set_horizontal_filter_stride(col_stride)
       .set_zero_padding_height(padding_rows / 2)
       .set_zero_padding_width(padding_cols / 2);
@@ -652,6 +700,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       out_depths,        // out_depths
       {{patch_rows,      // filter_rows
         patch_cols}},    // filter_cols
+      {{row_dilation,    // dilation_rows
+        col_dilation}},  // dilation_cols
       {{row_stride,      // stride_rows
         col_stride}},    // stride_cols
       {{padding_rows,    // padding_rows
diff --git a/tensorflow/core/kernels/conv_ops.h b/tensorflow/core/kernels/conv_ops.h
index e29271dff278afbc1ff2c947c161824615640b66..09a3b78776c8bf114ccd42866bc7aded92c463b5 100644
--- a/tensorflow/core/kernels/conv_ops.h
+++ b/tensorflow/core/kernels/conv_ops.h
@@ -34,8 +34,9 @@ class OpKernelContext;
 template <typename Device, typename T>
 struct LaunchConv2DOp {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-                  const Tensor& input, const Tensor& filter, int row_stride,
-                  int col_stride, const Padding& padding, Tensor* output,
+                  const Tensor& input, const Tensor& filter, int row_dilation,
+                  int col_dilation, int row_stride, int col_stride,
+                  const Padding& padding, Tensor* output,
                   TensorFormat data_format);
 };
 
@@ -43,8 +44,9 @@ struct LaunchConv2DOp {
 template <typename T>
 struct LaunchConv2DOp<Eigen::GpuDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-                  const Tensor& input, const Tensor& filter, int row_stride,
-                  int col_stride, const Padding& padding, Tensor* output,
+                  const Tensor& input, const Tensor& filter, int row_dilation,
+                  int col_dilation, int row_stride, int col_stride,
+                  const Padding& padding, Tensor* output,
                   TensorFormat data_format);
 };
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 37cb67bc51112d42feaca25c37b3939775b66888..21c84b2a0ed15eaada88e308e1761dcb58cb07b3 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -377,6 +377,9 @@ struct LaunchConvOp<GPUDevice, T> {
         {{in_planes, in_rows, in_cols}},
         out_depth,
         {{filter_planes, filter_rows, filter_cols}},
+        // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
+        // conv is supported.
+        /*dilation=*/{{1, 1, 1}},
         {{strides[0], strides[1], strides[2]}},
         {{pad_planes, pad_rows, pad_cols}},
         dtype,
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index c852dc9991c2e879c8fa6a64b2bd8b5141606409..6f82698596260d0fa9ce3198b5fc3eec18c86c98 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -91,13 +91,14 @@ class ConvParameters {
   using SpatialArray = gtl::InlinedVector<int64, 3>;
   ConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
                  int64 out_depths, const SpatialArray& filter,
-                 const SpatialArray& stride, const SpatialArray& padding,
-                 DataType dtype, int device_id)
+                 const SpatialArray& dilation, const SpatialArray& stride,
+                 const SpatialArray& padding, DataType dtype, int device_id)
       : batch_(batch),
         in_depths_(in_depths),
         out_depths_(out_depths),
         in_(in),
         filter_(filter),
+        dilation_(dilation),
         stride_(stride),
         padding_(padding),
         dtype_(dtype),
@@ -107,6 +108,7 @@ class ConvParameters {
     for (int64 val : in) hash_code_ = Hash64Combine(hash_code_, val);
     hash_code_ = Hash64Combine(hash_code_, out_depths);
     for (int64 val : filter) hash_code_ = Hash64Combine(hash_code_, val);
+    for (int64 val : dilation) hash_code_ = Hash64Combine(hash_code_, val);
     for (int64 val : stride) hash_code_ = Hash64Combine(hash_code_, val);
     for (int64 val : padding) hash_code_ = Hash64Combine(hash_code_, val);
     hash_code_ = Hash64Combine(hash_code_, dtype);
@@ -128,6 +130,7 @@ class ConvParameters {
         "(", str_util::Join(in_, ", "), "), ",
         out_depths_, ", ",
         "(", str_util::Join(filter_, ", "), "), ",
+        "(", str_util::Join(dilation_, ", "), "), ",
         "(", str_util::Join(stride_, ", "), "), ",
         "(", str_util::Join(padding_, ", "), "), ",
         dtype_, ", ",
@@ -154,11 +157,11 @@ class ConvParameters {
  protected:
   using ParameterDataType =
       std::tuple<int64, int64, SpatialArray, int64, SpatialArray, SpatialArray,
-                 SpatialArray, DataType, int>;
+                 SpatialArray, SpatialArray, DataType, int>;
 
   ParameterDataType get_data_as_tuple() const {
     return std::make_tuple(batch_, in_depths_, in_, out_depths_, filter_,
-                           stride_, padding_, dtype_, device_id_);
+                           dilation_, stride_, padding_, dtype_, device_id_);
   }
 
   uint64 hash_code_;
@@ -169,6 +172,7 @@ class ConvParameters {
   int64 out_depths_;
   SpatialArray in_;
   SpatialArray filter_;
+  SpatialArray dilation_;
   SpatialArray stride_;
   SpatialArray padding_;
   DataType dtype_;
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index ea54d6cf6cbfb6f2d38ae10644fed348980ab622..666bca265c95febf3753e71bf010a7caf95c0541 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -43,6 +43,8 @@ TEST(ConvParameters, WinogradNonfusedAlgoSize) {
       128,       // out_depths
       {{3,       // filter_rows
         3}},     // filter_cols
+      {{1,       // dilation_rows
+        1}},     // dilation_cols
       {{1,       // stride_rows
         1}},     // stride_cols
       {{0,       // padding_rows
@@ -60,6 +62,8 @@ TEST(ConvParameters, WinogradNonfusedAlgoSize) {
       768,       // out_depths
       {{3,       // filter_rows
         3}},     // filter_cols
+      {{1,       // dilation_rows
+        1}},     // dilation_cols
       {{1,       // stride_rows
         1}},     // stride_cols
       {{0,       // padding_rows
diff --git a/tensorflow/core/kernels/cwise_op_asinh.cc b/tensorflow/core/kernels/cwise_op_asinh.cc
index 8d44208aa7d99723670aca20c740cb483479af63..0aec6aac3442a98309e352cf1431b920a87f62fe 100644
--- a/tensorflow/core/kernels/cwise_op_asinh.cc
+++ b/tensorflow/core/kernels/cwise_op_asinh.cc
@@ -1,10 +1,10 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+  /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+  http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -22,7 +22,7 @@ REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double,
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Asinh", functor::asinh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_atanh.cc b/tensorflow/core/kernels/cwise_op_atanh.cc
index bbc69e45aac5bdc8ae8e3d0f78d07065682b8cc5..7b688db4c585b0f8d92f289cae598a78df7e379c 100644
--- a/tensorflow/core/kernels/cwise_op_atanh.cc
+++ b/tensorflow/core/kernels/cwise_op_atanh.cc
@@ -22,7 +22,7 @@ REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double,
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Atanh", functor::atanh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_and.cc b/tensorflow/core/kernels/cwise_op_bitwise_and.cc
index 017a2182dcff0f0121dd6343f1c012802cdf28d1..5a6cf4bad1609cebc0fded4d212e50fb19d22558 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_and.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_and.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER6(BinaryOp, CPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, CPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                      \
@@ -30,13 +30,15 @@ REGISTER_SYCL_KERNEL(int32);
 REGISTER_SYCL_KERNEL(int64);
 REGISTER_SYCL_KERNEL(uint8);
 REGISTER_SYCL_KERNEL(uint16);
+REGISTER_SYCL_KERNEL(uint32);
+REGISTER_SYCL_KERNEL(uint64);
 #undef REGISTER_SYCL_KERNEL
 
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
-REGISTER6(BinaryOp, GPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, GPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_or.cc b/tensorflow/core/kernels/cwise_op_bitwise_or.cc
index 36f45fe92dfce44c68a778b6c719c45d24bcaa90..201a10198a629b26429393c5c04404175399df73 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_or.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_or.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER6(BinaryOp, CPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, CPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                     \
@@ -30,13 +30,15 @@ REGISTER_SYCL_KERNEL(int32);
 REGISTER_SYCL_KERNEL(int64);
 REGISTER_SYCL_KERNEL(uint8);
 REGISTER_SYCL_KERNEL(uint16);
+REGISTER_SYCL_KERNEL(uint32);
+REGISTER_SYCL_KERNEL(uint64);
 #undef REGISTER_SYCL_KERNEL
 
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
-REGISTER6(BinaryOp, GPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, GPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_xor.cc b/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
index 36432d851d99f20706b7e7f8535e6ac241b00937..2a7cd2699596a7ace6afd5ce688ff2e186650336 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER6(BinaryOp, CPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, CPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                      \
@@ -30,13 +30,15 @@ REGISTER_SYCL_KERNEL(int32);
 REGISTER_SYCL_KERNEL(int64);
 REGISTER_SYCL_KERNEL(uint8);
 REGISTER_SYCL_KERNEL(uint16);
+REGISTER_SYCL_KERNEL(uint32);
+REGISTER_SYCL_KERNEL(uint64);
 #undef REGISTER_SYCL_KERNEL
 
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
-REGISTER6(BinaryOp, GPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, GPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc
index 9d4d65442762b88bb418bc0266b41ae37259e43f..66d7b7d22ebe63bf42da848aa028fcbafc26864b 100644
--- a/tensorflow/core/kernels/cwise_op_exp.cc
+++ b/tensorflow/core/kernels/cwise_op_exp.cc
@@ -20,7 +20,8 @@ REGISTER5(UnaryOp, CPU, "Exp", functor::exp, float, Eigen::half, double,
           complex64, complex128);
 
 #if GOOGLE_CUDA
-REGISTER3(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double);
+REGISTER5(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double,
+          complex64, complex128);
 #endif
 
 #if TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_op_gpu_bitwise_and.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_bitwise_and.cu.cc
index 27f973c90d73a1d7828ce180254363a0b7b4be76..3fbf69c114d3c546eafb9f6c504568a649c52e59 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_bitwise_and.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_bitwise_and.cu.cc
@@ -19,7 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY6(bitwise_and, int8, int16, int32, int64, uint8, uint16);
+DEFINE_BINARY8(bitwise_and, int8, int16, int32, int64, uint8, uint16, uint32,
+               uint64);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_bitwise_or.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_bitwise_or.cu.cc
index a34c3a52cd6253527c67d2d1f8c1498756ff5be8..8bcb82266a2d3567c0f8d79b2fdccd5916b2ecbb 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_bitwise_or.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_bitwise_or.cu.cc
@@ -19,7 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY6(bitwise_or, int8, int16, int32, int64, uint8, uint16);
+DEFINE_BINARY8(bitwise_or, int8, int16, int32, int64, uint8, uint16, uint32,
+               uint64);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_bitwise_xor.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_bitwise_xor.cu.cc
index a4531ab7c6f283f8e732dbc87b3c64d93a8a5bef..e62a87aba44eea0fc5b1cf13a74ddfed2ef294b6 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_bitwise_xor.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_bitwise_xor.cu.cc
@@ -19,7 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY6(bitwise_xor, int8, int16, int32, int64, uint8, uint16);
+DEFINE_BINARY8(bitwise_xor, int8, int16, int32, int64, uint8, uint16, uint32,
+               uint64);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc
index 0f492917bd54cc5b518e7fe76a8dd08b3934d1da..417e5da7588221b190d11092b6e03787a0dd15d4 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_UNARY3(exp, Eigen::half, float, double);
+DEFINE_UNARY5(exp, Eigen::half, float, double, complex64, complex128);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index d32185b6bf48f7b6d49f355c0653004310bde533..24c6e6361d7032e60b6da3141caed84751615686 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -26,24 +26,26 @@ limitations under the License.
 #include "tensorflow/core/kernels/bounds_check.h"
 
 namespace Eigen {
-namespace internal {
+namespace numext {
+#if GOOGLE_CUDA
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+std::complex<float> exp(const std::complex<float> &x) {
+  auto com = ::expf(x.real());
+  auto res_real = com * ::cosf(x.imag());
+  auto res_imag = com * ::sinf(x.imag());
+  return std::complex<float>(res_real, res_imag);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+std::complex<double> exp(const std::complex<double> &x) {
+  auto com = ::exp(x.real());
+  auto res_real = com * ::cos(x.imag());
+  auto res_imag = com * ::sin(x.imag());
+  return std::complex<double>(res_real, res_imag);
+}
+#endif
+}
 
-// TODO(rmlarsen): Get rid of fmod2 once fmod is upstreamed to Eigen.
-template <typename T>
-struct scalar_fmod2_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod2_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a,
-                                                           const T& b) const {
-    return std::fmod(a, b);
-  }
-};
-template <typename T>
-struct functor_traits<scalar_fmod2_op<T>> {
-  enum {
-    Cost = 13,  // Reciprocal throughput of FPREM on Haswell.
-    PacketAccess = false,
-  };
-};
+namespace internal {
 
 template <typename T>
 struct scalar_asinh_op {
@@ -702,7 +704,7 @@ struct safe_div : base<T, Eigen::internal::safe_div_or_mod_op<
 };
 
 template <typename T>
-struct fmod : base<T, Eigen::internal::scalar_fmod2_op<T>> {};
+struct fmod : base<T, Eigen::internal::scalar_fmod_op<T>> {};
 
 template <typename T>
 struct mod : base<T, Eigen::internal::scalar_mod2_op<T>> {};
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..58cf36f454c7df93f1b19e7249aa1309854080e8
--- /dev/null
+++ b/tensorflow/core/kernels/data/BUILD
@@ -0,0 +1,531 @@
+# Description:
+#   OpKernels for tf.data
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_kernel_library",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "stats_aggregator",
+    hdrs = ["stats_aggregator.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_kernel_library(
+    name = "stats_aggregator_ops",
+    srcs = ["stats_aggregator_ops.cc"],
+    deps = [
+        ":stats_aggregator",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "dataset",
+    srcs = ["dataset.cc"],
+    hdrs = ["dataset.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "dataset_utils",
+    srcs = ["dataset_utils.cc"],
+    hdrs = ["dataset_utils.h"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "captured_function",
+    srcs = ["captured_function.cc"],
+    hdrs = ["captured_function.h"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core/kernels:variable_ops",
+    ],
+)
+
+cc_library(
+    name = "window_dataset",
+    srcs = ["window_dataset.cc"],
+    hdrs = ["window_dataset.h"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "batch_dataset_op",
+    srcs = ["batch_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:batch_util",
+    ],
+)
+
+tf_kernel_library(
+    name = "padded_batch_dataset_op",
+    srcs = ["padded_batch_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "dense_to_sparse_batch_dataset_op",
+    srcs = ["dense_to_sparse_batch_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "group_by_window_dataset_op",
+    srcs = ["group_by_window_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        ":window_dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "filter_dataset_op",
+    srcs = ["filter_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "map_dataset_op",
+    srcs = ["map_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "map_and_batch_dataset_op",
+    srcs = ["map_and_batch_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:inplace_ops",
+    ],
+)
+
+tf_kernel_library(
+    name = "parallel_map_dataset_op",
+    srcs = ["parallel_map_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "scan_dataset_op",
+    srcs = ["scan_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "flat_map_dataset_op",
+    srcs = ["flat_map_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        ":dataset_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "interleave_dataset_op",
+    srcs = ["interleave_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        ":dataset_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "parallel_interleave_dataset_op",
+    srcs = ["parallel_interleave_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        ":dataset_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "prefetch_dataset_op",
+    srcs = ["prefetch_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "repeat_dataset_op",
+    srcs = ["repeat_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "take_dataset_op",
+    srcs = ["take_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "skip_dataset_op",
+    srcs = ["skip_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "ignore_errors_dataset_op",
+    srcs = ["ignore_errors_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "stats_dataset_ops",
+    srcs = ["stats_dataset_ops.cc"],
+    deps = [
+        ":dataset",
+        ":stats_aggregator",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "random_dataset_op",
+    srcs = ["random_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "range_dataset_op",
+    srcs = ["range_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "shuffle_dataset_op",
+    srcs = ["shuffle_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "sparse_tensor_slice_dataset_op",
+    srcs = ["sparse_tensor_slice_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "tensor_dataset_op",
+    srcs = ["tensor_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "tensor_slice_dataset_op",
+    srcs = ["tensor_slice_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:batch_util",
+    ],
+)
+
+tf_kernel_library(
+    name = "zip_dataset_op",
+    srcs = ["zip_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "concatenate_dataset_op",
+    srcs = ["concatenate_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "reader_dataset_ops",
+    srcs = ["reader_dataset_ops.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "sql_dataset_ops",
+    srcs = [
+        "sql_dataset_ops.cc",
+    ],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data/sql",
+    ],
+)
+
+tf_kernel_library(
+    name = "iterator_ops",
+    srcs = ["iterator_ops.cc"],
+    deps = [
+        ":dataset",
+        ":stats_aggregator",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+tf_kernel_library(
+    name = "cache_dataset_ops",
+    srcs = ["cache_dataset_ops.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/util/tensor_bundle",
+    ],
+)
+
+tf_kernel_library(
+    name = "dataset_ops",
+    deps = [
+        ":batch_dataset_op",
+        ":cache_dataset_ops",
+        ":concatenate_dataset_op",
+        ":dense_to_sparse_batch_dataset_op",
+        ":filter_dataset_op",
+        ":flat_map_dataset_op",
+        ":group_by_window_dataset_op",
+        ":ignore_errors_dataset_op",
+        ":interleave_dataset_op",
+        ":iterator_ops",
+        ":map_and_batch_dataset_op",
+        ":map_dataset_op",
+        ":padded_batch_dataset_op",
+        ":parallel_interleave_dataset_op",
+        ":parallel_map_dataset_op",
+        ":prefetch_dataset_op",
+        ":random_dataset_op",
+        ":range_dataset_op",
+        ":reader_dataset_ops",
+        ":repeat_dataset_op",
+        ":scan_dataset_op",
+        ":shuffle_dataset_op",
+        ":skip_dataset_op",
+        ":sparse_tensor_slice_dataset_op",
+        ":sql_dataset_ops",
+        ":stats_aggregator_ops",
+        ":stats_dataset_ops",
+        ":take_dataset_op",
+        ":tensor_dataset_op",
+        ":tensor_slice_dataset_op",
+        ":zip_dataset_op",
+    ],
+)
diff --git a/tensorflow/core/kernels/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
similarity index 80%
rename from tensorflow/core/kernels/batch_dataset_op.cc
rename to tensorflow/core/kernels/data/batch_dataset_op.cc
index 46412a554b34d22a9e261aaec328d48b0f250c82..876f76fb43d1a8776378458b6794a0508a5df8f6 100644
--- a/tensorflow/core/kernels/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/batch_util.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -92,44 +92,6 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
     }
 
    private:
-    // Copies element into the index^th slice of parent (in the 0th dimension).
-    //
-    // TODO(mrry): Reconcile this method with the similar method in
-    // the queue implementation.
-    template <typename T>
-    static Status HandleElementToSlice(const Tensor& element, Tensor* parent,
-                                       int64 index) {
-      if (element.NumElements() !=
-          (parent->NumElements() / parent->dim_size(0))) {
-        TensorShape chip_shape = parent->shape();
-        chip_shape.RemoveDim(0);
-        return errors::InvalidArgument(
-            "HandleElementToSlice Cannot copy slice: number of elements does "
-            "not match. Shapes are: [element]: ",
-            element.shape().DebugString(),
-            ", [parent slice]: ", chip_shape.DebugString());
-      }
-      auto parent_as_matrix = parent->flat_outer_dims<T>();
-      parent_as_matrix.chip(index, 0) = element.flat<T>();
-      return Status::OK();
-    }
-
-    // Copies element into the index^th slice of parent (in the 0th dimension).
-    static Status CopyElementToSlice(const Tensor& element, Tensor* parent,
-                                     int64 index) {
-#define HANDLE_TYPE(T)                                      \
-  case DataTypeToEnum<T>::value: {                          \
-    return HandleElementToSlice<T>(element, parent, index); \
-  }
-
-      switch (element.dtype()) {
-        TF_CALL_DATASET_TYPES(HANDLE_TYPE);
-#undef HANDLE_TYPE
-        default:
-          return errors::Unimplemented(
-              "CopyElementToSlice Unhandled data type: ", element.dtype());
-      }
-    }
 
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -188,8 +150,19 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
           // Build the output tuple component by copying one slice
           // from each input element in the batch.
           for (size_t i = 0; i < num_batch_elements; ++i) {
-            TF_RETURN_IF_ERROR(CopyElementToSlice(
-                batch_elements[i][component_index], &batch_component, i));
+            if (batch_elements[i][component_index].shape() !=
+                first_element.shape()) {
+              return errors::InvalidArgument(
+                  "Cannot batch tensors with different shapes in component ",
+                  component_index, ". First element had shape ",
+                  first_element.shape().DebugString(), " and element ", i,
+                  " had shape ",
+                  batch_elements[i][component_index].shape().DebugString(),
+                  ".");
+            }
+            TF_RETURN_IF_ERROR(batch_util::CopyElementToSlice(
+                std::move(batch_elements[i][component_index]), &batch_component,
+                i));
           }
           out_tensors->emplace_back(std::move(batch_component));
         }
diff --git a/tensorflow/core/kernels/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
similarity index 99%
rename from tensorflow/core/kernels/cache_dataset_ops.cc
rename to tensorflow/core/kernels/data/cache_dataset_ops.cc
index 137002b9d77a18fbd5660eb06bcf69d0c4ad3f13..f0a2192826e051586e4999d729c24ed5495be0ea 100644
--- a/tensorflow/core/kernels/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
diff --git a/tensorflow/core/kernels/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
similarity index 50%
rename from tensorflow/core/kernels/captured_function.cc
rename to tensorflow/core/kernels/data/captured_function.cc
index 00cdc1eff2d3003cb55e868389033f8504e01588..17ee1db407dfc5bb37e371242b75dd195f955b3a 100644
--- a/tensorflow/core/kernels/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/captured_function.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
 
 #include <utility>
 
@@ -24,8 +24,9 @@ limitations under the License.
 #include "tensorflow/core/framework/queue_interface.h"
 #include "tensorflow/core/framework/reader_interface.h"
 #include "tensorflow/core/framework/resource_handle.pb_text.h"
-#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -113,70 +114,220 @@ Status CapturedFunction::Create(
   FunctionLibraryRuntime::Handle f_handle;
   TF_RETURN_IF_ERROR(
       lib->Instantiate(func.name(), AttrSlice(&func.attr()), &f_handle));
+  const FunctionBody* fbody = lib->GetFunctionBody(f_handle);
+  if (fbody == nullptr) {
+    return errors::Internal("Failed to instantiate function body.");
+  }
 
   out_function->reset(new CapturedFunction(
       device, std::move(device_mgr), std::move(flib_def), std::move(pflr), lib,
-      f_handle, std::move(captured_inputs)));
+      f_handle, std::move(captured_inputs), fbody->ret_types));
   return Status::OK();
 }
 
+namespace {
+class CallFrameBase : public CallFrameInterface {
+ public:
+  explicit CallFrameBase(DataTypeSlice ret_types)
+      : ret_types_(ret_types), retvals_(ret_types.size()) {}
+
+  // Caller methods.
+  Status ConsumeRetvals(std::vector<Tensor>* retvals) {
+    retvals->reserve(retvals_.size());
+    int i = 0;
+    for (auto&& val : retvals_) {
+      if (!val) {
+        return errors::Internal("No return value for index ", i, ".");
+      }
+      retvals->emplace_back(std::move(val.value()));
+      ++i;
+    }
+    return Status::OK();
+  }
+
+  size_t num_retvals() const override { return retvals_.size(); }
+
+  // Callee methods.
+  Status SetRetval(int index, const Tensor& val) override {
+    if (index < retvals_.size() && val.dtype() == ret_types_[index] &&
+        !retvals_[index]) {
+      retvals_[index] = val;
+      return Status::OK();
+    } else if (index >= retvals_.size()) {
+      return errors::InvalidArgument("Return value ", index,
+                                     " is out of range.");
+    } else if (val.dtype() != ret_types_[index]) {
+      return errors::InvalidArgument("Expected type ",
+                                     DataTypeString(ret_types_[index]),
+                                     " for return value ", index, " but got ",
+                                     DataTypeString(val.dtype()), ".");
+    } else {
+      return errors::Internal("Attempted to set return value ", index,
+                              " more than once.");
+    }
+  }
+
+ private:
+  DataTypeSlice ret_types_;
+  std::vector<gtl::optional<Tensor>> retvals_;
+  TF_DISALLOW_COPY_AND_ASSIGN(CallFrameBase);
+};
+
+class OwnedArgsCallFrame : public CallFrameBase {
+ public:
+  OwnedArgsCallFrame(std::vector<Tensor>&& args,
+                     const std::vector<Tensor>* captured_inputs,
+                     DataTypeSlice ret_types)
+      : CallFrameBase(ret_types),
+        args_(std::move(args)),
+        captured_inputs_(captured_inputs) {}
+
+  size_t num_args() const override {
+    return args_.size() + captured_inputs_->size();
+  }
+
+  // Callee methods.
+  Status GetArg(int index, Tensor* val) const override {
+    if (index < args_.size() && args_[index].IsInitialized()) {
+      // TODO(mrry): Consider making `CallFrameInterface::GetArg` non-const in
+      // order to be able to `std::move(args_[index])` into `*val`.
+      *val = args_[index];
+      return Status::OK();
+    } else if (index < args_.size() + captured_inputs_->size()) {
+      *val = (*captured_inputs_)[index - args_.size()];
+      return Status::OK();
+    } else if (index >= args_.size() + captured_inputs_->size()) {
+      return errors::InvalidArgument("Argument ", index, " is out of range.");
+    } else {
+      return errors::Internal("Attempted to get argument ", index,
+                              " more than once.");
+    }
+  }
+
+ private:
+  std::vector<Tensor> args_;
+  const std::vector<Tensor>* const captured_inputs_;  // Not owned.
+};
+
+class BorrowedArgsCallFrame : public CallFrameBase {
+ public:
+  BorrowedArgsCallFrame(const std::vector<Tensor>& args,
+                        const std::vector<Tensor>* captured_inputs,
+                        DataTypeSlice ret_types)
+      : CallFrameBase(ret_types),
+        args_(args),
+        captured_inputs_(captured_inputs) {}
+
+  size_t num_args() const override {
+    return args_.size() + captured_inputs_->size();
+  }
+
+  // Callee methods.
+  Status GetArg(int index, Tensor* val) const override {
+    if (index < args_.size() && args_[index].IsInitialized()) {
+      *val = args_[index];
+      return Status::OK();
+    } else if (index < args_.size() + captured_inputs_->size()) {
+      *val = (*captured_inputs_)[index - args_.size()];
+      return Status::OK();
+    } else if (index >= args_.size() + captured_inputs_->size()) {
+      return errors::InvalidArgument("Argument ", index, " is out of range.");
+    } else {
+      return errors::Internal("Attempted to get argument ", index,
+                              " more than once.");
+    }
+  }
+
+ private:
+  const std::vector<Tensor>& args_;                   // Not owned.
+  const std::vector<Tensor>* const captured_inputs_;  // Not owned.
+};
+
+}  // namespace
+
 Status CapturedFunction::Run(FunctionLibraryRuntime::Options f_opts,
-                             gtl::ArraySlice<Tensor> args,
+                             std::vector<Tensor>&& args,
                              std::vector<Tensor>* rets) {
+  // TODO(mrry): Add cancellation manager support to IteratorContext
+  // so that we can cancel running map functions. The local
+  // cancellation manager here is created so that we can run kernels
+  // (such as queue kernels) that depend on the non-nullness of
+  // `OpKernelContext::cancellation_manager()`, but additional effort
+  // will be required to plumb it through the `IteratorContext`.
+  auto c_mgr = new CancellationManager;
+  auto frame =
+      new OwnedArgsCallFrame(std::move(args), &captured_inputs_, ret_types_);
+  f_opts.cancellation_manager = c_mgr;
   Notification n;
   Status s;
-  auto done_callback = [&n, &s](Status func_status) {
-    s.Update(func_status);
-    n.Notify();
-  };
+  lib_->Run(f_opts, f_handle_, frame,
+            [rets, c_mgr, frame, &n, &s](Status func_status) {
+              delete c_mgr;
+              s.Update(func_status);
+              if (s.ok()) {
+                s = frame->ConsumeRetvals(rets);
+              }
+              delete frame;
+              n.Notify();
+            });
+  n.WaitForNotification();
+  return s;
+}
+
+Status CapturedFunction::RunWithBorrowedArgs(
+    FunctionLibraryRuntime::Options f_opts, const std::vector<Tensor>& args,
+    std::vector<Tensor>* rets) {
   // TODO(mrry): Add cancellation manager support to IteratorContext
   // so that we can cancel running map functions. The local
   // cancellation manager here is created so that we can run kernels
-  // (such as queue kernels) that depend on the non-nullness
+  // (such as queue kernels) that depend on the non-nullness of
   // `OpKernelContext::cancellation_manager()`, but additional effort
   // will be required to plumb it through the `IteratorContext`.
-  CancellationManager c_mgr;
-  f_opts.cancellation_manager = &c_mgr;
-  RunHelper(std::move(f_opts), args, rets, std::move(done_callback));
+  auto c_mgr = new CancellationManager;
+  BorrowedArgsCallFrame frame(args, &captured_inputs_, ret_types_);
+  f_opts.cancellation_manager = c_mgr;
+  Notification n;
+  Status s;
+  lib_->Run(f_opts, f_handle_, &frame,
+            [rets, c_mgr, &frame, &n, &s](Status func_status) {
+              delete c_mgr;
+              s.Update(func_status);
+              if (s.ok()) {
+                s = frame.ConsumeRetvals(rets);
+              }
+              n.Notify();
+            });
   n.WaitForNotification();
   return s;
 }
 
 void CapturedFunction::RunAsync(FunctionLibraryRuntime::Options f_opts,
-                                gtl::ArraySlice<Tensor> args,
+                                std::vector<Tensor>&& args,
                                 std::vector<Tensor>* rets,
                                 FunctionLibraryRuntime::DoneCallback done) {
+  // TODO(mrry): Add cancellation manager support to IteratorContext
+  // so that we can cancel running map functions. The local
+  // cancellation manager here is created so that we can run kernels
+  // (such as queue kernels) that depend on the non-nullness of
+  // `OpKernelContext::cancellation_manager()`, but additional effort
+  // will be required to plumb it through the `IteratorContext`.
   auto c_mgr = new CancellationManager;
+  auto frame =
+      new OwnedArgsCallFrame(std::move(args), &captured_inputs_, ret_types_);
   f_opts.cancellation_manager = c_mgr;
-  FunctionLibraryRuntime::DoneCallback wrapped_done = std::bind(
-      [c_mgr](FunctionLibraryRuntime::DoneCallback done,
-              // Begin unbound arguments.
-              Status s) {
-        delete c_mgr;
-        done(s);
-      },
-      std::move(done), std::placeholders::_1);
-  RunHelper(std::move(f_opts), args, rets, std::move(wrapped_done));
-}
-
-void CapturedFunction::RunHelper(FunctionLibraryRuntime::Options f_opts,
-                                 gtl::ArraySlice<Tensor> args,
-                                 std::vector<Tensor>* rets,
-                                 FunctionLibraryRuntime::DoneCallback done) {
-  // TODO(mrry): Implement a synchronous version of
-  // FunctionLibraryRuntime::Run() that avoids a context switch for small
-  // functions.
-  if (captured_inputs_.empty()) {
-    lib_->Run(f_opts, f_handle_, args, rets, std::move(done));
-  } else {
-    std::vector<Tensor> args_with_captured;
-    args_with_captured.reserve(args.size() + captured_inputs_.size());
-    args_with_captured.insert(args_with_captured.end(), args.begin(),
-                              args.end());
-    args_with_captured.insert(args_with_captured.end(),
-                              captured_inputs_.begin(), captured_inputs_.end());
-    lib_->Run(f_opts, f_handle_, args_with_captured, rets, std::move(done));
-  }
+  lib_->Run(f_opts, f_handle_, frame,
+            std::bind(
+                [rets, c_mgr, frame](FunctionLibraryRuntime::DoneCallback done,
+                                     // Begin unbound arguments.
+                                     Status s) {
+                  delete c_mgr;
+                  if (s.ok()) {
+                    s = frame->ConsumeRetvals(rets);
+                  }
+                  delete frame;
+                  done(s);
+                },
+                std::move(done), std::placeholders::_1));
 }
 
 CapturedFunction::CapturedFunction(
@@ -184,13 +335,14 @@ CapturedFunction::CapturedFunction(
     std::unique_ptr<FunctionLibraryDefinition> flib_def,
     std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
     FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle,
-    std::vector<Tensor> captured_inputs)
+    std::vector<Tensor> captured_inputs, DataTypeSlice ret_types)
     : device_(device),
       device_mgr_(std::move(device_mgr)),
       flib_def_(std::move(flib_def)),
       pflr_(std::move(pflr)),
       lib_(lib),
       f_handle_(f_handle),
-      captured_inputs_(std::move(captured_inputs)) {}
+      captured_inputs_(std::move(captured_inputs)),
+      ret_types_(ret_types) {}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f62b74470da407c46e4a8426c059f1125b3ab94
--- /dev/null
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -0,0 +1,127 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_CAPTURED_FUNCTION_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_CAPTURED_FUNCTION_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class Device;
+class OpKernelContext;
+class ResourceMgr;
+
+// A `CapturedFunction` encapsulates a TensorFlow function and all of
+// the runtime support required to execute it.
+//
+// The `Dataset`-related classes use `CapturedFunction` to execute
+// TensorFlow functions outside a the normal `OpKernel::Compute()`
+// context.
+//
+// NOTE(mrry): Here we are taking a conservative approach to dealing with
+// ownership of the various framework and runtime objects that are needed
+// to execute functions. We copy the function library *definition* (i.e.
+// a set of FunctionDefs) out of this kernel's context's function library
+// *runtime*, then we use that together with a specially-created
+// ThreadPoolDevice to build a new FunctionLibraryRuntime for the Dataset.
+//
+// We need to do this (or refactor the ownership of framework components
+// in each of the session implementations) to make it possible to close
+// down a ParallelMapDataset::Iterator when its session is closed.
+//
+// TODO(mrry): Clean this up. Investigate whether it would be possible to
+// reuse the session's FunctionLibraryRuntime(s) or Device(s).
+class CapturedFunction {
+ public:
+  // NOTE(mrry): The `captured_inputs` are passed by value. For
+  // efficiency, you are recommended to move this argument into the call.
+  static Status Create(OpKernelContext* ctx, const NameAttrList& func,
+                       int graph_def_version,
+                       std::vector<Tensor> captured_inputs,
+                       std::unique_ptr<CapturedFunction>* out_function);
+
+  // Synchronously runs the captured function on the given `args`, and stores
+  // the results in `*rets`. This method takes ownership of the tensors in
+  // `args`, in order to be able to deallocate them as early as possible.
+  // Use `RunWithBorrowedArgs()` if the caller needs to retain ownership of
+  // the `args`.
+  Status Run(FunctionLibraryRuntime::Options f_opts, std::vector<Tensor>&& args,
+             std::vector<Tensor>* rets);
+
+  // Synchronously runs the captured function on the given `args`, and stores
+  // the results in `*rets`. Prefer to use `Run()` or `RunAsync()` when
+  // possible.
+  Status RunWithBorrowedArgs(FunctionLibraryRuntime::Options f_opts,
+                             const std::vector<Tensor>& args,
+                             std::vector<Tensor>* rets);
+
+  // Asynchronously runs the captured function on the given `args`, stores
+  // the results in `*rets`, and calls the given `done` callback when the
+  // function returns. This method takes ownership of the tensors in `args`,
+  // in order to be able to deallocate them as early as possible.
+  void RunAsync(FunctionLibraryRuntime::Options f_opts,
+                std::vector<Tensor>&& args, std::vector<Tensor>* rets,
+                FunctionLibraryRuntime::DoneCallback done);
+
+  // Returns a borrowed pointer to the `ResourceManager` used when this
+  // function is run.
+  ResourceMgr* resource_manager() const { return device_->resource_manager(); }
+
+  // Returns that additional captured inputs that will be passed to the function
+  // when `Run*()` is called.
+  const std::vector<Tensor>& captured_inputs() { return captured_inputs_; }
+
+  // Returns a step ID for use when running a `CapturedFunction`.
+  static int64 generate_step_id() {
+    // Choose a step ID that is guaranteed not to clash with any
+    // Session-generated step ID. DirectSession only generates
+    // non-negative step IDs (contiguous, starting from 0), and
+    // MasterSession generates 56-bit random step IDs whose MSB is
+    // always 0, so a negative random step ID should suffice.
+    return -std::abs(static_cast<int64>(random::New64()));
+  }
+
+ private:
+  CapturedFunction(Device* device, std::unique_ptr<DeviceMgr> device_mgr,
+                   std::unique_ptr<FunctionLibraryDefinition> flib_def,
+                   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+                   FunctionLibraryRuntime* lib,
+                   FunctionLibraryRuntime::Handle f_handle,
+                   std::vector<Tensor> captured_inputs,
+                   DataTypeSlice ret_types);
+
+  Device* const device_;  // owned by device_mgr_.
+  const std::unique_ptr<DeviceMgr> device_mgr_;
+  const std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  const std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  FunctionLibraryRuntime* const lib_;  // owned by pflr_.
+  const FunctionLibraryRuntime::Handle f_handle_;
+  const std::vector<Tensor> captured_inputs_;
+  DataTypeSlice ret_types_;  // owned by pflr_.
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CapturedFunction);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_CAPTURED_FUNCTION_H_
diff --git a/tensorflow/core/kernels/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
similarity index 99%
rename from tensorflow/core/kernels/concatenate_dataset_op.cc
rename to tensorflow/core/kernels/data/concatenate_dataset_op.cc
index ad78ba01869a862d496d66b8dcac1243cf09fe84..24efadfd477da39fd046d23a4a2a19f65f270d64 100644
--- a/tensorflow/core/kernels/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/data/dataset.cc b/tensorflow/core/kernels/data/dataset.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ea6875567604e4e5bf7c990ad6a42ed8c5dafaa
--- /dev/null
+++ b/tensorflow/core/kernels/data/dataset.cc
@@ -0,0 +1,271 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/node_builder.h"
+
+namespace tensorflow {
+
+namespace {
+
+// A wrapper class for storing a `DatasetBase` instance in a DT_VARIANT tensor.
+// Objects of the wrapper class own a reference on an instance of `DatasetBase`,
+// and the wrapper's copy constructor and destructor take care of managing the
+// reference count.
+//
+// NOTE(mrry): This is not a feature-complete implementation of the DT_VARIANT
+// specification. In particular, we cannot currently serialize an arbitrary
+// `DatasetBase` object, so the `Encode()` and `Decode()` methods are not
+// implemented.
+class DatasetVariantWrapper {
+ public:
+  DatasetVariantWrapper() : dataset_(nullptr) {}
+
+  // Transfers ownership of `dataset` to `*this`.
+  explicit DatasetVariantWrapper(DatasetBase* dataset) : dataset_(dataset) {}
+
+  DatasetVariantWrapper(const DatasetVariantWrapper& other)
+      : dataset_(other.dataset_) {
+    if (dataset_) dataset_->Ref();
+  }
+
+  ~DatasetVariantWrapper() {
+    if (dataset_) dataset_->Unref();
+  }
+
+  DatasetBase* get() const { return dataset_; }
+
+  string TypeName() const { return "tensorflow::DatasetVariantWrapper"; }
+  string DebugString() const {
+    if (dataset_) {
+      return dataset_->DebugString();
+    } else {
+      return "<Uninitialized DatasetVariantWrapper>";
+    }
+  }
+  void Encode(VariantTensorData* data) const {
+    LOG(ERROR) << "The Encode() method is not implemented for "
+                  "DatasetVariantWrapper objects.";
+  }
+  bool Decode(const VariantTensorData& data) {
+    LOG(ERROR) << "The Decode() method is not implemented for "
+                  "DatasetVariantWrapper objects.";
+    return false;
+  }
+
+ private:
+  DatasetBase* const dataset_;  // Owns one reference.
+};
+
+}  // namespace
+
+Status GraphDefBuilderWrapper::AddDataset(
+    const GraphDatasetBase* dataset,
+    const std::vector<std::pair<size_t, Node*>>& inputs,
+    const std::vector<std::pair<size_t, gtl::ArraySlice<Node*>>>& list_inputs,
+    const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
+    Node** output) {
+  const string& op_type_name = dataset->op_name();
+  std::unique_ptr<const GraphDefBuilder::Options> opts(
+      new GraphDefBuilder::Options(b_->opts()));
+  // TODO(srbs|mrry): Not all datasets have output_types and output_shapes
+  // attributes defined. It will be nice to have a consistent pattern.
+  bool has_output_types_attr = HasAttr(op_type_name, "output_types");
+  bool has_output_shapes_attr = HasAttr(op_type_name, "output_shapes");
+  if (has_output_shapes_attr) {
+    opts.reset(new GraphDefBuilder::Options(
+        opts->WithAttr("output_shapes", dataset->output_shapes())));
+  }
+  if (has_output_types_attr) {
+    opts.reset(new GraphDefBuilder::Options(
+        opts->WithAttr("output_types", dataset->output_dtypes())));
+  }
+  for (auto attr : attrs) {
+    opts.reset(
+        new GraphDefBuilder::Options(opts->WithAttr(attr.first, attr.second)));
+  }
+  if (opts->HaveError()) {
+    return errors::Internal("AddDataset: Failed to build Options with error ",
+                            opts->StatusToString());
+  }
+  NodeBuilder node_builder(opts->GetNameForOp(op_type_name), op_type_name,
+                           opts->op_registry());
+  {
+    size_t total_size = inputs.size() + list_inputs.size();
+    auto inputs_iter = inputs.begin();
+    auto list_inputs_iter = list_inputs.begin();
+    for (int i = 0; i < total_size; i++) {
+      if (inputs_iter != inputs.end() && inputs_iter->first == i) {
+        node_builder.Input(NodeBuilder::NodeOut(inputs_iter->second));
+        inputs_iter++;
+      } else if (list_inputs_iter != list_inputs.end() &&
+                 list_inputs_iter->first == i) {
+        std::vector<NodeBuilder::NodeOut> nodeout_inputs;
+        nodeout_inputs.reserve(list_inputs_iter->second.size());
+        for (Node* n : list_inputs_iter->second) {
+          nodeout_inputs.emplace_back(n);
+        }
+        node_builder.Input(nodeout_inputs);
+        list_inputs_iter++;
+      } else {
+        return errors::InvalidArgument("No input found for index ", i);
+      }
+    }
+  }
+  *output = opts->FinalizeBuilder(&node_builder);
+  if (*output == nullptr) {
+    return errors::Internal("AddDataset: Failed to build ", op_type_name,
+                            " op with error ", opts->StatusToString());
+  }
+  return Status::OK();
+}
+
+Status GraphDefBuilderWrapper::AddFunction(OpKernelContext* ctx,
+                                           const string& function_name) {
+  if (b_->HasFunction(function_name)) {
+    LOG(INFO) << "Function with name " << function_name << "already exists in"
+              << " the graph. It will not be added again.";
+    return Status::OK();
+  }
+  TF_RETURN_IF_ERROR(EnsureFunctionIsStateless(ctx, function_name));
+  const FunctionLibraryDefinition* flib_def =
+      ctx->function_library()->GetFunctionLibraryDefinition();
+  const FunctionDef* f_def = flib_def->Find(function_name);
+  if (f_def == nullptr) {
+    return errors::InvalidArgument("Unable to find FunctionDef for ",
+                                   function_name, " in the registry.");
+  }
+  FunctionDefLibrary def;
+  *def.add_function() = *f_def;
+  const string gradient_func = flib_def->FindGradient(function_name);
+  if (!gradient_func.empty()) {
+    GradientDef* g_def = def.add_gradient();
+    g_def->set_function_name(function_name);
+    g_def->set_gradient_func(gradient_func);
+  }
+  TF_RETURN_IF_ERROR(b_->AddFunctionLibrary(def));
+
+  // Recursively add functions in inputs of function_name.
+  for (const NodeDef& node_def : f_def->node_def()) {
+    const OpRegistrationData* op_reg_data = nullptr;
+    TF_RETURN_IF_ERROR(flib_def->LookUp(node_def.op(), &op_reg_data));
+    if (op_reg_data->is_function_op) {
+      TF_RETURN_IF_ERROR(AddFunction(ctx, op_reg_data->op_def.name()));
+    }
+    // Recursively add functions in attrs of this NodeDef.
+    for (const auto& pair : node_def.attr()) {
+      TF_RETURN_IF_ERROR(AddAttrFunctions(pair.second, ctx));
+    }
+  }
+
+  // Recursively add functions in attrs of function_name.
+  for (auto iter = f_def->attr().begin(); iter != f_def->attr().end(); iter++) {
+    TF_RETURN_IF_ERROR(AddAttrFunctions(iter->second, ctx));
+  }
+  return Status::OK();
+}
+
+void GraphDefBuilderWrapper::AddTensorInternal(const Tensor& val,
+                                               Node** output) {
+  *output = ops::SourceOp(
+      "Const",
+      b_->opts().WithAttr("dtype", val.dtype()).WithAttr("value", val));
+}
+
+bool GraphDefBuilderWrapper::HasAttr(const string& op_type_name,
+                                     const string& attr_name) const {
+  const OpDef* op_def = nullptr;
+  Status s = b_->opts().op_registry()->LookUpOpDef(op_type_name, &op_def);
+  if (!s.ok() || op_def == nullptr) {
+    return false;
+  }
+  return HasAttr(op_def, attr_name);
+}
+
+Status GraphDatasetBase::Serialize(OpKernelContext* ctx,
+                                   string* serialized_graph_def,
+                                   string* output_node) const {
+  GraphDefBuilder b;
+  DatasetGraphDefBuilder db(&b);
+  Node* node = nullptr;
+  TF_RETURN_IF_ERROR(AsGraphDefInternal(ctx, &db, &node));
+  *output_node = node->name();
+  GraphDef graph_def;
+  TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
+  graph_def.SerializeToString(serialized_graph_def);
+  return Status::OK();
+}
+
+Status GetDatasetFromVariantTensor(const Tensor& tensor,
+                                   DatasetBase** out_dataset) {
+  if (!(tensor.dtype() == DT_VARIANT ||
+        TensorShapeUtils::IsScalar(tensor.shape()))) {
+    return errors::InvalidArgument(
+        "Dataset tensor must be a scalar of dtype DT_VARIANT.");
+  }
+  const Variant& variant = tensor.scalar<Variant>()();
+  const DatasetVariantWrapper* wrapper = variant.get<DatasetVariantWrapper>();
+  if (wrapper == nullptr) {
+    return errors::InvalidArgument("Tensor must be a Dataset object.");
+  }
+  *out_dataset = wrapper->get();
+  if (*out_dataset == nullptr) {
+    return errors::Internal("Read uninitialized Dataset variant.");
+  }
+  return Status::OK();
+}
+
+Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor) {
+  if (!(tensor->dtype() == DT_VARIANT ||
+        TensorShapeUtils::IsScalar(tensor->shape()))) {
+    return errors::InvalidArgument(
+        "Dataset tensor must be a scalar of dtype DT_VARIANT.");
+  }
+  tensor->scalar<Variant>()() = DatasetVariantWrapper(dataset);
+  return Status::OK();
+}
+
+void DatasetOpKernel::Compute(OpKernelContext* ctx) {
+  DatasetBase* dataset = nullptr;
+  MakeDataset(ctx, &dataset);
+  if (ctx->status().ok()) {
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    OP_REQUIRES_OK(ctx, StoreDatasetInVariantTensor(dataset, output));
+  }
+}
+
+void UnaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx,
+                                       DatasetBase** output) {
+  DatasetBase* input;
+  OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &input));
+  MakeDataset(ctx, input, output);
+}
+
+void BinaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx,
+                                        DatasetBase** output) {
+  DatasetBase* input;
+  OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &input));
+  DatasetBase* another_input;
+  OP_REQUIRES_OK(ctx,
+                 GetDatasetFromVariantTensor(ctx->input(1), &another_input));
+  MakeDataset(ctx, input, another_input, output);
+}
+
+const char GraphDatasetBase::kDatasetGraphKey[] = "_DATASET_GRAPH";
+const char GraphDatasetBase::kDatasetGraphOutputNodeKey[] =
+    "_DATASET_GRAPH_OUTPUT_NODE";
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset.h b/tensorflow/core/kernels/data/dataset.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e01535bd8437add36951bba0a6fe401cbc7f47f
--- /dev/null
+++ b/tensorflow/core/kernels/data/dataset.h
@@ -0,0 +1,578 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
+
+#include <memory>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/tracing.h"
+
+// Polymorphic datasets should support all primitive TensorFlow
+// types. Use this macro to expand `m(T)` once for each primitive type
+// `T`, e.g. to build a `switch` statement.
+#define TF_CALL_DATASET_TYPES(m) TF_CALL_ALL_TYPES(m) TF_CALL_QUANTIZED_TYPES(m)
+
+namespace tensorflow {
+
+// Interface for reading values from a key-value store.
+// Used for restoring iterator state.
+class IteratorStateReader {
+ public:
+  virtual Status ReadScalar(StringPiece key, int64* val) = 0;
+  virtual Status ReadScalar(StringPiece key, string* val) = 0;
+  virtual Status ReadTensor(StringPiece key, Tensor* val) = 0;
+  virtual bool Contains(StringPiece key) = 0;
+
+  virtual ~IteratorStateReader() {}
+};
+
+// Interface for writing values to a key-value store.
+// Used for saving iterator state.
+class IteratorStateWriter {
+ public:
+  virtual Status WriteScalar(StringPiece key, const int64 val) = 0;
+  virtual Status WriteScalar(StringPiece key, const string& val) = 0;
+  virtual Status WriteTensor(StringPiece key, const Tensor& val) = 0;
+
+  virtual ~IteratorStateWriter() {}
+};
+
+// Forward declarations to avoid introducing a dependency on headers in
+// "tensorflow/core/graph/...".
+class GraphDefBuilder;
+class GraphDatasetBase;
+class Node;
+
+// Wrapper around GraphDefBuilder. Used to serialize Dataset graph.
+class GraphDefBuilderWrapper {
+ public:
+  explicit GraphDefBuilderWrapper(GraphDefBuilder* b) : b_(b) {}
+
+  // Adds a Const node with scalar value to the Graph.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  template <typename T>
+  Status AddScalar(const T& val, Node** output) {
+    Tensor val_t = Tensor(DataTypeToEnum<T>::v(), TensorShape({}));
+    val_t.scalar<T>()() = val;
+    AddTensorInternal(val_t, output);
+    if (*output == nullptr) {
+      return errors::Internal("AddScalar: Failed to build Const op.");
+    }
+    return Status::OK();
+  }
+
+  // Adds a Const node with vector value to the Graph.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  // TODO(shivaniagrawal): Consider changing to gtl::ArraySlice?
+  template <typename T>
+  Status AddVector(const std::vector<T>& val, Node** output) {
+    Tensor val_t = Tensor(DataTypeToEnum<T>::v(),
+                          TensorShape({static_cast<int64>(val.size())}));
+    for (int i = 0; i < val.size(); i++) {
+      val_t.flat<T>()(i) = val[i];
+    }
+    AddTensorInternal(val_t, output);
+    if (*output == nullptr) {
+      return errors::Internal("AddVector: Failed to build Const op.");
+    }
+    return Status::OK();
+  }
+
+  // Adds a Const node with Tensor value to the Graph.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  Status AddTensor(const Tensor& val, Node** output) {
+    AddTensorInternal(val, output);
+    if (*output == nullptr) {
+      return errors::Internal("AddTesor: Failed to build Const op.");
+    }
+    return Status::OK();
+  }
+
+  Status AddDataset(const GraphDatasetBase* dataset,
+                    const std::vector<Node*>& inputs, Node** output) {
+    return AddDataset(dataset, inputs, {}, output);
+  }
+
+  // Adds a node corresponding to the `DatasetType` to the Graph.
+  // Return value of `DatasetType::op_name()` is used as the op type for the
+  // node.
+  // Values for the output_types and output_shapes node attributes are also
+  // written if those attributes are defined in the OpDef.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  Status AddDataset(const GraphDatasetBase* dataset,
+                    const std::vector<Node*>& inputs,
+                    const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
+                    Node** output) {
+    std::vector<std::pair<size_t, Node*>> enumerated_inputs(inputs.size());
+    for (int i = 0; i < inputs.size(); i++) {
+      enumerated_inputs[i] = std::make_pair(i, inputs[i]);
+    }
+    return AddDataset(dataset, enumerated_inputs, {}, attrs, output);
+  }
+
+  Status AddDataset(
+      const GraphDatasetBase* dataset,
+      const std::vector<std::pair<size_t, Node*>>& inputs,
+      const std::vector<std::pair<size_t, gtl::ArraySlice<Node*>>>& list_inputs,
+      const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
+      Node** output);
+
+  // Adds a user-defined function with name `function_name` to the graph and
+  // recursively adds all functions it references. If a function with a matching
+  // name has already been added, returns with OK status. If a user-defined with
+  // name `function_name` is not found in the FunctionLibraryDefinition, returns
+  // an InvalidArgumentError. If the function with name `function_name` or any
+  // of its dependent functions are stateful, returns an InvalidArgument error.
+  Status AddFunction(OpKernelContext* ctx, const string& function_name);
+
+  template <typename T>
+  void BuildAttrValue(const T& value, AttrValue* attr) {
+    SetAttrValue(value, attr);
+  }
+
+ private:
+  void AddTensorInternal(const Tensor& val, Node** output);
+
+  Status EnsureFunctionIsStateless(OpKernelContext* ctx,
+                                   const string& function_name) const {
+    const FunctionLibraryDefinition* lib_def =
+        ctx->function_library()->GetFunctionLibraryDefinition();
+    const FunctionDef* function_def = lib_def->Find(function_name);
+    if (!function_def) {
+      return errors::InvalidArgument("Unable to find FunctionDef for ",
+                                     function_name, " in registry.");
+    }
+    for (const NodeDef& node_def : function_def->node_def()) {
+      const OpDef* op_def;
+      TF_RETURN_IF_ERROR(lib_def->LookUpOpDef(node_def.op(), &op_def));
+      // TODO(b/65524810): Hack to allow functions to capture Dataset op
+      // nodes needed for FlatMap. Currently, source datasets nodes have been
+      // marked stateful to avoid constant folding since we do not have a
+      // good way of serializing them.
+      if (IsOpWhitelisted(op_def)) {
+        continue;
+      }
+      if (op_def->is_stateful()) {
+        return errors::InvalidArgument(
+            "Op[name: ", node_def.name(), ", type: ", node_def.op(), "] ",
+            "in function ", function_name, " is stateful. ",
+            "Saving stateful functions is not supported yet.");
+      }
+    }
+    return Status::OK();
+  }
+
+  bool IsOpWhitelisted(const OpDef* op_def) const {
+    return StringPiece(op_def->name()).ends_with("Dataset") &&
+           HasAttr(op_def, "output_shapes");
+  }
+
+  bool HasAttr(const string& op_type_name, const string& attr_name) const;
+
+  bool HasAttr(const OpDef* op_def, const string& attr_name) const {
+    for (auto attr : op_def->attr()) {
+      if (attr.name() == attr_name) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  Status AddAttrFunctions(const AttrValue& attr_value, OpKernelContext* ctx) {
+    if (attr_value.has_func()) {
+      TF_RETURN_IF_ERROR(AddFunction(ctx, attr_value.func().name()));
+    } else if (attr_value.has_list()) {
+      for (const NameAttrList& name_attr_list : attr_value.list().func()) {
+        TF_RETURN_IF_ERROR(AddFunction(ctx, name_attr_list.name()));
+      }
+    }
+    return Status::OK();
+  }
+
+  GraphDefBuilder* b_;
+};
+
+class StatsAggregator;
+
+// A cut-down version of OpKernelContext for running computations in
+// iterators. Note that we cannot simply use OpKernelContext here
+// because we might run computation in an iterator whose lifetime is
+// not nested within the lifetime of a single OpKernelContext
+// (e.g. asynchronous prefetching).
+//
+// TODO(mrry): We will probably need to support more of
+// OpKernelContext here. For example, should allocation be handled by
+// the IteratorContext?
+// TODO(mrry): We're making some daring assumptions about the lifetime
+// of the runner passed in here. A runner will be deleted when the original
+// step ends, but all existing runners only close over session-lifetime (or
+// longer-lived) state, so we can make a copy of the function. There's nothing
+// in the definition of the API from which we took the runner to guarantee that
+// what we are doing is safe. We should formalize the properties here.
+class IteratorContext {
+ public:
+  struct Params {
+    // Interface to operating system functionality.
+    Env* env;
+
+    // Function call support.
+    std::function<void(std::function<void()>)> runner = nullptr;
+
+    // A function that returns the current `StatsAggregator` instance to be
+    // used when recording statistics about the iterator.
+    //
+    // NOTE(mrry): This is somewhat awkward, because (i) the `StatsAggregator`
+    // is a property of the `IteratorResource` (which this class does not know
+    // about), and (ii) it can change after the `IteratorContext` has been
+    // created. Better suggestions are welcome!
+    std::function<std::shared_ptr<StatsAggregator>()> stats_aggregator_getter =
+        nullptr;
+  };
+
+  explicit IteratorContext(Params params) : params_(std::move(params)) {}
+
+  Env* env() const { return params_.env; }
+
+  std::function<void(std::function<void()>)>* runner() {
+    return &params_.runner;
+  }
+
+  std::shared_ptr<StatsAggregator> stats_aggregator() {
+    if (params_.stats_aggregator_getter) {
+      return params_.stats_aggregator_getter();
+    } else {
+      return nullptr;
+    }
+  }
+
+ private:
+  Params params_;
+};
+
+// Represents the current position in a range of outputs, where the
+// range of outputs is typically represented by an `DatasetBase`,
+// defined below.
+class IteratorBase {
+ public:
+  virtual ~IteratorBase() {}
+
+  // Gets the next output from the range that this iterator is traversing.
+  //
+  // If at least one output remains in this iterator's range, that
+  // output will be stored in `*out_tensors` and `false` will be
+  // stored in `*end_of_sequence`.
+  //
+  // If no more outputs remain in this iterator's range, `true` will
+  // be stored in `*end_of_sequence`, and the content of
+  // `*out_tensors` will be undefined.
+  //
+  // This method is thread-safe.
+  //
+  // TODO(mrry): Define `GetNextAsync()` or `GetNextManyAsync()`, and
+  // potentially remove this method.
+  virtual Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) = 0;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this
+  // iterator.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this iterator.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+
+  // Saves the state of this iterator.
+  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) {
+    return SaveInternal(writer);
+  }
+
+  // Restores the state of this iterator.
+  virtual Status Restore(OpKernelContext* ctx, IteratorStateReader* reader) {
+    return RestoreInternal(ctx, reader);
+  }
+
+ protected:
+  // This is needed so that sub-classes of IteratorBase can call
+  // `SaveInternal` on their parent iterators, e.g., in
+  // `RepeatDataasetOp::Dataset`.
+  Status SaveParent(IteratorStateWriter* writer,
+                    const std::unique_ptr<IteratorBase>& parent) {
+    return parent->SaveInternal(writer);
+  }
+
+  // This is needed so that sub-classes of IteratorBase can call
+  // `RestoreInternal` on their parent iterators, e.g., in
+  // `RepeatDataasetOp::Dataset`.
+  Status RestoreParent(OpKernelContext* ctx, IteratorStateReader* reader,
+                       const std::unique_ptr<IteratorBase>& parent) {
+    return parent->RestoreInternal(ctx, reader);
+  }
+
+  // Saves the state of this iterator recursively.
+  virtual Status SaveInternal(IteratorStateWriter* writer) {
+    return errors::Unimplemented("SaveInternal");
+  }
+
+  // Restores the state of this iterator recursively.
+  virtual Status RestoreInternal(OpKernelContext* ctx,
+                                 IteratorStateReader* reader) {
+    return errors::Unimplemented("RestoreInternal");
+  }
+};
+
+// Represents a (potentially infinite) range of outputs, where each
+// output is a tuple of tensors.
+class DatasetBase : public core::RefCounted {
+ public:
+  // Returns a new iterator for iterating over the range of elements in
+  // this dataset.
+  //
+  // This method may be called multiple times on the same instance,
+  // and the resulting iterators will have distinct state. Each
+  // iterator will traverse all elements in this dataset from the
+  // start.
+  //
+  // Ownership of the created iterator will be transferred to the caller.
+  //
+  // The prefix identifies the sequence of iterators leading up to the newly
+  // created iterator.
+  virtual std::unique_ptr<IteratorBase> MakeIterator(
+      const string& prefix) const = 0;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this
+  // dataset.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this dataset.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+
+  // A human-readable debug string for this dataset.
+  virtual string DebugString() = 0;
+
+  // Serializes the dataset and writes it to the `writer`.
+  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) const {
+    return errors::Unimplemented("DatasetBase::Save");
+  }
+
+ protected:
+  // TODO(srbs): Ideally all graph related logic should reside in
+  // GraphDatasetBase. However, that would require Datasets defined in all ops
+  // to derive from GraphDatasetBase. Once that is done we can move
+  // DatasetGraphDefBuilder and AsGraphDefInternal to GraphDatasetBase.
+  class DatasetGraphDefBuilder : public GraphDefBuilderWrapper {
+   public:
+    DatasetGraphDefBuilder(GraphDefBuilder* b) : GraphDefBuilderWrapper(b) {}
+    Status AddParentDataset(OpKernelContext* ctx, const DatasetBase* dataset,
+                            Node** output) {
+      return dataset->AsGraphDefInternal(ctx, this, output);
+    }
+  };
+
+  virtual Status AsGraphDefInternal(OpKernelContext* ctx,
+                                    DatasetGraphDefBuilder* b,
+                                    Node** node) const {
+    return AsGraphDefInternal(b, node);
+  }
+
+  virtual Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                                    Node** node) const {
+    return errors::Unimplemented("AsGraphDefInternal");
+  }
+};
+
+// Base-class for datasets that are built by ops.
+class GraphDatasetBase : public DatasetBase {
+ public:
+  GraphDatasetBase(OpKernelContext* ctx)
+      : op_name_(ctx->op_kernel().type_string()) {}
+
+  const string op_name() const { return op_name_; }
+
+  Status Save(OpKernelContext* ctx,
+              IteratorStateWriter* writer) const override {
+    string serialized_graph_def;
+    string output_node;
+    TF_RETURN_IF_ERROR(Serialize(ctx, &serialized_graph_def, &output_node));
+    TF_RETURN_IF_ERROR(
+        writer->WriteScalar(kDatasetGraphKey, serialized_graph_def));
+    TF_RETURN_IF_ERROR(
+        writer->WriteScalar(kDatasetGraphOutputNodeKey, output_node));
+    return Status::OK();
+  }
+
+  // Key for storing the Dataset graph in the serialized format.
+  static const char kDatasetGraphKey[];
+
+  // Key for storing the output node of the Dataset graph in the serialized
+  // format.
+  static const char kDatasetGraphOutputNodeKey[];
+
+ private:
+  Status Serialize(OpKernelContext* ctx, string* serialized_graph_def,
+                   string* output_node) const;
+
+  const string op_name_;
+};
+
+// Represents an iterator that is associated with a particular parent dataset.
+template <class DatasetType>
+class DatasetIterator : public IteratorBase {
+ public:
+  struct Params {
+    // Owns one reference on the shared dataset resource.
+    const DatasetType* dataset;
+
+    // Identifies the sequence of iterators leading up to this iterator.
+    const string prefix;
+  };
+
+  explicit DatasetIterator(const Params& params) : params_(params) {
+    params_.dataset->Ref();
+  }
+
+  ~DatasetIterator() override { params_.dataset->Unref(); }
+
+  // The dataset from which this iterator was created.
+  const DatasetType* dataset() const { return params_.dataset; }
+
+  // The sequence of iterators leading up to this iterator.
+  const string prefix() const { return params_.prefix; }
+
+  const DataTypeVector& output_dtypes() const override {
+    return params_.dataset->output_dtypes();
+  }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return params_.dataset->output_shapes();
+  }
+
+  Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                 bool* end_of_sequence) final {
+    port::Tracing::TraceMe activity(params_.prefix);
+    return GetNextInternal(ctx, out_tensors, end_of_sequence);
+  }
+
+  Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) final {
+    TF_RETURN_IF_ERROR(dataset()->Save(ctx, writer));
+    return IteratorBase::Save(ctx, writer);
+  }
+
+ protected:
+  // Internal implementation of GetNext that is wrapped in tracing logic.
+  virtual Status GetNextInternal(IteratorContext* ctx,
+                                 std::vector<Tensor>* out_tensors,
+                                 bool* end_of_sequence) = 0;
+
+  string full_name(const string& name) const {
+    return strings::StrCat(prefix(), ":", name);
+  }
+
+ private:
+  Params params_;
+};
+
+// Encapsulates the work required to plug a DatasetBase into the core TensorFlow
+// graph execution engine.
+class DatasetOpKernel : public OpKernel {
+ public:
+  DatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) final;
+
+ protected:
+  // Subclasses should implement this method. It will be called during Compute
+  // execution.
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase** output) = 0;
+
+  template <typename T>
+  Status ParseScalarArgument(OpKernelContext* ctx,
+                             const StringPiece& argument_name, T* output) {
+    const Tensor* argument_t;
+    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
+      return errors::InvalidArgument(argument_name, " must be a scalar");
+    }
+    *output = argument_t->scalar<T>()();
+    return Status::OK();
+  }
+};
+
+// Encapsulates the work required to plug unary Datasets into the core
+// TensorFlow graph execution engine.
+class UnaryDatasetOpKernel : public DatasetOpKernel {
+ public:
+  UnaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                           DatasetBase** output) = 0;
+};
+
+// Encapsulates the work required to plug binary Datasets into the core
+// TensorFlow graph execution engine.
+class BinaryDatasetOpKernel : public DatasetOpKernel {
+ public:
+  BinaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                           DatasetBase* another_input,
+                           DatasetBase** output) = 0;
+};
+
+// Validates and extracts a `DatasetBase` object from `tensor`.
+//
+// `tensor` must have been written by a call to SetVariantTensorToDataset().
+//
+// The retrieved pointer is a borrowed reference to the dataset, which is owned
+// by the tensor. The consumer must either acquire its own reference to the
+// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
+// destroyed or mutated while the retrieved pointer is in use.
+Status GetDatasetFromVariantTensor(const Tensor& tensor,
+                                   DatasetBase** out_dataset);
+
+// Stores a `DatasetBase` object in `tensor`.
+//
+// The ownership of `dataset` is transferred to `tensor`.
+Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
diff --git a/tensorflow/core/kernels/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
similarity index 91%
rename from tensorflow/core/kernels/dataset_utils.cc
rename to tensorflow/core/kernels/data/dataset_utils.cc
index cd58c8091211ae75265f6cfecb65746965f98d2f..1afc823e056e44aeb9f4d8ba186997304f2e2d87 100644
--- a/tensorflow/core/kernels/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/dataset_utils.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 
 namespace tensorflow {
 
@@ -32,12 +32,13 @@ Status MakeIteratorFromInputElement(
   // is always 0, so a negative random step ID should suffice.
   opts.step_id = CapturedFunction::generate_step_id();
   ScopedStepContainer step_container(
-      opts.step_id, [captured_func, ctx](const string& name) {
+      opts.step_id, [captured_func](const string& name) {
         captured_func->resource_manager()->Cleanup(name).IgnoreError();
       });
   opts.step_container = &step_container;
   std::vector<Tensor> return_values;
-  TF_RETURN_IF_ERROR(captured_func->Run(opts, input_element, &return_values));
+  TF_RETURN_IF_ERROR(
+      captured_func->RunWithBorrowedArgs(opts, input_element, &return_values));
 
   if (!(return_values.size() == 1 && return_values[0].dtype() == DT_VARIANT &&
         TensorShapeUtils::IsScalar(return_values[0].shape()))) {
diff --git a/tensorflow/core/kernels/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
similarity index 77%
rename from tensorflow/core/kernels/dataset_utils.h
rename to tensorflow/core/kernels/data/dataset_utils.h
index eea2b8802b813808f752659a469c3818a52162d2..40bc8735847f56157d81f6d5fb7a2d02291232fe 100644
--- a/tensorflow/core/kernels/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_UTILS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_DATASET_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_DATASET_UTILS_H_
 
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/captured_function.h"
-#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -32,4 +32,4 @@ Status MakeIteratorFromInputElement(
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_UTILS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_DATASET_UTILS_H_
diff --git a/tensorflow/core/kernels/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
similarity index 99%
rename from tensorflow/core/kernels/dense_to_sparse_batch_dataset_op.cc
rename to tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
index e80d11eaea1640c54c21a7b94a2f043099c790f3..fe0e498a3b7402aea1b8eb342a04bec2f172a71b 100644
--- a/tensorflow/core/kernels/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
similarity index 62%
rename from tensorflow/core/kernels/filter_dataset_op.cc
rename to tensorflow/core/kernels/data/filter_dataset_op.cc
index a69040b3bba34f08aede66e1f97c3e7092978ae3..937222846559454cdfcf590808cb2c3be5617419 100644
--- a/tensorflow/core/kernels/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -12,15 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 
-#include "tensorflow/core/kernels/captured_function.h"
-
 namespace tensorflow {
 
 namespace {
@@ -51,17 +49,21 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
                                                  std::move(other_arguments),
                                                  &captured_func));
 
-    *output = new Dataset(input, std::move(captured_func));
+    *output = new Dataset(ctx, input, func_, std::move(captured_func));
   }
 
  private:
   const int graph_def_version_;
 
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(const DatasetBase* input,
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func)
-        : input_(input), captured_func_(std::move(captured_func)) {
+        : GraphDatasetBase(ctx),
+          input_(input),
+          func_(func),
+          captured_func_(std::move(captured_func)) {
       input_->Ref();
     }
 
@@ -82,6 +84,35 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "FilterDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      Node* input_graph_node;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(func_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {{0, input_graph_node}}, {{1, other_arguments}},
+          {{"predicate", f}, {"Targuments", other_arguments_types_attr}},
+          output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -98,16 +129,25 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
         // non-deterministic order.
         bool matched;
         do {
-          TF_RETURN_IF_ERROR(
-              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+          {
+            tf_shared_lock l(mu_);
+            if (!input_impl_) {
+              *end_of_sequence = true;
+              return Status::OK();
+            }
+            TF_RETURN_IF_ERROR(
+                input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+          }
           if (*end_of_sequence) {
+            mutex_lock l(mu_);
+            input_impl_.reset();
             return Status::OK();
           }
 
           FunctionLibraryRuntime::Options opts;
           opts.step_id = CapturedFunction::generate_step_id();
           ScopedStepContainer step_container(
-              opts.step_id, [this, ctx](const string& name) {
+              opts.step_id, [this](const string& name) {
                 dataset()
                     ->captured_func_->resource_manager()
                     ->Cleanup(name)
@@ -120,7 +160,8 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
           Notification n;
           Status ret;
           std::vector<Tensor> result;
-          ret = dataset()->captured_func_->Run(opts, *out_tensors, &result);
+          ret = dataset()->captured_func_->RunWithBorrowedArgs(
+              opts, *out_tensors, &result);
 
           if (!ret.ok()) {
             return ret;
@@ -139,11 +180,34 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impl_)
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        else
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impls_empty"), ""));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (reader->Contains(full_name("input_impls_empty")))
+          input_impl_.reset();
+        else
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
      private:
-      const std::unique_ptr<IteratorBase> input_impl_;
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
+    const NameAttrList func_;
     const std::unique_ptr<CapturedFunction> captured_func_;
   };
 
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a3c03c9916a56d82043ecde3823aeba8a24e87f4
--- /dev/null
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -0,0 +1,282 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class FlatMapDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit FlatMapDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx),
+        graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs));
+    std::vector<Tensor> other_arguments;
+    other_arguments.reserve(inputs.size());
+    for (const Tensor& t : inputs) {
+      other_arguments.push_back(t);
+    }
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
+                                                 std::move(other_arguments),
+                                                 &captured_func));
+
+    *output = new Dataset(ctx, input, func_, std::move(captured_func),
+                          output_types_, output_shapes_);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
+            std::unique_ptr<CapturedFunction> captured_func,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : GraphDatasetBase(ctx),
+          input_(input),
+          func_(func),
+          captured_func_(std::move(captured_func)),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::FlatMap")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override { return "FlatMapDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(func_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {std::make_pair(0, input_graph_node)},  // Single tensor inputs.
+          {std::make_pair(1, other_arguments)},         // Tensor list inputs.
+          {std::make_pair("f", f),
+           std::make_pair("Targuments", other_arguments_types_attr)},  // Attrs
+          output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          if (!input_impl_) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+          if (current_element_iterator_) {
+            // We are currently precessing a mapped element, so try to get the
+            // next subelement.
+            bool end_of_element;
+            TF_RETURN_IF_ERROR(current_element_iterator_->GetNext(
+                ctx, out_tensors, &end_of_element));
+            if (!end_of_element) {
+              // Produce the subelement as output.
+              *end_of_sequence = false;
+              return Status::OK();
+            }
+
+            // We have reached the end of the current element, so maybe move on
+            // to the next element.
+            current_element_iterator_.reset();
+          }
+
+          // Get the next element from the input dataset.
+          captured_func_inputs_.clear();
+          TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &captured_func_inputs_,
+                                                  end_of_sequence));
+          if (*end_of_sequence) {
+            input_impl_.reset();
+            return Status::OK();
+          }
+
+          TF_RETURN_IF_ERROR(BuildCurrentElementIteratorLocked(ctx));
+        } while (true);
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("element_index"), element_index_));
+          if (current_element_iterator_) {
+            TF_RETURN_IF_ERROR(
+                writer->WriteScalar(full_name("captured_func_inputs_size"),
+                                    captured_func_inputs_.size()));
+            for (int i = 0; i < captured_func_inputs_.size(); i++) {
+              TF_RETURN_IF_ERROR(writer->WriteTensor(
+                  full_name(strings::StrCat("captured_func_inputs[", i, "]")),
+                  captured_func_inputs_[i]));
+            }
+            TF_RETURN_IF_ERROR(SaveParent(writer, current_element_iterator_));
+          } else {
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name("current_element_iterator_uninitialized"), ""));
+          }
+        } else {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("exhausted"), ""));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        input_impl_.reset();
+        element_index_ = 0;
+        current_element_iterator_.reset();
+        captured_func_inputs_.clear();
+        if (!reader->Contains(full_name("exhausted"))) {
+          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          {
+            int64 temp;
+            TF_RETURN_IF_ERROR(
+                reader->ReadScalar(full_name("element_index"), &temp));
+            element_index_ = temp;
+          }
+          if (!reader->Contains(
+                  full_name("current_element_iterator_uninitialized"))) {
+            size_t captured_func_inputs_size;
+            {
+              int64 temp;
+              TF_RETURN_IF_ERROR(reader->ReadScalar(
+                  full_name("captured_func_inputs_size"), &temp));
+              captured_func_inputs_size = static_cast<size_t>(temp);
+            }
+            captured_func_inputs_.reserve(captured_func_inputs_size);
+            for (int i = 0; i < captured_func_inputs_size; i++) {
+              captured_func_inputs_.emplace_back();
+              TF_RETURN_IF_ERROR(reader->ReadTensor(
+                  full_name(strings::StrCat("captured_func_inputs[", i, "]")),
+                  &captured_func_inputs_.back()));
+            }
+            element_index_--;
+            TF_RETURN_IF_ERROR(BuildCurrentElementIteratorLocked(ctx));
+            TF_RETURN_IF_ERROR(
+                RestoreParent(ctx, reader, current_element_iterator_));
+          }
+        }
+        return Status::OK();
+      }
+
+     private:
+      Status BuildCurrentElementIteratorLocked(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        return dataset::MakeIteratorFromInputElement(
+            ctx, captured_func_inputs_, element_index_++,
+            dataset()->captured_func_.get(), prefix(),
+            &current_element_iterator_);
+      }
+
+      Status BuildCurrentElementIteratorLocked(OpKernelContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        IteratorContext::Params params;
+        params.env = ctx->env();
+        params.runner = *(ctx->runner());
+        IteratorContext iter_ctx(std::move(params));
+        return BuildCurrentElementIteratorLocked(&iter_ctx);
+      }
+
+      mutex mu_;
+      size_t element_index_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> current_element_iterator_ GUARDED_BY(mu_);
+      std::vector<Tensor> captured_func_inputs_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    const NameAttrList func_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  NameAttrList func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("FlatMapDataset").Device(DEVICE_CPU),
+                        FlatMapDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
similarity index 94%
rename from tensorflow/core/kernels/group_by_window_dataset_op.cc
rename to tensorflow/core/kernels/data/group_by_window_dataset_op.cc
index 8644bcf9b509b7aaf335791b583ad8e82073f471..35ac67fce5c5fa107192570a70aa9e7e29fb1e6a 100644
--- a/tensorflow/core/kernels/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
@@ -17,12 +17,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/window_dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 
-#include "tensorflow/core/kernels/captured_function.h"
-#include "tensorflow/core/kernels/dataset.h"
-#include "tensorflow/core/kernels/window_dataset.h"
-
 namespace tensorflow {
 
 namespace {
@@ -169,7 +168,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
               opts.step_id = CapturedFunction::generate_step_id();
               opts.runner = ctx->runner();
               ScopedStepContainer step_container(
-                  opts.step_id, [this, ctx](const string& name) {
+                  opts.step_id, [this](const string& name) {
                     dataset()
                         ->captured_key_func_->resource_manager()
                         ->Cleanup(name)
@@ -180,8 +179,9 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
               // Run the key function on the input element to identify its
               // group.
               std::vector<Tensor> key_func_output;
-              TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Run(
-                  opts, next_input_element, &key_func_output));
+              TF_RETURN_IF_ERROR(
+                  dataset()->captured_key_func_->RunWithBorrowedArgs(
+                      opts, next_input_element, &key_func_output));
 
               if (key_func_output.size() != 1 ||
                   key_func_output[0].dtype() != DT_INT64 ||
@@ -198,7 +198,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
                 opts2.step_id = CapturedFunction::generate_step_id();
                 opts2.runner = ctx->runner();
                 ScopedStepContainer step_container2(
-                    opts2.step_id, [this, ctx](const string& name) {
+                    opts2.step_id, [this](const string& name) {
                       dataset()
                           ->captured_window_size_func_->resource_manager()
                           ->Cleanup(name)
@@ -210,7 +210,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
                 // window size.
                 std::vector<Tensor> window_size_func_output;
                 TF_RETURN_IF_ERROR(dataset()->captured_window_size_func_->Run(
-                    opts2, key_func_output, &window_size_func_output));
+                    opts2, std::move(key_func_output),
+                    &window_size_func_output));
 
                 if (window_size_func_output.size() != 1 ||
                     window_size_func_output[0].dtype() != DT_INT64 ||
@@ -257,7 +258,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
         opts.step_id = CapturedFunction::generate_step_id();
         opts.runner = ctx->runner();
         ScopedStepContainer step_container(
-            opts.step_id, [this, ctx](const string& name) {
+            opts.step_id, [this](const string& name) {
               dataset()
                   ->captured_reduce_func_->resource_manager()
                   ->Cleanup(name)
@@ -282,8 +283,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
             {std::move(key_arg), std::move(group_dataset_arg)});
         std::vector<Tensor> return_values;
 
-        TF_RETURN_IF_ERROR(
-            dataset()->captured_reduce_func_->Run(opts, args, &return_values));
+        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Run(
+            opts, std::move(args), &return_values));
 
         if (!(return_values.size() == 1 &&
               return_values[0].dtype() == DT_VARIANT &&
diff --git a/tensorflow/core/kernels/ignore_errors_dataset_op.cc b/tensorflow/core/kernels/data/ignore_errors_dataset_op.cc
similarity index 57%
rename from tensorflow/core/kernels/ignore_errors_dataset_op.cc
rename to tensorflow/core/kernels/data/ignore_errors_dataset_op.cc
index 568e7ade0ef5e4bc9648ffcfdc7e40cdc01d11a0..beedc7c6777bfa5240bd380081359adfdc3e3aca 100644
--- a/tensorflow/core/kernels/ignore_errors_dataset_op.cc
+++ b/tensorflow/core/kernels/data/ignore_errors_dataset_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
@@ -32,13 +31,14 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    *output = new Dataset(input);
+    *output = new Dataset(ctx, input);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    explicit Dataset(const DatasetBase* input) : input_(input) {
+    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input)
+        : GraphDatasetBase(ctx), input_(input) {
       input_->Ref();
     }
 
@@ -59,6 +59,15 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "IgnoreErrorsDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -69,16 +78,49 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
-        Status s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
-        while (!s.ok()) {
-          out_tensors->clear();
-          s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+        {
+          tf_shared_lock l(mu_);
+          if (!input_impl_) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+          Status s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+          while (!s.ok()) {
+            out_tensors->clear();
+            s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+          }
         }
+        if (*end_of_sequence) {
+          mutex_lock l(mu_);
+          input_impl_.reset();
+        }
+        return Status::OK();
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impl_)
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        else
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impls_empty"), ""));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (reader->Contains(full_name("input_impls_empty")))
+          input_impl_.reset();
+        else
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
         return Status::OK();
       }
 
      private:
-      const std::unique_ptr<IteratorBase> input_impl_;
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
similarity index 53%
rename from tensorflow/core/kernels/interleave_dataset_op.cc
rename to tensorflow/core/kernels/data/interleave_dataset_op.cc
index c01d1c7cbb0c460cd5facf7b2a1b3b8af9abe6bc..81d7b754982d834e0323e4803ecbce6ebff4d56d 100644
--- a/tensorflow/core/kernels/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -13,16 +13,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/random/random.h"
 
-#include "tensorflow/core/kernels/captured_function.h"
-#include "tensorflow/core/kernels/dataset_utils.h"
-
 namespace tensorflow {
 
 namespace {
@@ -73,18 +71,22 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
                                                  std::move(other_arguments),
                                                  &captured_func));
 
-    *output = new Dataset(input, std::move(captured_func), cycle_length,
-                          block_length, output_types_, output_shapes_);
+    *output =
+        new Dataset(ctx, input, func_, std::move(captured_func), cycle_length,
+                    block_length, output_types_, output_shapes_);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(const DatasetBase* input,
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
             int64 block_length, const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : input_(input),
+        : GraphDatasetBase(ctx),
+          input_(input),
+          func_(func),
           captured_func_(std::move(captured_func)),
           cycle_length_(cycle_length),
           block_length_(block_length),
@@ -110,13 +112,47 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "InterleaveDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      Node* cycle_length_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(cycle_length_, &cycle_length_node));
+      Node* block_length_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(block_length_, &block_length_node));
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(func_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this,
+          {{0, input_node}, {2, cycle_length_node}, {3, block_length_node}},
+          {{1, other_arguments}},
+          {{"f", f}, {"Targuments", other_arguments_types_attr}}, output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
             input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
-            current_elements_(params.dataset->cycle_length_) {}
+            current_elements_(params.dataset->cycle_length_),
+            args_list_(params.dataset->cycle_length_) {}
 
       void AdvanceToNextInCycle() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         block_index_ = 0;
@@ -150,18 +186,19 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
             // We have reached the end of the current element, so move
             // on to the next element in the cycle.
             current_elements_[cycle_index_].reset();
+            args_list_[cycle_index_].clear();
             --num_open_;
             AdvanceToNextInCycle();
           } else if (!end_of_input_) {
             // Get the next element from the input dataset, and create
             // an iterator from it.
-            std::vector<Tensor> args;
-            TF_RETURN_IF_ERROR(
-                input_impl_->GetNext(ctx, &args, &end_of_input_));
+            TF_RETURN_IF_ERROR(input_impl_->GetNext(
+                ctx, &args_list_[cycle_index_], &end_of_input_));
             if (!end_of_input_) {
               TF_RETURN_IF_ERROR(dataset::MakeIteratorFromInputElement(
-                  ctx, args, cycle_index_, dataset()->captured_func_.get(),
-                  prefix(), &current_elements_[cycle_index_]));
+                  ctx, args_list_[cycle_index_], cycle_index_,
+                  dataset()->captured_func_.get(), prefix(),
+                  &current_elements_[cycle_index_]));
               ++num_open_;
             }
           } else {
@@ -173,11 +210,100 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("cycle_index"), cycle_index_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("block_index"), block_index_));
+        if (end_of_input_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("end_of_input"), ""));
+        }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("num_open"), num_open_));
+        TF_RETURN_IF_ERROR(SaveCurrentElements(writer));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        int64 cycle_index;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("cycle_index"), &cycle_index));
+        cycle_index_ = size_t(cycle_index);
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("block_index"), &block_index_));
+        if (reader->Contains(full_name("end_of_input"))) end_of_input_ = true;
+        int64 num_open;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("num_open"), &num_open));
+        num_open_ = size_t(num_open);
+        TF_RETURN_IF_ERROR(RestoreCurrentElements(ctx, reader));
+        return Status::OK();
+      }
+
      private:
+      Status SaveCurrentElements(IteratorStateWriter* writer)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        for (int idx = 0; idx < current_elements_.size(); idx++) {
+          if (current_elements_[idx]) {
+            TF_RETURN_IF_ERROR(SaveParent(writer, current_elements_[idx]));
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat("args_size[", idx, "]")),
+                args_list_[idx].size()));
+            for (int i = 0; i < args_list_[idx].size(); i++) {
+              TF_RETURN_IF_ERROR(writer->WriteTensor(
+                  full_name(strings::StrCat("args_list_[", idx, "][", i, "]")),
+                  args_list_[idx][i]));
+            }
+          }
+        }
+        return Status::OK();
+      }
+
+      Status RestoreCurrentElements(OpKernelContext* ctx,
+                                    IteratorStateReader* reader)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        IteratorContext::Params params;
+        params.env = ctx->env();
+        params.runner = *(ctx->runner());
+        IteratorContext iter_ctx(std::move(params));
+        for (int idx = 0; idx < current_elements_.size(); idx++) {
+          if (reader->Contains(
+                  full_name(strings::StrCat("args_size[", idx, "]")))) {
+            int64 args_size;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("args_size[", idx, "]")),
+                &args_size));
+            args_list_[idx].resize(args_size);
+            for (int i = 0; i < args_size; i++) {
+              TF_RETURN_IF_ERROR(reader->ReadTensor(
+                  full_name(strings::StrCat("args_list_[", idx, "][", i, "]")),
+                  &args_list_[idx][i]));
+            }
+            TF_RETURN_IF_ERROR(dataset::MakeIteratorFromInputElement(
+                &iter_ctx, args_list_[idx], idx,
+                dataset()->captured_func_.get(), prefix(),
+                &current_elements_[idx]));
+            TF_RETURN_IF_ERROR(
+                RestoreParent(ctx, reader, current_elements_[idx]));
+          } else {
+            current_elements_[idx].reset();
+          }
+        }
+        return Status::OK();
+      }
+
       mutex mu_;
       const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       std::vector<std::unique_ptr<IteratorBase>> current_elements_
           GUARDED_BY(mu_);
+      std::vector<std::vector<Tensor>> args_list_ GUARDED_BY(mu_);
       size_t cycle_index_ GUARDED_BY(mu_) = 0;
       int64 block_index_ GUARDED_BY(mu_) = 0;
       bool end_of_input_ GUARDED_BY(mu_) = false;
@@ -185,6 +311,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
     };
 
     const DatasetBase* const input_;
+    const NameAttrList func_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const int64 cycle_length_;
     const int64 block_length_;
diff --git a/tensorflow/core/kernels/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
similarity index 89%
rename from tensorflow/core/kernels/iterator_ops.cc
rename to tensorflow/core/kernels/data/iterator_ops.cc
index b48da5b32639f8880579b29c7c45aef90f0892ff..b7fdfab5fa25c7d8e3f55068c523f4db7033f2c1 100644
--- a/tensorflow/core/kernels/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -20,8 +20,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/kernels/dataset.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/stats_aggregator.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -168,6 +169,16 @@ class IteratorResource : public ResourceBase {
     return Status::OK();
   }
 
+  void set_stats_aggregator(std::shared_ptr<StatsAggregator> stats_aggregator) {
+    mutex_lock l(mu_);
+    stats_aggregator_ = std::move(stats_aggregator);
+  }
+
+  std::shared_ptr<StatsAggregator> stats_aggregator() {
+    tf_shared_lock l(mu_);
+    return stats_aggregator_;
+  }
+
   string DebugString() override { return "Iterator resource"; }
 
   const DataTypeVector& output_dtypes() const { return output_dtypes_; }
@@ -178,6 +189,8 @@ class IteratorResource : public ResourceBase {
 
  private:
   std::shared_ptr<IteratorBase> iterator_;
+  mutex mu_;
+  std::shared_ptr<StatsAggregator> stats_aggregator_ GUARDED_BY(mu_);
   const DataTypeVector output_dtypes_;
   const std::vector<PartialTensorShape> output_shapes_;
   const int graph_def_version_;
@@ -435,40 +448,60 @@ class MakeIteratorOp : public OpKernel {
   }
 };
 
-class ToSingleElementOp : public OpKernel {
+class ToSingleElementOp : public AsyncOpKernel {
  public:
-  explicit ToSingleElementOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  explicit ToSingleElementOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        thread_pool_(new thread::ThreadPool(
+            ctx->env(), ThreadOptions(),
+            strings::StrCat("to_single_element_op_thread_",
+                            SanitizeThreadSuffix(name())),
+            1 /* num_threads */, false /* low_latency_hint */)) {}
 
-  void Compute(OpKernelContext* ctx) override {
-    DatasetBase* dataset;
-    OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
-    auto iterator = dataset->MakeIterator("SingleElementIterator");
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    // The call to `iterator->GetNext()` may block and depend on an
+    // inter-op thread pool thread, so we issue the call from the
+    // owned thread pool.
+    thread_pool_->Schedule([ctx, done]() {
+      DatasetBase* dataset;
+      OP_REQUIRES_OK_ASYNC(
+          ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
+      auto iterator = dataset->MakeIterator("SingleElementIterator");
 
-    IteratorContext::Params params;
-    params.env = ctx->env();
-    params.runner = *(ctx->runner());
-    IteratorContext iter_ctx(std::move(params));
+      IteratorContext::Params params;
+      params.env = ctx->env();
+      params.runner = *(ctx->runner());
+      IteratorContext iter_ctx(std::move(params));
 
-    std::vector<Tensor> components;
-    components.reserve(dataset->output_dtypes().size());
-    bool end_of_sequence;
+      std::vector<Tensor> components;
+      components.reserve(dataset->output_dtypes().size());
+      bool end_of_sequence;
 
-    OP_REQUIRES_OK(ctx,
-                   iterator->GetNext(&iter_ctx, &components, &end_of_sequence));
-    OP_REQUIRES(ctx, !end_of_sequence,
-                errors::InvalidArgument("Dataset was empty."));
+      OP_REQUIRES_OK_ASYNC(
+          ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
+          done);
+      OP_REQUIRES_ASYNC(ctx, !end_of_sequence,
+                        errors::InvalidArgument("Dataset was empty."), done);
 
-    for (int i = 0; i < components.size(); ++i) {
-      // TODO(mrry): Check that the shapes match the shape attrs.
-      ctx->set_output(i, components[i]);
-    }
+      for (int i = 0; i < components.size(); ++i) {
+        // TODO(mrry): Check that the shapes match the shape attrs.
+        ctx->set_output(i, components[i]);
+      }
 
-    components.clear();
-    OP_REQUIRES_OK(ctx,
-                   iterator->GetNext(&iter_ctx, &components, &end_of_sequence));
-    OP_REQUIRES(ctx, end_of_sequence,
-                errors::InvalidArgument("Dataset had more than one element."));
+      components.clear();
+      OP_REQUIRES_OK_ASYNC(
+          ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
+          done);
+      OP_REQUIRES_ASYNC(
+          ctx, end_of_sequence,
+          errors::InvalidArgument("Dataset had more than one element."), done);
+
+      done();
+    });
   }
+
+ private:
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
 };
 
 class OneShotIteratorOp : public AsyncOpKernel {
@@ -684,6 +717,9 @@ class IteratorGetNextOp : public AsyncOpKernel {
 
       IteratorContext::Params params;
       params.env = ctx->env();
+      params.stats_aggregator_getter = [iterator]() {
+        return iterator->stats_aggregator();
+      };
       params.runner = *(ctx->runner());
       IteratorContext iter_ctx(std::move(params));
 
@@ -835,6 +871,31 @@ class DeserializeIteratorOp : public OpKernel {
   }
 };
 
+class IteratorSetStatsAggregatorOp : public OpKernel {
+ public:
+  explicit IteratorSetStatsAggregatorOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    IteratorResource* iterator_resource;
+    OP_REQUIRES_OK(
+        ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator_resource));
+    core::ScopedUnref unref_iterator(iterator_resource);
+
+    StatsAggregatorResource* stats_aggregator_resource;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 1),
+                                       &stats_aggregator_resource));
+    core::ScopedUnref unref_stats_aggregator(stats_aggregator_resource);
+    // TODO(mrry): Consider allowing multiple StatsAggregator ops to
+    // subscribe to updates, and/or unsubscribing.
+    OP_REQUIRES(ctx, !iterator_resource->stats_aggregator(),
+                errors::FailedPrecondition(
+                    "Iterator already associated with a StatsAggregator"));
+    iterator_resource->set_stats_aggregator(
+        stats_aggregator_resource->stats_aggregator());
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("Iterator").Device(DEVICE_CPU), IteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("MakeIterator").Device(DEVICE_CPU),
                         MakeIteratorOp);
@@ -852,6 +913,8 @@ REGISTER_KERNEL_BUILDER(Name("SerializeIterator").Device(DEVICE_CPU),
                         SerializeIteratorOp);
 REGISTER_KERNEL_BUILDER(Name("DeserializeIterator").Device(DEVICE_CPU),
                         DeserializeIteratorOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorSetStatsAggregator").Device(DEVICE_CPU),
+                        IteratorSetStatsAggregatorOp);
 
 }  // namespace
 
diff --git a/tensorflow/core/kernels/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
similarity index 66%
rename from tensorflow/core/kernels/map_and_batch_dataset_op.cc
rename to tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index 620efdb7781e677c94af4946033e02955ee412f3..2f3959772c47febc93935093c49f13c413ca955d 100644
--- a/tensorflow/core/kernels/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -14,13 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/captured_function.h"
-#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/kernels/inplace_ops_functor.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/tracing.h"
@@ -132,7 +132,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         if (current_batch_index_ != -1) {
           for (size_t batch_index = 0;
                batch_index < dataset()->num_parallel_batches_; ++batch_index) {
-            WaitForBatch(batch_index).IgnoreError();
+            int64 num_elements;
+            WaitForBatch(batch_index, &num_elements).IgnoreError();
             // Deallocate tensors allocated for the output.
             batch_results_[batch_index].output.clear();
           }
@@ -166,17 +167,35 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           }
         }
 
-        if (end_of_input_) {
+        int64 num_elements = 0;
+        Status status = WaitForBatch(current_batch_index_, &num_elements);
+        if (num_elements == 0) {
           *end_of_sequence = true;
           return Status::OK();
         }
-
-        Status status = WaitForBatch(current_batch_index_);
         if (!status.ok()) {
           // Deallocate tensors allocated for the output.
           batch_results_[current_batch_index_].output.clear();
         } else {
-          *out_tensors = std::move(batch_results_[current_batch_index_].output);
+          if (num_elements < dataset()->batch_size_) {
+            const std::vector<Tensor>& output =
+                batch_results_[current_batch_index_].output;
+            for (size_t i = 0; i < output.size(); ++i) {
+              TensorShape component_shape(
+                  batch_results_[current_batch_index_].output[i].shape());
+              component_shape.set_dim(0, num_elements);
+              Tensor component(cpu_allocator(), output[i].dtype(),
+                               component_shape);
+              TF_RETURN_IF_ERROR(
+                  CopyPartialBatch(&component, output[i], num_elements));
+              out_tensors->emplace_back(std::move(component));
+            }
+            // Deallocate tensors allocated for the output.
+            batch_results_[current_batch_index_].output.clear();
+          } else {
+            *out_tensors =
+                std::move(batch_results_[current_batch_index_].output);
+          }
           *end_of_sequence = false;
         }
         StartInvocationBatch(ctx, current_batch_index_);
@@ -195,6 +214,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
       struct InvocationResult {
         Status status;
+        bool end_of_input;
         std::vector<Tensor> return_values;
       };
 
@@ -202,6 +222,29 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         return batch_index * dataset()->batch_size_ + offset;
       }
 
+      Status CopyPartialBatch(Tensor* output, const Tensor& value,
+                              int64 num_elements) {
+        switch (value.dtype()) {
+#define CASE(type)                                                \
+  case DataTypeToEnum<type>::value: {                             \
+    auto output_t = output->flat_outer_dims<type>();              \
+    auto value_t = value.flat_outer_dims<type>();                 \
+    for (size_t i = 0; i < num_elements; i++) {                   \
+      output_t.template chip<0>(i) = value_t.template chip<0>(i); \
+    }                                                             \
+    return Status::OK();                                          \
+  }
+          TF_CALL_NUMBER_TYPES(CASE);
+          TF_CALL_string(CASE);
+          TF_CALL_variant(CASE);
+#undef CASE
+          default:
+            return errors::InvalidArgument("Unsupported data type: ",
+                                           value.dtype());
+        }
+        return Status::OK();
+      }
+
       void EnsureOutputAllocated(BatchResult* batch_result,
                                  const std::vector<Tensor>& return_values) {
         mutex_lock l(batch_result->mu);
@@ -228,8 +271,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         // Get the next input element.
         std::vector<Tensor> input_element;
         result->status =
-            input_impl_->GetNext(ctx, &input_element, &end_of_input_);
-        if (end_of_input_ || !result->status.ok()) {
+            input_impl_->GetNext(ctx, &input_element, &result->end_of_input);
+        if (result->end_of_input || !result->status.ok()) {
           batch_result->counter->DecrementCount();
           return;
         }
@@ -239,48 +282,67 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         // to unblock a consumer.
         FunctionLibraryRuntime::Options opts;
         opts.step_id = CapturedFunction::generate_step_id();
-        ScopedStepContainer* step_container = new ScopedStepContainer(
-            opts.step_id, [this, ctx](const string& name) {
+        ScopedStepContainer* step_container =
+            new ScopedStepContainer(opts.step_id, [this](const string& name) {
               dataset()
                   ->captured_func_->resource_manager()
                   ->Cleanup(name)
                   .IgnoreError();
             });
         opts.step_container = step_container;
-        opts.runner = ctx->runner();
-        dataset()->captured_func_->RunAsync(
-            opts, input_element, &result->return_values,
-            [this, result, step_container, batch_result,
-             offset](Status ret_status) {
-              delete step_container;
-              result->status.Update(ret_status);
-              if (ret_status.ok()) {
-                EnsureOutputAllocated(batch_result, result->return_values);
-                const size_t num_components = result->return_values.size();
-                for (size_t i = 0; i < num_components; ++i) {
-                  Tensor tensor = result->return_values[i];
-                  Tensor* batch = &(batch_result->output)[i];
-                  if (tensor.NumElements() !=
-                      (batch->NumElements() / batch->dim_size(0))) {
-                    TensorShape batch_shape = batch->shape();
-                    batch_shape.RemoveDim(0);
-                    result->status.Update(errors::InvalidArgument(
-                        "Cannot add tensor to the batch: number of "
-                        "elements does not match. Shapes are: [tensor]: ",
-                        tensor.shape().DebugString(),
-                        ", [batch]: ", batch_shape.DebugString()));
-                    break;
-                  }
-                  Status copy_status = ::tensorflow::functor::DoParallelConcat(
-                      *dataset()->device_, tensor, offset, batch);
-                  if (!copy_status.ok()) {
-                    result->status.Update(copy_status);
-                    break;
-                  }
-                }
-              }
-              batch_result->counter->DecrementCount();
-            });
+        std::function<void(std::function<void()>)>* runner =
+            new std::function<void(std::function<void()>)>(*ctx->runner());
+        opts.runner = runner;
+        (*ctx->runner())(std::bind(
+            [=](std::vector<Tensor> input_element) {
+              dataset()->captured_func_->RunAsync(
+                  opts, std::move(input_element), &result->return_values,
+                  [this, step_container, runner, result, batch_result,
+                   offset](Status ret_status) {
+                    delete step_container;
+                    delete runner;
+                    result->status.Update(ret_status);
+                    if (ret_status.ok()) {
+                      EnsureOutputAllocated(batch_result,
+                                            result->return_values);
+                      const size_t num_components =
+                          result->return_values.size();
+                      for (size_t i = 0; i < num_components; ++i) {
+                        const Tensor& tensor = result->return_values[i];
+                        Tensor* batch = &(batch_result->output)[i];
+                        if (tensor.NumElements() !=
+                            (batch->NumElements() / batch->dim_size(0))) {
+                          TensorShape batch_shape = batch->shape();
+                          batch_shape.RemoveDim(0);
+                          result->status.Update(errors::InvalidArgument(
+                              "Cannot add tensor to the batch: number of "
+                              "elements does not match. Shapes are: [tensor]: ",
+                              tensor.shape().DebugString(),
+                              ", [batch]: ", batch_shape.DebugString()));
+                          break;
+                        }
+                        // TODO(mrry): Add a version of DoParallelConcat that
+                        // allows us to move `tensor` where possible, to speed
+                        // up string tensor batching.
+                        Status copy_status =
+                            ::tensorflow::functor::DoParallelConcat(
+                                *dataset()->device_, tensor, offset, batch);
+                        if (!copy_status.ok()) {
+                          result->status.Update(copy_status);
+                          break;
+                        }
+                      }
+                    }
+                    // NOTE(mrry): We clear the return values here to release
+                    // any memory associated with them and to paralellize the
+                    // destruction of the tensors (which can be surprisingly
+                    // expensive for map functions with large numbers of return
+                    // values).
+                    result->return_values.clear();
+                    batch_result->counter->DecrementCount();
+                  });
+            },
+            std::move(input_element)));
       }
 
       void StartInvocationBatch(IteratorContext* ctx, int64 batch_index)
@@ -297,7 +359,10 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         for (size_t i = 0; i < dataset()->batch_size_; ++i) {
           size_t index = ComputeInvocationIndex(batch_index, i);
           InvocationResult* result = &invocation_results_[index];
-          *result = InvocationResult();
+          // Reset the state of `result`; `result->return_values` was cleared
+          // when the previous invocation completed.
+          result->end_of_input = false;
+          result->status = Status::OK();
         }
         // Start individual invocations.
         for (size_t i = 0; i < dataset()->batch_size_; ++i) {
@@ -305,13 +370,18 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
-      Status WaitForBatch(int64 batch_index) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      Status WaitForBatch(int64 batch_index, int64* num_elements)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         port::Tracing::TraceMe activity(strings::StrCat(prefix(), "::Wait"));
         batch_results_[batch_index].counter->Wait();
         Status status = Status::OK();
-        for (size_t i = 0; i < dataset()->batch_size_; ++i) {
+        for (size_t i = 0; i < dataset()->batch_size_; ++i, ++*num_elements) {
           size_t index = ComputeInvocationIndex(batch_index, i);
           InvocationResult* result = &invocation_results_[index];
+          if (result->end_of_input) {
+            VLOG(3) << "end of input encountered at element[" << i << "]: ";
+            return Status::OK();
+          }
           if (!result->status.ok()) {
             VLOG(3) << "failed to process element[" << i
                     << "]: " << result->status;
@@ -326,7 +396,6 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       std::vector<InvocationResult> invocation_results_ GUARDED_BY(mu_);
       std::vector<BatchResult> batch_results_ GUARDED_BY(mu_);
-      bool end_of_input_ GUARDED_BY(mu_) = false;
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
similarity index 95%
rename from tensorflow/core/kernels/map_dataset_op.cc
rename to tensorflow/core/kernels/data/map_dataset_op.cc
index 4ba09bc335e9682eef2a0c2042aa98e9b428d562..8fb1472e52ef9f7cf087237d2d7ed0c6049c4ae7 100644
--- a/tensorflow/core/kernels/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -12,15 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 
-#include "tensorflow/core/kernels/captured_function.h"
-
 namespace tensorflow {
 
 namespace {
@@ -100,7 +98,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
       DataTypeVector other_arguments_types(
           captured_func_->captured_inputs().size());
-      std::vector<NodeBuilder::NodeOut> other_arguments(
+      std::vector<Node*> other_arguments(
           captured_func_->captured_inputs().size());
       for (const Tensor& t : captured_func_->captured_inputs()) {
         Node* node;
@@ -146,7 +144,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
         FunctionLibraryRuntime::Options opts;
         opts.step_id = CapturedFunction::generate_step_id();
         ScopedStepContainer step_container(
-            opts.step_id, [this, ctx](const string& name) {
+            opts.step_id, [this](const string& name) {
               dataset()
                   ->captured_func_->resource_manager()
                   ->Cleanup(name)
@@ -156,7 +154,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
         opts.runner = ctx->runner();
         // TODO(mrry): Avoid blocking a threadpool thread. We will need to
         // stack-rip the iterators and use async kernels.
-        Status s = dataset()->captured_func_->Run(opts, args, out_tensors);
+        Status s =
+            dataset()->captured_func_->Run(opts, std::move(args), out_tensors);
         if (errors::IsOutOfRange(s)) {
           // `f` may deliberately raise `errors::OutOfRange` to indicate
           // that we should terminate the iteration early.
diff --git a/tensorflow/core/kernels/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
similarity index 81%
rename from tensorflow/core/kernels/padded_batch_dataset_op.cc
rename to tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index cfc77690b568a3223ca33f359f47fe22de9b35ff..00743324a82b4258f435aa66e0abd30530a366d7 100644
--- a/tensorflow/core/kernels/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -181,16 +180,18 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
       padding_values.push_back(tensor::DeepCopy(padding_value_t));
     }
 
-    *output = new Dataset(batch_size, std::move(padded_shapes),
+    *output = new Dataset(ctx, batch_size, std::move(padded_shapes),
                           std::move(padding_values), input);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(int64 batch_size, std::vector<PartialTensorShape> padded_shapes,
+    Dataset(OpKernelContext* ctx, int64 batch_size,
+            std::vector<PartialTensorShape> padded_shapes,
             std::vector<Tensor> padding_values, const DatasetBase* input)
-        : batch_size_(batch_size),
+        : GraphDatasetBase(ctx),
+          batch_size_(batch_size),
           padded_shapes_(std::move(padded_shapes)),
           padding_values_(std::move(padding_values)),
           input_(input) {
@@ -232,6 +233,47 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
                              ")::Dataset");
     }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      Node* batch_size = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size));
+
+      std::vector<Node*> padded_shapes;
+      padded_shapes.reserve(padded_shapes_.size());
+      for (int i = 0; i < padded_shapes_.size(); i++) {
+        Node* node;
+        Tensor t(DT_INT64, TensorShape({padded_shapes_[i].dims()}));
+        for (int j = 0; j < padded_shapes_[i].dims(); j++) {
+          t.vec<int64>()(j) = padded_shapes_[i].dim_size(j);
+        }
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        padded_shapes.emplace_back(node);
+      }
+
+      std::vector<Node*> padding_values;
+      padding_values.reserve(padding_values_.size());
+      for (const Tensor& t : padding_values_) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        padding_values.emplace_back(node);
+      }
+
+      AttrValue output_types;
+      b->BuildAttrValue(output_dtypes(), &output_types);
+
+      AttrValue N;
+      b->BuildAttrValue<int64>(padded_shapes_.size(), &N);
+
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {{0, input_graph_node}, {1, batch_size}},
+                        {{2, padded_shapes}, {3, padding_values}},
+                        {{"Toutput_types", output_types}, {"N", N}}, output));
+      return Status::OK();
+    }
+
    private:
     // Copies element into the index^th slice of parent (in the 0th dimension).
     //
@@ -248,17 +290,25 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
         // Each row of `batch_elements` is a tuple of tensors from the
         // input iterator.
         std::vector<std::vector<Tensor>> batch_elements;
-        batch_elements.reserve(dataset()->batch_size_);
         {
           mutex_lock l(mu_);
-          *end_of_sequence = false;
-          for (int i = 0; i < dataset()->batch_size_ && !*end_of_sequence;
-               ++i) {
-            std::vector<Tensor> batch_element_tuple;
-            TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &batch_element_tuple,
-                                                    end_of_sequence));
-            if (!*end_of_sequence) {
-              batch_elements.push_back(std::move(batch_element_tuple));
+          if (!input_impl_) {
+            *end_of_sequence = true;
+            return Status::OK();
+          } else {
+            *end_of_sequence = false;
+            batch_elements.reserve(dataset()->batch_size_);
+            for (int i = 0; i < dataset()->batch_size_ && !*end_of_sequence;
+                 ++i) {
+              std::vector<Tensor> batch_element_tuple;
+              TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &batch_element_tuple,
+                                                      end_of_sequence));
+              if (!*end_of_sequence) {
+                batch_elements.push_back(std::move(batch_element_tuple));
+              }
+            }
+            if (*end_of_sequence) {
+              input_impl_.reset();
             }
           }
         }
@@ -347,6 +397,28 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impl_)
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        else
+          TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("exhausted"), ""));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (reader->Contains(full_name("exhausted"))) {
+          input_impl_.reset();
+        } else {
+          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        }
+        return Status::OK();
+      }
+
      private:
       mutex mu_;
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
similarity index 99%
rename from tensorflow/core/kernels/parallel_interleave_dataset_op.cc
rename to tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 56942a5c01f3c2be5617aa1a9e1eadea12857911..cb6a83606e621ce67c828a2059afa7ead733e315 100644
--- a/tensorflow/core/kernels/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/captured_function.h"
-#include "tensorflow/core/kernels/dataset_utils.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 
diff --git a/tensorflow/core/kernels/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
similarity index 96%
rename from tensorflow/core/kernels/parallel_map_dataset_op.cc
rename to tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 2be87f4bde6f28596213433fe287d351ccf0c721..930ea3585951f2fa5dd4e41ddfde9cc25a598d4c 100644
--- a/tensorflow/core/kernels/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -14,15 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include <deque>
 
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 
-#include "tensorflow/core/kernels/captured_function.h"
-
 namespace tensorflow {
 
 namespace {
@@ -195,8 +193,8 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
 
           FunctionLibraryRuntime::Options opts;
           opts.step_id = CapturedFunction::generate_step_id();
-          ScopedStepContainer* step_container = new ScopedStepContainer(
-              opts.step_id, [this, ctx](const string& name) {
+          ScopedStepContainer* step_container =
+              new ScopedStepContainer(opts.step_id, [this](const string& name) {
                 dataset()
                     ->captured_func_->resource_manager()
                     ->Cleanup(name)
@@ -205,7 +203,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
           opts.step_container = step_container;
           opts.runner = ctx->runner();
           dataset()->captured_func_->RunAsync(
-              opts, input_element, &result->return_values,
+              opts, std::move(input_element), &result->return_values,
               [result, step_container, result_index](Status ret_status) {
                 delete step_container;
                 result->status.Update(ret_status);
diff --git a/tensorflow/core/kernels/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
similarity index 54%
rename from tensorflow/core/kernels/prefetch_dataset_op.cc
rename to tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 93ff7cff57c492679c3a872364d74931ab83288a..6899767ce5eb1d69a02e4f6f07d2a844360183b9 100644
--- a/tensorflow/core/kernels/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -14,9 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include <deque>
 
-#include "tensorflow/core/kernels/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
 
 namespace tensorflow {
 
@@ -39,14 +40,14 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES(ctx, buffer_size > 0,
                 errors::InvalidArgument("buffer_size must be > 0"));
 
-    *output = new Dataset(input, buffer_size);
+    *output = new Dataset(ctx, input, buffer_size);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(const DatasetBase* input, int64 buffer_size)
-        : input_(input), buffer_size_(buffer_size) {
+    Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size)
+        : GraphDatasetBase(ctx), input_(input), buffer_size_(buffer_size) {
       input_->Ref();
     }
 
@@ -67,6 +68,18 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "PrefetchDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      Node* buffer_size = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {input_graph_node, buffer_size}, output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -121,7 +134,10 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
 
             // Wake the prefetch thread, in case it has been waiting
             // for space in the buffer.
-            cond_var_.notify_one();
+            // Also wake up threads from other calls to GetNext.
+            // TODO(mrry): Consider using different condition variables
+            // for GetNext and Prefetch.
+            cond_var_.notify_all();
             return s;
           } else if (prefetch_thread_finished_) {
             *end_of_sequence = true;
@@ -130,6 +146,69 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        // Acquire both locks to ensure that the prefetch thread and
+        // all GetNext threads are blocked.
+        mutex_lock parent_l(parent_mu_);
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("buffer_size"), buffer_.size()));
+        for (size_t i = 0; i < buffer_.size(); i++) {
+          auto& buffer_element = buffer_[i];
+          TF_RETURN_IF_ERROR(WriteStatus(writer, i, buffer_element.status));
+          if (buffer_element.status.ok()) {
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat("buffer[", i, "].size")),
+                buffer_element.value.size()));
+            for (size_t j = 0; j < buffer_element.value.size(); j++) {
+              TF_RETURN_IF_ERROR(writer->WriteTensor(
+                  strings::StrCat("buffer[", i, "][", j, "]"),
+                  buffer_element.value[j]));
+            }
+          }
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock parent_l(parent_mu_);
+        mutex_lock l(mu_);
+        buffer_.clear();
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        size_t buffer_size;
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("buffer_size"), &temp));
+          buffer_size = static_cast<size_t>(temp);
+        }
+        for (size_t i = 0; i < buffer_size; i++) {
+          buffer_.emplace_back();
+          auto& buffer_element = buffer_.back();
+          TF_RETURN_IF_ERROR(ReadStatus(reader, i, &buffer_element.status));
+          if (buffer_element.status.ok()) {
+            size_t value_size;
+            {
+              int64 temp;
+              TF_RETURN_IF_ERROR(reader->ReadScalar(
+                  full_name(strings::StrCat("buffer[", i, "].size")), &temp));
+              value_size = static_cast<size_t>(temp);
+            }
+            buffer_element.value.reserve(value_size);
+            for (size_t j = 0; j < value_size; j++) {
+              buffer_element.value.emplace_back();
+              TF_RETURN_IF_ERROR(reader->ReadTensor(
+                  strings::StrCat("buffer[", i, "][", j, "]"),
+                  &buffer_element.value.back()));
+            }
+          }
+        }
+        return Status::OK();
+      }
+
      private:
       // A buffer element comprises a status and (if that status is
       // OK) a vector of tensors, representing an element of the input dataset.
@@ -173,6 +252,12 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
           }
 
           // 2. Read the next element.
+          // Acquire the parent lock since we will be reading an element
+          // from the input iterator. Note that we do not wish to release
+          // this lock till we have added the fetched element to the
+          // `buffer_` else there will be local state that may be missed
+          // by SaveInternal.
+          mutex_lock parent_l(parent_mu_);
           bool end_of_sequence;
           BufferElement buffer_element;
           buffer_element.status = input_impl_->GetNext(
@@ -193,8 +278,50 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
+      Status WriteStatus(IteratorStateWriter* writer, size_t index,
+                         const Status& status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            CodeKey(index), static_cast<int64>(status.code())));
+        if (!status.ok()) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(ErrorMessageKey(index),
+                                                 status.error_message()));
+        }
+        return Status::OK();
+      }
+
+      Status ReadStatus(IteratorStateReader* reader, size_t index,
+                        Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        int64 code_int;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
+        error::Code code = static_cast<error::Code>(code_int);
+
+        if (code != error::Code::OK) {
+          string error_message;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(ErrorMessageKey(index), &error_message));
+          *status = Status(code, error_message);
+        } else {
+          *status = Status::OK();
+        }
+        return Status::OK();
+      }
+
+      string CodeKey(size_t index) {
+        return full_name(strings::StrCat("status[", index, "].code"));
+      }
+
+      string ErrorMessageKey(size_t index) {
+        return full_name(strings::StrCat("status[", index, "].error_message"));
+      }
+
+      // This mutex is used to ensure exclusivity between multiple threads
+      // reading/writing this iterator's local state.
       mutex mu_;
-      const std::unique_ptr<IteratorBase> input_impl_;
+      // This mutex is used to ensure exclusivity between multiple threads
+      // accessing the parent iterator. We keep this separate from `mu_` to
+      // allow prefetching to run in parallel with GetNext calls.
+      mutex parent_mu_ ACQUIRED_BEFORE(mu_);
+      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(parent_mu_);
       condition_variable cond_var_;
       std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
       std::unique_ptr<Thread> prefetch_thread_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/data/random_dataset_op.cc b/tensorflow/core/kernels/data/random_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..569df12df7597b021e24be5eed774c1354c4fe41
--- /dev/null
+++ b/tensorflow/core/kernels/data/random_dataset_op.cc
@@ -0,0 +1,154 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class RandomDatasetOp : public DatasetOpKernel {
+ public:
+  explicit RandomDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    int64 seed;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed", &seed));
+
+    int64 seed2;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed2", &seed2));
+
+    // By TensorFlow convention, passing 0 for both seeds indicates
+    // that the shuffling should be seeded non-deterministically.
+    if (seed == 0 && seed2 == 0) {
+      seed = random::New64();
+      seed2 = random::New64();
+    }
+
+    *output = new Dataset(ctx, seed, seed2);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, int64 seed, int64 seed2)
+        : GraphDatasetBase(ctx), seed_(seed), seed2_(seed2) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Random")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_INT64});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override {
+      return strings::StrCat("RandomDatasetOp(", seed_, ", ", seed2_,
+                             ")::Dataset");
+    }
+
+   protected:
+    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* seed = nullptr;
+      Node* seed2 = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(seed_, &seed));
+      TF_RETURN_IF_ERROR(b->AddScalar(seed2_, &seed2));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {seed, seed2}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            parent_generator_(dataset()->seed_, dataset()->seed2_),
+            generator_(&parent_generator_) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        Tensor value_tensor(cpu_allocator(), DT_INT64, {});
+        value_tensor.scalar<int64>()() = Random();
+        out_tensors->emplace_back(std::move(value_tensor));
+        *end_of_sequence = false;
+        return Status::OK();
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("num_random_samples"),
+                                               num_random_samples_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("num_random_samples"),
+                                              &num_random_samples_));
+        parent_generator_ =
+            random::PhiloxRandom(dataset()->seed_, dataset()->seed2_);
+        generator_ = random::SingleSampleAdapter<random::PhiloxRandom>(
+            &parent_generator_);
+        generator_.Skip(num_random_samples_);
+        return Status::OK();
+      }
+
+     private:
+      random::SingleSampleAdapter<random::PhiloxRandom>::ResultType Random()
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        num_random_samples_++;
+        auto out = generator_();
+        return out;
+      }
+      mutex mu_;
+      random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
+      random::SingleSampleAdapter<random::PhiloxRandom> generator_
+          GUARDED_BY(mu_);
+      int64 num_random_samples_ GUARDED_BY(mu_) = 0;
+    };
+
+    const int64 seed_;
+    const int64 seed2_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("RandomDataset").Device(DEVICE_CPU),
+                        RandomDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
similarity index 98%
rename from tensorflow/core/kernels/range_dataset_op.cc
rename to tensorflow/core/kernels/data/range_dataset_op.cc
index e7ae840fc7d023cda8c11ecd1f7cde3842a9da00..e75a3f8d4d86d4384298f8b3cb764247b7483361 100644
--- a/tensorflow/core/kernels/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc
similarity index 99%
rename from tensorflow/core/kernels/reader_dataset_ops.cc
rename to tensorflow/core/kernels/data/reader_dataset_ops.cc
index d942ddc4a7b9042038c6b7a2a52e46c1bf45b2a9..557e98c1e68d58e57951f3fbc30a7c77d6b6d0ee 100644
--- a/tensorflow/core/kernels/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/io/buffered_inputstream.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
diff --git a/tensorflow/core/kernels/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
similarity index 90%
rename from tensorflow/core/kernels/repeat_dataset_op.cc
rename to tensorflow/core/kernels/data/repeat_dataset_op.cc
index 3d977a0fa38be77ac812cb12aade2af20b871fb8..0e4f92a8fd1251173c976e864ebcce6032f27c28 100644
--- a/tensorflow/core/kernels/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -176,30 +175,25 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
         do {
+          bool first_call = false;
           if (!input_impl_) {
+            first_call = true;
             input_impl_ = dataset()->input_->MakeIterator(prefix());
-            TF_RETURN_IF_ERROR(
-                input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
-            // If the first call to GetNext() fails because the end of
-            // sequence has been reached, we return an OutOfRange
-            // error to terminate the iteration. (Otherwise, this
-            // iterator would loop infinitely and never produce a
-            // value.)
-            if (!*end_of_sequence) {
-              return Status::OK();
-            } else {
-              input_impl_.reset();
+          }
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+          if (!*end_of_sequence) {
+            return Status::OK();
+          } else {
+            input_impl_.reset();
+            if (first_call) {
+              // If the first call to GetNext() fails because the end of
+              // sequence has been reached, we return an OutOfRange error to
+              // terminate the iteration. (Otherwise, this iterator would loop
+              // infinitely and never produce a value.)
               return errors::OutOfRange(
                   "Attempted to repeat an empty dataset infinitely.");
             }
-          } else {
-            TF_RETURN_IF_ERROR(
-                input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
-            if (!*end_of_sequence) {
-              return Status::OK();
-            } else {
-              input_impl_.reset();
-            }
           }
         } while (true);
       }
diff --git a/tensorflow/core/kernels/scan_dataset_op.cc b/tensorflow/core/kernels/data/scan_dataset_op.cc
similarity index 96%
rename from tensorflow/core/kernels/scan_dataset_op.cc
rename to tensorflow/core/kernels/data/scan_dataset_op.cc
index 76c219f1ae6352f047035b1bfd3231689d0d3771..84ba0514687f2c3b289fad5f520de14abd859570 100644
--- a/tensorflow/core/kernels/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/scan_dataset_op.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/captured_function.h"
-#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
@@ -132,7 +132,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
         FunctionLibraryRuntime::Options opts;
         opts.step_id = CapturedFunction::generate_step_id();
         ScopedStepContainer step_container(
-            opts.step_id, [this, ctx](const string& name) {
+            opts.step_id, [this](const string& name) {
               dataset()
                   ->captured_func_->resource_manager()
                   ->Cleanup(name)
@@ -143,8 +143,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
         std::vector<Tensor> state_and_output;
         state_and_output.reserve(dataset()->state_types_.size() +
                                  output_dtypes().size());
-        Status s =
-            dataset()->captured_func_->Run(opts, args, &state_and_output);
+        Status s = dataset()->captured_func_->Run(opts, std::move(args),
+                                                  &state_and_output);
         if (s.ok()) {
           state_.clear();
           size_t i = 0;
diff --git a/tensorflow/core/kernels/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
similarity index 51%
rename from tensorflow/core/kernels/shuffle_dataset_op.cc
rename to tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 72facb3a0d0cc13a559b3d8005592e19b97fed6f..caef449b8e829ac9626c9ed29382f68829d7c0b9 100644
--- a/tensorflow/core/kernels/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
+
+#include <deque>
+#include <vector>
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
@@ -29,50 +32,21 @@ const int64 kLogIntervalMicros = 10 * 1000000;  // 10 seconds.
 // See documentation in ../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
-class ShuffleDatasetOp : public UnaryDatasetOpKernel {
+class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
  public:
-  explicit ShuffleDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("reshuffle_each_iteration",
-                                     &reshuffle_each_iteration_));
-  }
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    int64 buffer_size;
-    OP_REQUIRES_OK(
-        ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
-    OP_REQUIRES(
-        ctx, buffer_size > 0,
-        errors::InvalidArgument("buffer_size must be greater than zero."));
-
-    int64 seed;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed", &seed));
-
-    int64 seed2;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed2", &seed2));
-
-    // By TensorFlow convention, passing 0 for both seeds indicates
-    // that the shuffling should be seeded non-deterministically.
-    if (seed == 0 && seed2 == 0) {
-      seed = random::New64();
-      seed2 = random::New64();
-    }
-
-    if (reshuffle_each_iteration_) {
-      *output = new ReshufflingDataset(ctx, input, buffer_size, seed, seed2);
-    } else {
-      *output = new FixedSeedDataset(ctx, input, buffer_size, seed, seed2);
-    }
-  }
+  explicit ShuffleDatasetOpBase(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
 
- private:
+ protected:
   // Abstract base dataset that implements a shuffling iterator.
   class ShuffleDatasetBase : public GraphDatasetBase {
    public:
     ShuffleDatasetBase(OpKernelContext* ctx, const DatasetBase* input,
-                       int64 buffer_size)
-        : GraphDatasetBase(ctx), input_(input), buffer_size_(buffer_size) {
+                       int64 buffer_size, int64 count)
+        : GraphDatasetBase(ctx),
+          input_(input),
+          buffer_size_(buffer_size),
+          count_(count) {
       input_->Ref();
     }
 
@@ -91,12 +65,15 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params, int64 seed, int64 seed2)
           : DatasetIterator<ShuffleDatasetBase>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
+            input_impl_(nullptr),
             seed_(seed),
             seed2_(seed2),
+            epoch_(0),
+            num_elements_(0),
             parent_generator_(seed, seed2),
             generator_(&parent_generator_) {
-        buffer_.reserve(params.dataset->buffer_size_);
+        buffer_.reset(new std::vector<Tensor>[params.dataset->buffer_size_]);
+        slices_.emplace_back(new Slice{0, 0});
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -105,19 +82,44 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
         mutex_lock l(mu_);
         int64 start_micros = ctx->env()->NowMicros();
         int64 num_log_entries = 0;
-        while (input_impl_ && buffer_.size() < dataset()->buffer_size_) {
+        bool first_call = false;
+        if (!input_impl_ && epoch_ == 0) {
+          first_call = true;
+          input_impl_ = dataset()->input_->MakeIterator(prefix());
+        }
+        while (input_impl_ && num_elements_ < dataset()->buffer_size_) {
           if (ctx->env()->NowMicros() >
               ((num_log_entries + 1) * kLogIntervalMicros) + start_micros) {
             num_log_entries++;
             LOG(INFO) << "Filling up shuffle buffer (this may take a while): "
-                      << buffer_.size() << " of " << dataset()->buffer_size_;
+                      << num_elements_ << " of " << dataset()->buffer_size_;
           }
           std::vector<Tensor> input_element;
-          bool end_of_input_sequence;
-          TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &input_element,
-                                                  &end_of_input_sequence));
+          bool end_of_input_sequence = false;
+          while (dataset()->count_ == -1 || epoch_ < dataset()->count_) {
+            TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &input_element,
+                                                    &end_of_input_sequence));
+            if (!end_of_input_sequence) {
+              break;
+            }
+            if (first_call && dataset()->count_ == -1) {
+              // If the first call to GetNext() fails because the end of
+              // sequence has been reached, we return an OutOfRange error to
+              // terminate the iteration. (Otherwise, this iterator may loop
+              // infinitely and never produce a value.)
+              return errors::OutOfRange(
+                  "Attempted to repeat an empty dataset infinitely.");
+            }
+            epoch_++;
+            int64 n = slices_.back()->end;
+            slices_.emplace_back(new Slice{n, n});
+            input_impl_ = dataset()->input_->MakeIterator(prefix());
+          }
           if (!end_of_input_sequence) {
-            buffer_.emplace_back(std::move(input_element));
+            buffer_[slices_.back()->end % dataset()->buffer_size_] =
+                std::move(input_element);
+            num_elements_++;
+            slices_.back()->end++;
           } else {
             input_impl_.reset();
           }
@@ -126,14 +128,25 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
           LOG(INFO) << "Shuffle buffer filled.";
         }
 
-        if (!buffer_.empty()) {
+        if (num_elements_ > 0) {
           *end_of_sequence = false;
-          // Choose an element to produce uniformly at random, and
-          // swap the last element into its place in the buffer.
-          int64 index = Random() % buffer_.size();
+          // Garbage collect all empty slices.
+          while (!slices_.empty() &&
+                 slices_.front()->start == slices_.front()->end) {
+            slices_.pop_front();
+          }
+          DCHECK(!slices_.empty());
+          // Choose an element to produce uniformly at random from the first
+          // slice, and then remove the element from the slice.
+          int64 offset =
+              Random() % (slices_.front()->end - slices_.front()->start);
+          int64 index =
+              (slices_.front()->start + offset) % dataset()->buffer_size_;
           *out_tensors = std::move(buffer_[index]);
-          std::swap(buffer_[index], buffer_.back());
-          buffer_.pop_back();
+          std::swap(buffer_[index],
+                    buffer_[slices_.front()->start % dataset()->buffer_size_]);
+          slices_.front()->start++;
+          num_elements_--;
         } else {
           DCHECK(input_impl_ == nullptr);
           *end_of_sequence = true;
@@ -145,20 +158,6 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
 
-        // Save the tensors in the buffer.
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("buffer_size"), buffer_.size()));
-        for (size_t i = 0; i < buffer_.size(); i++) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("buffer_", i, "_size")),
-              buffer_[i].size()));
-          for (size_t j = 0; j < buffer_[i].size(); j++) {
-            TF_RETURN_IF_ERROR(writer->WriteTensor(
-                full_name(strings::StrCat("buffer_", i, "_", j)),
-                buffer_[i][j]));
-          }
-        }
-
         // Save state needed to restore the random number generators.
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("num_random_samples"),
                                                num_random_samples_));
@@ -171,34 +170,38 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
         } else {
           TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
         }
+
+        // Save the epoch counter, buffer, and buffer slices.
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("epoch"), epoch_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("num_elements"), num_elements_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("slices_size"), slices_.size()));
+        for (size_t i = 0; i < slices_.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("slices_start_", i)),
+              slices_[i]->start));
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("slices_end_", i)), slices_[i]->end));
+          for (size_t j = slices_[i]->start; j < slices_[i]->end; ++j) {
+            size_t index = j % dataset()->buffer_size_;
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat("buffer_", index, "_size")),
+                buffer_[index].size()));
+            for (size_t k = 0; k < buffer_[index].size(); ++k) {
+              TF_RETURN_IF_ERROR(writer->WriteTensor(
+                  full_name(strings::StrCat("buffer_", index, "_", k)),
+                  buffer_[index][k]));
+            }
+          }
+        }
+
         return Status::OK();
       }
 
       Status RestoreInternal(OpKernelContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        buffer_.clear();
-
-        // Restore the buffer.
-        size_t buffer_size;
-        {
-          int64 temp;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("buffer_size"), &temp));
-          buffer_size = static_cast<size_t>(temp);
-        }
-        buffer_.reserve(buffer_size);
-        for (size_t i = 0; i < buffer_size; i++) {
-          int64 list_size;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(
-              full_name(strings::StrCat("buffer_", i, "_size")), &list_size));
-          buffer_.emplace_back(std::vector<Tensor>(list_size));
-          for (int j = 0; j < list_size; j++) {
-            TF_RETURN_IF_ERROR(reader->ReadTensor(
-                full_name(strings::StrCat("buffer_", i, "_", j)),
-                &buffer_[i][j]));
-          }
-        }
 
         // Restore the random number generators.
         TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("num_random_samples"),
@@ -212,10 +215,58 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
         } else {
           input_impl_.reset();
         }
+
+        // Restore the epoch counter, buffer, and buffer slices.
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("epoch"), &epoch_));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("num_elements"), &num_elements_));
+        size_t slices_size;
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("slices_size"), &temp));
+          slices_size = static_cast<size_t>(temp);
+        }
+        buffer_.reset(new std::vector<Tensor>[dataset()->buffer_size_]);
+        for (size_t i = 0; i < slices_size; ++i) {
+          int64 start;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat("slices_start_", i)), &start));
+          int64 end;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat("slices_end_", i)), &end));
+          slices_.emplace_back(new Slice{start, end});
+          for (size_t j = start; j < end; ++j) {
+            size_t index = j % dataset()->buffer_size_;
+            int64 list_size;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("buffer_", index, "_size")),
+                &list_size));
+            buffer_[index] = std::vector<Tensor>(list_size);
+            for (int k = 0; k < list_size; ++k) {
+              TF_RETURN_IF_ERROR(reader->ReadTensor(
+                  full_name(strings::StrCat("buffer_", index, "_", k)),
+                  &buffer_[index][k]));
+            }
+          }
+        }
+
         return Status::OK();
       }
 
      private:
+      // Used to represent slices of `buffer_` that belong to different epochs.
+      // The invariant maintained by the implementation is: `start` <= `end`.
+      // When using `start` and `end` to index into `buffer_`, their values
+      // should be taken modulo the size of `buffer_` as their absolute value
+      // can be greater than the range of `buffer_`.
+      struct Slice {
+        Slice(int64 start, int64 end) : start(start), end(end) {}
+
+        int64 start;
+        int64 end;
+      };
+
       random::SingleSampleAdapter<random::PhiloxRandom>::ResultType Random()
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         num_random_samples_++;
@@ -232,10 +283,13 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
       }
 
       mutex mu_;
-      std::vector<std::vector<Tensor>> buffer_ GUARDED_BY(mu_);
+      std::unique_ptr<std::vector<Tensor>[]> buffer_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       const int64 seed_ GUARDED_BY(mu_);
       const int64 seed2_ GUARDED_BY(mu_);
+      int64 epoch_ GUARDED_BY(mu_);
+      int64 num_elements_ GUARDED_BY(mu_);
+      std::deque<std::unique_ptr<Slice>> slices_ GUARDED_BY(mu_);
       random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
       random::SingleSampleAdapter<random::PhiloxRandom> generator_
           GUARDED_BY(mu_);
@@ -244,15 +298,58 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
 
     const DatasetBase* const input_;
     const int64 buffer_size_;
+    const int64 count_;
   };
+};
+
+class ShuffleDatasetOp : public ShuffleDatasetOpBase {
+ public:
+  explicit ShuffleDatasetOp(OpKernelConstruction* ctx)
+      : ShuffleDatasetOpBase(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("reshuffle_each_iteration",
+                                     &reshuffle_each_iteration_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 buffer_size;
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
+    OP_REQUIRES(
+        ctx, buffer_size > 0,
+        errors::InvalidArgument("buffer_size must be greater than zero."));
+
+    int64 seed;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed", &seed));
+
+    int64 seed2;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed2", &seed2));
+
+    // By TensorFlow convention, passing 0 for both seeds indicates
+    // that the shuffling should be seeded non-deterministically.
+    if (seed == 0 && seed2 == 0) {
+      seed = random::New64();
+      seed2 = random::New64();
+    }
+
+    int64 count = 1;
+    if (reshuffle_each_iteration_) {
+      *output =
+          new ReshufflingDataset(ctx, input, buffer_size, seed, seed2, count);
+    } else {
+      *output =
+          new FixedSeedDataset(ctx, input, buffer_size, seed, seed2, count);
+    }
+  }
 
+ private:
   // A dataset that uses a pseduorandom sequence of seeds for the iterators
   // created from it. Used when `reshuffle_each_iteration` is true.
   class ReshufflingDataset : public ShuffleDatasetBase {
    public:
     ReshufflingDataset(OpKernelContext* ctx, const DatasetBase* input,
-                       int64 buffer_size, int64 seed, int64 seed2)
-        : ShuffleDatasetBase(ctx, input, buffer_size),
+                       int64 buffer_size, int64 seed, int64 seed2, int64 count)
+        : ShuffleDatasetBase(ctx, input, buffer_size, count),
           seed_(seed),
           seed2_(seed2),
           parent_generator_(seed, seed2),
@@ -291,8 +388,8 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
   class FixedSeedDataset : public ShuffleDatasetBase {
    public:
     FixedSeedDataset(OpKernelContext* ctx, const DatasetBase* input,
-                     int64 buffer_size, int64 seed, int64 seed2)
-        : ShuffleDatasetBase(ctx, input, buffer_size),
+                     int64 buffer_size, int64 seed, int64 seed2, int64 count)
+        : ShuffleDatasetBase(ctx, input, buffer_size, count),
           seed_(seed),
           seed2_(seed) {}
 
@@ -337,9 +434,93 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
   bool reshuffle_each_iteration_;
 };
 
+class ShuffleAndRepeatDatasetOp : public ShuffleDatasetOpBase {
+ public:
+  explicit ShuffleAndRepeatDatasetOp(OpKernelConstruction* ctx)
+      : ShuffleDatasetOpBase(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 buffer_size;
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
+    OP_REQUIRES(
+        ctx, buffer_size > 0,
+        errors::InvalidArgument("buffer_size must be greater than zero."));
+
+    int64 seed;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed", &seed));
+
+    int64 seed2;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed2", &seed2));
+
+    int64 count;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "count", &count));
+
+    // By TensorFlow convention, if both seeds are 0, then shuffling should be
+    // seeded non-deterministically.
+    if (seed == 0 && seed2 == 0) {
+      seed = random::New64();
+      seed2 = random::New64();
+    }
+
+    *output = new Dataset(ctx, input, buffer_size, seed, seed2, count);
+  }
+
+ private:
+  class Dataset : public ShuffleDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size,
+            int64 seed, int64 seed2, int64 count)
+        : ShuffleDatasetBase(ctx, input, buffer_size, count),
+          seed_(seed),
+          seed2_(seed2) {}
+
+    string DebugString() override {
+      return strings::StrCat("ShuffleAndRepeatDatasetOp(", buffer_size_, ", ",
+                             seed_, ", ", seed2_, ", ", count_, ")::Dataset");
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new ShuffleDatasetBase::Iterator(
+          {this, strings::StrCat(prefix, "::ShuffleAndRepeat")}, seed_,
+          seed2_));
+    }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      Node* buffer_size = nullptr;
+      Node* seed = nullptr;
+      Node* seed2 = nullptr;
+      Node* count = nullptr;
+
+      TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
+      TF_RETURN_IF_ERROR(b->AddScalar(seed_, &seed));
+      TF_RETURN_IF_ERROR(b->AddScalar(seed2_, &seed2));
+      TF_RETURN_IF_ERROR(b->AddScalar(count_, &count));
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {input_graph_node, buffer_size, seed, seed2, count},  // Inputs
+          {},                                                         // Attrs
+          output));
+      return Status::OK();
+    }
+
+   private:
+    const int64 seed_;
+    const int64 seed2_;
+  };
+};
+
 REGISTER_KERNEL_BUILDER(Name("ShuffleDataset").Device(DEVICE_CPU),
                         ShuffleDatasetOp);
 
+REGISTER_KERNEL_BUILDER(Name("ShuffleAndRepeatDataset").Device(DEVICE_CPU),
+                        ShuffleAndRepeatDatasetOp);
+
 }  // namespace
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
similarity index 99%
rename from tensorflow/core/kernels/skip_dataset_op.cc
rename to tensorflow/core/kernels/data/skip_dataset_op.cc
index 1fe49271e299f042b9dc88a30d88d3d26a9e65f2..58a149c7cf2098afa9b55b6d13495cd439f05758 100644
--- a/tensorflow/core/kernels/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
similarity index 99%
rename from tensorflow/core/kernels/sparse_tensor_slice_dataset_op.cc
rename to tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
index de5ab1a3678b981a95de533dc2f59cc16dd7705c..fdfb2b70e061f73c878f92a57d6ad97a92842199 100644
--- a/tensorflow/core/kernels/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
@@ -14,11 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include <numeric>
 
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/data/sql/BUILD b/tensorflow/core/kernels/data/sql/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..bec5b2e7d6cd045fb2fac4593defa2ff241324bf
--- /dev/null
+++ b/tensorflow/core/kernels/data/sql/BUILD
@@ -0,0 +1,39 @@
+# Description:
+# SQL library.
+#
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "sql",
+    srcs = [
+        "driver_manager.cc",
+        "sqlite_query_connection.cc",
+    ],
+    hdrs = [
+        "driver_manager.h",
+        "query_connection.h",
+        "sqlite_query_connection.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/lib/db:sqlite",
+        "@sqlite_archive//:sqlite",
+    ],
+)
diff --git a/tensorflow/core/kernels/sql/driver_manager.cc b/tensorflow/core/kernels/data/sql/driver_manager.cc
similarity index 89%
rename from tensorflow/core/kernels/sql/driver_manager.cc
rename to tensorflow/core/kernels/data/sql/driver_manager.cc
index 9a5d5aa853c438ef4e893fac2322af17ae863fa8..ffabda1a8a1fe8bce629ed34590c058a231f3cfc 100644
--- a/tensorflow/core/kernels/sql/driver_manager.cc
+++ b/tensorflow/core/kernels/data/sql/driver_manager.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/sql/driver_manager.h"
-#include "tensorflow/core/kernels/sql/sqlite_query_connection.h"
+#include "tensorflow/core/kernels/data/sql/driver_manager.h"
+#include "tensorflow/core/kernels/data/sql/sqlite_query_connection.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/sql/driver_manager.h b/tensorflow/core/kernels/data/sql/driver_manager.h
similarity index 82%
rename from tensorflow/core/kernels/sql/driver_manager.h
rename to tensorflow/core/kernels/data/sql/driver_manager.h
index 53350268d30f4f7215eb543a28ae3fedf837ac0d..0d0c38eb58314962554b929d1a5c4a387ab68e55 100644
--- a/tensorflow/core/kernels/sql/driver_manager.h
+++ b/tensorflow/core/kernels/data/sql/driver_manager.h
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_DRIVER_MANAGER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_DRIVER_MANAGER_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
 
-#include "tensorflow/core/kernels/sql/query_connection.h"
+#include "tensorflow/core/kernels/data/sql/query_connection.h"
 
 namespace tensorflow {
 
@@ -38,4 +38,4 @@ class DriverManager {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_DRIVER_MANAGER_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
diff --git a/tensorflow/core/kernels/sql/query_connection.h b/tensorflow/core/kernels/data/sql/query_connection.h
similarity index 92%
rename from tensorflow/core/kernels/sql/query_connection.h
rename to tensorflow/core/kernels/data/sql/query_connection.h
index f9945aee7dc6ac59df8cc9063ab5c4d9aedf4018..194714897221f73ffec51c50c5202860b1bd0b46 100644
--- a/tensorflow/core/kernels/sql/query_connection.h
+++ b/tensorflow/core/kernels/data/sql/query_connection.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_QUERY_CONNECTION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_QUERY_CONNECTION_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
 
 #include "tensorflow/core/framework/tensor.h"
 
@@ -64,4 +64,4 @@ class QueryConnection {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_QUERY_CONNECTION_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
diff --git a/tensorflow/core/kernels/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
similarity index 98%
rename from tensorflow/core/kernels/sql/sqlite_query_connection.cc
rename to tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
index 1330506d28ca96b4a9e668219dc67cbb1c3b796d..abe31261a3ecfc3bd990448e6978ccf3aafdc323 100644
--- a/tensorflow/core/kernels/sql/sqlite_query_connection.cc
+++ b/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/sql/sqlite_query_connection.h"
+#include "tensorflow/core/kernels/data/sql/sqlite_query_connection.h"
 
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
diff --git a/tensorflow/core/kernels/sql/sqlite_query_connection.h b/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
similarity index 84%
rename from tensorflow/core/kernels/sql/sqlite_query_connection.h
rename to tensorflow/core/kernels/data/sql/sqlite_query_connection.h
index 435dd8e234ca7a8fb9a3ef6ffeef0ca4dda7a221..00b7cb3213ecf1b767938c8d7c3f5be1af1eeff1 100644
--- a/tensorflow/core/kernels/sql/sqlite_query_connection.h
+++ b/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_SQLITE_QUERY_CONNECTION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_SQLITE_QUERY_CONNECTION_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
 
 #include <memory>
 
-#include "tensorflow/core/kernels/sql/query_connection.h"
+#include "tensorflow/core/kernels/data/sql/query_connection.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -53,4 +53,4 @@ class SqliteQueryConnection : public QueryConnection {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_SQLITE_QUERY_CONNECTION_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
diff --git a/tensorflow/core/kernels/sql_dataset_ops.cc b/tensorflow/core/kernels/data/sql_dataset_ops.cc
similarity index 97%
rename from tensorflow/core/kernels/sql_dataset_ops.cc
rename to tensorflow/core/kernels/data/sql_dataset_ops.cc
index 23846d65bb8426ad8e5c3343047f72d24653c101..72302190802d17f2cb1ed5471017180238aedff3 100644
--- a/tensorflow/core/kernels/sql_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/sql_dataset_ops.cc
@@ -16,9 +16,9 @@ limitations under the License.
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/dataset.h"
-#include "tensorflow/core/kernels/sql/driver_manager.h"
-#include "tensorflow/core/kernels/sql/query_connection.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/sql/driver_manager.h"
+#include "tensorflow/core/kernels/data/sql/query_connection.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
diff --git a/tensorflow/core/kernels/data/stats_aggregator.h b/tensorflow/core/kernels/data/stats_aggregator.h
new file mode 100644
index 0000000000000000000000000000000000000000..4cb8dba5cbb4a3866b94101df0f1e9a8e52d9cf2
--- /dev/null
+++ b/tensorflow/core/kernels/data/stats_aggregator.h
@@ -0,0 +1,84 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace tensorflow {
+
+class Summary;
+
+// A `StatsAggregator` accumulates statistics incrementally. A
+// `StatsAggregator` can accumulate multiple different statistics, distinguished
+// by a string name.
+//
+// The class currently supports accumulating `Histogram` objects, and we expect
+// to add other methods in future.
+//
+// NOTE(mrry): `StatsAggregator` is a virtual interface because we anticipate
+// that many different implementations will the same interface. For example, the
+// current implementation in "stats_aggregator_ops.cc" is a simple in-memory
+// implementation that integrates with the pull-based summary API, and we may
+// add implementations that work with the push-based `SummaryWriterInterface`,
+// as well as custom monitoring services.
+class StatsAggregator {
+ public:
+  virtual ~StatsAggregator() {}
+
+  // Add the given `values` to the histogram with the given `name`. Each
+  // element of `values` will be treated as a separate sample in the histogram.
+  virtual void AddToHistogram(const string& name,
+                              gtl::ArraySlice<double> values) = 0;
+
+  // Stores a protocol buffer representation of the aggregator state in the
+  // given `out_summary`.
+  // TODO(mrry): Consider separating this method from the `StatsAggregator`
+  // interface. It is possible that not all implementations will support
+  // encoding their state as a protocol buffer.
+  virtual void EncodeToProto(Summary* out_summary) = 0;
+};
+
+// A `StatsAggregatorResource` wraps a shareable `StatsAggregator` as a resource
+// in the TensorFlow resource manager.
+//
+// NOTE(mrry): This class is separate from `StatsAggregator` in order to
+// simplify the memory management of the shared object. Most users of
+// `StatsAggregator` interact with a `std::shared_ptr<StatsAggregator>` whereas
+// the `ResourceBase` API requires explicit reference counting.
+class StatsAggregatorResource : public ResourceBase {
+ public:
+  // Creates a new resource from the given `stats_aggregator`.
+  StatsAggregatorResource(std::unique_ptr<StatsAggregator> stats_aggregator)
+      : stats_aggregator_(stats_aggregator.release()) {}
+
+  // Returns the wrapped `StatsAggregator`.
+  std::shared_ptr<StatsAggregator> stats_aggregator() const {
+    return stats_aggregator_;
+  }
+
+  string DebugString() { return "StatsAggregatorResource"; }
+
+ private:
+  const std::shared_ptr<StatsAggregator> stats_aggregator_;
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
diff --git a/tensorflow/core/kernels/data/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/stats_aggregator_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5a2dd9c43dbcbf5250d4dcd4bd803ed4979999e0
--- /dev/null
+++ b/tensorflow/core/kernels/data/stats_aggregator_ops.cc
@@ -0,0 +1,108 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/stats_aggregator.h"
+
+#include <memory>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_op_kernel.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace {
+
+class StatsAggregatorImpl : public StatsAggregator {
+ public:
+  StatsAggregatorImpl() {}
+
+  void AddToHistogram(const string& name,
+                      gtl::ArraySlice<double> values) override {
+    mutex_lock l(mu_);
+    histogram::Histogram& histogram = histograms_[name];
+    for (double value : values) {
+      histogram.Add(value);
+    }
+  }
+
+  void EncodeToProto(Summary* out_summary) override {
+    mutex_lock l(mu_);
+    for (const auto& pair : histograms_) {
+      const string& name = pair.first;
+      const histogram::Histogram& histogram = pair.second;
+
+      Summary::Value* value = out_summary->add_value();
+      value->set_tag(name);
+      histogram.EncodeToProto(value->mutable_histo(),
+                              true /* preserve_zero_buckets */);
+    }
+  }
+
+ private:
+  mutex mu_;
+  std::unordered_map<string, histogram::Histogram> histograms_ GUARDED_BY(mu_);
+  TF_DISALLOW_COPY_AND_ASSIGN(StatsAggregatorImpl);
+};
+
+class StatsAggregatorHandleOp
+    : public ResourceOpKernel<StatsAggregatorResource> {
+ public:
+  explicit StatsAggregatorHandleOp(OpKernelConstruction* ctx)
+      : ResourceOpKernel<StatsAggregatorResource>(ctx) {}
+
+ private:
+  Status CreateResource(StatsAggregatorResource** ret) override
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    *ret = new StatsAggregatorResource(
+        std::unique_ptr<StatsAggregator>(new StatsAggregatorImpl));
+    return Status::OK();
+  }
+
+  Status VerifyResource(StatsAggregatorResource* resource) override {
+    return Status::OK();
+  }
+};
+
+class StatsAggregatorSummaryOp : public OpKernel {
+ public:
+  explicit StatsAggregatorSummaryOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& resource_handle_t = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(resource_handle_t.shape()),
+                errors::InvalidArgument("resource_handle must be a scalar"));
+
+    StatsAggregatorResource* resource;
+    OP_REQUIRES_OK(ctx,
+                   LookupResource(ctx, HandleFromInput(ctx, 0), &resource));
+    core::ScopedUnref unref_iterator(resource);
+
+    Tensor* summary_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &summary_t));
+    Summary summary;
+    resource->stats_aggregator()->EncodeToProto(&summary);
+    summary_t->scalar<string>()() = summary.SerializeAsString();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StatsAggregatorHandle").Device(DEVICE_CPU),
+                        StatsAggregatorHandleOp);
+REGISTER_KERNEL_BUILDER(Name("StatsAggregatorSummary").Device(DEVICE_CPU),
+                        StatsAggregatorSummaryOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/stats_dataset_ops.cc b/tensorflow/core/kernels/data/stats_dataset_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8742e6c55f059f303a47c3c7fae6576b2b897e37
--- /dev/null
+++ b/tensorflow/core/kernels/data/stats_dataset_ops.cc
@@ -0,0 +1,235 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/stats_aggregator.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+namespace {
+
+// This op defines a `Dataset` that passes through its input elements and
+// records the latency of producing each element in the context's
+// `StatsAggregator`.
+//
+// TODO(mrry): It is likely that many *StatsDatasetOp kernels will have the
+// same or similar structure. We should abstract the common boilerplate into
+// a base case and/or investigate how to make general-purpose *StatsDatasetOp
+// kernels that use TensorFlow functions to represent their logic. For example,
+// if the performance were adequate, we might replace this kernel with an
+// implementation that executes functions before and after the `GetNext()` call
+// on the input, each executing an op that gets the current time and performing
+// the subtraction.
+class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit LatencyStatsDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    string tag;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "tag", &tag));
+    *output = new Dataset(ctx, input, std::move(tag));
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input, string tag)
+        : GraphDatasetBase(ctx), input_(input), tag_(std::move(tag)) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::LatencyStats")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override { return "LatencyStatsDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      Node* tag_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(tag_, &tag_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {input_node, tag_node}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        tf_shared_lock l(mu_);
+        uint64 start = ctx->env()->NowMicros();
+        Status s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+        uint64 end = ctx->env()->NowMicros();
+        auto stats_aggregator = ctx->stats_aggregator();
+        if (stats_aggregator && !*end_of_sequence) {
+          ctx->stats_aggregator()->AddToHistogram(
+              dataset()->tag_, {static_cast<double>(end - start)});
+        }
+        return s;
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    const string tag_;
+  };
+};
+
+class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit BytesProducedStatsDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    string tag;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "tag", &tag));
+    *output = new Dataset(ctx, input, std::move(tag));
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input, string tag)
+        : GraphDatasetBase(ctx), input_(input), tag_(std::move(tag)) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::BytesProducedStats")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override {
+      return "BytesProducedStatsDatasetOp::Dataset";
+    }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      Node* tag_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(tag_, &tag_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {input_node, tag_node}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        tf_shared_lock l(mu_);
+        Status s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+        auto stats_aggregator = ctx->stats_aggregator();
+        if (stats_aggregator && s.ok() && !*end_of_sequence) {
+          size_t total_bytes = 0;
+          for (const Tensor& t : *out_tensors) {
+            total_bytes += t.TotalBytes();
+          }
+          ctx->stats_aggregator()->AddToHistogram(
+              dataset()->tag_, {static_cast<double>(total_bytes)});
+        }
+        return s;
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    const string tag_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("LatencyStatsDataset").Device(DEVICE_CPU),
+                        LatencyStatsDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("BytesProducedStatsDataset").Device(DEVICE_CPU),
+                        BytesProducedStatsDatasetOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
similarity index 99%
rename from tensorflow/core/kernels/take_dataset_op.cc
rename to tensorflow/core/kernels/data/take_dataset_op.cc
index 7a6d20d6c7cb5a9bc5142e877c5c0c5285c1fd90..22824a957e28d89ec33a55bcd96b5c51161dac30 100644
--- a/tensorflow/core/kernels/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
similarity index 97%
rename from tensorflow/core/kernels/tensor_dataset_op.cc
rename to tensorflow/core/kernels/data/tensor_dataset_op.cc
index fe53434d176d77c0064574a044a18db05146e62d..5f53fe026e54471e014b4fb7ab3cfad6a0f79c02 100644
--- a/tensorflow/core/kernels/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -70,7 +69,7 @@ class TensorDatasetOp : public DatasetOpKernel {
    protected:
     Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      std::vector<NodeBuilder::NodeOut> components;
+      std::vector<Node*> components;
       components.reserve(tensors_.size());
       for (const Tensor& t : tensors_) {
         Node* node;
diff --git a/tensorflow/core/kernels/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
similarity index 78%
rename from tensorflow/core/kernels/tensor_slice_dataset_op.cc
rename to tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index e85f59b584720cae0f00cf45a265862e688b157c..c7f9efeea148fc6671a183bdda5055d4de6ab9df 100644
--- a/tensorflow/core/kernels/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/batch_util.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -86,7 +86,7 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
    protected:
     Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      std::vector<NodeBuilder::NodeOut> components;
+      std::vector<Node*> components;
       components.reserve(tensors_.size());
       for (const Tensor& t : tensors_) {
         Node* node;
@@ -101,41 +101,6 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
     }
 
    private:
-    template <typename T>
-    static Status HandleSliceToElement(const Tensor& parent, Tensor* element,
-                                       int64 index) {
-      DCHECK_NE(parent.dim_size(0), 0);
-      DCHECK_GE(index, 0);
-      if (element->NumElements() !=
-          (parent.NumElements() / parent.dim_size(0))) {
-        TensorShape chip_shape = parent.shape();
-        chip_shape.RemoveDim(0);
-        return errors::Internal(
-            "HandleSliceToElement Cannot copy slice: number of elements does "
-            "not match.  Shapes are: [element]: ",
-            element->shape().DebugString(), ", [parent slice]: ",
-            chip_shape.DebugString());
-      }
-      auto parent_as_matrix = parent.flat_outer_dims<T>();
-      element->flat<T>() = parent_as_matrix.chip(index, 0);
-      return Status::OK();
-    }
-
-    static Status CopySliceToElement(const Tensor& parent, Tensor* element,
-                                     int64 index) {
-#define HANDLE_TYPE(T)                                      \
-  case DataTypeToEnum<T>::value: {                          \
-    return HandleSliceToElement<T>(parent, element, index); \
-  }
-
-      switch (parent.dtype()) {
-        TF_CALL_DATASET_TYPES(HANDLE_TYPE);
-        default:
-          return errors::Unimplemented(
-              "CopySliceToElement Unhandled data type: ", element->dtype());
-      }
-    }
-
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
@@ -154,7 +119,7 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
             const Tensor& t = dataset()->tensors_[i];
             Tensor t_slice(cpu_allocator(), t.dtype(),
                            TensorShape(dataset()->shapes_[i].dim_sizes()));
-            TF_RETURN_IF_ERROR(CopySliceToElement(t, &t_slice, i_));
+            TF_RETURN_IF_ERROR(batch_util::CopySliceToElement(t, &t_slice, i_));
             out_tensors->emplace_back(std::move(t_slice));
           }
           ++i_;
diff --git a/tensorflow/core/kernels/window_dataset.cc b/tensorflow/core/kernels/data/window_dataset.cc
similarity index 98%
rename from tensorflow/core/kernels/window_dataset.cc
rename to tensorflow/core/kernels/data/window_dataset.cc
index 77345fd3dfb7e39184605ed1bb4cab3251a62ea1..815d420c68ab62f3b3fd4d7f0d60dcd9da949188 100644
--- a/tensorflow/core/kernels/window_dataset.cc
+++ b/tensorflow/core/kernels/data/window_dataset.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/window_dataset.h"
+#include "tensorflow/core/kernels/data/window_dataset.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/kernels/window_dataset.h b/tensorflow/core/kernels/data/window_dataset.h
similarity index 87%
rename from tensorflow/core/kernels/window_dataset.h
rename to tensorflow/core/kernels/data/window_dataset.h
index a4fccf17b4c7cc064c1aec57554bb88bb7b59578..25396bd3e72f01eb40922a83e6dd18d1fc81e077 100644
--- a/tensorflow/core/kernels/window_dataset.h
+++ b/tensorflow/core/kernels/data/window_dataset.h
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINDOW_DATASET_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINDOW_DATASET_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_H_
 
 #include <vector>
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -45,4 +45,4 @@ Status NewWindowDataset(std::vector<std::vector<Tensor>> elements,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINDOW_DATASET_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_H_
diff --git a/tensorflow/core/kernels/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
similarity index 97%
rename from tensorflow/core/kernels/zip_dataset_op.cc
rename to tensorflow/core/kernels/data/zip_dataset_op.cc
index 96080863ea14eaffab703112a90ee69f54554211..dbc4331c9e82f3bd496d0b06aab7264fe581f647 100644
--- a/tensorflow/core/kernels/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -80,7 +79,7 @@ class ZipDatasetOp : public DatasetOpKernel {
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      std::vector<NodeBuilder::NodeOut> input_graph_nodes;
+      std::vector<Node*> input_graph_nodes;
       input_graph_nodes.reserve(inputs_.size());
       for (const auto& input : inputs_) {
         Node* input_node;
@@ -128,8 +127,6 @@ class ZipDatasetOp : public DatasetOpKernel {
         if (*end_of_sequence) {
           out_tensors->clear();
           input_impls_.clear();
-        } else {
-          *end_of_sequence = false;
         }
         return Status::OK();
       }
diff --git a/tensorflow/core/kernels/data_format_ops.cc b/tensorflow/core/kernels/data_format_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e32d6545b87f900e83459826a565758b52e9f103
--- /dev/null
+++ b/tensorflow/core/kernels/data_format_ops.cc
@@ -0,0 +1,181 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/data_format_ops.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class DataFormatDimMapOp : public OpKernel {
+ public:
+  explicit DataFormatDimMapOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string src_format;
+    OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
+    string dst_format;
+    OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
+    OP_REQUIRES(
+        context, src_format == "NHWC",
+        errors::InvalidArgument(strings::StrCat(
+            "Current implementation doesn't support source data format ",
+            src_format)));
+    OP_REQUIRES(context, dst_format == "NCHW",
+                errors::InvalidArgument(strings::StrCat(
+                    "Current implementation doesn't support dst data format ",
+                    dst_format)));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(
+        context, input.dims() == 0,
+        errors::InvalidArgument("input must be a scalar, but got shape ",
+                                input.shape().DebugString()));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    functor::DataFormatDimMap<Device, T>()(context->eigen_device<Device>(),
+                                           input.scalar<T>(),
+                                           output->scalar<T>());
+  }
+};
+
+template <typename Device, typename T>
+class DataFormatVecPermuteOp : public OpKernel {
+ public:
+  explicit DataFormatVecPermuteOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string src_format;
+    OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
+    string dst_format;
+    OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
+    OP_REQUIRES(context,
+                (src_format == "NHWC" && dst_format == "NCHW") ||
+                    (src_format == "NCHW" && dst_format == "NHWC"),
+                errors::InvalidArgument(strings::StrCat(
+                    "Current implementation only supports NCHW-to-NHWC and "
+                    "NHWC-to-NCHW format conversion; got source format ",
+                    src_format, " and destination format ", dst_format)));
+    nhwc_to_nchw_ = (src_format == "NHWC") ? true : false;
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(context, input.dims() == 1 || input.dims() == 2,
+                errors::InvalidArgument(
+                    "input must be a vector or 2D tensor, but got shape ",
+                    input.shape().DebugString()));
+    if (input.dims() == 1) {
+      OP_REQUIRES(
+          context, input.NumElements() == 4,
+          errors::InvalidArgument("1D input must be of size 4, but got shape ",
+                                  input.shape().DebugString()));
+    } else if (input.dims() == 2) {
+      OP_REQUIRES(
+          context, input.dim_size(0) == 4,
+          errors::InvalidArgument(
+              "First dimension of 2D input must be of size 4, but got shape ",
+              input.shape().DebugString()));
+      OP_REQUIRES(
+          context, input.dim_size(1) == 2,
+          errors::InvalidArgument(
+              "Second dimension of 2D input must be of size 2, but got shape ",
+              input.shape().DebugString()));
+    }
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    functor::DataFormatVecPermute<Device, T>()(
+        context->eigen_device<Device>(), input.flat<T>(), output->flat<T>(),
+        nhwc_to_nchw_);
+  }
+
+ private:
+  bool nhwc_to_nchw_;
+};
+
+#define REGISTER_KERNEL(T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("DataFormatDimMap").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      DataFormatDimMapOp<CPUDevice, T>);
+TF_CALL_int32(REGISTER_KERNEL);
+TF_CALL_int64(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#define REGISTER_KERNEL(T)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("DataFormatVecPermute").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      DataFormatVecPermuteOp<CPUDevice, T>);
+TF_CALL_int32(REGISTER_KERNEL);
+TF_CALL_int64(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                  \
+  template <>                                                \
+  void DataFormatDimMap<GPUDevice, T>::operator()(           \
+      const GPUDevice& d, typename TTypes<T>::ConstScalar x, \
+      typename TTypes<T>::Scalar y);                         \
+  extern template struct DataFormatDimMap<GPUDevice, T>;
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
+TF_CALL_int32(DECLARE_GPU_SPECS);
+TF_CALL_int64(DECLARE_GPU_SPECS);
+#undef DECLARE_GPU_SPEC
+
+#define DECLARE_GPU_SPEC(T)                                \
+  template <>                                              \
+  void DataFormatVecPermute<GPUDevice, T>::operator()(     \
+      const GPUDevice& d, typename TTypes<T>::ConstFlat x, \
+      typename TTypes<T>::Vec y, bool nhwc_to_nchw);       \
+  extern template struct DataFormatVecPermute<GPUDevice, T>;
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
+TF_CALL_int32(DECLARE_GPU_SPECS);
+TF_CALL_int64(DECLARE_GPU_SPECS);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T)                                            \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("DataFormatDimMap").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      DataFormatDimMapOp<GPUDevice, T>);
+TF_CALL_int32(REGISTER_GPU_KERNEL);
+TF_CALL_int64(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+#define REGISTER_GPU_KERNEL(T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("DataFormatVecPermute").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      DataFormatVecPermuteOp<GPUDevice, T>);
+TF_CALL_int32(REGISTER_GPU_KERNEL);
+TF_CALL_int64(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data_format_ops.h b/tensorflow/core/kernels/data_format_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..01b7bff1eb9d439b53da69a8998535210a2c2a19
--- /dev/null
+++ b/tensorflow/core/kernels/data_format_ops.h
@@ -0,0 +1,116 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_DATA_FORMAT_OPS_H_
+#define TENSORFLOW_KERNELS_DATA_FORMAT_OPS_H_
+// Functor definition for data format dim mapping ops, must be compilable
+// by nvcc.
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by DataFormatDimMapOP to do the computations.
+template <typename Device, typename T>
+struct DataFormatDimMap {
+  void operator()(const Device& d, typename TTypes<T>::ConstScalar x,
+                  typename TTypes<T>::Scalar y) {
+    auto zero = x.constant(0);
+    auto one = x.constant(1);
+    auto three = x.constant(3);
+    auto four = x.constant(4);
+    auto x_mod = (x + four) % 4;
+    auto is_zero = (x_mod == zero);
+    auto is_three = (x_mod == three);
+    y.device(d) = is_zero.select(zero, is_three.select(one, x_mod + one));
+  }
+};
+
+template <typename T>
+struct VecPermuteNHWCToNCHW {
+  Eigen::DSizes<Eigen::DenseIndex, 1> dimensions(
+      typename TTypes<T>::ConstFlat input) const {
+    Eigen::DSizes<Eigen::DenseIndex, 1> result;
+    result[0] = input.dimension(0);
+    return result;
+  }
+  template <typename Output, typename Device>
+  void eval(typename TTypes<T>::ConstFlat input, Output& output,
+            const Device& d) const {
+    if (input.size() == 8) {
+      output.template chip<0>(0).device(d) = input.template chip<0>(0);
+      output.template chip<0>(1).device(d) = input.template chip<0>(1);
+      output.template chip<0>(2).device(d) = input.template chip<0>(6);
+      output.template chip<0>(3).device(d) = input.template chip<0>(7);
+      output.template chip<0>(4).device(d) = input.template chip<0>(2);
+      output.template chip<0>(5).device(d) = input.template chip<0>(3);
+      output.template chip<0>(6).device(d) = input.template chip<0>(4);
+      output.template chip<0>(7).device(d) = input.template chip<0>(5);
+    } else {
+      output.template chip<0>(0).device(d) = input.template chip<0>(0);
+      output.template chip<0>(1).device(d) = input.template chip<0>(3);
+      output.template chip<0>(2).device(d) = input.template chip<0>(1);
+      output.template chip<0>(3).device(d) = input.template chip<0>(2);
+    }
+  }
+};
+
+template <typename T>
+struct VecPermuteNCHWToNHWC {
+  Eigen::DSizes<Eigen::DenseIndex, 1> dimensions(
+      typename TTypes<T>::ConstFlat input) const {
+    Eigen::DSizes<Eigen::DenseIndex, 1> result;
+    result[0] = input.dimension(0);
+    return result;
+  }
+  template <typename Output, typename Device>
+  void eval(typename TTypes<T>::ConstFlat input, Output& output,
+            const Device& d) const {
+    if (input.size() == 8) {
+      output.template chip<0>(0).device(d) = input.template chip<0>(0);
+      output.template chip<0>(1).device(d) = input.template chip<0>(1);
+      output.template chip<0>(2).device(d) = input.template chip<0>(4);
+      output.template chip<0>(3).device(d) = input.template chip<0>(5);
+      output.template chip<0>(4).device(d) = input.template chip<0>(6);
+      output.template chip<0>(5).device(d) = input.template chip<0>(7);
+      output.template chip<0>(6).device(d) = input.template chip<0>(2);
+      output.template chip<0>(7).device(d) = input.template chip<0>(3);
+    } else {
+      output.template chip<0>(0).device(d) = input.template chip<0>(0);
+      output.template chip<0>(1).device(d) = input.template chip<0>(2);
+      output.template chip<0>(2).device(d) = input.template chip<0>(3);
+      output.template chip<0>(3).device(d) = input.template chip<0>(1);
+    }
+  }
+};
+
+// Functor used by DataFormatVecPermuteOp to do the computations.
+template <typename Device, typename T>
+struct DataFormatVecPermute {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat x,
+                  typename TTypes<T>::Flat y, bool nhwc_to_nchw) {
+    if (nhwc_to_nchw) {
+      y.device(d) = x.customOp(VecPermuteNHWCToNCHW<T>());
+    } else {
+      y.device(d) = x.customOp(VecPermuteNCHWToNHWC<T>());
+    }
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_DATA_FORMAT_OPS_H_
diff --git a/tensorflow/core/kernels/data_format_ops_gpu.cu.cc b/tensorflow/core/kernels/data_format_ops_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38ce7c28fea662cea7004c47a46c0031875e3c36
--- /dev/null
+++ b/tensorflow/core/kernels/data_format_ops_gpu.cu.cc
@@ -0,0 +1,33 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/data_format_ops.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+template struct functor::DataFormatDimMap<GPUDevice, int32>;
+template struct functor::DataFormatDimMap<GPUDevice, int64>;
+template struct functor::DataFormatVecPermute<GPUDevice, int32>;
+template struct functor::DataFormatVecPermute<GPUDevice, int64>;
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/dataset.cc b/tensorflow/core/kernels/dataset.cc
deleted file mode 100644
index fcfa2956f782fc9617448ad75e53b7c36963d222..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/dataset.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/kernels/dataset.h"
-
-namespace tensorflow {
-
-namespace {
-
-// A wrapper class for storing a `DatasetBase` instance in a DT_VARIANT tensor.
-// Objects of the wrapper class own a reference on an instance of `DatasetBase`,
-// and the wrapper's copy constructor and destructor take care of managing the
-// reference count.
-//
-// NOTE(mrry): This is not a feature-complete implementation of the DT_VARIANT
-// specification. In particular, we cannot currently serialize an arbitrary
-// `DatasetBase` object, so the `Encode()` and `Decode()` methods are not
-// implemented.
-class DatasetVariantWrapper {
- public:
-  DatasetVariantWrapper() : dataset_(nullptr) {}
-
-  // Transfers ownership of `dataset` to `*this`.
-  explicit DatasetVariantWrapper(DatasetBase* dataset) : dataset_(dataset) {}
-
-  DatasetVariantWrapper(const DatasetVariantWrapper& other)
-      : dataset_(other.dataset_) {
-    if (dataset_) dataset_->Ref();
-  }
-
-  ~DatasetVariantWrapper() {
-    if (dataset_) dataset_->Unref();
-  }
-
-  DatasetBase* get() const { return dataset_; }
-
-  string TypeName() const { return "tensorflow::DatasetVariantWrapper"; }
-  string DebugString() const {
-    if (dataset_) {
-      return dataset_->DebugString();
-    } else {
-      return "<Uninitialized DatasetVariantWrapper>";
-    }
-  }
-  void Encode(VariantTensorData* data) const {
-    LOG(ERROR) << "The Encode() method is not implemented for "
-                  "DatasetVariantWrapper objects.";
-  }
-  bool Decode(const VariantTensorData& data) {
-    LOG(ERROR) << "The Decode() method is not implemented for "
-                  "DatasetVariantWrapper objects.";
-    return false;
-  }
-
- private:
-  DatasetBase* const dataset_;  // Owns one reference.
-};
-
-}  // namespace
-
-Status GetDatasetFromVariantTensor(const Tensor& tensor,
-                                   DatasetBase** out_dataset) {
-  if (!(tensor.dtype() == DT_VARIANT ||
-        TensorShapeUtils::IsScalar(tensor.shape()))) {
-    return errors::InvalidArgument(
-        "Dataset tensor must be a scalar of dtype DT_VARIANT.");
-  }
-  const Variant& variant = tensor.scalar<Variant>()();
-  const DatasetVariantWrapper* wrapper = variant.get<DatasetVariantWrapper>();
-  if (wrapper == nullptr) {
-    return errors::InvalidArgument("Tensor must be a Dataset object.");
-  }
-  *out_dataset = wrapper->get();
-  if (*out_dataset == nullptr) {
-    return errors::Internal("Read uninitialized Dataset variant.");
-  }
-  return Status::OK();
-}
-
-Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor) {
-  if (!(tensor->dtype() == DT_VARIANT ||
-        TensorShapeUtils::IsScalar(tensor->shape()))) {
-    return errors::InvalidArgument(
-        "Dataset tensor must be a scalar of dtype DT_VARIANT.");
-  }
-  tensor->scalar<Variant>()() = DatasetVariantWrapper(dataset);
-  return Status::OK();
-}
-
-void DatasetOpKernel::Compute(OpKernelContext* ctx) {
-  DatasetBase* dataset = nullptr;
-  MakeDataset(ctx, &dataset);
-  if (ctx->status().ok()) {
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
-    OP_REQUIRES_OK(ctx, StoreDatasetInVariantTensor(dataset, output));
-  }
-}
-
-void UnaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx,
-                                       DatasetBase** output) {
-  DatasetBase* input;
-  OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &input));
-  MakeDataset(ctx, input, output);
-}
-
-void BinaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx,
-                                        DatasetBase** output) {
-  DatasetBase* input;
-  OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &input));
-  DatasetBase* another_input;
-  OP_REQUIRES_OK(ctx,
-                 GetDatasetFromVariantTensor(ctx->input(1), &another_input));
-  MakeDataset(ctx, input, another_input, output);
-}
-
-const char GraphDatasetBase::kDatasetGraphKey[] = "_DATASET_GRAPH";
-const char GraphDatasetBase::kDatasetGraphOutputNodeKey[] =
-    "_DATASET_GRAPH_OUTPUT_NODE";
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dataset.h b/tensorflow/core/kernels/dataset.h
index df75deacbe3cfec3ee9221d233e07cc61758dcf3..2aa6dbe6f3e1602e0fb94b8b196d41e29d644fd8 100644
--- a/tensorflow/core/kernels/dataset.h
+++ b/tensorflow/core/kernels/dataset.h
@@ -15,639 +15,6 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_H_
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_H_
 
-#include <memory>
-
-#include "tensorflow/core/common_runtime/graph_runner.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/attr_value_util.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/framework/variant_tensor_data.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/tracing.h"
-#include "tensorflow/core/util/tensor_bundle/naming.h"
-#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
-
-// Polymorphic datasets should support all primitive TensorFlow
-// types. Use this macro to expand `m(T)` once for each primitive type
-// `T`, e.g. to build a `switch` statement.
-#define TF_CALL_DATASET_TYPES(m) TF_CALL_ALL_TYPES(m) TF_CALL_QUANTIZED_TYPES(m)
-
-namespace tensorflow {
-
-class ResourceMgr;
-
-// Interface for reading values from a key-value store.
-// Used for restoring iterator state.
-class IteratorStateReader {
- public:
-  virtual Status ReadScalar(StringPiece key, int64* val) = 0;
-  virtual Status ReadScalar(StringPiece key, string* val) = 0;
-  virtual Status ReadTensor(StringPiece key, Tensor* val) = 0;
-  virtual bool Contains(StringPiece key) = 0;
-
-  virtual ~IteratorStateReader() {}
-};
-
-// Interface for writing values to a key-value store.
-// Used for saving iterator state.
-class IteratorStateWriter {
- public:
-  virtual Status WriteScalar(StringPiece key, const int64 val) = 0;
-  virtual Status WriteScalar(StringPiece key, const string& val) = 0;
-  virtual Status WriteTensor(StringPiece key, const Tensor& val) = 0;
-
-  virtual ~IteratorStateWriter() {}
-};
-
-// Wrapper around GraphDefBuilder. Used to serialize Dataset graph.
-class GraphDefBuilderWrapper {
- public:
-  explicit GraphDefBuilderWrapper(GraphDefBuilder* b) : b_(b) {}
-
-  // Adds a Const node with scalar value to the Graph.
-  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
-  // non-null if the method returns with an OK status.
-  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  template <typename T>
-  Status AddScalar(const T& val, Node** output) {
-    Tensor val_t = Tensor(DataTypeToEnum<T>::v(), TensorShape({}));
-    val_t.scalar<T>()() = val;
-    AddTensorInternal(val_t, output);
-    if (*output == nullptr) {
-      return errors::Internal("AddScalar: Failed to build Const op.");
-    }
-    return Status::OK();
-  }
-
-  // Adds a Const node with vector value to the Graph.
-  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
-  // non-null if the method returns with an OK status.
-  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  // TODO(shivaniagrawal): Consider changing to gtl::ArraySlice?
-  template <typename T>
-  Status AddVector(const std::vector<T>& val, Node** output) {
-    Tensor val_t = Tensor(DataTypeToEnum<T>::v(),
-                          TensorShape({static_cast<int64>(val.size())}));
-    for (int i = 0; i < val.size(); i++) {
-      val_t.flat<T>()(i) = val[i];
-    }
-    AddTensorInternal(val_t, output);
-    if (*output == nullptr) {
-      return errors::Internal("AddVector: Failed to build Const op.");
-    }
-    return Status::OK();
-  }
-
-  // Adds a Const node with Tensor value to the Graph.
-  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
-  // non-null if the method returns with an OK status.
-  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  Status AddTensor(const Tensor& val, Node** output) {
-    AddTensorInternal(val, output);
-    if (*output == nullptr) {
-      return errors::Internal("AddTesor: Failed to build Const op.");
-    }
-    return Status::OK();
-  }
-
-  template <class DatasetType>
-  Status AddDataset(const DatasetType* dataset,
-                    const std::vector<NodeBuilder::NodeOut>& inputs,
-                    Node** output) {
-    return AddDataset(dataset, inputs, {}, output);
-  }
-
-  // Adds a node corresponding to the `DatasetType` to the Graph.
-  // Return value of `DatasetType::op_name()` is used as the op type for the
-  // node.
-  // Values for the output_types and output_shapes node attributes are also
-  // written if those attributes are defined in the OpDef.
-  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
-  // non-null if the method returns with an OK status.
-  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  template <class DatasetType>
-  Status AddDataset(const DatasetType* dataset,
-                    const std::vector<NodeBuilder::NodeOut>& inputs,
-                    const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
-                    Node** output) {
-    std::vector<std::pair<size_t, NodeBuilder::NodeOut>> enumerated_inputs(
-        inputs.size());
-    for (int i = 0; i < inputs.size(); i++) {
-      enumerated_inputs[i] = std::make_pair(i, inputs[i]);
-    }
-    return AddDataset(dataset, enumerated_inputs, {}, attrs, output);
-  }
-
-  template <class DatasetType>
-  Status AddDataset(
-      const DatasetType* dataset,
-      const std::vector<std::pair<size_t, NodeBuilder::NodeOut>>& inputs,
-      const std::vector<
-          std::pair<size_t, gtl::ArraySlice<NodeBuilder::NodeOut>>>&
-          list_inputs,
-      const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
-      Node** output) {
-    const string& op_type_name = dataset->op_name();
-    std::unique_ptr<const GraphDefBuilder::Options> opts(
-        new GraphDefBuilder::Options(b_->opts()));
-    // TODO(srbs|mrry): Not all datasets have output_types and output_shapes
-    // attributes defined. It will be nice to have a consistent pattern.
-    bool has_output_types_attr = HasAttr(op_type_name, "output_types");
-    bool has_output_shapes_attr = HasAttr(op_type_name, "output_shapes");
-    if (has_output_shapes_attr) {
-      opts.reset(new GraphDefBuilder::Options(
-          opts->WithAttr("output_shapes", dataset->output_shapes())));
-    }
-    if (has_output_types_attr) {
-      opts.reset(new GraphDefBuilder::Options(
-          opts->WithAttr("output_types", dataset->output_dtypes())));
-    }
-    for (auto attr : attrs) {
-      opts.reset(new GraphDefBuilder::Options(
-          opts->WithAttr(attr.first, attr.second)));
-    }
-    if (opts->HaveError()) {
-      return errors::Internal("AddDataset: Error building Options.");
-    }
-    NodeBuilder node_builder(opts->GetNameForOp(op_type_name), op_type_name,
-                             opts->op_registry());
-    {
-      size_t total_size = inputs.size() + list_inputs.size();
-      auto inputs_iter = inputs.begin();
-      auto list_inputs_iter = list_inputs.begin();
-      for (int i = 0; i < total_size; i++) {
-        if (inputs_iter != inputs.end() && inputs_iter->first == i) {
-          node_builder.Input(inputs_iter->second);
-          inputs_iter++;
-        } else if (list_inputs_iter != list_inputs.end() &&
-                   list_inputs_iter->first == i) {
-          node_builder.Input(list_inputs_iter->second);
-          list_inputs_iter++;
-        } else {
-          return errors::InvalidArgument("No input found for index ", i);
-        }
-      }
-    }
-    *output = opts->FinalizeBuilder(&node_builder);
-    if (*output == nullptr) {
-      return errors::Internal("AddDataset: Failed to build ", op_type_name,
-                              " op.");
-    }
-    return Status::OK();
-  }
-
-  // Adds a user-defined function with name `function_name` to the graph and
-  // recursively adds all functions it references. If a function with a matching
-  // name has already been added, returns with OK status. If a user-defined with
-  // name `function_name` is not found in the FunctionLibraryDefinition, returns
-  // an InvalidArgumentError. If the function with name `function_name` or any
-  // of its dependent functions are stateful, returns an InvalidArgument error.
-  Status AddFunction(OpKernelContext* ctx, const string& function_name) {
-    if (b_->HasFunction(function_name)) {
-      LOG(INFO) << "Function with name " << function_name << "already exists in"
-                << " the graph. It will not be added again.";
-      return Status::OK();
-    }
-    TF_RETURN_IF_ERROR(EnsureFunctionIsStateless(ctx, function_name));
-    const FunctionLibraryDefinition* flib_def =
-        ctx->function_library()->GetFunctionLibraryDefinition();
-    const FunctionDef* f_def = flib_def->Find(function_name);
-    if (f_def == nullptr) {
-      return errors::InvalidArgument("Unable to find FunctionDef for ",
-                                     function_name, " in the registry.");
-    }
-    FunctionDefLibrary def;
-    *def.add_function() = *f_def;
-    const string gradient_func = flib_def->FindGradient(function_name);
-    if (!gradient_func.empty()) {
-      GradientDef* g_def = def.add_gradient();
-      g_def->set_function_name(function_name);
-      g_def->set_gradient_func(gradient_func);
-    }
-    TF_RETURN_IF_ERROR(b_->AddFunctionLibrary(def));
-
-    // Recursively add functions in inputs of function_name.
-    for (const NodeDef& node_def : f_def->node_def()) {
-      const OpRegistrationData* op_reg_data = nullptr;
-      TF_RETURN_IF_ERROR(flib_def->LookUp(node_def.op(), &op_reg_data));
-      if (op_reg_data->is_function_op) {
-        TF_RETURN_IF_ERROR(AddFunction(ctx, op_reg_data->op_def.name()));
-      }
-    }
-
-    // Recursively add functions in attrs of function_name.
-    for (auto iter = f_def->attr().begin(); iter != f_def->attr().end();
-         iter++) {
-      const AttrValue& attr_value = iter->second;
-      if (attr_value.has_func()) {
-        TF_RETURN_IF_ERROR(AddFunction(ctx, attr_value.func().name()));
-      } else if (attr_value.has_list()) {
-        for (const NameAttrList& name_attr_list : attr_value.list().func()) {
-          TF_RETURN_IF_ERROR(AddFunction(ctx, name_attr_list.name()));
-        }
-      }
-    }
-    return Status::OK();
-  }
-
-  template <typename T>
-  void BuildAttrValue(const T& value, AttrValue* attr) {
-    SetAttrValue(value, attr);
-  }
-
- private:
-  void AddTensorInternal(const Tensor& val, Node** output) {
-    *output = ops::SourceOp(
-        "Const",
-        b_->opts().WithAttr("dtype", val.dtype()).WithAttr("value", val));
-  }
-
-  Status EnsureFunctionIsStateless(OpKernelContext* ctx,
-                                   const string& function_name) const {
-    const FunctionLibraryDefinition* lib_def =
-        ctx->function_library()->GetFunctionLibraryDefinition();
-    const FunctionDef* function_def = lib_def->Find(function_name);
-    if (!function_def) {
-      return errors::InvalidArgument("Unable to find FunctionDef for ",
-                                     function_name, " in registry.");
-    }
-    for (const NodeDef& node_def : function_def->node_def()) {
-      const OpDef* op_def;
-      TF_RETURN_IF_ERROR(lib_def->LookUpOpDef(node_def.op(), &op_def));
-      if (op_def->is_stateful()) {
-        return errors::InvalidArgument(
-            "Op[name: ", node_def.name(), ", type: ", node_def.op(), "] ",
-            "in function ", function_name, " is stateful. ",
-            "Saving stateful functions is not supported yet.");
-      }
-    }
-    return Status::OK();
-  }
-
-  bool HasAttr(const string& op_type_name, const string& attr_name) {
-    const OpDef* op_def = nullptr;
-    Status s = b_->opts().op_registry()->LookUpOpDef(op_type_name, &op_def);
-    if (!s.ok() || op_def == nullptr) {
-      return false;
-    }
-    for (auto attr : op_def->attr()) {
-      if (attr.name() == attr_name) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  GraphDefBuilder* b_;
-};
-
-// A cut-down version of OpKernelContext for running computations in
-// iterators. Note that we cannot simply use OpKernelContext here
-// because we might run computation in an iterator whose lifetime is
-// not nested within the lifetime of a single OpKernelContext
-// (e.g. asynchronous prefetching).
-//
-// TODO(mrry): We will probably need to support more of
-// OpKernelContext here. For example, should allocation be handled by
-// the IteratorContext?
-// TODO(mrry): We're making some daring assumptions about the lifetime
-// of the runner passed in here. A runner will be deleted when the original
-// step ends, but all existing runners only close over session-lifetime (or
-// longer-lived) state, so we can make a copy of the function. There's nothing
-// in the definition of the API from which we took the runner to guarantee that
-// what we are doing is safe. We should formalize the properties here.
-class IteratorContext {
- public:
-  struct Params {
-    // Interface to operating system functionality.
-    Env* env;
-
-    // Function call support.
-    std::function<void(std::function<void()>)> runner = nullptr;
-  };
-
-  explicit IteratorContext(Params params) : params_(std::move(params)) {}
-
-  Env* env() const { return params_.env; }
-
-  std::function<void(std::function<void()>)>* runner() {
-    return &params_.runner;
-  }
-
- private:
-  Params params_;
-};
-
-// Represents the current position in a range of outputs, where the
-// range of outputs is typically represented by an `DatasetBase`,
-// defined below.
-class IteratorBase {
- public:
-  virtual ~IteratorBase() {}
-
-  // Gets the next output from the range that this iterator is traversing.
-  //
-  // If at least one output remains in this iterator's range, that
-  // output will be stored in `*out_tensors` and `false` will be
-  // stored in `*end_of_sequence`.
-  //
-  // If no more outputs remain in this iterator's range, `true` will
-  // be stored in `*end_of_sequence`, and the content of
-  // `*out_tensors` will be undefined.
-  //
-  // This method is thread-safe.
-  //
-  // TODO(mrry): Define `GetNextAsync()` or `GetNextManyAsync()`, and
-  // potentially remove this method.
-  virtual Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
-                         bool* end_of_sequence) = 0;
-
-  // Returns a vector of DataType values, representing the respective
-  // element types of each tuple component in the outputs of this
-  // iterator.
-  virtual const DataTypeVector& output_dtypes() const = 0;
-
-  // Returns a vector of tensor shapes, representing the respective
-  // (and possibly partially defined) shapes of each tuple component
-  // in the outputs of this iterator.
-  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
-
-  // Saves the state of this iterator.
-  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) {
-    return SaveInternal(writer);
-  }
-
-  // Restores the state of this iterator.
-  virtual Status Restore(OpKernelContext* ctx, IteratorStateReader* reader) {
-    return RestoreInternal(ctx, reader);
-  }
-
- protected:
-  // This is needed so that sub-classes of IteratorBase can call
-  // `SaveInternal` on their parent iterators, e.g., in
-  // `RepeatDataasetOp::Dataset`.
-  Status SaveParent(IteratorStateWriter* writer,
-                    const std::unique_ptr<IteratorBase>& parent) {
-    return parent->SaveInternal(writer);
-  }
-
-  // This is needed so that sub-classes of IteratorBase can call
-  // `RestoreInternal` on their parent iterators, e.g., in
-  // `RepeatDataasetOp::Dataset`.
-  Status RestoreParent(OpKernelContext* ctx, IteratorStateReader* reader,
-                       const std::unique_ptr<IteratorBase>& parent) {
-    return parent->RestoreInternal(ctx, reader);
-  }
-
-  // Saves the state of this iterator recursively.
-  virtual Status SaveInternal(IteratorStateWriter* writer) {
-    return errors::Unimplemented("SaveInternal");
-  }
-
-  // Restores the state of this iterator recursively.
-  virtual Status RestoreInternal(OpKernelContext* ctx,
-                                 IteratorStateReader* reader) {
-    return errors::Unimplemented("RestoreInternal");
-  }
-};
-
-// Represents a (potentially infinite) range of outputs, where each
-// output is a tuple of tensors.
-class DatasetBase : public core::RefCounted {
- public:
-  // Returns a new iterator for iterating over the range of elements in
-  // this dataset.
-  //
-  // This method may be called multiple times on the same instance,
-  // and the resulting iterators will have distinct state. Each
-  // iterator will traverse all elements in this dataset from the
-  // start.
-  //
-  // Ownership of the created iterator will be transferred to the caller.
-  //
-  // The prefix identifies the sequence of iterators leading up to the newly
-  // created iterator.
-  virtual std::unique_ptr<IteratorBase> MakeIterator(
-      const string& prefix) const = 0;
-
-  // Returns a vector of DataType values, representing the respective
-  // element types of each tuple component in the outputs of this
-  // dataset.
-  virtual const DataTypeVector& output_dtypes() const = 0;
-
-  // Returns a vector of tensor shapes, representing the respective
-  // (and possibly partially defined) shapes of each tuple component
-  // in the outputs of this dataset.
-  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
-
-  // A human-readable debug string for this dataset.
-  virtual string DebugString() = 0;
-
-  // Serializes the dataset and writes it to the `writer`.
-  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) const {
-    return errors::Unimplemented("DatasetBase::Save");
-  }
-
- protected:
-  // TODO(srbs): Ideally all graph related logic should reside in
-  // GraphDatasetBase. However, that would require Datasets defined in all ops
-  // to derive from GraphDatasetBase. Once that is done we can move
-  // DatasetGraphDefBuilder and AsGraphDefInternal to GraphDatasetBase.
-  class DatasetGraphDefBuilder : public GraphDefBuilderWrapper {
-   public:
-    DatasetGraphDefBuilder(GraphDefBuilder* b) : GraphDefBuilderWrapper(b) {}
-    Status AddParentDataset(OpKernelContext* ctx, const DatasetBase* dataset,
-                            Node** output) {
-      return dataset->AsGraphDefInternal(ctx, this, output);
-    }
-  };
-
-  virtual Status AsGraphDefInternal(OpKernelContext* ctx,
-                                    DatasetGraphDefBuilder* b,
-                                    Node** node) const {
-    return AsGraphDefInternal(b, node);
-  }
-
-  virtual Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
-                                    Node** node) const {
-    return errors::Unimplemented("AsGraphDefInternal");
-  }
-};
-
-// Base-class for datasets that are built by ops.
-class GraphDatasetBase : public DatasetBase {
- public:
-  GraphDatasetBase(OpKernelContext* ctx)
-      : op_name_(ctx->op_kernel().type_string()) {}
-
-  const string op_name() const { return op_name_; }
-
-  Status Save(OpKernelContext* ctx,
-              IteratorStateWriter* writer) const override {
-    string serialized_graph_def;
-    string output_node;
-    TF_RETURN_IF_ERROR(Serialize(ctx, &serialized_graph_def, &output_node));
-    TF_RETURN_IF_ERROR(
-        writer->WriteScalar(kDatasetGraphKey, serialized_graph_def));
-    TF_RETURN_IF_ERROR(
-        writer->WriteScalar(kDatasetGraphOutputNodeKey, output_node));
-    return Status::OK();
-  }
-
-  // Key for storing the Dataset graph in the serialized format.
-  static const char kDatasetGraphKey[];
-
-  // Key for storing the output node of the Dataset graph in the serialized
-  // format.
-  static const char kDatasetGraphOutputNodeKey[];
-
- private:
-  Status Serialize(OpKernelContext* ctx, string* serialized_graph_def,
-                   string* output_node) const {
-    GraphDefBuilder b;
-    DatasetGraphDefBuilder db(&b);
-    Node* node = nullptr;
-    TF_RETURN_IF_ERROR(AsGraphDefInternal(ctx, &db, &node));
-    *output_node = node->name();
-    GraphDef graph_def;
-    TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
-    graph_def.SerializeToString(serialized_graph_def);
-    return Status::OK();
-  }
-
-  const string op_name_;
-};
-
-// Represents an iterator that is associated with a particular parent dataset.
-template <class DatasetType>
-class DatasetIterator : public IteratorBase {
- public:
-  struct Params {
-    // Owns one reference on the shared dataset resource.
-    const DatasetType* dataset;
-
-    // Identifies the sequence of iterators leading up to this iterator.
-    const string prefix;
-  };
-
-  explicit DatasetIterator(const Params& params) : params_(params) {
-    params_.dataset->Ref();
-  }
-
-  ~DatasetIterator() override { params_.dataset->Unref(); }
-
-  // The dataset from which this iterator was created.
-  const DatasetType* dataset() const { return params_.dataset; }
-
-  // The sequence of iterators leading up to this iterator.
-  const string prefix() const { return params_.prefix; }
-
-  const DataTypeVector& output_dtypes() const override {
-    return params_.dataset->output_dtypes();
-  }
-
-  const std::vector<PartialTensorShape>& output_shapes() const override {
-    return params_.dataset->output_shapes();
-  }
-
-  Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
-                 bool* end_of_sequence) final {
-    port::Tracing::TraceMe activity(params_.prefix);
-    return GetNextInternal(ctx, out_tensors, end_of_sequence);
-  }
-
-  Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) final {
-    TF_RETURN_IF_ERROR(dataset()->Save(ctx, writer));
-    return IteratorBase::Save(ctx, writer);
-  }
-
- protected:
-  // Internal implementation of GetNext that is wrapped in tracing logic.
-  virtual Status GetNextInternal(IteratorContext* ctx,
-                                 std::vector<Tensor>* out_tensors,
-                                 bool* end_of_sequence) = 0;
-
-  string full_name(const string& name) const {
-    return strings::StrCat(prefix(), ":", name);
-  }
-
- private:
-  Params params_;
-};
-
-// Encapsulates the work required to plug a DatasetBase into the core TensorFlow
-// graph execution engine.
-class DatasetOpKernel : public OpKernel {
- public:
-  DatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  void Compute(OpKernelContext* ctx) final;
-
- protected:
-  // Subclasses should implement this method. It will be called during Compute
-  // execution.
-  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase** output) = 0;
-
-  template <typename T>
-  Status ParseScalarArgument(OpKernelContext* ctx,
-                             const StringPiece& argument_name, T* output) {
-    const Tensor* argument_t;
-    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
-    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
-      return errors::InvalidArgument(argument_name, " must be a scalar");
-    }
-    *output = argument_t->scalar<T>()();
-    return Status::OK();
-  }
-};
-
-// Encapsulates the work required to plug unary Datasets into the core
-// TensorFlow graph execution engine.
-class UnaryDatasetOpKernel : public DatasetOpKernel {
- public:
-  UnaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
-
- protected:
-  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
-  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                           DatasetBase** output) = 0;
-};
-
-// Encapsulates the work required to plug binary Datasets into the core
-// TensorFlow graph execution engine.
-class BinaryDatasetOpKernel : public DatasetOpKernel {
- public:
-  BinaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
-
- protected:
-  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
-  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                           DatasetBase* another_input,
-                           DatasetBase** output) = 0;
-};
-
-// Validates and extracts a `DatasetBase` object from `tensor`.
-//
-// `tensor` must have been written by a call to SetVariantTensorToDataset().
-//
-// The retrieved pointer is a borrowed reference to the dataset, which is owned
-// by the tensor. The consumer must either acquire its own reference to the
-// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
-// destroyed or mutated while the retrieved pointer is in use.
-Status GetDatasetFromVariantTensor(const Tensor& tensor,
-                                   DatasetBase** out_dataset);
-
-// Stores a `DatasetBase` object in `tensor`.
-//
-// The ownership of `dataset` is transferred to `tensor`.
-Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
-
-}  // namespace tensorflow
+#include "tensorflow/core/kernels/data/dataset.h"
 
 #endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_H_
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 2c2105312119fccd0f2be95d989f56388fc18ab4..381add3fb3bd57ebf068212cdd32a640bf60dd9b 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -185,7 +185,7 @@ class BaseDebugOp : public OpKernel {
       if (!status.ok()) {
         LOG(ERROR) << "Debug node of watch key "
                    << debug_watch_key_->debug_node_name
-                   << "failed to publish debug tensor data to all URLs "
+                   << " failed to publish debug tensor data to all URLs "
                    << str_util::Join(debug_urls_, ", ")
                    << ", due to: " << status.error_message();
       }
diff --git a/tensorflow/core/kernels/decode_bmp_op.cc b/tensorflow/core/kernels/decode_bmp_op.cc
index cd7956e1cb2d3394883694832b602bc485e6797d..c778278e8fbbec67a0255ea7d257c19da4f3612f 100644
--- a/tensorflow/core/kernels/decode_bmp_op.cc
+++ b/tensorflow/core/kernels/decode_bmp_op.cc
@@ -33,10 +33,11 @@ class DecodeBmpOp : public OpKernel {
  public:
   explicit DecodeBmpOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
-    OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3 ||
-                             channels_ == 4,
-                errors::InvalidArgument("channels must be 0, 1, 3 or 4, got ",
-                                        channels_));
+    OP_REQUIRES(
+        context,
+        channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
+        errors::InvalidArgument("channels must be 0, 1, 3 or 4, got ",
+                                channels_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -48,6 +49,12 @@ class DecodeBmpOp : public OpKernel {
     // Start decoding image to get shape details
     const StringPiece input = contents.scalar<string>()();
 
+    OP_REQUIRES(context, (32 <= input.size()),
+                errors::InvalidArgument("Incomplete bmp content, requires at "
+                                        "least 32 bytes to find the header "
+                                        "size, width, height, and bpp, got ",
+                                        input.size(), " bytes"));
+
     const uint8* img_bytes = reinterpret_cast<const uint8*>(input.data());
     const int32 header_size = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 10)));
@@ -73,6 +80,22 @@ class DecodeBmpOp : public OpKernel {
                 errors::InvalidArgument(
                     "Number of channels must be 1, 3 or 4, was ", channels_));
 
+    // there may be padding bytes when the width is not a multiple of 4 bytes
+    // 8 * channels == bits per pixel
+    const int row_size = (8 * channels_ * width + 31) / 32 * 4;
+
+    const int last_pixel_offset =
+        header_size + (abs(height) - 1) * row_size + (width - 1) * channels_;
+
+    // [expected file size] = [last pixel offset] + [last pixel size=channels]
+    const int expected_file_size = last_pixel_offset + channels_;
+
+    OP_REQUIRES(
+        context, (expected_file_size <= input.size()),
+        errors::InvalidArgument("Incomplete bmp content, requires at least ",
+                                expected_file_size, " bytes, got ",
+                                input.size(), " bytes"));
+
     // if height is negative, data layout is top down
     // otherwise, it's bottom up
     bool top_down = (height < 0);
@@ -85,25 +108,23 @@ class DecodeBmpOp : public OpKernel {
 
     const uint8* bmp_pixels = &img_bytes[header_size];
 
-    Decode(bmp_pixels, output->flat<uint8>().data(), width, abs(height),
-           channels_, top_down);
+    Decode(bmp_pixels, row_size, output->flat<uint8>().data(), width,
+           abs(height), channels_, top_down);
   }
 
-  uint8* Decode(const uint8* input, uint8* const output, const int width,
-                const int height, const int channles, bool top_down);
+  uint8* Decode(const uint8* input, const int row_size, uint8* const output,
+                const int width, const int height, const int channles,
+                bool top_down);
 
  private:
   int channels_;
 };
 REGISTER_KERNEL_BUILDER(Name("DecodeBmp").Device(DEVICE_CPU), DecodeBmpOp);
 
-uint8* DecodeBmpOp::Decode(const uint8* input, uint8* const output,
-                           const int width, const int height,
-                           const int channels, bool top_down) {
-  // there may be padding bytes when the width is not a multiple of 4 bytes
-  // 8 * channels == bits per pixel
-  int row_size = (8 * channels * width + 31) / 32 * 4;
-
+uint8* DecodeBmpOp::Decode(const uint8* input, const int row_size,
+                           uint8* const output, const int width,
+                           const int height, const int channels,
+                           bool top_down) {
   for (int i = 0; i < height; i++) {
     int src_pos;
     int dst_pos;
diff --git a/tensorflow/core/kernels/decode_compressed_op.cc b/tensorflow/core/kernels/decode_compressed_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f0c0aef55b7a7436427dbb4311e25f59e038853
--- /dev/null
+++ b/tensorflow/core/kernels/decode_compressed_op.cc
@@ -0,0 +1,125 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/parse_ops.cc.
+
+#include <algorithm>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/zlib_compression_options.h"
+#include "tensorflow/core/lib/io/zlib_inputstream.h"
+
+namespace tensorflow {
+namespace {
+// Wrap memory buffer into InputStreamInterface
+class MemoryInputStream : public io::InputStreamInterface {
+ public:
+  explicit MemoryInputStream(const char* buffer, size_t length)
+      : buf_(buffer), len_(length), pos_(0) {}
+
+  ~MemoryInputStream(){};
+
+  Status ReadNBytes(int64 bytes_to_read, string* result) override {
+    result->clear();
+    if (bytes_to_read < 0) {
+      return errors::InvalidArgument("Can't read a negative number of bytes: ",
+                                     bytes_to_read);
+    }
+    int64 bytes = bytes_to_read;
+    Status s = Status::OK();
+    if (pos_ + bytes_to_read > len_) {
+      bytes = len_ - pos_;
+      s = errors::OutOfRange("reached end of file");
+    }
+    if (bytes > 0) {
+      result->resize(bytes);
+      memcpy(&(*result)[0], &buf_[pos_], bytes);
+      pos_ += bytes;
+    }
+    return s;
+  }
+
+  int64 Tell() const override { return pos_; }
+
+  Status Reset() override {
+    pos_ = 0;
+    return Status::OK();
+  }
+
+ private:
+  const char* buf_;  // Not owned.
+  int64 len_;
+  int64 pos_ = 0;  // Tracks where we are in the file.
+};
+}  // namespace
+
+class DecodeCompressedOp : public OpKernel {
+ public:
+  explicit DecodeCompressedOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("compression_type", &compression_type_));
+    OP_REQUIRES(context,
+                (compression_type_ == "" || compression_type_ == "ZLIB" ||
+                 compression_type_ == "GZIP"),
+                errors::InvalidArgument(
+                    "Only ZLIB, GZIP or NONE are supported compressions"));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* bytes_tensor;
+    OP_REQUIRES_OK(context, context->input("bytes", &bytes_tensor));
+    const auto& bytes_flat = bytes_tensor->flat<string>();
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("output", bytes_tensor->shape(),
+                                            &output_tensor));
+    auto output_flat = output_tensor->flat<string>();
+    if (compression_type_ == "") {
+      for (int64 i = 0; i < bytes_flat.size(); i++) {
+        output_flat(i) = bytes_flat(i);
+      }
+    } else {
+      const io::ZlibCompressionOptions zlib_options =
+          compression_type_ == "ZLIB" ? io::ZlibCompressionOptions::DEFAULT()
+                                      : io::ZlibCompressionOptions::GZIP();
+      for (int64 i = 0; i < bytes_flat.size(); i++) {
+        std::unique_ptr<MemoryInputStream> input_stream(
+            new MemoryInputStream(bytes_flat(i).data(), bytes_flat(i).size()));
+        std::unique_ptr<io::ZlibInputStream> zlib_stream(
+            new io::ZlibInputStream(
+                input_stream.get(), static_cast<size_t>(kBufferSize),
+                static_cast<size_t>(kBufferSize), zlib_options));
+        std::string output_string;
+        Status s = zlib_stream->ReadNBytes(INT_MAX, &output_string);
+        OP_REQUIRES(context, (s.ok() || errors::IsOutOfRange(s)), s);
+        output_flat(i) = output_string;
+      }
+    }
+  }
+
+ private:
+  enum { kBufferSize = 256 << 10 /* 256 kB */ };
+  std::string compression_type_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("DecodeCompressed").Device(DEVICE_CPU),
+                        DecodeCompressedOp)
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 208401cb24e9c7ebf28e42ccb2762764474a5377..c9c97dc072c93e3ab840a8a9c9d81eadd2adaa3c 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -62,6 +62,8 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 #define DEFINE_GPU_KERNELS(T) \
   template struct functor::DenseUpdate<GPUDevice, T, ASSIGN>;
 TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_KERNELS);
+TF_CALL_int32(DEFINE_GPU_KERNELS);
+TF_CALL_int64(DEFINE_GPU_KERNELS);
 #undef DEFINE_GPU_KERNELS
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index 53d65a22d1a6cbd5b15aeb9019a204e40c02f1d0..9347978d515b9244dde2b50b2fcfaa3c91ab9c94 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -231,7 +231,8 @@ static void CopyOutputBackpropRegion(const DepthwiseArgs& args,
       }
       // Pad to vector-register width (if needed).
       for (int64 d = 0; d < pad_size; ++d) {
-        buffer[buf_base + vectorized_size + scalar_size + d] = static_cast<T>(0);
+        buffer[buf_base + vectorized_size + scalar_size + d] =
+            static_cast<T>(0);
       }
     }
   }
@@ -510,7 +511,8 @@ static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
 
 #if GOOGLE_CUDA
 
-extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, Eigen::half>;
+extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice,
+                                                          Eigen::half>;
 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
 
@@ -885,7 +887,8 @@ static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
 
 #if GOOGLE_CUDA
 
-extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, Eigen::half>;
+extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice,
+                                                           Eigen::half>;
 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index 2759ecb2f1157b037b700cc5b4662a35b175c08c..a5fd07fbe177f2206ef9b6b3252556211b9e3905 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -373,8 +373,11 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     // If in_depth==1, this operation is just a standard convolution, so
     // invoke that op.
     if (std::is_same<T, float>::value && in_depth == 1) {
+      // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
+      // conv is supported.
       launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter,
-                stride_, stride_, padding_, output, data_format_);
+                /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
+                padding_, output, data_format_);
       return;
     }
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op.h b/tensorflow/core/kernels/depthwise_conv_op.h
index 11aed5b415a4bed8286a23796667266ce73beea8..097a9f5bfad4f1cf0232b0bb31cf6f88fdb5696b 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.h
+++ b/tensorflow/core/kernels/depthwise_conv_op.h
@@ -158,7 +158,8 @@ struct DepthwiseFilterPadOp {
       }
       // Pad the remainder of output to vector-register boundary.
       for (int64 j = 0; j < pad_size; ++j) {
-        padded_filter[output_base + vectorized_size + scalar_size + j] = static_cast<T>(0);
+        padded_filter[output_base + vectorized_size + scalar_size + j] =
+            static_cast<T>(0);
       }
     }
   }
diff --git a/tensorflow/core/kernels/diag_op.cc b/tensorflow/core/kernels/diag_op.cc
index be862b82f1b311e3e46bbe27de9921bb548fa0b6..86fa7dce36afff121dc6ff0642f45c809bc63a3d 100644
--- a/tensorflow/core/kernels/diag_op.cc
+++ b/tensorflow/core/kernels/diag_op.cc
@@ -108,7 +108,7 @@ class DiagPartOp : public OpKernel {
 };
 
 // Implementation of the functor specialization for CPU.
-// 
+//
 // According to the diagonal definition,
 // `output[i1,..., ik, i1,..., ik] = input[i1,..., ik]`,
 //
@@ -116,7 +116,7 @@ class DiagPartOp : public OpKernel {
 // pointer can be represent by coordinate [i1,..., ik],
 // where `index = i1*(s2*...*sk) + i2*(s3*...*sk) +... + ik`
 //
-// Let new_index is the offset of output's pointer with coordinate 
+// Let new_index is the offset of output's pointer with coordinate
 // [i1,..., ik, i1,..., ik], then we have
 // `new_index = i1*(s2*...sk*s1*...*sk) + i2*(s3*...*sk*s1*...*sk) +... + \
 //              ik*(s1*...*sk) + i1*(s2*...*sk) + i2*(s3*...*sk) +... + ik
diff --git a/tensorflow/core/kernels/diag_op_gpu.cu.cc b/tensorflow/core/kernels/diag_op_gpu.cu.cc
index 684f00ea61d136a3ed75d6a6b19f7eff02c30d1e..d3c529d784e3a9ba4a793cd98cff9eb5e74d6090 100644
--- a/tensorflow/core/kernels/diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/diag_op_gpu.cu.cc
@@ -33,7 +33,7 @@ __global__ void DiagCudaKernel(const int num_threads,
                                const T* in,
                                T* out) {
   CUDA_1D_KERNEL_LOOP(index, num_threads) {
-    // Fill the diagonal elements or set to zero in other place. 
+    // Fill the diagonal elements or set to zero in other place.
     if (index % (1 + size) == 0) {
       out[index] = in[index / (1 + size)];
     } else {
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index fc98556440b949c89d8e41901dd57dec552b71df..9bb58b13f382970c60b551f448243a2b75e30df3 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -19,11 +19,12 @@ limitations under the License.
 // 2. We apply cub::DeviceRadixSort::SortPairs to the key - value pairs given
 //    by partitions and indices_in. This will result in two new vectors
 //    partitions_out and indices_out, with partitions_out sorted.
-// 3. The first dimension of outputs[i] is equal to the length of the interval
-//    of i-values in partitions_out. We determine it in two steps:
-//    - compute the starting and ending point of each interval,
-//    - subtract the starting and ending points to find the length.
-//    The result is placed in partition_count.
+// 3. The first dimension of outputs[i] is equal to the number of i-values in
+//    partitions_out. We determine it in two steps:
+//    - apply cub::DeviceReduce::ReduceByKey to count how many times each value
+//      appears in partitions_out,
+//    - move the results to partition_count. This handles missing values
+//      (corresponding to empty parts).
 // 4. Because partition_count is on the GPU, we bring it asynchronously to
 //    the CPU. Then we can allocate the output tensors.
 // 5. Finally, we use indices_out and the gather functor to collect the output.
@@ -35,6 +36,9 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "external/cub_archive/cub/device/device_radix_sort.cuh"
+#include "external/cub_archive/cub/device/device_reduce.cuh"
+#include "external/cub_archive/cub/iterator/constant_input_iterator.cuh"
+#include "external/cub_archive/cub/thread/thread_operators.cuh"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -44,6 +48,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/gather_functor_gpu.cu.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/transform_output_iterator.h"
 
 namespace tensorflow {
 
@@ -57,34 +62,14 @@ __global__ void RangeInitKernel(const T start, const T delta, const int32 size,
   CUDA_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; }
 }
 
-__global__ void FindEndpointsKernel(const int32* partitions, int32 size,
-                                    int32 nump, int32* start, int32* end) {
-  CUDA_1D_KERNEL_LOOP(i, size) {
-    int32 current = ldg(partitions + i);
-    if (FastBoundsCheck(current, nump)) {
-      if (i == 0)
-        start[current] = i;
-      else {
-        int32 before = ldg(partitions + i - 1);
-        if (before != current) start[current] = i;
-      }
-      if (i == size - 1)
-        end[current] = i + 1;
-      else {
-        int32 after = ldg(partitions + i + 1);
-        if (after != current) end[current] = i + 1;
-      }
-    }
-  }
-}
-
-// We create a local version of subtract, because the tf.subtract kernel
-// is not defined for int32. We use it to compute the length of an interval
-// by subtracting the endpoints.
-__global__ void IntervalLengthKernel(int32* start, int32 size, int32* end) {
-  CUDA_1D_KERNEL_LOOP(i, size) {
-    int32 start_point = ldg(start + i);
-    end[i] = end[i] - start_point;
+__global__ void MoveValuesKernel(const int32* keys, const int32* values,
+                                 const int32* size, int32 out_size,
+                                 int32* out) {
+  int32 N = min(ldg(size), out_size);
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    int32 key = ldg(keys + i);
+    int32 value = ldg(values + i);
+    if (FastBoundsCheck(key, out_size)) out[key] = value;
   }
 }
 
@@ -99,23 +84,18 @@ void RangeInit(const GPUDevice& d, const T start, const T delta,
       start, delta, size, out.data());
 }
 
-// Partitions is a sorted vector of N non-negative integer numbers.
-// This function computes the starting and ending points of each interval
-// of values.
-void ComputeIntervals(const GPUDevice& d, Tensor* partitions, int32 N,
-                      int32 nump, int32* start_ptr, int32* end_ptr) {
-  CudaLaunchConfig config = GetCudaLaunchConfig(N, d);
-  FindEndpointsKernel<<<config.block_count, config.thread_per_block, 0,
-                        d.stream()>>>(partitions->flat<int32>().data(), N, nump,
-                                      start_ptr, end_ptr);
-}
-
-// Subtract the ending points of each interval to obtain the interval length.
-void ComputeItvLength(const GPUDevice& d, int32 num, int32* start_ptr,
-                      int32* end_ptr) {
-  CudaLaunchConfig config = GetCudaLaunchConfig(num, d);
-  IntervalLengthKernel<<<config.block_count, config.thread_per_block, 0,
-                         d.stream()>>>(start_ptr, num, end_ptr);
+// Given *num_runs pairs (key, value), this function moves the value
+// corresponding to key i at position i in the array out.
+void MoveValues(const GPUDevice& d, int32* keys, int32* values, int32* num_runs,
+                int32 out_size, int32* out) {
+  // Because num_runs is located on the GPU, we can not access it directly.
+  // So we launch the kernel with size = out_size.
+  // This is valid for correct inputs, because then out_size >= *num_runs.
+  // For wrong inputs, we may have out_size < *num_runs. In this case we will
+  // only handle the first out_size values.
+  CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
+  MoveValuesKernel<<<config.block_count, config.thread_per_block, 0,
+                     d.stream()>>>(keys, values, num_runs, out_size, out);
 }
 
 template <typename T>
@@ -130,10 +110,75 @@ void CallGatherKernel(const GPUDevice& d, const T* params, const int32* indices,
       out_size);
 }
 
+struct IdentityOp {
+  __device__ int32 __forceinline__ operator()(const int32& a) const {
+    return a;
+  }
+};
+
+// Define an output iterator that only allows assignment to
+// positions between [base, base + limit).
+class BoundedOutputIterator
+    : public TransformOutputIterator<int32, int32, IdentityOp> {
+ private:
+  int32 limit;
+  int32* base;
+
+  struct BoundedReference : Reference {
+    int32 limit;
+    int32* base;
+    // Constructor
+    __host__ __device__ __forceinline__
+    BoundedReference(int32* ptr, int32* base, IdentityOp op, int32 limit)
+        : Reference(ptr, op), limit(limit), base(base) {}
+
+    // Assignment
+    __host__ __device__ __forceinline__ int32 operator=(int32 val) {
+      if (ptr - base < limit && ptr - base >= 0) *ptr = val;
+      return val;
+    }
+  };
+
+ public:
+  typedef BoundedOutputIterator self_type;
+  typedef BoundedReference reference;
+
+  __host__ __device__ __forceinline__ BoundedOutputIterator(int32* ptr,
+                                                            IdentityOp op,
+                                                            int32 size)
+      : TransformOutputIterator(ptr, op), limit(size), base(ptr) {}
+
+  __host__ __device__ __forceinline__
+  BoundedOutputIterator(int32* ptr, int32* base, IdentityOp op, int32 size)
+      : TransformOutputIterator(ptr, op), limit(size), base(base) {}
+
+  // Indirection
+  __host__ __device__ __forceinline__ reference operator*() const {
+    return BoundedReference(ptr, base, conversion_op, limit);
+  }
+
+  // Array subscript
+  __host__ __device__ __forceinline__ reference operator[](int32 n) const {
+    return BoundedReference(ptr + n, base, conversion_op, limit);
+  }
+
+  // Addition
+  __host__ __device__ __forceinline__ self_type operator+(int32 n) const {
+    self_type retval(ptr + n, base, conversion_op, limit);
+    return retval;
+  }
+
+  // Subtraction
+  __host__ __device__ __forceinline__ self_type operator-(int32 n) const {
+    self_type retval(ptr - n, base, conversion_op, limit);
+    return retval;
+  }
+};
+
 }  // namespace
 
 // The current implementation has memory cost on GPU
-// I + P + max(3N + R, O + N), where:
+// I + P + max(3N + R + P, O + N), where:
 // I - the size of the input
 // N - the size of the partitions tensor
 // R - the temporary storage used by cub::RadixSort, about 2N
@@ -310,9 +355,11 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
                          Tensor* partition_count, Tensor* indices_out,
                          DoneCallback done) {
     const GPUDevice& device = c->eigen_device<GPUDevice>();
+    const cudaStream_t& cu_stream = GetCudaStream(c);
     int32 N = partitions->NumElements();
     Tensor indices_in;
     Tensor partitions_out;
+    Tensor aggregates_out;
 
     // Allocate memory for Radix-Sort.
     this->AllocateTempSpace(c, N, &indices_in, &partitions_out, indices_out,
@@ -321,24 +368,66 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     this->RadixSort(c, partitions, &indices_in, &partitions_out, indices_out,
                     done);
     if (!c->status().ok()) return;
-    // We still need a little bit of additional memory. However,
-    // we can reuse the indices_in tensor. We could also use atomic
-    // operations and no additional memory, but this approach seems faster.
+    // We will now apply a reduce operation to count how many times
+    // each index appears in partitions.
 
-    // Zero-out the allocated memory.
+    // Zero-out the partition_count tensor.
     functor::SetZeroFunctor<GPUDevice, int32> zero_functor;
     zero_functor(device, partition_count->flat<int32>());
-    zero_functor(device, indices_in.flat<int32>());
+    // Allocate memory for aggregates_out.
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                            &aggregates_out),
+        done);
     // Obtain the pointers to inner buffers.
-    int32* start_ptr = indices_in.flat<int32>().data();
-    int32* end_ptr = partition_count->flat<int32>().data();
-    // Obtain the starting and ending points of each interval.
-    ComputeIntervals(device, &partitions_out, N, num_partitions_, start_ptr,
-                     end_ptr);
-    // Subtract to compute the number of appearances of each id.
-    ComputeItvLength(device, num_partitions_, start_ptr, end_ptr);
-  }  // At this point indices_in and partitions_out will be marked
-     // for deallocation.
+    int32* keys_in_ptr = partitions_out.flat<int32>().data();
+    // Here we reuse the indices_in tensor for the unique keys output.
+    int32* unique_out_ptr = indices_in.flat<int32>().data();
+    int32* aggregates_out_ptr = aggregates_out.flat<int32>().data();
+    // We wrap the pointers in bounded output iterators to guard against
+    // wrong inputs (more than num_partitions distinct indices).
+    IdentityOp id_op;
+    BoundedOutputIterator unique_out_it(unique_out_ptr, id_op, num_partitions_);
+    BoundedOutputIterator aggregates_out_it(aggregates_out_ptr, id_op,
+                                            num_partitions_);
+
+    cub::ConstantInputIterator<int32> values_in(1);
+    cub::Sum reduction_op;
+
+    // Allocate space on GPU for the number of runs. This is required by CUB.
+    Tensor num_runs;
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(DT_INT32, TensorShape({1}), &num_runs), done);
+    int32* num_runs_ptr = num_runs.flat<int32>().data();
+
+    // Determine temporary device storage requirements
+    Tensor cub_temp_storage;
+    size_t temp_storage_bytes = 0;
+    cub::DeviceReduce::ReduceByKey(NULL, temp_storage_bytes, keys_in_ptr,
+                                   unique_out_it, values_in, aggregates_out_it,
+                                   num_runs_ptr, reduction_op, N, cu_stream);
+    // Allocate temporary storage.
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(
+               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+               &cub_temp_storage),
+        done);
+    // Run reduce-by-key. The effect is that we count how many times
+    // each index appears in partitions. The distinct indices are stored
+    // in unique_out, while the count is stored in aggregates_out.
+    // The total number of distinct indices is stored in num_runs.
+    cub::DeviceReduce::ReduceByKey(cub_temp_storage.flat<int8>().data(),
+                                   temp_storage_bytes, keys_in_ptr,
+                                   unique_out_it, values_in, aggregates_out_it,
+                                   num_runs_ptr, reduction_op, N, cu_stream);
+    // We are not done yet. unique_out only contains the indices that appeared
+    // at least once in partitions. We move each value from aggregates_out
+    // to the corresponding position in partition_count. This will handle
+    // possibly empty parts.
+    MoveValues(device, unique_out_ptr, aggregates_out_ptr, num_runs_ptr,
+               num_partitions_, partition_count->flat<int32>().data());
+  }  // At this point indices_in, partitions_out, aggregates_out
+     // and cub_temp_storage will be marked for deallocation.
 
   void GatherSlices(OpKernelContext* c, const Tensor* data,
                     const Tensor* indices, int32 N, int64 slice_size,
@@ -358,7 +447,7 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     }
   }
 
-  int num_partitions_;
+  int32 num_partitions_;
 };
 
 #define REGISTER_DYNAMIC_PARTITION_GPU(T)                                 \
diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc
index ea86b04762d52bd1debe80c2d404cff7bd276406..82ec87911985abe714490ad74fa19105f850b536 100644
--- a/tensorflow/core/kernels/fifo_queue.cc
+++ b/tensorflow/core/kernels/fifo_queue.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/fifo_queue.h"
 #include "tensorflow/core/kernels/queue_base.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -94,7 +95,7 @@ Status FIFOQueue::GetElementComponentFromBatch(const FIFOQueue::Tuple& tuple,
   TF_RETURN_IF_ERROR(ctx->allocate_persistent(
       tuple[component].dtype(), element_shape, out_tensor, &element_access));
   TF_RETURN_IF_ERROR(
-      CopySliceToElement(tuple[component], element_access, index));
+      batch_util::CopySliceToElement(tuple[component], element_access, index));
   return Status::OK();
 }
 
@@ -329,8 +330,8 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                   const int64 index = attempt->tuple[0].dim_size(0) -
                                       attempt->elements_requested;
                   for (int i = 0; i < num_components(); ++i) {
-                    attempt->context->SetStatus(CopyElementToSlice(
-                        tuple[i], &attempt->tuple[i], index));
+                    attempt->context->SetStatus(batch_util::CopyElementToSlice(
+                        std::move(tuple[i]), &attempt->tuple[i], index));
                     if (!attempt->context->status().ok()) return kComplete;
                   }
                   tuple.clear();
diff --git a/tensorflow/core/kernels/flat_map_dataset_op.cc b/tensorflow/core/kernels/flat_map_dataset_op.cc
deleted file mode 100644
index e62a43e94cc277dd8880d13ed22a25909e705b30..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/flat_map_dataset_op.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/random/random.h"
-
-#include "tensorflow/core/kernels/captured_function.h"
-#include "tensorflow/core/kernels/dataset_utils.h"
-
-namespace tensorflow {
-
-namespace {
-
-// See documentation in ../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
-class FlatMapDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit FlatMapDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        graph_def_version_(ctx->graph_def_version()) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-  }
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    OpInputList inputs;
-    OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs));
-    std::vector<Tensor> other_arguments;
-    other_arguments.reserve(inputs.size());
-    for (const Tensor& t : inputs) {
-      other_arguments.push_back(t);
-    }
-
-    std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
-                                                 std::move(other_arguments),
-                                                 &captured_func));
-
-    *output = new Dataset(input, std::move(captured_func), output_types_,
-                          output_shapes_);
-  }
-
- private:
-  class Dataset : public DatasetBase {
-   public:
-    Dataset(const DatasetBase* input,
-            std::unique_ptr<CapturedFunction> captured_func,
-            const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
-        : input_(input),
-          captured_func_(std::move(captured_func)),
-          output_types_(output_types),
-          output_shapes_(output_shapes) {
-      input_->Ref();
-    }
-
-    ~Dataset() override { input_->Unref(); }
-
-    std::unique_ptr<IteratorBase> MakeIterator(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::FlatMap")}));
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
-
-    string DebugString() override { return "FlatMapDatasetOp::Dataset"; }
-
-   private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        do {
-          if (current_element_iterator_) {
-            // We are currently precessing a mapped element, so try to get the
-            // next subelement.
-            bool end_of_element;
-            TF_RETURN_IF_ERROR(current_element_iterator_->GetNext(
-                ctx, out_tensors, &end_of_element));
-            if (!end_of_element) {
-              // Produce the subelement as output.
-              *end_of_sequence = false;
-              return Status::OK();
-            }
-
-            // We have reached the end of the current element, so maybe move on
-            // to the next element.
-            current_element_iterator_.reset();
-          }
-
-          // Get the next element from the input dataset.
-          std::vector<Tensor> args;
-          TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &args, end_of_sequence));
-          if (*end_of_sequence) {
-            return Status::OK();
-          }
-
-          TF_RETURN_IF_ERROR(dataset::MakeIteratorFromInputElement(
-              ctx, args, element_index_++, dataset()->captured_func_.get(),
-              prefix(), &current_element_iterator_));
-        } while (true);
-      }
-
-     private:
-      mutex mu_;
-      size_t element_index_ GUARDED_BY(mu_) = 0;
-      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      std::unique_ptr<IteratorBase> current_element_iterator_ GUARDED_BY(mu_);
-    };
-
-    const DatasetBase* const input_;
-    const std::unique_ptr<CapturedFunction> captured_func_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
-  };
-
-  const int graph_def_version_;
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList func_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("FlatMapDataset").Device(DEVICE_CPU),
-                        FlatMapDatasetOp);
-
-}  // namespace
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index f2290e87a5fdac44629ed6b81c8661cf74c2054e..9382ff7847fcbe8a7e9de4af56eac7774036042f 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -318,7 +318,7 @@ class RemoteCallOp : public AsyncOpKernel {
     if (opts.source_device != target_device) {
       opts.remote_execution = true;
     }
-    opts.rendezvous = ctx->rendezvous();
+    opts.create_rendezvous = true;
     std::vector<Tensor> args;
     args.reserve(arguments.size());
     for (const Tensor& argument : arguments) {
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 5dc74d720ab22e2f2f10baf8309b59661740184f..7e5a9e1ec5aac26706d95646a29539dd0f4be2ed 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -176,10 +176,12 @@ Status DoGatherNd(OpKernelContext* c, const Tensor& params,
       PARAMS_CASE(3);
       PARAMS_CASE(4);
       PARAMS_CASE(5);
+      PARAMS_CASE(6);
+      PARAMS_CASE(7);
 #undef PARAMS_CASE
       default:
         return errors::InvalidArgument(
-            "Only indices.shape[-1] values between 1 and 5 "
+            "Only indices.shape[-1] values between 1 and 7 "
             "are currently supported.  Requested rank: ",
             indices_nd);
     }
@@ -218,7 +220,9 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 2); \
   DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 3); \
   DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 4); \
-  DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 5);
+  DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 5); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 6); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 7);
 
 #define DECLARE_GPU_SPECS(T)         \
   DECLARE_GPU_SPECS_INDEX(T, int32); \
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_6.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_6.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2aec872448ec02581faf95e30844e5e1e80cd277
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_6.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 6
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_7.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_7.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9222cb07695cb1c05b12da59b0c0bbc96bebb388
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_7.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 7
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
index ed5240c20abd247404cb926dd9a455af901c0d7c..b03efc684ffca4abde99b31952983aad5f805ee3 100644
--- a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
@@ -111,7 +111,9 @@ struct GatherNdSlice<GPUDevice, T, Index, IXDIM> {
   DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 2); \
   DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 3); \
   DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 4); \
-  DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 5);
+  DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 5); \
+  DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 6); \
+  DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 7);
 
 #define DEFINE_GPU_SPECS(T)         \
   DEFINE_GPU_SPECS_INDEX(T, int32); \
diff --git a/tensorflow/core/kernels/guarantee_const_op.cc b/tensorflow/core/kernels/guarantee_const_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de3a2a1148b7e7922a08cfce159fb05ccdb9fe30
--- /dev/null
+++ b/tensorflow/core/kernels/guarantee_const_op.cc
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace {
+
+// Refer to the Op description for detailed comments.
+class GuaranteeConstOp : public OpKernel {
+ public:
+  explicit GuaranteeConstOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const DataType input_dtype = ctx->input_dtype(0);
+    OP_REQUIRES(ctx, input_dtype != DT_RESOURCE,
+                errors::InvalidArgument(
+                    "Input tensor cannot be a resource variable handle."));
+    const Tensor& input_tensor = ctx->input(0);
+    Tensor* output = nullptr;
+    if (!ctx->forward_input_to_output_with_shape(0, 0, input_tensor.shape(),
+                                                 &output)) {
+      ctx->set_output(0, input_tensor);
+    }
+  }
+
+  bool IsExpensive() override { return false; }
+};
+
+REGISTER_KERNEL_BUILDER(Name("GuaranteeConst").Device(DEVICE_CPU),
+                        GuaranteeConstOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/guarantee_const_op_test.cc b/tensorflow/core/kernels/guarantee_const_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..01461fbb8c22a2bfb9669bef680759ecab324a61
--- /dev/null
+++ b/tensorflow/core/kernels/guarantee_const_op_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class GuaranteeConstOpTest : public OpsTestBase {
+ protected:
+  Status Init(DataType input_type) {
+    TF_CHECK_OK(NodeDefBuilder("op", "GuaranteeConst")
+                    .Input(FakeInput(input_type))
+                    .Finalize(node_def()));
+    return InitOp();
+  }
+};
+
+TEST_F(GuaranteeConstOpTest, Int32Success_6) {
+  TF_ASSERT_OK(Init(DT_INT32));
+  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(GuaranteeConstOpTest, Int32Success_2_3) {
+  TF_ASSERT_OK(Init(DT_INT32));
+  AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_INT32, TensorShape({2, 3}));
+  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(GuaranteeConstOpTest, StringSuccess) {
+  TF_ASSERT_OK(Init(DT_STRING));
+  AddInputFromArray<string>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({6}));
+  test::FillValues<string>(&expected, {"A", "b", "C", "d", "E", "f"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(GuaranteeConstOpTest, ResourceInputError) {
+  TF_ASSERT_OK(Init(DT_RESOURCE));
+  AddResourceInput("", "resource", new Var(DT_INT32));
+  const auto status = RunOpKernel();
+  ASSERT_EQ(error::INVALID_ARGUMENT, status.code());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index 01ae5a83c1eec9eb4ccb74841555b5bb1b6cd60f..7728ba850c94aa79feb31d137712692df0f89176 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -52,6 +52,7 @@ Status DoParallelConcat(const CPUDevice& d, const Tensor& value, int32 loc,
     return DoParallelConcatUpdate<CPUDevice, type>(d, value, loc, output);
     TF_CALL_NUMBER_TYPES(CASE);
     TF_CALL_string(CASE);
+    TF_CALL_variant(CASE);
 #undef CASE
     default:
       return errors::InvalidArgument("Unsupported data type: ", value.dtype());
diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc
index 67d603dd0ae9851d1135e0d031efc16ca612f680..bacf3e77408a12a8a95bf7e7ab8f3a580e675675 100644
--- a/tensorflow/core/kernels/logging_ops.cc
+++ b/tensorflow/core/kernels/logging_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <iostream>
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -76,7 +77,7 @@ class PrintOp : public OpKernel {
       strings::StrAppend(&msg, "[", ctx->input(i).SummarizeValue(summarize_),
                          "]");
     }
-    LOG(INFO) << msg;
+    std::cerr << msg << std::endl;
   }
 
  private:
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index e774c771b8e28c1a3c19cfafb6e7597c81e4eb5c..418d9dcc610c98bb1e7135b29d929fd17478fcd1 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -823,6 +823,7 @@ REGISTER_KERNEL(int64, int64);
 REGISTER_KERNEL(int64, float);
 REGISTER_KERNEL(string, string);
 REGISTER_KERNEL(string, bool);
+REGISTER_KERNEL(int32, int32);
 
 #undef REGISTER_KERNEL
 
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 157ce106ce685ab9f1f77a5d03df51eb8732270a..2eefadad4949fd8d78f6a27533ce0385c38d9c69 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -359,7 +359,8 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
 
     use_dnn_ = CanUseCudnn();
-    ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false, &propagate_nans_);
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
+                                   &propagate_nans_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -888,7 +889,8 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
 
-    ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false, &propagate_nans_);
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
+                                   &propagate_nans_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -1052,7 +1054,8 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
                     "Pooling is not yet supported on the batch dimension."));
     use_dnn_ = CanUseCudnn();
 
-    ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false, &propagate_nans_);
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
+                                   &propagate_nans_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -1137,7 +1140,8 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
     }
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
     use_dnn_ = CanUseCudnn();
-    ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false, &propagate_nans_);
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
+                                   &propagate_nans_));
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index d96b844383edec2abb04c75226bd37e574a3dba5..f8daaca4c94aada5dbae5e5582f0da075b7222d5 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -405,17 +405,17 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
   if (propagate_nans) {
     MaxPoolForwardNHWC<true>
         <<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-           kThreadsPerBlock, 0, d.stream()>>>
-        (output_size, bottom_data, height, width, channels, pooled_height,
-         pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
-         top_data, mask);
+           kThreadsPerBlock, 0, d.stream()>>>(
+            output_size, bottom_data, height, width, channels, pooled_height,
+            pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
+            top_data, mask);
   } else {
     MaxPoolForwardNHWC<false>
         <<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-           kThreadsPerBlock, 0, d.stream()>>>
-        (output_size, bottom_data, height, width, channels, pooled_height,
-         pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
-         top_data, mask);
+           kThreadsPerBlock, 0, d.stream()>>>(
+            output_size, bottom_data, height, width, channels, pooled_height,
+            pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
+            top_data, mask);
   }
   return d.ok();
 }
diff --git a/tensorflow/core/kernels/meta_support.cc b/tensorflow/core/kernels/meta_support.cc
index b29feb003242500548d1a4bf83a31c8c2d1c57d0..9fed01189fc3bfde4ad1e23ea8fda0c76311b3bc 100644
--- a/tensorflow/core/kernels/meta_support.cc
+++ b/tensorflow/core/kernels/meta_support.cc
@@ -82,7 +82,7 @@ gemmlowp::WorkersPool* GetWorkersPool() {
 }
 
 mutex& GetMutex() {
-  static mutex mu;
+  static mutex mu(LINKER_INITIALIZED);
   return mu;
 }
 
diff --git a/tensorflow/core/kernels/mfcc.h b/tensorflow/core/kernels/mfcc.h
index c39f10499091f0b5c6c74a3e70a812169b84c807..0d5d9fb90f8bd137aea5d7f3b8c08dfcd1495c18 100644
--- a/tensorflow/core/kernels/mfcc.h
+++ b/tensorflow/core/kernels/mfcc.h
@@ -33,10 +33,11 @@ class Mfcc {
   bool Initialize(int input_length,
                   double input_sample_rate);
 
-  // Input is a single magnitude spectrogram frame. The input spectrum
-  // is filtered into bands using a triangular mel filterbank and a
-  // discrete cosine transform (DCT) of the values is taken. Output is
-  // populated with the lowest dct_coefficient_count of these values.
+  // Input is a single squared-magnitude spectrogram frame. The input spectrum
+  // is converted to linear magnitude and weighted into bands using a
+  // triangular mel filterbank, and a discrete cosine transform (DCT) of the
+  // values is taken. Output is populated with the lowest dct_coefficient_count
+  // of these values.
   void Compute(const std::vector<double>& spectrogram_frame,
                std::vector<double>* output) const;
 
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.h b/tensorflow/core/kernels/mfcc_mel_filterbank.h
index 33ea1bdb5bc3e2a2326913c99f2f6713bd82f096..a766a20cbca4a7772a62a2701334c87a5ed57531 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank.h
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Basic class for applying a mel-scale filterbank to an input.
+// Basic class for applying a mel-scale mapping to a power spectrum.
 
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
@@ -32,8 +32,9 @@ class MfccMelFilterbank {
                   double lower_frequency_limit,
                   double upper_frequency_limit);
 
-  // Takes a magnitude spectrogram slice as input, computes a
-  // traingular mel filterbank and places the result in output.
+  // Takes a squared-magnitude spectrogram slice as input, computes a
+  // triangular-mel-weighted linear-magnitude filterbank, and places the result
+  // in output.
   void Compute(const std::vector<double>& input,
                std::vector<double>* output) const;
 
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 935eb81dd05897b49446cc285222a946be3d2931..9aabbbdb6b4d9041ec2d8dffc0cb69199306dba1 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include <numeric>
-
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -29,10 +28,17 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-namespace tensorflow {
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+using mkldnn::stream;
+using mkldnn::sum;
+#endif
 
+namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, typename T>
 class MklAddNOp : public OpKernel {
  public:
@@ -41,17 +47,18 @@ class MklAddNOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const int num = ctx->num_inputs();
     OP_REQUIRES(ctx, num / 2 == 2,
-                errors::InvalidArgument("Only additions of two arguments "
+                errors::InvalidArgument("Only additions of two tensors "
                                         "supported by MKL. Num inputs: ",
                                         num));
 
     MklAddNOpContext mkl_context;
-    const Tensor& input0 = MklGetInput(ctx, 0);
-    GetMklShape(ctx, 0, &(mkl_context.input1_shape));
+    size_t src1_idx = 0, src2_idx = 1;
+    const Tensor& input0 = MklGetInput(ctx, src1_idx);
+    GetMklShape(ctx, src1_idx, &(mkl_context.input1_shape));
     bool input1_in_mkl_format = mkl_context.input1_shape.IsMklTensor();
 
-    const Tensor& input1 = MklGetInput(ctx, 1);
-    GetMklShape(ctx, 1, &(mkl_context.input2_shape));
+    const Tensor& input1 = MklGetInput(ctx, src2_idx);
+    GetMklShape(ctx, src2_idx, &(mkl_context.input2_shape));
     bool input2_in_mkl_format = mkl_context.input2_shape.IsMklTensor();
 
     // handle the case of a scalar
@@ -59,13 +66,12 @@ class MklAddNOp : public OpKernel {
       const TensorShape& o_shape = input0.shape();
       Tensor* out_tensor = nullptr;
       mkl_context.output_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(ctx, 0, &out_tensor, o_shape,
+      AllocateOutputSetMklShape(ctx, src1_idx, &out_tensor, o_shape,
                                 mkl_context.output_shape);
       float user_i1 = (input0.scalar<T>()());
-      ;
       float user_i2 = (input1.scalar<T>()());
-      ;
-      out_tensor->scalar<T>()() = std::plus<float>{}(user_i1, user_i2);
+      out_tensor->scalar<T>()() =
+          std::plus<float>{}(user_i1, user_i2);
       return;
     }
 
@@ -82,8 +88,8 @@ class MklAddNOp : public OpKernel {
       if (o_shape.num_elements() == 0) {
         Tensor* out_tensor = nullptr;
         mkl_context.output_shape.SetMklTensor(false);
-        AllocateOutputSetMklShape(ctx, 0, &out_tensor, o_shape,
-                                  mkl_context.output_shape);
+        AllocateOutputSetMklShape(ctx, src1_idx, &out_tensor, o_shape,
+                                 mkl_context.output_shape);
         return;
       }
     }
@@ -92,9 +98,9 @@ class MklAddNOp : public OpKernel {
     mkl_context.in_strides = new size_t[mkl_context.in_dims];
     // Generate size, stride for input if input is in MKL format.
     if (input1_in_mkl_format || input2_in_mkl_format) {
-      const MklShape* tmp_mkl_shape = (input1_in_mkl_format)
-                                          ? &mkl_context.input1_shape
-                                          : &mkl_context.input2_shape;
+      const MklShape* tmp_mkl_shape =
+        (input1_in_mkl_format) ? &mkl_context.input1_shape :
+        &mkl_context.input2_shape;
       for (int i = 0; i < mkl_context.in_dims; i++) {
         mkl_context.in_sizes[i] = tmp_mkl_shape->GetSizes()[i];
         mkl_context.in_strides[i] = tmp_mkl_shape->GetStrides()[i];
@@ -110,7 +116,6 @@ class MklAddNOp : public OpKernel {
             mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
       }
     }
-
     std::vector<float> coeff(2, 1.0);
     mkl_context.MklCreateInputLayouts(ctx);
     CHECK_EQ(dnnSumCreate_F32(&mkl_context.Eltwise, mkl_context.attributes, 2,
@@ -127,7 +132,7 @@ class MklAddNOp : public OpKernel {
      mkl_context.output_shape.SetMklLayout(mkl_context.Eltwise, dnnResourceDst);
 
      mkl_context.output_shape.SetTfLayout(
-         mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
+        mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
      if (input1_in_mkl_format == true) {
       mkl_context.output_shape.SetTfDimOrder(mkl_context.in_dims,
       mkl_context.input1_shape.GetTfToMklDimMap());
@@ -139,12 +144,12 @@ class MklAddNOp : public OpKernel {
                         mkl_context.output_shape.GetMklLayout())) /
                     sizeof(T));
 
-     AllocateOutputSetMklShape(ctx, 0, &output, tf_shape,
+     AllocateOutputSetMklShape(ctx, src1_idx, &output, tf_shape,
                               mkl_context.output_shape);
     } else {
      const TensorShape& o_shape = input1.shape();
      mkl_context.output_shape.SetMklTensor(false);
-     AllocateOutputSetMklShape(ctx, 0, &output, o_shape,
+     AllocateOutputSetMklShape(ctx, src1_idx, &output, o_shape,
                                 mkl_context.output_shape);
     }
 
@@ -172,16 +177,18 @@ class MklAddNOp : public OpKernel {
     void MklCreateInputLayouts(OpKernelContext* context) {
       bool input1_in_mkl_format = input1_shape.IsMklTensor();
       if (!input1_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input1, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
+        CHECK_EQ(
+            dnnLayoutCreate_F32(&lt_input1, in_dims, in_sizes, in_strides),
+            E_SUCCESS);
       } else {
         lt_input1 = static_cast<dnnLayout_t>(input1_shape.GetCurLayout());
       }
 
       bool input2_in_mkl_format = input2_shape.IsMklTensor();
       if (!input2_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input2, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
+        CHECK_EQ(
+            dnnLayoutCreate_F32(&lt_input2, in_dims, in_sizes, in_strides),
+            E_SUCCESS);
       } else {
         lt_input2 = static_cast<dnnLayout_t>(input2_shape.GetCurLayout());
       }
@@ -257,8 +264,8 @@ class MklAddNOp : public OpKernel {
       bool input2_in_mkl_format = input2_shape.IsMklTensor();
       dnnDelete_F32(Eltwise);
       if (!input1_in_mkl_format || !input2_in_mkl_format) {
-        delete[] in_sizes;
-        delete[] in_strides;
+         delete [] in_sizes;
+         delete [] in_strides;
       }
       if (!input1_in_mkl_format) {
          dnnLayoutDelete_F32(lt_input1);
@@ -270,6 +277,151 @@ class MklAddNOp : public OpKernel {
   } MklAddNOpContext;
 };
 
+#else  // INTEL_MKL_DNN
+template <typename Device, typename T>
+class MklAddNOp : public OpKernel {
+ public:
+  ~MklAddNOp() {}
+  explicit MklAddNOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const int num = ctx->num_inputs();
+    // Only additions of 2 input tensors is supported now
+    OP_REQUIRES(ctx, num / 2 == 2,
+                errors::InvalidArgument("Only additions of two tensors "
+                                        "supported by MKL. Num inputs: ",
+                                        num));
+
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      size_t src1_idx = 0, src2_idx = 1;
+      const Tensor& src1_tensor = MklGetInput(ctx, src1_idx);
+      const Tensor& src2_tensor = MklGetInput(ctx, src2_idx);
+
+      MklDnnShape src1_mkl_shape, src2_mkl_shape;
+      GetMklShape(ctx, src1_idx, &src1_mkl_shape);
+      GetMklShape(ctx, src2_idx, &src2_mkl_shape);
+      bool input1_in_mkl_format = src1_mkl_shape.IsMklTensor();
+      bool input2_in_mkl_format = src2_mkl_shape.IsMklTensor();
+      int src1_dims_size = input1_in_mkl_format?
+       src1_mkl_shape.GetDimension(): src1_tensor.dims();
+      int src2_dims_size = input2_in_mkl_format?
+       src2_mkl_shape.GetDimension(): src2_tensor.dims();
+
+      if (!input1_in_mkl_format && src1_dims_size == 0) {
+         Tensor* dst_tensor = nullptr;
+         MklShape mkl_shape_dst;
+         mkl_shape_dst.SetMklTensor(false);
+         AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor,
+         src1_tensor.shape(), mkl_shape_dst);
+         float user_i1 = (src1_tensor.scalar<T>()());
+         float user_i2 = (src2_tensor.scalar<T>()());
+         dst_tensor->scalar<T>()() =
+           std::plus<float>{}(user_i1, user_i2);
+         return;
+       }
+
+      // If there is nothing to compute, return.
+      if (!input1_in_mkl_format && !input2_in_mkl_format) {
+        if (src1_tensor.shape().num_elements() == 0) {
+           Tensor* dst_tensor = nullptr;
+           MklShape mkl_shape_dst;
+           mkl_shape_dst.SetMklTensor(false);
+           AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor,
+           src1_tensor.shape(), mkl_shape_dst);
+           return;
+        }
+      }
+
+      // element-wise add operator for tensor input1 and tensor input2
+      std::vector<double> coeff(2, 1.0);
+      MklDnnData<T> src1(&cpu_engine);
+      MklDnnData<T> src2(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);
+
+      int tmp_size = input1_in_mkl_format ? src2_dims_size: src1_dims_size;
+      memory::dims dims(tmp_size);
+      memory::dims strides(tmp_size);
+      memory::desc md1({}, memory::data_undef, memory::format_undef);
+      memory::desc md2({}, memory::data_undef, memory::format_undef);
+
+      if ( input1_in_mkl_format || input2_in_mkl_format ) {
+        if ( input1_in_mkl_format ) {
+          md1 = src1_mkl_shape.GetMklLayout();
+          md2 = md1;
+          dst.SetUsrMem(md1);
+        } else {
+          md2 = src2_mkl_shape.GetMklLayout();
+          md1 = md2;
+          dst.SetUsrMem(md2);
+        }
+      } else {
+         dims = TFShapeToMklDnnDims(src1_tensor.shape());
+         strides = CalculateTFStrides(dims);
+         md1 = MklDnnData<T>::CreateBlockedMemDesc(dims, strides);
+         md2 = md1;
+         dst.SetUsrMem(dims, strides);
+      }
+
+      std::vector<memory::primitive_desc> srcs_pd;
+
+      src1.SetUsrMem(md1, &src1_tensor);
+      auto mpd1 = src1.GetUsrMemPrimDesc();
+      srcs_pd.push_back(mpd1);
+
+      src2.SetUsrMem(md2, &src2_tensor);
+      auto mpd2 = src2.GetUsrMemPrimDesc();
+      srcs_pd.push_back(mpd2);
+
+      std::vector<primitive::at> inputs;
+      inputs.push_back(src1.GetOpMem());
+      inputs.push_back(src2.GetOpMem());
+      auto output_pd = dst.GetUsrMemPrimDesc();
+      Tensor* dst_tensor = nullptr;
+      auto sum_pd = sum::primitive_desc(dst.GetUsrMemDesc(), coeff, srcs_pd);
+      auto sum_op = sum(sum_pd, inputs, dst.GetOpMem());
+      if ( input2_in_mkl_format || input1_in_mkl_format ) {
+         MklDnnShape output_mkl_shape;
+         output_mkl_shape.SetMklTensor(true);
+         output_mkl_shape.SetMklLayout(&output_pd);
+         output_mkl_shape.SetElemType(MklDnnType<T>());
+         if ( input1_in_mkl_format ) {
+          output_mkl_shape.SetTfLayout(src1_dims_size,
+          src1_mkl_shape.GetSizesAsMklDnnDims(),
+          src1_mkl_shape.GetTfDataFormat());
+         } else {
+          output_mkl_shape.SetTfLayout(src2_dims_size,
+          src2_mkl_shape.GetSizesAsMklDnnDims(),
+          src2_mkl_shape.GetTfDataFormat());
+         }
+         TensorShape output_tf_shape;
+         output_tf_shape.AddDim((output_pd.get_size() / sizeof(T))
+         + (output_pd.get_size()%sizeof(T) == 0 ? 0 : 1));
+         AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor, output_tf_shape,
+                                output_mkl_shape);
+      } else {
+         MklShape mkl_shape_dst;
+         mkl_shape_dst.SetMklTensor(false);
+         AllocateOutputSetMklShape(ctx, src1_idx,
+         &dst_tensor, src1_tensor.shape(), mkl_shape_dst);
+      }
+
+      dst.SetUsrMemDataHandle(dst_tensor);
+      std::vector<primitive> net;
+      net.push_back(sum_op);
+      stream(stream::kind::eager).submit(net).wait();
+    } catch (mkldnn::error &e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) +
+                       ", in file " + string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+      OP_REQUIRES_OK(ctx, errors::Aborted("Operation received an exception:",
+                                            error_msg));
+    }
+  }
+};
+
+#endif
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklAddN")                          \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index d90baee069c17e9b25169dcb2650681f6103f9b1..d751a70fc86b40d8ca656322484848cf906359fd 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -24,10 +24,25 @@
 
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+using mkldnn::memory;
+using mkldnn::error;
+using mkldnn::pooling_forward;
+using mkldnn::pooling_backward;
+using mkldnn::padding_kind;
+using mkldnn::engine;
+using mkldnn::prop_kind;
+using mkldnn::algorithm;
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+// For now, MKL-ML is default. So making MKL-DNN not a default choice.
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, typename T>
 class MklAvgPoolingOp : public OpKernel {
  public:
@@ -132,7 +147,7 @@ class MklAvgPoolingOp : public OpKernel {
         E_SUCCESS);
 
     mkl_context.MklCleanup();
-  }
+  }  // Compute
 
  private:
   typedef struct {
@@ -411,7 +426,293 @@ class MklAvgPoolingGradOp : public OpKernel {
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
-};
+};  // MklAvgPoolingGradOp
+
+
+#else  // INTEL_MKL_DNN is defined
+
+template <typename Device, typename T>
+class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
+ public:
+  explicit MklAvgPoolingOp(OpKernelConstruction* context)
+  : MklPoolingForwardOpBase<T>(context) {
+    // Workspace is an MKLDNN construct that is only used in Max Pooling.
+    // So set workspace_enabled_ to false.
+    this->workspace_enabled_ = false;
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const Tensor& input_tensor = MklGetInput(context,
+              this->kInputTensorIndexInput);
+      MklDnnShape dnn_shape_input;
+      GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
+      this->SanityCheckInput(context, input_tensor, dnn_shape_input);
+      if (!context->status().ok()) return;
+
+      MklDnnData<T> dnn_data_input(&cpu_engine);
+      MklDnnData<T> dnn_data_output(&cpu_engine);
+
+      // initialize variables for the pooling op
+      MklPoolParameters pool_params;
+      // Get the input tensor and initialize the pooling parameters
+      this->ConfigureInput(context, dnn_shape_input,
+                          input_tensor, &pool_params,
+                          &dnn_data_input);
+      OP_REQUIRES_OK(context, context->status());
+
+      // Declare output tensor
+      Tensor* output_tensor = nullptr;
+      memory::dims output_dims_mkl_order;
+      this->GetOutputDims(pool_params, &output_dims_mkl_order);
+
+      // If input is in Mkl layout, then just get the memory format from it
+      // directly, instead of using input data_format to AvgPool.
+      if (dnn_shape_input.IsMklTensor()) {
+        dnn_data_output.SetUsrMem(output_dims_mkl_order,
+                static_cast<memory::format>(dnn_data_input.GetUsrMemDesc()
+                    .data.format));
+
+      } else {
+          dnn_data_output.SetUsrMem(output_dims_mkl_order,
+              this->data_format_mkldnn_);
+      }
+
+        // describe the memory layout
+      dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
+
+      // 3. create a pooling primitive descriptor
+      auto pool_desc = pooling_forward::desc(prop_kind::forward,
+              algorithm::pooling_avg_exclude_padding,
+              dnn_data_input.GetUsrMemDesc(),
+              dnn_data_output.GetUsrMemDesc(),
+              memory::dims({  pool_params.row_stride,
+                              pool_params.col_stride}),
+              memory::dims({  pool_params.window_rows,
+                              pool_params.window_cols}),
+              memory::dims({  static_cast<int>(pool_params.pad_top),
+                              static_cast<int>(pool_params.pad_left)}),
+              memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                              static_cast<int>(pool_params.pad_right)}),
+              TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc,
+                                                 cpu_engine);
+
+      this->AllocateOutputTensor(context, pool_prim_desc, output_dims_mkl_order,
+                            this->data_format_mkldnn_, &output_tensor);
+      CHECK_NOTNULL(output_tensor);
+
+      OP_REQUIRES_OK(context, context->status());
+      dnn_data_output.SetUsrMemDataHandle(output_tensor);
+
+      this->PrepareAndExecuteNet(pool_prim_desc,
+                                &dnn_data_input,
+                                &dnn_data_output);
+    } catch (mkldnn::error &e) {
+        string error_msg = "Status: " + std::to_string(e.status) +
+                        ", message: " + string(e.message) +
+                        ", in file " + string(__FILE__) + ":" +
+                        std::to_string(__LINE__);
+        OP_REQUIRES_OK(context,
+                        errors::Aborted("Operation received an exception:",
+                                         error_msg));
+    }
+  }  // Compute
+};  // MklAvgPoolingOp
+
+//-----------------------------------------------------------------------------
+
+template <class Device, class T>
+class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
+ public:
+  explicit MklAvgPoolingGradOp(OpKernelConstruction* context)
+      : MklPoolingBackwardOpBase<T>(context) {
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      MklDnnShape original_input_mkl_shape, input_gradient_mkl_shape;
+      const Tensor& tensor_in_shape = MklGetInput(context,
+          kInputTensorIndexInputShape);
+      const Tensor& input_gradient_tensor = MklGetInput(context,
+          kInputTensorIndexInputGradient);
+      GetMklShape(context, kInputTensorIndexInputShape,
+            &original_input_mkl_shape);
+      GetMklShape(context, kInputTensorIndexInputGradient,
+            &input_gradient_mkl_shape);
+
+
+      SanityCheckInputs(context, tensor_in_shape,
+                        input_gradient_tensor,
+                        original_input_mkl_shape,
+                        input_gradient_mkl_shape);
+      if (!context->status().ok()) return;
+
+      // Used to allocate output_diff_src/diff_src
+      // and create pool_fwd mdm desc
+      // 0. Input("orig_input_shape: int32") //NOT a T Tensor!
+      // 1. Input("grad: T")
+
+      MklDnnData<T> input_gradient_diff_dst(&cpu_engine);
+      MklDnnData<T> output_diff_src(&cpu_engine);
+      Tensor* output_tensor_diff_src = nullptr;
+      TensorShape original_input_shape;
+      MklPoolParameters pool_params;
+      memory::dims output_dims_mkl_order, original_input_dims_nchw;
+      // Configure the original input memory descriptor
+      memory::desc original_input_md = ConfigureOriginalInput(context,
+                                      tensor_in_shape,
+                                      original_input_mkl_shape,
+                                      &original_input_dims_nchw,
+                                      &pool_params,
+                                      &original_input_shape);
+
+      // configure the original output memory descriptor
+      // by definition, the shape of the original output is the same
+      // as the shape of the gradient diff_dst
+      memory::desc original_output_md = this->ConfigureOriginalOutput(
+                pool_params, input_gradient_mkl_shape, output_dims_mkl_order);
+
+      memory::desc target_diff_dst_md = this->ConfigureInputGradient(
+                                    input_gradient_mkl_shape,
+                                    input_gradient_tensor,
+                                    &input_gradient_diff_dst,
+                                    original_output_md);
+      // The shape of the output diff src needs to be the same shape as the
+      // original input. But we will set its format to be same as the format of
+      // input gradient. We won't use format of original input since it will
+      // always be in Tensorflow layout (given that AvgPoolGrad gets shape of
+      // the input rather than actual input).
+      output_diff_src.SetUsrMem(original_input_dims_nchw,
+                                static_cast<memory::format>(
+                                  target_diff_dst_md.data.format));
+
+      // Create the forward pooling primitive descriptor so we can reference it
+      // in the backward pooling primitive descriptor
+      auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward,
+              algorithm::pooling_avg_exclude_padding,
+              original_input_md,
+              original_output_md,
+              memory::dims({  pool_params.row_stride,
+                              pool_params.col_stride}),
+              memory::dims({  pool_params.window_rows,
+                              pool_params.window_cols}),
+              memory::dims({  static_cast<int>(pool_params.pad_top),
+                              static_cast<int>(pool_params.pad_left)}),
+              memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                              static_cast<int>(pool_params.pad_right)}),
+              TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_prim_desc
+              = pooling_forward::primitive_desc(pool_fwd_desc,
+                                                  cpu_engine);
+
+      auto pool_bkwd_desc = pooling_backward::desc(
+              algorithm::pooling_avg_exclude_padding,
+              output_diff_src.GetUsrMemDesc(),
+              target_diff_dst_md,
+              memory::dims({  pool_params.row_stride,
+                              pool_params.col_stride}),
+              memory::dims({  pool_params.window_rows,
+                              pool_params.window_cols}),
+              memory::dims({  static_cast<int>(pool_params.pad_top),
+                              static_cast<int>(pool_params.pad_left)}),
+              memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                              static_cast<int>(pool_params.pad_right)}),
+              TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_bkwd_prim_desc
+                = pooling_backward::primitive_desc(pool_bkwd_desc,
+                                              cpu_engine,
+                                              pool_fwd_prim_desc);
+      this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
+                      original_input_dims_nchw,
+                      this->data_format_mkldnn_,
+                      &output_tensor_diff_src);
+
+      output_diff_src.SetUsrMemDataHandle(output_tensor_diff_src);
+
+      this->PrepareAndExecuteNet(pool_bkwd_prim_desc,
+                          &input_gradient_diff_dst,
+                          &output_diff_src,
+                          memory::primitive_desc(
+                              target_diff_dst_md,
+                              cpu_engine));
+    } catch (mkldnn::error &e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                      ", message: " + string(e.message) +
+                      ", in file " + string(__FILE__) + ":" +
+                      std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+                      errors::Aborted("Compute received an exception:",
+                                      error_msg));
+    }
+  }  // Compute
+
+ private:
+  // 0. Input("orig_input_shape: int32")
+  // 1. Input("grad: T")
+  const int kInputTensorIndexInputShape = 0;
+  const int kInputTensorIndexInputGradient = 1;
+
+  memory::desc ConfigureOriginalInput(OpKernelContext* context,
+        const Tensor& tensor_original_input_shape,
+        const MklDnnShape& original_input_mkl_shape,
+        memory::dims* original_input_dims_mkl_order,
+        MklPoolParameters* pool_params,
+        TensorShape* input_tensor_shape) {
+    CHECK_NOTNULL(original_input_dims_mkl_order);
+    CHECK_NOTNULL(pool_params);
+    CHECK_NOTNULL(input_tensor_shape);
+    // For AvgPoolGrad, we only get the size of the original input because
+    // The original data is irrelvant.
+    auto shape_vec = tensor_original_input_shape.vec<int32>();
+    for (int64 i = 0; i < tensor_original_input_shape.NumElements(); ++i) {
+      input_tensor_shape->AddDim(shape_vec(i));
+    }
+
+    return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
+                                              context,
+                                              tensor_original_input_shape,
+                                              original_input_mkl_shape,
+                                              original_input_dims_mkl_order,
+                                              pool_params,
+                                              *input_tensor_shape);
+}
+
+  void SanityCheckInputs(OpKernelContext* context,
+                        const Tensor& tensor_in_shape,
+                        const Tensor& input_gradient_tensor,
+                        const MklDnnShape& original_input_mkl_shape,
+                        const MklDnnShape& input_gradient_mkl_shape) {
+    if (!original_input_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
+          tensor_in_shape.NumElements() == 4,
+          errors::InvalidArgument("original input shape must be "
+                "1-dimensional and 4 elements"));
+    } else {
+      OP_REQUIRES(context, original_input_mkl_shape.GetDimension() == 1 &&
+          original_input_mkl_shape.DimSize(0) == 4,
+          errors::InvalidArgument("original input shape must be "
+                "1-dimensional and 4 elements"));
+    }
+
+    if (!input_gradient_mkl_shape.IsMklTensor()) {
+      // For avgpooling, input_gradient_diff_dst should have 4 dimensions.
+      OP_REQUIRES(context, input_gradient_tensor.dims() == 4,
+          errors::InvalidArgument("Gradient shape must be "
+                              "4-dimensional"));
+    } else {
+      OP_REQUIRES(context, input_gradient_mkl_shape.GetDimension() == 4,
+          errors::InvalidArgument("Gradient shape must be "
+                              "4-dimensional"));
+    }
+  }
+};  // MklAvgPoolingGradOp
+
+
+
+#endif  // INTEL_MKL_DNN
 
 REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
                             .Device(DEVICE_CPU)
@@ -427,3 +728,4 @@ REGISTER_KERNEL_BUILDER(Name("_MklAvgPoolGrad")
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
+
diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9fee94f946555480fce8acf904a7909622404524
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
@@ -0,0 +1,239 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/math_ops.cc.
+
+// This file uses MKL CBLAS batched xGEMM for acceleration of TF Batch
+// Matrix-Matrix Multiplication (MatMul) operations.
+// We currently register this kernel only for MKL supported data
+// types (float, double, complex64, complex128). The macro INTEL_MKL is defined
+// by the build system only when MKL is chosen as an option at configure stage
+// and when it is undefined at build time, this file becomes an empty
+// compilation unit
+
+#define EIGEN_USE_THREADS
+
+#if defined(INTEL_MKL)
+#include <vector>
+#include "mkl_cblas.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#define MKL_Complex8 tensorflow::complex64
+#define MKL_Complex16 tensorflow::complex128
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename Scalar>
+class BatchMatMulMkl : public OpKernel {
+ public:
+  explicit BatchMatMulMkl(OpKernelConstruction *context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_));
+    OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_));
+  }
+
+  virtual ~BatchMatMulMkl() {}
+
+  void Compute(OpKernelContext *ctx) override {
+    const Tensor &lhs = ctx->input(0);
+    const Tensor &rhs = ctx->input(1);
+    OP_REQUIRES(ctx, lhs.dims() == rhs.dims(),
+                errors::InvalidArgument("lhs and rhs has different ndims: ",
+                                        lhs.shape().DebugString(), " vs. ",
+                                        rhs.shape().DebugString()));
+    const int ndims = lhs.dims();
+    OP_REQUIRES(
+        ctx, ndims >= 2,
+        errors::InvalidArgument("lhs and rhs ndims must be >= 2: ", ndims));
+    TensorShape out_shape;
+    for (int i = 0; i < ndims - 2; ++i) {
+      OP_REQUIRES(ctx, lhs.dim_size(i) == rhs.dim_size(i),
+                  errors::InvalidArgument(
+                      "lhs.dim(", i, ") and rhs.dim(", i,
+                      ") must be the same: ", lhs.shape().DebugString(), " vs ",
+                      rhs.shape().DebugString()));
+      out_shape.AddDim(lhs.dim_size(i));
+    }
+    auto batch_size = (ndims == 2) ? 1 : out_shape.num_elements();
+    auto lhs_rows = lhs.dim_size(ndims - 2);
+    auto lhs_cols = lhs.dim_size(ndims - 1);
+    auto rhs_rows = rhs.dim_size(ndims - 2);
+    auto rhs_cols = rhs.dim_size(ndims - 1);
+    if (adj_x_) std::swap(lhs_rows, lhs_cols);
+    if (adj_y_) std::swap(rhs_rows, rhs_cols);
+    OP_REQUIRES(ctx, lhs_cols == rhs_rows,
+                errors::InvalidArgument(
+                    "lhs mismatch rhs shape: ", lhs_cols, " vs. ", rhs_rows,
+                    ": ", lhs.shape().DebugString(), " ",
+                    rhs.shape().DebugString(), " ", adj_x_, " ", adj_y_));
+    out_shape.AddDim(lhs_rows);
+    out_shape.AddDim(rhs_cols);
+    Tensor *out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+    if (out->NumElements() == 0) {
+      return;
+    }
+    if (lhs.NumElements() == 0 || rhs.NumElements() == 0) {
+      functor::SetZeroFunctor<Device, Scalar> f;
+      f(ctx->eigen_device<Device>(), out->flat<Scalar>());
+      return;
+    }
+
+    auto rhs_reshaped = rhs.template flat_inner_dims<Scalar, 3>();
+    auto lhs_reshaped = lhs.template flat_inner_dims<Scalar, 3>();
+    auto out_reshaped = out->template flat_inner_dims<Scalar, 3>();
+    const uint64 M = lhs_reshaped.dimension(adj_x_ ? 2 : 1);
+    const uint64 K = lhs_reshaped.dimension(adj_x_ ? 1 : 2);
+    const uint64 N = rhs_reshaped.dimension(adj_y_ ? 1 : 2);
+
+    std::vector<MKL_INT> m_array(batch_size, M);
+    std::vector<MKL_INT> n_array(batch_size, N);
+    std::vector<MKL_INT> k_array(batch_size, K);
+    std::vector<MKL_INT> lda_array(batch_size, adj_x_ ? M : K);
+    std::vector<MKL_INT> ldb_array(batch_size, adj_y_ ? K : N);
+    std::vector<MKL_INT> ldc_array(batch_size, N);
+    std::vector<MKL_INT> group_size(1, batch_size);
+    std::vector<const Scalar *> a_array;
+    std::vector<const Scalar *> b_array;
+    std::vector<Scalar *> c_array;
+    a_array.reserve(batch_size);
+    b_array.reserve(batch_size);
+    c_array.reserve(batch_size);
+    for (int64 i = 0; i < batch_size; i++) {
+      a_array.push_back(&lhs_reshaped(i, 0, 0));
+      b_array.push_back(&rhs_reshaped(i, 0, 0));
+      c_array.push_back(&out_reshaped(i, 0, 0));
+    }
+
+    MklCblasGemmBatch(CblasRowMajor, adj_x_, adj_y_, &m_array[0], &n_array[0],
+                      &k_array[0], &a_array[0], &lda_array[0], &b_array[0],
+                      &ldb_array[0], &c_array[0], &ldc_array[0], 1,
+                      &group_size[0]);
+  }
+
+ private:
+  bool adj_x_;
+  bool adj_y_;
+
+  void MklCblasGemmBatch(const CBLAS_LAYOUT Layout, const bool TransA,
+                         const bool TransB, const MKL_INT *M_Array,
+                         const MKL_INT *N_Array, const MKL_INT *K_Array,
+                         const float **A_Array, const MKL_INT *lda_Array,
+                         const float **B_Array, const MKL_INT *ldb_Array,
+                         float **C_Array, const MKL_INT *ldc_Array,
+                         const MKL_INT group_count, const MKL_INT *group_size) {
+    std::vector<CBLAS_TRANSPOSE> TransA_Array(
+        group_size[0], TransA ? CblasTrans : CblasNoTrans);
+    std::vector<CBLAS_TRANSPOSE> TransB_Array(
+        group_size[0], TransB ? CblasTrans : CblasNoTrans);
+    std::vector<float> alpha_Array(group_size[0], 1.0);
+    std::vector<float> beta_Array(group_size[0], 0.0);
+    cblas_sgemm_batch(Layout, &TransA_Array[0], &TransB_Array[0], M_Array,
+                      N_Array, K_Array, &alpha_Array[0], A_Array, lda_Array,
+                      B_Array, ldb_Array, &beta_Array[0], C_Array, ldc_Array,
+                      group_count, group_size);
+  }
+
+  void MklCblasGemmBatch(const CBLAS_LAYOUT Layout, const bool TransA,
+                         const bool TransB, const MKL_INT *M_Array,
+                         const MKL_INT *N_Array, const MKL_INT *K_Array,
+                         const double **A_Array, const MKL_INT *lda_Array,
+                         const double **B_Array, const MKL_INT *ldb_Array,
+                         double **C_Array, const MKL_INT *ldc_Array,
+                         const MKL_INT group_count, const MKL_INT *group_size) {
+    std::vector<CBLAS_TRANSPOSE> TransA_array(
+        group_size[0], TransA ? CblasTrans : CblasNoTrans);
+    std::vector<CBLAS_TRANSPOSE> TransB_array(
+        group_size[0], TransB ? CblasTrans : CblasNoTrans);
+    std::vector<double> alpha_Array(group_size[0], 1.0);
+    std::vector<double> beta_Array(group_size[0], 0.0);
+    cblas_dgemm_batch(Layout, &TransA_array[0], &TransB_array[0], M_Array,
+                      N_Array, K_Array, &alpha_Array[0], A_Array, lda_Array,
+                      B_Array, ldb_Array, &beta_Array[0], C_Array, ldc_Array,
+                      group_count, group_size);
+  }
+
+  void MklCblasGemmBatch(const CBLAS_LAYOUT Layout, const bool TransA,
+                         const bool TransB, const MKL_INT *M_Array,
+                         const MKL_INT *N_Array, const MKL_INT *K_Array,
+                         const MKL_Complex8 **A_Array, const MKL_INT *lda_Array,
+                         const MKL_Complex8 **B_Array, const MKL_INT *ldb_Array,
+                         MKL_Complex8 **C_Array, const MKL_INT *ldc_Array,
+                         const MKL_INT group_count, const MKL_INT *group_size) {
+    std::vector<CBLAS_TRANSPOSE> TransA_array(
+        group_size[0], TransA ? CblasConjTrans : CblasNoTrans);
+    std::vector<CBLAS_TRANSPOSE> TransB_array(
+        group_size[0], TransB ? CblasConjTrans : CblasNoTrans);
+    std::vector<MKL_Complex8> alpha_Array(group_size[0], {1.0f, 0.0f});
+    std::vector<MKL_Complex8> beta_Array(group_size[0], {0.0f, 0.0f});
+    cblas_cgemm_batch(
+        Layout, &TransA_array[0], &TransB_array[0], M_Array, N_Array, K_Array,
+        static_cast<const void *>(&alpha_Array[0]),
+        reinterpret_cast<const void **>(A_Array), lda_Array,
+        reinterpret_cast<const void **>(B_Array), ldb_Array,
+        static_cast<const void *>(&beta_Array[0]),
+        reinterpret_cast<void **>(C_Array), ldc_Array, group_count, group_size);
+  }
+
+  void MklCblasGemmBatch(const CBLAS_LAYOUT Layout, const bool TransA,
+                         const bool TransB, const MKL_INT *M_Array,
+                         const MKL_INT *N_Array, const MKL_INT *K_Array,
+                         const MKL_Complex16 **A_Array,
+                         const MKL_INT *lda_Array,
+                         const MKL_Complex16 **B_Array,
+                         const MKL_INT *ldb_Array, MKL_Complex16 **C_Array,
+                         const MKL_INT *ldc_Array, const MKL_INT group_count,
+                         const MKL_INT *group_size) {
+    std::vector<CBLAS_TRANSPOSE> TransA_array(
+        group_size[0], TransA ? CblasConjTrans : CblasNoTrans);
+    std::vector<CBLAS_TRANSPOSE> TransB_array(
+        group_size[0], TransB ? CblasConjTrans : CblasNoTrans);
+    std::vector<MKL_Complex16> alpha_Array(group_size[0], {1.0f, 0.0f});
+    std::vector<MKL_Complex16> beta_Array(group_size[0], {0.0f, 0.0f});
+    cblas_zgemm_batch(
+        Layout, &TransA_array[0], &TransB_array[0], M_Array, N_Array, K_Array,
+        static_cast<const void *>(&alpha_Array[0]),
+        reinterpret_cast<const void **>(A_Array), lda_Array,
+        reinterpret_cast<const void **>(B_Array), ldb_Array,
+        static_cast<const void *>(&beta_Array[0]),
+        reinterpret_cast<void **>(C_Array), ldc_Array, group_count, group_size);
+  }
+};
+
+#define REGISTER_BATCH_MATMUL_MKL(TYPE)                                 \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
+      BatchMatMulMkl<CPUDevice, TYPE>)
+
+TF_CALL_float(REGISTER_BATCH_MATMUL_MKL);
+TF_CALL_double(REGISTER_BATCH_MATMUL_MKL);
+TF_CALL_complex64(REGISTER_BATCH_MATMUL_MKL);
+TF_CALL_complex128(REGISTER_BATCH_MATMUL_MKL);
+
+}  // end namespace tensorflow
+#endif
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index e6673b2ffb7dc4a2e0127c363b4402c98a023b17..d0175dfd715bcdd2cc89fe8ca5eb7d60410f6562 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -1,11 +1,8 @@
 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -33,11 +30,22 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+
+using mkldnn::stream;
+using mkldnn::concat;
+#endif
+
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+// List of TensorShape objects. Used in Concat/Split layers.
+typedef std::vector<TensorShape> TensorShapeList;
+
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
 
+
 // TODO(intelft) Check if we can reuse existing EigenConcatOp using Mutable
 // reference inputs.
 // --------------------------------------------------------------------------
@@ -55,6 +63,8 @@ class EigenConcatBaseOp : public OpKernel {
   // we need to have empty Compute because Compute is pure virtual function.
   void Compute(OpKernelContext* c) {}
 
+#ifndef INTEL_MKL_DNN
+
   void Compute(OpKernelContext* c, const std::vector<Tensor>& values) {
     const Tensor* concat_dim_tensor;
     const char* axis_attribute_name =
@@ -139,8 +149,89 @@ class EigenConcatBaseOp : public OpKernel {
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
+
+#else  // MKL_DNN
+
+void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
+                        const TensorShapeList& input_shapes) {
+    const Tensor* concat_dim_tensor;
+    const char* axis_attribute_name =
+        AxisArgName == NAME_IS_AXIS
+            ? "axis"
+            : AxisArgName == NAME_IS_CONCAT_DIM ? "concat_dim" : "<invalid>";
+    OP_REQUIRES_OK(c, c->input(axis_attribute_name, &concat_dim_tensor));
+    OP_REQUIRES(c, IsLegacyScalar(concat_dim_tensor->shape()),
+                errors::InvalidArgument(
+                    axis_attribute_name,
+                    " tensor should be a scalar integer, but got shape ",
+                    concat_dim_tensor->shape().DebugString()));
+    const int32 concat_dim =
+        internal::SubtleMustCopy(concat_dim_tensor->scalar<int32>()());
+    // Instead of accessing values from context, we use input to Compute.
+    const int N = values.size();
+    const int input_dims = input_shapes[0].dims();
+    const TensorShape& input_shape = input_shapes[0];
+
+    int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
+    OP_REQUIRES(c,
+                (0 <= axis && axis < input_dims) ||
+                    (allow_legacy_scalars() && concat_dim == 0),
+                errors::InvalidArgument(
+                    "ConcatOp : Expected concatenating dimensions in the range "
+                    "[",
+                    -input_dims, ", ", input_dims, "), but got ", concat_dim));
+    // Note that we reduce the concat of n-dimensional tensors into a two
+    // dimensional concat. Assuming the dimensions of any input/output
+    // tensor are {x0, x1,...,xn-1, y0, y1,...,ym-1}, where the concat is along
+    // the dimension indicated with size y0, we flatten it to {x, y}, where y =
+    // Prod_i(yi) and x = ((n > 0) ? Prod_i(xi) : 1).
+    ConstMatrixVector inputs_flat;
+    inputs_flat.reserve(N);
+    int64 inputs_flat_dim0 = 1;
+    for (int d = 0; d < axis; ++d) {
+      inputs_flat_dim0 *= input_shape.dim_size(d);
+    }
+    int64 output_concat_dim = 0;
+    const bool input_is_scalar = IsLegacyScalar(input_shape);
+    for (int i = 0; i < N; ++i) {
+      const auto in = values[i];
+      const bool in_is_scalar = IsLegacyScalar(input_shapes[i]);
+      OP_REQUIRES(
+          c, (input_shapes[i].dims() == input_dims) ||
+              (input_is_scalar && in_is_scalar),
+          errors::InvalidArgument(
+              "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", input_shapes[i].DebugString()));
+      if (in.NumElements() > 0) {
+        int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            in.shaped<T, 2>({inputs_flat_dim0, inputs_flat_dim1})));
+      }
+      output_concat_dim += input_shapes[i].dims() > 0 ?
+                           input_shapes[i].dim_size(axis) : 1;
+    }
+
+    TensorShape output_shape(input_shape);
+    if (output_shape.dims() == 0) {
+      output_shape.AddDim(output_concat_dim);
+    } else {
+      output_shape.set_dim(axis, output_concat_dim);
+    }
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
+    if (output->NumElements() > 0) {
+      int64 output_dim1 = output->NumElements() / inputs_flat_dim0;
+      auto output_flat = output->shaped<T, 2>({inputs_flat_dim0, output_dim1});
+      ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+    }
+  }
+
+#endif
 };
 
+#ifndef INTEL_MKL_DNN
+
 // --------------------------------------------------------------------------
 //                      Mkl Concat Op
 // --------------------------------------------------------------------------
@@ -327,6 +418,7 @@ class MklConcatOp : public OpKernel {
     OP_REQUIRES_OK(context, context->status());
   }
 
+
  private:
   typedef struct {
     TensorFormat data_format;
@@ -435,8 +527,284 @@ class MklConcatOp : public OpKernel {
         mkl_tensor->flat<uint8>().data(),
         mkl_tensor->flat<uint8>().size() * sizeof(uint8));
   }
+
+  // overloading methods with input shapes as a list of TensorShape's
+  void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
+                        const TensorShapeList& input_shapes) {
+    CHECK_EQ(values.size(), input_shapes.size());
+
+    std::vector<Tensor> converted_values;
+    for (int i = 0; i < input_shapes.size(); i++) {
+      converted_values.push_back(values[i]);
+    }
+
+    // Call Eigen concat.
+    eigen_concat_op_.Compute(context, converted_values);
+
+    // Set dummy Mkl tensor as output Mkl tensor for this op.
+    MklShape mkl_tensor_mkl_shape;
+    mkl_tensor_mkl_shape.SetMklTensor(false);
+    mkl_tensor_mkl_shape.SetDimensions(4);
+    Tensor* mkl_tensor = nullptr;
+    TensorShape mkl_tensor_tf_shape;
+    mkl_tensor_tf_shape.AddDim(
+        SIZE_OF_MKL_SERIAL_DATA(mkl_tensor_mkl_shape.GetDimension()));
+    int tf_output_index = 0;
+    context->allocate_output(
+        GetTensorMetaDataIndex(tf_output_index, context->num_outputs()),
+        mkl_tensor_tf_shape, &mkl_tensor);
+    mkl_tensor_mkl_shape.SerializeMklShape(
+        mkl_tensor->flat<uint8>().data(),
+        mkl_tensor->flat<uint8>().size() * sizeof(uint8));
+  }
 };
 
+#else
+
+// --------------------------------------------------------------------------
+//                      Mkl Concat Op
+// --------------------------------------------------------------------------
+
+template <typename Device, typename T, AxisArgumentName AxisArgName>
+class MklConcatOp : public OpKernel {
+ private:
+  TensorFormat data_format_;
+  EigenConcatBaseOp<Device, T, AxisArgName> eigen_concat_op_;
+
+ public:
+  typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>
+      ConstMatrixVector;
+
+  explicit MklConcatOp(OpKernelConstruction* c)
+      : OpKernel(c), eigen_concat_op_(c) {}
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      OpInputList input_tensors;
+      GetMklInputList(context, "values", &input_tensors);
+      const int N = input_tensors.size();
+
+      // Get Tensor shapes.
+      std::vector<MklDnnShape> input_shapes(N);
+      GetMklShapeList(context, "values", &input_shapes);
+
+      const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM)
+                    ? MklGetInput(context, 0) : MklGetInput(context, N);
+      // Sanity checks
+      OP_REQUIRES(context, IsLegacyScalar(concat_dim_tensor.shape()),
+        errors::InvalidArgument(
+            "Concat dim tensor should be a scalar integer, but got shape ",
+            concat_dim_tensor.shape().DebugString()));
+      int32 concat_dim = internal::SubtleMustCopy(
+                           concat_dim_tensor.scalar<int32>()());
+      if (concat_dim < 0) concat_dim = N + concat_dim;
+
+      // check that ranks of all tensors match
+      // and that their shapes match except for concat_dim.
+      int i = 0;
+      bool invoke_eigen = false;
+      bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
+      const TensorShape expected_shape = input_shapes[0].IsMklTensor() ?
+                                         input_shapes[0].GetTfShape() :
+                                         input_tensors[0].shape();
+      size_t expected_dims = expected_shape.dims();
+      for (auto& s : input_shapes) {
+        if (s == expected_shape) {++i; continue;}
+
+        TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() :
+                      input_tensors[i].shape();
+        size_t s_dims = s_shape.dims();
+
+        OP_REQUIRES(context, s_dims == expected_dims,
+                  errors::InvalidArgument(
+                      "_MklConcatOp : Ranks of all input tensors should match:"
+                      " input dimensions = ",
+                      s_dims, " vs. expected rank = ", expected_dims));
+
+        for (int d = 0; d < expected_dims; ++d) {
+          if (d == concat_dim) continue;
+
+          size_t expected_size = expected_shape.dim_size(d);
+          size_t s_size = s_shape.dim_size(d);
+          OP_REQUIRES(
+            context, expected_size == s_size,
+            errors::InvalidArgument("_MklConcatOp : Dimensions of inputs "
+                    "should match: shape[0][", d, "]= ", expected_size,
+                    " vs. shape[", i, "][", d, "] = ", s_size));
+        }
+
+        if (s.IsMklTensor())
+          are_all_tf_inputs = false;
+        else
+          are_all_mkl_inputs = false;
+
+        if (s_dims != 4) invoke_eigen = true;
+        ++i;
+      }
+
+      // All inputs are not in one format (TF or MKL). This is mixed input case.
+      // We can potentially optimize this case by converting all TF inputs
+      // to Mkl format. But currently, we fall to Eigen for this case.
+      // It may be possible to convert inputs that in TF format to Mkl
+      // format and avoid calling eigen version.
+      if (!are_all_tf_inputs && !are_all_mkl_inputs) invoke_eigen = true;
+
+      // Temporary fallback to Eigen until MKLDNN Concat performance
+      // is improved. To be removed.
+      invoke_eigen = true;
+
+      // Call Eigen library
+      if (invoke_eigen) {
+        TensorShapeList tf_input_shapes;
+        i = 0;
+        for (auto& s : input_shapes) {
+          TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() :
+                                input_tensors[i].shape();
+          tf_input_shapes.push_back(s_shape);
+          ++i;
+        }
+        CallEigenVersion(context, input_tensors, tf_input_shapes);
+        return;
+      }
+
+      memory::dims dst_dims;
+      if (are_all_mkl_inputs)
+        dst_dims = TFShapeToMklDnnDims(input_shapes[0].GetTfShape());
+      else
+        // When all the inputs are in Tensorflow format, we don't know
+        // what is the input data format. In that case, we just use
+        // output format that is same as input formats.
+        dst_dims = TFShapeToMklDnnDims(input_tensors[0].shape());
+
+      std::vector<memory::primitive_desc> srcs_pd;
+      std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
+      int64 dst_concat_dim_size = 0;
+      for (int k =0; k < N; k++) {
+        bool is_mkl_tensor = input_shapes[k].IsMklTensor();
+        memory::dims src_dims;
+
+        // Same comment as dst_dims for src_dims.
+        src_dims = (is_mkl_tensor) ?
+                   TFShapeToMklDnnDims(input_shapes[k].GetTfShape()) :
+                   TFShapeToMklDnnDims(input_tensors[k].shape());
+
+        dst_concat_dim_size += src_dims[concat_dim];
+        auto src_md = is_mkl_tensor ? input_shapes[k].GetMklLayout() :
+          // It does not matter what data format we use here (NHWC or NCHW).
+          // We just need to ensure that output of Concat uses same data format
+          // as input.
+                  memory::desc(src_dims, MklDnnType<T>(), memory::format::nhwc);
+
+        srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+        auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+        srcs_pd.push_back(src_mpd);
+      }
+      dst_dims[concat_dim] = dst_concat_dim_size;
+
+      MklDnnData<T> dst(&cpu_engine);
+      memory::desc dst_md({}, memory::data_undef, memory::format_undef);
+      memory::dims dst_dims_in_nchw;
+      if (are_all_mkl_inputs) {
+        // Since we are passing a specific format for destination,
+        // we need to have dst_dims in MklDnn order (NCHW).
+        auto orig_tf_format = input_shapes[0].GetTfDataFormat();
+        dst_dims_in_nchw = MklDnnDimsInNCHW(dst_dims,
+                               MklDnnDataFormatToTFDataFormat(orig_tf_format));
+        // We will set the output in the same format as input to avoid layout
+        // conversions.
+        // Currently we are setting dst format same as input format.
+        // See if we can make this choice in a better way.
+        dst_md = memory::desc(dst_dims_in_nchw, MklDnnType<T>(),
+                 (memory::format) input_shapes[0].GetMklLayout().data.format);
+      } else {
+        // Again, format does not matter here. We just need to make it same as
+        // input format.
+        dst_md = memory::desc(dst_dims, MklDnnType<T>(), memory::format::nhwc);
+      }
+
+      std::vector<primitive::at> inputs;
+      for (int k=0; k < input_tensors.size(); k++)
+        inputs.push_back(srcs[k].GetOpMem());
+
+      // If all inputs are in MKL format, then meaning of concat_dim needs to
+      // change. Value of concat_dim is tied to input Tensorflow data format
+      // (NHWC or NCHW). MklDnn dimensions are in NCHW order. So if Tensorflow
+      // tensors are in NCHW order, then concat_dim semantics is preserved.
+      // But ifinput tensors are in NHWC order, then semantics need to change.
+      // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
+      // then since MklDnn order is NCHW, concat_dim needs to be 1.
+      if (are_all_mkl_inputs)
+        concat_dim = input_shapes[0].TfDimIdx(concat_dim);
+
+      auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
+
+      MklDnnShape dnn_shape_dst;
+      TensorShape tf_shape_dst;
+      Tensor* dst_tensor = nullptr;
+      if (are_all_mkl_inputs) {
+        dnn_shape_dst.SetMklTensor(true);
+        auto dst_pd = concat_pd.dst_primitive_desc();
+        dnn_shape_dst.SetMklLayout(&dst_pd);
+        dnn_shape_dst.SetElemType(MklDnnType<T>());
+        dnn_shape_dst.SetTfLayout(dst_dims.size(), dst_dims_in_nchw,
+                                  input_shapes[0].GetTfDataFormat());
+        tf_shape_dst.AddDim((dst_pd.get_size() / sizeof(T)));
+      } else {
+        dnn_shape_dst.SetMklTensor(false);
+        tf_shape_dst = MklDnnDimsToTFShape(dst_dims);
+      }
+      AllocateOutputSetMklShape(context, 0, &dst_tensor,
+                                tf_shape_dst, dnn_shape_dst);
+      CHECK_NOTNULL(dst_tensor);
+
+      dst_md = dnn_shape_dst.IsMklTensor() ?
+               dnn_shape_dst.GetMklLayout() : dst_md;
+      dst.SetUsrMem(dst_md, dst_tensor);
+
+      auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
+      std::vector<primitive> net;
+      net.push_back(concat_op);
+      stream(stream::kind::eager).submit(net).wait();
+    } catch (mkldnn::error &e) {
+        string error_msg = "Status: " + std::to_string(e.status) +
+               ", message: " + string(e.message) + ", in file " +
+               string(__FILE__) + ":" + std::to_string(__LINE__);
+        OP_REQUIRES_OK(context, errors::Aborted(
+                "Operation received an exception:", error_msg));
+    }
+  }
+
+  void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
+                        const TensorShapeList& input_shapes) {
+    CHECK_EQ(values.size(), input_shapes.size());
+
+    std::vector<Tensor> converted_values;
+    for (int i = 0; i < input_shapes.size(); i++)
+      converted_values.push_back(values[i]);
+
+    // Call Eigen concat.
+    eigen_concat_op_.Compute(context, converted_values, input_shapes);
+
+    // Set output Mkl tensor for this op.
+    MklDnnShape dnn_shape_output;
+    dnn_shape_output.SetMklTensor(false);
+    dnn_shape_output.SetDimensions(4);
+    Tensor* output_tensor = nullptr;
+    TensorShape tf_shape_output;
+    tf_shape_output.AddDim(
+        dnn_shape_output.GetSerializeBufferSize());
+    context->allocate_output(
+        GetTensorMetaDataIndex(0, context->num_outputs()),
+        tf_shape_output, &output_tensor);
+    dnn_shape_output.SerializeMklDnnShape(
+        output_tensor->flat<uint8>().data(),
+        output_tensor->flat<uint8>().size() * sizeof(uint8));
+  }
+};
+
+#endif
+
 /* Use optimized concat for float type only */
 #define REGISTER_MKL_CPU(type)                                              \
   REGISTER_KERNEL_BUILDER(Name("_MklConcat")                                \
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index f291281108d36465ef670cb990714dbb8a0a5715..793fa24d992723c10317b01a70134dcd4d5066db 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -47,11 +47,8 @@ limitations under the License.
 
 using mkldnn::stream;
 using mkldnn::prop_kind;
-
-using mkldnn::convolution_forward;
 using mkldnn::convolution_backward_weights;
-using mkldnn::convolution_direct;
-
+using mkldnn::memory;
 #endif
 
 namespace tensorflow {
@@ -426,183 +423,229 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
   TensorFormat data_format_;
 };
 
+#define REGISTER_MKL_FILTER_KERNELS(T)                              \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+              MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
+#undef REGISTER_MKL_FILTER_KERNELS
+
 #else
 
-template <typename Device, class T>
-class MklConv2DCustomBackpropFilterOp : public OpKernel {
+template <typename Device, class T, bool biasEnabled>
+class MklConv2DCustomBackpropFilterOp :
+  public MklConv2DBackpropCommonOp<Device, T> {
  public:
   explicit MklConv2DCustomBackpropFilterOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    string data_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
+      : MklConv2DBackpropCommonOp<Device, T>(context) { }
+  ~MklConv2DCustomBackpropFilterOp() {}
 
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    int stride_n = GetTensorDim(strides_, data_format_, 'N');
-    int stride_c = GetTensorDim(strides_, data_format_, 'C');
-    OP_REQUIRES(
-        context, (stride_n == 1 && stride_c == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ private:
+  void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
+                         const MklDnnShape& filter_mkl_shape,
+                         const MklDnnShape& obp_mkl_shape) {
+    CHECK(!filter_mkl_shape.IsMklTensor())
+      << "Conv2DBackpropFilter: filter should not be in MKL Layout";
   }
 
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto cpu_engine = engine(engine::cpu, 0);
+  size_t GetInputTensorIndexWithSizes() { return 1; /* filter index */ }
 
-      MklDnnData<T> input(&cpu_engine);
-      MklDnnData<T> outbackprop(&cpu_engine);
-      MklDnnData<T> output(&cpu_engine);
+  TensorShape MakeInputTfShape(OpKernelContext* context,
+                               const Tensor& input_tensor) {
+    size_t input_idx = 0;
+    return GetTfShape(context, input_idx);
+  }
 
-      // Input tensors
-      const Tensor& input_tensor = MklGetInput(context, 0);
-      const Tensor& filter_tensor = MklGetInput(context, 1);
-      const Tensor& obp_tensor = MklGetInput(context, 2);  // Outbackprop
+  TensorShape MakeFilterTfShape(OpKernelContext* context,
+                                const Tensor& filter_tensor) {
+    TensorShape filter_tf_shape;
+    CHECK_EQ(TensorShapeUtils::IsVector(filter_tensor.shape()), true);
+    CHECK_EQ(TensorShapeUtils::MakeShape(
+             filter_tensor.vec<int32>(), &filter_tf_shape).ok(), true);
+    return filter_tf_shape;
+  }
 
-      // Generate input shapes.
-      TensorShape filter_shape;
-      OP_REQUIRES(context, TensorShapeUtils::IsVector(filter_tensor.shape()),
-        errors::InvalidArgument(
-              "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
-              filter_tensor.dims()));
-      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                        filter_tensor.vec<int32>(), &filter_shape));
-      TensorShape input_shape = input_tensor.shape();
-      TensorShape obp_shape = obp_tensor.shape();
-
-      // By default, all dims are in MKL order. Only dims in TF order
-      // are those with prefix tf_order.
-      memory::dims obp_dims, fwd_input_dims, fwd_filter_dims;
-      memory::dims padding_l, padding_r, strides, fwd_output_dims;
-      memory::dims fwd_output_dims_tf_order;
-
-      // Get forward convolution parameters.
-      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(input_shape, filter_shape,
-                                         &fwd_input_dims, &fwd_filter_dims,
-                                         &strides,
-                                         &fwd_output_dims_tf_order,
-                                         &fwd_output_dims,
-                                         &padding_l, &padding_r);
-      if (!context->status().ok()) return;
-
-      // Create Convolution forward descriptor since Convolution backward
-      // API needs it. For that, we first need to create input, filter
-      // and output memory descriptors.
-      auto mkl_data_format = TFDataFormatToMklDnnDataFormat(data_format_);
-      auto fwd_src_md = memory::desc(fwd_input_dims, MklDnnType<T>(),
-                                     mkl_data_format);
-      auto fwd_filter_md = memory::desc(fwd_filter_dims, MklDnnType<T>(),
-                                        memory::format::hwio);
-      auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType<T>(),
-                                     mkl_data_format);
-      auto fwd_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, fwd_src_md, fwd_filter_md, fwd_out_md,
-            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
-      auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
-
-      // Allocate output tensor and shape
-      // TODO(nhasabni): Update this when support for MKL layout is added.
-      // Shape of output of Conv2DBackpropInput is same as 'input' of Conv2D.
-      TensorShape tf_output_shape(filter_shape);
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      Tensor* output_tensor = nullptr;
-      AllocateOutputSetMklShape(context, 0, &output_tensor, tf_output_shape,
-                                mkl_output_mkl_shape);
-
-      // Create memory for user data.
-      // Describe how the inputs and outputs of Convolution look like. Also
-      // specify buffers containing actual input and output data.
-      // Although input shape required is in MKL-DNN order, the layout is
-      // Tensorflow's layout (NHWC or NCHW depending on data format).
-      input.SetUsrMem(fwd_input_dims, mkl_data_format, &input_tensor);
-      // Outbackprop shape is NHWC or NCHW depending on data format. Since
-      // GetInputSizeInMklOrder function returns size in that order we just use
-      // use that function directly.
-      conv_utl.GetInputSizeInMklOrder(obp_shape, &obp_dims);
-      if (!context->status().ok()) return;
-      outbackprop.SetUsrMem(obp_dims, mkl_data_format, &obp_tensor);
-      // Although output shape required is in MKL-DNN order,
-      // layout is Tensorflow's filter layout (HWIO)
-      // Shape of output of Conv2DBackpropInput is same as shape of filter.
-      memory::dims bwd_output_dims = fwd_filter_dims;
-      output.SetUsrMem(bwd_output_dims, memory::format::hwio, output_tensor);
-
-      // Create memory descriptors for convolution data w/ no specified format.
-      input.SetOpMemDesc(fwd_input_dims, memory::format::any);
-      outbackprop.SetOpMemDesc(obp_dims, memory::format::any);
-      output.SetOpMemDesc(bwd_output_dims, memory::format::any);
-
-      // Create convolution backward weights primitive.
-      auto bwd_desc = convolution_backward_weights::desc(convolution_direct,
-                          input.GetOpMemDesc(), output.GetOpMemDesc(),
-                          outbackprop.GetOpMemDesc(), strides, padding_l,
-                          padding_r, TFPaddingToMklDnnPadding(padding_));
-
-      auto bwd_pd = convolution_backward_weights::primitive_desc(bwd_desc,
-                                                              cpu_engine,
-                                                              fwd_pd);
-
-      PrepareAndExecutePrimitive(bwd_pd, &input, &outbackprop, &output);
-    } catch (mkldnn::error &e) {
-     string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-     OP_REQUIRES_OK(context, errors::Aborted("Operation received an exception:",
-                                            error_msg));
+  const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
+                                    const memory::dims& fwd_filter_dims) {
+    // Shape of output of Conv2DBackpropFilter is same as shape of filter.
+    return fwd_filter_dims;
+  }
+
+  memory::format GetOutputFormat(const memory::format data_format) {
+    // Output layout is Tensorflow's filter layout (HWIO).
+    return memory::format::hwio;
+  }
+
+  void CreatePrimitive(OpKernelContext* context,
+                       const engine& cpu_engine,
+                       const convolution_forward::primitive_desc& conv_fwd_pd,
+                       MklDnnData<T>* input, MklDnnData<T>* filter,
+                       MklDnnData<T>* outbackprop, MklDnnData<T>* output,
+                       Tensor** output_tensor,
+                       const memory::dims& strides,
+                       const memory::dims& padding_l,
+                       const memory::dims& padding_r,
+                       padding_kind padding,
+                       const memory::dims& bwd_output_dims,
+                       memory::format bwd_output_format) {
+    CHECK_NOTNULL(context);
+    CHECK_NOTNULL(input);
+    CHECK_NOTNULL(filter);
+    CHECK_NOTNULL(outbackprop);
+    CHECK_NOTNULL(output);
+    CHECK_NOTNULL(output_tensor);
+
+    MklDnnData<T>* bias_grad = nullptr;
+    int depth = 0;
+    if (biasEnabled) {
+      // Data structure for bias_grad
+      bias_grad = new MklDnnData<T> (&cpu_engine);
+      TensorShape obp_tf_shape = GetTfShape(context, 2);
+      depth = (MklConv2DBackpropCommonOp<Device, T>::GetTFDataFormat()
+                == FORMAT_NCHW) ?
+          obp_tf_shape.dim_size(1) : obp_tf_shape.dim_size(3);
+      memory::dims bias_grad_dims = {depth};
+      bias_grad->SetOpMemDesc(bias_grad_dims, memory::format::x);
+    }
+
+    // Create convolution backward weights primitive.
+    auto bwd_desc = (biasEnabled && (bias_grad != nullptr))?
+        convolution_backward_weights::desc(convolution_direct,
+                                input->GetOpMemDesc(), output->GetOpMemDesc(),
+                                bias_grad->GetOpMemDesc(),
+                                outbackprop->GetOpMemDesc(), strides, padding_l,
+                                padding_r, padding) :
+        convolution_backward_weights::desc(convolution_direct,
+                          input->GetOpMemDesc(), output->GetOpMemDesc(),
+                          outbackprop->GetOpMemDesc(), strides, padding_l,
+                          padding_r, padding);
+
+    auto bwd_pd = convolution_backward_weights::primitive_desc(bwd_desc,
+                                                            cpu_engine,
+                                                            conv_fwd_pd);
+
+    // Allocate output tensor.
+    AllocateOutputTensor(context, bwd_pd, bwd_output_dims,
+                         bwd_output_format, output_tensor);
+
+    CHECK_NOTNULL(*output_tensor);
+    // Set buffer handle using allocated output tensor.
+    output->SetUsrMemDataHandle(*output_tensor);
+
+    if (biasEnabled && (bias_grad != nullptr)) {
+      // Allocate bias_grad tensor
+      TensorShape bias_grad_shape({depth});
+      Tensor* bias_grad_tensor = nullptr;
+      AllocateBiasGradTensor(context, bias_grad_shape, &bias_grad_tensor);
+      memory::dims bias_grad_dims = {depth};
+      // Since Bias is 1D, we use format::x from MKLDNN to represent it.
+      auto bias_grad_md = memory::desc({bias_grad_dims}, MklDnnType<T>(),
+                                       memory::format::x);
+      bias_grad->SetUsrMem(bias_grad_md, bias_grad_tensor);
+      bias_grad->SetUsrMemDataHandle(bias_grad_tensor);
+    }
+
+    if (biasEnabled && (bias_grad != nullptr)) {
+      PrepareAndExecutePrimitive(bwd_pd, input, outbackprop, output, bias_grad);
+    } else {
+      PrepareAndExecutePrimitive(bwd_pd, input, outbackprop, output);
     }
   }
 
- private:
-  std::vector<int32> strides_;
-  Padding padding_;
-  TensorFormat data_format_;
+  // Allocate output tensor.
+  void AllocateOutputTensor(OpKernelContext* context,
+                  const convolution_backward_weights::primitive_desc& conv_pd,
+                  const memory::dims& output_dims_mkl_order,
+                  memory::format output_tf_format, Tensor** output_tensor) {
+      CHECK_NOTNULL(output_tensor);
+
+      // For BackpropFilter, we convert the output tensor back in Tensorflow
+      // layout. Because typically, BackpropFilter is the last operator in the
+      // graph that emit filter gradient that is provided to ApplyGradient
+      // method to update the filter. But it may be possible to eliminate this
+      // by forwarding filter in MKL layout if we support ApplyGradient method
+      // for MKL layout propagation.
+      MklDnnShape output_mkl_shape;
+      output_mkl_shape.SetMklTensor(false);
+      // output_dims_mkl_order is in OIHW format.
+      // Allocate shape of TF tensor in HWIO format.
+      TensorShape output_tf_shape({output_dims_mkl_order[MklDnnDims::Dim_H],
+                                   output_dims_mkl_order[MklDnnDims::Dim_W],
+                                   output_dims_mkl_order[MklDnnDims::Dim_I],
+                                   output_dims_mkl_order[MklDnnDims::Dim_O]});
+      AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
+                                output_mkl_shape);
+  }
+
+  // Allocate tensor for bias grad
+  void AllocateBiasGradTensor(OpKernelContext* context,
+                              const TensorShape& bias_grad_shape,
+                              Tensor** bias_grad_tensor) {
+    CHECK_NOTNULL(bias_grad_tensor);
+
+    MklDnnShape bias_grad_mkl_shape;
+    bias_grad_mkl_shape.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 1, bias_grad_tensor, bias_grad_shape,
+                              bias_grad_mkl_shape);
+  }
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
                   const convolution_backward_weights::primitive_desc& conv_pd,
                   MklDnnData<T>* input, MklDnnData<T>* obp,
-                  MklDnnData<T>* output) {
+                  MklDnnData<T>* output, MklDnnData<T>* bias_grad = nullptr) {
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution.
     std::vector<primitive> net;
     input->CheckReorderToOpMem(conv_pd.src_primitive_desc(), &net);
     obp->CheckReorderToOpMem(conv_pd.diff_dst_primitive_desc(), &net);
 
-    // Memory for output of convolution. Since we may need reorder on the
-    // output side, we will prepare reorder primitive in case output
-    // reorder to user memory is required.
+    // For BackpropFilter, we convert the output tensor back in Tensorflow
+    // layout.
     bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
                                       conv_pd.diff_weights_primitive_desc());
 
-    net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
-                                    obp->GetOpMem(), output->GetOpMem()));
+    if (biasEnabled && (bias_grad != nullptr)) {
+      net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
+                                      obp->GetOpMem(), output->GetOpMem(),
+                                      bias_grad->GetOpMem()));
+    } else {
+      net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
+                                      obp->GetOpMem(), output->GetOpMem()));
+    }
 
-    // Insert reorder primitive in the net for output reorder if reorder is
-    // required.
     if (output_reorder_required) {
       output->InsertReorderToUserMem(&net);
     }
 
-    // Handle output reorder
     stream(stream::kind::eager).submit(net).wait();
   }
 };
-#endif
 
 #define REGISTER_MKL_FILTER_KERNELS(T)                              \
   REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")          \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
+              MklConv2DCustomBackpropFilterOp<CPUDevice, T, false>);\
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilterWithBias")  \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+              MklConv2DCustomBackpropFilterOp<CPUDevice, T, true>); \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias")  \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+              MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
+
+#endif  // INTEL_MKL_DNN
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index 4a47d0463ef778430d59fed32202bff02233a9e9..df51df963881b33c08fbd6486574e5e5f8c3d2ff 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -49,9 +49,6 @@ limitations under the License.
 
 using mkldnn::stream;
 using mkldnn::prop_kind;
-
-using mkldnn::convolution_forward;
-using mkldnn::convolution_direct;
 using mkldnn::convolution_backward_data;
 #endif
 
@@ -362,143 +359,117 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
 #else
 
 template <typename Device, class T>
-class MklConv2DCustomBackpropInputOp : public OpKernel {
+class MklConv2DCustomBackpropInputOp :
+  public MklConv2DBackpropCommonOp<Device, T> {
  public:
-  ~MklConv2DCustomBackpropInputOp() {}
   explicit MklConv2DCustomBackpropInputOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    string data_format_str;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    int stride_n = GetTensorDim(strides_, data_format_, 'N');
-    int stride_c = GetTensorDim(strides_, data_format_, 'C');
-    OP_REQUIRES(
-        context, (stride_n == 1 && stride_c == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
+      : MklConv2DBackpropCommonOp<Device, T>(context) { }
+  ~MklConv2DCustomBackpropInputOp() {}
 
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ private:
+  void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
+                         const MklDnnShape& filter_mkl_shape,
+                         const MklDnnShape& obp_mkl_shape) {
+    // Tensor that feeds to 'Input' slot of BackpropInput is always just a shape
+    // of the Tensor and never an actual tensor. So it will never be in MKL
+    // layout.
+    CHECK(!input_mkl_shape.IsMklTensor())
+      << "Conv2DBackpropInput: input should not be in MKL Layout";
   }
 
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto cpu_engine = engine(engine::cpu, 0);
+  size_t GetInputTensorIndexWithSizes() { return 0; /* input index */ }
 
-      MklDnnData<T> filter(&cpu_engine);
-      MklDnnData<T> outbackprop(&cpu_engine);
-      MklDnnData<T> output(&cpu_engine);
+  TensorShape MakeInputTfShape(OpKernelContext* context,
+                               const Tensor& input_tensor) {
+    TensorShape input_tf_shape;
+    CHECK_EQ(TensorShapeUtils::IsVector(input_tensor.shape()), true);
+    CHECK_EQ(TensorShapeUtils::MakeShape(input_tensor.vec<int32>(),
+                                         &input_tf_shape).ok(), true);
+    return input_tf_shape;
+  }
 
-      // Input tensors
-      const Tensor& input_tensor = MklGetInput(context, 0);
-      const Tensor& filter_tensor = MklGetInput(context, 1);
-      const Tensor& obp_tensor = MklGetInput(context, 2);  // Outbackprop
+  TensorShape MakeFilterTfShape(OpKernelContext* context,
+                                const Tensor& filter_tensor) {
+    size_t filter_idx = 1;
+    return GetTfShape(context, filter_idx);
+  }
 
-      // Generate input shape.
-      TensorShape input_shape;
-      OP_REQUIRES(context, TensorShapeUtils::IsVector(input_tensor.shape()),
-        errors::InvalidArgument(
-              "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
-              input_tensor.dims()));
-      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                        input_tensor.vec<int32>(), &input_shape));
-      TensorShape filter_shape = filter_tensor.shape();
-      TensorShape obp_shape = obp_tensor.shape();
-
-      // By default, all dims are in MKL order. Only dims in TF order
-      // are those with prefix tf_order.
-      memory::dims obp_dims, fwd_input_dims, fwd_filter_dims;
-      memory::dims padding_l, padding_r, strides, fwd_output_dims;
-      memory::dims fwd_output_dims_tf_order;
-
-      // Get forward convolution parameters.
-      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(input_shape, filter_shape,
-                                         &fwd_input_dims, &fwd_filter_dims,
-                                         &strides,
-                                         &fwd_output_dims_tf_order,
-                                         &fwd_output_dims,
-                                         &padding_l, &padding_r);
-      if (!context->status().ok()) return;
-
-      // Create Convolution forward descriptor since Convolution backward
-      // API needs it. For that, we first need to create input, filter
-      // and output memory descriptors.
-      auto mkl_data_format = TFDataFormatToMklDnnDataFormat(data_format_);
-      auto fwd_src_md = memory::desc(fwd_input_dims, MklDnnType<T>(),
-                                     mkl_data_format);
-      auto fwd_filter_md = memory::desc(fwd_filter_dims, MklDnnType<T>(),
-                                        memory::format::hwio);
-      auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType<T>(),
-                                     mkl_data_format);
-      auto fwd_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, fwd_src_md, fwd_filter_md, fwd_out_md,
-            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
-      auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
-
-      // Allocate output tensor and shape
-      // TODO(nhasabni): Update this when support for MKL layout is added.
-      // Shape of output of Conv2DBackpropInput is same as 'input' of Conv2D.
-      TensorShape tf_output_shape(input_shape);
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      Tensor* output_tensor = nullptr;
-      AllocateOutputSetMklShape(context, 0, &output_tensor, tf_output_shape,
-                                mkl_output_mkl_shape);
-
-      // Create memory for user data.
-      // Describe how the inputs and outputs of Convolution look like. Also
-      // specify buffers containing actual input and output data.
-      // Although input shape required is in MKL-DNN order, the layout is
-      // Tensorflow's layout (NHWC or NCHW depending on data format).
-      // Although filter shape (filter_dims) required is in MKL-DNN order,
-      // the layout is Tensorflow's layout (HWIO).
-      // Shape of Conv2DBackpropInput's filter is same as that of Conv2D filter.
-      filter.SetUsrMem(fwd_filter_dims, memory::format::hwio, &filter_tensor);
-      // Outbackprop shape is NHWC or NCHW depending on data format. Since
-      // GetInputSizeInMklOrder function returns size in that order we just use
-      // use that function directly.
-      conv_utl.GetInputSizeInMklOrder(obp_shape, &obp_dims);
-      if (!context->status().ok()) return;
-      outbackprop.SetUsrMem(obp_dims, mkl_data_format, &obp_tensor);
-      // Although output shape required is in MKL-DNN order,
-      // layout is Tensorflow's layout (NHWC or NCHW depending on data format).
-      // Shape of output of Conv2DBackpropInput is same as shape of 'input'
-      // of Conv2D.
-      memory::dims bwd_output_dims = fwd_input_dims;
-      output.SetUsrMem(bwd_output_dims, mkl_data_format, output_tensor);
-
-      // Create memory descriptors for convolution data w/ no specified format.
-      filter.SetOpMemDesc(fwd_filter_dims, memory::format::any);
-      outbackprop.SetOpMemDesc(obp_dims, memory::format::any);
-      output.SetOpMemDesc(bwd_output_dims, memory::format::any);
-
-      // Create convolution backward data primitive.
-      auto bwd_desc = convolution_backward_data::desc(convolution_direct,
-                          output.GetOpMemDesc(), filter.GetOpMemDesc(),
-                          outbackprop.GetOpMemDesc(), strides, padding_l,
-                          padding_r, TFPaddingToMklDnnPadding(padding_));
-
-      auto bwd_pd = convolution_backward_data::primitive_desc(bwd_desc,
-                                                              cpu_engine,
-                                                              fwd_pd);
-
-      PrepareAndExecutePrimitive(bwd_pd, &filter, &outbackprop, &output);
-    } catch (mkldnn::error &e) {
-     string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-     OP_REQUIRES_OK(context, errors::Aborted("Operation received an exception:",
-                                            error_msg));
-    }
+  const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
+                                    const memory::dims& fwd_filter_dims) {
+    // Output Shape of Conv2DBackpropInput is same as shape of Conv2D 'input'.
+    return fwd_input_dims;
   }
 
- private:
-  std::vector<int32> strides_;
-  Padding padding_;
-  TensorFormat data_format_;
+  memory::format GetOutputFormat(const memory::format data_format) {
+    // Output layout is Tensorflow's layout in data format order.
+    return data_format;
+  }
+
+  void CreatePrimitive(OpKernelContext* context,
+                       const engine& cpu_engine,
+                       const convolution_forward::primitive_desc& conv_fwd_pd,
+                       MklDnnData<T>* input, MklDnnData<T>* filter,
+                       MklDnnData<T>* outbackprop, MklDnnData<T>* output,
+                       Tensor** output_tensor,
+                       const memory::dims& strides,
+                       const memory::dims& padding_l,
+                       const memory::dims& padding_r,
+                       padding_kind padding,
+                       const memory::dims& bwd_output_dims,
+                       memory::format bwd_output_format) {
+    CHECK_NOTNULL(context);
+    CHECK_NOTNULL(input);
+    CHECK_NOTNULL(filter);
+    CHECK_NOTNULL(outbackprop);
+    CHECK_NOTNULL(output);
+    CHECK_NOTNULL(output_tensor);
+
+    // Create convolution backward data primitive.
+    auto bwd_desc = convolution_backward_data::desc(convolution_direct,
+                      output->GetOpMemDesc(), filter->GetOpMemDesc(),
+                      outbackprop->GetOpMemDesc(), strides, padding_l,
+                      padding_r, padding);
+
+    auto bwd_pd = convolution_backward_data::primitive_desc(bwd_desc,
+                                                          cpu_engine,
+                                                          conv_fwd_pd);
+
+
+    // Allocate output tensor in TensorFlow and MKL layout.
+    AllocateOutputTensor(context, bwd_pd, bwd_output_dims,
+                         bwd_output_format, output_tensor);
+    CHECK_NOTNULL(*output_tensor);
+    // Set buffer handle using allocated output tensor.
+    output->SetUsrMemDataHandle(*output_tensor);
+
+    PrepareAndExecutePrimitive(bwd_pd, filter, outbackprop, output);
+  }
+
+  // Allocate output tensor.
+  void AllocateOutputTensor(OpKernelContext* context,
+                  const convolution_backward_data::primitive_desc& conv_pd,
+                  const memory::dims& output_dims_mkl_order,
+                  memory::format output_tf_format, Tensor** output_tensor) {
+      CHECK_NOTNULL(output_tensor);
+
+      // Output primitive descriptor for backward data is diff_src.
+      auto dst_pd = conv_pd.diff_src_primitive_desc();
+
+      // Allocate shape of Mkl tensor.
+      MklDnnShape output_mkl_shape;
+      output_mkl_shape.SetMklTensor(true);
+      output_mkl_shape.SetMklLayout(&dst_pd);
+      output_mkl_shape.SetElemType(MklDnnType<T>());
+      output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                   output_dims_mkl_order, output_tf_format);
+
+      // Allocate shape of TF tensor.
+      TensorShape output_tf_shape;
+      output_tf_shape.AddDim(dst_pd.get_size() / sizeof(T));
+
+      AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
+                                output_mkl_shape);
+  }
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
@@ -511,22 +482,9 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
     filter->CheckReorderToOpMem(conv_pd.weights_primitive_desc(), &net);
     obp->CheckReorderToOpMem(conv_pd.diff_dst_primitive_desc(), &net);
 
-    // Memory for output of convolution. Since we may need reorder on the
-    // output side, we will prepare reorder primitive in case output
-    // reorder to user memory is required.
-    bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
-                                      conv_pd.diff_src_primitive_desc());
-
     net.push_back(convolution_backward_data(conv_pd, obp->GetOpMem(),
                                     filter->GetOpMem(), output->GetOpMem()));
 
-    // Insert reorder primitive in the net for output reorder if reorder is
-    // required.
-    if (output_reorder_required) {
-      output->InsertReorderToUserMem(&net);
-    }
-
-    // Handle output reorder
     stream(stream::kind::eager).submit(net).wait();
   }
 };
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index a9872b8d6d3ea89da0a73017af19cabbc25f78ce..04268f23bb3e07f8eb9ba66957ca00a09b1e6d5d 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -40,8 +40,7 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 
 #include "tensorflow/core/util/mkl_util.h"
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
+
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
@@ -51,6 +50,9 @@ using mkldnn::prop_kind;
 
 using mkldnn::convolution_forward;
 using mkldnn::convolution_direct;
+#else
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #endif
 
 namespace tensorflow {
@@ -288,10 +290,8 @@ class MklConv2DOp : public OpKernel {
     mkl_filter_output_mkl_shape.SetMklLayout(mkl_context.prim_fwd,
                                              dnnResourceFilter);
 
-    size_t filter_sizes[4] = {static_cast<size_t>(filter.dim_size(0)),
-                              static_cast<size_t>(filter.dim_size(1)),
-                              static_cast<size_t>(filter.dim_size(2)),
-                              static_cast<size_t>(filter.dim_size(3))};
+    size_t filter_sizes[4] = {filter.dim_size(0), filter.dim_size(1),
+                              filter.dim_size(2), filter.dim_size(3)};
     mkl_filter_output_mkl_shape.SetTfLayout(filter.dims(), filter_sizes,
                                             mkl_context.filter_strides);
 
@@ -514,6 +514,12 @@ class MklConv2DOp : public OpKernel {
       const Tensor& src_tensor = MklGetInput(context, src_idx);
       const Tensor& filter_tensor = MklGetInput(context, filter_idx);
 
+      MklDnnShape src_mkl_shape, filter_mkl_shape;
+      GetMklShape(context, src_idx, &src_mkl_shape);
+      GetMklShape(context, filter_idx, &filter_mkl_shape);
+      CHECK(!filter_mkl_shape.IsMklTensor())
+        << "Conv2D filter should not be in MKL Layout";
+
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<T> filter(&cpu_engine);
       MklDnnData<T> output(&cpu_engine);
@@ -523,8 +529,9 @@ class MklConv2DOp : public OpKernel {
 
       // Get shapes of input tensors in MKL-DNN order
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(src_tensor.shape(),
-                                         filter_tensor.shape(),
+      auto src_tf_shape = GetTfShape(context, src_idx);
+      auto filter_tf_shape = GetTfShape(context, filter_idx);
+      conv_utl.GetConvFwdSizesInMklOrder(src_tf_shape, filter_tf_shape,
                                          &src_dims, &filter_dims, &strides,
                                          &output_dims_tf_order,
                                          &output_dims_mkl_order, &padding_l,
@@ -532,58 +539,47 @@ class MklConv2DOp : public OpKernel {
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
-      TensorShape tf_output_shape({output_dims_tf_order[0],
-                                output_dims_tf_order[1],
-                                output_dims_tf_order[2],
-                                output_dims_tf_order[3]});
-      Tensor* output_tensor = nullptr;
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(context, 0, &output_tensor, tf_output_shape,
-                                mkl_output_mkl_shape);
+      TensorShape output_tf_shape = MklDnnDimsToTFShape(output_dims_tf_order);
 
       // Forward filter in TF format from input at index 1 to output at index 1.
       ForwardTfTensorInToOut(context, 1, 1);
 
-      if (tf_output_shape.num_elements() == 0) {
+      // Corner cases: output with 0 elements and 0 batch size.
+      Tensor* output_tensor = nullptr;
+      if (output_tf_shape.num_elements() == 0 ||
+          output_dims_tf_order[0] == 0) {
         // TODO(jbobba): Verify correctness here
         //               Need semantics for Null MKL tensor
+        MklDnnShape output_mkl_shape;
+        output_mkl_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, 0, &output_tensor, src_tf_shape,
+                                output_mkl_shape);
         return;
       }
 
-      // Corner case to handle 0 batch size.
-      if (output_dims_tf_order[0] == 0) {
-        // Nothing to do, allocate output tensor and return
-        // TODO(nhasabni): remove this code later once serialization
-        // in MKL-DNN is supported.
-        AllocateOutputSetMklShape(context, 0, &output_tensor,
-                                  src_tensor.shape(), mkl_output_mkl_shape);
-        return;
-      } else {
-        // Otherwise regular output tensor allocation
-        // Allocate output tensor.
-      }
-      CHECK_NOTNULL(output_tensor);
-
       // Create memory for user data.
       // Describe how the inputs and outputs of Convolution look like. Also
       // specify buffers containing actual input and output data.
-      // Although input shape (src_dims) required is in MKL-DNN order,
-      // the layout is Tensorflow's layout (NHWC or NCHW depending on data
-      // format).
-      src.SetUsrMem(src_dims, TFDataFormatToMklDnnDataFormat(data_format_),
-                    const_cast<void*>(static_cast<const void*>(
-                    src_tensor.flat<T>().data())));
+      auto tf_fmt = TFDataFormatToMklDnnDataFormat(data_format_);
+      // If input is in MKL layout, then simply grab input layout; otherwise,
+      // construct input Tf layout. For TF layout, although input shape
+      // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
+      // layout (NHWC or NCHW depending on data format).
+      auto src_md = src_mkl_shape.IsMklTensor()
+                    ? src_mkl_shape.GetMklLayout()
+                    : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
+      src.SetUsrMem(src_md, &src_tensor);
       // Although filter shape (filter_dims) required is in MKL-DNN order,
       // the layout is Tensorflow's layout (HWIO).
-      filter.SetUsrMem(filter_dims, memory::format::hwio,
-                       const_cast<void*>(static_cast<const void*>(
-                       filter_tensor.flat<T>().data())));
-      // Although output shape (output_dims) required is in MKL-DNN order,
-      // layout is Tensorflow's layout (NHWC or NCHW depending on data format).
-      output.SetUsrMem(output_dims_mkl_order,
-                       TFDataFormatToMklDnnDataFormat(data_format_),
-                       output_tensor->flat<T>().data());
+      auto filter_md = filter_mkl_shape.IsMklTensor()
+                    ? filter_mkl_shape.GetMklLayout()
+          : memory::desc(filter_dims, MklDnnType<T>(), memory::format::hwio);
+      filter.SetUsrMem(filter_md, &filter_tensor);
+      // Set output shape (output_dims) required in MKL-DNN order.
+      // Currently, we set output layout as Tensorflow's layout (NHWC or NCHW
+      // depending on data format). But later we propagate Mkl layout of the
+      // output to the next op directly.
+      output.SetUsrMem(output_dims_mkl_order, tf_fmt);
 
       // Create memory descriptors for convolution data w/ no specified format.
       src.SetOpMemDesc(src_dims, memory::format::any);
@@ -596,9 +592,7 @@ class MklConv2DOp : public OpKernel {
         memory::dims bias_size;
         conv_utl.GetBiasSizeInMklOrder(2 /* bias idx */, &bias_size);
         const Tensor& bias_tensor = MklGetInput(context, 2);
-        bias.SetUsrMem(bias_size, memory::format::x,
-                       const_cast<void*>(static_cast<const void*>(
-                       bias_tensor.flat<T>().data())));
+        bias.SetUsrMem(bias_size, memory::format::x, &bias_tensor);
         bias.SetOpMemDesc(bias_size, memory::format::any);
 
         // Create convolution primitive with Bias.
@@ -609,6 +603,10 @@ class MklConv2DOp : public OpKernel {
 
         auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
                                                                 cpu_engine);
+        AllocateOutputTensor(context, conv_prim_desc,
+                             output_dims_mkl_order, tf_fmt, &output_tensor);
+        // Set data handle for output.
+        output.SetUsrMemDataHandle(output_tensor);
         PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output);
       } else {
         // Create convolution primitive without Bias.
@@ -619,6 +617,10 @@ class MklConv2DOp : public OpKernel {
 
         auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
                                                                 cpu_engine);
+        AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
+                             tf_fmt, &output_tensor);
+        // Set data handle for output.
+        output.SetUsrMemDataHandle(output_tensor);
         PrepareAndExecuteNet(conv_prim_desc, &src, &filter, nullptr, &output);
       }
     } catch (mkldnn::error &e) {
@@ -636,23 +638,44 @@ class MklConv2DOp : public OpKernel {
   Padding padding_;
   TensorFormat data_format_;
 
+  // Allocate output tensor.
+  void AllocateOutputTensor(
+                  OpKernelContext* context,
+                  const convolution_forward::primitive_desc& conv_prim_desc,
+                  const memory::dims& output_dims_mkl_order,
+                  memory::format output_tf_format, Tensor** output_tensor) {
+      CHECK_NOTNULL(output_tensor);
+      auto dst_pd = conv_prim_desc.dst_primitive_desc();
+
+      // Allocate shape of Mkl tensor.
+      MklDnnShape output_mkl_shape;
+      output_mkl_shape.SetMklTensor(true);
+      output_mkl_shape.SetMklLayout(&dst_pd);
+      output_mkl_shape.SetElemType(MklDnnType<T>());
+      output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                   output_dims_mkl_order, output_tf_format);
+
+      // Allocate shape of TF tensor.
+      TensorShape output_tf_shape;
+      output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
+
+      const int kOutputSlotIdx = 0;
+      AllocateOutputSetMklShape(context, kOutputSlotIdx, output_tensor,
+                                output_tf_shape, output_mkl_shape);
+  }
+
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecuteNet(
                   const convolution_forward::primitive_desc& conv_prim_desc,
                   MklDnnData<T>* src, MklDnnData<T>* filter,
                   MklDnnData<T>* bias, MklDnnData<T>* output) {
     // Create reorders between user layout and MKL layout if it is needed and
-    // add it to the net before convolution.
+    // add it to the net before convolution. No need to check for output
+    // reorder as we propagate output layout to the next layer.
     std::vector<primitive> net;
     src->CheckReorderToOpMem(conv_prim_desc.src_primitive_desc(), &net);
     filter->CheckReorderToOpMem(conv_prim_desc.weights_primitive_desc(), &net);
 
-    // Memory for output of convolution. Since we may need reorder on the
-    // output side, we will prepare reorder primitive in case output
-    // reorder to user memory is required.
-    bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
-                                      conv_prim_desc.dst_primitive_desc());
-
     // Create convolution primitive and add it to net.
     if (bias) {
       CHECK_EQ(biasEnabled, true);
@@ -665,13 +688,6 @@ class MklConv2DOp : public OpKernel {
                                     filter->GetOpMem(), output->GetOpMem()));
     }
 
-    // Insert reorder primitive in the net for output reorder if reorder is
-    // required.
-    if (output_reorder_required) {
-      output->InsertReorderToUserMem(&net);
-    }
-
-    // Handle output reorder
     stream(stream::kind::eager).submit(net).wait();
   }
 };
@@ -688,7 +704,12 @@ class MklConv2DOp : public OpKernel {
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConv2DOp<CPUDevice, T, true>);
+                          MklConv2DOp<CPUDevice, T, true>);         \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_CPU);
 
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index f0cb37f8a42c19cad183af2e0de7db2931cf299a..47a9b4bfc734dab5786b42f6e7118a798f790345 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -41,6 +41,12 @@ limitations under the License.
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
+
+using mkldnn::stream;
+using mkldnn::prop_kind;
+
+using mkldnn::convolution_forward;
+using mkldnn::convolution_direct;
 #endif
 
 namespace tensorflow {
@@ -108,7 +114,13 @@ class MklDnnConvUtil {
   #undef CHECK_BOUNDS
 
     // MKL-DNN always requires input in NCHW format.
-    *input_dims = {input_batch, input_depth, input_rows, input_cols};
+    std::vector<int> mkldnn_sizes(4, -1);
+    mkldnn_sizes[MklDnnDims::Dim_N] = input_batch;
+    mkldnn_sizes[MklDnnDims::Dim_C] = input_depth;
+    mkldnn_sizes[MklDnnDims::Dim_H] = input_rows;
+    mkldnn_sizes[MklDnnDims::Dim_W] = input_cols;
+
+    *input_dims = mkldnn_sizes;
   }
 
   // Calculate Convolution filter size in MKL-DNN order. MKL-DNN
@@ -156,7 +168,13 @@ class MklDnnConvUtil {
 
     // MKL-DNN always needs filter in OIHW format.
     // OIHW = (out_depth, in_depth, rows, cols)
-    *filter_dims = {out_depth, in_depth, filter_rows, filter_cols};
+    std::vector<int> mkldnn_sizes(4, -1);
+    mkldnn_sizes[MklDnnDims::Dim_O] = out_depth;
+    mkldnn_sizes[MklDnnDims::Dim_I] = in_depth;
+    mkldnn_sizes[MklDnnDims::Dim_H] = filter_rows;
+    mkldnn_sizes[MklDnnDims::Dim_W] = filter_cols;
+
+    *filter_dims = mkldnn_sizes;
   }
 
   // Calculate Convolution filter size in MKL-DNN order. MKL-DNN
@@ -167,9 +185,9 @@ class MklDnnConvUtil {
   GetFilterSizeInMklOrder(size_t src_index, size_t filter_index,
                           memory::dims *filter_dims) {
     CHECK_NOTNULL(filter_dims);
-    const Tensor& input = MklGetInput(context_, src_index);
-    const Tensor& filter = MklGetInput(context_, filter_index);
-    GetFilterSizeInMklOrder(input.shape(), filter.shape(), filter_dims);
+    GetFilterSizeInMklOrder(GetTfShape(context_, src_index),
+                            GetTfShape(context_, filter_index),
+                            filter_dims);
   }
 
   // Calculate Bias size for 2D Convolution. Function does not return
@@ -238,8 +256,12 @@ class MklDnnConvUtil {
     *output_dims_tf_order = TFShapeToMklDnnDims(out_shape);
 
     // MKL-DNN always needs output in NCHW format.
-    *output_dims_mkl_order = {out_batch, out_depth, static_cast<int>(out_rows),
-                   static_cast<int>(out_cols)};
+    std::vector<int> mkldnn_sizes(4, -1);
+    mkldnn_sizes[MklDnnDims::Dim_N] = out_batch;
+    mkldnn_sizes[MklDnnDims::Dim_C] = out_depth;
+    mkldnn_sizes[MklDnnDims::Dim_H] = static_cast<int>(out_rows);
+    mkldnn_sizes[MklDnnDims::Dim_W] = static_cast<int>(out_cols);
+    *output_dims_mkl_order = mkldnn_sizes;
 
     // Now handle padding. MKL-DNN uses asymetric padding.
     *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
@@ -261,14 +283,14 @@ class MklDnnConvUtil {
     CHECK_NOTNULL(pad_l);
     CHECK_NOTNULL(pad_r);
 
-    const Tensor& input = MklGetInput(context_, src_index);
-    const Tensor& filter = MklGetInput(context_, filter_index);
+    auto input_tf_shape = GetTfShape(context_, src_index);
+    auto filter_tf_shape = GetTfShape(context_, filter_index);
 
-    OP_REQUIRES(context_, input.dims() == 4,
+    OP_REQUIRES(context_, input_tf_shape.dims() == 4,
                 errors::InvalidArgument("input must be 4-dimensional",
-                                          input.shape().DebugString()));
+                                        input_tf_shape.DebugString()));
 
-    GetOutputAndPadSizeInMklOrder(input.shape(), filter.shape(),
+    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape,
                                   strides, output_dims_tf_order,
                                   output_dims_mkl_order, pad_l, pad_r);
   }
@@ -309,8 +331,231 @@ class MklDnnConvUtil {
   }
 };
 
+/////////////////////////////////////////////////////////////////////
+///  Common class that implements Conv2DBackpropFilter and Input
+/////////////////////////////////////////////////////////////////////
+
+template <typename Device, class T>
+class MklConv2DBackpropCommonOp :  public OpKernel {
+ public:
+  ~MklConv2DBackpropCommonOp() {}
+  explicit MklConv2DBackpropCommonOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format_str;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    int stride_n = GetTensorDim(strides_, data_format_, 'N');
+    int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    OP_REQUIRES(
+        context, (stride_n == 1 && stride_c == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+
+      // Prepare common tensors for Conv2DBackpropInput and
+      // Conv2DBackpropFilter.
+      MklDnnData<T> input(&cpu_engine);
+      MklDnnData<T> filter(&cpu_engine);
+      MklDnnData<T> outbackprop(&cpu_engine);
+      MklDnnData<T> output(&cpu_engine);
+
+      // Input tensors
+      const int kInputIdx = 0, kFilterIdx = 1, kOutbpropIdx = 2;
+      const Tensor& input_tensor = MklGetInput(context, kInputIdx);
+      const Tensor& filter_tensor = MklGetInput(context, kFilterIdx);
+      const Tensor& outbprop_tensor = MklGetInput(context, kOutbpropIdx);
+
+      MklDnnShape input_mkl_shape, filter_mkl_shape, outbprop_mkl_shape;
+      GetMklShape(context, kInputIdx, &input_mkl_shape);
+      GetMklShape(context, kFilterIdx, &filter_mkl_shape);
+      GetMklShape(context, kOutbpropIdx, &outbprop_mkl_shape);
+      // Allow operator-specific sanity checking of shapes.
+      ValidateMklShapes(input_mkl_shape, filter_mkl_shape, outbprop_mkl_shape);
+
+      // Allow operator-specific generation of shapes.
+      // E.g., Conv2DBackpropFilter gets filter as filter_sizes. It is a
+      // tensor containing shape of filter. So filter.shape() is not
+      // a correct way to get filter shape. These operator-specific calls
+      // allow this class to handle this case.
+      TensorShape input_tf_shape = MakeInputTfShape(context, input_tensor);
+      TensorShape filter_tf_shape = MakeFilterTfShape(context, filter_tensor);
+      TensorShape outbprop_tf_shape = GetTfShape(context, kOutbpropIdx);
+
+      // By default, all dims are in MKL order. Only dims in TF order
+      // are those with prefix tf_order.
+      memory::dims outbprop_dims, fwd_input_dims, fwd_filter_dims;
+      memory::dims padding_l, padding_r, strides, fwd_output_dims;
+      memory::dims fwd_output_dims_tf_order;
+
+      // Get forward convolution parameters.
+      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
+      conv_utl.GetConvFwdSizesInMklOrder(input_tf_shape, filter_tf_shape,
+                                         &fwd_input_dims, &fwd_filter_dims,
+                                         &strides,
+                                         &fwd_output_dims_tf_order,
+                                         &fwd_output_dims,
+                                         &padding_l, &padding_r);
+      if (!context->status().ok()) return;
+
+      // Create Convolution forward descriptor since Convolution backward
+      // API needs it. For that, we first need to create input, filter
+      // and output memory descriptors.
+      auto tf_fmt = TFDataFormatToMklDnnDataFormat(data_format_);
+      // If input is in MKL layout, then simply grab input layout; otherwise,
+      // construct input TF layout. For TF layout, although input shape
+      // required is in MKL-DNN order, the layout is Tensorflow's layout
+      // (NHWC or NCHW depending on data format).
+      auto fwd_input_md = input_mkl_shape.IsMklTensor() ?
+                          input_mkl_shape.GetMklLayout() :
+                       memory::desc(fwd_input_dims, MklDnnType<T>(), tf_fmt);
+      // If filter is in MKL layout, then simply grab filter layout; otherwise
+      // construct filter in TF layout. For TF layout, filter is in HWIO format.
+      auto fwd_filter_md = filter_mkl_shape.IsMklTensor() ?
+                          filter_mkl_shape.GetMklLayout() :
+                          memory::desc(fwd_filter_dims, MklDnnType<T>(),
+                                       memory::format::hwio);
+      // Tensorflow Output of Conv2D is in data_format order.
+      auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType<T>(), tf_fmt);
+      auto fwd_desc = convolution_forward::desc(prop_kind::forward,
+            convolution_direct, fwd_input_md, fwd_filter_md, fwd_out_md,
+            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
+      auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
+
+      // Create memory for user data. Describe how the inputs and outputs of
+      // Convolution look like. Also specify buffers containing actual input
+      // and output data.
+
+      // Since this is a common class for both Conv2DBackpropFilter and
+      // Conv2DBackpropInput, we skip SetUsrMem call for input tensor (for
+      // Conv2DBackpropInput) and for filter tensor (for
+      // conv2DBackpropFilter) depending on which tensor is int32 type.
+      size_t input_with_sizes = GetInputTensorIndexWithSizes();
+      if (input_with_sizes != kInputIdx) {
+        // Shape of Conv2DBackpropFilter's input is same as Conv2D input.
+        input.SetUsrMem(fwd_input_md, &input_tensor);
+      } else if (input_with_sizes != kFilterIdx) {
+        // Shape of Conv2DBackpropInput's filter is same as Conv2D filter.
+        filter.SetUsrMem(fwd_filter_md, &filter_tensor);
+      }
+
+      conv_utl.GetInputSizeInMklOrder(outbprop_tf_shape, &outbprop_dims);
+      if (!context->status().ok()) return;
+      if (outbprop_mkl_shape.IsMklTensor()) {
+        // If outbackprop is in Mkl layout, then simply grab it.
+        auto outbprop_md = outbprop_mkl_shape.GetMklLayout();
+        outbackprop.SetUsrMem(outbprop_md, &outbprop_tensor);
+      } else {
+        // If outbackprop is in TensorFlow layout, then we need to create memory
+        // descriptor for it. Outbackprop shape is data format order.
+        outbackprop.SetUsrMem(outbprop_dims, tf_fmt, &outbprop_tensor);
+      }
+
+      // Operator specific call to get output shape and data_format.
+      auto bwd_output_dims = GetOutputDims(fwd_input_dims, fwd_filter_dims);
+      auto bwd_output_format = GetOutputFormat(tf_fmt);
+      output.SetUsrMem(bwd_output_dims, bwd_output_format);
+
+      // Create memory descriptors for convolution data w/ no specified format.
+      input.SetOpMemDesc(fwd_input_dims, memory::format::any);
+      filter.SetOpMemDesc(fwd_filter_dims, memory::format::any);
+      outbackprop.SetOpMemDesc(outbprop_dims, memory::format::any);
+      output.SetOpMemDesc(bwd_output_dims, memory::format::any);
+
+      // Operator-specific call to create and execute primitive.
+      Tensor* output_tensor = nullptr;
+      CreatePrimitive(context, cpu_engine, fwd_pd, &input, &filter,
+                      &outbackprop, &output, &output_tensor,
+                      strides, padding_l, padding_r,
+                      TFPaddingToMklDnnPadding(padding_),
+                      bwd_output_dims, bwd_output_format);
+    } catch (mkldnn::error &e) {
+     string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) +
+                       ", in file " + string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+     OP_REQUIRES_OK(context, errors::Aborted("Operation received an exception:",
+                                            error_msg));
+    }
+  }
+
+  /// Pure virtual function to allow operator to check for validity of input
+  /// shapes. Function asserts that input shapes are valid.
+  virtual void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
+                                 const MklDnnShape& filter_mkl_shape,
+                                 const MklDnnShape& outbprop_mkl_shape) = 0;
+
+  /// Operator-specific function that returns index of input that is
+  /// representing input sizes. For Conv2DBackpropFilter it returns 1 since
+  /// filter for this operator is filter shape. For Conv2DBackpropInput it
+  /// returns 0 (for input).
+  virtual size_t GetInputTensorIndexWithSizes() = 0;
+
+  /// Get TensorFlow shape of input tensor.
+  virtual TensorShape MakeInputTfShape(OpKernelContext* context,
+                                      const Tensor& input_tensor) = 0;
+
+  /// Get TensorFlow shape of filter tensor.
+  virtual TensorShape MakeFilterTfShape(OpKernelContext* context,
+                                       const Tensor& filter_tensor) = 0;
+
+  /// Get shape of output in MKL-DNN order. Computes shape of output from
+  /// input shape (fwd_input_dims) and filter shape (fwd_filter_dims).
+  virtual
+  const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
+                                    const memory::dims& fwd_filter_dims) = 0;
+
+  /// Get data_format of output in MKL-DNN order. If output data format is
+  /// same as input data format, then it simply returns value of data_format
+  /// parameter as it is.
+  virtual memory::format GetOutputFormat(const memory::format data_format) = 0;
+
+  /// Create and execute the primitive storing output in the output_tensor.
+  virtual void CreatePrimitive(OpKernelContext* context,
+    const engine& cpu_engine,
+    const convolution_forward::primitive_desc& conv_fwd_pd,
+    MklDnnData<T>* input, MklDnnData<T>* filter, MklDnnData<T>* outbackprop,
+    MklDnnData<T>* output, Tensor** output_tensor, const memory::dims& strides,
+    const memory::dims& padding_l, const memory::dims& padding_r,
+    padding_kind padding, const memory::dims& bwd_output_dims,
+    memory::format bwd_output_format) = 0;
+
+  // Get the data_format {NCHW, NHWC}
+  TensorFormat GetTFDataFormat () { return data_format_; }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
 #endif  // INTEL_MKL_DNN
 
+/////////////////////////////////////////////////////////////////////
+///  Dummy Mkl op that is just used for operators that are intermediate
+///  output of node fusion in the graph
+/////////////////////////////////////////////////////////////////////
+
+template <typename Device, typename T>
+class MklDummyOp : public OpKernel {
+ public:
+  ~MklDummyOp() {}
+
+  explicit MklDummyOp(OpKernelConstruction* context) :
+    OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TF_CHECK_OK(errors::Unimplemented("This is a dummy op."
+                                      "It should not have been invoked."));
+  }
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index bc9e906c39a9a7f5f4b2ae83afc6774aecb38c48..a761562a4b9966d3dbd8bede2f64e6eb0546b42e 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -25,10 +25,24 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+
+using mkldnn::stream;
+using mkldnn::prop_kind;
+using mkldnn::use_scale_shift;
+using mkldnn::use_global_stats;
+using mkldnn::batch_normalization_forward;
+using mkldnn::batch_normalization_backward;
+#endif
+
 // TODO(inteltf) Address comments from PR 8968.
 
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
+
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, typename T>
 class MklFusedBatchNormOp : public OpKernel {
  public:
@@ -46,7 +60,6 @@ class MklFusedBatchNormOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     MklFusedBatchNormOpContext mkl_context;
-
     const Tensor& input = MklGetInput(context, 0);
     const Tensor& scale = MklGetInput(context, 1);
     const Tensor& shift = MklGetInput(context, 2);
@@ -55,6 +68,7 @@ class MklFusedBatchNormOp : public OpKernel {
 
     GetMklShape(context, 0, &(mkl_context.mkl_shape_input_shape));
     bool input_in_mkl_format = mkl_context.mkl_shape_input_shape.IsMklTensor();
+
     if (!input_in_mkl_format) {
       OP_REQUIRES(context, input.dims() == 4,
                   errors::InvalidArgument("input must be 4-dimensional",
@@ -69,10 +83,12 @@ class MklFusedBatchNormOp : public OpKernel {
     OP_REQUIRES(context, est_mean.dims() == 1,
                 errors::InvalidArgument("estimated_mean must be 1-dimensional",
                                         est_mean.shape().DebugString()));
+
     OP_REQUIRES(
         context, est_variance.dims() == 1,
         errors::InvalidArgument("estimated_variance must be 1-dimensional",
                                 est_variance.shape().DebugString()));
+
     if (is_training_) {
       OP_REQUIRES(context, est_mean.dim_size(0) == 0,
                   errors::InvalidArgument("estimated_mean empty for training",
@@ -258,7 +274,6 @@ class MklFusedBatchNormOp : public OpKernel {
             E_SUCCESS);
       }
     }
-
     void MklPrepareContextInputs(OpKernelContext* context,
                                  Tensor* mkl_tmp_input_buf_tensor,
                                  Tensor* mkl_tmp_scale_shift_buf_tensor) {
@@ -325,15 +340,6 @@ class MklFusedBatchNormOp : public OpKernel {
   } MklFusedBatchNormOpContext;
 };
 
-#define REGISTER_MKL_CPU(T)                                         \
-  REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNorm")                \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklFusedBatchNormOp<CPUDevice, T>);
-TF_CALL_float(REGISTER_MKL_CPU);
-#undef REGISTER_MKL_CPU
-
 template <typename Device, typename T>
 class MklFusedBatchNormGradOp : public OpKernel {
  public:
@@ -595,7 +601,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
       mkl_res_batchnorm_bwd[dnnResourceSrc] =
           (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input;
 
-      bool mkl_convert_out_backprop;
+     bool mkl_convert_out_backprop;
       dnnPrimitive_t mkl_prim_convert_out_backprop = nullptr;
       dnnLayout_t mkl_lt_internal_out_backprop = nullptr;
       void* mkl_buf_converted_out_backprop = nullptr;
@@ -675,6 +681,628 @@ class MklFusedBatchNormGradOp : public OpKernel {
     }
   } MklFusedBatchNormGradOpContext;
 };
+#endif
+
+#ifdef INTEL_MKL_DNN
+
+template <typename Device, typename T>
+class MklFusedBatchNormOp : public OpKernel {
+ public:
+  explicit MklFusedBatchNormOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    float epsilon;
+    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
+    epsilon_ = T(epsilon);
+    string tensor_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
+    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const size_t src_index = 0;    // index of src input tensor
+      const size_t scale_index = 1;  // index of scale tensor
+      const size_t shift_index = 2;  // index of shift tensor
+      const size_t mean_index = 3;   // index of est_mean tensor
+      const size_t var_index = 4;    // index of est_variance tensor
+
+      const Tensor& src_tensor          = MklGetInput(context, src_index);
+      const Tensor& scale_tensor        = MklGetInput(context, scale_index);
+      const Tensor& shift_tensor        = MklGetInput(context, shift_index);
+      const Tensor& est_mean_tensor     = MklGetInput(context, mean_index);
+      const Tensor& est_variance_tensor = MklGetInput(context, var_index);
+
+      MklDnnShape dnn_shape_src;
+      GetMklShape(context, src_index, &dnn_shape_src);
+
+      if (dnn_shape_src.IsMklTensor()) {
+        OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                        src_tensor.shape().DebugString()));
+      } else {
+        OP_REQUIRES(context, src_tensor.dims() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                        src_tensor.shape().DebugString()));
+      }
+      OP_REQUIRES(context, scale_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "scale must be 1-dimensional",
+                      scale_tensor.shape().DebugString()));
+      OP_REQUIRES(context, shift_tensor.dims() == 1,
+                  errors::InvalidArgument("offset must be 1-dimensional",
+                                        shift_tensor.shape().DebugString()));
+      OP_REQUIRES(context, est_mean_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "estimated_mean must be 1-dimensional",
+                      est_mean_tensor.shape().DebugString()));
+      OP_REQUIRES(context, est_variance_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "estimated_variance must be 1-dimensional",
+                      est_variance_tensor.shape().DebugString()));
+
+      if (is_training_) {
+        OP_REQUIRES(context, est_mean_tensor.dim_size(0) == 0,
+                    errors::InvalidArgument(
+                        "estimated_mean must be empty for training",
+                        est_mean_tensor.shape().DebugString()));
+        OP_REQUIRES(context, est_variance_tensor.dim_size(0) == 0,
+                    errors::InvalidArgument(
+                        "estimated_variance must be empty for training",
+                        est_variance_tensor.shape().DebugString()));
+      }
+
+      if (dnn_shape_src.IsMklTensor())
+        depth_ = dnn_shape_src.DimSize(MklDnnDims::Dim_C);
+      else
+        ExtractParams(context);
+
+      // Indices of output tensors
+      const size_t dst_index = 0;
+      const size_t batch_mean_index = 1;
+      const size_t batch_variance_index = 2;
+      const size_t saved_mean_index = 3;
+      const size_t saved_variance_index = 4;
+
+      // allocate batch mean output tensor
+      Tensor* batch_mean_tensor = nullptr;
+      MklDnnShape mkl_shape_batch_mean;
+      mkl_shape_batch_mean.SetMklTensor(false);
+      AllocateOutputSetMklShape(context,
+                                batch_mean_index,
+                                &batch_mean_tensor,
+                                scale_tensor.shape(),
+                                mkl_shape_batch_mean);
+      CHECK_NOTNULL(batch_mean_tensor);
+
+      // Batch variance
+      Tensor* batch_variance_tensor = nullptr;
+      MklDnnShape mkl_shape_batch_variance;
+      mkl_shape_batch_variance.SetMklTensor(false);
+      AllocateOutputSetMklShape(context,
+                                batch_variance_index,
+                                &batch_variance_tensor,
+                                scale_tensor.shape(),
+                                mkl_shape_batch_variance);
+      CHECK_NOTNULL(batch_variance_tensor);
+
+      if (is_training_)
+        SetMeanVariance(*batch_mean_tensor, *batch_variance_tensor);
+      else
+        SetMeanVariance(est_mean_tensor, est_variance_tensor);
+
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);
+
+      memory::format format_m;
+      if (dnn_shape_src.IsMklTensor()) {
+        if (dnn_shape_src.IsTensorInNCHWFormat()) {
+          format_m = memory::format::nchw;
+        } else {
+          format_m = memory::format::nhwc;
+        }
+      } else {
+        format_m = TFDataFormatToMklDnnDataFormat(tensor_format_);
+      }
+
+      // set src primitive
+      memory::dims src_dims;
+      if (dnn_shape_src.IsMklTensor()) {
+        src_dims = TFShapeToMklDnnDimsInNCHW(dnn_shape_src.GetTfShape(),
+                                             tensor_format_);
+      } else {
+        src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
+                                             tensor_format_);
+      }
+
+      auto src_md = dnn_shape_src.IsMklTensor()
+                    ? dnn_shape_src.GetMklLayout()
+                    : memory::desc(src_dims, MklDnnType<T>(), format_m);
+      src.SetUsrMem(src_md, &src_tensor);
+
+      // set weights primitive
+      // MKL-DNN packs scale & shift as "weights":
+      // <scale>...<scale><shift>...<shift>
+      auto weights_desc = memory::desc({2, depth_},
+                                       MklDnnType<T>(),
+                                       memory::format::nc);
+      auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
+      auto weights_m = memory(weights_pd);
+      T* weights_data = reinterpret_cast<T*>(
+                        weights_m.get_data_handle());
+      T* scale_tf = reinterpret_cast<T*>(
+                    const_cast<T*>(scale_tensor.flat<T>().data()));
+      T* shift_tf = reinterpret_cast<T*>(
+                    const_cast<T*>(shift_tensor.flat<T>().data()));
+
+      for (int k=0; k < depth_; k++) {
+        weights_data[k] = scale_tf[k];
+        weights_data[k + depth_] = shift_tf[k];
+      }
+
+      // Mean and variance (without Bessel's correction) saved for backward
+      // computation to serve as pre-computed mean and variance.
+      Tensor* saved_mean_tensor = nullptr;
+      MklDnnShape mkl_shape_saved_mean;
+      mkl_shape_saved_mean.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, saved_mean_index,
+                                &saved_mean_tensor,
+                                scale_tensor.shape(),
+                                mkl_shape_saved_mean);
+      CHECK_NOTNULL(saved_mean_tensor);
+
+      Tensor* saved_variance_tensor = nullptr;
+      MklDnnShape mkl_shape_saved_variance;
+      mkl_shape_saved_variance.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, saved_variance_index,
+                                &saved_variance_tensor,
+                                scale_tensor.shape(),
+                                mkl_shape_saved_variance);
+      CHECK_NOTNULL(saved_variance_tensor);
+
+      // set mean primitive
+      auto mean_desc = memory::desc({1, depth_},
+                                    MklDnnType<T>(),
+                                    memory::format::nc);
+      auto mean_pd = memory::primitive_desc(mean_desc, cpu_engine);
+      char* saved_mean_data_tf = reinterpret_cast<char*>
+                                 (saved_mean_tensor->flat<T>().data());
+      std::memcpy(saved_mean_data_tf,
+                  reinterpret_cast<char*>(mean_values_),
+                  depth_*sizeof(T));
+      auto mean_m = memory(mean_pd,
+                           reinterpret_cast<void*>(saved_mean_data_tf));
+
+      // set variance primitive
+      auto variance_desc = memory::desc({1, depth_},
+                                    MklDnnType<T>(),
+                                    memory::format::nc);
+      auto variance_pd = memory::primitive_desc(variance_desc, cpu_engine);
+      char* saved_variance_data_tf = reinterpret_cast<char*>
+                  (saved_variance_tensor->flat<T>().data());
+      std::memcpy(saved_variance_data_tf,
+                  reinterpret_cast<char*>(variance_values_),
+                  depth_*sizeof(T));
+      auto variance_m = memory(variance_pd, saved_variance_data_tf);
+
+      prop_kind pk = (is_training_) ?
+                     prop_kind::forward_training :
+                     prop_kind::forward_scoring;
+      auto bnrm_fwd_desc = batch_normalization_forward::desc(
+                               pk, src.GetUsrMemDesc(), epsilon_,
+                               is_training_ ? use_scale_shift :
+                               (use_scale_shift | use_global_stats));
+      auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc(
+                             bnrm_fwd_desc, cpu_engine);
+
+      // allocate dst tensor
+      MklDnnShape dnn_shape_dst;
+      TensorShape tf_shape_dst;
+      Tensor* dst_tensor = nullptr;
+      if (dnn_shape_src.IsMklTensor()) {
+        dnn_shape_dst.SetMklTensor(true);
+        auto dst_pd = bnrm_fwd_pd.dst_primitive_desc();
+        dnn_shape_dst.SetMklLayout(&dst_pd);
+        dnn_shape_dst.SetElemType(MklDnnType<T>());
+        dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(),
+                                  src_dims, format_m);
+        tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T));
+      } else {
+        dnn_shape_dst.SetMklTensor(false);
+        tf_shape_dst = src_tensor.shape();
+      }
+      AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                                tf_shape_dst, dnn_shape_dst);
+
+      // Output of batchnorm has same shape as input.
+      dst.SetUsrMem(src_md, dst_tensor);
+
+      primitive bnrm_fwd_op;
+      if (is_training_) {
+        bnrm_fwd_op = batch_normalization_forward(
+                          bnrm_fwd_pd,
+                          src.GetOpMem(),
+                          weights_m,
+                          dst.GetOpMem(),
+                          mean_m,
+                          variance_m);
+      } else {
+        bnrm_fwd_op = batch_normalization_forward(
+                          bnrm_fwd_pd,
+                          src.GetOpMem(),
+                          mean_m,
+                          variance_m,
+                          (const primitive::at) weights_m,
+                          dst.GetOpMem());
+      }
+      std::vector<primitive> net;
+      net.push_back(bnrm_fwd_op);
+      stream(stream::kind::eager).submit(net).wait();
+
+      // copy batch_mean data
+      T* batch_mean_data_tf = reinterpret_cast<T*>(
+                                batch_mean_tensor->flat<T>().data());
+      std::memcpy(reinterpret_cast<char*>(batch_mean_data_tf),
+                  reinterpret_cast<char*>(mean_m.get_data_handle()),
+                  depth_*sizeof(T));
+
+      // copy batch_variance data with Bessel's correction
+      // if training mode is on
+      float adjust_factor = 1.0;
+      if (is_training_) {
+        size_t orig_size = src_dims[0] * src_dims[2] * src_dims[3];
+        size_t adjust_size = orig_size - 1;
+        adjust_factor = (static_cast<float>(orig_size)) / adjust_size;
+      }
+      T* batch_variance_data_tf = reinterpret_cast<T*>(
+                                  batch_variance_tensor->flat<T>().data());
+      for (int k=0; k < depth_; k++)
+        batch_variance_data_tf[k] =
+            (reinterpret_cast<T*>(variance_m.get_data_handle()))[k]
+            * adjust_factor;
+    } catch (mkldnn::error &e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) +
+                         ", in file " + string(__FILE__) + ":" +
+                         std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+                     errors::Aborted("Operation received an exception:",
+                     error_msg));
+    }
+  }
+
+ private:
+  T epsilon_;
+  TensorFormat tensor_format_;
+  bool is_training_;
+  T* mean_values_;
+  T* variance_values_;
+  size_t depth_;          // batch normalization is done for per channel.
+
+  void ExtractParams(OpKernelContext* context) {
+    const Tensor& input = MklGetInput(context, 0);
+    depth_ = static_cast<int>(GetTensorDim(input, tensor_format_, 'C'));
+  }
+
+  void SetMeanVariance(const Tensor& mean, const Tensor& variance) {
+    mean_values_ = reinterpret_cast<T*>(
+                       const_cast<T*>(mean.flat<T>().data()));
+    variance_values_ = reinterpret_cast<T*>(
+                       const_cast<T*>(variance.flat<T>().data()));
+  }
+};
+
+
+template <typename Device, typename T>
+class MklFusedBatchNormGradOp : public OpKernel {
+ public:
+  explicit MklFusedBatchNormGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    float epsilon;
+    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
+    epsilon_ = T(epsilon);
+    string tensor_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
+    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+
+      const size_t diff_dst_index = 0;  // index of diff_dst tensor
+      const size_t src_index = 1;       // index of src input tensor
+      const size_t scale_index = 2;     // index of scale tensor
+      const size_t mean_index = 3;      // index of saved_mean tensor
+      const size_t variance_index = 4;  // index of saved_variance tensor
+      const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+      const Tensor& src_tensor = MklGetInput(context, src_index);
+      const Tensor& scale_tensor = MklGetInput(context, scale_index);
+      const Tensor& saved_mean_tensor = MklGetInput(context, mean_index);
+      const Tensor& saved_variance_tensor = MklGetInput(context,
+                                            variance_index);
+
+      MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
+      GetMklShape(context, src_index, &dnn_shape_src);
+      GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+      if (dnn_shape_diff_dst.IsMklTensor()) {
+        OP_REQUIRES(context, dnn_shape_diff_dst.GetDimension() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                        diff_dst_tensor.shape().DebugString()));
+      } else {
+        OP_REQUIRES(context, diff_dst_tensor.dims() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                        diff_dst_tensor.shape().DebugString()));
+      }
+
+      if (dnn_shape_src.IsMklTensor()) {
+        OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                         src_tensor.shape().DebugString()));
+      } else {
+        OP_REQUIRES(context, src_tensor.dims() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                        src_tensor.shape().DebugString()));
+      }
+
+      OP_REQUIRES(context, scale_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "scale must be 1-dimensional",
+                      scale_tensor.shape().DebugString()));
+      OP_REQUIRES(context, saved_mean_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "saved mean must be 1-dimensional",
+                       saved_mean_tensor.shape().DebugString()));
+
+      OP_REQUIRES(context, saved_variance_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "saved variance must be 1-dimensional",
+                      saved_variance_tensor.shape().DebugString()));
+
+      if (dnn_shape_src.IsMklTensor())
+        depth_ = dnn_shape_src.DimSize(MklDnnDims::Dim_C);
+      else
+        ExtractParams(context);
+
+      memory::format format_m;
+      if (dnn_shape_src.IsMklTensor()) {
+        if (dnn_shape_src.IsTensorInNCHWFormat())
+          format_m = memory::format::nchw;
+        else
+          format_m = memory::format::nhwc;
+      } else {
+        format_m = TFDataFormatToMklDnnDataFormat(tensor_format_);
+      }
+
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> mean(&cpu_engine);
+      MklDnnData<T> variance(&cpu_engine);
+      MklDnnData<T> diff_dst(&cpu_engine);
+      MklDnnData<T> diff_src(&cpu_engine);
+
+      memory::dims src_dims, diff_dst_dims;
+      if (dnn_shape_src.IsMklTensor())
+        src_dims = TFShapeToMklDnnDimsInNCHW(
+                       dnn_shape_src.GetTfShape(), tensor_format_);
+      else
+        src_dims = TFShapeToMklDnnDimsInNCHW(
+                       src_tensor.shape(), tensor_format_);
+
+      if (dnn_shape_diff_dst.IsMklTensor())
+        diff_dst_dims = TFShapeToMklDnnDimsInNCHW(
+                            dnn_shape_diff_dst.GetTfShape(),
+                            tensor_format_);
+      else
+        diff_dst_dims = TFShapeToMklDnnDimsInNCHW(
+                            diff_dst_tensor.shape(),
+                            tensor_format_);
+
+      // set src and diff_dst primitives
+      memory::desc src_md({}, memory::data_undef, memory::format_undef);
+      memory::desc diff_dst_md({}, memory::data_undef, memory::format_undef);
+      if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
+        if (dnn_shape_src.IsMklTensor()) {
+          src_md = dnn_shape_src.GetMklLayout();
+          diff_dst_md = src_md;
+        } else {
+          diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
+          src_md = diff_dst_md;
+        }
+      } else {
+        src_md =  memory::desc(src_dims, MklDnnType<T>(), format_m);
+        diff_dst_md = src_md;
+      }
+      src.SetUsrMem(src_md, &src_tensor);
+      diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
+
+      // weights -- DNN packs scales/shifts as weights in order of
+      // scale, ..., scale, shift, ..., shift
+      auto weights_desc = memory::desc({2, depth_},
+                                       MklDnnType<T>(),
+                                       memory::format::nc);
+      auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
+      auto weights_m = memory(weights_pd);
+      T* weights_data = reinterpret_cast<T*>(weights_m.get_data_handle());
+      T* scale_tf = reinterpret_cast<T*>(const_cast<T*>
+                                        (scale_tensor.flat<T>().data()));
+      for (int k=0; k < depth_; k++) {
+        weights_data[k] = scale_tf[k];
+        weights_data[k + depth_] = 0;
+      }
+
+      // set mean primitive
+      memory::dims mv_dims = GetMeanVarianceDims();
+      mean.SetUsrMem(mv_dims,
+                     memory::format::nc,
+                     const_cast<void*>(static_cast<const void*>
+                     (saved_mean_tensor.flat<T>().data())));
+      mean.SetOpMemDesc(mv_dims, memory::format::nc);
+
+      // set variance primitive
+      variance.SetUsrMem(mv_dims,  memory::format::nc,
+                         const_cast<void*>(static_cast<const void*>
+                         (saved_variance_tensor.flat<T>().data())));
+      variance.SetOpMemDesc(mv_dims, memory::format::nc);
+
+      // set diff_weight primitive
+      auto diff_weights_desc = memory::desc(
+                                 {2, depth_},
+                                 MklDnnType<T>(),
+                                 memory::format::nc);
+      auto diff_weights_pd = memory::primitive_desc(
+                                diff_weights_desc,
+                                cpu_engine);
+      auto diff_weights_m = memory(diff_weights_pd);
+
+      auto bnrm_fwd_desc = batch_normalization_forward::desc(
+                                prop_kind::forward_training,
+                                src.GetUsrMemDesc(),
+                                epsilon_,
+                                use_scale_shift);
+      auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc(
+                                bnrm_fwd_desc,
+                                cpu_engine);
+
+      // Indices of output tensors
+      const size_t diff_src_index = 0;    // index of diff_src tensor
+      const size_t diff_scale_index = 1;  // index of diff_scale tensor
+      const size_t diff_shift_index = 2;  // index of diff_shift tensor
+      const size_t p1_index = 3;  // index of 1st placeholder tensor
+      const size_t p2_index = 4;  // index of 2nd placeholder tensor
+
+      // allocate diff_src tensor
+      MklDnnShape dnn_shape_diff_src;
+      TensorShape tf_shape_diff_src;
+      Tensor* diff_src_tensor = nullptr;
+      if (dnn_shape_src.IsMklTensor()) {
+        dnn_shape_diff_src.SetMklTensor(true);
+        auto diff_src_pd = bnrm_fwd_pd.dst_primitive_desc();
+        dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
+        dnn_shape_diff_src.SetElemType(MklDnnType<T>());
+        dnn_shape_diff_src.SetTfLayout(
+                              dnn_shape_src.GetDimension(),
+                              src_dims,
+                              format_m);
+        dnn_shape_diff_src.SetTfDimOrder(
+                              dnn_shape_src.GetDimension(),
+                              tensor_format_);
+        tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T));
+      } else {
+        dnn_shape_diff_src.SetMklTensor(false);
+        tf_shape_diff_src = src_tensor.shape();
+      }
+      AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                                tf_shape_diff_src, dnn_shape_diff_src);
+
+      diff_src.SetUsrMem(src_md, diff_src_tensor);
+
+      prop_kind pk = prop_kind::backward;
+      auto bnrm_bwd_desc = batch_normalization_backward::desc(
+                               pk,
+                               diff_src.GetUsrMemDesc(),
+                               src.GetUsrMemDesc(),
+                               epsilon_,
+                               use_scale_shift);
+      auto bnrm_bwd_pd = batch_normalization_backward::primitive_desc(
+                               bnrm_bwd_desc,
+                               cpu_engine,
+                               bnrm_fwd_pd);
+
+      auto bnrm_bwd_op = batch_normalization_backward(
+                               bnrm_bwd_pd,
+                               src.GetOpMem(),
+                               mean.GetOpMem(),
+                               variance.GetOpMem(),
+                               diff_dst.GetOpMem(),
+                               weights_m,
+                               diff_src.GetOpMem(),
+                               diff_weights_m);
+
+      std::vector<primitive> net;
+      net.push_back(bnrm_bwd_op);
+      stream(stream::kind::eager).submit(net).wait();
+
+      // separate out scale and shift grad and copy to individual tensors
+      const TensorShape& tf_shape_scale_shift = scale_tensor.shape();
+      Tensor* diff_scale_tensor = nullptr;
+      MklDnnShape mkl_shape_diff_scale;
+      mkl_shape_diff_scale.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, diff_scale_index, &diff_scale_tensor,
+                                tf_shape_scale_shift, mkl_shape_diff_scale);
+
+      Tensor* diff_shift_tensor = nullptr;
+      MklDnnShape mkl_shape_diff_shift;
+      mkl_shape_diff_shift.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, diff_shift_index, &diff_shift_tensor,
+                                tf_shape_scale_shift, mkl_shape_diff_shift);
+
+      // copy data: diff_scale and diff_shift
+      T* diff_weights_data_dnn = reinterpret_cast<T*>
+                                 (diff_weights_m.get_data_handle());
+      float* diff_scale_data_tf = const_cast<float*>(
+             static_cast<const float*>(diff_scale_tensor->flat<T>().data()));
+      float* diff_shift_data_tf = const_cast<float*>(
+             static_cast<const float*>(diff_shift_tensor->flat<T>().data()));
+      for (int i = 0; i < depth_; i++) {
+        diff_scale_data_tf[i] = diff_weights_data_dnn[i];
+        diff_shift_data_tf[i] = diff_weights_data_dnn[i + depth_];
+      }
+
+      // Placeholders for estimated_mean and estimated_variance, which are
+      // used for inference and thus not needed here for gradient computation.
+      Tensor* p1_tensor = nullptr, *p2_tensor = nullptr;
+      MklDnnShape mkl_shape_p;
+      mkl_shape_p.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, p1_index, &p1_tensor,
+                                TensorShape({}), mkl_shape_p);
+      AllocateOutputSetMklShape(context, p2_index, &p2_tensor,
+                                TensorShape({}), mkl_shape_p);
+    } catch (mkldnn::error &e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                          ", message: " + string(e.message) +
+                          ", in file " + string(__FILE__) + ":" +
+                          std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+                     errors::Aborted("Operation received an exception:",
+                     error_msg));
+    }
+  }
+
+ private:
+  T epsilon_;
+  TensorFormat tensor_format_;
+  int depth_;             // batch normalization is done for per channel.
+
+  void ExtractParams(OpKernelContext* context) {
+      const Tensor& input = MklGetInput(context, 0);
+      depth_ = static_cast<int>(GetTensorDim(input, tensor_format_, 'C'));
+  }
+
+  memory::dims GetMeanVarianceDims() {
+    return memory::dims({1, depth_});
+  }
+};
+
+#endif
+
+#define REGISTER_MKL_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNorm")                \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklFusedBatchNormOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_MKL_CPU);
+#undef REGISTER_MKL_CPU
 
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNormGrad")            \
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
index f31e7afd46873a02c10277283862a7e5e2384803..9ee27ee21c8d23c8ce314a7687ac9b79a1d9ea30 100644
--- a/tensorflow/core/kernels/mkl_identity_op.cc
+++ b/tensorflow/core/kernels/mkl_identity_op.cc
@@ -28,8 +28,15 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+#endif
+
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
+
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, typename T>
 class MklIdentityOp : public OpKernel {
  public:
@@ -50,6 +57,32 @@ class MklIdentityOp : public OpKernel {
   bool IsExpensive() override { return false; }
 };
 
+#else
+
+template <typename Device, typename T>
+class MklIdentityOp : public OpKernel {
+ public:
+  explicit MklIdentityOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    MklDnnShape dnn_shape_input;
+    const int kInputIdx = 0, kOutputIdx = 0;
+    GetMklShape(context, kInputIdx, &dnn_shape_input);
+
+    if (dnn_shape_input.IsMklTensor()) {
+      ForwardMklTensorInToOut(context, kInputIdx, kOutputIdx);
+    } else {
+      ForwardTfTensorInToOut(context, kInputIdx, kOutputIdx);
+    }
+  }
+
+  // TensorFlow's IdentityOp has the following member function, so kept it
+  // as it is.
+  bool IsExpensive() override { return false; }
+};
+
+#endif
+
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklIdentity")                      \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index b58e44e39800c8c047d5557ab3c84113bb78d3ca..001834b13bdd64ffd0d536897fbc4a170c4c4117 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -31,6 +31,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/mkl_tfconv_op.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+
+using mkldnn::stream;
+#endif
+
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
@@ -44,15 +50,16 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 // else if both inputs are in mkl format:
 //   if both have the same shape:
 //     pass the inputs through to the output
-// 	else:
-// 		convert both to TF
+//   else:
+//     convert both to TF
 // else if one is TF and one is MKL:
-// 	if broadcast is needed:
-// 		convert the MKL format input to TF format
-// 	else:
-// 		convert the TF format input to MKL format
+//   if broadcast is needed:
+//     convert the MKL format input to TF format
+//   else:
+//     convert the TF format input to MKL format
 ///////////////////////////////////////////////////////////
 
+#ifndef INTEL_MKL_DNN
 template <typename Device, typename T>
 class MklInputConversionOp : public OpKernel {
  public:
@@ -242,6 +249,199 @@ class MklInputConversionOp : public OpKernel {
   bool has_avx512f_ = false;
 };
 
+#else
+
+template <typename Device, typename T>
+class MklInputConversionOp : public OpKernel {
+ public:
+  explicit MklInputConversionOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type));
+    has_avx512f_ = port::TestCPUFeature(port::CPUFeature::AVX512F);
+  }
+
+ private:
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_tensor_0 = MklGetInput(context, 0);
+    MklDnnShape input_shape_0;
+    GetMklShape(context, 0, &input_shape_0);
+
+    const Tensor& input_tensor_1 = MklGetInput(context, 1);
+    MklDnnShape input_shape_1;
+    GetMklShape(context, 1, &input_shape_1);
+
+    bool tf_shapes_are_same = context->input(0).shape() ==
+                              context->input(1).shape();
+
+    VLOG(1) << "MklInputConversionOp: Input shapes are "
+            << (tf_shapes_are_same ? "*same*" : "*different*") << ": "
+            << context->input(0).shape().DebugString() << " and "
+            << context->input(1).shape().DebugString();
+
+    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    // if both inputs are in TF format, just copy input tensors to output.
+    if (!input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
+      VLOG(1) << "MklInputConversionOp: No conversion needed, "
+              << "copying TF inputs to output";
+
+      ForwardTfTensorInToOut(context, 0, 0);
+      ForwardTfTensorInToOut(context, 1, 1);
+      return;
+    }
+
+    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    // If both inputs are in MKL format
+    if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
+      // If both have the same shape, pass them through
+      if (tf_shapes_are_same) {
+        VLOG(1) << "MklInputConversionOp: No conversion needed, "
+                << "copying MKL inputs with identical shapes to output";
+
+        ForwardMklTensorInToOut(context, 0, 0);
+        ForwardMklTensorInToOut(context, 1, 1);
+        return;
+      }
+
+      // Sanity check
+      bool mkl_shapes_are_same = input_shape_0 == input_shape_1;
+      if (mkl_shapes_are_same) {
+        CHECK(false) << "MklInputConversionOp: Unexpected: TF shapes are "
+                        "different but MKL shapes are same";
+      }
+
+      // Both have different shapes, so broadcast will be necessary.
+      // Convert to TF and pass both tensors through (we can't do broadcast
+      // with MKL tensors)
+      VLOG(1) << "MklInputConversionOp: Broadcast needed, "
+              << "converted MKL inputs to TF format";
+
+      MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
+                                           op_data_type, has_avx512f_, 0);
+      MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
+                                           op_data_type, has_avx512f_, 1);
+      SetDummyMklShapeOutput(context, 0);
+      SetDummyMklShapeOutput(context, 1);
+      return;
+    }
+
+    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    // One input is MKL and one is TF. If no broadcast is needed, convert
+    // the TF tensor to MKL, otherwise convert the MKL tensor to TF format
+    VLOG(1) << "MklInputConversionOp: Inputs in different formats (MKL/TF)";
+
+    const Tensor* mkl_tensor;
+    const MklDnnShape* mkl_shape;
+    const Tensor* tf_tensor;
+    MklDnnShape* tf_mkl_shape;
+    uint mkl_tensor_index;
+    uint tf_tensor_index;
+    if (input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
+      mkl_tensor = &input_tensor_0;
+      mkl_shape = &input_shape_0;
+      mkl_tensor_index = 0;
+      tf_tensor = &input_tensor_1;
+      tf_mkl_shape = &input_shape_1;
+      tf_tensor_index = 1;
+    } else if (!input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
+      mkl_tensor = &input_tensor_1;
+      mkl_shape = &input_shape_1;
+      mkl_tensor_index = 1;
+      tf_tensor = &input_tensor_0;
+      tf_mkl_shape = &input_shape_0;
+      tf_tensor_index = 0;
+    } else {
+      CHECK(false) << "MklInputConversionOp: Unexpected combination of input "
+                      "shapes for MKL "
+                   << "element-wise op";
+    }
+
+    // Broadcast is needed if the shapes are not the same
+    bool broadcast_needed;
+
+    size_t in0_size = 1;
+    for (size_t i = 0; i < mkl_shape->GetDimension(); ++i)
+      in0_size *= mkl_shape->TfDimSize(i);
+
+    size_t in1_size = 1;
+    for (size_t i = 0; i < tf_tensor->shape().dims(); ++i)
+      in1_size *= tf_tensor->shape().dim_size(i);
+
+    broadcast_needed = (in0_size != in1_size);
+
+    if (!broadcast_needed) {
+      // Both shapes are same, convert the TF input to MKL
+      VLOG(1) << "MklInputConversionOp: No broadcast needed.";
+      VLOG(1) << "MklInputConversionOp: Converting input " << tf_tensor_index
+              << " to MKL format";
+
+      // Create MklDnnShape for output Mkl tensor.
+      Tensor* tensor_out;
+      MklDnnShape mkl_output_mkl_shape;
+      mkl_output_mkl_shape.SetMklTensor(true);
+      mkl_output_mkl_shape.SetElemType(MklDnnType<T>());
+      mkl_output_mkl_shape.SetTfLayout(mkl_shape->GetDimension(),
+                                       mkl_shape->GetSizesAsMklDnnDims(),
+                                       mkl_shape->GetTfDataFormat());
+      // ** Temporarily borrow the layout from the MKL input **
+      auto output_mkl_md = mkl_shape->GetMklLayout();
+      mkl_output_mkl_shape.SetMklLayout(&output_mkl_md);
+
+      // Create output Mkl tensor
+      AllocateOutputSetMklShape(context, tf_tensor_index, &tensor_out,
+                                mkl_tensor->shape(), mkl_output_mkl_shape);
+
+      // Create MklDnnData object for input tensor. Input tensor is in
+      // Tensorflow layout.
+      auto cpu_engine = engine(engine::cpu, 0);
+      MklDnnData<T> tf_input(&cpu_engine);
+      auto input_tf_md = mkl_output_mkl_shape.GetTfLayout();
+      tf_input.SetUsrMem(input_tf_md, &tf_tensor);
+
+      // Create reorder between tensorflow layout and Mkl layout.
+      std::vector<primitive> net;
+      CHECK_EQ(tf_input.CheckReorderToOpMem(memory::primitive_desc(
+                                            output_mkl_md, cpu_engine),
+                                            tensor_out, &net),
+               true);
+      stream(stream::kind::eager).submit(net).wait();
+
+      // -- The tensor in MKL format passes through --
+      ForwardMklTensorInToOut(context, mkl_tensor_index, mkl_tensor_index);
+    } else {
+      // Broadcast is needed, so convert the MKL input to TF
+      VLOG(1) << "MklInputConversionOp: Broadcast needed.";
+      VLOG(1) << "MklInputConversionOp: Converting input " << mkl_tensor_index
+              << " to TF format";
+      MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
+                                           op_data_type, has_avx512f_,
+                                           mkl_tensor_index);
+      SetDummyMklShapeOutput(context, mkl_tensor_index);
+
+      // The tensor in TF format passes through
+      ForwardTfTensorInToOut(context, tf_tensor_index, tf_tensor_index);
+    }
+
+    VLOG(1) << "MklInputConversionOp: Shapes (output): "
+            << context->mutable_output(0)->shape().DebugString() << " and "
+            << context->mutable_output(1)->shape().DebugString();
+
+    VLOG(1) << "MklInputConversion completed successfully.";
+  }
+
+ private:
+  /// Data format of the operation
+  string data_format_str;
+
+  /// Data type of the operation
+  DataType op_data_type;
+
+  /// CPUIDInfo
+  bool has_avx512f_ = false;
+};
+
+#endif
+
 ///////////////////////////////////////////////////////////
 //               Register kernel
 ///////////////////////////////////////////////////////////
@@ -253,7 +453,10 @@ class MklInputConversionOp : public OpKernel {
                               .Label(mkl_op_registry::kMklOpLabel), \
                           MklInputConversionOp<CPUDevice, T>);
 
-TF_CALL_NUMBER_TYPES(REGISTER_CPU);
+// TODO(nhasabni): We cannot support all number types since MklDnn does
+// not support types.
+// TF_CALL_NUMBER_TYPES(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
 #undef REGISTER_CPU
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index aa08e93924c588cfb5b4a22a20055e5c74a43b3a..227765e46d649eb0637f8e31a2ea4a0bf90f0c1a 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc. This opkernel uses MKL library, create MKL
 // layout and primitives, use MKL dnn primitives to compute local
 // response normalization
-
+#undef INTEL_MKL
 #ifdef INTEL_MKL
 
 #define EIGEN_USE_THREADS
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 846bb5710ded92c303567e4078c49a56b3746706..de4d7d2e729e0b1dec876ec6f7915acd88bf9167 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -16,17 +16,32 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
 #define EIGEN_USE_THREADS
-
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
+#ifdef INTEL_MKL_DNN
+#include <algorithm>
+#include "mkldnn.hpp"
+using mkldnn::memory;
+using mkldnn::error;
+using mkldnn::pooling_forward;
+using mkldnn::pooling_backward;
+using mkldnn::padding_kind;
+using mkldnn::engine;
+using mkldnn::prop_kind;
+using mkldnn::algorithm;
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+// For now, MKL-ML is default. So making MKL-DNN not a default choice.
+#ifndef INTEL_MKL_DNN
+
 // An implementation of MaxPooling (forward).
 template <typename Device, typename T>
 class MklMaxPoolingOp : public OpKernel {
@@ -475,8 +490,348 @@ class MklMaxPoolingGradOp : public OpKernel {
   TensorFormat data_format_;
 
   bool workspace_enabled_;
+};  // MklMaxPoolingGradOp
+
+#else  // INTEL_MKL_DNN is defined
+
+// An implementation of MaxPooling (forward).
+template <typename Device, typename T>
+class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
+ public:
+  explicit MklMaxPoolingOp(OpKernelConstruction* context)
+            : MklPoolingForwardOpBase<T>(context) {
+    // In Max Pooling, MKLDNN does not allow passing workspace as NULL.
+    // So we set workspace_enabled_ to true.
+    this->workspace_enabled_ = true;
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const Tensor& input_tensor = MklGetInput(context,
+                this->kInputTensorIndexInput);
+      MklDnnShape dnn_shape_input;
+      GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
+      this->SanityCheckInput(context, input_tensor, dnn_shape_input);
+      if (!context->status().ok()) return;
+
+      MklDnnData<T> dnn_data_input(&cpu_engine);
+      MklDnnData<T> dnn_data_output(&cpu_engine);
+      MklDnnData<T> dnn_data_wksp(&cpu_engine);
+
+      // initialize variables for the pooling op
+      MklPoolParameters pool_params;
+      // Get the input tensor and initialize the pooling parameters
+      this->ConfigureInput(context, dnn_shape_input,
+                        input_tensor, &pool_params,
+                        &dnn_data_input);
+      OP_REQUIRES_OK(context, context->status());
+
+      // Declare output tensor
+      Tensor* output_tensor = nullptr;
+      memory::dims output_dims_mkl_order;
+      this->GetOutputDims(pool_params, &output_dims_mkl_order);
+
+      // If input is in Mkl layout, then just get the memory format from it
+      // directly, instead of using input data_format to MaxPool.
+      if (dnn_shape_input.IsMklTensor()) {
+        dnn_data_output.SetUsrMem(output_dims_mkl_order,
+                                  static_cast<memory::format>(
+              dnn_data_input.GetUsrMemDesc().data.format));
+      } else {
+        dnn_data_output.SetUsrMem(output_dims_mkl_order,
+                                  this->data_format_mkldnn_);
+      }
+
+      // describe the memory layout; let mkl-dnn choose the best for the op
+      dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
+
+      auto pool_desc = pooling_forward::desc(prop_kind::forward,
+            algorithm::pooling_max,
+            dnn_data_input.GetUsrMemDesc(),
+            dnn_data_output.GetUsrMemDesc(),
+            memory::dims({  pool_params.row_stride,
+                            pool_params.col_stride}),
+            memory::dims({  pool_params.window_rows,
+                            pool_params.window_cols}),
+            memory::dims({  static_cast<int>(pool_params.pad_top),
+                            static_cast<int>(pool_params.pad_left)}),
+            memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                            static_cast<int>(pool_params.pad_right)}),
+            TFPaddingToMklDnnPadding(this->padding_));
+        auto pool_fwd_desc = pooling_forward::primitive_desc(pool_desc,
+            cpu_engine);
+
+      this->AllocateOutputTensor(context, pool_fwd_desc, output_dims_mkl_order,
+                            this->data_format_mkldnn_, &output_tensor);
+      OP_REQUIRES_OK(context, context->status());
+      dnn_data_output.SetUsrMemDataHandle(output_tensor);
+
+      AllocateWorkspaceTensor(context, pool_fwd_desc, &dnn_data_wksp);
+      OP_REQUIRES_OK(context, context->status());
+
+      this->PrepareAndExecuteNet(pool_fwd_desc, &dnn_data_input,
+                        &dnn_data_output, &dnn_data_wksp);
+    } catch (mkldnn::error &e) {
+        string error_msg = "Status: " + std::to_string(e.status) +
+                        ", message: " + string(e.message) +
+                        ", in file " + string(__FILE__) + ":" +
+                        std::to_string(__LINE__);
+        OP_REQUIRES_OK(context,
+                        errors::Aborted("Compute received an exception:",
+                                         error_msg));
+    }
+  }  // Compute
+
+ private:
+    const int kOutputTensorIndexWorkspace = 1;
+
+    void AllocateWorkspaceTensor(OpKernelContext* context,
+                const pooling_forward::primitive_desc& pool_fwd_prim_desc,
+                MklDnnData<T>* dnn_data_wksp) {
+        CHECK_NOTNULL(dnn_data_wksp);
+        Tensor* workspace_tensor = nullptr;
+        memory::primitive_desc workspace_pd
+                    = pool_fwd_prim_desc.workspace_primitive_desc();
+        size_t workspace_t_elems = this->GetNumTElements(workspace_pd);
+        MklDnnShape workspace_mkl_shape;
+        workspace_mkl_shape.SetMklTensor(false);
+        TensorShape workspace_tf_shape;
+        workspace_tf_shape.AddDim(workspace_t_elems);
+        AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace,
+                                &workspace_tensor,
+                                workspace_tf_shape, workspace_mkl_shape);
+        CHECK_NOTNULL(workspace_tensor);
+        dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
+    }
 };
 
+// The operation to compute MaxPool gradients.
+// It takes three inputs:
+//   - The original input tensor
+//   - The original output tensor
+//   - Backprop tensor for output
+// It produces one output: backprop tensor for input.
+template <class Device, class T>
+class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
+ public:
+  explicit MklMaxPoolingGradOp(OpKernelConstruction* context)
+      : MklPoolingBackwardOpBase<T>(context) {
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+        auto cpu_engine = engine(engine::cpu, 0);
+        const Tensor& orig_input_tensor = MklGetInput(context,
+            kInputTensorIndexOrigInput);
+        const Tensor& orig_output_tensor = MklGetInput(context,
+            kInputTensorIndexOrigOutput);
+        const Tensor& grad_tensor = MklGetInput(context,
+            kInputTensorIndexGradient);
+        const Tensor& workspace_tensor = MklGetInput(context,
+            kInputTensorIndexWorkspace);
+        MklDnnShape orig_input_mkl_shape,
+                    orig_output_mkl_shape,
+                    grad_mkl_shape,
+                    workspace_mkl_shape;
+        GetMklShape(context, kInputTensorIndexOrigInput,
+            &orig_input_mkl_shape);
+        GetMklShape(context, kInputTensorIndexOrigOutput,
+            &orig_output_mkl_shape);
+        GetMklShape(context, kInputTensorIndexGradient,
+            &grad_mkl_shape);
+        GetMklShape(context, kInputTensorIndexWorkspace,
+            &workspace_mkl_shape);
+
+        SanityCheckInputs(context,
+                            orig_input_tensor, orig_output_tensor,
+                            grad_tensor, workspace_tensor,
+                            orig_input_mkl_shape, orig_output_mkl_shape,
+                            grad_mkl_shape, workspace_mkl_shape);
+        if (!context->status().ok()) return;
+
+        MklDnnData<T> grad_dnn_data(&cpu_engine);
+        MklDnnData<T> workspace_dnn_data(&cpu_engine);
+        MklDnnData<T> output_dnn_data(&cpu_engine);
+        Tensor* output_tensor = nullptr;
+        MklPoolParameters pool_params;
+        TensorShape orig_input_shape;
+        memory::dims output_dims_mkl_order, orig_input_dims_mkl_order;
+        memory::desc original_input_md = ConfigureOriginalInput(context,
+                                orig_input_tensor,
+                                orig_input_mkl_shape,
+                                &orig_input_dims_mkl_order,
+                                &pool_params,
+                                &orig_input_shape);
+
+        memory::desc original_output_md = this->ConfigureOriginalOutput(
+                                pool_params,
+                                orig_output_mkl_shape,
+                                output_dims_mkl_order);
+
+        memory::desc target_diff_dst_md =  this->ConfigureInputGradient(
+                                        grad_mkl_shape,
+                                        grad_tensor,
+                                        &grad_dnn_data,
+                                        original_output_md);
+
+        output_dnn_data.SetUsrMem(original_input_md);
+
+        // Create the forward pooling primitive descriptor so we can
+        // pass it as a hint to the backward pooling primitive descriptor
+        auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward,
+                algorithm::pooling_max,
+                original_input_md,
+                original_output_md,
+                memory::dims({  pool_params.row_stride,
+                                pool_params.col_stride}),
+                memory::dims({  pool_params.window_rows,
+                                pool_params.window_cols}),
+                memory::dims({  static_cast<int>(pool_params.pad_top),
+                                static_cast<int>(pool_params.pad_left)}),
+                memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                                static_cast<int>(pool_params.pad_right)}),
+                TFPaddingToMklDnnPadding(this->padding_));
+        auto pool_fwd_prim_desc
+                = pooling_forward::primitive_desc(pool_fwd_desc,
+                                                    cpu_engine);
+
+        auto pool_bkwd_desc = pooling_backward::desc(
+                algorithm::pooling_max,
+                output_dnn_data.GetUsrMemDesc(),
+                target_diff_dst_md,
+                memory::dims({  pool_params.row_stride,
+                                pool_params.col_stride}),
+                memory::dims({  pool_params.window_rows,
+                                pool_params.window_cols}),
+                memory::dims({  static_cast<int>(pool_params.pad_top),
+                                static_cast<int>(pool_params.pad_left)}),
+                memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                                static_cast<int>(pool_params.pad_right)}),
+                TFPaddingToMklDnnPadding(this->padding_));
+        auto pool_bkwd_prim_desc
+            = pooling_backward::primitive_desc(pool_bkwd_desc,
+                                                cpu_engine,
+                                                pool_fwd_prim_desc);
+
+        this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
+            orig_input_dims_mkl_order,
+            this->data_format_mkldnn_,
+            &output_tensor);
+        output_dnn_data.SetUsrMemDataHandle(output_tensor);
+
+        ConfigureWorkspace(workspace_tensor,
+                pool_fwd_prim_desc.workspace_primitive_desc(),
+                &workspace_dnn_data);
+        this->PrepareAndExecuteNet(pool_bkwd_prim_desc,
+                            &grad_dnn_data,
+                            &output_dnn_data,
+                            memory::primitive_desc(
+                                target_diff_dst_md,
+                                cpu_engine),
+                            &workspace_dnn_data);
+    } catch (mkldnn::error &e) {
+        string error_msg = "Status: " + std::to_string(e.status) +
+                        ", message: " + string(e.message) +
+                        ", in file " + string(__FILE__) + ":" +
+                        std::to_string(__LINE__);
+        OP_REQUIRES_OK(context,
+                        errors::Aborted("Compute received an exception:",
+                                         error_msg));
+    }
+  }  // Compute
+
+ private:
+    // .Input("orig_input: T")
+    // .Input("orig_output: T")
+    // .Input("grad: T")
+    // .Input("workspace: T")
+    const int kInputTensorIndexOrigInput = 0;
+    const int kInputTensorIndexOrigOutput = 1;
+    const int kInputTensorIndexGradient = 2;
+    const int kInputTensorIndexWorkspace = 3;
+    //  Output("output: T") in Base Class
+
+    memory::desc ConfigureOriginalInput(OpKernelContext* context,
+                                const Tensor& tensor_original_input,
+                                const MklDnnShape& original_input_mkl_shape,
+                                memory::dims* original_input_dims_mkl_order,
+                                MklPoolParameters* pool_params,
+                                TensorShape* input_tensor_shape) {
+        *input_tensor_shape = tensor_original_input.shape();
+        return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
+                                        context,
+                                        tensor_original_input,
+                                        original_input_mkl_shape,
+                                        original_input_dims_mkl_order,
+                                        pool_params,
+                                        *input_tensor_shape);
+    }
+
+    void ConfigureWorkspace(const Tensor& workspace_tensor,
+                        memory::primitive_desc workspace_pd,
+                        MklDnnData<T> *workspace_dnn_data) {
+        CHECK_NOTNULL(workspace_dnn_data);
+
+        workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
+    }
+
+    void SanityCheckInputs(OpKernelContext* context,
+                            const Tensor& orig_input_tensor,
+                            const Tensor& orig_output_tensor,
+                            const Tensor& grad_tensor,
+                            const Tensor& workspace_tensor,
+                            const MklDnnShape& orig_input_mkl_shape,
+                            const MklDnnShape& orig_output_mkl_shape,
+                            const MklDnnShape& grad_mkl_shape,
+                            const MklDnnShape& workspace_mkl_shape) {
+        if (!orig_input_mkl_shape.IsMklTensor()) {
+            OP_REQUIRES(context, orig_input_tensor.dims() == 4,
+                errors::InvalidArgument("Original input shape must be "
+                "4-dimensional"));
+        } else {
+            OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4,
+                    errors::InvalidArgument("Original input shape must be "
+                    "4-dimensional"));
+        }
+        if (!orig_output_mkl_shape.IsMklTensor()) {
+            OP_REQUIRES(context, orig_output_tensor.dims() == 4,
+                errors::InvalidArgument("Original output must be "
+                        "4-dimensional"));
+        } else {
+            OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4,
+                    errors::InvalidArgument("Original output must be "
+                    "4-dimensional"));
+        }
+        if (!grad_mkl_shape.IsMklTensor()) {
+            OP_REQUIRES(context, grad_tensor.dims() == 4,
+                errors::InvalidArgument("Gradient must be 4-dimensional"));
+        } else {
+            OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4,
+                    errors::InvalidArgument("Gradient must be "
+                    "4-dimensional"));
+        }
+        if (this->workspace_enabled_){
+            // The workspace should not be an MKL tensor
+            OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false,
+                    errors::InvalidArgument("Workspace tensor should not"
+                                            " be an MKL Tensor."));
+            // It should only have one dimension
+            OP_REQUIRES(context, workspace_tensor.dims() == 1,
+                    errors::InvalidArgument("Workspace tensor must be "
+                                "1-dimensional"));
+        } else {
+            OP_REQUIRES(context, this->workspace_enabled_,
+                    errors::Unimplemented("MKL-DNN Max Pooling does not "
+                                "yet support the use case "
+                                "where MaxPoolGrad is called without first"
+                                " calling MaxPool."));
+        }
+    }
+};  // MklMaxPoolingGradOp
+
+#endif  // INTEL_MKL_DNN
+
 REGISTER_KERNEL_BUILDER(Name("_MklMaxPool")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<float>("T")
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index 65e8852cfb11a2dd78395860a7ca7b2cc550be34..f7cadffd39c11bdedaca6a07e48f222e7ac5e0cb 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -14,10 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #ifdef INTEL_MKL
+
 #include <vector>
+#include <limits>
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
@@ -39,6 +42,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
   Init(context, ksize, stride, padding, data_format);
 }
 
+#ifndef INTEL_MKL_DNN
 // Initialization for MKL format
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -53,7 +57,22 @@ void MklPoolParameters::Init(OpKernelContext* context,
 
   Init(context, ksize, stride, padding, data_format);
 }
+#else
+// Initialization for MKL format
+void MklPoolParameters::Init(OpKernelContext* context,
+                             const std::vector<int32>& ksize,
+                             const std::vector<int32>& stride, Padding padding,
+                             TensorFormat data_format,
+                             const MklDnnShape* mklInputShape) {
+  // Get the input sizes
+  depth = mklInputShape->GetDimension('C');
+  tensor_in_cols = mklInputShape->GetDimension('W');
+  tensor_in_rows = mklInputShape->GetDimension('H');
+  tensor_in_batch = mklInputShape->GetDimension('N');
 
+  Init(context, ksize, stride, padding, data_format);
+}
+#endif  // INTEL_MKL_DNN
 // Common Initialization for TensorFlow and MKL formats
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -80,7 +99,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
                   "MaxPooling supports exactly one of pooling across depth "
                   "or pooling across width/height."));
 
-  if (depth_window == 1) {
+  if (depth_window == 1) {  // we are pooling in the H and W
     OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
                                 tensor_in_rows, window_rows, row_stride,
                                 padding, &out_height, &pad_top, &pad_bottom));
@@ -88,7 +107,21 @@ void MklPoolParameters::Init(OpKernelContext* context,
     OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
                                 tensor_in_cols, window_cols, col_stride,
                                 padding, &out_width, &pad_left, &pad_right));
-  } else {
+#ifdef INTEL_MKL_DNN
+    // TF can work with int64, but mkldnn only supports int32
+    // Fail if the height or width are greater than MAX_INT
+
+    OP_REQUIRES(context, FastBoundsCheck(out_height,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("output height is too large"));
+
+    OP_REQUIRES(context, FastBoundsCheck(out_width,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("output width is too large"));
+
+#endif
+    out_depth = depth;  // output will have the same depth as the input
+  } else {  // we are pooling in the depth dimension
     // Our current version of depthwise max pooling does not support
     // any padding, and expects the depth_window to equal the depth
     // stride (no overlapping).
@@ -109,7 +142,6 @@ void MklPoolParameters::Init(OpKernelContext* context,
                 errors::Unimplemented("Depthwise max pooling is currently "
                                       "only implemented for CPU devices."));
 
-    pad_depth = 0;
     out_depth = depth / depth_window;
   }
 }
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 92ea2beb25aa1fd4cab7fd787b04c4d086ca1b05..d33e91a15dcba948ad5279ea848b5d1a7cd9b119 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -18,9 +18,18 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 #include <vector>
+#include <string>
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+using mkldnn::memory;
+using mkldnn::pooling_forward;
+using mkldnn::pooling_backward;
+using mkldnn::stream;
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -51,14 +60,28 @@ struct MklPoolParameters {
   int pad_depth;
 
   TensorFormat data_format;
+  MklPoolParameters()
+    : depth(0)
+    , tensor_in_cols(0), tensor_in_rows(0), tensor_in_batch(0)
+    , window_rows(0), window_cols(0), depth_window(0)
+    , row_stride(0), col_stride(0), depth_stride(0)
+    , out_height(0), out_width(0), out_depth(0)
+    , pad_left(0), pad_right(0), pad_top(0), pad_bottom(0), pad_depth(0)
+    , data_format(TensorFormat::FORMAT_NCHW) {}
 
   // Updates context->status if there is an invalid input.
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const TensorShape& tensor_in_shape);
+#ifndef INTEL_MKL_DNN
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const MklShape* mkl_in_shape);
+#else
+  void Init(OpKernelContext* context, const std::vector<int32>& ksize,
+            const std::vector<int32>& stride, Padding padding,
+            TensorFormat data_format, const MklDnnShape* mkl_in_shape);
+#endif
 
  private:
   // Common initialization for TensorFlow and MKL formats
@@ -67,6 +90,325 @@ struct MklPoolParameters {
             TensorFormat data_format);
 };
 
+#ifdef INTEL_MKL_DNN
+
+template <class T>
+class MklPoolingOpBase : public OpKernel {
+ public:
+  explicit MklPoolingOpBase(OpKernelConstruction* context)
+            : OpKernel(context)
+            , workspace_enabled_(false) {
+      string data_format;
+      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+      OP_REQUIRES(context,
+            FormatFromString(data_format, &this->data_format_tf_),
+            errors::InvalidArgument("Invalid data format"));
+      this->data_format_mkldnn_
+                = TFDataFormatToMklDnnDataFormat(this->data_format_tf_);
+      OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_));
+      OP_REQUIRES(context, this->ksize_.size() == 4,
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_));
+      OP_REQUIRES(context, this->stride_.size() == 4,
+                  errors::InvalidArgument("Sliding window strides field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_));
+      OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1,
+                  errors::Unimplemented("Pooling is not yet supported on the "
+                                        "batch dimension."));
+
+      // We may not get this attribute for this node if it does not go through
+      // graph rewrite pass. So we do not check for error while retrieving this
+      // attribute value.
+      context->GetAttr("workspace_enabled", &this->workspace_enabled_);
+    }
+  void Compute(OpKernelContext* context) override = 0;
+
+ protected:
+  // Calculate output shape of pooling op in MKL-DNN and TensorFlow order.
+  // MKL-DNN uses NCHW for output order. But TensorFlow output will be in
+  // NHWC or NCHW format depending on data format. Function expects
+  // output height and output width to have already been int32
+  // bounds-checked
+  void GetOutputDims(const MklPoolParameters& mkl_pool_params,
+                    memory::dims* output_dims_mkl_order) {
+    // MKL-DNN always needs output in NCHW format.
+    *output_dims_mkl_order = { mkl_pool_params.tensor_in_batch,
+                              mkl_pool_params.out_depth,
+                              static_cast<int>(mkl_pool_params.out_height),
+                              static_cast<int>(mkl_pool_params.out_width)};
+  }
+
+  void InitMklPoolParameters(OpKernelContext* context,
+                      MklPoolParameters* pool_params,
+                      const MklDnnShape& original_input_mkl_shape,
+                      const TensorShape& input_tensor_shape) {
+    if (!original_input_mkl_shape.IsMklTensor()) {
+      pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
+          this->data_format_tf_, input_tensor_shape);
+    } else {
+      pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
+          this->data_format_tf_, &original_input_mkl_shape);
+    }
+  }
+
+  // Checks to make sure that the memory we need to allocate
+  // is a multiple of sizeof(T)
+  // returns the number of elements
+  size_t GetNumTElements(const memory::primitive_desc& pd) {
+    size_t num_bytes = pd.get_size();
+    size_t ret_val = num_bytes / sizeof(T);
+    if ( num_bytes % sizeof(T) != 0 ) {
+        ret_val++;
+    }
+    return ret_val;
+  }
+
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_tf_;
+  memory::format data_format_mkldnn_;
+  bool workspace_enabled_;
+};
+
+template <class T>
+class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
+ public:
+  explicit MklPoolingForwardOpBase<T>(OpKernelConstruction* context)
+      : MklPoolingOpBase<T>(context) {}
+  void Compute(OpKernelContext* context) override = 0;
+
+ protected:
+  void ConfigureInput(OpKernelContext* context,
+                    const MklDnnShape& input_mkl_shape,
+                    const Tensor& input_tensor,
+                    MklPoolParameters* pool_params,
+                    MklDnnData<T>* dnn_data_input) {
+    CHECK_NOTNULL(pool_params);
+    CHECK_NOTNULL(dnn_data_input);
+    TensorShape input_tensor_shape = input_tensor.shape();
+    memory::desc input_md = input_mkl_shape.IsMklTensor()
+                        ? input_mkl_shape.GetMklLayout()
+                        : memory::desc(
+                              TFShapeToMklDnnDimsInNCHW(
+                                  input_tensor_shape, this->data_format_tf_),
+                              MklDnnType<T>(),
+                              this->data_format_mkldnn_);
+    dnn_data_input->SetUsrMem(input_md, &input_tensor);
+    this->InitMklPoolParameters(context, pool_params,
+                      input_mkl_shape, input_tensor_shape);
+  }
+
+  void AllocateOutputTensor(OpKernelContext* context,
+            const pooling_forward::primitive_desc& pool_fwd_prim_desc,
+            const memory::dims output_dims_mkl_order,
+            const memory::format& output_tf_format,
+            Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+    memory::primitive_desc dst_pd = pool_fwd_prim_desc.dst_primitive_desc();
+
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                              output_dims_mkl_order,
+                              output_tf_format);
+    TensorShape output_tf_shape;
+
+    // only allocate enough space for the elements we need.
+    output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput,
+                            output_tensor,
+                            output_tf_shape, output_mkl_shape);
+    CHECK_NOTNULL(*output_tensor);
+  }
+
+  void PrepareAndExecuteNet(
+                  const pooling_forward::primitive_desc& pool_fwd_desc,
+                  const MklDnnData<T>* src,
+                  MklDnnData<T>* dst,
+                  MklDnnData<T>* wksp = nullptr) {
+    std::vector<primitive> net;
+
+    // Create pooling primitive and add it to net
+    if (wksp != nullptr) {
+        net.push_back(pooling_forward(pool_fwd_desc,
+                        src->GetOpMem(),
+                        dst->GetOpMem(),
+                        wksp->GetOpMem()));
+    } else {
+        net.push_back(pooling_forward(pool_fwd_desc,
+            src->GetOpMem(),
+            dst->GetOpMem()));
+    }
+    stream(stream::kind::eager).submit(net).wait();
+  }
+
+
+  void SanityCheckInput(OpKernelContext* context,
+                  const Tensor& input_tensor,
+                  const MklDnnShape& input_mkl_shape) {
+    if (!input_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, input_tensor.dims() == 4,
+          errors::InvalidArgument("Input must be 4-dimensional"));
+    } else {
+        OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4,
+                errors::InvalidArgument("Input shape must be "
+                "4-dimensional"));
+    }
+  }
+  // .Input("value: T")
+  // .Output("output: T")
+  const int kInputTensorIndexInput = 0;
+  const int kOutputTensorIndexOutput = 0;
+};  // MklPoolingForwardBaseOp
+
+
+template <class T>
+class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
+ public:
+  explicit MklPoolingBackwardOpBase<T>(OpKernelConstruction* context)
+          : MklPoolingOpBase<T>(context) { }
+  void Compute(OpKernelContext* context) override = 0;
+
+ protected:
+  const int kOutputTensorIndexOutput = 0;
+
+  void AllocateOutputTensor(OpKernelContext* context,
+            const pooling_backward::primitive_desc& pool_bkwd_prim_desc,
+            const memory::dims output_dims_mkl_order,
+            const memory::format& output_tf_format,
+            Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+    memory::primitive_desc dst_pd
+                = pool_bkwd_prim_desc.diff_src_primitive_desc();
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                              output_dims_mkl_order,
+                              output_tf_format);
+
+    TensorShape output_tf_shape;
+    output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput,
+                            output_tensor,
+                            output_tf_shape, output_mkl_shape);
+    CHECK_NOTNULL(*output_tensor);
+  }
+
+  void PrepareAndExecuteNet(
+    const pooling_backward::primitive_desc& pool_bkwd_desc,
+    MklDnnData<T>* input_gradient_diff_dst,
+    MklDnnData<T>* output_diff_src,
+    const memory::primitive_desc& target_diff_dst_pd,
+    const MklDnnData<T>* workspace = nullptr) {
+
+    std::vector<primitive> net;
+
+    // If the input gradient isn't in the same format as the output
+    // reorder it to the same format as the output
+    input_gradient_diff_dst->CheckReorderToOpMem(
+            target_diff_dst_pd,
+            &net);
+
+    // Create pooling primitive and add it to net
+    if (nullptr == workspace) {
+      net.push_back(pooling_backward(pool_bkwd_desc,
+                              input_gradient_diff_dst->GetOpMem(),
+                              output_diff_src->GetOpMem()));
+    } else {
+      net.push_back(pooling_backward(pool_bkwd_desc,
+                                  input_gradient_diff_dst->GetOpMem(),
+                                  workspace->GetOpMem(),
+                                  output_diff_src->GetOpMem()));
+    }
+    stream(stream::kind::eager).submit(net).wait();
+  }
+
+  // Max Pooling and Avg Pooling have slightly different implementations
+  // Takes the Tensor containing original input data and the original
+  // mkl Dnn Shape and populates other data
+  memory::desc ConfigureOriginalInput(OpKernelContext* context,
+                              const Tensor& tensor_original_input_shape,
+                              const MklDnnShape& original_input_mkl_shape,
+                              memory::dims* original_input_dims_nchw,
+                              MklPoolParameters* pool_params,
+                              const TensorShape& input_tensor_shape) {
+    CHECK_NOTNULL(original_input_dims_nchw);
+    CHECK_NOTNULL(pool_params);
+    this->InitMklPoolParameters(context, pool_params,
+                          original_input_mkl_shape,
+                          input_tensor_shape);
+
+    *original_input_dims_nchw
+          = original_input_mkl_shape.IsMklTensor()
+          ? original_input_mkl_shape.GetSizesAsMklDnnDims()
+          : TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
+        this->data_format_tf_);
+
+    return  original_input_mkl_shape.IsMklTensor()
+      ? original_input_mkl_shape.GetMklLayout()
+      : memory::desc(*original_input_dims_nchw,
+                      MklDnnType<T>(),
+                      this->data_format_mkldnn_);
+  }
+
+  memory::desc ConfigureOriginalOutput(const MklPoolParameters& pool_params,
+                                const MklDnnShape& original_output_mkl_shape,
+                                      memory::dims output_dims_mkl_order) {
+    this->GetOutputDims(pool_params, &output_dims_mkl_order);
+
+    return original_output_mkl_shape.IsMklTensor()
+            ? original_output_mkl_shape.GetMklLayout()
+            : memory::desc(output_dims_mkl_order,
+                         MklDnnType<T>(),
+                         this->data_format_mkldnn_);
+  }
+
+  memory::desc ConfigureInputGradient(
+        const MklDnnShape& input_gradient_mkl_shape,
+        const Tensor& input_gradient_tensor,
+        MklDnnData<T>* input_gradient_dnn_data,
+        const memory::desc& original_output_md) {
+    // Configure the gradient as is
+    memory::desc original_input_grad_md
+          = input_gradient_mkl_shape.IsMklTensor()
+          ? input_gradient_mkl_shape.GetMklLayout()
+          : memory::desc(TFShapeToMklDnnDimsInNCHW(
+                    input_gradient_tensor.shape(),
+                    this->data_format_tf_),
+                    MklDnnType<T>(), this->data_format_mkldnn_);
+
+    input_gradient_dnn_data->SetUsrMem(original_input_grad_md,
+                &input_gradient_tensor);
+
+    // Check to see if input grad diff dst is in the right format
+    // Create a new memory descriptor with the same shape as the
+    // original, but the format of the other tensors.
+    memory::format original_output_format =
+            static_cast<memory::format>(original_output_md.data.format);
+    bool grad_reorder_needed = input_gradient_dnn_data->IsReorderNeeded(
+                                    original_output_format);
+    memory::dims diff_dst_dims = input_gradient_mkl_shape.IsMklTensor()
+        ? input_gradient_mkl_shape.GetSizesAsMklDnnDims()
+        : TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(),
+                    this->data_format_tf_);
+    memory::desc target_diff_dst_md = memory::desc(diff_dst_dims,
+        MklDnnType<T>(), original_output_format);
+
+    return grad_reorder_needed
+            ? target_diff_dst_md
+            : original_input_grad_md;
+  }
+};
+#endif  // INTEL_MKL_DNN
+
 //-------------------------------------------------------------------
 // Utility functions
 
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 86a77d769a52d7592d15627b504ae60278b45058..45bdd0ad5cbab6c806f6c008f0d2642c4845cbc2 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -28,6 +28,19 @@ limitations under the License.
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+
+using mkldnn::stream;
+using mkldnn::prop_kind;
+using mkldnn::algorithm;
+using mkldnn::relu_forward;
+using mkldnn::relu_backward;
+using mkldnn::eltwise_relu;
+using mkldnn::eltwise_elu;
+using mkldnn::eltwise_tanh;
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -45,6 +58,8 @@ struct MklReluHelpers {
   }
 };
 
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, typename T>
 class MklReluOp : public OpKernel {
  public:
@@ -59,6 +74,7 @@ class MklReluOp : public OpKernel {
     GetMklShape(context, 0, &mkl_context.input_shape);
     void* user_i = static_cast<void*>(const_cast<T*>(input.flat<T>().data()));
     bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
     if (!input_in_mkl_format && !input.dims()) {  // handle the case of a scalar
       const TensorShape& o_shape = input.shape();
       Tensor* out_tensor = nullptr;
@@ -164,6 +180,7 @@ class MklReluOp : public OpKernel {
   } MklReluOpContext;
 };
 
+
 template <typename Device, typename T>
 class MklReluGradOp : public OpKernel {
  public:
@@ -189,18 +206,18 @@ class MklReluGradOp : public OpKernel {
       const Tensor& a = MklGetInput(context, 1);
       void* buf_input = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
       void* mkl_buffer_convert = nullptr;
+
       dnnPrimitive_t cv_input_to_grad = nullptr;
 
-      // if input and grad are not in the same layout, do a conversion between
-      // them.
+      // if input and grad are not in the same layout,
+      // do a conversion between them.
       if (!dnnLayoutCompare_F32(lt_input, lt_grad)) {
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_grad,
                        &mkl_buffer_convert);
         CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input,
                    lt_grad), E_SUCCESS);
         CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, buf_input,
-                                          mkl_buffer_convert),
-                 E_SUCCESS);
+                                          mkl_buffer_convert), E_SUCCESS);
         relu_res[dnnResourceSrc] = mkl_buffer_convert;
         dnnDelete_F32(cv_input_to_grad);
       } else {
@@ -246,7 +263,6 @@ class MklReluGradOp : public OpKernel {
 };
 
 template <typename Device, typename T>
-
 void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   MklReluGradOpContext mkl_context;
   const Tensor& g = MklGetInput(context, 0);
@@ -264,20 +280,21 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
       !MklReluHelpers::ValidateSameSize(context, g, a))
     return;
   Tensor* output = nullptr;
-  if (!input_is_mkl && !grad_is_mkl &&
-      !a.dims()) {  // handle the case of a scalar
-    // Allocate space for g and
+
+  if (!input_is_mkl && !grad_is_mkl && !a.dims()) {
+    // handle the scalar case
     const TensorShape& g_shape = g.shape();
     mkl_context.output_shape.SetMklTensor(false);
     AllocateOutputSetMklShape(context, 0, &output, g_shape,
                               mkl_context.output_shape);
+
     void* out_o = static_cast<void*>(output->flat<T>().data());
     (static_cast<T*>(out_o))[0] =
         (static_cast<T*>(user_g))[0] * ((static_cast<T*>(user_i))[0] > 0);
     return;
   }
 
-  // Generate size, stride for input if input/grad is in MKL format.
+  // generate size, stride for input if input/grad is in mkl format.
   if (grad_is_mkl || input_is_mkl) {
     const MklShape* tmp_mkl_shape =
         (grad_is_mkl) ? &mkl_context.grad_shape : &mkl_context.input_shape;
@@ -308,21 +325,20 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   float negative_slope = 0.0;
   CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL,
                                      mkl_context.lt_grad, mkl_context.lt_grad,
-                                     negative_slope),
-           E_SUCCESS);
+                                     negative_slope), E_SUCCESS);
   Tensor mkl_tmp_input_buf_tensor;
   mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_input_buf_tensor);
 
   if (input_is_mkl ||
-      grad_is_mkl) { /*if  grad or input are MKL leave it in MKL*/
+      grad_is_mkl) { /*if  grad or input are mkl leave it in mkl*/
     TensorShape tf_shape;
     mkl_context.output_shape.SetMklTensor(true);
     mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_bwd,
                                           dnnResourceDiffSrc);
     mkl_context.output_shape.SetTfLayout(
         mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
-    // If input_is_mkl or grad_is_mkl, then we copy strides and sizes from Mkl
-    // shape of one that is in MKL layout.
+    // if input_is_mkl or grad_is_mkl, then we copy strides and sizes from mkl
+    // shape of one that is in mkl layout.
     if (grad_is_mkl == true) {
       mkl_context.output_shape.SetTfDimOrder(
           mkl_context.in_dims, mkl_context.grad_shape.GetTfToMklDimMap());
@@ -332,11 +348,9 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
     }
 
     tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                        mkl_context.output_shape.GetMklLayout())) /
-                    sizeof(T));
+                    mkl_context.output_shape.GetMklLayout())) / sizeof(T));
     AllocateOutputSetMklShape(context, 0, &output, tf_shape,
                               mkl_context.output_shape);
-
   } else {
     const TensorShape& o_shape = g.shape();
     mkl_context.output_shape.SetMklTensor(false);
@@ -347,13 +361,430 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   mkl_context.relu_res[dnnResourceDiffSrc] =
       static_cast<void*>(output->flat<T>().data());
 
-  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, mkl_context.relu_res),
-           E_SUCCESS);
+  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd,
+                          mkl_context.relu_res),
+                          E_SUCCESS);
   mkl_context.MklCleanup();
 }
 
-/* Register DNN kernels for supported operations and supported types - right now
- * it is only Relu and f32*/
+
+#else  // INTEL_MKL_DNN
+
+template <typename Device, typename T, algorithm alg_kind>
+class MklReluOpBase : public OpKernel {
+ public:
+  ~MklReluOpBase() {}
+
+  explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) {
+  }
+
+  virtual void Compute_Scalar(OpKernelContext* context) = 0;
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const size_t src_index = 0;  // index of src input tensor
+      const size_t dst_index = 0;  // index of dst output tensor
+      const Tensor& src_tensor = MklGetInput(context, src_index);
+      MklDnnShape dnn_shape_src;
+      GetMklShape(context, src_index, &dnn_shape_src);
+
+      Tensor* dst_tensor = nullptr;
+      if (src_tensor.dims() == 0) {
+        Compute_Scalar(context);
+        return;
+      }
+
+      // Create relu primitive.
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);
+
+      // Set DNN primitive - src
+      memory::desc src_md({}, memory::data_undef, memory::format_undef);
+      if (dnn_shape_src.IsMklTensor()) {
+        src_md = dnn_shape_src.GetMklLayout();
+      } else {
+        auto src_dims = TFShapeToMklDnnDims(src_tensor.shape());
+        auto src_strides = CalculateTFStrides(src_dims);
+        // Create blocked memory descriptor
+        src_md = MklDnnData<T>::CreateBlockedMemDesc(src_dims, src_strides);
+      }
+      src.SetUsrMem(src_md, &src_tensor);
+
+      T alpha = 0, beta = 0;
+      std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
+      auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training,
+          // Operator memory descriptor is same as user memory descriptor.
+                                              alg_kind, src.GetUsrMemDesc(),
+                                              alpha, beta);
+      relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc,
+                                                         cpu_engine));
+
+      // allocate dst tensor
+      MklDnnShape dnn_shape_dst;
+      TensorShape tf_shape_dst;
+      if (dnn_shape_src.IsMklTensor()) {
+        dnn_shape_dst.SetMklTensor(true);
+        auto dst_pd = relu_fwd_pd->dst_primitive_desc();
+        dnn_shape_dst.SetMklLayout(&dst_pd);
+        dnn_shape_dst.SetElemType(MklDnnType<T>());
+        dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(),
+                                  dnn_shape_src.GetSizesAsMklDnnDims(),
+                                  dnn_shape_src.GetTfDataFormat());
+        tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T));
+      } else {
+        dnn_shape_dst.SetMklTensor(false);
+        tf_shape_dst = src_tensor.shape();
+      }
+      AllocateOutputSetMklShape(context, dst_index, &dst_tensor, tf_shape_dst,
+                                dnn_shape_dst);
+
+      // Destination memory descriptor is same as source memory descriptor.
+      auto dst_md = src_md;
+      dst.SetUsrMem(dst_md, dst_tensor);
+
+      // execute net
+      std::vector<primitive> net;
+      auto relu_fwd = relu_forward(*relu_fwd_pd, src.GetOpMem(),
+                                   dst.GetOpMem());
+      net.push_back(relu_fwd);
+      stream(stream::kind::eager).submit(net).wait();
+    } catch (mkldnn::error &e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) +
+                         ", in file " + string(__FILE__) + ":" +
+                         std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+                     errors::Aborted("Operation received an exception:",
+                        error_msg));
+    }
+  }
+};
+
+
+template <typename Device, typename T, algorithm alg_kind>
+class MklReluGradOpBase : public OpKernel {
+ public:
+  ~MklReluGradOpBase() {}
+
+  explicit MklReluGradOpBase(OpKernelConstruction* context) :
+    OpKernel(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) = 0;
+
+  void Compute(OpKernelContext* context)  {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> diff_dst(&cpu_engine);
+      MklDnnData<T> diff_src(&cpu_engine);
+
+      const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+      const size_t src_index = 1;       // index of src input tensor
+      const size_t diff_src_index = 0;  // index of diff_src output tensor
+
+      const Tensor& src_tensor      = MklGetInput(context, src_index);
+      const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+      Tensor* diff_src_tensor       = nullptr;
+
+      MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
+      GetMklShape(context, src_index, &dnn_shape_src);
+      GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+      int src_dims_size = src_tensor.dims();
+      if (src_dims_size == 0) {
+        Compute_Scalar(context);
+        return;
+      }
+
+      // Set DNN primitives for src & diff_dst
+      memory::desc src_md({}, memory::data_undef, memory::format_undef);
+      memory::desc diff_dst_md({}, memory::data_undef, memory::format_undef);
+      if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
+        if (dnn_shape_diff_dst.IsMklTensor()) {
+          diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
+          src_md = diff_dst_md;
+        } else {
+          src_md = dnn_shape_src.GetMklLayout();
+          diff_dst_md = src_md;
+        }
+      } else {
+        auto src_dims = TFShapeToMklDnnDims(src_tensor.shape());
+        auto src_strides = CalculateTFStrides(src_dims);
+        src_md = MklDnnData<T>::CreateBlockedMemDesc(src_dims, src_strides);
+        diff_dst_md = src_md;
+      }
+      src.SetUsrMem(src_md, &src_tensor);
+      diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
+
+      T alpha = 0, beta = 0;
+      std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
+      auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training,
+                                              alg_kind, src_md, alpha, beta);
+      relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc,
+                                                         cpu_engine));
+      auto relu_bwd_desc = relu_backward::desc(alg_kind, diff_dst_md, src_md,
+                                                alpha, beta);
+      auto relu_bwd_pd  = relu_backward::primitive_desc(relu_bwd_desc,
+                                                cpu_engine, *relu_fwd_pd);
+
+      // allocate diff_src tensor
+      MklDnnShape dnn_shape_diff_src;
+      TensorShape tf_shape_diff_src;
+      if (dnn_shape_src.IsMklTensor()) {
+        dnn_shape_diff_src.SetMklTensor(true);
+        auto diff_src_pd = relu_bwd_pd.diff_src_primitive_desc();
+        dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
+        dnn_shape_diff_src.SetElemType(MklDnnType<T>());
+        dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(),
+                                       dnn_shape_src.GetSizesAsMklDnnDims(),
+                                       dnn_shape_src.GetTfDataFormat());
+        tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T));
+      } else {
+        dnn_shape_diff_src.SetMklTensor(false);
+        tf_shape_diff_src = src_tensor.shape();
+      }
+      AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                                 tf_shape_diff_src, dnn_shape_diff_src);
+
+      // diff_src memory descriptor is same as diff_dst memory descriptor.
+      auto diff_src_md = diff_dst_md;
+      diff_src.SetUsrMem(diff_src_md, diff_src_tensor);
+
+      PrepareAndExecuteNet(relu_bwd_pd, &src, &diff_src, &diff_dst);
+     } catch (mkldnn::error &e) {
+       string error_msg = "Status: " + std::to_string(e.status) +
+                          ", message: " + string(e.message) +
+                          ", in file " + string(__FILE__) + ":" +
+                          std::to_string(__LINE__);
+       OP_REQUIRES_OK(context,
+                      errors::Aborted("Operation received an exception:",
+                                      error_msg));
+    }
+  }
+
+  void PrepareAndExecuteNet(const relu_backward::primitive_desc& relu_prim_desc,
+                  MklDnnData<T>* src, MklDnnData<T>* diff_src, MklDnnData<T>*
+                  diff_dst) {
+    std::vector<primitive> net;
+    net.push_back(relu_backward(relu_prim_desc, src->GetOpMem(),
+                                diff_dst->GetOpMem(), diff_src->GetOpMem()));
+    stream(stream::kind::eager).submit(net).wait();
+  }
+};
+
+
+template <typename Device, typename T>
+class MklReluOp : public MklReluOpBase<Device, T, eltwise_relu> {
+ public:
+  ~MklReluOp() {}
+
+  explicit MklReluOp(OpKernelConstruction* context) :
+  MklReluOpBase<Device, T, eltwise_relu>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t src_index = 0;  // index of src input tensor
+    const size_t dst_index = 0;  // index of dst output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    MklDnnShape dnn_shape_src;
+    GetMklShape(context, src_index, &dnn_shape_src);
+
+    Tensor* dst_tensor = nullptr;
+    void* user_i = static_cast<void*>(const_cast<T*>(
+                         src_tensor.flat<T>().data()));
+    MklDnnShape dnn_shape_dst;
+    dnn_shape_dst.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                              src_tensor.shape(), dnn_shape_dst);
+    void* out_o = static_cast<void*>(dst_tensor->flat<T>().data());
+    (static_cast<T*>(out_o))[0] =
+              std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
+ public:
+  ~MklReluGradOp() {}
+
+  explicit MklReluGradOp(OpKernelConstruction* context) :
+  MklReluGradOpBase<Device, T, eltwise_relu>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+    const size_t src_index = 1;       // index of src input tensor
+    const size_t diff_src_index = 0;  // index of diff_src output tensor
+    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+    Tensor* diff_src_tensor = nullptr;
+
+    MklDnnShape dnn_shape_diff_dst;
+    GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+    int src_dims_size = src_tensor.dims();
+    MklDnnShape dnn_shape_diff_src;
+    dnn_shape_diff_src.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                              diff_dst_tensor.shape(), dnn_shape_diff_src);
+    void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
+    void* user_i =
+          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+    void* user_g =
+          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] *
+                                  ((static_cast<T*>(user_i))[0] > 0);
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklEluOp : public MklReluOpBase<Device, T, eltwise_elu> {
+ public:
+  ~MklEluOp() {}
+
+  explicit MklEluOp(OpKernelConstruction* context) :
+  MklReluOpBase<Device, T, eltwise_elu>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t src_index = 0;  // index of src input tensor
+    const size_t dst_index = 0;  // index of dst output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    MklDnnShape dnn_shape_src;
+    GetMklShape(context, src_index, &dnn_shape_src);
+
+    Tensor* dst_tensor = nullptr;
+    void* user_i = static_cast<void*>(const_cast<T*>(
+                         src_tensor.flat<T>().data()));
+    MklDnnShape dnn_shape_dst;
+    dnn_shape_dst.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                              src_tensor.shape(), dnn_shape_dst);
+    void* out_o = static_cast<void*>(dst_tensor->flat<T>().data());
+    // return exp(feature) - 1 if feature > 0; feature otherwise
+    T feature = (static_cast<T*>(user_i))[0];
+    if (feature < 0)
+      (static_cast<T*>(out_o))[0] = std::exp(feature);
+    else
+      (static_cast<T*>(out_o))[0] = feature;
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklEluGradOp : public MklReluGradOpBase<Device, T, eltwise_elu> {
+ public:
+  ~MklEluGradOp() {}
+
+  explicit MklEluGradOp(OpKernelConstruction* context) :
+  MklReluGradOpBase<Device, T, eltwise_elu>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+    const size_t src_index = 1;       // index of src input tensor
+    const size_t diff_src_index = 0;  // index of diff_src output tensor
+    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+    Tensor* diff_src_tensor = nullptr;
+
+    MklDnnShape dnn_shape_diff_dst;
+    GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+    int src_dims_size = src_tensor.dims();
+    MklDnnShape dnn_shape_diff_src;
+    dnn_shape_diff_src.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                              diff_dst_tensor.shape(), dnn_shape_diff_src);
+    void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
+    void* user_i =
+          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+    void* user_g =
+          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    // gradient of elu(x) = 1 if x > 0; elu(x) + 1 otherwise
+    T feature = (static_cast<T*>(user_i))[0];
+    if (feature > 0) {
+      (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0];
+    } else {
+      T elu = std::exp(feature) - 1;
+      (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] * (elu + 1);
+    }
+  }
+};
+
+template <typename Device, typename T>
+class MklTanhOp : public MklReluOpBase<Device, T, eltwise_tanh> {
+ public:
+  ~MklTanhOp() {}
+
+  explicit MklTanhOp(OpKernelConstruction* context) :
+  MklReluOpBase<Device, T, eltwise_tanh>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t src_index = 0;  // index of src input tensor
+    const size_t dst_index = 0;  // index of dst output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    MklDnnShape dnn_shape_src;
+    GetMklShape(context, src_index, &dnn_shape_src);
+
+    Tensor* dst_tensor = nullptr;
+    void* user_i = static_cast<void*>(const_cast<T*>(
+                         src_tensor.flat<T>().data()));
+    MklDnnShape dnn_shape_dst;
+    dnn_shape_dst.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                              src_tensor.shape(), dnn_shape_dst);
+    void* out_o = static_cast<void*>(dst_tensor->flat<T>().data());
+    // tanh(x) = (e^x - e^(-x))/ (e^x + e^(-x))
+    T feature = (static_cast<T*>(user_i))[0];
+    T e1 = std::exp(feature);
+    T e2 = std::exp(-feature);
+    (static_cast<T*>(out_o))[0] = (e1 - e2)/(e1 + e2);
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
+ public:
+  ~MklTanhGradOp() {}
+
+  explicit MklTanhGradOp(OpKernelConstruction* context) :
+  MklReluGradOpBase<Device, T, eltwise_tanh>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+    const size_t src_index = 1;       // index of src input tensor
+    const size_t diff_src_index = 0;  // index of diff_src output tensor
+    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+    Tensor* diff_src_tensor = nullptr;
+
+    MklDnnShape dnn_shape_diff_dst;
+    GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+    int src_dims_size = src_tensor.dims();
+    MklDnnShape dnn_shape_diff_src;
+    dnn_shape_diff_src.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                              diff_dst_tensor.shape(), dnn_shape_diff_src);
+    void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
+    void* user_i =
+          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+    // gradient of tanh(x) = 1 - tanh(x)^2
+    T feature = (static_cast<T*>(user_i))[0];
+    T e1 = std::exp(feature);
+    T e2 = std::exp(-feature);
+    T tanh = (e1 - e2)/(e1 + e2);
+    void* user_g =
+          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] *
+                                  (1 - tanh * tanh);
+  }
+};
+
+#endif
+
+// register dnn kernels for supported operations and supported types
 #define REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES(type)             \
   REGISTER_KERNEL_BUILDER(Name("_MklRelu")                          \
                               .Device(DEVICE_CPU)                   \
@@ -367,6 +798,38 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
                           MklReluGradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
 
+#ifdef INTEL_MKL_DNN
+
+// register dnn kernels for supported operations and supported types
+#define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)             \
+  REGISTER_KERNEL_BUILDER(Name("_MklElu")                          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklEluOp<CPUDevice, type>);              \
+  REGISTER_KERNEL_BUILDER(Name("_MklEluGrad")                      \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklEluGradOp<CPUDevice, type>);
+TF_CALL_float(REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES);
+
+#define REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES(type)             \
+  REGISTER_KERNEL_BUILDER(Name("_MklTanh")                          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklTanhOp<CPUDevice, type>);              \
+  REGISTER_KERNEL_BUILDER(Name("_MklTanhGrad")                      \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklTanhGradOp<CPUDevice, type>);
+TF_CALL_float(REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES);
+
+#endif
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
+
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index 5e985824750befb702f8fa7a59d699f853f40267..11c92ebdb41c559f10fb851c9684c0dc3d93d21e 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -28,6 +28,11 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+using mkldnn::stream;
+#endif
+
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 template <typename Device, typename T>
@@ -35,6 +40,7 @@ class MklReshapeOp : public OpKernel {
  public:
   explicit MklReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
 
+#ifndef INTEL_MKL_DNN
   void Compute(OpKernelContext* context) override {
     const Tensor& input = MklGetInput(context, 0);
     const Tensor& sizes = MklGetInput(context, 1);
@@ -129,7 +135,183 @@ class MklReshapeOp : public OpKernel {
     }
   }
 
+#else
+
  private:
+  // When the input tensor is in MKL layout and we are reshaping the tensor to a
+  // different shape than its actual shape, then we use MKLDNN reorder primitive
+  // to put tensor back in Tensorflow layout. But we can skip this reordering
+  // some times. This function checks for all such cases.
+  bool SkipReorder(const MklDnnShape& mkl_shape_input,
+                   const TensorShape& reshape_to) {
+    CHECK_EQ(mkl_shape_input.IsMklTensor(), true);
+    bool ret = false;
+
+    // If Tensorflow's data format and the underlying format maintained by
+    // MKLDNN are equivalent (both are NHWC or both are NCHW), then we can
+    // safely return true.
+    auto input_mkl_md = mkl_shape_input.GetMklLayout();
+    if (mkl_shape_input.GetTfDataFormat() == input_mkl_md.data.format) {
+      ret = true;
+    }
+
+    return ret;
+  }
+
+ public:
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_tensor = MklGetInput(context, 0);
+    const Tensor& sizes = MklGetInput(context, 1);
+
+    MklDnnShape mkl_shape_input;
+    GetMklShape(context, kInputSlotIdx, &mkl_shape_input);
+    bool input_in_mkl_format = mkl_shape_input.IsMklTensor();
+    const int64 nelems = input_in_mkl_format ?
+                         mkl_shape_input.GetTfShape().num_elements()
+                         : input_tensor.NumElements();
+
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, IsLegacyVector(sizes.shape()),
+                errors::InvalidArgument("sizes input must be 1-D, not shape ",
+                                        sizes.shape().DebugString()));
+
+    // Compute the output shape.  Determine product of specified
+    // dimensions, and find the index of the unspecified one.
+    TensorShape shape;
+    int64 product = 1;
+    int unknown_index = -1;
+    switch (sizes.dtype()) {
+      case DT_INT32:
+        OP_REQUIRES_OK(context, ValidateSizes<int32>(sizes, &product,
+                                                     &unknown_index, &shape));
+        break;
+      case DT_INT64:
+        OP_REQUIRES_OK(context, ValidateSizes<int64>(sizes, &product,
+                                                     &unknown_index, &shape));
+        break;
+      default:
+        context->CtxFailure(errors::InvalidArgument(
+            "desired shape must be a DT_INT32 or DT_INT64 vector, not a ",
+            DataTypeString(sizes.dtype())));
+        return;
+    }
+    if (unknown_index != -1) {
+      OP_REQUIRES(
+          context, product > 0,
+          errors::InvalidArgument("Reshape cannot infer the missing input size "
+                                  "for an empty tensor unless all specified "
+                                  "input sizes are non-zero"));
+      const int64 missing = nelems / product;
+      OP_REQUIRES(
+          context, product * missing == nelems,
+          errors::InvalidArgument(
+              "Input to reshape is a tensor with ", nelems,
+              " values, but the requested shape requires a multiple of ",
+              product));
+      shape.set_dim(unknown_index, missing);
+    }
+    OP_REQUIRES(context, shape.num_elements() == nelems,
+                errors::InvalidArgument("Input to reshape is a tensor with ",
+                                        nelems,
+                                        " values, but the requested shape has ",
+                                        shape.num_elements()));
+
+    if (input_in_mkl_format) {
+      TensorShape& shape_to = shape;
+      TensorShape shape_from = mkl_shape_input.GetTfShape();
+      if (shape_from == shape_to) {
+        CopyMklTensorInToOut(context, kInputSlotIdx, kOutputSlotIdx);
+        return;
+      } else {
+        try {
+          auto cpu_engine = engine(engine::cpu, 0);
+          MklDnnData<T> dnn_data_input(&cpu_engine);
+          // Reshape is just a logical view change operation for a tensor.
+          // It does not change underlying layout. But MKLDNN may maintain
+          // tensor data in different layout than that specified by Tensorflow.
+          // If MKLDNN maintains input tensor in different layout than that
+          // specified by Tensorflow, we will need to reorder tensor and then
+          // put it in the shape expected by Tensorflow. But if MKLDNN has
+          // maintained input tensor in the same layout as it is expected by
+          // Tensorflow, we don't need to reorder tensor contents, we just
+          // need to update MklDnnShape object associated with the input
+          // tensor to reflect the shape change expected by reshape.
+          if (!SkipReorder(mkl_shape_input, shape_to)) {
+              // If dimensions that are being expanded or collapsed are not
+              // maintained contiguously by MKLDNN, then we use reorder.
+
+              // Get Mkl layout of input tensor.
+              auto input_mkl_md = mkl_shape_input.GetMklLayout();
+              // Set input Mkl layout as the user layout.
+              dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor);
+              // Get expected Tensorflow layout of input tensor.
+              auto output_tf_md = mkl_shape_input.GetTfLayout();
+              auto output_tf_pd = memory::primitive_desc(output_tf_md,
+                                                         cpu_engine);
+
+              Tensor* output_tensor = nullptr;
+              MklShape mkl_shape_output;
+              mkl_shape_output.SetMklTensor(false);
+              // We allocate output tensor in the shape expected by Reshape.
+              AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor,
+                                        shape_to, mkl_shape_output);
+
+              // Insert reorder between Mkl layout and TensorFlow layout.
+              std::vector<primitive> net;
+              CHECK_EQ(dnn_data_input.CheckReorderToOpMem(output_tf_pd,
+                       output_tensor, &net), true);
+              stream(stream::kind::eager).submit(net).wait();
+              return;
+          } else {
+            // If dimensions that are being expanded or collapsed are
+            // maintained contiguously by MKLDNN, then we skip reorder, just
+            // update MklDnnShape object for the tensorflow tensor, and forward
+            // Tensorflow tensor as it is to the output.
+            auto output_dims = TFShapeToMklDnnDims(shape_to);
+            auto output_strides = CalculateTFStrides(output_dims);
+            auto output_tf_md = MklDnnData<T>::CreateBlockedMemDesc(output_dims,
+                                                               output_strides);
+            auto output_tf_pd = memory::primitive_desc(output_tf_md,
+                                                       cpu_engine);
+
+            // Set MklDnnShape
+            MklDnnShape mkl_shape_output;
+            mkl_shape_output.SetMklTensor(true);
+            mkl_shape_output.SetMklLayout(&output_tf_pd);
+            mkl_shape_output.SetElemType(MklDnnType<T>());
+            mkl_shape_output.SetTfLayout(output_dims.size(), output_dims,
+                                         memory::format::blocked);
+
+            // We now simply forward input Mkl tensor to output and change its
+            // output MklDnnShape object.
+            ForwardMklTensorInToOutWithMklShape(context, kInputSlotIdx,
+                                              kOutputSlotIdx, mkl_shape_output);
+            return;
+          }
+        } catch (mkldnn::error &e) {
+          string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) +
+                       ", in file " + string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+          OP_REQUIRES_OK(context,
+                   errors::Aborted("Operation received an exception:",
+                      error_msg));
+        }
+      }
+    } else {
+      // If input tensor is not in Mkl format, then just copy Tensorflow tensor
+      // to output with specified shape.
+      CopyTfTensorInToOutWithShape(context, kInputSlotIdx, kOutputSlotIdx,
+                                   shape);
+    }
+  }
+
+#endif  // INTEL_MKL_DNN
+
+ private:
+  const int kInputSlotIdx = 0;
+  const int kOutputSlotIdx = 0;
+
   template <typename Tshape>
   Status ValidateSizes(const Tensor& sizes, int64* product, int* unknown_index,
                        TensorShape* shape) {
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index 0a5be4fec97797a0250543c96f1c87884806ce0f..c4d5a45d3caff0f59b1ecc61f95dd26fe16fd06b 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -101,8 +101,8 @@ class MklToTfOp : public OpKernel {
       // Allocate output tensor.
       TensorShape output_shape = input_shape.GetTfShape();
       Tensor* output_tensor = NULL;
-      OP_REQUIRES_OK(context, context->allocate_output(input_number,
-                                  output_shape, &output_tensor));
+      OP_REQUIRES_OK(context, context->allocate_output(
+                                  input_number, output_shape, &output_tensor));
       CHECK_NOTNULL(output_tensor);
 
       // Do we need to reorder Mkl layout into TensorFlow layout?
@@ -116,13 +116,13 @@ class MklToTfOp : public OpKernel {
         // If not, just forward input tensor to output tensor.
         CHECK(output_tensor->CopyFrom(input_tensor, output_shape));
       }
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + std::string(e.message) +
-                       ", in file " + std::string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-        errors::Aborted("Operation received an exception:", error_msg));
+                         ", message: " + std::string(e.message) + ", in file " +
+                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 #else
@@ -160,8 +160,8 @@ class MklToTfOp : public OpKernel {
 
     // Allocate output tensor.
     Tensor* output_tensor = NULL;
-    OP_REQUIRES_OK(context, context->allocate_output(input_number,
-                              output_shape, &output_tensor));
+    OP_REQUIRES_OK(context, context->allocate_output(input_number, output_shape,
+                                                     &output_tensor));
 
     dnnLayout_t output_layout =
         static_cast<dnnLayout_t>(input_shape.GetTfLayout());
diff --git a/tensorflow/core/kernels/multinomial_op.cc b/tensorflow/core/kernels/multinomial_op.cc
index 8c0109f5c87ce5f73621a1683471bbcb8a936ea4..d086abb24760f1ab946605fd422a4fd0d5fc866d 100644
--- a/tensorflow/core/kernels/multinomial_op.cc
+++ b/tensorflow/core/kernels/multinomial_op.cc
@@ -40,7 +40,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
 
-template <typename Device, typename T>
+template <typename Device, typename T, typename OutputType>
 struct MultinomialFunctor {
   void operator()(OpKernelContext* ctx, const Device& d,
                   typename TTypes<T>::ConstMatrix logits,
@@ -49,11 +49,11 @@ struct MultinomialFunctor {
                   typename TTypes<float>::Flat scratch, int batch_size,
                   int num_classes, int num_samples,
                   const random::PhiloxRandom& gen,
-                  typename TTypes<int64>::Matrix output);
+                  typename TTypes<OutputType>::Matrix output);
 };
 
-template <typename T>
-struct MultinomialFunctor<CPUDevice, T> {
+template <typename T, typename OutputType>
+struct MultinomialFunctor<CPUDevice, T, OutputType> {
   void operator()(OpKernelContext* ctx, const CPUDevice& d,
                   typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<float>::Flat /* noises */,
@@ -61,7 +61,7 @@ struct MultinomialFunctor<CPUDevice, T> {
                   typename TTypes<float>::Flat /* scratch */, int batch_size,
                   int num_classes, int num_samples,
                   const random::PhiloxRandom& gen,
-                  typename TTypes<int64>::Matrix output) {
+                  typename TTypes<OutputType>::Matrix output) {
     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
 
     // The implementation only parallelizes by batch.
@@ -128,7 +128,7 @@ struct MultinomialFunctor<CPUDevice, T> {
 }  // namespace functor
 
 // Samples from a multinomial distribution.
-template <typename Device, typename T>
+template <typename Device, typename T, typename OutputType>
 class MultinomialOp : public OpKernel {
  public:
   explicit MultinomialOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -195,11 +195,11 @@ class MultinomialOp : public OpKernel {
       if (std::is_same<Device, CPUDevice>::value) num_samples_ceil_4 *= 2;
       auto rng =
           generator_.ReserveRandomOutputs(batch_size * num_samples_ceil_4, 256);
-      functor::MultinomialFunctor<Device, T>()(
+      functor::MultinomialFunctor<Device, T, OutputType>()(
           ctx, ctx->eigen_device<Device>(), logits_t.matrix<T>(),
           noises.flat<float>(), scores.flat<float>(), scratch.flat<float>(),
           batch_size, num_classes, num_samples, rng,
-          samples_t->matrix<int64>());
+          samples_t->matrix<OutputType>());
     }
   }
 
@@ -209,10 +209,17 @@ class MultinomialOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(MultinomialOp);
 };
 
-#define REGISTER(TYPE)                                                  \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("Multinomial").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
-      MultinomialOp<CPUDevice, TYPE>);
+#define REGISTER(TYPE)                                                   \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<TYPE>("T")                 \
+                              .TypeConstraint("output_dtype", DT_INT32), \
+                          MultinomialOp<CPUDevice, TYPE, int32>);        \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<TYPE>("T")                 \
+                              .TypeConstraint("output_dtype", DT_INT64), \
+                          MultinomialOp<CPUDevice, TYPE, int64>);
 
 TF_CALL_half(REGISTER);
 TF_CALL_float(REGISTER);
@@ -220,12 +227,20 @@ TF_CALL_double(REGISTER);
 #undef REGISTER
 
 #if GOOGLE_CUDA
-#define REGISTER(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(Name("Multinomial")             \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("num_samples")  \
-                              .TypeConstraint<TYPE>("T"), \
-                          MultinomialOp<GPUDevice, TYPE>)
+#define REGISTER(TYPE)                                                   \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
+                              .Device(DEVICE_GPU)                        \
+                              .HostMemory("num_samples")                 \
+                              .TypeConstraint<TYPE>("T")                 \
+                              .TypeConstraint("output_dtype", DT_INT32), \
+                          MultinomialOp<GPUDevice, TYPE, int32>)         \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
+                              .Device(DEVICE_GPU)                        \
+                              .HostMemory("num_samples")                 \
+                              .TypeConstraint<TYPE>("T")                 \
+                              .TypeConstraint("output_dtype", DT_INT64), \
+                          MultinomialOp<GPUDevice, TYPE, int64>)
+
 TF_CALL_half(REGISTER);
 TF_CALL_float(REGISTER);
 TF_CALL_double(REGISTER);
diff --git a/tensorflow/core/kernels/multinomial_op.h b/tensorflow/core/kernels/multinomial_op.h
index af5e81f219c802857fd6d5eb27e4962cc890a058..6e41060aa414b0611dd7dca31374444f8dd364ec 100644
--- a/tensorflow/core/kernels/multinomial_op.h
+++ b/tensorflow/core/kernels/multinomial_op.h
@@ -21,7 +21,7 @@ namespace tensorflow {
 namespace functor {
 
 // Generic helper functor for the Multinomial Op.
-template <typename Device, typename T>
+template <typename Device, typename T, typename OutputType>
 struct MultinomialFunctor;
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
index 19b4f3ca559f56d93fae203df77f0ef35718db1b..5cc5877cceb19320023423d35a352c5ba3db13e2 100644
--- a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
@@ -37,20 +37,22 @@ using GPUDevice = Eigen::GpuDevice;
 
 // Kernel for Multinomial op.  Data is interpreted to have the following shapes:
 //   scores: [B, S, C];  maxima: [B, S];  output: [B, S].
+template <typename OutputType>
 __global__ void MultinomialKernel(int32 nthreads, const int32 num_classes,
                                   const int32 num_samples, const float* scores,
-                                  const float* maxima, int64* output) {
+                                  const float* maxima, OutputType* output) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     const int maxima_idx = index / num_classes;
     if (ldg(maxima + maxima_idx) == ldg(scores + index)) {
-      CudaAtomicMax(reinterpret_cast<uint64*>(output + maxima_idx),
-                    static_cast<uint64>(index % num_classes));
+      using UnsignedOutputType = typename std::make_unsigned<OutputType>::type;
+      CudaAtomicMax(reinterpret_cast<UnsignedOutputType*>(output + maxima_idx),
+                    static_cast<UnsignedOutputType>(index % num_classes));
     }
   }
 }
 
-template <typename T>
-struct MultinomialFunctor<GPUDevice, T> {
+template <typename T, typename OutputType>
+struct MultinomialFunctor<GPUDevice, T, OutputType> {
   void operator()(OpKernelContext* ctx, const GPUDevice& d,
                   typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<float>::Flat noises,
@@ -58,7 +60,7 @@ struct MultinomialFunctor<GPUDevice, T> {
                   typename TTypes<float>::Flat maxima, int batch_size,
                   int num_classes, int num_samples,
                   const random::PhiloxRandom& gen,
-                  typename TTypes<int64>::Matrix output) {
+                  typename TTypes<OutputType>::Matrix output) {
     // Uniform, [0, 1).
     typedef random::UniformDistribution<random::PhiloxRandom, float> Dist;
     functor::FillPhiloxRandom<GPUDevice, Dist>()(ctx, d, gen, noises.data(),
@@ -111,11 +113,17 @@ struct MultinomialFunctor<GPUDevice, T> {
 };
 
 // Explicit instantiation of the GPU functors.
-template struct MultinomialFunctor<GPUDevice, Eigen::half>;
-template struct MultinomialFunctor<GPUDevice, float>;
-template struct MultinomialFunctor<GPUDevice, double>;
-template struct MultinomialFunctor<GPUDevice, int32>;
-template struct MultinomialFunctor<GPUDevice, int64>;
+template struct MultinomialFunctor<GPUDevice, Eigen::half, int32>;
+template struct MultinomialFunctor<GPUDevice, float, int32>;
+template struct MultinomialFunctor<GPUDevice, double, int32>;
+template struct MultinomialFunctor<GPUDevice, int32, int32>;
+template struct MultinomialFunctor<GPUDevice, int64, int32>;
+
+template struct MultinomialFunctor<GPUDevice, Eigen::half, int64>;
+template struct MultinomialFunctor<GPUDevice, float, int64>;
+template struct MultinomialFunctor<GPUDevice, double, int64>;
+template struct MultinomialFunctor<GPUDevice, int32, int64>;
+template struct MultinomialFunctor<GPUDevice, int64, int64>;
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc
index 0db7c63b8b6a25f1d495dd937d49ec9d0615ab0a..a841291ddd7d4f64b0ab2b611c59307f4d11150f 100644
--- a/tensorflow/core/kernels/nn_ops_test.cc
+++ b/tensorflow/core/kernels/nn_ops_test.cc
@@ -653,6 +653,8 @@ BM_ConvFloatDepthwiseFwd(32, 7, 7, 1024, 1, 1024, 3, 3, 1, SAME, conv6);
 // Benchmarks with different stride and padding options.
 BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 2, SAME, conv7);
 BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 2, VALID, conv8);
+BM_ConvFloatDepthwiseFwd(1, 100, 100, 72, 1, 72, 3, 3, 1, SAME, conv9);
+BM_ConvFloatDepthwiseFwd(1, 100, 100, 72, 1, 72, 5, 5, 1, SAME, conv10);
 
 #define BM_ConvFloatDepthwiseBk(BS, R, C, ID, DM, OD, KR, KC, STR, PAD, LABEL) \
   static void BM_ConvFloatDepthwiseBkInCPU1_##LABEL(int iters) {               \
diff --git a/tensorflow/core/kernels/ops_util.h b/tensorflow/core/kernels/ops_util.h
index d3d1b56c9d568487c768f1b1620d2880a3afc531..93ef5127789048b85740e276f76f97e7b46e8368 100644
--- a/tensorflow/core/kernels/ops_util.h
+++ b/tensorflow/core/kernels/ops_util.h
@@ -98,6 +98,19 @@ gtl::InlinedVector<T, 8> ComputeStride(const TensorShape& shape) {
   return strides;
 }
 
+// Helper to compute 'strides' given an Eigen TensorDimensions
+template <typename T, typename EigenDimensions>
+gtl::InlinedVector<T, 8> ComputeEigenStrides(const EigenDimensions& shape) {
+  const int ndims = shape.rank();
+  gtl::InlinedVector<T, 8> strides(ndims);
+  T stride = 1;
+  for (int i = ndims - 1; i >= 0; --i) {
+    strides[i] = stride;
+    stride *= static_cast<T>(shape[i]);
+  }
+  return strides;
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_OPS_UTIL_H_
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 814128d99ac2acb4a10cfcb2907edb735eaca382..2923c38662e3c2b74df5c72c513b5e3ecab9f5e5 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -140,6 +140,7 @@ class PackOp : public OpKernel {
 TF_CALL_ALL_TYPES(REGISTER_PACK);
 TF_CALL_QUANTIZED_TYPES(REGISTER_PACK);
 TF_CALL_bfloat16(REGISTER_PACK);
+TF_CALL_variant(REGISTER_PACK);
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION)
 // Primarily used for SavedModel support on mobile.
@@ -157,6 +158,7 @@ REGISTER_PACK(string);
       PackOp<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
 REGISTER_GPU(bool);
 #undef REGISTER_GPU
diff --git a/tensorflow/core/kernels/padding_fifo_queue.cc b/tensorflow/core/kernels/padding_fifo_queue.cc
index d0f7683f3dd8d520339dfd132af8a101da3abd5a..9d35ecb66c00e0cf7a2298a9d324c910ed33c7cc 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.cc
+++ b/tensorflow/core/kernels/padding_fifo_queue.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/padding_fifo_queue.h"
 #include "tensorflow/core/kernels/queue_base.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -156,7 +157,7 @@ void PaddingFIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                 // Finished.  Allocate attempt->tuple and
                 // copy from attempt->tuples to attempt->tuple.
                 attempt->tuple.reserve(num_components());
-                const std::vector<Tuple>& tuples = attempt->tuples;
+                std::vector<Tuple>& tuples = attempt->tuples;
 
                 std::vector<bool> dynamic_shape;
                 const int64 batch_size = tuples.size();
@@ -206,8 +207,10 @@ void PaddingFIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                       attempt->context->SetStatus(CopyElementToLargerSlice(
                           tuples[index][i], &attempt->tuple[i], index));
                     } else {
-                      attempt->context->SetStatus(CopyElementToSlice(
-                          tuples[index][i], &attempt->tuple[i], index));
+                      attempt->context->SetStatus(
+                          batch_util::CopyElementToSlice(
+                              std::move(tuples[index][i]), &attempt->tuple[i],
+                              index));
                     }
                     if (!attempt->context->status().ok()) return kComplete;
                   }
diff --git a/tensorflow/core/kernels/priority_queue.cc b/tensorflow/core/kernels/priority_queue.cc
index 4c406fc1ed9f86477a7c0eb7c88f7dd7833f796c..bab94f7f0ad1fd7609761aaabc4f76ae6eafeb7b 100644
--- a/tensorflow/core/kernels/priority_queue.cc
+++ b/tensorflow/core/kernels/priority_queue.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/priority_queue.h"
 #include "tensorflow/core/kernels/queue_base.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -122,7 +123,7 @@ Status PriorityQueue::GetElementComponentFromBatch(
   TF_RETURN_IF_ERROR(ctx->allocate_persistent(
       tuple[component].dtype(), element_shape, out_tensor, &element_access));
   TF_RETURN_IF_ERROR(
-      CopySliceToElement(tuple[component], element_access, index));
+      batch_util::CopySliceToElement(tuple[component], element_access, index));
   return Status::OK();
 }
 
@@ -358,8 +359,8 @@ void PriorityQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
               const int index =
                   attempt->tuple[0].dim_size(0) - attempt->elements_requested;
               for (int i = 0; i < num_components(); ++i) {
-                attempt->context->SetStatus(
-                    CopyElementToSlice(tuple[i], &attempt->tuple[i], index));
+                attempt->context->SetStatus(batch_util::CopyElementToSlice(
+                    std::move(tuple[i]), &attempt->tuple[i], index));
                 if (!attempt->context->status().ok()) return kComplete;
               }
               tuple.clear();
diff --git a/tensorflow/core/kernels/quantization_utils_test.cc b/tensorflow/core/kernels/quantization_utils_test.cc
index eae303b85e44f87fb7e895902d02ff225f13def5..a73581fbbc1e9db4af621b109496088ba2c7c7de 100644
--- a/tensorflow/core/kernels/quantization_utils_test.cc
+++ b/tensorflow/core/kernels/quantization_utils_test.cc
@@ -910,42 +910,41 @@ void TestComputeLerp4xAll() {
 
 }  // namespace tensorflow
 
-#if defined(__ANDROID__)
-int main(int argc, char** argv) {
-#define RUN_TEST(t)            \
-  LOG(INFO) << "Test: " << #t; \
-  tensorflow::t();
-#else
 #define RUN_TEST(t) \
   TEST(QuantizationUtilsTest, t) { tensorflow::t(); }
-#endif
 
-  RUN_TEST(TestFloatToQuantized);
-  RUN_TEST(TestQuantizedToFloat);
-  RUN_TEST(TestAvoidBias);
-  RUN_TEST(TestRequantizeInNewRange);
-  RUN_TEST(TestRequantizeInNewRangeRealData);
-  RUN_TEST(TestRequantizeInNewRange32To8Bit);
-  RUN_TEST(TestRequantizeManyInNewRange32To8Bit);
-  RUN_TEST(TestRequantizeManyInNewRange32To8BitUsingEigen);
-  RUN_TEST(TestRequantizeManyInNewRange32To8BitEigenVsNonEigen);
-  RUN_TEST(TestRequantizeManyInNewRange32To8BitSignedEigenVsNonEigen);
-  RUN_TEST(TestFloatTensorToQuantized);
-  RUN_TEST(TestRequantizeManyInNewRange8To32Bit);
-  RUN_TEST(TestFloatToQuantizedInPlaceUsingEigen);
-  RUN_TEST(TestOverflowWithEigen);
-  RUN_TEST(TestQuantizedTensorToFloat);
-  RUN_TEST(TestQuantizedToFloatInPlaceUsingEigen);
+RUN_TEST(TestFloatToQuantized);
+RUN_TEST(TestQuantizedToFloat);
+RUN_TEST(TestAvoidBias);
+RUN_TEST(TestRequantizeInNewRange);
+RUN_TEST(TestRequantizeInNewRangeRealData);
+RUN_TEST(TestRequantizeInNewRange32To8Bit);
+RUN_TEST(TestRequantizeManyInNewRange32To8Bit);
+RUN_TEST(TestRequantizeManyInNewRange32To8BitUsingEigen);
+RUN_TEST(TestRequantizeManyInNewRange32To8BitEigenVsNonEigen);
+RUN_TEST(TestRequantizeManyInNewRange32To8BitSignedEigenVsNonEigen);
+RUN_TEST(TestFloatTensorToQuantized);
+RUN_TEST(TestRequantizeManyInNewRange8To32Bit);
+RUN_TEST(TestFloatToQuantizedInPlaceUsingEigen);
+RUN_TEST(TestOverflowWithEigen);
+RUN_TEST(TestQuantizedTensorToFloat);
+RUN_TEST(TestQuantizedToFloatInPlaceUsingEigen);
 
 #if defined(__ANDROID__)
+
+RUN_TEST(BenchmarkRequantizeManyInNewRange);
+
 #ifdef QUANTIZATION_UTILS_USE_NEON
-  RUN_TEST(TestDivide64x2PowAll);
-  RUN_TEST(TestComputeLerp4xAll);
-#endif
 
-  tensorflow::BenchmarkRequantizeManyInNewRange();
+RUN_TEST(TestDivide64x2PowAll);
+RUN_TEST(TestComputeLerp4xAll);
+
+#endif  // QUANTIZATION_UTILS_USE_NEON
+
+#endif  // __ANDROID__
 
-  LOG(INFO) << "All tests complete.";
-  return 0;
+int main(int argc, char** argv) {
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
-#endif
diff --git a/tensorflow/core/kernels/quantized_add_op_test.cc b/tensorflow/core/kernels/quantized_add_op_test.cc
index 74d16b282dff492f5493027390c39ee514f6f4c7..90bd145ad0c9b1da8805ecac7c49bd94c1db22ed 100644
--- a/tensorflow/core/kernels/quantized_add_op_test.cc
+++ b/tensorflow/core/kernels/quantized_add_op_test.cc
@@ -276,10 +276,10 @@ void BenchmarkVectorPlusTensor() {
   TimeAdd({100000, 100}, {100}, 1);
 }
 
-#if !defined(__ANDROID__)
+}  // end namespace tensorflow
 
 #define RUN_TEST(t) \
-  TEST(QuantizedAddOpTest, t) { t(); }
+  TEST(QuantizedAddOpTest, t) { tensorflow::t(); }
 
 RUN_TEST(TestManualScalar);
 RUN_TEST(TestManualVector);
@@ -288,24 +288,16 @@ RUN_TEST(TestScalar);
 RUN_TEST(TestVector);
 RUN_TEST(TestVectorPlusTensor);
 
-#undef RUN_TEST
+#if defined(__ANDROID__)
 
-#endif  // __ANDROID__
+RUN_TEST(BenchmarkTensorScalar);
+RUN_TEST(BenchmarkVector);
+RUN_TEST(BenchmarkVectorPlusTensor);
 
-}  // end namespace tensorflow
+#endif  // __ANDROID__
 
-#if defined(__ANDROID__)
 int main(int argc, char** argv) {
-  LOG(INFO) << "TestManualScalar:";
-  tensorflow::TestManualScalar();
-  LOG(INFO) << "TestManualVector:";
-  tensorflow::TestManualVector();
-  LOG(INFO) << "TestManualVectorPlusTensor:";
-  tensorflow::TestManualVectorPlusTensor();
-  tensorflow::BenchmarkTensorScalar();
-  tensorflow::BenchmarkVector();
-  tensorflow::BenchmarkVectorPlusTensor();
-  LOG(INFO) << "All tests complete";
-  return 0;
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
-#endif  // __ANDROID__
diff --git a/tensorflow/core/kernels/quantized_concat_op.cc b/tensorflow/core/kernels/quantized_concat_op.cc
index ee573f1bb805107299fed89df211275a1e81c35d..d67f1ab3ec28934bc08c11997a8b2f448c30ad91 100644
--- a/tensorflow/core/kernels/quantized_concat_op.cc
+++ b/tensorflow/core/kernels/quantized_concat_op.cc
@@ -174,13 +174,13 @@ class QuantizedConcatOp : public OpKernel {
     OP_REQUIRES(context, (input_mins.size() == N),
                 errors::InvalidArgument(
                     "QuantizedConcatOp : Expected mins input list length ",
-                    input_mins.size(), " to equal values length ", N))
+                    input_mins.size(), " to equal values length ", N));
     OpInputList input_maxes;
     OP_REQUIRES_OK(context, context->input_list("input_maxes", &input_maxes));
     OP_REQUIRES(context, (input_maxes.size() == N),
                 errors::InvalidArgument(
                     "QuantizedConcatOp : Expected maxes input list length ",
-                    input_maxes.size(), " to equal values length ", N))
+                    input_maxes.size(), " to equal values length ", N));
     const int input_dims = values[0].dims();
     const TensorShape& input_shape = values[0].shape();
     OP_REQUIRES(
diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc
index 3b0764bb9bf9ff00c71173c53cdb78b6ab3ac6ca..1921b83d12c0688a96bad0c561080a0189e49bbe 100644
--- a/tensorflow/core/kernels/quantized_conv_ops.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops.cc
@@ -268,6 +268,13 @@ class Im2ColConvFunctor {
     Im2ColBufferResource<T1, chunk_value_count>* im2col_buffer_resource;
     std::function<Status(Im2ColBufferResource<T1, chunk_value_count>**)>
         creator = [](Im2ColBufferResource<T1, chunk_value_count>** resource) {
+#ifdef _MSC_VER
+          // MSVC complains about the capture of chunk_value_count which oddly
+          // works fine in conv_ops_using_gemm.cc for example.
+          // Define chunk_value_count inside the lambda for now.
+          const int64 chunk_value_count =
+              (kMaxChunkSize + (sizeof(T1) - 1)) / sizeof(T1);
+#endif
           *resource = new Im2ColBufferResource<T1, chunk_value_count>();
           return Status::OK();
         };
@@ -457,6 +464,19 @@ class QuantizedConv2DOp : public OpKernel {
         context, (strides_[0] == 1 && strides_[3] == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    std::vector<int32> dilations;
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations));
+    OP_REQUIRES(context, dilations.size() == 4,
+                errors::InvalidArgument("Dilations field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, dilations[1] == 1 && dilations[2] == 1,
+                errors::InvalidArgument(
+                    "Current implementation only supports dilated rate as 1 "
+                    "in the row and column dimensions."));
+    OP_REQUIRES(context, (dilations[0] == 1 && dilations[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
   }
 
diff --git a/tensorflow/core/kernels/quantized_instance_norm_test.cc b/tensorflow/core/kernels/quantized_instance_norm_test.cc
index 29d8dbc0dfcb48b83a1ec0da9085208ffb35c656..d2b15ee20bb89a28c9d7f8398435352107eb4d79 100644
--- a/tensorflow/core/kernels/quantized_instance_norm_test.cc
+++ b/tensorflow/core/kernels/quantized_instance_norm_test.cc
@@ -173,10 +173,10 @@ void TestClamp() {
   Expect(input_tensor, -10.0f, 10.0f, true, 0.0f, 1.0f);
 }
 
-#if !defined(__ANDROID__)
+}  // end namespace tensorflow
 
 #define RUN_TEST(t) \
-  TEST(QuantizedInstanceNormTest, t) { t(); }
+  TEST(QuantizedAddOpTest, t) { tensorflow::t(); }
 
 RUN_TEST(TestBasic);
 RUN_TEST(TestZeroInput);
@@ -184,19 +184,8 @@ RUN_TEST(TestMaxInput);
 RUN_TEST(TestOutputRangeGiven);
 RUN_TEST(TestClamp);
 
-#undef RUN_TEST
-
-#endif  // __ANDROID__
-
-}  // end namespace tensorflow
-
-#if defined(__ANDROID__)
 int main(int argc, char** argv) {
-  tensorflow::TestBasic();
-  tensorflow::TestZeroInput();
-  tensorflow::TestMaxInput();
-  tensorflow::TestOutputRangeGiven();
-  tensorflow::TestClamp();
-  return 0;
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
-#endif  // __ANDROID__
diff --git a/tensorflow/core/kernels/quantized_mul_op_test.cc b/tensorflow/core/kernels/quantized_mul_op_test.cc
index 45d6c51444a9981bd7567a6de8aaeb3f2c1720af..5f858eb8ce03be7d130649f814db5f1f9c68f18c 100644
--- a/tensorflow/core/kernels/quantized_mul_op_test.cc
+++ b/tensorflow/core/kernels/quantized_mul_op_test.cc
@@ -276,10 +276,10 @@ void BenchmarkVectorTimesTensor() {
   TimeMul({100000, 100}, {100}, 100);
 }
 
-#if !defined(__ANDROID__)
+}  // end namespace tensorflow
 
 #define RUN_TEST(t) \
-  TEST(QuantizedMulOpTest, t) { t(); }
+  TEST(QuantizedAddOpTest, t) { tensorflow::t(); }
 
 RUN_TEST(TestManualScalar);
 RUN_TEST(TestManualVector);
@@ -288,24 +288,16 @@ RUN_TEST(TestScalar);
 RUN_TEST(TestVector);
 RUN_TEST(TestVectorTimesTensor);
 
-#undef RUN_TEST
+#if defined(__ANDROID__)
 
-#endif  // __ANDROID__
+RUN_TEST(BenchmarkTensorScalar);
+RUN_TEST(BenchmarkVector);
+RUN_TEST(BenchmarkVectorTimesTensor);
 
-}  // end namespace tensorflow
+#endif  // __ANDROID__
 
-#if defined(__ANDROID__)
 int main(int argc, char** argv) {
-  LOG(INFO) << "TestManualScalar:";
-  tensorflow::TestManualScalar();
-  LOG(INFO) << "TestManualVector:";
-  tensorflow::TestManualVector();
-  LOG(INFO) << "TestManualVectorTimesTensor:";
-  tensorflow::TestManualVectorTimesTensor();
-  tensorflow::BenchmarkTensorScalar();
-  tensorflow::BenchmarkVector();
-  tensorflow::BenchmarkVectorTimesTensor();
-  LOG(INFO) << "All tests complete";
-  return 0;
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
-#endif  // __ANDROID__
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
index 8d3d7105a4a67637cffde4d6a66157789c9e2bdb..e6133415d0f5c143acad25ee6e681820e956cca8 100644
--- a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
@@ -373,22 +373,20 @@ void RunBenchmarkResizeBilinearTwoDims() {
 
 }  // namespace tensorflow
 
-#if defined(__ANDROID__)
-int main(int argc, char** argv) {
-#define RUN_TEST(t)            \
-  LOG(INFO) << "Test: " << #t; \
-  tensorflow::t();
-#else
 #define RUN_TEST(t) \
   TEST(QuantizationResizeBilenarTest, t) { tensorflow::t(); }
-#endif
 
-  RUN_TEST(TestResizeBilinearOneDim);
-  RUN_TEST(TestResizeBilinearTwoDims);
+RUN_TEST(TestResizeBilinearOneDim);
+RUN_TEST(TestResizeBilinearTwoDims);
 
 #if defined(__ANDROID__)
-  RUN_TEST(RunBenchmarkResizeBilinearTwoDims);
-  LOG(INFO) << "All tests complete.";
-  return 0;
+
+RUN_TEST(RunBenchmarkResizeBilinearTwoDims);
+
+#endif  // __ANDROID__
+
+int main(int argc, char** argv) {
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
-#endif
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index 8a9af39e1f7af5483bc72023915dfd408907a99a..330d161c32bc1a48b671765cacc21618545fa71a 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
@@ -46,25 +47,6 @@ Status HandleSliceToElement(const Tensor& parent, Tensor* element,
   return Status::OK();
 }
 
-template <DataType DT>
-Status HandleElementToSlice(const Tensor& element, Tensor* parent, int index) {
-  typedef typename EnumToDataType<DT>::Type T;
-  DCHECK_NE(parent->dim_size(0), 0);
-  DCHECK_GE(index, 0);
-  if (element.NumElements() != (parent->NumElements() / parent->dim_size(0))) {
-    TensorShape chip_shape = parent->shape();
-    chip_shape.RemoveDim(0);
-    return errors::Internal(
-        "HandleElementToSlice Cannot copy slice: number of elements does not "
-        "match.  Shapes are: [element]: ",
-        element.shape().DebugString(), ", [parent slice]: ",
-        chip_shape.DebugString());
-  }
-  auto parent_as_matrix = parent->flat_outer_dims<T>();
-  parent_as_matrix.chip(index, 0) = element.flat<T>();
-  return Status::OK();
-}
-
 }  // namespace
 
 QueueBase::QueueBase(int32 capacity, const DataTypeVector& component_dtypes,
@@ -354,63 +336,13 @@ void QueueBase::FlushUnlocked() {
 
 Status QueueBase::CopySliceToElement(const Tensor& parent, Tensor* element,
                                      int64 index) {
-#define HANDLE_TYPE(DT)                                                   \
-  if (parent.dtype() == DT) {                                             \
-    TF_RETURN_IF_ERROR(HandleSliceToElement<DT>(parent, element, index)); \
-    return Status::OK();                                                  \
-  }
-  HANDLE_TYPE(DT_FLOAT);
-  HANDLE_TYPE(DT_HALF);
-  HANDLE_TYPE(DT_DOUBLE);
-  HANDLE_TYPE(DT_INT32);
-  HANDLE_TYPE(DT_UINT8);
-  HANDLE_TYPE(DT_INT16);
-  HANDLE_TYPE(DT_INT8);
-  HANDLE_TYPE(DT_STRING);
-  HANDLE_TYPE(DT_COMPLEX64);
-  HANDLE_TYPE(DT_COMPLEX128);
-  HANDLE_TYPE(DT_INT64);
-  HANDLE_TYPE(DT_BOOL);
-  HANDLE_TYPE(DT_QINT8);
-  HANDLE_TYPE(DT_QUINT8);
-  HANDLE_TYPE(DT_QINT32);
-  HANDLE_TYPE(DT_QINT16);
-  HANDLE_TYPE(DT_QUINT16);
-  HANDLE_TYPE(DT_UINT16);
-#undef HANDLE_TYPE
-  return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
-                               parent.dtype());
+  return batch_util::CopySliceToElement(parent, element, index);
 }
 
-// Static method
+/* static */
 Status QueueBase::CopyElementToSlice(const Tensor& element, Tensor* parent,
                                      int64 index) {
-#define HANDLE_TYPE(DT)                                                   \
-  if (element.dtype() == DT) {                                            \
-    TF_RETURN_IF_ERROR(HandleElementToSlice<DT>(element, parent, index)); \
-    return Status::OK();                                                  \
-  }
-  HANDLE_TYPE(DT_FLOAT);
-  HANDLE_TYPE(DT_HALF);
-  HANDLE_TYPE(DT_DOUBLE);
-  HANDLE_TYPE(DT_INT32);
-  HANDLE_TYPE(DT_UINT8);
-  HANDLE_TYPE(DT_INT16);
-  HANDLE_TYPE(DT_INT8);
-  HANDLE_TYPE(DT_STRING);
-  HANDLE_TYPE(DT_COMPLEX64);
-  HANDLE_TYPE(DT_COMPLEX128);
-  HANDLE_TYPE(DT_INT64);
-  HANDLE_TYPE(DT_BOOL);
-  HANDLE_TYPE(DT_QINT8);
-  HANDLE_TYPE(DT_QUINT8);
-  HANDLE_TYPE(DT_QINT32);
-  HANDLE_TYPE(DT_QINT16);
-  HANDLE_TYPE(DT_QUINT16);
-  HANDLE_TYPE(DT_UINT16);
-#undef HANDLE_TYPE
-  return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
-                               element.dtype());
+  return batch_util::CopyElementToSlice(element, parent, index);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h
index c101fb35791eafa109f1a360fe63051398d48de5..5fb1c92f9422cb6cc1e6adb6e8e0a03a80acc767 100644
--- a/tensorflow/core/kernels/queue_base.h
+++ b/tensorflow/core/kernels/queue_base.h
@@ -79,6 +79,9 @@ class QueueBase : public QueueInterface {
                                    int64 index);
 
   // Copies element into the index^th slice (in the first dimension) of parent.
+  // NOTE(mrry): This method is deprecated. Use
+  // `tensorflow::batch_util::CopySliceToElement()` defined in
+  // "./batch_util.h" instead.
   static Status CopyElementToSlice(const Tensor& element, Tensor* parent,
                                    int64 index);
 
diff --git a/tensorflow/core/kernels/queue_ops.cc b/tensorflow/core/kernels/queue_ops.cc
index d51dc4ecb00f9501d544dbbbfbd4e92ebf515682..17831b74370bcd21cf7772f0ea6809ee840511c3 100644
--- a/tensorflow/core/kernels/queue_ops.cc
+++ b/tensorflow/core/kernels/queue_ops.cc
@@ -429,7 +429,7 @@ class QueueIsClosedOp : public QueueOpKernel {
  public:
   explicit QueueIsClosedOp(OpKernelConstruction* context)
      : QueueOpKernel(context) {}
- 
+
  protected:
   void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
                     DoneCallback callback) override {
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index 30bbbd4aed6924972f914c42eb8b0a7b9239f7ae..e9695cfde30945c9c99db85f33e44030e5d45054 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/queue_op.h"
 #include "tensorflow/core/kernels/typed_queue.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -170,7 +171,7 @@ Status RandomShuffleQueue::GetElementComponentFromBatch(
   TF_RETURN_IF_ERROR(ctx->allocate_persistent(
       tuple[component].dtype(), element_shape, out_tensor, &element_access));
   TF_RETURN_IF_ERROR(
-      CopySliceToElement(tuple[component], element_access, index));
+      batch_util::CopySliceToElement(tuple[component], element_access, index));
   return Status::OK();
 }
 
@@ -407,8 +408,8 @@ void RandomShuffleQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                   const int index = attempt->tuple[0].dim_size(0) -
                                     attempt->elements_requested;
                   for (int i = 0; i < num_components(); ++i) {
-                    attempt->context->SetStatus(CopyElementToSlice(
-                        tuple[i], &attempt->tuple[i], index));
+                    attempt->context->SetStatus(batch_util::CopyElementToSlice(
+                        std::move(tuple[i]), &attempt->tuple[i], index));
                     if (!attempt->context->status().ok()) return kComplete;
                   }
                   tuple.clear();
diff --git a/tensorflow/core/kernels/record_input_op.cc b/tensorflow/core/kernels/record_input_op.cc
index 878996c9d6a9923404791d4e8995b817ecdf9799..8a4bf4745c929d0f3d49a987b805925af34ebd61 100644
--- a/tensorflow/core/kernels/record_input_op.cc
+++ b/tensorflow/core/kernels/record_input_op.cc
@@ -37,6 +37,8 @@ class RecordInputOp : public OpKernel {
     GETATTR(int64, file_parallelism);
     GETATTR(int64, batch_size);
 #undef GETATTR
+    string compression_type;
+    ctx->GetAttr("compression_type", &compression_type);
 
     RecordYielder::Options yopts;
     yopts.file_pattern = file_pattern;
@@ -44,6 +46,7 @@ class RecordInputOp : public OpKernel {
     yopts.bufsize = file_buffer_size;
     yopts.file_shuffle_shift_ratio = file_shuffle_shift_ratio;
     yopts.parallelism = file_parallelism;
+    yopts.compression_type = compression_type;
     yielder_ = std::unique_ptr<RecordYielder>(new RecordYielder(ctx, yopts));
 
     batch_size_ = batch_size;
diff --git a/tensorflow/core/kernels/record_yielder.cc b/tensorflow/core/kernels/record_yielder.cc
index e4fa0ed322df57789f95efe584fe91a3efe561ec..3fd9bf9defe4aeedde1f0456638e60ea1e5e2cdb 100644
--- a/tensorflow/core/kernels/record_yielder.cc
+++ b/tensorflow/core/kernels/record_yielder.cc
@@ -206,7 +206,10 @@ void RecordYielder::ShardLoop(Shard* shard) {
       shard->status = errors::InvalidArgument("Can't open ", filename);
       break;
     }
-    io::RecordReader rdr(file.get());
+    io::RecordReaderOptions options =
+        io::RecordReaderOptions::CreateRecordReaderOptions(
+            opts_.compression_type);
+    io::RecordReader rdr(file.get(), options);
     uint64 offset = 0;
     string record;
     while (true) {
diff --git a/tensorflow/core/kernels/record_yielder.h b/tensorflow/core/kernels/record_yielder.h
index c6301812213bf569d47c1fd3b7deba3c57a31ae5..34817ad51b6e4f21e6b6b0f516c438a845b30e3b 100644
--- a/tensorflow/core/kernels/record_yielder.h
+++ b/tensorflow/core/kernels/record_yielder.h
@@ -78,6 +78,8 @@ class RecordYielder {
     // Uses these many concurrent tfrecord iterators to iterate through
     // tfrecords.
     int32 parallelism = 1;
+
+    string compression_type;
   };
 
   explicit RecordYielder(OpKernelConstruction* context,
diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc
index 807ac0a4567790ef3fb95b4c12a91a1562f83fa7..5c537c5b9c75afef2b8f4ea5446f3d4012ed0cbb 100644
--- a/tensorflow/core/kernels/reduction_ops_min.cc
+++ b/tensorflow/core/kernels/reduction_ops_min.cc
@@ -50,6 +50,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
           .TypeConstraint<int64>("Tidx")                                       \
           .HostMemory("reduction_indices"),                                    \
       ReductionOp<GPUDevice, type, int64, Eigen::internal::MinReducer<type>>);
+REGISTER_GPU_KERNELS(Eigen::half);
 REGISTER_GPU_KERNELS(float);
 REGISTER_GPU_KERNELS(double);
 
diff --git a/tensorflow/core/kernels/reduction_ops_test.cc b/tensorflow/core/kernels/reduction_ops_test.cc
index 9bbe993a2f93e522688738abaf41a518e95ef871..fe8ea59f1be521166d0e42295e79d1bb5a242750 100644
--- a/tensorflow/core/kernels/reduction_ops_test.cc
+++ b/tensorflow/core/kernels/reduction_ops_test.cc
@@ -174,6 +174,11 @@ static void BM_Min2DToScalarGPU(int iters, int num_x, int num_y) {
 }
 BENCHMARK(BM_Min2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
 
+static void BM_Min2DToScalarGPUHalf(int iters, int num_x, int num_y) {
+  ReduceToScalar<Eigen::half>(iters, "gpu", "Min", num_x, num_y);
+}
+BENCHMARK(BM_Min2DToScalarGPUHalf)->RangePair(2048, 8192, 2048, 8192);
+
 static void BM_Bool2DToScalarGPU(int iters, int num_x, int num_y) {
   ReduceToScalar<bool>(iters, "gpu", "All", num_x, num_y);
 }
diff --git a/tensorflow/core/kernels/reshape_op.cc b/tensorflow/core/kernels/reshape_op.cc
index 18ebf70c1738747ab64545f7770309a3e0865f1a..8b86596721aa41c124b35b19cac7aac264b6f574 100644
--- a/tensorflow/core/kernels/reshape_op.cc
+++ b/tensorflow/core/kernels/reshape_op.cc
@@ -43,7 +43,8 @@ REGISTER_KERNEL_BUILDER(Name("Reshape")
                               .TypeConstraint<int64>("Tshape"), \
                           ReshapeOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
-REGISTER_GPU_KERNEL(bool);
+TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/reshape_util.cc b/tensorflow/core/kernels/reshape_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4188ad233ea8f826fda28ee891a54ee9bd1156e3
--- /dev/null
+++ b/tensorflow/core/kernels/reshape_util.cc
@@ -0,0 +1,149 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <numeric>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+namespace tensorflow {
+
+void Reshape(OpKernelContext *context, const Tensor &input_indices_in,
+             const Tensor &input_shape_in, const Tensor &target_shape_in,
+             int output_indices_idx, int output_shape_idx) {
+  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_indices_in.shape()),
+              errors::InvalidArgument(
+                  "Input indices should be a matrix but received shape ",
+                  input_indices_in.shape().DebugString()));
+  OP_REQUIRES(context, TensorShapeUtils::IsVector(input_shape_in.shape()),
+              errors::InvalidArgument(
+                  "Input shape should be a vector but received shape ",
+                  input_shape_in.shape().DebugString()));
+  OP_REQUIRES(context, TensorShapeUtils::IsVector(target_shape_in.shape()),
+              errors::InvalidArgument(
+                  "Target shape should be a vector but received shape ",
+                  target_shape_in.shape().DebugString()));
+
+  const int64 input_rank = input_shape_in.NumElements();
+  const int64 output_rank = target_shape_in.NumElements();
+  const TensorShape input_shape(input_shape_in.vec<int64>());
+  const int64 dense_size = input_shape.num_elements();
+  const int64 nnz = input_indices_in.shape().dim_size(0);
+
+  // Compute the output shape. Determine product of specified dimensions, and
+  // find the index of the unspecified one.
+  TensorShape output_shape;
+  int64 product = 1;
+  int unknown_index = -1;
+  auto target_shape = target_shape_in.vec<int64>();
+  for (int d = 0; d < output_rank; ++d) {
+    const int64 size = target_shape(d);
+    if (size == -1) {
+      OP_REQUIRES(
+          context, unknown_index == -1,
+          errors::InvalidArgument("only one output dimension may be -1, "
+                                  "not both ",
+                                  unknown_index, " and ", d));
+      unknown_index = d;
+      output_shape.AddDim(1);
+    } else {
+      OP_REQUIRES(context, size >= 0,
+                  errors::InvalidArgument("size ", d,
+                                          " must be non-negative, not ", size));
+      product *= size;
+      output_shape.AddDim(size);
+    }
+  }
+  if (unknown_index != -1) {
+    OP_REQUIRES(
+        context, product > 0,
+        errors::InvalidArgument("reshape cannot infer the missing "
+                                "input size for an empty tensor unless all "
+                                "specified input sizes are non-zero"));
+    const int64 missing = dense_size / product;
+    OP_REQUIRES(
+        context, product * missing == dense_size,
+        errors::InvalidArgument(
+            "Input to reshape is a SparseTensor with ", dense_size,
+            " dense values, but the requested shape requires a multiple of ",
+            product));
+    output_shape.set_dim(unknown_index, missing);
+  }
+
+  OP_REQUIRES(
+      context, output_shape.num_elements() == dense_size,
+      errors::InvalidArgument("Input to reshape is a tensor with ", dense_size,
+                              " dense values, but the requested shape has ",
+                              output_shape.num_elements()));
+
+  // Optimize for reshaping to the same shape.
+  if (input_shape == output_shape) {
+    context->set_output(output_indices_idx, input_indices_in);
+    context->set_output(output_shape_idx, input_shape_in);
+    return;
+  }
+
+  gtl::InlinedVector<int64, 8> input_strides(input_rank);
+  input_strides[input_rank - 1] = 1;
+  for (int d = input_rank - 2; d >= 0; --d) {
+    input_strides[d] = input_strides[d + 1] * input_shape.dim_size(d + 1);
+  }
+
+  gtl::InlinedVector<int64, 8> output_strides(output_rank);
+  output_strides[output_rank - 1] = 1;
+  for (int d = output_rank - 2; d >= 0; --d) {
+    output_strides[d] = output_strides[d + 1] * output_shape.dim_size(d + 1);
+  }
+
+  Tensor *result_indices = nullptr;
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(output_indices_idx,
+                                          TensorShape({nnz, output_rank}),
+                                          &result_indices));
+  auto input_ind = input_indices_in.matrix<int64>();
+  auto output_ind = result_indices->matrix<int64>();
+  for (int i = 0; i < nnz; ++i) {
+    int64 id = 0;
+    for (int j = 0; j < input_rank; ++j) {
+      id += input_ind(i, j) * input_strides[j];
+    }
+    for (int j = 0; j < output_rank; ++j) {
+      output_ind(i, j) = id / output_strides[j];
+      id %= output_strides[j];
+    }
+  }
+
+  Tensor *result_shape = nullptr;
+  OP_REQUIRES_OK(context, context->allocate_output(output_shape_idx,
+                                                   TensorShape({output_rank}),
+                                                   &result_shape));
+  auto output_shape_vec = result_shape->vec<int64>();
+  for (int j = 0; j < output_shape.dims(); ++j) {
+    output_shape_vec(j) = output_shape.dim_size(j);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reshape_util.h b/tensorflow/core/kernels/reshape_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed583afd13824eff789ea556045507fb4cff44e6
--- /dev/null
+++ b/tensorflow/core/kernels/reshape_util.h
@@ -0,0 +1,31 @@
+
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Reshapes the input indices and input shape to the target shape.
+void Reshape(OpKernelContext *context, const Tensor &input_indices_in,
+             const Tensor &input_shape_in, const Tensor &target_shape_in,
+             int output_indices_idx, int output_shape_idx);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
diff --git a/tensorflow/core/kernels/resize_area_op_test.cc b/tensorflow/core/kernels/resize_area_op_test.cc
index cc5244d3a07031a843f3bb77e0d409cf9d64b4f2..a7e06ef15a1dd15c4c1428f44dbcd5e560b5e993 100644
--- a/tensorflow/core/kernels/resize_area_op_test.cc
+++ b/tensorflow/core/kernels/resize_area_op_test.cc
@@ -41,7 +41,7 @@ class ResizeAreaOpTest : public OpsTestBase {
     bool is_ref = IsRefType(input_types_[inputs_.size()]);
     Tensor* input = new Tensor(device_->GetAllocator(AllocatorAttributes()),
                                DataTypeToEnum<float>::v(), shape);
-    input->flat<float>().setZero();
+    input->flat<float>().setRandom();
     tensors_.push_back(input);
     if (is_ref) {
       CHECK_EQ(RemoveRefType(input_types_[inputs_.size()]),
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 0ae8a8fdbc14af81650fb756fdd20bb0d983e71e..e632baa2b4f26c2a3cd2505a19b7a623f7ba6255 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -55,6 +55,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/gather_functor.h"
@@ -110,7 +111,6 @@ REGISTER_KERNEL_BUILDER(Name("ReadVariableOp").Device(DEVICE_CPU),
                         ReadVariableOp);
 
 #if GOOGLE_CUDA
-
 REGISTER_KERNEL_BUILDER(
     Name("ReadVariableOp").Device(DEVICE_GPU).HostMemory("resource"),
     ReadVariableOp);
@@ -130,6 +130,7 @@ REGISTER_KERNEL_BUILDER(
                           ResourceHandleOp<Var>)
 
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_variant(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA
 
@@ -275,6 +276,64 @@ class AssignVariableOp : public OpKernel {
   DataType dtype_;
 };
 
+template <typename Device>
+Status VariantCopyFn(OpKernelContext* context, const Tensor& from, Tensor* to);
+
+#define CPU_DENSE_COPY(T)                                                \
+  case DataTypeToEnum<T>::value: {                                       \
+    functor::DenseUpdate<CPUDevice, T, ASSIGN> copy_functor_;            \
+    copy_functor_(context->eigen_device<CPUDevice>(), tensor->flat<T>(), \
+                  from.flat<T>());                                       \
+    break;                                                               \
+  }
+
+#define INSTANTIATE_GET_VARIANT_COPY_FN(Device, TYPE_CALLER, TYPE_DENSE_COPY) \
+  template <>                                                                 \
+  Status VariantCopyFn<Device>(OpKernelContext * context, const Tensor& from, \
+                               Tensor* to) {                                  \
+    PersistentTensor tmp;                                                     \
+    Tensor* tensor;                                                           \
+    AllocatorAttributes attr;                                                 \
+    attr.set_gpu_compatible(true);                                            \
+    attr.set_nic_compatible(true);                                            \
+    TF_RETURN_IF_ERROR(context->allocate_persistent(                          \
+        from.dtype(), from.shape(), &tmp, &tensor, attr));                    \
+    switch (from.dtype()) {                                                   \
+      TYPE_CALLER(TYPE_DENSE_COPY);                                           \
+      default:                                                                \
+        return errors::InvalidArgument(                                       \
+            "VariantCopyFn: Could not perform a deep copy of variant "        \
+            "element of type: ",                                              \
+            DataTypeString(from.dtype()),                                     \
+            " using device: ", context->device()->name());                    \
+    }                                                                         \
+    *to = *tensor;                                                            \
+    return Status::OK();                                                      \
+  }
+
+INSTANTIATE_GET_VARIANT_COPY_FN(CPUDevice, TF_CALL_ALL_TYPES, CPU_DENSE_COPY);
+
+#if GOOGLE_CUDA
+#define GPU_DENSE_COPY(T)                                                \
+  case DataTypeToEnum<T>::value: {                                       \
+    functor::DenseUpdate<GPUDevice, T, ASSIGN> copy_functor_;            \
+    copy_functor_(context->eigen_device<GPUDevice>(), tensor->flat<T>(), \
+                  from.flat<T>());                                       \
+    break;                                                               \
+  }
+#define TF_CALL_GPU_AND_ADDITIONAL_TYPES(T) \
+  TF_CALL_GPU_ALL_TYPES(T);                 \
+  TF_CALL_int32(T);                         \
+  TF_CALL_int64(T);
+INSTANTIATE_GET_VARIANT_COPY_FN(GPUDevice, TF_CALL_GPU_AND_ADDITIONAL_TYPES,
+                                GPU_DENSE_COPY);
+#undef TF_CALL_GPU_AND_ADDITIONAL_TYPES
+#undef GPU_DENSE_COPY
+#endif  // GOOGLE_CUDA
+
+#undef CPU_DENSE_COPY
+#undef INSTANTIATE_GET_VARIANT_COPY_FN
+
 template <typename Device>
 class AssignVariableOp<Device, Variant> : public OpKernel {
  public:
@@ -287,21 +346,15 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& value = context->input(1);
-    OP_REQUIRES(context, dtype_ == value.dtype(),
-                errors::InvalidArgument(
-                    "Variable and value dtypes don't match; respectively, ",
-                    dtype_, " and ", context->input(1).dtype()));
-
     Var* variable = nullptr;
     OP_REQUIRES_OK(context, LookupOrCreateResource<Var>(
                                 context, HandleFromInput(context, 0), &variable,
                                 [this, context](Var** ptr) {
-                                  *ptr = new Var(dtype_);
-                                  // Create an empty new Variant tensor.
+                                  // Created on host.
+                                  *ptr = new Var(DT_VARIANT);
                                   return Status::OK();
                                 }));
     core::ScopedUnref s(variable);
-
     OP_REQUIRES(context, variable->tensor()->dtype() == DT_VARIANT,
                 errors::InvalidArgument(
                     "Trying to assign variable with wrong dtype. Expected ",
@@ -309,16 +362,17 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
                     DataTypeString(DT_VARIANT)));
 
     mutex_lock ml(*variable->mu());
-    // TODO(ebrevdo): Add a proper Variant deep copy / assign registry
-    // entry and use that here.  For now, use a serialization
-    // roundtrip to perform the copy on CPU.  This is OK because this
-    // op is not registered for GPU.
-    *variable->tensor() = Tensor();
-    TensorProto tmp;
-    value.AsProtoTensorContent(&tmp);
-    OP_REQUIRES(context, variable->tensor()->FromProto(tmp),
-                errors::Internal("Could not properly reserialize values "
-                                 "Variant.  Check logs for more details."));
+
+    *variable->tensor() = Tensor(DT_VARIANT, value.shape());
+    const auto elements_in = value.flat<Variant>();
+    auto elements_out = variable->tensor()->flat<Variant>();
+    auto copy_fn = std::bind(&VariantCopyFn<Device>, context,
+                             std::placeholders::_1, std::placeholders::_2);
+    for (int64 i = 0; i < elements_in.size(); ++i) {
+      OP_REQUIRES_OK(context, VariantDeviceCopy(
+                                  VariantDeviceCopyDirection::DEVICE_TO_DEVICE,
+                                  elements_in(i), &elements_out(i), copy_fn));
+    };
   }
 
  private:
@@ -345,6 +399,7 @@ TF_CALL_variant(REGISTER_KERNELS);
                           AssignVariableOp<GPUDevice, type>);
 
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_variant(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA
 
@@ -464,8 +519,7 @@ class ResourceGatherOp : public OpKernel {
       auto out_flat = out->shaped<T, 3>({1, N, out->NumElements() / N});
 
       functor::GatherFunctor<Device, T, Index> functor;
-      int64 bad_i = functor(c, params_flat,
-                            indices_flat, out_flat);
+      int64 bad_i = functor(c, params_flat, indices_flat, out_flat);
 
       OP_REQUIRES(
           c, bad_i < 0,
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 484932ab0157dee1685b2b90a6c013c11dac061d..3a95dd1773398509e81a514f07fd79f5cb9a0928 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #endif  // GOOGLE_CUDA
 
 #include "tensorflow/core/kernels/scatter_nd_op.h"
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -28,6 +29,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/kernels/variable_ops.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
@@ -83,7 +86,10 @@ class ScatterNdUpdateOp : public OpKernel {
     const DataType dt = DataTypeToEnum<T>::v();
     const DataType dt_ref = DataTypeToEnum<T>::ref();
     const DataType index_t = DataTypeToEnum<Index>::v();
-    if (IsRefType(c->input_type(0))) {
+    dtype_ = c->input_type(0);
+    if (c->input_type(0) == DT_RESOURCE) {
+      // TODO(apassos): what to validate here?
+    } else if (IsRefType(c->input_type(0))) {
       OP_REQUIRES_OK(c, c->MatchSignature({dt_ref, index_t, dt}, {dt_ref}));
       OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_));
     } else {
@@ -93,7 +99,16 @@ class ScatterNdUpdateOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* c) override {
-    if (use_exclusive_lock_) {
+    if (dtype_ == DT_RESOURCE) {
+      if (use_exclusive_lock_) {
+        Var* v;
+        OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
+        mutex_lock m(*v->mu());
+        DoCompute(c);
+      } else {
+        DoCompute(c);
+      }
+    } else if (use_exclusive_lock_) {
       // If we're here, it means the input type is a ref.
       DCHECK(IsRefType(c->input_dtype(0)));
       // Hold mutex while we apply updates
@@ -105,6 +120,7 @@ class ScatterNdUpdateOp : public OpKernel {
   }
 
  private:
+  DataType dtype_;
   bool use_exclusive_lock_;
 
   void DoCompute(OpKernelContext* c) {
@@ -113,7 +129,20 @@ class ScatterNdUpdateOp : public OpKernel {
     Tensor params;
     TensorShape params_shape;
 
-    if (IsRefType(c->input_dtype(0))) {
+    if (dtype_ == DT_RESOURCE) {
+      Var* v;
+      OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
+      Tensor* t = v->tensor();
+      if (!use_exclusive_lock_) {
+        // We're not holding the lock in the outer scope so need it here.
+        mutex_lock m(*v->mu());
+        OP_REQUIRES_OK(c, PrepareToUpdateVariable<Device, T>(c, t));
+      } else {
+        OP_REQUIRES_OK(c, PrepareToUpdateVariable<Device, T>(c, t));
+      }
+      params = *t;
+      params_shape = params.shape();
+    } else if (IsRefType(c->input_dtype(0))) {
       params = c->mutable_input(0, use_exclusive_lock_);
       params_shape = params.shape();
       c->forward_ref_input_to_ref_output(0, 0);
@@ -159,6 +188,16 @@ class ScatterNdUpdateOp : public OpKernel {
           .TypeConstraint<index_type>("Tindices"),                           \
       ScatterNdUpdateOp<dev##Device, type, index_type, op>)
 
+#define REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX(type, index_type, \
+                                                         dev, name, op)    \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name(name)                                                           \
+          .Device(DEVICE_##dev)                                            \
+          .TypeConstraint<type>("T")                                       \
+          .TypeConstraint<index_type>("Tindices")                          \
+          .HostMemory("ref"),                                              \
+      ScatterNdUpdateOp<dev##Device, type, index_type, op>)
+
 #define REGISTER_SCATTER_ND_KERNEL(type, dev, name)         \
   REGISTER_SCATTER_ND_KERNEL_INDEX(type, int32, dev, name); \
   REGISTER_SCATTER_ND_KERNEL_INDEX(type, int64, dev, name)
@@ -167,6 +206,11 @@ class ScatterNdUpdateOp : public OpKernel {
   REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int32, dev, name, op); \
   REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int64, dev, name, op)
 
+#define REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(type, dev, name, op)    \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int32, dev, name, \
+                                                   op);                    \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int64, dev, name, op)
+
 #define REGISTER_SCATTER_ND_ADD_SUB(type, dev)                            \
   REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdAdd",            \
                                     scatter_nd_op::UpdateOp::ADD);        \
@@ -178,9 +222,11 @@ class ScatterNdUpdateOp : public OpKernel {
 #define REGISTER_SCATTER_ND(type, dev) \
   REGISTER_SCATTER_ND_KERNEL(type, dev, "ScatterNd");
 
-#define REGISTER_SCATTER_ND_UPDATE(type, dev)                     \
-  REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdUpdate", \
-                                    scatter_nd_op::UpdateOp::ASSIGN);
+#define REGISTER_SCATTER_ND_UPDATE(type, dev)                         \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdUpdate",     \
+                                    scatter_nd_op::UpdateOp::ASSIGN); \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(                         \
+      type, dev, "ResourceScatterNdUpdate", scatter_nd_op::UpdateOp::ASSIGN);
 
 // Registers CPU kernels.
 #define REGISTER_SCATTER_ND_ADD_SUB_CPU(type) \
@@ -281,8 +327,7 @@ Status ValidateUpdateShape(const TensorShape& params_shape,
 }
 
 template <typename Index>
-Status PrepareAndValidateInputs(OpKernelContext* c,
-                                const TensorShape& params_shape,
+Status PrepareAndValidateInputs(const TensorShape& params_shape,
                                 const Tensor& indices, const Tensor& updates,
                                 int64* slice_dim, Index* num_updates,
                                 Index* slice_size) {
@@ -396,7 +441,7 @@ Status DoScatterNd(OpKernelContext* c, const Tensor& indices,
   Index num_updates;
   Index slice_size;
   TF_RETURN_IF_ERROR(PrepareAndValidateInputs<Index>(
-      c, shape, indices, updates, &slice_dim, &num_updates, &slice_size));
+      shape, indices, updates, &slice_dim, &num_updates, &slice_size));
 
   IndexFlattener<Device, Index> index_flattener;
   auto indices_flat = index_flattener(c, indices);
@@ -442,6 +487,8 @@ Status DoScatterNd(OpKernelContext* c, const Tensor& indices,
       PARAMS_CASE(3);
       PARAMS_CASE(4);
       PARAMS_CASE(5);
+      PARAMS_CASE(6);
+      PARAMS_CASE(7);
 #undef PARAMS_CASE
       default:
         return errors::InvalidArgument(
@@ -480,7 +527,9 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 2); \
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 3); \
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 4); \
-  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 5);
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 5); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 6); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 7);
 
 #define DECLARE_GPU_SPECS_INDEX(T, Index)                                \
   DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::ASSIGN); \
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl_6.cc b/tensorflow/core/kernels/scatter_nd_op_cpu_impl_6.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d98412e2551b5eacb9190838b922cadd26d7aaf2
--- /dev/null
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl_6.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 6
+#include "tensorflow/core/kernels/scatter_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl_7.cc b/tensorflow/core/kernels/scatter_nd_op_cpu_impl_7.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a008b55603c060953015a463cf49f5768bde637a
--- /dev/null
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl_7.cc
@@ -0,0 +1,19 @@
+
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 7
+#include "tensorflow/core/kernels/scatter_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index 0eb3cf32dd33705cffe4c37dbe91eb0ffc31563a..31f74671cabdabce2884fcae61a6e56dbfdefe8b 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -136,7 +136,9 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 2); \
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 3); \
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 4); \
-  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 5);
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 5); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 6); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 7);
 
 #define DECLARE_GPU_SPECS_INDEX(T, Index)                                \
   DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::ASSIGN); \
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 2334e50f1dcb08e9662615e83d721f8b08568102..3ef1cd1e062b5f5abecca2f4f788e3fed20e33e9 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -553,10 +553,11 @@ class SparseSegmentReductionOpBase : public OpKernel {
  public:
   explicit SparseSegmentReductionOpBase(OpKernelConstruction* context,
                                         bool is_mean, bool is_sqrtn,
-                                        T default_value)
+                                        bool has_num_segments, T default_value)
       : OpKernel(context),
         is_mean_(is_mean),
         is_sqrtn_(is_sqrtn),
+        has_num_segments_(has_num_segments),
         default_value_(default_value) {}
 
   void Compute(OpKernelContext* context) override {
@@ -564,6 +565,19 @@ class SparseSegmentReductionOpBase : public OpKernel {
     const Tensor& indices = context->input(1);
     const Tensor& segment_ids = context->input(2);
 
+    Index output_rows = -1;
+    if (has_num_segments_) {
+      const Tensor& num_segments = context->input(3);
+
+      OP_REQUIRES(
+          context, num_segments.shape().dims() == 0,
+          errors::InvalidArgument("num_segments should be a scalar, not shape ",
+                                  num_segments.shape().DebugString()));
+      output_rows = internal::SubtleMustCopy(num_segments.scalar<int32>()());
+      OP_REQUIRES(context, output_rows >= 0,
+                  errors::InvalidArgument("segment ids must be >= 0"));
+    }
+
     OP_REQUIRES(context, TensorShapeUtils::IsVector(indices.shape()),
                 errors::InvalidArgument("indices should be a vector."));
     OP_REQUIRES(context, TensorShapeUtils::IsVector(segment_ids.shape()),
@@ -581,10 +595,17 @@ class SparseSegmentReductionOpBase : public OpKernel {
     const auto segment_vec = segment_ids.vec<OutputRow>();
     // Note that the current implementation assumes that segment_vec values are
     // sorted.
-    const OutputRow output_rows =
+    const OutputRow last_segment_id_plus_one =
         num_indices > 0
             ? internal::SubtleMustCopy(segment_vec(num_indices - 1)) + 1
             : 0;
+    if (has_num_segments_) {
+      OP_REQUIRES(
+          context, output_rows >= last_segment_id_plus_one,
+          errors::InvalidArgument("segment ids must be < num_segments"));
+    } else {
+      output_rows = last_segment_id_plus_one;
+    }
     OP_REQUIRES(context, output_rows >= 0,
                 errors::InvalidArgument("segment ids must be >= 0"));
 
@@ -646,11 +667,20 @@ class SparseSegmentReductionOpBase : public OpKernel {
                       indices_vec(start + bad_offset), " out of range [0, ",
                       input_flat.dimension(0), ")"));
 
-      if (end >= num_indices) break;
       start = end;
       ++end;
       uninitialized_index = out_index + 1;
       out_index = next_index;
+      if (end > num_indices) break;
+    }
+
+    // Fill the gap at the end with the default value.
+    if (uninitialized_index < output_rows) {
+      Eigen::DSizes<Eigen::DenseIndex, 2> gap_slice_shape(
+          output_rows - uninitialized_index, num_col);
+      Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Unaligned>
+          gap_slice(&output_flat(uninitialized_index, 0), gap_slice_shape);
+      gap_slice.setConstant(default_value_);
     }
   }
 
@@ -786,6 +816,7 @@ class SparseSegmentReductionOpBase : public OpKernel {
 
   const bool is_mean_;
   const bool is_sqrtn_;
+  const bool has_num_segments_;
   const T default_value_;
 };
 
@@ -794,9 +825,20 @@ class SparseSegmentReductionMeanOp
     : public SparseSegmentReductionOpBase<Device, T> {
  public:
   explicit SparseSegmentReductionMeanOp(OpKernelConstruction* context)
-      : SparseSegmentReductionOpBase<Device, T>(context, true /*is_mean*/,
-                                                false /*is_sqrtn*/,
-                                                T(0) /* default_value */) {}
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, true /*is_mean*/, false /*is_sqrtn*/,
+            false /* has_num_segments */, T(0) /* default_value */) {}
+};
+
+template <typename Device, class T>
+class SparseSegmentReductionMeanWithNumSegmentsOp
+    : public SparseSegmentReductionOpBase<Device, T> {
+ public:
+  explicit SparseSegmentReductionMeanWithNumSegmentsOp(
+      OpKernelConstruction* context)
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, true /*is_mean*/, false /*is_sqrtn*/,
+            true /* has_num_segments */, T(0) /* default_value */) {}
 };
 
 template <typename Device, class T>
@@ -804,9 +846,20 @@ class SparseSegmentReductionSqrtNOp
     : public SparseSegmentReductionOpBase<Device, T> {
  public:
   explicit SparseSegmentReductionSqrtNOp(OpKernelConstruction* context)
-      : SparseSegmentReductionOpBase<Device, T>(context, false /*is_mean*/,
-                                                true /*is_sqrtn*/,
-                                                T(0) /* default_value */) {}
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, false /*is_mean*/, true /*is_sqrtn*/,
+            false /* has_num_segments */, T(0) /* default_value */) {}
+};
+
+template <typename Device, class T>
+class SparseSegmentReductionSqrtNWithNumSegmentsOp
+    : public SparseSegmentReductionOpBase<Device, T> {
+ public:
+  explicit SparseSegmentReductionSqrtNWithNumSegmentsOp(
+      OpKernelConstruction* context)
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, false /*is_mean*/, true /*is_sqrtn*/,
+            true /* has_num_segments */, T(0) /* default_value */) {}
 };
 
 template <typename Device, class T>
@@ -814,37 +867,65 @@ class SparseSegmentReductionSumOp
     : public SparseSegmentReductionOpBase<Device, T> {
  public:
   explicit SparseSegmentReductionSumOp(OpKernelConstruction* context)
-      : SparseSegmentReductionOpBase<Device, T>(context, false /*is_mean*/,
-                                                false /*is_sqrtn*/,
-                                                T(0) /* default_value */) {}
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, false /*is_mean*/, false /*is_sqrtn*/,
+            false /* has_num_segments */, T(0) /* default_value */) {}
 };
 
-#define REGISTER_CPU_SPARSE_KERNELS(type)                     \
-  REGISTER_KERNEL_BUILDER(Name("SparseSegmentSum")            \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("T")      \
-                              .TypeConstraint<int32>("Tidx"), \
-                          SparseSegmentReductionSumOp<CPUDevice, type>);
+template <typename Device, class T>
+class SparseSegmentReductionSumWithNumSegmentsOp
+    : public SparseSegmentReductionOpBase<Device, T> {
+ public:
+  explicit SparseSegmentReductionSumWithNumSegmentsOp(
+      OpKernelConstruction* context)
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, false /*is_mean*/, false /*is_sqrtn*/,
+            true /* has_num_segments */, T(0) /* default_value */) {}
+};
 
+#define REGISTER_CPU_SPARSE_KERNELS(type)                                \
+  REGISTER_KERNEL_BUILDER(Name("SparseSegmentSum")                       \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<type>("T")                 \
+                              .TypeConstraint<int32>("Tidx"),            \
+                          SparseSegmentReductionSumOp<CPUDevice, type>); \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("SparseSegmentSumWithNumSegments")                            \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<type>("T")                                     \
+          .TypeConstraint<int32>("Tidx"),                                \
+      SparseSegmentReductionSumWithNumSegmentsOp<CPUDevice, type>);
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_SPARSE_KERNELS);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
-#define REGISTER_CPU_SPARSE_KERNELS(type)                     \
-  REGISTER_KERNEL_BUILDER(Name("SparseSegmentMean")           \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("T")      \
-                              .TypeConstraint<int32>("Tidx"), \
-                          SparseSegmentReductionMeanOp<CPUDevice, type>);
+#define REGISTER_CPU_SPARSE_KERNELS(type)                                 \
+  REGISTER_KERNEL_BUILDER(Name("SparseSegmentMean")                       \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<type>("T")                  \
+                              .TypeConstraint<int32>("Tidx"),             \
+                          SparseSegmentReductionMeanOp<CPUDevice, type>); \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("SparseSegmentMeanWithNumSegments")                            \
+          .Device(DEVICE_CPU)                                             \
+          .TypeConstraint<type>("T")                                      \
+          .TypeConstraint<int32>("Tidx"),                                 \
+      SparseSegmentReductionMeanWithNumSegmentsOp<CPUDevice, type>);
 REGISTER_CPU_SPARSE_KERNELS(float);
 REGISTER_CPU_SPARSE_KERNELS(double);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
-#define REGISTER_CPU_SPARSE_KERNELS(type)                     \
-  REGISTER_KERNEL_BUILDER(Name("SparseSegmentSqrtN")          \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("T")      \
-                              .TypeConstraint<int32>("Tidx"), \
-                          SparseSegmentReductionSqrtNOp<CPUDevice, type>);
+#define REGISTER_CPU_SPARSE_KERNELS(type)                                  \
+  REGISTER_KERNEL_BUILDER(Name("SparseSegmentSqrtN")                       \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<type>("T")                   \
+                              .TypeConstraint<int32>("Tidx"),              \
+                          SparseSegmentReductionSqrtNOp<CPUDevice, type>); \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("SparseSegmentSqrtNWithNumSegments")                            \
+          .Device(DEVICE_CPU)                                              \
+          .TypeConstraint<type>("T")                                       \
+          .TypeConstraint<int32>("Tidx"),                                  \
+      SparseSegmentReductionSqrtNWithNumSegmentsOp<CPUDevice, type>);
 REGISTER_CPU_SPARSE_KERNELS(float);
 REGISTER_CPU_SPARSE_KERNELS(double);
 #undef REGISTER_CPU_SPARSE_KERNELS
@@ -889,9 +970,10 @@ class SparseSegmentGradOpBase : public OpKernel {
 
     // Note that similar to SparseSegmentMean, we assume that segment_vec is
     // already sorted and has non-negative values.
-    const SegmentId num_segments =
+    const SegmentId num_segments = input.dim_size(0);
+    const SegmentId last_segment_id_plus_one =
         internal::SubtleMustCopy(segment_vec(N - 1)) + 1;
-    OP_REQUIRES(context, input.dim_size(0) == num_segments,
+    OP_REQUIRES(context, last_segment_id_plus_one <= num_segments,
                 errors::InvalidArgument("Invalid number of segments"));
 
     // Compute scaling factors for input.
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 542382872cc706eb868639d0b26ceece98eb41b1..206fd40fa68c3158fa60b7651d40121ab1344bbd 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -142,17 +142,12 @@ RecvOp::RecvOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
   }
 }
 
-void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
-  OP_REQUIRES(
-      ctx, ctx->rendezvous() != nullptr,
-      errors::Internal("Op kernel context needs to provide a rendezvous."));
-
-  Rendezvous::Args args;
-  args.device_context = ctx->op_device_context();
-  args.alloc_attrs = ctx->output_alloc_attr(0);
+namespace {
+Rendezvous::DoneCallback make_recv_callback(OpKernelContext* ctx,
+                                            AsyncOpKernel::DoneCallback done) {
   using namespace std::placeholders;
-  Rendezvous::DoneCallback done_cb = std::bind(
-      [ctx](DoneCallback done,
+  return std::bind(
+      [ctx](AsyncOpKernel::DoneCallback done,
             // Begin unbound arguments.
             const Status& s, const Rendezvous::Args& send_args,
             const Rendezvous::Args& recv_args, const Tensor& val,
@@ -170,19 +165,31 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
         done();
       },
       std::move(done), _1, _2, _3, _4, _5);
+}
+}  // namespace
+
+void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
+  OP_REQUIRES(
+      ctx, ctx->rendezvous() != nullptr,
+      errors::Internal("Op kernel context needs to provide a rendezvous."));
+
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = ctx->output_alloc_attr(0);
 
   FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
   if (frame_iter == FrameAndIter(0, 0)) {
     VLOG(2) << "Recv " << parsed_key_.buf_;
-    ctx->rendezvous()->RecvAsync(parsed_key_, args, std::move(done_cb));
+    ctx->rendezvous()->RecvAsync(parsed_key_, args,
+                                 make_recv_callback(ctx, std::move(done)));
   } else {
     Rendezvous::ParsedKey in_loop_parsed;
     GetRendezvousKey(key_prefix_, frame_iter, &in_loop_parsed.buf_);
     VLOG(2) << "Recv " << in_loop_parsed.buf_;
     OP_REQUIRES_OK_ASYNC(
         ctx, Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed), done);
-
-    ctx->rendezvous()->RecvAsync(in_loop_parsed, args, std::move(done_cb));
+    ctx->rendezvous()->RecvAsync(in_loop_parsed, args,
+                                 make_recv_callback(ctx, std::move(done)));
   }
 }
 
diff --git a/tensorflow/core/kernels/serialize_sparse_op.cc b/tensorflow/core/kernels/serialize_sparse_op.cc
index ac58c3d1ea9649f936472e995e1c72ad0c509b0c..61e40caef99c019914fc331bee5d8beab0883f41 100644
--- a/tensorflow/core/kernels/serialize_sparse_op.cc
+++ b/tensorflow/core/kernels/serialize_sparse_op.cc
@@ -27,22 +27,31 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/kernels/reshape_util.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 
 namespace tensorflow {
 
 using sparse::SparseTensor;
 
+template <typename T>
 class SerializeSparseOp : public OpKernel {
  public:
   explicit SerializeSparseOp(OpKernelConstruction* context)
       : OpKernel(context) {}
 
+  Status Initialize(Tensor* result);
+  Status Serialize(const Tensor& input, T* result);
+
   void Compute(OpKernelContext* context) override {
     const Tensor* input_indices;
     const Tensor* input_values;
     const Tensor* input_shape;
+
     OP_REQUIRES_OK(context, context->input("sparse_indices", &input_indices));
     OP_REQUIRES_OK(context, context->input("sparse_values", &input_values));
     OP_REQUIRES_OK(context, context->input("sparse_shape", &input_shape));
@@ -61,34 +70,75 @@ class SerializeSparseOp : public OpKernel {
                     "Input shape should be a vector but received shape ",
                     input_shape->shape().DebugString()));
 
-    TensorProto proto_indices;
-    TensorProto proto_values;
-    TensorProto proto_shape;
-
-    input_indices->AsProtoTensorContent(&proto_indices);
-    input_values->AsProtoTensorContent(&proto_values);
-    input_shape->AsProtoTensorContent(&proto_shape);
+    Tensor serialized_sparse;
+    OP_REQUIRES_OK(context, Initialize(&serialized_sparse));
 
-    Tensor serialized_sparse(DT_STRING, TensorShape({3}));
-    auto serialized_sparse_t = serialized_sparse.vec<string>();
-
-    serialized_sparse_t(0) = proto_indices.SerializeAsString();
-    serialized_sparse_t(1) = proto_values.SerializeAsString();
-    serialized_sparse_t(2) = proto_shape.SerializeAsString();
+    auto serialized_sparse_t = serialized_sparse.vec<T>();
+    OP_REQUIRES_OK(context, Serialize(*input_indices, &serialized_sparse_t(0)));
+    OP_REQUIRES_OK(context, Serialize(*input_values, &serialized_sparse_t(1)));
+    OP_REQUIRES_OK(context, Serialize(*input_shape, &serialized_sparse_t(2)));
 
     context->set_output(0, serialized_sparse);
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("SerializeSparse").Device(DEVICE_CPU),
-                        SerializeSparseOp);
+template <>
+Status SerializeSparseOp<string>::Initialize(Tensor* result) {
+  *result = Tensor(DT_STRING, TensorShape({3}));
+  return Status::OK();
+}
+
+template <>
+Status SerializeSparseOp<string>::Serialize(const Tensor& input,
+                                            string* result) {
+  TensorProto proto;
+  input.AsProtoTensorContent(&proto);
+  *result = proto.SerializeAsString();
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("SerializeSparse")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<string>("out_type"),
+                        SerializeSparseOp<string>);
+
+template <>
+Status SerializeSparseOp<Variant>::Initialize(Tensor* result) {
+  *result = Tensor(DT_VARIANT, TensorShape({3}));
+  return Status::OK();
+}
+
+template <>
+Status SerializeSparseOp<Variant>::Serialize(const Tensor& input,
+                                             Variant* result) {
+  *result = input;
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("SerializeSparse")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<Variant>("out_type"),
+                        SerializeSparseOp<Variant>);
 
 template <typename T>
-class SerializeManySparseOp : public OpKernel {
+class SerializeManySparseOpBase : public OpKernel {
  public:
-  explicit SerializeManySparseOp(OpKernelConstruction* context)
+  explicit SerializeManySparseOpBase(OpKernelConstruction* context)
       : OpKernel(context) {}
 
+  void Compute(OpKernelContext* context) override {}
+
+ protected:
+  Status Initialize(const int64 n, Tensor* result);
+  Status Serialize(const Tensor& input, T* result);
+};
+
+template <typename T, typename U>
+class SerializeManySparseOp : public SerializeManySparseOpBase<U> {
+ public:
+  explicit SerializeManySparseOp(OpKernelConstruction* context)
+      : SerializeManySparseOpBase<U>(context) {}
+
   void Compute(OpKernelContext* context) override {
     const Tensor* input_indices;
     const Tensor* input_values;
@@ -126,37 +176,31 @@ class SerializeManySparseOp : public OpKernel {
 
     auto input_shape_t = input_shape->vec<int64>();
     const int64 N = input_shape_t(0);
-
-    Tensor serialized_sparse(DT_STRING, TensorShape({N, 3}));
-    auto serialized_sparse_t = serialized_sparse.matrix<string>();
+    Tensor serialized_sparse;
+    OP_REQUIRES_OK(context, this->Initialize(N, &serialized_sparse));
+    auto serialized_sparse_t = serialized_sparse.matrix<U>();
 
     OP_REQUIRES_OK(context, input_st.IndicesValid());
 
-    // We can generate the output shape proto string now, for all
-    // minibatch entries.
-    Tensor output_shape(DT_INT64, {rank - 1});
-    auto output_shape_t = output_shape.vec<int64>();
-    for (int d = 1; d < rank; d++) output_shape_t(d - 1) = input_shape_t(d);
-    TensorProto proto_shape;
-    output_shape.AsProtoTensorContent(&proto_shape);
-    const string proto_shape_string = proto_shape.SerializeAsString();
-
+    // Initialize output with empty values and the proper shapes.
     Tensor output_blank_indices(DT_INT64, {0, rank - 1});
-    Tensor output_blank_values(DataTypeToEnum<T>::value, {0});
-    TensorProto proto_blank_indices;
-    TensorProto proto_blank_values;
-    output_blank_indices.AsProtoTensorContent(&proto_blank_indices);
-    output_blank_values.AsProtoTensorContent(&proto_blank_values);
+    U serialized_indices;
+    OP_REQUIRES_OK(context,
+                   this->Serialize(output_blank_indices, &serialized_indices));
+    serialized_sparse_t.template chip<1>(0).setConstant(serialized_indices);
 
-    const string proto_blank_indices_string =
-        proto_blank_indices.SerializeAsString();
-    const string proto_blank_values_string =
-        proto_blank_values.SerializeAsString();
+    Tensor output_blank_values(DataTypeToEnum<T>::value, {0});
+    U serialized_values;
+    OP_REQUIRES_OK(context,
+                   this->Serialize(output_blank_values, &serialized_values));
+    serialized_sparse_t.template chip<1>(1).setConstant(serialized_values);
 
-    // Initialize output with empty values and the proper shapes.
-    serialized_sparse_t.chip<1>(0).setConstant(proto_blank_indices_string);
-    serialized_sparse_t.chip<1>(1).setConstant(proto_blank_values_string);
-    serialized_sparse_t.chip<1>(2).setConstant(proto_shape_string);
+    Tensor output_shape(DT_INT64, {rank - 1});
+    auto output_shape_t = output_shape.vec<int64>();
+    for (int d = 1; d < rank; d++) output_shape_t(d - 1) = input_shape_t(d);
+    U serialized_shape;
+    OP_REQUIRES_OK(context, this->Serialize(output_shape, &serialized_shape));
+    serialized_sparse_t.template chip<1>(2).setConstant(serialized_shape);
 
     // Get groups by minibatch dimension
     sparse::GroupIterable minibatch = input_st.group({0});
@@ -185,24 +229,62 @@ class SerializeManySparseOp : public OpKernel {
         output_values_t(i) = values(i);
       }
 
-      TensorProto proto_indices;
-      TensorProto proto_values;
-      output_indices.AsProtoTensorContent(&proto_indices);
-      output_values.AsProtoTensorContent(&proto_values);
-
-      serialized_sparse_t(b, 0) = proto_indices.SerializeAsString();
-      serialized_sparse_t(b, 1) = proto_values.SerializeAsString();
+      OP_REQUIRES_OK(
+          context, this->Serialize(output_indices, &serialized_sparse_t(b, 0)));
+      OP_REQUIRES_OK(
+          context, this->Serialize(output_values, &serialized_sparse_t(b, 1)));
     }
 
     context->set_output(0, serialized_sparse);
   }
 };
 
-#define REGISTER_KERNELS(type)                            \
-  REGISTER_KERNEL_BUILDER(Name("SerializeManySparse")     \
-                              .Device(DEVICE_CPU)         \
-                              .TypeConstraint<type>("T"), \
-                          SerializeManySparseOp<type>)
+template <>
+Status SerializeManySparseOpBase<string>::Initialize(const int64 n,
+                                                     Tensor* result) {
+  *result = Tensor(DT_STRING, TensorShape({n, 3}));
+  return Status::OK();
+}
+
+template <>
+Status SerializeManySparseOpBase<string>::Serialize(const Tensor& input,
+                                                    string* result) {
+  TensorProto proto;
+  input.AsProtoTensorContent(&proto);
+  *result = proto.SerializeAsString();
+  return Status::OK();
+}
+
+#define REGISTER_KERNELS(type)                                     \
+  REGISTER_KERNEL_BUILDER(Name("SerializeManySparse")              \
+                              .Device(DEVICE_CPU)                  \
+                              .TypeConstraint<type>("T")           \
+                              .TypeConstraint<string>("out_type"), \
+                          SerializeManySparseOp<type, string>)
+
+TF_CALL_ALL_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+template <>
+Status SerializeManySparseOpBase<Variant>::Initialize(const int64 n,
+                                                      Tensor* result) {
+  *result = Tensor(DT_VARIANT, TensorShape({n, 3}));
+  return Status::OK();
+}
+
+template <>
+Status SerializeManySparseOpBase<Variant>::Serialize(const Tensor& input,
+                                                     Variant* result) {
+  *result = input;
+  return Status::OK();
+}
+
+#define REGISTER_KERNELS(type)                                      \
+  REGISTER_KERNEL_BUILDER(Name("SerializeManySparse")               \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .TypeConstraint<Variant>("out_type"), \
+                          SerializeManySparseOp<type, Variant>)
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
@@ -211,281 +293,264 @@ template <typename T>
 class DeserializeSparseOp : public OpKernel {
  public:
   explicit DeserializeSparseOp(OpKernelConstruction* context)
-      : OpKernel(context) {}
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
+  }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& serialized_sparse = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(serialized_sparse.shape()),
-                errors::InvalidArgument(
-                    "Serialized sparse should be a vector but received shape ",
-                    serialized_sparse.shape().DebugString()));
-    OP_REQUIRES(
-        context, serialized_sparse.shape().dim_size(0) == 3,
-        errors::InvalidArgument(
-            "Serialize sparse should have 3 columns but received shape ",
-            serialized_sparse.shape().DebugString()));
-
-    Tensor output_indices(DT_INT64);
-    Tensor output_values(DataTypeToEnum<T>::value);
-    Tensor output_shape(DT_INT64);
-    TensorProto proto_indices;
-    TensorProto proto_values;
-    TensorProto proto_shape;
-
-    const auto& serialized_sparse_t = serialized_sparse.vec<string>();
-
-    OP_REQUIRES(
-        context, ParseProtoUnlimited(&proto_indices, serialized_sparse_t(0)),
-        errors::InvalidArgument("Could not parse serialized_sparse[0]"));
-    OP_REQUIRES(
-        context, ParseProtoUnlimited(&proto_values, serialized_sparse_t(1)),
-        errors::InvalidArgument("Could not parse serialized_sparse[1]"));
-    OP_REQUIRES(
-        context, ParseProtoUnlimited(&proto_shape, serialized_sparse_t(2)),
-        errors::InvalidArgument("Could not parse serialized_sparse[2]"));
+    const int ndims = serialized_sparse.shape().dims();
 
     OP_REQUIRES(
-        context, output_indices.FromProto(proto_indices),
-        errors::InvalidArgument(
-            "Could not construct Tensor serialized_sparse[0] (indices)"));
-    OP_REQUIRES(
-        context, TensorShapeUtils::IsMatrix(output_indices.shape()),
-        errors::InvalidArgument("Expected serialized_sparse[0] to represent an "
-                                "index matrix but received shape ",
-                                output_indices.shape().DebugString()));
-    OP_REQUIRES(
-        context, output_values.FromProto(proto_values),
-        errors::InvalidArgument(
-            "Could not construct Tensor serialized_sparse[1] (values)"));
-    OP_REQUIRES(
-        context, TensorShapeUtils::IsVector(output_values.shape()),
-        errors::InvalidArgument("Expected serialized_sparse[1] to represent a "
-                                "values vector but received shape ",
-                                output_values.shape().DebugString()));
-    OP_REQUIRES(context, output_shape.FromProto(proto_shape),
-                errors::InvalidArgument(
-                    "Could not construct Tensor serialized_sparse[2] (shape)"));
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(output_shape.shape()),
-                errors::InvalidArgument("Expected serialized_sparse[2] to be a "
-                                        "shape vector but its shape is ",
-                                        output_shape.shape().DebugString()));
+        context, ndims > 0,
+        errors::InvalidArgument("Serialized sparse should have non-zero rank ",
+                                serialized_sparse.shape().DebugString()));
 
-    OP_REQUIRES(
-        context, DataTypeToEnum<T>::value == output_values.dtype(),
-        errors::InvalidArgument("Requested SparseTensor of type ",
-                                DataTypeString(DataTypeToEnum<T>::value),
-                                " but SparseTensor.values.dtype() == ",
-                                DataTypeString(output_values.dtype())));
-
-    int64 num_entries = output_indices.dim_size(0);
-    OP_REQUIRES(context, num_entries == output_values.dim_size(0),
+    OP_REQUIRES(context, serialized_sparse.shape().dim_size(ndims - 1) == 3,
                 errors::InvalidArgument(
-                    "Expected row counts of SparseTensor.indices and "
-                    "SparseTensor.values to match but they do not: ",
-                    num_entries, " vs. ", output_values.dim_size(0)));
-    int rank = output_indices.dim_size(1);
-    OP_REQUIRES(context, rank == output_shape.dim_size(0),
-                errors::InvalidArgument(
-                    "Expected column counts of SparseTensor.indices to match "
-                    "size of SparseTensor.shape but they do not: ",
-                    rank, " vs. ", output_shape.dim_size(0)));
-
-    context->set_output(0, output_indices);
-    context->set_output(1, output_values);
-    context->set_output(2, output_shape);
-  }
-};
-
-#define REGISTER_KERNELS(type)                                \
-  REGISTER_KERNEL_BUILDER(Name("DeserializeSparse")           \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("dtype"), \
-                          DeserializeSparseOp<type>)
-
-TF_CALL_ALL_TYPES(REGISTER_KERNELS);
-#undef REGISTER_KERNELS
-
-template <typename T>
-class DeserializeManySparseOp : public OpKernel {
- public:
-  explicit DeserializeManySparseOp(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& serialized_sparse = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(serialized_sparse.shape()),
-                errors::InvalidArgument(
-                    "Serialized sparse should be a matrix but received shape ",
+                    "Serialized sparse should have 3 as the last dimension ",
                     serialized_sparse.shape().DebugString()));
-    OP_REQUIRES(
-        context, serialized_sparse.shape().dim_size(1) == 3,
-        errors::InvalidArgument(
-            "Serialize sparse should have 3 columns but received shape ",
-            serialized_sparse.shape().DebugString()));
 
-    int num_sparse_tensors = serialized_sparse.shape().dim_size(0);
+    int num_sparse_tensors = 1;
+    for (int i = 0; i < ndims - 1; ++i) {
+      num_sparse_tensors *= serialized_sparse.shape().dim_size(i);
+    }
 
     OP_REQUIRES(
         context, num_sparse_tensors > 0,
-        errors::InvalidArgument("Must have at least 1 serialized SparseTensor, "
-                                "but input matrix has 0 rows"));
+        errors::InvalidArgument(
+            "Serialized sparse should have at least 1 serialized tensor, "
+            "but has a zero dimension ",
+            serialized_sparse.shape().DebugString()));
 
-    std::vector<Tensor> indices_to_concat;
-    std::vector<Tensor> values_to_concat;
-    std::vector<TensorShape> shapes_to_concat;
+    if (num_sparse_tensors == 0 && serialized_sparse.shape().dims() == 1) {
+      // Special case with a single sparse tensor. We can avoid data
+      // motion in the Concat and Reshape.
+      const auto& serialized_sparse_t = serialized_sparse.vec<T>();
+
+      Tensor output_indices;
+      Tensor output_values;
+      Tensor output_shape;
+      OP_REQUIRES_OK(context,
+                     this->GetAndValidateSparseTensor(
+                         serialized_sparse_t(0), serialized_sparse_t(1),
+                         serialized_sparse_t(2), dtype_, 0 /* index */,
+                         &output_indices, &output_values, &output_shape));
+      context->set_output(0, output_indices);
+      context->set_output(1, output_values);
+      context->set_output(2, output_shape);
+      return;
+    }
 
-    const auto& serialized_sparse_t = serialized_sparse.matrix<string>();
+    std::vector<Tensor> indices;
+    std::vector<Tensor> values;
+    TensorShape shape;
+    indices.reserve(num_sparse_tensors);
+    values.reserve(num_sparse_tensors);
 
+    const auto& serialized_sparse_t = serialized_sparse.flat_inner_dims<T, 2>();
     for (int i = 0; i < num_sparse_tensors; ++i) {
-      Tensor output_indices(DT_INT64);
-      Tensor output_values(DataTypeToEnum<T>::value);
-      Tensor output_shape(DT_INT64);
-      TensorProto proto_indices;
-      TensorProto proto_values;
-      TensorProto proto_shape;
-
-      OP_REQUIRES(
-          context,
-          ParseProtoUnlimited(&proto_indices, serialized_sparse_t(i, 0)),
-          errors::InvalidArgument("Could not parse serialized_sparse[", i,
-                                  ", 0]"));
-      OP_REQUIRES(context,
-                  ParseProtoUnlimited(&proto_values, serialized_sparse_t(i, 1)),
-                  errors::InvalidArgument("Could not parse serialized_sparse[",
-                                          i, ", 1]"));
-      OP_REQUIRES(context,
-                  ParseProtoUnlimited(&proto_shape, serialized_sparse_t(i, 2)),
-                  errors::InvalidArgument("Could not parse serialized_sparse[",
-                                          i, ", 2]"));
-
-      OP_REQUIRES(context, output_indices.FromProto(proto_indices),
-                  errors::InvalidArgument(
-                      "Could not construct Tensor serialized_sparse[", i,
-                      ", 0] (indices)"));
-      OP_REQUIRES(context, TensorShapeUtils::IsMatrix(output_indices.shape()),
-                  errors::InvalidArgument(
-                      "Expected serialized_sparse[", i,
-                      ", 0] to represent an index matrix but received shape ",
-                      output_indices.shape().DebugString()));
-      OP_REQUIRES(context, output_values.FromProto(proto_values),
-                  errors::InvalidArgument(
-                      "Could not construct Tensor serialized_sparse[", i,
-                      ", 1] (values)"));
-      OP_REQUIRES(context, TensorShapeUtils::IsVector(output_values.shape()),
-                  errors::InvalidArgument(
-                      "Expected serialized_sparse[", i,
-                      ", 1] to represent a values vector but received shape ",
-                      output_values.shape().DebugString()));
-      OP_REQUIRES(context, output_shape.FromProto(proto_shape),
-                  errors::InvalidArgument(
-                      "Could not construct Tensor serialized_sparse[", i,
-                      ", 2] (shape)"));
-      OP_REQUIRES(
-          context, TensorShapeUtils::IsVector(output_shape.shape()),
-          errors::InvalidArgument("Expected serialized_sparse[", i,
-                                  ", 1] to be a shape vector but its shape is ",
-                                  output_shape.shape().DebugString()));
-
-      OP_REQUIRES(
-          context, DataTypeToEnum<T>::value == output_values.dtype(),
-          errors::InvalidArgument(
-              "Requested SparseTensor of type ",
-              DataTypeString(DataTypeToEnum<T>::value), " but SparseTensor[", i,
-              "].values.dtype() == ", DataTypeString(output_values.dtype())));
-
+      Tensor output_indices;
+      Tensor output_values;
+      Tensor output_shape;
+      OP_REQUIRES_OK(context,
+                     this->GetAndValidateSparseTensor(
+                         serialized_sparse_t(i, 0), serialized_sparse_t(i, 1),
+                         serialized_sparse_t(i, 2), dtype_, i, &output_indices,
+                         &output_values, &output_shape));
       int64 num_entries = output_indices.dim_size(0);
-      OP_REQUIRES(context, num_entries == output_values.dim_size(0),
-                  errors::InvalidArgument(
-                      "Expected row counts of SparseTensor[", i,
-                      "].indices and SparseTensor[", i,
-                      "].values to match but they do not: ", num_entries,
-                      " vs. ", output_values.dim_size(0)));
       int rank = output_indices.dim_size(1);
-      OP_REQUIRES(
-          context, rank == output_shape.dim_size(0),
-          errors::InvalidArgument("Expected column counts of SparseTensor[", i,
-                                  "].indices to match size of SparseTensor[", i,
-                                  "].shape "
-                                  "but they do not: ",
-                                  rank, " vs. ", output_shape.dim_size(0)));
 
       // Now we expand each SparseTensors' indices and shape by
       // prefixing a dimension
-      Tensor expanded_indices(
-          DT_INT64, TensorShape({num_entries, 1 + output_indices.dim_size(1)}));
-      Tensor expanded_shape(DT_INT64,
-                            TensorShape({1 + output_shape.dim_size(0)}));
+      Tensor expanded_indices(DT_INT64, TensorShape({num_entries, 1 + rank}));
       const auto& output_indices_t = output_indices.matrix<int64>();
-      const auto& output_shape_t = output_shape.vec<int64>();
       auto expanded_indices_t = expanded_indices.matrix<int64>();
-      auto expanded_shape_t = expanded_shape.vec<int64>();
       expanded_indices_t.chip<1>(0).setZero();
       Eigen::DSizes<Eigen::DenseIndex, 2> indices_start(0, 1);
       Eigen::DSizes<Eigen::DenseIndex, 2> indices_sizes(num_entries, rank);
       expanded_indices_t.slice(indices_start, indices_sizes) = output_indices_t;
+
+      Tensor expanded_shape(DT_INT64, TensorShape({1 + rank}));
+      const auto& output_shape_t = output_shape.vec<int64>();
+      auto expanded_shape_t = expanded_shape.vec<int64>();
       expanded_shape_t(0) = 1;
       std::copy_n(&output_shape_t(0), rank, &expanded_shape_t(1));
 
       TensorShape expanded_tensor_shape(expanded_shape.vec<int64>());
 
-      indices_to_concat.push_back(expanded_indices);
-      values_to_concat.push_back(output_values);
-      shapes_to_concat.push_back(expanded_tensor_shape);
-    }
-
-    int rank = -1;
-    for (int i = 0; i < num_sparse_tensors; ++i) {
-      if (rank < 0) rank = shapes_to_concat[i].dims();
-      OP_REQUIRES(context, rank == shapes_to_concat[i].dims(),
-                  errors::InvalidArgument(
-                      "Inconsistent rank across SparseTensors: rank prior to "
-                      "SparseTensor[",
-                      i, "] was: ", rank, " but rank of SparseTensor[", i,
-                      "] is: ", shapes_to_concat[i].dims()));
-    }
-
-    // SparseTensor::Concat requires consistent shape for all but the
-    // primary order dimension (dimension 0 in this case).  So we get
-    // the maximum value across all the input SparseTensors for each
-    // dimension and use that.
-    TensorShape preconcat_shape(shapes_to_concat[0]);
-    for (int i = 0; i < num_sparse_tensors; ++i) {
-      for (int d = 0; d < rank; ++d) {
-        preconcat_shape.set_dim(d, std::max(preconcat_shape.dim_size(d),
-                                            shapes_to_concat[i].dim_size(d)));
+      indices.push_back(expanded_indices);
+      values.push_back(output_values);
+      if (i == 0) {
+        shape = expanded_tensor_shape;
+      } else {
+        OP_REQUIRES(
+            context, shape.dims() == expanded_tensor_shape.dims(),
+            errors::InvalidArgument(
+                "Inconsistent shape across SparseTensors: rank prior to "
+                "SparseTensor[",
+                i, "] was: ", shape.dims() - 1, " but rank of SparseTensor[", i,
+                "] is: ", expanded_tensor_shape.dims() - 1));
+        for (int j = 1; j < shape.dims(); ++j) {
+          // NOTE(mrry): For compatibility with the implementations of
+          // DeserializeManySparse, and many ops that generate
+          // SparseTensors to batch that do not have a fixed
+          // dense_shape (e.g. `tf.parse_single_example()`), we
+          // compute the maximum in each dimension to find the
+          // smallest dense_shape that bounds all of the input
+          // SparseTensors.
+          shape.set_dim(j, std::max(shape.dim_size(j),
+                                    expanded_tensor_shape.dim_size(j)));
+        }
       }
     }
 
     // Dimension 0 is the primary dimension.
+    int rank = shape.dims();
     gtl::InlinedVector<int64, 8> std_order(rank);
     std::iota(std_order.begin(), std_order.end(), 0);
 
-    std::vector<SparseTensor> tensors_to_concat;
-    tensors_to_concat.reserve(num_sparse_tensors);
+    std::vector<SparseTensor> tensors;
+    tensors.reserve(num_sparse_tensors);
     for (int i = 0; i < num_sparse_tensors; ++i) {
-      tensors_to_concat.emplace_back(indices_to_concat[i], values_to_concat[i],
-                                     preconcat_shape, std_order);
+      tensors.emplace_back(indices[i], values[i], shape, std_order);
     }
 
-    SparseTensor output = SparseTensor::Concat<T>(tensors_to_concat);
+    gtl::optional<SparseTensor> maybe_output;
+#define HANDLE_TYPE(T)                               \
+  case DataTypeToEnum<T>::value: {                   \
+    maybe_output = SparseTensor::Concat<T>(tensors); \
+    break;                                           \
+  }
 
-    Tensor final_output_shape(DT_INT64, TensorShape({output.dims()}));
+    switch (dtype_) {
+      TF_CALL_ALL_TYPES(HANDLE_TYPE);
+      TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+      TF_CALL_variant(HANDLE_TYPE);
+#undef HANDLE_TYPE
+      default:
+        OP_REQUIRES(context, false,
+                    errors::Unimplemented(
+                        "DeserializeSparse Unhandled data type: ", dtype_));
+    }
+    DCHECK(maybe_output);
+    SparseTensor& output = maybe_output.value();
 
+    // Compute the input shape for the reshape operation.
+    Tensor input_shape(DT_INT64, TensorShape({output.dims()}));
     std::copy_n(output.shape().data(), output.dims(),
-                final_output_shape.vec<int64>().data());
+                input_shape.vec<int64>().data());
 
-    context->set_output(0, output.indices());
+    // Compute the target shape for the reshape operation.
+    Tensor target_shape(DT_INT64, TensorShape({ndims + output.dims() - 2}));
+    for (int i = 0; i < ndims - 1; ++i) {
+      target_shape.vec<int64>()(i) = serialized_sparse.shape().dim_size(i);
+    }
+    for (int i = 0; i < output.dims() - 1; ++i) {
+      target_shape.vec<int64>()(i + ndims - 1) = output.shape().data()[i + 1];
+    }
+
+    Tensor output_indices;
+    Tensor output_shape;
+    Reshape(context, output.indices(), input_shape, target_shape,
+            0 /* output indices index */, 2 /* output shape index */);
     context->set_output(1, output.values());
-    context->set_output(2, final_output_shape);
   }
+
+ protected:
+  Status Deserialize(const T& serialized, Tensor* result);
+
+  Status GetAndValidateSparseTensor(
+      const T& serialized_indices, const T& serialized_values,
+      const T& serialized_shape, DataType values_dtype, int index,
+      Tensor* output_indices, Tensor* output_values, Tensor* output_shape) {
+    // Deserialize and validate the indices.
+    TF_RETURN_IF_ERROR(this->Deserialize(serialized_indices, output_indices));
+    if (!TensorShapeUtils::IsMatrix(output_indices->shape())) {
+      return errors::InvalidArgument(
+          "Expected serialized_sparse[", index,
+          ", 0] to represent an index matrix but received shape ",
+          output_indices->shape().DebugString());
+    }
+    int64 num_entries = output_indices->dim_size(0);
+    int rank = output_indices->dim_size(1);
+
+    // Deserialize and validate the values.
+    TF_RETURN_IF_ERROR(this->Deserialize(serialized_values, output_values));
+    if (!TensorShapeUtils::IsVector(output_values->shape())) {
+      return errors::InvalidArgument(
+          "Expected serialized_sparse[", index,
+          ", 1] to represent a values vector but received shape ",
+          output_values->shape().DebugString());
+    }
+    if (values_dtype != output_values->dtype()) {
+      return errors::InvalidArgument(
+          "Requested SparseTensor of type ", DataTypeString(values_dtype),
+          " but SparseTensor[", index,
+          "].values.dtype() == ", DataTypeString(output_values->dtype()));
+    }
+    if (num_entries != output_values->dim_size(0)) {
+      return errors::InvalidArgument(
+          "Expected row counts of SparseTensor[", index,
+          "].indices and SparseTensor[", index,
+          "].values to match but they do not: ", num_entries, " vs. ",
+          output_values->dim_size(0));
+    }
+
+    // Deserialize and validate the shape.
+    TF_RETURN_IF_ERROR(this->Deserialize(serialized_shape, output_shape));
+    if (!TensorShapeUtils::IsVector(output_shape->shape())) {
+      return errors::InvalidArgument(
+          "Expected serialized_sparse[", index,
+          ", 1] to be a shape vector but its shape is ",
+          output_shape->shape().DebugString());
+    }
+    if (rank != output_shape->dim_size(0)) {
+      return errors::InvalidArgument("Expected column counts of SparseTensor[",
+                                     index,
+                                     "].indices to match size of SparseTensor[",
+                                     index, "].shape but they do not: ", rank,
+                                     " vs. ", output_shape->dim_size(0));
+    }
+    return Status::OK();
+  }
+
+  DataType dtype_;
 };
 
-#define REGISTER_KERNELS(type)                                \
-  REGISTER_KERNEL_BUILDER(Name("DeserializeManySparse")       \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("dtype"), \
-                          DeserializeManySparseOp<type>)
+template <>
+Status DeserializeSparseOp<string>::Deserialize(const string& serialized,
+                                                Tensor* result) {
+  TensorProto proto;
+  if (!ParseProtoUnlimited(&proto, serialized)) {
+    return errors::InvalidArgument("Could not parse serialized proto");
+  }
+  Tensor tensor;
+  if (!tensor.FromProto(proto)) {
+    return errors::InvalidArgument("Could not construct tensor from proto");
+  }
+  *result = tensor;
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("DeserializeSparse")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<string>("Tserialized"),
+                        DeserializeSparseOp<string>)
+
+REGISTER_KERNEL_BUILDER(Name("DeserializeManySparse").Device(DEVICE_CPU),
+                        DeserializeSparseOp<string>)
+
+template <>
+Status DeserializeSparseOp<Variant>::Deserialize(const Variant& serialized,
+                                                 Tensor* result) {
+  *result = *serialized.get<Tensor>();
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("DeserializeSparse")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<Variant>("Tserialized"),
+                        DeserializeSparseOp<Variant>)
 
-TF_CALL_ALL_TYPES(REGISTER_KERNELS);
-#undef REGISTER_KERNELS
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index 28a379774be5222bb15865c3642d9467659c3d1e..a9e31cc3363a7a4dc1b6178f4674db010528f93a 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -190,25 +190,41 @@ class SliceOp : public OpKernel {
         }
         return;
       }
-#define HANDLE_DIM(NDIM)                                              \
-  if (input_dims == NDIM) {                                           \
-    functor::Slice<Device, T, NDIM>()(                                \
-        context->eigen_device<Device>(), result, input, begin, size); \
-    return;                                                           \
+#define HANDLE_DIM(NDIM)                            \
+  if (input_dims == NDIM) {                         \
+    HandleCase<NDIM>(context, begin, size, result); \
+    return;                                         \
   }
+
       HANDLE_DIM(1);
       HANDLE_DIM(2);
       HANDLE_DIM(3);
       HANDLE_DIM(4);
       HANDLE_DIM(5);
       HANDLE_DIM(6);
+      HANDLE_DIM(7);
 
 #undef HANDLE_DIM
 
-      // handle cases which dim >= 7
-      functor::Slice<Device, T, 7>()(
-          context->eigen_device<Device>(), result, input, begin, size);
+      OP_REQUIRES(context, false, errors::Unimplemented(
+                                      "SliceOp : Unhandled input dimensions"));
+    }
+  }
+
+ private:
+  template <int NDIM>
+  void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
+                  const gtl::ArraySlice<int64>& size, Tensor* result) {
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
+    for (int i = 0; i < NDIM; ++i) {
+      indices[i] = begin[i];
+      sizes[i] = size[i];
     }
+
+    functor::Slice<Device, T, NDIM>()(
+        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
+        context->input(0).tensor<T, NDIM>(), indices, sizes);
   }
 };
 
@@ -248,16 +264,11 @@ class MklSliceOp : public OpKernel {
         }
         return;
       }
-      // Special case for handling 4-D tensor slice.
-      if (input_dims == 4) {
-        HandleCase4D(context, begin, size, result);
-      } else {
-#define HANDLE_DIM(NDIM)                                                  \
-      if (input_dims == NDIM) {                                           \
-        functor::Slice<Device, T, NDIM>()(                                \
-            context->eigen_device<Device>(), result, input, begin, size); \
-            return;                                                       \
-      }
+#define HANDLE_DIM(NDIM)                            \
+  if (input_dims == NDIM) {                         \
+    HandleCase<NDIM>(context, begin, size, result); \
+    return;                                         \
+  }
 
       HANDLE_DIM(1);
       HANDLE_DIM(2);
@@ -265,13 +276,12 @@ class MklSliceOp : public OpKernel {
       HANDLE_DIM(4);
       HANDLE_DIM(5);
       HANDLE_DIM(6);
+      HANDLE_DIM(7);
 
 #undef HANDLE_DIM
 
-        // handle cases which dim >= 7
-        functor::Slice<Device, T, 7>()(
-          context->eigen_device<Device>(), result, input, begin, size);
-      }
+      OP_REQUIRES(context, false, errors::Unimplemented(
+                                      "SliceOp : Unhandled input dimensions"));
     }
   }
 
@@ -318,7 +328,8 @@ class MklSliceOp : public OpKernel {
     return false;
   }
 
-  void HandleCase4D(OpKernelContext* context,
+  template <int NDIM>
+  void HandleCase(OpKernelContext* context,
                   const gtl::ArraySlice<int64>& begin,
                   const gtl::ArraySlice<int64>& size, Tensor* result) {
     int slice_dim = -1;
@@ -327,7 +338,8 @@ class MklSliceOp : public OpKernel {
     // differs from the input tensor in only 1 out of 4 dimensions.
     // This case arises in the context of Slice of 4-D tensor in NHWC or NCHW
     // format over channel dimension.
-    if (DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) {
+    if (NDIM == 4 &&
+        DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) {
         size_t in_strides[4] = { (size_t) in_shape.dim_size(1) *
                                           in_shape.dim_size(2) *
                                           in_shape.dim_size(3),
@@ -391,8 +403,16 @@ class MklSliceOp : public OpKernel {
         // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
     }
 
-    functor::Slice<Device, T, 4>()(
-        context->eigen_device<Device>(), result, context->input(0), begin, size);
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
+    for (int i = 0; i < NDIM; ++i) {
+      indices[i] = begin[i];
+      sizes[i] = size[i];
+    }
+
+    functor::Slice<Device, T, NDIM>()(
+        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
+        context->input(0).tensor<T, NDIM>(), indices, sizes);
   }
 };
 #endif
@@ -400,13 +420,13 @@ class MklSliceOp : public OpKernel {
 // Forward declarations of the functor specializations for declared in the
 // sharded source files.
 namespace functor {
-#define DECLARE_CPU_SPEC(T, NDIM)                        \
-  template <>                                            \
-  void Slice<CPUDevice, T, NDIM>::operator()(            \
-      const CPUDevice& d, Tensor* output,                \
-      const Tensor& input,                               \
-      const gtl::ArraySlice<int64>& slice_indices,       \
-      const gtl::ArraySlice<int64>& slice_sizes);        \
+#define DECLARE_CPU_SPEC(T, NDIM)                                  \
+  template <>                                                      \
+  void Slice<CPUDevice, T, NDIM>::operator()(                      \
+      const CPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \
+      typename TTypes<T, NDIM>::ConstTensor input,                 \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
   extern template struct Slice<CPUDevice, T, NDIM>;
 
 #define DECLARE_FOR_N(T)  \
@@ -419,7 +439,7 @@ namespace functor {
   DECLARE_CPU_SPEC(T, 7);
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N);
-DECLARE_FOR_N(bfloat16);
+TF_CALL_bfloat16(DECLARE_FOR_N);
 
 #undef DECLARE_FOR_N
 #undef DECLARE_CPU_SPEC
@@ -436,7 +456,7 @@ DECLARE_FOR_N(bfloat16);
 
 TF_CALL_POD_STRING_TYPES(REGISTER_SLICE);
 TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE);
-REGISTER_SLICE(bfloat16);
+TF_CALL_bfloat16(REGISTER_SLICE);
 #undef REGISTER_SLICE
 #else
 #define REGISTER_SLICE(type)                             \
@@ -449,21 +469,20 @@ REGISTER_SLICE(bfloat16);
 
 TF_CALL_POD_STRING_TYPES(REGISTER_SLICE);
 TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE);
-REGISTER_SLICE(bfloat16);
+TF_CALL_bfloat16(REGISTER_SLICE);
 #undef REGISTER_SLICE
 #endif  // INTEL_MKL
 
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T, NDIM)                        \
-  template <>                                            \
-  void Slice<GPUDevice, T, NDIM>::operator()(            \
-      const GPUDevice& d,                                \
-      Tensor* output,                                    \
-      const Tensor& input,                               \
-      const gtl::ArraySlice<int64>& slice_indices,       \
-      const gtl::ArraySlice<int64>& slice_sizes);        \
+#define DECLARE_GPU_SPEC(T, NDIM)                                  \
+  template <>                                                      \
+  void Slice<GPUDevice, T, NDIM>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \
+      typename TTypes<T, NDIM>::ConstTensor input,                 \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
   extern template struct Slice<GPUDevice, T, NDIM>;
 
 #define DECLARE_FOR_N(T)  \
@@ -478,6 +497,7 @@ namespace functor {
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N);
 TF_CALL_complex64(DECLARE_FOR_N);
 TF_CALL_complex128(DECLARE_FOR_N);
+TF_CALL_bfloat16(DECLARE_FOR_N);
 DECLARE_FOR_N(int32);
 
 #undef DECLARE_FOR_N
@@ -496,6 +516,7 @@ DECLARE_FOR_N(int32);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
+TF_CALL_bfloat16(REGISTER_GPU);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -517,14 +538,13 @@ REGISTER_KERNEL_BUILDER(Name("Slice")
 #ifdef TENSORFLOW_USE_SYCL
 // Forward declarations of the functor specializations for SYCL.
 namespace functor {
-#define DECLARE_SYCL_SPEC(T, NDIM)                       \
-  template <>                                            \
-  void Slice<SYCLDevice, T, NDIM>::operator()(           \
-      const SYCLDevice& d,                               \
-      Tensor* output,                                    \
-      const Tensor& input,                               \
-      const gtl::ArraySlice<int64>& slice_indices,       \
-      const gtl::ArraySlice<int64>& slice_sizes);        \
+#define DECLARE_SYCL_SPEC(T, NDIM)                                 \
+  template <>                                                      \
+  void Slice<SYCLDevice, T, NDIM>::operator()(                     \
+      const SYCLDevice& d, typename TTypes<T, NDIM>::Tensor output,\
+      typename TTypes<T, NDIM>::ConstTensor input,                 \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
   extern template struct Slice<SYCLDevice, T, NDIM>;
 
 #define DECLARE_FOR_N(T)   \
diff --git a/tensorflow/core/kernels/slice_op.h b/tensorflow/core/kernels/slice_op.h
index 5fd6ce4067a60c4a3446abc98bf58d6c12a75124..0362a021336f633b88a666c68f42fa5082f4f66d 100644
--- a/tensorflow/core/kernels/slice_op.h
+++ b/tensorflow/core/kernels/slice_op.h
@@ -19,104 +19,32 @@ limitations under the License.
 // Functor definition for SliceOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/ops_util.h"
 
 namespace tensorflow {
+namespace functor {
 
-namespace internal {
-
-template <typename Device, typename T>
-void SliceSimple(const Device& d, Tensor* out, const Tensor& in,
-                 const gtl::ArraySlice<int64>& slice_indices);
-template <typename Device, typename T>
-void SliceSimpleGpu(const Device& d, Tensor* out, const Tensor& in,
-                 const gtl::ArraySlice<int64>& slice_indices);
-
-template <typename Device, typename T>
-void SliceSimple(const Device& d, Tensor* out, const Tensor& in,
-                 const gtl::ArraySlice<int64>& slice_indices) {
-  const int ndims = in.dims();
-  const int64 nelem = out->NumElements();
-  const gtl::InlinedVector<int64, 8> in_strides = ComputeStride<int64>(in.shape());
-  const gtl::InlinedVector<int64, 8> out_strides = ComputeStride<int64>(out->shape());
-  const T* p = in.flat<T>().data();
-  T* q = out->flat<T>().data();
-
-  std::vector<int64> i_idx(nelem, 0);
-  std::vector<int64> t(nelem, 0);
-
-  for (int64 o_idx = 0; o_idx < nelem; ++o_idx) {
-    t[o_idx] = o_idx;
-  }
-  for (int i = 0; i < ndims; ++i) {
-    int64 n = (nelem + 7) / 8;
-    int64 o_idx = 0;
-    switch (nelem % 8) {
-#define CALC_INPUT_IDX                                                            \
-  i_idx[o_idx] += (t[o_idx] / out_strides[i] + slice_indices[i]) * in_strides[i]; \
-  t[o_idx] %= out_strides[i];                                                     \
-  ++o_idx;
-      case 0: do { CALC_INPUT_IDX;
-      case 7:      CALC_INPUT_IDX;
-      case 6:      CALC_INPUT_IDX;
-      case 5:      CALC_INPUT_IDX;
-      case 4:      CALC_INPUT_IDX;
-      case 3:      CALC_INPUT_IDX;
-      case 2:      CALC_INPUT_IDX;
-      case 1:      CALC_INPUT_IDX;
-#undef CALC_INPUT_IDX
-              } while (--n > 0);
-    }
-  }
-  for (int64 o_idx = 0; o_idx < nelem; ++o_idx) {
-    q[o_idx] = p[i_idx[o_idx]];
-  }
-}
 
 template <typename Device, typename T, int NDIMS>
-void SliceUsingEigen(const Device& d, Tensor* out, const Tensor& in,
-                 const gtl::ArraySlice<int64>& slice_indices,
-                 const gtl::ArraySlice<int64>& slice_sizes) {
-  auto input = in.tensor<T, NDIMS>();
-  auto output = out->tensor<T, NDIMS>();
-  Eigen::DSizes<int, NDIMS> indices;
-  for (int i = 0; i < NDIMS; ++i) {
-    indices[i] = slice_indices[i];
-  }
-  Eigen::DSizes<int, NDIMS> sizes;
-  for (int i = 0; i < NDIMS; ++i) {
-    sizes[i] = slice_sizes[i];
-  }
-  const bool use_64bit = input.size() > Eigen::NumTraits<int>::highest();
-  if (!use_64bit &&
-      Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
-    To32Bit(output).device(d) = To32Bit(input).slice(indices, sizes);
-  } else {
-    output.device(d) = input.slice(indices, sizes);
-  }
-}
-
-} // namespace internal
-
-namespace functor {
-
-// Template parameter NDIM is not neccesary here. The aim of keeping it
-// is to compile struct slice separately which minimizes the compiling time.
-template <typename Device, typename T, int NDIM>
 struct Slice {
-  void operator()(const Device& d, Tensor* out, const Tensor& in,
-                  const gtl::ArraySlice<int64>& slice_indices,
-                  const gtl::ArraySlice<int64>& slice_sizes) {
-    if (in.dims() == NDIM) {
-        internal::SliceUsingEigen<Device, T, NDIM>(d, out, in, slice_indices, slice_sizes);
+  void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor output,
+                  typename TTypes<T, NDIMS>::ConstTensor input,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& slice_indices,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& slice_sizes) {
+    bool use_64bit = (input.size() > Eigen::NumTraits<int>::highest());
+    if (!use_64bit &&
+        Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
+      Eigen::DSizes<int, NDIMS> indices;
+      for (int i = 0; i < NDIMS; ++i) {
+        indices[i] = slice_indices[i];
+      }
+      Eigen::DSizes<int, NDIMS> sizes;
+      for (int i = 0; i < NDIMS; ++i) {
+        sizes[i] = slice_sizes[i];
+      }
+      To32Bit(output).device(d) = To32Bit(input).slice(indices, sizes);
     } else {
-        if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
-          internal::SliceSimpleGpu<Device, T>(d, out, in, slice_indices);
-        } else {
-          internal::SliceSimple<Device, T>(d, out, in, slice_indices);
-        }
+      output.device(d) = input.slice(slice_indices, slice_sizes);
     }
   }
 };
diff --git a/tensorflow/core/kernels/slice_op_gpu.cu.cc b/tensorflow/core/kernels/slice_op_gpu.cu.cc
index 3039b3d777f543e2f24c8ce9e138aa8ebd843090..9d51f8978c0a24afb2f98845a4de4e8b51a29aeb 100644
--- a/tensorflow/core/kernels/slice_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/slice_op_gpu.cu.cc
@@ -21,65 +21,9 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
-namespace internal {
-
-template <typename T>
-__global__ void SliceKernel(int nthreads, const T* src, const int32* buf,
-                            const int32 ndims, T* dst) {
-  const int32* in_strides = buf;
-  const int32* out_strides = buf + ndims;
-  const int32* slice_indices = buf + ndims * 2;
-  CUDA_1D_KERNEL_LOOP(o_idx, nthreads) {
-    int32 i_idx = 0;
-    int32 t = o_idx;
-    for (int i = 0; i < ndims; ++i) {
-      i_idx += (t / out_strides[i] + slice_indices[i]) * in_strides[i];
-      t %= out_strides[i];
-    }
-    dst[o_idx] = ldg(src + i_idx);
-  }
-}
-
-template <typename Device, typename T>
-void SliceSimpleGpu(const Device& d, Tensor* out, const Tensor& in,
-                 const gtl::ArraySlice<int64>& slice_indices) {
-  // Ensures we can use 32-bit index.
-  const int64 in_nelem = in.NumElements();
-  CHECK_LT(in_nelem, kint32max) << "Tensor too large to transpose on GPU";
-  const int64 out_nelem = out->NumElements();
-  CHECK_LT(out_nelem, kint32max) << "Tensor too large to transpose on GPU";
-  // Pack strides and slice indices sizes into one buffer.
-  const int32 ndims = in.dims();
-  gtl::InlinedVector<int32, 24> host_buf(ndims * 3);
-  gtl::InlinedVector<int32, 8> in_strides = ComputeStride<int32>(in.shape());
-  gtl::InlinedVector<int32, 8> out_strides = ComputeStride<int32>(out->shape());
-  for (int i = 0; i < ndims; ++i) {
-    host_buf[i] = in_strides[i];
-    host_buf[ndims + i] = out_strides[i];
-    host_buf[ndims * 2 + i] = slice_indices[i];
-  }
-  auto num_bytes = sizeof(int64) * host_buf.size();
-  auto dev_buf = d.allocate(num_bytes);
-  // NOTE: host_buf is not allocated by CudaHostAllocator, and
-  // therefore we are doing a sync copy effectively.
-  d.memcpyHostToDevice(dev_buf, host_buf.data(), num_bytes);
-  // Launch kernel to q[...] = p[...].
-  const T* p = in.flat<T>().data();
-  T* q = out->flat<T>().data();
-  CudaLaunchConfig cfg = GetCudaLaunchConfig(out_nelem, d);
-  SliceKernel<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
-      cfg.virtual_thread_count, p, reinterpret_cast<const int32*>(dev_buf),
-      ndims, q);
-  // Safe to deallocate immediately after the kernel launch.
-  d.deallocate(dev_buf);
-}
-
-} // namespace internal
 
 typedef Eigen::GpuDevice GPUDevice;
 
@@ -95,6 +39,7 @@ typedef Eigen::GpuDevice GPUDevice;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_complex64(DEFINE_GPU_KERNELS);
 TF_CALL_complex128(DEFINE_GPU_KERNELS);
+TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
 DEFINE_GPU_KERNELS(int32);
 
 #undef DEFINE_GPU_KERNELS
diff --git a/tensorflow/core/kernels/snapshot_op.cc b/tensorflow/core/kernels/snapshot_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..50157d5d48f93bfe61cbac95246123ef0a7d446e
--- /dev/null
+++ b/tensorflow/core/kernels/snapshot_op.cc
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+#include "tensorflow/core/kernels/snapshot_op.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+#define REGISTER_KERNEL(TYPE)                                        \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("Snapshot").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
+      SnapshotOp<CPUDevice, TYPE>);
+
+TF_CALL_POD_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#if TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SyclDevice;
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Snapshot").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      SnapshotOp<SyclDevice, TYPE>);
+
+TF_CALL_POD_TYPES(REGISTER_SYCL_KERNEL);
+
+#undef REGISTER_SYCL_KERNEL
+#endif  // TENSORFLOW_USE_SYCL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/snapshot_op.h b/tensorflow/core/kernels/snapshot_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c79893b49661519515a7b4a537ff3caeceba2be
--- /dev/null
+++ b/tensorflow/core/kernels/snapshot_op.h
@@ -0,0 +1,49 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_SNAPSHOT_OP_H_
+#define TENSORFLOW_KERNELS_SNAPSHOT_OP_H_
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+template <typename Device, typename Scalar>
+class SnapshotOp : public OpKernel {
+ public:
+  explicit SnapshotOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    const Device& device = context->eigen_device<Device>();
+    device.memcpy(output->template flat<Scalar>().data(),
+                  input.template flat<Scalar>().data(),
+                  input.NumElements() * sizeof(Scalar));
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_SNAPSHOT_OP_H_
diff --git a/tensorflow/core/kernels/snapshot_op_gpu.cu.cc b/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..52070be838d65d21813dfe097db9c395ef5a8448
--- /dev/null
+++ b/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA
+
+// See docs in ../ops/array_ops.cc.
+#include "tensorflow/core/kernels/snapshot_op.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER_KERNEL(TYPE)                                        \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("Snapshot").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      SnapshotOp<GPUDevice, TYPE>);
+
+TF_CALL_POD_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/softmax_op_functor.h b/tensorflow/core/kernels/softmax_op_functor.h
index 1f38bdce8c3a8f70e89efe62ad6c6f385bb5dfc0..d3a267ed877eedf8ed3845ebd11255f0690b3106 100644
--- a/tensorflow/core/kernels/softmax_op_functor.h
+++ b/tensorflow/core/kernels/softmax_op_functor.h
@@ -64,23 +64,21 @@ struct SoftmaxEigenImpl {
     one_by_class.set(1, num_classes);
 #endif
     // shifted_logits = logits - max(logits along classes);
-    auto shifted_logits = (logits -
-                           logits.maximum(along_class)
-                               .eval()
-                               .reshape(batch_by_one)
-                               .broadcast(one_by_class));
+    auto shifted_logits = (logits - logits.maximum(along_class)
+                                        .eval()
+                                        .reshape(batch_by_one)
+                                        .broadcast(one_by_class));
     if (log) {
       // Calculate the log of the softmax
       // softmax = logits - max(logits along classes);
       softmax.device(d) = shifted_logits;
       // softmax = softmax - log(sum(exp(softmax along classes)));
-      softmax.device(d) = (softmax -
-                           softmax.exp()
-                               .sum(along_class)
-                               .eval()
-                               .reshape(batch_by_one)
-                               .log()
-                               .broadcast(one_by_class));
+      softmax.device(d) = (softmax - softmax.exp()
+                                         .sum(along_class)
+                                         .log()
+                                         .eval()
+                                         .reshape(batch_by_one)
+                                         .broadcast(one_by_class));
     } else {
       // NOTE(touts): If you modify this implementation please run
       // the BM_ImageNetSoftmaxFwd benchmark in nn_ops_test.cc.
@@ -88,12 +86,11 @@ struct SoftmaxEigenImpl {
       // softmax = exp(logits - max(logits along classes));
       softmax.device(d) = shifted_logits.exp();
       // softmax = softmax * (1 / sum(softmax along classes));
-      softmax.device(d) = (softmax *
-                           softmax.sum(along_class)
-                               .inverse()
-                               .eval()
-                               .reshape(batch_by_one)
-                               .broadcast(one_by_class));
+      softmax.device(d) = (softmax * softmax.sum(along_class)
+                                         .inverse()
+                                         .eval()
+                                         .reshape(batch_by_one)
+                                         .broadcast(one_by_class));
     }
   }
 };
diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc
index a0c54805e2f348362f496cd77c16508d66671ada..f815ca9e344664c4c95befccb88e750eb99d0eaf 100644
--- a/tensorflow/core/kernels/sparse_matmul_op_test.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc
@@ -284,12 +284,12 @@ class SparseMatmulOpTest : public ::testing::Test {
       uint16_t* data3_bfloat16_p =
           reinterpret_cast<uint16_t*>(data3_bfloat16) + i;
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-            data3_p[1] = 0;  
-            data3_bfloat16_p[0] = data3_p[0];  
+            data3_p[1] = 0;
+            data3_bfloat16_p[0] = data3_p[0];
 #else
-            data3_p[0] = 0;  
-            data3_bfloat16_p[0] = data3_p[1];  
-#endif  
+            data3_p[0] = 0;
+            data3_bfloat16_p[0] = data3_p[1];
+#endif
     }
   }
 
diff --git a/tensorflow/core/kernels/sparse_reshape_op.cc b/tensorflow/core/kernels/sparse_reshape_op.cc
index f0f353871d0449c08492ddb0a2fc3db27b245a9d..939d404aa442e6d3384d46f19cc54771cb53a27b 100644
--- a/tensorflow/core/kernels/sparse_reshape_op.cc
+++ b/tensorflow/core/kernels/sparse_reshape_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/reshape_util.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 
 namespace tensorflow {
@@ -33,124 +34,10 @@ class SparseReshapeOp : public OpKernel {
   explicit SparseReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
-    const Tensor& input_ind_in = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_ind_in.shape()),
-                errors::InvalidArgument(
-                    "Input indices should be a matrix but received shape ",
-                    input_ind_in.shape().DebugString()));
-
-    const Tensor& input_shape_in = context->input(1);
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(input_shape_in.shape()),
-                errors::InvalidArgument(
-                    "Input shape should be a vector but received shape ",
-                    input_shape_in.shape().DebugString()));
-
-    const Tensor& new_shape_in = context->input(2);
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(new_shape_in.shape()),
-                errors::InvalidArgument(
-                    "New shape should be a vector but received shape ",
-                    new_shape_in.shape().DebugString()));
-
-    const int64 input_rank = input_shape_in.NumElements();
-    const int64 output_rank = new_shape_in.NumElements();
-
-    const TensorShape input_shape(input_shape_in.vec<int64>());
-    const int64 dense_size = input_shape.num_elements();
-
-    const int64 nnz = input_ind_in.shape().dim_size(0);
-
-    // Compute the output shape.  Determine product of specified
-    // dimensions, and find the index of the unspecified one. Largely the
-    // same calculation as reshape_op
-    TensorShape output_shape;
-    int64 product = 1;
-    int unknown_index = -1;
-    auto new_shape = new_shape_in.vec<int64>();
-    for (int d = 0; d < output_rank; ++d) {
-      const int64 size = new_shape(d);
-      if (size == -1) {
-        OP_REQUIRES(
-            context, unknown_index == -1,
-            errors::InvalidArgument("only one output shape size may be -1, "
-                                    "not both ",
-                                    unknown_index, " and ", d));
-        unknown_index = d;
-        output_shape.AddDim(1);
-      } else {
-        OP_REQUIRES(context, size >= 0,
-                    errors::InvalidArgument(
-                        "size ", d, " must be non-negative, not ", size));
-        output_shape.AddDim(size);
-        product *= size;
-      }
-    }
-    if (unknown_index != -1) {
-      OP_REQUIRES(
-          context, product > 0,
-          errors::InvalidArgument("SparseReshape cannot infer the missing "
-                                  "input size for an empty tensor unless all "
-                                  "specified input sizes are non-zero"));
-      const int64 missing = dense_size / product;
-      OP_REQUIRES(
-          context, product * missing == dense_size,
-          errors::InvalidArgument(
-              "Input to reshape is a SparseTensor with ", dense_size,
-              " dense values, but the requested shape requires a multiple of ",
-              product));
-      output_shape.set_dim(unknown_index, missing);
-    }
-
-    OP_REQUIRES(context, output_shape.num_elements() == dense_size,
-                errors::InvalidArgument("Input to reshape is a tensor with ",
-                                        dense_size,
-                                        " dense values, but the "
-                                        "requested shape has ",
-                                        output_shape.num_elements()));
-
-    // Optimize for reshaping to the same shape.
-    if (input_shape == output_shape) {
-      context->set_output(0, input_ind_in);
-      context->set_output(1, input_shape_in);
-      return;
-    }
-
-    gtl::InlinedVector<int64, 8> input_strides(input_rank);
-    input_strides[input_rank - 1] = 1;
-    for (int d = input_rank - 2; d >= 0; --d) {
-      input_strides[d] = input_strides[d + 1] * input_shape.dim_size(d + 1);
-    }
-
-    gtl::InlinedVector<int64, 8> output_strides(output_rank);
-    output_strides[output_rank - 1] = 1;
-    for (int d = output_rank - 2; d >= 0; --d) {
-      output_strides[d] = output_strides[d + 1] * output_shape.dim_size(d + 1);
-    }
-
-    Tensor* output_ind_out = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({nnz, output_rank}),
-                                            &output_ind_out));
-    auto input_ind = input_ind_in.matrix<int64>();
-    auto output_ind = output_ind_out->matrix<int64>();
-    for (int i = 0; i < nnz; ++i) {
-      int64 id = 0;
-      for (int j = 0; j < input_rank; ++j) {
-        id += input_ind(i, j) * input_strides[j];
-      }
-      for (int j = 0; j < output_rank; ++j) {
-        output_ind(i, j) = id / output_strides[j];
-        id %= output_strides[j];
-      }
-    }
-
-    Tensor* output_shape_out = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, TensorShape({output_rank}),
-                                            &output_shape_out));
-    auto output_shape_vec = output_shape_out->vec<int64>();
-    for (int j = 0; j < output_shape.dims(); ++j) {
-      output_shape_vec(j) = output_shape.dim_size(j);
-    }
+    Tensor output_indices;
+    Tensor output_shape;
+    Reshape(context, context->input(0), context->input(1), context->input(2),
+            0 /* output indices index */, 1 /* output shape index */);
   }
 };
 
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index dd6fc6115f7b5bce60f5373c8556e7b1642afd6a..9f234fc0935be0662b0d8df1a6bd1c109ab24fd9 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -52,7 +52,7 @@ void SplitCustom<Device, T>::operator()(
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_complex64(DEFINE_GPU_KERNELS);
 TF_CALL_complex128(DEFINE_GPU_KERNELS);
-DEFINE_GPU_KERNELS(bfloat16);
+TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
 
 #undef DEFINE_GPU_KERNELS
 #define DEFINE_GPU_KERNELS(T) template struct SplitCustom<Eigen::GpuDevice, T>;
@@ -60,7 +60,7 @@ DEFINE_GPU_KERNELS(bfloat16);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_complex64(DEFINE_GPU_KERNELS);
 TF_CALL_complex128(DEFINE_GPU_KERNELS);
-DEFINE_GPU_KERNELS(bfloat16);
+TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
 
 #undef DEFINE_GPU_KERNELS
 
@@ -243,6 +243,7 @@ struct SplitVOpGPULaunch {
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_complex64(REGISTER_GPU_KERNEL);
 TF_CALL_complex128(REGISTER_GPU_KERNEL);
+TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #define REGISTER_GPU_KERNEL(T)                 \
   template struct SplitVOpGPULaunch<T, int32>; \
@@ -251,7 +252,7 @@ TF_CALL_complex128(REGISTER_GPU_KERNEL);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_complex64(REGISTER_GPU_KERNEL);
 TF_CALL_complex128(REGISTER_GPU_KERNEL);
-REGISTER_GPU_KERNEL(bfloat16);
+TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index 58e1a73be61cf04aba05ebadb8d8e49f6aacef6b..90d7e225ed0e365c51c15473df1b283b48678d05 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -360,6 +360,8 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
 
 TF_CALL_ALL_TYPES(REGISTER_SPLIT);
 REGISTER_SPLIT(quint8);
+// TODO(xpan): Merge bfloat16 into TF_CALL_ALL_TYPES
+REGISTER_SPLIT(bfloat16);
 
 #undef REGISTER_SPLIT
 
@@ -375,6 +377,7 @@ REGISTER_SPLIT(quint8);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
+REGISTER_GPU(bfloat16);
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/stateless_random_ops.cc b/tensorflow/core/kernels/stateless_random_ops.cc
index f6fb0a121d8336a1abd624103e33e3ed8869f0d2..88fcf542fb0cc726b228be34d0fe7b92663ce95d 100644
--- a/tensorflow/core/kernels/stateless_random_ops.cc
+++ b/tensorflow/core/kernels/stateless_random_ops.cc
@@ -50,9 +50,18 @@ class StatelessRandomOpBase : public OpKernel {
     if (shape.num_elements() == 0) return;
 
     // Grab the two seeds
-    const auto seed = seed_t.flat<int64>();
-    const uint64 seed0 = internal::SubtleMustCopy(seed(0));
-    const uint64 seed1 = internal::SubtleMustCopy(seed(1));
+    uint64 seed0;
+    uint64 seed1;
+    if (context->input_dtype(1) == DT_INT32) {
+      const auto seed = seed_t.flat<int32>();
+      seed0 = internal::SubtleMustCopy(seed(0));
+      seed1 = internal::SubtleMustCopy(seed(1));
+    } else {
+      CHECK_EQ(DT_INT64, context->input_dtype(1));
+      const auto seed = seed_t.flat<int64>();
+      seed0 = internal::SubtleMustCopy(seed(0));
+      seed1 = internal::SubtleMustCopy(seed(1));
+    }
 
     // Scramble the seeds so that the user doesn't need to worry about which
     // part of the seed needs to be strong.
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 8fc40db3cc22060eb18b64c2246188925626b8bf..73b6d4cf6a212d3f09a6955cb8a138d2aec58b75 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -427,6 +427,7 @@ REGISTER_STRIDED_SLICE(bfloat16);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
+TF_CALL_int64(REGISTER_GPU);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
index a8487f49f4488269e058c6b7ee94d0f82aeb5270..8ca27e3b920e7c0cd36343d0c9db5a6098b6bede 100644
--- a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
@@ -53,6 +53,7 @@ typedef Eigen::GpuDevice GPUDevice;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_complex64(DEFINE_GPU_KERNELS);
 TF_CALL_complex128(DEFINE_GPU_KERNELS);
+TF_CALL_int64(DEFINE_GPU_KERNELS);
 DEFINE_GPU_KERNELS(int32);
 
 #undef DEFINE_GPU_KERNELS
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index 7d4288742644be26d7e91e730b611a165989063c..afe3a051e64cbff2040d32e95c5a4aacb2decbd1 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -84,16 +84,16 @@ void HandleStridedSliceCase(OpKernelContext* context,
 
   gtl::InlinedVector<int64, 4> processing_dims = processing_shape.dim_sizes();
   if (is_simple_slice) {
-    gtl::InlinedVector<int64, 4> sizes(begin.size());
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> begin_di;
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes_di;
     for (int i = 0; i < NDIM; ++i) {
-      sizes[i] = end[i] - begin[i];
+      begin_di[i] = begin[i];
+      sizes_di[i] = end[i] - begin[i];
     }
-    const TensorShape final_shape = result->shape();
-    CHECK(result->CopyFrom(*result, processing_shape));
-    const Tensor input = context->input(0);
-    functor::Slice<Device, T, NDIM>()(
-        context->eigen_device<Device>(), result, input, begin, sizes);
-    CHECK(result->CopyFrom(*result, final_shape));
+    functor::Slice<Device, Proxy, NDIM>()(
+        context->eigen_device<Device>(),
+        result->bit_casted_shaped<Proxy, NDIM>(processing_dims),
+        context->input(0).bit_casted_tensor<Proxy, NDIM>(), begin_di, sizes_di);
   } else {
     Eigen::DSizes<Eigen::DenseIndex, NDIM> begin_di;
     Eigen::DSizes<Eigen::DenseIndex, NDIM> end_di;
@@ -196,9 +196,10 @@ class HandleStridedSliceAssignCase<Device, T, 0> {
   extern template struct StridedSlice<GPUDevice, T, NDIM>;         \
   template <>                                                      \
   void Slice<GPUDevice, T, NDIM>::operator()(                      \
-      const GPUDevice& d, Tensor* output, const Tensor& input,     \
-      const gtl::ArraySlice<int64>& slice_indices,                 \
-      const gtl::ArraySlice<int64>& slice_sizes);                  \
+      const GPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \
+      typename TTypes<T, NDIM>::ConstTensor input,                 \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
   extern template struct Slice<GPUDevice, T, NDIM>;                \
   template <>                                                      \
   void StridedSliceGrad<GPUDevice, T, NDIM>::operator()(           \
@@ -283,6 +284,7 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N_GPU);
 TF_CALL_complex64(DECLARE_FOR_N_GPU);
 TF_CALL_complex128(DECLARE_FOR_N_GPU);
 DECLARE_FOR_N_GPU(int32);
+DECLARE_FOR_N_GPU(int64);
 #endif  // END GOOGLE_CUDA
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
@@ -298,6 +300,7 @@ DECLARE_FOR_N_CPU(bfloat16);
 TF_CALL_SYCL_PROXY_TYPES(PREVENT_FOR_N_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_FOR_N_SYCL);
 DECLARE_FOR_N_SYCL(int32);
+DECLARE_FOR_N_SYCL(int64);
 
 #undef DECLARE_FOR_N_SYCL
 #endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/strided_slice_op_test.cc b/tensorflow/core/kernels/strided_slice_op_test.cc
index 78bb15463c2ae4bb1b2b00a810223ab00b3aee70..281ca0f58fe8148d8ad5ba959b88fbe16950c31d 100644
--- a/tensorflow/core/kernels/strided_slice_op_test.cc
+++ b/tensorflow/core/kernels/strided_slice_op_test.cc
@@ -76,69 +76,20 @@ static void SliceHelper(int iters, int size) {
   testing::UseRealTime();
 }
 
-template <typename T>
-static void Dim8SliceHelper(int iters, int size) {
-  testing::StopTiming();
-  Graph* g = new Graph(OpRegistry::Global());
-  DataType dt = DataTypeToEnum<T>::v();
-  int kDim = 100;
-  int kMaxSize = 15000;
-  CHECK_LT(size, kMaxSize);
-
-  Tensor begin(DT_INT32, TensorShape({8}));
-  begin.flat<int32>()(10) = 10;
-  for (int i = 1; i < 7; ++i) {
-    begin.flat<int32>()(i) = 0;
-  }
-  begin.flat<int32>()(7) = 10;
-
-  Tensor end(DT_INT32, TensorShape({8}));
-  end.flat<int32>()(0) = 10 + kDim;
-  for (int i = 1; i < 7; ++i) {
-    end.flat<int32>()(i) = 1;
-  }
-  end.flat<int32>()(7) = 10 + size;
-
-  Tensor strides(DT_INT32, TensorShape({8}));
-  for (int i = 0; i < 8; ++i) {
-    strides.flat<int32>()(i) = 1;
-  }
-
-  Tensor input(dt, TensorShape({2*kDim, 1, 1, 1, 1, 1, 1, kMaxSize}));
-  input.flat<T>().setRandom();
-
-  Node* node;
-  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "StridedSlice")
-                  .Input(test::graph::Constant(g, input))
-                  .Input(test::graph::Constant(g, begin))
-                  .Input(test::graph::Constant(g, end))
-                  .Input(test::graph::Constant(g, strides))
-                  .Attr("T", dt)
-                  .Finalize(g, &node));
-
-  testing::BytesProcessed(static_cast<int64>(iters) * kDim * size * sizeof(T));
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
-  testing::UseRealTime();
-}
-
 static void BM_SliceFloat(int iters, int dim2) {
   SliceHelper<float>(iters, dim2);
-  Dim8SliceHelper<float>(iters, dim2);
 }
 
 BENCHMARK(BM_SliceFloat)->Arg(100)->Arg(1000)->Arg(10000);
 
 static void BM_SliceComplex64(int iters, int dim2) {
   SliceHelper<std::complex<float>>(iters, dim2);
-  Dim8SliceHelper<std::complex<float>>(iters, dim2);
 }
 
 BENCHMARK(BM_SliceComplex64)->Arg(100)->Arg(1000)->Arg(10000);
 
 static void BM_SliceBFloat16(int iters, int dim2) {
   SliceHelper<bfloat16>(iters, dim2);
-  Dim8SliceHelper<bfloat16>(iters, dim2);
 }
 
 BENCHMARK(BM_SliceBFloat16)->Arg(100)->Arg(1000)->Arg(10000);
diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
index 3706f51cf40d88f1b0786857536f2ed6a9da1b22..f092afe66ca1a9130410904a2c1158cfc3a8ac70 100644
--- a/tensorflow/core/kernels/summary_kernels.cc
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -67,6 +67,7 @@ class CreateSummaryDbWriterOp : public OpKernel {
     SummaryWriterInterface* s;
     auto db = Sqlite::Open(db_uri);
     OP_REQUIRES_OK(ctx, db.status());
+    db.ValueOrDie()->UseWriteAheadLogWithReducedDurabilityIfPossible();
     OP_REQUIRES_OK(
         ctx, CreateSummaryDbWriter(std::move(db.ValueOrDie()), experiment_name,
                                    run_name, user_name, ctx->env(), &s));
@@ -111,8 +112,8 @@ class WriteSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
     core::ScopedUnref unref(s);
     const Tensor* tmp;
-    OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp));
-    const int64 global_step = tmp->scalar<int64>()();
+    OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
+    const int64 step = tmp->scalar<int64>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
     const string& tag = tmp->scalar<string>()();
     OP_REQUIRES_OK(ctx, ctx->input("summary_metadata", &tmp));
@@ -121,8 +122,7 @@ class WriteSummaryOp : public OpKernel {
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("tensor", &t));
 
-    OP_REQUIRES_OK(ctx,
-                   s->WriteTensor(global_step, *t, tag, serialized_metadata));
+    OP_REQUIRES_OK(ctx, s->WriteTensor(step, *t, tag, serialized_metadata));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("WriteSummary").Device(DEVICE_CPU),
@@ -158,15 +158,15 @@ class WriteScalarSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
     core::ScopedUnref unref(s);
     const Tensor* tmp;
-    OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp));
-    const int64 global_step = tmp->scalar<int64>()();
+    OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
+    const int64 step = tmp->scalar<int64>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
     const string& tag = tmp->scalar<string>()();
 
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("value", &t));
 
-    OP_REQUIRES_OK(ctx, s->WriteScalar(global_step, *t, tag));
+    OP_REQUIRES_OK(ctx, s->WriteScalar(step, *t, tag));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("WriteScalarSummary").Device(DEVICE_CPU),
@@ -181,15 +181,15 @@ class WriteHistogramSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
     core::ScopedUnref unref(s);
     const Tensor* tmp;
-    OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp));
-    const int64 global_step = tmp->scalar<int64>()();
+    OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
+    const int64 step = tmp->scalar<int64>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
     const string& tag = tmp->scalar<string>()();
 
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("values", &t));
 
-    OP_REQUIRES_OK(ctx, s->WriteHistogram(global_step, *t, tag));
+    OP_REQUIRES_OK(ctx, s->WriteHistogram(step, *t, tag));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("WriteHistogramSummary").Device(DEVICE_CPU),
@@ -210,8 +210,8 @@ class WriteImageSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
     core::ScopedUnref unref(s);
     const Tensor* tmp;
-    OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp));
-    const int64 global_step = tmp->scalar<int64>()();
+    OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
+    const int64 step = tmp->scalar<int64>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
     const string& tag = tmp->scalar<string>()();
     const Tensor* bad_color;
@@ -224,8 +224,7 @@ class WriteImageSummaryOp : public OpKernel {
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("tensor", &t));
 
-    OP_REQUIRES_OK(
-        ctx, s->WriteImage(global_step, *t, tag, max_images_, *bad_color));
+    OP_REQUIRES_OK(ctx, s->WriteImage(step, *t, tag, max_images_, *bad_color));
   }
 
  private:
@@ -247,8 +246,8 @@ class WriteAudioSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
     core::ScopedUnref unref(s);
     const Tensor* tmp;
-    OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp));
-    const int64 global_step = tmp->scalar<int64>()();
+    OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
+    const int64 step = tmp->scalar<int64>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
     const string& tag = tmp->scalar<string>()();
     OP_REQUIRES_OK(ctx, ctx->input("sample_rate", &tmp));
@@ -257,8 +256,8 @@ class WriteAudioSummaryOp : public OpKernel {
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("tensor", &t));
 
-    OP_REQUIRES_OK(
-        ctx, s->WriteAudio(global_step, *t, tag, max_outputs_, sample_rate));
+    OP_REQUIRES_OK(ctx,
+                   s->WriteAudio(step, *t, tag, max_outputs_, sample_rate));
   }
 
  private:
@@ -278,8 +277,8 @@ class WriteGraphSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
     core::ScopedUnref unref(s);
     const Tensor* t;
-    OP_REQUIRES_OK(ctx, ctx->input("global_step", &t));
-    const int64 global_step = t->scalar<int64>()();
+    OP_REQUIRES_OK(ctx, ctx->input("step", &t));
+    const int64 step = t->scalar<int64>()();
     OP_REQUIRES_OK(ctx, ctx->input("tensor", &t));
     std::unique_ptr<GraphDef> graph{new GraphDef};
     if (!ParseProtoUnlimited(graph.get(), t->scalar<string>()())) {
@@ -287,7 +286,7 @@ class WriteGraphSummaryOp : public OpKernel {
           errors::DataLoss("Bad tf.GraphDef binary proto tensor string"));
       return;
     }
-    OP_REQUIRES_OK(ctx, s->WriteGraph(global_step, std::move(graph)));
+    OP_REQUIRES_OK(ctx, s->WriteGraph(step, std::move(graph)));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("WriteGraphSummary").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/tile_functor_gpu.cu.cc b/tensorflow/core/kernels/tile_functor_gpu.cu.cc
index 5a36e7567beb16e447de28d3cf930fbd29f6c078..84a5060fc3cd17c09b905d606dba62bbaa7f1373 100644
--- a/tensorflow/core/kernels/tile_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu.cu.cc
@@ -90,6 +90,7 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct Tile<GPUDevice, T, int32>; \
   template struct Tile<GPUDevice, T, int64>;
 
+TF_CALL_bool(DEFINE_TYPE);
 TF_CALL_int16(DEFINE_TYPE);
 TF_CALL_int32(DEFINE_TYPE);
 TF_CALL_int64(DEFINE_TYPE);
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index fa5afe6a31b0c660151070f5cd2e1d5be280adc5..68cdae3249a070caeb77ce944be2c32791e4245c 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -222,6 +222,7 @@ TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
 TF_CALL_string(HANDLE_TYPE_NAME_CPU);
 
 #if GOOGLE_CUDA
+TF_CALL_bool(HANDLE_TYPE_NAME_GPU);
 TF_CALL_float(HANDLE_TYPE_NAME_GPU);
 TF_CALL_double(HANDLE_TYPE_NAME_GPU);
 TF_CALL_int16(HANDLE_TYPE_NAME_GPU);
@@ -534,7 +535,7 @@ REGISTER_KERNEL_BUILDER(Name("TileGrad")
                         TileGradientOp<CPUDevice, int64>);
 
 #if GOOGLE_CUDA
-#define REGISTER_GPU(type)                                         \
+#define REGISTER_GPU_TILE(type)                                    \
   REGISTER_KERNEL_BUILDER(Name("Tile")                             \
                               .Device(DEVICE_GPU)                  \
                               .TypeConstraint<type>("T")           \
@@ -546,7 +547,9 @@ REGISTER_KERNEL_BUILDER(Name("TileGrad")
                               .TypeConstraint<type>("T")           \
                               .TypeConstraint<int64>("Tmultiples") \
                               .HostMemory("multiples"),            \
-                          TileOp<GPUDevice, int64>);               \
+                          TileOp<GPUDevice, int64>);
+
+#define REGISTER_GPU_TILE_GRAD(type)                               \
   REGISTER_KERNEL_BUILDER(Name("TileGrad")                         \
                               .Device(DEVICE_GPU)                  \
                               .TypeConstraint<type>("T")           \
@@ -560,6 +563,11 @@ REGISTER_KERNEL_BUILDER(Name("TileGrad")
                               .HostMemory("multiples"),            \
                           TileGradientOp<GPUDevice, int64>);
 
+#define REGISTER_GPU(type) \
+  REGISTER_GPU_TILE(type); \
+  REGISTER_GPU_TILE_GRAD(type);
+
+TF_CALL_bool(REGISTER_GPU_TILE);
 TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 TF_CALL_half(REGISTER_GPU);
@@ -568,6 +576,8 @@ TF_CALL_int32(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU)
 
+#undef REGISTER_GPU_TILE
+#undef REGISTER_GPU_TILE_GRAD
 #undef REGISTER_GPU
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 98dfa5a3dd8ee02c077d6924ca19e90838c42074..38e77ab60fb7126bcdedc09bfe9e2ec7de88c0ad 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -15,12 +15,13 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/training_ops.h"
 #include <algorithm>
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/kernels/training_ops.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 
 #ifdef TENSORFLOW_USE_SYCL
@@ -75,9 +76,9 @@ struct ApplyAdadelta<CPUDevice, T> {
         accum * rho() + grad.square() * (static_cast<T>(1) - rho());
     const auto update =
         (accum_update + epsilon()).sqrt() * (accum + epsilon()).rsqrt() * grad;
+    var.device(d) -= update * lr();
     accum_update.device(d) =
         accum_update * rho() + update.square() * (static_cast<T>(1) - rho());
-    var.device(d) -= update * lr();
   }
 };
 
@@ -361,6 +362,37 @@ struct ApplyCenteredRMSProp<CPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyAddSign<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar alpha,
+                  typename TTypes<T>::ConstScalar sign_decay,
+                  typename TTypes<T>::ConstScalar beta,
+                  typename TTypes<T>::ConstFlat grad) {
+    m.device(d) = m * beta() + grad * (static_cast<T>(1) - beta());
+    auto sign_gm = grad.sign() * m.sign();
+    var.device(d) -= lr() * (alpha() + sign_decay() * sign_gm) * grad;
+  }
+};
+
+template <typename T>
+struct ApplyPowerSign<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar logbase,
+                  typename TTypes<T>::ConstScalar sign_decay,
+                  typename TTypes<T>::ConstScalar beta,
+                  typename TTypes<T>::ConstFlat grad) {
+    m.device(d) = m * beta() + grad * (static_cast<T>(1) - beta());
+    auto sign_gm = grad.sign() * m.sign();
+    auto grad_scale = (logbase() * sign_decay() * sign_gm).exp();
+    var.device(d) -= lr() * grad_scale * grad;
+  }
+};
+
 }  // namespace functor
 
 template <typename Device, typename T>
@@ -504,8 +536,9 @@ class ApplyAdadeltaOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    if (use_exclusive_lock_) {
-      mutex_lock l1(*GetTrainingVariableMutex(ctx, 0));
+    mutex* mu = GetTrainingVariableMutex(ctx, 0);
+    if (use_exclusive_lock_ && mu != nullptr) {
+      mutex_lock l1(*mu);
       // Don't try to acquire a lock on the second ref as they share the same
       // mutex.
       //
@@ -650,15 +683,21 @@ class SparseApplyAdadeltaOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
   }
 
-  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    mutex* mu_var = GetTrainingVariableMutex(ctx, 0);
+  void Compute(OpKernelContext* ctx) override {
+    mutex* mu = GetTrainingVariableMutex(ctx, 0);
     // mu_accum is actually the same mutex as mu_var since currently we use a
     // global mutex.
     //
     // mutex* mu_accum = ctx->input_ref_mutex(1);
-    if (use_exclusive_lock_) {
-      mu_var->lock();
+    if (use_exclusive_lock_ && mu != nullptr) {
+      mutex_lock ml(*mu);
+      DoCompute(ctx);
+    } else {
+      DoCompute(ctx);
     }
+  }
+
+  void DoCompute(OpKernelContext* ctx) {
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
                             ctx, 0, use_exclusive_lock_, true, &var));
@@ -752,16 +791,13 @@ class SparseApplyAdadeltaOp : public OpKernel {
         const auto update =
             (accum_update_ + accum_update_.constant(epsilon_scalar)).sqrt() *
             (accum_ + accum_.constant(epsilon_scalar)).rsqrt() * grad_;
+        auto v = var_flat.template chip<0>(index);
+        v -= update * update.constant(lr_scalar);
         accum_update_ =
             accum_update_ * accum_update_.constant(rho_scalar) +
             update.square() * update.constant(static_cast<T>(1) - rho_scalar);
-        auto v = var_flat.template chip<0>(index);
-        v -= update * update.constant(lr_scalar);
       }
     }
-    if (use_exclusive_lock_) {
-      mu_var->unlock();
-    }
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
@@ -3243,4 +3279,220 @@ REGISTER_KERNELS(double, int64);
 
 #undef REGISTER_KERNELS
 
+
+template <typename Device, typename T>
+class ApplyAddSignOp : public OpKernel {
+ public:
+  explicit ApplyAddSignOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 0, use_exclusive_lock_, false, &var));
+    Tensor m;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 1, use_exclusive_lock_, false, &m));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, m.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& alpha = ctx->input(3);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(alpha.shape()),
+                errors::InvalidArgument("alpha is not a scalar: ",
+                                        alpha.shape().DebugString()));
+    const Tensor& sign_decay = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(alpha.shape()),
+                errors::InvalidArgument("sign_decay is not a scalar: ",
+                                        sign_decay.shape().DebugString()));
+    const Tensor& beta = ctx->input(5);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta.shape()),
+                errors::InvalidArgument("beta is not a scalar: ",
+                                        beta.shape().DebugString()));
+    const Tensor& grad = ctx->input(6);
+    OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
+                errors::InvalidArgument("var and m do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        m.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyAddSign<Device, T>()(
+        device, var.flat<T>(), m.flat<T>(), lr.scalar<T>(), alpha.scalar<T>(),
+        sign_decay.scalar<T>(), beta.scalar<T>(), grad.flat<T>());
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(D, T)                                        \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("ApplyAddSign").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyAddSignOp<D##Device, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyAddSign")                \
+                              .Device(DEVICE_##D)                     \
+                              .HostMemory("var")                      \
+                              .HostMemory("m")                        \
+                              .TypeConstraint<T>("T"),                \
+                          ApplyAddSignOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                               \
+  template <>                                                             \
+  void ApplyAddSign<GPUDevice, T>::operator()(                            \
+      const GPUDevice& d,                                                 \
+      typename TTypes<T>::Flat var,                                       \
+      typename TTypes<T>::Flat m,                                         \
+      typename TTypes<T>::ConstScalar lr,                                 \
+      typename TTypes<T>::ConstScalar alpha,                              \
+      typename TTypes<T>::ConstScalar sign_decay,                         \
+      typename TTypes<T>::ConstScalar beta,                               \
+      typename TTypes<T>::ConstFlat grad);                                \
+  extern template struct ApplyAddSign<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+
+template <typename Device, typename T>
+class ApplyPowerSignOp : public OpKernel {
+ public:
+  explicit ApplyPowerSignOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 0, use_exclusive_lock_, false, &var));
+    Tensor m;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 1, use_exclusive_lock_, false, &m));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, m.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& logbase = ctx->input(3);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(logbase.shape()),
+                errors::InvalidArgument("logbase is not a scalar: ",
+                                        logbase.shape().DebugString()));
+    const Tensor& sign_decay = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(logbase.shape()),
+                errors::InvalidArgument("sign_decay is not a scalar: ",
+                                        sign_decay.shape().DebugString()));
+    const Tensor& beta = ctx->input(5);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta.shape()),
+                errors::InvalidArgument("beta is not a scalar: ",
+                                        beta.shape().DebugString()));
+    const Tensor& grad = ctx->input(6);
+    OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
+                errors::InvalidArgument("var and m do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        m.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyPowerSign<Device, T>()(
+        device, var.flat<T>(), m.flat<T>(), lr.scalar<T>(), logbase.scalar<T>(),
+        sign_decay.scalar<T>(), beta.scalar<T>(), grad.flat<T>());
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(D, T)                                          \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("ApplyPowerSign").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyPowerSignOp<D##Device, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyPowerSign")                \
+                              .Device(DEVICE_##D)                       \
+                              .HostMemory("var")                        \
+                              .HostMemory("m")                          \
+                              .TypeConstraint<T>("T"),                  \
+                          ApplyPowerSignOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                               \
+  template <>                                                             \
+  void ApplyPowerSign<GPUDevice, T>::operator()(                          \
+      const GPUDevice& d,                                                 \
+      typename TTypes<T>::Flat var,                                       \
+      typename TTypes<T>::Flat m,                                         \
+      typename TTypes<T>::ConstScalar lr,                                 \
+      typename TTypes<T>::ConstScalar logbase,                            \
+      typename TTypes<T>::ConstScalar sign_decay,                         \
+      typename TTypes<T>::ConstScalar beta,                               \
+      typename TTypes<T>::ConstFlat grad);                                \
+  extern template struct ApplyPowerSign<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index 99a714e0a27cd66b3e53ab732fd1c8929b91e106..7ee956053abd320058963a8cc0bffa1fdc2e085c 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -161,6 +161,29 @@ struct ApplyCenteredRMSProp {
                   typename TTypes<T>::ConstScalar epsilon,
                   typename TTypes<T>::ConstFlat grad);
 };
+
+template <typename Device, typename T>
+struct ApplyAddSign {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar alpha,
+                  typename TTypes<T>::ConstScalar sign_decay,
+                  typename TTypes<T>::ConstScalar beta,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T>
+struct ApplyPowerSign {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar logbase,
+                  typename TTypes<T>::ConstScalar sign_decay,
+                  typename TTypes<T>::ConstScalar beta,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
 }  // end namespace functor
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index 3678b96e98f49994089487a833c9a0b4d662041e..d443a6b3c1d0b548e915216adbc05549a66eaeda 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -70,11 +70,11 @@ struct ApplyAdadelta<GPUDevice, T> {
     const auto update =
         (accum_update + epsilon.reshape(single).broadcast(bcast)).sqrt() *
         (accum + epsilon.reshape(single).broadcast(bcast)).rsqrt() * grad;
+    var.device(d) -= update * lr.reshape(single).broadcast(bcast);
     accum_update.device(d) =
         accum_update * rho.reshape(single).broadcast(bcast) +
         update.square() *
             (grad.constant(T(1)) - rho.reshape(single).broadcast(bcast));
-    var.device(d) -= update * lr.reshape(single).broadcast(bcast);
   }
 };
 
@@ -193,6 +193,71 @@ struct ApplyCenteredRMSProp<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyAddSign<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar alpha,
+                  typename TTypes<T>::ConstScalar sign_decay,
+                  typename TTypes<T>::ConstScalar beta,
+                  typename TTypes<T>::ConstFlat grad) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+
+    // The following is the GPU equivalent of the CPU version:
+    // m.device(d) = m * beta() + grad * (static_cast<T>(1) - beta());
+    const auto one = static_cast<T>(1.0);
+    auto beta_bcast = beta.reshape(single).broadcast(bcast);
+    auto one_minus_beta =
+        (beta.constant(one) - beta).reshape(single).broadcast(bcast);
+    m.device(d) =  m * beta_bcast + grad * one_minus_beta;
+
+    // The following is the GPU equivalent of the CPU version:
+    // var.device(d) -= lr() * (alpha() + sign_decay() * sign_gm) * grad;
+    auto sign_gm = grad.sign() * m.sign();
+    auto lr_bcast = lr.reshape(single).broadcast(bcast);
+    auto alpha_bcast = alpha.reshape(single).broadcast(bcast);
+    auto sign_decay_bcast = sign_decay.reshape(single).broadcast(bcast);
+    var.device(d) -=
+        lr_bcast * (alpha_bcast + sign_decay_bcast * sign_gm) * grad;
+  }
+};
+
+template <typename T>
+struct ApplyPowerSign<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar logbase,
+                  typename TTypes<T>::ConstScalar sign_decay,
+                  typename TTypes<T>::ConstScalar beta,
+                  typename TTypes<T>::ConstFlat grad) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+
+    // The following is the GPU equivalent of the CPU version:
+    // m.device(d) = m * beta() + grad * (static_cast<T>(1) - beta());
+    const auto one = static_cast<T>(1.0);
+    auto beta_bcast = beta.reshape(single).broadcast(bcast);
+    auto one_minus_beta =
+        (beta.constant(one) - beta).reshape(single).broadcast(bcast);
+    m.device(d) =  m * beta_bcast + grad * one_minus_beta;
+
+    // The following is the GPU equivalent of the CPU version:
+    // auto grad_scale = (logbase() * sign_decay() * sign_gm).exp();
+    // var.device(d) -= lr() * grad_scale * grad;
+    auto sign_gm = grad.sign() * m.sign();
+    auto lr_bcast = lr.reshape(single).broadcast(bcast);
+    auto logbase_bcast = logbase.reshape(single).broadcast(bcast);
+    auto sign_decay_bcast = sign_decay.reshape(single).broadcast(bcast);
+    auto grad_scale =  (logbase_bcast * sign_decay_bcast * sign_gm).exp();
+    var.device(d) -= lr_bcast * grad_scale * grad;
+  }
+};
+
 }  // namespace functor
 
 template struct functor::ApplyGradientDescent<GPUDevice, Eigen::half>;
@@ -222,6 +287,15 @@ template struct functor::ApplyRMSProp<GPUDevice, double>;
 template struct functor::ApplyCenteredRMSProp<GPUDevice, Eigen::half>;
 template struct functor::ApplyCenteredRMSProp<GPUDevice, float>;
 template struct functor::ApplyCenteredRMSProp<GPUDevice, double>;
+
+template struct functor::ApplyAddSign<GPUDevice, Eigen::half>;
+template struct functor::ApplyAddSign<GPUDevice, float>;
+template struct functor::ApplyAddSign<GPUDevice, double>;
+
+template struct functor::ApplyPowerSign<GPUDevice, Eigen::half>;
+template struct functor::ApplyPowerSign<GPUDevice, float>;
+template struct functor::ApplyPowerSign<GPUDevice, double>;
+
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc
index 4b1c9eb8bb6f06b5827dafa423f83a8e0184dcd6..ffa7f87c9efda0e3288b9fb06d0c9d1a3dcba277 100644
--- a/tensorflow/core/kernels/training_ops_test.cc
+++ b/tensorflow/core/kernels/training_ops_test.cc
@@ -233,4 +233,78 @@ static void BM_RMSProp(int iters, int params) {
 }
 BENCHMARK(BM_RMSProp)->Arg(128 << 10)->Arg(256 << 10);
 
+static void AddSign(int32 n, Graph** init_g, Graph** train_g) {
+  TensorShape shape({n});
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto m = Var(g, n);
+    auto zero = Zeros(g, n);
+    test::graph::Assign(g, var, zero);
+    test::graph::Assign(g, m, zero);
+    *init_g = g;
+  }
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto m = Var(g, n);
+    auto lr = Scalar(g, 0.01);
+    auto alpha = Scalar(g, 0.1);
+    auto sign_decay = Scalar(g, 0.9);
+    auto beta = Scalar(g, 0.8);
+    auto grad = Random(g, n);
+    test::graph::Multi(g, "ApplyAddSign",
+                       {var, m, lr, alpha, sign_decay, beta, grad});
+    *train_g = g;
+  }
+}
+
+static void BM_AddSign(int iters, int params) {
+  const int64 tot = static_cast<int64>(iters) * params;
+  testing::ItemsProcessed(tot);
+  testing::BytesProcessed(tot * sizeof(float));
+  Graph* init;
+  Graph* train;
+  AddSign(params, &init, &train);
+  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_AddSign)->Arg(128 << 10)->Arg(256 << 10);
+
+static void PowerSign(int32 n, Graph** init_g, Graph** train_g) {
+  TensorShape shape({n});
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto m = Var(g, n);
+    auto zero = Zeros(g, n);
+    test::graph::Assign(g, var, zero);
+    test::graph::Assign(g, m, zero);
+    *init_g = g;
+  }
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto m = Var(g, n);
+    auto lr = Scalar(g, 0.01);
+    auto logbase = Scalar(g, 2);
+    auto sign_decay = Scalar(g, 0.9);
+    auto beta = Scalar(g, 0.8);
+    auto grad = Random(g, n);
+    test::graph::Multi(g, "ApplyPowerSign",
+                       {var, m, lr, logbase, sign_decay, beta, grad});
+    *train_g = g;
+  }
+}
+
+static void BM_PowerSign(int iters, int params) {
+  const int64 tot = static_cast<int64>(iters) * params;
+  testing::ItemsProcessed(tot);
+  testing::BytesProcessed(tot * sizeof(float));
+  Graph* init;
+  Graph* train;
+  PowerSign(params, &init, &train);
+  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_PowerSign)->Arg(128 << 10)->Arg(256 << 10);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index d087784c8a0bd2a53438af4582754b2d47620545..782470210f7869cc1291adb663dbdc61e9f32da0 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -133,11 +133,7 @@ class UniqueOp : public OpKernel {
     auto Tout = output->shaped<T, 3>(new_sizes);
 
     for (auto it : uniq) {
-      for (int64 i = 0; i < Tin.dimension(0); i++) {
-        for (int64 j = 0; j < Tin.dimension(2); j++) {
-          Tout(i, it.second, j) = Tin(i, it.first, j);
-        }
-      }
+      Tout.chip(it.second, 1) = Tin.chip(it.first, 1);
     }
 
     if (num_outputs() > 2) {
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index 7fd1def1fe02e8418882bc4cb19c4318779c5282..397bdd56708d766d06e5a68f3b049a5b928195e1 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -142,6 +142,7 @@ TF_CALL_ALL_TYPES(REGISTER_UNPACK);
       UnpackOp<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_bfloat16(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // A special GPU kernel for int32.
@@ -153,6 +154,12 @@ REGISTER_KERNEL_BUILDER(Name("Unpack")
                             .HostMemory("output")
                             .TypeConstraint<int32>("T"),
                         UnpackOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("Unpack")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("value")
+                            .HostMemory("output")
+                            .TypeConstraint<int64>("T"),
+                        UnpackOp<CPUDevice, int64>);
 
 #endif  // GOOGLE_CUDA
 
@@ -170,6 +177,13 @@ REGISTER_KERNEL_BUILDER(Name("Unpack")
                             .HostMemory("output")
                             .TypeConstraint<int32>("T"),
                         UnpackOp<CPUDevice, int32>);
+
+REGISTER_KERNEL_BUILDER(Name("Unpack")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("value")
+                            .HostMemory("output")
+                            .TypeConstraint<int64>("T"),
+                        UnpackOp<CPUDevice, int64>);
 #undef REGISTER_SYCL
 #endif  // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index 36b8ff09d7381a0b8bbb8b6f8d71b14e47fa4663..1b7079dcbae34de683951979cbf692d954a966ee 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -23,6 +23,177 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Resource stored by variables in the resource manager
+// (legacy, ref-style version).
+class LegacyVar : public ResourceBase {
+ public:
+  explicit LegacyVar(DataType dtype) : tensor_(dtype) {}
+  // Not copyable or movable.
+  LegacyVar(const LegacyVar&) = delete;
+  LegacyVar& operator=(const LegacyVar&) = delete;
+
+  mutex* mu() { return &mu_; }
+  Tensor* tensor() { return &tensor_; }
+
+  string DebugString() override {
+    return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
+                           tensor_.shape().DebugString());
+  }
+
+ private:
+  mutex mu_;
+  Tensor tensor_;
+
+  ~LegacyVar() override {}
+};
+
+VariableOp::VariableOp(OpKernelConstruction* context) : OpKernel(context) {
+  OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
+  dtype_ = RemoveRefType(context->output_type(0));
+}
+
+void VariableOp::Compute(OpKernelContext* ctx) {
+  mutex_lock l(init_mu_);
+  if (!initialized_) {
+    OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
+                                    true /* use name() */));
+    initialized_ = true;
+  }
+  auto creator = [this](LegacyVar** var) {
+    *var = new LegacyVar(dtype_);
+    (*var)->tensor()->set_shape(shape_);
+    return Status::OK();
+  };
+  LegacyVar* var;
+  OP_REQUIRES_OK(ctx, cinfo_.resource_manager()->LookupOrCreate<LegacyVar>(
+                          cinfo_.container(), cinfo_.name(), &var, creator));
+  // Output a reference to our tensor, so it may be updated.
+  //
+  // As long as the resource manager hasn't been cleared the ref we return
+  // here is valid because it owns a ref on var.
+  ctx->set_output_ref(0, var->mu(), var->tensor());
+  if (ctx->track_allocations() && var->tensor()->IsInitialized()) {
+    AllocatorAttributes attr;
+    attr.set_gpu_compatible(true);
+    attr.set_nic_compatible(true);
+    if (ctx->allocate_on_host(attr)) {
+      ctx->record_host_persistent_memory_allocation(
+          var->tensor()->AllocatedBytes());
+    } else {
+      ctx->record_device_persistent_memory_allocation(
+          var->tensor()->AllocatedBytes());
+    }
+  }
+  var->Unref();
+}
+
+class TemporaryVariableOp : public OpKernel {
+ public:
+  explicit TemporaryVariableOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
+    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
+    OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_));
+    // Variable name defaults to op name if not specified explicitly.
+    if (var_name_.empty()) var_name_ = name();
+  }
+
+  void Compute(OpKernelContext* context) override {
+    Status s;
+    ResourceMgr* rm = context->resource_manager();
+    OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
+    auto* tmp_var = new TmpVar;
+    OP_REQUIRES(context, tmp_var,
+                errors::ResourceExhausted("Could not allocate TmpVar."));
+    tmp_var->name = var_name_;
+    s = context->allocate_temp(dtype_, shape_, &tmp_var->val);
+    if (!s.ok()) tmp_var->Unref();
+    OP_REQUIRES_OK(context, s);
+    OP_REQUIRES_OK(context, rm->Create(context->step_container()->name(),
+                                       var_name_, tmp_var));
+    context->set_output_ref(0, &tmp_var->mu, &tmp_var->val);
+    if (context->track_allocations()) {
+      AllocatorAttributes attr;
+      if (context->allocate_on_host(attr)) {
+        context->record_host_persistent_memory_allocation(
+            tmp_var->val.AllocatedBytes());
+      } else {
+        context->record_device_persistent_memory_allocation(
+            tmp_var->val.AllocatedBytes());
+      }
+    }
+  }
+
+ private:
+  // Refcounted temporary variable resource.
+  friend class DestroyTemporaryVariableOp;
+  struct TmpVar : public ResourceBase {
+    mutex mu;
+    Tensor val;
+    string name;
+    string DebugString() override { return name; }
+    ~TmpVar() override { VLOG(3) << "TmpVar " << name << " deleted"; }
+  };
+
+  TensorShape shape_;
+  DataType dtype_;
+  string var_name_;
+};
+
+class DestroyTemporaryVariableOp : public OpKernel {
+ public:
+  explicit DestroyTemporaryVariableOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES(context, IsRefType(context->input_type(0)),
+                errors::InvalidArgument("lhs input needs to be a ref type"));
+    OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_));
+    OP_REQUIRES(context, !var_name_.empty(),
+                errors::InvalidArgument("Missing var_name attribute"));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // NOTE(pbar): All other mutators of the Tensor Ref *must* have completed
+    // their execution before this DestroyTemporaryVariable op executes.
+    // This is typically achieved using control dependencies.
+    CHECK(IsRefType(context->input_dtype(0)));
+    Tensor tmpvar = context->mutable_input(0, false);
+    context->set_output(0, tmpvar);
+    ResourceMgr* rm = context->resource_manager();
+    OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
+    OP_REQUIRES_OK(context, rm->Delete<TemporaryVariableOp::TmpVar>(
+                                context->step_container()->name(), var_name_));
+    if (context->track_allocations()) {
+      if (context->allocate_on_host(AllocatorAttributes())) {
+        context->record_host_persistent_memory_allocation(
+            -static_cast<int64>(tmpvar.AllocatedBytes()));
+      } else {
+        context->record_device_persistent_memory_allocation(
+            -static_cast<int64>(tmpvar.AllocatedBytes()));
+      }
+    }
+  }
+
+ private:
+  string var_name_;
+};
+
+class IsVariableInitializedOp : public OpKernel {
+ public:
+  explicit IsVariableInitializedOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Get a mutable input tensor of the Ref input.
+    const Tensor& input_tensor = context->mutable_input(0, false);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+    auto output_tensor = output->tensor<bool, 0>();
+    bool result = input_tensor.IsInitialized();
+    output_tensor() = result;
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("Variable").Device(DEVICE_CPU), VariableOp);
 REGISTER_KERNEL_BUILDER(Name("VariableV2").Device(DEVICE_CPU), VariableOp);
 REGISTER_KERNEL_BUILDER(Name("TemporaryVariable").Device(DEVICE_CPU),
@@ -33,30 +204,30 @@ REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized").Device(DEVICE_CPU),
                         IsVariableInitializedOp);
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                         \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("Variable").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"),  \
-      VariableOp);                                                         \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("VariableV2").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"),\
-      VariableOp);                                                         \
-  REGISTER_KERNEL_BUILDER(Name("TemporaryVariable")                        \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("dtype"),              \
-                          TemporaryVariableOp);                            \
-  REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable")                 \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("T"),                  \
-                          DestroyTemporaryVariableOp);                     \
-  REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized")                    \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("dtype")               \
-                              .HostMemory("is_initialized"),               \
+#define REGISTER_SYCL_KERNEL(type)                                          \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("Variable").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"),   \
+      VariableOp);                                                          \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("VariableV2").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"), \
+      VariableOp);                                                          \
+  REGISTER_KERNEL_BUILDER(Name("TemporaryVariable")                         \
+                              .Device(DEVICE_SYCL)                          \
+                              .TypeConstraint<type>("dtype"),               \
+                          TemporaryVariableOp);                             \
+  REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable")                  \
+                              .Device(DEVICE_SYCL)                          \
+                              .TypeConstraint<type>("T"),                   \
+                          DestroyTemporaryVariableOp);                      \
+  REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized")                     \
+                              .Device(DEVICE_SYCL)                          \
+                              .TypeConstraint<type>("dtype")                \
+                              .HostMemory("is_initialized"),                \
                           IsVariableInitializedOp);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 // Only register 'Variable' on GPU for the subset of types also supported by
diff --git a/tensorflow/core/kernels/variable_ops.h b/tensorflow/core/kernels/variable_ops.h
index 355140d44c5c53c8496d5bd2b3028e9ae9b3940b..83134bad378bfef18c3e93be5cc3c6b70ab4f523 100644
--- a/tensorflow/core/kernels/variable_ops.h
+++ b/tensorflow/core/kernels/variable_ops.h
@@ -27,10 +27,16 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Resource stored by variables in the resource manager.
+// Resource stored by variables in the resource manager
+// (new, resource-style version).
 class Var : public ResourceBase {
  public:
   explicit Var(DataType dtype) : tensor_(dtype) {}
+  // Not copyable or movable.
+  Var(const Var&) = delete;
+  Var& operator=(const Var&) = delete;
+
+  // TODO(ebrevdo): Use LockSet instead of exposing mu.
   mutex* mu() { return &mu_; }
   Tensor* tensor() { return &tensor_; }
 
@@ -44,52 +50,12 @@ class Var : public ResourceBase {
   Tensor tensor_;
 
   ~Var() override {}
-  TF_DISALLOW_COPY_AND_ASSIGN(Var);
 };
 
 class VariableOp : public OpKernel {
  public:
-  explicit VariableOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
-    dtype_ = RemoveRefType(context->output_type(0));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    mutex_lock l(init_mu_);
-    if (!initialized_) {
-      OP_REQUIRES_OK(
-          ctx,
-          cinfo_.Init(ctx->resource_manager(), def(), true /* use name() */));
-      initialized_ = true;
-    }
-    auto creator = [this](Var** var) {
-      *var = new Var(dtype_);
-      (*var)->tensor()->set_shape(shape_);
-      return Status::OK();
-    };
-    Var* var;
-    OP_REQUIRES_OK(ctx,
-                   cinfo_.resource_manager()->LookupOrCreate<Var>(
-                       cinfo_.container(), cinfo_.name(), &var, creator));
-    // Output a reference to our tensor, so it may be updated.
-    //
-    // As long as the resource manager hasn't been cleared the ref we return
-    // here is valid because it owns a ref on var.
-    ctx->set_output_ref(0, var->mu(), var->tensor());
-    if (ctx->track_allocations() && var->tensor()->IsInitialized()) {
-      AllocatorAttributes attr;
-      attr.set_gpu_compatible(true);
-      attr.set_nic_compatible(true);
-      if (ctx->allocate_on_host(attr)) {
-        ctx->record_host_persistent_memory_allocation(
-            var->tensor()->AllocatedBytes());
-      } else {
-        ctx->record_device_persistent_memory_allocation(
-            var->tensor()->AllocatedBytes());
-      }
-    }
-    var->Unref();
-  }
+  explicit VariableOp(OpKernelConstruction* context);
+  void Compute(OpKernelContext* ctx) override;
 
  private:
   DataType dtype_;
@@ -102,112 +68,6 @@ class VariableOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(VariableOp);
 };
 
-class TemporaryVariableOp : public OpKernel {
- public:
-  explicit TemporaryVariableOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
-    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
-    OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_));
-    // Variable name defaults to op name if not specified explicitly.
-    if (var_name_ == "") var_name_ = name();
-  }
-
-  void Compute(OpKernelContext* context) override {
-    Status s;
-    ResourceMgr* rm = context->resource_manager();
-    OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
-    auto* tmp_var = new TmpVar;
-    OP_REQUIRES(context, tmp_var,
-                errors::ResourceExhausted("Could not allocate TmpVar."));
-    tmp_var->name = var_name_;
-    s = context->allocate_temp(dtype_, shape_, &tmp_var->val);
-    if (!s.ok()) tmp_var->Unref();
-    OP_REQUIRES_OK(context, s);
-    OP_REQUIRES_OK(context, rm->Create(context->step_container()->name(),
-                                       var_name_, tmp_var));
-    context->set_output_ref(0, &tmp_var->mu, &tmp_var->val);
-    if (context->track_allocations()) {
-      AllocatorAttributes attr;
-      if (context->allocate_on_host(attr)) {
-        context->record_host_persistent_memory_allocation(
-            tmp_var->val.AllocatedBytes());
-      } else {
-        context->record_device_persistent_memory_allocation(
-            tmp_var->val.AllocatedBytes());
-      }
-    }
-  }
-
- private:
-  // Refcounted temporary variable resource.
-  friend class DestroyTemporaryVariableOp;
-  struct TmpVar : public ResourceBase {
-    mutex mu;
-    Tensor val;
-    string name;
-    string DebugString() override { return name; }
-    ~TmpVar() override { VLOG(3) << "TmpVar " << name << " deleted"; }
-  };
-
-  TensorShape shape_;
-  DataType dtype_;
-  string var_name_;
-};
-
-class DestroyTemporaryVariableOp : public OpKernel {
- public:
-  explicit DestroyTemporaryVariableOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES(context, IsRefType(context->input_type(0)),
-                errors::InvalidArgument("lhs input needs to be a ref type"))
-    OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_));
-    OP_REQUIRES(context, var_name_ != "",
-                errors::InvalidArgument("Missing var_name attribute"));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // NOTE(pbar): All other mutators of the Tensor Ref *must* have completed
-    // their execution before this DestroyTemporaryVariable op executes.
-    // This is typically achieved using control dependencies.
-    CHECK(IsRefType(context->input_dtype(0)));
-    Tensor tmpvar = context->mutable_input(0, false);
-    context->set_output(0, tmpvar);
-    ResourceMgr* rm = context->resource_manager();
-    OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
-    OP_REQUIRES_OK(context, rm->Delete<TemporaryVariableOp::TmpVar>(
-                                context->step_container()->name(), var_name_));
-    if (context->track_allocations()) {
-      if (context->allocate_on_host(AllocatorAttributes())) {
-        context->record_host_persistent_memory_allocation(
-            -static_cast<int64>(tmpvar.AllocatedBytes()));
-      } else {
-        context->record_device_persistent_memory_allocation(
-            -static_cast<int64>(tmpvar.AllocatedBytes()));
-      }
-    }
-  }
-
- private:
-  string var_name_;
-};
-
-class IsVariableInitializedOp : public OpKernel {
- public:
-  IsVariableInitializedOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Get a mutable input tensor of the Ref input.
-    const Tensor& input_tensor = context->mutable_input(0, false);
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({}), &output));
-    auto output_tensor = output->tensor<bool, 0>();
-    bool result = input_tensor.IsInitialized();
-    output_tensor() = result;
-  }
-};
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_VARIABLE_OPS_H_
diff --git a/tensorflow/core/kernels/xsmm_conv2d_test.cc b/tensorflow/core/kernels/xsmm_conv2d_test.cc
index 381ea39b77c26e16ca8727d23dfd90c46b9e4b9a..e29470124674636a0e125a5cd1b856a467f4c6f0 100644
--- a/tensorflow/core/kernels/xsmm_conv2d_test.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d_test.cc
@@ -73,7 +73,7 @@ LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor  &rsck, in
   LIBXSMM_VLA_DECL(4, const float,  input, kcrs, C, R, S);
   int r, s, c, k;
   auto output =  rsck.flat<float>();
- 
+
   for ( r = 0; r < R; r++ ) {
     for ( s = 0; s < S; s++ ) {
       for ( c = 0; c < C; c++ ) {
@@ -94,14 +94,14 @@ LIBXSMM_INLINE void zero_buf(float* buf, long size) {
     buf[i] = 0.0f;
   }
 }
- 
+
 LIBXSMM_INLINE void copy_buf(Tensor &dst,float *src,long size) {
   long  i;
   auto output =  dst.flat<float>();
-  for (i = 0; i < size; ++i) 
+  for (i = 0; i < size; ++i)
           output(i) = src[i];
 }
- 
+
 LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne)
 {
   int i;
@@ -110,7 +110,7 @@ LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne)
     buf[i] = (float)((initOne != 0) ? 1.0 : ((initPos != 0) ? drand48() : (0.05 - drand48()/10.0)));
   }
 }
- 
+
 
 
 LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float* output, const float* filter)
@@ -138,11 +138,11 @@ LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float
   int stride_w  = param->stride_w;
   /* loop counters */
   int img, ofm, ifm, oj, oi, ij, ii, kj, ki;
- 
+
   LIBXSMM_VLA_DECL(4,       float, output_t, output + (pad_w_out * ofwp + pad_h_out), nOfm, ofhp, ofwp);
   LIBXSMM_VLA_DECL(4, const float,  input_t,  input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp);
   LIBXSMM_VLA_DECL(4, const float, filter_t, filter, nIfm, kh, kw);
- 
+
   for (img = 0; img < nImg; ++img) {
     for (ofm = 0; ofm < nOfm; ++ofm) {
       for (ifm = 0; ifm < nIfm; ++ifm) {
@@ -172,7 +172,7 @@ void RunXsmmVsGeneric() {}
 class XsmmConv2DTest : public OpsTestBase {
  protected:
   void MakeOp(int stride) {
-  
+
     TF_CHECK_OK(NodeDefBuilder("xsmm", "Conv2D")
                       .Input(FakeInput(DT_FLOAT))
                       .Input(FakeInput(DT_FLOAT))
@@ -184,7 +184,7 @@ class XsmmConv2DTest : public OpsTestBase {
     TF_ASSERT_OK(InitOp());
   }
 };
- 
+
 TEST_F(XsmmConv2DTest, Basic) {
      MakeOp(1);
 
@@ -206,13 +206,13 @@ TEST_F(XsmmConv2DTest, Basic) {
      int stride_h = stride;
      int pad_h = pad;
      int pad_w = pad;
- 
+
      int pad_h_in = pad_h;
      int pad_w_in = pad_w;
- 
+
      int pad_h_out = 0;
      int pad_w_out = 0;
- 
+
   /* deriving some values for naive code */
      int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1;
      int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1;
@@ -223,7 +223,7 @@ TEST_F(XsmmConv2DTest, Basic) {
 
 
     //Initialization of Filter and Image
-    
+
     /* allocate data */
      float *naive_input           = (float*)libxsmm_aligned_scratch( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152);
      float *naive_output          = (float*)libxsmm_aligned_scratch( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152);
@@ -232,21 +232,21 @@ TEST_F(XsmmConv2DTest, Basic) {
      init_buf(naive_input,          nImg*nIfm*ifhp*ifwp, 0, 0);
      zero_buf(naive_output,         nImg*nOfm*ofhp*ofwp);
      init_buf(naive_filter,         nOfm*nIfm*kh*kw, 0, 0);
-        
+
 
      Tensor image(DT_FLOAT,
                  {nImg, ifhp, ifwp, nIfm});
- 
- 
+
+
      Tensor filter(DT_FLOAT, {kh,kw,nIfm,nOfm});
- 
+
 
      naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm);
-     naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm); 
+     naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm);
 
 
     //Run naive convolution
-    
+
      naive_conv_t naive_param;
 
      naive_param.nImg = nImg;
@@ -274,8 +274,8 @@ TEST_F(XsmmConv2DTest, Basic) {
 
      naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter);
 
- 
- 
+
+
      AddInputFromArray<float>(image.shape(), image.flat<float>());
      AddInputFromArray<float>(filter.shape(), filter.flat<float>());
 
@@ -283,7 +283,7 @@ TEST_F(XsmmConv2DTest, Basic) {
 
      //Run Op (TF)
      TF_ASSERT_OK(RunOpKernel());
- 
+
      // Check the output.
      Tensor expected(DT_FLOAT, {nImg,ofhp,ofwp, nOfm});
      naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm);
@@ -329,15 +329,15 @@ TEST(XsmmConv2DTest, Basic) {
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
     desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
- 
+
     if (!CanUseXsmmConv2D(desc, data_format)) {
       return false;
     }
- 
+
     auto input_ptr = input.template flat<float>().data();
     auto filter_ptr = filter.template flat<float>().data();
     auto output_ptr = output->template flat<float>().data();
- 
+
     bool success = functor::XsmmFwdConv2D<CPUDevice, float>()(
         ctx, desc, input_ptr, filter_ptr, output_ptr);
     return success;
diff --git a/tensorflow/core/lib/core/arena.cc b/tensorflow/core/lib/core/arena.cc
index 2a04f7bd39df98a97ec7ed0f82dfdfbd8222a2da..55e481d0e60a004f2baebdcac444dd7e7cf93e66 100644
--- a/tensorflow/core/lib/core/arena.cc
+++ b/tensorflow/core/lib/core/arena.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mem.h"
@@ -113,24 +114,11 @@ void Arena::MakeNewBlock(const uint32 alignment) {
   CHECK(SatisfyAlignment(alignment));
 }
 
-// The following simple numeric routines also exist in util/math/mathutil.h
-// but we don't want to depend on that library.
-
-// Euclid's algorithm for Greatest Common Denominator.
-static uint32 GCD(uint32 x, uint32 y) {
-  while (y != 0) {
-    uint32 r = x % y;
-    x = y;
-    y = r;
-  }
-  return x;
-}
-
 static uint32 LeastCommonMultiple(uint32 a, uint32 b) {
   if (a > b) {
-    return (a / GCD(a, b)) * b;
+    return (a / MathUtil::GCD<uint32>(a, b)) * b;
   } else if (a < b) {
-    return (b / GCD(b, a)) * a;
+    return (b / MathUtil::GCD<uint32>(b, a)) * a;
   } else {
     return a;
   }
diff --git a/tensorflow/core/lib/core/status.h b/tensorflow/core/lib/core/status.h
index 3b8a322854f562c0b066e6175e23697ca6445633..58a50a70c26a63a9edd55349e2253a9ace16f1f2 100644
--- a/tensorflow/core/lib/core/status.h
+++ b/tensorflow/core/lib/core/status.h
@@ -127,9 +127,9 @@ inline tensorflow::string* TfCheckOpHelper(::tensorflow::Status v,
   return TfCheckOpHelperOutOfLine(v, msg);
 }
 
-#define TF_DO_CHECK_OK(val, level)                  \
-  while (auto _result = TfCheckOpHelper(val, #val)) \
-    LOG(level) << *(_result)
+#define TF_DO_CHECK_OK(val, level)                                \
+  while (auto _result = ::tensorflow::TfCheckOpHelper(val, #val)) \
+  LOG(level) << *(_result)
 
 #define TF_CHECK_OK(val)  TF_DO_CHECK_OK(val, FATAL)
 #define TF_QCHECK_OK(val) TF_DO_CHECK_OK(val, QFATAL)
diff --git a/tensorflow/core/lib/core/stringpiece.cc b/tensorflow/core/lib/core/stringpiece.cc
index 984f4404ce2c6b35611c0db470d127fb92d1e5be..29b727fc4463d933ceeb402c5dd92f3ea5b8a62a 100644
--- a/tensorflow/core/lib/core/stringpiece.cc
+++ b/tensorflow/core/lib/core/stringpiece.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-size_t StringPiece::Hasher::operator()(StringPiece s) const {
+size_t StringPieceHasher::operator()(StringPiece s) const {
   return Hash64(s.data(), s.size());
 }
 
diff --git a/tensorflow/core/lib/core/stringpiece.h b/tensorflow/core/lib/core/stringpiece.h
index 94f4a377f1dc26d9d66712a0980ff278a543b70a..caa9642774bebec05a28b7a0c2ea71d18d6ebd1a 100644
--- a/tensorflow/core/lib/core/stringpiece.h
+++ b/tensorflow/core/lib/core/stringpiece.h
@@ -35,12 +35,14 @@ limitations under the License.
 
 namespace tensorflow {
 
+struct StringPieceHasher;
+
 class StringPiece {
  public:
   typedef size_t size_type;
 
   // Create an empty slice.
-  StringPiece() : data_(""), size_(0) {}
+  StringPiece() : data_(nullptr), size_(0) {}
 
   // Create a slice that refers to d[0,n-1].
   StringPiece(const char* d, size_t n) : data_(d), size_(n) {}
@@ -103,10 +105,6 @@ class StringPiece {
 
   StringPiece substr(size_t pos, size_t n = npos) const;
 
-  struct Hasher {
-    size_t operator()(StringPiece arg) const;
-  };
-
   // Return a string that contains the copy of the referenced data.
   std::string ToString() const { return std::string(data_, size_); }
 
@@ -133,6 +131,10 @@ class StringPiece {
   // Intentionally copyable
 };
 
+struct StringPieceHasher {
+  size_t operator()(StringPiece s) const;
+};
+
 inline bool operator==(StringPiece x, StringPiece y) {
   return ((x.size() == y.size()) &&
           (memcmp(x.data(), y.data(), x.size()) == 0));
diff --git a/tensorflow/core/lib/core/stringpiece_test.cc b/tensorflow/core/lib/core/stringpiece_test.cc
index ad70d418732af94ecc162f8ef096796138ebbcb7..8f17b85b6d7941d7084ce4e142de4ad33f1e8202 100644
--- a/tensorflow/core/lib/core/stringpiece_test.cc
+++ b/tensorflow/core/lib/core/stringpiece_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/lib/core/stringpiece.h"
+
+#include <unordered_map>
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -63,4 +65,74 @@ TEST(StringPiece, Contains) {
   EXPECT_TRUE(!a.contains(d));
 }
 
+TEST(StringPieceHasher, Equality) {
+  StringPieceHasher hasher;
+
+  StringPiece s1("foo");
+  StringPiece s2("bar");
+  StringPiece s3("baz");
+  StringPiece s4("zot");
+
+  EXPECT_TRUE(hasher(s1) != hasher(s2));
+  EXPECT_TRUE(hasher(s1) != hasher(s3));
+  EXPECT_TRUE(hasher(s1) != hasher(s4));
+  EXPECT_TRUE(hasher(s2) != hasher(s3));
+  EXPECT_TRUE(hasher(s2) != hasher(s4));
+  EXPECT_TRUE(hasher(s3) != hasher(s4));
+
+  EXPECT_TRUE(hasher(s1) == hasher(s1));
+  EXPECT_TRUE(hasher(s2) == hasher(s2));
+  EXPECT_TRUE(hasher(s3) == hasher(s3));
+  EXPECT_TRUE(hasher(s4) == hasher(s4));
+}
+
+TEST(StringPieceHasher, HashMap) {
+  string s1("foo");
+  string s2("bar");
+  string s3("baz");
+
+  StringPiece p1(s1);
+  StringPiece p2(s2);
+  StringPiece p3(s3);
+
+  std::unordered_map<StringPiece, int, StringPieceHasher> map;
+
+  map.insert(std::make_pair(p1, 0));
+  map.insert(std::make_pair(p2, 1));
+  map.insert(std::make_pair(p3, 2));
+  EXPECT_EQ(map.size(), 3);
+
+  bool found[3] = {false, false, false};
+  for (auto const& val : map) {
+    int x = val.second;
+    EXPECT_TRUE(x >= 0 && x < 3);
+    EXPECT_TRUE(!found[x]);
+    found[x] = true;
+  }
+  EXPECT_EQ(found[0], true);
+  EXPECT_EQ(found[1], true);
+  EXPECT_EQ(found[2], true);
+
+  auto new_iter = map.find("zot");
+  EXPECT_TRUE(new_iter == map.end());
+
+  new_iter = map.find("bar");
+  EXPECT_TRUE(new_iter != map.end());
+
+  map.erase(new_iter);
+  EXPECT_EQ(map.size(), 2);
+
+  found[0] = false;
+  found[1] = false;
+  found[2] = false;
+  for (const auto& iter : map) {
+    int x = iter.second;
+    EXPECT_TRUE(x >= 0 && x < 3);
+    EXPECT_TRUE(!found[x]);
+    found[x] = true;
+  }
+  EXPECT_EQ(found[0], true);
+  EXPECT_EQ(found[1], false);
+  EXPECT_EQ(found[2], true);
+}
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/db/sqlite.cc b/tensorflow/core/lib/db/sqlite.cc
index 701655f622a7ec0288f1cb53818877e65839643e..23361e64312a00658077d197650b0f9561bec40b 100644
--- a/tensorflow/core/lib/db/sqlite.cc
+++ b/tensorflow/core/lib/db/sqlite.cc
@@ -18,15 +18,36 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
+namespace {
+
+void ExecuteOrLog(Sqlite* db, const char* sql) {
+  Status s = db->Prepare(sql).StepAndReset();
+  if (!s.ok()) {
+    LOG(WARNING) << s.ToString();
+  }
+}
+
+string ExecuteOrEmpty(Sqlite* db, const char* sql) {
+  auto stmt = db->Prepare(sql);
+  bool is_done = false;
+  if (stmt.Step(&is_done).ok() && !is_done) {
+    return stmt.ColumnString(0);
+  }
+  return "";
+}
+
+}  // namespace
 
 /* static */
 xla::StatusOr<std::shared_ptr<Sqlite>> Sqlite::Open(const string& uri) {
   sqlite3* sqlite = nullptr;
-  Status s = MakeStatus(sqlite3_open(uri.c_str(), &sqlite));
-  if (s.ok()) {
-    return std::shared_ptr<Sqlite>(new Sqlite(sqlite));
-  }
-  return s;
+  TF_RETURN_IF_ERROR(MakeStatus(sqlite3_open(uri.c_str(), &sqlite)));
+  Sqlite* db = new Sqlite(sqlite, uri);
+  // This is the SQLite default since 2016. However it's good to set
+  // this anyway, since we might get linked against an older version of
+  // the library, and it's pretty much impossible to change later.
+  ExecuteOrLog(db, "PRAGMA page_size=4096");
+  return std::shared_ptr<Sqlite>(db);
 }
 
 /* static */ Status Sqlite::MakeStatus(int resultCode) {
@@ -75,7 +96,7 @@ xla::StatusOr<std::shared_ptr<Sqlite>> Sqlite::Open(const string& uri) {
   }
 }
 
-Sqlite::Sqlite(sqlite3* db) : db_(db) {}
+Sqlite::Sqlite(sqlite3* db, const string& uri) : db_(db), uri_(uri) {}
 
 Sqlite::~Sqlite() {
   // close_v2 doesn't care if a stmt hasn't been GC'd yet
@@ -97,6 +118,30 @@ Status Sqlite::Close() {
   return s;
 }
 
+void Sqlite::UseWriteAheadLogWithReducedDurabilityIfPossible() {
+  // TensorFlow summaries are intensively write-heavy, cf. most apps.
+  // This pragma loves writes and means that TensorBoard can read the
+  // database even as the training job inserts stuff. In other words,
+  // this makes SQLite almost as powerful as MySQL or PostgreSQL.
+  // https://www.sqlite.org/wal.html
+  string journal = ExecuteOrEmpty(this, "PRAGMA journal_mode=wal");
+  if (journal != "wal") {
+    LOG(WARNING) << "Failed to set journal_mode=wal because SQLite wants "
+                 << uri_ << " to be in '" << journal << "' mode, which might "
+                 << "be bad since WAL is important for the performance of "
+                 << "write-intensive apps. This might only happen for memory "
+                 << "databases or old versions of SQLite, but is definitely "
+                 << "worth fixing if that's not the case";
+  } else {
+    // This setting means we might lose transactions due to power loss,
+    // but the database can't become corrupted. In exchange, we get the
+    // the performance of a NoSQL database. This is a trade-off most data
+    // scientists would consider acceptable.
+    // https://www.sqlite.org/pragma.html#pragma_synchronous
+    ExecuteOrLog(this, "PRAGMA synchronous=NORMAL");
+  }
+}
+
 SqliteStatement Sqlite::Prepare(const string& sql) {
   sqlite3_stmt* stmt = nullptr;
   int rc = sqlite3_prepare_v2(db_, sql.c_str(), sql.size() + 1, &stmt, nullptr);
diff --git a/tensorflow/core/lib/db/sqlite.h b/tensorflow/core/lib/db/sqlite.h
index 774852efea7b494406c89960654b1acdca1f4ac9..12840bd42bab80c10210c8c87968835136b2d5ea 100644
--- a/tensorflow/core/lib/db/sqlite.h
+++ b/tensorflow/core/lib/db/sqlite.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_LIB_DB_SQLITE_H_
 #define TENSORFLOW_CORE_LIB_DB_SQLITE_H_
 
-#include <stddef.h>
+#include <cstddef>
 #include <memory>
 #include <utility>
 
@@ -69,6 +69,13 @@ class Sqlite {
   /// beforehand. This is a no-op if already closed
   Status Close();
 
+  /// \brief Enables WAL mode with less fsync or log a warning.
+  ///
+  /// The synchronous pragma is only set to NORMAL if WAL mode was
+  /// successfully enabled. This must be called immediately after
+  /// creating the object.
+  void UseWriteAheadLogWithReducedDurabilityIfPossible();
+
   /// \brief Creates SQLite statement.
   ///
   /// Call result.status() to determine whether or not this operation
@@ -78,8 +85,9 @@ class Sqlite {
   SqliteStatement Prepare(const string& sql);
 
  private:
-  explicit Sqlite(sqlite3* db);
+  explicit Sqlite(sqlite3* db, const string& uri);
   sqlite3* db_;
+  string uri_;
   TF_DISALLOW_COPY_AND_ASSIGN(Sqlite);
 };
 
@@ -103,7 +111,7 @@ class SqliteStatement {
   SqliteStatement& operator=(SqliteStatement&& other);
 
   /// \brief Returns true if statement is not empty.
-  operator bool() const { return stmt_ != nullptr; }
+  explicit operator bool() const { return stmt_ != nullptr; }
 
   /// \brief Returns SQLite result code state.
   ///
diff --git a/tensorflow/core/lib/gtl/iterator_range.h b/tensorflow/core/lib/gtl/iterator_range.h
index e7fea7579db6e3bd8f6f2ce6f5f8c53a40dd3d20..0ba4587fde65f9d396716acb6a7e4f491ff51e32 100644
--- a/tensorflow/core/lib/gtl/iterator_range.h
+++ b/tensorflow/core/lib/gtl/iterator_range.h
@@ -37,6 +37,10 @@ namespace gtl {
 template <typename IteratorT>
 class iterator_range {
  public:
+  using value_type = decltype(*std::declval<IteratorT>());
+  using iterator = IteratorT;
+  using const_iterator = IteratorT;
+
   iterator_range() : begin_iterator_(), end_iterator_() {}
   iterator_range(IteratorT begin_iterator, IteratorT end_iterator)
       : begin_iterator_(std::move(begin_iterator)),
diff --git a/tensorflow/core/lib/gtl/stl_util.h b/tensorflow/core/lib/gtl/stl_util.h
index cda72a579da0f76abe6c37c724f76c307890f224..ffeca4e88a93936ee6a1711afec735d97d04172e 100644
--- a/tensorflow/core/lib/gtl/stl_util.h
+++ b/tensorflow/core/lib/gtl/stl_util.h
@@ -29,48 +29,23 @@ limitations under the License.
 namespace tensorflow {
 namespace gtl {
 
-// Returns a mutable char* pointing to a string's internal buffer, which may not
-// be null-terminated. Returns NULL for an empty string. If not non-null,
-// writing through this pointer will modify the string.
-//
-// string_as_array(&str)[i] is valid for 0 <= i < str.size() until the
-// next call to a string method that invalidates iterators.
-//
-// In C++11 you may simply use &str[0] to get a mutable char*.
-//
-// Prior to C++11, there was no standard-blessed way of getting a mutable
-// reference to a string's internal buffer. The requirement that string be
-// contiguous is officially part of the C++11 standard [string.require]/5.
-// According to Matt Austern, this should already work on all current C++98
-// implementations.
-inline char* string_as_array(string* str) {
-  return str->empty() ? NULL : &*str->begin();
-}
-
-// Returns the T* array for the given vector, or NULL if the vector was empty.
-//
-// Note: If you know the array will never be empty, you can use &*v.begin()
-// directly, but that is may dump core if v is empty. This function is the most
-// efficient code that will work, taking into account how our STL is actually
-// implemented. THIS IS NON-PORTABLE CODE, so use this function instead of
-// repeating the nonportable code everywhere. If our STL implementation changes,
-// we will need to change this as well.
+// Returns a char* pointing to the beginning of a string's internal buffer.
+// The result is a valid "null-terminated byte string", even if *str is empty.
+// Up to C++14 it is not valid to *write* to the null terminator; as of C++17,
+// it is valid to write zero to the null terminator (but not any other value).
+inline char* string_as_array(string* str) { return &*str->begin(); }
+
+// The following vector_as_array functions return raw pointers to the underlying
+// data buffer. The return value is unspecified (but valid) if the input range
+// is empty.
 template <typename T, typename Allocator>
 inline T* vector_as_array(std::vector<T, Allocator>* v) {
-#if defined NDEBUG && !defined _GLIBCXX_DEBUG
-  return &*v->begin();
-#else
-  return v->empty() ? NULL : &*v->begin();
-#endif
+  return v->data();
 }
-// vector_as_array overload for const std::vector<>.
+
 template <typename T, typename Allocator>
 inline const T* vector_as_array(const std::vector<T, Allocator>* v) {
-#if defined NDEBUG && !defined _GLIBCXX_DEBUG
-  return &*v->begin();
-#else
-  return v->empty() ? NULL : &*v->begin();
-#endif
+  return v->data();
 }
 
 // Like str->resize(new_size), except any new characters added to "*str" as a
diff --git a/tensorflow/core/lib/io/path.h b/tensorflow/core/lib/io/path.h
index 93151efcbe2abe55a8d8ec2e9aa39a3454f92e2e..47bb2b998d637099b3ab788f7ce274f83e4fc646 100644
--- a/tensorflow/core/lib/io/path.h
+++ b/tensorflow/core/lib/io/path.h
@@ -20,10 +20,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 
 namespace tensorflow {
-class StringPiece;
 namespace io {
 namespace internal {
-string JoinPathImpl(std::initializer_list<StringPiece> paths);
+string JoinPathImpl(std::initializer_list<tensorflow::StringPiece> paths);
 }
 
 // Utility routines for processing filenames
@@ -50,20 +49,20 @@ string JoinPath(const T&... args) {
 #endif /* SWIG */
 
 // Return true if path is absolute.
-bool IsAbsolutePath(StringPiece path);
+bool IsAbsolutePath(tensorflow::StringPiece path);
 
 // Returns the part of the path before the final "/".  If there is a single
 // leading "/" in the path, the result will be the leading "/".  If there is
 // no "/" in the path, the result is the empty prefix of the input.
-StringPiece Dirname(StringPiece path);
+tensorflow::StringPiece Dirname(tensorflow::StringPiece path);
 
 // Returns the part of the path after the final "/".  If there is no
 // "/" in the path, the result is the same as the input.
-StringPiece Basename(StringPiece path);
+tensorflow::StringPiece Basename(tensorflow::StringPiece path);
 
 // Returns the part of the basename of path after the final ".".  If
 // there is no "." in the basename, the result is empty.
-StringPiece Extension(StringPiece path);
+tensorflow::StringPiece Extension(tensorflow::StringPiece path);
 
 // Collapse duplicate "/"s, resolve ".." and "." path elements, remove
 // trailing "/".
@@ -72,7 +71,7 @@ StringPiece Extension(StringPiece path);
 // invoke any system calls (getcwd(2)) in order to resolve relative
 // paths with respect to the actual working directory.  That is, this is purely
 // string manipulation, completely independent of process state.
-string CleanPath(StringPiece path);
+string CleanPath(tensorflow::StringPiece path);
 
 // Populates the scheme, host, and path from a URI. scheme, host, and path are
 // guaranteed by this function to point into the contents of uri, even if
@@ -82,12 +81,13 @@ string CleanPath(StringPiece path);
 // - If the URI is invalid, scheme and host are set to empty strings and the
 //   passed string is assumed to be a path
 // - If the URI omits the path (e.g. file://host), then the path is left empty.
-void ParseURI(StringPiece uri, StringPiece* scheme, StringPiece* host,
-              StringPiece* path);
+void ParseURI(tensorflow::StringPiece uri, tensorflow::StringPiece* scheme,
+              tensorflow::StringPiece* host, tensorflow::StringPiece* path);
 
 // Creates a URI from a scheme, host, and path. If the scheme is empty, we just
 // return the path.
-string CreateURI(StringPiece scheme, StringPiece host, StringPiece path);
+string CreateURI(tensorflow::StringPiece scheme, tensorflow::StringPiece host,
+                 tensorflow::StringPiece path);
 
 // Creates a temporary file name with an extension.
 string GetTempFilename(const string& extension);
diff --git a/tensorflow/core/lib/io/proto_encode_helper.h b/tensorflow/core/lib/io/proto_encode_helper.h
index 5d30dda90172e0f69ea1512b228d9fb95e9a6d39..f70e1cbaabf8383d255f5d339d65a7958bf67596 100644
--- a/tensorflow/core/lib/io/proto_encode_helper.h
+++ b/tensorflow/core/lib/io/proto_encode_helper.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_LIB_IO_PROTO_ENCODE_HELPER_H_
 
 #include "tensorflow/core/lib/core/coding.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 // A helper class for appending various kinds of values in protocol
@@ -24,7 +25,6 @@ limitations under the License.
 // a buffer and a maximum size guarantee for the number of bytes they
 // will add to this buffer.
 namespace tensorflow {
-class StringPiece;
 namespace io {
 
 class ProtoEncodeHelper {
diff --git a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
index be1fa22c69c27a5c57e3c397076a66dfe05eb035..3c310167326721e8f569ab6148622517aaf82ce5 100644
--- a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
@@ -161,7 +161,7 @@ Status SnappyOutputBuffer::Deflate() {
   }
 
   // Write length of compressed block to output buffer.
-  char* compressed_length_array = new char[4];
+  char compressed_length_array[4];
   std::fill(compressed_length_array, compressed_length_array + 4, 0);
   for (int i = 0; i < 4; i++) {
     // Little endian.
@@ -173,7 +173,6 @@ Status SnappyOutputBuffer::Deflate() {
   TF_RETURN_IF_ERROR(AddToOutputBuffer(output.data(), output.size()));
   next_in_ += avail_in_;
   avail_in_ = 0;
-  delete[] compressed_length_array;
 
   return Status::OK();
 }
diff --git a/tensorflow/core/lib/math/math_util.h b/tensorflow/core/lib/math/math_util.h
index 6f279865e7b361d7b0d2c402747c7b3476e63448..41d486f2bd142954d288f1ccdcf30d960fa2c6a7 100644
--- a/tensorflow/core/lib/math/math_util.h
+++ b/tensorflow/core/lib/math/math_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_MATH_MATH_UTIL_H_
 #define TENSORFLOW_LIB_MATH_MATH_UTIL_H_
 
+#include <type_traits>
+
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -59,6 +61,29 @@ class MathUtil {
   template <typename IntegralType, bool ceil>
   static IntegralType CeilOrFloorOfRatio(IntegralType numerator,
                                          IntegralType denominator);
+
+  template <typename IntegralType>
+  static IntegralType GCD(IntegralType x, IntegralType y);
+
+  // ----------------------------------------------------------------------
+  // IPow<T>
+  //   Computes the result of raising a number to a non-negative integral power.
+  //
+  //  * T: An integral type, floating-point type, or user-defined type for which
+  //    operator*= is defined.
+  //  * base: the base "v" of the operation
+  //  * exp: the exponent "i" of the operation; must be non-negative.
+  //
+  // Computes v^i, in a way that is faster than std::pow (which supports
+  // arbitrary real exponents).
+  //
+  // When T is a floating point type, this has the same semantics as std::pow,
+  // but it is much faster. When T is an integral type, computations are
+  // performed in the value domain of T, and overflow semantics are those of T.
+  //
+  // Input validity is DCHECKed.
+  template <typename T>
+  static T IPow(T base, int exp);
 };
 
 // ---- CeilOrFloorOfRatio ----
@@ -107,6 +132,32 @@ IntegralType MathUtil::CeilOrFloorOfRatio(IntegralType numerator,
   }
 }
 
+template <typename IntegralType>
+IntegralType MathUtil::GCD(IntegralType a, IntegralType b) {
+  static_assert(std::is_unsigned<IntegralType>::value,
+                "signed GCD not supported!");
+  while (b != 0) {
+    IntegralType r = a % b;
+    a = b;
+    b = r;
+  }
+  return a;
+}
+
+// ---- IPow ----
+// Implemented with the squared exponentiation method (a.k.a. double-and-add).
+//
+// Note that "exp >>= 1" is faster than "exp /= 2" on at least one platform.
+template <typename T>
+T MathUtil::IPow(T base, int exp) {
+  DCHECK_GE(exp, 0);
+  for (T result(1);; base *= base) {
+    if ((exp & 1) != 0) result *= base;
+    exp >>= 1;
+    if (exp == 0) return result;
+  }
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_LIB_MATH_MATH_UTIL_H_
diff --git a/tensorflow/core/lib/math/math_util_test.cc b/tensorflow/core/lib/math/math_util_test.cc
index eaf8c31a431728d6f728abeb50e14c443bce6d85..cad5d0d8993b5c61e82489ca942744608f7fd37a 100644
--- a/tensorflow/core/lib/math/math_util_test.cc
+++ b/tensorflow/core/lib/math/math_util_test.cc
@@ -15,12 +15,17 @@ limitations under the License.
 
 #include "tensorflow/core/lib/math/math_util.h"
 
+#include <cmath>
+#include <limits>
 #include <vector>
+
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
+namespace {
 
 // Number of arguments for each test of the CeilOrRatio method
 const int kNumTestArguments = 4;
@@ -195,4 +200,141 @@ TEST(MathUtil, CeilOfRatio) {
 #endif
 }
 
+struct GCDTestCase {
+  unsigned int x;
+  unsigned int y;
+  unsigned int gcd;
+};
+
+TEST(MathUtil, GCD) {
+  std::vector<GCDTestCase> testcases({
+      {10, 20, 10},  //
+      {27, 8, 1},    //
+      {4, 3, 1},     //
+      {6, 8, 2},     //
+      {5, 0, 5},     //
+      {5, 5, 5},     //
+      {0, 0, 0}      //
+  });
+
+  for (const auto& tc : testcases) {
+    EXPECT_EQ(tc.gcd, MathUtil::GCD<uint32>(tc.x, tc.y));
+    EXPECT_EQ(tc.gcd, MathUtil::GCD<uint32>(tc.y, tc.x));
+    EXPECT_EQ(tc.gcd, MathUtil::GCD<uint64>(tc.x, tc.y));
+    EXPECT_EQ(tc.gcd, MathUtil::GCD<uint64>(tc.y, tc.x));
+  }
+
+  const uint64 biggish_prime = 1666666667;
+  EXPECT_EQ(biggish_prime,
+            MathUtil::GCD<uint64>(biggish_prime * 3, biggish_prime * 4));
+}
+
+template <typename T>
+void TestOneIPowN() {
+  const T one{1};
+  for (int i = 0; i < 1024; ++i) {
+    // Computations are exact.
+    EXPECT_EQ(MathUtil::IPow(one, i), one);
+  }
+}
+
+template <typename T>
+void TestTwoIPowN() {
+  int limit = std::is_integral<T>::value ? std::numeric_limits<T>::digits : 63;
+  for (int i = 0; i < limit; ++i) {
+    // Computations are exact.
+    EXPECT_EQ(MathUtil::IPow(T{2}, i), static_cast<T>(1ull << i));
+  }
+}
+
+template <typename T>
+void TestFloatIPow(const int max_exponent, const T start, const T end,
+                   const T step) {
+  for (T f = start; f < end; f += step) {
+    for (int i = 0; i < max_exponent; ++i) {
+      EXPECT_FLOAT_EQ(MathUtil::IPow(f, i), pow(f, i));
+    }
+  }
+}
+
+TEST(MathUtil, IPow) {
+  TestOneIPowN<double>();
+  TestOneIPowN<float>();
+  TestOneIPowN<int>();
+  TestOneIPowN<int64>();
+  TestTwoIPowN<double>();
+  TestTwoIPowN<float>();
+  TestTwoIPowN<int>();
+  TestTwoIPowN<int64>();
+
+  EXPECT_EQ(MathUtil::IPow(3, 0), 1);
+  EXPECT_EQ(MathUtil::IPow(3, 1), 3);
+  EXPECT_EQ(MathUtil::IPow(3, 2), 9);
+  EXPECT_EQ(MathUtil::IPow(3, 3), 27);
+  EXPECT_EQ(MathUtil::IPow(3, 4), 81);
+  EXPECT_EQ(MathUtil::IPow(3, 5), 243);
+
+  TestFloatIPow<float>(13, -16.0f, 16.0f, 1.0f / 8);
+  TestFloatIPow<double>(13, -16.0, 16.0, 1.0 / 8);
+
+  TestFloatIPow<float>(13, -1.0f / (1 << 12), -1.0f / (1 << 12),
+                       1.0f / (1 << 16));
+  TestFloatIPow<double>(13, -1.0 / (1 << 12), -1.0 / (1 << 12),
+                        1.0 / (1 << 16));
+}
+
+TEST(MathUtil, IPowEdgeCases) {
+  constexpr const double kInf = std::numeric_limits<double>::infinity();
+
+  EXPECT_EQ(MathUtil::IPow(-12345.0, 79), -kInf);
+  EXPECT_EQ(MathUtil::IPow(-12345.0, 80), +kInf);
+
+  // The semantics of the edge cases that follow  are defined in the standard:
+  // http://en.cppreference.com/w/cpp/numeric/math/pow for a summary.
+
+  // 1 - These edge cases apply.
+  // pow(+0, exp), where exp is a positive odd integer, returns +0
+  EXPECT_EQ(MathUtil::IPow(+0.0, 3), +0.0);
+  // pow(-0, exp), where exp is a positive odd integer, returns -0
+  EXPECT_EQ(MathUtil::IPow(-0.0, 3), -0.0);
+  // pow(±0, exp), where exp is positive non-integer or a positive even integer,
+  // returns +0
+  EXPECT_EQ(MathUtil::IPow(+0.0, 42), +0.0);
+  EXPECT_EQ(MathUtil::IPow(-0.0, 42), +0.0);
+  // pow(base, ±0) returns 1 for any base, even when base is NaN
+  EXPECT_EQ(MathUtil::IPow(-kInf, 0.0), 1.0);
+  EXPECT_EQ(MathUtil::IPow(-2.0, 0.0), 1.0);
+  EXPECT_EQ(MathUtil::IPow(-1.0, 0.0), 1.0);
+  EXPECT_EQ(MathUtil::IPow(-0.0, 0.0), 1.0);
+  EXPECT_EQ(MathUtil::IPow(+0.0, 0.0), 1.0);
+  EXPECT_EQ(MathUtil::IPow(+1.0, 0.0), 1.0);
+  EXPECT_EQ(MathUtil::IPow(+2.0, 0.0), 1.0);
+  EXPECT_EQ(MathUtil::IPow(+kInf, 0.0), 1.0);
+  EXPECT_EQ(MathUtil::IPow(std::numeric_limits<double>::quiet_NaN(), 0.0), 1.0);
+  // pow(-∞, exp) returns -∞ if exp is a positive odd integer
+  EXPECT_EQ(MathUtil::IPow(-kInf, 43), -kInf);
+  // pow(-∞, exp) returns +∞ if exp is a positive non-integer or even integer
+  EXPECT_EQ(MathUtil::IPow(-kInf, 42), +kInf);
+  // pow(+∞, exp) returns +∞ for any positive exp
+  EXPECT_EQ(MathUtil::IPow(+kInf, 42), +kInf);
+  EXPECT_EQ(MathUtil::IPow(+kInf, 43), +kInf);
+
+  // 2 - These do not apply due to the restricted exp range.
+  // pow(+0, exp), where exp is a negative odd integer, returns +∞ and raises
+  // FE_DIVBYZERO pow(-0, exp), where exp is a negative odd integer, returns -∞
+  // and raises FE_DIVBYZERO pow(±0, exp), where exp is negative, finite, and is
+  // an even integer or a non-integer, returns +∞ and raises FE_DIVBYZERO
+  // pow(-1, ±∞) returns 1
+  // pow(+1, exp) returns 1 for any exp, even when exp is NaN
+  // pow(±0, -∞) returns +∞ and may raise FE_DIVBYZERO
+  // pow(base, exp) returns NaN and raises FE_INVALID if base is finite and
+  // negative and exp is finite and non-integer. pow(base, -∞) returns +∞ for
+  // any |base|<1 pow(base, -∞) returns +0 for any |base|>1 pow(base, +∞)
+  // returns +0 for any |base|<1 pow(base, +∞) returns +∞ for any |base|>1
+  // pow(-∞, exp) returns -0 if exp is a negative odd integer
+  // pow(-∞, exp) returns +0 if exp is a negative non-integer or even integer
+  // pow(+∞, exp) returns +0 for any negative exp
+}
+
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/monitoring/collected_metrics.h b/tensorflow/core/lib/monitoring/collected_metrics.h
index fbef25619fd4f9ad6dc6927c43d2b8750ac51804..acdb0d86edb1a15631c324afe9d535e0660c4b98 100644
--- a/tensorflow/core/lib/monitoring/collected_metrics.h
+++ b/tensorflow/core/lib/monitoring/collected_metrics.h
@@ -88,6 +88,7 @@ struct Point {
   ValueType value_type;
   int64 int64_value;
   string string_value;
+  bool bool_value;
   HistogramProto histogram_value;
 
   // start_timestamp and end_timestamp indicate the time period over which this
diff --git a/tensorflow/core/lib/monitoring/collection_registry.h b/tensorflow/core/lib/monitoring/collection_registry.h
index 030f8e360a7237c2727cc4c8d4d8134b67c7cee7..2c8e250c5631ee8a56d6871c1a61ef17efc97c82 100644
--- a/tensorflow/core/lib/monitoring/collection_registry.h
+++ b/tensorflow/core/lib/monitoring/collection_registry.h
@@ -224,6 +224,12 @@ inline void CollectValue(const string& value, Point* const point) {
   point->string_value = value;
 }
 
+template <>
+inline void CollectValue(const bool& value, Point* const point) {
+  point->value_type = ValueType::kBool;
+  point->bool_value = value;
+}
+
 template <>
 inline void CollectValue(const HistogramProto& value, Point* const point) {
   point->value_type = ValueType::kHistogram;
@@ -321,13 +327,13 @@ void MetricCollector<metric_kind, Value, NumLabels>::CollectValue(
     const std::array<string, NumLabels>& labels, const Value& value) {
   point_set_->points.emplace_back(new Point());
   auto* const point = point_set_->points.back().get();
-  const std::vector<StringPiece> label_descriptions =
+  const std::vector<string> label_descriptions =
       metric_def_->label_descriptions();
   point->labels.reserve(NumLabels);
   for (int i = 0; i < NumLabels; ++i) {
     point->labels.push_back({});
     auto* const label = &point->labels.back();
-    label->name = label_descriptions[i].ToString();
+    label->name = label_descriptions[i];
     label->value = labels[i];
   }
   internal::CollectValue(value, point);
diff --git a/tensorflow/core/lib/monitoring/gauge.h b/tensorflow/core/lib/monitoring/gauge.h
index 75471cfb22956deac0b0a5841fdde8ee538da30e..ec978a91935890cb0563f39ba0e6554a03d7c86e 100644
--- a/tensorflow/core/lib/monitoring/gauge.h
+++ b/tensorflow/core/lib/monitoring/gauge.h
@@ -86,8 +86,29 @@ class GaugeCell<int64> {
   TF_DISALLOW_COPY_AND_ASSIGN(GaugeCell);
 };
 
+// Explicit specialization of GaugeCell<bool>. Compared to the primary
+// template, it uses atomic values as opposed to mutex. This class is
+// thread-safe.
+template <>
+class GaugeCell<bool> {
+ public:
+  explicit GaugeCell(bool value) : value_(value) {}
+  ~GaugeCell() {}
+
+  // Atomically sets the value.
+  void Set(bool value);
+
+  // Retrieves the current value.
+  bool value() const;
+
+ private:
+  std::atomic<bool> value_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GaugeCell);
+};
+
 // A stateful class for updating a gauge-like metric. Allowed ValueType are
-// int64 and string.
+// int64, string and bool.
 //
 // This class encapsulates a set of values (or a single value for a label-less
 // metric). Each value is identified by a tuple of labels. The class allows the
@@ -117,6 +138,9 @@ class Gauge {
   //
   // auto* integer_gauge = Gauge<int64, 0>::New("/tensorflow/integer_gauge",
   //   "Integer gauge")
+  //
+  // auto* bool_gauge = Gauge<bool, 0>::New("/tensorflow/bool_gauge",
+  //   "Bool gauge")
   template <typename... MetricDefArgs>
   static Gauge* New(MetricDefArgs&&... metric_def_args);
 
@@ -172,12 +196,17 @@ inline void GaugeCell<int64>::Set(int64 value) { value_ = value; }
 
 inline int64 GaugeCell<int64>::value() const { return value_; }
 
+inline void GaugeCell<bool>::Set(bool value) { value_ = value; }
+
+inline bool GaugeCell<bool>::value() const { return value_; }
+
 template <typename ValueType, int NumLabels>
 template <typename... MetricDefArgs>
 Gauge<ValueType, NumLabels>* Gauge<ValueType, NumLabels>::New(
     MetricDefArgs&&... metric_def_args) {
   static_assert(std::is_same<ValueType, int64>::value ||
-                    std::is_same<ValueType, string>::value,
+                    std::is_same<ValueType, string>::value ||
+                    std::is_same<ValueType, bool>::value,
                 "Gauge only allows int64 and string types.");
   return new Gauge<ValueType, NumLabels>(
       MetricDef<MetricKind::kGauge, ValueType, NumLabels>(
diff --git a/tensorflow/core/lib/monitoring/gauge_test.cc b/tensorflow/core/lib/monitoring/gauge_test.cc
index f98cfe2a3b34cfb0630865e2fd0eeef6ea4f734d..c8f673db38928b96bd4f97cbb72c1007fdc9e9bb 100644
--- a/tensorflow/core/lib/monitoring/gauge_test.cc
+++ b/tensorflow/core/lib/monitoring/gauge_test.cc
@@ -87,6 +87,28 @@ TEST(GaugeOfStringValue, GetCell) {
   EXPECT_EQ("bar", same_cell->value());
 }
 
+auto* bool_gauge =
+    Gauge<bool, 0>::New("/tensorflow/test/bool_gauge", "Gauge of bool value.");
+
+TEST(GaugeOfBoolValue, InitializedWithFalseValue) {
+  EXPECT_EQ(false, bool_gauge->GetCell()->value());
+}
+
+TEST(GaugeOfBoolValue, GetCell) {
+  auto* cell = bool_gauge->GetCell();
+  EXPECT_EQ(false, cell->value());
+
+  cell->Set(true);
+  EXPECT_EQ(true, cell->value());
+
+  auto* same_cell = bool_gauge->GetCell();
+  EXPECT_EQ(true, cell->value());
+
+  same_cell->Set(false);
+  EXPECT_EQ(false, cell->value());
+  EXPECT_EQ(false, same_cell->value());
+}
+
 }  // namespace
 }  // namespace monitoring
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/monitoring/metric_def.h b/tensorflow/core/lib/monitoring/metric_def.h
index 3459c2ab82e45d0db9857345da2e96c3e12d41a3..f046842618a03f7a161a11d3b493b71be50ad988 100644
--- a/tensorflow/core/lib/monitoring/metric_def.h
+++ b/tensorflow/core/lib/monitoring/metric_def.h
@@ -28,39 +28,21 @@ namespace monitoring {
 // The different metric kinds available.
 //
 // Gauge indicates that the metric's values are instantaneous measurements of a
-// (typically) continuously varying quantity or a string value. Examples: a
-// process's current heap size, a queue's current length, the name of the binary
-// used by a process.
+// (typically) continuously varying value. Examples: a process's current heap
+// size, a queue's current length, the name of the binary used by a process,
+// whether a task is complete.
 //
 // Cumulative indicates that the metric's values represent non-negative changes
 // over specified time periods. Example: the number of rpc calls to a service.
 enum class MetricKind : int { kGauge = 0, kCumulative };
 
 // The type of the metric values.
-enum class ValueType : int { kInt64 = 0, kHistogram, kString };
+enum class ValueType : int { kInt64 = 0, kHistogram, kString, kBool };
 
 // Everything in the internal namespace is implementation details. Do not depend
 // on this.
 namespace internal {
 
-// Ensures that the string is a compile-time string literal.
-class StringLiteral {
- public:
-  // We allow implicit conversions here on purpose.
-  template <int N>
-  StringLiteral(const char (&data)[N]) : literal_(data, N - 1) {}
-
-  // This ctor will be called for non-literals, causing compile-time failure.
-  template <typename NotStringLiteral>
-  StringLiteral(const NotStringLiteral& not_string_literal) = delete;
-
-  // Implicit conversion to StringPiece.
-  operator StringPiece() const { return literal_; }
-
- private:
-  const StringPiece literal_;
-};
-
 template <typename Value>
 ValueType GetValueType();
 
@@ -79,6 +61,11 @@ inline ValueType GetValueType<string>() {
   return ValueType::kString;
 }
 
+template <>
+inline ValueType GetValueType<bool>() {
+  return ValueType::kBool;
+}
+
 }  // namespace internal
 
 // Abstract base class for a metric definition.
@@ -98,7 +85,7 @@ class AbstractMetricDef {
 
   StringPiece description() const { return description_; }
 
-  const std::vector<StringPiece> label_descriptions() const {
+  const std::vector<string>& label_descriptions() const {
     return label_descriptions_;
   }
 
@@ -106,23 +93,21 @@ class AbstractMetricDef {
   template <MetricKind kind, typename Value, int NumLabels>
   friend class MetricDef;
 
-  AbstractMetricDef(
-      const MetricKind kind, const ValueType value_type,
-      const internal::StringLiteral name,
-      const internal::StringLiteral description,
-      const std::vector<internal::StringLiteral>& label_descriptions)
+  AbstractMetricDef(const MetricKind kind, const ValueType value_type,
+                    const StringPiece name, const StringPiece description,
+                    const std::vector<string>& label_descriptions)
       : kind_(kind),
         value_type_(value_type),
-        name_(name),
-        description_(description),
-        label_descriptions_(std::vector<StringPiece>(
-            label_descriptions.begin(), label_descriptions.end())) {}
+        name_(name.ToString()),
+        description_(description.ToString()),
+        label_descriptions_(std::vector<string>(label_descriptions.begin(),
+                                                label_descriptions.end())) {}
 
   const MetricKind kind_;
   const ValueType value_type_;
-  const StringPiece name_;
-  const StringPiece description_;
-  const std::vector<StringPiece> label_descriptions_;
+  const string name_;
+  const string description_;
+  const std::vector<string> label_descriptions_;
 };
 
 // Metric definition.
@@ -130,15 +115,18 @@ class AbstractMetricDef {
 // A metric is defined by its kind, value-type, name, description and the
 // description of its labels.
 //
-// NOTE: We allow only string literals for the name, description and label
-// descriptions because these should be fixed at compile-time and shouldn't be
-// dynamic.
+// NOTE: Name, description, and label descriptions should be logically static,
+// but do not have to live for the lifetime of the MetricDef.
+//
+// By "logically static", we mean that they should never contain dynamic
+// information, but is static for the lifetime of the MetricDef, and
+// in-turn the metric; they do not need to be compile-time constants.
+// This allows for e.g. prefixed metrics in a CLIF wrapped environment.
 template <MetricKind metric_kind, typename Value, int NumLabels>
 class MetricDef : public AbstractMetricDef {
  public:
   template <typename... LabelDesc>
-  MetricDef(const internal::StringLiteral name,
-            const internal::StringLiteral description,
+  MetricDef(const StringPiece name, const StringPiece description,
             const LabelDesc&... label_descriptions)
       : AbstractMetricDef(metric_kind, internal::GetValueType<Value>(), name,
                           description, {label_descriptions...}) {
diff --git a/tensorflow/core/lib/monitoring/metric_def_test.cc b/tensorflow/core/lib/monitoring/metric_def_test.cc
index dc07a08e4feaed1045b379e2795733cb0d4f2024..66973b6b5f646218269ac5da286ceb6667d170fc 100644
--- a/tensorflow/core/lib/monitoring/metric_def_test.cc
+++ b/tensorflow/core/lib/monitoring/metric_def_test.cc
@@ -41,6 +41,24 @@ TEST(MetricDefTest, Simple) {
   EXPECT_EQ("LabelName", metric_def1.label_descriptions()[0]);
 }
 
+TEST(MetricDefTest, StringsPersist) {
+  // Ensure string attributes of the metric are copied into the metric
+  string name = "/tensorflow/metric0";
+  string description = "test description";
+  string label_description = "test label description";
+  const MetricDef<MetricKind::kCumulative, int64, 1> metric_def(
+      name, description, label_description);
+
+  // Mutate the strings
+  name[4] = 'A';
+  description[4] = 'B';
+  label_description[4] = 'C';
+
+  EXPECT_NE(name, metric_def.name());
+  EXPECT_NE(description, metric_def.description());
+  EXPECT_NE(label_description, metric_def.label_descriptions()[0]);
+}
+
 }  // namespace
 }  // namespace monitoring
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/random/random.cc b/tensorflow/core/lib/random/random.cc
index 723c1100f8e49f31e1e656649472eb72cec790a9..82dc82950730aa58abc52528bfdf496284634336 100644
--- a/tensorflow/core/lib/random/random.cc
+++ b/tensorflow/core/lib/random/random.cc
@@ -33,14 +33,14 @@ std::mt19937_64 InitRngWithDefaultSeed() { return std::mt19937_64(); }
 
 uint64 New64() {
   static std::mt19937_64* rng = InitRngWithRandomSeed();
-  static mutex mu;
+  static mutex mu(LINKER_INITIALIZED);
   mutex_lock l(mu);
   return (*rng)();
 }
 
 uint64 New64DefaultSeed() {
   static std::mt19937_64 rng = InitRngWithDefaultSeed();
-  static mutex mu;
+  static mutex mu(LINKER_INITIALIZED);
   mutex_lock l(mu);
   return rng();
 }
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index 302a6967e3a4eb355d5a5f10548f0d946b1db354..f5822fad8e3d3b8559d19c79ee2885e580ea3e11 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -81,10 +81,12 @@ T locale_independent_strtonum(const char* str, const char** endptr) {
   // number was outside the range, the stringstream sets the fail flag, but
   // returns the +/-max() value, whereas strto{f,d} functions return +/-INF.
   if (s.fail()) {
-    if (result == std::numeric_limits<T>::max()) {
+    if (result == std::numeric_limits<T>::max() ||
+        result == std::numeric_limits<T>::infinity()) {
       result = std::numeric_limits<T>::infinity();
       s.clear(s.rdstate() & ~std::ios::failbit);
-    } else if (result == -std::numeric_limits<T>::max()) {
+    } else if (result == -std::numeric_limits<T>::max() ||
+               result == -std::numeric_limits<T>::infinity()) {
       result = -std::numeric_limits<T>::infinity();
       s.clear(s.rdstate() & ~std::ios::failbit);
     }
diff --git a/tensorflow/core/lib/strings/ordered_code.h b/tensorflow/core/lib/strings/ordered_code.h
index ce823c3f872a73702c00460248b483e24f09364c..91870cfec6322a56c8917261d336e56dbca7aea7 100644
--- a/tensorflow/core/lib/strings/ordered_code.h
+++ b/tensorflow/core/lib/strings/ordered_code.h
@@ -39,11 +39,11 @@ limitations under the License.
 #define TENSORFLOW_LIB_STRINGS_ORDERED_CODE_H__
 
 #include <string>
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
-class StringPiece;
 
 namespace strings {
 
diff --git a/tensorflow/core/lib/strings/str_util_test.cc b/tensorflow/core/lib/strings/str_util_test.cc
index d5909d17aaa7e401cf8028346783e638af47a168..6d461241f7e9c5a29064c015991039d5bf95a80f 100644
--- a/tensorflow/core/lib/strings/str_util_test.cc
+++ b/tensorflow/core/lib/strings/str_util_test.cc
@@ -305,7 +305,7 @@ TEST(SplitAndParseAsInts, Int64) {
   EXPECT_EQ(nums[0], 134);
   EXPECT_EQ(nums[1], 2);
   EXPECT_EQ(nums[2], 13);
-  EXPECT_EQ(nums[3], -4000000000);
+  EXPECT_EQ(nums[3], static_cast<int64>(-4000000000ull));
 
   EXPECT_FALSE(str_util::SplitAndParseAsInts("abc", ',', &nums));
 
diff --git a/tensorflow/core/ops/array_grad.cc b/tensorflow/core/ops/array_grad.cc
index 325dbc48835d2f975ecd2530486be239fdcf96c6..38bd851da89357238360dcb3dd465b5e4f6a5fdd 100644
--- a/tensorflow/core/ops/array_grad.cc
+++ b/tensorflow/core/ops/array_grad.cc
@@ -333,6 +333,25 @@ Status TransposeGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("Transpose", TransposeGrad);
 
+Status ConjugateTransposeGrad(const AttrSlice& attrs, FunctionDef* g) {
+  *g = FDH::Define(
+      // Arg defs
+      {"x: T", "p: int32", "dy: T"},
+      // Ret val defs
+      {"dx: T", "dp: int32"},
+      // Attr defs
+      {"T: type"},
+      // Nodes
+      {
+          {{"q"}, "InvertPermutation", {"p"}, {}},
+          {{"dx"}, "ConjugateTranspose", {"dy", "q"}, {{"T", "$T"}}},
+          {{"dp"}, "ZerosLike", {"p"}, {{"T", DT_INT32}}},
+      });
+  VLOG(1) << "ConjugateTransposeGrad " << DebugString(*g);
+  return Status::OK();
+}
+REGISTER_OP_GRADIENT("ConjugateTranspose", ConjugateTransposeGrad);
+
 Status ReverseGrad(const AttrSlice& attrs, FunctionDef* g) {
   *g = FDH::Define(
       // Arg defs
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index c8cc1473607f4eecf94bfa6c6b30311b8c9e486a..5a31f433cee88e8ef6ecf6dcc85d735997a9805a 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -706,6 +706,26 @@ memory_region_name: Name of readonly memory region used by the tensor, see
   NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
 )doc");
 
+REGISTER_OP("GuaranteeConst")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      return UnchangedShape(c);
+    })
+    // We don't want this to be optimized away.
+    .SetIsStateful()
+    .Doc(R"(
+Gives a guarantee to the TF runtime that the input tensor is a constant.
+
+The runtime is then free to make optimizations based on this.
+
+Only accepts value typed tensors as inputs and rejects resource variable handles
+as input.
+
+Returns the input tensor without modification.
+)");
+
 // --------------------------------------------------------------------------
 REGISTER_OP("ZerosLike")
     .Input("x: T")
@@ -724,8 +744,8 @@ REGISTER_OP("OnesLike")
     .Input("x: T")
     .Output("y: T")
     .Attr(
-        "T: {float, double, int8, uint8, int16, uint16, int32, int64, "
-        "complex64, complex128, bool}")
+        "T: {bfloat16, float, double, int8, uint8, int16, uint16, int32, "
+        "int64, complex64, complex128, bool}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Returns a tensor of ones with the same shape and type as x.
@@ -738,7 +758,7 @@ y: a tensor of the same shape and type as x but filled with ones.
 REGISTER_OP("Diag")
     .Input("diagonal: T")
     .Output("output: T")
-    .Attr("T: {float, double, int32, int64, complex64, complex128}")
+    .Attr("T: {bfloat16, float, double, int32, int64, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle in = c->input(0);
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(in, 1, &in));
@@ -776,7 +796,7 @@ diagonal: Rank k tensor where k is at most 1.
 REGISTER_OP("DiagPart")
     .Input("input: T")
     .Output("diagonal: T")
-    .Attr("T: {float, double, int32, int64, complex64, complex128}")
+    .Attr("T: {bfloat16, float, double, int32, int64, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle in = c->input(0);
       if (!c->RankKnown(in)) {
@@ -1059,9 +1079,8 @@ REGISTER_OP("Reverse")
     .Input("dims: bool")
     .Output("output: T")
     .Attr(
-        "T: {uint8, int8, uint16, int16, int32, int64, bool, half, float, "
-        "double, complex64, "
-        "complex128, string}")
+        "T: {uint8, int8, uint16, int16, int32, int64, bool, half, "
+        "float, double, complex64, complex128, string}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       ShapeHandle dims;
@@ -1137,9 +1156,8 @@ REGISTER_OP("ReverseV2")
     .Output("output: T")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr(
-        "T: {uint8, int8, uint16, int16, int32, int64, bool, half, float, "
-        "double, complex64, "
-        "complex128, string}")
+        "T: {uint8, int8, uint16, int16, int32, int64, bool, half, bfloat16, "
+        "float, double, complex64, complex128, string}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       ShapeHandle axis;
@@ -1704,6 +1722,20 @@ REGISTER_OP("Identity")
 Return a tensor with the same shape and contents as the input tensor or value.
 )Doc");
 
+REGISTER_OP("Snapshot")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        c->set_output_handle_shapes_and_types(0, *handle_data);
+      }
+      return Status::OK();
+    })
+    .Doc(R"Doc(Returns a copy of the input tensor.)Doc");
+
 #ifdef INTEL_MKL
 REGISTER_OP("_MklIdentity")
     .Input("input: T")
@@ -1834,7 +1866,7 @@ this operation.
 REGISTER_OP("CheckNumerics")
     .Input("tensor: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("message: string")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
@@ -2420,6 +2452,7 @@ REGISTER_OP("Slice")
           TF_RETURN_IF_ERROR(
               c->WithRank(begin_value, c->Rank(sizes_value), &begin_value));
           std::vector<DimensionHandle> dims;
+          dims.reserve(c->Rank(sizes_value));
           for (int i = 0; i < c->Rank(sizes_value); ++i) {
             dims.emplace_back(c->Dim(sizes_value, i));
           }
@@ -4226,7 +4259,7 @@ with the following options:
   "NHWC": `[ batch, height, width, channels ]`
   "NCHW": `[ batch, channels, height, width ]`
   "NCHW_VECT_C":
-      `qint8 [ batch, channels / 4, height, width, channels % 4 ]`
+      `qint8 [ batch, channels / 4, height, width, 4 ]`
 
 It is useful to consider the operation as transforming a 6-D Tensor.
 e.g. for data_format = NHWC,
@@ -4370,7 +4403,7 @@ with the following options:
   "NHWC": `[ batch, height, width, channels ]`
   "NCHW": `[ batch, channels, height, width ]`
   "NCHW_VECT_C":
-      `qint8 [ batch, channels / 4, height, width, channels % 4 ]`
+      `qint8 [ batch, channels / 4, height, width, 4 ]`
 
 It is useful to consider the operation as transforming a 6-D Tensor.
 e.g. for data_format = NHWC,
@@ -4564,12 +4597,12 @@ REGISTER_OP("Bitcast")
     .Output("output: type")
     // All supported dtypes are listed here to include qint16 and quint16.
     .Attr(
-        "T: {float, double, int64, int32, uint8, uint16, int8, int16,"
+        "T: {bfloat16, float, double, int64, int32, uint8, uint16, int8, int16,"
         " complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
         " half}")
     .Attr(
-        "type: {float, double, int64, int32, uint8, uint16, int8, int16,"
-        " complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
+        "type: {bfloat16, float, double, int64, int32, uint8, uint16, int8, "
+        "int16, complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
         " half}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
@@ -4781,7 +4814,7 @@ REGISTER_OP("QuantizeAndDequantize")
     .Attr("input_min: float = 0")
     .Attr("input_max: float = 0")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Deprecated(22, "Replaced by QuantizeAndDequantizeV2")
     .Doc(R"doc(
@@ -4797,7 +4830,7 @@ REGISTER_OP("QuantizeAndDequantizeV2")
     .Attr("num_bits: int = 8")
     .Attr("range_given: bool = false")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
@@ -4876,7 +4909,7 @@ REGISTER_OP("QuantizeAndDequantizeV3")
     .Attr("signed_input: bool = true")
     .Attr("range_given: bool = true")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 94eb120175555d8d51b9be1ff98676a9dc4fff07..c8ea443613656b418dd88fc4a1b9343101d754eb 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -158,6 +158,13 @@ TEST(ArrayOpsTest, UnchangedShapes_ShapeFn) {
   INFER_OK(op, "[1,2,?,4,5];?;?", "in0");
 }
 
+TEST(ArrayOpsTest, GuaranteeConst_ShapeFn) {
+  ShapeInferenceTestOp op("GuaranteeConst");
+  INFER_OK(op, "?", "in0");
+  INFER_OK(op, "[]", "in0");
+  INFER_OK(op, "[1,2,?,4,5]", "in0");
+}
+
 TEST(ArrayOpsTest, Identity_ShapeFnHandles) {
   const char* op_name = "Identity";
   ShapeInferenceTestOp op(op_name);
@@ -514,7 +521,7 @@ TEST(ArrayOpsTest, MatrixSetDiag_ShapeFn) {
   INFER_ERROR("Dimensions must be equal, but are 2 and 3", op, "[2,3];[3]");
 
   // Output matches input.
-  INFER_OK(op, "?;?", "?");
+  INFER_OK(op, "?;?", "in0");
   INFER_OK(op, "[1,2,2];[1,2]", "in0");
   INFER_OK(op, "[1,2,3];?", "in0");
   INFER_OK(op, "[1,3,2];?", "in0");
@@ -1612,7 +1619,7 @@ TEST(ArrayOpsTest, UnchangedWithQuantizationScalars_ShapeFn) {
 TEST(ArrayOpsTest, FakeQuantWithMinMaxVarsPerChannel) {
   ShapeInferenceTestOp op("FakeQuantWithMinMaxVarsPerChannel");
 
-  INFER_OK(op, "?;?;?", "?");
+  INFER_OK(op, "?;?;?", "in0");
   INFER_OK(op, "[?];?;?", "in0");
   INFER_OK(op, "[1,?,3];[3];[3]", "in0");
   INFER_OK(op, "[3];[3];[3]", "in0");
@@ -1631,7 +1638,7 @@ TEST(ArrayOpsTest, FakeQuantWithMinMaxVarsPerChannel) {
 TEST(ArrayOpsTest, FakeQuantWithMinMaxVarsPerChannelGradient) {
   ShapeInferenceTestOp op("FakeQuantWithMinMaxVarsPerChannelGradient");
 
-  INFER_OK(op, "?;?;?;?", "?;[?];[?]");
+  INFER_OK(op, "?;?;?;?", "in0;[?];[?]");
   INFER_OK(op, "[3];[3];[3];[3]", "in0;in3;in3");
   INFER_OK(op, "[1,3];[1,3];[3];[3]", "in0;in3;in3");
   INFER_OK(op, "[1,2,3,4];[1,2,3,4];[4];[4]", "in0;in3;in3");
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index ffb608d600744667675fd2494338111335c7ca99..e57f1d72381d5ed2546471e07b9c021991a653b7 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -39,6 +39,79 @@ op {
     }
   }
 }
+op {
+  name: "Abs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "AccumulateNV2"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_aggregate: true
+  is_commutative: true
+}
 op {
   name: "AccumulateNV2"
   input_arg {
@@ -77,6 +150,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -165,6 +239,47 @@ op {
     }
   }
 }
+op {
+  name: "AccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
 op {
   name: "AccumulatorNumAccumulated"
   input_arg {
@@ -267,6 +382,47 @@ op {
     }
   }
 }
+op {
+  name: "AccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "average"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
 op {
   name: "Acos"
   input_arg {
@@ -293,6 +449,33 @@ op {
     }
   }
 }
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Acosh"
   input_arg {
@@ -317,6 +500,65 @@ op {
     }
   }
 }
+op {
+  name: "Acosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Add"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "Add"
   input_arg {
@@ -337,6 +579,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -519,6 +762,52 @@ op {
   is_aggregate: true
   is_commutative: true
 }
+op {
+  name: "AddN"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+        type: DT_VARIANT
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
 op {
   name: "AddSparseToTensorsMap"
   input_arg {
@@ -592,6 +881,42 @@ op {
   is_aggregate: true
   is_commutative: true
 }
+op {
+  name: "AddV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
 op {
   name: "AdjustContrast"
   input_arg {
@@ -1023,7 +1348,7 @@ op {
   }
 }
 op {
-  name: "ApplyAdagrad"
+  name: "ApplyAdadelta"
   input_arg {
     name: "var"
     type_attr: "T"
@@ -1035,62 +1360,20 @@ op {
     is_ref: true
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
+    name: "accum_update"
     type_attr: "T"
     is_ref: true
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyAdagrad"
   input_arg {
-    name: "var"
+    name: "lr"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "rho"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "lr"
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
@@ -1123,6 +1406,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1135,42 +1419,25 @@ op {
   }
 }
 op {
-  name: "ApplyAdagradDA"
+  name: "ApplyAdagrad"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "gradient_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
+    name: "accum"
     type_attr: "T"
     is_ref: true
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
+    name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
   output_arg {
     name: "out"
     type_attr: "T"
@@ -1207,42 +1474,25 @@ op {
   }
 }
 op {
-  name: "ApplyAdagradDA"
+  name: "ApplyAdagrad"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "gradient_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
+    name: "accum"
     type_attr: "T"
     is_ref: true
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
+    name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
   output_arg {
     name: "out"
     type_attr: "T"
@@ -1281,46 +1531,21 @@ op {
   }
 }
 op {
-  name: "ApplyAdam"
+  name: "ApplyAdagrad"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "m"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "v"
+    name: "accum"
     type_attr: "T"
     is_ref: true
   }
-  input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
-    type_attr: "T"
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
-  input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
   input_arg {
     name: "grad"
     type_attr: "T"
@@ -1349,6 +1574,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1361,28 +1589,24 @@ op {
   }
 }
 op {
-  name: "ApplyAdam"
+  name: "ApplyAdagradDA"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "m"
+    name: "gradient_accumulator"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "v"
+    name: "gradient_squared_accumulator"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
@@ -1390,20 +1614,16 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "global_step"
+    type: DT_INT64
   }
   output_arg {
     name: "out"
@@ -1439,37 +1659,26 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ApplyAdam"
+  name: "ApplyAdagradDA"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "m"
+    name: "gradient_accumulator"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "v"
+    name: "gradient_squared_accumulator"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
@@ -1477,20 +1686,16 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "global_step"
+    type: DT_INT64
   }
   output_arg {
     name: "out"
@@ -1528,55 +1733,43 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ApplyCenteredRMSProp"
+  name: "ApplyAdagradDA"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mg"
+    name: "gradient_accumulator"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "ms"
+    name: "gradient_squared_accumulator"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mom"
+    name: "grad"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "global_step"
+    type: DT_INT64
   }
   output_arg {
     name: "out"
@@ -1602,6 +1795,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1614,37 +1810,40 @@ op {
   }
 }
 op {
-  name: "ApplyCenteredRMSProp"
+  name: "ApplyAdam"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mg"
+    name: "m"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "ms"
+    name: "v"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mom"
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
+    name: "beta1"
     type_attr: "T"
   }
   input_arg {
-    name: "momentum"
+    name: "beta2"
     type_attr: "T"
   }
   input_arg {
@@ -1679,8 +1878,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -1693,24 +1890,28 @@ op {
   }
 }
 op {
-  name: "ApplyFtrl"
+  name: "ApplyAdam"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "m"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "linear"
+    name: "v"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "grad"
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
     type_attr: "T"
   }
   input_arg {
@@ -1718,15 +1919,19 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "beta1"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "beta2"
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -1763,26 +1968,37 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "ApplyFtrl"
+  name: "ApplyAdam"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "m"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "linear"
+    name: "v"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "grad"
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
     type_attr: "T"
   }
   input_arg {
@@ -1790,15 +2006,19 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "beta1"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "beta2"
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -1837,26 +2057,37 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "ApplyFtrlV2"
+  name: "ApplyAdam"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "m"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "linear"
+    name: "v"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "grad"
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
     type_attr: "T"
   }
   input_arg {
@@ -1864,19 +2095,19 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "beta1"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "beta2"
     type_attr: "T"
   }
   input_arg {
-    name: "l2_shrinkage"
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -1903,6 +2134,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1913,46 +2147,44 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "ApplyFtrlV2"
+  name: "ApplyAddSign"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
+    name: "m"
     type_attr: "T"
     is_ref: true
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "sign_decay"
     type_attr: "T"
   }
   input_arg {
-    name: "l2_shrinkage"
+    name: "beta"
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -1993,68 +2225,35 @@ op {
   }
 }
 op {
-  name: "ApplyGradientDescent"
+  name: "ApplyAddSign"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "alpha"
+    name: "m"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "delta"
+    name: "lr"
     type_attr: "T"
   }
-  output_arg {
-    name: "out"
+  input_arg {
+    name: "alpha"
     type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
   }
-}
-op {
-  name: "ApplyGradientDescent"
   input_arg {
-    name: "var"
+    name: "sign_decay"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "alpha"
+    name: "beta"
     type_attr: "T"
   }
   input_arg {
-    name: "delta"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -2083,6 +2282,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -2095,14 +2295,24 @@ op {
   }
 }
 op {
-  name: "ApplyMomentum"
+  name: "ApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
     type_attr: "T"
     is_ref: true
   }
@@ -2111,13 +2321,21 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "rho"
     type_attr: "T"
   }
   input_arg {
     name: "momentum"
     type_attr: "T"
   }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
   output_arg {
     name: "out"
     type_attr: "T"
@@ -2152,23 +2370,26 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ApplyMomentum"
+  name: "ApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
     type_attr: "T"
     is_ref: true
   }
@@ -2177,13 +2398,21 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "rho"
     type_attr: "T"
   }
   input_arg {
     name: "momentum"
     type_attr: "T"
   }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
   output_arg {
     name: "out"
     type_attr: "T"
@@ -2220,23 +2449,26 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ApplyProximalAdagrad"
+  name: "ApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
     type_attr: "T"
     is_ref: true
   }
@@ -2245,11 +2477,15 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "rho"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
@@ -2280,6 +2516,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -2292,7 +2531,7 @@ op {
   }
 }
 op {
-  name: "ApplyProximalAdagrad"
+  name: "ApplyFtrl"
   input_arg {
     name: "var"
     type_attr: "T"
@@ -2303,6 +2542,15 @@ op {
     type_attr: "T"
     is_ref: true
   }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
   input_arg {
     name: "lr"
     type_attr: "T"
@@ -2316,7 +2564,7 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "lr_power"
     type_attr: "T"
   }
   output_arg {
@@ -2343,8 +2591,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -2357,14 +2603,28 @@ op {
   }
 }
 op {
-  name: "ApplyProximalGradientDescent"
+  name: "ApplyFtrl"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "alpha"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
@@ -2376,7 +2636,7 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "delta"
+    name: "lr_power"
     type_attr: "T"
   }
   output_arg {
@@ -2403,6 +2663,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -2415,14 +2677,28 @@ op {
   }
 }
 op {
-  name: "ApplyProximalGradientDescent"
+  name: "ApplyFtrl"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "alpha"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
@@ -2434,7 +2710,7 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "delta"
+    name: "lr_power"
     type_attr: "T"
   }
   output_arg {
@@ -2463,6 +2739,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -2475,40 +2752,44 @@ op {
   }
 }
 op {
-  name: "ApplyRMSProp"
+  name: "ApplyFtrlV2"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "ms"
+    name: "accum"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mom"
+    name: "linear"
     type_attr: "T"
     is_ref: true
   }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "momentum"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2_shrinkage"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "lr_power"
     type_attr: "T"
   }
   output_arg {
@@ -2547,40 +2828,44 @@ op {
   }
 }
 op {
-  name: "ApplyRMSProp"
+  name: "ApplyFtrlV2"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "ms"
+    name: "accum"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mom"
+    name: "linear"
     type_attr: "T"
     is_ref: true
   }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "momentum"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2_shrinkage"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "lr_power"
     type_attr: "T"
   }
   output_arg {
@@ -2621,18 +2906,50 @@ op {
   }
 }
 op {
-  name: "ApproximateEqual"
+  name: "ApplyFtrlV2"
   input_arg {
-    name: "x"
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "y"
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -2653,31 +2970,39 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "tolerance"
-    type: "float"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      f: 1e-05
+      b: false
     }
   }
-  is_commutative: true
 }
 op {
-  name: "ApproximateEqual"
+  name: "ApplyGradientDescent"
   input_arg {
-    name: "x"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "y"
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -2698,33 +3023,36 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "tolerance"
-    type: "float"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      f: 1e-05
+      b: false
     }
   }
-  is_commutative: true
 }
 op {
-  name: "ArgMax"
+  name: "ApplyGradientDescent"
   input_arg {
-    name: "input"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_INT64
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -2745,36 +3073,38 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
 }
 op {
-  name: "ArgMax"
+  name: "ApplyGradientDescent"
   input_arg {
-    name: "input"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "output_type"
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -2795,49 +3125,48 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "output_type"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
 }
 op {
-  name: "ArgMax"
+  name: "ApplyMomentum"
   input_arg {
-    name: "input"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "output_type"
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -2858,51 +3187,52 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
   attr {
-    name: "output_type"
-    type: "type"
+    name: "use_nesterov"
+    type: "bool"
     default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
 }
 op {
-  name: "ArgMin"
+  name: "ApplyMomentum"
   input_arg {
-    name: "input"
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_INT64
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -2923,36 +3253,54 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT32
+      b: false
     }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
 }
 op {
-  name: "ArgMin"
+  name: "ApplyMomentum"
   input_arg {
-    name: "input"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "output_type"
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -2973,49 +3321,63 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
   attr {
-    name: "output_type"
-    type: "type"
+    name: "use_nesterov"
+    type: "bool"
     default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
 }
 op {
-  name: "ArgMin"
+  name: "ApplyPowerSign"
   input_arg {
-    name: "input"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "output_type"
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -3042,217 +3404,360 @@ op {
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "output_type"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
 }
 op {
-  name: "AsString"
+  name: "ApplyPowerSign"
   input_arg {
-    name: "input"
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_STRING
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_BOOL
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
         type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "precision"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "scientific"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "shortest"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
-  attr {
-    name: "width"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "fill"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
 }
 op {
-  name: "Asin"
+  name: "ApplyProximalAdagrad"
   input_arg {
-    name: "x"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "Asinh"
+  name: "ApplyProximalAdagrad"
   input_arg {
-    name: "x"
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "Assert"
+  name: "ApplyProximalAdagrad"
   input_arg {
-    name: "condition"
-    type: DT_BOOL
+    name: "var"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "data"
-    type_list_attr: "T"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "summarize"
-    type: "int"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      i: 3
+      b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "Assign"
+  name: "ApplyProximalGradientDescent"
   input_arg {
-    name: "ref"
+    name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "value"
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
     type_attr: "T"
   }
   output_arg {
-    name: "output_ref"
+    name: "out"
     type_attr: "T"
     is_ref: true
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "validate_shape"
-    type: "bool"
-    default_value {
-      b: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
     }
   }
   attr {
     name: "use_locking"
     type: "bool"
     default_value {
-      b: true
+      b: false
     }
   }
-  allows_uninitialized_input: true
 }
 op {
-  name: "AssignAdd"
+  name: "ApplyProximalGradientDescent"
   input_arg {
-    name: "ref"
+    name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "value"
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
     type_attr: "T"
   }
   output_arg {
-    name: "output_ref"
+    name: "out"
     type_attr: "T"
     is_ref: true
   }
@@ -3275,6 +3780,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -3287,18 +3794,30 @@ op {
   }
 }
 op {
-  name: "AssignAdd"
+  name: "ApplyProximalGradientDescent"
   input_arg {
-    name: "ref"
+    name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "value"
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
     type_attr: "T"
   }
   output_arg {
-    name: "output_ref"
+    name: "out"
     type_attr: "T"
     is_ref: true
   }
@@ -3323,6 +3842,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -3335,34 +3855,44 @@ op {
   }
 }
 op {
-  name: "AssignAddVariableOp"
+  name: "ApplyRMSProp"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "var"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
   }
-  is_stateful: true
-}
-op {
-  name: "AssignSub"
   input_arg {
-    name: "ref"
+    name: "mom"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "value"
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "output_ref"
+    name: "out"
     type_attr: "T"
     is_ref: true
   }
@@ -3397,18 +3927,44 @@ op {
   }
 }
 op {
-  name: "AssignSub"
+  name: "ApplyRMSProp"
   input_arg {
-    name: "ref"
+    name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "value"
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "output_ref"
+    name: "out"
     type_attr: "T"
     is_ref: true
   }
@@ -3445,76 +4001,93 @@ op {
   }
 }
 op {
-  name: "AssignSubVariableOp"
+  name: "ApplyRMSProp"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "var"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "value"
-    type_attr: "dtype"
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "dtype"
-    type: "type"
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
   }
-  is_stateful: true
-}
-op {
-  name: "AssignVariableOp"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "value"
-    type_attr: "dtype"
+    name: "rho"
+    type_attr: "T"
   }
-  attr {
-    name: "dtype"
-    type: "type"
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "Atan"
   input_arg {
-    name: "x"
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "Atan2"
+  name: "ApproximateEqual"
   input_arg {
-    name: "y"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "x"
+    name: "y"
     type_attr: "T"
   }
   output_arg {
     name: "z"
-    type_attr: "T"
+    type: DT_BOOL
   }
   attr {
     name: "T"
@@ -3523,666 +4096,760 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
+  attr {
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  is_commutative: true
 }
 op {
-  name: "Atanh"
+  name: "ApproximateEqual"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "y"
     type_attr: "T"
   }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  is_commutative: true
 }
 op {
-  name: "AudioSpectrogram"
+  name: "ApproximateEqual"
   input_arg {
-    name: "input"
-    type: DT_FLOAT
+    name: "x"
+    type_attr: "T"
   }
-  output_arg {
-    name: "spectrogram"
-    type: DT_FLOAT
+  input_arg {
+    name: "y"
+    type_attr: "T"
   }
-  attr {
-    name: "window_size"
-    type: "int"
+  output_arg {
+    name: "z"
+    type: DT_BOOL
   }
   attr {
-    name: "stride"
-    type: "int"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "magnitude_squared"
-    type: "bool"
+    name: "tolerance"
+    type: "float"
     default_value {
-      b: false
+      f: 1e-05
     }
   }
+  is_commutative: true
 }
 op {
-  name: "AudioSummary"
+  name: "ArgMax"
   input_arg {
-    name: "tag"
-    type: DT_STRING
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "tensor"
-    type: DT_FLOAT
+    name: "dimension"
+    type_attr: "Tidx"
   }
   output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "sample_rate"
-    type: "float"
+    name: "output"
+    type: DT_INT64
   }
   attr {
-    name: "max_outputs"
-    type: "int"
-    default_value {
-      i: 3
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
     }
-    has_minimum: true
-    minimum: 1
-  }
-  deprecation {
-    version: 15
-  }
-}
-op {
-  name: "AudioSummaryV2"
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "sample_rate"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
   }
   attr {
-    name: "max_outputs"
-    type: "int"
+    name: "Tidx"
+    type: "type"
     default_value {
-      i: 3
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
-    has_minimum: true
-    minimum: 1
   }
 }
 op {
-  name: "AvgPool"
+  name: "ArgMax"
   input_arg {
-    name: "value"
+    name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
   output_arg {
     name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    type_attr: "output_type"
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "Tidx"
+    type: "type"
     default_value {
-      s: "NHWC"
+      type: DT_INT32
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "T"
+    name: "output_type"
     type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "AvgPool"
+  name: "ArgMax"
   input_arg {
-    name: "value"
+    name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
   output_arg {
     name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    type_attr: "output_type"
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "Tidx"
+    type: "type"
     default_value {
-      s: "NHWC"
+      type: DT_INT32
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "T"
+    name: "output_type"
     type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_HALF
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "AvgPool"
+  name: "ArgMax"
   input_arg {
-    name: "value"
+    name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
   output_arg {
     name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    type_attr: "output_type"
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "Tidx"
+    type: "type"
     default_value {
-      s: "NHWC"
+      type: DT_INT32
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "T"
+    name: "output_type"
     type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "AvgPool3D"
+  name: "ArgMin"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
   output_arg {
     name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    type: DT_INT64
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "T"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "AvgPool3D"
+  name: "ArgMin"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
   output_arg {
     name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    type_attr: "output_type"
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "Tidx"
+    type: "type"
     default_value {
-      s: "NDHWC"
+      type: DT_INT32
     }
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "T"
+    name: "output_type"
     type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "AvgPool3DGrad"
+  name: "ArgMin"
   input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "dimension"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    type_attr: "output_type"
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "T"
+    name: "output_type"
     type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "AvgPool3DGrad"
+  name: "ArgMin"
   input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "dimension"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    type_attr: "output_type"
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "Tidx"
+    type: "type"
     default_value {
-      s: "NDHWC"
+      type: DT_INT32
     }
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "T"
+    name: "output_type"
     type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "AvgPoolGrad"
-  input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
-  }
+  name: "AsString"
   input_arg {
-    name: "grad"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    type: DT_STRING
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_BOOL
+        type: DT_INT8
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "precision"
+    type: "int"
     default_value {
-      s: "NHWC"
+      i: -1
     }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
+  }
+  attr {
+    name: "scientific"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
-      }
+    name: "shortest"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "width"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "fill"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
 }
 op {
-  name: "AvgPoolGrad"
-  input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
-  }
+  name: "Asin"
   input_arg {
-    name: "grad"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+}
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_HALF
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "AvgPoolGrad"
-  input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
-  }
+  name: "Asinh"
   input_arg {
-    name: "grad"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+}
+op {
+  name: "Asinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -4190,66 +4857,116 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Barrier"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+  name: "Assert"
+  input_arg {
+    name: "condition"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
   }
   attr {
-    name: "component_types"
+    name: "T"
     type: "list(type)"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "shapes"
-    type: "list(shape)"
+    name: "summarize"
+    type: "int"
     default_value {
-      list {
-      }
+      i: 3
     }
-    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "Assign"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "validate_shape"
+    type: "bool"
     default_value {
-      s: ""
+      b: true
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: ""
+      b: true
     }
   }
-  is_stateful: true
+  allows_uninitialized_input: true
 }
 op {
-  name: "BarrierClose"
+  name: "AssignAdd"
   input_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
     is_ref: true
   }
   attr {
-    name: "cancel_pending_enqueues"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
@@ -4257,141 +4974,179 @@ op {
   }
 }
 op {
-  name: "BarrierIncompleteSize"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-}
-op {
-  name: "BarrierInsertMany"
+  name: "AssignAdd"
   input_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "ref"
+    type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "keys"
-    type: DT_STRING
+    name: "value"
+    type_attr: "T"
   }
-  input_arg {
-    name: "values"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "component_index"
-    type: "int"
-  }
-}
-op {
-  name: "BarrierReadySize"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT32
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
 }
 op {
-  name: "BarrierTakeMany"
+  name: "AssignAdd"
   input_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "ref"
+    type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "num_elements"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "keys"
-    type: DT_STRING
+    name: "value"
+    type_attr: "T"
   }
   output_arg {
-    name: "values"
-    type_list_attr: "component_types"
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "allow_small_batch"
-    type: "bool"
-    default_value {
-      b: false
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
     }
   }
   attr {
-    name: "wait_for_incomplete"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
+}
+op {
+  name: "AssignAddVariableOp"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
   attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
+    name: "dtype"
+    type: "type"
   }
+  is_stateful: true
 }
 op {
-  name: "BatchCholesky"
+  name: "AssignSub"
   input_arg {
-    name: "input"
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
-  deprecation {
-    version: 13
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
 }
 op {
-  name: "BatchCholeskyGrad"
+  name: "AssignSub"
   input_arg {
-    name: "l"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "grad"
+    name: "value"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -4400,164 +5155,146 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  deprecation {
-    version: 13
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
 }
 op {
-  name: "BatchDataset"
+  name: "AssignSub"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "batch_size"
-    type: DT_INT64
+    name: "value"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "BatchDataset"
+  name: "AssignSubVariableOp"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "resource"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "value"
+    type_attr: "dtype"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "BatchFFT"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-  deprecation {
-    version: 15
-  }
-}
-op {
-  name: "BatchFFT2D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-  deprecation {
-    version: 15
-  }
-}
-op {
-  name: "BatchFFT3D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-  deprecation {
-    version: 15
+    name: "dtype"
+    type: "type"
   }
+  is_stateful: true
 }
 op {
-  name: "BatchIFFT"
+  name: "AssignVariableOp"
   input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-  deprecation {
-    version: 15
+    name: "resource"
+    type: DT_RESOURCE
   }
-}
-op {
-  name: "BatchIFFT2D"
   input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
+    name: "value"
+    type_attr: "dtype"
   }
-  deprecation {
-    version: 15
+  attr {
+    name: "dtype"
+    type: "type"
   }
+  is_stateful: true
 }
 op {
-  name: "BatchIFFT3D"
+  name: "Atan"
   input_arg {
-    name: "input"
-    type: DT_COMPLEX64
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_COMPLEX64
+    name: "y"
+    type_attr: "T"
   }
-  deprecation {
-    version: 15
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "BatchMatMul"
+  name: "Atan"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -4566,63 +5303,54 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
+        type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "adj_x"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "adj_y"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "BatchMatrixBandPart"
+  name: "Atan2"
   input_arg {
-    name: "input"
+    name: "y"
     type_attr: "T"
   }
   input_arg {
-    name: "num_lower"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_upper"
-    type: DT_INT64
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "band"
+    name: "z"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  deprecation {
-    version: 14
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
 }
 op {
-  name: "BatchMatrixDeterminant"
+  name: "Atan2"
   input_arg {
-    name: "input"
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
@@ -4630,23 +5358,21 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  deprecation {
-    version: 13
-  }
 }
 op {
-  name: "BatchMatrixDeterminant"
+  name: "Atanh"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -4654,6 +5380,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -4661,107 +5388,121 @@ op {
       }
     }
   }
-  deprecation {
-    version: 13
-  }
 }
 op {
-  name: "BatchMatrixDiag"
+  name: "Atanh"
   input_arg {
-    name: "diagonal"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  deprecation {
-    version: 14
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "BatchMatrixDiagPart"
+  name: "AudioSpectrogram"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "diagonal"
-    type_attr: "T"
+    name: "spectrogram"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "window_size"
+    type: "int"
   }
-  deprecation {
-    version: 14
+  attr {
+    name: "stride"
+    type: "int"
+  }
+  attr {
+    name: "magnitude_squared"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
 }
 op {
-  name: "BatchMatrixInverse"
+  name: "AudioSummary"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "summary"
+    type: DT_STRING
   }
   attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "sample_rate"
+    type: "float"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
+    name: "max_outputs"
+    type: "int"
+    default_value {
+      i: 3
     }
+    has_minimum: true
+    minimum: 1
   }
   deprecation {
-    version: 13
+    version: 15
   }
 }
 op {
-  name: "BatchMatrixSetDiag"
+  name: "AudioSummaryV2"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "tag"
+    type: DT_STRING
   }
   input_arg {
-    name: "diagonal"
-    type_attr: "T"
+    name: "tensor"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "summary"
+    type: DT_STRING
   }
   attr {
-    name: "T"
-    type: "type"
-  }
-  deprecation {
-    version: 14
+    name: "max_outputs"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "BatchMatrixSolve"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
+  name: "AvgPool"
   input_arg {
-    name: "rhs"
+    name: "value"
     type_attr: "T"
   }
   output_arg {
@@ -4769,10 +5510,38 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "adjoint"
-    type: "bool"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
     default_value {
-      b: false
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
   attr {
@@ -4780,62 +5549,74 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
       }
     }
   }
-  deprecation {
-    version: 13
-  }
 }
 op {
-  name: "BatchMatrixSolveLs"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
+  name: "AvgPool"
   input_arg {
-    name: "rhs"
+    name: "value"
     type_attr: "T"
   }
-  input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "fast"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: true
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
-  deprecation {
-    version: 13
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
   }
 }
 op {
-  name: "BatchMatrixTriangularSolve"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
+  name: "AvgPool"
   input_arg {
-    name: "rhs"
+    name: "value"
     type_attr: "T"
   }
   output_arg {
@@ -4843,17 +5624,38 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "lower"
-    type: "bool"
-    default_value {
-      b: true
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "adjoint"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: false
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
   attr {
@@ -4861,255 +5663,103 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
+        type: DT_HALF
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
-  deprecation {
-    version: 13
-  }
 }
 op {
-  name: "BatchNormWithGlobalNormalization"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
+  name: "AvgPool"
   input_arg {
-    name: "m"
+    name: "value"
     type_attr: "T"
   }
-  input_arg {
-    name: "v"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
-  input_arg {
-    name: "gamma"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "result"
-    type_attr: "T"
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "variance_epsilon"
-    type: "float"
-  }
-  attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-  deprecation {
-    version: 9
-  }
-}
-op {
-  name: "BatchNormWithGlobalNormalization"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "gamma"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "result"
-    type_attr: "T"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "variance_epsilon"
-    type: "float"
-  }
-  attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-  deprecation {
-    version: 9
-  }
 }
 op {
-  name: "BatchNormWithGlobalNormalizationGrad"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "gamma"
-    type_attr: "T"
-  }
+  name: "AvgPool3D"
   input_arg {
-    name: "backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dx"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dm"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "dv"
+    name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "db"
-    type_attr: "T"
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
-  output_arg {
-    name: "dg"
-    type_attr: "T"
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
-  attr {
-    name: "variance_epsilon"
-    type: "float"
-  }
-  attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-  deprecation {
-    version: 9
-  }
-}
-op {
-  name: "BatchNormWithGlobalNormalizationGrad"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "gamma"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dx"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dm"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dv"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "db"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dg"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
@@ -5117,37 +5767,12 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "variance_epsilon"
-    type: "float"
-  }
-  attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-  deprecation {
-    version: 9
-  }
 }
 op {
-  name: "BatchSelfAdjointEig"
+  name: "AvgPool3D"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -5157,38 +5782,38 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
-  deprecation {
-    version: 11
-  }
-}
-op {
-  name: "BatchSelfAdjointEigV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "e"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "v"
-    type_attr: "T"
-  }
   attr {
-    name: "compute_v"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: true
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
     }
   }
   attr {
@@ -5196,45 +5821,55 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
-  deprecation {
-    version: 13
-  }
 }
 op {
-  name: "BatchSvd"
+  name: "AvgPool3D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "s"
+    name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "u"
-    type_attr: "T"
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
-  output_arg {
-    name: "v"
-    type_attr: "T"
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "compute_uv"
-    type: "bool"
-    default_value {
-      b: true
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "full_matrices"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: false
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
     }
   }
   attr {
@@ -5242,122 +5877,109 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
+        type: DT_BFLOAT16
         type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_DOUBLE
       }
     }
   }
-  deprecation {
-    version: 13
-  }
 }
 op {
-  name: "BatchToSpace"
+  name: "AvgPool3DGrad"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "orig_input_shape"
+    type: DT_INT32
   }
   input_arg {
-    name: "crops"
-    type_attr: "Tidx"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "block_size"
-    type: "int"
+    name: "strides"
+    type: "list(int)"
     has_minimum: true
-    minimum: 2
+    minimum: 5
   }
   attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "BatchToSpaceND"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "AvgPool3DGrad"
   input_arg {
-    name: "block_shape"
-    type_attr: "Tblock_shape"
+    name: "orig_input_shape"
+    type: DT_INT32
   }
   input_arg {
-    name: "crops"
-    type_attr: "Tcrops"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "Tblock_shape"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "Tcrops"
-    type: "type"
+    name: "data_format"
+    type: "string"
     default_value {
-      type: DT_INT32
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
-}
-op {
-  name: "Betainc"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
@@ -5370,13 +5992,13 @@ op {
   }
 }
 op {
-  name: "BiasAdd"
+  name: "AvgPool3DGrad"
   input_arg {
-    name: "value"
-    type_attr: "T"
+    name: "orig_input_shape"
+    type: DT_INT32
   }
   input_arg {
-    name: "bias"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -5384,24 +6006,24 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
@@ -5409,24 +6031,35 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "BiasAdd"
+  name: "AvgPoolGrad"
   input_arg {
-    name: "value"
-    type_attr: "T"
+    name: "orig_input_shape"
+    type: DT_INT32
   }
   input_arg {
-    name: "bias"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -5434,26 +6067,24 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
@@ -5470,11 +6101,26 @@ op {
       }
     }
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
 }
 op {
-  name: "BiasAddGrad"
+  name: "AvgPoolGrad"
   input_arg {
-    name: "out_backprop"
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -5482,24 +6128,24 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
@@ -5516,11 +6162,26 @@ op {
       }
     }
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
 }
 op {
-  name: "BiasAddGrad"
+  name: "AvgPoolGrad"
   input_arg {
-    name: "out_backprop"
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -5528,26 +6189,24 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
@@ -5564,240 +6223,275 @@ op {
       }
     }
   }
-}
-op {
-  name: "BiasAddV1"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "BiasAddV1"
+  name: "AvgPoolGrad"
   input_arg {
-    name: "value"
-    type_attr: "T"
+    name: "orig_input_shape"
+    type: DT_INT32
   }
   input_arg {
-    name: "bias"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Bincount"
+  name: "Barrier"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "BarrierClose"
   input_arg {
-    name: "arr"
-    type: DT_INT32
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
+  attr {
+    name: "cancel_pending_enqueues"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "BarrierIncompleteSize"
   input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
     name: "size"
     type: DT_INT32
   }
+}
+op {
+  name: "BarrierInsertMany"
   input_arg {
-    name: "weights"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
-  output_arg {
-    name: "bins"
+  input_arg {
+    name: "keys"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+  }
+  attr {
+    name: "component_index"
+    type: "int"
   }
 }
 op {
-  name: "Bitcast"
+  name: "BarrierReadySize"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   output_arg {
-    name: "output"
-    type_attr: "type"
+    name: "size"
+    type: DT_INT32
+  }
+}
+op {
+  name: "BarrierTakeMany"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "keys"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "component_types"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "allow_small_batch"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
+    name: "wait_for_incomplete"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
     }
   }
 }
 op {
-  name: "Bitcast"
+  name: "BatchCholesky"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_attr: "type"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "type"
-    type: "type"
-    allowed_values {
-      list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
+  deprecation {
+    version: 13
+  }
 }
 op {
-  name: "BitwiseAnd"
+  name: "BatchCholeskyGrad"
   input_arg {
-    name: "x"
+    name: "l"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -5805,51 +6499,156 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
-  is_commutative: true
+  deprecation {
+    version: 13
+  }
 }
 op {
-  name: "BitwiseAnd"
+  name: "BatchDataset"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "batch_size"
+    type: DT_INT64
   }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
-  is_commutative: true
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
 }
 op {
-  name: "BitwiseOr"
+  name: "BatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "BatchFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchIFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchIFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchIFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchMatMul"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -5859,7 +6658,7 @@ op {
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -5867,19 +6666,32 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  is_commutative: true
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "BitwiseOr"
+  name: "BatchMatMul"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -5889,7 +6701,7 @@ op {
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -5897,61 +6709,65 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  is_commutative: true
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "BitwiseXor"
+  name: "BatchMatrixBandPart"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "num_lower"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_upper"
+    type: DT_INT64
   }
   output_arg {
-    name: "z"
+    name: "band"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-      }
-    }
   }
-  is_commutative: true
+  deprecation {
+    version: 14
+  }
 }
 op {
-  name: "BitwiseXor"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
+  name: "BatchMatrixDeterminant"
   input_arg {
-    name: "y"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -5959,360 +6775,330 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
-  is_commutative: true
+  deprecation {
+    version: 13
+  }
 }
 op {
-  name: "BroadcastArgs"
-  input_arg {
-    name: "s0"
-    type_attr: "T"
-  }
+  name: "BatchMatrixDeterminant"
   input_arg {
-    name: "s1"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "r0"
+    name: "output"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+  deprecation {
+    version: 13
+  }
 }
 op {
-  name: "BroadcastGradientArgs"
+  name: "BatchMatrixDiag"
   input_arg {
-    name: "s0"
+    name: "diagonal"
     type_attr: "T"
   }
-  input_arg {
-    name: "s1"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "r0"
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 14
+  }
+}
+op {
+  name: "BatchMatrixDiagPart"
+  input_arg {
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "r1"
+    name: "diagonal"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  }
+  deprecation {
+    version: 14
   }
 }
 op {
-  name: "Bucketize"
+  name: "BatchMatrixInverse"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    type: DT_INT32
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_FLOAT
       }
     }
   }
-  attr {
-    name: "boundaries"
-    type: "list(float)"
+  deprecation {
+    version: 13
   }
 }
 op {
-  name: "CTCBeamSearchDecoder"
+  name: "BatchMatrixSetDiag"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "sequence_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "decoded_indices"
-    type: DT_INT64
-    number_attr: "top_paths"
-  }
-  output_arg {
-    name: "decoded_values"
-    type: DT_INT64
-    number_attr: "top_paths"
-  }
-  output_arg {
-    name: "decoded_shape"
-    type: DT_INT64
-    number_attr: "top_paths"
+    name: "diagonal"
+    type_attr: "T"
   }
   output_arg {
-    name: "log_probability"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "beam_width"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "top_paths"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
   }
-  attr {
-    name: "merge_repeated"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  deprecation {
+    version: 14
   }
 }
 op {
-  name: "CTCGreedyDecoder"
+  name: "BatchMatrixSolve"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "matrix"
+    type_attr: "T"
   }
   input_arg {
-    name: "sequence_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "decoded_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "decoded_values"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "decoded_shape"
-    type: DT_INT64
+    name: "rhs"
+    type_attr: "T"
   }
   output_arg {
-    name: "log_probability"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "merge_repeated"
+    name: "adjoint"
     type: "bool"
     default_value {
       b: false
     }
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 13
+  }
 }
 op {
-  name: "CTCLoss"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
+  name: "BatchMatrixSolveLs"
   input_arg {
-    name: "labels_indices"
-    type: DT_INT64
+    name: "matrix"
+    type_attr: "T"
   }
   input_arg {
-    name: "labels_values"
-    type: DT_INT32
+    name: "rhs"
+    type_attr: "T"
   }
   input_arg {
-    name: "sequence_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "loss"
-    type: DT_FLOAT
+    name: "l2_regularizer"
+    type: DT_DOUBLE
   }
   output_arg {
-    name: "gradient"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "preprocess_collapse_repeated"
-    type: "bool"
-    default_value {
-      b: false
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
     }
   }
   attr {
-    name: "ctc_merge_repeated"
+    name: "fast"
     type: "bool"
     default_value {
       b: true
     }
   }
+  deprecation {
+    version: 13
+  }
 }
 op {
-  name: "CTCLoss"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "labels_indices"
-    type: DT_INT64
-  }
+  name: "BatchMatrixTriangularSolve"
   input_arg {
-    name: "labels_values"
-    type: DT_INT32
+    name: "matrix"
+    type_attr: "T"
   }
   input_arg {
-    name: "sequence_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "loss"
-    type: DT_FLOAT
+    name: "rhs"
+    type_attr: "T"
   }
   output_arg {
-    name: "gradient"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "preprocess_collapse_repeated"
+    name: "lower"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
   attr {
-    name: "ctc_merge_repeated"
+    name: "adjoint"
     type: "bool"
     default_value {
-      b: true
+      b: false
     }
   }
   attr {
-    name: "ignore_longer_outputs_than_inputs"
-    type: "bool"
-    default_value {
-      b: false
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
     }
   }
+  deprecation {
+    version: 13
+  }
 }
 op {
-  name: "CacheDataset"
+  name: "BatchNormWithGlobalNormalization"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "t"
+    type_attr: "T"
   }
   input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "m"
+    type_attr: "T"
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "v"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "CacheDataset"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "beta"
+    type_attr: "T"
   }
   input_arg {
-    name: "filename"
-    type: DT_STRING
+    name: "gamma"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "result"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
   }
 }
 op {
-  name: "Cast"
+  name: "BatchNormWithGlobalNormalization"
   input_arg {
-    name: "x"
-    type_attr: "SrcT"
+    name: "t"
+    type_attr: "T"
   }
-  output_arg {
-    name: "y"
-    type_attr: "DstT"
+  input_arg {
+    name: "m"
+    type_attr: "T"
   }
-  attr {
-    name: "SrcT"
-    type: "type"
+  input_arg {
+    name: "v"
+    type_attr: "T"
   }
-  attr {
-    name: "DstT"
-    type: "type"
+  input_arg {
+    name: "beta"
+    type_attr: "T"
   }
-}
-op {
-  name: "Ceil"
   input_arg {
-    name: "x"
+    name: "gamma"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "result"
     type_attr: "T"
   }
   attr {
@@ -6320,21 +7106,61 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
 }
 op {
-  name: "CheckNumerics"
+  name: "BatchNormWithGlobalNormalization"
   input_arg {
-    name: "tensor"
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "result"
     type_attr: "T"
   }
   attr {
@@ -6342,46 +7168,78 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "message"
-    type: "string"
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
   }
 }
 op {
-  name: "Cholesky"
+  name: "BatchNormWithGlobalNormalizationGrad"
   input_arg {
-    name: "input"
+    name: "t"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "m"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
+  input_arg {
+    name: "v"
+    type_attr: "T"
   }
-}
-op {
-  name: "Cholesky"
   input_arg {
-    name: "input"
+    name: "gamma"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "dx"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dm"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dv"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "db"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dg"
     type_attr: "T"
   }
   attr {
@@ -6389,26 +7247,75 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
 }
 op {
-  name: "CholeskyGrad"
+  name: "BatchNormWithGlobalNormalizationGrad"
   input_arg {
-    name: "l"
+    name: "t"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "dx"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dm"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dv"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "db"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dg"
     type_attr: "T"
   }
   attr {
@@ -6418,236 +7325,296 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
 }
 op {
-  name: "CompareAndBitpack"
+  name: "BatchNormWithGlobalNormalizationGrad"
   input_arg {
-    name: "input"
+    name: "t"
     type_attr: "T"
   }
   input_arg {
-    name: "threshold"
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_UINT8
+    name: "dx"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dm"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dv"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "db"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dg"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_BOOL
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
 }
 op {
-  name: "Complex"
-  input_arg {
-    name: "real"
-    type_attr: "T"
-  }
+  name: "BatchSelfAdjointEig"
   input_arg {
-    name: "imag"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "out"
-    type_attr: "Tout"
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_FLOAT
       }
     }
   }
-  attr {
-    name: "Tout"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+  deprecation {
+    version: 11
   }
 }
 op {
-  name: "ComplexAbs"
+  name: "BatchSelfAdjointEigV2"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
-    type_attr: "Tout"
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "compute_v"
+    type: "bool"
     default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+      b: true
     }
   }
   attr {
-    name: "Tout"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_FLOAT
       }
     }
   }
+  deprecation {
+    version: 13
+  }
 }
 op {
-  name: "ComputeAccidentalHits"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
+  name: "BatchSvd"
   input_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "s"
+    type_attr: "T"
   }
   output_arg {
-    name: "ids"
-    type: DT_INT64
+    name: "u"
+    type_attr: "T"
   }
   output_arg {
-    name: "weights"
-    type: DT_FLOAT
+    name: "v"
+    type_attr: "T"
   }
   attr {
-    name: "num_true"
-    type: "int"
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "seed"
-    type: "int"
+    name: "full_matrices"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
   attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
+  deprecation {
+    version: 13
+  }
 }
 op {
-  name: "Concat"
+  name: "BatchToSpace"
   input_arg {
-    name: "concat_dim"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
+    name: "crops"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
   attr {
     name: "T"
     type: "type"
   }
-}
-op {
-  name: "ConcatOffset"
-  input_arg {
-    name: "concat_dim"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "shape"
-    type: DT_INT32
-    number_attr: "N"
-  }
-  output_arg {
-    name: "offset"
-    type: DT_INT32
-    number_attr: "N"
-  }
   attr {
-    name: "N"
+    name: "block_size"
     type: "int"
     has_minimum: true
     minimum: 2
   }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
-  name: "ConcatV2"
+  name: "BatchToSpaceND"
   input_arg {
-    name: "values"
+    name: "input"
     type_attr: "T"
-    number_attr: "N"
   }
   input_arg {
-    name: "axis"
-    type_attr: "Tidx"
+    name: "block_shape"
+    type_attr: "Tblock_shape"
+  }
+  input_arg {
+    name: "crops"
+    type_attr: "Tcrops"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "T"
+    name: "Tblock_shape"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "Tidx"
+    name: "Tcrops"
     type: "type"
     default_value {
       type: DT_INT32
@@ -6661,69 +7628,50 @@ op {
   }
 }
 op {
-  name: "ConcatenateDataset"
+  name: "Betainc"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "a"
+    type_attr: "T"
   }
   input_arg {
-    name: "another_dataset"
-    type: DT_VARIANT
+    name: "b"
+    type_attr: "T"
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "ConcatenateDataset"
+  name: "BiasAdd"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "value"
+    type_attr: "T"
   }
   input_arg {
-    name: "another_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "bias"
+    type_attr: "T"
   }
-}
-op {
-  name: "ConditionalAccumulator"
   output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
@@ -6745,34 +7693,35 @@ op {
     }
   }
   attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "container"
+    name: "data_format"
     type: "string"
     default_value {
-      s: ""
+      s: "NHWC"
     }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ConditionalAccumulator"
+  name: "BiasAdd"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
   output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
@@ -6796,53 +7745,27 @@ op {
     }
   }
   attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
+    name: "data_format"
     type: "string"
     default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Conj"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
+      s: "NHWC"
     }
     allowed_values {
       list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
 }
 op {
-  name: "Conj"
+  name: "BiasAdd"
   input_arg {
-    name: "input"
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
     type_attr: "T"
   }
   output_arg {
@@ -6852,76 +7775,46 @@ op {
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_VARIANT
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-}
-op {
-  name: "ConjugateTranspose"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "perm"
-    type_attr: "Tperm"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
   attr {
-    name: "Tperm"
-    type: "type"
+    name: "data_format"
+    type: "string"
     default_value {
-      type: DT_INT32
+      s: "NHWC"
     }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
 }
 op {
-  name: "Const"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "value"
-    type: "tensor"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-}
-op {
-  name: "ControlTrigger"
-}
-op {
-  name: "Conv2D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "BiasAddGrad"
   input_arg {
-    name: "filter"
+    name: "out_backprop"
     type_attr: "T"
   }
   output_arg {
@@ -6933,29 +7826,20 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
@@ -6974,15 +7858,7 @@ op {
   }
 }
 op {
-  name: "Conv2DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
+  name: "BiasAddGrad"
   input_arg {
     name: "out_backprop"
     type_attr: "T"
@@ -6996,29 +7872,22 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -7037,15 +7906,7 @@ op {
   }
 }
 op {
-  name: "Conv2DBackpropInput"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
+  name: "BiasAddGrad"
   input_arg {
     name: "out_backprop"
     type_attr: "T"
@@ -7059,29 +7920,23 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -7100,13 +7955,13 @@ op {
   }
 }
 op {
-  name: "Conv3D"
+  name: "BiasAddV1"
   input_arg {
-    name: "input"
+    name: "value"
     type_attr: "T"
   }
   input_arg {
-    name: "filter"
+    name: "bias"
     type_attr: "T"
   }
   output_arg {
@@ -7120,34 +7975,69 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+}
+op {
+  name: "BiasAddV1"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Conv3D"
+  name: "BiasAddV1"
   input_arg {
-    name: "input"
+    name: "value"
     type_attr: "T"
   }
   input_arg {
-    name: "filter"
+    name: "bias"
     type_attr: "T"
   }
   output_arg {
@@ -7161,55 +8051,41 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "Conv3DBackpropFilter"
+  name: "Bincount"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "arr"
+    type: DT_INT32
   }
   input_arg {
-    name: "filter"
-    type_attr: "T"
+    name: "size"
+    type: DT_INT32
   }
   input_arg {
-    name: "out_backprop"
+    name: "weights"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "bins"
     type_attr: "T"
   }
   attr {
@@ -7217,48 +8093,23 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_INT32
+        type: DT_INT64
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  deprecation {
-    version: 10
-  }
 }
 op {
-  name: "Conv3DBackpropFilterV2"
+  name: "Bitcast"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "type"
   }
   attr {
     name: "T"
@@ -7267,43 +8118,53 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "type"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "Conv3DBackpropFilterV2"
+  name: "Bitcast"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "type"
   }
   attr {
     name: "T"
@@ -7312,103 +8173,121 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
+    name: "type"
+    type: "type"
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "Conv3DBackpropInput"
+  name: "Bitcast"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "type"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "type"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
-  deprecation {
-    version: 10
-  }
 }
 op {
-  name: "Conv3DBackpropInputV2"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
+  name: "BitwiseAnd"
   input_arg {
-    name: "filter"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
@@ -7416,44 +8295,29 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
       }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "Conv3DBackpropInputV2"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
+  name: "BitwiseAnd"
   input_arg {
-    name: "filter"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
@@ -7461,211 +8325,212 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "Copy"
+  name: "BitwiseOr"
   input_arg {
-    name: "input"
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
     }
   }
-  allows_uninitialized_input: true
+  is_commutative: true
 }
 op {
-  name: "Copy"
+  name: "BitwiseOr"
   input_arg {
-    name: "input"
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_ops_spec"
-    type: "list(string)"
-    default_value {
+    allowed_values {
       list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  allows_uninitialized_input: true
+  is_commutative: true
 }
 op {
-  name: "CopyHost"
+  name: "BitwiseXor"
   input_arg {
-    name: "input"
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
     }
   }
-  allows_uninitialized_input: true
+  is_commutative: true
 }
 op {
-  name: "CopyHost"
+  name: "BitwiseXor"
   input_arg {
-    name: "input"
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_ops_spec"
-    type: "list(string)"
-    default_value {
+    allowed_values {
       list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  allows_uninitialized_input: true
+  is_commutative: true
 }
 op {
-  name: "Cos"
+  name: "BroadcastArgs"
   input_arg {
-    name: "x"
+    name: "s0"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "s1"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "r0"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "Cosh"
+  name: "BroadcastGradientArgs"
   input_arg {
-    name: "x"
+    name: "s0"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "s1"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "r0"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r1"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "CountUpTo"
+  name: "Bucketize"
   input_arg {
-    name: "ref"
+    name: "input"
     type_attr: "T"
-    is_ref: true
   }
   output_arg {
     name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "limit"
-    type: "int"
+    type: DT_INT32
   }
   attr {
     name: "T"
@@ -7674,247 +8539,365 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
+  attr {
+    name: "boundaries"
+    type: "list(float)"
+  }
 }
 op {
-  name: "CropAndResize"
+  name: "BytesProducedStatsDataset"
   input_arg {
-    name: "image"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
+}
+op {
+  name: "CTCBeamSearchDecoder"
   input_arg {
-    name: "box_ind"
-    type: DT_INT32
+    name: "inputs"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "crop_size"
+    name: "sequence_length"
     type: DT_INT32
   }
   output_arg {
-    name: "crops"
+    name: "decoded_indices"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "decoded_values"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "decoded_shape"
+    type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "log_probability"
     type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+    name: "beam_width"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "method"
-    type: "string"
+    name: "top_paths"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "merge_repeated"
+    type: "bool"
     default_value {
-      s: "bilinear"
-    }
-    allowed_values {
-      list {
-        s: "bilinear"
-      }
+      b: true
     }
   }
+}
+op {
+  name: "CTCGreedyDecoder"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "decoded_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "decoded_values"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "decoded_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "log_probability"
+    type: DT_FLOAT
+  }
   attr {
-    name: "extrapolation_value"
-    type: "float"
+    name: "merge_repeated"
+    type: "bool"
     default_value {
-      f: 0
+      b: false
     }
   }
 }
 op {
-  name: "CropAndResize"
+  name: "CTCLoss"
   input_arg {
-    name: "image"
-    type_attr: "T"
+    name: "inputs"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+    name: "labels_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "box_ind"
+    name: "labels_values"
     type: DT_INT32
   }
   input_arg {
-    name: "crop_size"
+    name: "sequence_length"
     type: DT_INT32
   }
   output_arg {
-    name: "crops"
+    name: "loss"
     type: DT_FLOAT
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+  output_arg {
+    name: "gradient"
+    type: DT_FLOAT
   }
   attr {
-    name: "method"
-    type: "string"
+    name: "preprocess_collapse_repeated"
+    type: "bool"
     default_value {
-      s: "bilinear"
-    }
-    allowed_values {
-      list {
-        s: "bilinear"
-      }
+      b: false
     }
   }
   attr {
-    name: "extrapolation_value"
-    type: "float"
+    name: "ctc_merge_repeated"
+    type: "bool"
     default_value {
-      f: 0
+      b: true
     }
   }
 }
 op {
-  name: "CropAndResizeGradBoxes"
+  name: "CTCLoss"
   input_arg {
-    name: "grads"
+    name: "inputs"
     type: DT_FLOAT
   }
   input_arg {
-    name: "image"
-    type_attr: "T"
+    name: "labels_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+    name: "labels_values"
+    type: DT_INT32
   }
   input_arg {
-    name: "box_ind"
+    name: "sequence_length"
     type: DT_INT32
   }
   output_arg {
-    name: "output"
+    name: "loss"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient"
     type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+    name: "preprocess_collapse_repeated"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "method"
-    type: "string"
+    name: "ctc_merge_repeated"
+    type: "bool"
     default_value {
-      s: "bilinear"
+      b: true
     }
-    allowed_values {
-      list {
-        s: "bilinear"
-      }
+  }
+  attr {
+    name: "ignore_longer_outputs_than_inputs"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
 }
 op {
-  name: "CropAndResizeGradBoxes"
+  name: "CacheDataset"
   input_arg {
-    name: "grads"
-    type: DT_FLOAT
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "image"
-    type_attr: "T"
+    name: "filename"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
+  is_stateful: true
+}
+op {
+  name: "CacheDataset"
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "box_ind"
-    type: DT_INT32
+    name: "filename"
+    type: DT_STRING
   }
   output_arg {
-    name: "output"
-    type: DT_FLOAT
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Cast"
+  input_arg {
+    name: "x"
+    type_attr: "SrcT"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "DstT"
+  }
+  attr {
+    name: "SrcT"
+    type: "type"
+  }
+  attr {
+    name: "DstT"
+    type: "type"
+  }
+}
+op {
+  name: "Ceil"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
+}
+op {
+  name: "Ceil"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
   attr {
-    name: "method"
-    type: "string"
-    default_value {
-      s: "bilinear"
-    }
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "bilinear"
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "CropAndResizeGradImage"
+  name: "CheckNumerics"
   input_arg {
-    name: "grads"
-    type: DT_FLOAT
+    name: "tensor"
+    type_attr: "T"
   }
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
-  input_arg {
-    name: "box_ind"
-    type: DT_INT32
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "message"
+    type: "string"
   }
+}
+op {
+  name: "CheckNumerics"
   input_arg {
-    name: "image_size"
-    type: DT_INT32
+    name: "tensor"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -7925,37 +8908,74 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "method"
+    name: "message"
     type: "string"
-    default_value {
-      s: "bilinear"
+  }
+}
+op {
+  name: "Cholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
     }
+  }
+}
+op {
+  name: "Cholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "bilinear"
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Cross"
+  name: "CholeskyGrad"
   input_arg {
-    name: "a"
+    name: "l"
     type_attr: "T"
   }
   input_arg {
-    name: "b"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "product"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -7965,296 +8985,233 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "Cross"
+  name: "CompareAndBitpack"
   input_arg {
-    name: "a"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "b"
+    name: "threshold"
     type_attr: "T"
   }
   output_arg {
-    name: "product"
-    type_attr: "T"
+    name: "output"
+    type: DT_UINT8
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_BOOL
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Cumprod"
+  name: "Complex"
   input_arg {
-    name: "x"
+    name: "real"
     type_attr: "T"
   }
   input_arg {
-    name: "axis"
-    type_attr: "Tidx"
+    name: "imag"
+    type_attr: "T"
   }
   output_arg {
     name: "out"
-    type_attr: "T"
-  }
-  attr {
-    name: "exclusive"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    type_attr: "Tout"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "Tout"
     type: "type"
     default_value {
-      type: DT_INT32
+      type: DT_COMPLEX64
     }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Cumprod"
+  name: "ComplexAbs"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
   output_arg {
-    name: "out"
-    type_attr: "T"
+    name: "y"
+    type_attr: "Tout"
   }
   attr {
-    name: "exclusive"
-    type: "bool"
+    name: "T"
+    type: "type"
     default_value {
-      b: false
+      type: DT_COMPLEX64
     }
-  }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "Tout"
     type: "type"
     default_value {
-      type: DT_INT32
+      type: DT_FLOAT
     }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "Cumsum"
+  name: "ComputeAccidentalHits"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "true_classes"
+    type: DT_INT64
   }
   input_arg {
-    name: "axis"
-    type_attr: "Tidx"
+    name: "sampled_candidates"
+    type: DT_INT64
   }
   output_arg {
-    name: "out"
-    type_attr: "T"
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "ids"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "weights"
+    type: DT_FLOAT
   }
   attr {
-    name: "exclusive"
-    type: "bool"
+    name: "num_true"
+    type: "int"
+  }
+  attr {
+    name: "seed"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
   }
   attr {
-    name: "reverse"
-    type: "bool"
+    name: "seed2"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
   }
+}
+op {
+  name: "Concat"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
+  }
+}
+op {
+  name: "ConcatOffset"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "shape"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  output_arg {
+    name: "offset"
+    type: DT_INT32
+    number_attr: "N"
   }
   attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
   }
 }
 op {
-  name: "Cumsum"
+  name: "ConcatV2"
   input_arg {
-    name: "x"
+    name: "values"
     type_attr: "T"
+    number_attr: "N"
   }
   input_arg {
     name: "axis"
     type_attr: "Tidx"
   }
   output_arg {
-    name: "out"
+    name: "output"
     type_attr: "T"
   }
   attr {
-    name: "exclusive"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
   }
   attr {
     name: "Tidx"
@@ -8271,14 +9228,18 @@ op {
   }
 }
 op {
-  name: "DatasetToSingleElement"
+  name: "ConcatenateDataset"
   input_arg {
-    name: "dataset"
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "another_dataset"
     type: DT_VARIANT
   }
   output_arg {
-    name: "components"
-    type_list_attr: "output_types"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
     name: "output_types"
@@ -8292,1130 +9253,968 @@ op {
     has_minimum: true
     minimum: 1
   }
+  is_stateful: true
 }
 op {
-  name: "DebugGradientIdentity"
+  name: "ConcatenateDataset"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "another_dataset"
+    type: DT_VARIANT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
-  allows_uninitialized_input: true
 }
 op {
-  name: "DebugIdentity"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "ConditionalAccumulator"
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
   }
   attr {
-    name: "tensor_name"
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "debug_urls"
-    type: "list(string)"
+    name: "shared_name"
+    type: "string"
     default_value {
-      list {
-      }
+      s: ""
     }
   }
-  allows_uninitialized_input: true
+  is_stateful: true
 }
 op {
-  name: "DebugIdentity"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "ConditionalAccumulator"
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "shape"
+    type: "shape"
   }
   attr {
-    name: "debug_urls"
-    type: "list(string)"
+    name: "container"
+    type: "string"
     default_value {
-      list {
-      }
+      s: ""
     }
   }
   attr {
-    name: "gated_grpc"
-    type: "bool"
+    name: "shared_name"
+    type: "string"
     default_value {
-      b: false
+      s: ""
     }
   }
-  allows_uninitialized_input: true
+  is_stateful: true
 }
 op {
-  name: "DebugIdentity"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "ConditionalAccumulator"
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "device_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "shape"
+    type: "shape"
   }
   attr {
-    name: "tensor_name"
+    name: "container"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "gated_grpc"
-    type: "bool"
+    name: "shared_name"
+    type: "string"
     default_value {
-      b: false
+      s: ""
     }
   }
-  allows_uninitialized_input: true
+  is_stateful: true
 }
 op {
-  name: "DebugNanCount"
+  name: "Conj"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    type: DT_INT64
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
     default_value {
-      s: ""
+      type: DT_COMPLEX64
     }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
+    allowed_values {
       list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  allows_uninitialized_input: true
 }
 op {
-  name: "DebugNanCount"
+  name: "Conj"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    type: DT_INT64
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
     default_value {
-      s: ""
+      type: DT_COMPLEX64
     }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
+    allowed_values {
       list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_VARIANT
       }
     }
   }
-  attr {
-    name: "gated_grpc"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  allows_uninitialized_input: true
 }
 op {
-  name: "DebugNanCount"
+  name: "ConjugateTranspose"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
+  input_arg {
+    name: "perm"
+    type_attr: "Tperm"
+  }
   output_arg {
-    name: "output"
-    type: DT_INT64
+    name: "y"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
   }
   attr {
-    name: "device_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
+    name: "Tperm"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_INT32
     }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
+    allowed_values {
       list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-  attr {
-    name: "gated_grpc"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  allows_uninitialized_input: true
 }
 op {
-  name: "DebugNumericSummary"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "Const"
   output_arg {
     name: "output"
-    type: DT_DOUBLE
-  }
-  attr {
-    name: "T"
-    type: "type"
+    type_attr: "dtype"
   }
   attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "value"
+    type: "tensor"
   }
   attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
+    name: "dtype"
+    type: "type"
   }
-  allows_uninitialized_input: true
 }
 op {
-  name: "DebugNumericSummary"
+  name: "ControlTrigger"
+}
+op {
+  name: "Conv2D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
   output_arg {
     name: "output"
-    type: DT_DOUBLE
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
+    allowed_values {
       list {
+        type: DT_HALF
+        type: DT_FLOAT
       }
     }
   }
   attr {
-    name: "lower_bound"
-    type: "float"
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
     default_value {
-      f: -inf
+      b: true
     }
   }
   attr {
-    name: "upper_bound"
-    type: "float"
-    default_value {
-      f: inf
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "mute_if_healthy"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: false
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
-  allows_uninitialized_input: true
 }
 op {
-  name: "DebugNumericSummary"
+  name: "Conv2D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
   output_arg {
     name: "output"
-    type: DT_DOUBLE
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
+    allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
       }
     }
   }
   attr {
-    name: "lower_bound"
-    type: "float"
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
     default_value {
-      f: -inf
+      b: true
     }
   }
   attr {
-    name: "upper_bound"
-    type: "float"
-    default_value {
-      f: inf
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "mute_if_healthy"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: false
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
   attr {
-    name: "gated_grpc"
-    type: "bool"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      b: false
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
   }
-  allows_uninitialized_input: true
 }
 op {
-  name: "DebugNumericSummary"
+  name: "Conv2DBackpropFilter"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
   output_arg {
     name: "output"
-    type: DT_DOUBLE
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "device_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
+    allowed_values {
       list {
+        type: DT_HALF
+        type: DT_FLOAT
       }
     }
   }
   attr {
-    name: "lower_bound"
-    type: "float"
-    default_value {
-      f: -inf
-    }
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "upper_bound"
-    type: "float"
+    name: "use_cudnn_on_gpu"
+    type: "bool"
     default_value {
-      f: inf
+      b: true
     }
   }
   attr {
-    name: "mute_if_healthy"
-    type: "bool"
-    default_value {
-      b: false
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "gated_grpc"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: false
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
-  allows_uninitialized_input: true
 }
 op {
-  name: "DecodeAndCropJpeg"
+  name: "Conv2DBackpropFilter"
   input_arg {
-    name: "contents"
-    type: DT_STRING
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "crop_window"
+    name: "filter_sizes"
     type: DT_INT32
   }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
   output_arg {
-    name: "image"
-    type: DT_UINT8
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "channels"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
     }
   }
   attr {
-    name: "ratio"
-    type: "int"
-    default_value {
-      i: 1
-    }
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "fancy_upscaling"
+    name: "use_cudnn_on_gpu"
     type: "bool"
     default_value {
       b: true
     }
   }
   attr {
-    name: "try_recover_truncated"
-    type: "bool"
-    default_value {
-      b: false
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "acceptable_fraction"
-    type: "float"
+    name: "data_format"
+    type: "string"
     default_value {
-      f: 1
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
   attr {
-    name: "dct_method"
-    type: "string"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      s: ""
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
   }
 }
 op {
-  name: "DecodeBase64"
+  name: "Conv2DBackpropInput"
   input_arg {
-    name: "input"
-    type: DT_STRING
+    name: "input_sizes"
+    type: DT_INT32
   }
-  output_arg {
-    name: "output"
-    type: DT_STRING
+  input_arg {
+    name: "filter"
+    type_attr: "T"
   }
-}
-op {
-  name: "DecodeBmp"
   input_arg {
-    name: "contents"
-    type: DT_STRING
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
-    name: "image"
-    type: DT_UINT8
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "channels"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
     }
   }
-}
-op {
-  name: "DecodeCSV"
-  input_arg {
-    name: "records"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "record_defaults"
-    type_list_attr: "OUT_TYPE"
+  attr {
+    name: "strides"
+    type: "list(int)"
   }
-  output_arg {
-    name: "output"
-    type_list_attr: "OUT_TYPE"
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "OUT_TYPE"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_STRING
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "field_delim"
+    name: "data_format"
     type: "string"
     default_value {
-      s: ","
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
 }
 op {
-  name: "DecodeCSV"
+  name: "Conv2DBackpropInput"
   input_arg {
-    name: "records"
-    type: DT_STRING
+    name: "input_sizes"
+    type: DT_INT32
   }
   input_arg {
-    name: "record_defaults"
-    type_list_attr: "OUT_TYPE"
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_list_attr: "OUT_TYPE"
+    type_attr: "T"
   }
   attr {
-    name: "OUT_TYPE"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_STRING
       }
     }
   }
   attr {
-    name: "field_delim"
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
     type: "string"
     default_value {
-      s: ","
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
   attr {
-    name: "use_quote_delim"
-    type: "bool"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      b: true
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
   }
 }
 op {
-  name: "DecodeCSV"
+  name: "Conv3D"
   input_arg {
-    name: "records"
-    type: DT_STRING
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "record_defaults"
-    type_list_attr: "OUT_TYPE"
+    name: "filter"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_list_attr: "OUT_TYPE"
+    type_attr: "T"
   }
   attr {
-    name: "OUT_TYPE"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_STRING
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "field_delim"
-    type: "string"
-    default_value {
-      s: ","
-    }
-  }
-  attr {
-    name: "use_quote_delim"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "na_value"
+    name: "padding"
     type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
 }
 op {
-  name: "DecodeCSV"
+  name: "Conv3D"
   input_arg {
-    name: "records"
-    type: DT_STRING
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "record_defaults"
-    type_list_attr: "OUT_TYPE"
+    name: "filter"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_list_attr: "OUT_TYPE"
+    type_attr: "T"
   }
   attr {
-    name: "OUT_TYPE"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_STRING
       }
     }
   }
   attr {
-    name: "field_delim"
-    type: "string"
-    default_value {
-      s: ","
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "use_quote_delim"
-    type: "bool"
-    default_value {
-      b: true
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "na_value"
+    name: "data_format"
     type: "string"
     default_value {
-      s: ""
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
     }
   }
 }
 op {
-  name: "DecodeGif"
+  name: "Conv3D"
   input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "image"
-    type: DT_UINT8
+    name: "input"
+    type_attr: "T"
   }
-}
-op {
-  name: "DecodeJSONExample"
   input_arg {
-    name: "json_examples"
-    type: DT_STRING
+    name: "filter"
+    type_attr: "T"
   }
   output_arg {
-    name: "binary_examples"
-    type: DT_STRING
-  }
-}
-op {
-  name: "DecodeJpeg"
-  input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "image"
-    type: DT_UINT8
-  }
-  attr {
-    name: "channels"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "ratio"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-  attr {
-    name: "fancy_upscaling"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "try_recover_truncated"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "acceptable_fraction"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "dct_method"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-}
-op {
-  name: "DecodePng"
-  input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "image"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "channels"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_UINT8
-    }
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_UINT16
-      }
-    }
-  }
-}
-op {
-  name: "DecodeRaw"
-  input_arg {
-    name: "bytes"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "out_type"
+    name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
       }
     }
   }
   attr {
-    name: "little_endian"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "DecodeRaw"
-  input_arg {
-    name: "bytes"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "out_type"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT16
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "little_endian"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: true
+      s: "NDHWC"
     }
-  }
-}
-op {
-  name: "DecodeWav"
-  input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "audio"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "sample_rate"
-    type: DT_INT32
-  }
-  attr {
-    name: "desired_channels"
-    type: "int"
-    default_value {
-      i: -1
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
     }
   }
   attr {
-    name: "desired_samples"
-    type: "int"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      i: -1
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
   }
 }
 op {
-  name: "DeleteSessionTensor"
+  name: "Conv3DBackpropFilter"
   input_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "input"
+    type_attr: "T"
   }
-}
-op {
-  name: "DenseToDenseSetOperation"
   input_arg {
-    name: "set1"
+    name: "filter"
     type_attr: "T"
   }
   input_arg {
-    name: "set2"
+    name: "out_backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "result_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "result_values"
+    name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "result_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "set_operation"
-    type: "string"
-  }
-  attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_STRING
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
-}
-op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "strides"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    minimum: 5
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  deprecation {
+    version: 10
   }
 }
 op {
-  name: "DenseToSparseSetOperation"
+  name: "Conv3DBackpropFilter"
   input_arg {
-    name: "set1"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "set2_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "set2_values"
+    name: "filter"
     type_attr: "T"
   }
   input_arg {
-    name: "set2_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "result_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "result_values"
+    name: "out_backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "result_shape"
-    type: DT_INT64
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "set_operation"
-    type: "string"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
   attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_STRING
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
+  deprecation {
+    version: 10
+  }
 }
 op {
-  name: "DepthToSpace"
+  name: "Conv3DBackpropFilterV2"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "block_size"
-    type: "int"
-    has_minimum: true
-    minimum: 2
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
   }
-}
-op {
-  name: "DepthToSpace"
   input_arg {
-    name: "input"
+    name: "out_backprop"
     type_attr: "T"
   }
   output_arg {
@@ -9425,36 +10224,42 @@ op {
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
   attr {
-    name: "block_size"
-    type: "int"
+    name: "strides"
+    type: "list(int)"
     has_minimum: true
-    minimum: 2
+    minimum: 5
   }
   attr {
-    name: "data_format"
+    name: "padding"
     type: "string"
-    default_value {
-      s: "NHWC"
-    }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
 }
 op {
-  name: "DepthwiseConv2dNative"
+  name: "Conv3DBackpropFilterV2"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter"
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
     type_attr: "T"
   }
   output_arg {
@@ -9474,6 +10279,8 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -9485,15 +10292,32 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
 }
 op {
-  name: "DepthwiseConv2dNative"
+  name: "Conv3DBackpropFilterV2"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter"
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
     type_attr: "T"
   }
   output_arg {
@@ -9505,6 +10329,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -9513,6 +10339,8 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -9528,25 +10356,38 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
 }
 op {
-  name: "DepthwiseConv2dNativeBackpropFilter"
+  name: "Conv3DBackpropInput"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
+    name: "filter"
+    type_attr: "T"
   }
   input_arg {
     name: "out_backprop"
@@ -9569,6 +10410,8 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -9580,16 +10423,19 @@ op {
       }
     }
   }
+  deprecation {
+    version: 10
+  }
 }
 op {
-  name: "DepthwiseConv2dNativeBackpropFilter"
+  name: "Conv3DBackpropInput"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
+    name: "filter"
+    type_attr: "T"
   }
   input_arg {
     name: "out_backprop"
@@ -9604,6 +10450,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -9612,6 +10459,8 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -9623,22 +10472,12 @@ op {
       }
     }
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+  deprecation {
+    version: 10
   }
 }
 op {
-  name: "DepthwiseConv2dNativeBackpropInput"
+  name: "Conv3DBackpropInputV2"
   input_arg {
     name: "input_sizes"
     type: DT_INT32
@@ -9668,6 +10507,8 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -9681,7 +10522,7 @@ op {
   }
 }
 op {
-  name: "DepthwiseConv2dNativeBackpropInput"
+  name: "Conv3DBackpropInputV2"
   input_arg {
     name: "input_sizes"
     type: DT_INT32
@@ -9711,6 +10552,8 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -9726,189 +10569,174 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
 }
 op {
-  name: "Dequantize"
+  name: "Conv3DBackpropInputV2"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_sizes"
+    type: DT_INT32
   }
   input_arg {
-    name: "min_range"
-    type: DT_FLOAT
+    name: "filter"
+    type_attr: "T"
   }
   input_arg {
-    name: "max_range"
-    type: DT_FLOAT
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
   attr {
-    name: "mode"
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
     type: "string"
     default_value {
-      s: "MIN_COMBINED"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
 }
 op {
-  name: "Dequantize"
+  name: "Copy"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "min_range"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_range"
-    type: DT_FLOAT
-  }
   output_arg {
     name: "output"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
   }
   attr {
-    name: "mode"
+    name: "tensor_name"
     type: "string"
     default_value {
-      s: "MIN_COMBINED"
-    }
-    allowed_values {
-      list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
-      }
+      s: ""
     }
   }
+  allows_uninitialized_input: true
 }
 op {
-  name: "DeserializeIterator"
-  input_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "serialized"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
-  name: "DeserializeManySparse"
+  name: "Copy"
   input_arg {
-    name: "serialized_sparse"
-    type: DT_STRING
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "sparse_indices"
-    type: DT_INT64
+    name: "output"
+    type_attr: "T"
   }
-  output_arg {
-    name: "sparse_values"
-    type_attr: "dtype"
+  attr {
+    name: "T"
+    type: "type"
   }
-  output_arg {
-    name: "sparse_shape"
-    type: DT_INT64
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "dtype"
-    type: "type"
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
   }
+  allows_uninitialized_input: true
 }
 op {
-  name: "DeserializeSparse"
+  name: "CopyHost"
   input_arg {
-    name: "serialized_sparse"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sparse_values"
-    type_attr: "dtype"
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "sparse_shape"
-    type: DT_INT64
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
   }
-}
-op {
-  name: "DestroyResourceOp"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
   attr {
-    name: "ignore_lookup_error"
-    type: "bool"
+    name: "tensor_name"
+    type: "string"
     default_value {
-      b: true
+      s: ""
     }
   }
-  is_stateful: true
+  allows_uninitialized_input: true
 }
 op {
-  name: "DestroyTemporaryVariable"
+  name: "CopyHost"
   input_arg {
-    name: "ref"
+    name: "input"
     type_attr: "T"
-    is_ref: true
   }
   output_arg {
-    name: "value"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -9916,18 +10744,30 @@ op {
     type: "type"
   }
   attr {
-    name: "var_name"
+    name: "tensor_name"
     type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
   }
+  allows_uninitialized_input: true
 }
 op {
-  name: "Diag"
+  name: "Cos"
   input_arg {
-    name: "diagonal"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -9935,10 +10775,9 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -9946,13 +10785,13 @@ op {
   }
 }
 op {
-  name: "DiagPart"
+  name: "Cos"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "diagonal"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -9960,10 +10799,10 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -9971,7 +10810,7 @@ op {
   }
 }
 op {
-  name: "Digamma"
+  name: "Cosh"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -9988,22 +10827,20 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Dilation2D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "Cosh"
   input_arg {
-    name: "filter"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -10011,289 +10848,282 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+}
+op {
+  name: "CountUpTo"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "limit"
+    type: "int"
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "Dilation2D"
+  name: "CropAndResize"
   input_arg {
-    name: "input"
+    name: "image"
     type_attr: "T"
   }
   input_arg {
-    name: "filter"
-    type_attr: "T"
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "crop_size"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "crops"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
-        type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
+    name: "method"
     type: "string"
+    default_value {
+      s: "bilinear"
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        s: "bilinear"
       }
     }
   }
+  attr {
+    name: "extrapolation_value"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
 }
 op {
-  name: "Dilation2DBackpropFilter"
+  name: "CropAndResize"
   input_arg {
-    name: "input"
+    name: "image"
     type_attr: "T"
   }
   input_arg {
-    name: "filter"
-    type_attr: "T"
+    name: "boxes"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "out_backprop"
-    type_attr: "T"
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "crop_size"
+    type: DT_INT32
   }
   output_arg {
-    name: "filter_backprop"
-    type_attr: "T"
+    name: "crops"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
         type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
         type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
+    name: "method"
     type: "string"
+    default_value {
+      s: "bilinear"
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        s: "bilinear"
       }
     }
   }
+  attr {
+    name: "extrapolation_value"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
 }
 op {
-  name: "Dilation2DBackpropFilter"
+  name: "CropAndResizeGradBoxes"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "grads"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "filter"
+    name: "image"
     type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
-    type_attr: "T"
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
   }
   output_arg {
-    name: "filter_backprop"
-    type_attr: "T"
+    name: "output"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
-        type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
+    name: "method"
     type: "string"
+    default_value {
+      s: "bilinear"
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        s: "bilinear"
       }
     }
   }
 }
 op {
-  name: "Dilation2DBackpropInput"
+  name: "CropAndResizeGradBoxes"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "grads"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "filter"
+    name: "image"
     type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
-    type_attr: "T"
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
   }
   output_arg {
-    name: "in_backprop"
-    type_attr: "T"
+    name: "output"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
         type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
         type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
+    name: "method"
     type: "string"
+    default_value {
+      s: "bilinear"
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        s: "bilinear"
       }
     }
   }
 }
 op {
-  name: "Dilation2DBackpropInput"
+  name: "CropAndResizeGradImage"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "grads"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "filter"
-    type_attr: "T"
+    name: "boxes"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "out_backprop"
-    type_attr: "T"
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "image_size"
+    type: DT_INT32
   }
   output_arg {
-    name: "in_backprop"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -10302,54 +11132,36 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
+    name: "method"
     type: "string"
+    default_value {
+      s: "bilinear"
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        s: "bilinear"
       }
     }
   }
 }
 op {
-  name: "Div"
+  name: "Cross"
   input_arg {
-    name: "x"
+    name: "a"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "b"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "product"
     type_attr: "T"
   }
   attr {
@@ -10357,153 +11169,179 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
+        type: DT_INT16
         type: DT_INT8
         type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "DrawBoundingBoxes"
+  name: "Cross"
   input_arg {
-    name: "images"
+    name: "a"
     type_attr: "T"
   }
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+    name: "b"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "product"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "DynamicPartition"
+  name: "Cross"
   input_arg {
-    name: "data"
+    name: "a"
     type_attr: "T"
   }
   input_arg {
-    name: "partitions"
-    type: DT_INT32
+    name: "b"
+    type_attr: "T"
   }
   output_arg {
-    name: "outputs"
+    name: "product"
     type_attr: "T"
-    number_attr: "num_partitions"
-  }
-  attr {
-    name: "num_partitions"
-    type: "int"
-    has_minimum: true
-    minimum: 1
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
 }
 op {
-  name: "DynamicStitch"
+  name: "Cumprod"
   input_arg {
-    name: "indices"
-    type: DT_INT32
-    number_attr: "N"
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "data"
-    type_attr: "T"
-    number_attr: "N"
+    name: "axis"
+    type_attr: "Tidx"
   }
   output_arg {
-    name: "merged"
+    name: "out"
     type_attr: "T"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
 }
 op {
-  name: "EditDistance"
-  input_arg {
-    name: "hypothesis_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "hypothesis_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "hypothesis_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "truth_indices"
-    type: DT_INT64
-  }
+  name: "Cumprod"
   input_arg {
-    name: "truth_values"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "truth_shape"
-    type: DT_INT64
+    name: "axis"
+    type_attr: "Tidx"
   }
   output_arg {
-    name: "output"
-    type: DT_FLOAT
+    name: "out"
+    type_attr: "T"
   }
   attr {
-    name: "normalize"
+    name: "exclusive"
     type: "bool"
     default_value {
-      b: true
+      b: false
     }
   }
   attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "Elu"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
     name: "T"
@@ -10512,47 +11350,65 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "Elu"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "EluGrad"
+  name: "Cumprod"
   input_arg {
-    name: "gradients"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "outputs"
-    type_attr: "T"
+    name: "axis"
+    type_attr: "Tidx"
   }
   output_arg {
-    name: "backprops"
+    name: "out"
     type_attr: "T"
   }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -10560,308 +11416,423 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "EluGrad"
+  name: "Cumsum"
   input_arg {
-    name: "gradients"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "outputs"
-    type_attr: "T"
+    name: "axis"
+    type_attr: "Tidx"
   }
   output_arg {
-    name: "backprops"
+    name: "out"
     type_attr: "T"
   }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "EncodeBase64"
+  name: "Cumsum"
   input_arg {
-    name: "input"
-    type: DT_STRING
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
   }
   output_arg {
-    name: "output"
-    type: DT_STRING
+    name: "out"
+    type_attr: "T"
   }
   attr {
-    name: "pad"
+    name: "exclusive"
     type: "bool"
     default_value {
       b: false
     }
   }
-}
-op {
-  name: "EncodeJpeg"
-  input_arg {
-    name: "image"
-    type: DT_UINT8
-  }
-  output_arg {
-    name: "contents"
-    type: DT_STRING
-  }
   attr {
-    name: "format"
-    type: "string"
+    name: "reverse"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: ""
-        s: "grayscale"
-        s: "rgb"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "quality"
-    type: "int"
+    name: "Tidx"
+    type: "type"
     default_value {
-      i: 95
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
+}
+op {
+  name: "Cumsum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
   attr {
-    name: "progressive"
+    name: "exclusive"
     type: "bool"
     default_value {
       b: false
     }
   }
   attr {
-    name: "optimize_size"
+    name: "reverse"
     type: "bool"
     default_value {
       b: false
     }
   }
   attr {
-    name: "chroma_downsampling"
-    type: "bool"
-    default_value {
-      b: true
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
     }
   }
   attr {
-    name: "density_unit"
-    type: "string"
+    name: "Tidx"
+    type: "type"
     default_value {
-      s: "in"
+      type: DT_INT32
     }
     allowed_values {
       list {
-        s: "in"
-        s: "cm"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+}
+op {
+  name: "DataFormatDimMap"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
   attr {
-    name: "x_density"
-    type: "int"
+    name: "T"
+    type: "type"
     default_value {
-      i: 300
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "y_density"
-    type: "int"
+    name: "src_format"
+    type: "string"
     default_value {
-      i: 300
+      s: "NHWC"
     }
   }
   attr {
-    name: "xmp_metadata"
+    name: "dst_format"
     type: "string"
     default_value {
-      s: ""
+      s: "NCHW"
     }
   }
 }
 op {
-  name: "EncodePng"
+  name: "DataFormatVecPermute"
   input_arg {
-    name: "image"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  attr {
-    name: "compression"
-    type: "int"
-    default_value {
-      i: -1
-    }
+    name: "y"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     default_value {
-      type: DT_UINT8
+      type: DT_INT32
     }
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "src_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "dst_format"
+    type: "string"
+    default_value {
+      s: "NCHW"
+    }
+  }
 }
 op {
-  name: "EncodeWav"
+  name: "DatasetToSingleElement"
   input_arg {
-    name: "audio"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "sample_rate"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-}
-op {
-  name: "Enter"
-  input_arg {
-    name: "data"
-    type_attr: "T"
+    name: "dataset"
+    type: DT_VARIANT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "frame_name"
-    type: "string"
+    name: "components"
+    type_list_attr: "output_types"
   }
   attr {
-    name: "is_constant"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "parallel_iterations"
-    type: "int"
-    default_value {
-      i: 10
-    }
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "Equal"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
+  name: "DebugGradientIdentity"
   input_arg {
-    name: "y"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
-      }
-    }
   }
-  is_commutative: true
+  allows_uninitialized_input: true
 }
 op {
-  name: "Erf"
+  name: "DebugIdentity"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "output"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
+  allows_uninitialized_input: true
 }
 op {
-  name: "Erfc"
+  name: "DebugIdentity"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "output"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
 }
 op {
-  name: "Exit"
+  name: "DebugIdentity"
   input_arg {
-    name: "data"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
@@ -10872,1037 +11843,903 @@ op {
     name: "T"
     type: "type"
   }
-}
-op {
-  name: "Exp"
-  input_arg {
-    name: "x"
-    type_attr: "T"
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
 }
 op {
-  name: "ExpandDims"
+  name: "DebugNanCount"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "dim"
-    type_attr: "Tdim"
-  }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type: DT_INT64
   }
   attr {
     name: "T"
     type: "type"
   }
   attr {
-    name: "Tdim"
-    type: "type"
+    name: "tensor_name"
+    type: "string"
     default_value {
-      type: DT_INT32
+      s: ""
     }
-    allowed_values {
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
       list {
-        type: DT_INT32
-        type: DT_INT64
       }
     }
   }
+  allows_uninitialized_input: true
 }
 op {
-  name: "Expm1"
+  name: "DebugNanCount"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
-    type_attr: "T"
+    name: "output"
+    type: DT_INT64
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "ExtractGlimpse"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "offsets"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "glimpse"
-    type: DT_FLOAT
   }
   attr {
-    name: "centered"
-    type: "bool"
+    name: "tensor_name"
+    type: "string"
     default_value {
-      b: true
+      s: ""
     }
   }
   attr {
-    name: "normalized"
-    type: "bool"
+    name: "debug_urls"
+    type: "list(string)"
     default_value {
-      b: true
+      list {
+      }
     }
   }
   attr {
-    name: "uniform_noise"
+    name: "gated_grpc"
     type: "bool"
     default_value {
-      b: true
+      b: false
     }
   }
+  allows_uninitialized_input: true
 }
 op {
-  name: "ExtractImagePatches"
+  name: "DebugNanCount"
   input_arg {
-    name: "images"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "patches"
-    type_attr: "T"
+    name: "output"
+    type: DT_INT64
   }
   attr {
-    name: "ksizes"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
+  allows_uninitialized_input: true
 }
 op {
-  name: "ExtractImagePatches"
+  name: "DebugNumericSummary"
   input_arg {
-    name: "images"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "patches"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksizes"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "output"
+    type: DT_DOUBLE
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
   }
   attr {
-    name: "padding"
+    name: "tensor_name"
     type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    default_value {
+      s: ""
     }
   }
-}
-op {
-  name: "ExtractJpegShape"
-  input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "image_shape"
-    type_attr: "output_type"
-  }
   attr {
-    name: "output_type"
-    type: "type"
+    name: "debug_urls"
+    type: "list(string)"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
       }
     }
   }
+  allows_uninitialized_input: true
 }
 op {
-  name: "FFT"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "FFT2D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "FFT3D"
+  name: "DebugNumericSummary"
   input_arg {
     name: "input"
-    type: DT_COMPLEX64
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type: DT_COMPLEX64
+    type: DT_DOUBLE
   }
-}
-op {
-  name: "FIFOQueue"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+  attr {
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "shapes"
-    type: "list(shape)"
+    name: "debug_urls"
+    type: "list(string)"
     default_value {
       list {
       }
     }
-    has_minimum: true
   }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "lower_bound"
+    type: "float"
     default_value {
-      i: -1
+      f: -inf
     }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "upper_bound"
+    type: "float"
     default_value {
-      s: ""
+      f: inf
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "mute_if_healthy"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
-  is_stateful: true
+  allows_uninitialized_input: true
 }
 op {
-  name: "FIFOQueueV2"
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
   output_arg {
-    name: "handle"
-    type: DT_RESOURCE
+    name: "output"
+    type: DT_DOUBLE
   }
   attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "shapes"
-    type: "list(shape)"
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
     default_value {
       list {
       }
     }
-    has_minimum: true
   }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "lower_bound"
+    type: "float"
     default_value {
-      i: -1
+      f: -inf
     }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "upper_bound"
+    type: "float"
     default_value {
-      s: ""
+      f: inf
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "mute_if_healthy"
+    type: "bool"
     default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Fact"
-  output_arg {
-    name: "fact"
-    type: DT_STRING
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxArgs"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "outputs"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "min"
-    type: "float"
-    default_value {
-      f: -6
+      b: false
     }
   }
   attr {
-    name: "max"
-    type: "float"
+    name: "gated_grpc"
+    type: "bool"
     default_value {
-      f: 6
+      b: false
     }
   }
+  allows_uninitialized_input: true
 }
 op {
-  name: "FakeQuantWithMinMaxArgs"
+  name: "DebugNumericSummary"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "outputs"
-    type: DT_FLOAT
+    name: "output"
+    type: DT_DOUBLE
   }
   attr {
-    name: "min"
-    type: "float"
-    default_value {
-      f: -6
-    }
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "max"
-    type: "float"
+    name: "device_name"
+    type: "string"
     default_value {
-      f: 6
+      s: ""
     }
   }
   attr {
-    name: "num_bits"
-    type: "int"
+    name: "tensor_name"
+    type: "string"
     default_value {
-      i: 8
+      s: ""
     }
   }
-}
-op {
-  name: "FakeQuantWithMinMaxArgs"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "outputs"
-    type: DT_FLOAT
-  }
   attr {
-    name: "min"
-    type: "float"
+    name: "debug_urls"
+    type: "list(string)"
     default_value {
-      f: -6
+      list {
+      }
     }
   }
   attr {
-    name: "max"
+    name: "lower_bound"
     type: "float"
     default_value {
-      f: 6
+      f: -inf
     }
   }
   attr {
-    name: "num_bits"
-    type: "int"
+    name: "upper_bound"
+    type: "float"
     default_value {
-      i: 8
+      f: inf
     }
   }
   attr {
-    name: "narrow_range"
+    name: "mute_if_healthy"
     type: "bool"
     default_value {
       b: false
     }
   }
-}
-op {
-  name: "FakeQuantWithMinMaxArgsGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprops"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "min"
-    type: "float"
-    default_value {
-      f: -6
-    }
-  }
   attr {
-    name: "max"
-    type: "float"
+    name: "gated_grpc"
+    type: "bool"
     default_value {
-      f: 6
+      b: false
     }
   }
+  allows_uninitialized_input: true
 }
 op {
-  name: "FakeQuantWithMinMaxArgsGradient"
+  name: "DecodeAndCropJpeg"
   input_arg {
-    name: "gradients"
-    type: DT_FLOAT
+    name: "contents"
+    type: DT_STRING
   }
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "crop_window"
+    type: DT_INT32
   }
   output_arg {
-    name: "backprops"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "min"
-    type: "float"
-    default_value {
-      f: -6
-    }
+    name: "image"
+    type: DT_UINT8
   }
   attr {
-    name: "max"
-    type: "float"
+    name: "channels"
+    type: "int"
     default_value {
-      f: 6
+      i: 0
     }
   }
   attr {
-    name: "num_bits"
+    name: "ratio"
     type: "int"
     default_value {
-      i: 8
+      i: 1
     }
   }
-}
-op {
-  name: "FakeQuantWithMinMaxArgsGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprops"
-    type: DT_FLOAT
-  }
   attr {
-    name: "min"
-    type: "float"
+    name: "fancy_upscaling"
+    type: "bool"
     default_value {
-      f: -6
+      b: true
     }
   }
   attr {
-    name: "max"
-    type: "float"
+    name: "try_recover_truncated"
+    type: "bool"
     default_value {
-      f: 6
+      b: false
     }
   }
   attr {
-    name: "num_bits"
-    type: "int"
+    name: "acceptable_fraction"
+    type: "float"
     default_value {
-      i: 8
+      f: 1
     }
   }
   attr {
-    name: "narrow_range"
-    type: "bool"
+    name: "dct_method"
+    type: "string"
     default_value {
-      b: false
+      s: ""
     }
   }
 }
 op {
-  name: "FakeQuantWithMinMaxVars"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
+  name: "DecodeBase64"
   input_arg {
-    name: "max"
-    type: DT_FLOAT
+    name: "input"
+    type: DT_STRING
   }
   output_arg {
-    name: "outputs"
-    type: DT_FLOAT
+    name: "output"
+    type: DT_STRING
   }
 }
 op {
-  name: "FakeQuantWithMinMaxVars"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
+  name: "DecodeBmp"
   input_arg {
-    name: "max"
-    type: DT_FLOAT
+    name: "contents"
+    type: DT_STRING
   }
   output_arg {
-    name: "outputs"
-    type: DT_FLOAT
+    name: "image"
+    type: DT_UINT8
   }
   attr {
-    name: "num_bits"
+    name: "channels"
     type: "int"
     default_value {
-      i: 8
+      i: 0
     }
   }
 }
 op {
-  name: "FakeQuantWithMinMaxVars"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
+  name: "DecodeCSV"
   input_arg {
-    name: "min"
-    type: DT_FLOAT
+    name: "records"
+    type: DT_STRING
   }
   input_arg {
-    name: "max"
-    type: DT_FLOAT
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
   }
   output_arg {
-    name: "outputs"
-    type: DT_FLOAT
+    name: "output"
+    type_list_attr: "OUT_TYPE"
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
     }
   }
   attr {
-    name: "narrow_range"
-    type: "bool"
+    name: "field_delim"
+    type: "string"
     default_value {
-      b: false
+      s: ","
     }
   }
 }
 op {
-  name: "FakeQuantWithMinMaxVarsGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
+  name: "DecodeCSV"
   input_arg {
-    name: "min"
-    type: DT_FLOAT
+    name: "records"
+    type: DT_STRING
   }
   input_arg {
-    name: "max"
-    type: DT_FLOAT
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
   }
   output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
+    name: "output"
+    type_list_attr: "OUT_TYPE"
   }
-  output_arg {
-    name: "backprop_wrt_min"
-    type: DT_FLOAT
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
   }
-  output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
 }
 op {
-  name: "FakeQuantWithMinMaxVarsGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
+  name: "DecodeCSV"
   input_arg {
-    name: "min"
-    type: DT_FLOAT
+    name: "records"
+    type: DT_STRING
   }
   input_arg {
-    name: "max"
-    type: DT_FLOAT
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
   }
   output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
+    name: "output"
+    type_list_attr: "OUT_TYPE"
   }
-  output_arg {
-    name: "backprop_wrt_min"
-    type: DT_FLOAT
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
   }
-  output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
   }
   attr {
-    name: "num_bits"
-    type: "int"
+    name: "use_quote_delim"
+    type: "bool"
     default_value {
-      i: 8
+      b: true
+    }
+  }
+  attr {
+    name: "na_value"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
 }
 op {
-  name: "FakeQuantWithMinMaxVarsGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
+  name: "DecodeCSV"
   input_arg {
-    name: "min"
-    type: DT_FLOAT
+    name: "records"
+    type: DT_STRING
   }
   input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
   }
   output_arg {
-    name: "backprop_wrt_min"
-    type: DT_FLOAT
+    name: "output"
+    type_list_attr: "OUT_TYPE"
   }
-  output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
   }
   attr {
-    name: "num_bits"
-    type: "int"
+    name: "field_delim"
+    type: "string"
     default_value {
-      i: 8
+      s: ","
     }
   }
   attr {
-    name: "narrow_range"
+    name: "use_quote_delim"
     type: "bool"
     default_value {
-      b: false
+      b: true
+    }
+  }
+  attr {
+    name: "na_value"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
 }
 op {
-  name: "FakeQuantWithMinMaxVarsPerChannel"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
+  name: "DecodeGif"
   input_arg {
-    name: "max"
-    type: DT_FLOAT
+    name: "contents"
+    type: DT_STRING
   }
   output_arg {
-    name: "outputs"
-    type: DT_FLOAT
+    name: "image"
+    type: DT_UINT8
   }
 }
 op {
-  name: "FakeQuantWithMinMaxVarsPerChannel"
+  name: "DecodeJSONExample"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "json_examples"
+    type: DT_STRING
   }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
+  output_arg {
+    name: "binary_examples"
+    type: DT_STRING
   }
+}
+op {
+  name: "DecodeJpeg"
   input_arg {
-    name: "max"
-    type: DT_FLOAT
+    name: "contents"
+    type: DT_STRING
   }
   output_arg {
-    name: "outputs"
-    type: DT_FLOAT
+    name: "image"
+    type: DT_UINT8
   }
   attr {
-    name: "num_bits"
+    name: "channels"
     type: "int"
     default_value {
-      i: 8
+      i: 0
     }
   }
-}
-op {
-  name: "FakeQuantWithMinMaxVarsPerChannel"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "outputs"
-    type: DT_FLOAT
-  }
   attr {
-    name: "num_bits"
+    name: "ratio"
     type: "int"
     default_value {
-      i: 8
+      i: 1
     }
   }
   attr {
-    name: "narrow_range"
+    name: "fancy_upscaling"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
-}
-op {
-  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
+  attr {
+    name: "try_recover_truncated"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+  attr {
+    name: "acceptable_fraction"
+    type: "float"
+    default_value {
+      f: 1
+    }
   }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
+  attr {
+    name: "dct_method"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
+}
+op {
+  name: "DecodePng"
   input_arg {
-    name: "max"
-    type: DT_FLOAT
+    name: "contents"
+    type: DT_STRING
   }
   output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
+    name: "image"
+    type_attr: "dtype"
   }
-  output_arg {
-    name: "backprop_wrt_min"
-    type: DT_FLOAT
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
-  output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
   }
 }
 op {
-  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
+  name: "DecodeRaw"
   input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
+    name: "bytes"
+    type: DT_STRING
   }
   output_arg {
-    name: "backprop_wrt_min"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "out_type"
   }
-  output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "num_bits"
-    type: "int"
+    name: "little_endian"
+    type: "bool"
     default_value {
-      i: 8
+      b: true
     }
   }
 }
 op {
-  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  name: "DecodeRaw"
   input_arg {
-    name: "gradients"
-    type: DT_FLOAT
+    name: "bytes"
+    type: DT_STRING
   }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
   }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT16
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+      }
+    }
   }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
+  attr {
+    name: "little_endian"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
-  output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
+}
+op {
+  name: "DecodeWav"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
   }
   output_arg {
-    name: "backprop_wrt_min"
+    name: "audio"
     type: DT_FLOAT
   }
   output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
+    name: "sample_rate"
+    type: DT_INT32
   }
   attr {
-    name: "num_bits"
+    name: "desired_channels"
     type: "int"
     default_value {
-      i: 8
+      i: -1
     }
   }
   attr {
-    name: "narrow_range"
-    type: "bool"
+    name: "desired_samples"
+    type: "int"
     default_value {
-      b: false
+      i: -1
     }
   }
 }
 op {
-  name: "FakeQueue"
+  name: "DeleteSessionTensor"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  output_arg {
     name: "handle"
     type: DT_STRING
-    is_ref: true
   }
-  is_stateful: true
 }
 op {
-  name: "Fill"
+  name: "DenseToDenseSetOperation"
   input_arg {
-    name: "dims"
-    type: DT_INT32
+    name: "set1"
+    type_attr: "T"
   }
   input_arg {
-    name: "value"
+    name: "set2"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "result_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
     type_attr: "T"
   }
+  output_arg {
+    name: "result_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
   }
 }
 op {
-  name: "FilterDataset"
+  name: "DenseToSparseBatchDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
   }
   output_arg {
     name: "handle"
     type: DT_VARIANT
   }
-  attr {
-    name: "predicate"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
   attr {
     name: "output_types"
     type: "list(type)"
@@ -11918,28 +12755,23 @@ op {
   is_stateful: true
 }
 op {
-  name: "FilterDataset"
+  name: "DenseToSparseBatchDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
   }
   output_arg {
     name: "handle"
     type: DT_VARIANT
   }
-  attr {
-    name: "predicate"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
   attr {
     name: "output_types"
     type: "list(type)"
@@ -11954,547 +12786,502 @@ op {
   }
 }
 op {
-  name: "FixedLengthRecordDataset"
+  name: "DenseToSparseSetOperation"
   input_arg {
-    name: "filenames"
-    type: DT_STRING
+    name: "set1"
+    type_attr: "T"
   }
   input_arg {
-    name: "header_bytes"
+    name: "set2_indices"
     type: DT_INT64
   }
   input_arg {
-    name: "record_bytes"
-    type: DT_INT64
+    name: "set2_values"
+    type_attr: "T"
   }
   input_arg {
-    name: "footer_bytes"
+    name: "set2_shape"
     type: DT_INT64
   }
-  input_arg {
-    name: "buffer_size"
+  output_arg {
+    name: "result_indices"
     type: DT_INT64
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "result_values"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "FixedLengthRecordReader"
   output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "header_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "record_bytes"
-    type: "int"
+    name: "result_shape"
+    type: DT_INT64
   }
   attr {
-    name: "footer_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "set_operation"
+    type: "string"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "validate_indices"
+    type: "bool"
     default_value {
-      s: ""
+      b: true
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "FixedLengthRecordReader"
+  name: "DepthToSpace"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
   output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "header_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "record_bytes"
+    name: "block_size"
     type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+}
+op {
+  name: "DepthToSpace"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "footer_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "hop_bytes"
+    name: "block_size"
     type: "int"
-    default_value {
-      i: 0
-    }
+    has_minimum: true
+    minimum: 2
   }
   attr {
-    name: "container"
+    name: "data_format"
     type: "string"
     default_value {
-      s: ""
+      s: "NHWC"
     }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "FixedLengthRecordReaderV2"
-  output_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
+  name: "DepthwiseConv2dNative"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "header_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
   }
-  attr {
-    name: "record_bytes"
-    type: "int"
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "footer_bytes"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "shared_name"
+    name: "padding"
     type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "FixedLengthRecordReaderV2"
-  output_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
+  name: "DepthwiseConv2dNative"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "header_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
   }
-  attr {
-    name: "record_bytes"
-    type: "int"
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "footer_bytes"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
   attr {
-    name: "hop_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "container"
+    name: "padding"
     type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "shared_name"
+    name: "data_format"
     type: "string"
     default_value {
-      s: ""
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "FixedLengthRecordReaderV2"
-  output_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
+  name: "DepthwiseConv2dNative"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "header_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
   }
-  attr {
-    name: "record_bytes"
-    type: "int"
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "footer_bytes"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
   attr {
-    name: "hop_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "container"
+    name: "padding"
     type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "shared_name"
+    name: "data_format"
     type: "string"
     default_value {
-      s: ""
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
   attr {
-    name: "encoding"
-    type: "string"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      s: ""
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "FixedUnigramCandidateSampler"
+  name: "DepthwiseConv2dNativeBackpropFilter"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
+    name: "input"
+    type_attr: "T"
   }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
   }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "unique"
-    type: "bool"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
   attr {
-    name: "range_max"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "vocab_file"
+    name: "padding"
     type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
-  attr {
-    name: "distortion"
-    type: "float"
-    default_value {
-      f: 1
-    }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "num_reserved_ids"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "num_shards"
-    type: "int"
-    default_value {
-      i: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
-    has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "shard"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "unigrams"
-    type: "list(float)"
-    default_value {
+    name: "padding"
+    type: "string"
+    allowed_values {
       list {
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "seed"
-    type: "int"
+    name: "data_format"
+    type: "string"
     default_value {
-      i: 0
+      s: "NHWC"
     }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
 }
 op {
-  name: "FixedUnigramCandidateSampler"
+  name: "DepthwiseConv2dNativeBackpropFilter"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
+    name: "input"
+    type_attr: "T"
   }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
   }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "unique"
-    type: "bool"
-  }
-  attr {
-    name: "range_max"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "vocab_file"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "distortion"
-    type: "float"
-    default_value {
-      f: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
   attr {
-    name: "num_reserved_ids"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "num_shards"
-    type: "int"
-    default_value {
-      i: 1
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
-    has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "shard"
-    type: "int"
+    name: "data_format"
+    type: "string"
     default_value {
-      i: 0
+      s: "NHWC"
     }
-    has_minimum: true
-  }
-  attr {
-    name: "unigrams"
-    type: "list(float)"
-    default_value {
+    allowed_values {
       list {
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
   attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      i: 0
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "FlatMapDataset"
+  name: "DepthwiseConv2dNativeBackpropInput"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "input_sizes"
+    type: DT_INT32
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "filter"
+    type_attr: "T"
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
   }
-  attr {
-    name: "f"
-    type: "func"
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "FlatMapDataset"
+  name: "DepthwiseConv2dNativeBackpropInput"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "input_sizes"
+    type: DT_INT32
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "filter"
+    type_attr: "T"
   }
-}
-op {
-  name: "Floor"
   input_arg {
-    name: "x"
+    name: "out_backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -12502,59 +13289,55 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-}
-op {
-  name: "FloorDiv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
+  attr {
+    name: "strides"
+    type: "list(int)"
   }
-  output_arg {
-    name: "z"
-    type_attr: "T"
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
 }
 op {
-  name: "FloorMod"
+  name: "DepthwiseConv2dNativeBackpropInput"
   input_arg {
-    name: "x"
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "out_backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -12562,186 +13345,281 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-}
-op {
-  name: "FractionalAvgPool"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "row_pooling_sequence"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "col_pooling_sequence"
-    type: DT_INT64
-  }
   attr {
-    name: "pooling_ratio"
-    type: "list(float)"
-    has_minimum: true
-    minimum: 4
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "pseudo_random"
-    type: "bool"
-    default_value {
-      b: false
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "overlapping"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: false
+      s: "NHWC"
     }
-  }
-  attr {
-    name: "deterministic"
-    type: "bool"
-    default_value {
-      b: false
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
   attr {
-    name: "seed"
-    type: "int"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      i: 0
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
   }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+}
+op {
+  name: "Dequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
       }
     }
   }
 }
 op {
-  name: "FractionalAvgPoolGrad"
-  input_arg {
-    name: "orig_input_tensor_shape"
-    type: DT_INT64
-  }
+  name: "Dequantize"
   input_arg {
-    name: "out_backprop"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "row_pooling_sequence"
-    type: DT_INT64
+    name: "min_range"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "col_pooling_sequence"
-    type: DT_INT64
+    name: "max_range"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type: DT_FLOAT
   }
   attr {
-    name: "overlapping"
-    type: "bool"
-    default_value {
-      b: false
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
       }
     }
   }
 }
 op {
-  name: "FractionalMaxPool"
+  name: "DeserializeIterator"
   input_arg {
-    name: "value"
-    type_attr: "T"
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "serialized"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "DeserializeManySparse"
+  input_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "sparse_indices"
+    type: DT_INT64
   }
   output_arg {
-    name: "row_pooling_sequence"
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "DeserializeSparse"
+  input_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sparse_indices"
     type: DT_INT64
   }
   output_arg {
-    name: "col_pooling_sequence"
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
     type: DT_INT64
   }
   attr {
-    name: "pooling_ratio"
-    type: "list(float)"
-    has_minimum: true
-    minimum: 4
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "DeserializeSparse"
+  input_arg {
+    name: "serialized_sparse"
+    type_attr: "Tserialized"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
+    type: DT_INT64
   }
   attr {
-    name: "pseudo_random"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "dtype"
+    type: "type"
   }
   attr {
-    name: "overlapping"
-    type: "bool"
+    name: "Tserialized"
+    type: "type"
     default_value {
-      b: false
+      type: DT_STRING
+    }
+    allowed_values {
+      list {
+        type: DT_STRING
+        type: DT_VARIANT
+      }
     }
   }
+}
+op {
+  name: "DestroyResourceOp"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
   attr {
-    name: "deterministic"
+    name: "ignore_lookup_error"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
+  is_stateful: true
+}
+op {
+  name: "DestroyTemporaryVariable"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "value"
+    type_attr: "T"
+  }
   attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "var_name"
+    type: "string"
+  }
+}
+op {
+  name: "Diag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -12752,96 +13630,71 @@ op {
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "FractionalMaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
+  name: "Diag"
   input_arg {
-    name: "out_backprop"
+    name: "diagonal"
     type_attr: "T"
   }
-  input_arg {
-    name: "row_pooling_sequence"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "col_pooling_sequence"
-    type: DT_INT64
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  attr {
-    name: "overlapping"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "FusedBatchNorm"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scale"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "offset"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "mean"
-    type_attr: "T"
-  }
+  name: "DiagPart"
   input_arg {
-    name: "variance"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "batch_mean"
+    name: "diagonal"
     type_attr: "T"
   }
-  output_arg {
-    name: "batch_variance"
-    type_attr: "T"
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
-  output_arg {
-    name: "reserve_space_1"
+}
+op {
+  name: "DiagPart"
+  input_arg {
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "reserve_space_2"
+    name: "diagonal"
     type_attr: "T"
   }
   attr {
@@ -12849,72 +13702,74 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
 }
 op {
-  name: "FusedBatchNormGrad"
-  input_arg {
-    name: "y_backprop"
-    type_attr: "T"
-  }
+  name: "Digamma"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "scale"
+  output_arg {
+    name: "y"
     type_attr: "T"
   }
-  input_arg {
-    name: "reserve_space_1"
-    type_attr: "T"
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
+}
+op {
+  name: "Digamma"
   input_arg {
-    name: "reserve_space_2"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "x_backprop"
+    name: "y"
     type_attr: "T"
   }
-  output_arg {
-    name: "scale_backprop"
-    type_attr: "T"
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
-  output_arg {
-    name: "offset_backprop"
+}
+op {
+  name: "Dilation2D"
+  input_arg {
+    name: "input"
     type_attr: "T"
   }
-  output_arg {
-    name: "reserve_space_3"
+  input_arg {
+    name: "filter"
     type_attr: "T"
   }
   output_arg {
-    name: "reserve_space_4"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -12923,213 +13778,169 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
 }
 op {
-  name: "FusedBatchNormGradV2"
+  name: "Dilation2D"
   input_arg {
-    name: "y_backprop"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "x"
+    name: "filter"
     type_attr: "T"
   }
-  input_arg {
-    name: "scale"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "reserve_space_1"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "reserve_space_2"
-    type_attr: "U"
-  }
   output_arg {
-    name: "x_backprop"
+    name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "scale_backprop"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "offset_backprop"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_3"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_4"
-    type_attr: "U"
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "U"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "data_format"
+    name: "padding"
     type: "string"
-    default_value {
-      s: "NHWC"
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
 }
 op {
-  name: "FusedBatchNormV2"
+  name: "Dilation2D"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "scale"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "offset"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "mean"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "variance"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "y"
+    name: "filter"
     type_attr: "T"
   }
   output_arg {
-    name: "batch_mean"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "batch_variance"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_1"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_2"
-    type_attr: "U"
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "U"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
-    }
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "data_format"
+    name: "padding"
     type: "string"
-    default_value {
-      s: "NHWC"
-    }
-  }
-  attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
 }
 op {
-  name: "FusedPadConv2D"
+  name: "Dilation2DBackpropFilter"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "paddings"
-    type: DT_INT32
+    name: "filter"
+    type_attr: "T"
   }
   input_arg {
-    name: "filter"
+    name: "out_backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "filter_backprop"
     type_attr: "T"
   }
   attr {
@@ -13138,22 +13949,28 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "mode"
-    type: "string"
-    allowed_values {
-      list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
-      }
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "strides"
+    name: "rates"
     type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
     name: "padding"
@@ -13167,25 +13984,21 @@ op {
   }
 }
 op {
-  name: "FusedResizeAndPadConv2D"
+  name: "Dilation2DBackpropFilter"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "paddings"
-    type: DT_INT32
+    name: "filter"
+    type_attr: "T"
   }
   input_arg {
-    name: "filter"
+    name: "out_backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "filter_backprop"
     type_attr: "T"
   }
   attr {
@@ -13194,29 +14007,30 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "resize_align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
-    allowed_values {
-      list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
-      }
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "strides"
+    name: "rates"
     type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
     name: "padding"
@@ -13230,261 +14044,247 @@ op {
   }
 }
 op {
-  name: "Gather"
+  name: "Dilation2DBackpropFilter"
   input_arg {
-    name: "params"
-    type_attr: "Tparams"
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tparams"
+    name: "filter"
+    type_attr: "T"
   }
-  attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
   }
-  attr {
-    name: "Tparams"
-    type: "type"
+  output_arg {
+    name: "filter_backprop"
+    type_attr: "T"
   }
   attr {
-    name: "Tindices"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-}
-op {
-  name: "GatherNd"
-  input_arg {
-    name: "params"
-    type_attr: "Tparams"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tparams"
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "Tparams"
-    type: "type"
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "Tindices"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
 }
 op {
-  name: "GatherV2"
+  name: "Dilation2DBackpropInput"
   input_arg {
-    name: "params"
-    type_attr: "Tparams"
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "filter"
+    type_attr: "T"
   }
   input_arg {
-    name: "axis"
-    type_attr: "Taxis"
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "Tparams"
-  }
-  attr {
-    name: "Tparams"
-    type: "type"
+    name: "in_backprop"
+    type_attr: "T"
   }
   attr {
-    name: "Tindices"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "Taxis"
-    type: "type"
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
 }
 op {
-  name: "GenerateVocabRemapping"
-  input_arg {
-    name: "new_vocab_file"
-    type: DT_STRING
-  }
+  name: "Dilation2DBackpropInput"
   input_arg {
-    name: "old_vocab_file"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "remapping"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "num_present"
-    type: DT_INT32
-  }
-  attr {
-    name: "new_vocab_offset"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "num_new_vocab"
-    type: "int"
-    has_minimum: true
+    name: "input"
+    type_attr: "T"
   }
-}
-op {
-  name: "GenerateVocabRemapping"
   input_arg {
-    name: "new_vocab_file"
-    type: DT_STRING
+    name: "filter"
+    type_attr: "T"
   }
   input_arg {
-    name: "old_vocab_file"
-    type: DT_STRING
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
-    name: "remapping"
-    type: DT_INT64
+    name: "in_backprop"
+    type_attr: "T"
   }
-  output_arg {
-    name: "num_present"
-    type: DT_INT32
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "new_vocab_offset"
-    type: "int"
+    name: "strides"
+    type: "list(int)"
     has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "num_new_vocab"
-    type: "int"
+    name: "rates"
+    type: "list(int)"
     has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "old_vocab_size"
-    type: "int"
-    default_value {
-      i: -1
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
-    has_minimum: true
-    minimum: -1
   }
 }
 op {
-  name: "GetSessionHandle"
+  name: "Dilation2DBackpropInput"
   input_arg {
-    name: "value"
+    name: "input"
     type_attr: "T"
   }
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "GetSessionHandle"
   input_arg {
-    name: "value"
+    name: "filter"
     type_attr: "T"
   }
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  deprecation {
-    version: 23
-  }
-}
-op {
-  name: "GetSessionHandle"
   input_arg {
-    name: "value"
+    name: "out_backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "GetSessionHandleV2"
-  input_arg {
-    name: "value"
+    name: "in_backprop"
     type_attr: "T"
   }
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
-  is_stateful: true
-}
-op {
-  name: "GetSessionTensor"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "dtype"
-    type: "type"
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
 }
 op {
-  name: "Greater"
+  name: "Div"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -13495,28 +14295,30 @@ op {
   }
   output_arg {
     name: "z"
-    type: DT_BOOL
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
-        type: DT_INT16
         type: DT_INT8
         type: DT_UINT16
-        type: DT_HALF
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Greater"
+  name: "Div"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -13527,717 +14329,648 @@ op {
   }
   output_arg {
     name: "z"
-    type: DT_BOOL
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
-        type: DT_INT16
         type: DT_INT8
         type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "GreaterEqual"
+  name: "DrawBoundingBoxes"
   input_arg {
-    name: "x"
+    name: "images"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "boxes"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "GreaterEqual"
+  name: "DynamicPartition"
   input_arg {
-    name: "x"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "partitions"
+    type: DT_INT32
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "outputs"
+    type_attr: "T"
+    number_attr: "num_partitions"
+  }
+  attr {
+    name: "num_partitions"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
   }
 }
 op {
-  name: "GroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
+  name: "DynamicStitch"
   input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
+    name: "indices"
+    type: DT_INT32
+    number_attr: "N"
   }
   input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
+    name: "data"
+    type_attr: "T"
+    number_attr: "N"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "merged"
+    type_attr: "T"
   }
   attr {
-    name: "key_func"
-    type: "func"
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "reduce_func"
-    type: "func"
+    name: "T"
+    type: "type"
   }
-  attr {
-    name: "window_size_func"
-    type: "func"
+}
+op {
+  name: "EagerPyFunc"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
   }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
   }
   attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
+    name: "token"
+    type: "string"
   }
   attr {
-    name: "Twindow_size_func_other_arguments"
+    name: "Tin"
     type: "list(type)"
     has_minimum: true
   }
   attr {
-    name: "output_types"
+    name: "Tout"
     type: "list(type)"
     has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
   }
   is_stateful: true
 }
 op {
-  name: "GroupByWindowDataset"
+  name: "EditDistance"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "hypothesis_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
+    name: "hypothesis_values"
+    type_attr: "T"
   }
   input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
+    name: "hypothesis_shape"
+    type: DT_INT64
   }
   input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
+    name: "truth_indices"
+    type: DT_INT64
   }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "truth_values"
+    type_attr: "T"
   }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "truth_shape"
+    type: DT_INT64
   }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "normalize"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
   }
 }
 op {
-  name: "HSVToRGB"
+  name: "Elu"
   input_arg {
-    name: "images"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "activations"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "HashTable"
-  output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
+  name: "Elu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "HashTableV2"
   output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "key_dtype"
-    type: "type"
+    name: "activations"
+    type_attr: "T"
   }
   attr {
-    name: "value_dtype"
+    name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "HistogramFixedWidth"
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
+  name: "Elu"
   input_arg {
-    name: "value_range"
+    name: "features"
     type_attr: "T"
   }
-  input_arg {
-    name: "nbins"
-    type: DT_INT32
-  }
   output_arg {
-    name: "out"
-    type_attr: "dtype"
+    name: "activations"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  attr {
-    name: "dtype"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
 }
 op {
-  name: "HistogramSummary"
+  name: "EluGrad"
   input_arg {
-    name: "tag"
-    type: DT_STRING
+    name: "gradients"
+    type_attr: "T"
   }
   input_arg {
-    name: "values"
+    name: "outputs"
     type_attr: "T"
   }
   output_arg {
-    name: "summary"
-    type: DT_STRING
+    name: "backprops"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "HistogramSummary"
+  name: "EluGrad"
   input_arg {
-    name: "tag"
-    type: DT_STRING
+    name: "gradients"
+    type_attr: "T"
   }
   input_arg {
-    name: "values"
+    name: "outputs"
     type_attr: "T"
   }
   output_arg {
-    name: "summary"
-    type: DT_STRING
+    name: "backprops"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "IFFT"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "IFFT2D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "IFFT3D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "IRFFT"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  input_arg {
-    name: "fft_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-}
-op {
-  name: "IRFFT2D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  input_arg {
-    name: "fft_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-}
-op {
-  name: "IRFFT3D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
+  name: "EluGrad"
   input_arg {
-    name: "fft_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
+    name: "gradients"
+    type_attr: "T"
   }
-}
-op {
-  name: "Identity"
   input_arg {
-    name: "input"
+    name: "outputs"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "backprops"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
 }
 op {
-  name: "IdentityN"
+  name: "EncodeBase64"
   input_arg {
     name: "input"
-    type_list_attr: "T"
+    type: DT_STRING
   }
   output_arg {
     name: "output"
-    type_list_attr: "T"
+    type: DT_STRING
   }
   attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "pad"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
 }
 op {
-  name: "IdentityReader"
+  name: "EncodeJpeg"
+  input_arg {
+    name: "image"
+    type: DT_UINT8
+  }
   output_arg {
-    name: "reader_handle"
+    name: "contents"
     type: DT_STRING
-    is_ref: true
   }
   attr {
-    name: "container"
+    name: "format"
     type: "string"
     default_value {
       s: ""
     }
+    allowed_values {
+      list {
+        s: ""
+        s: "grayscale"
+        s: "rgb"
+      }
+    }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "quality"
+    type: "int"
     default_value {
-      s: ""
+      i: 95
     }
   }
-  is_stateful: true
-}
-op {
-  name: "IdentityReaderV2"
-  output_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
+  attr {
+    name: "progressive"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "container"
+    name: "optimize_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "chroma_downsampling"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "density_unit"
     type: "string"
     default_value {
-      s: ""
+      s: "in"
+    }
+    allowed_values {
+      list {
+        s: "in"
+        s: "cm"
+      }
     }
   }
   attr {
-    name: "shared_name"
+    name: "x_density"
+    type: "int"
+    default_value {
+      i: 300
+    }
+  }
+  attr {
+    name: "y_density"
+    type: "int"
+    default_value {
+      i: 300
+    }
+  }
+  attr {
+    name: "xmp_metadata"
     type: "string"
     default_value {
       s: ""
     }
   }
-  is_stateful: true
 }
 op {
-  name: "Igamma"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
+  name: "EncodePng"
   input_arg {
-    name: "x"
+    name: "image"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "contents"
+    type: DT_STRING
+  }
+  attr {
+    name: "compression"
+    type: "int"
+    default_value {
+      i: -1
+    }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_UINT8
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_UINT16
       }
     }
   }
 }
 op {
-  name: "Igammac"
+  name: "EncodeWav"
   input_arg {
-    name: "a"
+    name: "audio"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+}
+op {
+  name: "Enter"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "frame_name"
+    type: "string"
+  }
+  attr {
+    name: "is_constant"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "parallel_iterations"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+}
+op {
+  name: "Equal"
   input_arg {
     name: "x"
     type_attr: "T"
   }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
   output_arg {
     name: "z"
-    type_attr: "T"
+    type: DT_BOOL
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
       }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "IgnoreErrorsDataset"
+  name: "Equal"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "x"
+    type_attr: "T"
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+  input_arg {
+    name: "y"
+    type_attr: "T"
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "z"
+    type: DT_BOOL
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
   }
-  is_stateful: true
+  is_commutative: true
 }
 op {
-  name: "IgnoreErrorsDataset"
+  name: "Erf"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
 }
 op {
-  name: "Imag"
+  name: "Erf"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "Tout"
+    name: "y"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
     allowed_values {
       list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
+}
+op {
+  name: "Erfc"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
   attr {
-    name: "Tout"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -14245,157 +14978,112 @@ op {
   }
 }
 op {
-  name: "ImageSummary"
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
+  name: "Erfc"
   input_arg {
-    name: "tensor"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "max_images"
-    type: "int"
-    default_value {
-      i: 3
-    }
-    has_minimum: true
-    minimum: 1
+    name: "y"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_FLOAT
         type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "bad_color"
-    type: "tensor"
-    default_value {
-      tensor {
-        dtype: DT_UINT8
-        tensor_shape {
-          dim {
-            size: 4
-          }
-        }
-        int_val: 255
-        int_val: 0
-        int_val: 0
-        int_val: 255
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "ImageSummary"
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
+  name: "Exit"
   input_arg {
-    name: "tensor"
+    name: "data"
     type_attr: "T"
   }
   output_arg {
-    name: "summary"
-    type: DT_STRING
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "max_images"
-    type: "int"
-    default_value {
-      i: 3
-    }
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "Exp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_FLOAT
         type: DT_HALF
+        type: DT_FLOAT
         type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "bad_color"
-    type: "tensor"
-    default_value {
-      tensor {
-        dtype: DT_UINT8
-        tensor_shape {
-          dim {
-            size: 4
-          }
-        }
-        int_val: 255
-        int_val: 0
-        int_val: 0
-        int_val: 255
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "ImmutableConst"
+  name: "Exp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
   output_arg {
-    name: "tensor"
-    type_attr: "dtype"
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "memory_region_name"
-    type: "string"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "InTopK"
+  name: "ExpandDims"
   input_arg {
-    name: "predictions"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "targets"
-    type_attr: "T"
+    name: "dim"
+    type_attr: "Tdim"
   }
   output_arg {
-    name: "precision"
-    type: DT_BOOL
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "k"
-    type: "int"
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "T"
+    name: "Tdim"
     type: "type"
     default_value {
       type: DT_INT32
@@ -14409,409 +15097,279 @@ op {
   }
 }
 op {
-  name: "InTopKV2"
+  name: "Expm1"
   input_arg {
-    name: "predictions"
-    type: DT_FLOAT
+    name: "x"
+    type_attr: "T"
   }
-  input_arg {
-    name: "targets"
+  output_arg {
+    name: "y"
     type_attr: "T"
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Expm1"
   input_arg {
-    name: "k"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "precision"
-    type: DT_BOOL
+    name: "y"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "InitializeTable"
+  name: "ExtractGlimpse"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "input"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tkey"
+    name: "size"
+    type: DT_INT32
   }
   input_arg {
-    name: "values"
-    type_attr: "Tval"
+    name: "offsets"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "glimpse"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tkey"
-    type: "type"
+    name: "centered"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "Tval"
-    type: "type"
+    name: "normalized"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "uniform_noise"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
 }
 op {
-  name: "InitializeTableFromTextFile"
+  name: "ExtractImagePatches"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "images"
+    type_attr: "T"
   }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
+  output_arg {
+    name: "patches"
+    type_attr: "T"
   }
   attr {
-    name: "key_index"
-    type: "int"
+    name: "ksizes"
+    type: "list(int)"
     has_minimum: true
-    minimum: -2
+    minimum: 4
   }
   attr {
-    name: "value_index"
-    type: "int"
-    has_minimum: true
-    minimum: -2
-  }
-  attr {
-    name: "vocab_size"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "delimiter"
-    type: "string"
-    default_value {
-      s: "\t"
-    }
-  }
-}
-op {
-  name: "InitializeTableFromTextFileV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  attr {
-    name: "key_index"
-    type: "int"
+    name: "strides"
+    type: "list(int)"
     has_minimum: true
-    minimum: -2
+    minimum: 4
   }
   attr {
-    name: "value_index"
-    type: "int"
+    name: "rates"
+    type: "list(int)"
     has_minimum: true
-    minimum: -2
+    minimum: 4
   }
   attr {
-    name: "vocab_size"
-    type: "int"
-    default_value {
-      i: -1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
     }
-    has_minimum: true
-    minimum: -1
   }
   attr {
-    name: "delimiter"
+    name: "padding"
     type: "string"
-    default_value {
-      s: "\t"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
-  is_stateful: true
-}
-op {
-  name: "InitializeTableV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tkey"
-  }
-  input_arg {
-    name: "values"
-    type_attr: "Tval"
-  }
-  attr {
-    name: "Tkey"
-    type: "type"
-  }
-  attr {
-    name: "Tval"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "InterleaveDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "block_length"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
 }
 op {
-  name: "InterleaveDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
+  name: "ExtractImagePatches"
   input_arg {
-    name: "block_length"
-    type: DT_INT64
+    name: "images"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
+    name: "patches"
+    type_attr: "T"
   }
   attr {
-    name: "Targuments"
-    type: "list(type)"
+    name: "ksizes"
+    type: "list(int)"
     has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "strides"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
+    minimum: 4
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "rates"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Inv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+    minimum: 4
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  deprecation {
-    version: 17
-  }
-}
-op {
-  name: "InvGrad"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
-  deprecation {
-    version: 17
-  }
 }
 op {
-  name: "InvGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
+  name: "ExtractImagePatches"
   input_arg {
-    name: "dy"
+    name: "images"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "patches"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  deprecation {
-    version: 17
+    name: "ksizes"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
-}
-op {
-  name: "Invert"
-  input_arg {
-    name: "x"
-    type_attr: "T"
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
         type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
         type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-}
-op {
-  name: "Invert"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
 }
 op {
-  name: "InvertPermutation"
+  name: "ExtractJpegShape"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "contents"
+    type: DT_STRING
   }
   output_arg {
-    name: "y"
-    type_attr: "T"
+    name: "image_shape"
+    type_attr: "output_type"
   }
   attr {
-    name: "T"
+    name: "output_type"
     type: "type"
     default_value {
       type: DT_INT32
@@ -14825,1348 +15383,1258 @@ op {
   }
 }
 op {
-  name: "IsFinite"
+  name: "FFT"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "input"
+    type: DT_COMPLEX64
   }
   output_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+    name: "output"
+    type: DT_COMPLEX64
   }
 }
 op {
-  name: "IsInf"
+  name: "FFT2D"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "input"
+    type: DT_COMPLEX64
   }
   output_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+    name: "output"
+    type: DT_COMPLEX64
   }
 }
 op {
-  name: "IsNan"
+  name: "FFT3D"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "input"
+    type: DT_COMPLEX64
   }
   output_arg {
-    name: "y"
-    type: DT_BOOL
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "FIFOQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
+    has_minimum: true
   }
-}
-op {
-  name: "IsVariableInitialized"
-  input_arg {
-    name: "ref"
-    type_attr: "dtype"
-    is_ref: true
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
   }
-  output_arg {
-    name: "is_initialized"
-    type: DT_BOOL
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "dtype"
-    type: "type"
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  allows_uninitialized_input: true
+  is_stateful: true
 }
 op {
-  name: "Iterator"
+  name: "FIFOQueueV2"
   output_arg {
     name: "handle"
     type: DT_RESOURCE
   }
   attr {
-    name: "shared_name"
-    type: "string"
-  }
-  attr {
-    name: "container"
-    type: "string"
-  }
-  attr {
-    name: "output_types"
+    name: "component_types"
     type: "list(type)"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "output_shapes"
+    name: "shapes"
     type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
     has_minimum: true
-    minimum: 1
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   is_stateful: true
 }
 op {
-  name: "IteratorFromStringHandle"
-  input_arg {
-    name: "string_handle"
-    type: DT_STRING
-  }
+  name: "Fact"
   output_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
+    name: "fact"
+    type: DT_STRING
   }
-  is_stateful: true
 }
 op {
-  name: "IteratorFromStringHandle"
+  name: "FakeQuantWithMinMaxArgs"
   input_arg {
-    name: "string_handle"
-    type: DT_STRING
+    name: "inputs"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
+    name: "outputs"
+    type: DT_FLOAT
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "min"
+    type: "float"
     default_value {
-      list {
-      }
+      f: -6
     }
-    has_minimum: true
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
     }
-    has_minimum: true
   }
-  is_stateful: true
 }
 op {
-  name: "IteratorGetNext"
+  name: "FakeQuantWithMinMaxArgs"
   input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
+    name: "inputs"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "components"
-    type_list_attr: "output_types"
+    name: "outputs"
+    type: DT_FLOAT
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "IteratorToStringHandle"
-  input_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "string_handle"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "L2Loss"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_HALF
-      }
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
     }
   }
 }
 op {
-  name: "L2Loss"
+  name: "FakeQuantWithMinMaxArgs"
   input_arg {
-    name: "t"
-    type_attr: "T"
+    name: "inputs"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "outputs"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
     }
   }
-}
-op {
-  name: "LMDBReader"
-  output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "num_bits"
+    type: "int"
     default_value {
-      s: ""
+      i: 8
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "narrow_range"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "LRN"
+  name: "FakeQuantWithMinMaxArgsGradient"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "backprops"
+    type: DT_FLOAT
   }
   attr {
-    name: "depth_radius"
-    type: "int"
+    name: "min"
+    type: "float"
     default_value {
-      i: 5
+      f: -6
     }
   }
   attr {
-    name: "bias"
+    name: "max"
     type: "float"
     default_value {
-      f: 1
+      f: 6
     }
   }
+}
+op {
+  name: "FakeQuantWithMinMaxArgsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
   attr {
-    name: "alpha"
+    name: "min"
     type: "float"
     default_value {
-      f: 1
+      f: -6
     }
   }
   attr {
-    name: "beta"
+    name: "max"
     type: "float"
     default_value {
-      f: 0.5
+      f: 6
     }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "num_bits"
+    type: "int"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-      }
+      i: 8
     }
   }
 }
 op {
-  name: "LRNGrad"
-  input_arg {
-    name: "input_grads"
-    type_attr: "T"
-  }
+  name: "FakeQuantWithMinMaxArgsGradient"
   input_arg {
-    name: "input_image"
-    type_attr: "T"
+    name: "gradients"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "output_image"
-    type_attr: "T"
+    name: "inputs"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "depth_radius"
-    type: "int"
-    default_value {
-      i: 5
-    }
+    name: "backprops"
+    type: DT_FLOAT
   }
   attr {
-    name: "bias"
+    name: "min"
     type: "float"
     default_value {
-      f: 1
+      f: -6
     }
   }
   attr {
-    name: "alpha"
+    name: "max"
     type: "float"
     default_value {
-      f: 1
+      f: 6
     }
   }
   attr {
-    name: "beta"
-    type: "float"
+    name: "num_bits"
+    type: "int"
     default_value {
-      f: 0.5
+      i: 8
     }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "narrow_range"
+    type: "bool"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-      }
+      b: false
     }
   }
 }
 op {
-  name: "LearnedUnigramCandidateSampler"
+  name: "FakeQuantWithMinMaxVars"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
+    name: "inputs"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "true_expected_count"
+    name: "outputs"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
     type: DT_FLOAT
   }
   output_arg {
-    name: "sampled_expected_count"
+    name: "outputs"
     type: DT_FLOAT
   }
   attr {
-    name: "num_true"
+    name: "num_bits"
     type: "int"
-    has_minimum: true
-    minimum: 1
+    default_value {
+      i: 8
+    }
   }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
   }
-  attr {
-    name: "unique"
-    type: "bool"
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
   }
-  attr {
-    name: "range_max"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
   }
   attr {
-    name: "seed"
+    name: "num_bits"
     type: "int"
     default_value {
-      i: 0
+      i: 8
     }
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "narrow_range"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
 }
 op {
-  name: "LearnedUnigramCandidateSampler"
+  name: "FakeQuantWithMinMaxVarsGradient"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "true_expected_count"
+    name: "backprop_wrt_min"
     type: DT_FLOAT
   }
   output_arg {
-    name: "sampled_expected_count"
+    name: "backprop_wrt_max"
     type: DT_FLOAT
   }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+}
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
   }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
   }
-  attr {
-    name: "unique"
-    type: "bool"
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "range_max"
+    name: "num_bits"
     type: "int"
-    has_minimum: true
-    minimum: 1
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "seed"
+    name: "num_bits"
     type: "int"
     default_value {
-      i: 0
+      i: 8
     }
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "narrow_range"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "LeftShift"
+  name: "FakeQuantWithMinMaxVarsPerChannel"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "inputs"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "min"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "z"
-    type_attr: "T"
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
   }
-  is_commutative: true
 }
 op {
-  name: "Less"
+  name: "FakeQuantWithMinMaxVarsPerChannel"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "inputs"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "outputs"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
     }
   }
 }
 op {
-  name: "Less"
+  name: "FakeQuantWithMinMaxVarsPerChannel"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "inputs"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "outputs"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
     }
   }
-}
-op {
-  name: "LessEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
 }
 op {
-  name: "LessEqual"
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "gradients"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "inputs"
+    type: DT_FLOAT
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "Lgamma"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "max"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "y"
-    type_attr: "T"
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
   }
 }
 op {
-  name: "LinSpace"
-  input_arg {
-    name: "start"
-    type_attr: "T"
-  }
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
   input_arg {
-    name: "stop"
-    type_attr: "T"
+    name: "gradients"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "num"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "inputs"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "ListDiff"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "min"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "max"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "out"
-    type_attr: "T"
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "idx"
-    type_attr: "out_idx"
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
   }
-  attr {
-    name: "T"
-    type: "type"
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "out_idx"
-    type: "type"
+    name: "num_bits"
+    type: "int"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      i: 8
     }
   }
 }
 op {
-  name: "LoadAndRemapMatrix"
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
   input_arg {
-    name: "ckpt_path"
-    type: DT_STRING
+    name: "gradients"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "old_tensor_name"
-    type: DT_STRING
+    name: "inputs"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "row_remapping"
-    type: DT_INT64
+    name: "min"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "col_remapping"
-    type: DT_INT64
+    name: "max"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "initializing_values"
+  output_arg {
+    name: "backprops_wrt_input"
     type: DT_FLOAT
   }
   output_arg {
-    name: "output_matrix"
+    name: "backprop_wrt_min"
     type: DT_FLOAT
   }
-  attr {
-    name: "num_rows"
-    type: "int"
-    has_minimum: true
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "num_cols"
+    name: "num_bits"
     type: "int"
-    has_minimum: true
-    minimum: 1
+    default_value {
+      i: 8
+    }
   }
   attr {
-    name: "max_rows_in_memory"
-    type: "int"
+    name: "narrow_range"
+    type: "bool"
     default_value {
-      i: -1
+      b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "Log"
+  name: "FakeQueue"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "resource"
+    type: DT_RESOURCE
   }
   output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
+  is_stateful: true
 }
 op {
-  name: "Log1p"
+  name: "Fill"
   input_arg {
-    name: "x"
+    name: "dims"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "output"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
 }
 op {
-  name: "LogMatrixDeterminant"
+  name: "FilterDataset"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
-  output_arg {
-    name: "sign"
-    type_attr: "T"
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
   output_arg {
-    name: "log_abs_determinant"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
   }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
 }
 op {
-  name: "LogSoftmax"
+  name: "FilterDataset"
   input_arg {
-    name: "logits"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
   output_arg {
-    name: "logsoftmax"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "LogUniformCandidateSampler"
+  name: "FixedLengthRecordDataset"
   input_arg {
-    name: "true_classes"
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "header_bytes"
     type: DT_INT64
   }
-  output_arg {
-    name: "sampled_candidates"
+  input_arg {
+    name: "record_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "footer_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "buffer_size"
     type: DT_INT64
   }
   output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
+    name: "handle"
+    type: DT_VARIANT
   }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReader"
   output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
-    name: "num_true"
+    name: "header_bytes"
     type: "int"
-    has_minimum: true
-    minimum: 1
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "num_sampled"
+    name: "record_bytes"
     type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "unique"
-    type: "bool"
   }
   attr {
-    name: "range_max"
+    name: "footer_bytes"
     type: "int"
-    has_minimum: true
-    minimum: 1
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "seed"
-    type: "int"
+    name: "container"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "shared_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "LogUniformCandidateSampler"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
-  }
+  name: "FixedLengthRecordReader"
   output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
-    name: "num_true"
+    name: "header_bytes"
     type: "int"
-    has_minimum: true
-    minimum: 1
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "num_sampled"
+    name: "record_bytes"
     type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "unique"
-    type: "bool"
   }
   attr {
-    name: "range_max"
+    name: "footer_bytes"
     type: "int"
-    has_minimum: true
-    minimum: 1
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "seed"
+    name: "hop_bytes"
     type: "int"
     default_value {
       i: 0
     }
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "container"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   is_stateful: true
 }
 op {
-  name: "LogicalAnd"
-  input_arg {
-    name: "x"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "y"
-    type: DT_BOOL
-  }
+  name: "FixedLengthRecordReaderV2"
   output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  is_commutative: true
-}
-op {
-  name: "LogicalNot"
-  input_arg {
-    name: "x"
-    type: DT_BOOL
+    name: "reader_handle"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "y"
-    type: DT_BOOL
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
-}
-op {
-  name: "LogicalOr"
-  input_arg {
-    name: "x"
-    type: DT_BOOL
+  attr {
+    name: "record_bytes"
+    type: "int"
   }
-  input_arg {
-    name: "y"
-    type: DT_BOOL
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  is_commutative: true
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "LookupTableExport"
-  input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
-  }
+  name: "FixedLengthRecordReaderV2"
   output_arg {
-    name: "values"
-    type_attr: "Tvalues"
+    name: "reader_handle"
+    type: DT_RESOURCE
   }
   attr {
-    name: "Tkeys"
-    type: "type"
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "Tvalues"
-    type: "type"
-  }
-}
-op {
-  name: "LookupTableExportV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "record_bytes"
+    type: "int"
   }
-  output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tvalues"
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "Tkeys"
-    type: "type"
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tvalues"
-    type: "type"
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableFind"
-  input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
-  }
-  input_arg {
-    name: "default_value"
-    type_attr: "Tout"
-  }
+  name: "FixedLengthRecordReaderV2"
   output_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "reader_handle"
+    type: DT_RESOURCE
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "Tout"
-    type: "type"
-  }
-}
-op {
-  name: "LookupTableFindV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "record_bytes"
+    type: "int"
   }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
-  input_arg {
-    name: "default_value"
-    type_attr: "Tout"
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tout"
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "encoding"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableImport"
+  name: "FixedUnigramCandidateSampler"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "true_classes"
+    type: DT_INT64
   }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
   }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Tin"
-    type: "type"
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tout"
-    type: "type"
-  }
-}
-op {
-  name: "LookupTableImportV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
-  }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "Tout"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "LookupTableInsert"
-  input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "unique"
+    type: "bool"
   }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
+  attr {
+    name: "vocab_file"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "distortion"
+    type: "float"
+    default_value {
+      f: 1
+    }
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "num_reserved_ids"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
-}
-op {
-  name: "LookupTableInsertV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+  attr {
+    name: "num_shards"
+    type: "int"
+    default_value {
+      i: 1
+    }
+    has_minimum: true
+    minimum: 1
   }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
+  attr {
+    name: "shard"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
   }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
+  attr {
+    name: "unigrams"
+    type: "list(float)"
+    default_value {
+      list {
+      }
+    }
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "LookupTableSize"
+  name: "FixedUnigramCandidateSampler"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "size"
+    name: "true_classes"
     type: DT_INT64
   }
-}
-op {
-  name: "LookupTableSizeV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
   output_arg {
-    name: "size"
+    name: "sampled_candidates"
     type: DT_INT64
   }
-  is_stateful: true
-}
-op {
-  name: "LoopCond"
-  input_arg {
-    name: "input"
-    type: DT_BOOL
-  }
   output_arg {
-    name: "output"
-    type: DT_BOOL
-  }
-}
-op {
-  name: "MakeIterator"
-  input_arg {
-    name: "dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
-op {
-  name: "MapAndBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_batches"
-    type: DT_INT64
+    name: "true_expected_count"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
+    name: "sampled_expected_count"
+    type: DT_FLOAT
   }
   attr {
-    name: "Targuments"
-    type: "list(type)"
+    name: "num_true"
+    type: "int"
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "num_sampled"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
-}
-op {
-  name: "MapClear"
   attr {
-    name: "capacity"
+    name: "vocab_file"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "distortion"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "num_reserved_ids"
     type: "int"
     default_value {
       i: 0
     }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+    default_value {
+      i: 1
+    }
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "memory_limit"
+    name: "shard"
     type: "int"
     default_value {
       i: 0
@@ -16174,27 +16642,31 @@ op {
     has_minimum: true
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "unigrams"
+    type: "list(float)"
+    default_value {
+      list {
+      }
+    }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "seed"
+    type: "int"
     default_value {
-      s: ""
+      i: 0
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "seed2"
+    type: "int"
     default_value {
-      s: ""
+      i: 0
     }
   }
   is_stateful: true
 }
 op {
-  name: "MapDataset"
+  name: "FlatMapDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
@@ -16231,7 +16703,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "MapDataset"
+  name: "FlatMapDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
@@ -16267,327 +16739,345 @@ op {
   }
 }
 op {
-  name: "MapIncompleteSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
+  name: "Floor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
-    has_minimum: true
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+}
+op {
+  name: "Floor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MapPeek"
+  name: "FloorDiv"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
-    has_minimum: true
   }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+}
+op {
+  name: "FloorDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "y"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MapSize"
+  name: "FloorMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
   output_arg {
-    name: "size"
-    type: DT_INT32
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
-    has_minimum: true
   }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+}
+op {
+  name: "FloorMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+  input_arg {
+    name: "y"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MapStage"
+  name: "FractionalAvgPool"
   input_arg {
-    name: "key"
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "row_pooling_sequence"
     type: DT_INT64
   }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
+  output_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
   }
-  input_arg {
-    name: "values"
-    type_list_attr: "fake_dtypes"
+  attr {
+    name: "pooling_ratio"
+    type: "list(float)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "pseudo_random"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
+    name: "overlapping"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "deterministic"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "fake_dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "seed2"
+    type: "int"
     default_value {
-      s: ""
+      i: 0
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MapUnstage"
+  name: "FractionalAvgPoolGrad"
   input_arg {
-    name: "key"
+    name: "orig_input_tensor_shape"
     type: DT_INT64
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
+    name: "out_backprop"
+    type_attr: "T"
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "row_pooling_sequence"
+    type: DT_INT64
   }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "overlapping"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MapUnstageNoKey"
+  name: "FractionalMaxPool"
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "value"
+    type_attr: "T"
   }
   output_arg {
-    name: "key"
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "row_pooling_sequence"
     type: DT_INT64
   }
   output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
+    name: "col_pooling_sequence"
+    type: DT_INT64
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "pooling_ratio"
+    type: "list(float)"
     has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "memory_limit"
-    type: "int"
+    name: "pseudo_random"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "overlapping"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "deterministic"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
-  is_stateful: true
-}
-op {
-  name: "MatMul"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "product"
-    type_attr: "T"
-  }
   attr {
-    name: "transpose_a"
-    type: "bool"
+    name: "seed"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
   }
   attr {
-    name: "transpose_b"
-    type: "bool"
+    name: "seed2"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
   }
   attr {
@@ -16595,59 +17085,46 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "MatchingFiles"
+  name: "FractionalMaxPoolGrad"
   input_arg {
-    name: "pattern"
-    type: DT_STRING
+    name: "orig_input"
+    type_attr: "T"
   }
-  output_arg {
-    name: "filenames"
-    type: DT_STRING
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
   }
-}
-op {
-  name: "MatrixBandPart"
   input_arg {
-    name: "input"
+    name: "out_backprop"
     type_attr: "T"
   }
   input_arg {
-    name: "num_lower"
+    name: "row_pooling_sequence"
     type: DT_INT64
   }
   input_arg {
-    name: "num_upper"
+    name: "col_pooling_sequence"
     type: DT_INT64
   }
   output_arg {
-    name: "band"
+    name: "output"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "MatrixDeterminant"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "overlapping"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
     name: "T"
@@ -16656,71 +17133,52 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "MatrixDeterminant"
+  name: "FusedBatchNorm"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "scale"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatrixDiag"
   input_arg {
-    name: "diagonal"
+    name: "offset"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "mean"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "MatrixDiagPart"
   input_arg {
-    name: "input"
+    name: "variance"
     type_attr: "T"
   }
   output_arg {
-    name: "diagonal"
+    name: "y"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
+  output_arg {
+    name: "batch_mean"
+    type_attr: "T"
   }
-}
-op {
-  name: "MatrixExponential"
-  input_arg {
-    name: "input"
+  output_arg {
+    name: "batch_variance"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "reserve_space_1"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_2"
     type_attr: "T"
   }
   attr {
@@ -16728,141 +17186,72 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
-}
-op {
-  name: "MatrixInverse"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "adjoint"
-    type: "bool"
+    name: "epsilon"
+    type: "float"
     default_value {
-      b: false
+      f: 0.0001
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
     }
   }
-}
-op {
-  name: "MatrixInverse"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "adjoint"
+    name: "is_training"
     type: "bool"
     default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+      b: true
     }
   }
 }
 op {
-  name: "MatrixSetDiag"
+  name: "FusedBatchNormGrad"
   input_arg {
-    name: "input"
+    name: "y_backprop"
     type_attr: "T"
   }
   input_arg {
-    name: "diagonal"
+    name: "x"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "scale"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "MatrixSolve"
   input_arg {
-    name: "matrix"
+    name: "reserve_space_1"
     type_attr: "T"
   }
   input_arg {
-    name: "rhs"
+    name: "reserve_space_2"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "x_backprop"
     type_attr: "T"
   }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatrixSolveLs"
-  input_arg {
-    name: "matrix"
+  output_arg {
+    name: "scale_backprop"
     type_attr: "T"
   }
-  input_arg {
-    name: "rhs"
+  output_arg {
+    name: "offset_backprop"
     type_attr: "T"
   }
-  input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
+  output_arg {
+    name: "reserve_space_3"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "reserve_space_4"
     type_attr: "T"
   }
   attr {
@@ -16870,13 +17259,26 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
       }
     }
   }
   attr {
-    name: "fast"
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
     type: "bool"
     default_value {
       b: true
@@ -16884,37 +17286,82 @@ op {
   }
 }
 op {
-  name: "MatrixSolveLs"
+  name: "FusedBatchNormGradV2"
   input_arg {
-    name: "matrix"
+    name: "y_backprop"
     type_attr: "T"
   }
   input_arg {
-    name: "rhs"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
   }
   output_arg {
-    name: "output"
+    name: "x_backprop"
     type_attr: "T"
   }
+  output_arg {
+    name: "scale_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "offset_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_3"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_4"
+    type_attr: "U"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
+        type: DT_HALF
         type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
   attr {
-    name: "fast"
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
     type: "bool"
     default_value {
       b: true
@@ -16922,207 +17369,270 @@ op {
   }
 }
 op {
-  name: "MatrixTriangularSolve"
+  name: "FusedBatchNormGradV2"
   input_arg {
-    name: "matrix"
+    name: "y_backprop"
     type_attr: "T"
   }
   input_arg {
-    name: "rhs"
+    name: "x"
     type_attr: "T"
   }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
   output_arg {
-    name: "output"
+    name: "x_backprop"
     type_attr: "T"
   }
-  attr {
-    name: "lower"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  output_arg {
+    name: "scale_backprop"
+    type_attr: "U"
   }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "offset_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_3"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_4"
+    type_attr: "U"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
   }
-}
-op {
-  name: "MatrixTriangularSolve"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
   }
   attr {
-    name: "lower"
-    type: "bool"
+    name: "epsilon"
+    type: "float"
     default_value {
-      b: true
+      f: 0.0001
     }
   }
   attr {
-    name: "adjoint"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: false
+      s: "NHWC"
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
     }
   }
 }
 op {
-  name: "Max"
+  name: "FusedBatchNormV2"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "scale"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "U"
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_HALF
+        type: DT_FLOAT
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "U"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_FLOAT
       }
     }
   }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
-  name: "Max"
+  name: "FusedBatchNormV2"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "scale"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "U"
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "U"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_FLOAT
       }
     }
   }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
-  name: "MaxPool"
+  name: "FusedPadConv2D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "paddings"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -17130,27 +17640,25 @@ op {
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_HALF
       }
     }
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
   }
   attr {
     name: "strides"
     type: "list(int)"
-    has_minimum: true
-    minimum: 4
   }
   attr {
     name: "padding"
@@ -17162,26 +17670,25 @@ op {
       }
     }
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
 }
 op {
-  name: "MaxPool"
+  name: "FusedResizeAndPadConv2D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "paddings"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -17189,597 +17696,650 @@ op {
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "resize_align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "padding"
+    name: "mode"
     type: "string"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        s: "REFLECT"
+        s: "SYMMETRIC"
       }
     }
   }
   attr {
-    name: "data_format"
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
     type: "string"
-    default_value {
-      s: "NHWC"
-    }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
 }
 op {
-  name: "MaxPool"
+  name: "Gather"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "params"
+    type_attr: "Tparams"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "Tparams"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "validate_indices"
+    type: "bool"
     default_value {
-      type: DT_FLOAT
+      b: true
     }
+  }
+  attr {
+    name: "Tparams"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_QINT8
       }
     }
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+}
+op {
+  name: "GatherNd"
+  input_arg {
+    name: "params"
+    type_attr: "Tparams"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tparams"
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "Tparams"
+    type: "type"
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
+    name: "Tindices"
+    type: "type"
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "MaxPool3D"
+  name: "GatherV2"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "params"
+    type_attr: "Tparams"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    type_attr: "Tparams"
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "Tparams"
+    type: "type"
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "Tindices"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "T"
+    name: "Taxis"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "MaxPool3D"
+  name: "GenerateVocabRemapping"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "new_vocab_file"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "old_vocab_file"
+    type: DT_STRING
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "remapping"
+    type: DT_INT64
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  output_arg {
+    name: "num_present"
+    type: DT_INT32
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "new_vocab_offset"
+    type: "int"
     has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
+    name: "num_new_vocab"
+    type: "int"
+    has_minimum: true
   }
 }
 op {
-  name: "MaxPool3DGrad"
+  name: "GenerateVocabRemapping"
   input_arg {
-    name: "orig_input"
-    type: DT_FLOAT
+    name: "new_vocab_file"
+    type: DT_STRING
   }
   input_arg {
-    name: "orig_output"
-    type: DT_FLOAT
+    name: "old_vocab_file"
+    type: DT_STRING
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
+  output_arg {
+    name: "remapping"
+    type: DT_INT64
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "num_present"
+    type: DT_INT32
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "new_vocab_offset"
+    type: "int"
     has_minimum: true
-    minimum: 5
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "num_new_vocab"
+    type: "int"
     has_minimum: true
-    minimum: 5
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    name: "old_vocab_size"
+    type: "int"
+    default_value {
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
+  }
+}
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
   }
 }
 op {
-  name: "MaxPool3DGrad"
+  name: "GetSessionHandle"
   input_arg {
-    name: "orig_input"
-    type: DT_FLOAT
+    name: "value"
+    type_attr: "T"
   }
-  input_arg {
-    name: "orig_output"
-    type: DT_FLOAT
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
   }
+  deprecation {
+    version: 23
+  }
+}
+op {
+  name: "GetSessionHandle"
   input_arg {
-    name: "grad"
+    name: "value"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "handle"
+    type: DT_STRING
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "T"
+    type: "type"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+}
+op {
+  name: "GetSessionHandleV2"
+  input_arg {
+    name: "value"
+    type_attr: "T"
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
-    }
   }
+  is_stateful: true
 }
 op {
-  name: "MaxPool3DGrad"
+  name: "GetSessionTensor"
   input_arg {
-    name: "orig_input"
-    type_attr: "TInput"
+    name: "handle"
+    type: DT_STRING
   }
-  input_arg {
-    name: "orig_output"
-    type_attr: "TInput"
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
   }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "Greater"
   input_arg {
-    name: "grad"
+    name: "x"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "y"
     type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  output_arg {
+    name: "z"
+    type: DT_BOOL
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
+}
+op {
+  name: "Greater"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+}
+op {
+  name: "Greater"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
   attr {
-    name: "TInput"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MaxPool3DGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
+  name: "GreaterEqual"
   input_arg {
-    name: "orig_output"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "z"
+    type: DT_BOOL
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
+}
+op {
+  name: "GreaterEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+}
+op {
+  name: "GreaterEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MaxPoolGrad"
+  name: "GroupByWindowDataset"
   input_arg {
-    name: "orig_input"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "orig_output"
-    type_attr: "T"
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
     has_minimum: true
-    minimum: 4
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
     has_minimum: true
-    minimum: 4
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-      }
-    }
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGrad"
+  name: "GroupByWindowDataset"
   input_arg {
-    name: "orig_input"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "orig_output"
-    type_attr: "T"
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
     has_minimum: true
-    minimum: 4
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
     has_minimum: true
-    minimum: 4
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "GuaranteeConst"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "HSVToRGB"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -17791,149 +18351,149 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
+  name: "HashTable"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "key_dtype"
+    type: "type"
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "HashTableV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   attr {
-    name: "padding"
+    name: "container"
     type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "data_format"
+    name: "shared_name"
     type: "string"
     default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
+      s: ""
     }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "use_node_name_sharing"
+    type: "bool"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+      b: false
     }
   }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGradGrad"
+  name: "HistogramFixedWidth"
   input_arg {
-    name: "orig_input"
+    name: "values"
     type_attr: "T"
   }
   input_arg {
-    name: "orig_output"
+    name: "value_range"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "nbins"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "out"
+    type_attr: "dtype"
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "dtype"
+    type: "type"
     default_value {
-      s: "NHWC"
+      type: DT_INT32
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+}
+op {
+  name: "HistogramSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
@@ -17950,61 +18510,25 @@ op {
   }
 }
 op {
-  name: "MaxPoolGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
+  name: "HistogramSummary"
   input_arg {
-    name: "orig_output"
-    type_attr: "T"
+    name: "tag"
+    type: DT_STRING
   }
   input_arg {
-    name: "grad"
+    name: "values"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+    name: "summary"
+    type: DT_STRING
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
@@ -18023,57 +18547,25 @@ op {
   }
 }
 op {
-  name: "MaxPoolGradGradV2"
+  name: "HistogramSummary"
   input_arg {
-    name: "orig_input"
-    type_attr: "T"
+    name: "tag"
+    type: DT_STRING
   }
   input_arg {
-    name: "orig_output"
+    name: "values"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+    name: "summary"
+    type: DT_STRING
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
@@ -18085,197 +18577,182 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGradV2"
+  name: "IFFT"
   input_arg {
-    name: "orig_input"
-    type_attr: "T"
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
   }
+}
+op {
+  name: "IFFT2D"
   input_arg {
-    name: "orig_output"
-    type_attr: "T"
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
   }
+}
+op {
+  name: "IFFT3D"
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "IRFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
   }
   input_arg {
-    name: "ksize"
+    name: "fft_length"
     type: DT_INT32
   }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "IRFFT2D"
   input_arg {
-    name: "strides"
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type: DT_FLOAT
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+}
+op {
+  name: "IRFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
   }
 }
 op {
-  name: "MaxPoolGradGradWithArgmax"
+  name: "Identity"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "IdentityN"
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "input"
+    type_list_attr: "T"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_list_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "T"
+    type: "list(type)"
     has_minimum: true
-    minimum: 4
+    minimum: 1
+  }
+}
+op {
+  name: "IdentityReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "padding"
+    name: "shared_name"
     type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
+}
+op {
+  name: "IdentityReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
   attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGradGradWithArgmax"
+  name: "Igamma"
   input_arg {
-    name: "input"
+    name: "a"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
   attr {
     name: "T"
     type: "type"
@@ -18283,140 +18760,107 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradV2"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
+  name: "Igammac"
   input_arg {
-    name: "orig_output"
+    name: "a"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradV2"
+  name: "IgnoreErrorsDataset"
   input_arg {
-    name: "orig_input"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
+  is_stateful: true
+}
+op {
+  name: "IgnoreErrorsDataset"
   input_arg {
-    name: "strides"
-    type: DT_INT32
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Imag"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
     default_value {
-      s: "NHWC"
+      type: DT_COMPLEX64
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
   attr {
-    name: "T"
+    name: "Tout"
     type: "type"
     default_value {
       type: DT_FLOAT
@@ -18425,68 +18869,32 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "ImageSummary"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "tag"
+    type: DT_STRING
   }
   input_arg {
-    name: "grad"
+    name: "tensor"
     type_attr: "T"
   }
-  input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
   output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "summary"
+    type: DT_STRING
   }
   attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "max_images"
+    type: "int"
+    default_value {
+      i: 3
     }
+    has_minimum: true
+    minimum: 1
   }
   attr {
     name: "T"
@@ -18496,477 +18904,449 @@ op {
     }
     allowed_values {
       list {
+        type: DT_UINT8
         type: DT_FLOAT
         type: DT_HALF
       }
     }
   }
+  attr {
+    name: "bad_color"
+    type: "tensor"
+    default_value {
+      tensor {
+        dtype: DT_UINT8
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        int_val: 255
+        int_val: 0
+        int_val: 0
+        int_val: 255
+      }
+    }
+  }
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "ImageSummary"
   input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "tag"
+    type: DT_STRING
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
+    name: "tensor"
     type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  output_arg {
+    name: "summary"
+    type: DT_STRING
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    name: "max_images"
+    type: "int"
+    default_value {
+      i: 3
     }
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "Targmax"
+    name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_UINT8
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
+    name: "bad_color"
+    type: "tensor"
+    default_value {
+      tensor {
+        dtype: DT_UINT8
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        int_val: 255
+        int_val: 0
+        int_val: 0
+        int_val: 255
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradWithArgmax"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
+  name: "ImmutableConst"
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "tensor"
+    type_attr: "dtype"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "dtype"
+    type: "type"
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "shape"
+    type: "shape"
   }
   attr {
-    name: "padding"
+    name: "memory_region_name"
     type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  }
+}
+op {
+  name: "InTopK"
+  input_arg {
+    name: "predictions"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "targets"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "precision"
+    type: DT_BOOL
   }
   attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "k"
+    type: "int"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolV2"
+  name: "InTopKV2"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "predictions"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "ksize"
-    type: DT_INT32
+    name: "targets"
+    type_attr: "T"
   }
   input_arg {
-    name: "strides"
-    type: DT_INT32
+    name: "k"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "precision"
+    type: DT_BOOL
   }
   attr {
     name: "T"
     type: "type"
     default_value {
-      type: DT_FLOAT
+      type: DT_INT32
     }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
+}
+op {
+  name: "InitializeTable"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tkey"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tval"
+  }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "Tkey"
+    type: "type"
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+    name: "Tval"
+    type: "type"
   }
 }
 op {
-  name: "MaxPoolV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "InitializeTableFromTextFile"
   input_arg {
-    name: "ksize"
-    type: DT_INT32
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "strides"
-    type: DT_INT32
+    name: "filename"
+    type: DT_STRING
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "key_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
   }
   attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_QINT8
-      }
-    }
+    name: "value_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    name: "vocab_size"
+    type: "int"
+    default_value {
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "data_format"
+    name: "delimiter"
     type: "string"
     default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
-      }
+      s: "\t"
     }
   }
 }
 op {
-  name: "MaxPoolWithArgmax"
+  name: "InitializeTableFromTextFileV2"
   input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "key_index"
+    type: "int"
     has_minimum: true
-    minimum: 4
+    minimum: -2
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "value_index"
+    type: "int"
     has_minimum: true
-    minimum: 4
+    minimum: -2
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "vocab_size"
+    type: "int"
     default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "padding"
+    name: "delimiter"
     type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    default_value {
+      s: "\t"
     }
   }
+  is_stateful: true
+}
+op {
+  name: "InitializeTableV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tkey"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tval"
+  }
   attr {
-    name: "T"
+    name: "Tkey"
+    type: "type"
+  }
+  attr {
+    name: "Tval"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-      }
-    }
   }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolWithArgmax"
+  name: "InterleaveDataset"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
   }
   output_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
     has_minimum: true
-    minimum: 4
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "output_types"
+    type: "list(type)"
     has_minimum: true
-    minimum: 4
+    minimum: 1
   }
   attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "InterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+  deprecation {
+    version: 17
+  }
 }
 op {
-  name: "MaxPoolWithArgmax"
+  name: "Inv"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
-  output_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+  deprecation {
+    version: 17
+  }
 }
 op {
-  name: "Maximum"
+  name: "InvGrad"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -18987,358 +19367,143 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  is_commutative: true
+  deprecation {
+    version: 17
+  }
 }
 op {
-  name: "Mean"
+  name: "InvGrad"
   input_arg {
-    name: "input"
+    name: "y"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "dy"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  deprecation {
+    version: 17
   }
 }
 op {
-  name: "Mean"
+  name: "InvGrad"
   input_arg {
-    name: "input"
+    name: "y"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "dy"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  deprecation {
+    version: 17
   }
 }
 op {
-  name: "Merge"
+  name: "Invert"
   input_arg {
-    name: "inputs"
+    name: "x"
     type_attr: "T"
-    number_attr: "N"
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
-  output_arg {
-    name: "value_index"
-    type: DT_INT32
-  }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "MergeSummary"
-  input_arg {
-    name: "inputs"
-    type: DT_STRING
-    number_attr: "N"
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "MergeV2Checkpoints"
-  input_arg {
-    name: "checkpoint_prefixes"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "destination_prefix"
-    type: DT_STRING
-  }
-  attr {
-    name: "delete_old_dirs"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "MergeV2Checkpoints"
-  input_arg {
-    name: "checkpoint_prefixes"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "destination_prefix"
-    type: DT_STRING
-  }
-  attr {
-    name: "delete_old_dirs"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Mfcc"
-  input_arg {
-    name: "spectrogram"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "sample_rate"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "upper_frequency_limit"
-    type: "float"
-    default_value {
-      f: 4000
-    }
-  }
-  attr {
-    name: "lower_frequency_limit"
-    type: "float"
-    default_value {
-      f: 20
-    }
-  }
-  attr {
-    name: "filterbank_channel_count"
-    type: "int"
-    default_value {
-      i: 40
-    }
-  }
-  attr {
-    name: "dct_coefficient_count"
-    type: "int"
-    default_value {
-      i: 13
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
     }
   }
 }
 op {
-  name: "Min"
+  name: "Invert"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Min"
+  name: "InvertPermutation"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
     default_value {
       type: DT_INT32
     }
@@ -19351,18 +19516,14 @@ op {
   }
 }
 op {
-  name: "Minimum"
+  name: "IsFinite"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "y"
+    type: DT_BOOL
   }
   attr {
     name: "T"
@@ -19372,118 +19533,72 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "MirrorPad"
+  name: "IsFinite"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "y"
+    type: DT_BOOL
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
     allowed_values {
       list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "MirrorPadGrad"
+  name: "IsInf"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "y"
+    type: DT_BOOL
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
     allowed_values {
       list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "Mod"
+  name: "IsInf"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "y"
+    type: DT_BOOL
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -19491,18 +19606,14 @@ op {
   }
 }
 op {
-  name: "Mul"
+  name: "IsNan"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "y"
+    type: DT_BOOL
   }
   attr {
     name: "T"
@@ -19512,246 +19623,239 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "Multinomial"
+  name: "IsNan"
   input_arg {
-    name: "logits"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "num_samples"
-    type: DT_INT32
-  }
   output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "y"
+    type: DT_BOOL
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "Multinomial"
+  name: "IsVariableInitialized"
   input_arg {
-    name: "logits"
-    type_attr: "T"
+    name: "ref"
+    type_attr: "dtype"
+    is_ref: true
   }
-  input_arg {
-    name: "num_samples"
-    type: DT_INT32
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  attr {
+    name: "dtype"
+    type: "type"
   }
+  allows_uninitialized_input: true
+}
+op {
+  name: "Iterator"
   output_arg {
-    name: "output"
-    type: DT_INT64
+    name: "handle"
+    type: DT_RESOURCE
   }
   attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "shared_name"
+    type: "string"
   }
   attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "container"
+    type: "string"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
   is_stateful: true
 }
 op {
-  name: "MutableDenseHashTable"
+  name: "IteratorFromStringHandle"
   input_arg {
-    name: "empty_key"
-    type_attr: "key_dtype"
+    name: "string_handle"
+    type: DT_STRING
   }
   output_arg {
-    name: "table_handle"
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorFromStringHandle"
+  input_arg {
+    name: "string_handle"
     type: DT_STRING
-    is_ref: true
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "output_types"
+    type: "list(type)"
     default_value {
-      s: ""
+      list {
+      }
     }
+    has_minimum: true
   }
   attr {
-    name: "use_node_name_sharing"
-    type: "bool"
+    name: "output_shapes"
+    type: "list(shape)"
     default_value {
-      b: false
+      list {
+      }
     }
+    has_minimum: true
   }
-  attr {
-    name: "key_dtype"
-    type: "type"
+  is_stateful: true
+}
+op {
+  name: "IteratorGetNext"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "value_dtype"
-    type: "type"
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
   }
   attr {
-    name: "value_shape"
-    type: "shape"
-    default_value {
-      shape {
-      }
-    }
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "initial_num_buckets"
-    type: "int"
-    default_value {
-      i: 131072
-    }
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
-  attr {
-    name: "max_load_factor"
-    type: "float"
-    default_value {
-      f: 0.8
-    }
+  is_stateful: true
+}
+op {
+  name: "IteratorSetStatsAggregator"
+  input_arg {
+    name: "iterator_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stats_aggregator_handle"
+    type: DT_RESOURCE
   }
   is_stateful: true
 }
 op {
-  name: "MutableDenseHashTableV2"
+  name: "IteratorToStringHandle"
   input_arg {
-    name: "empty_key"
-    type_attr: "key_dtype"
+    name: "resource_handle"
+    type: DT_RESOURCE
   }
   output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "string_handle"
+    type: DT_STRING
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  is_stateful: true
+}
+op {
+  name: "L2Loss"
+  input_arg {
+    name: "t"
+    type_attr: "T"
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
     }
   }
-  attr {
-    name: "key_dtype"
-    type: "type"
+}
+op {
+  name: "L2Loss"
+  input_arg {
+    name: "t"
+    type_attr: "T"
   }
-  attr {
-    name: "value_dtype"
-    type: "type"
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "value_shape"
-    type: "shape"
-    default_value {
-      shape {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
-  attr {
-    name: "initial_num_buckets"
-    type: "int"
-    default_value {
-      i: 131072
-    }
+}
+op {
+  name: "L2Loss"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "max_load_factor"
-    type: "float"
-    default_value {
-      f: 0.8
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MutableHashTable"
+  name: "LMDBReader"
   output_arg {
-    name: "table_handle"
+    name: "reader_handle"
     type: DT_STRING
     is_ref: true
   }
@@ -19769,283 +19873,438 @@ op {
       s: ""
     }
   }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
   is_stateful: true
 }
 op {
-  name: "MutableHashTableOfTensors"
+  name: "LRN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
   output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "depth_radius"
+    type: "int"
     default_value {
-      s: ""
+      i: 5
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "bias"
+    type: "float"
     default_value {
-      s: ""
+      f: 1
     }
   }
   attr {
-    name: "use_node_name_sharing"
-    type: "bool"
+    name: "alpha"
+    type: "float"
     default_value {
-      b: false
+      f: 1
     }
   }
   attr {
-    name: "key_dtype"
-    type: "type"
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
   }
   attr {
-    name: "value_dtype"
+    name: "T"
     type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
     default_value {
-      shape {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MutableHashTableOfTensorsV2"
+  name: "LRN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
   output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "depth_radius"
+    type: "int"
     default_value {
-      s: ""
+      i: 5
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "bias"
+    type: "float"
     default_value {
-      s: ""
+      f: 1
     }
   }
   attr {
-    name: "use_node_name_sharing"
-    type: "bool"
+    name: "alpha"
+    type: "float"
     default_value {
-      b: false
+      f: 1
     }
   }
   attr {
-    name: "key_dtype"
-    type: "type"
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
   }
   attr {
-    name: "value_dtype"
+    name: "T"
     type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
     default_value {
-      shape {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MutableHashTableV2"
+  name: "LRNGrad"
+  input_arg {
+    name: "input_grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_image"
+    type_attr: "T"
+  }
   output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "depth_radius"
+    type: "int"
     default_value {
-      s: ""
+      i: 5
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "bias"
+    type: "float"
     default_value {
-      s: ""
+      f: 1
     }
   }
   attr {
-    name: "use_node_name_sharing"
-    type: "bool"
+    name: "alpha"
+    type: "float"
     default_value {
-      b: false
+      f: 1
     }
   }
   attr {
-    name: "key_dtype"
-    type: "type"
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
   }
   attr {
-    name: "value_dtype"
+    name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "Neg"
+  name: "LRNGrad"
   input_arg {
-    name: "x"
+    name: "input_grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_image"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "NegTrain"
-  input_arg {
-    name: "w_in"
-    type: DT_FLOAT
-    is_ref: true
-  }
-  input_arg {
-    name: "w_out"
-    type: DT_FLOAT
-    is_ref: true
-  }
+  name: "LatencyStatsDataset"
   input_arg {
-    name: "examples"
-    type: DT_INT32
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "labels"
-    type: DT_INT32
+    name: "tag"
+    type: DT_STRING
   }
-  input_arg {
-    name: "lr"
-    type: DT_FLOAT
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "vocab_count"
-    type: "list(int)"
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "num_negative_samples"
-    type: "int"
-  }
-  deprecation {
-    version: 19
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
-  is_stateful: true
 }
 op {
-  name: "NextIteration"
+  name: "LearnedUnigramCandidateSampler"
   input_arg {
-    name: "data"
-    type_attr: "T"
+    name: "true_classes"
+    type: DT_INT64
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
 }
 op {
-  name: "NoOp"
-}
-op {
-  name: "NonMaxSuppression"
+  name: "LearnedUnigramCandidateSampler"
   input_arg {
-    name: "boxes"
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
     type: DT_FLOAT
   }
-  input_arg {
-    name: "scores"
+  output_arg {
+    name: "sampled_expected_count"
     type: DT_FLOAT
   }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "iou_threshold"
-    type: "float"
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
     default_value {
-      f: 0.5
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
+  is_stateful: true
 }
 op {
-  name: "NonMaxSuppressionV2"
+  name: "LeftShift"
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "scores"
-    type: DT_FLOAT
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Less"
   input_arg {
-    name: "max_output_size"
-    type: DT_INT32
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
   }
 }
 op {
-  name: "NotEqual"
+  name: "Less"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -20063,46 +20322,34 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "NthElement"
+  name: "Less"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "n"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
+    name: "y"
     type_attr: "T"
   }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
   }
   attr {
     name: "T"
@@ -20120,98 +20367,136 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "OneHot"
+  name: "LessEqual"
   input_arg {
-    name: "indices"
-    type_attr: "TI"
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "depth"
-    type: DT_INT32
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
   }
+}
+op {
+  name: "LessEqual"
   input_arg {
-    name: "on_value"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "off_value"
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "axis"
-    type: "int"
-    default_value {
-      i: -1
-    }
+    name: "z"
+    type: DT_BOOL
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "TI"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
     allowed_values {
       list {
-        type: DT_UINT8
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "OneShotIterator"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "dataset_factory"
-    type: "func"
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "y"
+    type_attr: "T"
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "z"
+    type: DT_BOOL
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
     }
   }
+}
+op {
+  name: "Lgamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "OnesLike"
+  name: "Lgamma"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -20225,676 +20510,754 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "OrderedMapClear"
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  name: "LinSpace"
+  input_arg {
+    name: "start"
+    type_attr: "T"
   }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "stop"
+    type_attr: "T"
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+  input_arg {
+    name: "num"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "Tidx"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "OrderedMapIncompleteSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
+  name: "LinSpace"
+  input_arg {
+    name: "start"
+    type_attr: "T"
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "stop"
+    type_attr: "T"
   }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "num"
+    type_attr: "Tidx"
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "Tidx"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "OrderedMapPeek"
+  name: "ListDiff"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
+    name: "out"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "memory_limit"
-    type: "int"
+    name: "out_idx"
+    type: "type"
     default_value {
-      i: 0
+      type: DT_INT32
     }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "OrderedMapStage"
+  name: "LoadAndRemapMatrix"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "ckpt_path"
+    type: DT_STRING
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "old_tensor_name"
+    type: DT_STRING
   }
   input_arg {
-    name: "values"
-    type_list_attr: "fake_dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "fake_dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "row_remapping"
+    type: DT_INT64
   }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapUnstage"
   input_arg {
-    name: "key"
+    name: "col_remapping"
     type: DT_INT64
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "initializing_values"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
+    name: "output_matrix"
+    type: DT_FLOAT
   }
   attr {
-    name: "capacity"
+    name: "num_rows"
     type: "int"
-    default_value {
-      i: 0
-    }
     has_minimum: true
   }
   attr {
-    name: "memory_limit"
+    name: "num_cols"
     type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
+    name: "max_rows_in_memory"
+    type: "int"
     default_value {
-      s: ""
+      i: -1
     }
   }
   is_stateful: true
 }
 op {
-  name: "OrderedMapUnstageNoKey"
+  name: "Log"
   input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "key"
-    type: DT_INT64
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
-    has_minimum: true
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+}
+op {
+  name: "Log"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "Pack"
+  name: "Log1p"
   input_arg {
-    name: "values"
+    name: "x"
     type_attr: "T"
-    number_attr: "N"
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "axis"
-    type: "int"
-    default_value {
-      i: 0
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
 }
 op {
-  name: "Pad"
+  name: "Log1p"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "PadV2"
+  name: "LogMatrixDeterminant"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
-  input_arg {
-    name: "constant_values"
+  output_arg {
+    name: "sign"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "log_abs_determinant"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "PaddedBatchDataset"
+  name: "LogSoftmax"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "logits"
+    type_attr: "T"
   }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
+  output_arg {
+    name: "logsoftmax"
+    type_attr: "T"
   }
-  input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
+}
+op {
+  name: "LogSoftmax"
   input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
+    name: "logits"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "logsoftmax"
+    type_attr: "T"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "PaddedBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
+  name: "LogUniformCandidateSampler"
   input_arg {
-    name: "batch_size"
+    name: "true_classes"
     type: DT_INT64
   }
-  input_arg {
-    name: "padded_shapes"
+  output_arg {
+    name: "sampled_candidates"
     type: DT_INT64
-    number_attr: "N"
-  }
-  input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "true_expected_count"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "num_true"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "N"
+    name: "num_sampled"
     type: "int"
     has_minimum: true
     minimum: 1
   }
-}
-op {
-  name: "PaddingFIFOQueue"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
   attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "unique"
+    type: "bool"
   }
   attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
+    name: "range_max"
+    type: "int"
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "capacity"
+    name: "seed"
     type: "int"
     default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
+      i: 0
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "seed2"
+    type: "int"
     default_value {
-      s: ""
+      i: 0
     }
   }
-  is_stateful: true
 }
 op {
-  name: "PaddingFIFOQueueV2"
+  name: "LogUniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
   output_arg {
-    name: "handle"
-    type: DT_RESOURCE
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
   }
   attr {
-    name: "component_types"
-    type: "list(type)"
+    name: "num_true"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
+    name: "num_sampled"
+    type: "int"
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "capacity"
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
     type: "int"
-    default_value {
-      i: -1
-    }
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "seed"
+    type: "int"
     default_value {
-      s: ""
+      i: 0
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "seed2"
+    type: "int"
     default_value {
-      s: ""
+      i: 0
     }
   }
   is_stateful: true
 }
 op {
-  name: "ParallelConcat"
+  name: "LogicalAnd"
   input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
+    name: "x"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "y"
+    type: DT_BOOL
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "z"
+    type: DT_BOOL
+  }
+  is_commutative: true
+}
+op {
+  name: "LogicalNot"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "LogicalOr"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  is_commutative: true
+}
+op {
+  name: "LookupTableExport"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "Tkeys"
+    type: "type"
   }
   attr {
-    name: "T"
+    name: "Tvalues"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableExportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tkeys"
     type: "type"
   }
   attr {
-    name: "shape"
-    type: "shape"
+    name: "Tvalues"
+    type: "type"
   }
+  is_stateful: true
 }
 op {
-  name: "ParallelDynamicStitch"
+  name: "LookupTableFind"
   input_arg {
-    name: "indices"
-    type: DT_INT32
-    number_attr: "N"
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "data"
-    type_attr: "T"
-    number_attr: "N"
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
   }
   output_arg {
-    name: "merged"
-    type_attr: "T"
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "T"
+    name: "Tout"
     type: "type"
   }
 }
 op {
-  name: "ParallelInterleaveDataset"
+  name: "LookupTableFindV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableImport"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableImportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableInsert"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableInsertV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableSize"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+}
+op {
+  name: "LookupTableSizeV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
+op {
+  name: "LoopCond"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "MakeIterator"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "MapAndBatchDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
@@ -20904,17 +21267,13 @@ op {
     type_list_attr: "Targuments"
   }
   input_arg {
-    name: "cycle_length"
+    name: "batch_size"
     type: DT_INT64
   }
   input_arg {
-    name: "block_length"
+    name: "num_parallel_batches"
     type: DT_INT64
   }
-  input_arg {
-    name: "sloppy"
-    type: DT_BOOL
-  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -20942,7 +21301,45 @@ op {
   }
 }
 op {
-  name: "ParallelMapDataset"
+  name: "MapClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
@@ -20951,10 +21348,6 @@ op {
     name: "other_arguments"
     type_list_attr: "Targuments"
   }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT32
-  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -20983,7 +21376,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ParallelMapDataset"
+  name: "MapDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
@@ -20992,10 +21385,6 @@ op {
     name: "other_arguments"
     type_list_attr: "Targuments"
   }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT32
-  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -21023,226 +21412,213 @@ op {
   }
 }
 op {
-  name: "ParameterizedTruncatedNormal"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "means"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "stdevs"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "minvals"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "maxvals"
-    type_attr: "dtype"
-  }
+  name: "MapIncompleteSize"
   output_arg {
-    name: "output"
-    type_attr: "dtype"
+    name: "size"
+    type: DT_INT32
   }
   attr {
-    name: "seed"
+    name: "capacity"
     type: "int"
     default_value {
       i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "seed2"
+    name: "memory_limit"
     type: "int"
     default_value {
       i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   is_stateful: true
 }
 op {
-  name: "ParseExample"
-  input_arg {
-    name: "serialized"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "names"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "sparse_keys"
-    type: DT_STRING
-    number_attr: "Nsparse"
-  }
-  input_arg {
-    name: "dense_keys"
-    type: DT_STRING
-    number_attr: "Ndense"
-  }
+  name: "MapPeek"
   input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "sparse_indices"
+    name: "key"
     type: DT_INT64
-    number_attr: "Nsparse"
-  }
-  output_arg {
-    name: "sparse_values"
-    type_list_attr: "sparse_types"
   }
-  output_arg {
-    name: "sparse_shapes"
-    type: DT_INT64
-    number_attr: "Nsparse"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
   }
   output_arg {
-    name: "dense_values"
-    type_list_attr: "Tdense"
+    name: "values"
+    type_list_attr: "dtypes"
   }
   attr {
-    name: "Nsparse"
+    name: "capacity"
     type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
   }
   attr {
-    name: "Ndense"
+    name: "memory_limit"
     type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
   }
   attr {
-    name: "sparse_types"
+    name: "dtypes"
     type: "list(type)"
     has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
+    minimum: 1
   }
   attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "ParseSingleSequenceExample"
-  input_arg {
-    name: "serialized"
-    type: DT_STRING
+  name: "MapSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
   }
-  input_arg {
-    name: "feature_list_dense_missing_assumed_empty"
-    type: DT_STRING
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
   }
-  input_arg {
-    name: "context_sparse_keys"
-    type: DT_STRING
-    number_attr: "Ncontext_sparse"
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
   }
-  input_arg {
-    name: "context_dense_keys"
-    type: DT_STRING
-    number_attr: "Ncontext_dense"
+  attr {
+    name: "dtypes"
+    type: "list(type)"
   }
-  input_arg {
-    name: "feature_list_sparse_keys"
-    type: DT_STRING
-    number_attr: "Nfeature_list_sparse"
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapStage"
   input_arg {
-    name: "feature_list_dense_keys"
-    type: DT_STRING
-    number_attr: "Nfeature_list_dense"
+    name: "key"
+    type: DT_INT64
   }
   input_arg {
-    name: "context_dense_defaults"
-    type_list_attr: "Tcontext_dense"
+    name: "indices"
+    type: DT_INT32
   }
   input_arg {
-    name: "debug_name"
-    type: DT_STRING
+    name: "values"
+    type_list_attr: "fake_dtypes"
   }
-  output_arg {
-    name: "context_sparse_indices"
-    type: DT_INT64
-    number_attr: "Ncontext_sparse"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
   }
-  output_arg {
-    name: "context_sparse_values"
-    type_list_attr: "context_sparse_types"
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
   }
-  output_arg {
-    name: "context_sparse_shapes"
-    type: DT_INT64
-    number_attr: "Ncontext_sparse"
+  attr {
+    name: "dtypes"
+    type: "list(type)"
   }
-  output_arg {
-    name: "context_dense_values"
-    type_list_attr: "Tcontext_dense"
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
-  output_arg {
-    name: "feature_list_sparse_indices"
-    type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "feature_list_sparse_values"
-    type_list_attr: "feature_list_sparse_types"
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "feature_list_sparse_shapes"
+  is_stateful: true
+}
+op {
+  name: "MapUnstage"
+  input_arg {
+    name: "key"
     type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
   }
   output_arg {
-    name: "feature_list_dense_values"
-    type_list_attr: "feature_list_dense_types"
+    name: "values"
+    type_list_attr: "dtypes"
   }
   attr {
-    name: "Ncontext_sparse"
+    name: "capacity"
     type: "int"
     default_value {
       i: 0
@@ -21250,7 +21626,7 @@ op {
     has_minimum: true
   }
   attr {
-    name: "Ncontext_dense"
+    name: "memory_limit"
     type: "int"
     default_value {
       i: 0
@@ -21258,7 +21634,43 @@ op {
     has_minimum: true
   }
   attr {
-    name: "Nfeature_list_sparse"
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapUnstageNoKey"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
     type: "int"
     default_value {
       i: 0
@@ -21266,7 +21678,7 @@ op {
     has_minimum: true
   }
   attr {
-    name: "Nfeature_list_dense"
+    name: "memory_limit"
     type: "int"
     default_value {
       i: 0
@@ -21274,295 +21686,350 @@ op {
     has_minimum: true
   }
   attr {
-    name: "context_sparse_types"
+    name: "dtypes"
     type: "list(type)"
-    default_value {
-      list {
-      }
-    }
     has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "Tcontext_dense"
-    type: "list(type)"
+    name: "shared_name"
+    type: "string"
     default_value {
-      list {
-      }
+      s: ""
     }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
+  }
+  is_stateful: true
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "feature_list_dense_types"
-    type: "list(type)"
+    name: "transpose_b"
+    type: "bool"
     default_value {
-      list {
-      }
+      b: false
     }
-    has_minimum: true
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
   attr {
-    name: "context_dense_shapes"
-    type: "list(shape)"
+    name: "transpose_a"
+    type: "bool"
     default_value {
-      list {
-      }
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "feature_list_sparse_types"
-    type: "list(type)"
+    name: "transpose_b"
+    type: "bool"
     default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
+      b: false
     }
   }
   attr {
-    name: "feature_list_dense_shapes"
-    type: "list(shape)"
-    default_value {
+    name: "T"
+    type: "type"
+    allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
-    has_minimum: true
   }
 }
 op {
-  name: "ParseTensor"
+  name: "MatchingFiles"
   input_arg {
-    name: "serialized"
+    name: "pattern"
     type: DT_STRING
   }
   output_arg {
-    name: "output"
-    type_attr: "out_type"
+    name: "filenames"
+    type: DT_STRING
+  }
+}
+op {
+  name: "MatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_upper"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
   }
   attr {
-    name: "out_type"
+    name: "T"
     type: "type"
   }
 }
 op {
-  name: "Placeholder"
+  name: "MatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
   output_arg {
     name: "output"
-    type_attr: "dtype"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-    default_value {
-      shape {
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "Placeholder"
+  name: "MatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
   output_arg {
     name: "output"
-    type_attr: "dtype"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "PlaceholderV2"
+  name: "MatrixDiag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
   output_arg {
     name: "output"
-    type_attr: "dtype"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
   }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
 }
 op {
-  name: "PlaceholderV2"
+  name: "MatrixDiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
   output_arg {
-    name: "output"
-    type_attr: "dtype"
+    name: "diagonal"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
   }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  deprecation {
-    version: 23
-  }
 }
 op {
-  name: "PlaceholderWithDefault"
+  name: "MatrixExponential"
   input_arg {
     name: "input"
-    type_attr: "dtype"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_attr: "dtype"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "Polygamma"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
+  name: "MatrixInverse"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_FLOAT
       }
     }
   }
 }
 op {
-  name: "PopulationCount"
+  name: "MatrixInverse"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
-    type: DT_UINT8
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "PopulationCount"
+  name: "MatrixSetDiag"
   input_arg {
-    name: "x"
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "diagonal"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
-    type: DT_UINT8
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
   }
 }
 op {
-  name: "Pow"
+  name: "MatrixSolve"
   input_arg {
-    name: "x"
+    name: "matrix"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "rhs"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_FLOAT
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -21570,65 +22037,18 @@ op {
   }
 }
 op {
-  name: "PrefetchDataset"
+  name: "MatrixSolveLs"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "matrix"
+    type_attr: "T"
   }
   input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "PrefetchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "rhs"
+    type_attr: "T"
   }
-}
-op {
-  name: "PreventGradient"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "l2_regularizer"
+    type: DT_DOUBLE
   }
   output_arg {
     name: "output"
@@ -21637,24 +22057,34 @@ op {
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
   }
   attr {
-    name: "message"
-    type: "string"
+    name: "fast"
+    type: "bool"
     default_value {
-      s: ""
+      b: true
     }
   }
 }
 op {
-  name: "Print"
+  name: "MatrixSolveLs"
   input_arg {
-    name: "input"
+    name: "matrix"
     type_attr: "T"
   }
   input_arg {
-    name: "data"
-    type_list_attr: "U"
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
   }
   output_arg {
     name: "output"
@@ -21663,171 +22093,162 @@ op {
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "U"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "message"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "first_n"
-    type: "int"
-    default_value {
-      i: -1
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
   attr {
-    name: "summarize"
-    type: "int"
+    name: "fast"
+    type: "bool"
     default_value {
-      i: 3
+      b: true
     }
   }
-  is_stateful: true
 }
 op {
-  name: "Print"
+  name: "MatrixTriangularSolve"
   input_arg {
-    name: "input"
+    name: "matrix"
     type_attr: "T"
   }
   input_arg {
-    name: "data"
-    type_list_attr: "U"
+    name: "rhs"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "U"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "message"
-    type: "string"
+    name: "lower"
+    type: "bool"
     default_value {
-      s: ""
+      b: true
     }
   }
   attr {
-    name: "first_n"
-    type: "int"
+    name: "adjoint"
+    type: "bool"
     default_value {
-      i: -1
+      b: false
     }
   }
   attr {
-    name: "summarize"
-    type: "int"
-    default_value {
-      i: 3
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "PriorityQueue"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
   }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
   }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    has_minimum: true
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "lower"
+    type: "bool"
     default_value {
-      i: -1
+      b: true
     }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "adjoint"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "PriorityQueueV2"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    has_minimum: true
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "keep_dims"
+    type: "bool"
     default_value {
-      i: -1
+      b: false
     }
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "Tidx"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "Prod"
+  name: "Max"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -21866,6 +22287,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -21884,7 +22307,7 @@ op {
   }
 }
 op {
-  name: "Prod"
+  name: "Max"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -21925,6 +22348,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -21943,92 +22367,66 @@ op {
   }
 }
 op {
-  name: "PyFunc"
+  name: "MaxPool"
   input_arg {
     name: "input"
-    type_list_attr: "Tin"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_list_attr: "Tout"
+    type_attr: "T"
   }
   attr {
-    name: "token"
-    type: "string"
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
   }
   attr {
-    name: "Tin"
-    type: "list(type)"
+    name: "ksize"
+    type: "list(int)"
     has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "Tout"
-    type: "list(type)"
+    name: "strides"
+    type: "list(int)"
     has_minimum: true
-  }
-  is_stateful: true
-}
-op {
-  name: "PyFuncStateless"
-  input_arg {
-    name: "input"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
+    minimum: 4
   }
   attr {
-    name: "token"
+    name: "padding"
     type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
   attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-}
-op {
-  name: "Qr"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "q"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "r"
-    type_attr: "T"
-  }
-  attr {
-    name: "full_matrices"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: false
+      s: "NHWC"
     }
-  }
-  attr {
-    name: "T"
-    type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
 }
 op {
-  name: "QuantizeAndDequantize"
+  name: "MaxPool"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -22038,53 +22436,63 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "signed_input"
-    type: "bool"
+    name: "T"
+    type: "type"
     default_value {
-      b: true
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
     }
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "input_max"
-    type: "float"
+    name: "data_format"
+    type: "string"
     default_value {
-      f: 0
+      s: "NHWC"
     }
-  }
-  attr {
-    name: "T"
-    type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
 }
 op {
-  name: "QuantizeAndDequantize"
+  name: "MaxPool"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -22094,56 +22502,65 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "signed_input"
-    type: "bool"
+    name: "T"
+    type: "type"
     default_value {
-      b: true
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_QINT8
+      }
     }
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "input_max"
-    type: "float"
+    name: "data_format"
+    type: "string"
     default_value {
-      f: 0
+      s: "NHWC"
     }
-  }
-  attr {
-    name: "T"
-    type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
       }
     }
   }
-  deprecation {
-    version: 21
-  }
 }
 op {
-  name: "QuantizeAndDequantize"
+  name: "MaxPool"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -22153,91 +22570,94 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "signed_input"
-    type: "bool"
+    name: "T"
+    type: "type"
     default_value {
-      b: true
+      type: DT_FLOAT
     }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_QINT8
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
       }
     }
   }
-  deprecation {
-    version: 22
-  }
 }
 op {
-  name: "QuantizeAndDequantizeV2"
+  name: "MaxPool3D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "input_min"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_max"
-    type_attr: "T"
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
@@ -22246,45 +22666,53 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "QuantizeAndDequantizeV3"
+  name: "MaxPool3D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "input_min"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_max"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num_bits"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: true
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
     }
   }
   attr {
@@ -22293,692 +22721,723 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "QuantizeDownAndShrinkRange"
+  name: "MaxPool3D"
   input_arg {
     name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "input_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "input_max"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_attr: "out_type"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "Tinput"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "out_type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_FLOAT
       }
     }
   }
 }
 op {
-  name: "QuantizeV2"
+  name: "MaxPool3DGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_range"
+    name: "orig_output"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_range"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "mode"
-    type: "string"
-    default_value {
-      s: "MIN_COMBINED"
-    }
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
+        type: DT_FLOAT
       }
     }
   }
 }
 op {
-  name: "QuantizeV2"
+  name: "MaxPool3DGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_range"
+    name: "orig_output"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_range"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "mode"
+    name: "data_format"
     type: "string"
     default_value {
-      s: "MIN_COMBINED"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
       }
     }
   }
 }
 op {
-  name: "QuantizeV2"
+  name: "MaxPool3DGrad"
   input_arg {
-    name: "input"
-    type: DT_FLOAT
+    name: "orig_input"
+    type_attr: "TInput"
   }
   input_arg {
-    name: "min_range"
-    type: DT_FLOAT
+    name: "orig_output"
+    type_attr: "TInput"
   }
   input_arg {
-    name: "max_range"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "mode"
+    name: "data_format"
     type: "string"
     default_value {
-      s: "MIN_COMBINED"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
   attr {
-    name: "round_mode"
-    type: "string"
+    name: "T"
+    type: "type"
     default_value {
-      s: "HALF_AWAY_FROM_ZERO"
+      type: DT_FLOAT
     }
     allowed_values {
       list {
-        s: "HALF_AWAY_FROM_ZERO"
-        s: "HALF_TO_EVEN"
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
       }
     }
   }
 }
 op {
-  name: "QuantizedAdd"
-  input_arg {
-    name: "x"
-    type_attr: "T1"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T2"
-  }
-  input_arg {
-    name: "min_x"
-    type: DT_FLOAT
-  }
+  name: "MaxPool3DGrad"
   input_arg {
-    name: "max_x"
-    type: DT_FLOAT
+    name: "orig_input"
+    type_attr: "TInput"
   }
   input_arg {
-    name: "min_y"
-    type: DT_FLOAT
+    name: "orig_output"
+    type_attr: "TInput"
   }
   input_arg {
-    name: "max_y"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type_attr: "Toutput"
+    name: "output"
+    type_attr: "T"
   }
-  output_arg {
-    name: "min_z"
-    type: DT_FLOAT
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
-  output_arg {
-    name: "max_z"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "T1"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "T2"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_FLOAT
       }
     }
   }
   attr {
-    name: "Toutput"
+    name: "TInput"
     type: "type"
     default_value {
-      type: DT_QINT32
+      type: DT_FLOAT
     }
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_FLOAT
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "QuantizedAvgPool"
+  name: "MaxPool3DGradGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "min_input"
-    type: DT_FLOAT
+    name: "orig_output"
+    type_attr: "T"
   }
   input_arg {
-    name: "max_input"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
+    name: "padding"
+    type: "string"
+    allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
       }
     }
   }
 }
 op {
-  name: "QuantizedBatchNormWithGlobalNormalization"
-  input_arg {
-    name: "t"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "t_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "t_max"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "m"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "m_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "m_max"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "v"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "v_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "v_max"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "beta"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "beta_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "beta_max"
-    type: DT_FLOAT
-  }
+  name: "MaxPoolGrad"
   input_arg {
-    name: "gamma"
-    type_attr: "Tinput"
+    name: "orig_input"
+    type_attr: "T"
   }
   input_arg {
-    name: "gamma_min"
-    type: DT_FLOAT
+    name: "orig_output"
+    type_attr: "T"
   }
   input_arg {
-    name: "gamma_max"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
-    name: "result"
-    type_attr: "out_type"
+    name: "output"
+    type_attr: "T"
   }
-  output_arg {
-    name: "result_min"
-    type: DT_FLOAT
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
-  output_arg {
-    name: "result_max"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "Tinput"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "out_type"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
   attr {
-    name: "variance_epsilon"
-    type: "float"
-  }
-  attr {
-    name: "scale_after_normalization"
-    type: "bool"
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
   }
 }
 op {
-  name: "QuantizedBiasAdd"
-  input_arg {
-    name: "input"
-    type_attr: "T1"
-  }
-  input_arg {
-    name: "bias"
-    type_attr: "T2"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
+  name: "MaxPoolGrad"
   input_arg {
-    name: "max_input"
-    type: DT_FLOAT
+    name: "orig_input"
+    type_attr: "T"
   }
   input_arg {
-    name: "min_bias"
-    type: DT_FLOAT
+    name: "orig_output"
+    type_attr: "T"
   }
   input_arg {
-    name: "max_bias"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_attr: "out_type"
+    type_attr: "T"
   }
-  output_arg {
-    name: "min_out"
-    type: DT_FLOAT
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
-  output_arg {
-    name: "max_out"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "T1"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "T2"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
   attr {
-    name: "out_type"
+    name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "QuantizedConcat"
-  input_arg {
-    name: "concat_dim"
-    type: DT_INT32
-  }
+  name: "MaxPoolGrad"
   input_arg {
-    name: "values"
+    name: "orig_input"
     type_attr: "T"
-    number_attr: "N"
   }
   input_arg {
-    name: "input_mins"
-    type: DT_FLOAT
-    number_attr: "N"
+    name: "orig_output"
+    type_attr: "T"
   }
   input_arg {
-    name: "input_maxes"
-    type: DT_FLOAT
-    number_attr: "N"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "N"
-    type: "int"
+    name: "strides"
+    type: "list(int)"
     has_minimum: true
-    minimum: 2
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
 }
 op {
-  name: "QuantizedConv2D"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
+  name: "MaxPoolGrad"
   input_arg {
-    name: "max_input"
-    type: DT_FLOAT
+    name: "orig_input"
+    type_attr: "T"
   }
   input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
+    name: "orig_output"
+    type_attr: "T"
   }
   input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_attr: "out_type"
+    type_attr: "T"
   }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "Tinput"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "Tfilter"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
   attr {
-    name: "out_type"
+    name: "T"
     type: "type"
     default_value {
-      type: DT_QINT32
+      type: DT_FLOAT
     }
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
+}
+op {
+  name: "MaxPoolGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
   attr {
     name: "strides"
     type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
     name: "padding"
@@ -22990,1093 +23449,988 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
 }
 op {
-  name: "QuantizedInstanceNorm"
+  name: "MaxPoolGradGrad"
   input_arg {
-    name: "x"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "x_min"
-    type: DT_FLOAT
+    name: "orig_output"
+    type_attr: "T"
   }
   input_arg {
-    name: "x_max"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "y_min"
-    type: DT_FLOAT
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
-  output_arg {
-    name: "y_max"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "output_range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "given_y_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "given_y_max"
-    type: "float"
+    name: "data_format"
+    type: "string"
     default_value {
-      f: 0
+      s: "NHWC"
     }
-  }
-  attr {
-    name: "variance_epsilon"
-    type: "float"
-    default_value {
-      f: 1e-05
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
   attr {
-    name: "min_separation"
-    type: "float"
-    default_value {
-      f: 0.001
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
     }
   }
 }
 op {
-  name: "QuantizedMatMul"
-  input_arg {
-    name: "a"
-    type_attr: "T1"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T2"
-  }
-  input_arg {
-    name: "min_a"
-    type: DT_FLOAT
-  }
+  name: "MaxPoolGradGrad"
   input_arg {
-    name: "max_a"
-    type: DT_FLOAT
+    name: "orig_input"
+    type_attr: "T"
   }
   input_arg {
-    name: "min_b"
-    type: DT_FLOAT
+    name: "orig_output"
+    type_attr: "T"
   }
   input_arg {
-    name: "max_b"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "out"
-    type_attr: "Toutput"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
-    name: "min_out"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
-  output_arg {
-    name: "max_out"
-    type: DT_FLOAT
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "T1"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "T2"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "Toutput"
-    type: "type"
+    name: "data_format"
+    type: "string"
     default_value {
-      type: DT_QINT32
+      s: "NHWC"
     }
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
   attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "Tactivation"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "QuantizedMaxPool"
+  name: "MaxPoolGradGradV2"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "min_input"
-    type: DT_FLOAT
+    name: "orig_output"
+    type_attr: "T"
   }
   input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
-    name: "max_output"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "QuantizedMul"
-  input_arg {
-    name: "x"
-    type_attr: "T1"
-  }
+  name: "MaxPoolGradGradV2"
   input_arg {
-    name: "y"
-    type_attr: "T2"
+    name: "orig_input"
+    type_attr: "T"
   }
   input_arg {
-    name: "min_x"
-    type: DT_FLOAT
+    name: "orig_output"
+    type_attr: "T"
   }
   input_arg {
-    name: "max_x"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "min_y"
-    type: DT_FLOAT
+    name: "ksize"
+    type: DT_INT32
   }
   input_arg {
-    name: "max_y"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "z"
-    type_attr: "Toutput"
-  }
-  output_arg {
-    name: "min_z"
-    type: DT_FLOAT
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
-    name: "max_z"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "T1"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "T2"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
   attr {
-    name: "Toutput"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_QINT32
-    }
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "QuantizedRelu"
+  name: "MaxPoolGradGradV2"
   input_arg {
-    name: "features"
-    type_attr: "Tinput"
+    name: "orig_input"
+    type_attr: "T"
   }
   input_arg {
-    name: "min_features"
-    type: DT_FLOAT
+    name: "orig_output"
+    type_attr: "T"
   }
   input_arg {
-    name: "max_features"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "activations"
-    type_attr: "out_type"
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
   }
-  output_arg {
-    name: "min_activations"
-    type: DT_FLOAT
+  input_arg {
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
-    name: "max_activations"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "Tinput"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "out_type"
-    type: "type"
+    name: "data_format"
+    type: "string"
     default_value {
-      type: DT_QUINT8
+      s: "NHWC"
     }
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "QuantizedRelu6"
+  name: "MaxPoolGradGradWithArgmax"
   input_arg {
-    name: "features"
-    type_attr: "Tinput"
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "min_features"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "max_features"
-    type: DT_FLOAT
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
-    name: "activations"
-    type_attr: "out_type"
+    name: "output"
+    type_attr: "T"
   }
-  output_arg {
-    name: "min_activations"
-    type: DT_FLOAT
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
-  output_arg {
-    name: "max_activations"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "Tinput"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "out_type"
+    name: "Targmax"
     type: "type"
-    default_value {
-      type: DT_QUINT8
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "QuantizedReluX"
-  input_arg {
-    name: "features"
-    type_attr: "Tinput"
-  }
+  name: "MaxPoolGradGradWithArgmax"
   input_arg {
-    name: "max_value"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "min_features"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "max_features"
-    type: DT_FLOAT
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
-    name: "activations"
-    type_attr: "out_type"
+    name: "output"
+    type_attr: "T"
   }
-  output_arg {
-    name: "min_activations"
-    type: DT_FLOAT
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
-  output_arg {
-    name: "max_activations"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "Tinput"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "out_type"
+    name: "Targmax"
     type: "type"
-    default_value {
-      type: DT_QUINT8
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "QuantizedReshape"
+  name: "MaxPoolGradGradWithArgmax"
   input_arg {
-    name: "tensor"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "shape"
-    type_attr: "Tshape"
-  }
-  input_arg {
-    name: "input_min"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "input_max"
-    type: DT_FLOAT
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
   attr {
-    name: "Tshape"
+    name: "Targmax"
     type: "type"
-    default_value {
-      type: DT_INT32
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "QuantizedResizeBilinear"
+  name: "MaxPoolGradV2"
   input_arg {
-    name: "images"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "orig_output"
+    type_attr: "T"
   }
   input_arg {
-    name: "min"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "resized_images"
-    type_attr: "T"
+    name: "ksize"
+    type: DT_INT32
   }
-  output_arg {
-    name: "out_min"
-    type: DT_FLOAT
+  input_arg {
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
-    name: "out_max"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_FLOAT
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "align_corners"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: false
+      s: "NHWC"
     }
-  }
-}
-op {
-  name: "QueueClose"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "cancel_pending_enqueues"
-    type: "bool"
-    default_value {
-      b: false
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
-}
-op {
-  name: "QueueCloseV2"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
   attr {
-    name: "cancel_pending_enqueues"
-    type: "bool"
+    name: "T"
+    type: "type"
     default_value {
-      b: false
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "QueueDequeue"
+  name: "MaxPoolGradV2"
   input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "component_types"
+    name: "orig_input"
+    type_attr: "T"
   }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
   }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-}
-op {
-  name: "QueueDequeueMany"
   input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "ksize"
+    type: DT_INT32
   }
   input_arg {
-    name: "n"
+    name: "strides"
     type: DT_INT32
   }
   output_arg {
-    name: "components"
-    type_list_attr: "component_types"
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
   attr {
-    name: "timeout_ms"
-    type: "int"
+    name: "data_format"
+    type: "string"
     default_value {
-      i: -1
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
-  }
-}
-op {
-  name: "QueueDequeueManyV2"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "n"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "component_types"
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "timeout_ms"
-    type: "int"
+    name: "T"
+    type: "type"
     default_value {
-      i: -1
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "QueueDequeueUpTo"
+  name: "MaxPoolGradV2"
   input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "orig_input"
+    type_attr: "T"
   }
   input_arg {
-    name: "n"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "component_types"
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "orig_output"
+    type_attr: "T"
   }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-}
-op {
-  name: "QueueDequeueUpToV2"
   input_arg {
-    name: "handle"
-    type: DT_RESOURCE
+    name: "ksize"
+    type: DT_INT32
   }
   input_arg {
-    name: "n"
+    name: "strides"
     type: DT_INT32
   }
   output_arg {
-    name: "components"
-    type_list_attr: "component_types"
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
   attr {
-    name: "timeout_ms"
-    type: "int"
+    name: "data_format"
+    type: "string"
     default_value {
-      i: -1
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
-  }
-  is_stateful: true
-}
-op {
-  name: "QueueDequeueV2"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "component_types"
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "timeout_ms"
-    type: "int"
+    name: "T"
+    type: "type"
     default_value {
-      i: -1
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "QueueEnqueue"
+  name: "MaxPoolGradWithArgmax"
   input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "Tcomponents"
-    type: "list(type)"
+    name: "ksize"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
+    minimum: 4
   }
   attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-}
-op {
-  name: "QueueEnqueueMany"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
-  input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
   attr {
-    name: "Tcomponents"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "timeout_ms"
-    type: "int"
+    name: "T"
+    type: "type"
     default_value {
-      i: -1
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
     }
   }
 }
 op {
-  name: "QueueEnqueueManyV2"
+  name: "MaxPoolGradWithArgmax"
   input_arg {
-    name: "handle"
-    type: DT_RESOURCE
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "Tcomponents"
-    type: "list(type)"
+    name: "ksize"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
+    minimum: 4
   }
   attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "QueueEnqueueV2"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
-  }
-  attr {
-    name: "Tcomponents"
-    type: "list(type)"
+    name: "strides"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
+    minimum: 4
   }
   attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
-  is_stateful: true
-}
-op {
-  name: "QueueIsClosed"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "is_closed"
-    type: DT_BOOL
-  }
-}
-op {
-  name: "QueueIsClosedV2"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "is_closed"
-    type: DT_BOOL
-  }
-  is_stateful: true
-}
-op {
-  name: "QueueSize"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-}
-op {
-  name: "QueueSizeV2"
-  input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  is_stateful: true
-}
-op {
-  name: "RFFT"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "fft_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "RFFT2D"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "fft_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "RFFT3D"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "fft_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "RGBToHSV"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Targmax"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-}
-op {
-  name: "RandomCrop"
-  input_arg {
-    name: "image"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  deprecation {
-    version: 8
-  }
-  is_stateful: true
 }
 op {
-  name: "RandomGamma"
+  name: "MaxPoolGradWithArgmax"
   input_arg {
-    name: "shape"
-    type_attr: "S"
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "alpha"
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "S"
+    name: "Targmax"
     type: "type"
     allowed_values {
       list {
@@ -24090,44 +24444,63 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "RandomPoisson"
+  name: "MaxPoolGradWithArgmax"
   input_arg {
-    name: "shape"
-    type_attr: "S"
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "rate"
-    type_attr: "dtype"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
     name: "output"
-    type_attr: "dtype"
+    type_attr: "T"
   }
   attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "S"
+    name: "Targmax"
     type: "type"
     allowed_values {
       list {
@@ -24137,343 +24510,653 @@ op {
     }
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "RandomPoisson"
+  name: "MaxPoolV2"
   input_arg {
-    name: "shape"
-    type_attr: "S"
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "rate"
-    type_attr: "dtype"
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    type_attr: "T"
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "T"
+    type: "type"
     default_value {
-      i: 0
+      type: DT_FLOAT
     }
-  }
-  attr {
-    name: "S"
-    type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "dtype"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
-  deprecation {
-    version: 25
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "RandomPoissonV2"
+  name: "MaxPoolV2"
   input_arg {
-    name: "shape"
-    type_attr: "S"
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "rate"
-    type_attr: "R"
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
-    type_attr: "dtype"
+    type_attr: "T"
   }
   attr {
-    name: "seed"
-    type: "int"
+    name: "T"
+    type: "type"
     default_value {
-      i: 0
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_QINT8
+      }
     }
   }
   attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "S"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
       }
     }
   }
+}
+op {
+  name: "MaxPoolV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "R"
+    name: "T"
     type: "type"
     default_value {
-      type: DT_DOUBLE
+      type: DT_FLOAT
     }
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_QINT8
       }
     }
   }
   attr {
-    name: "dtype"
-    type: "type"
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
     default_value {
-      type: DT_INT64
+      s: "NHWC"
     }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "RandomShuffle"
+  name: "MaxPoolWithArgmax"
   input_arg {
-    name: "value"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
   attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "T"
+    name: "Targmax"
     type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "RandomShuffleQueue"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
   attr {
-    name: "shapes"
-    type: "list(shape)"
+    name: "T"
+    type: "type"
     default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_HALF
       }
     }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
     has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
     default_value {
-      i: -1
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "min_after_dequeue"
-    type: "int"
-    default_value {
-      i: 0
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
     }
   }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
     default_value {
-      i: 0
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "container"
+    name: "padding"
     type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "RandomShuffleQueueV2"
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
   output_arg {
-    name: "handle"
-    type: DT_RESOURCE
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
   }
   attr {
-    name: "component_types"
-    type: "list(type)"
+    name: "ksize"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
+    minimum: 4
   }
   attr {
-    name: "shapes"
-    type: "list(shape)"
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
     default_value {
+      type: DT_INT64
+    }
+    allowed_values {
       list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
-    has_minimum: true
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "min_after_dequeue"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
     }
   }
+}
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
+  is_commutative: true
+}
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
+  is_commutative: true
+}
+op {
+  name: "Mean"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "container"
-    type: "string"
+    name: "keep_dims"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "RandomStandardNormal"
+  name: "Mean"
   input_arg {
-    name: "shape"
+    name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
   output_arg {
     name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    type_attr: "T"
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "keep_dims"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "T"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
         type: DT_INT32
@@ -24481,46 +25164,59 @@ op {
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "RandomUniform"
+  name: "Mean"
   input_arg {
-    name: "shape"
+    name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
   output_arg {
     name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    type_attr: "T"
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "keep_dims"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "T"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
         type: DT_INT32
@@ -24528,421 +25224,10504 @@ op {
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "RandomUniformInt"
+  name: "Merge"
   input_arg {
-    name: "shape"
+    name: "inputs"
     type_attr: "T"
+    number_attr: "N"
   }
-  input_arg {
-    name: "minval"
-    type_attr: "Tout"
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "value_index"
+    type: DT_INT32
   }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "MergeSummary"
   input_arg {
-    name: "maxval"
-    type_attr: "Tout"
+    name: "inputs"
+    type: DT_STRING
+    number_attr: "N"
   }
   output_arg {
-    name: "output"
-    type_attr: "Tout"
+    name: "summary"
+    type: DT_STRING
   }
   attr {
-    name: "seed"
+    name: "N"
     type: "int"
-    default_value {
-      i: 0
-    }
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "MergeV2Checkpoints"
+  input_arg {
+    name: "checkpoint_prefixes"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "destination_prefix"
+    type: DT_STRING
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "delete_old_dirs"
+    type: "bool"
     default_value {
-      i: 0
+      b: true
+    }
+  }
+}
+op {
+  name: "MergeV2Checkpoints"
+  input_arg {
+    name: "checkpoint_prefixes"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "destination_prefix"
+    type: DT_STRING
+  }
+  attr {
+    name: "delete_old_dirs"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Mfcc"
+  input_arg {
+    name: "spectrogram"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "upper_frequency_limit"
+    type: "float"
+    default_value {
+      f: 4000
+    }
+  }
+  attr {
+    name: "lower_frequency_limit"
+    type: "float"
+    default_value {
+      f: 20
+    }
+  }
+  attr {
+    name: "filterbank_channel_count"
+    type: "int"
+    default_value {
+      i: 40
+    }
+  }
+  attr {
+    name: "dct_coefficient_count"
+    type: "int"
+    default_value {
+      i: 13
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Minimum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Minimum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "MirrorPad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+}
+op {
+  name: "MirrorPadGrad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+}
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Mul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Mul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Multinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Multinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Multinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableDenseHashTable"
+  input_arg {
+    name: "empty_key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "initial_num_buckets"
+    type: "int"
+    default_value {
+      i: 131072
+    }
+  }
+  attr {
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableDenseHashTableV2"
+  input_arg {
+    name: "empty_key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "initial_num_buckets"
+    type: "int"
+    default_value {
+      i: 131072
+    }
+  }
+  attr {
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTable"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableOfTensors"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableOfTensorsV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "NegTrain"
+  input_arg {
+    name: "w_in"
+    type: DT_FLOAT
+    is_ref: true
+  }
+  input_arg {
+    name: "w_out"
+    type: DT_FLOAT
+    is_ref: true
+  }
+  input_arg {
+    name: "examples"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "labels"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "lr"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "vocab_count"
+    type: "list(int)"
+  }
+  attr {
+    name: "num_negative_samples"
+    type: "int"
+  }
+  deprecation {
+    version: 19
+  }
+  is_stateful: true
+}
+op {
+  name: "NextIteration"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "NoOp"
+}
+op {
+  name: "NonMaxSuppression"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "iou_threshold"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+}
+op {
+  name: "NonMaxSuppressionV2"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+}
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "NthElement"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "NthElement"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "OneHot"
+  input_arg {
+    name: "indices"
+    type_attr: "TI"
+  }
+  input_arg {
+    name: "depth"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "on_value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "off_value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "TI"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "OneShotIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "dataset_factory"
+    type: "func"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_BOOL
+      }
+    }
+  }
+}
+op {
+  name: "OrderedMapClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapIncompleteSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapPeek"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapStage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "fake_dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapUnstage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapUnstageNoKey"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Pack"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "Pad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PadV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  input_arg {
+    name: "constant_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "PaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "PaddingFIFOQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PaddingFIFOQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParallelConcat"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "ParallelDynamicStitch"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "merged"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "ParallelInterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sloppy"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stdevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stdevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParseExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "sparse_keys"
+    type: DT_STRING
+    number_attr: "Nsparse"
+  }
+  input_arg {
+    name: "dense_keys"
+    type: DT_STRING
+    number_attr: "Ndense"
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "Nsparse"
+  }
+  output_arg {
+    name: "sparse_values"
+    type_list_attr: "sparse_types"
+  }
+  output_arg {
+    name: "sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nsparse"
+  }
+  output_arg {
+    name: "dense_values"
+    type_list_attr: "Tdense"
+  }
+  attr {
+    name: "Nsparse"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Ndense"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseSingleSequenceExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "feature_list_dense_missing_assumed_empty"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "context_sparse_keys"
+    type: DT_STRING
+    number_attr: "Ncontext_sparse"
+  }
+  input_arg {
+    name: "context_dense_keys"
+    type: DT_STRING
+    number_attr: "Ncontext_dense"
+  }
+  input_arg {
+    name: "feature_list_sparse_keys"
+    type: DT_STRING
+    number_attr: "Nfeature_list_sparse"
+  }
+  input_arg {
+    name: "feature_list_dense_keys"
+    type: DT_STRING
+    number_attr: "Nfeature_list_dense"
+  }
+  input_arg {
+    name: "context_dense_defaults"
+    type_list_attr: "Tcontext_dense"
+  }
+  input_arg {
+    name: "debug_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "context_sparse_indices"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_sparse_values"
+    type_list_attr: "context_sparse_types"
+  }
+  output_arg {
+    name: "context_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_dense_values"
+    type_list_attr: "Tcontext_dense"
+  }
+  output_arg {
+    name: "feature_list_sparse_indices"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_sparse_values"
+    type_list_attr: "feature_list_sparse_types"
+  }
+  output_arg {
+    name: "feature_list_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_dense_values"
+    type_list_attr: "feature_list_dense_types"
+  }
+  attr {
+    name: "Ncontext_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Ncontext_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "context_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tcontext_dense"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "context_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseTensor"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+  }
+}
+op {
+  name: "Placeholder"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+}
+op {
+  name: "Placeholder"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
+op {
+  name: "PlaceholderV2"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "PlaceholderV2"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  deprecation {
+    version: 23
+  }
+}
+op {
+  name: "PlaceholderWithDefault"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "Polygamma"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "PrefetchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "PrefetchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "PreventGradient"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "Print"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "U"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "U"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "first_n"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Print"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "U"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "U"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "first_n"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PriorityQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PriorityQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PyFunc"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "PyFuncStateless"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+}
+op {
+  name: "Qr"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "q"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r"
+    type_attr: "T"
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 21
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeDownAndShrinkRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_AWAY_FROM_ZERO"
+    }
+    allowed_values {
+      list {
+        s: "HALF_AWAY_FROM_ZERO"
+        s: "HALF_TO_EVEN"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "QuantizedAvgPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedBatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "t_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "t_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "m_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "v_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "beta_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "gamma_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "result_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+}
+op {
+  name: "QuantizedBiasAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_bias"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConcat"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "input_mins"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "input_maxes"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "QuantizedConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedInstanceNorm"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "x_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "y_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "output_range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "given_y_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "given_y_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  attr {
+    name: "min_separation"
+    type: "float"
+    default_value {
+      f: 0.001
+    }
+  }
+}
+op {
+  name: "QuantizedMatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_b"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_b"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Tactivation"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedMaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedMul"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "QuantizedRelu"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedRelu6"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedReluX"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "max_value"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedReshape"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "QuantizedResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "QueueClose"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "cancel_pending_enqueues"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "QueueCloseV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "cancel_pending_enqueues"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueDequeue"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "QueueDequeueMany"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "QueueDequeueManyV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueDequeueUpTo"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "QueueDequeueUpToV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueDequeueV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueEnqueue"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "QueueEnqueueMany"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "QueueEnqueueManyV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueEnqueueV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueIsClosed"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "is_closed"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "QueueIsClosedV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_closed"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
+op {
+  name: "QueueSize"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+}
+op {
+  name: "QueueSizeV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "RFFT"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "RFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "RFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "RGBToHSV"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "RandomCrop"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  deprecation {
+    version: 8
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomDataset"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomGamma"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomPoisson"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "rate"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomPoisson"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "rate"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 25
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomPoissonV2"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "rate"
+    type_attr: "R"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "R"
+    type: "type"
+    default_value {
+      type: DT_DOUBLE
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomShuffle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomShuffleQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "min_after_dequeue"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomShuffleQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "min_after_dequeue"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomStandardNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomStandardNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RandomUniformInt"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "Tout"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Range"
+  input_arg {
+    name: "start"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "limit"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tidx"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Range"
+  input_arg {
+    name: "start"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "limit"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tidx"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "RangeDataset"
+  input_arg {
+    name: "start"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "stop"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "Rank"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "ReadFile"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ReadVariableOp"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderNumRecordsProduced"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "records_produced"
+    type: DT_INT64
+  }
+}
+op {
+  name: "ReaderNumRecordsProducedV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "records_produced"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderNumWorkUnitsCompleted"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "units_completed"
+    type: DT_INT64
+  }
+}
+op {
+  name: "ReaderNumWorkUnitsCompletedV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "units_completed"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderRead"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "queue_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "key"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ReaderReadUpTo"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "queue_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_records"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "keys"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "values"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ReaderReadUpToV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "queue_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "num_records"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "keys"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "values"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderReadV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "queue_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "key"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderReset"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+}
+op {
+  name: "ReaderResetV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderRestoreState"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "state"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ReaderRestoreStateV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "state"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "ReaderSerializeState"
+  input_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "state"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ReaderSerializeStateV2"
+  input_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "state"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "Real"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "RealDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "RealDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ReciprocalGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ReciprocalGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ReciprocalGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "RecordInput"
+  output_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  attr {
+    name: "file_pattern"
+    type: "string"
+  }
+  attr {
+    name: "file_random_seed"
+    type: "int"
+    default_value {
+      i: 301
+    }
+  }
+  attr {
+    name: "file_shuffle_shift_ratio"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "file_buffer_size"
+    type: "int"
+    default_value {
+      i: 10000
+    }
+  }
+  attr {
+    name: "file_parallelism"
+    type: "int"
+    default_value {
+      i: 16
+    }
+  }
+  attr {
+    name: "batch_size"
+    type: "int"
+    default_value {
+      i: 32
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ReduceJoin"
+  input_arg {
+    name: "inputs"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "reduction_indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "separator"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "RefEnter"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "frame_name"
+    type: "string"
+  }
+  attr {
+    name: "is_constant"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "parallel_iterations"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+}
+op {
+  name: "RefExit"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "RefIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "RefMerge"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "value_index"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "RefNextIteration"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "RefSelect"
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "RefSwitch"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "pred"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output_false"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output_true"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "Relu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Relu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Relu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Relu6"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Relu6"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Relu6"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Relu6Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Relu6Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Relu6Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "ReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "ReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "ReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "RemoteCall"
+  input_arg {
+    name: "target"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "RemoteFusedGraphExecute"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "Toutputs"
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Toutputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "serialized_remote_fused_graph_execute_info"
+    type: "string"
+  }
+}
+op {
+  name: "RepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "RepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "RequantizationRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "Requantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "Reshape"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ResizeArea"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeArea"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBicubic"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBicubic"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBicubicGrad"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeBilinearGrad"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeNearestNeighbor"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeNearestNeighbor"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResizeNearestNeighborGrad"
+  input_arg {
+    name: "grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAddSign"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAddSign"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
     }
   }
   attr {
-    name: "Tout"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "Range"
+  name: "ResourceApplyPowerSign"
   input_arg {
-    name: "start"
-    type_attr: "Tidx"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "limit"
-    type_attr: "Tidx"
+    name: "m"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "delta"
-    type_attr: "Tidx"
+    name: "lr"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type_attr: "Tidx"
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
-    name: "Tidx"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "RangeDataset"
+  name: "ResourceApplyPowerSign"
   input_arg {
-    name: "start"
-    type: DT_INT64
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "stop"
-    type: DT_INT64
+    name: "m"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "step"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "Rank"
   input_arg {
-    name: "input"
+    name: "beta"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type: DT_INT32
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-}
-op {
-  name: "ReadFile"
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-}
-op {
-  name: "ReadVariableOp"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "dtype"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   is_stateful: true
 }
 op {
-  name: "ReaderNumRecordsProduced"
-  input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "records_produced"
-    type: DT_INT64
-  }
-}
-op {
-  name: "ReaderNumRecordsProducedV2"
+  name: "ResourceApplyProximalAdagrad"
   input_arg {
-    name: "reader_handle"
+    name: "var"
     type: DT_RESOURCE
   }
-  output_arg {
-    name: "records_produced"
-    type: DT_INT64
-  }
-  is_stateful: true
-}
-op {
-  name: "ReaderNumWorkUnitsCompleted"
-  input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "units_completed"
-    type: DT_INT64
-  }
-}
-op {
-  name: "ReaderNumWorkUnitsCompletedV2"
   input_arg {
-    name: "reader_handle"
+    name: "accum"
     type: DT_RESOURCE
   }
-  output_arg {
-    name: "units_completed"
-    type: DT_INT64
-  }
-  is_stateful: true
-}
-op {
-  name: "ReaderRead"
-  input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
   input_arg {
-    name: "queue_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "key"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "value"
-    type: DT_STRING
+    name: "lr"
+    type_attr: "T"
   }
-}
-op {
-  name: "ReaderReadUpTo"
   input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "l1"
+    type_attr: "T"
   }
   input_arg {
-    name: "queue_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "l2"
+    type_attr: "T"
   }
   input_arg {
-    name: "num_records"
-    type: DT_INT64
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "keys"
-    type: DT_STRING
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
   }
-  output_arg {
-    name: "values"
-    type: DT_STRING
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "ReaderReadUpToV2"
+  name: "ResourceApplyProximalAdagrad"
   input_arg {
-    name: "reader_handle"
+    name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "queue_handle"
+    name: "accum"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "num_records"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "keys"
-    type: DT_STRING
+    name: "lr"
+    type_attr: "T"
   }
-  output_arg {
-    name: "values"
-    type: DT_STRING
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "ReaderReadV2"
   input_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
+    name: "l2"
+    type_attr: "T"
   }
   input_arg {
-    name: "queue_handle"
-    type: DT_RESOURCE
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "key"
-    type: DT_STRING
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
-  output_arg {
-    name: "value"
-    type: DT_STRING
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   is_stateful: true
 }
 op {
-  name: "ReaderReset"
-  input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-}
-op {
-  name: "ReaderResetV2"
+  name: "ResourceApplyProximalAdagrad"
   input_arg {
-    name: "reader_handle"
+    name: "var"
     type: DT_RESOURCE
   }
-  is_stateful: true
-}
-op {
-  name: "ReaderRestoreState"
-  input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
   input_arg {
-    name: "state"
-    type: DT_STRING
-  }
-}
-op {
-  name: "ReaderRestoreStateV2"
-  input_arg {
-    name: "reader_handle"
+    name: "accum"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "state"
-    type: DT_STRING
+    name: "lr"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "ReaderSerializeState"
   input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "state"
-    type: DT_STRING
+    name: "l1"
+    type_attr: "T"
   }
-}
-op {
-  name: "ReaderSerializeStateV2"
   input_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "state"
-    type: DT_STRING
+    name: "l2"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "Real"
   input_arg {
-    name: "input"
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type_attr: "Tout"
-  }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "RealDiv"
+  name: "ResourceApplyProximalGradientDescent"
   input_arg {
-    name: "x"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "l1"
     type_attr: "T"
   }
-  output_arg {
-    name: "z"
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
     type_attr: "T"
   }
   attr {
@@ -24950,29 +35729,52 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
-        type: DT_INT8
         type: DT_UINT16
         type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "Reciprocal"
+  name: "ResourceApplyProximalGradientDescent"
   input_arg {
-    name: "x"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
     type_attr: "T"
   }
-  output_arg {
-    name: "y"
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
     type_attr: "T"
   }
   attr {
@@ -24980,29 +35782,54 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "ReciprocalGrad"
+  name: "ResourceApplyProximalGradientDescent"
   input_arg {
-    name: "x"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "l1"
     type_attr: "T"
   }
-  output_arg {
-    name: "z"
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
     type_attr: "T"
   }
   attr {
@@ -25010,27 +35837,67 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "ReciprocalGrad"
+  name: "ResourceApplyRMSProp"
   input_arg {
-    name: "y"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "dy"
+    name: "rho"
     type_attr: "T"
   }
-  output_arg {
-    name: "z"
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   attr {
@@ -25038,917 +35905,769 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
-}
-op {
-  name: "RecordInput"
-  output_arg {
-    name: "records"
-    type: DT_STRING
-  }
-  attr {
-    name: "file_pattern"
-    type: "string"
-  }
-  attr {
-    name: "file_random_seed"
-    type: "int"
-    default_value {
-      i: 301
-    }
-  }
-  attr {
-    name: "file_shuffle_shift_ratio"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "file_buffer_size"
-    type: "int"
-    default_value {
-      i: 10000
-    }
-  }
-  attr {
-    name: "file_parallelism"
-    type: "int"
-    default_value {
-      i: 16
-    }
-  }
   attr {
-    name: "batch_size"
-    type: "int"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      i: 32
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "ReduceJoin"
+  name: "ResourceApplyRMSProp"
   input_arg {
-    name: "inputs"
-    type: DT_STRING
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "reduction_indices"
-    type: DT_INT32
+    name: "ms"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "output"
-    type: DT_STRING
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "separator"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
   }
-}
-op {
-  name: "RefEnter"
   input_arg {
-    name: "data"
+    name: "momentum"
     type_attr: "T"
-    is_ref: true
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "frame_name"
-    type: "string"
-  }
-  attr {
-    name: "is_constant"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
-  attr {
-    name: "parallel_iterations"
-    type: "int"
-    default_value {
-      i: 10
-    }
-  }
+  is_stateful: true
 }
 op {
-  name: "RefExit"
+  name: "ResourceApplyRMSProp"
   input_arg {
-    name: "data"
-    type_attr: "T"
-    is_ref: true
+    name: "var"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-    is_ref: true
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "T"
-    type: "type"
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
   }
-}
-op {
-  name: "RefIdentity"
   input_arg {
-    name: "input"
+    name: "lr"
     type_attr: "T"
-    is_ref: true
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "rho"
     type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
   }
-  allows_uninitialized_input: true
-}
-op {
-  name: "RefMerge"
   input_arg {
-    name: "inputs"
+    name: "momentum"
     type_attr: "T"
-    number_attr: "N"
-    is_ref: true
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "epsilon"
     type_attr: "T"
-    is_ref: true
   }
-  output_arg {
-    name: "value_index"
-    type: DT_INT32
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "RefNextIteration"
+  name: "ResourceCountUpTo"
   input_arg {
-    name: "data"
-    type_attr: "T"
-    is_ref: true
+    name: "resource"
+    type: DT_RESOURCE
   }
   output_arg {
     name: "output"
     type_attr: "T"
-    is_ref: true
+  }
+  attr {
+    name: "limit"
+    type: "int"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "RefSelect"
+  name: "ResourceGather"
   input_arg {
-    name: "index"
-    type: DT_INT32
+    name: "resource"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "inputs"
-    type_attr: "T"
-    number_attr: "N"
-    is_ref: true
+    name: "indices"
+    type_attr: "Tindices"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
-    is_ref: true
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "dtype"
     type: "type"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "RefSwitch"
+  name: "ResourceScatterAdd"
   input_arg {
-    name: "data"
-    type_attr: "T"
-    is_ref: true
+    name: "resource"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "pred"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "output_false"
-    type_attr: "T"
-    is_ref: true
-  }
-  output_arg {
-    name: "output_true"
-    type_attr: "T"
-    is_ref: true
-  }
-  attr {
-    name: "T"
-    type: "type"
+    name: "indices"
+    type_attr: "Tindices"
   }
-  allows_uninitialized_input: true
-}
-op {
-  name: "Relu"
   input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
+    name: "updates"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "Relu"
+  name: "ResourceScatterAdd"
   input_arg {
-    name: "features"
-    type_attr: "T"
+    name: "resource"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "Relu6"
+  name: "ResourceScatterAdd"
   input_arg {
-    name: "features"
-    type_attr: "T"
+    name: "resource"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-}
-op {
-  name: "Relu6"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Relu6Grad"
+  name: "ResourceScatterNdUpdate"
   input_arg {
-    name: "gradients"
-    type_attr: "T"
+    name: "ref"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "features"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
-  output_arg {
-    name: "backprops"
+  input_arg {
+    name: "updates"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "Relu6Grad"
+  name: "ResourceScatterUpdate"
   input_arg {
-    name: "gradients"
-    type_attr: "T"
+    name: "resource"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "features"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "ReluGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "ReluGrad"
+  name: "ResourceScatterUpdate"
   input_arg {
-    name: "gradients"
-    type_attr: "T"
+    name: "resource"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "features"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-}
-op {
-  name: "RemoteCall"
-  input_arg {
-    name: "target"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
   attr {
-    name: "f"
-    type: "func"
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "RemoteFusedGraphExecute"
+  name: "ResourceSparseApplyAdadelta"
   input_arg {
-    name: "inputs"
-    type_list_attr: "Tinputs"
-  }
-  output_arg {
-    name: "outputs"
-    type_list_attr: "Toutputs"
-  }
-  attr {
-    name: "Tinputs"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Toutputs"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "serialized_remote_fused_graph_execute_info"
-    type: "string"
+    name: "var"
+    type: DT_RESOURCE
   }
-}
-op {
-  name: "RepeatDataset"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "count"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "accum_update"
+    type: DT_RESOURCE
   }
-  is_stateful: true
-}
-op {
-  name: "RepeatDataset"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "count"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "rho"
+    type_attr: "T"
   }
-}
-op {
-  name: "RequantizationRange"
   input_arg {
-    name: "input"
-    type_attr: "Tinput"
+    name: "epsilon"
+    type_attr: "T"
   }
   input_arg {
-    name: "input_min"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "input_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
-    name: "Tinput"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
-}
-op {
-  name: "Requantize"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "input_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "input_max"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "requested_output_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "requested_output_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
   attr {
-    name: "Tinput"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "out_type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Reshape"
+  name: "ResourceSparseApplyAdadelta"
   input_arg {
-    name: "tensor"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "shape"
-    type_attr: "Tshape"
+    name: "accum"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "T"
-    type: "type"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "Tshape"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
   }
-}
-op {
-  name: "ResizeArea"
   input_arg {
-    name: "images"
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
-        type: DT_INT8
+        type: DT_UINT16
         type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeArea"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "align_corners"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "ResizeBicubic"
+  name: "ResourceSparseApplyAdadelta"
   input_arg {
-    name: "images"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "rho"
+    type_attr: "T"
   }
-  output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ResizeBicubic"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "align_corners"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "ResizeBicubicGrad"
+  name: "ResourceSparseApplyAdagrad"
   input_arg {
-    name: "grads"
-    type: DT_FLOAT
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "original_image"
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   attr {
     name: "T"
     type: "type"
@@ -25956,252 +36675,512 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "align_corners"
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "ResizeBilinear"
+  name: "ResourceSparseApplyAdagrad"
   input_arg {
-    name: "images"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
-        type: DT_INT8
+        type: DT_UINT16
         type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_INT32
         type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "align_corners"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "ResizeBilinear"
+  name: "ResourceSparseApplyAdagrad"
   input_arg {
-    name: "images"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
-        type: DT_INT16
         type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_INT32
         type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "align_corners"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "ResizeBilinearGrad"
+  name: "ResourceSparseApplyAdagradDA"
   input_arg {
-    name: "grads"
-    type: DT_FLOAT
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "original_image"
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_HALF
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "align_corners"
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "ResizeNearestNeighbor"
+  name: "ResourceSparseApplyAdagradDA"
   input_arg {
-    name: "images"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "indices"
+    type_attr: "Tindices"
   }
-  output_arg {
-    name: "resized_images"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
     type_attr: "T"
   }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
-        type: DT_INT8
+        type: DT_UINT16
         type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_INT32
         type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "align_corners"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "ResizeNearestNeighbor"
+  name: "ResourceSparseApplyAdagradDA"
   input_arg {
-    name: "images"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "indices"
+    type_attr: "Tindices"
   }
-  output_arg {
-    name: "resized_images"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
     type_attr: "T"
   }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
-        type: DT_INT16
         type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_INT32
         type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "align_corners"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "ResizeNearestNeighborGrad"
+  name: "ResourceSparseApplyCenteredRMSProp"
   input_arg {
-    name: "grads"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "rho"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
         type: DT_INT8
-        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "align_corners"
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "ResourceApplyAdadelta"
+  name: "ResourceSparseApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "mg"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum_update"
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
     type: DT_RESOURCE
   }
   input_arg {
@@ -26212,6 +37191,10 @@ op {
     name: "rho"
     type_attr: "T"
   }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
   input_arg {
     name: "epsilon"
     type_attr: "T"
@@ -26220,6 +37203,10 @@ op {
     name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   attr {
     name: "T"
     type: "type"
@@ -26239,6 +37226,18 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -26252,17 +37251,21 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyAdadelta"
+  name: "ResourceSparseApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "mg"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum_update"
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
     type: DT_RESOURCE
   }
   input_arg {
@@ -26273,6 +37276,10 @@ op {
     name: "rho"
     type_attr: "T"
   }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
   input_arg {
     name: "epsilon"
     type_attr: "T"
@@ -26281,6 +37288,10 @@ op {
     name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   attr {
     name: "T"
     type: "type"
@@ -26302,6 +37313,17 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -26315,7 +37337,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyAdagrad"
+  name: "ResourceSparseApplyFtrl"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -26324,12 +37346,32 @@ op {
     name: "accum"
     type: DT_RESOURCE
   }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
     type_attr: "T"
   }
   attr {
@@ -26355,53 +37397,12 @@ op {
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_INT64
       }
     }
   }
@@ -26415,23 +37416,27 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyAdagradDA"
+  name: "ResourceSparseApplyFtrl"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_accumulator"
+    name: "accum"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_squared_accumulator"
+    name: "linear"
     type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   input_arg {
     name: "lr"
     type_attr: "T"
@@ -26445,8 +37450,8 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "global_step"
-    type: DT_INT64
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -26467,6 +37472,18 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -26480,23 +37497,27 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyAdagradDA"
+  name: "ResourceSparseApplyFtrl"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_accumulator"
+    name: "accum"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_squared_accumulator"
+    name: "linear"
     type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   input_arg {
     name: "lr"
     type_attr: "T"
@@ -26510,8 +37531,8 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "global_step"
-    type: DT_INT64
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -26534,6 +37555,17 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -26547,45 +37579,45 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyAdam"
+  name: "ResourceSparseApplyFtrlV2"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "m"
+    name: "accum"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "v"
+    name: "linear"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "beta1_power"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "beta2_power"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "beta1"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "beta2"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2_shrinkage"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "lr_power"
     type_attr: "T"
   }
   attr {
@@ -26610,6 +37642,16 @@ op {
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "use_locking"
     type: "bool"
@@ -26620,45 +37662,45 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyAdam"
+  name: "ResourceSparseApplyFtrlV2"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "m"
+    name: "accum"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "v"
+    name: "linear"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "beta1_power"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "beta2_power"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "beta1"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "beta2"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2_shrinkage"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "lr_power"
     type_attr: "T"
   }
   attr {
@@ -26680,18 +37722,23 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "use_nesterov"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
@@ -26700,45 +37747,45 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyAdam"
+  name: "ResourceSparseApplyFtrlV2"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "m"
+    name: "accum"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "v"
+    name: "linear"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "beta1_power"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "beta2_power"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "beta1"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "beta2"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2_shrinkage"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "lr_power"
     type_attr: "T"
   }
   attr {
@@ -26762,18 +37809,22 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "use_nesterov"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
@@ -26782,21 +37833,13 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyCenteredRMSProp"
+  name: "ResourceSparseApplyMomentum"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
+    name: "accum"
     type: DT_RESOURCE
   }
   input_arg {
@@ -26804,19 +37847,15 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "grad"
+    name: "momentum"
     type_attr: "T"
   }
   attr {
@@ -26841,6 +37880,16 @@ op {
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "use_locking"
     type: "bool"
@@ -26848,24 +37897,23 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
-  name: "ResourceApplyCenteredRMSProp"
+  name: "ResourceSparseApplyMomentum"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
+    name: "accum"
     type: DT_RESOURCE
   }
   input_arg {
@@ -26873,19 +37921,15 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "grad"
+    name: "momentum"
     type_attr: "T"
   }
   attr {
@@ -26912,6 +37956,16 @@ op {
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "use_locking"
     type: "bool"
@@ -26919,10 +37973,17 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
-  name: "ResourceApplyFtrl"
+  name: "ResourceSparseApplyMomentum"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -26931,28 +37992,20 @@ op {
     name: "accum"
     type: DT_RESOURCE
   }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "lr_power"
+    name: "momentum"
     type_attr: "T"
   }
   attr {
@@ -26974,6 +38027,19 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -26984,10 +38050,17 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
-  name: "ResourceApplyFtrl"
+  name: "ResourceSparseApplyProximalAdagrad"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -26996,14 +38069,6 @@ op {
     name: "accum"
     type: DT_RESOURCE
   }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
@@ -27017,9 +38082,13 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   attr {
     name: "T"
     type: "type"
@@ -27039,8 +38108,16 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -27054,7 +38131,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyFtrlV2"
+  name: "ResourceSparseApplyProximalAdagrad"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -27063,14 +38140,6 @@ op {
     name: "accum"
     type: DT_RESOURCE
   }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
@@ -27084,12 +38153,12 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "l2_shrinkage"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
@@ -27110,6 +38179,18 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -27123,7 +38204,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyFtrlV2"
+  name: "ResourceSparseApplyProximalAdagrad"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -27132,14 +38213,6 @@ op {
     name: "accum"
     type: DT_RESOURCE
   }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
@@ -27153,12 +38226,12 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "l2_shrinkage"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
@@ -27181,6 +38254,17 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -27194,7 +38278,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyGradientDescent"
+  name: "ResourceSparseApplyProximalGradientDescent"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -27204,9 +38288,21 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "delta"
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   attr {
     name: "T"
     type: "type"
@@ -27229,6 +38325,16 @@ op {
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "use_locking"
     type: "bool"
@@ -27239,7 +38345,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyGradientDescent"
+  name: "ResourceSparseApplyProximalGradientDescent"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -27249,9 +38355,21 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "delta"
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   attr {
     name: "T"
     type: "type"
@@ -27276,6 +38394,16 @@ op {
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "use_locking"
     type: "bool"
@@ -27286,27 +38414,31 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyMomentum"
+  name: "ResourceSparseApplyProximalGradientDescent"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
-    type: DT_RESOURCE
+    name: "alpha"
+    type_attr: "T"
   }
   input_arg {
-    name: "lr"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "momentum"
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   attr {
     name: "T"
     type: "type"
@@ -27326,18 +38458,24 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "use_nesterov"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
@@ -27346,13 +38484,17 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyMomentum"
+  name: "ResourceSparseApplyRMSProp"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
     type: DT_RESOURCE
   }
   input_arg {
@@ -27360,13 +38502,25 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "rho"
     type_attr: "T"
   }
   input_arg {
     name: "momentum"
     type_attr: "T"
   }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   attr {
     name: "T"
     type: "type"
@@ -27386,20 +38540,21 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "use_nesterov"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
@@ -27408,13 +38563,17 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyProximalAdagrad"
+  name: "ResourceSparseApplyRMSProp"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
     type: DT_RESOURCE
   }
   input_arg {
@@ -27422,17 +38581,25 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "rho"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
     name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   attr {
     name: "T"
     type: "type"
@@ -27452,6 +38619,18 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -27465,13 +38644,17 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyProximalAdagrad"
+  name: "ResourceSparseApplyRMSProp"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
     type: DT_RESOURCE
   }
   input_arg {
@@ -27479,17 +38662,25 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "rho"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
     name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   attr {
     name: "T"
     type: "type"
@@ -27511,6 +38702,17 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -27524,78 +38726,255 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyProximalGradientDescent"
+  name: "ResourceStridedSliceAssign"
   input_arg {
-    name: "var"
+    name: "ref"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "alpha"
-    type_attr: "T"
+    name: "begin"
+    type_attr: "Index"
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "end"
+    type_attr: "Index"
   }
   input_arg {
-    name: "l2"
-    type_attr: "T"
+    name: "strides"
+    type_attr: "Index"
   }
   input_arg {
-    name: "delta"
+    name: "value"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Restore"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "Restore"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyProximalGradientDescent"
+  name: "RestoreV2"
   input_arg {
-    name: "var"
-    type: DT_RESOURCE
+    name: "prefix"
+    type: DT_STRING
   }
   input_arg {
-    name: "alpha"
-    type_attr: "T"
+    name: "tensor_names"
+    type: DT_STRING
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
+  is_stateful: true
+}
+op {
+  name: "Reverse"
   input_arg {
-    name: "l2"
+    name: "tensor"
     type_attr: "T"
   }
   input_arg {
-    name: "delta"
+    name: "dims"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -27603,66 +38982,32 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
         type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
+  name: "Reverse"
   input_arg {
-    name: "momentum"
+    name: "tensor"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
-    type_attr: "T"
+    name: "dims"
+    type: DT_BOOL
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -27670,64 +39015,33 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
         type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+        type: DT_STRING
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
+  name: "Reverse"
   input_arg {
-    name: "momentum"
+    name: "tensor"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
-    type_attr: "T"
+    name: "dims"
+    type: DT_BOOL
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -27735,51 +39049,58 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
         type: DT_UINT8
+        type: DT_INT8
         type: DT_UINT16
         type: DT_INT16
-        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_STRING
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceCountUpTo"
+  name: "ReverseSequence"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seq_lengths"
+    type_attr: "Tlen"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "limit"
+    name: "seq_dim"
+    type: "int"
+  }
+  attr {
+    name: "batch_dim"
     type: "int"
+    default_value {
+      i: 0
+    }
   }
   attr {
     name: "T"
     type: "type"
+  }
+  attr {
+    name: "Tlen"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
         type: DT_INT32
@@ -27787,626 +39108,546 @@ op {
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceGather"
+  name: "ReverseV2"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "tensor"
+    type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "axis"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    type_attr: "dtype"
+    type_attr: "T"
   }
   attr {
-    name: "validate_indices"
-    type: "bool"
+    name: "Tidx"
+    type: "type"
     default_value {
-      b: true
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_UINT8
+        type: DT_INT8
         type: DT_INT32
         type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceScatterAdd"
+  name: "ReverseV2"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "tensor"
+    type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "axis"
+    type_attr: "Tidx"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "Tindices"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_UINT8
+        type: DT_INT8
         type: DT_INT32
         type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceScatterAdd"
+  name: "ReverseV2"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "tensor"
+    type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "axis"
+    type_attr: "Tidx"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "Tindices"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceScatterUpdate"
+  name: "ReverseV2"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "tensor"
+    type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "axis"
+    type_attr: "Tidx"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "Tindices"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdadelta"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "RightShift"
   input_arg {
-    name: "rho"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "y"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "z"
     type_attr: "T"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
         type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+  is_commutative: true
+}
+op {
+  name: "Rint"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
   attr {
-    name: "Tindices"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdadelta"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
+  name: "Rint"
   input_arg {
-    name: "lr"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "rho"
+  output_arg {
+    name: "y"
     type_attr: "T"
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Round"
   input_arg {
-    name: "epsilon"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "y"
     type_attr: "T"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
+        type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
+}
+op {
+  name: "Round"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
   attr {
-    name: "Tindices"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
+  name: "Rsqrt"
   input_arg {
-    name: "lr"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "y"
     type_attr: "T"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
+}
+op {
+  name: "Rsqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
   attr {
-    name: "Tindices"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
+  name: "RsqrtGrad"
   input_arg {
-    name: "lr"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "y"
     type_attr: "T"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+  output_arg {
+    name: "z"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
+  name: "RsqrtGrad"
   input_arg {
-    name: "lr"
+    name: "y"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "dy"
     type_attr: "T"
   }
-  input_arg {
-    name: "l2"
+  output_arg {
+    name: "z"
     type_attr: "T"
   }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
+}
+op {
+  name: "RsqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
-    name: "Tindices"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
-  }
+  name: "SampleDistortedBoundingBox"
   input_arg {
-    name: "grad"
+    name: "image_size"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "bounding_boxes"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "l1"
+  output_arg {
+    name: "begin"
     type_attr: "T"
   }
-  input_arg {
-    name: "l2"
+  output_arg {
+    name: "size"
     type_attr: "T"
   }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
+  output_arg {
+    name: "bboxes"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "min_object_covered"
+    type: "float"
+    default_value {
+      f: 0.1
+    }
+  }
+  attr {
+    name: "aspect_ratio_range"
+    type: "list(float)"
+    default_value {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        f: 0.75
+        f: 1.33
       }
     }
   }
   attr {
-    name: "use_locking"
+    name: "area_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.05
+        f: 1
+      }
+    }
+  }
+  attr {
+    name: "max_attempts"
+    type: "int"
+    default_value {
+      i: 100
+    }
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
     type: "bool"
     default_value {
       b: false
@@ -28415,81 +39656,87 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
+  name: "SampleDistortedBoundingBoxV2"
   input_arg {
-    name: "lr"
+    name: "image_size"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
-    type_attr: "T"
+    name: "bounding_boxes"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "momentum"
-    type_attr: "T"
+    name: "min_object_covered"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "epsilon"
+  output_arg {
+    name: "begin"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "size"
     type_attr: "T"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+  output_arg {
+    name: "bboxes"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "aspect_ratio_range"
+    type: "list(float)"
+    default_value {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        f: 0.75
+        f: 1.33
       }
     }
   }
   attr {
-    name: "use_locking"
+    name: "area_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.05
+        f: 1
+      }
+    }
+  }
+  attr {
+    name: "max_attempts"
+    type: "int"
+    default_value {
+      i: 100
+    }
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
     type: "bool"
     default_value {
       b: false
@@ -28498,128 +39745,164 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyCenteredRMSProp"
+  name: "Save"
   input_arg {
-    name: "var"
-    type: DT_RESOURCE
+    name: "filename"
+    type: DT_STRING
   }
   input_arg {
-    name: "mg"
-    type: DT_RESOURCE
+    name: "tensor_names"
+    type: DT_STRING
   }
   input_arg {
-    name: "ms"
-    type: DT_RESOURCE
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
+}
+op {
+  name: "Save"
   input_arg {
-    name: "mom"
-    type: DT_RESOURCE
+    name: "filename"
+    type: DT_STRING
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "tensor_names"
+    type: DT_STRING
   }
   input_arg {
-    name: "rho"
-    type_attr: "T"
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "SaveSlices"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shapes_and_slices"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
   }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "SaveSlices"
   input_arg {
-    name: "momentum"
-    type_attr: "T"
+    name: "filename"
+    type: DT_STRING
   }
   input_arg {
-    name: "epsilon"
-    type_attr: "T"
+    name: "tensor_names"
+    type: DT_STRING
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "shapes_and_slices"
+    type: DT_STRING
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "data"
+    type_list_attr: "T"
   }
   attr {
     name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyFtrl"
+  name: "SaveV2"
   input_arg {
-    name: "var"
-    type: DT_RESOURCE
+    name: "prefix"
+    type: DT_STRING
   }
   input_arg {
-    name: "accum"
-    type: DT_RESOURCE
+    name: "tensor_names"
+    type: DT_STRING
   }
   input_arg {
-    name: "linear"
-    type: DT_RESOURCE
+    name: "shape_and_slices"
+    type: DT_STRING
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
+}
+op {
+  name: "SaveV2"
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "prefix"
+    type: DT_STRING
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "tensor_names"
+    type: DT_STRING
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "shape_and_slices"
+    type: DT_STRING
   }
   input_arg {
-    name: "l2"
-    type_attr: "T"
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
+  is_stateful: true
+}
+op {
+  name: "ScalarSummary"
   input_arg {
-    name: "lr_power"
+    name: "tags"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
     type_attr: "T"
   }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
   attr {
     name: "T"
     type: "type"
@@ -28627,77 +39910,30 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyFtrl"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "ScalarSummary"
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "tags"
+    type: DT_STRING
   }
   input_arg {
-    name: "l2"
+    name: "values"
     type_attr: "T"
   }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
+  output_arg {
+    name: "summary"
+    type: DT_STRING
   }
   attr {
     name: "T"
@@ -28706,83 +39942,32 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
+  name: "ScalarSummary"
   input_arg {
-    name: "l2"
-    type_attr: "T"
+    name: "tags"
+    type: DT_STRING
   }
   input_arg {
-    name: "l2_shrinkage"
+    name: "values"
     type_attr: "T"
   }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
+  output_arg {
+    name: "summary"
+    type: DT_STRING
   }
   attr {
     name: "T"
@@ -28791,81 +39976,85 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyFtrlV2"
+  name: "ScanDataset"
   input_arg {
-    name: "var"
-    type: DT_RESOURCE
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "accum"
-    type: DT_RESOURCE
+    name: "initial_state"
+    type_list_attr: "Tstate"
   }
   input_arg {
-    name: "linear"
-    type: DT_RESOURCE
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+  attr {
+    name: "f"
+    type: "func"
   }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
+}
+op {
+  name: "ScatterAdd"
   input_arg {
-    name: "l1"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "l2"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "l2_shrinkage"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "lr_power"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -28886,8 +40075,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -28908,33 +40095,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyMomentum"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "ScatterAdd"
   input_arg {
-    name: "grad"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
     name: "indices"
     type_attr: "Tindices"
   }
   input_arg {
-    name: "momentum"
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -28955,6 +40135,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -28975,40 +40157,26 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyMomentum"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "ScatterAdd"
   input_arg {
-    name: "grad"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
     name: "indices"
     type_attr: "Tindices"
   }
   input_arg {
-    name: "momentum"
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -29031,6 +40199,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -29051,44 +40220,26 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
+  name: "ScatterDiv"
   input_arg {
-    name: "lr"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "l2"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -29129,37 +40280,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
+  name: "ScatterDiv"
   input_arg {
-    name: "lr"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "l2"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -29202,33 +40342,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
+  name: "ScatterDiv"
   input_arg {
-    name: "l1"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "l2"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "grad"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -29249,6 +40382,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -29269,33 +40405,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
+  name: "ScatterMul"
   input_arg {
-    name: "alpha"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "l2"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -29316,8 +40445,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -29338,45 +40465,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "ScatterMul"
   input_arg {
-    name: "rho"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "momentum"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "epsilon"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -29397,6 +40505,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -29417,45 +40527,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "ScatterMul"
   input_arg {
-    name: "rho"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "momentum"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "epsilon"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -29478,6 +40569,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -29498,28 +40590,23 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceStridedSliceAssign"
-  input_arg {
-    name: "ref"
-    type: DT_RESOURCE
-  }
+  name: "ScatterNd"
   input_arg {
-    name: "begin"
-    type_attr: "Index"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "end"
-    type_attr: "Index"
+    name: "updates"
+    type_attr: "T"
   }
   input_arg {
-    name: "strides"
-    type_attr: "Index"
+    name: "shape"
+    type_attr: "Tindices"
   }
-  input_arg {
-    name: "value"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -29527,7 +40614,7 @@ op {
     type: "type"
   }
   attr {
-    name: "Index"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
@@ -29536,284 +40623,205 @@ op {
       }
     }
   }
-  attr {
-    name: "begin_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "end_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "ellipsis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "new_axis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "shrink_axis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "Restore"
+  name: "ScatterNdAdd"
   input_arg {
-    name: "file_pattern"
-    type: DT_STRING
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "dt"
+    name: "T"
     type: "type"
-  }
-  attr {
-    name: "preferred_shard"
-    type: "int"
-    default_value {
-      i: -1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
     }
   }
-}
-op {
-  name: "Restore"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "tensor"
-    type_attr: "dt"
-  }
   attr {
-    name: "dt"
+    name: "Tindices"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "preferred_shard"
-    type: "int"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      i: -1
+      b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "RestoreSlice"
+  name: "ScatterNdAdd"
   input_arg {
-    name: "file_pattern"
-    type: DT_STRING
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "shape_and_slice"
-    type: DT_STRING
+    name: "updates"
+    type_attr: "T"
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "dt"
+    name: "T"
     type: "type"
-  }
-  attr {
-    name: "preferred_shard"
-    type: "int"
-    default_value {
-      i: -1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
     }
   }
-}
-op {
-  name: "RestoreSlice"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "shape_and_slice"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "tensor"
-    type_attr: "dt"
-  }
   attr {
-    name: "dt"
+    name: "Tindices"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "preferred_shard"
-    type: "int"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      i: -1
+      b: false
     }
   }
-  is_stateful: true
-}
-op {
-  name: "RestoreV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
 }
 op {
-  name: "RestoreV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
-  }
+  name: "ScatterNdAdd"
   input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "indices"
+    type_attr: "Tindices"
   }
-  is_stateful: true
-}
-op {
-  name: "Reverse"
   input_arg {
-    name: "tensor"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "dims"
-    type: DT_BOOL
-  }
   output_arg {
-    name: "output"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-}
-op {
-  name: "Reverse"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "dims"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
         type: DT_INT32
         type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "Reverse"
+  name: "ScatterNdNonAliasingAdd"
   input_arg {
-    name: "tensor"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "dims"
-    type: DT_BOOL
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -29824,58 +40832,26 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
-        type: DT_INT8
         type: DT_UINT16
         type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_STRING
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
-}
-op {
-  name: "ReverseSequence"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "seq_lengths"
-    type_attr: "Tlen"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "seq_dim"
-    type: "int"
-  }
-  attr {
-    name: "batch_dim"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
   attr {
-    name: "Tlen"
+    name: "Tindices"
     type: "type"
-    default_value {
-      type: DT_INT64
-    }
     allowed_values {
       list {
         type: DT_INT32
@@ -29885,645 +40861,730 @@ op {
   }
 }
 op {
-  name: "ReverseV2"
+  name: "ScatterNdNonAliasingAdd"
   input_arg {
-    name: "tensor"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "axis"
-    type_attr: "Tidx"
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "Tidx"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
         type: DT_INT32
         type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "ReverseV2"
+  name: "ScatterNdNonAliasingAdd"
   input_arg {
-    name: "tensor"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "axis"
-    type_attr: "Tidx"
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "Tidx"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
         type: DT_INT32
         type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
       }
     }
   }
 }
 op {
-  name: "ReverseV2"
+  name: "ScatterNdSub"
   input_arg {
-    name: "tensor"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "axis"
-    type_attr: "Tidx"
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "Tidx"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "RightShift"
+  name: "ScatterNdSub"
   input_arg {
-    name: "x"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "y"
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
         type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  is_commutative: true
-}
-op {
-  name: "Rint"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "Round"
+  name: "ScatterNdSub"
   input_arg {
-    name: "x"
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-}
-op {
-  name: "Rsqrt"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "RsqrtGrad"
+  name: "ScatterNdUpdate"
   input_arg {
-    name: "x"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "y"
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
-  name: "RsqrtGrad"
+  name: "ScatterSub"
   input_arg {
-    name: "y"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "dy"
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SampleDistortedBoundingBox"
+  name: "ScatterSub"
   input_arg {
-    name: "image_size"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "bounding_boxes"
-    type: DT_FLOAT
+    name: "indices"
+    type_attr: "Tindices"
   }
-  output_arg {
-    name: "begin"
+  input_arg {
+    name: "updates"
     type_attr: "T"
   }
   output_arg {
-    name: "size"
+    name: "output_ref"
     type_attr: "T"
-  }
-  output_arg {
-    name: "bboxes"
-    type: DT_FLOAT
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
-        type: DT_INT8
+        type: DT_UINT16
         type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "min_object_covered"
-    type: "float"
-    default_value {
-      f: 0.1
-    }
-  }
-  attr {
-    name: "aspect_ratio_range"
-    type: "list(float)"
-    default_value {
-      list {
-        f: 0.75
-        f: 1.33
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "area_range"
-    type: "list(float)"
-    default_value {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
       list {
-        f: 0.05
-        f: 1
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "max_attempts"
-    type: "int"
-    default_value {
-      i: 100
-    }
-  }
-  attr {
-    name: "use_image_if_no_bounding_boxes"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "SampleDistortedBoundingBoxV2"
+  name: "ScatterSub"
   input_arg {
-    name: "image_size"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "bounding_boxes"
-    type: DT_FLOAT
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "min_object_covered"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "begin"
+    name: "updates"
     type_attr: "T"
   }
   output_arg {
-    name: "size"
+    name: "output_ref"
     type_attr: "T"
-  }
-  output_arg {
-    name: "bboxes"
-    type: DT_FLOAT
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
-        type: DT_INT8
+        type: DT_UINT16
         type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
-  attr {
-    name: "aspect_ratio_range"
-    type: "list(float)"
-    default_value {
-      list {
-        f: 0.75
-        f: 1.33
-      }
-    }
+}
+op {
+  name: "ScatterUpdate"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "area_range"
-    type: "list(float)"
-    default_value {
-      list {
-        f: 0.05
-        f: 1
-      }
-    }
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "max_attempts"
-    type: "int"
-    default_value {
-      i: 100
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "use_image_if_no_bounding_boxes"
+    name: "use_locking"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
-  is_stateful: true
 }
 op {
-  name: "Save"
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
+  name: "SdcaFprint"
   input_arg {
-    name: "tensor_names"
+    name: "input"
     type: DT_STRING
   }
-  input_arg {
-    name: "data"
-    type_list_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "output"
+    type: DT_INT64
   }
 }
 op {
-  name: "Save"
+  name: "SdcaOptimizer"
   input_arg {
-    name: "filename"
-    type: DT_STRING
+    name: "sparse_example_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
   }
   input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+    name: "sparse_feature_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
   }
   input_arg {
-    name: "data"
-    type_list_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "sparse_feature_values"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features_with_values"
   }
-  is_stateful: true
-}
-op {
-  name: "SaveSlices"
   input_arg {
-    name: "filename"
-    type: DT_STRING
+    name: "dense_features"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
   }
   input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+    name: "example_weights"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "shapes_and_slices"
-    type: DT_STRING
+    name: "example_labels"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "data"
-    type_list_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
   }
-}
-op {
-  name: "SaveSlices"
   input_arg {
-    name: "filename"
-    type: DT_STRING
+    name: "sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
   }
   input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+    name: "dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
   }
   input_arg {
-    name: "shapes_and_slices"
-    type: DT_STRING
+    name: "example_state_data"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "data"
-    type_list_attr: "T"
+  output_arg {
+    name: "out_example_state_data"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_delta_sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  output_arg {
+    name: "out_delta_dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
   }
   attr {
-    name: "T"
-    type: "list(type)"
+    name: "loss_type"
+    type: "string"
+    allowed_values {
+      list {
+        s: "logistic_loss"
+        s: "squared_loss"
+        s: "hinge_loss"
+        s: "smooth_hinge_loss"
+      }
+    }
+  }
+  attr {
+    name: "adaptative"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "num_sparse_features"
+    type: "int"
     has_minimum: true
-    minimum: 1
   }
-  is_stateful: true
-}
-op {
-  name: "SaveV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
+  attr {
+    name: "num_sparse_features_with_values"
+    type: "int"
+    has_minimum: true
   }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+  attr {
+    name: "num_dense_features"
+    type: "int"
+    has_minimum: true
   }
-  input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
+  attr {
+    name: "l1"
+    type: "float"
   }
-  input_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
+  attr {
+    name: "l2"
+    type: "float"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "num_loss_partitions"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_inner_iterations"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
 }
 op {
-  name: "SaveV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
-  }
+  name: "SdcaShrinkL1"
   input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+    name: "weights"
+    type: DT_FLOAT
+    number_attr: "num_features"
+    is_ref: true
   }
-  input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
   }
-  input_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
+  attr {
+    name: "l1"
+    type: "float"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "l2"
+    type: "float"
   }
-  is_stateful: true
 }
 op {
-  name: "ScalarSummary"
+  name: "SegmentMax"
   input_arg {
-    name: "tags"
-    type: DT_STRING
+    name: "data"
+    type_attr: "T"
   }
   input_arg {
-    name: "values"
-    type_attr: "T"
+    name: "segment_ids"
+    type_attr: "Tindices"
   }
   output_arg {
-    name: "summary"
-    type: DT_STRING
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -30542,20 +41603,30 @@ op {
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
-  name: "ScalarSummary"
+  name: "SegmentMax"
   input_arg {
-    name: "tags"
-    type: DT_STRING
+    name: "data"
+    type_attr: "T"
   }
   input_arg {
-    name: "values"
-    type_attr: "T"
+    name: "segment_ids"
+    type_attr: "Tindices"
   }
   output_arg {
-    name: "summary"
-    type: DT_STRING
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -30576,72 +41647,30 @@ op {
       }
     }
   }
-}
-op {
-  name: "ScanDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "initial_state"
-    type_list_attr: "Tstate"
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Tstate"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
 }
 op {
-  name: "ScatterAdd"
+  name: "SegmentMax"
   input_arg {
-    name: "ref"
+    name: "data"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
   output_arg {
-    name: "output_ref"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -30650,18 +41679,16 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -30675,33 +41702,20 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterAdd"
+  name: "SegmentMean"
   input_arg {
-    name: "ref"
+    name: "data"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
   output_arg {
-    name: "output_ref"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -30710,20 +41724,13 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -30737,33 +41744,20 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterDiv"
+  name: "SegmentMean"
   input_arg {
-    name: "ref"
+    name: "data"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
   output_arg {
-    name: "output_ref"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -30772,18 +41766,15 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -30797,33 +41788,20 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterDiv"
+  name: "SegmentMean"
   input_arg {
-    name: "ref"
+    name: "data"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
   output_arg {
-    name: "output_ref"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -30832,20 +41810,16 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -30859,33 +41833,20 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterMul"
+  name: "SegmentMin"
   input_arg {
-    name: "ref"
+    name: "data"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
   output_arg {
-    name: "output_ref"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -30894,17 +41855,12 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
       }
     }
@@ -30919,33 +41875,20 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterMul"
+  name: "SegmentMin"
   input_arg {
-    name: "ref"
+    name: "data"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
   output_arg {
-    name: "output_ref"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -30954,17 +41897,12 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -30981,26 +41919,15 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterNd"
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
+  name: "SegmentMin"
   input_arg {
-    name: "updates"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "shape"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
   output_arg {
@@ -31010,6 +41937,22 @@ op {
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
     name: "Tindices"
@@ -31023,24 +41966,18 @@ op {
   }
 }
 op {
-  name: "ScatterNdAdd"
+  name: "SegmentProd"
   input_arg {
-    name: "ref"
+    name: "data"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
   output_arg {
-    name: "output_ref"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -31074,33 +42011,20 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterNdAdd"
+  name: "SegmentProd"
   input_arg {
-    name: "ref"
+    name: "data"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
   output_arg {
-    name: "output_ref"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -31136,28 +42060,17 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterNdNonAliasingAdd"
+  name: "SegmentProd"
   input_arg {
-    name: "input"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -31181,6 +42094,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -31196,19 +42112,15 @@ op {
   }
 }
 op {
-  name: "ScatterNdNonAliasingAdd"
+  name: "SegmentSum"
   input_arg {
-    name: "input"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -31232,8 +42144,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -31249,24 +42159,18 @@ op {
   }
 }
 op {
-  name: "ScatterNdSub"
+  name: "SegmentSum"
   input_arg {
-    name: "ref"
+    name: "data"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
   output_arg {
-    name: "output_ref"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -31287,6 +42191,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -31300,33 +42206,20 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterNdSub"
+  name: "SegmentSum"
   input_arg {
-    name: "ref"
+    name: "data"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
   output_arg {
-    name: "output_ref"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -31349,6 +42242,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -31362,205 +42256,460 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterNdUpdate"
+  name: "Select"
   input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
+    name: "condition"
+    type: DT_BOOL
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "t"
+    type_attr: "T"
   }
   input_arg {
-    name: "updates"
+    name: "e"
     type_attr: "T"
   }
   output_arg {
-    name: "output_ref"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
   }
+}
+op {
+  name: "SelfAdjointEig"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "Tindices"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_DOUBLE
+        type: DT_FLOAT
       }
     }
   }
+  deprecation {
+    version: 11
+  }
+}
+op {
+  name: "SelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
   attr {
-    name: "use_locking"
+    name: "compute_v"
     type: "bool"
     default_value {
       b: true
     }
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
 }
 op {
-  name: "ScatterSub"
+  name: "SelfAdjointEigV2"
   input_arg {
-    name: "ref"
+    name: "input"
     type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
+  output_arg {
+    name: "e"
     type_attr: "T"
   }
   output_arg {
-    name: "output_ref"
+    name: "v"
     type_attr: "T"
-    is_ref: true
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
+        type: DT_FLOAT
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
+}
+op {
+  name: "Selu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
   attr {
-    name: "Tindices"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
+}
+op {
+  name: "Selu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
 }
 op {
-  name: "ScatterSub"
+  name: "SeluGrad"
   input_arg {
-    name: "ref"
+    name: "gradients"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "SeluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
   }
   input_arg {
-    name: "updates"
+    name: "outputs"
     type_attr: "T"
   }
   output_arg {
-    name: "output_ref"
+    name: "backprops"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
+}
+op {
+  name: "SerializeIterator"
+  input_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "serialized"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "SerializeManySparse"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
   attr {
-    name: "Tindices"
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SerializeManySparse"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
     type: "type"
+    default_value {
+      type: DT_STRING
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_STRING
+        type: DT_VARIANT
+      }
+    }
+  }
+}
+op {
+  name: "SerializeSparse"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SerializeSparse"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_STRING
+    }
+    allowed_values {
+      list {
+        type: DT_STRING
+        type: DT_VARIANT
       }
     }
   }
+}
+op {
+  name: "SerializeTensor"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
   attr {
-    name: "use_locking"
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SetSize"
+  input_arg {
+    name: "set_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "validate_indices"
     type: "bool"
     default_value {
-      b: false
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
     }
   }
 }
 op {
-  name: "ScatterUpdate"
+  name: "Shape"
   input_arg {
-    name: "ref"
+    name: "input"
     type_attr: "T"
-    is_ref: true
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
+}
+op {
+  name: "ShapeN"
   input_arg {
-    name: "updates"
+    name: "input"
     type_attr: "T"
+    number_attr: "N"
   }
   output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
+    name: "output"
+    type_attr: "out_type"
+    number_attr: "N"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
     name: "T"
     type: "type"
   }
   attr {
-    name: "Tindices"
+    name: "out_type"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
         type: DT_INT32
@@ -31568,177 +42717,201 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
 }
 op {
-  name: "SdcaFprint"
+  name: "ShardedFilename"
   input_arg {
-    name: "input"
+    name: "basename"
     type: DT_STRING
   }
+  input_arg {
+    name: "shard"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT32
+  }
   output_arg {
-    name: "output"
-    type: DT_INT64
+    name: "filename"
+    type: DT_STRING
   }
 }
 op {
-  name: "SdcaOptimizer"
+  name: "ShardedFilespec"
   input_arg {
-    name: "sparse_example_indices"
-    type: DT_INT64
-    number_attr: "num_sparse_features"
+    name: "basename"
+    type: DT_STRING
   }
   input_arg {
-    name: "sparse_feature_indices"
-    type: DT_INT64
-    number_attr: "num_sparse_features"
+    name: "num_shards"
+    type: DT_INT32
   }
-  input_arg {
-    name: "sparse_feature_values"
-    type: DT_FLOAT
-    number_attr: "num_sparse_features_with_values"
+  output_arg {
+    name: "filename"
+    type: DT_STRING
   }
+}
+op {
+  name: "ShuffleAndRepeatDataset"
   input_arg {
-    name: "dense_features"
-    type: DT_FLOAT
-    number_attr: "num_dense_features"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "example_weights"
-    type: DT_FLOAT
+    name: "buffer_size"
+    type: DT_INT64
   }
   input_arg {
-    name: "example_labels"
-    type: DT_FLOAT
+    name: "seed"
+    type: DT_INT64
   }
   input_arg {
-    name: "sparse_indices"
+    name: "seed2"
     type: DT_INT64
-    number_attr: "num_sparse_features"
   }
   input_arg {
-    name: "sparse_weights"
-    type: DT_FLOAT
-    number_attr: "num_sparse_features"
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ShuffleDataset"
   input_arg {
-    name: "dense_weights"
-    type: DT_FLOAT
-    number_attr: "num_dense_features"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "example_state_data"
-    type: DT_FLOAT
+    name: "buffer_size"
+    type: DT_INT64
   }
-  output_arg {
-    name: "out_example_state_data"
-    type: DT_FLOAT
+  input_arg {
+    name: "seed"
+    type: DT_INT64
   }
-  output_arg {
-    name: "out_delta_sparse_weights"
-    type: DT_FLOAT
-    number_attr: "num_sparse_features"
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
   }
   output_arg {
-    name: "out_delta_dense_weights"
-    type: DT_FLOAT
-    number_attr: "num_dense_features"
-  }
-  attr {
-    name: "loss_type"
-    type: "string"
-    allowed_values {
-      list {
-        s: "logistic_loss"
-        s: "squared_loss"
-        s: "hinge_loss"
-        s: "smooth_hinge_loss"
-      }
-    }
-  }
-  attr {
-    name: "adaptative"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "num_sparse_features"
-    type: "int"
+    name: "output_types"
+    type: "list(type)"
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "num_sparse_features_with_values"
-    type: "int"
+    name: "output_shapes"
+    type: "list(shape)"
     has_minimum: true
+    minimum: 1
   }
-  attr {
-    name: "num_dense_features"
-    type: "int"
-    has_minimum: true
+  is_stateful: true
+}
+op {
+  name: "ShuffleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
   }
-  attr {
-    name: "l1"
-    type: "float"
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
   }
-  attr {
-    name: "l2"
-    type: "float"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "num_loss_partitions"
-    type: "int"
+    name: "output_types"
+    type: "list(type)"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "num_inner_iterations"
-    type: "int"
+    name: "output_shapes"
+    type: "list(shape)"
     has_minimum: true
     minimum: 1
   }
 }
 op {
-  name: "SdcaShrinkL1"
+  name: "ShuffleDataset"
   input_arg {
-    name: "weights"
-    type: DT_FLOAT
-    number_attr: "num_features"
-    is_ref: true
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "num_features"
-    type: "int"
-    has_minimum: true
+    name: "reshuffle_each_iteration"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "l1"
-    type: "float"
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "l2"
-    type: "float"
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "SegmentMax"
+  name: "Sigmoid"
   input_arg {
-    name: "data"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -31746,41 +42919,23 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "SegmentMax"
+  name: "Sigmoid"
   input_arg {
-    name: "data"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -31788,43 +42943,28 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "SegmentMean"
+  name: "SigmoidGrad"
   input_arg {
-    name: "data"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
@@ -31832,41 +42972,27 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "SegmentMean"
+  name: "SigmoidGrad"
   input_arg {
-    name: "data"
+    name: "y"
     type_attr: "T"
   }
   input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
+    name: "dy"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
@@ -31874,43 +43000,27 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "SegmentMin"
+  name: "SigmoidGrad"
   input_arg {
-    name: "data"
+    name: "y"
     type_attr: "T"
   }
   input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
+    name: "dy"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
@@ -31918,41 +43028,24 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "SegmentMin"
+  name: "Sign"
   input_arg {
-    name: "data"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -31960,43 +43053,52 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+}
+op {
+  name: "Sign"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
   attr {
-    name: "Tindices"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "SegmentProd"
+  name: "Sin"
   input_arg {
-    name: "data"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -32004,46 +43106,23 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "SegmentProd"
+  name: "Sin"
   input_arg {
-    name: "data"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -32051,48 +43130,24 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "SegmentSum"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
+  name: "Sinh"
   input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -32100,46 +43155,23 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "SegmentSum"
+  name: "Sinh"
   input_arg {
-    name: "data"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -32147,28 +43179,36 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
+}
+op {
+  name: "Size"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
   attr {
-    name: "Tindices"
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
         type: DT_INT32
@@ -32178,126 +43218,180 @@ op {
   }
 }
 op {
-  name: "Select"
+  name: "SkipDataset"
   input_arg {
-    name: "condition"
-    type: DT_BOOL
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "t"
-    type_attr: "T"
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "SkipDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "e"
-    type_attr: "T"
+    name: "count"
+    type: DT_INT64
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "SelfAdjointEig"
-  input_arg {
-    name: "input"
-    type_attr: "T"
+  name: "Skipgram"
+  output_arg {
+    name: "vocab_word"
+    type: DT_STRING
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "vocab_freq"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "words_per_epoch"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "current_epoch"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "total_words_processed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "examples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "labels"
+    type: DT_INT32
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
+    name: "filename"
+    type: "string"
+  }
+  attr {
+    name: "batch_size"
+    type: "int"
+  }
+  attr {
+    name: "window_size"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "min_count"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "subsample"
+    type: "float"
+    default_value {
+      f: 0.001
     }
   }
   deprecation {
-    version: 11
+    version: 19
   }
+  is_stateful: true
 }
 op {
-  name: "SelfAdjointEigV2"
+  name: "Slice"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  output_arg {
-    name: "e"
-    type_attr: "T"
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "size"
+    type_attr: "Index"
   }
   output_arg {
-    name: "v"
+    name: "output"
     type_attr: "T"
   }
   attr {
-    name: "compute_v"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "T"
+    name: "Index"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "SelfAdjointEigV2"
+  name: "Snapshot"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "e"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "v"
+    name: "output"
     type_attr: "T"
   }
-  attr {
-    name: "compute_v"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
 }
 op {
-  name: "Selu"
+  name: "Softmax"
   input_arg {
-    name: "features"
+    name: "logits"
     type_attr: "T"
   }
   output_arg {
-    name: "activations"
+    name: "softmax"
     type_attr: "T"
   }
   attr {
@@ -32313,17 +43407,13 @@ op {
   }
 }
 op {
-  name: "SeluGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
+  name: "Softmax"
   input_arg {
-    name: "outputs"
+    name: "logits"
     type_attr: "T"
   }
   output_arg {
-    name: "backprops"
+    name: "softmax"
     type_attr: "T"
   }
   attr {
@@ -32332,6 +43422,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -32339,338 +43430,292 @@ op {
   }
 }
 op {
-  name: "SerializeIterator"
-  input_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "serialized"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
-  name: "SerializeManySparse"
+  name: "SoftmaxCrossEntropyWithLogits"
   input_arg {
-    name: "sparse_indices"
-    type: DT_INT64
+    name: "features"
+    type_attr: "T"
   }
   input_arg {
-    name: "sparse_values"
+    name: "labels"
     type_attr: "T"
   }
-  input_arg {
-    name: "sparse_shape"
-    type: DT_INT64
+  output_arg {
+    name: "loss"
+    type_attr: "T"
   }
   output_arg {
-    name: "serialized_sparse"
-    type: DT_STRING
+    name: "backprop"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
 }
 op {
-  name: "SerializeSparse"
-  input_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-  }
+  name: "SoftmaxCrossEntropyWithLogits"
   input_arg {
-    name: "sparse_values"
+    name: "features"
     type_attr: "T"
   }
   input_arg {
-    name: "sparse_shape"
-    type: DT_INT64
+    name: "labels"
+    type_attr: "T"
   }
   output_arg {
-    name: "serialized_sparse"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SerializeTensor"
-  input_arg {
-    name: "tensor"
+    name: "loss"
     type_attr: "T"
   }
   output_arg {
-    name: "serialized"
-    type: DT_STRING
+    name: "backprop"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
 }
 op {
-  name: "SetSize"
-  input_arg {
-    name: "set_indices"
-    type: DT_INT64
-  }
+  name: "Softplus"
   input_arg {
-    name: "set_values"
+    name: "features"
     type_attr: "T"
   }
-  input_arg {
-    name: "set_shape"
-    type: DT_INT64
-  }
   output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "activations"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
         type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
         type: DT_UINT16
-        type: DT_STRING
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "Shape"
+  name: "Softplus"
   input_arg {
-    name: "input"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "out_type"
+    name: "activations"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "ShapeN"
+  name: "Softplus"
   input_arg {
-    name: "input"
+    name: "features"
     type_attr: "T"
-    number_attr: "N"
   }
   output_arg {
-    name: "output"
-    type_attr: "out_type"
-    number_attr: "N"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "activations"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "ShardedFilename"
-  input_arg {
-    name: "basename"
-    type: DT_STRING
-  }
+  name: "SoftplusGrad"
   input_arg {
-    name: "shard"
-    type: DT_INT32
+    name: "gradients"
+    type_attr: "T"
   }
   input_arg {
-    name: "num_shards"
-    type: DT_INT32
+    name: "features"
+    type_attr: "T"
   }
   output_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-}
-op {
-  name: "ShardedFilespec"
-  input_arg {
-    name: "basename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "num_shards"
-    type: DT_INT32
+    name: "backprops"
+    type_attr: "T"
   }
-  output_arg {
-    name: "filename"
-    type: DT_STRING
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
   }
 }
 op {
-  name: "ShuffleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
+  name: "SoftplusGrad"
   input_arg {
-    name: "seed"
-    type: DT_INT64
+    name: "gradients"
+    type_attr: "T"
   }
   input_arg {
-    name: "seed2"
-    type: DT_INT64
+    name: "features"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "backprops"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "ShuffleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
+  name: "SoftplusGrad"
   input_arg {
-    name: "seed"
-    type: DT_INT64
+    name: "gradients"
+    type_attr: "T"
   }
   input_arg {
-    name: "seed2"
-    type: DT_INT64
+    name: "features"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "backprops"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
 }
 op {
-  name: "ShuffleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
+  name: "Softsign"
   input_arg {
-    name: "seed2"
-    type: DT_INT64
+    name: "features"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "activations"
+    type_attr: "T"
   }
   attr {
-    name: "reshuffle_each_iteration"
-    type: "bool"
-    default_value {
-      b: true
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
     }
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
 }
 op {
-  name: "Sigmoid"
+  name: "Softsign"
   input_arg {
-    name: "x"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "activations"
     type_attr: "T"
   }
   attr {
@@ -32678,27 +43723,29 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "SigmoidGrad"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
+  name: "Softsign"
   input_arg {
-    name: "y"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "activations"
     type_attr: "T"
   }
   attr {
@@ -32706,27 +43753,34 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "SigmoidGrad"
+  name: "SoftsignGrad"
   input_arg {
-    name: "y"
+    name: "gradients"
     type_attr: "T"
   }
   input_arg {
-    name: "dy"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "backprops"
     type_attr: "T"
   }
   attr {
@@ -32734,23 +43788,31 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "Sign"
+  name: "SoftsignGrad"
   input_arg {
-    name: "x"
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "backprops"
     type_attr: "T"
   }
   attr {
@@ -32758,25 +43820,33 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Sin"
+  name: "SoftsignGrad"
   input_arg {
-    name: "x"
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "backprops"
     type_attr: "T"
   }
   attr {
@@ -32784,55 +43854,97 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "Sinh"
+  name: "SpaceToBatch"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
   output_arg {
-    name: "y"
+    name: "output"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
 }
 op {
-  name: "Size"
+  name: "SpaceToBatchND"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "block_shape"
+    type_attr: "Tblock_shape"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
   output_arg {
     name: "output"
-    type_attr: "out_type"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
   }
   attr {
-    name: "out_type"
+    name: "Tblock_shape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tpaddings"
     type: "type"
     default_value {
       type: DT_INT32
@@ -32846,219 +43958,401 @@ op {
   }
 }
 op {
-  name: "SkipDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
+  name: "SpaceToDepth"
   input_arg {
-    name: "count"
-    type: DT_INT64
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "block_size"
+    type: "int"
     has_minimum: true
-    minimum: 1
+    minimum: 2
   }
-  is_stateful: true
 }
 op {
-  name: "SkipDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
+  name: "SpaceToDepth"
   input_arg {
-    name: "count"
-    type: DT_INT64
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "block_size"
+    type: "int"
     has_minimum: true
-    minimum: 1
+    minimum: 2
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
   }
 }
 op {
-  name: "Skipgram"
-  output_arg {
-    name: "vocab_word"
+  name: "SparseAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
     type: DT_STRING
+    is_ref: true
   }
-  output_arg {
-    name: "vocab_freq"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "words_per_epoch"
+  input_arg {
+    name: "local_step"
     type: DT_INT64
   }
-  output_arg {
-    name: "current_epoch"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "total_words_processed"
+  input_arg {
+    name: "gradient_indices"
     type: DT_INT64
   }
-  output_arg {
-    name: "examples"
-    type: DT_INT32
+  input_arg {
+    name: "gradient_values"
+    type_attr: "dtype"
   }
-  output_arg {
-    name: "labels"
-    type: DT_INT32
+  input_arg {
+    name: "gradient_shape"
+    type: DT_INT64
   }
   attr {
-    name: "filename"
-    type: "string"
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
   }
   attr {
-    name: "batch_size"
-    type: "int"
+    name: "has_known_shape"
+    type: "bool"
+  }
+}
+op {
+  name: "SparseAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_values"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "gradient_shape"
+    type: DT_INT64
   }
   attr {
-    name: "window_size"
-    type: "int"
-    default_value {
-      i: 5
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
     }
   }
   attr {
-    name: "min_count"
-    type: "int"
-    default_value {
-      i: 5
-    }
+    name: "has_known_shape"
+    type: "bool"
+  }
+}
+op {
+  name: "SparseAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_values"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "gradient_shape"
+    type: DT_INT64
   }
   attr {
-    name: "subsample"
-    type: "float"
-    default_value {
-      f: 0.001
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
     }
   }
-  deprecation {
-    version: 19
+  attr {
+    name: "has_known_shape"
+    type: "bool"
   }
-  is_stateful: true
 }
 op {
-  name: "Slice"
+  name: "SparseAccumulatorTakeGradient"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "begin"
-    type_attr: "Index"
+    name: "num_required"
+    type: DT_INT32
   }
-  input_arg {
-    name: "size"
-    type_attr: "Index"
+  output_arg {
+    name: "indices"
+    type: DT_INT64
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "values"
+    type_attr: "dtype"
   }
-  attr {
-    name: "T"
-    type: "type"
+  output_arg {
+    name: "shape"
+    type: DT_INT64
   }
   attr {
-    name: "Index"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "Softmax"
+  name: "SparseAccumulatorTakeGradient"
   input_arg {
-    name: "logits"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
   }
   output_arg {
-    name: "softmax"
-    type_attr: "T"
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "SoftmaxCrossEntropyWithLogits"
+  name: "SparseAccumulatorTakeGradient"
   input_arg {
-    name: "features"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "labels"
-    type_attr: "T"
+    name: "num_required"
+    type: DT_INT32
   }
   output_arg {
-    name: "loss"
-    type_attr: "T"
+    name: "indices"
+    type: DT_INT64
   }
   output_arg {
-    name: "backprop"
-    type_attr: "T"
+    name: "values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "Softplus"
+  name: "SparseAdd"
   input_arg {
-    name: "features"
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
     type_attr: "T"
   }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "thresh"
+    type_attr: "Treal"
+  }
   output_arg {
-    name: "activations"
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sum_values"
     type_attr: "T"
   }
+  output_arg {
+    name: "sum_shape"
+    type: DT_INT64
+  }
   attr {
     name: "T"
     type: "type"
@@ -33066,29 +44360,23 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
       }
     }
   }
-}
-op {
-  name: "Softplus"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Treal"
     type: "type"
     allowed_values {
       list {
@@ -33101,26 +44389,52 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "SoftplusGrad"
+  name: "SparseAdd"
   input_arg {
-    name: "gradients"
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
     type_attr: "T"
   }
   input_arg {
-    name: "features"
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
     type_attr: "T"
   }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "thresh"
+    type_attr: "Treal"
+  }
   output_arg {
-    name: "backprops"
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sum_values"
     type_attr: "T"
   }
+  output_arg {
+    name: "sum_shape"
+    type: DT_INT64
+  }
   attr {
     name: "T"
     type: "type"
@@ -33128,33 +44442,25 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "SoftplusGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Treal"
     type: "type"
     allowed_values {
       list {
@@ -33174,15 +44480,47 @@ op {
   }
 }
 op {
-  name: "Softsign"
+  name: "SparseAdd"
   input_arg {
-    name: "features"
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
     type_attr: "T"
   }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "thresh"
+    type_attr: "Treal"
+  }
   output_arg {
-    name: "activations"
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sum_values"
     type_attr: "T"
   }
+  output_arg {
+    name: "sum_shape"
+    type: DT_INT64
+  }
   attr {
     name: "T"
     type: "type"
@@ -33190,29 +44528,26 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-}
-op {
-  name: "Softsign"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Treal"
     type: "type"
     allowed_values {
       list {
@@ -33227,22 +44562,35 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "SoftsignGrad"
+  name: "SparseAddGrad"
   input_arg {
-    name: "gradients"
+    name: "backprop_val_grad"
     type_attr: "T"
   }
   input_arg {
-    name: "features"
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "a_val_grad"
     type_attr: "T"
   }
   output_arg {
-    name: "backprops"
+    name: "b_val_grad"
     type_attr: "T"
   }
   attr {
@@ -33252,29 +44600,46 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "SoftsignGrad"
+  name: "SparseAddGrad"
   input_arg {
-    name: "gradients"
+    name: "backprop_val_grad"
     type_attr: "T"
   }
   input_arg {
-    name: "features"
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "a_val_grad"
     type_attr: "T"
   }
   output_arg {
-    name: "backprops"
+    name: "b_val_grad"
     type_attr: "T"
   }
   attr {
@@ -33284,12 +44649,17 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -33298,173 +44668,183 @@ op {
   }
 }
 op {
-  name: "SpaceToBatch"
+  name: "SparseAddGrad"
   input_arg {
-    name: "input"
+    name: "backprop_val_grad"
     type_attr: "T"
   }
   input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sum_indices"
+    type: DT_INT64
   }
   output_arg {
-    name: "output"
+    name: "a_val_grad"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
+  output_arg {
+    name: "b_val_grad"
+    type_attr: "T"
   }
   attr {
-    name: "Tpaddings"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-  attr {
-    name: "block_size"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
 }
 op {
-  name: "SpaceToBatchND"
+  name: "SparseApplyAdadelta"
   input_arg {
-    name: "input"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "block_shape"
-    type_attr: "Tblock_shape"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
-  output_arg {
-    name: "output"
+    name: "accum_update"
     type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "T"
-    type: "type"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "Tblock_shape"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
   }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
   }
-}
-op {
-  name: "SpaceToDepth"
   input_arg {
-    name: "input"
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   output_arg {
-    name: "output"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
   }
   attr {
-    name: "block_size"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-}
-op {
-  name: "SpaceToDepth"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
-  }
-  attr {
-    name: "block_size"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SparseAccumulatorApplyGradient"
+  name: "SparseApplyAdadelta"
   input_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "var"
+    type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "local_step"
-    type: DT_INT64
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "gradient_indices"
-    type: DT_INT64
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "gradient_values"
-    type_attr: "dtype"
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "gradient_shape"
-    type: DT_INT64
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
@@ -33482,39 +44862,73 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "has_known_shape"
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
     type: "bool"
+    default_value {
+      b: false
+    }
   }
 }
 op {
-  name: "SparseAccumulatorApplyGradient"
+  name: "SparseApplyAdadelta"
   input_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "var"
+    type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "local_step"
-    type: DT_INT64
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "gradient_indices"
-    type: DT_INT64
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "gradient_values"
-    type_attr: "dtype"
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "gradient_shape"
-    type: DT_INT64
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
@@ -33534,39 +44948,59 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "has_known_shape"
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
     type: "bool"
+    default_value {
+      b: false
+    }
   }
 }
 op {
-  name: "SparseAccumulatorTakeGradient"
+  name: "SparseApplyAdagrad"
   input_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "var"
+    type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "num_required"
-    type: DT_INT32
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
   }
-  output_arg {
-    name: "indices"
-    type: DT_INT64
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  output_arg {
-    name: "values"
-    type_attr: "dtype"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   output_arg {
-    name: "shape"
-    type: DT_INT64
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
@@ -33587,32 +45021,55 @@ op {
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SparseAccumulatorTakeGradient"
+  name: "SparseApplyAdagrad"
   input_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "var"
+    type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "num_required"
-    type: DT_INT32
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
   }
-  output_arg {
-    name: "indices"
-    type: DT_INT64
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  output_arg {
-    name: "values"
-    type_attr: "dtype"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   output_arg {
-    name: "shape"
-    type: DT_INT64
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
@@ -33635,48 +45092,52 @@ op {
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SparseAdd"
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
+  name: "SparseApplyAdagrad"
   input_arg {
-    name: "a_values"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "a_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "b_values"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "b_shape"
-    type: DT_INT64
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "thresh"
-    type_attr: "Treal"
-  }
-  output_arg {
-    name: "sum_indices"
-    type: DT_INT64
+    name: "indices"
+    type_attr: "Tindices"
   }
   output_arg {
-    name: "sum_values"
+    name: "out"
     type_attr: "T"
-  }
-  output_arg {
-    name: "sum_shape"
-    type: DT_INT64
+    is_ref: true
   }
   attr {
     name: "T"
@@ -33697,68 +45158,75 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "Treal"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SparseAdd"
+  name: "SparseApplyAdagradDA"
   input_arg {
-    name: "a_indices"
-    type: DT_INT64
+    name: "var"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "a_values"
+    name: "gradient_accumulator"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "a_shape"
-    type: DT_INT64
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "b_indices"
-    type: DT_INT64
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "b_values"
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "b_shape"
-    type: DT_INT64
+    name: "l1"
+    type_attr: "T"
   }
   input_arg {
-    name: "thresh"
-    type_attr: "Treal"
+    name: "l2"
+    type_attr: "T"
   }
-  output_arg {
-    name: "sum_indices"
+  input_arg {
+    name: "global_step"
     type: DT_INT64
   }
   output_arg {
-    name: "sum_values"
+    name: "out"
     type_attr: "T"
-  }
-  output_arg {
-    name: "sum_shape"
-    type: DT_INT64
+    is_ref: true
   }
   attr {
     name: "T"
@@ -33779,56 +45247,72 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Treal"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SparseAddGrad"
+  name: "SparseApplyAdagradDA"
   input_arg {
-    name: "backprop_val_grad"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "a_indices"
-    type: DT_INT64
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "b_indices"
-    type: DT_INT64
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "sum_indices"
-    type: DT_INT64
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "a_val_grad"
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
     type_attr: "T"
   }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
   output_arg {
-    name: "b_val_grad"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -33849,35 +45333,74 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SparseAddGrad"
+  name: "SparseApplyAdagradDA"
   input_arg {
-    name: "backprop_val_grad"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "a_indices"
-    type: DT_INT64
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "b_indices"
-    type: DT_INT64
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "sum_indices"
-    type: DT_INT64
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "a_val_grad"
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
   output_arg {
-    name: "b_val_grad"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -33900,24 +45423,47 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SparseApplyAdadelta"
+  name: "SparseApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "mg"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum_update"
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
     type_attr: "T"
     is_ref: true
   }
@@ -33929,6 +45475,10 @@ op {
     name: "rho"
     type_attr: "T"
   }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
   input_arg {
     name: "epsilon"
     type_attr: "T"
@@ -33987,19 +45537,24 @@ op {
   }
 }
 op {
-  name: "SparseApplyAdadelta"
+  name: "SparseApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "mg"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum_update"
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
     type_attr: "T"
     is_ref: true
   }
@@ -34011,6 +45566,10 @@ op {
     name: "rho"
     type_attr: "T"
   }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
   input_arg {
     name: "epsilon"
     type_attr: "T"
@@ -34071,14 +45630,24 @@ op {
   }
 }
 op {
-  name: "SparseApplyAdagrad"
+  name: "SparseApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
     type_attr: "T"
     is_ref: true
   }
@@ -34086,6 +45655,18 @@ op {
     name: "lr"
     type_attr: "T"
   }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
   input_arg {
     name: "grad"
     type_attr: "T"
@@ -34118,6 +45699,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -34140,7 +45724,7 @@ op {
   }
 }
 op {
-  name: "SparseApplyAdagrad"
+  name: "SparseApplyFtrl"
   input_arg {
     name: "var"
     type_attr: "T"
@@ -34152,8 +45736,9 @@ op {
     is_ref: true
   }
   input_arg {
-    name: "lr"
+    name: "linear"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
     name: "grad"
@@ -34163,6 +45748,22 @@ op {
     name: "indices"
     type_attr: "Tindices"
   }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
   output_arg {
     name: "out"
     type_attr: "T"
@@ -34187,8 +45788,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -34211,19 +45810,19 @@ op {
   }
 }
 op {
-  name: "SparseApplyAdagradDA"
+  name: "SparseApplyFtrl"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "gradient_accumulator"
+    name: "accum"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "gradient_squared_accumulator"
+    name: "linear"
     type_attr: "T"
     is_ref: true
   }
@@ -34248,8 +45847,8 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "global_step"
-    type: DT_INT64
+    name: "lr_power"
+    type_attr: "T"
   }
   output_arg {
     name: "out"
@@ -34275,6 +45874,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -34297,19 +45898,19 @@ op {
   }
 }
 op {
-  name: "SparseApplyAdagradDA"
+  name: "SparseApplyFtrl"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "gradient_accumulator"
+    name: "accum"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "gradient_squared_accumulator"
+    name: "linear"
     type_attr: "T"
     is_ref: true
   }
@@ -34334,8 +45935,8 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "global_step"
-    type: DT_INT64
+    name: "lr_power"
+    type_attr: "T"
   }
   output_arg {
     name: "out"
@@ -34363,6 +45964,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -34385,50 +45987,49 @@ op {
   }
 }
 op {
-  name: "SparseApplyCenteredRMSProp"
+  name: "SparseApplyFtrlV2"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mg"
+    name: "accum"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "ms"
+    name: "linear"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mom"
+    name: "grad"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "rho"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "momentum"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "l2_shrinkage"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "lr_power"
+    type_attr: "T"
   }
   output_arg {
     name: "out"
@@ -34476,50 +46077,49 @@ op {
   }
 }
 op {
-  name: "SparseApplyCenteredRMSProp"
+  name: "SparseApplyFtrlV2"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mg"
+    name: "accum"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "ms"
+    name: "linear"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mom"
+    name: "grad"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "rho"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "momentum"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "l2_shrinkage"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "lr_power"
+    type_attr: "T"
   }
   output_arg {
     name: "out"
@@ -34569,7 +46169,7 @@ op {
   }
 }
 op {
-  name: "SparseApplyFtrl"
+  name: "SparseApplyFtrlV2"
   input_arg {
     name: "var"
     type_attr: "T"
@@ -34605,6 +46205,10 @@ op {
     name: "l2"
     type_attr: "T"
   }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
   input_arg {
     name: "lr_power"
     type_attr: "T"
@@ -34633,6 +46237,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -34655,7 +46262,7 @@ op {
   }
 }
 op {
-  name: "SparseApplyFtrl"
+  name: "SparseApplyMomentum"
   input_arg {
     name: "var"
     type_attr: "T"
@@ -34667,9 +46274,8 @@ op {
     is_ref: true
   }
   input_arg {
-    name: "linear"
+    name: "lr"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
     name: "grad"
@@ -34680,19 +46286,7 @@ op {
     type_attr: "Tindices"
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
+    name: "momentum"
     type_attr: "T"
   }
   output_arg {
@@ -34719,8 +46313,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -34741,9 +46333,16 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SparseApplyFtrlV2"
+  name: "SparseApplyMomentum"
   input_arg {
     name: "var"
     type_attr: "T"
@@ -34755,9 +46354,8 @@ op {
     is_ref: true
   }
   input_arg {
-    name: "linear"
+    name: "lr"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
     name: "grad"
@@ -34768,23 +46366,89 @@ op {
     type_attr: "Tindices"
   }
   input_arg {
-    name: "lr"
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyMomentum"
   input_arg {
-    name: "l1"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "l2"
+    name: "accum"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "l2_shrinkage"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
     type_attr: "T"
   }
   output_arg {
@@ -34811,6 +46475,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -34831,9 +46498,16 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SparseApplyFtrlV2"
+  name: "SparseApplyProximalAdagrad"
   input_arg {
     name: "var"
     type_attr: "T"
@@ -34844,19 +46518,6 @@ op {
     type_attr: "T"
     is_ref: true
   }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
@@ -34870,12 +46531,12 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "l2_shrinkage"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   output_arg {
     name: "out"
@@ -34901,8 +46562,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -34925,7 +46584,7 @@ op {
   }
 }
 op {
-  name: "SparseApplyMomentum"
+  name: "SparseApplyProximalAdagrad"
   input_arg {
     name: "var"
     type_attr: "T"
@@ -34941,17 +46600,21 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "l2"
+    type_attr: "T"
   }
   input_arg {
-    name: "momentum"
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   output_arg {
     name: "out"
     type_attr: "T"
@@ -34976,6 +46639,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -34996,16 +46661,9 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "SparseApplyMomentum"
+  name: "SparseApplyProximalAdagrad"
   input_arg {
     name: "var"
     type_attr: "T"
@@ -35021,17 +46679,21 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "l2"
+    type_attr: "T"
   }
   input_arg {
-    name: "momentum"
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   output_arg {
     name: "out"
     type_attr: "T"
@@ -35058,6 +46720,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -35078,28 +46741,16 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "SparseApplyProximalAdagrad"
+  name: "SparseApplyProximalGradientDescent"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
@@ -35164,19 +46815,14 @@ op {
   }
 }
 op {
-  name: "SparseApplyProximalAdagrad"
+  name: "SparseApplyProximalGradientDescent"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
@@ -35293,6 +46939,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -35315,22 +46964,36 @@ op {
   }
 }
 op {
-  name: "SparseApplyProximalGradientDescent"
+  name: "SparseApplyRMSProp"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "alpha"
+    name: "ms"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "l1"
+    name: "mom"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "l2"
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
@@ -35365,8 +47028,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -35453,6 +47114,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -35541,6 +47204,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -35706,6 +47370,58 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "SparseConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "SparseCross"
   input_arg {
@@ -35767,40 +47483,180 @@ op {
       }
     }
   }
-  attr {
-    name: "dense_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "internal_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseAdd"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseDenseCwiseAdd"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "out_type"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
-        type: DT_STRING
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+}
+op {
+  name: "SparseDenseCwiseAdd"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "internal_type"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
-        type: DT_STRING
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "SparseDenseCwiseAdd"
+  name: "SparseDenseCwiseDiv"
   input_arg {
     name: "sp_indices"
     type: DT_INT64
@@ -35845,7 +47701,7 @@ op {
   }
 }
 op {
-  name: "SparseDenseCwiseAdd"
+  name: "SparseDenseCwiseDiv"
   input_arg {
     name: "sp_indices"
     type: DT_INT64
@@ -35932,12 +47788,15 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "SparseDenseCwiseDiv"
+  name: "SparseDenseCwiseMul"
   input_arg {
     name: "sp_indices"
     type: DT_INT64
@@ -35977,8 +47836,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -36024,6 +47881,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -36071,6 +47930,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -36302,6 +48162,111 @@ op {
     }
   }
 }
+op {
+  name: "SparseReduceMax"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceMaxSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
 op {
   name: "SparseReduceMaxSparse"
   input_arg {
@@ -36353,6 +48318,8 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -36410,6 +48377,59 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSum"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
@@ -36462,6 +48482,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -36495,6 +48517,129 @@ op {
       b: false
     }
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSumSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseReduceSumSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -36576,126 +48721,256 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseReorder"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SparseReshape"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "new_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+}
+op {
+  name: "SparseSegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentMeanGrad"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_dim0"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "SparseReduceSumSparse"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
+  name: "SparseSegmentMeanWithNumSegments"
   input_arg {
-    name: "input_values"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "input_shape"
-    type: DT_INT64
+    name: "indices"
+    type_attr: "Tidx"
   }
   input_arg {
-    name: "reduction_axes"
+    name: "segment_ids"
     type: DT_INT32
   }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
   }
   output_arg {
-    name: "output_values"
+    name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
+    name: "Tidx"
+    type: "type"
     default_value {
-      b: false
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "T"
+    name: "Tnumsegments"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "SparseReorder"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
+  name: "SparseSegmentSqrtN"
   input_arg {
-    name: "input_values"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "input_shape"
-    type: DT_INT64
+    name: "indices"
+    type_attr: "Tidx"
   }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
   }
   output_arg {
-    name: "output_values"
+    name: "output"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
-}
-op {
-  name: "SparseReshape"
-  input_arg {
-    name: "input_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "input_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "new_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
 }
 op {
-  name: "SparseSegmentMean"
+  name: "SparseSegmentSqrtNGrad"
   input_arg {
-    name: "data"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
@@ -36706,6 +48981,10 @@ op {
     name: "segment_ids"
     type: DT_INT32
   }
+  input_arg {
+    name: "output_dim0"
+    type: DT_INT32
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -36735,9 +49014,9 @@ op {
   }
 }
 op {
-  name: "SparseSegmentMeanGrad"
+  name: "SparseSegmentSqrtNWithNumSegments"
   input_arg {
-    name: "grad"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
@@ -36749,8 +49028,8 @@ op {
     type: DT_INT32
   }
   input_arg {
-    name: "output_dim0"
-    type: DT_INT32
+    name: "num_segments"
+    type_attr: "Tnumsegments"
   }
   output_arg {
     name: "output"
@@ -36779,9 +49058,22 @@ op {
       }
     }
   }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
-  name: "SparseSegmentSqrtN"
+  name: "SparseSegmentSum"
   input_arg {
     name: "data"
     type_attr: "T"
@@ -36805,6 +49097,13 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
@@ -36823,9 +49122,9 @@ op {
   }
 }
 op {
-  name: "SparseSegmentSqrtNGrad"
+  name: "SparseSegmentSum"
   input_arg {
-    name: "grad"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
@@ -36836,10 +49135,6 @@ op {
     name: "segment_ids"
     type: DT_INT32
   }
-  input_arg {
-    name: "output_dim0"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -36851,6 +49146,15 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -36900,6 +49204,9 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -36918,7 +49225,7 @@ op {
   }
 }
 op {
-  name: "SparseSegmentSum"
+  name: "SparseSegmentSumWithNumSegments"
   input_arg {
     name: "data"
     type_attr: "T"
@@ -36931,6 +49238,10 @@ op {
     name: "segment_ids"
     type: DT_INT32
   }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -36951,6 +49262,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -36967,6 +49279,19 @@ op {
       }
     }
   }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "SparseSlice"
@@ -37079,6 +49404,102 @@ op {
     }
   }
 }
+op {
+  name: "SparseSoftmaxCrossEntropyWithLogits"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "labels"
+    type_attr: "Tlabels"
+  }
+  output_arg {
+    name: "loss"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tlabels"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMaximum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
 op {
   name: "SparseSparseMaximum"
   input_arg {
@@ -37127,6 +49548,8 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -37181,6 +49604,123 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMinimum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMinimum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -37219,6 +49759,98 @@ op {
     name: "output_values"
     type_attr: "T"
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseSplit"
+  input_arg {
+    name: "split_dim"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+    number_attr: "num_split"
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+    number_attr: "num_split"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+    number_attr: "num_split"
+  }
+  attr {
+    name: "num_split"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SparseTensorDenseAdd"
+  input_arg {
+    name: "a_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -37241,110 +49873,17 @@ op {
       }
     }
   }
-}
-op {
-  name: "SparseSparseMinimum"
-  input_arg {
-    name: "a_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "a_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "a_shape"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "b_values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_INT64
       }
     }
   }
 }
-op {
-  name: "SparseSplit"
-  input_arg {
-    name: "split_dim"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_indices"
-    type: DT_INT64
-    number_attr: "num_split"
-  }
-  output_arg {
-    name: "output_values"
-    type_attr: "T"
-    number_attr: "num_split"
-  }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
-    number_attr: "num_split"
-  }
-  attr {
-    name: "num_split"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
 op {
   name: "SparseTensorDenseAdd"
   input_arg {
@@ -37386,6 +49925,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -37443,6 +49984,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -37810,6 +50352,31 @@ op {
     }
   }
 }
+op {
+  name: "Sqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "SqrtGrad"
   input_arg {
@@ -37866,6 +50433,61 @@ op {
     }
   }
 }
+op {
+  name: "SqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Square"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Square"
   input_arg {
@@ -37876,6 +50498,37 @@ op {
     name: "y"
     type_attr: "T"
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SquaredDifference"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -37891,6 +50544,7 @@ op {
       }
     }
   }
+  is_commutative: true
 }
 op {
   name: "SquaredDifference"
@@ -37912,6 +50566,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -38121,11 +50776,51 @@ op {
   is_stateful: true
 }
 op {
-  name: "Stage"
-  input_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
+  name: "Stage"
+  input_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StageClear"
   attr {
     name: "capacity"
     type: "int"
@@ -38145,8 +50840,6 @@ op {
   attr {
     name: "dtypes"
     type: "list(type)"
-    has_minimum: true
-    minimum: 1
   }
   attr {
     name: "container"
@@ -38165,7 +50858,15 @@ op {
   is_stateful: true
 }
 op {
-  name: "StageClear"
+  name: "StagePeek"
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
   attr {
     name: "capacity"
     type: "int"
@@ -38185,6 +50886,8 @@ op {
   attr {
     name: "dtypes"
     type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
     name: "container"
@@ -38203,14 +50906,10 @@ op {
   is_stateful: true
 }
 op {
-  name: "StagePeek"
-  input_arg {
-    name: "index"
-    type: DT_INT32
-  }
+  name: "StageSize"
   output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
+    name: "size"
+    type: DT_INT32
   }
   attr {
     name: "capacity"
@@ -38231,8 +50930,6 @@ op {
   attr {
     name: "dtypes"
     type: "list(type)"
-    has_minimum: true
-    minimum: 1
   }
   attr {
     name: "container"
@@ -38251,49 +50948,104 @@ op {
   is_stateful: true
 }
 op {
-  name: "StageSize"
+  name: "StatelessRandomNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
   output_arg {
-    name: "size"
-    type: DT_INT32
+    name: "output"
+    type_attr: "dtype"
   }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "dtype"
+    type: "type"
     default_value {
-      i: 0
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
+    name: "T"
+    type: "type"
     default_value {
-      i: 0
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
-    has_minimum: true
+  }
+}
+op {
+  name: "StatelessRandomNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "T"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "Tseed"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "StatelessRandomNormal"
+  name: "StatelessRandomUniform"
   input_arg {
     name: "shape"
     type_attr: "T"
@@ -38342,7 +51094,7 @@ op {
   }
   input_arg {
     name: "seed"
-    type: DT_INT64
+    type_attr: "Tseed"
   }
   output_arg {
     name: "output"
@@ -38375,6 +51127,19 @@ op {
       }
     }
   }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "StatelessTruncatedNormal"
@@ -38418,6 +51183,95 @@ op {
     }
   }
 }
+op {
+  name: "StatelessTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatsAggregatorHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatsAggregatorSummary"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "StopGradient"
   input_arg {
@@ -38906,6 +51760,41 @@ op {
     }
   }
 }
+op {
+  name: "Sub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Substr"
   input_arg {
@@ -39051,6 +51940,66 @@ op {
     }
   }
 }
+op {
+  name: "Sum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Svd"
   input_arg {
@@ -39344,6 +52293,57 @@ op {
     }
   }
 }
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Tanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Tanh"
   input_arg {
@@ -39360,6 +52360,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -39424,6 +52425,35 @@ op {
     }
   }
 }
+op {
+  name: "TanhGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TemporaryVariable"
   output_arg {
@@ -40836,6 +53866,56 @@ op {
     version: 7
   }
 }
+op {
+  name: "TopK"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  deprecation {
+    version: 7
+  }
+}
 op {
   name: "TopKV2"
   input_arg {
@@ -40924,6 +54004,52 @@ op {
     }
   }
 }
+op {
+  name: "TopKV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
 op {
   name: "Transpose"
   input_arg {
@@ -40990,6 +54116,41 @@ op {
     }
   }
 }
+op {
+  name: "TruncateDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TruncateMod"
   input_arg {
@@ -41017,6 +54178,34 @@ op {
     }
   }
 }
+op {
+  name: "TruncateMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "TruncatedNormal"
   input_arg {
@@ -41064,6 +54253,54 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "UniformCandidateSampler"
   input_arg {
@@ -41207,6 +54444,42 @@ op {
     }
   }
 }
+op {
+  name: "UniqueV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "UniqueWithCounts"
   input_arg {
@@ -41365,6 +54638,68 @@ op {
     }
   }
 }
+op {
+  name: "UnsortedSegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "UnsortedSegmentSum"
   input_arg {
@@ -41469,6 +54804,73 @@ op {
     }
   }
 }
+op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Unstage"
   output_arg {
@@ -41720,6 +55122,46 @@ op {
     }
   }
 }
+op {
+  name: "Where"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_BOOL
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+        type: DT_BOOL
+      }
+    }
+  }
+}
 op {
   name: "WholeFileReader"
   output_arg {
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index ac2dc601f1f6b48905f1269b8726ac30ba5dda67..b3d7653359330155d4c0f6fcb55736d8f354f022 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -2497,6 +2497,7 @@ REGISTER_OP("RecordInput")
     .Attr("file_buffer_size: int = 10000")
     .Attr("file_parallelism: int = 16")
     .Attr("batch_size: int = 32")
+    .Attr("compression_type: string = ''")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
@@ -2510,6 +2511,8 @@ file_shuffle_shift_ratio: Shifts the list of files after the list is randomly
 file_buffer_size: The randomization shuffling buffer.
 file_parallelism: How many sstables are opened and concurrently iterated over.
 batch_size: The batch size.
+compression_type: The type of compression for the file. Currently ZLIB and
+    GZIP are supported. Defaults to none.
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index f5122139645e2d3360bdcdbde29335ccaca79fbb..2072e0df57f2c00bc34108928920914655429f74 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -151,6 +151,28 @@ REGISTER_OP("IgnoreErrorsDataset")
 Creates a dataset that contains the elements of `input_dataset` ignoring errors.
 )doc");
 
+REGISTER_OP("BytesProducedStatsDataset")
+    .Input("input_dataset: variant")
+    .Input("tag: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+)doc");
+
+REGISTER_OP("LatencyStatsDataset")
+    .Input("input_dataset: variant")
+    .Input("tag: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Records the latency of producing `input_dataset` elements in a StatsAggregator.
+)doc");
+
 REGISTER_OP("MapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -447,6 +469,24 @@ stop: corresponds to stop in python's xrange().
 step: corresponds to step in python's xrange().
 )doc");
 
+REGISTER_OP("RandomDataset")
+    .Input("seed: int64")
+    .Input("seed2: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a Dataset that returns pseudorandom numbers.
+
+seed: A scalar seed for the random number generator. If either seed or
+  seed2 is set to be non-zero, the random number generator is seeded
+  by the given seed.  Otherwise, a random seed is used.
+seed2: A second scalar seed to avoid seed collision.
+)doc");
+
 REGISTER_OP("ShuffleDataset")
     .Input("input_dataset: variant")
     .Input("buffer_size: int64")
@@ -468,8 +508,33 @@ reshuffle_each_iteration: If true, each iterator over this dataset will be given
   `seed` and `seed2` inputs. If false, each iterator will be given the same
   seed, and repeated iteration over this dataset will yield the exact same
   sequence of results.
-seed: A scalar seed for the random number generator. If either seed or
-  seed2 is set to be non-zero, the random number generator is seeded
+seed: A scalar seed for the random number generator. If either `seed` or
+  `seed2` is set to be non-zero, the random number generator is seeded
+  by the given seed.  Otherwise, a random seed is used.
+seed2: A second scalar seed to avoid seed collision.
+)doc");
+
+REGISTER_OP("ShuffleAndRepeatDataset")
+    .Input("input_dataset: variant")
+    .Input("buffer_size: int64")
+    .Input("seed: int64")
+    .Input("seed2: int64")
+    .Input("count: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that shuffles and repeats elements from `input_dataset`
+pseudorandomly.
+
+buffer_size: The number of output elements to buffer in an iterator over
+  this dataset. Compare with the `min_after_dequeue` attr when creating a
+  `RandomShuffleQueue`.
+count: A scalar representing the number of times the underlying dataset
+  should be repeated. The default is `-1`, which results in infinite repetition.
+seed: A scalar seed for the random number generator. If either `seed` or
+  `seed2` is set to be non-zero, the random number generator is seeded
   by the given seed.  Otherwise, a random seed is used.
 seed2: A second scalar seed to avoid seed collision.
 )doc");
@@ -744,4 +809,29 @@ serialized: A variant tensor storing the state of the iterator contained in the
   resource.
 )doc");
 
+REGISTER_OP("StatsAggregatorHandle")
+    .Output("handle: resource")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Doc(R"doc(
+Creates a statistics manager resource.
+)doc");
+
+REGISTER_OP("IteratorSetStatsAggregator")
+    .Input("iterator_handle: resource")
+    .Input("stats_aggregator_handle: resource")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Associates the given iterator with the given statistics aggregator.
+)doc");
+
+REGISTER_OP("StatsAggregatorSummary")
+    .Input("iterator: resource")
+    .Output("summary: string")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Produces a summary of any statistics recorded by the given statistics manager.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index c3f80064150ba0dcce1173de1d02142cf3dc6621..13fbd2fa515c5a7e0ec06cdc4c585f4dc691a928 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -818,8 +818,8 @@ bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
 bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
 height of the underlying image.
 
-For example, if an image is 100 x 200 pixels (height x width) and the bounding 
-box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of 
+For example, if an image is 100 x 200 pixels (height x width) and the bounding
+box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
 the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
 
 Parts of the bounding box may fall outside the image.
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index df75caca37a616f75263e35a0d5e725f36e1307b..8ea170ba14355d06cc6cd19f306674000fe3bda3 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -85,7 +85,7 @@ REGISTER_OP("BatchMatMul")
     .Input("x: T")
     .Input("y: T")
     .Output("output: T")
-    .Attr("T: {half, float, double, int32, complex64, complex128}")
+    .Attr("T: {half, bfloat16, float, double, int32, complex64, complex128}")
     .Attr("adj_x: bool = false")
     .Attr("adj_y: bool = false")
     .SetShapeFn([](InferenceContext* c) {
@@ -184,7 +184,7 @@ _HostCast requires its input and produces its output in host memory.
 REGISTER_OP("Abs")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, float, double, int32, int64}")
+    .Attr("T: {half, bfloat16, float, double, int32, int64}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Computes the absolute value of a tensor.
@@ -210,29 +210,31 @@ value is computed as \\( \sqrt{a^2 + b^2}\\).
 )doc");
 
 // Declares cwise unary operations signature: 't -> 't
-#define UNARY()                                                              \
-  Input("x: T")                                                              \
-      .Output("y: T")                                                        \
-      .Attr("T: {half, float, double, int32, int64, complex64, complex128}") \
+#define UNARY()                                                          \
+  Input("x: T")                                                          \
+      .Output("y: T")                                                    \
+      .Attr(                                                             \
+          "T: {half, bfloat16, float, double, int32, int64, complex64, " \
+          "complex128}")                                                 \
       .SetShapeFn(shape_inference::UnchangedShape)
 
-#define UNARY_REAL()                    \
-  Input("x: T")                         \
-      .Output("y: T")                   \
-      .Attr("T: {half, float, double}") \
+#define UNARY_REAL()                              \
+  Input("x: T")                                   \
+      .Output("y: T")                             \
+      .Attr("T: {half, bfloat16, float, double}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
-#define UNARY_COMPLEX()                                        \
-  Input("x: T")                                                \
-      .Output("y: T")                                          \
-      .Attr("T: {half, float, double, complex64, complex128}") \
+#define UNARY_COMPLEX()                                                  \
+  Input("x: T")                                                          \
+      .Output("y: T")                                                    \
+      .Attr("T: {half, bfloat16, float, double, complex64, complex128}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
-#define UNARY_GRADIENT_COMPLEX()                               \
-  Input("y: T")                                                \
-      .Input("dy: T")                                          \
-      .Output("z: T")                                          \
-      .Attr("T: {half, float, double, complex64, complex128}") \
+#define UNARY_GRADIENT_COMPLEX()                                         \
+  Input("y: T")                                                          \
+      .Input("dy: T")                                                    \
+      .Output("z: T")                                                    \
+      .Attr("T: {half, bfloat16, float, double, complex64, complex128}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 REGISTER_OP("Neg")
@@ -481,7 +483,7 @@ Computes atan of x element-wise.
 REGISTER_OP("IsNan")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Returns which elements of x are NaN.
@@ -494,7 +496,7 @@ Equivalent to np.isnan
 REGISTER_OP("IsInf")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Returns which elements of x are Inf.
@@ -507,7 +509,7 @@ Equivalent to np.isinf
 REGISTER_OP("IsFinite")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Returns which elements of x are finite.
@@ -520,7 +522,9 @@ Equivalent to np.isfinite
 REGISTER_OP("Sign")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, float, double, int32, int64, complex64, complex128}")
+    .Attr(
+        "T: {half, bfloat16, float, double, int32, int64, complex64, "
+        "complex128}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Returns an element-wise indication of the sign of a number.
@@ -533,7 +537,7 @@ For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
 REGISTER_OP("Floor")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Returns element-wise largest integer not greater than x.
@@ -542,7 +546,7 @@ Returns element-wise largest integer not greater than x.
 REGISTER_OP("Ceil")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Returns element-wise smallest integer in not less than x.
@@ -551,7 +555,7 @@ Returns element-wise smallest integer in not less than x.
 REGISTER_OP("Rint")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Returns element-wise integer closest to x.
@@ -569,22 +573,23 @@ rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
 
 // Declares cwise binary operations signature: 't, 't -> 't.
 
-#define BINARY_MORE()                                                       \
-  Input("x: T").Input("y: T").Output("z: T").Attr(                          \
-      "T: {half, float, double, uint8, int8, uint16, int16, int32, int64, " \
-      "complex64, complex128}")
+#define BINARY_MORE()                                                          \
+  Input("x: T").Input("y: T").Output("z: T").Attr(                             \
+      "T: {half, bfloat16, float, double, uint8, int8, uint16, int16, int32, " \
+      "int64, complex64, complex128}")
 
-#define BINARY_FEWER()                             \
-  Input("x: T").Input("y: T").Output("z: T").Attr( \
-      "T: {half, float, double, int32, int64, complex64, complex128}")
+#define BINARY_FEWER()                                               \
+  Input("x: T").Input("y: T").Output("z: T").Attr(                   \
+      "T: {half, bfloat16, float, double, int32, int64, complex64, " \
+      "complex128}")
 
 REGISTER_OP("Add")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, float, double, uint8, int8, int16, int32, int64, complex64, "
-        "complex128, string}")
+        "T: {half, bfloat16, float, double, uint8, int8, int16, int32, int64, "
+        "complex64, complex128, string}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Returns x + y element-wise.
@@ -600,8 +605,8 @@ REGISTER_OP("AddV2")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, float, double, uint8, int8, int16, int32, int64, complex64, "
-        "complex128}")
+        "T: {half, bfloat16, float, double, uint8, int8, int16, int32, int64, "
+        "complex64, complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .SetIsAggregate()
     .SetIsCommutative()
@@ -757,7 +762,7 @@ REGISTER_OP("Maximum")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, float, double, int32, int64}")
+    .Attr("T: {half, bfloat16, float, double, int32, int64}")
     .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
@@ -788,7 +793,7 @@ REGISTER_OP("Minimum")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, float, double, int32, int64}")
+    .Attr("T: {half, bfloat16, float, double, int32, int64}")
     .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
@@ -802,7 +807,7 @@ REGISTER_OP("Mod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, float, double}")
+    .Attr("T: {int32, int64, bfloat16, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Returns element-wise remainder of division. This emulates C semantics in that
@@ -817,7 +822,7 @@ REGISTER_OP("FloorMod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, float, double}")
+    .Attr("T: {int32, int64, bfloat16, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
@@ -832,7 +837,7 @@ REGISTER_OP("TruncateMod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, float, double}")
+    .Attr("T: {int32, int64, bfloat16, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Returns element-wise remainder of division. This emulates C semantics in that
@@ -847,7 +852,9 @@ REGISTER_OP("Pow")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, float, double, int32, int64, complex64, complex128}")
+    .Attr(
+        "T: {half, bfloat16, float, double, int32, int64, complex64, "
+        "complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Computes the power of one value to another.
@@ -946,7 +953,7 @@ REGISTER_OP("Atan2")
     .Input("y: T")
     .Input("x: T")
     .Output("z: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
@@ -1064,15 +1071,15 @@ Returns the truth value of (x >= y) element-wise.
 
 // --------------------------------------------------------------------------
 
-#define EQUALITY_COMPARISON()                                           \
-  Input("x: T")                                                         \
-      .Input("y: T")                                                    \
-      .Output("z: bool")                                                \
-      .SetIsCommutative()                                               \
-      .Attr(                                                            \
-          "T: {half, float, double, uint8, int8, int16, int32, int64, " \
-          "complex64, "                                                 \
-          "quint8, qint8, qint32, string, bool, complex128}")           \
+#define EQUALITY_COMPARISON()                                              \
+  Input("x: T")                                                            \
+      .Input("y: T")                                                       \
+      .Output("z: bool")                                                   \
+      .SetIsCommutative()                                                  \
+      .Attr(                                                               \
+          "T: {half, bfloat16, float, double, uint8, int8, int16, int32, " \
+          "int64, complex64, quint8, qint8, qint32, string, bool, "        \
+          "complex128}")                                                   \
       .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
 
 REGISTER_OP("Equal")
@@ -1291,7 +1298,7 @@ REGISTER_OP("MatMul")
     .Output("product: T")
     .Attr("transpose_a: bool = false")
     .Attr("transpose_b: bool = false")
-    .Attr("T: {half, float, double, int32, complex64, complex128}")
+    .Attr("T: {half, bfloat16, float, double, int32, complex64, complex128}")
     .SetShapeFn(shape_inference::MatMulShape)
     .Doc(R"doc(
 Multiply the matrix "a" by the matrix "b".
@@ -1625,6 +1632,45 @@ Status SparseSegmentReductionGradShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+Status SparseSegmentReductionWithNumSegmentsShapeFn(InferenceContext* c) {
+  ShapeHandle data_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &data_shape));
+
+  ShapeHandle indices_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &indices_shape));
+
+  ShapeHandle segment_ids_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &segment_ids_shape));
+
+  ShapeHandle num_segments_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &num_segments_shape));
+
+  // indices and segment_ids should merge cleanly.
+  ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->Merge(indices_shape, segment_ids_shape, &unused));
+
+  ShapeHandle subshape;
+  TF_RETURN_IF_ERROR(c->Subshape(data_shape, 1, &subshape));
+
+  ShapeHandle out;
+  const Tensor* dim0 = c->input_tensor(3);
+  if (dim0 == nullptr) {
+    // We don't have the value at inference time, so the output
+    // shape is unknown.
+    TF_RETURN_IF_ERROR(c->Concatenate(c->Vector(InferenceContext::kUnknownDim),
+                                      subshape, &out));
+  } else {
+    auto dim0_value = dim0->scalar<int32>()();
+    if (dim0_value < 0) {
+      return errors::InvalidArgument(
+          "Cannot specify a negative value for num_segments");
+    }
+    TF_RETURN_IF_ERROR(c->Concatenate(c->Vector(dim0_value), subshape, &out));
+  }
+  c->set_output(0, out);
+  return Status::OK();
+}
+
 Status UnsortedSegmentReductionShapeFn(InferenceContext* c) {
   ShapeHandle s_data = c->input(0);
   ShapeHandle s_segment_ids = c->input(1);
@@ -1811,10 +1857,11 @@ output: Has same shape as data, except for dimension 0 which
 REGISTER_OP("UnsortedSegmentSum")
     .Input("data: T")
     .Input("segment_ids: Tindices")
-    .Input("num_segments: int32")
+    .Input("num_segments: Tnumsegments")
     .Output("output: T")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32,int64}")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .SetShapeFn(UnsortedSegmentReductionShapeFn)
     .Doc(R"doc(
 Computes the sum along segments of a tensor.
@@ -1849,10 +1896,11 @@ output: Has same shape as data, except for the first `segment_ids.rank`
 REGISTER_OP("UnsortedSegmentMax")
     .Input("data: T")
     .Input("segment_ids: Tindices")
-    .Input("num_segments: int32")
+    .Input("num_segments: Tnumsegments")
     .Output("output: T")
     .Attr("T: realnumbertype")
     .Attr("Tindices: {int32,int64}")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .SetShapeFn(UnsortedSegmentReductionShapeFn)
     .Doc(R"doc(
 Computes the Max along segments of a tensor.
@@ -1881,6 +1929,7 @@ output: Has same shape as data, except for dimension 0 which
 has size `num_segments`.
 
 )doc");
+
 REGISTER_OP("SparseSegmentSum")
     .Input("data: T")
     .Input("indices: Tidx")
@@ -1929,6 +1978,56 @@ output: Has same shape as data, except for dimension 0 which
   has size `k`, the number of segments.
 )doc");
 
+REGISTER_OP("SparseSegmentSumWithNumSegments")
+    .Input("data: T")
+    .Input("indices: Tidx")
+    .Input("segment_ids: int32")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(SparseSegmentReductionWithNumSegmentsShapeFn)
+    .Doc(R"doc(
+Computes the sum along sparse segments of a tensor.
+
+Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+misisng, the `output` tensor at that position will be zeroed.
+
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+For example:
+
+```python
+c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+
+tf.sparse_segment_sum_with_num_segments(
+    c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+# => [[0 0 0 0]
+#     [0 0 0 0]
+#     [0 0 0 0]]
+
+tf.sparse_segment_sum_with_num_segments(c,
+                                        tf.constant([0, 1]),
+                                        tf.constant([0, 2],
+                                        num_segments=4))
+# => [[ 1  2  3  4]
+#     [ 0  0  0  0]
+#     [-1 -2 -3 -4]
+#     [ 0  0  0  0]]
+```
+
+indices: A 1-D tensor. Has same rank as `segment_ids`.
+
+segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+
+num_segments: Should equal the number of distinct segment IDs.
+
+output: Has same shape as data, except for dimension 0 which
+  has size `num_segments`.
+)doc");
+
 REGISTER_OP("SparseSegmentMean")
     .Input("data: T")
     .Input("indices: Tidx")
@@ -1955,6 +2054,35 @@ output: Has same shape as data, except for dimension 0 which
 
 )doc");
 
+REGISTER_OP("SparseSegmentMeanWithNumSegments")
+    .Input("data: T")
+    .Input("indices: Tidx")
+    .Input("segment_ids: int32")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: {float, double}")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(SparseSegmentReductionWithNumSegmentsShapeFn)
+    .Doc(R"doc(
+Computes the mean along sparse segments of a tensor.
+
+Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+misisng, the `output` tensor at that position will be zeroed.
+
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+indices: A 1-D tensor. Has same rank as `segment_ids`.
+
+segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+
+num_segments: Should equal the number of distinct segment IDs.
+
+output: Has same shape as data, except for dimension 0 which has size
+    `num_segments`.
+)doc");
+
 REGISTER_OP("SparseSegmentMeanGrad")
     .Input("grad: T")
     .Input("indices: Tidx")
@@ -2001,6 +2129,38 @@ output: Has same shape as data, except for dimension 0 which
 
 )doc");
 
+REGISTER_OP("SparseSegmentSqrtNWithNumSegments")
+    .Input("data: T")
+    .Input("indices: Tidx")
+    .Input("segment_ids: int32")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: {float, double}")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(SparseSegmentReductionWithNumSegmentsShapeFn)
+    .Doc(R"doc(
+Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+
+N is the size of the segment being reduced.
+
+Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+misisng, the `output` tensor at that position will be zeroed.
+
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+indices: A 1-D tensor. Has same rank as `segment_ids`.
+
+segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+
+num_segments: Should equal the number of distinct segment IDs.
+
+output: Has same shape as data, except for dimension 0 which
+  has size `k`, the number of segments.
+
+)doc");
+
 REGISTER_OP("SparseSegmentSqrtNGrad")
     .Input("grad: T")
     .Input("indices: Tidx")
@@ -2103,7 +2263,7 @@ REGISTER_OP("Range")
     .Input("limit: Tidx")
     .Input("delta: Tidx")
     .Output("output: Tidx")
-    .Attr("Tidx: {float, double, int32, int64} = DT_INT32")
+    .Attr("Tidx: {bfloat16, float, double, int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_WITH_CONTEXT_IF_ERROR(c->WithRank(c->input(0), 0, &unused),
@@ -2158,7 +2318,7 @@ REGISTER_OP("LinSpace")
     .Input("stop: T")
     .Input("num: Tidx")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index 3dfa776d26f53c5f341332b3a2bdf5fd95067049..ca3772e6f89805b70f05f1c9fd5e36ee99f2d510 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -522,7 +522,7 @@ TEST(MathOpsTest, Cross_ShapeFn) {
   INFER_ERROR("Dimension 0 in both shapes must be equal, but", op, "[3];[5]");
   INFER_ERROR("Dimension must be 3 but", op, "[3,5];[3,5]");
 
-  INFER_OK(op, "?;?", "?");
+  INFER_OK(op, "?;?", "in0");
   INFER_OK(op, "[?];[?]", "in0");
   INFER_OK(op, "[1,?,3];[?,?,?]", "in0");
 }
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index a242a13878bba3408387f7565397218b4be5ffe4..df2d4a7123a5bfb81366cb18e092a121de84f916 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -73,7 +73,7 @@ REGISTER_OP("AvgPool")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::AvgPoolShape)
     .Doc(R"doc(
 Performs average pooling on the input.
@@ -101,7 +101,7 @@ REGISTER_OP("AvgPoolGrad")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -300,7 +300,7 @@ REGISTER_OP("FusedBatchNormV2")
     .Output("batch_variance: U")
     .Output("reserve_space_1: U")
     .Output("reserve_space_2: U")
-    .Attr("T: {half, float}")
+    .Attr("T: {half, bfloat16, float}")
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("data_format: string = 'NHWC'")
@@ -359,7 +359,7 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
 y_backprop: A 4D Tensor for the gradient with respect to y.
 x: A 4D Tensor for input data.
 scale: A 1D Tensor for scaling factor, to scale the normalized x.
-reserve_space_1: When is_training is True, a 1D Tensor for the computed batch 
+reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
                  mean to be reused in gradient computation. When is_training is
                  False, a 1D Tensor for the population mean to be reused in both
                  1st and 2nd order gradient computation.
@@ -393,7 +393,7 @@ REGISTER_OP("FusedBatchNormGradV2")
     .Output("offset_backprop: U")
     .Output("reserve_space_3: U")
     .Output("reserve_space_4: U")
-    .Attr("T: {half, float}")
+    .Attr("T: {half, bfloat16, float}")
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("data_format: string = 'NHWC'")
@@ -407,7 +407,7 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
 y_backprop: A 4D Tensor for the gradient with respect to y.
 x: A 4D Tensor for input data.
 scale: A 1D Tensor for scaling factor, to scale the normalized x.
-reserve_space_1: When is_training is True, a 1D Tensor for the computed batch 
+reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
                  mean to be reused in gradient computation. When is_training is
                  False, a 1D Tensor for the population mean to be reused in both
                  1st and 2nd order gradient computation.
@@ -508,11 +508,12 @@ REGISTER_OP("Conv2D")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, float}")
+    .Attr("T: {half, bfloat16, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
 Computes a 2-D convolution given 4-D `input` and `filter` tensors.
@@ -546,7 +547,7 @@ filter: A 4-D tensor of shape
 output: A 4-D tensor. The dimension order is determined by the value of
     `data_format`, see below for details.
 strides: 1-D tensor of length 4.  The stride of the sliding window for each
-  dimension of `input`. The dimension order is determined by the value of
+    dimension of `input`. The dimension order is determined by the value of
     `data_format`, see below for details.
 padding: The type of padding algorithm to use.
 data_format: Specify the data format of the input and output data. With the
@@ -554,6 +555,11 @@ data_format: Specify the data format of the input and output data. With the
         [batch, height, width, channels].
     Alternatively, the format could be "NCHW", the data storage order of:
         [batch, channels, height, width].
+dilations: 1-D tensor of length 4.  The dilation factor for each dimension of
+    `input`. If set to k > 1, there will be k-1 skipped cells between each
+    filter element on that dimension. The dimension order is determined by the
+    value of `data_format`, see above for details. Dilations in the batch and
+    depth dimensions must be 1.
 )doc");
 
 REGISTER_OP("Conv2DBackpropInput")
@@ -561,11 +567,12 @@ REGISTER_OP("Conv2DBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, float}")
+    .Attr("T: {half, bfloat16, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -589,10 +596,15 @@ padding: The type of padding algorithm to use.
 output: 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
   w.r.t. the input of the convolution.
 data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
+  default format "NHWC", the data is stored in the order of:
+      [batch, in_height, in_width, in_channels].
+  Alternatively, the format could be "NCHW", the data storage order of:
+      [batch, in_channels, in_height, in_width].
+dilations: 1-D tensor of length 4.  The dilation factor for each dimension of
+  `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+  element on that dimension. The dimension order is determined by the value of
+  `data_format`, see above for details. Dilations in the batch and depth
+  dimensions must be 1.
 )doc");
 
 // TODO(jeff): Instead of 'use_cudnn_for_gpu', maybe we should have a
@@ -603,11 +615,12 @@ REGISTER_OP("Conv2DBackpropFilter")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, float}")
+    .Attr("T: {half, bfloat16, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
@@ -632,10 +645,15 @@ output: 4-D with shape
   `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
   the `filter` input of the convolution.
 data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
+  default format "NHWC", the data is stored in the order of:
+      [batch, in_height, in_width, in_channels].
+  Alternatively, the format could be "NCHW", the data storage order of:
+      [batch, in_channels, in_height, in_width].
+dilations: 1-D tensor of length 4.  The dilation factor for each dimension of
+  `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+  element on that dimension. The dimension order is determined by the value of
+  `data_format`, see above for details. Dilations in the batch and depth
+  dimensions must be 1.
 )doc");
 
 namespace {
@@ -733,6 +751,40 @@ Status CommonFusedConvCalculations(InferenceContext* c, bool has_resize) {
 
 }  // namespace
 
+REGISTER_OP("DataFormatDimMap")
+    .Input("x: T")
+    .Output("y: T")
+    .Attr("T: {int32, int64} = DT_INT32")
+    .Attr("src_format: string = 'NHWC'")
+    .Attr("dst_format: string = 'NCHW'")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Returns the dimension index in the destination data format given the one in
+the source data format.
+
+x: Scalar. Dimension index in source data format. Must be in the range [-4, 4).
+y: Scalar. Dimension index in destination data format.
+src_format: source data format.
+dst_format: destination data format.
+)doc");
+
+REGISTER_OP("DataFormatVecPermute")
+    .Input("x: T")
+    .Output("y: T")
+    .Attr("T: {int32, int64} = DT_INT32")
+    .Attr("src_format: string = 'NHWC'")
+    .Attr("dst_format: string = 'NCHW'")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Returns the permuted vector/tensor in the destination data format given the
+one in the source data format.
+
+x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
+y: Vector of size 4 or Tensor of shape (4, 2) in destination data format.
+src_format: source data format.
+dst_format: destination data format.
+)doc");
+
 REGISTER_OP("FusedResizeAndPadConv2D")
     .Input("input: T")
     .Input("size: int32")
@@ -819,10 +871,11 @@ REGISTER_OP("DepthwiseConv2dNative")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn(shape_inference::DepthwiseConv2DNativeShape)
     .Doc(R"doc(
 Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
@@ -845,7 +898,6 @@ for k in 0..in_channels-1
 
 Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
 horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-
 strides: 1-D of length 4.  The stride of the sliding window for each dimension
   of `input`.
 padding: The type of padding algorithm to use.
@@ -854,6 +906,11 @@ data_format: Specify the data format of the input and output data. With the
         [batch, height, width, channels].
     Alternatively, the format could be "NCHW", the data storage order of:
         [batch, channels, height, width].
+dilations: 1-D tensor of length 4.  The dilation factor for each dimension of
+  `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+  element on that dimension. The dimension order is determined by the value of
+  `data_format`, see above for details. Dilations in the batch and depth
+  dimensions must be 1.
 )doc");
 
 REGISTER_OP("DepthwiseConv2dNativeBackpropInput")
@@ -861,10 +918,11 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -892,6 +950,11 @@ data_format: Specify the data format of the input and output data. With the
         [batch, height, width, channels].
     Alternatively, the format could be "NCHW", the data storage order of:
         [batch, channels, height, width].
+dilations: 1-D tensor of length 4.  The dilation factor for each dimension of
+  `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+  element on that dimension. The dimension order is determined by the value of
+  `data_format`, see above for details. Dilations in the batch and depth
+  dimensions must be 1.
 output: 4-D with shape according to `data_format`.  For example, if
   `data_format` is 'NHWC', output shape is `[batch, in_height,
   in_width, in_channels]`.  Gradient w.r.t. the input of the
@@ -903,10 +966,11 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropFilter")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
@@ -935,6 +999,11 @@ data_format: Specify the data format of the input and output data. With the
         [batch, height, width, channels].
     Alternatively, the format could be "NCHW", the data storage order of:
         [batch, channels, height, width].
+dilations: 1-D tensor of length 4.  The dilation factor for each dimension of
+  `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+  element on that dimension. The dimension order is determined by the value of
+  `data_format`, see above for details. Dilations in the batch and depth
+  dimensions must be 1.
 output: 4-D with shape
   `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
   the `filter` input of the convolution.
@@ -945,10 +1014,11 @@ REGISTER_OP("Conv3D")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
     .SetShapeFn(shape_inference::Conv3DShape)
     .Doc(R"doc(
 Computes a 3-D convolution given 5-D `input` and `filter` tensors.
@@ -970,6 +1040,11 @@ data_format: The data format of the input and output data. With the
         [batch, in_depth, in_height, in_width, in_channels].
     Alternatively, the format could be "NCDHW", the data storage order is:
         [batch, in_channels, in_depth, in_height, in_width].
+dilations: 1-D tensor of length 5.  The dilation factor for each dimension of
+    `input`. If set to k > 1, there will be k-1 skipped cells between each
+    filter element on that dimension. The dimension order is determined by the
+    value of `data_format`, see above for details. Dilations in the batch and
+    depth dimensions must be 1.
 )doc");
 
 REGISTER_OP("Conv3DBackpropInput")
@@ -1032,10 +1107,11 @@ REGISTER_OP("Conv3DBackpropInputV2")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -1061,6 +1137,11 @@ data_format: The data format of the input and output data. With the
         [batch, in_depth, in_height, in_width, in_channels].
     Alternatively, the format could be "NCDHW", the data storage order is:
         [batch, in_channels, in_depth, in_height, in_width].
+dilations: 1-D tensor of length 5.  The dilation factor for each dimension of
+    `input`. If set to k > 1, there will be k-1 skipped cells between each
+    filter element on that dimension. The dimension order is determined by the
+    value of `data_format`, see above for details. Dilations in the batch and
+    depth dimensions must be 1.
 
 )doc");
 
@@ -1069,10 +1150,11 @@ REGISTER_OP("Conv3DBackpropFilterV2")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
@@ -1098,6 +1180,11 @@ data_format: The data format of the input and output data. With the
         [batch, in_depth, in_height, in_width, in_channels].
     Alternatively, the format could be "NCDHW", the data storage order is:
         [batch, in_channels, in_depth, in_height, in_width].
+dilations: 1-D tensor of length 5.  The dilation factor for each dimension of
+    `input`. If set to k > 1, there will be k-1 skipped cells between each
+    filter element on that dimension. The dimension order is determined by the
+    value of `data_format`, see above for details. Dilations in the batch and
+    depth dimensions must be 1.
 
 )doc");
 
@@ -1110,7 +1197,7 @@ REGISTER_OP("AvgPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn(shape_inference::Pool3DShape)
     .Doc(R"doc(
 Performs 3D average pooling on the input.
@@ -1137,7 +1224,7 @@ REGISTER_OP("AvgPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -1172,7 +1259,7 @@ REGISTER_OP("MaxPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {float}")
+    .Attr("T: {bfloat16, float}")
     .SetShapeFn(shape_inference::Pool3DShape)
     .Doc(R"doc(
 Performs 3D max pooling on the input.
@@ -1200,8 +1287,8 @@ REGISTER_OP("MaxPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {float} = DT_FLOAT")
-    .Attr("TInput: {float} = DT_FLOAT")
+    .Attr("T: {bfloat16, float} = DT_FLOAT")
+    .Attr("TInput: {bfloat16, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 5);
     })
@@ -1266,7 +1353,7 @@ data_format: The data format of the input and output data. With the
 REGISTER_OP("L2Loss")
     .Input("t: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 L2 Loss.
@@ -1288,7 +1375,7 @@ REGISTER_OP("LRN")
     .Attr("bias: float = 1.0")
     .Attr("alpha: float = 1.0")
     .Attr("beta: float = 0.5")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: {half, bfloat16, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 4);
     })
@@ -1323,7 +1410,7 @@ REGISTER_OP("LRNGrad")
     .Attr("bias: float = 1.0")
     .Attr("alpha: float = 1.0")
     .Attr("beta: float = 0.5")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: {half, bfloat16, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &s));  // input_grads
@@ -1349,8 +1436,8 @@ output: The gradients for LRN.
 
 REGISTER_OP("MaxPool")
     .Attr(
-        "T: {float, double, int32, int64, uint8, int16, int8, uint16, "
-        "half, qint8} = DT_FLOAT")
+        "T: {half, bfloat16, float, double, int32, int64, uint8, int16, int8, "
+        "uint16, qint8} = DT_FLOAT")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
@@ -1376,8 +1463,8 @@ output: The max pooled output tensor.
 
 REGISTER_OP("MaxPoolV2")
     .Attr(
-        "T: {float, double, int32, int64, uint8, int16, int8, uint16, "
-        "half, qint8} = DT_FLOAT")
+        "T: {half, bfloat16, float, double, int32, int64, uint8, int16, int8, "
+        "uint16, qint8} = DT_FLOAT")
     .Attr(GetPaddingAttrString())
     .Attr("data_format: {'NHWC', 'NCHW', 'NCHW_VECT_C'} = 'NHWC'")
     .Input("input: T")
@@ -1860,7 +1947,7 @@ backprops: The gradients:
 REGISTER_OP("Elu")
     .Input("features: T")
     .Output("activations: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
@@ -1873,7 +1960,7 @@ REGISTER_OP("EluGrad")
     .Input("gradients: T")
     .Input("outputs: T")
     .Output("backprops: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
     .Doc(R"doc(
 Computes gradients for the exponential linear (Elu) operation.
@@ -1887,7 +1974,7 @@ backprops: The gradients: `gradients * (outputs + 1)` if outputs < 0,
 REGISTER_OP("Selu")
     .Input("features: T")
     .Output("activations: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
@@ -1900,7 +1987,7 @@ REGISTER_OP("SeluGrad")
     .Input("gradients: T")
     .Input("outputs: T")
     .Output("backprops: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
     .Doc(R"doc(
 Computes gradients for the scaled exponential linear (Selu) operation.
@@ -1962,7 +2049,7 @@ backprops: The gradients: `gradients / (1 + abs(features)) ** 2`.
 REGISTER_OP("Softmax")
     .Input("logits: T")
     .Output("softmax: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
     })
@@ -1982,7 +2069,7 @@ softmax: Same shape as `logits`.
 REGISTER_OP("LogSoftmax")
     .Input("logits: T")
     .Output("logsoftmax: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
     })
@@ -2004,7 +2091,7 @@ REGISTER_OP("SoftmaxCrossEntropyWithLogits")
     .Input("labels: T")
     .Output("loss: T")
     .Output("backprop: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
@@ -2033,7 +2120,7 @@ REGISTER_OP("SparseSoftmaxCrossEntropyWithLogits")
     .Input("labels: Tlabels")
     .Output("loss: T")
     .Output("backprop: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("Tlabels: {int32, int64} = DT_INT64")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle features;
@@ -2613,6 +2700,7 @@ REGISTER_OP("QuantizedConv2D")
     .Attr("out_type: quantizedtype = DT_QINT32")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -2641,7 +2729,11 @@ min_filter: The float value that the lowest quantized filter value represents.
 max_filter: The float value that the highest quantized filter value represents.
 min_output: The float value that the lowest quantized output value represents.
 max_output: The float value that the highest quantized output value represents.
-
+dilations: 1-D tensor of length 4.  The dilation factor for each dimension of
+    `input`. If set to k > 1, there will be k-1 skipped cells between each
+    filter element on that dimension. The dimension order is determined by the
+    value of `data_format`, see above for details. Dilations in the batch and
+    depth dimensions must be 1.
 )doc");
 
 REGISTER_OP("QuantizedMaxPool")
@@ -2866,6 +2958,25 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("__MklDummyConv2DWithBias")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("bias: T")
+    .Output("output: T")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Doc(R"doc(
+Dummy node that enables fusing Conv2D and BiasAdd operator for MKL. This node
+does not perform anything. It is just created as an intermediate output of
+merging Conv2D and BiasAdd.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklConv2DWithBias")
     .Input("input: T")
     .Input("filter: T")
@@ -2919,6 +3030,88 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("__MklDummyConv2DBackpropFilterWithBias")
+    .Input("input: T")
+    .Input("filter_sizes: int32")
+    .Input("out_backprop: T")
+    .Output("output: T")
+    .Output("bias_grad: T")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input_shape;
+      // Fetch the data_format attribute, which may not exist.
+      string data_format;
+      Status s = c->GetAttr("data_format", &data_format);
+
+      if (s.ok() && data_format == "NCHW") {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+        c->set_output(1, c->Vector(c->Dim(input_shape, -3)));
+      } else {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+        c->set_output(1, c->Vector(c->Dim(input_shape, -1)));
+      }
+      ShapeHandle sh;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &sh));
+      TF_RETURN_IF_ERROR(c->WithRank(sh, 4, &sh));
+      c->set_output(0, sh);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Dummy node that enables fusing Conv2DBackpropFilter and BiasAddGrad operator
+for MKL. This node does not perform anything. It is just created as an
+intermediate output of merging Conv2DBackpropFilter and BiasAddGrad.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklConv2DBackpropFilterWithBias")
+    .Input("input: T")
+    .Input("filter_sizes: int32")
+    .Input("out_backprop: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter_size: uint8")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("bias_grad: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_bias_grad: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input_shape;
+      // Fetch the data_format attribute, which may not exist.
+      string data_format;
+      Status s = c->GetAttr("data_format", &data_format);
+
+      if (s.ok() && data_format == "NCHW") {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+        c->set_output(1, c->Vector(c->Dim(input_shape, -3)));
+      } else {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+        c->set_output(1, c->Vector(c->Dim(input_shape, -1)));
+      }
+      ShapeHandle sh;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &sh));
+      TF_RETURN_IF_ERROR(c->WithRank(sh, 4, &sh));
+      c->set_output(0, sh);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of Conv2DBackpropFilterWithBias. Uses MKL DNN APIs to compute the
+gradients of convolution with respect to the filter.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklConv2DWithBiasBackpropBias")
     .Input("out_backprop: T")
     .Input("mkl_out_backprop: uint8")
@@ -2995,6 +3188,78 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_MklElu")
+    .Input("features: T")
+    .Input("mkl_features: uint8")
+    .Output("activations: T")
+    .Output("mkl_activations: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+MKL version of Elu operator. Uses MKL DNN APIs to implement Elu operator.
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklEluGrad")
+    .Input("gradients: T")
+    .Input("features: T")
+    .Input("mkl_gradients: uint8")
+    .Input("mkl_features: uint8")
+    .Output("backprops: T")
+    .Output("mkl_backprops: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
+    .Doc(R"doc(
+MKL version of EluGrad operator. Uses MKL DNN APIs to compute Elu
+gradients for Elu operation.
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklSoftmax")
+    .Input("logits: T")
+    .Input("mkl_logits: uint8")
+    .Output("softmax: T")
+    .Output("mkl_softmax: uint8")
+    .Attr("T: {half, float, double}")
+    .SetShapeFn([](InferenceContext* c) {
+      return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
+    })
+    .Doc(R"doc(
+MKL version of ReluGrad operator. Uses MKL DNN APIs to compute rectified
+linear gradients for Relu operation.
+)doc");
+
+REGISTER_OP("_MklTanh")
+    .Input("features: T")
+    .Input("mkl_features: uint8")
+    .Output("activations: T")
+    .Output("mkl_activations: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+MKL version of Tanh operator. Uses MKL DNN APIs to implement Tanh operator.
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklTanhGrad")
+    .Input("gradients: T")
+    .Input("features: T")
+    .Input("mkl_gradients: uint8")
+    .Input("mkl_features: uint8")
+    .Output("backprops: T")
+    .Output("mkl_backprops: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
+    .Doc(R"doc(
+MKL version of TanhGrad operator. Uses MKL DNN APIs to compute tanh
+gradients for Tanh operation.
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklMaxPool")
     .Attr("T: {float, half} = DT_FLOAT")
     .Attr("ksize: list(int) >= 4")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index a2a2e8ddd063f07fdfa9539afe11d8e5ea101cc7..6382a2fb79d736e335ce019c70c2e0591814fcfe 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -34,6 +34,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -83,6 +84,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -136,6 +138,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -213,6 +216,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -235,6 +239,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -262,6 +267,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -291,6 +297,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -392,6 +399,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
         type: DT_VARIANT
       }
     }
@@ -466,6 +474,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -835,6 +844,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -900,6 +910,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -986,6 +997,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1081,6 +1093,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1103,6 +1116,87 @@ op {
   summary: "Update \'*var\' according to the Adam algorithm."
   description: "lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)\nm_t <- beta1 * m_{t-1} + (1 - beta1) * g_t\nv_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t\nvariable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)"
 }
+op {
+  name: "ApplyAddSign"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    description: "Same as \"var\"."
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and m tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update \'*var\' according to the AddSign update."
+  description: "m_t <- beta1 * m_{t-1} + (1 - beta1) * g\nupdate <- (alpha + sign_decay * sign(g) *sign(m)) * g\nvariable <- variable - lr_t * update"
+}
 op {
   name: "ApplyCenteredRMSProp"
   input_arg {
@@ -1180,6 +1274,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1266,6 +1361,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1356,6 +1452,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1415,6 +1512,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1484,6 +1582,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1506,6 +1605,87 @@ op {
   summary: "Update \'*var\' according to the momentum scheme. Set use_nesterov = True if you"
   description: "want to use Nesterov momentum.\n\naccum = accum * momentum + grad\nvar -= lr * accum"
 }
+op {
+  name: "ApplyPowerSign"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    description: "Same as \"var\"."
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and m tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update \'*var\' according to the AddSign update."
+  description: "m_t <- beta1 * m_{t-1} + (1 - beta1) * g\nupdate <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g\nvariable <- variable - lr_t * update"
+}
 op {
   name: "ApplyProximalAdagrad"
   input_arg {
@@ -1567,6 +1747,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1636,6 +1817,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1721,6 +1903,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1770,6 +1953,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1819,6 +2003,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1887,6 +2072,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -2003,6 +2189,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -2030,6 +2217,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -2152,6 +2340,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -2227,6 +2416,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -2299,6 +2489,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -2329,6 +2520,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -2353,6 +2545,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -2526,6 +2719,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -2590,6 +2784,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -2658,6 +2853,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -2727,6 +2923,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -3123,6 +3320,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -3468,6 +3666,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -3561,6 +3760,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -3857,6 +4057,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -3910,6 +4111,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -3968,6 +4170,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -4026,6 +4229,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT64
@@ -4050,6 +4254,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT64
@@ -4270,6 +4475,34 @@ op {
   summary: "Bucketizes \'input\' based on \'boundaries\'."
   description: "For example, if the inputs are\n    boundaries = [0, 10, 100]\n    input = [[-5, 10000]\n             [150,   10]\n             [5,    100]]\n\nthen the output will be\n    output = [[0, 3]\n              [3, 2]\n              [1, 3]]"
 }
+op {
+  name: "BytesProducedStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Records the bytes size of each element of `input_dataset` in a StatsAggregator."
+}
 op {
   name: "CTCBeamSearchDecoder"
   input_arg {
@@ -4498,6 +4731,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -4521,6 +4755,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -4920,6 +5155,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -5054,6 +5290,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -5061,7 +5298,7 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D tensor of length 4.  The stride of the sliding window for each\ndimension of `input`. The dimension order is determined by the value of\n  `data_format`, see below for details."
+    description: "1-D tensor of length 4.  The stride of the sliding window for each\ndimension of `input`. The dimension order is determined by the value of\n`data_format`, see below for details."
   }
   attr {
     name: "use_cudnn_on_gpu"
@@ -5095,6 +5332,19 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+    description: "1-D tensor of length 4.  The dilation factor for each dimension of\n`input`. If set to k > 1, there will be k-1 skipped cells between each\nfilter element on that dimension. The dimension order is determined by the\nvalue of `data_format`, see above for details. Dilations in the batch and\ndepth dimensions must be 1."
+  }
   summary: "Computes a 2-D convolution given 4-D `input` and `filter` tensors."
   description: "Given an input tensor of shape `[batch, in_height, in_width, in_channels]`\nand a filter / kernel tensor of shape\n`[filter_height, filter_width, in_channels, out_channels]`, this op\nperforms the following:\n\n1. Flattens the filter to a 2-D matrix with shape\n   `[filter_height * filter_width * in_channels, output_channels]`.\n2. Extracts image patches from the input tensor to form a *virtual*\n   tensor of shape `[batch, out_height, out_width,\n   filter_height * filter_width * in_channels]`.\n3. For each patch, right-multiplies the filter matrix and the image patch\n   vector.\n\nIn detail, with the default NHWC format,\n\n    output[b, i, j, k] =\n        sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *\n                        filter[di, dj, q, k]\n\nMust have `strides[0] = strides[3] = 1`.  For the most common case of the same\nhorizontal and vertices strides, `strides = [1, stride, stride, 1]`."
 }
@@ -5126,6 +5376,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -5167,6 +5418,19 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+    description: "1-D tensor of length 4.  The dilation factor for each dimension of\n`input`. If set to k > 1, there will be k-1 skipped cells between each filter\nelement on that dimension. The dimension order is determined by the value of\n`data_format`, see above for details. Dilations in the batch and depth\ndimensions must be 1."
+  }
   summary: "Computes the gradients of convolution with respect to the filter."
 }
 op {
@@ -5197,6 +5461,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -5238,6 +5503,19 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+    description: "1-D tensor of length 4.  The dilation factor for each dimension of\n`input`. If set to k > 1, there will be k-1 skipped cells between each filter\nelement on that dimension. The dimension order is determined by the value of\n`data_format`, see above for details. Dilations in the batch and depth\ndimensions must be 1."
+  }
   summary: "Computes the gradients of convolution with respect to the input."
 }
 op {
@@ -5262,6 +5540,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -5299,6 +5578,20 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+    description: "1-D tensor of length 5.  The dilation factor for each dimension of\n`input`. If set to k > 1, there will be k-1 skipped cells between each\nfilter element on that dimension. The dimension order is determined by the\nvalue of `data_format`, see above for details. Dilations in the batch and\ndepth dimensions must be 1."
+  }
   summary: "Computes a 3-D convolution given 5-D `input` and `filter` tensors."
   description: "In signal processing, cross-correlation is a measure of similarity of\ntwo waveforms as a function of a time-lag applied to one of them. This\nis also known as a sliding dot product or sliding inner-product.\n\nOur Conv3D implements a form of cross-correlation."
 }
@@ -5385,6 +5678,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -5422,6 +5716,20 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+    description: "1-D tensor of length 5.  The dilation factor for each dimension of\n`input`. If set to k > 1, there will be k-1 skipped cells between each\nfilter element on that dimension. The dimension order is determined by the\nvalue of `data_format`, see above for details. Dilations in the batch and\ndepth dimensions must be 1."
+  }
   summary: "Computes the gradients of 3-D convolution with respect to the filter."
 }
 op {
@@ -5507,6 +5815,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -5544,6 +5853,20 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+    description: "1-D tensor of length 5.  The dilation factor for each dimension of\n`input`. If set to k > 1, there will be k-1 skipped cells between each\nfilter element on that dimension. The dimension order is determined by the\nvalue of `data_format`, see above for details. Dilations in the batch and\ndepth dimensions must be 1."
+  }
   summary: "Computes the gradients of 3-D convolution with respect to the input."
 }
 op {
@@ -5636,6 +5959,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -5661,6 +5985,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -5913,6 +6238,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -5972,6 +6298,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -6044,6 +6371,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -6063,6 +6391,94 @@ op {
   summary: "Compute the cumulative sum of the tensor `x` along `axis`."
   description: "By default, this op performs an inclusive cumsum, which means that the first\nelement of the input is identical to the first element of the output:\n\n```python\ntf.cumsum([a, b, c])  # => [a, a + b, a + b + c]\n```\n\nBy setting the `exclusive` kwarg to `True`, an exclusive cumsum is\nperformed instead:\n\n```python\ntf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]\n```\n\nBy setting the `reverse` kwarg to `True`, the cumsum is performed in the\nopposite direction:\n\n```python\ntf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]\n```\n\nThis is more efficient than using separate `tf.reverse` ops.\n\nThe `reverse` and `exclusive` kwargs can also be combined:\n\n```python\ntf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]\n```"
 }
+op {
+  name: "DataFormatDimMap"
+  input_arg {
+    name: "x"
+    description: "Scalar. Dimension index in source data format. Must be in the range [-4, 4)."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    description: "Scalar. Dimension index in destination data format."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "src_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    description: "source data format."
+  }
+  attr {
+    name: "dst_format"
+    type: "string"
+    default_value {
+      s: "NCHW"
+    }
+    description: "destination data format."
+  }
+  summary: "Returns the dimension index in the destination data format given the one in"
+  description: "the source data format."
+}
+op {
+  name: "DataFormatVecPermute"
+  input_arg {
+    name: "x"
+    description: "Vector of size 4 or Tensor of shape (4, 2) in source data format."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    description: "Vector of size 4 or Tensor of shape (4, 2) in destination data format."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "src_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    description: "source data format."
+  }
+  attr {
+    name: "dst_format"
+    type: "string"
+    default_value {
+      s: "NCHW"
+    }
+    description: "destination data format."
+  }
+  summary: "Returns the permuted vector/tensor in the destination data format given the"
+  description: "one in the source data format."
+}
 op {
   name: "DatasetToSingleElement"
   input_arg {
@@ -6857,7 +7273,7 @@ op {
     }
   }
   summary: "DepthToSpace for tensors of type T."
-  description: "Rearranges data from depth into blocks of spatial data.\nThis is the reverse transformation of SpaceToDepth. More specifically,\nthis op outputs a copy of the input tensor where values from the `depth`\ndimension are moved in spatial blocks to the `height` and `width` dimensions.\nThe attr `block_size` indicates the input block size and how the data is moved.\n\n  * Chunks of data of size `block_size * block_size` from depth are rearranged\n    into non-overlapping blocks of size `block_size x block_size`\n  * The width the output tensor is `input_depth * block_size`, whereas the\n    height is `input_height * block_size`.\n  * The Y, X coordinates within each block of the output image are determined\n    by the high order component of the input channel index.\n  * The depth of the input tensor must be divisible by\n    `block_size * block_size`.\n\nThe `data_format` attr specifies the layout of the input and output tensors\nwith the following options:\n  \"NHWC\": `[ batch, height, width, channels ]`\n  \"NCHW\": `[ batch, channels, height, width ]`\n  \"NCHW_VECT_C\":\n      `qint8 [ batch, channels / 4, height, width, channels % 4 ]`\n\nIt is useful to consider the operation as transforming a 6-D Tensor.\ne.g. for data_format = NHWC,\n     Each element in the input tensor can be specified via 6 coordinates,\n     ordered by decreasing memory layout significance as:\n     n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates\n                        within the input image, bX, bY means coordinates\n                        within the output block, oC means output channels).\n     The output would be the input transposed to the following layout:\n     n,iY,bY,iX,bX,oC\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given an input of shape `[1, 1, 1, 4]`, data_format = \"NHWC\" and\nblock_size = 2:\n\n```\nx = [[[[1, 2, 3, 4]]]]\n\n```\n\nThis operation will output a tensor of shape `[1, 2, 2, 1]`:\n\n```\n   [[[[1], [2]],\n     [[3], [4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,\nthe corresponding output will have 2x2 elements and will have a depth of\n1 channel (1 = `4 / (block_size * block_size)`).\nThe output element shape is `[2, 2, 1]`.\n\nFor an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.\n\n```\nx = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nThis operation, for block size of 2, will return the following tensor of shape\n`[1, 2, 2, 3]`\n\n```\n   [[[[1, 2, 3], [4, 5, 6]],\n     [[7, 8, 9], [10, 11, 12]]]]\n\n```\n\nSimilarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:\n\n```\nx =  [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 4 4 1]`:\n\n```\nx = [[[ [1],   [2],  [5],  [6]],\n      [ [3],   [4],  [7],  [8]],\n      [ [9],  [10], [13],  [14]],\n      [ [11], [12], [15],  [16]]]]\n\n```"
+  description: "Rearranges data from depth into blocks of spatial data.\nThis is the reverse transformation of SpaceToDepth. More specifically,\nthis op outputs a copy of the input tensor where values from the `depth`\ndimension are moved in spatial blocks to the `height` and `width` dimensions.\nThe attr `block_size` indicates the input block size and how the data is moved.\n\n  * Chunks of data of size `block_size * block_size` from depth are rearranged\n    into non-overlapping blocks of size `block_size x block_size`\n  * The width the output tensor is `input_depth * block_size`, whereas the\n    height is `input_height * block_size`.\n  * The Y, X coordinates within each block of the output image are determined\n    by the high order component of the input channel index.\n  * The depth of the input tensor must be divisible by\n    `block_size * block_size`.\n\nThe `data_format` attr specifies the layout of the input and output tensors\nwith the following options:\n  \"NHWC\": `[ batch, height, width, channels ]`\n  \"NCHW\": `[ batch, channels, height, width ]`\n  \"NCHW_VECT_C\":\n      `qint8 [ batch, channels / 4, height, width, 4 ]`\n\nIt is useful to consider the operation as transforming a 6-D Tensor.\ne.g. for data_format = NHWC,\n     Each element in the input tensor can be specified via 6 coordinates,\n     ordered by decreasing memory layout significance as:\n     n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates\n                        within the input image, bX, bY means coordinates\n                        within the output block, oC means output channels).\n     The output would be the input transposed to the following layout:\n     n,iY,bY,iX,bX,oC\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given an input of shape `[1, 1, 1, 4]`, data_format = \"NHWC\" and\nblock_size = 2:\n\n```\nx = [[[[1, 2, 3, 4]]]]\n\n```\n\nThis operation will output a tensor of shape `[1, 2, 2, 1]`:\n\n```\n   [[[[1], [2]],\n     [[3], [4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,\nthe corresponding output will have 2x2 elements and will have a depth of\n1 channel (1 = `4 / (block_size * block_size)`).\nThe output element shape is `[2, 2, 1]`.\n\nFor an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.\n\n```\nx = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nThis operation, for block size of 2, will return the following tensor of shape\n`[1, 2, 2, 3]`\n\n```\n   [[[[1, 2, 3], [4, 5, 6]],\n     [[7, 8, 9], [10, 11, 12]]]]\n\n```\n\nSimilarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:\n\n```\nx =  [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 4 4 1]`:\n\n```\nx = [[[ [1],   [2],  [5],  [6]],\n      [ [3],   [4],  [7],  [8]],\n      [ [9],  [10], [13],  [14]],\n      [ [11], [12], [15],  [16]]]]\n\n```"
 }
 op {
   name: "DepthwiseConv2dNative"
@@ -6878,6 +7294,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -6913,6 +7331,19 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+    description: "1-D tensor of length 4.  The dilation factor for each dimension of\n`input`. If set to k > 1, there will be k-1 skipped cells between each filter\nelement on that dimension. The dimension order is determined by the value of\n`data_format`, see above for details. Dilations in the batch and depth\ndimensions must be 1."
+  }
   summary: "Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors."
   description: "Given an input tensor of shape `[batch, in_height, in_width, in_channels]`\nand a filter / kernel tensor of shape\n`[filter_height, filter_width, in_channels, channel_multiplier]`, containing\n`in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies\na different filter to each input channel (expanding from 1 channel to\n`channel_multiplier` channels for each), then concatenates the results\ntogether. Thus, the output has `in_channels * channel_multiplier` channels.\n\n```\nfor k in 0..in_channels-1\n  for q in 0..channel_multiplier-1\n    output[b, i, j, k * channel_multiplier + q] =\n      sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *\n                        filter[di, dj, k, q]\n```\n\nMust have `strides[0] = strides[3] = 1`.  For the most common case of the same\nhorizontal and vertices strides, `strides = [1, stride, stride, 1]`."
 }
@@ -6943,6 +7374,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -6978,6 +7410,19 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+    description: "1-D tensor of length 4.  The dilation factor for each dimension of\n`input`. If set to k > 1, there will be k-1 skipped cells between each filter\nelement on that dimension. The dimension order is determined by the value of\n`data_format`, see above for details. Dilations in the batch and depth\ndimensions must be 1."
+  }
   summary: "Computes the gradients of depthwise convolution with respect to the filter."
 }
 op {
@@ -7007,6 +7452,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -7042,6 +7488,19 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+    description: "1-D tensor of length 4.  The dilation factor for each dimension of\n`input`. If set to k > 1, there will be k-1 skipped cells between each filter\nelement on that dimension. The dimension order is determined by the value of\n`data_format`, see above for details. Dilations in the batch and depth\ndimensions must be 1."
+  }
   summary: "Computes the gradients of depthwise convolution with respect to the input."
 }
 op {
@@ -7140,8 +7599,8 @@ op {
   name: "DeserializeSparse"
   input_arg {
     name: "serialized_sparse"
-    description: "1-D, The serialized `SparseTensor` object. Must have 3 columns."
-    type: DT_STRING
+    description: "The serialized `SparseTensor` objects. The last dimension\nmust have 3 columns."
+    type_attr: "Tserialized"
   }
   output_arg {
     name: "sparse_indices"
@@ -7158,10 +7617,23 @@ op {
   attr {
     name: "dtype"
     type: "type"
-    description: "The `dtype` of the serialized `SparseTensor` object."
+    description: "The `dtype` of the serialized `SparseTensor` objects."
   }
-  summary: "Deserialize `SparseTensor` from a (serialized) string 3-vector (1-D `Tensor`)"
-  description: "object."
+  attr {
+    name: "Tserialized"
+    type: "type"
+    default_value {
+      type: DT_STRING
+    }
+    allowed_values {
+      list {
+        type: DT_STRING
+        type: DT_VARIANT
+      }
+    }
+  }
+  summary: "Deserialize `SparseTensor` objects."
+  description: "The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where\nthe last dimension stores serialized `SparseTensor` objects and the other N\ndimensions (N >= 0) correspond to a batch. The ranks of the original\n`SparseTensor` objects must all match. When the final `SparseTensor` is\ncreated, its rank is the rank of the incoming `SparseTensor` objects plus N;\nthe sparse tensors have been concatenated along new dimensions, one for each\nbatch.\n\nThe output `SparseTensor` object\'s shape values for the original dimensions\nare the max across the input `SparseTensor` objects\' shape values for the\ncorresponding dimensions. The new dimensions match the size of the batch.\n\nThe input `SparseTensor` objects\' indices are assumed ordered in\nstandard lexicographic order.  If this is not the case, after this\nstep run `SparseReorder` to restore index ordering.\n\nFor example, if the serialized input is a `[2 x 3]` matrix representing two\noriginal `SparseTensor` objects:\n\n    index = [ 0]\n            [10]\n            [20]\n    values = [1, 2, 3]\n    shape = [50]\n\nand\n\n    index = [ 2]\n            [10]\n    values = [4, 5]\n    shape = [30]\n\nthen the final deserialized `SparseTensor` will be:\n\n    index = [0  0]\n            [0 10]\n            [0 20]\n            [1  2]\n            [1 10]\n    values = [1, 2, 3, 4, 5]\n    shape = [2 50]"
 }
 op {
   name: "DestroyResourceOp"
@@ -7222,6 +7694,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -7251,6 +7724,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -7279,6 +7753,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -7320,6 +7795,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -7389,6 +7865,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -7457,6 +7934,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -7507,6 +7985,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -7615,6 +8094,34 @@ op {
   summary: "Interleave the values from the `data` tensors into a single tensor."
   description: "Builds a merged tensor such that\n\n```python\n    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]\n```\n\nFor example, if each `indices[m]` is scalar or vector, we have\n\n```python\n    # Scalar indices:\n    merged[indices[m], ...] = data[m][...]\n\n    # Vector indices:\n    merged[indices[m][i], ...] = data[m][i, ...]\n```\n\nEach `data[i].shape` must start with the corresponding `indices[i].shape`,\nand the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we\nmust have `data[i].shape = indices[i].shape + constant`.  In terms of this\n`constant`, the output shape is\n\n    merged.shape = [max(indices)] + constant\n\nValues are merged in order, so if an index appears in both `indices[m][i]` and\n`indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the\nmerged result. If you do not need this guarantee, ParallelDynamicStitch might\nperform better on some devices.\n\nFor example:\n\n```python\n    indices[0] = 6\n    indices[1] = [4, 1]\n    indices[2] = [[5, 2], [0, 3]]\n    data[0] = [61, 62]\n    data[1] = [[41, 42], [11, 12]]\n    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]\n    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],\n              [51, 52], [61, 62]]\n```\n\nThis method can be used to merge partitions created by `dynamic_partition`\nas illustrated on the following example:\n\n```python\n    # Apply function (increments x_i) on elements for which a certain condition\n    # apply (x_i != -1 in this example).\n    x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])\n    condition_mask=tf.not_equal(x,tf.constant(-1.))\n    partitioned_data = tf.dynamic_partition(\n        x, tf.cast(condition_mask, tf.int32) , 2)\n    partitioned_data[1] = partitioned_data[1] + 1.0\n    condition_indices = tf.dynamic_partition(\n        tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)\n    x = tf.dynamic_stitch(condition_indices, partitioned_data)\n    # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain\n    # unchanged.\n```\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/DynamicStitch.png\" alt>\n</div>"
 }
+op {
+  name: "EagerPyFunc"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  summary: "Eagerly executes a python function to compute func(input)->output. The"
+  description: "semantics of the input, output, and attributes are the same as those for\nPyFunc."
+  is_stateful: true
+}
 op {
   name: "EditDistance"
   input_arg {
@@ -7683,6 +8190,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -7714,6 +8222,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -7960,6 +8469,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -7997,6 +8507,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -8020,6 +8531,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -8062,6 +8574,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -8123,6 +8636,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -8231,6 +8745,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -9101,6 +9616,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -9128,6 +9644,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -9165,6 +9682,7 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -9682,6 +10200,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -9782,6 +10301,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -10208,6 +10728,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -10244,6 +10765,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -10315,6 +10837,24 @@ op {
   summary: "Creates a dataset that computes a windowed group-by on `input_dataset`."
   description: "// TODO(mrry): Support non-int64 keys."
 }
+op {
+  name: "GuaranteeConst"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  summary: "Gives a guarantee to the TF runtime that the input tensor is a constant."
+  description: "The runtime is then free to make optimizations based on this.\n\nOnly accepts value typed tensors as inputs and rejects resource variable handles\nas input.\n\nReturns the input tensor without modification."
+  is_stateful: true
+}
 op {
   name: "HSVToRGB"
   input_arg {
@@ -10520,6 +11060,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -11217,6 +11758,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -11253,6 +11795,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -11340,6 +11883,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -11364,6 +11908,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -11388,6 +11933,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -11507,6 +12053,19 @@ op {
   summary: "Gets the next output from the given iterator."
   is_stateful: true
 }
+op {
+  name: "IteratorSetStatsAggregator"
+  input_arg {
+    name: "iterator_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stats_aggregator_handle"
+    type: DT_RESOURCE
+  }
+  summary: "Associates the given iterator with the given statistics aggregator."
+  is_stateful: true
+}
 op {
   name: "IteratorToStringHandle"
   input_arg {
@@ -11540,6 +12099,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -11626,8 +12186,9 @@ op {
     }
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
       }
     }
   }
@@ -11696,13 +12257,42 @@ op {
     }
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
       }
     }
   }
   summary: "Gradients for Local Response Normalization."
 }
+op {
+  name: "LatencyStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Records the latency of producing `input_dataset` elements in a StatsAggregator."
+}
 op {
   name: "LearnedUnigramCandidateSampler"
   input_arg {
@@ -11835,6 +12425,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -11871,6 +12462,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -11893,6 +12485,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -11927,6 +12520,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -12063,6 +12657,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -12089,6 +12684,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -12149,6 +12745,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -13040,6 +13637,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -13423,6 +14021,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -13462,6 +14061,8 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -13470,7 +14071,6 @@ op {
         type: DT_INT16
         type: DT_INT8
         type: DT_UINT16
-        type: DT_HALF
         type: DT_QINT8
       }
     }
@@ -13573,6 +14173,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -13647,6 +14248,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -13659,6 +14261,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -13817,6 +14420,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -13899,6 +14503,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -13977,6 +14582,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -14055,6 +14661,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -14136,6 +14743,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -14214,6 +14822,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -14249,6 +14858,8 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -14257,7 +14868,6 @@ op {
         type: DT_INT16
         type: DT_INT8
         type: DT_UINT16
-        type: DT_HALF
         type: DT_QINT8
       }
     }
@@ -14361,6 +14971,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -14387,6 +14998,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -14444,6 +15056,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -14637,6 +15250,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -14676,6 +15290,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -14804,6 +15419,7 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -14832,6 +15448,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -14864,7 +15481,7 @@ op {
   output_arg {
     name: "output"
     description: "2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`\ncontains the drawn class labels with range `[0, num_classes)`."
-    type: DT_INT64
+    type_attr: "output_dtype"
   }
   attr {
     name: "seed"
@@ -14898,6 +15515,20 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -15265,6 +15896,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -15427,6 +16059,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -15489,6 +16122,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -15610,12 +16244,18 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_BOOL
       }
     }
   }
@@ -16437,6 +17077,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -16916,6 +17557,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -17187,6 +17829,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -17355,6 +17998,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -17416,6 +18060,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -17465,6 +18110,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -18134,6 +18780,19 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+    description: "1-D tensor of length 4.  The dilation factor for each dimension of\n`input`. If set to k > 1, there will be k-1 skipped cells between each\nfilter element on that dimension. The dimension order is determined by the\nvalue of `data_format`, see above for details. Dilations in the batch and\ndepth dimensions must be 1."
+  }
   summary: "Computes a 2D convolution given quantized 4D input and filter tensors."
   description: "The inputs are quantized tensors where the lowest value represents the real\nnumber of the associated minimum, and the highest represents the maximum.\nThis means that you can only interpret the quantized output in the same way, by\ntaking the returned minimum and maximum values into account."
 }
@@ -19372,6 +20031,37 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RandomDataset"
+  input_arg {
+    name: "seed"
+    description: "A scalar seed for the random number generator. If either seed or\nseed2 is set to be non-zero, the random number generator is seeded\nby the given seed.  Otherwise, a random seed is used."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    description: "A second scalar seed to avoid seed collision."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a Dataset that returns pseudorandom numbers."
+  is_stateful: true
+}
 op {
   name: "RandomGamma"
   input_arg {
@@ -19787,6 +20477,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -19841,6 +20532,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -19952,6 +20644,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -20362,6 +21055,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -20394,6 +21088,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -20426,6 +21121,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -20753,6 +21449,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -20784,6 +21481,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -20822,6 +21520,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -20860,6 +21559,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -21464,6 +22164,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -21522,6 +22223,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -21600,6 +22302,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -21687,6 +22390,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -21710,6 +22414,80 @@ op {
   description: "lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)\nm_t <- beta1 * m_{t-1} + (1 - beta1) * g_t\nv_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t\nvariable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)"
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAddSign"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and m tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update \'*var\' according to the AddSign update."
+  description: "m_t <- beta1 * m_{t-1} + (1 - beta1) * g\nupdate <- (alpha + sign_decay * sign(g) *sign(m)) * g\nvariable <- variable - lr_t * update"
+  is_stateful: true
+}
 op {
   name: "ResourceApplyCenteredRMSProp"
   input_arg {
@@ -21777,6 +22555,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -21826,7 +22605,90 @@ op {
   }
   input_arg {
     name: "l2"
-    description: "L2 regulariation. Must be a scalar."
+    description: "L2 regulariation. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update \'*var\' according to the Ftrl-proximal scheme."
+  description: "accum_new = accum + grad * grad\nlinear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrlV2"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regulariation. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 shrinkage regulariation. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
     type_attr: "T"
   }
   input_arg {
@@ -21855,6 +22717,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -21867,53 +22730,24 @@ op {
     description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
   summary: "Update \'*var\' according to the Ftrl-proximal scheme."
-  description: "accum_new = accum + grad * grad\nlinear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
+  description: "grad_with_shrinkage = grad + 2 * l2_shrinkage * var\naccum_new = accum + grad_with_shrinkage * grad_with_shrinkage\nlinear += grad_with_shrinkage +\n    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
   is_stateful: true
 }
 op {
-  name: "ResourceApplyFtrlV2"
+  name: "ResourceApplyGradientDescent"
   input_arg {
     name: "var"
     description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
-    description: "Should be from a Variable()."
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    description: "Should be from a Variable()."
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    description: "The gradient."
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
+    name: "alpha"
     description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
-    description: "L1 regulariation. Must be a scalar."
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    description: "L2 shrinkage regulariation. Must be a scalar."
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    description: "Scaling factor. Must be a scalar."
+    name: "delta"
+    description: "The change."
     type_attr: "T"
   }
   attr {
@@ -21937,6 +22771,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -21946,27 +22781,36 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+    description: "If `True`, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Update \'*var\' according to the Ftrl-proximal scheme."
-  description: "grad_with_shrinkage = grad + 2 * l2_shrinkage * var\naccum_new = accum + grad_with_shrinkage * grad_with_shrinkage\nlinear += grad_with_shrinkage +\n    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
+  summary: "Update \'*var\' by subtracting \'alpha\' * \'delta\' from it."
   is_stateful: true
 }
 op {
-  name: "ResourceApplyGradientDescent"
+  name: "ResourceApplyMomentum"
   input_arg {
     name: "var"
     description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
-    name: "alpha"
+    name: "accum"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
-    name: "delta"
-    description: "The change."
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    description: "Momentum. Must be a scalar."
     type_attr: "T"
   }
   attr {
@@ -21990,6 +22834,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -21999,20 +22844,29 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
+    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' by subtracting \'alpha\' * \'delta\' from it."
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, the tensor passed to compute grad will be\nvar - lr * momentum * accum, so in the end, the var you get is actually\nvar - lr * momentum * accum."
+  }
+  summary: "Update \'*var\' according to the momentum scheme. Set use_nesterov = True if you"
+  description: "want to use Nesterov momentum.\n\naccum = accum * momentum + grad\nvar -= lr * accum"
   is_stateful: true
 }
 op {
-  name: "ResourceApplyMomentum"
+  name: "ResourceApplyPowerSign"
   input_arg {
     name: "var"
     description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "m"
     description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
@@ -22022,13 +22876,23 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    description: "The gradient."
+    name: "logbase"
+    description: "Must be a scalar."
     type_attr: "T"
   }
   input_arg {
-    name: "momentum"
-    description: "Momentum. Must be a scalar."
+    name: "sign_decay"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
     type_attr: "T"
   }
   attr {
@@ -22052,6 +22916,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -22061,18 +22926,10 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-    description: "If `True`, the tensor passed to compute grad will be\nvar - lr * momentum * accum, so in the end, the var you get is actually\nvar - lr * momentum * accum."
+    description: "If `True`, updating of the var and m tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the momentum scheme. Set use_nesterov = True if you"
-  description: "want to use Nesterov momentum.\n\naccum = accum * momentum + grad\nvar -= lr * accum"
+  summary: "Update \'*var\' according to the AddSign update."
+  description: "m_t <- beta1 * m_{t-1} + (1 - beta1) * g\nupdate <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g\nvariable <- variable - lr_t * update"
   is_stateful: true
 }
 op {
@@ -22128,6 +22985,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -22191,6 +23049,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -22268,6 +23127,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -22390,6 +23250,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -22407,6 +23268,49 @@ op {
   description: "This operation computes\n\n    # Scalar indices\n    ref[indices, ...] += updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] += updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their contributions add.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\'https://www.tensorflow.org/images/ScatterAdd.png\' alt>\n</div>"
   is_stateful: true
 }
+op {
+  name: "ResourceScatterNdUpdate"
+  input_arg {
+    name: "ref"
+    description: "A resource handle. Must be from a VarHandleOp."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    description: "A Tensor. Must be one of the following types: int32, int64.\nA tensor of indices into ref."
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    description: "A Tensor. Must have the same type as ref. A tensor of updated\nvalues to add to ref."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+    description: "An optional bool. Defaults to True. If True, the assignment will\nbe protected by a lock; otherwise the behavior is undefined,\nbut may exhibit less contention."
+  }
+  summary: "Applies sparse `updates` to individual values or slices within a given"
+  description: "variable according to `indices`.\n\n`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.\n\n`indices` must be integer tensor, containing indices into `ref`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `ref`.\n\n`updates` is `Tensor` of rank `Q-1+P-K` with shape:\n\n```\n[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].\n```\n\nFor example, say we want to update 4 scattered elements to a rank-1 tensor to\n8 elements. In Python, that update would look like this:\n\n```python\n    ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])\n    indices = tf.constant([[4], [3], [1] ,[7]])\n    updates = tf.constant([9, 10, 11, 12])\n    update = tf.scatter_nd_update(ref, indices, updates)\n    with tf.Session() as sess:\n      print sess.run(update)\n```\n\nThe resulting update to ref would look like this:\n\n    [1, 11, 3, 10, 9, 6, 7, 12]\n\nSee @{tf.scatter_nd} for more details about how to make updates to\nslices."
+  is_stateful: true
+}
 op {
   name: "ResourceScatterUpdate"
   input_arg {
@@ -22445,6 +23349,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -22524,6 +23429,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -22596,6 +23502,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -22689,6 +23596,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -22785,6 +23693,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -22878,6 +23787,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -22975,6 +23885,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -23053,6 +23964,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -23144,6 +24056,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -23222,6 +24135,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -23314,6 +24228,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -23654,6 +24569,7 @@ op {
         type: DT_INT64
         type: DT_BOOL
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -23714,6 +24630,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -23738,6 +24655,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -23766,6 +24684,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -23796,6 +24715,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -24139,6 +25059,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -24237,6 +25158,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -24306,6 +25228,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -24375,6 +25298,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -24483,6 +25407,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -24550,6 +25475,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -24611,6 +25537,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -24729,6 +25656,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -25017,6 +25945,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -25065,6 +25994,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -25113,6 +26043,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -25166,6 +26097,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -25219,6 +26151,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -25348,6 +26281,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -25379,6 +26313,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -25420,13 +26355,27 @@ op {
   }
   output_arg {
     name: "serialized_sparse"
-    type: DT_STRING
+    type_attr: "out_type"
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`."
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_STRING
+    }
+    description: "The `dtype` to use for serialization; the supported types are `string`\n(default) and `variant`."
+    allowed_values {
+      list {
+        type: DT_STRING
+        type: DT_VARIANT
+      }
+    }
+  }
+  summary: "Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object."
   description: "The `SparseTensor` must have rank `R` greater than 1, and the first dimension\nis treated as the minibatch dimension.  Elements of the `SparseTensor`\nmust be sorted in increasing order of this first dimension.  The serialized\n`SparseTensor` objects going into each row of `serialized_sparse` will have\nrank `R-1`.\n\nThe minibatch size `N` is extracted from `sparse_shape[0]`."
 }
 op {
@@ -25448,13 +26397,27 @@ op {
   }
   output_arg {
     name: "serialized_sparse"
-    type: DT_STRING
+    type_attr: "out_type"
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object."
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_STRING
+    }
+    description: "The `dtype` to use for serialization; the supported types are `string`\n(default) and `variant`."
+    allowed_values {
+      list {
+        type: DT_STRING
+        type: DT_VARIANT
+      }
+    }
+  }
+  summary: "Serialize a `SparseTensor` into a `[3]` `Tensor` object."
 }
 op {
   name: "SerializeTensor"
@@ -25627,6 +26590,51 @@ op {
   }
   summary: "Generate a glob pattern matching all sharded file names."
 }
+op {
+  name: "ShuffleAndRepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    description: "The number of output elements to buffer in an iterator over\nthis dataset. Compare with the `min_after_dequeue` attr when creating a\n`RandomShuffleQueue`."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    description: "A scalar seed for the random number generator. If either `seed` or\n`seed2` is set to be non-zero, the random number generator is seeded\nby the given seed.  Otherwise, a random seed is used."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    description: "A second scalar seed to avoid seed collision."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "count"
+    description: "A scalar representing the number of times the underlying dataset\nshould be repeated. The default is `-1`, which results in infinite repetition."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that shuffles and repeats elements from `input_dataset`"
+  description: "pseudorandomly."
+}
 op {
   name: "ShuffleDataset"
   input_arg {
@@ -25640,7 +26648,7 @@ op {
   }
   input_arg {
     name: "seed"
-    description: "A scalar seed for the random number generator. If either seed or\nseed2 is set to be non-zero, the random number generator is seeded\nby the given seed.  Otherwise, a random seed is used."
+    description: "A scalar seed for the random number generator. If either `seed` or\n`seed2` is set to be non-zero, the random number generator is seeded\nby the given seed.  Otherwise, a random seed is used."
     type: DT_INT64
   }
   input_arg {
@@ -25690,6 +26698,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -25720,6 +26729,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -25746,6 +26756,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -25774,6 +26785,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -25799,6 +26811,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -25982,6 +26995,22 @@ op {
   summary: "Return a slice from \'input\'."
   description: "The output tensor is a tensor with dimensions described by \'size\'\nwhose values are extracted from \'input\' starting at the offsets in\n\'begin\'.\n\n*Requirements*:\n  0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)"
 }
+op {
+  name: "Snapshot"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  summary: "Returns a copy of the input tensor."
+}
 op {
   name: "Softmax"
   input_arg {
@@ -26000,6 +27029,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -26036,6 +27066,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -26070,6 +27101,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -26108,6 +27140,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -26139,6 +27172,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -26177,6 +27211,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -26314,7 +27349,7 @@ op {
     }
   }
   summary: "SpaceToDepth for tensors of type T."
-  description: "Rearranges blocks of spatial data, into depth. More specifically,\nthis op outputs a copy of the input tensor where values from the `height`\nand `width` dimensions are moved to the `depth` dimension.\nThe attr `block_size` indicates the input block size.\n\n  * Non-overlapping blocks of size `block_size x block size` are rearranged\n    into depth at each location.\n  * The depth of the output tensor is `block_size * block_size * input_depth`.\n  * The Y, X coordinates within each block of the input become the high order\n    component of the output channel index.\n  * The input tensor\'s height and width must be divisible by block_size.\n\nThe `data_format` attr specifies the layout of the input and output tensors\nwith the following options:\n  \"NHWC\": `[ batch, height, width, channels ]`\n  \"NCHW\": `[ batch, channels, height, width ]`\n  \"NCHW_VECT_C\":\n      `qint8 [ batch, channels / 4, height, width, channels % 4 ]`\n\nIt is useful to consider the operation as transforming a 6-D Tensor.\ne.g. for data_format = NHWC,\n     Each element in the input tensor can be specified via 6 coordinates,\n     ordered by decreasing memory layout significance as:\n     n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates\n                        within the output image, bX, bY means coordinates\n                        within the input block, iC means input channels).\n     The output would be a transpose to the following layout:\n     n,oY,oX,bY,bX,iC\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given an input of shape `[1, 2, 2, 1]`, data_format = \"NHWC\" and\nblock_size = 2:\n\n```\nx = [[[[1], [2]],\n      [[3], [4]]]]\n```\n\nThis operation will output a tensor of shape `[1, 1, 1, 4]`:\n\n```\n[[[[1, 2, 3, 4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,\nthe corresponding output will have a single element (i.e. width and height are\nboth 1) and will have a depth of 4 channels (1 * block_size * block_size).\nThe output element shape is `[1, 1, 4]`.\n\nFor an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThis operation, for block_size of 2, will return the following tensor of shape\n`[1, 1, 1, 12]`\n\n```\n[[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nSimilarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:\n\n```\nx = [[[[1],   [2],  [5],  [6]],\n      [[3],   [4],  [7],  [8]],\n      [[9],  [10], [13],  [14]],\n      [[11], [12], [15],  [16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 2 2 4]`:\n\n```\nx = [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```"
+  description: "Rearranges blocks of spatial data, into depth. More specifically,\nthis op outputs a copy of the input tensor where values from the `height`\nand `width` dimensions are moved to the `depth` dimension.\nThe attr `block_size` indicates the input block size.\n\n  * Non-overlapping blocks of size `block_size x block size` are rearranged\n    into depth at each location.\n  * The depth of the output tensor is `block_size * block_size * input_depth`.\n  * The Y, X coordinates within each block of the input become the high order\n    component of the output channel index.\n  * The input tensor\'s height and width must be divisible by block_size.\n\nThe `data_format` attr specifies the layout of the input and output tensors\nwith the following options:\n  \"NHWC\": `[ batch, height, width, channels ]`\n  \"NCHW\": `[ batch, channels, height, width ]`\n  \"NCHW_VECT_C\":\n      `qint8 [ batch, channels / 4, height, width, 4 ]`\n\nIt is useful to consider the operation as transforming a 6-D Tensor.\ne.g. for data_format = NHWC,\n     Each element in the input tensor can be specified via 6 coordinates,\n     ordered by decreasing memory layout significance as:\n     n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates\n                        within the output image, bX, bY means coordinates\n                        within the input block, iC means input channels).\n     The output would be a transpose to the following layout:\n     n,oY,oX,bY,bX,iC\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given an input of shape `[1, 2, 2, 1]`, data_format = \"NHWC\" and\nblock_size = 2:\n\n```\nx = [[[[1], [2]],\n      [[3], [4]]]]\n```\n\nThis operation will output a tensor of shape `[1, 1, 1, 4]`:\n\n```\n[[[[1, 2, 3, 4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,\nthe corresponding output will have a single element (i.e. width and height are\nboth 1) and will have a depth of 4 channels (1 * block_size * block_size).\nThe output element shape is `[1, 1, 4]`.\n\nFor an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThis operation, for block_size of 2, will return the following tensor of shape\n`[1, 1, 1, 12]`\n\n```\n[[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nSimilarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:\n\n```\nx = [[[[1],   [2],  [5],  [6]],\n      [[3],   [4],  [7],  [8]],\n      [[9],  [10], [13],  [14]],\n      [[11], [12], [15],  [16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 2 2 4]`:\n\n```\nx = [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```"
 }
 op {
   name: "SparseAccumulatorApplyGradient"
@@ -26366,6 +27401,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -26427,6 +27463,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -26503,6 +27540,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -26522,6 +27560,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -26581,6 +27620,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -26658,6 +27698,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -26737,6 +27778,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -26838,6 +27880,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -26943,6 +27986,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -27044,6 +28088,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -27149,6 +28194,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -27234,6 +28280,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -27332,6 +28379,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -27416,6 +28464,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -27516,6 +28565,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -27623,6 +28673,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -27804,6 +28855,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -27858,6 +28910,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -27912,6 +28965,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -28117,6 +29171,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -28181,6 +29236,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -28243,6 +29299,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -28312,6 +29369,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -28426,33 +29484,249 @@ op {
       }
     }
   }
-  summary: "Computes the mean along sparse segments of a tensor."
-  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nLike `SegmentMean`, but `segment_ids` can have rank less than `data`\'s first\ndimension, selecting a subset of dimension 0, specified by `indices`."
+  summary: "Computes the mean along sparse segments of a tensor."
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nLike `SegmentMean`, but `segment_ids` can have rank less than `data`\'s first\ndimension, selecting a subset of dimension 0, specified by `indices`."
+}
+op {
+  name: "SparseSegmentMeanGrad"
+  input_arg {
+    name: "grad"
+    description: "gradient propagated to the SparseSegmentMean op."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "indices passed to the corresponding SparseSegmentMean op."
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    description: "segment_ids passed to the corresponding SparseSegmentMean op."
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_dim0"
+    description: "dimension 0 of \"data\" passed to SparseSegmentMean op."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Computes gradients for SparseSegmentMean."
+  description: "Returns tensor \"output\" with same shape as grad, except for dimension 0 whose\nvalue is output_dim0."
+}
+op {
+  name: "SparseSegmentMeanWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A 1-D tensor. Has same rank as `segment_ids`."
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    description: "A 1-D tensor. Values should be sorted and can be repeated."
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_segments"
+    description: "Should equal the number of distinct segment IDs."
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    description: "Has same shape as data, except for dimension 0 which has size\n`num_segments`."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Computes the mean along sparse segments of a tensor."
+  description: "Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is\nmisisng, the `output` tensor at that position will be zeroed.\n\nRead @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments."
+}
+op {
+  name: "SparseSegmentSqrtN"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A 1-D tensor. Has same rank as `segment_ids`."
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    description: "A 1-D tensor. Values should be sorted and can be repeated."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    description: "Has same shape as data, except for dimension 0 which\nhas size `k`, the number of segments."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Computes the sum along sparse segments of a tensor divided by the sqrt of N."
+  description: "N is the size of the segment being reduced.\n\nRead @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments."
+}
+op {
+  name: "SparseSegmentSqrtNGrad"
+  input_arg {
+    name: "grad"
+    description: "gradient propagated to the SparseSegmentSqrtN op."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "indices passed to the corresponding SparseSegmentSqrtN op."
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    description: "segment_ids passed to the corresponding SparseSegmentSqrtN op."
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_dim0"
+    description: "dimension 0 of \"data\" passed to SparseSegmentSqrtN op."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Computes gradients for SparseSegmentSqrtN."
+  description: "Returns tensor \"output\" with same shape as grad, except for dimension 0 whose\nvalue is output_dim0."
 }
 op {
-  name: "SparseSegmentMeanGrad"
+  name: "SparseSegmentSqrtNWithNumSegments"
   input_arg {
-    name: "grad"
-    description: "gradient propagated to the SparseSegmentMean op."
+    name: "data"
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "indices passed to the corresponding SparseSegmentMean op."
+    description: "A 1-D tensor. Has same rank as `segment_ids`."
     type_attr: "Tidx"
   }
   input_arg {
     name: "segment_ids"
-    description: "segment_ids passed to the corresponding SparseSegmentMean op."
+    description: "A 1-D tensor. Values should be sorted and can be repeated."
     type: DT_INT32
   }
   input_arg {
-    name: "output_dim0"
-    description: "dimension 0 of \"data\" passed to SparseSegmentMean op."
-    type: DT_INT32
+    name: "num_segments"
+    description: "Should equal the number of distinct segment IDs."
+    type_attr: "Tnumsegments"
   }
   output_arg {
     name: "output"
+    description: "Has same shape as data, except for dimension 0 which\nhas size `k`, the number of segments."
     type_attr: "T"
   }
   attr {
@@ -28478,42 +29752,8 @@ op {
       }
     }
   }
-  summary: "Computes gradients for SparseSegmentMean."
-  description: "Returns tensor \"output\" with same shape as grad, except for dimension 0 whose\nvalue is output_dim0."
-}
-op {
-  name: "SparseSegmentSqrtN"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    description: "A 1-D tensor. Has same rank as `segment_ids`."
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    description: "A 1-D tensor. Values should be sorted and can be repeated."
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    description: "Has same shape as data, except for dimension 0 which\nhas size `k`, the number of segments."
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
   attr {
-    name: "Tidx"
+    name: "Tnumsegments"
     type: "type"
     default_value {
       type: DT_INT32
@@ -28526,32 +29766,27 @@ op {
     }
   }
   summary: "Computes the sum along sparse segments of a tensor divided by the sqrt of N."
-  description: "N is the size of the segment being reduced.\n\nRead @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments."
+  description: "N is the size of the segment being reduced.\n\nLike `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is\nmisisng, the `output` tensor at that position will be zeroed.\n\nRead @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments."
 }
 op {
-  name: "SparseSegmentSqrtNGrad"
+  name: "SparseSegmentSum"
   input_arg {
-    name: "grad"
-    description: "gradient propagated to the SparseSegmentSqrtN op."
+    name: "data"
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "indices passed to the corresponding SparseSegmentSqrtN op."
+    description: "A 1-D tensor. Has same rank as `segment_ids`."
     type_attr: "Tidx"
   }
   input_arg {
     name: "segment_ids"
-    description: "segment_ids passed to the corresponding SparseSegmentSqrtN op."
-    type: DT_INT32
-  }
-  input_arg {
-    name: "output_dim0"
-    description: "dimension 0 of \"data\" passed to SparseSegmentSqrtN op."
+    description: "A 1-D tensor. Values should be sorted and can be repeated."
     type: DT_INT32
   }
   output_arg {
     name: "output"
+    description: "Has same shape as data, except for dimension 0 which\nhas size `k`, the number of segments."
     type_attr: "T"
   }
   attr {
@@ -28561,6 +29796,16 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -28577,11 +29822,11 @@ op {
       }
     }
   }
-  summary: "Computes gradients for SparseSegmentSqrtN."
-  description: "Returns tensor \"output\" with same shape as grad, except for dimension 0 whose\nvalue is output_dim0."
+  summary: "Computes the sum along sparse segments of a tensor."
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nLike `SegmentSum`, but `segment_ids` can have rank less than `data`\'s first\ndimension, selecting a subset of dimension 0, specified by `indices`.\n\nFor example:\n\n```python\nc = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])\n\n# Select two rows, one segment.\ntf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))\n# => [[0 0 0 0]]\n\n# Select two rows, two segment.\ntf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))\n# => [[ 1  2  3  4]\n#     [-1 -2 -3 -4]]\n\n# Select all rows, two segments.\ntf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))\n# => [[0 0 0 0]\n#     [5 6 7 8]]\n\n# Which is equivalent to:\ntf.segment_sum(c, tf.constant([0, 0, 1]))\n```"
 }
 op {
-  name: "SparseSegmentSum"
+  name: "SparseSegmentSumWithNumSegments"
   input_arg {
     name: "data"
     type_attr: "T"
@@ -28596,9 +29841,14 @@ op {
     description: "A 1-D tensor. Values should be sorted and can be repeated."
     type: DT_INT32
   }
+  input_arg {
+    name: "num_segments"
+    description: "Should equal the number of distinct segment IDs."
+    type_attr: "Tnumsegments"
+  }
   output_arg {
     name: "output"
-    description: "Has same shape as data, except for dimension 0 which\nhas size `k`, the number of segments."
+    description: "Has same shape as data, except for dimension 0 which\nhas size `num_segments`."
     type_attr: "T"
   }
   attr {
@@ -28617,6 +29867,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -28633,8 +29884,21 @@ op {
       }
     }
   }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   summary: "Computes the sum along sparse segments of a tensor."
-  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nLike `SegmentSum`, but `segment_ids` can have rank less than `data`\'s first\ndimension, selecting a subset of dimension 0, specified by `indices`.\n\nFor example:\n\n```python\nc = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])\n\n# Select two rows, one segment.\ntf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))\n# => [[0 0 0 0]]\n\n# Select two rows, two segment.\ntf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))\n# => [[ 1  2  3  4]\n#     [-1 -2 -3 -4]]\n\n# Select all rows, two segments.\ntf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))\n# => [[0 0 0 0]\n#     [5 6 7 8]]\n\n# Which is equivalent to:\ntf.segment_sum(c, tf.constant([0, 0, 1]))\n```"
+  description: "Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is\nmisisng, the `output` tensor at that position will be zeroed.\n\nRead @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nFor example:\n\n```python\nc = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])\n\ntf.sparse_segment_sum_with_num_segments(\n    c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)\n# => [[0 0 0 0]\n#     [0 0 0 0]\n#     [0 0 0 0]]\n\ntf.sparse_segment_sum_with_num_segments(c,\n                                        tf.constant([0, 1]),\n                                        tf.constant([0, 2],\n                                        num_segments=4))\n# => [[ 1  2  3  4]\n#     [ 0  0  0  0]\n#     [-1 -2 -3 -4]\n#     [ 0  0  0  0]]\n```"
 }
 op {
   name: "SparseSlice"
@@ -28747,6 +30011,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -28826,6 +30091,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -28895,6 +30161,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -29001,6 +30268,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -29363,6 +30631,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -29393,6 +30662,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -29419,6 +30689,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -29451,6 +30722,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -29851,7 +31123,7 @@ op {
   input_arg {
     name: "seed"
     description: "2 seeds (shape [2])."
-    type: DT_INT64
+    type_attr: "Tseed"
   }
   output_arg {
     name: "output"
@@ -29886,6 +31158,19 @@ op {
       }
     }
   }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   summary: "Outputs deterministic pseudorandom values from a normal distribution."
   description: "The generated values will have mean 0 and standard deviation 1.\n\nThe outputs are a deterministic function of `shape` and `seed`."
 }
@@ -29899,7 +31184,7 @@ op {
   input_arg {
     name: "seed"
     description: "2 seeds (shape [2])."
-    type: DT_INT64
+    type_attr: "Tseed"
   }
   output_arg {
     name: "output"
@@ -29934,6 +31219,19 @@ op {
       }
     }
   }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   summary: "Outputs deterministic pseudorandom random values from a uniform distribution."
   description: "The generated values follow a uniform distribution in the range `[0, 1)`. The\nlower bound 0 is included in the range, while the upper bound 1 is excluded.\n\nThe outputs are a deterministic function of `shape` and `seed`."
 }
@@ -29947,7 +31245,7 @@ op {
   input_arg {
     name: "seed"
     description: "2 seeds (shape [2])."
-    type: DT_INT64
+    type_attr: "Tseed"
   }
   output_arg {
     name: "output"
@@ -29982,9 +31280,58 @@ op {
       }
     }
   }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   summary: "Outputs deterministic pseudorandom values from a truncated normal distribution."
   description: "The generated values follow a normal distribution with mean 0 and standard\ndeviation 1, except that values whose magnitude is more than 2 standard\ndeviations from the mean are dropped and re-picked.\n\nThe outputs are a deterministic function of `shape` and `seed`."
 }
+op {
+  name: "StatsAggregatorHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Creates a statistics manager resource."
+  is_stateful: true
+}
+op {
+  name: "StatsAggregatorSummary"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  summary: "Produces a summary of any statistics recorded by the given statistics manager."
+  is_stateful: true
+}
 op {
   name: "StopGradient"
   input_arg {
@@ -30429,6 +31776,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -30526,6 +31874,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -30843,6 +32192,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -30870,6 +32220,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -30899,6 +32250,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -32304,6 +33656,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -32360,6 +33713,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -32420,6 +33774,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -32457,6 +33812,7 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -32500,6 +33856,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -32624,6 +33981,48 @@ op {
   summary: "Finds unique elements in a 1-D tensor."
   description: "This operation returns a tensor `y` containing all of the unique elements of `x`\nsorted in the same order that they occur in `x`. This operation also returns a\ntensor `idx` the same size as `x` that contains the index of each value of `x`\nin the unique output `y`. In other words:\n\n`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`\n\nFor example:\n\n```\n# tensor \'x\' is [1, 1, 2, 4, 4, 4, 7, 8, 8]\ny, idx = unique(x)\ny ==> [1, 2, 4, 7, 8]\nidx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]\n```"
 }
+op {
+  name: "UniqueV2"
+  input_arg {
+    name: "x"
+    description: "A `Tensor`."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    description: "A `Tensor` of type `int64` (default: 0). The axis of the Tensor to\nfind the unique elements."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "y"
+    description: "A `Tensor`. Unique elements along the `axis` of `Tensor` x."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    description: "A 1-D Tensor. Has the same type as x that contains the index of each\nvalue of x in the output y."
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Finds unique elements in a 1-D tensor."
+  description: "This operation returns a tensor `y` containing all of the unique elements of `x`\nsorted in the same order that they occur in `x`. This operation also returns a\ntensor `idx` the same size as `x` that contains the index of each value of `x`\nin the unique output `y`. In other words:\n\n`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`\n\nFor example:\n\n```\n# tensor \'x\' is [1, 1, 2, 4, 4, 4, 7, 8, 8]\ny, idx = unique(x)\ny ==> [1, 2, 4, 7, 8]\nidx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]\n```"
+}
 op {
   name: "UniqueWithCounts"
   input_arg {
@@ -32712,7 +34111,7 @@ op {
   }
   input_arg {
     name: "num_segments"
-    type: DT_INT32
+    type_attr: "Tnumsegments"
   }
   output_arg {
     name: "output"
@@ -32735,6 +34134,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -32748,6 +34148,19 @@ op {
       }
     }
   }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   summary: "Computes the Max along segments of a tensor."
   description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nThis operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).\nInstead of computing the sum over segments, it computes the maximum\nsuch that:\n\n\\\\(output_i = \\max_j data_j\\\\) where max is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,\n `output[i] = numeric_limits<T>::min()`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/UnsortedSegmentMax.png\" alt>\n</div>"
 }
@@ -32764,7 +34177,7 @@ op {
   }
   input_arg {
     name: "num_segments"
-    type: DT_INT32
+    type_attr: "Tnumsegments"
   }
   output_arg {
     name: "output"
@@ -32792,6 +34205,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -32805,8 +34219,21 @@ op {
       }
     }
   }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   summary: "Computes the sum along segments of a tensor."
-  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/UnsortedSegmentSum.png\" alt>\n</div>"
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\nIf the given segment ID `i` is negative, the value is dropped and will not be\nadded to the sum of the segment.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/UnsortedSegmentSum.png\" alt>\n</div>"
 }
 op {
   name: "Unstage"
@@ -33035,6 +34462,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
         type: DT_BOOL
       }
     }
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index 40ec792ef82ff5e0bdf6d0c4e35bf18f5560c5a7..ee12c7dfaa593ed592619cb50b38083aa43bb06d 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -48,6 +48,28 @@ output: A Tensor with one more dimension than the input `bytes`.  The
   of `bytes` divided by the number of bytes to represent `out_type`.
 )doc");
 
+REGISTER_OP("DecodeCompressed")
+    .Input("bytes: string")
+    .Output("output: string")
+    .Attr("compression_type: string = ''")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Decompress strings. 
+
+This op decompresses each element of the `bytes` input `Tensor`, which
+is assumed to be compressed using the given `compression_type`. 
+
+The `output` is a string `Tensor` of the same shape as `bytes`, 
+each element containing the decompressed data from the corresponding
+element in `bytes`.
+
+bytes: A Tensor of string which is compressed.
+output: A Tensor with the same shape as input `bytes`, uncompressed
+  from bytes.
+compression_type: A scalar containing either (i) the empty string (no
+  compression), (ii) "ZLIB", or (iii) "GZIP".
+)doc");
+
 REGISTER_OP("ParseExample")
     .Input("serialized: string")
     .Input("names: string")
diff --git a/tensorflow/core/ops/random_ops.cc b/tensorflow/core/ops/random_ops.cc
index 2429171fa93093362510601c5167d63a62caec54..31d9c82e537d170bb13aa381c4a0a47feb98172b 100644
--- a/tensorflow/core/ops/random_ops.cc
+++ b/tensorflow/core/ops/random_ops.cc
@@ -29,7 +29,7 @@ REGISTER_OP("RandomUniform")
     .Output("output: dtype")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
-    .Attr("dtype: {half,float,double}")
+    .Attr("dtype: {half,bfloat16,float,double}")
     .Attr("T: {int32, int64}")
     .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
@@ -87,7 +87,7 @@ REGISTER_OP("RandomStandardNormal")
     .Output("output: dtype")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
-    .Attr("dtype: {half,float,double}")
+    .Attr("dtype: {half,bfloat16,float,double}")
     .Attr("T: {int32, int64}")
     .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
@@ -115,7 +115,7 @@ REGISTER_OP("ParameterizedTruncatedNormal")
     .Output("output: dtype")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
-    .Attr("dtype: {half,float,double}")
+    .Attr("dtype: {half,bfloat16,float,double}")
     .Attr("T: {int32, int64}")
     .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
@@ -145,7 +145,7 @@ REGISTER_OP("TruncatedNormal")
     .Output("output: dtype")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
-    .Attr("dtype: {half,float,double}")
+    .Attr("dtype: {half,bfloat16,float,double}")
     .Attr("T: {int32, int64}")
     .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
@@ -201,10 +201,11 @@ REGISTER_OP("Multinomial")
     .SetIsStateful()
     .Input("logits: T")
     .Input("num_samples: int32")
-    .Output("output: int64")
+    .Output("output: output_dtype")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .Attr("T: realnumbertype")
+    .Attr("output_dtype: {int32, int64} = DT_INT64")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle logits_shape;
       ShapeHandle unused;
diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc
index cdfbec85cf1194d02c81cb4a3d66563dc85dfa57..bf9e673e8e46381fa655f37eff4a08b3f3dca38b 100644
--- a/tensorflow/core/ops/resource_variable_ops.cc
+++ b/tensorflow/core/ops/resource_variable_ops.cc
@@ -204,7 +204,10 @@ Status VariableShapeShapeFn(InferenceContext* c) {
   if (handle_data == nullptr || handle_data->empty()) {
     return errors::InvalidArgument("Handle doesn't have shape information.");
   }
-  c->set_output(0, (*handle_data)[0].shape);
+  ShapeHandle var_shape = (*handle_data)[0].shape;
+  int64 rank = c->RankKnown(var_shape) ? c->Rank(var_shape)
+                                       : InferenceContext::kUnknownDim;
+  c->set_output(0, c->Vector(rank));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/ops/script_ops.cc b/tensorflow/core/ops/script_ops.cc
index 8197327b562c5296e4bcbe43ce9ca81696dedf8b..c7c594a999a87682e36de6af54e7d7ede4486ca9 100644
--- a/tensorflow/core/ops/script_ops.cc
+++ b/tensorflow/core/ops/script_ops.cc
@@ -51,4 +51,18 @@ REGISTER_OP("PyFuncStateless")
 A stateless version of PyFunc.
 )doc");
 
+REGISTER_OP("EagerPyFunc")
+    .Input("input: Tin")
+    .Output("output: Tout")
+    .Attr("token: string")
+    .Attr("Tin: list(type) >= 0")
+    .Attr("Tout: list(type) >=0")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Eagerly executes a python function to compute func(input)->output. The
+semantics of the input, output, and attributes are the same as those for
+PyFunc.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index 8b6106f2a40e013635e0f280dcf20a750d1455a4..99f61a3054563bcf66757bbd1496bb1ee1ae7a3f 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -190,7 +190,8 @@ REGISTER_OP("SerializeSparse")
     .Input("sparse_values: T")
     .Input("sparse_shape: int64")
     .Attr("T: type")
-    .Output("serialized_sparse: string")
+    .Output("serialized_sparse: out_type")
+    .Attr("out_type: {string, variant} = DT_STRING")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &unused));
@@ -200,11 +201,13 @@ REGISTER_OP("SerializeSparse")
       return Status::OK();
     })
     .Doc(R"doc(
-Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object.
+Serialize a `SparseTensor` into a `[3]` `Tensor` object.
 
 sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
 sparse_values: 1-D.  The `values` of the `SparseTensor`.
 sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+out_type: The `dtype` to use for serialization; the supported types are `string`
+  (default) and `variant`.
 )doc");
 
 REGISTER_OP("SerializeManySparse")
@@ -212,7 +215,8 @@ REGISTER_OP("SerializeManySparse")
     .Input("sparse_values: T")
     .Input("sparse_shape: int64")
     .Attr("T: type")
-    .Output("serialized_sparse: string")
+    .Output("serialized_sparse: out_type")
+    .Attr("out_type: {string, variant} = DT_STRING")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &unused));
@@ -222,7 +226,7 @@ REGISTER_OP("SerializeManySparse")
       return Status::OK();
     })
     .Doc(R"doc(
-Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`.
+Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
 
 The `SparseTensor` must have rank `R` greater than 1, and the first dimension
 is treated as the minibatch dimension.  Elements of the `SparseTensor`
@@ -235,22 +239,21 @@ The minibatch size `N` is extracted from `sparse_shape[0]`.
 sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
 sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
 sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+out_type: The `dtype` to use for serialization; the supported types are `string`
+  (default) and `variant`.
 )doc");
 
 REGISTER_OP("DeserializeSparse")
-    .Input("serialized_sparse: string")
-    .Attr("dtype: type")
+    .Input("serialized_sparse: Tserialized")
     .Output("sparse_indices: int64")
     .Output("sparse_values: dtype")
     .Output("sparse_shape: int64")
+    .Attr("dtype: type")
+    .Attr("Tserialized: {string, variant} = DT_STRING")
     .SetShapeFn([](InferenceContext* c) {
-      // serialized sparse is [3] vector.
-      ShapeHandle serialized_sparse;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &serialized_sparse));
+      // serialized sparse is [?, ..., ?, 3] vector.
       DimensionHandle unused;
-      TF_RETURN_IF_ERROR(
-          c->WithValue(c->Dim(serialized_sparse, 0), 3, &unused));
-
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), -1), 3, &unused));
       c->set_output(0, c->Matrix(InferenceContext::kUnknownDim,
                                  InferenceContext::kUnknownDim));
       c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
@@ -258,19 +261,61 @@ REGISTER_OP("DeserializeSparse")
       return Status::OK();
     })
     .Doc(R"doc(
-Deserialize `SparseTensor` from a (serialized) string 3-vector (1-D `Tensor`)
-object.
+Deserialize `SparseTensor` objects.
+
+The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+the last dimension stores serialized `SparseTensor` objects and the other N
+dimensions (N >= 0) correspond to a batch. The ranks of the original
+`SparseTensor` objects must all match. When the final `SparseTensor` is
+created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+the sparse tensors have been concatenated along new dimensions, one for each
+batch.
+
+The output `SparseTensor` object's shape values for the original dimensions
+are the max across the input `SparseTensor` objects' shape values for the
+corresponding dimensions. The new dimensions match the size of the batch.
+
+The input `SparseTensor` objects' indices are assumed ordered in
+standard lexicographic order.  If this is not the case, after this
+step run `SparseReorder` to restore index ordering.
+
+For example, if the serialized input is a `[2 x 3]` matrix representing two
+original `SparseTensor` objects:
+
+    index = [ 0]
+            [10]
+            [20]
+    values = [1, 2, 3]
+    shape = [50]
 
-serialized_sparse: 1-D, The serialized `SparseTensor` object. Must have 3 columns.
-dtype: The `dtype` of the serialized `SparseTensor` object.
+and
+
+    index = [ 2]
+            [10]
+    values = [4, 5]
+    shape = [30]
+
+then the final deserialized `SparseTensor` will be:
+
+    index = [0  0]
+            [0 10]
+            [0 20]
+            [1  2]
+            [1 10]
+    values = [1, 2, 3, 4, 5]
+    shape = [2 50]
+
+serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+  must have 3 columns.
+dtype: The `dtype` of the serialized `SparseTensor` objects.
 )doc");
 
 REGISTER_OP("DeserializeManySparse")
     .Input("serialized_sparse: string")
-    .Attr("dtype: type")
     .Output("sparse_indices: int64")
     .Output("sparse_values: dtype")
     .Output("sparse_shape: int64")
+    .Attr("dtype: type")
     .SetShapeFn([](InferenceContext* c) {
       // serialized sparse is [?,3] matrix.
       ShapeHandle serialized_sparse;
diff --git a/tensorflow/core/ops/spectral_ops_test.cc b/tensorflow/core/ops/spectral_ops_test.cc
index 0f8a3e6ef1366b2de08ee352bc54d1bf874a6bed..b1c5e95fc5ce25496d18202182cc418496349bb6 100644
--- a/tensorflow/core/ops/spectral_ops_test.cc
+++ b/tensorflow/core/ops/spectral_ops_test.cc
@@ -22,7 +22,7 @@ namespace tensorflow {
 TEST(MathOpsTest, FFT_ShapeFn) {
   for (const auto* op_name : {"FFT", "IFFT"}) {
     ShapeInferenceTestOp op(op_name);
-    INFER_OK(op, "?", "?");
+    INFER_OK(op, "?", "in0");
     INFER_ERROR("Shape must be at least rank 1 but is rank 0", op, "[]");
     INFER_OK(op, "[?]", "in0");
     INFER_OK(op, "[1]", "in0");
@@ -31,7 +31,7 @@ TEST(MathOpsTest, FFT_ShapeFn) {
 
   for (const auto* op_name : {"FFT2D", "IFFT2D"}) {
     ShapeInferenceTestOp op(op_name);
-    INFER_OK(op, "?", "?");
+    INFER_OK(op, "?", "in0");
     INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
     INFER_OK(op, "[?,1]", "in0");
     INFER_OK(op, "[1,2]", "in0");
@@ -40,7 +40,7 @@ TEST(MathOpsTest, FFT_ShapeFn) {
 
   for (const auto* op_name : {"FFT3D", "IFFT3D"}) {
     ShapeInferenceTestOp op(op_name);
-    INFER_OK(op, "?", "?");
+    INFER_OK(op, "?", "in0");
     INFER_ERROR("Shape must be at least rank 3 but is rank 2", op, "[1,2]");
     INFER_OK(op, "[?,1,?]", "in0");
     INFER_OK(op, "[1,2,3]", "in0");
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index da5f091e9f1988721b1947ad812851e0322efa9e..5b1f5d2477d662ca911f9d1aca6d495f1d63eb7e 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -513,6 +513,62 @@ output_ref: Same as ref. Returned as a convenience for operations that want to
   use the updated values after the update is done.
 )doc");
 
+REGISTER_OP("ResourceScatterNdUpdate")
+    .Input("ref: resource")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = true")
+    .SetShapeFn(shape_inference::ScatterNdUpdateShape)
+    .Doc(R"doc(
+Applies sparse `updates` to individual values or slices within a given
+variable according to `indices`.
+
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+```
+
+For example, say we want to update 4 scattered elements to a rank-1 tensor to
+8 elements. In Python, that update would look like this:
+
+```python
+    ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+    indices = tf.constant([[4], [3], [1] ,[7]])
+    updates = tf.constant([9, 10, 11, 12])
+    update = tf.scatter_nd_update(ref, indices, updates)
+    with tf.Session() as sess:
+      print sess.run(update)
+```
+
+The resulting update to ref would look like this:
+
+    [1, 11, 3, 10, 9, 6, 7, 12]
+
+See @{tf.scatter_nd} for more details about how to make updates to
+slices.
+
+ref: A resource handle. Must be from a VarHandleOp.
+indices: A Tensor. Must be one of the following types: int32, int64.
+  A tensor of indices into ref.
+updates: A Tensor. Must have the same type as ref. A tensor of updated
+  values to add to ref.
+use_locking: An optional bool. Defaults to True. If True, the assignment will
+  be protected by a lock; otherwise the behavior is undefined,
+  but may exhibit less contention.
+)doc");
+
 REGISTER_OP("ScatterNdAdd")
     .Input("ref: Ref(T)")
     .Input("indices: Tindices")
diff --git a/tensorflow/core/ops/stateless_random_ops.cc b/tensorflow/core/ops/stateless_random_ops.cc
index 7c00fdb99fb59a37751c4cb1797f7c51c801d3af..3e1f8781fcd7718e3443b0b4bee5ea5d33980524 100644
--- a/tensorflow/core/ops/stateless_random_ops.cc
+++ b/tensorflow/core/ops/stateless_random_ops.cc
@@ -38,10 +38,11 @@ static Status StatelessShape(shape_inference::InferenceContext* context) {
 #define REGISTER_STATELESS_OP(name)                  \
   REGISTER_OP(name)                                  \
       .Input("shape: T")                             \
-      .Input("seed: int64")                          \
+      .Input("seed: Tseed")                          \
       .Output("output: dtype")                       \
       .Attr("dtype: {half,float,double} = DT_FLOAT") \
       .Attr("T: {int32, int64} = DT_INT32")          \
+      .Attr("Tseed: {int32, int64} = DT_INT64")      \
       .SetShapeFn(StatelessShape)
 
 // This op is exposed through contrib/stateless only.  The interface may change.
diff --git a/tensorflow/core/ops/summary_ops.cc b/tensorflow/core/ops/summary_ops.cc
index 7f6d8b06cd3bccef9aec2e9f51f73f7b7bd72ad8..aa7458f903cf76af660c04149ff50ac899987eac 100644
--- a/tensorflow/core/ops/summary_ops.cc
+++ b/tensorflow/core/ops/summary_ops.cc
@@ -38,6 +38,7 @@ REGISTER_OP("CreateSummaryFileWriter")
     .Input("max_queue: int32")
     .Input("flush_millis: int32")
     .Input("filename_suffix: string")
+    .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"doc(
 Creates a summary file writer accessible by the given resource handle.
 
@@ -99,7 +100,7 @@ writer: A handle to the summary writer resource.
 
 REGISTER_OP("WriteSummary")
     .Input("writer: resource")
-    .Input("global_step: int64")
+    .Input("step: int64")
     .Input("tensor: T")
     .Input("tag: string")
     .Input("summary_metadata: string")
@@ -109,7 +110,7 @@ REGISTER_OP("WriteSummary")
 Outputs a `Summary` protocol buffer with a tensor.
 
 writer: A handle to a summary writer.
-global_step: The step to write the summary for.
+step: The step to write the summary for.
 tensor: A tensor to serialize.
 tag: The summary's tag.
 summary_metadata: Serialized SummaryMetadata protocol buffer containing
@@ -132,7 +133,7 @@ event: A string containing a binary-encoded tf.Event proto.
 
 REGISTER_OP("WriteScalarSummary")
     .Input("writer: resource")
-    .Input("global_step: int64")
+    .Input("step: int64")
     .Input("tag: string")
     .Input("value: T")
     .Attr("T: realnumbertype")
@@ -143,14 +144,14 @@ Writes a `Summary` protocol buffer with scalar values.
 The input `tag` and `value` must have the scalars.
 
 writer: A handle to a summary writer.
-global_step: The step to write the summary for.
+step: The step to write the summary for.
 tag: Tag for the summary.
 value: Value for the summary.
 )doc");
 
 REGISTER_OP("WriteHistogramSummary")
     .Input("writer: resource")
-    .Input("global_step: int64")
+    .Input("step: int64")
     .Input("tag: string")
     .Input("values: T")
     .Attr("T: realnumbertype = DT_FLOAT")
@@ -165,14 +166,14 @@ has one summary value containing a histogram for `values`.
 This op reports an `InvalidArgument` error if any value is not finite.
 
 writer: A handle to a summary writer.
-global_step: The step to write the summary for.
+step: The step to write the summary for.
 tag: Scalar.  Tag to use for the `Summary.Value`.
 values: Any shape. Values to use to build the histogram.
 )doc");
 
 REGISTER_OP("WriteImageSummary")
     .Input("writer: resource")
-    .Input("global_step: int64")
+    .Input("step: int64")
     .Input("tag: string")
     .Input("tensor: T")
     .Input("bad_color: uint8")
@@ -217,7 +218,7 @@ replaced by this tensor in the output image.  The default value is the color
 red.
 
 writer: A handle to a summary writer.
-global_step: The step to write the summary for.
+step: The step to write the summary for.
 tag: Scalar. Used to build the `tag` attribute of the summary values.
 tensor: 4-D of shape `[batch_size, height, width, channels]` where
   `channels` is 1, 3, or 4.
@@ -227,7 +228,7 @@ bad_color: Color to use for pixels with non-finite values.
 
 REGISTER_OP("WriteAudioSummary")
     .Input("writer: resource")
-    .Input("global_step: int64")
+    .Input("step: int64")
     .Input("tag: string")
     .Input("tensor: float")
     .Input("sample_rate: float")
@@ -249,7 +250,7 @@ build the `tag` of the summary values:
    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 
 writer: A handle to a summary writer.
-global_step: The step to write the summary for.
+step: The step to write the summary for.
 tag: Scalar. Used to build the `tag` attribute of the summary values.
 tensor: 2-D of shape `[batch_size, frames]`.
 sample_rate: The sample rate of the signal in hertz.
@@ -258,14 +259,14 @@ max_outputs: Max number of batch elements to generate audio for.
 
 REGISTER_OP("WriteGraphSummary")
     .Input("writer: resource")
-    .Input("global_step: int64")
+    .Input("step: int64")
     .Input("tensor: string")
     .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"doc(
 Writes a `GraphDef` protocol buffer to a `SummaryWriter`.
 
 writer: Handle of `SummaryWriter`.
-global_step: The step to write the summary for.
+step: The step to write the summary for.
 tensor: A scalar string of the serialized tf.GraphDef proto.
 )doc");
 
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index 6f06b87d589dd4c8f41b375642da01ef37be5e67..405318caf20183ce267e84cd2554ed8c77a5b409 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -22,6 +22,48 @@ using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
+const char kAddSignCommonDocStr[] = R"doc(
+Update '*var' according to the AddSign update.
+
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+variable <- variable - lr_t * update
+
+var: Should be from a Variable().
+m: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+sign_decay: Must be a scalar.
+alpha: Must be a scalar.
+beta: Must be a scalar.
+grad: The gradient.
+)doc";
+
+const char kPowerSignCommonDocStr[] = R"doc(
+Update '*var' according to the AddSign update.
+
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+variable <- variable - lr_t * update
+
+var: Should be from a Variable().
+m: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+logbase: Must be a scalar.
+sign_decay: Must be a scalar.
+beta: Must be a scalar.
+grad: The gradient.
+)doc";
+
+const char kOutDocStr[] = R"doc(
+out: Same as "var".
+)doc";
+
+const char kLockDocStr[] = R"doc(
+use_locking: If `True`, updating of the var and m tensors is
+  protected by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc";
+
 static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) {
   auto* handle_data = c->input_handle_shapes_and_types(input);
   if (handle_data != nullptr && !handle_data->empty() &&
@@ -1796,4 +1838,99 @@ use_locking: If `True`, updating of the var, mg, ms, and mom tensors is
   contention.
 )doc");
 
+static Status ApplyAddSignShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // m
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));       // alpha
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // sign_decay
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));       // beta
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 6 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ApplyAddSign")
+    .Input("var: Ref(T)")
+    .Input("m: Ref(T)")
+    .Input("lr: T")
+    .Input("alpha: T")
+    .Input("sign_decay: T")
+    .Input("beta: T")
+    .Input("grad: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAddSignShapeFn(c, /*sparse=*/false);
+    })
+    .Doc(strings::StrCat(kAddSignCommonDocStr, kOutDocStr, kLockDocStr));
+
+REGISTER_OP("ResourceApplyAddSign")
+    .Input("var: resource")
+    .Input("m: resource")
+    .Input("lr: T")
+    .Input("alpha: T")
+    .Input("sign_decay: T")
+    .Input("beta: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAddSignShapeFn(c, /*sparse=*/false);
+    })
+    .Doc(strings::StrCat(kAddSignCommonDocStr, kLockDocStr));
+
+static Status ApplyPowerSignShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // m
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));       // logbase
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // sign_delay
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));       // beta
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 6 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ApplyPowerSign")
+    .Input("var: Ref(T)")
+    .Input("m: Ref(T)")
+    .Input("lr: T")
+    .Input("logbase: T")
+    .Input("sign_decay: T")
+    .Input("beta: T")
+    .Input("grad: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyPowerSignShapeFn(c, /*sparse=*/false);
+    })
+    .Doc(strings::StrCat(kPowerSignCommonDocStr, kOutDocStr, kLockDocStr));
+
+REGISTER_OP("ResourceApplyPowerSign")
+    .Input("var: resource")
+    .Input("m: resource")
+    .Input("lr: T")
+    .Input("logbase: T")
+    .Input("sign_decay: T")
+    .Input("beta: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyPowerSignShapeFn(c, /*sparse=*/false);
+    })
+    .Doc(strings::StrCat(kPowerSignCommonDocStr, kLockDocStr));
+
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/training_ops_test.cc b/tensorflow/core/ops/training_ops_test.cc
index 92d5ad99645404d50c114df6b9a45d4af64a6481..de4e3cd9e70014ea9b29d4d473d94c0abb52eabc 100644
--- a/tensorflow/core/ops/training_ops_test.cc
+++ b/tensorflow/core/ops/training_ops_test.cc
@@ -332,4 +332,38 @@ TEST(TrainingOpsTest, SparseApplyRMSProp_ShapeFn) {
   INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;?;?;?;?;[?];?;?");
 }
 
+TEST(TrainingOpsTest, ApplyAddSign_ShapeFn) {
+  ShapeInferenceTestOp op("ApplyAddSign");
+
+  // Output is a merge of inputs 0, 1, and 6 (var, ms, and grad).
+  INFER_OK(op, "[1,?,?];[?,2,?];[];[];[];[];[?,?,2]", "[d0_0,d1_1,d6_2]");
+  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
+              "[1];[2];[];[];[];[];[1]");
+  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
+              "[1];[1];[];[];[];[];[2]");
+
+  // lr, alpha, sign_decay, and beta must be scalars.
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;[?];?;?;?;?");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;?;[?];?;?;?");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;?;?;[?];?;?");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;?;?;?;[?];?");
+}
+
+TEST(TrainingOpsTest, ApplyPowerSign_ShapeFn) {
+  ShapeInferenceTestOp op("ApplyPowerSign");
+
+  // Output is a merge of inputs 0, 1, and 6 (var, ms, and grad).
+  INFER_OK(op, "[1,?,?];[?,2,?];[];[];[];[];[?,?,2]", "[d0_0,d1_1,d6_2]");
+  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
+              "[1];[2];[];[];[];[];[1]");
+  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
+              "[1];[1];[];[];[];[];[2]");
+
+  // lr, logbase, sign_decay, and beta must be scalars.
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;[?];?;?;?;?");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;?;[?];?;?;?");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;?;?;[?];?;?");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;?;?;?;[?];?");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 624145da75194fac7f859d4df0f6f51fe7ac5eff..aaeccc8324bea5237f2e4e2dea07ce630a8f5beb 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -10,6 +10,7 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
+    "tf_copts",
 )
 
 filegroup(
@@ -29,6 +30,7 @@ filegroup(
 cc_library(
     name = "expiring_lru_cache",
     hdrs = ["expiring_lru_cache.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = ["//tensorflow/core:lib"],
 )
@@ -37,6 +39,7 @@ cc_library(
     name = "file_block_cache",
     srcs = ["file_block_cache.cc"],
     hdrs = ["file_block_cache.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = ["//tensorflow/core:lib"],
 )
@@ -45,6 +48,7 @@ cc_library(
     name = "gcs_dns_cache",
     srcs = ["gcs_dns_cache.cc"],
     hdrs = ["gcs_dns_cache.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":http_request",
@@ -56,6 +60,7 @@ cc_library(
     name = "gcs_file_system",
     srcs = ["gcs_file_system.cc"],
     hdrs = ["gcs_file_system.h"],
+    copts = tf_copts(),
     linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
     visibility = ["//visibility:public"],
     deps = [
@@ -78,6 +83,7 @@ cc_library(
 cc_library(
     name = "http_request",
     hdrs = ["http_request.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/core:framework_headers_lib",
@@ -89,6 +95,7 @@ cc_library(
     name = "curl_http_request",
     srcs = ["curl_http_request.cc"],
     hdrs = ["curl_http_request.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":http_request",
@@ -104,6 +111,7 @@ cc_library(
     hdrs = [
         "http_request_fake.h",
     ],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":curl_http_request",
@@ -121,6 +129,7 @@ cc_library(
         "auth_provider.h",
         "google_auth_provider.h",
     ],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":curl_http_request",
@@ -136,6 +145,7 @@ cc_library(
     name = "now_seconds_env",
     testonly = 1,
     hdrs = ["now_seconds_env.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/core:lib",
@@ -151,6 +161,7 @@ cc_library(
     hdrs = [
         "oauth_client.h",
     ],
+    copts = tf_copts(),
     deps = [
         ":curl_http_request",
         ":http_request",
@@ -169,6 +180,7 @@ cc_library(
     hdrs = [
         "retrying_utils.h",
     ],
+    copts = tf_copts(),
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_internal",
@@ -183,6 +195,7 @@ cc_library(
     hdrs = [
         "retrying_file_system.h",
     ],
+    copts = tf_copts(),
     deps = [
         ":retrying_utils",
         "//tensorflow/core:framework_headers_lib",
@@ -198,6 +211,7 @@ cc_library(
     hdrs = [
         "time_util.h",
     ],
+    copts = tf_copts(),
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/core/platform/cloud/curl_http_request.cc b/tensorflow/core/platform/cloud/curl_http_request.cc
index d01734ba3a649afa73a5fc8ad59a01a7cc6c3088..c2533b4314e80d6f6586ce2e70962e0a657ffce5 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request.cc
@@ -29,16 +29,6 @@ namespace {
 // Set to 1 to enable verbose debug output from curl.
 constexpr uint64 kVerboseOutput = 0;
 
-// Timeout for the whole request. Set only to prevent hanging indefinitely.
-constexpr uint32 kRequestTimeoutSeconds = 3600;  // 1 hour
-
-// Timeout for the connection phase.
-constexpr uint32 kConnectTimeoutSeconds = 120;  // 2 minutes
-
-// The maximum period of request inactivity, after which the request
-// is terminated.
-constexpr uint64 kInactivityTimeoutSeconds = 60;  // 1 minute
-
 // Proxy to the real libcurl implementation.
 class LibCurlProxy : public LibCurl {
  public:
@@ -117,6 +107,10 @@ class LibCurlProxy : public LibCurl {
   }
 
   void curl_free(void* p) override { ::curl_free(p); }
+
+  const char* curl_easy_strerror(CURLcode errornum) override {
+    return ::curl_easy_strerror(errornum);
+  }
 };
 }  // namespace
 
@@ -161,9 +155,6 @@ Status CurlHttpRequest::Init() {
       strings::StrCat("TensorFlow/", TF_VERSION_STRING).c_str());
   // Do not use signals for timeouts - does not work in multi-threaded programs.
   libcurl_->curl_easy_setopt(curl_, CURLOPT_NOSIGNAL, 1L);
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_TIMEOUT, kRequestTimeoutSeconds);
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_CONNECTTIMEOUT,
-                             kConnectTimeoutSeconds);
   libcurl_->curl_easy_setopt(curl_, CURLOPT_HTTP_VERSION,
                              CURL_HTTP_VERSION_2_0);
 
@@ -195,6 +186,7 @@ Status CurlHttpRequest::SetUri(const string& uri) {
   TF_RETURN_IF_ERROR(CheckInitialized());
   TF_RETURN_IF_ERROR(CheckNotSent());
   is_uri_set_ = true;
+  uri_ = uri;
   libcurl_->curl_easy_setopt(curl_, CURLOPT_URL, uri.c_str());
   return Status::OK();
 }
@@ -335,6 +327,16 @@ Status CurlHttpRequest::SetResultBuffer(std::vector<char>* out_buffer) {
   return Status::OK();
 }
 
+Status CurlHttpRequest::SetTimeouts(uint32 connection, uint32 inactivity,
+                                    uint32 total) {
+  TF_RETURN_IF_ERROR(CheckInitialized());
+  TF_RETURN_IF_ERROR(CheckNotSent());
+  connect_timeout_secs_ = connection;
+  inactivity_timeout_secs_ = inactivity;
+  request_timeout_secs_ = total;
+  return Status::OK();
+}
+
 size_t CurlHttpRequest::WriteCallback(const void* ptr, size_t size,
                                       size_t nmemb, void* this_object) {
   CHECK(ptr);
@@ -398,6 +400,10 @@ Status CurlHttpRequest::Send() {
   libcurl_->curl_easy_setopt(curl_, CURLOPT_HEADERFUNCTION,
                              &CurlHttpRequest::HeaderCallback);
 
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_TIMEOUT, request_timeout_secs_);
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_CONNECTTIMEOUT,
+                             connect_timeout_secs_);
+
   char error_buffer[CURL_ERROR_SIZE] = {0};
   libcurl_->curl_easy_setopt(curl_, CURLOPT_ERRORBUFFER, error_buffer);
 
@@ -528,12 +534,37 @@ int CurlHttpRequest::ProgressCallback(void* this_object, curl_off_t dltotal,
     return 0;
   }
 
-  if (now - that->last_progress_timestamp_ > kInactivityTimeoutSeconds) {
+  if (now - that->last_progress_timestamp_ > that->inactivity_timeout_secs_) {
+    double lookup_time = -1;
+    const auto lookup_time_status = that->libcurl_->curl_easy_getinfo(
+        that->curl_, CURLINFO_NAMELOOKUP_TIME, &lookup_time);
+
+    double connect_time = -1;
+    const auto connect_time_status = that->libcurl_->curl_easy_getinfo(
+        that->curl_, CURLINFO_CONNECT_TIME, &connect_time);
+
+    double pretransfer_time = -1;
+    const auto pretransfer_time_status = that->libcurl_->curl_easy_getinfo(
+        that->curl_, CURLINFO_PRETRANSFER_TIME, &pretransfer_time);
+
+    double starttransfer_time = -1;
+    const auto starttransfer_time_status = that->libcurl_->curl_easy_getinfo(
+        that->curl_, CURLINFO_PRETRANSFER_TIME, &starttransfer_time);
+
     LOG(ERROR) << "The transmission  of request " << this_object
-               << " has been stuck at " << current_progress << " of "
-               << dltotal + ultotal << " bytes for "
-               << now - that->last_progress_timestamp_
-               << " seconds and will be aborted.";
+               << " (URI: " << that->uri_ << ") has been stuck at "
+               << current_progress << " of " << dltotal + ultotal
+               << " bytes for " << now - that->last_progress_timestamp_
+               << " seconds and will be aborted. CURL timing information: "
+               << "lookup time: " << lookup_time << " ("
+               << that->libcurl_->curl_easy_strerror(lookup_time_status)
+               << "), connect time: " << connect_time << " ("
+               << that->libcurl_->curl_easy_strerror(connect_time_status)
+               << "), pre-transfer time: " << pretransfer_time << " ("
+               << that->libcurl_->curl_easy_strerror(pretransfer_time_status)
+               << "), start-transfer time: " << starttransfer_time << " ("
+               << that->libcurl_->curl_easy_strerror(starttransfer_time_status)
+               << ")";
     return 1;  // Will abort the request.
   }
 
diff --git a/tensorflow/core/platform/cloud/curl_http_request.h b/tensorflow/core/platform/cloud/curl_http_request.h
index 2396593d6de015d7e002cc59a5ca12a092ab6e86..e4c91dac8d20f85d607feff610b6c50cf8b63e2a 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.h
+++ b/tensorflow/core/platform/cloud/curl_http_request.h
@@ -120,6 +120,9 @@ class CurlHttpRequest : public HttpRequest {
   // Url encodes str and returns a new string.
   string EscapeString(const string& str) override;
 
+  Status SetTimeouts(uint32 connection, uint32 inactivity,
+                     uint32 total) override;
+
  private:
   /// A write callback in the form which can be accepted by libcurl.
   static size_t WriteCallback(const void* ptr, size_t size, size_t nmemb,
@@ -162,12 +165,24 @@ class CurlHttpRequest : public HttpRequest {
   // The last progress in terms of bytes transmitted.
   curl_off_t last_progress_bytes_ = 0;
 
+  // The maximum period of request inactivity.
+  uint32 inactivity_timeout_secs_ = 60;  // 1 minute
+
+  // Timeout for the connection phase.
+  uint32 connect_timeout_secs_ = 120;  // 2 minutes
+
+  // Tiemout for the whole request. Set only to prevent hanging indefinitely.
+  uint32 request_timeout_secs_ = 3600;  // 1 hour
+
   // Members to enforce the usage flow.
   bool is_initialized_ = false;
   bool is_uri_set_ = false;
   bool is_method_set_ = false;
   bool is_sent_ = false;
 
+  // Store the URI to help disambiguate requests when errors occur.
+  string uri_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(CurlHttpRequest);
 };
 
@@ -205,6 +220,8 @@ class LibCurl {
   virtual void curl_slist_free_all(curl_slist* list) = 0;
   virtual char* curl_easy_escape(CURL* curl, const char* str, int length) = 0;
   virtual void curl_free(void* p) = 0;
+
+  virtual const char* curl_easy_strerror(CURLcode errornum) = 0;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/curl_http_request_test.cc b/tensorflow/core/platform/cloud/curl_http_request_test.cc
index 6c0f0818527fdc2610d2f54a965db23a636a98c7..2d3e46edaf8eeaa4ad240b43c23f264a68d9835f 100644
--- a/tensorflow/core/platform/cloud/curl_http_request_test.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request_test.cc
@@ -219,6 +219,10 @@ class FakeLibCurl : public LibCurl {
   }
   void curl_free(void* p) override { port::Free(p); }
 
+  const char* curl_easy_strerror(CURLcode errornum) override {
+    return "<unimplemented>";
+  }
+
   // Variables defining the behavior of this fake.
   string response_content_;
   uint64 response_code_;
@@ -263,7 +267,6 @@ TEST(CurlHttpRequestTest, GetRequest) {
 
   std::vector<char> scratch;
   scratch.insert(scratch.begin(), kTestContent.begin(), kTestContent.end());
-  StringPiece result;
   scratch.reserve(100);
 
   TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
@@ -594,7 +597,6 @@ TEST(CurlHttpRequestTest, ErrorReturnsNoResponse) {
 
   std::vector<char> scratch;
   scratch.insert(scratch.begin(), kTestContent.begin(), kTestContent.end());
-  StringPiece result;
   scratch.reserve(100);
 
   TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
diff --git a/tensorflow/core/platform/cloud/expiring_lru_cache.h b/tensorflow/core/platform/cloud/expiring_lru_cache.h
index 4fe4234e2231e1da8a6ffaf59f4b327be35d406b..3fc23a4306eb96e85099bd63c9c83c6663fe7e3c 100644
--- a/tensorflow/core/platform/cloud/expiring_lru_cache.h
+++ b/tensorflow/core/platform/cloud/expiring_lru_cache.h
@@ -28,7 +28,7 @@ limitations under the License.
 namespace tensorflow {
 
 /// \brief An LRU cache of string keys and arbitrary values, with configurable
-/// max item age and max entries.
+/// max item age (in seconds) and max entries.
 ///
 /// This class is thread safe.
 template <typename T>
@@ -48,16 +48,7 @@ class ExpiringLRUCache {
       return;
     }
     mutex_lock lock(mu_);
-    lru_list_.push_front(key);
-    Entry entry{env_->NowSeconds(), value, lru_list_.begin()};
-    auto insert = cache_.insert(std::make_pair(key, entry));
-    if (!insert.second) {
-      lru_list_.erase(insert.first->second.lru_iterator);
-      insert.first->second = entry;
-    } else if (max_entries_ > 0 && cache_.size() > max_entries_) {
-      cache_.erase(lru_list_.back());
-      lru_list_.pop_back();
-    }
+    InsertLocked(key, value);
   }
 
   /// Look up the entry with key `key` and copy it to `value` if found. Returns
@@ -68,19 +59,33 @@ class ExpiringLRUCache {
       return false;
     }
     mutex_lock lock(mu_);
-    auto it = cache_.find(key);
-    if (it == cache_.end()) {
-      return false;
+    return LookupLocked(key, value);
+  }
+
+  typedef std::function<Status(const string&, T*)> ComputeFunc;
+
+  /// Look up the entry with key `key` and copy it to `value` if found. If not
+  /// found, call `compute_func`. If `compute_func` returns successfully, store
+  /// a copy of the output parameter in the cache, and another copy in `value`.
+  Status LookupOrCompute(const string& key, T* value,
+                         const ComputeFunc& compute_func) {
+    if (max_age_ == 0) {
+      return compute_func(key, value);
     }
-    lru_list_.erase(it->second.lru_iterator);
-    if (env_->NowSeconds() - it->second.timestamp > max_age_) {
-      cache_.erase(it);
-      return false;
+
+    // Note: we hold onto mu_ for the rest of this function. In practice, this
+    // is okay, as stat requests are typically fast, and concurrent requests are
+    // often for the same file. Future work can split this up into one lock per
+    // key if this proves to be a significant performance bottleneck.
+    mutex_lock lock(mu_);
+    if (LookupLocked(key, value)) {
+      return Status::OK();
     }
-    *value = it->second.value;
-    lru_list_.push_front(it->first);
-    it->second.lru_iterator = lru_list_.begin();
-    return true;
+    Status s = compute_func(key, value);
+    if (s.ok()) {
+      InsertLocked(key, *value);
+    }
+    return s;
   }
 
   /// Accessors for cache parameters.
@@ -99,6 +104,36 @@ class ExpiringLRUCache {
     std::list<string>::iterator lru_iterator;
   };
 
+  bool LookupLocked(const string& key, T* value) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    auto it = cache_.find(key);
+    if (it == cache_.end()) {
+      return false;
+    }
+    lru_list_.erase(it->second.lru_iterator);
+    if (env_->NowSeconds() - it->second.timestamp > max_age_) {
+      cache_.erase(it);
+      return false;
+    }
+    *value = it->second.value;
+    lru_list_.push_front(it->first);
+    it->second.lru_iterator = lru_list_.begin();
+    return true;
+  }
+
+  void InsertLocked(const string& key, const T& value)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    lru_list_.push_front(key);
+    Entry entry{env_->NowSeconds(), value, lru_list_.begin()};
+    auto insert = cache_.insert(std::make_pair(key, entry));
+    if (!insert.second) {
+      lru_list_.erase(insert.first->second.lru_iterator);
+      insert.first->second = entry;
+    } else if (max_entries_ > 0 && cache_.size() > max_entries_) {
+      cache_.erase(lru_list_.back());
+      lru_list_.pop_back();
+    }
+  }
+
   /// The maximum age of entries in the cache, in seconds. A value of 0 means
   /// that no entry is ever placed in the cache.
   const uint64 max_age_;
diff --git a/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc b/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc
index bf9bfcd67eb0e2b05dd796002b9de03ca2011a92..8f8d5744a4576991c0056bfefeb30c4bc58549e0 100644
--- a/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc
+++ b/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc
@@ -88,5 +88,69 @@ TEST(ExpiringLRUCacheTest, MaxEntries) {
   EXPECT_EQ(value, 5);
 }
 
+TEST(ExpiringLRUCacheTest, LookupOrCompute) {
+  // max_age of 0 means we should always compute.
+  uint64 num_compute_calls = 0;
+  ExpiringLRUCache<int>::ComputeFunc compute_func =
+      [&num_compute_calls](const string& key, int* value) {
+        *value = num_compute_calls;
+        num_compute_calls++;
+        return Status::OK();
+      };
+  ExpiringLRUCache<int> cache1(0, 4);
+
+  int value = -1;
+  TF_EXPECT_OK(cache1.LookupOrCompute("a", &value, compute_func));
+  EXPECT_EQ(value, 0);
+  EXPECT_EQ(num_compute_calls, 1);
+  // re-read the same value, expect another lookup
+  TF_EXPECT_OK(cache1.LookupOrCompute("a", &value, compute_func));
+  EXPECT_EQ(value, 1);
+  EXPECT_EQ(num_compute_calls, 2);
+
+  // Define a new cache with max_age > 0 and verify correct behavior.
+  ExpiringLRUCache<int> cache2(2, 4);
+  num_compute_calls = 0;
+  value = -1;
+
+  // Read our first value
+  TF_EXPECT_OK(cache2.LookupOrCompute("a", &value, compute_func));
+  EXPECT_EQ(value, 0);
+  EXPECT_EQ(num_compute_calls, 1);
+  // Re-read, exepct no additional function compute_func calls.
+  TF_EXPECT_OK(cache2.LookupOrCompute("a", &value, compute_func));
+  EXPECT_EQ(value, 0);
+  EXPECT_EQ(num_compute_calls, 1);
+
+  // Read a sequence of additional values, eventually evicting "a".
+  TF_EXPECT_OK(cache2.LookupOrCompute("b", &value, compute_func));
+  EXPECT_EQ(value, 1);
+  EXPECT_EQ(num_compute_calls, 2);
+  TF_EXPECT_OK(cache2.LookupOrCompute("c", &value, compute_func));
+  EXPECT_EQ(value, 2);
+  EXPECT_EQ(num_compute_calls, 3);
+  TF_EXPECT_OK(cache2.LookupOrCompute("d", &value, compute_func));
+  EXPECT_EQ(value, 3);
+  EXPECT_EQ(num_compute_calls, 4);
+  TF_EXPECT_OK(cache2.LookupOrCompute("e", &value, compute_func));
+  EXPECT_EQ(value, 4);
+  EXPECT_EQ(num_compute_calls, 5);
+  // Verify the other values remain in the cache.
+  TF_EXPECT_OK(cache2.LookupOrCompute("b", &value, compute_func));
+  EXPECT_EQ(value, 1);
+  EXPECT_EQ(num_compute_calls, 5);
+  TF_EXPECT_OK(cache2.LookupOrCompute("c", &value, compute_func));
+  EXPECT_EQ(value, 2);
+  EXPECT_EQ(num_compute_calls, 5);
+  TF_EXPECT_OK(cache2.LookupOrCompute("d", &value, compute_func));
+  EXPECT_EQ(value, 3);
+  EXPECT_EQ(num_compute_calls, 5);
+
+  // Re-read "a", ensure it is re-computed.
+  TF_EXPECT_OK(cache2.LookupOrCompute("a", &value, compute_func));
+  EXPECT_EQ(value, 5);
+  EXPECT_EQ(num_compute_calls, 6);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/file_block_cache.cc b/tensorflow/core/platform/cloud/file_block_cache.cc
index a05c18c06948aa835da7452451cb649df7a66943..e1afc7b308e740769abca5d95fde34c004df75ee 100644
--- a/tensorflow/core/platform/cloud/file_block_cache.cc
+++ b/tensorflow/core/platform/cloud/file_block_cache.cc
@@ -16,79 +16,137 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/file_block_cache.h"
 #include <cstring>
 #include <memory>
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
 
-std::shared_ptr<FileBlockCache::Block> FileBlockCache::Lookup(const Key& key) {
-  mutex_lock lock(mu_);
-  auto entry = block_map_.find(key);
-  if (entry == block_map_.end()) {
-    return std::shared_ptr<Block>();
-  }
-  // If we're enforcing max staleness and the block is stale, remove all of the
-  // file's cached blocks so we reload them.
-  if (max_staleness_ > 0 &&
-      env_->NowSeconds() - entry->second->timestamp > max_staleness_) {
-    RemoveFile_Locked(key.first);
-    return std::shared_ptr<Block>();
+bool FileBlockCache::BlockNotStale(const std::shared_ptr<Block>& block) {
+  mutex_lock l(block->mu);
+  if (block->state != FetchState::FINISHED) {
+    return true;  // No need to check for staleness.
   }
-  return entry->second;
+  if (max_staleness_ == 0) return true;  // Not enforcing staleness.
+  return env_->NowSeconds() - block->timestamp <= max_staleness_;
 }
 
-std::shared_ptr<FileBlockCache::Block> FileBlockCache::Insert(
-    const Key& key, std::shared_ptr<Block> block) {
+std::shared_ptr<FileBlockCache::Block> FileBlockCache::Lookup(const Key& key) {
   mutex_lock lock(mu_);
   auto entry = block_map_.find(key);
   if (entry != block_map_.end()) {
-    // Use the block that's already in the cache.
-    return entry->second;
-  }
-  // Sanity check to detect interrupted reads leading to partial blocks: a
-  // partial block must have a higher key than the highest existing key in the
-  // block map for the file. Note that since this check relies on the existence
-  // of a cached block with a higher key, some incomplete reads may still go
-  // undetected (if their key happens to be higher than anything in the cache).
-  if (block->data.size() < block_size_ && !block_map_.empty()) {
-    Key fmax = std::make_pair(key.first, std::numeric_limits<size_t>::max());
-    auto fcmp = block_map_.upper_bound(fmax);
-    if (fcmp != block_map_.begin() && key < (--fcmp)->first) {
-      // We expected to read a full block at this position.
-      return std::shared_ptr<Block>();
+    if (BlockNotStale(entry->second)) {
+      return entry->second;
+    } else {
+      // Remove the stale block and continue.
+      RemoveFile_Locked(key.first);
     }
   }
-  // Add the block to the cache (with necessary bookkeeping).
+
+  // Insert a new empty block, setting the bookkeeping to sentinel values
+  // in order to update them as appropriate.
+  auto new_entry = std::make_shared<Block>();
   lru_list_.push_front(key);
   lra_list_.push_front(key);
-  block->lru_iterator = lru_list_.begin();
-  block->lra_iterator = lra_list_.begin();
-  block->timestamp = env_->NowSeconds();
-  cache_size_ += block->data.size();
-  block_map_.emplace(std::make_pair(key, block));
-  return block;
+  new_entry->lru_iterator = lru_list_.begin();
+  new_entry->lra_iterator = lra_list_.begin();
+  new_entry->timestamp = env_->NowSeconds();
+  block_map_.emplace(std::make_pair(key, new_entry));
+  return new_entry;
 }
 
-// Remove blocks from the cache until there is space for a full sized block.
+// Remove blocks from the cache until we do not exceed our maximum size.
 void FileBlockCache::Trim() {
-  mutex_lock lock(mu_);
-  while (!lru_list_.empty() && cache_size_ + block_size_ > max_bytes_) {
+  while (!lru_list_.empty() && cache_size_ > max_bytes_) {
     RemoveBlock(block_map_.find(lru_list_.back()));
   }
 }
 
 /// Move the block to the front of the LRU list if it isn't already there.
-void FileBlockCache::UpdateLRU(const Key& key,
-                               const std::shared_ptr<Block>& block) {
+Status FileBlockCache::UpdateLRU(const Key& key,
+                                 const std::shared_ptr<Block>& block) {
   mutex_lock lock(mu_);
   if (block->timestamp == 0) {
     // The block was evicted from another thread. Allow it to remain evicted.
-    return;
+    return Status::OK();
   }
   if (block->lru_iterator != lru_list_.begin()) {
     lru_list_.erase(block->lru_iterator);
     lru_list_.push_front(key);
     block->lru_iterator = lru_list_.begin();
   }
+
+  // Check for inconsistent state. If there is a block later in the same file
+  // in the cache, and our current block is not block size, this likely means
+  // we have inconsistent state within the cache. Note: it's possible some
+  // incomplete reads may still go undetected.
+  if (block->data.size() < block_size_) {
+    Key fmax = std::make_pair(key.first, std::numeric_limits<size_t>::max());
+    auto fcmp = block_map_.upper_bound(fmax);
+    if (fcmp != block_map_.begin() && key < (--fcmp)->first) {
+      return errors::Internal("Block cache contents are inconsistent.");
+    }
+  }
+
+  Trim();
+
+  return Status::OK();
+}
+
+Status FileBlockCache::MaybeFetch(const Key& key,
+                                  const std::shared_ptr<Block>& block) {
+  bool downloaded_block = false;
+  auto reconcile_state =
+      gtl::MakeCleanup([this, &downloaded_block, &key, &block] {
+        // Perform this action in a cleanup callback to avoid locking mu_ after
+        // locking block->mu.
+        if (downloaded_block) {
+          mutex_lock l(mu_);
+          // Do not update state if the block is already to be evicted.
+          if (block->timestamp != 0) {
+            cache_size_ += block->data.size();
+            // Put to beginning of LRA list.
+            lra_list_.erase(block->lra_iterator);
+            lra_list_.push_front(key);
+            block->lra_iterator = lra_list_.begin();
+            block->timestamp = env_->NowSeconds();
+          }
+        }
+      });
+  // Loop until either block content is successfully fetched, or our request
+  // encounters an error.
+  mutex_lock l(block->mu);
+  Status status = Status::OK();
+  while (true) {
+    switch (block->state) {
+      case FetchState::ERROR:
+        TF_FALLTHROUGH_INTENDED;
+      case FetchState::CREATED:
+        block->state = FetchState::FETCHING;
+        block->mu.unlock();  // Release the lock while making the API call.
+        status.Update(
+            block_fetcher_(key.first, key.second, block_size_, &block->data));
+        block->mu.lock();  // Reacquire the lock immediately afterwards
+        if (status.ok()) {
+          downloaded_block = true;
+          block->state = FetchState::FINISHED;
+        } else {
+          block->state = FetchState::ERROR;
+        }
+        block->cond_var.notify_all();
+        return status;
+      case FetchState::FETCHING:
+        block->cond_var.wait_for(l, std::chrono::seconds(60));
+        if (block->state == FetchState::FINISHED) {
+          return Status::OK();
+        }
+        // Re-loop in case of errors.
+        break;
+      case FetchState::FINISHED:
+        return Status::OK();
+    }
+  }
+  return errors::Internal(
+      "Control flow should never reach the end of FileBlockCache::Fetch.");
 }
 
 Status FileBlockCache::Read(const string& filename, size_t offset, size_t n,
@@ -114,22 +172,18 @@ Status FileBlockCache::Read(const string& filename, size_t offset, size_t n,
     // Look up the block, fetching and inserting it if necessary, and update the
     // LRU iterator for the key and block.
     std::shared_ptr<Block> block = Lookup(key);
-    if (!block) {
-      Trim();
-      auto fetch = std::make_shared<Block>();
-      auto status = block_fetcher_(filename, pos, block_size_, &fetch->data);
-      if (!(block = Insert(key, fetch))) {
-        return errors::Internal("File contents are inconsistent");
-      }
-    }
-    UpdateLRU(key, block);
+    DCHECK(block) << "No block for key " << key.first << "@" << key.second;
+    TF_RETURN_IF_ERROR(MaybeFetch(key, block));
+    TF_RETURN_IF_ERROR(UpdateLRU(key, block));
     // Copy the relevant portion of the block into the result buffer.
     const auto& data = block->data;
     if (offset >= pos + data.size()) {
       // The requested offset is at or beyond the end of the file. This can
       // happen if `offset` is not block-aligned, and the read returns the last
       // block in the file, which does not extend all the way out to `offset`.
-      return errors::OutOfRange("EOF at offset ", offset);
+      return errors::OutOfRange("EOF at offset ", offset, " in file ", filename,
+                                " at position ", pos, "with data size ",
+                                data.size());
     }
     auto begin = data.begin();
     if (offset > pos) {
@@ -190,11 +244,11 @@ void FileBlockCache::RemoveFile_Locked(const string& filename) {
 }
 
 void FileBlockCache::RemoveBlock(BlockMap::iterator entry) {
-  lru_list_.erase(entry->second->lru_iterator);
-  lra_list_.erase(entry->second->lra_iterator);
   // This signals that the block is removed, and should not be inadvertently
   // reinserted into the cache in UpdateLRU.
   entry->second->timestamp = 0;
+  lru_list_.erase(entry->second->lru_iterator);
+  lra_list_.erase(entry->second->lra_iterator);
   cache_size_ -= entry->second->data.size();
   block_map_.erase(entry);
 }
diff --git a/tensorflow/core/platform/cloud/file_block_cache.h b/tensorflow/core/platform/cloud/file_block_cache.h
index b45d2260957858163585ae845a3867b0c01f3d0f..36dbf9db83238fa05e3b010c2a73cb823623f54b 100644
--- a/tensorflow/core/platform/cloud/file_block_cache.h
+++ b/tensorflow/core/platform/cloud/file_block_cache.h
@@ -115,11 +115,35 @@ class FileBlockCache {
   /// The file block cache key is a {filename, offset} pair.
   typedef std::pair<string, size_t> Key;
 
+  /// \brief The state of a block.
+  ///
+  /// A block begins in the CREATED stage. The first thread will attempt to read
+  /// the block from the filesystem, transitioning the state of the block to
+  /// FETCHING. After completing, if the read was successful the state should
+  /// be FINISHED. Otherwise the state should be ERROR. A subsequent read can
+  /// re-fetch the block if the state is ERROR.
+  enum class FetchState {
+    CREATED,
+    FETCHING,
+    FINISHED,
+    ERROR,
+  };
+
   /// \brief A block of a file.
   ///
   /// A file block consists of the block data, the block's current position in
-  /// the LRU cache, and the timestamp (seconds since epoch) at which the block
-  /// was cached.
+  /// the LRU cache, the timestamp (seconds since epoch) at which the block
+  /// was cached, a coordination lock, and state & condition variables.
+  ///
+  /// Thread safety:
+  /// The iterator and timestamp fields should only be accessed while holding
+  /// the block-cache-wide mu_ instance variable. The state variable should only
+  /// be accessed while holding the Block's mu lock. The data vector should only
+  /// be accessed after state == FINISHED, and it should never be modified.
+  ///
+  /// In order to prevent deadlocks, never grab the block-cache-wide mu_ lock
+  /// AFTER grabbing any block's mu lock. It is safe to grab mu without locking
+  /// mu_.
   struct Block {
     /// The block data.
     std::vector<char> data;
@@ -129,6 +153,12 @@ class FileBlockCache {
     std::list<Key>::iterator lra_iterator;
     /// The timestamp (seconds since epoch) at which the block was cached.
     uint64 timestamp;
+    /// Mutex to guard state variable
+    mutex mu;
+    /// The state of the block.
+    FetchState state GUARDED_BY(mu) = FetchState::CREATED;
+    /// Wait on cond_var if state is FETCHING.
+    condition_variable cond_var;
   };
 
   /// \brief The block map type for the file block cache.
@@ -139,19 +169,20 @@ class FileBlockCache {
   /// Prune the cache by removing files with expired blocks.
   void Prune() LOCKS_EXCLUDED(mu_);
 
+  bool BlockNotStale(const std::shared_ptr<Block>& block)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   /// Look up a Key in the block cache.
   std::shared_ptr<Block> Lookup(const Key& key) LOCKS_EXCLUDED(mu_);
 
-  /// Insert a block in the block cache with the given key.
-  std::shared_ptr<FileBlockCache::Block> Insert(const Key& key,
-                                                std::shared_ptr<Block> block)
+  Status MaybeFetch(const Key& key, const std::shared_ptr<Block>& block)
       LOCKS_EXCLUDED(mu_);
 
   /// Trim the block cache to make room for another entry.
-  void Trim() LOCKS_EXCLUDED(mu_);
+  void Trim() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  /// Update LRU and LRA iterators for the block at `key`.
-  void UpdateLRU(const Key& key, const std::shared_ptr<Block>& block)
+  /// Update the LRU iterator for the block at `key`.
+  Status UpdateLRU(const Key& key, const std::shared_ptr<Block>& block)
       LOCKS_EXCLUDED(mu_);
 
   /// Remove all blocks of a file, with mu_ already held.
@@ -179,6 +210,9 @@ class FileBlockCache {
 
   /// The LRA (least recently added) list of block keys. The front of the list
   /// identifies the most recently added block.
+  ///
+  /// Note: blocks are added to lra_list_ only after they have successfully been
+  /// fetched from the underlying block store.
   std::list<Key> lra_list_ GUARDED_BY(mu_);
 
   /// The combined number of bytes in all of the cached blocks.
diff --git a/tensorflow/core/platform/cloud/file_block_cache_test.cc b/tensorflow/core/platform/cloud/file_block_cache_test.cc
index 5fa738b45292f3683a2f79aee00de1aa9da619d4..bebed5af10dda5a2a6c545300f3e023aad8079bd 100644
--- a/tensorflow/core/platform/cloud/file_block_cache_test.cc
+++ b/tensorflow/core/platform/cloud/file_block_cache_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/cloud/now_seconds_env.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -435,5 +436,39 @@ TEST(FileBlockCacheTest, ParallelReads) {
   // executed, or 10 seconds have passed).
 }
 
+TEST(FileBlockCacheTest, CoalesceConcurrentReads) {
+  // Concurrent reads to the same file blocks should be de-duplicated.
+  const size_t block_size = 16;
+  int num_requests = 0;
+  Notification notification;
+  auto fetcher = [&num_requests, &notification, block_size](
+                     const string& filename, size_t offset, size_t n,
+                     std::vector<char>* out) {
+    EXPECT_EQ(n, block_size);
+    EXPECT_EQ(offset, 0);
+    num_requests++;
+    out->resize(n, 'x');
+    notification.Notify();
+    // Wait for other thread to issue read.
+    Env::Default()->SleepForMicroseconds(100000);  // 0.1 secs
+    return Status::OK();
+  };
+  FileBlockCache cache(block_size, block_size, 0, fetcher);
+  // Fork off thread for parallel read.
+  std::unique_ptr<Thread> concurrent(
+      Env::Default()->StartThread({}, "concurrent", [&cache, block_size] {
+        std::vector<char> out;
+        TF_EXPECT_OK(cache.Read("", 0, block_size / 2, &out));
+        EXPECT_EQ(out.size(), block_size / 2);
+      }));
+  EXPECT_TRUE(WaitForNotificationWithTimeout(&notification, 10000))
+      << "Timeout waiting for concurrent thread to start.";
+  std::vector<char> out;
+  TF_EXPECT_OK(cache.Read("", block_size / 2, block_size / 2, &out));
+  EXPECT_EQ(out.size(), block_size / 2);
+
+  EXPECT_EQ(1, num_requests);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache.cc b/tensorflow/core/platform/cloud/gcs_dns_cache.cc
index 63f2da065db9c85eaac0f6ae1f64a079440a9eaf..87b0dde13627f38bb98ada40ec78c1421245df6d 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache.cc
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache.cc
@@ -14,18 +14,48 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/platform/cloud/gcs_dns_cache.h"
-
+#ifndef _WIN32
 #include <arpa/inet.h>
 #include <netdb.h>
+#else
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#include <Windows.h>
+#endif
 #include <sys/types.h>
 
 namespace tensorflow {
 
 namespace {
 
-constexpr char kStorageHost[] = "storage.googleapis.com";
-constexpr char kWwwHost[] = "www.googleapis.com";
+const std::vector<string>& kCachedDomainNames =
+    *new std::vector<string>{"www.googleapis.com", "storage.googleapis.com"};
+
+inline void print_getaddrinfo_error(const string& name, int error_code) {
+#ifndef _WIN32
+  if (error_code == EAI_SYSTEM) {
+    LOG(ERROR) << "Error resolving " << name
+               << " (EAI_SYSTEM): " << strerror(errno);
+  } else {
+    LOG(ERROR) << "Error resolving " << name << ": "
+               << gai_strerror(error_code);
+  }
+#else
+  // TODO:WSAGetLastError is better than gai_strerror
+  LOG(ERROR) << "Error resolving " << name << ": " << gai_strerror(error_code);
+#endif
+}
 
+// Selects one item at random from a vector of items, using a uniform
+// distribution.
+template <typename T>
+const T& SelectRandomItemUniform(std::default_random_engine* random,
+                                 const std::vector<T>& items) {
+  CHECK_GT(items.size(), 0);
+  std::uniform_int_distribution<size_t> distribution(0u, items.size() - 1u);
+  size_t choice_index = distribution(*random);
+  return items[choice_index];
+}
 }  // namespace
 
 GcsDnsCache::GcsDnsCache(Env* env, int64 refresh_rate_secs)
@@ -35,39 +65,38 @@ Status GcsDnsCache::AnnotateRequest(HttpRequest* request) {
   // TODO(saeta): Blacklist failing IP addresses.
   mutex_lock l(mu_);
   if (!started_) {
+    VLOG(1) << "Starting GCS DNS cache.";
     DCHECK(!worker_) << "Worker thread already exists!";
     // Perform DNS resolutions to warm the cache.
-    std::vector<string> www_addresses = ResolveName(kWwwHost);
-    std::vector<string> storage_addresses = ResolveName(kStorageHost);
-    www_addresses.swap(www_addresses_);
-    storage_addresses.swap(storage_addresses_);
+    addresses_ = ResolveNames(kCachedDomainNames);
 
     // Note: we opt to use a thread instead of a delayed closure.
     worker_.reset(env_->StartThread(
         {}, "gcs_dns_worker", std::bind(&GcsDnsCache::WorkerThread, this)));
     started_ = true;
   }
-  if (!storage_addresses_.empty()) {
-    std::uniform_int_distribution<> storage_dist(0,
-                                                 storage_addresses_.size() - 1);
-    size_t index = storage_dist(random_);
-    TF_RETURN_IF_ERROR(request->AddResolveOverride(kStorageHost, 443,
-                                                   storage_addresses_[index]));
-  } else {
-    LOG(WARNING) << "No IP addresses available for " << kStorageHost;
-  }
-  if (!www_addresses_.empty()) {
-    std::uniform_int_distribution<> www_dist(0, www_addresses_.size() - 1);
-    size_t index = www_dist(random_);
-    TF_RETURN_IF_ERROR(
-        request->AddResolveOverride(kWwwHost, 443, www_addresses_[index]));
-  } else {
-    LOG(WARNING) << "No IP addresses available for " << kWwwHost;
+
+  CHECK_EQ(kCachedDomainNames.size(), addresses_.size());
+  for (size_t i = 0; i < kCachedDomainNames.size(); ++i) {
+    const string& name = kCachedDomainNames[i];
+    const std::vector<string>& addresses = addresses_[i];
+    if (!addresses.empty()) {
+      const string& chosen_address =
+          SelectRandomItemUniform(&random_, addresses);
+      TF_RETURN_IF_ERROR(
+          request->AddResolveOverride(name, 443, chosen_address));
+      VLOG(1) << "Annotated DNS mapping: " << name << " --> " << chosen_address;
+    } else {
+      LOG(WARNING) << "No IP addresses available for " << name;
+    }
   }
+
   return Status::OK();
 }
 
 /* static */ std::vector<string> GcsDnsCache::ResolveName(const string& name) {
+  VLOG(1) << "Resolving DNS name: " << name;
+
   addrinfo hints;
   memset(&hints, 0, sizeof(hints));
   hints.ai_family = AF_INET;  // Only use IPv4 for now.
@@ -77,7 +106,7 @@ Status GcsDnsCache::AnnotateRequest(HttpRequest* request) {
 
   std::vector<string> output;
   if (return_code == 0) {
-    for (addrinfo* i = result; i != nullptr; i = i->ai_next) {
+    for (const addrinfo* i = result; i != nullptr; i = i->ai_next) {
       if (i->ai_family != AF_INET || i->ai_addr->sa_family != AF_INET) {
         LOG(WARNING) << "Non-IPv4 address returned. ai_family: " << i->ai_family
                      << ". sa_family: " << i->ai_addr->sa_family << ".";
@@ -93,16 +122,11 @@ Status GcsDnsCache::AnnotateRequest(HttpRequest* request) {
                    << ": " << strerror(errno);
       } else {
         output.emplace_back(buf);
+        VLOG(1) << "... address: " << buf;
       }
     }
   } else {
-    if (return_code == EAI_SYSTEM) {
-      LOG(ERROR) << "Error resolving " << name
-                 << " (EAI_SYSTEM): " << strerror(errno);
-    } else {
-      LOG(ERROR) << "Error resolving " << name << ": "
-                 << gai_strerror(return_code);
-    }
+    print_getaddrinfo_error(name, return_code);
   }
   if (result != nullptr) {
     freeaddrinfo(result);
@@ -110,6 +134,25 @@ Status GcsDnsCache::AnnotateRequest(HttpRequest* request) {
   return output;
 }
 
+// Performs DNS resolution for a set of DNS names. The return vector contains
+// one element for each element in 'names', and each element is itself a
+// vector of IP addresses (in textual form).
+//
+// If DNS resolution fails for any name, then that slot in the return vector
+// will still be present, but will be an empty vector.
+//
+// Ensures: names.size() == return_value.size()
+
+std::vector<std::vector<string>> GcsDnsCache::ResolveNames(
+    const std::vector<string>& names) {
+  std::vector<std::vector<string>> all_addresses;
+  all_addresses.reserve(names.size());
+  for (const string& name : names) {
+    all_addresses.push_back(ResolveName(name));
+  }
+  return all_addresses;
+}
+
 void GcsDnsCache::WorkerThread() {
   while (true) {
     {
@@ -119,15 +162,14 @@ void GcsDnsCache::WorkerThread() {
       cond_var_.wait_for(l, std::chrono::seconds(refresh_rate_secs_));
       if (cancelled_) return;
     }
+
     // Resolve DNS values
-    std::vector<string> www_addresses = ResolveName(kWwwHost);
-    std::vector<string> storage_addresses = ResolveName(kStorageHost);
+    auto new_addresses = ResolveNames(kCachedDomainNames);
 
     {
       mutex_lock l(mu_);
       // Update instance variables.
-      www_addresses.swap(www_addresses_);
-      storage_addresses.swap(storage_addresses_);
+      addresses_.swap(new_addresses);
     }
   }
 }
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache.h b/tensorflow/core/platform/cloud/gcs_dns_cache.h
index 7a4d3847a5ac82b1ced742a20ca18ba84bf6fa7c..2ef7c9bdbe5279974308cdd947c0e415d5d87f32 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache.h
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache.h
@@ -52,6 +52,8 @@ class GcsDnsCache {
 
  private:
   static std::vector<string> ResolveName(const string& name);
+  static std::vector<std::vector<string>> ResolveNames(
+      const std::vector<string>& names);
   void WorkerThread();
 
   // Define a friend class for testing.
@@ -63,10 +65,11 @@ class GcsDnsCache {
   std::default_random_engine random_ GUARDED_BY(mu_);
   bool started_ GUARDED_BY(mu_) = false;
   bool cancelled_ GUARDED_BY(mu_) = false;
-  std::vector<string> www_addresses_ GUARDED_BY(mu_);
-  std::vector<string> storage_addresses_ GUARDED_BY(mu_);
   std::unique_ptr<Thread> worker_ GUARDED_BY(mu_);  // After mutable vars.
   const int64 refresh_rate_secs_;
+
+  // Entries in this vector correspond to entries in kCachedDomainNames.
+  std::vector<std::vector<string>> addresses_ GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc b/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc
index 8d1a108f30dd0461a1cd08dd217badbdf24fc400..2c3819f1e2e10a5046d00aac82fc6df4bf1ef07c 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc
@@ -64,6 +64,11 @@ class TestHttpRequest : public HttpRequest {
   Status Send() override { return Status::OK(); }
   string EscapeString(const string& str) override { return ""; }
 
+  Status SetTimeouts(uint32 connection, uint32 inactivity,
+                     uint32 total) override {
+    return Status::OK();
+  }
+
   std::map<string, string> resolve_overrides_;
 };
 
@@ -83,8 +88,7 @@ class GcsDnsCacheTest : public ::testing::Test {
     {
       mutex_lock l(d.mu_);
       d.started_ = true;  // Avoid creating a thread.
-      d.www_addresses_ = {"192.168.1.1"};
-      d.storage_addresses_ = {"172.134.1.1"};
+      d.addresses_ = {{"192.168.1.1"}, {"172.134.1.1"}};
     }
 
     TestHttpRequest req;
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 9287de7237df4d56a9a6b27e32859b3f60e7da4e..a183fe6fa80d8f9961384f32735c06d69d52038f 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -22,6 +22,9 @@ limitations under the License.
 #include <cstring>
 #include <fstream>
 #include <vector>
+#ifdef _WIN32
+#include <io.h>  //for _mktemp
+#endif
 #include "include/json/json.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -29,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/cloud/curl_http_request.h"
 #include "tensorflow/core/platform/cloud/file_block_cache.h"
 #include "tensorflow/core/platform/cloud/google_auth_provider.h"
@@ -39,6 +43,12 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
+#ifdef _WIN32
+#ifdef DeleteFile
+#undef DeleteFile
+#endif
+#endif
+
 namespace tensorflow {
 
 namespace {
@@ -93,17 +103,40 @@ const FileStatistics DIRECTORY_STAT(0, 0, true);
 // variable to a positive integer describing the frequency used to refresh the
 // userspace DNS cache.
 constexpr char kResolveCacheSecs[] = "GCS_RESOLVE_REFRESH_SECS";
-
+// The environment variable to configure the http request's connection timeout.
+constexpr char kRequestConnectionTimeout[] =
+    "GCS_REQUEST_CONNECTION_TIMEOUT_SECS";
+// The environment varaible to configure the http request's idle timeout.
+constexpr char kRequestIdleTimeout[] = "GCS_REQUEST_IDLE_TIMEOUT_SECS";
+// The environment variable to configure the overall request timeout for
+// metadata requests.
+constexpr char kMetadataRequestTimeout[] = "GCS_METADATA_REQUEST_TIMEOUT_SECS";
+// The environment variable to configure the overall request timeout for
+// block reads requests.
+constexpr char kReadRequestTimeout[] = "GCS_READ_REQUEST_TIMEOUT_SECS";
+// The environment variable to configure the overall request timeout for
+// upload requests.
+constexpr char kWriteRequestTimeout[] = "GCS_WRITE_REQUEST_TIMEOUT_SECS";
+
+// TODO: DO NOT use a hardcoded path
 Status GetTmpFilename(string* filename) {
   if (!filename) {
     return errors::Internal("'filename' cannot be nullptr.");
   }
+#ifndef _WIN32
   char buffer[] = "/tmp/gcs_filesystem_XXXXXX";
   int fd = mkstemp(buffer);
   if (fd < 0) {
     return errors::Internal("Failed to create a temporary file.");
   }
   close(fd);
+#else
+  char buffer[] = "/tmp/gcs_filesystem_XXXXXX";
+  char* ret = _mktemp(buffer);
+  if (ret == nullptr) {
+    return errors::Internal("Failed to create a temporary file.");
+  }
+#endif
   *filename = buffer;
   return Status::OK();
 }
@@ -280,17 +313,18 @@ class GcsRandomAccessFile : public RandomAccessFile {
 class GcsWritableFile : public WritableFile {
  public:
   GcsWritableFile(const string& bucket, const string& object,
-                  AuthProvider* auth_provider,
-                  HttpRequest::Factory* http_request_factory,
+                  GcsFileSystem* filesystem,
+                  GcsFileSystem::TimeoutConfig* timeouts,
                   std::function<void()> file_cache_erase,
                   int64 initial_retry_delay_usec)
       : bucket_(bucket),
         object_(object),
-        auth_provider_(auth_provider),
-        http_request_factory_(http_request_factory),
+        filesystem_(filesystem),
+        timeouts_(timeouts),
         file_cache_erase_(std::move(file_cache_erase)),
         sync_needed_(true),
         initial_retry_delay_usec_(initial_retry_delay_usec) {
+    // TODO: to make it safer, outfile_ should be constructed from an FD
     if (GetTmpFilename(&tmp_content_filename_).ok()) {
       outfile_.open(tmp_content_filename_,
                     std::ofstream::binary | std::ofstream::app);
@@ -303,15 +337,14 @@ class GcsWritableFile : public WritableFile {
   /// with the content to be appended. The class takes onwnership of the
   /// specified tmp file and deletes it on close.
   GcsWritableFile(const string& bucket, const string& object,
-                  AuthProvider* auth_provider,
-                  const string& tmp_content_filename,
-                  HttpRequest::Factory* http_request_factory,
+                  GcsFileSystem* filesystem, const string& tmp_content_filename,
+                  GcsFileSystem::TimeoutConfig* timeouts,
                   std::function<void()> file_cache_erase,
                   int64 initial_retry_delay_usec)
       : bucket_(bucket),
         object_(object),
-        auth_provider_(auth_provider),
-        http_request_factory_(http_request_factory),
+        filesystem_(filesystem),
+        timeouts_(timeouts),
         file_cache_erase_(std::move(file_cache_erase)),
         sync_needed_(true),
         initial_retry_delay_usec_(initial_retry_delay_usec) {
@@ -415,7 +448,7 @@ class GcsWritableFile : public WritableFile {
       return errors::Internal("'size' cannot be nullptr");
     }
     const auto tellp = outfile_.tellp();
-    if (tellp == -1) {
+    if (tellp == static_cast<std::streampos>(-1)) {
       return errors::Internal(
           "Could not get the size of the internal temporary file.");
     }
@@ -431,20 +464,19 @@ class GcsWritableFile : public WritableFile {
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
 
-    string auth_token;
-    TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_, &auth_token));
-
     std::vector<char> output_buffer;
-    std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-    TF_RETURN_IF_ERROR(request->Init());
+    std::unique_ptr<HttpRequest> request;
+    TF_RETURN_IF_ERROR(filesystem_->CreateHttpRequest(&request));
+
     TF_RETURN_IF_ERROR(request->SetUri(strings::StrCat(
         kGcsUploadUriBase, "b/", bucket_,
         "/o?uploadType=resumable&name=", request->EscapeString(object_))));
-    TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
     TF_RETURN_IF_ERROR(request->AddHeader("X-Upload-Content-Length",
                                           std::to_string(file_size)));
     TF_RETURN_IF_ERROR(request->SetPostEmptyBody());
     TF_RETURN_IF_ERROR(request->SetResultBuffer(&output_buffer));
+    TF_RETURN_IF_ERROR(request->SetTimeouts(timeouts_->connect, timeouts_->idle,
+                                            timeouts_->metadata));
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
         request->Send(), " when initiating an upload to ", GetGcsPath());
     *session_uri = request->GetResponseHeader("Location");
@@ -469,13 +501,11 @@ class GcsWritableFile : public WritableFile {
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
 
-    string auth_token;
-    TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_, &auth_token));
-
-    std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-    TF_RETURN_IF_ERROR(request->Init());
+    std::unique_ptr<HttpRequest> request;
+    TF_RETURN_IF_ERROR(filesystem_->CreateHttpRequest(&request));
     TF_RETURN_IF_ERROR(request->SetUri(session_uri));
-    TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
+    TF_RETURN_IF_ERROR(request->SetTimeouts(timeouts_->connect, timeouts_->idle,
+                                            timeouts_->metadata));
     TF_RETURN_IF_ERROR(request->AddHeader(
         "Content-Range", strings::StrCat("bytes */", file_size)));
     TF_RETURN_IF_ERROR(request->SetPutEmptyBody());
@@ -518,18 +548,17 @@ class GcsWritableFile : public WritableFile {
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
 
-    string auth_token;
-    TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_, &auth_token));
-
-    std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-    TF_RETURN_IF_ERROR(request->Init());
+    std::unique_ptr<HttpRequest> request;
+    TF_RETURN_IF_ERROR(filesystem_->CreateHttpRequest(&request));
     TF_RETURN_IF_ERROR(request->SetUri(session_uri));
-    TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
     if (file_size > 0) {
       TF_RETURN_IF_ERROR(request->AddHeader(
           "Content-Range", strings::StrCat("bytes ", start_offset, "-",
                                            file_size - 1, "/", file_size)));
     }
+    TF_RETURN_IF_ERROR(request->SetTimeouts(timeouts_->connect, timeouts_->idle,
+                                            timeouts_->write));
+
     TF_RETURN_IF_ERROR(
         request->SetPutFromFile(tmp_content_filename_, start_offset));
     TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when uploading ",
@@ -545,10 +574,10 @@ class GcsWritableFile : public WritableFile {
 
   string bucket_;
   string object_;
-  AuthProvider* auth_provider_;
+  GcsFileSystem* const filesystem_;  // Not owned.
   string tmp_content_filename_;
   std::ofstream outfile_;
-  HttpRequest::Factory* http_request_factory_;
+  GcsFileSystem::TimeoutConfig* timeouts_;
   std::function<void()> file_cache_erase_;
   bool sync_needed_;  // whether there is buffered data that needs to be synced
   int64 initial_retry_delay_usec_;
@@ -633,6 +662,30 @@ GcsFileSystem::GcsFileSystem()
   if (GetEnvVar(kResolveCacheSecs, strings::safe_strto64,
                 &resolve_frequency_secs)) {
     dns_cache_.reset(new GcsDnsCache(resolve_frequency_secs));
+    VLOG(1) << "GCS DNS cache is enabled.  " << kResolveCacheSecs << " = "
+            << resolve_frequency_secs;
+  } else {
+    VLOG(1) << "GCS DNS cache is disabled, because " << kResolveCacheSecs
+            << " = 0 (or is not set)";
+  }
+  // Apply the overrides for request timeouts
+  uint32 timeout_value;
+  if (GetEnvVar(kRequestConnectionTimeout, strings::safe_strtou32,
+                &timeout_value)) {
+    timeouts_.connect = timeout_value;
+  }
+  if (GetEnvVar(kRequestIdleTimeout, strings::safe_strtou32, &timeout_value)) {
+    timeouts_.idle = timeout_value;
+  }
+  if (GetEnvVar(kMetadataRequestTimeout, strings::safe_strtou32,
+                &timeout_value)) {
+    timeouts_.metadata = timeout_value;
+  }
+  if (GetEnvVar(kReadRequestTimeout, strings::safe_strtou32, &timeout_value)) {
+    timeouts_.read = timeout_value;
+  }
+  if (GetEnvVar(kWriteRequestTimeout, strings::safe_strtou32, &timeout_value)) {
+    timeouts_.write = timeout_value;
   }
 }
 
@@ -642,7 +695,8 @@ GcsFileSystem::GcsFileSystem(
     size_t block_size, size_t max_bytes, uint64 max_staleness,
     uint64 stat_cache_max_age, size_t stat_cache_max_entries,
     uint64 matching_paths_cache_max_age,
-    size_t matching_paths_cache_max_entries, int64 initial_retry_delay_usec)
+    size_t matching_paths_cache_max_entries, int64 initial_retry_delay_usec,
+    TimeoutConfig timeouts)
     : auth_provider_(std::move(auth_provider)),
       http_request_factory_(std::move(http_request_factory)),
       file_block_cache_(
@@ -650,6 +704,7 @@ GcsFileSystem::GcsFileSystem(
       stat_cache_(new StatCache(stat_cache_max_age, stat_cache_max_entries)),
       matching_paths_cache_(new MatchingPathsCache(
           matching_paths_cache_max_age, matching_paths_cache_max_entries)),
+      timeouts_(timeouts),
       initial_retry_delay_usec_(initial_retry_delay_usec) {}
 
 Status GcsFileSystem::NewRandomAccessFile(
@@ -677,24 +732,37 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
                                         size_t n, std::vector<char>* out) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(filename, false, &bucket, &object));
-  string auth_token;
-  TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_.get(), &auth_token));
 
-  std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-  TF_RETURN_IF_ERROR(request->Init());
+  std::unique_ptr<HttpRequest> request;
+  TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
   TF_RETURN_IF_ERROR(
       request->SetUri(strings::StrCat("https://", kStorageHost, "/", bucket,
                                       "/", request->EscapeString(object))));
-  TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
   TF_RETURN_IF_ERROR(request->SetRange(offset, offset + n - 1));
   TF_RETURN_IF_ERROR(request->SetResultBuffer(out));
-
-  if (dns_cache_) {
-    TF_RETURN_IF_ERROR(dns_cache_->AnnotateRequest(request.get()));
-  }
+  TF_RETURN_IF_ERROR(
+      request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.read));
 
   TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when reading gs://",
                                   bucket, "/", object);
+
+  VLOG(1) << "Successful read of gs://" << bucket << "/" << object << " @ "
+          << offset << " of size: " << out->size();
+
+  if (out->size() < block_size()) {
+    // Check stat cache to see if we encountered an interrupted read.
+    FileStatistics stat;
+    if (stat_cache_->Lookup(filename, &stat)) {
+      if (offset + out->size() < stat.length) {
+        return errors::Internal(strings::Printf(
+            "File contents are inconsistent for file: %s @ %lu.",
+            filename.c_str(), offset));
+      }
+      VLOG(2) << "Successful integrity check for: gs://" << bucket << "/"
+              << object << " @ " << offset;
+    }
+  }
+
   return Status::OK();
 }
 
@@ -703,7 +771,7 @@ Status GcsFileSystem::NewWritableFile(const string& fname,
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
   result->reset(new GcsWritableFile(
-      bucket, object, auth_provider_.get(), http_request_factory_.get(),
+      bucket, object, this, &timeouts_,
       [this, fname]() { file_block_cache_->RemoveFile(fname); },
       initial_retry_delay_usec_));
   return Status::OK();
@@ -744,8 +812,7 @@ Status GcsFileSystem::NewAppendableFile(const string& fname,
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
   result->reset(new GcsWritableFile(
-      bucket, object, auth_provider_.get(), old_content_filename,
-      http_request_factory_.get(),
+      bucket, object, this, old_content_filename, &timeouts_,
       [this, fname]() { file_block_cache_->RemoveFile(fname); },
       initial_retry_delay_usec_));
   return Status::OK();
@@ -814,67 +881,68 @@ Status GcsFileSystem::StatForObject(const string& fname, const string& bucket,
   if (!stat) {
     return errors::Internal("'stat' cannot be nullptr.");
   }
-  if (stat_cache_->Lookup(fname, stat)) {
-    if (stat->is_directory) {
-      return errors::NotFound(fname, " is a directory.");
-    } else {
-      return Status::OK();
-    }
-  }
   if (object.empty()) {
-    return errors::InvalidArgument("'object' must be a non-empty string.");
-  }
-
-  string auth_token;
-  TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_.get(), &auth_token));
-
-  std::vector<char> output_buffer;
-  std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-  TF_RETURN_IF_ERROR(request->Init());
-  TF_RETURN_IF_ERROR(request->SetUri(strings::StrCat(
-      kGcsUriBase, "b/", bucket, "/o/", request->EscapeString(object),
-      "?fields=size%2Cupdated")));
-  TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
-  TF_RETURN_IF_ERROR(request->SetResultBuffer(&output_buffer));
-
-  if (dns_cache_) {
-    TF_RETURN_IF_ERROR(dns_cache_->AnnotateRequest(request.get()));
+    return errors::InvalidArgument(strings::Printf(
+        "'object' must be a non-empty string. (File: %s)", fname.c_str()));
+  }
+
+  StatCache::ComputeFunc compute_func =
+      [this, &bucket, &object](const string& fname, FileStatistics* stat) {
+        std::vector<char> output_buffer;
+        std::unique_ptr<HttpRequest> request;
+        TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
+        TF_RETURN_IF_ERROR(request->SetUri(strings::StrCat(
+            kGcsUriBase, "b/", bucket, "/o/", request->EscapeString(object),
+            "?fields=size%2Cupdated")));
+        TF_RETURN_IF_ERROR(request->SetResultBuffer(&output_buffer));
+        TF_RETURN_IF_ERROR(request->SetTimeouts(
+            timeouts_.connect, timeouts_.idle, timeouts_.metadata));
+
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
+                                        " when reading metadata of gs://",
+                                        bucket, "/", object);
+
+        StringPiece response_piece =
+            StringPiece(output_buffer.data(), output_buffer.size());
+        Json::Value root;
+        TF_RETURN_IF_ERROR(ParseJson(response_piece, &root));
+
+        // Parse file size.
+        TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &(stat->length)));
+
+        // Parse file modification time.
+        string updated;
+        TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated));
+        TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->mtime_nsec)));
+
+        VLOG(1) << "Stat of: gs://" << bucket << "/" << object << " -- "
+                << " length: " << stat->length
+                << "; mtime_nsec: " << stat->mtime_nsec
+                << "; updated: " << updated;
+
+        stat->is_directory = false;
+        return Status::OK();
+      };
+
+  TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(fname, stat, compute_func));
+  if (stat->is_directory) {
+    return errors::NotFound(fname, " is a directory.");
+  } else {
+    return Status::OK();
   }
-
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      request->Send(), " when reading metadata of gs://", bucket, "/", object);
-
-  StringPiece response_piece =
-      StringPiece(output_buffer.data(), output_buffer.size());
-  Json::Value root;
-  TF_RETURN_IF_ERROR(ParseJson(response_piece, &root));
-
-  // Parse file size.
-  TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &(stat->length)));
-
-  // Parse file modification time.
-  string updated;
-  TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated));
-  TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->mtime_nsec)));
-
-  stat->is_directory = false;
-  stat_cache_->Insert(fname, *stat);
-
-  return Status::OK();
 }
 
 Status GcsFileSystem::BucketExists(const string& bucket, bool* result) {
   if (!result) {
     return errors::Internal("'result' cannot be nullptr.");
   }
-  string auth_token;
-  TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_.get(), &auth_token));
 
-  std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-  TF_RETURN_IF_ERROR(request->Init());
+  std::unique_ptr<HttpRequest> request;
+  TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
   TF_RETURN_IF_ERROR(
       request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket)));
-  TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
+  TF_RETURN_IF_ERROR(request->SetTimeouts(timeouts_.connect, timeouts_.idle,
+                                          timeouts_.metadata));
   const Status status = request->Send();
   switch (status.code()) {
     case errors::Code::OK:
@@ -892,19 +960,30 @@ Status GcsFileSystem::FolderExists(const string& dirname, bool* result) {
   if (!result) {
     return errors::Internal("'result' cannot be nullptr.");
   }
+  StatCache::ComputeFunc compute_func = [this](const string& dirname,
+                                               FileStatistics* stat) {
+    std::vector<string> children;
+    TF_RETURN_IF_ERROR(
+        GetChildrenBounded(dirname, 1, &children, true /* recursively */,
+                           true /* include_self_directory_marker */));
+    if (!children.empty()) {
+      *stat = DIRECTORY_STAT;
+      return Status::OK();
+    } else {
+      return errors::InvalidArgument("Not a directory!");
+    }
+  };
   FileStatistics stat;
-  if (stat_cache_->Lookup(dirname, &stat)) {
+  Status s = stat_cache_->LookupOrCompute(dirname, &stat, compute_func);
+  if (s.ok()) {
     *result = stat.is_directory;
     return Status::OK();
   }
-  std::vector<string> children;
-  TF_RETURN_IF_ERROR(
-      GetChildrenBounded(dirname, 1, &children, true /* recursively */,
-                         true /* include_self_directory_marker */));
-  if ((*result = !children.empty())) {
-    stat_cache_->Insert(dirname, DIRECTORY_STAT);
+  if (errors::IsInvalidArgument(s)) {
+    *result = false;
+    return Status::OK();
   }
-  return Status::OK();
+  return s;
 }
 
 Status GcsFileSystem::GetChildren(const string& dirname,
@@ -916,33 +995,35 @@ Status GcsFileSystem::GetChildren(const string& dirname,
 
 Status GcsFileSystem::GetMatchingPaths(const string& pattern,
                                        std::vector<string>* results) {
-  if (matching_paths_cache_->Lookup(pattern, results)) {
-    return Status::OK();
-  }
-  results->clear();
-  // Find the fixed prefix by looking for the first wildcard.
-  const string& fixed_prefix =
-      pattern.substr(0, pattern.find_first_of("*?[\\"));
-  const string& dir = io::Dirname(fixed_prefix).ToString();
-  if (dir.empty()) {
-    return errors::InvalidArgument("A GCS pattern doesn't have a bucket name: ",
-                                   pattern);
-  }
-  std::vector<string> all_files;
+  MatchingPathsCache::ComputeFunc compute_func =
+      [this](const string& pattern, std::vector<string>* results) {
+        results->clear();
+        // Find the fixed prefix by looking for the first wildcard.
+        const string& fixed_prefix =
+            pattern.substr(0, pattern.find_first_of("*?[\\"));
+        const string& dir = io::Dirname(fixed_prefix).ToString();
+        if (dir.empty()) {
+          return errors::InvalidArgument(
+              "A GCS pattern doesn't have a bucket name: ", pattern);
+        }
+        std::vector<string> all_files;
+        TF_RETURN_IF_ERROR(GetChildrenBounded(
+            dir, UINT64_MAX, &all_files, true /* recursively */,
+            false /* include_self_directory_marker */));
+
+        const auto& files_and_folders = AddAllSubpaths(all_files);
+
+        // Match all obtained paths to the input pattern.
+        for (const auto& path : files_and_folders) {
+          const string& full_path = io::JoinPath(dir, path);
+          if (Env::Default()->MatchPath(full_path, pattern)) {
+            results->push_back(full_path);
+          }
+        }
+        return Status::OK();
+      };
   TF_RETURN_IF_ERROR(
-      GetChildrenBounded(dir, UINT64_MAX, &all_files, true /* recursively */,
-                         false /* include_self_directory_marker */));
-
-  const auto& files_and_folders = AddAllSubpaths(all_files);
-
-  // Match all obtained paths to the input pattern.
-  for (const auto& path : files_and_folders) {
-    const string& full_path = io::JoinPath(dir, path);
-    if (Env::Default()->MatchPath(full_path, pattern)) {
-      results->push_back(full_path);
-    }
-  }
-  matching_paths_cache_->Insert(pattern, *results);
+      matching_paths_cache_->LookupOrCompute(pattern, results, compute_func));
   return Status::OK();
 }
 
@@ -961,13 +1042,9 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
   string nextPageToken;
   uint64 retrieved_results = 0;
   while (true) {  // A loop over multiple result pages.
-    string auth_token;
-    TF_RETURN_IF_ERROR(
-        AuthProvider::GetToken(auth_provider_.get(), &auth_token));
-
     std::vector<char> output_buffer;
-    std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-    TF_RETURN_IF_ERROR(request->Init());
+    std::unique_ptr<HttpRequest> request;
+    TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
     auto uri = strings::StrCat(kGcsUriBase, "b/", bucket, "/o");
     if (recursive) {
       uri = strings::StrCat(uri, "?fields=items%2Fname%2CnextPageToken");
@@ -991,12 +1068,9 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
           strings::StrCat(uri, "&maxResults=", max_results - retrieved_results);
     }
     TF_RETURN_IF_ERROR(request->SetUri(uri));
-    TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
     TF_RETURN_IF_ERROR(request->SetResultBuffer(&output_buffer));
-
-    if (dns_cache_) {
-      TF_RETURN_IF_ERROR(dns_cache_->AnnotateRequest(request.get()));
-    }
+    TF_RETURN_IF_ERROR(request->SetTimeouts(timeouts_.connect, timeouts_.idle,
+                                            timeouts_.metadata));
 
     TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when reading ", dirname);
     Json::Value root;
@@ -1109,15 +1183,14 @@ Status GcsFileSystem::DeleteFile(const string& fname) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
 
-  string auth_token;
-  TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_.get(), &auth_token));
-
-  std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-  TF_RETURN_IF_ERROR(request->Init());
+  std::unique_ptr<HttpRequest> request;
+  TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
   TF_RETURN_IF_ERROR(request->SetUri(strings::StrCat(
       kGcsUriBase, "b/", bucket, "/o/", request->EscapeString(object))));
-  TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
+  TF_RETURN_IF_ERROR(request->SetTimeouts(timeouts_.connect, timeouts_.idle,
+                                          timeouts_.metadata));
   TF_RETURN_IF_ERROR(request->SetDeleteRequest());
+
   TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when deleting ", fname);
   file_block_cache_->RemoveFile(fname);
   return Status::OK();
@@ -1200,17 +1273,15 @@ Status GcsFileSystem::RenameObject(const string& src, const string& target) {
   TF_RETURN_IF_ERROR(
       ParseGcsPath(target, false, &target_bucket, &target_object));
 
-  string auth_token;
-  TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_.get(), &auth_token));
-
-  std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-  TF_RETURN_IF_ERROR(request->Init());
+  std::unique_ptr<HttpRequest> request;
+  TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
   TF_RETURN_IF_ERROR(request->SetUri(strings::StrCat(
       kGcsUriBase, "b/", src_bucket, "/o/", request->EscapeString(src_object),
       "/rewriteTo/b/", target_bucket, "/o/",
       request->EscapeString(target_object))));
-  TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
   TF_RETURN_IF_ERROR(request->SetPostEmptyBody());
+  TF_RETURN_IF_ERROR(request->SetTimeouts(timeouts_.connect, timeouts_.idle,
+                                          timeouts_.metadata));
   std::vector<char> output_buffer;
   TF_RETURN_IF_ERROR(request->SetResultBuffer(&output_buffer));
   TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when renaming ", src,
@@ -1310,6 +1381,25 @@ Status GcsFileSystem::DeleteRecursively(const string& dirname,
   return Status::OK();
 }
 
+// Creates an HttpRequest and sets several parameters that are common to all
+// requests.  All code (in GcsFileSystem) that creates an HttpRequest should
+// go through this method, rather than directly using http_request_factory_.
+Status GcsFileSystem::CreateHttpRequest(std::unique_ptr<HttpRequest>* request) {
+  std::unique_ptr<HttpRequest> new_request{http_request_factory_->Create()};
+  TF_RETURN_IF_ERROR(new_request->Init());
+  if (dns_cache_) {
+    TF_RETURN_IF_ERROR(dns_cache_->AnnotateRequest(new_request.get()));
+  }
+
+  string auth_token;
+  TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_.get(), &auth_token));
+
+  TF_RETURN_IF_ERROR(new_request->AddAuthBearerHeader(auth_token));
+
+  *request = std::move(new_request);
+  return Status::OK();
+}
+
 REGISTER_FILE_SYSTEM("gs", RetryingGcsFileSystem);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 4b4853c838abb2d2cc1a6cf68877a0dedcbcc15c..f4190b3f1ee24c0e3a1bf3fba84141638fd16687 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -35,6 +35,8 @@ namespace tensorflow {
 /// which adds retry logic to GCS operations.
 class GcsFileSystem : public FileSystem {
  public:
+  struct TimeoutConfig;
+
   GcsFileSystem();
   GcsFileSystem(std::unique_ptr<AuthProvider> auth_provider,
                 std::unique_ptr<HttpRequest::Factory> http_request_factory,
@@ -42,7 +44,7 @@ class GcsFileSystem : public FileSystem {
                 uint64 stat_cache_max_age, size_t stat_cache_max_entries,
                 uint64 matching_paths_cache_max_age,
                 size_t matching_paths_cache_max_entries,
-                int64 initial_retry_delay_usec);
+                int64 initial_retry_delay_usec, TimeoutConfig timeouts);
 
   Status NewRandomAccessFile(
       const string& filename,
@@ -87,6 +89,7 @@ class GcsFileSystem : public FileSystem {
   size_t block_size() const { return file_block_cache_->block_size(); }
   size_t max_bytes() const { return file_block_cache_->max_bytes(); }
   uint64 max_staleness() const { return file_block_cache_->max_staleness(); }
+  TimeoutConfig timeouts() const { return timeouts_; }
 
   uint64 stat_cache_max_age() const { return stat_cache_->max_age(); }
   size_t stat_cache_max_entries() const { return stat_cache_->max_entries(); }
@@ -98,6 +101,43 @@ class GcsFileSystem : public FileSystem {
     return matching_paths_cache_->max_entries();
   }
 
+  /// Structure containing the information for timeouts related to accessing the
+  /// GCS APIs.
+  ///
+  /// All values are in seconds.
+  struct TimeoutConfig {
+    // The request connection timeout. If a connection cannot be established
+    // within `connect` seconds, abort the request.
+    uint32 connect = 120;  // 2 minutes
+
+    // The request idle timeout. If a request has seen no activity in `idle`
+    // seconds, abort the request.
+    uint32 idle = 60;  // 1 minute
+
+    // The maximum total time a metadata request can take. If a request has not
+    // completed within `metadata` seconds, the request is aborted.
+    uint32 metadata = 3600;  // 1 hour
+
+    // The maximum total time a block read request can take. If a request has
+    // not completed within `read` seconds, the request is aborted.
+    uint32 read = 3600;  // 1 hour
+
+    // The maximum total time an upload request can take. If a request has not
+    // completed within `write` seconds, the request is aborted.
+    uint32 write = 3600;  // 1 hour
+
+    TimeoutConfig() {}
+    TimeoutConfig(uint32 connect, uint32 idle, uint32 metadata, uint32 read,
+                  uint32 write)
+        : connect(connect),
+          idle(idle),
+          metadata(metadata),
+          read(read),
+          write(write) {}
+  };
+
+  Status CreateHttpRequest(std::unique_ptr<HttpRequest>* request);
+
  private:
   /// \brief Checks if the bucket exists. Returns OK if the check succeeded.
   ///
@@ -150,6 +190,8 @@ class GcsFileSystem : public FileSystem {
   using MatchingPathsCache = ExpiringLRUCache<std::vector<string>>;
   std::unique_ptr<MatchingPathsCache> matching_paths_cache_;
 
+  TimeoutConfig timeouts_;
+
   /// The initial delay for exponential backoffs when retrying failed calls.
   const int64 initial_retry_delay_usec_ = 1000000L;
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 911176365f462c6b3da88d274040c933343adaf9..cdf6c2b97ec7ae5a6cb8c051c699bb3a3ebe9164 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -22,6 +22,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+static GcsFileSystem::TimeoutConfig kTestTimeoutConfig(5, 1, 10, 20, 30);
+
 class FakeAuthProvider : public AuthProvider {
  public:
   Status GetToken(string* token) override {
@@ -35,12 +37,14 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache) {
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-5\n",
+           "Range: 0-5\n"
+           "Timeouts: 5 1 20\n",
            "012345"),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 6-11\n",
+           "Range: 6-11\n"
+           "Timeouts: 5 1 20\n",
            "6789")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -49,7 +53,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -73,12 +77,14 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache_differentN) {
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-2\n",
+           "Range: 0-2\n"
+           "Timeouts: 5 1 20\n",
            "012"),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 3-12\n",
+           "Range: 3-12\n"
+           "Timeouts: 5 1 20\n",
            "3456789")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -87,7 +93,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache_differentN) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -116,31 +122,30 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-8\n",
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
            "012345678"),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 9-17\n",
+           "Range: 9-17\n"
+           "Timeouts: 5 1 20\n",
            "9abcde"),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 18-26\n",
-           ""),
-       new FakeHttpRequest(
-           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
-           "Auth Token: fake_token\n"
-           "Range: 0-8\n",
-           "012345678")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      9 /* block size */, 18 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
-      0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+           "Range: 18-26\n"
+           "Timeouts: 5 1 20\n",
+           "")});
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   9 /* block size */, 18 /* max bytes */,
+                   0 /* max staleness */, 0 /* stat cache max age */,
+                   0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   char scratch[100];
   StringPiece result;
@@ -182,8 +187,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
               file->Read(20, 10, &result, scratch).code());
     EXPECT_TRUE(result.empty());
 
-    // The beginning of the file has been evicted from the LRU cache.  This will
-    // result in another request. The buffer size is still 15.
+    // The beginning of the file should still be in the LRU cache. There should
+    // not be another request. The buffer size is still 15.
     TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
   }
 
@@ -196,20 +201,23 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_MaxStaleness) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest("Uri: https://storage.googleapis.com/bucket/object\n"
                            "Auth Token: fake_token\n"
-                           "Range: 0-7\n",
+                           "Range: 0-7\n"
+                           "Timeouts: 5 1 20\n",
                            "01234567"),
        new FakeHttpRequest("Uri: https://storage.googleapis.com/bucket/object\n"
                            "Auth Token: fake_token\n"
-                           "Range: 8-15\n",
+                           "Range: 8-15\n"
+                           "Timeouts: 5 1 20\n",
                            "89abcdef")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      8 /* block size */, 16 /* max bytes */, 3600 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
-      0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   8 /* block size */, 16 /* max bytes */,
+                   3600 /* max staleness */, 0 /* stat cache max age */,
+                   0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay */, kTestTimeoutConfig);
   char scratch[100];
   StringPiece result;
   // There should only be two HTTP requests issued to GCS even though we iterate
@@ -243,14 +251,15 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_MaxStaleness) {
 
 TEST(GcsFileSystemTest, NewRandomAccessFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      0 /* read ahead bytes */, 0 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
-      0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   0 /* read ahead bytes */, 0 /* max bytes */,
+                   0 /* max staleness */, 0 /* stat cache max age */,
+                   0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   std::unique_ptr<RandomAccessFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -262,24 +271,28 @@ TEST(GcsFileSystemTest, NewWritableFile) {
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
            "Auth Token: fake_token\n"
-           "Range: 0-7\n",
+           "Range: 0-7\n"
+           "Timeouts: 5 1 20\n",
            "01234567"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
            "uploadType=resumable&name=path%2Fwriteable\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 17\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            ""),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
            "Auth Token: fake_token\n"
-           "Range: 0-7\n",
+           "Range: 0-7\n"
+           "Timeouts: 5 1 20\n",
            "01234567")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -288,7 +301,7 @@ TEST(GcsFileSystemTest, NewWritableFile) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   // Read from the file first, to fill the block cache.
   std::unique_ptr<RandomAccessFile> rfile;
@@ -320,15 +333,18 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
            "uploadType=resumable&name=path%2Fwriteable.txt\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 17\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            "", errors::Unavailable("503"), 503),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Header Content-Range: bytes */17\n"
                            "Put: yes\n",
                            "", errors::FailedPrecondition("308"), nullptr,
@@ -336,10 +352,12 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 11-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: ntent2\n",
                            "", errors::Unavailable("503"), 503),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Header Content-Range: bytes */17\n"
                            "Put: yes\n",
                            "", errors::FailedPrecondition("308"), nullptr,
@@ -347,6 +365,7 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 13-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: ent2\n",
                            "")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -356,7 +375,7 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -374,38 +393,44 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
            "Auth Token: fake_token\n"
-           "Range: 0-7\n",
+           "Range: 0-7\n"
+           "Timeouts: 5 1 20\n",
            "01234567"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
            "uploadType=resumable&name=path%2Fwriteable\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 17\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            "", errors::Unavailable("503"), 503),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Header Content-Range: bytes */17\n"
                            "Put: yes\n",
                            "", Status::OK(), nullptr, {}, 201),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
            "Auth Token: fake_token\n"
-           "Range: 0-7\n",
+           "Range: 0-7\n"
+           "Timeouts: 5 1 20\n",
            "01234567")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      8 /* block size */, 8 /* max bytes */, 3600 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
-      0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   8 /* block size */, 8 /* max bytes */,
+                   3600 /* max staleness */, 0 /* stat cache max age */,
+                   0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay */, kTestTimeoutConfig);
   // Pull the file's first block into the cache. This will trigger the first
   // HTTP request to GCS.
   std::unique_ptr<RandomAccessFile> rfile;
@@ -439,17 +464,20 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
            "uploadType=resumable&name=path%2Fwriteable.txt\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 17\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            "", errors::Unavailable("503"), 503)});
   for (int i = 0; i < 10; i++) {
     requests.emplace_back(new FakeHttpRequest(
         "Uri: https://custom/upload/location\n"
         "Auth Token: fake_token\n"
+        "Timeouts: 5 1 10\n"
         "Header Content-Range: bytes */17\n"
         "Put: yes\n",
         "", errors::FailedPrecondition("important HTTP error 308"), nullptr,
@@ -458,6 +486,7 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
         "Uri: https://custom/upload/location\n"
         "Auth Token: fake_token\n"
         "Header Content-Range: bytes 11-16/17\n"
+        "Timeouts: 5 1 30\n"
         "Put body: ntent2\n",
         "", errors::Unavailable("important HTTP error 503"), 503));
   }
@@ -468,12 +497,14 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
       "uploadType=resumable&name=path%2Fwriteable.txt\n"
       "Auth Token: fake_token\n"
       "Header X-Upload-Content-Length: 17\n"
-      "Post: yes\n",
+      "Post: yes\n"
+      "Timeouts: 5 1 10\n",
       "", {{"Location", "https://custom/upload/location"}}));
   requests.emplace_back(
       new FakeHttpRequest("Uri: https://custom/upload/location\n"
                           "Auth Token: fake_token\n"
                           "Header Content-Range: bytes 0-16/17\n"
+                          "Timeouts: 5 1 30\n"
                           "Put body: content1,content2\n",
                           ""));
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -483,7 +514,7 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   2 /* initial retry delay */);
+                   2 /* initial retry delay */, kTestTimeoutConfig);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -505,11 +536,13 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
            "uploadType=resumable&name=path%2Fwriteable.txt\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 17\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            "", errors::NotFound("important HTTP error 410"),
                            410),
@@ -520,11 +553,13 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
            "uploadType=resumable&name=path%2Fwriteable.txt\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 17\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            "")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -534,7 +569,7 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -563,7 +598,7 @@ TEST(GcsFileSystemTest, NewWritableFile_NoObjectName) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -575,33 +610,38 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fappendable\n"
            "Auth Token: fake_token\n"
-           "Range: 0-31\n",
+           "Range: 0-31\n"
+           "Timeouts: 5 1 20\n",
            "content1,"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
            "uploadType=resumable&name=path%2Fappendable\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 17\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            ""),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fappendable\n"
            "Auth Token: fake_token\n"
-           "Range: 0-31\n",
+           "Range: 0-31\n"
+           "Timeouts: 5 1 20\n",
            "01234567")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      32 /* block size */, 32 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
-      0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   32 /* block size */, 32 /* max bytes */,
+                   0 /* max staleness */, 0 /* stat cache max age */,
+                   0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   // Create an appendable file. This should read the file from GCS, and pull its
   // contents into the block cache.
@@ -634,7 +674,7 @@ TEST(GcsFileSystemTest, NewAppendableFile_NoObjectName) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -647,7 +687,8 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Frandom_access.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"", content.size(),
                            "\", \"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
@@ -655,7 +696,7 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
                            "path%2Frandom_access.txt\n"
                            "Auth Token: fake_token\n"
                            "Range: 0-",
-                           content.size() - 1, "\n"),
+                           content.size() - 1, "\n", "Timeouts: 5 1 20\n"),
            content)});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -664,7 +705,7 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   TF_EXPECT_OK(fs.NewReadOnlyMemoryRegionFromFile(
@@ -683,7 +724,7 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile_NoObjectName) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -694,7 +735,8 @@ TEST(GcsFileSystemTest, FileExists_YesAsObject) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
       "path%2Ffile1.txt?fields=size%2Cupdated\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -704,7 +746,7 @@ TEST(GcsFileSystemTest, FileExists_YesAsObject) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket/path/file1.txt"));
 }
@@ -714,13 +756,15 @@ TEST(GcsFileSystemTest, FileExists_YesAsFolder) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsubfolder?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsubfolder%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/subfolder/\" }]}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -730,7 +774,7 @@ TEST(GcsFileSystemTest, FileExists_YesAsFolder) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket/path/subfolder"));
 }
@@ -739,11 +783,13 @@ TEST(GcsFileSystemTest, FileExists_YesAsBucket) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"size\": \"100\"}"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"size\": \"100\"}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -752,7 +798,7 @@ TEST(GcsFileSystemTest, FileExists_YesAsBucket) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket1"));
   TF_EXPECT_OK(fs.FileExists("gs://bucket1/"));
@@ -763,13 +809,15 @@ TEST(GcsFileSystemTest, FileExists_NotAsObjectOrFolder) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Ffile1.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Ffile1.txt%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": []}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -778,7 +826,7 @@ TEST(GcsFileSystemTest, FileExists_NotAsObjectOrFolder) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   EXPECT_EQ(errors::Code::NOT_FOUND,
             fs.FileExists("gs://bucket/path/file1.txt").code());
@@ -788,11 +836,13 @@ TEST(GcsFileSystemTest, FileExists_NotAsBucket) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket2\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket2\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -801,7 +851,7 @@ TEST(GcsFileSystemTest, FileExists_NotAsBucket) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig);
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
             fs.FileExists("gs://bucket2/").code());
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -813,29 +863,33 @@ TEST(GcsFileSystemTest, FileExists_StatCache) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Ffile1.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"1010\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsubfolder?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsubfolder%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/subfolder/\" }]}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-      3600 /* stat cache max age */, 0 /* stat cache max entries */,
-      0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   3600 /* stat cache max age */,
+                   0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   // The stat cache will ensure that repeated lookups don't trigger additional
   // HTTP requests.
@@ -850,7 +904,8 @@ TEST(GcsFileSystemTest, GetChildren_NoItems) {
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2Cprefixes%2CnextPageToken&delimiter=%2F&prefix="
       "path%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"prefixes\": [\"path/subpath/\"]}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -859,7 +914,7 @@ TEST(GcsFileSystemTest, GetChildren_NoItems) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -872,7 +927,8 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles) {
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2Cprefixes%2CnextPageToken&delimiter=%2F&prefix="
       "path%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/file3.txt\" }],"
@@ -884,7 +940,7 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -898,7 +954,8 @@ TEST(GcsFileSystemTest, GetChildren_SelfDirectoryMarker) {
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2Cprefixes%2CnextPageToken&delimiter=%2F&prefix="
       "path%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/\" },"
       "  { \"name\": \"path/file3.txt\" }],"
@@ -910,7 +967,7 @@ TEST(GcsFileSystemTest, GetChildren_SelfDirectoryMarker) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -923,7 +980,8 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles_NoSlash) {
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2Cprefixes%2CnextPageToken&delimiter=%2F&prefix="
       "path%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/file3.txt\" }],"
@@ -935,7 +993,7 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles_NoSlash) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
@@ -948,7 +1006,8 @@ TEST(GcsFileSystemTest, GetChildren_Root) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket-a-b-c/o?"
       "fields=items%2Fname%2Cprefixes%2CnextPageToken&delimiter=%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -957,7 +1016,7 @@ TEST(GcsFileSystemTest, GetChildren_Root) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket-a-b-c", &children));
@@ -970,7 +1029,8 @@ TEST(GcsFileSystemTest, GetChildren_Empty) {
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2Cprefixes%2CnextPageToken&delimiter=%2F&prefix="
       "path%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -979,7 +1039,7 @@ TEST(GcsFileSystemTest, GetChildren_Empty) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -993,7 +1053,8 @@ TEST(GcsFileSystemTest, GetChildren_Pagination) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2Cprefixes%2CnextPageToken&delimiter=%2F&"
            "prefix=path%2F\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"nextPageToken\": \"ABCD==\", "
            "\"items\": [ "
            "  { \"name\": \"path/file1.txt\" },"
@@ -1004,7 +1065,8 @@ TEST(GcsFileSystemTest, GetChildren_Pagination) {
            "fields=items%2Fname%2Cprefixes%2CnextPageToken&delimiter=%2F&"
            "prefix=path%2F"
            "&pageToken=ABCD==\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/file4.txt\" },"
            "  { \"name\": \"path/file5.txt\" }]}")});
@@ -1016,7 +1078,7 @@ TEST(GcsFileSystemTest, GetChildren_Pagination) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
@@ -1030,7 +1092,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_NoWildcard) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsubpath%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/subpath/file2.txt\" }]}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1040,7 +1103,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_NoWildcard) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   std::vector<string> result;
   TF_EXPECT_OK(
@@ -1053,7 +1116,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_BucketAndWildcard) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2CnextPageToken\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/subpath/file2.txt\" },"
@@ -1065,7 +1129,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_BucketAndWildcard) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/*/*", &result));
@@ -1079,7 +1143,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_Matches) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2CnextPageToken&prefix=path%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/subpath/file2.txt\" },"
@@ -1091,7 +1156,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_Matches) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file2.txt", &result));
@@ -1103,7 +1168,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SelfDirectoryMarker) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2CnextPageToken&prefix=path%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/\" },"
       "  { \"name\": \"path/file3.txt\" }]}")});
@@ -1114,7 +1180,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SelfDirectoryMarker) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", &result));
@@ -1125,7 +1191,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_NoMatches) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2CnextPageToken&prefix=path%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/subpath/file2.txt\" },"
@@ -1137,7 +1204,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_NoMatches) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file3.txt", &result));
@@ -1153,7 +1220,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_OnlyWildcard) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   std::vector<string> result;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1165,13 +1232,15 @@ TEST(GcsFileSystemTest, GetMatchingPaths_Cache) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsubpath%2F\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/subpath/file2.txt\" }]}"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/file1.txt\" },"
            "  { \"name\": \"path/subpath/file2.txt\" },"
@@ -1183,7 +1252,7 @@ TEST(GcsFileSystemTest, GetMatchingPaths_Cache) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    3600 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   // Repeated calls to fs.GetMatchingPaths on these patterns should not lead to
   // any additional HTTP requests to GCS.
@@ -1206,26 +1275,30 @@ TEST(GcsFileSystemTest, DeleteFile) {
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Ffile1.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-15\n",
+           "Range: 0-15\n"
+           "Timeouts: 5 1 20\n",
            "01234567"),
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Ffile1.txt\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            ""),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Ffile1.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-15\n",
+           "Range: 0-15\n"
+           "Timeouts: 5 1 20\n",
            "76543210")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      16 /* block size */, 16 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
-      0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   16 /* block size */, 16 /* max bytes */,
+                   0 /* max staleness */, 0 /* stat cache max age */,
+                   0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   // Do an initial read of the file to load its contents into the block cache.
   char scratch[100];
@@ -1251,7 +1324,7 @@ TEST(GcsFileSystemTest, DeleteFile_NoObjectName) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
             fs.DeleteFile("gs://bucket/").code());
@@ -1261,7 +1334,8 @@ TEST(GcsFileSystemTest, DeleteDir_Empty) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2CnextPageToken&prefix=path%2F&maxResults=2\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1270,7 +1344,7 @@ TEST(GcsFileSystemTest, DeleteDir_Empty) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
 }
@@ -1280,12 +1354,14 @@ TEST(GcsFileSystemTest, DeleteDir_OnlyDirMarkerLeft) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2F&maxResults=2\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/\" }]}"),
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2F\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            "")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1295,7 +1371,7 @@ TEST(GcsFileSystemTest, DeleteDir_OnlyDirMarkerLeft) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
 }
@@ -1303,7 +1379,8 @@ TEST(GcsFileSystemTest, DeleteDir_OnlyDirMarkerLeft) {
 TEST(GcsFileSystemTest, DeleteDir_BucketOnly) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?fields=items%2F"
-      "name%2CnextPageToken&maxResults=2\nAuth Token: fake_token\n",
+      "name%2CnextPageToken&maxResults=2\nAuth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1312,7 +1389,7 @@ TEST(GcsFileSystemTest, DeleteDir_BucketOnly) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket"));
 }
@@ -1321,7 +1398,8 @@ TEST(GcsFileSystemTest, DeleteDir_NonEmpty) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2CnextPageToken&prefix=path%2F&maxResults=2\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/file1.txt\" }]}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1331,7 +1409,7 @@ TEST(GcsFileSystemTest, DeleteDir_NonEmpty) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
             fs.DeleteDir("gs://bucket/path/").code());
@@ -1341,7 +1419,8 @@ TEST(GcsFileSystemTest, GetFileSize) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
       "file.txt?fields=size%2Cupdated\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1351,7 +1430,7 @@ TEST(GcsFileSystemTest, GetFileSize) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   uint64 size;
   TF_EXPECT_OK(fs.GetFileSize("gs://bucket/file.txt", &size));
@@ -1367,7 +1446,7 @@ TEST(GcsFileSystemTest, GetFileSize_NoObjectName) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   uint64 size;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1381,14 +1460,16 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path1%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path1/subfolder/file1.txt\" }]}"),
        // Requesting the full list of files in the folder.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path1%2F\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path1/\" },"  // A directory marker.
            "  { \"name\": \"path1/subfolder/file1.txt\" },"
@@ -1398,13 +1479,15 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path1%2F/rewriteTo/b/bucket/o/path2%2F\n"
            "Auth Token: fake_token\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "{\"done\": true}"),
        // Deleting the original directory marker.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path1%2F\n"
            "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            ""),
        // Copying the first file.
@@ -1413,13 +1496,15 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
            "path1%2Fsubfolder%2Ffile1.txt/rewriteTo/b/bucket/o/"
            "path2%2Fsubfolder%2Ffile1.txt\n"
            "Auth Token: fake_token\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "{\"done\": true}"),
        // Deleting the first original file.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path1%2Fsubfolder%2Ffile1.txt\n"
            "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            ""),
        // Copying the second file.
@@ -1427,13 +1512,15 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path1%2Ffile2.txt/rewriteTo/b/bucket/o/path2%2Ffile2.txt\n"
            "Auth Token: fake_token\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "{\"done\": true}"),
        // Deleting the second original file.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path1%2Ffile2.txt\n"
            "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            "")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1443,7 +1530,7 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   TF_EXPECT_OK(fs.RenameFile("gs://bucket/path1", "gs://bucket/path2/"));
 }
@@ -1453,25 +1540,29 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fsrc.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-15\n",
+           "Range: 0-15\n"
+           "Timeouts: 5 1 20\n",
            "01234567"),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fdst.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-15\n",
+           "Range: 0-15\n"
+           "Timeouts: 5 1 20\n",
            "76543210"),
        // IsDirectory is checking whether there are children objects.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsrc.txt%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}"),
        // IsDirectory is checking if the path exists as an object.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"1010\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        // Copying to the new location.
@@ -1479,33 +1570,38 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt/rewriteTo/b/bucket/o/path%2Fdst.txt\n"
            "Auth Token: fake_token\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "{\"done\": true}"),
        // Deleting the original file.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt\n"
            "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            ""),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fsrc.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-15\n",
+           "Range: 0-15\n"
+           "Timeouts: 5 1 20\n",
            "89abcdef"),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fdst.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-15\n",
+           "Range: 0-15\n"
+           "Timeouts: 5 1 20\n",
            "fedcba98")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      16 /* block size */, 64 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
-      0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   16 /* block size */, 64 /* max bytes */,
+                   0 /* max staleness */, 0 /* stat cache max age */,
+                   0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
   // Do an initial read of the source and destination files to load their
   // contents into the block cache.
   char scratch[100];
@@ -1536,13 +1632,15 @@ TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsrc.txt%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}"),
        // IsDirectory is checking if the path exists as an object.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"1010\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        // Copying to the new location.
@@ -1550,13 +1648,15 @@ TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt/rewriteTo/b/bucket/o/path%2Fdst.txt\n"
            "Auth Token: fake_token\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "{\"done\": true}"),
        // Deleting the original file - the deletion returns a failure.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt\n"
            "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            "", errors::Unavailable("503"), 503),
        // Deleting the original file again - the deletion returns NOT_FOUND.
@@ -1564,6 +1664,7 @@ TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt\n"
            "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            "", errors::NotFound("404"), 404)});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1573,7 +1674,7 @@ TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   TF_EXPECT_OK(
       fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt"));
@@ -1587,13 +1688,15 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsrc.txt%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}"),
        // IsDirectory is checking if the path exists as an object.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"1010\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        // Copying to the new location.
@@ -1601,7 +1704,8 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt/rewriteTo/b/bucket/o/path%2Fdst.txt\n"
            "Auth Token: fake_token\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "{\"done\": false}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1610,7 +1714,7 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   EXPECT_EQ(
       errors::Code::UNIMPLEMENTED,
@@ -1622,7 +1726,8 @@ TEST(GcsFileSystemTest, Stat_Object) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
       "file.txt?fields=size%2Cupdated\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1632,7 +1737,7 @@ TEST(GcsFileSystemTest, Stat_Object) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
@@ -1646,13 +1751,15 @@ TEST(GcsFileSystemTest, Stat_Folder) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "subfolder?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=subfolder%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"subfolder/\" }]}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1662,7 +1769,7 @@ TEST(GcsFileSystemTest, Stat_Folder) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/subfolder", &stat));
@@ -1676,13 +1783,15 @@ TEST(GcsFileSystemTest, Stat_ObjectOrFolderNotFound) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1691,7 +1800,7 @@ TEST(GcsFileSystemTest, Stat_ObjectOrFolderNotFound) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   FileStatistics stat;
   EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/path", &stat).code());
@@ -1700,7 +1809,8 @@ TEST(GcsFileSystemTest, Stat_ObjectOrFolderNotFound) {
 TEST(GcsFileSystemTest, Stat_Bucket) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1709,7 +1819,7 @@ TEST(GcsFileSystemTest, Stat_Bucket) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/", &stat));
@@ -1721,7 +1831,8 @@ TEST(GcsFileSystemTest, Stat_Bucket) {
 TEST(GcsFileSystemTest, Stat_BucketNotFound) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "", errors::NotFound("404"), 404)});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1730,7 +1841,7 @@ TEST(GcsFileSystemTest, Stat_BucketNotFound) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   FileStatistics stat;
   EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/", &stat).code());
@@ -1741,29 +1852,33 @@ TEST(GcsFileSystemTest, Stat_Cache) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "file.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"1010\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "subfolder?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=subfolder%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"subfolder/\" }]}")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-      3600 /* stat cache max age */, 0 /* stat cache max entries */,
-      0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   3600 /* stat cache max age */,
+                   0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   // Repeated calls to fs.Stat on these paths should not lead to any additional
   // HTTP requests to GCS.
@@ -1786,12 +1901,14 @@ TEST(GcsFileSystemTest, IsDirectory_NotFound) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=file.txt%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "file.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1800,7 +1917,7 @@ TEST(GcsFileSystemTest, IsDirectory_NotFound) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   EXPECT_EQ(error::Code::NOT_FOUND,
             fs.IsDirectory("gs://bucket/file.txt").code());
@@ -1812,12 +1929,14 @@ TEST(GcsFileSystemTest, IsDirectory_NotDirectoryButObject) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=file.txt%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "file.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"1010\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1827,7 +1946,7 @@ TEST(GcsFileSystemTest, IsDirectory_NotDirectoryButObject) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
             fs.IsDirectory("gs://bucket/file.txt").code());
@@ -1839,13 +1958,15 @@ TEST(GcsFileSystemTest, IsDirectory_Yes) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=subfolder%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [{\"name\": \"subfolder/\"}]}"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=subfolder%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [{\"name\": \"subfolder/\"}]}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1854,7 +1975,7 @@ TEST(GcsFileSystemTest, IsDirectory_Yes) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder"));
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder/"));
@@ -1864,11 +1985,13 @@ TEST(GcsFileSystemTest, IsDirectory_Bucket) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1877,7 +2000,7 @@ TEST(GcsFileSystemTest, IsDirectory_Bucket) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket"));
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/"));
@@ -1886,7 +2009,8 @@ TEST(GcsFileSystemTest, IsDirectory_Bucket) {
 TEST(GcsFileSystemTest, IsDirectory_BucketNotFound) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "", errors::NotFound("404"), 404)});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1895,7 +2019,7 @@ TEST(GcsFileSystemTest, IsDirectory_BucketNotFound) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   EXPECT_EQ(error::Code::NOT_FOUND, fs.IsDirectory("gs://bucket/").code());
 }
@@ -1907,10 +2031,12 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
            "uploadType=resumable&name=subpath%2F\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 0\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: \n",
                            ""),
        new FakeHttpRequest(
@@ -1918,10 +2044,12 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
            "uploadType=resumable&name=subpath%2F\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 0\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: \n",
                            "")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1931,7 +2059,7 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath"));
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath/"));
@@ -1941,11 +2069,13 @@ TEST(GcsFileSystemTest, CreateDir_Bucket) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            ""),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1954,7 +2084,7 @@ TEST(GcsFileSystemTest, CreateDir_Bucket) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/"));
   TF_EXPECT_OK(fs.CreateDir("gs://bucket"));
@@ -1967,14 +2097,16 @@ TEST(GcsFileSystemTest, DeleteRecursively_Ok) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/file1.txt\" }]}"),
        // GetChildren recursively.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2F\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/\" },"  // The current directory's marker.
            "  { \"name\": \"path/file1.txt\" },"
@@ -1984,30 +2116,35 @@ TEST(GcsFileSystemTest, DeleteRecursively_Ok) {
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2F\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            ""),
        // Delete the object - fails and will be retried.
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Ffile1.txt\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            "", errors::Unavailable("500"), 500),
        // Delete the object again.
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Ffile1.txt\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            ""),
        // Delete the object.
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Fsubpath%2Ffile2.txt\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            ""),
        // Delete the object.
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Ffile3.txt\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            "")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -2017,7 +2154,7 @@ TEST(GcsFileSystemTest, DeleteRecursively_Ok) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   int64 undeleted_files, undeleted_dirs;
   TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
@@ -2033,14 +2170,16 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/file1.txt\" }]}"),
        // Calling GetChildren recursively.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2F\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/file1.txt\" },"
            "  { \"name\": \"path/subpath/\" },"
@@ -2050,12 +2189,14 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Ffile1.txt\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            ""),
        // Deleting the directory marker gs://bucket/path/ - fails with 404.
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Fsubpath%2F\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            "", errors::NotFound("404"), 404),
        // Checking if gs://bucket/path/subpath/ is a folder - it is.
@@ -2063,19 +2204,22 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsubpath%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            strings::StrCat("{\"items\": [ "
                            "    { \"name\": \"path/subpath/\" }]}")),
        // Deleting the object gs://bucket/path/subpath/file2.txt
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Fsubpath%2Ffile2.txt\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            ""),
        // Deleting the object s://bucket/path/file3.txt - fails with 404.
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Ffile3.txt\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            "", errors::NotFound("404"), 404),
        // Checking if gs://bucket/path/file3.txt/ is a folder - it's not.
@@ -2083,13 +2227,15 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Ffile3.txt%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}"),
        // Checking if gs://bucket/path/file3.txt is an object - fails with 404.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Ffile3.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
 
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -2099,7 +2245,7 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   int64 undeleted_files, undeleted_dirs;
   TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
@@ -2115,13 +2261,15 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}"),
        // IsDirectory is checking if the path exists as an object.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -2130,7 +2278,7 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig);
 
   int64 undeleted_files, undeleted_dirs;
   EXPECT_EQ(error::Code::NOT_FOUND,
@@ -2147,6 +2295,11 @@ TEST(GcsFileSystemTest, OverrideCacheParameters) {
   EXPECT_EQ(128 * 1024 * 1024, fs1.block_size());
   EXPECT_EQ(2 * fs1.block_size(), fs1.max_bytes());
   EXPECT_EQ(0, fs1.max_staleness());
+  EXPECT_EQ(120, fs1.timeouts().connect);
+  EXPECT_EQ(60, fs1.timeouts().idle);
+  EXPECT_EQ(3600, fs1.timeouts().metadata);
+  EXPECT_EQ(3600, fs1.timeouts().read);
+  EXPECT_EQ(3600, fs1.timeouts().write);
 
   // Verify legacy readahead buffer override sets block size.
   setenv("GCS_READAHEAD_BUFFER_SIZE_BYTES", "123456789", 1);
@@ -2172,6 +2325,42 @@ TEST(GcsFileSystemTest, OverrideCacheParameters) {
   EXPECT_EQ(32, fs4.stat_cache_max_entries());
   EXPECT_EQ(30, fs4.matching_paths_cache_max_age());
   EXPECT_EQ(64, fs4.matching_paths_cache_max_entries());
+
+  // Verify timeout overrides.
+  setenv("GCS_REQUEST_CONNECTION_TIMEOUT_SECS", "10", 1);
+  setenv("GCS_REQUEST_IDLE_TIMEOUT_SECS", "5", 1);
+  setenv("GCS_METADATA_REQUEST_TIMEOUT_SECS", "20", 1);
+  setenv("GCS_READ_REQUEST_TIMEOUT_SECS", "30", 1);
+  setenv("GCS_WRITE_REQUEST_TIMEOUT_SECS", "40", 1);
+  GcsFileSystem fs5;
+  EXPECT_EQ(10, fs5.timeouts().connect);
+  EXPECT_EQ(5, fs5.timeouts().idle);
+  EXPECT_EQ(20, fs5.timeouts().metadata);
+  EXPECT_EQ(30, fs5.timeouts().read);
+  EXPECT_EQ(40, fs5.timeouts().write);
+}
+
+TEST(GcsFileSystemTest, CreateHttpRequest) {
+  std::vector<HttpRequest*> requests(
+      {// IsDirectory is checking whether there are children objects.
+       new FakeHttpRequest("Uri: https://www.googleapis.com/fake\n"
+                           "Auth Token: fake_token\n"
+                           "Header Hello: world\n",
+                           "{}")});
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay */, kTestTimeoutConfig);
+
+  std::unique_ptr<HttpRequest> request;
+  TF_EXPECT_OK(fs.CreateHttpRequest(&request));
+  TF_EXPECT_OK(request->SetUri("https://www.googleapis.com/fake"));
+  TF_EXPECT_OK(request->AddHeader("Hello", "world"));
+  TF_EXPECT_OK(request->Send());
 }
 
 }  // namespace
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.cc b/tensorflow/core/platform/cloud/google_auth_provider.cc
index f6fd8373cd593da3afdb159640b9cd29fcb795b5..d77f439c5acaa1712ce1f203bafa003aafa6e7c9 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider.cc
@@ -14,9 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/platform/cloud/google_auth_provider.h"
+#ifndef _WIN32
 #include <pwd.h>
-#include <sys/types.h>
 #include <unistd.h>
+#else
+#include <sys/types.h>
+#endif
 #include <fstream>
 #include "include/json/json.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/platform/cloud/http_request.h b/tensorflow/core/platform/cloud/http_request.h
index 02d9e9054ad3b22f3cd15cf7b24d917184db264b..95a436c6229b18963985c93e158df030efd4adb3 100644
--- a/tensorflow/core/platform/cloud/http_request.h
+++ b/tensorflow/core/platform/cloud/http_request.h
@@ -118,6 +118,16 @@ class HttpRequest {
   // Url encodes str and returns a new string.
   virtual string EscapeString(const string& str) = 0;
 
+  /// \brief Set timeouts for this request.
+  ///
+  /// The connection parameter controls how long we should wait for the
+  /// connection to be established. The inactivity parameter controls how long
+  /// we should wait between additional responses from the server. Finally the
+  /// total parameter controls the maximum total connection time to prevent
+  /// hanging indefinitely.
+  virtual Status SetTimeouts(uint32 connection, uint32 inactivity,
+                             uint32 total) = 0;
+
   TF_DISALLOW_COPY_AND_ASSIGN(HttpRequest);
 };
 
diff --git a/tensorflow/core/platform/cloud/http_request_fake.h b/tensorflow/core/platform/cloud/http_request_fake.h
index bfe04f6363b6cde227f73333f2351b550be1dde1..f65c15dac77b1c689319a08237ff91cab849a70b 100644
--- a/tensorflow/core/platform/cloud/http_request_fake.h
+++ b/tensorflow/core/platform/cloud/http_request_fake.h
@@ -37,7 +37,8 @@ class FakeHttpRequest : public CurlHttpRequest {
  public:
   /// Return the response for the given request.
   FakeHttpRequest(const string& request, const string& response)
-      : FakeHttpRequest(request, response, Status::OK(), nullptr, {}, 200) {}
+      : FakeHttpRequest(request, response, Status::OK(), nullptr, {}, 200) {
+  }
 
   /// Return the response with headers for the given request.
   FakeHttpRequest(const string& request, const string& response,
@@ -76,7 +77,7 @@ class FakeHttpRequest : public CurlHttpRequest {
 
   Status Init() override { return Status::OK(); }
   Status SetUri(const string& uri) override {
-    actual_request_ += "Uri: " + uri + "\n";
+    actual_uri_ += "Uri: " + uri + "\n";
     return Status::OK();
   }
   Status SetRange(uint64 start, uint64 end) override {
@@ -130,7 +131,8 @@ class FakeHttpRequest : public CurlHttpRequest {
     return Status::OK();
   }
   Status Send() override {
-    EXPECT_EQ(expected_request_, actual_request_) << "Unexpected HTTP request.";
+    EXPECT_EQ(expected_request_, actual_request())
+        << "Unexpected HTTP request.";
     if (buffer_) {
       buffer_->insert(buffer_->begin(), response_.c_str(),
                       response_.c_str() + response_.size());
@@ -160,9 +162,24 @@ class FakeHttpRequest : public CurlHttpRequest {
 
   virtual uint64 GetResponseCode() const override { return response_code_; }
 
+  Status SetTimeouts(uint32 connection, uint32 inactivity,
+                     uint32 total) override {
+    actual_request_ += strings::StrCat("Timeouts: ", connection, " ",
+                                       inactivity, " ", total, "\n");
+    return Status::OK();
+  }
+
  private:
+  string actual_request() const {
+    string s;
+    s.append(actual_uri_);
+    s.append(actual_request_);
+    return s;
+  }
+
   std::vector<char>* buffer_ = nullptr;
   string expected_request_;
+  string actual_uri_;
   string actual_request_;
   string response_;
   Status response_status_;
diff --git a/tensorflow/core/platform/cloud/oauth_client.cc b/tensorflow/core/platform/cloud/oauth_client.cc
index c700b97dc95f85400f9a8c214ea1ccc2b1a3e436..3c2830ccd92acdeaa205063ab4867b0c47d567d4 100644
--- a/tensorflow/core/platform/cloud/oauth_client.cc
+++ b/tensorflow/core/platform/cloud/oauth_client.cc
@@ -14,9 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/platform/cloud/oauth_client.h"
+#ifndef _WIN32
 #include <pwd.h>
 #include <sys/types.h>
 #include <unistd.h>
+#else
+#include <sys/types.h>
+#endif
 #include <fstream>
 #include <openssl/bio.h>
 #include <openssl/evp.h>
diff --git a/tensorflow/core/platform/cloud/time_util.cc b/tensorflow/core/platform/cloud/time_util.cc
index 2f8643f3c7f39c53566d481c078d8f71b44bbedd..0587a65c299778b95ccdec86e03c9f5dca8ec878 100644
--- a/tensorflow/core/platform/cloud/time_util.cc
+++ b/tensorflow/core/platform/cloud/time_util.cc
@@ -18,6 +18,9 @@ limitations under the License.
 #include <cmath>
 #include <cstdio>
 #include <ctime>
+#ifdef _WIN32
+#define timegm _mkgmtime
+#endif
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 5eeb861bddfa1701143d3e10da7812fd4b6e33b3..948334d27ba420097d0ea686153638fc45d63606 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -399,13 +399,13 @@ def tf_env_time_srcs():
 def tf_additional_cupti_wrapper_deps():
   return ["//tensorflow/core/platform/default/gpu:cupti_wrapper"]
 
-def tf_additional_gpu_tracer_srcs():
-  return ["platform/default/gpu_tracer.cc"]
+def tf_additional_device_tracer_srcs():
+  return ["platform/default/device_tracer.cc"]
 
-def tf_additional_gpu_tracer_cuda_deps():
+def tf_additional_device_tracer_cuda_deps():
   return []
 
-def tf_additional_gpu_tracer_deps():
+def tf_additional_device_tracer_deps():
   return []
 
 def tf_additional_libdevice_data():
@@ -458,7 +458,6 @@ def tf_additional_lib_deps():
 
 def tf_additional_core_deps():
   return select({
-      "//tensorflow:with_gcp_support_windows_override": [],
       "//tensorflow:with_gcp_support_android_override": [],
       "//tensorflow:with_gcp_support_ios_override": [],
       "//tensorflow:with_gcp_support": [
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index caeed0aa4a32213e490dd0a05adadeff847d14df..09029a4b256beceeb69c735c15bb1587cb1e06ac 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -10,46 +10,51 @@ def tf_sycl_tests_tags():
 
 def tf_additional_plugin_deps():
   return select({
-      "//tensorflow:with_xla_support": ["//tensorflow/compiler/jit"],
+      str(Label("//tensorflow:with_xla_support")): [
+          str(Label("//tensorflow/compiler/jit"))
+      ],
       "//conditions:default": [],
   })
 
 def tf_additional_xla_deps_py():
   return []
 
+def tf_additional_grpc_deps_py():
+  return []
+
 def tf_additional_license_deps():
   return select({
-      "//tensorflow:with_xla_support": ["@llvm//:LICENSE.TXT"],
+      str(Label("//tensorflow:with_xla_support")): ["@llvm//:LICENSE.TXT"],
       "//conditions:default": [],
   })
 
 def tf_additional_verbs_deps():
   return select({
-      "//tensorflow:with_verbs_support": [
-          "//tensorflow/contrib/verbs:verbs_server_lib",
-          "//tensorflow/contrib/verbs:grpc_verbs_client",
-      ], 
+      str(Label("//tensorflow:with_verbs_support")): [
+          str(Label("//tensorflow/contrib/verbs:verbs_server_lib")),
+          str(Label("//tensorflow/contrib/verbs:grpc_verbs_client")),
+      ],
       "//conditions:default": [],
   })
 
 def tf_additional_mpi_deps():
   return select({
-      "//tensorflow:with_mpi_support": [
-          "//tensorflow/contrib/mpi:mpi_server_lib",
+      str(Label("//tensorflow:with_mpi_support")): [
+          str(Label("//tensorflow/contrib/mpi:mpi_server_lib")),
       ],
       "//conditions:default": [],
   })
 
 def tf_additional_gdr_deps():
   return select({
-      "//tensorflow:with_gdr_support": [
-          "//tensorflow/contrib/gdr:gdr_server_lib",
+      str(Label("//tensorflow:with_gdr_support")): [
+          str(Label("//tensorflow/contrib/gdr:gdr_server_lib")),
       ],
       "//conditions:default": [],
   })
 
 def if_static(extra_deps, otherwise=[]):
   return select({
-      "//tensorflow:framework_shared_object": otherwise,
+      str(Label("//tensorflow:framework_shared_object")): otherwise,
       "//conditions:default": extra_deps,
   })
diff --git a/tensorflow/core/platform/default/gpu_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
similarity index 93%
rename from tensorflow/core/platform/default/gpu_tracer.cc
rename to tensorflow/core/platform/default/device_tracer.cc
index d6489f2f00497dfd052e1af720188976180b450b..f4b0f16393d70521386ad49fbf010591e5afb08c 100644
--- a/tensorflow/core/platform/default/gpu_tracer.cc
+++ b/tensorflow/core/platform/default/device_tracer.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/gpu_tracer.h"
+#include "tensorflow/core/platform/device_tracer.h"
 
 #if GOOGLE_CUDA
 
@@ -101,7 +101,7 @@ const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
 }  // namespace
 
 namespace tensorflow {
-namespace gputracer {
+namespace devicetracer {
 
 // Forward declaration.
 class CUPTIManager;
@@ -286,14 +286,14 @@ CUPTIManager *GetCUPTIManager() {
 // for the duration of the CUPTI API callback.
 TF_STATIC_THREAD_LOCAL_POD(const char *, tls_current_annotation);
 
-class GPUTracerImpl : public GPUTracer,
-                      public CUPTIClient,
-                      public port::Tracing::Engine {
+class DeviceTracerImpl : public DeviceTracer,
+                         public CUPTIClient,
+                         public port::Tracing::Engine {
  public:
-  GPUTracerImpl();
-  ~GPUTracerImpl() override;
+  DeviceTracerImpl();
+  ~DeviceTracerImpl() override;
 
-  // GPUTracer interface:
+  // DeviceTracer interface:
   Status Start() override;
   Status Stop() override;
   Status Collect(StepStatsCollector *collector) override;
@@ -348,7 +348,7 @@ class GPUTracerImpl : public GPUTracer,
   };
 
   // This is the subscriber callback which is invoked directly by CUPTI.
-  // The 'userdata' argument will be a pointer to the active 'GPUTracerImpl'.
+  // The 'userdata' argument will be a pointer to the active 'DeviceTracerImpl'.
   static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
                                    CUpti_CallbackId cbid, const void *cbdata);
 
@@ -375,28 +375,28 @@ class GPUTracerImpl : public GPUTracer,
   uint64_t start_timestamp_ GUARDED_BY(mu_);
   uint64_t end_timestamp_ GUARDED_BY(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GPUTracerImpl);
+  TF_DISALLOW_COPY_AND_ASSIGN(DeviceTracerImpl);
 };
 
-GPUTracerImpl::GPUTracerImpl() {
-  VLOG(1) << "GPUTracer created.";
+DeviceTracerImpl::DeviceTracerImpl() {
+  VLOG(1) << "DeviceTracer created.";
   cupti_manager_ = GetCUPTIManager();
   CHECK(cupti_manager_);
   cupti_wrapper_.reset(new perftools::gputools::profiler::CuptiWrapper());
   enabled_ = false;
 }
 
-GPUTracerImpl::~GPUTracerImpl() {
+DeviceTracerImpl::~DeviceTracerImpl() {
   // Unregister the CUPTI callbacks if needed to prevent them from accessing
   // freed memory.
   Stop().IgnoreError();
 }
 
-Status GPUTracerImpl::Start() {
-  VLOG(1) << "GPUTracer::Start";
+Status DeviceTracerImpl::Start() {
+  VLOG(1) << "DeviceTracer::Start";
   mutex_lock l(mu_);
   if (enabled_) {
-    return errors::FailedPrecondition("GPUTracer is already enabled.");
+    return errors::FailedPrecondition("DeviceTracer is already enabled.");
   }
   // There can only be one CUPTI subscriber.  If we can't create one then
   // there is another trace in progress (possibly by external code).
@@ -451,8 +451,8 @@ Status GPUTracerImpl::Start() {
   return Status::OK();
 }
 
-Status GPUTracerImpl::Stop() {
-  VLOG(1) << "GPUTracer::Stop";
+Status DeviceTracerImpl::Stop() {
+  VLOG(1) << "DeviceTracer::Stop";
   mutex_lock l(mu_);
   if (!enabled_) {
     return Status::OK();
@@ -466,20 +466,20 @@ Status GPUTracerImpl::Stop() {
   return Status::OK();
 }
 
-void GPUTracerImpl::AddCorrelationId(uint32 correlation_id,
-                                     const string &name) {
+void DeviceTracerImpl::AddCorrelationId(uint32 correlation_id,
+                                        const string &name) {
   VLOG(2) << correlation_id << " : " << name;
   mutex_lock l(trace_mu_);
   if (correlations_.size() >= kMaxRecords) return;
   correlations_.emplace(correlation_id, name);
 }
 
-/*static*/ void GPUTracerImpl::ApiCallback(void *userdata,
-                                           CUpti_CallbackDomain domain,
-                                           CUpti_CallbackId cbid,
-                                           const void *cbdata) {
+/*static*/ void DeviceTracerImpl::ApiCallback(void *userdata,
+                                              CUpti_CallbackDomain domain,
+                                              CUpti_CallbackId cbid,
+                                              const void *cbdata) {
   auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
-  GPUTracerImpl *tracer = reinterpret_cast<GPUTracerImpl *>(userdata);
+  DeviceTracerImpl *tracer = reinterpret_cast<DeviceTracerImpl *>(userdata);
   VLOG(2) << "ApiCallback " << domain << ":" << cbid
           << " func: " << cbInfo->functionName;
 
@@ -533,7 +533,7 @@ void GPUTracerImpl::AddCorrelationId(uint32 correlation_id,
   }
 }
 
-void GPUTracerImpl::ActivityCallback(const CUpti_Activity &record) {
+void DeviceTracerImpl::ActivityCallback(const CUpti_Activity &record) {
   VLOG(2) << "ActivityCallback " << record.kind;
   mutex_lock l(trace_mu_);
   switch (record.kind) {
@@ -570,10 +570,10 @@ void GPUTracerImpl::ActivityCallback(const CUpti_Activity &record) {
   }
 }
 
-Status GPUTracerImpl::Collect(StepStatsCollector *collector) {
+Status DeviceTracerImpl::Collect(StepStatsCollector *collector) {
   mutex_lock l(mu_);
   if (enabled_) {
-    return errors::FailedPrecondition("GPUTracer is still enabled.");
+    return errors::FailedPrecondition("DeviceTracer is still enabled.");
   }
 
   // TODO(pbar) Handle device IDs and prefix properly.
@@ -630,10 +630,10 @@ Status GPUTracerImpl::Collect(StepStatsCollector *collector) {
   return Status::OK();
 }
 
-}  // namespace gputracer
+}  // namespace devicetracer
 
-std::unique_ptr<GPUTracer> CreateGPUTracer() {
-  std::unique_ptr<GPUTracer> tracer(new gputracer::GPUTracerImpl());
+std::unique_ptr<DeviceTracer> CreateDeviceTracer() {
+  std::unique_ptr<DeviceTracer> tracer(new devicetracer::DeviceTracerImpl());
   return tracer;
 }
 
@@ -643,7 +643,7 @@ std::unique_ptr<GPUTracer> CreateGPUTracer() {
 
 namespace tensorflow {
 
-std::unique_ptr<GPUTracer> CreateGPUTracer() { return nullptr; }
+std::unique_ptr<DeviceTracer> CreateDeviceTracer() { return nullptr; }
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/default/mutex.h b/tensorflow/core/platform/default/mutex.h
index c3e44c42d942326af210e1038da20bf655d14a10..044c754e80bd0dee04c73e969c325a2aa4a89c31 100644
--- a/tensorflow/core/platform/default/mutex.h
+++ b/tensorflow/core/platform/default/mutex.h
@@ -31,6 +31,8 @@ namespace tensorflow {
 
 enum LinkerInitialized { LINKER_INITIALIZED };
 
+class condition_variable;
+
 // Mimic std::mutex + C++17's shared_mutex, adding a LinkerInitialized
 // constructor interface.  This type is as fast as mutex, but is also a shared
 // lock.
diff --git a/tensorflow/core/platform/gpu_tracer.h b/tensorflow/core/platform/device_tracer.h
similarity index 69%
rename from tensorflow/core/platform/gpu_tracer.h
rename to tensorflow/core/platform/device_tracer.h
index 3373d974e3815939989b5abd3fa294025082212b..d0f86a51030710cb97d2c962c460eaf87b9931d4 100644
--- a/tensorflow/core/platform/gpu_tracer.h
+++ b/tensorflow/core/platform/device_tracer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PLATFORM_GPU_TRACER_H_
-#define TENSORFLOW_CORE_PLATFORM_GPU_TRACER_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_DEVICE_TRACER_H_
+#define TENSORFLOW_CORE_PLATFORM_DEVICE_TRACER_H_
 
 #include <memory>
 
@@ -24,16 +24,16 @@ namespace tensorflow {
 
 class StepStatsCollector;
 
-// 'GPUTracer' is an interface for collecting low-level execution timings
-// of GPU computation and DMA transfers.
+// 'DeviceTracer' is an interface for collecting low-level execution timings
+// of hardware accelerator (e.g. GPU) computation and DMA transfers.
 //
 // Typical usage pattern is as follows:
 //
-// GPUTracer* tracer = CreateGPUTracer();
+// DeviceTracer* tracer = CreateDeviceTracer();
 // if (tracer) {
 //   tracer->Start();
 //
-//   ... perform some GPU computations.
+//   ... perform some computations on a hardware accelerator.
 //
 //   tracer->Stop();
 //
@@ -44,23 +44,23 @@ class StepStatsCollector;
 //
 // Notes:
 // Tracing is not supported on all plaforms.  On platforms
-// with no GPU tracing support, 'CreateGPUTracer' will return 'nullptr'.
-// On most plaforms, GPU tracing will be a system-wide activity and
-// a single 'GPUTracer' will collect activity from all GPUs.
+// with no tracing support, 'CreateDeviceTracer' will return 'nullptr'.
+// On most plaforms, hardware tracing will be a system-wide activity and
+// a single 'DeviceTracer' will collect activity from all devices.
 // It is also common that only a single tracer may be active at any
 // given time.  The 'Start' method will return an error if tracing is
 // already in progress elsewhere.
 //
-class GPUTracer {
+class DeviceTracer {
  public:
-  virtual ~GPUTracer() {}
+  virtual ~DeviceTracer() {}
 
-  // Start GPU tracing.
+  // Start device tracing.
   // Note that only a single trace can be active, in which case this
   // methods will return an 'Unavailable' error.
   virtual Status Start() = 0;
 
-  // Stop GPU tracing.
+  // Stop device tracing.
   // It is safe to call 'Stop' on a tracer which is not enabled.
   virtual Status Stop() = 0;
 
@@ -70,10 +70,10 @@ class GPUTracer {
   virtual Status Collect(StepStatsCollector* collector) = 0;
 };
 
-// Creates a platform-specific GPUTracer.
+// Creates a platform-specific DeviceTracer.
 // Returns 'nullptr' on platforms where tracing is not supported.
-std::unique_ptr<GPUTracer> CreateGPUTracer();
+std::unique_ptr<DeviceTracer> CreateDeviceTracer();
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PLATFORM_GPU_TRACER_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_DEVICE_TRACER_H_
diff --git a/tensorflow/core/platform/gpu_tracer_test.cc b/tensorflow/core/platform/device_tracer_test.cc
similarity index 84%
rename from tensorflow/core/platform/gpu_tracer_test.cc
rename to tensorflow/core/platform/device_tracer_test.cc
index ce2985fd47c6de819aedd78a047815edb0e29e86..c0c08dabacbcb9fdbbfd9bdbe16bcfaea7328507 100644
--- a/tensorflow/core/platform/gpu_tracer_test.cc
+++ b/tensorflow/core/platform/device_tracer_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/gpu_tracer.h"
+#include "tensorflow/core/platform/device_tracer.h"
 
 #include <map>
 #include <memory>
@@ -50,7 +50,7 @@ std::unique_ptr<Session> CreateSession() {
   return std::unique_ptr<Session>(NewSession(options));
 }
 
-class GPUTracerTest : public ::testing::Test {
+class DeviceTracerTest : public ::testing::Test {
  public:
   void Initialize(std::initializer_list<float> a_values) {
     Graph graph(OpRegistry::Global());
@@ -84,10 +84,10 @@ class GPUTracerTest : public ::testing::Test {
 
  protected:
   void ExpectFailure(const Status& status, error::Code code) {
-    EXPECT_FALSE(status.ok());
+    EXPECT_FALSE(status.ok()) << status.ToString();
     if (!status.ok()) {
       LOG(INFO) << "Status message: " << status.error_message();
-      EXPECT_EQ(code, status.code());
+      EXPECT_EQ(code, status.code()) << status.ToString();
     }
   }
 
@@ -97,22 +97,22 @@ class GPUTracerTest : public ::testing::Test {
   GraphDef def_;
 };
 
-TEST_F(GPUTracerTest, StartStop) {
-  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
+TEST_F(DeviceTracerTest, StartStop) {
+  std::unique_ptr<DeviceTracer> tracer(CreateDeviceTracer());
   if (!tracer) return;
   TF_EXPECT_OK(tracer->Start());
   TF_EXPECT_OK(tracer->Stop());
 }
 
-TEST_F(GPUTracerTest, StopBeforeStart) {
-  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
+TEST_F(DeviceTracerTest, StopBeforeStart) {
+  std::unique_ptr<DeviceTracer> tracer(CreateDeviceTracer());
   if (!tracer) return;
   TF_EXPECT_OK(tracer->Stop());
   TF_EXPECT_OK(tracer->Stop());
 }
 
-TEST_F(GPUTracerTest, CollectBeforeStart) {
-  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
+TEST_F(DeviceTracerTest, CollectBeforeStart) {
+  std::unique_ptr<DeviceTracer> tracer(CreateDeviceTracer());
   if (!tracer) return;
   StepStats stats;
   StepStatsCollector collector(&stats);
@@ -120,8 +120,8 @@ TEST_F(GPUTracerTest, CollectBeforeStart) {
   EXPECT_EQ(stats.dev_stats_size(), 0);
 }
 
-TEST_F(GPUTracerTest, CollectBeforeStop) {
-  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
+TEST_F(DeviceTracerTest, CollectBeforeStop) {
+  std::unique_ptr<DeviceTracer> tracer(CreateDeviceTracer());
   if (!tracer) return;
   TF_EXPECT_OK(tracer->Start());
   StepStats stats;
@@ -131,9 +131,9 @@ TEST_F(GPUTracerTest, CollectBeforeStop) {
   TF_EXPECT_OK(tracer->Stop());
 }
 
-TEST_F(GPUTracerTest, StartTwoTracers) {
-  std::unique_ptr<GPUTracer> tracer1(CreateGPUTracer());
-  std::unique_ptr<GPUTracer> tracer2(CreateGPUTracer());
+TEST_F(DeviceTracerTest, StartTwoTracers) {
+  std::unique_ptr<DeviceTracer> tracer1(CreateDeviceTracer());
+  std::unique_ptr<DeviceTracer> tracer2(CreateDeviceTracer());
   if (!tracer1 || !tracer2) return;
 
   TF_EXPECT_OK(tracer1->Start());
@@ -144,9 +144,9 @@ TEST_F(GPUTracerTest, StartTwoTracers) {
   TF_EXPECT_OK(tracer2->Stop());
 }
 
-TEST_F(GPUTracerTest, RunWithTracer) {
-  // On non-GPU platforms, we may not support GPUTracer.
-  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
+TEST_F(DeviceTracerTest, RunWithTracer) {
+  // On non-GPU platforms, we may not support DeviceTracer.
+  std::unique_ptr<DeviceTracer> tracer(CreateDeviceTracer());
   if (!tracer) return;
 
   Initialize({3, 2, -1, 0});
@@ -172,8 +172,8 @@ TEST_F(GPUTracerTest, RunWithTracer) {
   EXPECT_FLOAT_EQ(5.0, mat(0, 0));
 }
 
-TEST_F(GPUTracerTest, TraceToStepStatsCollector) {
-  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
+TEST_F(DeviceTracerTest, TraceToStepStatsCollector) {
+  std::unique_ptr<DeviceTracer> tracer(CreateDeviceTracer());
   if (!tracer) return;
 
   Initialize({3, 2, -1, 0});
@@ -198,10 +198,10 @@ TEST_F(GPUTracerTest, TraceToStepStatsCollector) {
   collector.Finalize();
   // Depending on whether this runs on CPU or GPU, we will have a
   // different number of devices.
-  EXPECT_GE(stats.dev_stats_size(), 1);
+  EXPECT_GE(stats.dev_stats_size(), 1) << "Saw stats: " << stats.DebugString();
 }
 
-TEST_F(GPUTracerTest, RunWithTraceOption) {
+TEST_F(DeviceTracerTest, RunWithTraceOption) {
   Initialize({3, 2, -1, 0});
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 12ef55ec26e3355f08235cce557b9c7ae0618f04..9a7725da94f6f3d7c381a8d165160a664c5eac4f 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -20,6 +20,10 @@ limitations under the License.
 #if defined(__APPLE__)
 #include <mach-o/dyld.h>
 #endif
+#if defined(__FreeBSD__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
 #if defined(PLATFORM_WINDOWS)
 #include <windows.h>
 #include "tensorflow/core/platform/windows/windows_file_system.h"
@@ -266,6 +270,14 @@ string Env::GetExecutablePath() {
   char unresolved_path[buffer_size];
   _NSGetExecutablePath(unresolved_path, &buffer_size);
   CHECK(realpath(unresolved_path, exe_path));
+#elif defined(__FreeBSD__)
+  int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
+  size_t exe_path_size = PATH_MAX;
+
+  if (sysctl(mib, 4, exe_path, &exe_path_size, NULL, 0) != 0) {
+    // Resolution of path failed
+    return "";
+  }
 #elif defined(PLATFORM_WINDOWS)
   HMODULE hModule = GetModuleHandleW(NULL);
   WCHAR wc_file_path[MAX_PATH] = {0};
@@ -288,30 +300,47 @@ bool Env::LocalTempFilename(string* filename) {
   // Try each directory, as they might be full, have inappropriate
   // permissions or have different problems at times.
   for (const string& dir : dirs) {
+    *filename = io::JoinPath(dir, "tempfile-");
+    if (CreateUniqueFileName(filename, "")) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool Env::CreateUniqueFileName(string* prefix, const string& suffix) {
 #ifdef __APPLE__
-    uint64_t tid64;
-    pthread_threadid_np(nullptr, &tid64);
-    int32 tid = static_cast<int32>(tid64);
-    int32 pid = static_cast<int32>(getpid());
+  uint64_t tid64;
+  pthread_threadid_np(nullptr, &tid64);
+  int32 tid = static_cast<int32>(tid64);
+  int32 pid = static_cast<int32>(getpid());
+#elif defined(__FreeBSD__)
+  // Has to be casted to long first, else this error appears:
+  // static_cast from 'pthread_t' (aka 'pthread *') to 'int32' (aka 'int')
+  // is not allowed
+  int32 tid = static_cast<int32>((long) pthread_self());
+  int32 pid = static_cast<int32>(getpid());
 #elif defined(PLATFORM_WINDOWS)
-    int32 tid = static_cast<int32>(GetCurrentThreadId());
-    int32 pid = static_cast<int32>(GetCurrentProcessId());
+  int32 tid = static_cast<int32>(GetCurrentThreadId());
+  int32 pid = static_cast<int32>(GetCurrentProcessId());
 #else
-    int32 tid = static_cast<int32>(pthread_self());
-    int32 pid = static_cast<int32>(getpid());
+  int32 tid = static_cast<int32>(pthread_self());
+  int32 pid = static_cast<int32>(getpid());
 #endif
-    uint64 now_microsec = NowMicros();
+  uint64 now_microsec = NowMicros();
 
-    *filename = io::JoinPath(
-        dir, strings::Printf("tempfile-%s-%x-%d-%llx", port::Hostname().c_str(),
-                             tid, pid, now_microsec));
-    if (FileExists(*filename).ok()) {
-      filename->clear();
-    } else {
-      return true;
-    }
+  *prefix += strings::Printf("%s-%x-%d-%llx", port::Hostname().c_str(),
+                           tid, pid, now_microsec);
+
+  if (suffix.size()) {
+    *prefix += suffix;
+  }
+  if (FileExists(*prefix).ok()) {
+    prefix->clear();
+    return false;
+  } else {
+    return true;
   }
-  return false;
 }
 
 Thread::~Thread() {}
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index da8c3e2d7e8a50c9d441cd371078fa86aae13179..a0adf70ef4c6d5c4f9facf2c9336330e5ad49fc9 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -218,6 +218,10 @@ class Env {
   /// Creates a local unique temporary file name. Returns true if success.
   bool LocalTempFilename(string* filename);
 
+  /// Creates a local unique file name that starts with |prefix| and ends with
+  /// |suffix|. Returns true if success.
+  bool CreateUniqueFileName(string* prefix, const string& suffix);
+
   // TODO(jeff,sanjay): Add back thread/thread-pool support if needed.
   // TODO(jeff,sanjay): if needed, tighten spec so relative to epoch, or
   // provide a routine to get the absolute time.
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index c9b362f18235f8ddec0994bc1110aaec950eef72..233c370a5fe74deea3ddf2163fe3547046a5785a 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -340,4 +341,18 @@ TEST_F(DefaultEnvTest, LocalTempFilename) {
   EXPECT_FALSE(env->FileExists(filename).ok());
 }
 
+TEST_F(DefaultEnvTest, CreateUniqueFileName) {
+  Env* env = Env::Default();
+
+  string prefix = "tempfile-prefix-";
+  string suffix = ".tmp";
+  string filename = prefix;
+
+  EXPECT_TRUE(env->CreateUniqueFileName(&filename, suffix));
+
+  StringPiece str(filename);
+  EXPECT_TRUE(str.starts_with(prefix));
+  EXPECT_TRUE(str.ends_with(suffix));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/posix/error.cc b/tensorflow/core/platform/posix/error.cc
index f8b0285c50480c8b23a49062133d9b6f224468b3..cda6d7d8f9d6ad3e7f2c8fa56cc99a8dbe07fa00 100644
--- a/tensorflow/core/platform/posix/error.cc
+++ b/tensorflow/core/platform/posix/error.cc
@@ -131,8 +131,8 @@ error::Code ErrnoToCode(int err_number) {
     case ENETUNREACH:   // Network unreachable
     case ENOLCK:        // No locks available
     case ENOLINK:       // Link has been severed
-#if !(defined(__APPLE__) || defined(__FreeBSD__) || defined(_WIN32) \
-	|| defined(__HAIKU__))
+#if !(defined(__APPLE__) || defined(__FreeBSD__) || defined(_WIN32) || \
+      defined(__HAIKU__))
     case ENONET:  // Machine is not on the network
 #endif
       code = error::UNAVAILABLE;
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 09f69a95c1ddd6c001f33ab54b395759c35d6b5a..614ee00b0133976e9fe49caf7c75a01194e10237 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -37,8 +37,8 @@ limitations under the License.
 #ifdef TF_USE_SNAPPY
 #include "snappy.h"
 #endif
-#if (defined(__APPLE__) && defined(__MACH__)) || defined(__FreeBSD__) \
-	|| defined(__HAIKU__)
+#if (defined(__APPLE__) && defined(__MACH__)) || defined(__FreeBSD__) || \
+    defined(__HAIKU__)
 #include <thread>
 #endif
 
@@ -62,8 +62,8 @@ int NumSchedulableCPUs() {
   }
   perror("sched_getaffinity");
 #endif
-#if (defined(__APPLE__) && defined(__MACH__)) || defined(__FreeBSD__) \
-	|| defined(__HAIKU__)
+#if (defined(__APPLE__) && defined(__MACH__)) || defined(__FreeBSD__) || \
+    defined(__HAIKU__)
   unsigned int count = std::thread::hardware_concurrency();
   if (count > 0) return static_cast<int>(count);
 #endif
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
index fb1955edde2abfd3fe5267e1319ea128138ee092..12dc9c58b38d01f6efc5644193fbf38b0e70c8d1 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
@@ -118,9 +118,10 @@ int64 AndroidArmV7ACpuUtilsHelper::ReadCpuFrequencyFile(
   const int retval = fscanf(fp, "%lld", &freq_in_khz);
   if (retval < 0) {
     LOG(WARNING) << "Failed to \"" << file_path << "\"";
+    fclose(fp);
     return INVALID_CPU_FREQUENCY;
   }
-  pclose(fp);
+  fclose(fp);
   return freq_in_khz * 1000;  // The file contains cpu frequency in khz
 }
 
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 51c85592bf43bdfb68c4ba90d19d28582560d6d4..682ad97eec3b3ffd0c69120e5de359ee50c9048e 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/platform/s3/s3_file_system.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/s3/s3_file_system.h"
 #include "tensorflow/core/platform/s3/s3_crypto.h"
 
 #include <aws/core/Aws.h>
@@ -38,7 +38,7 @@ static const size_t kS3ReadAppendableFileBufferSize = 1024 * 1024;
 static const int kS3GetChildrenMaxKeys = 100;
 
 Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
-  static mutex cfg_lock;
+  static mutex cfg_lock(LINKER_INITIALIZED);
   static bool init(false);
   static Aws::Client::ClientConfiguration cfg;
 
@@ -49,9 +49,15 @@ Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
     if (endpoint) {
       cfg.endpointOverride = Aws::String(endpoint);
     }
-    const char* region = getenv("S3_REGION");
+    const char* region = getenv("AWS_REGION");
     if (region) {
       cfg.region = Aws::String(region);
+    } else {
+      // TODO (yongtang): `S3_REGION` should be deprecated after 2.0.
+      const char* region = getenv("S3_REGION");
+      if (region) {
+        cfg.region = Aws::String(region);
+      }
     }
     const char* use_https = getenv("S3_USE_HTTPS");
     if (use_https) {
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index 8ca26fa5dcf944cdd2c106233324c03f38f7a13f..9e628b10651423a7ce05392e675453c87f8b6c8c 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -48,7 +48,7 @@ bazel-bin/tensorflow/python/profiler/profiler_ui \
 # Create options to profile the time and memory information.
 builder = tf.profiler.ProfileOptionBuilder
 opts = builder(builder.time_and_memory()).order_by('micros').build()
-# Create a profiling context, set constructor argument `trace_steps`, 
+# Create a profiling context, set constructor argument `trace_steps`,
 # `dump_steps` to empty for explicit control.
 with tf.contrib.tfprof.ProfileContext('/tmp/train_dir',
                                       trace_steps=[],
diff --git a/tensorflow/core/profiler/g3doc/options.md b/tensorflow/core/profiler/g3doc/options.md
index 4c73e372e3bd9f24c83bdc0d3b8d98b5f8b03f11..dd12f76d6fa9a71b78a672a687b96a985641283b 100644
--- a/tensorflow/core/profiler/g3doc/options.md
+++ b/tensorflow/core/profiler/g3doc/options.md
@@ -60,11 +60,14 @@ Currently, profiler only tracks the allocation of memory. As a result, the
 accumulated memory request is uaually larger than the peak memory of the overall
 model.
 
-bytes: The memory allocations requested by the operation.
-peak_bytes: The peak requested memory (not de-allocated) by the operation.
-residual_bytes: The memory requested by the operation and not de-allocated
+It's recommended to generate timeline to see the allocator memory usage over
+time.
+
+`bytes`: The memory allocations requested by the operation.
+`peak_bytes`: The peak requested memory (not de-allocated) by the operation.
+`residual_bytes`: The memory requested by the operation and not de-allocated
                 when Compute finishes.
-output_bytes: The memory output by the operation. It's not necessarily requested
+`output_bytes`: The memory output by the operation. It's not necessarily requested
               by the current operation. For example, it can be a tensor
               forwarded from input to output, with in-place mutation.
 
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 671b65d708f57713d984331de73ddf305675b792..2945c9510f1c91474a0a998541e394143a0490be 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -80,10 +80,15 @@ void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
 
 void ExecStep::AddMemoryStats(const string& dev,
                               const NodeExecStats& step_stat) {
-  if (exec_.memory_intialized()) {
+  ExecMemory exec_mem;
+  if (step_stat.all_start_micros() > 0) {
+    exec_mem.set_memory_micros(step_stat.all_start_micros() +
+                               step_stat.op_end_rel_micros());
+  } else {
+    fprintf(stderr, "%s has no start time, skipping\n",
+            step_stat.node_name().c_str());
     return;
   }
-  exec_.set_memory_intialized(true);
 
   int accelerator_allocator_cnt = 0;
   for (const auto& mem : step_stat.memory()) {
@@ -93,14 +98,12 @@ void ExecStep::AddMemoryStats(const string& dev,
       continue;
     }
     ++accelerator_allocator_cnt;
-    exec_.set_allocator_bytes_in_use(
-        std::max(static_cast<int64>(exec_.allocator_bytes_in_use()),
+    exec_mem.set_allocator_bytes_in_use(
+        std::max(static_cast<int64>(exec_mem.allocator_bytes_in_use()),
                  static_cast<int64>(mem.allocator_bytes_in_use())));
-    Allocation allocation;
     for (const auto& alloc : mem.allocation_records()) {
-      allocation.add_allocation_records()->MergeFrom(alloc);
+      allocations_.push_back(alloc);
     }
-    allocations_.push_back(allocation);
   }
   if (accelerator_allocator_cnt > 1) {
     fprintf(stderr, "found %d gpu allocator for 1 node\n",
@@ -121,24 +124,47 @@ void ExecStep::AddMemoryStats(const string& dev,
       uint64 output_ptr =
           output.tensor_description().allocation_description().ptr();
       total_output_bytes += output_bytes;
-      output_memory_[output.slot()] = std::make_pair(output_bytes, output_ptr);
+
+      auto& mem = (*exec_mem.mutable_output_memory())[output.slot()];
+      mem.set_ptr(output_ptr);
+      mem.set_bytes(output_bytes);
     }
   }
-  exec_.set_output_bytes(total_output_bytes);
+  exec_mem.set_output_bytes(total_output_bytes);
 
   if (step_stat.has_memory_stats()) {
-    exec_.set_host_temp_bytes(exec_.host_temp_bytes() +
-                              step_stat.memory_stats().host_temp_memory_size());
-    exec_.set_host_persistent_bytes(
-        exec_.host_persistent_bytes() +
+    exec_mem.set_host_temp_bytes(
+        exec_mem.host_temp_bytes() +
+        step_stat.memory_stats().host_temp_memory_size());
+    exec_mem.set_host_persistent_bytes(
+        exec_mem.host_persistent_bytes() +
         step_stat.memory_stats().host_persistent_memory_size());
-    exec_.set_accelerator_temp_bytes(
-        exec_.accelerator_temp_bytes() +
+    exec_mem.set_accelerator_temp_bytes(
+        exec_mem.accelerator_temp_bytes() +
         step_stat.memory_stats().device_temp_memory_size());
-    exec_.set_accelerator_persistent_bytes(
-        exec_.accelerator_persistent_bytes() +
+    exec_mem.set_accelerator_persistent_bytes(
+        exec_mem.accelerator_persistent_bytes() +
         step_stat.memory_stats().device_persistent_memory_size());
   }
+
+  // TODO(xpan): Make this more accurate:
+  // High level: Memory tracking is suspicous and requires large scale
+  // clean up.
+  // Investigte the memory usage difference between CPU/GPU with OpViewTest.
+  //
+  // 1. OpKernelConstruction::allocate_xxx is not traced. Below, we only
+  //    discuss OpKernelContext-related allocations.
+  // 2. allocate_output calls allocate_tensor, which is properly tracked in
+  //    'NodeExecStats.memory'.
+  // 3. allocate_temp is only tracked through record_xxx_temp. It appears
+  //    in 'NodeExecStats.memory_stats'.
+  // 4. allocate_persistent calls allocate_tensor, which is properly tracked
+  //    in 'NodeExecStats.memory'. However, there is no way to count it as
+  //    persistent now.
+  // 5. record_xxx_persistent is called when allocate_persistent
+  //    is not used and hence tracks some complementary bytes. It appears in
+  //    'NodeExecStats.memory_stats'. It's suspicious. But we should
+  //    use it now since it covers constant op.
   int64 residual_bytes = 0;
   int64 requested_bytes = 0;
   int64 peak_bytes = 0;
@@ -147,9 +173,20 @@ void ExecStep::AddMemoryStats(const string& dev,
     requested_bytes += mem.total_bytes();
     peak_bytes += mem.peak_bytes();
   }
-  exec_.set_requested_bytes(requested_bytes);
-  exec_.set_residual_bytes(residual_bytes);
-  exec_.set_peak_bytes(peak_bytes);
+  residual_bytes += exec_mem.host_persistent_bytes() +
+                    exec_mem.accelerator_persistent_bytes();
+  requested_bytes += exec_mem.host_persistent_bytes() +
+                     exec_mem.accelerator_persistent_bytes() +
+                     exec_mem.host_temp_bytes() +
+                     exec_mem.accelerator_temp_bytes();
+  peak_bytes += exec_mem.host_persistent_bytes() +
+                exec_mem.accelerator_persistent_bytes() +
+                exec_mem.host_temp_bytes() + exec_mem.accelerator_temp_bytes();
+
+  exec_mem.set_requested_bytes(requested_bytes);
+  exec_mem.set_residual_bytes(residual_bytes);
+  exec_mem.set_peak_bytes(peak_bytes);
+  memory_execs_.emplace_back(exec_mem);
 }
 
 void TFGraphNode::AddStepStat(int64 step, const string& device,
@@ -251,5 +288,8 @@ bool IsPlacedOnAccelerator(const string& device) {
   return device.find("gpu") != device.npos ||
          device.find("sycl") != device.npos;
 }
+bool IsPlacedOnCPU(const string& device) {
+  return device.find("cpu") != device.npos;
+}
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h
index e2d0563a0747d7bec74ce3aeb9d5995f47cff915..5bc2ea3c42210991a01aea1ea731aa3b4da83acc 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.h
+++ b/tensorflow/core/profiler/internal/tfprof_node.h
@@ -109,7 +109,6 @@ class ExecStep {
       const {
     return cpu_execs_;
   }
-
   int64 all_start_micros() const { return exec_.all_start_micros(); }
   int64 latest_end_micros() const { return exec_.latest_end_micros(); }
   int64 lastest_schedule_end_micros() const {
@@ -121,27 +120,73 @@ class ExecStep {
     }
     return ret;
   }
-
-  int64 requested_bytes() const { return exec_.requested_bytes(); }
-  int64 peak_bytes() const { return exec_.peak_bytes(); }
-  int64 residual_bytes() const { return exec_.residual_bytes(); }
-  int64 output_bytes() const { return exec_.output_bytes(); }
+  int64 requested_bytes() const {
+    int64 requested_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      requested_bytes += exec.requested_bytes();
+    }
+    return requested_bytes;
+  }
+  int64 peak_bytes() const {
+    int64 peak_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      peak_bytes += exec.peak_bytes();
+    }
+    return peak_bytes;
+  }
+  int64 residual_bytes() const {
+    int64 residual_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      residual_bytes += exec.residual_bytes();
+    }
+    return residual_bytes;
+  }
+  int64 output_bytes() const {
+    int64 output_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      output_bytes += exec.output_bytes();
+    }
+    return output_bytes;
+  }
   int64 accelerator_temp_bytes() const {
-    return exec_.accelerator_temp_bytes();
+    int64 accelerator_temp_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      accelerator_temp_bytes += exec.accelerator_temp_bytes();
+    }
+    return accelerator_temp_bytes;
+  }
+  int64 host_temp_bytes() const {
+    int64 host_temp_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      host_temp_bytes += exec.host_temp_bytes();
+    }
+    return host_temp_bytes;
   }
-  int64 host_temp_bytes() const { return exec_.host_temp_bytes(); }
   int64 accelerator_persistent_bytes() const {
-    return exec_.accelerator_persistent_bytes();
+    int64 accelerator_persistent_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      accelerator_persistent_bytes += exec.accelerator_persistent_bytes();
+    }
+    return accelerator_persistent_bytes;
   }
-  int64 host_persistent_bytes() const { return exec_.host_persistent_bytes(); }
-  const std::map<int32, std::pair<int64, uint64>>& output_memory() const {
-    return output_memory_;
+  int64 host_persistent_bytes() const {
+    int64 host_persistent_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      host_persistent_bytes += exec.host_persistent_bytes();
+    }
+    return host_persistent_bytes;
   }
-  int64 allocator_bytes_in_use() const {
-    return exec_.allocator_bytes_in_use();
+  std::map<int64, int64> allocator_bytes_in_use() const {
+    std::map<int64, int64> bytes_in_use;
+    for (const ExecMemory& exec : memory_execs_) {
+      bytes_in_use[exec.memory_micros()] = exec.allocator_bytes_in_use();
+    }
+    return bytes_in_use;
   }
 
-  const std::vector<Allocation>& allocations() const { return allocations_; }
+  const std::vector<AllocationRecord>& allocations() const {
+    return allocations_;
+  }
 
   const ExecProfile& ToProto() {
     exec_.mutable_accelerator_execs()->clear();
@@ -169,19 +214,15 @@ class ExecStep {
     for (const string& d : devices_) {
       exec_.add_devices(d);
     }
-
-    exec_.mutable_output_memory()->clear();
-    for (const auto& mem : output_memory_) {
-      auto& mem_pb = (*exec_.mutable_output_memory())[mem.first];
-      mem_pb.set_bytes(mem.second.first);
-      mem_pb.set_ptr(mem.second.second);
-    }
-
     exec_.mutable_allocations()->Clear();
     for (const auto& r : allocations_) {
       exec_.add_allocations()->MergeFrom(r);
     }
 
+    exec_.mutable_memory_execs()->Clear();
+    for (const auto& m : memory_execs_) {
+      exec_.add_memory_execs()->MergeFrom(m);
+    }
     return exec_;
   }
 
@@ -197,6 +238,7 @@ class ExecStep {
     op_execs_.clear();
 
     allocations_.clear();
+    memory_execs_.clear();
 
     for (const auto& exec_time : exec_.accelerator_execs()) {
       auto& exec = accelerator_execs_[exec_time.first];
@@ -214,15 +256,12 @@ class ExecStep {
         op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
       }
     }
-    for (const auto& output_mem : exec_.output_memory()) {
-      auto& mem = output_memory_[output_mem.first];
-      mem.first = output_mem.second.bytes();
-      mem.second = output_mem.second.ptr();
-    }
-
     for (const auto& r : exec_.allocations()) {
       allocations_.push_back(r);
     }
+    for (const auto& m : exec_.memory_execs()) {
+      memory_execs_.push_back(m);
+    }
   }
 
  private:
@@ -237,14 +276,15 @@ class ExecStep {
   std::map<string, std::vector<std::pair<int64, int64>>> cpu_execs_;
   // combines accelerator_execs_ and cpu_execs_.
   std::map<string, std::vector<std::pair<int64, int64>>> op_execs_;
+  // Each ExecMemory corresponds to one scheduling of the op. Normally,
+  // there are multiple schedulings in while_loop.
+  std::vector<ExecMemory> memory_execs_;
   // All devices the op is associated with (e.g. gpu:0 (scheduling),
   // gpu:0:stream:xx (kernel exec), cpu:0 host)
   std::set<string> devices_;
-  // output_idx -> {output_bytes, memory_ptr}
-  std::map<int32, std::pair<int64, uint64>> output_memory_;
 
   // The history of accelerator allocations and deallocations of this step.
-  std::vector<Allocation> allocations_;
+  std::vector<AllocationRecord> allocations_;
 };
 
 #define GRAPH_NODE_BYTES(type)             \
@@ -593,34 +633,20 @@ class TFGraphNode {
   int64 accelerator_persistent_bytes() const {
     int64 persistent_bytes = 0;
     for (const auto& exec : execs_) {
-      persistent_bytes += exec.second.accelerator_persistent_bytes();
+      persistent_bytes = std::max(persistent_bytes,
+                                  exec.second.accelerator_persistent_bytes());
     }
     return persistent_bytes;
   }
-  int64 host_persistent_bytes(int64 step) const {
+  const std::map<int64, int64> allocator_bytes_in_use(int64 step) const {
     auto exec = execs_.find(step);
     if (exec == execs_.end()) {
-      return 0;
-    }
-    return exec->second.host_persistent_bytes();
-  }
-  const std::map<int32, std::pair<int64, uint64>>& output_memory(
-      int64 step) const {
-    auto exec = execs_.find(step);
-    if (exec == execs_.end()) {
-      return empty_output_memory_;
-    }
-    return exec->second.output_memory();
-  }
-  int64 allocator_bytes_in_use(int64 step) const {
-    auto exec = execs_.find(step);
-    if (exec == execs_.end()) {
-      return 0;
+      return empty_bytes_in_use_;
     }
     return exec->second.allocator_bytes_in_use();
   }
 
-  const std::vector<Allocation>& allocations(int64 step) const {
+  const std::vector<AllocationRecord>& allocations(int64 step) const {
     auto exec = execs_.find(step);
     if (exec == execs_.end()) {
       return empty_allocations_;
@@ -725,9 +751,9 @@ class TFGraphNode {
   std::map<int64, ExecStep> execs_;
 
   // Placeholder for empty cases.
-  std::map<int32, std::pair<int64, uint64>> empty_output_memory_;
+  std::map<int64, int64> empty_bytes_in_use_;
   std::map<string, std::vector<std::pair<int64, int64>>> empty_execs_;
-  std::vector<Allocation> empty_allocations_;
+  std::vector<AllocationRecord> empty_allocations_;
 };
 
 class TFMultiGraphNode {
@@ -880,6 +906,7 @@ class TFMultiGraphNode {
   std::map<string, const TFGraphNode*> nodes_;
 };
 
+bool IsPlacedOnCPU(const string& device);
 bool IsPlacedOnAccelerator(const string& device);
 bool CountAsAcceleratorTime(const string& device);
 bool CountAsCPUTime(const string& device);
diff --git a/tensorflow/core/profiler/internal/tfprof_show_test.cc b/tensorflow/core/profiler/internal/tfprof_show_test.cc
index 1f19f8c322a15a726ce354ecf991ea902788d97b..98773ae19ea424fc1d3ca01572d9535367a41321 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_show_test.cc
@@ -105,12 +105,13 @@ TEST_F(TFProfShowTest, DumpScopeMode) {
       "node name | # parameters | # float_ops | requested bytes | peak bytes | "
       "residual bytes | output bytes | total execution time | accelerator "
       "execution time | cpu execution time\n_TFProfRoot (--/451 params, --/0 "
-      "flops, --/0B, --/0B, --/0B, --/2.56KB, --/13us, --/0us, --/13us)\n  DW "
-      "(3x3x3x6, 162/162 params, 0/0 flops, 0B/0B, 0B/0B, 0B/0B, "
-      "1.28KB/1.28KB, 2us/2us, 0us/0us, 2us/2us)\n  DW2 (2x2x6x12, 288/288 "
-      "params, 0/0 flops, 0B/0B, 0B/0B, 0B/0B, 1.28KB/1.28KB, 11us/11us, "
-      "0us/0us, 11us/11us)\n  ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, 0B/0B, "
-      "0B/0B, 0B/0B, 0us/0us, 0us/0us, 0us/0us)\n",
+      "flops, --/2.56KB, --/2.56KB, --/2.56KB, --/2.56KB, --/13us, --/0us, "
+      "--/13us)\n  DW (3x3x3x6, 162/162 params, 0/0 flops, 1.28KB/1.28KB, "
+      "1.28KB/1.28KB, 1.28KB/1.28KB, 1.28KB/1.28KB, 2us/2us, 0us/0us, "
+      "2us/2us)\n  DW2 (2x2x6x12, 288/288 params, 0/0 flops, 1.28KB/1.28KB, "
+      "1.28KB/1.28KB, 1.28KB/1.28KB, 1.28KB/1.28KB, 11us/11us, 0us/0us, "
+      "11us/11us)\n  ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, 0B/0B, 0B/0B, "
+      "0B/0B, 0us/0us, 0us/0us, 0us/0us)\n",
       dump_str);
 
   EXPECT_EQ(dump_str, TestToFromProto("scope", opts));
@@ -178,22 +179,22 @@ TEST_F(TFProfShowTest, DumpOpMode) {
   EXPECT_EQ(
       "nodename|requestedbytes|totalexecutiontime|acceleratorexecutiontime|"
       "cpuexecutiontime|#parameters|#float_ops|opoccurrence(run|defined)|"
-      "inputshapes\nVariableV20B(0.00%,0.00%),13us(100.00%,0.26%),0us(100.00%,"
-      "0.00%),13us(100.00%,0.29%),451params(100.00%,100.00%),0float_ops(100.00%"
-      ",0.00%),2|3\n\ninput_type:\t(run*2|defined*3)\texec_time:13us\n\nAdd0B("
-      "0.00%,0.00%),0us(99.74%,0.00%),0us(100.00%,0.00%),0us(99.71%,0.00%),"
-      "0params(0.00%,0.00%),0float_ops(100.00%,0.00%),0|3\n\ninput_type:0:1,"
-      "\t1:1\t(run*0|defined*1)\texec_time:0us\ninput_type:0:2x2x6x12,\t1:1\t("
-      "run*0|defined*1)\texec_time:0us\ninput_type:0:3x3x3x6,\t1:1\t(run*0|"
-      "defined*1)\texec_time:0us\n\nAssign0B(0.00%,0.00%),0us(99.74%,0.00%),"
-      "0us(100.00%,0.00%),0us(99.71%,0.00%),0params(0.00%,0.00%),0float_ops("
-      "100.00%,0.00%),0|3\n\ninput_type:0:1,\t1:1\t(run*0|defined*1)\texec_"
+      "inputshapes\nVariableV22.56KB(100.00%,8.40%),13us(100.00%,0.26%),0us("
+      "100.00%,0.00%),13us(100.00%,0.29%),451params(100.00%,100.00%),0float_"
+      "ops(100.00%,0.00%),2|3\n\ninput_type:\t(run*2|defined*3)\texec_time:"
+      "13us\n\nAdd0B(0.00%,0.00%),0us(99.74%,0.00%),0us(100.00%,0.00%),0us(99."
+      "71%,0.00%),0params(0.00%,0.00%),0float_ops(100.00%,0.00%),0|3\n\ninput_"
+      "type:0:1,\t1:1\t(run*0|defined*1)\texec_time:0us\ninput_type:0:2x2x6x12,"
+      "\t1:1\t(run*0|defined*1)\texec_time:0us\ninput_type:0:3x3x3x6,\t1:1\t("
+      "run*0|defined*1)\texec_time:0us\n\nAssign0B(0.00%,0.00%),0us(99.74%,0."
+      "00%),0us(100.00%,0.00%),0us(99.71%,0.00%),0params(0.00%,0.00%),0float_"
+      "ops(100.00%,0.00%),0|3\n\ninput_type:0:1,\t1:1\t(run*0|defined*1)\texec_"
       "time:0us\ninput_type:0:2x2x6x12,\t1:2x2x6x12\t(run*0|defined*1)\texec_"
       "time:0us\ninput_type:0:3x3x3x6,\t1:3x3x3x6\t(run*0|defined*1)\texec_"
       "time:0us\n\nConst0B(0.00%,0.00%),2us(99.74%,0.04%),0us(100.00%,0.00%),"
       "2us(99.71%,0.04%),0params(0.00%,0.00%),0float_ops(100.00%,0.00%),1|"
-      "10\n\ninput_type:\t(run*1|defined*10)\texec_time:2us\n\nConv2D14.59KB("
-      "100.00%,100.00%),4.89ms(99.70%,98.87%),404us(100.00%,100.00%),4.49ms(99."
+      "10\n\ninput_type:\t(run*1|defined*10)\texec_time:2us\n\nConv2D27.90KB("
+      "91.60%,91.60%),4.89ms(99.70%,98.87%),404us(100.00%,100.00%),4.49ms(99."
       "67%,98.77%),0params(0.00%,0.00%),10.44kfloat_ops(100.00%,100.00%),2|"
       "2\n\ninput_type:0:2x3x3x6,\t1:2x2x6x12\t(run*1|defined*1)\texec_time:"
       "597us\ninput_type:0:2x6x6x3,\t1:3x3x3x6\t(run*1|defined*1)\texec_time:4."
diff --git a/tensorflow/core/profiler/internal/tfprof_stats_test.cc b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
index 2f2101d76bfd4c0741fff0eb9762444cd8b6fd92..b86a83cb1bb5fd42437692ea9aec240275c26ed8 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
@@ -89,21 +89,27 @@ TEST_F(TFProfStatsTest, CustomOpType) {
 
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
-      "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_parameters: "
-      "451\nchildren {\n  name: \"DW\"\n  exec_micros: 2\n  parameters: 162\n  "
-      "total_exec_micros: 2\n  total_parameters: 162\n  devices: "
+      "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_requested_bytes: "
+      "2560\ntotal_parameters: 451\nchildren {\n  name: \"DW\"\n  exec_micros: "
+      "2\n  requested_bytes: 1280\n  parameters: 162\n  total_exec_micros: 2\n "
+      " total_requested_bytes: 1280\n  total_parameters: 162\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  cpu_exec_micros: 2\n  "
       "total_cpu_exec_micros: 2\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  output_bytes: 1280\n  total_output_bytes: "
-      "1280\n}\nchildren {\n  name: \"DW2\"\n  exec_micros: 11\n  parameters: "
-      "288\n  total_exec_micros: 11\n  total_parameters: 288\n  devices: "
+      "total_definition_count: 1\n  peak_bytes: 1280\n  residual_bytes: 1280\n "
+      " output_bytes: 1280\n  total_peak_bytes: 1280\n  total_residual_bytes: "
+      "1280\n  total_output_bytes: 1280\n}\nchildren {\n  name: \"DW2\"\n  "
+      "exec_micros: 11\n  requested_bytes: 1280\n  parameters: 288\n  "
+      "total_exec_micros: 11\n  total_requested_bytes: 1280\n  "
+      "total_parameters: 288\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  cpu_exec_micros: 11\n  "
       "total_cpu_exec_micros: 11\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  output_bytes: 1280\n  total_output_bytes: "
-      "1280\n}\nchildren {\n  name: \"ScalarW\"\n  parameters: 1\n  "
-      "total_parameters: 1\n  total_definition_count: "
+      "total_definition_count: 1\n  peak_bytes: 1280\n  residual_bytes: 1280\n "
+      " output_bytes: 1280\n  total_peak_bytes: 1280\n  total_residual_bytes: "
+      "1280\n  total_output_bytes: 1280\n}\nchildren {\n  name: \"ScalarW\"\n  "
+      "parameters: 1\n  total_parameters: 1\n  total_definition_count: "
       "1\n}\ntotal_cpu_exec_micros: 13\ntotal_run_count: "
-      "2\ntotal_definition_count: 3\ntotal_output_bytes: 2560\n",
+      "2\ntotal_definition_count: 3\ntotal_peak_bytes: "
+      "2560\ntotal_residual_bytes: 2560\ntotal_output_bytes: 2560\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 
@@ -119,21 +125,27 @@ TEST_F(TFProfStatsTest, CheckPointOpType) {
 
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
-      "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_parameters: "
-      "451\nchildren {\n  name: \"DW\"\n  exec_micros: 2\n  parameters: 162\n  "
-      "total_exec_micros: 2\n  total_parameters: 162\n  devices: "
+      "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_requested_bytes: "
+      "2560\ntotal_parameters: 451\nchildren {\n  name: \"DW\"\n  exec_micros: "
+      "2\n  requested_bytes: 1280\n  parameters: 162\n  total_exec_micros: 2\n "
+      " total_requested_bytes: 1280\n  total_parameters: 162\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  cpu_exec_micros: 2\n  "
       "total_cpu_exec_micros: 2\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  output_bytes: 1280\n  total_output_bytes: "
-      "1280\n}\nchildren {\n  name: \"DW2\"\n  exec_micros: 11\n  parameters: "
-      "288\n  total_exec_micros: 11\n  total_parameters: 288\n  devices: "
+      "total_definition_count: 1\n  peak_bytes: 1280\n  residual_bytes: 1280\n "
+      " output_bytes: 1280\n  total_peak_bytes: 1280\n  total_residual_bytes: "
+      "1280\n  total_output_bytes: 1280\n}\nchildren {\n  name: \"DW2\"\n  "
+      "exec_micros: 11\n  requested_bytes: 1280\n  parameters: 288\n  "
+      "total_exec_micros: 11\n  total_requested_bytes: 1280\n  "
+      "total_parameters: 288\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  cpu_exec_micros: 11\n  "
       "total_cpu_exec_micros: 11\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  output_bytes: 1280\n  total_output_bytes: "
-      "1280\n}\nchildren {\n  name: \"ScalarW\"\n  parameters: 1\n  "
-      "total_parameters: 1\n  total_definition_count: "
+      "total_definition_count: 1\n  peak_bytes: 1280\n  residual_bytes: 1280\n "
+      " output_bytes: 1280\n  total_peak_bytes: 1280\n  total_residual_bytes: "
+      "1280\n  total_output_bytes: 1280\n}\nchildren {\n  name: \"ScalarW\"\n  "
+      "parameters: 1\n  total_parameters: 1\n  total_definition_count: "
       "1\n}\ntotal_cpu_exec_micros: 13\ntotal_run_count: "
-      "2\ntotal_definition_count: 3\ntotal_output_bytes: 2560\n",
+      "2\ntotal_definition_count: 3\ntotal_peak_bytes: "
+      "2560\ntotal_residual_bytes: 2560\ntotal_output_bytes: 2560\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 
@@ -150,7 +162,7 @@ TEST_F(TFProfStatsTest, TestGraph) {
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\ntotal_exec_micros: 4945\ntotal_requested_bytes: "
-      "14592\ntotal_parameters: 451\nchildren {\n  name: "
+      "30464\ntotal_parameters: 451\nchildren {\n  name: "
       "\"DW/Initializer/random_normal/mul\"\n  children {\n    name: "
       "\"DW/Initializer/random_normal/RandomStandardNormal\"\n    children {\n "
       "     name: \"DW/Initializer/random_normal/shape\"\n      "
@@ -166,7 +178,7 @@ TEST_F(TFProfStatsTest, TestGraph) {
       "4\n}\ntotal_float_ops: 10440\ntotal_accelerator_exec_micros: "
       "404\ntotal_cpu_exec_micros: 4541\ntotal_run_count: "
       "6\ntotal_definition_count: 32\ntotal_peak_bytes: "
-      "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n",
+      "25856\ntotal_residual_bytes: 3840\ntotal_output_bytes: 4864\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 
@@ -181,9 +193,9 @@ TEST_F(TFProfStatsTest, TestFloatOps) {
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\ntotal_exec_micros: 4945\ntotal_requested_bytes: "
-      "14592\ntotal_parameters: 451\nchildren {\n  name: \"Conv2D\"\n  "
-      "exec_micros: 4292\n  requested_bytes: 9472\n  total_exec_micros: 4292\n "
-      " total_requested_bytes: 9472\n  devices: "
+      "30464\ntotal_parameters: 451\nchildren {\n  name: \"Conv2D\"\n  "
+      "exec_micros: 4292\n  requested_bytes: 18176\n  total_exec_micros: "
+      "4292\n  total_requested_bytes: 18176\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  float_ops: 5832\n  "
       "total_float_ops: 5832\n  input_shapes {\n    key: 0\n    value {\n      "
       "dim {\n        size: 2\n      }\n      dim {\n        size: 6\n      "
@@ -194,11 +206,11 @@ TEST_F(TFProfStatsTest, TestFloatOps) {
       "6\n      }\n    }\n  }\n  accelerator_exec_micros: 226\n  "
       "cpu_exec_micros: 4066\n  total_accelerator_exec_micros: 226\n  "
       "total_cpu_exec_micros: 4066\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  peak_bytes: 5888\n  residual_bytes: 768\n  "
-      "output_bytes: 768\n  total_peak_bytes: 5888\n  total_residual_bytes: "
+      "total_definition_count: 1\n  peak_bytes: 14592\n  residual_bytes: 768\n "
+      " output_bytes: 768\n  total_peak_bytes: 14592\n  total_residual_bytes: "
       "768\n  total_output_bytes: 768\n}\nchildren {\n  name: \"Conv2D_1\"\n  "
-      "exec_micros: 597\n  requested_bytes: 5120\n  total_exec_micros: 597\n  "
-      "total_requested_bytes: 5120\n  devices: "
+      "exec_micros: 597\n  requested_bytes: 9728\n  total_exec_micros: 597\n  "
+      "total_requested_bytes: 9728\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  float_ops: 4608\n  "
       "total_float_ops: 4608\n  input_shapes {\n    key: 0\n    value {\n      "
       "dim {\n        size: 2\n      }\n      dim {\n        size: 3\n      "
@@ -209,12 +221,12 @@ TEST_F(TFProfStatsTest, TestFloatOps) {
       "12\n      }\n    }\n  }\n  accelerator_exec_micros: 178\n  "
       "cpu_exec_micros: 419\n  total_accelerator_exec_micros: 178\n  "
       "total_cpu_exec_micros: 419\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  peak_bytes: 4096\n  residual_bytes: 512\n  "
-      "output_bytes: 512\n  total_peak_bytes: 4096\n  total_residual_bytes: "
+      "total_definition_count: 1\n  peak_bytes: 8704\n  residual_bytes: 512\n  "
+      "output_bytes: 512\n  total_peak_bytes: 8704\n  total_residual_bytes: "
       "512\n  total_output_bytes: 512\n}\ntotal_float_ops: "
       "10440\ntotal_accelerator_exec_micros: 404\ntotal_cpu_exec_micros: "
       "4541\ntotal_run_count: 6\ntotal_definition_count: 35\ntotal_peak_bytes: "
-      "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n",
+      "25856\ntotal_residual_bytes: 3840\ntotal_output_bytes: 4864\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 
@@ -231,9 +243,9 @@ TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\ntotal_exec_micros: 597\ntotal_requested_bytes: "
-      "5120\nchildren {\n  name: \"Conv2D_1\"\n  exec_micros: 597\n  "
-      "requested_bytes: 5120\n  total_exec_micros: 597\n  "
-      "total_requested_bytes: 5120\n  devices: "
+      "9728\nchildren {\n  name: \"Conv2D_1\"\n  exec_micros: 597\n  "
+      "requested_bytes: 9728\n  total_exec_micros: 597\n  "
+      "total_requested_bytes: 9728\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  float_ops: 4608\n  "
       "total_float_ops: 4608\n  input_shapes {\n    key: 0\n    value {\n      "
       "dim {\n        size: 2\n      }\n      dim {\n        size: 3\n      "
@@ -244,12 +256,12 @@ TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
       "12\n      }\n    }\n  }\n  accelerator_exec_micros: 178\n  "
       "cpu_exec_micros: 419\n  total_accelerator_exec_micros: 178\n  "
       "total_cpu_exec_micros: 419\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  peak_bytes: 4096\n  residual_bytes: 512\n  "
-      "output_bytes: 512\n  total_peak_bytes: 4096\n  total_residual_bytes: "
+      "total_definition_count: 1\n  peak_bytes: 8704\n  residual_bytes: 512\n  "
+      "output_bytes: 512\n  total_peak_bytes: 8704\n  total_residual_bytes: "
       "512\n  total_output_bytes: 512\n}\ntotal_float_ops: "
       "4608\ntotal_accelerator_exec_micros: 178\ntotal_cpu_exec_micros: "
       "419\ntotal_run_count: 1\ntotal_definition_count: 2\ntotal_peak_bytes: "
-      "4096\ntotal_residual_bytes: 512\ntotal_output_bytes: 512\n",
+      "8704\ntotal_residual_bytes: 512\ntotal_output_bytes: 512\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 
@@ -265,8 +277,9 @@ TEST_F(TFProfStatsTest, TestShowTensorValue) {
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\ntotal_exec_micros: 4945\ntotal_requested_bytes: "
-      "14592\ntotal_parameters: 451\nchildren {\n  name: \"DW\"\n  "
-      "exec_micros: 2\n  parameters: 162\n  total_exec_micros: 2\n  "
+      "30464\ntotal_parameters: 451\nchildren {\n  name: \"DW\"\n  "
+      "exec_micros: 2\n  requested_bytes: 1280\n  parameters: 162\n  "
+      "total_exec_micros: 2\n  total_requested_bytes: 1280\n  "
       "total_parameters: 162\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  tensor_value {\n    dtype: "
       "DT_FLOAT\n    value_double: -0.000534315\n    value_double: "
@@ -351,11 +364,13 @@ TEST_F(TFProfStatsTest, TestShowTensorValue) {
       "value_double: 0.000374641\n    value_double: -0.00149603\n    "
       "value_double: -0.000317367\n    value_double: -0.000417829\n  }\n  "
       "cpu_exec_micros: 2\n  total_cpu_exec_micros: 2\n  run_count: 1\n  "
-      "total_run_count: 1\n  total_definition_count: 10\n  output_bytes: "
-      "1280\n  total_output_bytes: 1280\n}\ntotal_float_ops: "
-      "10440\ntotal_accelerator_exec_micros: 404\ntotal_cpu_exec_micros: "
-      "4541\ntotal_run_count: 6\ntotal_definition_count: 35\ntotal_peak_bytes: "
-      "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n",
+      "total_run_count: 1\n  total_definition_count: 10\n  peak_bytes: 1280\n  "
+      "residual_bytes: 1280\n  output_bytes: 1280\n  total_peak_bytes: 1280\n  "
+      "total_residual_bytes: 1280\n  total_output_bytes: "
+      "1280\n}\ntotal_float_ops: 10440\ntotal_accelerator_exec_micros: "
+      "404\ntotal_cpu_exec_micros: 4541\ntotal_run_count: "
+      "6\ntotal_definition_count: 35\ntotal_peak_bytes: "
+      "25856\ntotal_residual_bytes: 3840\ntotal_output_bytes: 4864\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 }
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.cc b/tensorflow/core/profiler/internal/tfprof_timeline.cc
index bdb000747db72900d748c22140ca38e571db6691..b0dd8ce5e0f046325a309060b19467b7c1494568 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.cc
@@ -153,10 +153,8 @@ void MemoryTracker::TrackNode(int64 step, const GraphNode* node) {
 
   std::map<int64, int64> allocs;
   for (const auto& alloc : node->node->allocations(step)) {
-    for (const auto& r : alloc.allocation_records()) {
-      allocs[r.alloc_micros()] += r.alloc_bytes();
-      dev.tracked_allocations[r.alloc_micros()] += r.alloc_bytes();
-    }
+    allocs[alloc.alloc_micros()] += alloc.alloc_bytes();
+    dev.tracked_allocations[alloc.alloc_micros()] += alloc.alloc_bytes();
   }
   dev.tracked_allocations[0] += node->node->accelerator_persistent_bytes();
   allocs[0] += node->node->accelerator_persistent_bytes();
@@ -167,9 +165,9 @@ void MemoryTracker::TrackNode(int64 step, const GraphNode* node) {
     last += it->second;
     aggregate_allocs[it->first] = last;
   }
-  int64 end_micros = node->node->lastest_schedule_end_micros(step);
-  if (end_micros > 0 && node->node->allocator_bytes_in_use(step) > 0) {
-    dev.allocations[end_micros] = node->node->allocator_bytes_in_use(step);
+  for (const auto& bytes_in_use : node->node->allocator_bytes_in_use(step)) {
+    if (bytes_in_use.first <= 0) continue;
+    dev.allocations[bytes_in_use.first] = bytes_in_use.second;
   }
 }
 
@@ -265,6 +263,10 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
     }
   }
   for (const auto& dev : mem_tracker_.devices()) {
+    if (IsPlacedOnCPU(dev.first)) {
+      // TODO(xpan): Maybe also support CPU allocator memory tracking.
+      continue;
+    }
     int64 pid = AllocatePID();
     chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid);
     int64 pid2 = AllocatePID();
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
index 91eac0cf7617eba54f6938fb893192d2a8fe2eaf..6a7ab01029a4dd1bc26f81b1d3e739812130fcd1 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
@@ -71,7 +71,7 @@ TEST_F(TFProfTimelineTest, GraphView) {
 
   string dump_str;
   TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file + "_0", &dump_str));
-  EXPECT_EQ(7932146665024565912ull, Hash64(dump_str));
+  EXPECT_EQ(16556121177519539380ull, Hash64(dump_str));
 }
 
 TEST_F(TFProfTimelineTest, ScopeView) {
diff --git a/tensorflow/core/profiler/tfprof_log.proto b/tensorflow/core/profiler/tfprof_log.proto
index f92301133a3102a2e4233326dd811169e1ecd105..0bf1b477ed855e6ff877faa780d25a08e85ea1e5 100644
--- a/tensorflow/core/profiler/tfprof_log.proto
+++ b/tensorflow/core/profiler/tfprof_log.proto
@@ -90,10 +90,6 @@ message ProfileNode {
   map<int64, ExecProfile> execs = 12;
 }
 
-message Allocation {
-  repeated AllocationRecord allocation_records = 1;
-}
-
 message ExecProfile {
   // Can be larger than 1 if run multiple times in loop.
   int64 run_count = 1;
@@ -110,34 +106,42 @@ message ExecProfile {
   // For cpu, vector size can be larger than 1 if in tf.while_loop.
   map<string, ExecTime> cpu_execs = 5;
 
-  map<int32, Memory> output_memory = 17;
+  // Each entry to memory information of a scheduling of the node.
+  // Normally, there will be multiple entries in while_loop.
+  repeated ExecMemory memory_execs = 7;
+  // The allocation and deallocation times and sizes throughout execution.
+  repeated AllocationRecord allocations = 11;
+  // The devices related to this execution.
+  repeated string devices = 6;
+}
 
-  repeated Allocation allocations = 18;
+message ExecTime {
+  repeated Tuple times = 1;
+}
 
-  repeated string devices = 6;
+message ExecMemory {
+  // This is the timestamp when the memory information was tracked.
+  int64 memory_micros = 1;
+  // NOTE: Please don't depend on the following 4 fields yet. Due to
+  // TensorFlow internal tracing issues, the numbers can be quite wrong.
+  // TODO(xpan): Fix the TensorFlow internal tracing.
+  int64 host_temp_bytes = 2;
+  int64 host_persistent_bytes = 3;
+  int64 accelerator_temp_bytes = 4;
+  int64 accelerator_persistent_bytes = 5;
 
   // Total bytes requested by the op.
-  int64 requested_bytes = 7;
+  int64 requested_bytes = 6;
   // Total bytes requested by the op and released before op end.
-  int64 peak_bytes = 8;
+  int64 peak_bytes = 7;
   // Total bytes requested by the op and not released after op end.
-  int64 residual_bytes = 9;
+  int64 residual_bytes = 8;
   // Total bytes output by the op (not necessarily requested by the op).
-  int64 output_bytes = 10;
-  // Total temporary bytes allocated and released by the op.
-  int64 host_temp_bytes = 11;
-  // Total persistent bytes (e.g. variable) allocated by the op.
-  int64 host_persistent_bytes = 12;
-  int64 accelerator_temp_bytes = 13;
-  int64 accelerator_persistent_bytes = 14;
+  int64 output_bytes = 9;
   // The total number of bytes currently allocated by the allocator if >0.
-  int64 allocator_bytes_in_use = 15;
-
-  bool memory_intialized = 16;
-}
-
-message ExecTime {
-  repeated Tuple times = 1;
+  int64 allocator_bytes_in_use = 10;
+  // The memory of each output of the operation.
+  map<int32, Memory> output_memory = 11;
 }
 
 message Tuple {
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index a956aab3dcaf51c9f5c91784238d36f20948c490..ccab69b9c04cad1fdd95f7ff4304fc60e2f459da 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -42,18 +42,24 @@ message GPUOptions {
   // A comma-separated list of GPU ids that determines the 'visible'
   // to 'virtual' mapping of GPU devices.  For example, if TensorFlow
   // can see 8 GPU devices in the process, and one wanted to map
-  // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1", then one
-  // would specify this field as "5,3".  This field is similar in
+  // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1",
+  // then one would specify this field as "5,3".  This field is similar in
   // spirit to the CUDA_VISIBLE_DEVICES environment variable, except
   // it applies to the visible GPU devices in the process.
   //
-  // NOTE: The GPU driver provides the process with the visible GPUs
-  // in an order which is not guaranteed to have any correlation to
-  // the *physical* GPU id in the machine.  This field is used for
-  // remapping "visible" to "virtual", which means this operates only
-  // after the process starts.  Users are required to use vendor
-  // specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the
-  // physical to visible device mapping prior to invoking TensorFlow.
+  // NOTE:
+  // 1. The GPU driver provides the process with the visible GPUs
+  //    in an order which is not guaranteed to have any correlation to
+  //    the *physical* GPU id in the machine.  This field is used for
+  //    remapping "visible" to "virtual", which means this operates only
+  //    after the process starts.  Users are required to use vendor
+  //    specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the
+  //    physical to visible device mapping prior to invoking TensorFlow.
+  // 2. In the code, the ids in this list are also called "CUDA GPU id"s,
+  //    and the 'virtual' ids of GPU devices (i.e. the ids in the device
+  //    name "/device:GPU:<id>") are also called "TF GPU id"s. Please
+  //    refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h
+  //    for more information.
   string visible_device_list = 5;
 
   // In the event polling loop sleep this many microseconds between
@@ -77,6 +83,52 @@ message GPUOptions {
   // memory is unpageable, having too much pinned memory might negatively impact
   // the overall host system performance.
   bool force_gpu_compatible = 8;
+
+  // Everything inside Experimental is subject to change and is not subject
+  // to API stability guarantees in
+  // https://www.tensorflow.org/programmers_guide/version_compat.
+  message Experimental {
+    // Configuration for breaking down a visible GPU into multiple "virtual"
+    // devices.
+    message VirtualDevices {
+      // Per "virtual" device memory limit, in MB. The number of elements in
+      // the list is the number of virtual devices to create on the
+      // corresponding visible GPU (see "virtual_devices" below).
+      // If empty, it will create single virtual device taking all available
+      // memory from the device.
+      //
+      // For the concept of "visible" and "virtual" GPU, see the comments for
+      // "visible_device_list" above for more information.
+      repeated float memory_limit_mb = 1;
+    }
+
+    // The multi virtual device settings. If empty (not set), it will create
+    // single virtual device on each visible GPU, according to the settings
+    // in "visible_device_list" above. Otherwise, the number of elements in the
+    // list must be the same as the number of visible GPUs (after
+    // "visible_device_list" filtering if it is set), and the string represented
+    // device names (e.g. /device:GPU:<id>) will refer to the virtual
+    // devices and have the <id> field assigned sequentially starting from 0,
+    // according to the order they appear in this list and the "memory_limit"
+    // list inside each element. For example,
+    //   visible_device_list = "1,0"
+    //   virtual_devices { memory_limit: 1GB memory_limit: 2GB }
+    //   virtual_devices {}
+    // will create three virtual devices as:
+    //   /device:GPU:0 -> visible GPU 1 with 1GB memory
+    //   /device:GPU:1 -> visible GPU 1 with 2GB memory
+    //   /device:GPU:2 -> visible GPU 0 with all available memory
+    //
+    // NOTE:
+    // 1. It's invalid to set both this and "per_process_gpu_memory_fraction"
+    //    at the same time.
+    // 2. Currently this setting is per-process, not per-session. Using
+    //    different settings in different sessions within same process will
+    //    result in undefined behavior.
+    repeated VirtualDevices virtual_devices = 1;
+  }
+
+  Experimental experimental = 9;
 };
 
 // Options passed to the graph optimizer
@@ -303,7 +355,11 @@ message ConfigProto {
   // Optional list of all workers to use in this session.
   ClusterDef cluster_def = 14;
 
-  // Next: 15
+  // If true, any resources such as Variables used in the session will not be
+  // shared with other sessions.
+  bool isolate_session_state = 15;
+
+  // Next: 16
 };
 
 // Options for a single Run() call.
diff --git a/tensorflow/core/protobuf/control_flow.proto b/tensorflow/core/protobuf/control_flow.proto
index 48f503225447c26f8959ba379656361292052b44..2c9476a08ad946e7f019475055397fcd6cfbbc5a 100644
--- a/tensorflow/core/protobuf/control_flow.proto
+++ b/tensorflow/core/protobuf/control_flow.proto
@@ -66,4 +66,9 @@ message WhileContextDef {
 
   // Values and external values in control flow context.
   ValuesDef values_def = 9;
+
+  // Optional name of the maximum_iterations tensor.
+  string maximum_iterations_name = 11;
+
+  // Next available id: 12.
 }
diff --git a/tensorflow/core/protobuf/debug.proto b/tensorflow/core/protobuf/debug.proto
index 136c627e25f33cb9b4ff2de7725406c0f800a5b1..56983f3b7d464f88cebe608ac15882f04f27b003 100644
--- a/tensorflow/core/protobuf/debug.proto
+++ b/tensorflow/core/protobuf/debug.proto
@@ -60,3 +60,25 @@ message DebugOptions {
   // step count.
   int64 global_step = 10;
 }
+
+message DebuggedSourceFile {
+  // The host name on which a source code file is located.
+  string host = 1;
+
+  // Path to the source code file.
+  string file_path = 2;
+
+  // The timestamp at which the source code file is last modified.
+  int64 last_modified = 3;
+
+  // Byte size of the file.
+  int64 bytes = 4;
+
+  // Line-by-line content of the source code file.
+  repeated string lines = 5;
+}
+
+message DebuggedSourceFiles {
+  // A collection of source code files.
+  repeated DebuggedSourceFile source_files = 1;
+}
diff --git a/tensorflow/core/protobuf/device_properties.proto b/tensorflow/core/protobuf/device_properties.proto
index 9b1497c710d40c4c5a989f80ae0d98ee2a2dc3a8..3bd301590034847369fb18c95b75baf5221f979f 100644
--- a/tensorflow/core/protobuf/device_properties.proto
+++ b/tensorflow/core/protobuf/device_properties.proto
@@ -49,3 +49,8 @@ message DeviceProperties {
   // Memory bandwidth in KB/s
   int64 bandwidth = 13;
 }
+
+message NamedDevice {
+  string name = 1;
+  DeviceProperties properties = 2;
+}
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 3b5d1563a2695c4b33d596f0493e38ff044b3c38..96b55ce04ba9b791dd841cd6d2325d57aa199b8f 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -35,7 +35,7 @@ message RewriterConfig {
   Toggle constant_folding = 3;
   // Arithmetic optimizations (default is ON)
   Toggle arithmetic_optimization = 7;
-  // Control dependency optimizations (default is OFF).
+  // Control dependency optimizations (default is ON).
   Toggle dependency_optimization = 8;
   // If true, don't remove unnecessary ops from the graph
   bool disable_model_pruning = 2;
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index e7b3f36fcc7e66eaaad74ca611230fb061c267fe..385e2dd163b8c668357ea9fabd1dee7d9a675729 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -59,6 +59,10 @@ message CreateWorkerSessionRequest {
 
   // Defines the configuration of a TensorFlow worker.
   ServerDef server_def = 2;
+
+  // If true, any resources such as Variables used in the session will not be
+  // shared with other sessions.
+  bool isolate_session_state = 3;
 }
 
 message CreateWorkerSessionResponse {
diff --git a/tensorflow/core/public/session.h b/tensorflow/core/public/session.h
index bca384e59fe9412a77398a81f0c8abbfd512e51a..75ad50f6f2d59a8f4b8282d8e7b395e2323d62e1 100644
--- a/tensorflow/core/public/session.h
+++ b/tensorflow/core/public/session.h
@@ -186,7 +186,7 @@ class Session {
   /// the `SessionOptions::target` field).
   virtual Status Close() = 0;
 
-  // NOTE(ashankar): As of July 2017, this method was added to faciliate some
+  // NOTE(ashankar): As of July 2017, this method was added to facilitate some
   // experimentation. Reconsider/re-evaluate after September 2017.
   //
   // Sets `*output` to the `DeviceMgr` that owns accessible devices in the
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index ec077c42837e517f94955956ed75430b7a3d0a30..d8e7df48c2d9f7023f15ffab7a62ccafbce4458d 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -119,5 +119,7 @@ extern const char* tf_compiler_version();
 extern const char* tf_git_version();
 // Value of the _GLIBCXX_USE_CXX11_ABI flag, or 0 if it's not set.
 extern const int tf_cxx11_abi_flag();
+// Returns 1 if build is monolithic, or 0 otherwise.
+extern const int tf_monolithic_build();
 
 #endif  // TENSORFLOW_CORE_PUBLIC_VERSION_H_
diff --git a/tensorflow/core/user_ops/fact.cc b/tensorflow/core/user_ops/fact.cc
index c512275506436d54829b355dbbd9711115d364b3..800008e0b884bee3bcd94c1d90be3d7b2a636615 100644
--- a/tensorflow/core/user_ops/fact.cc
+++ b/tensorflow/core/user_ops/fact.cc
@@ -18,27 +18,27 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
-using namespace tensorflow;
-
 REGISTER_OP("Fact")
     .Output("fact: string")
     .Doc(R"doc(
 Output a fact about factorials.
 )doc");
 
-class FactOp : public OpKernel {
+class FactOp : public tensorflow::OpKernel {
  public:
-  explicit FactOp(OpKernelConstruction* context) : OpKernel(context) {}
+  explicit FactOp(tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {}
 
-  void Compute(OpKernelContext* context) override {
+  void Compute(tensorflow::OpKernelContext* context) override {
     // Output a scalar string.
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape(), &output_tensor));
+    tensorflow::Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, tensorflow::TensorShape(), &output_tensor));
+    using tensorflow::string;
     auto output = output_tensor->template scalar<string>();
 
     output() = "0! == 1";
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("Fact").Device(DEVICE_CPU), FactOp);
+REGISTER_KERNEL_BUILDER(Name("Fact").Device(tensorflow::DEVICE_CPU), FactOp);
diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h
index 8315f208e735ec1e879528bef9c8d53419a0303d..3e32ec79731e1529affb49cf6e1aff3f23b84262 100644
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@@ -374,6 +374,30 @@ __device__ __host__ inline Eigen::half ldg(const Eigen::half* address) {
 #endif
 }
 
+template <>
+__device__ __host__ inline tensorflow::bfloat16 ldg(
+    const tensorflow::bfloat16* address) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  tensorflow::bfloat16 return_value;
+  asm volatile("ld.global.nc.u16 %0, [%1];"
+               : "=h"(return_value.value)
+               : "l"(address));
+  return return_value;
+#else
+  return *address;
+#endif
+}
+
+template <>
+__device__ __host__ inline bool ldg(const bool* address) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  return *reinterpret_cast<const bool*>(
+      __ldg(reinterpret_cast<const char*>(address)));
+#else
+  return *address;
+#endif
+}
+
 // CUDA provides atomic ops, but not for all types.  We provide wrappers
 // for some ops and provide implementation for all reasonable types.
 #define CUDA_ATOMIC_WRAPPER(op, T) \
@@ -742,6 +766,12 @@ __device__ EIGEN_ALWAYS_INLINE T CudaShuffleDown(unsigned mask, T value,
   return __shfl_down_sync(mask, value, delta, width);
 }
 
+__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleDown(
+    unsigned mask, Eigen::half value, int delta, int width = warpSize) {
+  return Eigen::half(
+      __shfl_down_sync(mask, static_cast<uint16>(value), delta, width));
+}
+
 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
 // instead of float for lo and hi (which is incorrect with ftz, for example).
 // A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
@@ -764,6 +794,12 @@ __device__ EIGEN_ALWAYS_INLINE T CudaShuffleXor(unsigned mask, T value,
   return __shfl_xor_sync(mask, value, laneMask, width);
 }
 
+__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleXor(
+    unsigned mask, Eigen::half value, int laneMask, int width = warpSize) {
+  return Eigen::half(
+      __shfl_xor_sync(mask, static_cast<uint16>(value), laneMask, width));
+}
+
 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
 // instead of float for lo and hi (which is incorrect with ftz, for example).
 // A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 118ff0d0d6abbeb83fd5e1d5abfe2e3b5a66f296..2caf5fc56dafb5a8879db8026a78bc7bf46346a4 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -24,25 +24,25 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "mkl_service.h"
 #include "mkl_trans.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "tensorflow/core/graph/mkl_graph_util.h"
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
+using mkldnn::engine;
 using mkldnn::memory;
-using mkldnn::reorder;
-using mkldnn::primitive;
 using mkldnn::padding_kind;
-using mkldnn::engine;
+using mkldnn::primitive;
+using mkldnn::reorder;
 #endif
 
 // The file contains a number of utility classes and functions used by MKL
@@ -56,8 +56,14 @@ namespace tensorflow {
 // Tensorflow tensor.
 
 typedef enum { W = 0, H = 1, C = 2, N = 3 } MklDims;
-typedef enum { Dim_N = 0, Dim_C = 1, Dim_H = 2, Dim_W = 3,
-               Dim_O = 0, Dim_I = 1 } MklDnnDims;
+typedef enum {
+  Dim_N = 0,
+  Dim_C = 1,
+  Dim_H = 2,
+  Dim_W = 3,
+  Dim_O = 0,
+  Dim_I = 1
+} MklDnnDims;
 
 class MklShape {
  public:
@@ -236,8 +242,7 @@ class MklShape {
   (IS_MKL_TENSOR_OFFSET + sizeof(size_t))  // Location of dimension_
 // Location of sizes. Note dim is not used here, left here
 // to make macros consistent.
-#define SIZES_OFFSET(dims) \
-  (DIMS_OFFSET + sizeof(size_t))
+#define SIZES_OFFSET(dims) (DIMS_OFFSET + sizeof(size_t))
 #define STRIDES_OFFSET(dims) \
   (SIZES_OFFSET(dims) + dims * sizeof(size_t))  // Location of strides
 #define MKL_LAYOUT_OFFSET(dims) \
@@ -323,6 +328,10 @@ class MklShape {
 
 // Forward decl
 TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format);
+memory::dims CalculateTFStrides(const memory::dims& dims_tf_order);
+memory::desc CreateBlockedMemDescHelper(const memory::dims& dim,
+                                        const memory::dims& strides,
+                                        memory::data_type dtype);
 
 class MklDnnShape {
  private:
@@ -332,7 +341,7 @@ class MklDnnShape {
     /// Number of dimensions in Tensorflow format
     size_t dimension_ = 0;
     /// Required by MKLDNN for conversions
-    mkldnn_dims_t sizes_;    // Required by MKL for conversions
+    mkldnn_dims_t sizes_;  // Required by MKL for conversions
     memory::format tf_data_format_ = memory::format::format_undef;
     memory::data_type T_ = memory::data_type::data_undef;
     // MKL layout
@@ -345,15 +354,13 @@ class MklDnnShape {
   typedef std::remove_extent<mkldnn_dims_t>::type mkldnn_dim_t;
 #define INVALID_DIM_SIZE -1
 
-
  public:
   MklDnnShape() {
-    for (size_t i = 0; i < sizeof(data_.sizes_) /
-                           sizeof(data_.sizes_[0]); ++i) {
+    for (size_t i = 0; i < sizeof(data_.sizes_) / sizeof(data_.sizes_[0]);
+         ++i) {
       data_.sizes_[i] = -1;
     }
-    for (size_t i = 0; i < sizeof(data_.map_) /
-                           sizeof(data_.map_[0]); ++i) {
+    for (size_t i = 0; i < sizeof(data_.map_) / sizeof(data_.map_[0]); ++i) {
       data_.map_[i] = -1;
     }
   }
@@ -361,6 +368,52 @@ class MklDnnShape {
   ~MklDnnShape() {}
   TF_DISALLOW_COPY_AND_ASSIGN(MklDnnShape);  // Cannot copy
 
+  /// Helper function to compare memory::desc objects for MklDnn.
+  /// May be this should go into MklDnn directly.
+  inline bool CompareMklDnnLayouts(const memory::desc& md1,
+                                   const memory::desc& md2) const {
+    mkldnn_memory_desc_t mdd1 = md1.data;
+    mkldnn_memory_desc_t mdd2 = md2.data;
+    const char* d1 = reinterpret_cast<const char*>(&mdd1);
+    const char* d2 = reinterpret_cast<const char*>(&mdd2);
+
+    size_t md_size = sizeof(mdd1);
+    for (size_t i = 0; i < md_size; i++) {
+      if (*d1++ != *d2++) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// Equality function for MklDnnShape objects
+  /// @return true if both are equal; false otherwise.
+  inline bool operator == (const MklDnnShape& input_shape) const {
+    if (this->IsMklTensor() != input_shape.IsMklTensor()) {
+      return false;
+    }
+
+    // If input tensors are in Mkl layout, then we check for dimensions and
+    // sizes.
+    if (this->IsMklTensor()) {
+      return this->GetTfShape() == input_shape.GetTfShape() &&
+             CompareMklDnnLayouts(this->GetMklLayout(),
+                                  input_shape.GetMklLayout());
+    }
+
+    return true;
+  }
+
+  /// Equality operator for MklDnnShape and TFShape.
+  /// Returns: true if TF shapes for both are the same, false otherwise
+  inline bool operator == (const TensorShape& input_shape) const {
+    if (!this->IsMklTensor()) {
+      return false;
+    }
+
+    return this->GetTfShape() == input_shape;
+  }
+
   inline const bool IsMklTensor() const { return data_.is_mkl_tensor_; }
   inline void SetMklTensor(bool is_mkl_tensor) {
     data_.is_mkl_tensor_ = is_mkl_tensor;
@@ -369,26 +422,26 @@ class MklDnnShape {
   inline void SetDimensions(const size_t dimension) {
     data_.dimension_ = dimension;
   }
-  inline size_t GetDimension(char dimension)const {
+  inline size_t GetDimension(char dimension) const {
     int index = GetMklDnnTensorDimIndex(dimension);
     CHECK(index >= 0 && index < this->GetDimension())
-        << "Invalid index from the dimension: " << index << ", " << dimension;
+      << "Invalid index from the dimension: " << index << ", " << dimension;
     return this->DimSize(index);
   }
 
-  inline int32 GetMklDnnTensorDimIndex(char dimension)const {
+  inline int32 GetMklDnnTensorDimIndex(char dimension) const {
     switch (dimension) {
-  case 'N':
-    return MklDnnDims::Dim_N;
-  case 'C':
-    return MklDnnDims::Dim_C;
-  case 'H':
-    return MklDnnDims::Dim_H;
-  case 'W':
-    return MklDnnDims::Dim_W;
-  default:
-    LOG(FATAL) << "Invalid dimension: " << dimension;
-    return -1;  // Avoid compiler warning about missing return value
+      case 'N':
+        return MklDnnDims::Dim_N;
+      case 'C':
+        return MklDnnDims::Dim_C;
+      case 'H':
+        return MklDnnDims::Dim_H;
+      case 'W':
+        return MklDnnDims::Dim_W;
+      default:
+        LOG(FATAL) << "Invalid dimension: " << dimension;
+        return -1;  // Avoid compiler warning about missing return value
     }
   }
 
@@ -402,10 +455,10 @@ class MklDnnShape {
   inline memory::dims GetSizesAsMklDnnDims() const {
     memory::dims retVal;
     if (data_.is_mkl_tensor_) {
-      int dimensions = sizeof(data_.sizes_) / sizeof(data_.sizes_[0]);
-      for (size_t i = 0 ; i < dimensions; i++) {
+      size_t dimensions = sizeof(data_.sizes_) / sizeof(data_.sizes_[0]);
+      for (size_t i = 0; i < dimensions; i++) {
         if (data_.sizes_[i] != INVALID_DIM_SIZE)
-        retVal.push_back(data_.sizes_[i]);
+          retVal.push_back(data_.sizes_[i]);
       }
     } else {
       CHECK_EQ(data_.is_mkl_tensor_, true);
@@ -414,18 +467,27 @@ class MklDnnShape {
   }
 
   inline int64 DimSize(int index) const {
-    CHECK_LT(index, sizeof(data_.sizes_)/sizeof(data_.sizes_[0]));
+    CHECK_LT(index, sizeof(data_.sizes_) / sizeof(data_.sizes_[0]));
     return data_.sizes_[index];
   }
 
   /// Return TensorShape that describes the Tensorflow shape of the tensor
   /// represented by this MklShape.
-  inline TensorShape GetTfShape() {
+  inline TensorShape GetTfShape() const {
     CHECK_EQ(data_.is_mkl_tensor_, true);
 
     std::vector<int32> shape(data_.dimension_, -1);
-    for (size_t idx = 0; idx < data_.dimension_; ++idx) {
-      shape[idx] = data_.sizes_[TfDimIdx(idx)];
+    if (data_.tf_data_format_ != memory::format::blocked) {
+      for (size_t idx = 0; idx < data_.dimension_; ++idx) {
+        shape[idx] = data_.sizes_[TfDimIdx(idx)];
+      }
+    } else {
+      // If Tensorflow shape is in Blocked format, then we don't have dimension
+      // map for it. So we just create Tensorflow shape from sizes in the
+      // specified order.
+      for (size_t idx = 0; idx < data_.dimension_; ++idx) {
+        shape[idx] = data_.sizes_[idx];
+      }
     }
 
     TensorShape ts;
@@ -441,6 +503,12 @@ class MklDnnShape {
     CHECK_NOTNULL(pd);
     data_.mkl_md_ = pd->desc().data;
   }
+
+  inline void SetMklLayout(memory::desc* md) {
+    CHECK_NOTNULL(md);
+    data_.mkl_md_ = md->data;
+  }
+
   inline const memory::desc GetMklLayout() const {
     return memory::desc(data_.mkl_md_);
   }
@@ -449,24 +517,36 @@ class MklDnnShape {
     return data_.tf_data_format_;
   }
   /// We don't create primitive_descriptor for TensorFlow layout now.
-  /// We use lazy evaluation and create it only when needed.
+  /// We use lazy evaluation and create it only when needed. Input format can
+  /// also be Blocked format.
   inline void SetTfLayout(size_t dims, const memory::dims& sizes,
-                   memory::format format) {
+                          memory::format format) {
     CHECK_EQ(dims, sizes.size());
     data_.dimension_ = dims;
     for (size_t ii = 0; ii < dims; ii++) {
       data_.sizes_[ii] = sizes[ii];
     }
     data_.tf_data_format_ = format;
-    SetTfDimOrder(dims, format);
+    if (format != memory::format::blocked) {
+      SetTfDimOrder(dims, format);
+    }
   }
+
   inline const memory::desc GetTfLayout() const {
     memory::dims dims;
     for (size_t ii = 0; ii < data_.dimension_; ii++) {
       dims.push_back(data_.sizes_[ii]);
     }
-    return memory::desc(dims, data_.T_, data_.tf_data_format_);
+
+    // Create Blocked memory desc if input TF format was set like that.
+    if (data_.tf_data_format_ == memory::format::blocked) {
+      auto strides = CalculateTFStrides(dims);
+      return CreateBlockedMemDescHelper(dims, strides, data_.T_);
+    } else {
+      return memory::desc(dims, data_.T_, data_.tf_data_format_);
+    }
   }
+
   inline const memory::desc GetCurLayout() const {
     return IsMklTensor() ? GetMklLayout() : GetTfLayout();
   }
@@ -497,9 +577,7 @@ class MklDnnShape {
     SetTfDimOrder(dimension, data_format);
   }
 
-  inline const mkldnn_dim_t* GetTfToMklDimMap() const {
-    return &data_.map_[0];
-  }
+  inline const mkldnn_dim_t* GetTfToMklDimMap() const { return &data_.map_[0]; }
   inline size_t TfDimIdx(int index) const { return data_.map_[index]; }
   inline int64 TfDimSize(int index) const {
     return data_.sizes_[TfDimIdx(index)];
@@ -553,9 +631,7 @@ class MklDnnShape {
 
   /// Size of buffer to hold the serialized object, the size is computed by
   /// following above mentioned order
-  inline size_t GetSerializeBufferSize() const {
-    return sizeof(MklShapeData);
-  }
+  inline size_t GetSerializeBufferSize() const { return sizeof(MklShapeData); }
 
   void SerializeMklDnnShape(unsigned char* buf, size_t buf_size) const {
     CHECK(buf_size >= GetSerializeBufferSize())
@@ -566,12 +642,12 @@ class MklDnnShape {
   void DeSerializeMklDnnShape(const unsigned char* buf, size_t buf_size) {
     // Make sure buffer holds at least is_mkl_tensor_.
     CHECK(buf_size >= sizeof(data_.is_mkl_tensor_))
-      << "Buffer size is too small in DeSerializeMklDnnShape";
+        << "Buffer size is too small in DeSerializeMklDnnShape";
 
     const bool is_mkl_tensor = *reinterpret_cast<const bool*>(buf);
     if (is_mkl_tensor) {  // If it is an MKL Tensor then read the rest
       CHECK(buf_size >= GetSerializeBufferSize())
-        << "Buffer size is too small in DeSerializeMklDnnShape";
+          << "Buffer size is too small in DeSerializeMklDnnShape";
       data_ = *reinterpret_cast<const MklShapeData*>(buf);
     }
   }
@@ -580,8 +656,13 @@ class MklDnnShape {
 #endif
 
 // List of MklShape objects. Used in Concat/Split layers.
+
 typedef std::vector<MklShape> MklShapeList;
 
+#ifdef INTEL_MKL_DNN
+typedef std::vector<MklDnnShape> MklDnnShapeList;
+#endif
+
 // Check if all tensors specified by MklShapes are MKL tensors.
 inline bool AreAllMklTensors(const MklShapeList& shapes) {
   for (auto& s : shapes) {
@@ -592,6 +673,7 @@ inline bool AreAllMklTensors(const MklShapeList& shapes) {
   return true;
 }
 
+#ifndef INTEL_MKL_DNN
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklShape& mkl_shape) {
@@ -616,32 +698,15 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
 
   return output_tensor;
 }
-
-#ifdef INTEL_MKL_DNN
+#else
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklDnnShape& mkl_shape) {
   Tensor output_tensor;
   TensorShape output_shape;
 
-#if 0
-  // TODO(nhasabni): need to implement
-  for (size_t j = 0; j < mkl_shape.GetDimension(); j++) {
-    // Outermost to innermost dimension
-    output_shape.AddDim(mkl_shape.GetSizes()[mkl_shape.tf_dim_idx(j)]);
-  }
-
-  // Allocate output tensor.
-  context->allocate_temp(DataTypeToEnum<T>::v(), output_shape, &output_tensor);
-
-  dnnLayout_t output_layout = static_cast<dnnLayout_t>(mkl_shape.GetTfLayout());
-  void* input_buffer = const_cast<T*>(mkl_tensor.flat<T>().data());
-  void* output_buffer = const_cast<T*>(output_tensor.flat<T>().data());
-
-  if (mkl_tensor.NumElements() != 0) {
-    mkl_shape.GetConvertedFlatData(output_layout, input_buffer, output_buffer);
-  }
-#endif
+  TF_CHECK_OK(Status(error::Code::UNIMPLEMENTED,
+                     "Unimplemented conversion function"));
 
   return output_tensor;
 }
@@ -660,8 +725,7 @@ inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
 }
 
 #ifdef INTEL_MKL_DNN
-inline void GetMklShape(OpKernelContext* ctext, int n,
-                        MklDnnShape* mklshape) {
+inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape) {
   mklshape->DeSerializeMklDnnShape(
       ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
           .flat<uint8>()
@@ -684,6 +748,9 @@ inline void GetMklInputList(OpKernelContext* ctext, StringPiece name,
   ctext->input_list(name, input_tensors);
 }
 
+
+#ifndef INTEL_MKL_DNN
+
 inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
                             MklShapeList* mkl_shapes) {
   OpInputList input_mkl_tensors;
@@ -696,12 +763,27 @@ inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
   }
 }
 
+#else
+
+inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
+                            MklDnnShapeList* mkl_shapes) {
+  OpInputList input_mkl_tensors;
+  GetMklInputList(ctext, strings::StrCat("mkl_", name), &input_mkl_tensors);
+
+  for (int i = 0; i < input_mkl_tensors.size(); i++) {
+    (*mkl_shapes)[i].DeSerializeMklDnnShape(
+        input_mkl_tensors[i].flat<uint8>().data(),
+        input_mkl_tensors[i].flat<uint8>().size() * sizeof(uint8));
+  }
+}
+
+#endif
+
 #ifdef INTEL_MKL_DNN
 /// Get shape of input tensor pointed by 'input_idx' in TensorShape format.
 /// If the input tensor is in MKL layout, then obtains TensorShape from
 /// MklShape.
-inline TensorShape GetTfShape(OpKernelContext* context,
-                              size_t input_idx) {
+inline TensorShape GetTfShape(OpKernelContext* context, size_t input_idx) {
   // Sanity check.
   CHECK_NOTNULL(context);
   CHECK_LT(input_idx, context->num_inputs());
@@ -821,7 +903,7 @@ inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
 
 template <typename T>
 inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
-                              TensorShape tf_shape) {
+                           TensorShape tf_shape) {
   OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::v(),
                                                  tf_shape, tensor_out));
 }
@@ -912,6 +994,7 @@ inline void CopyMklTensorInToOut(OpKernelContext* context,
   context->set_output(idx_meta_out, meta_output);
 }
 
+#ifndef INTEL_MKL_DNN
 inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
                                          int idx_in, int idx_out,
                                          const TensorShape& shape) {
@@ -929,6 +1012,27 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
   CHECK(output.CopyFrom(data, shape));
   context->set_output(idx_data_out, output);
 }
+#else
+inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
+                                         int idx_in, int idx_out,
+                                         const TensorShape& shape) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  const Tensor& data = context->input(idx_data_in);
+  MklDnnShape mkl_shape_output;
+  mkl_shape_output.SetMklTensor(false);
+  AllocateOutputSetMklShape(context, idx_out, mkl_shape_output);
+  Tensor output(data.dtype());
+  // TODO(intel_tf): alternatively, call forward_input_to_output_with_shape(...)
+  CHECK(output.CopyFrom(data, shape));
+  context->set_output(idx_data_out, output);
+}
+#endif
+
+#ifndef INTEL_MKL_DNN
 
 inline void ForwardTfTensorInToOut(OpKernelContext* context,
                                   int idx_in, int idx_out) {
@@ -947,6 +1051,27 @@ inline void ForwardTfTensorInToOut(OpKernelContext* context,
   }
 }
 
+#else
+
+inline void ForwardTfTensorInToOut(OpKernelContext* context,
+                                  int idx_in, int idx_out) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  MklDnnShape dnn_shape_output;
+  dnn_shape_output.SetMklTensor(false);
+  AllocateOutputSetMklShape(context, idx_out, dnn_shape_output);
+  if (IsRefType(context->input_dtype(idx_data_in))) {
+    context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+  } else {
+    context->set_output(idx_data_out, context->input(idx_data_in));
+  }
+}
+
+#endif
+
 inline void ForwardMklTensorInToOut(OpKernelContext* context,
                                    int idx_in, int idx_out) {
   int num_inputs = context->num_inputs();
@@ -965,6 +1090,25 @@ inline void ForwardMklTensorInToOut(OpKernelContext* context,
   }
 }
 
+#ifdef INTEL_MKL_DNN
+inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context,
+                                             int idx_in, int idx_out,
+                                             const MklDnnShape& mkl_shape) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  AllocateOutputSetMklShape(context, idx_out, mkl_shape);
+
+  if (IsRefType(context->input_dtype(idx_data_in))) {
+    context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+  } else {
+    context->set_output(idx_data_out, context->input(idx_data_in));
+  }
+}
+#endif
+
 // Forward the MKL shape ONLY (used in elementwise and other ops where
 // we call the eigen implementation and MKL shape is not used)
 inline void ForwardMklMetaDataInToOut(OpKernelContext* context,
@@ -988,6 +1132,10 @@ inline void SetDummyMklShapeOutput(OpKernelContext* context,
   AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output);
 }
 
+#ifndef INTEL_MKL_DNN
+// We don't need these functions in MKLDNN. We have defined equality operator
+// on MklDnnShape class directly.
+
 // Checks if the TF shape for both MKL tensors is the same or not
 // Returns: true if both TF shapes are the same, false otherwise
 inline bool MklCompareShapes(const MklShape* input_shape_0,
@@ -1054,6 +1202,7 @@ inline bool MklCompareShapes(const TensorShape* input_shape_0,
 
   return true;
 }
+#endif
 
 // These functions do not compile with MKL-DNN since mkl.h is missing.
 // We may need to remove them later.
@@ -1099,7 +1248,8 @@ inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) {
 ///
 /// @input None
 /// @return memory::data_type corresponding to type T
-template<typename T> static memory::data_type MklDnnType();
+template <typename T>
+static memory::data_type MklDnnType();
 
 /// Instantiation for float type. Add similar instantiations for other
 /// type if needed.
@@ -1114,10 +1264,11 @@ memory::data_type MklDnnType<float>() {
 /// @return: memory::format corresponding to TensorFlow data format;
 ///          Fails with an error if invalid data format.
 inline memory::format TFDataFormatToMklDnnDataFormat(TensorFormat format) {
-  if (format == FORMAT_NHWC) return memory::format::nhwc;
-  else if (format == FORMAT_NCHW) return memory::format::nchw;
-  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT,
-                     "Unsupported data format"));
+  if (format == FORMAT_NHWC)
+    return memory::format::nhwc;
+  else if (format == FORMAT_NCHW)
+    return memory::format::nchw;
+  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
   // Return to get rid of compiler warning
   return memory::format::format_undef;
 }
@@ -1132,6 +1283,10 @@ inline TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format) {
   else if (format == memory::format::nchw) return FORMAT_NCHW;
   TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT,
                      "Unsupported data format"));
+
+  // Return to prevent compiler warnings, otherwise TF_CHECK_OK will ensure
+  // that we don't come here.
+  return FORMAT_NHWC;
 }
 
 /// Map TensorShape object into memory::dims required by MKL-DNN
@@ -1161,7 +1316,7 @@ inline memory::dims TFShapeToMklDnnDims(const TensorShape& shape) {
 /// @input TensorShape object in shape
 /// @return memory::dims in MKL-DNN required NCHW format
 inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape,
-                                            TensorFormat format) {
+                                              TensorFormat format) {
   // Check validity of format.
   CHECK_NE(TFDataFormatToMklDnnDataFormat(format),
            memory::format::format_undef);
@@ -1175,6 +1330,23 @@ inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape,
   return memory::dims({n, c, h, w});
 }
 
+/// Overloaded version of function above. Input parameters are
+/// self-explanatory.
+inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims,
+                                     TensorFormat format) {
+  // Check validity of format.
+  CHECK_NE(TFDataFormatToMklDnnDataFormat(format),
+           memory::format::format_undef);
+
+  int n = in_dims[GetTensorDimIndex(format, 'N')];
+  int c = in_dims[GetTensorDimIndex(format, 'C')];
+  int h = in_dims[GetTensorDimIndex(format, 'H')];
+  int w = in_dims[GetTensorDimIndex(format, 'W')];
+
+  // MKL-DNN requires dimensions in NCHW format.
+  return memory::dims({n, c, h, w});
+}
+
 /// Map MklDnn memory::dims object into TensorShape object.
 ///
 /// This function will simply map input shape in MKL-DNN memory::dims format
@@ -1217,6 +1389,43 @@ inline padding_kind TFPaddingToMklDnnPadding(Padding pad) {
   return padding_kind::zero;
 }
 
+/// Helper function to create memory descriptor in Blocked format
+///
+/// @input: Tensor dimensions
+/// @input: strides corresponding to dimensions. One can use utility
+///         function such as CalculateTFStrides to compute strides
+///         for given dimensions.
+/// @return: memory::desc object corresponding to blocked memory format
+///          for given dimensions and strides.
+inline memory::desc CreateBlockedMemDescHelper(const memory::dims& dim,
+                                               const memory::dims& strides,
+                                               memory::data_type dtype) {
+  CHECK_EQ(dim.size(), strides.size());
+
+  // We have to construct memory descriptor in a C style. This is not at all
+  // ideal but MKLDNN does not offer any API to construct descriptor in
+  // blocked format except a copy constructor that accepts
+  // mkldnn_memory_desc_t.
+  mkldnn_memory_desc_t md;
+  md.primitive_kind = mkldnn_memory;
+  md.ndims = dim.size();
+  md.format = mkldnn_blocked;
+  md.data_type = memory::convert_to_c(dtype);
+
+  for (size_t i = 0; i < dim.size(); i++) {
+    md.layout_desc.blocking.block_dims[i] = 1;
+    md.layout_desc.blocking.strides[1][i] = 1;
+    md.layout_desc.blocking.strides[0][i] = strides[i];
+    md.layout_desc.blocking.padding_dims[i] = dim[i];
+    md.layout_desc.blocking.offset_padding_to_data[i] = 0;
+    md.dims[i] = dim[i];
+  }
+  md.layout_desc.blocking.offset_padding = 0;
+
+  return memory::desc(md);
+}
+
+
 /*
  * Class to represent all the resources corresponding to a tensor in TensorFlow
  * that are required to execute an operation (such as Convolution).
@@ -1237,21 +1446,23 @@ class MklDnnData {
   const engine* cpu_engine_;
 
  public:
-  explicit MklDnnData(const engine* e) : user_memory_(nullptr),
-                                         reorder_memory_(nullptr),
-                                         op_md_(nullptr), cpu_engine_(e) {}
+  explicit MklDnnData(const engine* e)
+      : user_memory_(nullptr),
+        reorder_memory_(nullptr),
+        op_md_(nullptr),
+        cpu_engine_(e) {}
 
   ~MklDnnData() {
     cpu_engine_ = nullptr;  // We don't own this.
-    delete(user_memory_);
-    delete(reorder_memory_);
-    delete(op_md_);
+    delete (user_memory_);
+    delete (reorder_memory_);
+    delete (op_md_);
   }
 
   inline void* GetTensorBuffer(const Tensor* tensor) const {
     CHECK_NOTNULL(tensor);
-    return const_cast<void*>(static_cast<const void*>(
-              tensor->flat<T>().data()));
+    return const_cast<void*>(
+        static_cast<const void*>(tensor->flat<T>().data()));
   }
 
   /// Set user memory primitive using specified dimensions, memory format and
@@ -1283,30 +1494,8 @@ class MklDnnData {
   /// @return: memory::desc object corresponding to blocked memory format
   ///          for given dimensions and strides.
   static inline memory::desc CreateBlockedMemDesc(const memory::dims& dim,
-      const memory::dims& strides) {
-    CHECK_EQ(dim.size(), strides.size());
-
-    // We have to construct memory descriptor in a C style. This is not at all
-    // ideal but MKLDNN does not offer any API to construct descriptor in
-    // blocked format except a copy constructor that accepts
-    // mkldnn_memory_desc_t.
-    mkldnn_memory_desc_t md;
-    md.primitive_kind = mkldnn_memory;
-    md.ndims = dim.size();
-    md.format = mkldnn_blocked;
-    md.data_type = memory::convert_to_c(MklDnnType<T>());
-
-    for (size_t i = 0; i < dim.size(); i++) {
-      md.layout_desc.blocking.block_dims[i] = 1;
-      md.layout_desc.blocking.strides[1][i] = 1;
-      md.layout_desc.blocking.strides[0][i] = strides[i];
-      md.layout_desc.blocking.padding_dims[i] = dim[i];
-      md.layout_desc.blocking.offset_padding_to_data[i] = 0;
-      md.dims[i] = dim[i];
-    }
-    md.layout_desc.blocking.offset_padding = 0;
-
-    return memory::desc(md);
+                                                 const memory::dims& strides) {
+    return CreateBlockedMemDescHelper(dim, strides, MklDnnType<T>());
   }
 
   /// A version of SetUsrMem call that allows user to create memory in blocked
@@ -1352,7 +1541,7 @@ class MklDnnData {
     CHECK_NOTNULL(cpu_engine_);
     // TODO(nhasabni): can we remove dynamic memory allocation?
     if (data_buffer) {
-     user_memory_ = new memory(pd, data_buffer);
+      user_memory_ = new memory(pd, data_buffer);
     } else {
       user_memory_ = new memory(pd);
     }
@@ -1374,6 +1563,7 @@ class MklDnnData {
     return user_memory_->get_primitive_desc();
   }
 
+
   /// Get function for descriptor of user memory.
   inline memory::desc GetUsrMemDesc() {
     // This is ugly. Why MKL-DNN does not provide desc() method of const type??
@@ -1436,6 +1626,17 @@ class MklDnnData {
     return op_pd != user_memory_->get_primitive_desc();
   }
 
+  /// Predicate that checks if we need to reorder user's memory into memory
+  /// based on the provided format.
+  ///
+  /// @input: target_format - memory format of the given input of an
+  ///               operation
+  /// @return: true in case reorder of input is needed; false, otherwise.
+  inline bool IsReorderNeeded(const memory::format& target_format) const {
+    CHECK_NOTNULL(user_memory_);
+    return target_format != user_memory_->get_primitive_desc().desc().data.format;
+  }
+
   /// Function to create a reorder from memory pointed by from to memory pointed
   /// by to. Returns created primitive.
   inline primitive CreateReorder(const memory* from, const memory* to) const {
diff --git a/tensorflow/core/util/mkl_util_test.cc b/tensorflow/core/util/mkl_util_test.cc
index 6aef3d86e9e703efa9466ccdc3c67205df3f1d84..8b73eadb40046518179fcaaa5c244aa7f3d52ebe 100644
--- a/tensorflow/core/util/mkl_util_test.cc
+++ b/tensorflow/core/util/mkl_util_test.cc
@@ -54,7 +54,6 @@ TEST(MklUtilTest, MklDnnTfShape) {
   EXPECT_NE(b_tf_shape_nchw, b_mkldnn_tf_shape);
 }
 
-
 TEST(MklUtilTest, MklDnnBlockedFormatTest) {
   // Let's create 2D tensor of shape {3, 4} with 3 being innermost dimension
   // first (case 1) and then it being outermost dimension (case 2).
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index 0ea74c38b1916f777eaaf7b0907b614e680ea6e7..e816c282c81a8a3cf661b03ee7597ccfd2658648 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -69,6 +69,21 @@ class SparseTensor {
     CHECK_EQ(shape.size(), dims_) << "Shape rank must be SparseTensor rank.";
   }
 
+  SparseTensor(const SparseTensor& other)
+      : SparseTensor(other.ix_, other.vals_, other.shape_, other.order_) {}
+
+  SparseTensor(SparseTensor&& other)
+      : SparseTensor(std::move(other.ix_), std::move(other.vals_),
+                     std::move(other.shape_), std::move(other.order_)) {}
+
+  SparseTensor& operator=(const SparseTensor& other) {
+    ix_ = other.ix_;
+    vals_ = other.vals_;
+    shape_ = other.shape_;
+    order_ = other.order_;
+    return *this;
+  }
+
   std::size_t num_entries() const { return ix_.dim_size(0); }
 
   int dims() const { return shape_.size(); }
diff --git a/tensorflow/core/util/strided_slice_op.cc b/tensorflow/core/util/strided_slice_op.cc
index cfe9275a09189b0d72e57a79cd860de9ab5d82b8..aca60b942d15841438329c922a8aaaded7b08430 100644
--- a/tensorflow/core/util/strided_slice_op.cc
+++ b/tensorflow/core/util/strided_slice_op.cc
@@ -218,8 +218,8 @@ Status ValidateStridedSliceOp(
 
   // Step 2: Make a sparse spec into a full index spec
   //
-  // The sparse spec does not corresopnds to the number of dimensions
-  // Make a dense spec that corresponds to thte number of dimensions
+  // The sparse spec does not correspond to the number of dimensions
+  // Make a dense spec that corresponds to the number of dimensions
   //
   // For example suppose foo[...,3:] on foo.shape=(2,2,3) then
   // we need to produce the missing begin_mask for the first two
diff --git a/tensorflow/core/util/tensor_slice_reader.h b/tensorflow/core/util/tensor_slice_reader.h
index 4bb2b246158cb2c3387467d0cd89408a6dee9608..263f56c7fcb2fa822de2e0adb5e346feddc71cc2 100644
--- a/tensorflow/core/util/tensor_slice_reader.h
+++ b/tensorflow/core/util/tensor_slice_reader.h
@@ -15,7 +15,6 @@ limitations under the License.
 
 // The utility to read checkpoints for google brain tensor ops and v3
 // checkpoints for dist_belief.
-//
 
 #ifndef TENSORFLOW_UTIL_TENSOR_SLICE_READER_H_
 #define TENSORFLOW_UTIL_TENSOR_SLICE_READER_H_
diff --git a/tensorflow/core/util/tensor_slice_reader_cache.h b/tensorflow/core/util/tensor_slice_reader_cache.h
index bdd36a2791db690824032f25e339354d23f59441..63a8d0b068d21c8e178f3dd344b15db6484a8453 100644
--- a/tensorflow/core/util/tensor_slice_reader_cache.h
+++ b/tensorflow/core/util/tensor_slice_reader_cache.h
@@ -15,7 +15,6 @@ limitations under the License.
 
 // The utility to read checkpoints for google brain tensor ops and v3
 // checkpoints for dist_belief.
-//
 
 #ifndef TENSORFLOW_UTIL_TENSOR_SLICE_READER_CACHE_H_
 #define TENSORFLOW_UTIL_TENSOR_SLICE_READER_CACHE_H_
diff --git a/tensorflow/core/util/tensor_slice_writer.h b/tensorflow/core/util/tensor_slice_writer.h
index 95d6384afecd28025cc5e14c6f525caeafe1f0a5..bdb4921e1bbf8611d84420c1e52d01fa39c25264 100644
--- a/tensorflow/core/util/tensor_slice_writer.h
+++ b/tensorflow/core/util/tensor_slice_writer.h
@@ -15,7 +15,6 @@ limitations under the License.
 
 // The utility to write checkpoints for google brain tensor ops and v3
 // checkpoints for dist_belief.
-//
 
 #ifndef TENSORFLOW_UTIL_TENSOR_SLICE_WRITER_H_
 #define TENSORFLOW_UTIL_TENSOR_SLICE_WRITER_H_
diff --git a/tensorflow/core/util/transform_output_iterator.h b/tensorflow/core/util/transform_output_iterator.h
index 1640791ad1729a57283ab5f2b91b7734c9447d8f..059206c75b97d8bbb64a663a207717387409c04b 100644
--- a/tensorflow/core/util/transform_output_iterator.h
+++ b/tensorflow/core/util/transform_output_iterator.h
@@ -24,7 +24,7 @@ namespace tensorflow {
 template <typename StoreType, typename InputType, typename ConversionOp,
           typename OffsetT = ptrdiff_t>
 class TransformOutputIterator {
- private:
+ protected:
   // Proxy object
   struct Reference {
     StoreType* ptr;
diff --git a/tensorflow/docs_src/about/uses.md b/tensorflow/docs_src/about/uses.md
index d41818e10c924de21781c352e1a1db252b19c2ff..8818177a288ef16ac1907a20ab563ee3d871f7fd 100644
--- a/tensorflow/docs_src/about/uses.md
+++ b/tensorflow/docs_src/about/uses.md
@@ -5,7 +5,7 @@ This page highlights TensorFlow models in real world use.
 
 ## Model zoo
 
-Please visit our collection of TensorFlow models in the 
+Please visit our collection of TensorFlow models in the
 [TensorFlow Zoo](https://github.com/tensorflow/models).
 
 If you have built a model with TensorFlow, please consider publishing it in
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.entropy.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.entropy.md
deleted file mode 100644
index fc5d5d70d7ebf42c16294c84c2cc3f8381dae236..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.entropy.md
+++ /dev/null
@@ -1 +0,0 @@
-# BayesFlow Entropy (contrib)
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_graph.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_graph.md
deleted file mode 100644
index d855787ae695f115368ab76671182f3a6e490411..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_graph.md
+++ /dev/null
@@ -1 +0,0 @@
-# BayesFlow Stochastic Graph (contrib)
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_tensor.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_tensor.md
deleted file mode 100644
index 1cc1ac5d7e670a243f1dcda6ef8c59b6c6d8de2d..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_tensor.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# BayesFlow Stochastic Tensors (contrib)
-[TOC]
-
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.variational_inference.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.variational_inference.md
deleted file mode 100644
index 8f08c09c8fbbc9b5b6ab8612f140f4b7ca7d8b73..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.variational_inference.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# BayesFlow Variational Inference (contrib)
-[TOC]
-
-Variational inference.
diff --git a/tensorflow/docs_src/api_guides/python/contrib.copy_graph.md b/tensorflow/docs_src/api_guides/python/contrib.copy_graph.md
deleted file mode 100644
index f61f4c764d289814439bb8c5d33bdfb46d208866..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/contrib.copy_graph.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copying Graph Elements (contrib)
-[TOC]
-
-Functions for copying elements from one graph to another.
diff --git a/tensorflow/docs_src/api_guides/python/contrib.opt.md b/tensorflow/docs_src/api_guides/python/contrib.opt.md
deleted file mode 100644
index 944a80a5ccb0201b5b5a0cf3b57ca31dfc7ce01a..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/contrib.opt.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Optimization (contrib)
-[TOC]
-
-opt: A module containing optimization routines.
diff --git a/tensorflow/docs_src/api_guides/python/histogram_ops.md b/tensorflow/docs_src/api_guides/python/histogram_ops.md
deleted file mode 100644
index dbd4555429b2a09bdf32e2e421b2d55fac0c0fd0..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/histogram_ops.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Histograms
-[TOC]
-
-## Histograms
-
-*   @{tf.histogram_fixed_width}
diff --git a/tensorflow/docs_src/api_guides/python/image.md b/tensorflow/docs_src/api_guides/python/image.md
index a2c8c3c3c92e2acf177da104304746fb34281de7..051e4547ee6900ded85ae18fb80b51db1eacb009 100644
--- a/tensorflow/docs_src/api_guides/python/image.md
+++ b/tensorflow/docs_src/api_guides/python/image.md
@@ -19,6 +19,7 @@ Note: The PNG encode and decode Ops support RGBA, but the conversions Ops
 presently only support RGB, HSV, and GrayScale. Presently, the alpha channel has
 to be stripped from the image and re-attached using slicing ops.
 
+*   @{tf.image.decode_bmp}
 *   @{tf.image.decode_gif}
 *   @{tf.image.decode_jpeg}
 *   @{tf.image.encode_jpeg}
diff --git a/tensorflow/docs_src/api_guides/python/meta_graph.md b/tensorflow/docs_src/api_guides/python/meta_graph.md
index fa4cee87007cfd77663e74956fcfe0f15c55c52c..0eff9000931666dce742358a290f25bb2b5a7b16 100644
--- a/tensorflow/docs_src/api_guides/python/meta_graph.md
+++ b/tensorflow/docs_src/api_guides/python/meta_graph.md
@@ -221,15 +221,9 @@ Here are some of the typical usage models:
     # Addes loss and train.
     labels = tf.constant(0, tf.int32, shape=[100], name="labels")
     batch_size = tf.size(labels)
-    labels = tf.expand_dims(labels, 1)
-    indices = tf.expand_dims(tf.range(0, batch_size), 1)
-    concated = tf.concat([indices, labels], 1)
-    onehot_labels = tf.sparse_to_dense(
-        concated, tf.stack([batch_size, 10]), 1.0, 0.0)
     logits = tf.get_collection("logits")[0]
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
-        labels=onehot_labels, logits=logits, name="xentropy")
-    loss = tf.reduce_mean(cross_entropy, name="xentropy_mean")
+    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,
+                                                  logits=logits)
 
     tf.summary.scalar('loss', loss)
     # Creates the gradient descent optimizer with the given learning rate.
diff --git a/tensorflow/docs_src/api_guides/python/nn.md b/tensorflow/docs_src/api_guides/python/nn.md
index 75dbb04e7df6f5fef00363bab548fc04bd3c9694..8e6fd1cff93332b84f552c18f627ba05dc67103e 100644
--- a/tensorflow/docs_src/api_guides/python/nn.md
+++ b/tensorflow/docs_src/api_guides/python/nn.md
@@ -73,7 +73,7 @@ The total padding applied along the height and width is computed as:
       pad_along_width = max(filter_width - strides[2], 0)
     else:
       pad_along_width = max(filter_width - (in_width % strides[2]), 0)
-    
+
 Finally, the padding on the top, bottom, left and right are:
 
     pad_top = pad_along_height // 2
@@ -226,6 +226,8 @@ TensorFlow provides several operations that help you perform classification.
 *   @{tf.nn.softmax}
 *   @{tf.nn.log_softmax}
 *   @{tf.nn.softmax_cross_entropy_with_logits}
+*   @{tf.nn.softmax_cross_entropy_with_logits_v2} - identical to the base
+    version, except it allows gradient propagation into the labels.
 *   @{tf.nn.sparse_softmax_cross_entropy_with_logits}
 *   @{tf.nn.weighted_cross_entropy_with_logits}
 
@@ -351,7 +353,7 @@ p_i = max(s\cdot (n_o - 1) + k - n_i, 0)
 \end{equation}
 
 Remember that, for `'SAME'` padding,
-\\(n_o = \left \lceil{\frac{n_i}{s}}\right \rceil\\), as mentioned above. 
+\\(n_o = \left \lceil{\frac{n_i}{s}}\right \rceil\\), as mentioned above.
 We need to analyze in detail two cases:
 
 - \\(n_i \text{ mod } s = 0\\)
diff --git a/tensorflow/docs_src/api_guides/python/reading_data.md b/tensorflow/docs_src/api_guides/python/reading_data.md
index b3ebaa0f0a3645256d4e92632a10a53e4eb243cb..f316cce953da9b425463feffa317b6bf292694e4 100644
--- a/tensorflow/docs_src/api_guides/python/reading_data.md
+++ b/tensorflow/docs_src/api_guides/python/reading_data.md
@@ -1,11 +1,11 @@
 # Reading data
 
 Note: The preferred way to feed data into a tensorflow program is using the
-@{$datasets$Datasets API}.
+@{$datasets$`tf.data` API}.
 
 There are four methods of getting data into a TensorFlow program:
 
-*   `Dataset` API: Easily construct a complex input pipeline. (preferred method)
+*   `tf.data` API: Easily construct a complex input pipeline. (preferred method)
 *   Feeding: Python code provides the data when running each step.
 *   `QueueRunner`: a queue-based input pipeline reads the data from files
     at the beginning of a TensorFlow graph.
@@ -14,26 +14,27 @@ There are four methods of getting data into a TensorFlow program:
 
 [TOC]
 
-## Dataset API
+## `tf.data` API
 
 See the @{$datasets$programmer's guide} for an in-depth explanation of
-@{tf.data.Dataset}. The `Dataset` API allows you to extract and preprocess data
-from different input/file formats, and apply transformations such as batch,
-shuffle, and map to the dataset. This is an improved version of the old input
-methods, feeding and `QueueRunner`.
+@{tf.data.Dataset}. The `tf.data` API enables you to extract and preprocess data
+from different input/file formats, and apply transformations such as batching,
+shuffling, and mapping functions over the dataset. This is an improved version
+of the old input methods---feeding and `QueueRunner`---which are described
+below for historical purposes.
 
 ## Feeding
 
+Warning: "Feeding" is the least efficient way to feed data into a TensorFlow
+program and should only be used for small experiments and debugging.
+
 TensorFlow's feed mechanism lets you inject data into any Tensor in a
-computation graph. A python computation can thus feed data directly into the
+computation graph. A Python computation can thus feed data directly into the
 graph.
 
 Supply feed data through the `feed_dict` argument to a run() or eval() call
 that initiates computation.
 
-Warning: "Feeding" is the least efficient way to feed data into a tensorflow
-program and should only be used for small experiments and debugging.
-
 ```python
 with tf.Session():
   input = tf.placeholder(tf.float32)
@@ -55,6 +56,10 @@ and is described in the @{$mechanics$MNIST tutorial}.
 
 ## `QueueRunner`
 
+Warning: This section discusses implementing input pipelines using the
+queue-based APIs which can be cleanly replaced by the @{$datasets$`tf.data`
+API}.
+
 A typical queue-based pipeline for reading records from files has the following stages:
 
 1.  The list of filenames
@@ -66,9 +71,6 @@ A typical queue-based pipeline for reading records from files has the following
 7.  *Optional* preprocessing
 8.  Example queue
 
-Warning: This section discusses implementing input pipelines using the
-queue-based APIs which can be cleanly replaced by the @{$datasets$Datasets API}.
-
 ### Filenames, shuffling, and epoch limits
 
 For the list of filenames, use either a constant string Tensor (like
@@ -173,14 +175,25 @@ For example,
 [`tensorflow/examples/how_tos/reading_data/convert_to_records.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/convert_to_records.py)
 converts MNIST data to this format.
 
-To read a file of TFRecords, use
-@{tf.TFRecordReader} with
-the @{tf.parse_single_example}
-decoder. The `parse_single_example` op decodes the example protocol buffers into
-tensors. An MNIST example using the data produced by `convert_to_records` can be
-found in
-[`tensorflow/examples/how_tos/reading_data/fully_connected_reader.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py),
-which you can compare with the `fully_connected_feed` version.
+The recommended way to read a TFRecord file is with a @{tf.data.TFRecordDataset}, [as in this example](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py):
+
+``` python
+    dataset = tf.data.TFRecordDataset(filename)
+    dataset = dataset.repeat(num_epochs)
+
+    # map takes a python function and applies it to every sample
+    dataset = dataset.map(decode)
+```
+
+To acomplish the same task with a queue based input pipeline requires the following code 
+(using the same `decode` function from the above example): 
+
+``` python
+  filename_queue = tf.train.string_input_producer([filename], num_epochs=num_epochs)
+  reader = tf.TFRecordReader()
+  _, serialized_example = reader.read(filename_queue)
+  image,label = decode(serialized_example)
+```
 
 ### Preprocessing
 
@@ -499,7 +512,7 @@ You can have the train and eval in the same graph in the same process, and share
 their trained variables or layers. See @{$variables$the shared variables tutorial}.
 
 To support the single-graph approach
-@{$programmers_guide/datasets$Datasets} also supplies
+@{$programmers_guide/datasets$`tf.data`} also supplies
 @{$programmers_guide/datasets#creating_an_iterator$advanced iterator types} that
 that allow the user to change the input pipeline without rebuilding the graph or
 session.
diff --git a/tensorflow/docs_src/api_guides/python/script_ops.md b/tensorflow/docs_src/api_guides/python/script_ops.md
deleted file mode 100644
index ab49a570c135fefdcb3f4c7d4e4d35df38092b98..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/script_ops.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Wraps python functions
-
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-@{tf.convert_to_tensor}.
-
-[TOC]
-
-## Script Language Operators
-
-TensorFlow provides allows you to wrap python/numpy functions as
-TensorFlow operators.
-
-*   @{tf.py_func}
diff --git a/tensorflow/docs_src/community/documentation.md b/tensorflow/docs_src/community/documentation.md
index 77d4e0caece4b50222c6e8abdd7ebba006159f26..003e0a25ecd7c6afcc42aed08bd5d91f7c85a9bb 100644
--- a/tensorflow/docs_src/community/documentation.md
+++ b/tensorflow/docs_src/community/documentation.md
@@ -10,10 +10,10 @@ particular, this document explains the following:
 
 You can view TensorFlow documentation on https://www.tensorflow.org, and you
 can view and edit the raw files on
-[GitHub](https://www.tensorflow.org/code/tensorflow/docs_src/). 
+[GitHub](https://www.tensorflow.org/code/tensorflow/docs_src/).
 We're publishing our docs on GitHub so everybody can contribute. Whatever gets
 checked in to `tensorflow/docs_src` will be published soon after on
-https://www.tensorflow.org. 
+https://www.tensorflow.org.
 
 Republishing TensorFlow documentation in different forms is absolutely allowed,
 but we are unlikely to accept other documentation formats (or the tooling to
@@ -237,7 +237,7 @@ If a module is accidentally imported, it typically breaks the doc generator
 even if the doc generator succeeds, unwanted symbols may show up in the
 docs. Check the generated docs to make sure that all symbols that are documented
 are expected. If there are symbols that shouldn’t be there, you have the
-following options for dealing with them: 
+following options for dealing with them:
 
 - Private symbols and imports
 - The `remove_undocumented` filter
diff --git a/tensorflow/docs_src/community/style_guide.md b/tensorflow/docs_src/community/style_guide.md
index 40a75a4736d0e28e3c1822f5941fcb855f939f46..a4c4e2674ee78b2248323a0275a737d6417c5f99 100644
--- a/tensorflow/docs_src/community/style_guide.md
+++ b/tensorflow/docs_src/community/style_guide.md
@@ -162,7 +162,7 @@ operation.
              it's present in the scope.
 
 * Layers that behave differently during training should take:
-  - `is_training`: `bool` indicator to conditionally choose different 
+  - `is_training`: `bool` indicator to conditionally choose different
                    computation paths (e.g. using `tf.cond`) during execution.
 
 Example:
diff --git a/tensorflow/docs_src/community/welcome.md b/tensorflow/docs_src/community/welcome.md
index 33740de5d5af11cb6a8f1f6d57baa4c0e0dbefff..a3abf2550757e825ae2d023018def919de1bcd8f 100644
--- a/tensorflow/docs_src/community/welcome.md
+++ b/tensorflow/docs_src/community/welcome.md
@@ -65,5 +65,5 @@ please read the following list carefully:
     [TensorFlow issues tracker](https://github.com/tensorflow/tensorflow/issues)
     on GitHub.  For example, use the issue tracker to request a
     new operation in TensorFlow.
-    
+
 
diff --git a/tensorflow/docs_src/deploy/hadoop.md b/tensorflow/docs_src/deploy/hadoop.md
index 7592cf828beb1f45a60ecc58b7fbfc5f4c4308ab..c4471562b9e64dda2fade7759e06fb8eecd09f5c 100644
--- a/tensorflow/docs_src/deploy/hadoop.md
+++ b/tensorflow/docs_src/deploy/hadoop.md
@@ -32,8 +32,8 @@ be set:
     source ${HADOOP_HOME}/libexec/hadoop-config.sh
     ```
 
-*   **LD_LIBRARY_PATH**: To include the path to libjvm.so, and optionally the path 
-    to libhdfs.so if your Hadoop distribution does not install libhdfs.so in 
+*   **LD_LIBRARY_PATH**: To include the path to libjvm.so, and optionally the path
+    to libhdfs.so if your Hadoop distribution does not install libhdfs.so in
     `$HADOOP_HDFS_HOME/lib/native`. On Linux:
 
     ```shell
diff --git a/tensorflow/docs_src/extend/add_filesys.md b/tensorflow/docs_src/extend/add_filesys.md
index ea3a6fe53af3e960eaccb4f7b6836364244fbe05..f0591b7b7d8af478db067ecd3bdd949e75d813c9 100644
--- a/tensorflow/docs_src/extend/add_filesys.md
+++ b/tensorflow/docs_src/extend/add_filesys.md
@@ -32,9 +32,10 @@ Note that TensorFlow already includes many filesystem implementations, such as:
 
     Note: NFS filesystems often mount as a POSIX interface, and so standard
     TensorFlow can work on top of NFS-mounted remote filesystems.
-    
+
 *   HDFS - the Hadoop File System
 *   GCS - Google Cloud Storage filesystem
+*   S3 - Amazon Simple Storage Service filesystem
 *   A "memory-mapped-file" filesystem
 
 The rest of this guide describes how to implement a custom filesystem.
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index a3a02720591954a908bb4135ab597e283388fee0..c52279b212f46215125a20815f97b07b012a5513 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -341,9 +341,9 @@ Assuming you have `g++` installed, here is the sequence of commands you can use
 to compile your op into a dynamic library.
 
 ```bash
-TF_INC=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')
-TF_LIB=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_lib())')
-g++ -std=c++11 -shared zero_out.cc -o zero_out.so -fPIC -I$TF_INC -I$TF_INC/external/nsync/public -L$TF_LIB -ltensorflow_framework -O2
+TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
+TF_LFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
+g++ -std=c++11 -shared zero_out.cc -o zero_out.so -fPIC ${TF_CFLAGS[@]} ${TF_LFLAGS[@]} -O2
 ```
 
 On Mac OS X, the additional flag "-undefined dynamic_lookup" is required when
@@ -1228,10 +1228,10 @@ into a single dynamically loadable library:
 
 ```bash
 nvcc -std=c++11 -c -o cuda_op_kernel.cu.o cuda_op_kernel.cu.cc \
--I $TF_INC -I$TF_INC/external/nsync/public -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC
+  ${TF_CFLAGS[@]} -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC
 
 g++ -std=c++11 -shared -o cuda_op_kernel.so cuda_op_kernel.cc \
-cuda_op_kernel.cu.o -I $TF_INC -I$TF_INC/external/nsync/public -fPIC -lcudart -L$TF_LIB -ltensorflow_framework
+  cuda_op_kernel.cu.o ${TF_CFLAGS[@]} -fPIC -lcudart ${TF_LFLAGS[@]}
 ```
 
 `cuda_op_kernel.so` produced above can be loaded as usual in Python, using the
diff --git a/tensorflow/docs_src/extend/estimators.md b/tensorflow/docs_src/extend/estimators.md
index 7e6507c5840fe621aeb91842c9a83554e568db99..96fc9fae4720b5d29ff94bffe8f30e40aada0a27 100644
--- a/tensorflow/docs_src/extend/estimators.md
+++ b/tensorflow/docs_src/extend/estimators.md
@@ -515,7 +515,7 @@ using `mean_squared_error()` (in bold):
   loss = tf.losses.mean_squared_error(labels, predictions)</strong>
   ...</code></pre>
 
-See the @{$python/contrib.losses$API guide} for a
+See the @{tf.losses$API guide} for a
 full list of loss functions and more details on supported arguments and usage.
 
 Supplementary metrics for evaluation can be added to an `eval_metric_ops` dict.
@@ -694,5 +694,5 @@ For additional reference materials on building `Estimator`s, see the following
 sections of the API guides:
 
 *   @{$python/contrib.layers$Layers}
-*   @{$python/contrib.losses$Losses}
+*   @{tf.losses$Losses}
 *   @{$python/contrib.layers#optimization$Optimization}
diff --git a/tensorflow/docs_src/extend/index.md b/tensorflow/docs_src/extend/index.md
index 3f30b9a8c243728f6dd2a47ffa0b35fb92ee68fe..00b168c6be96a158c3be69fbcefbf941c0fbbe4d 100644
--- a/tensorflow/docs_src/extend/index.md
+++ b/tensorflow/docs_src/extend/index.md
@@ -20,7 +20,7 @@ TensorFlow:
 
 Python is currently the only language supported by TensorFlow's API stability
 promises.  However, TensorFlow also provides functionality in C++, Java, and Go,
-plus community support for [Haskell](https://github.com/tensorflow/haskell) and 
+plus community support for [Haskell](https://github.com/tensorflow/haskell) and
 [Rust](https://github.com/tensorflow/rust).  If you'd like to create or
 develop TensorFlow features in a language other than these languages, read the
 following guide:
diff --git a/tensorflow/docs_src/get_started/custom_estimators.md b/tensorflow/docs_src/get_started/custom_estimators.md
new file mode 100644
index 0000000000000000000000000000000000000000..81ab68a8032ae23926190eebe874062979ec37d2
--- /dev/null
+++ b/tensorflow/docs_src/get_started/custom_estimators.md
@@ -0,0 +1,590 @@
+
+# Creating Custom Estimators
+This document introduces custom Estimators. In particular, this document
+demonstrates how to create a custom @{tf.estimator.Estimator$Estimator} that
+mimics the behavior of the pre-made Estimator
+@{tf.estimator.DNNClassifier$`DNNClassifier`} in solving the Iris problem. See
+the @{$get_started/premade_estimators$Pre-Made Estimators chapter} for details
+on the Iris problem.
+
+To download and access the example code invoke the following two commands:
+
+```shell
+git clone https://github.com/tensorflow/models/
+cd models/samples/core/get_started
+```
+
+In this document we wil be looking at
+[`custom_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py).
+You can run it with the following command:
+
+```bsh
+python custom_estimator.py
+```
+
+If you are feeling impatient, feel free to compare and contrast
+[`custom_estimatr.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py)
+with
+[`premade_estimatr.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py).
+(which is in the same directory).
+
+
+
+## Pre-made vs. custom
+
+As the following figure shows, pre-made Estimators are subclasses of the
+@{tf.estimator.Estimator} base class, while custom Estimators are an instance
+of tf.estimator.Estimator:
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%"
+  alt="Premade estimators are sub-classes of `Estimator`. Custom Estimators are usually (direct) instances of `Estimator`"
+  src="../images/custom_estimators/estimator_types.png">
+</div>
+<div style="text-align: center">
+Pre-made and custom Estimators are all Estimators.
+</div>
+
+Pre-made Estimators are fully baked. Sometimes though, you need more control
+over an Estimator's behavior.  That's where custom Estimators come in. You can
+create a custom Estimator to do just about anything. If you want hidden layers
+connected in some unusual fashion, write a custom Estimator. If you want to
+calculate a unique
+[metric](https://developers.google.com/machine-learning/glossary/#metric)
+for your model, write a custom Estimator.  Basically, if you want an Estimator
+optimized for your specific problem, write a custom Estimator.
+
+A model function (or `model_fn`) implements the ML algorithm. The
+only difference between working with pre-made Estimators and custom Estimators
+is:
+
+* With pre-made Estimators, someone already wrote the model function for you.
+* With custom Estimators, you must write the model function.
+
+Your model function could implement a wide range of algorithms, defining all
+sorts of hidden layers and metrics.  Like input functions, all model functions
+must accept a standard group of input parameters and return a standard group of
+output values. Just as input functions can leverage the Dataset API, model
+functions can leverage the Layers API and the Metrics API.
+
+Let's see how to solve the Iris problem with a custom Estimator. A quick
+reminder--here's the organization of the Iris model that we're trying to mimic:
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="height:260px"
+  alt="A diagram of the network architecture: Inputs, 2 hidden layers, and outputs"
+  src="../images/custom_estimators/full_network.png">
+</div>
+<div style="text-align: center">
+Our implementation of Iris contains four features, two hidden layers,
+and a logits output layer.
+</div>
+
+## Write an Input function
+
+Our custom Estimator implementation uses the same input function as our
+@{$get_started/premade_estimators$pre-made Estimator implementation}, from
+[`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py).
+Namely:
+
+```python
+def train_input_fn(features, labels, batch_size):
+    """An input function for training"""
+    # Convert the inputs to a Dataset.
+    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
+
+    # Shuffle, repeat, and batch the examples.
+    dataset = dataset.shuffle(1000).repeat().batch(batch_size)
+
+    # Return the read end of the pipeline.
+    return dataset.make_one_shot_iterator().get_next()
+```
+
+This input function builds an input pipeline that yields batches of
+`(features, labels)` pairs, where `features` is a dictionary features.
+
+## Create feature columns
+
+As detailed in the @{$get_started/estimator$Premade Estimators} and
+@{$get_started/feature_columns$Feature Columns} chapters, you must define
+your model's feature columns to specify how the model should use each feature.
+Whether working with pre-made Estimators or custom Estimators, you define
+feature columns in the same fashion.
+
+The following code creates a simple `numeric_column` for each input feature,
+indicating that the value of the input feature should be used directly as an
+input to the model:
+
+```python
+# Feature columns describe how to use the input.
+my_feature_columns = []
+for key in train_x.keys():
+    my_feature_columns.append(tf.feature_column.numeric_column(key=key))
+```
+
+## Write a model function
+
+The model function we'll use has the following call signature:
+
+```python
+def my_model_fn(
+   features, # This is batch_features from input_fn
+   labels,   # This is batch_labels from input_fn
+   mode,     # An instance of tf.estimator.ModeKeys
+   params):  # Additional configuration
+```
+
+The first two arguments are the batches of features and labels returned from
+the input function; that is, `features` and `labels` are the handles to the
+data your model will use. The `mode` argument indicates whether the caller is
+requesting training, predicting, or evaluation.
+
+The caller may pass `params` to an Estimator's constructor. Any `params` passed
+to the constructor are in turn passed on to the `model_fn`. In
+[`custom_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py)
+the following lines create the estimator and set the params to configure the
+model. This configuration step is similar to how we configured the @{tf.estimator.DNNClassifier} in
+@{$get_started/premade_estimators}.
+
+```python
+classifier = tf.estimator.Estimator(
+    model_fn=my_model,
+    params={
+        'feature_columns': my_feature_columns,
+        # Two hidden layers of 10 nodes each.
+        'hidden_units': [10, 10],
+        # The model must choose between 3 classes.
+        'n_classes': 3,
+    })
+```
+
+To implement a typical model function, you must do the following:
+
+* (Define the model)[#define_the_model].
+* Specify additional calculations for each of
+  the [three different modes](#modes):
+  * [Predict](#predict)
+  * [Evaluate](#evaluate)
+  * [Train](#train)
+
+## Define the model
+
+The basic deep neural network model must define the following three sections:
+
+* An [input layer](https://developers.google.com/machine-learning/glossary/#input_layer)
+* One or more [hidden layers](https://developers.google.com/machine-learning/glossary/#hidden_layer)
+* An [output layer](https://developers.google.com/machine-learning/glossary/#output_layer)
+
+### Define the input layer
+
+Call @{tf.feature_column.input_layer} to convert your feature dictionary and
+feature columns into input for your model. For example:
+
+```python
+    # Use `input_layer` to apply the feature columns.
+    net = tf.feature_column.input_layer(features, params['feature_columns'])
+```
+
+The preceding line applies the transformations defined by your feature columns,
+creating the model's input layer.
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="height:260px"
+  alt="A diagram of the input layer, in this case a 1:1 mapping from raw-inputs to features."
+  src="../images/custom_estimators/input_layer.png">
+</div>
+
+
+### Hidden Layers
+
+If you are creating a deep neural network, you must define one or more hidden
+layers. The Layers API provides a rich set of functions to define all types of
+hidden layers, including convolutional, pooling, and dropout layers. For Iris,
+we're simply going to call @{tf.layers.dense} to create hidden layers, with
+dimensions defined by `params['hidden_layers']`. In a `dense` layer each node
+is connected to every node in the preceding layer.  Here's the relevant code:
+
+``` python
+    # Build the hidden layers, sized according to the 'hidden_units' param.
+    for units in params['hidden_units']:
+        net = tf.layers.dense(net, units=units, activation=tf.nn.relu)
+```
+
+* The `units` parameter defines the number of output neurons in a given layer.
+* The `activation` parameter defines the [activation function](https://developers.google.com/machine-learning/glossary/#a) —
+  [Relu](https://developers.google.com/machine-learning/glossary/#ReLU) in this
+  case.
+
+The variable `net` here signifies the current top layer of the network. During
+the first iteration, `net` signifies the input layer. On each loop iteration
+`tf.layers.dense` creates a new layer, which takes the previous layer's output
+as its input, using the variable `net`.
+
+After creating two hidden layers, our network looks as follows. For
+simplicity, the figure does not show all the units in each layer.
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="height:260px"
+  alt="The input layer with two hidden layers added."
+  src="../images/custom_estimators/add_hidden_layer.png">
+</div>
+
+Note that @{tf.layers.dense} provides many additional capabilities, including
+the ability to set a multitude of regularization parameters. For the sake of
+simplicity, though, we're going to simply accept the default values of the
+other parameters.
+
+### Output Layer
+
+We'll define the output layer by calling @{tf.layers.dense} yet again, this
+time without an activation function:
+
+```python
+    # Compute logits (1 per class).
+    logits = tf.layers.dense(net, params['n_classes'], activation=None)
+```
+
+Here, `net` signifies the final hidden layer. Therefore, the full set of layers
+is now connected as follows:
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="height:260px"
+  alt="A logit output layer connected to the top hidden layer"
+  src="../images/custom_estimators/add_logits.png">
+</div>
+<div style="text-align: center">
+The final hidden layer feeds into the output layer.
+</div>
+
+When defining an output layer, the `units` parameter specifies the number of
+outputs. So, by setting `units` to `params['n_classes']`, the model produces
+one output value per class. Each element of the output vector will contain the
+score, or "logit", calculated for the associated class of Iris: Setosa,
+Versicolor, or Virginica, respectively.
+
+Later on, these logits will be transformed into probabilities by the
+@{tf.nn.softmax} function.
+
+## Implement training, evaluation, and prediction {modes}
+
+The final step in creating a model function is to write branching code that
+implements prediction, evaluation, and training.
+
+The model function gets invoked whenever someone calls the Estimator's `train`,
+`evaluate`, or `predict` methods. Recall that the signature for the model
+function looks like this:
+
+``` python
+def my_model_fn(
+   features, # This is batch_features from input_fn
+   labels,   # This is batch_labels from input_fn
+   mode,     # An instance of tf.estimator.ModeKeys, see below
+   params):  # Additional configuration
+```
+
+Focus on that third argument, mode. As the following table shows, when someone
+calls `train`, `evaluate`, or `predict`, the Estimator framework invokes your model
+function with the mode parameter set as follows:
+
+| Estimator method                 |    Estimator Mode |
+|:---------------------------------|:------------------|
+|@{tf.estimator.Estimator.train$`train()`} |@{tf.estimator.ModeKeys.TRAIN$`ModeKeys.TRAIN`} |
+|@{tf.estimator.Estimator.evaluate$`evaluate()`}  |@{tf.estimator.ModeKeys.EVAL$`ModeKeys.EVAL`}      |
+|@{tf.estimator.Estimator.predict$`predict()`}|@{tf.estimator.ModeKeys.PREDICT$`ModeKeys.PREDICT`} |
+
+For example, suppose you instantiate a custom Estimator to generate an object
+named `classifier`. Then, you make the following call:
+
+``` python
+classifier = tf.estimator.Estimator(...)
+classifier.train(input_fn=lambda: my_input_fn(FILE_TRAIN, True, 500))
+```
+The Estimator framework then calls your model function with mode set to
+`ModeKeys.TRAIN`.
+
+Your model function must provide code to handle all three of the mode values.
+For each mode value, your code must return an instance of
+`tf.estimator.EstimatorSpec`, which contains the information the caller
+requires. Let's examine each mode.
+
+### Predict
+
+When the Estimator's `predict` method is called, the `model_fn` receives
+`mode = ModeKeys.PREDICT`. In this case, the model function must return a
+`tf.estimator.EstimatorSpec` containing the prediction.
+
+The model must have been trained prior to making a prediction. The trained model
+is stored on disk in the `model_dir` directory established when you
+instantiated the Estimator.
+
+The code to generate the prediction for this model looks as follows:
+
+```python
+# Compute predictions.
+predicted_classes = tf.argmax(logits, 1)
+if mode == tf.estimator.ModeKeys.PREDICT:
+    predictions = {
+        'class_ids': predicted_classes[:, tf.newaxis],
+        'probabilities': tf.nn.softmax(logits),
+        'logits': logits,
+    }
+    return tf.estimator.EstimatorSpec(mode, predictions=predictions)
+```
+The prediction dictionary contains everything that your model returns when run
+in prediction mode.
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="height:260px"
+  alt="Additional outputs added to the output layer."
+  src="../images/custom_estimators/full_network.png">
+</div>
+
+The `predictions` holds the following three key/value pairs:
+
+*   `class_ids` holds the class id (0, 1, or 2) representing the model's
+    prediction of the most likely species for this example.
+*   `probabilities` holds the three probabilities (in this example, 0.02, 0.95,
+    and 0.03)
+*   `logit` holds the raw logit values (in this example, -1.3, 2.6, and -0.9)
+
+We return that dictionary to the caller via the `predictions` parameter of the
+@{tf.estimator.EstimatorSpec}. The Estimator's
+@{tf.estimator.Estimator.predict$`predict`} method will yield these
+dictionaries.
+
+### Calculate the loss
+
+For both [training](#train) and [evaluation](#evaluate) we need to calculate the
+model's loss. This is the
+[objective](https://developers.google.com/machine-learning/glossary/#objective)
+that will be optimized.
+
+We can calculate the loss by calling @{tf.losses.sparse_softmax_cross_entropy}.
+The value returned by this function will be lowest, approximately 0,
+probability of the correct class (at index `label`) is near 1.0. The loss value
+returned is progressively larger as the probability of the correct class
+decreases.
+
+This function returns the average over the whole batch.
+
+```python
+# Compute loss.
+loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
+```
+
+### Evaluate
+
+When the Estimator's `evaluate` method is called, the `model_fn` receives
+`mode = ModeKeys.EVAL`. In this case, the model function must return a
+`tf.estimator.EstimatorSpec` containing the model's loss and optionally one
+or more metrics.
+
+Although returning metrics is optional, most custom Estimators do return at
+least one metric. TensorFlow provides a Metrics module @{tf.metrics} to
+calculate common metrics.  For brevity's sake, we'll only return accuracy. The
+@{tf.metrics.accuracy} function compares our predictions against the
+true values, that is, against the labels provided by the input function. The
+@{tf.metrics.accuracy} function requires the labels and predictions to have the
+same shape. Here's the call to @{tf.metrics.accuracy}:
+
+``` python
+# Compute evaluation metrics.
+accuracy = tf.metrics.accuracy(labels=labels,
+                               predictions=predicted_classes,
+                               name='acc_op')
+```
+
+The @{tf.estimator.EstimatorSpec$`EstimatorSpec`} returned for evaluation
+typically contains the following information:
+
+* `loss`, which is the model's loss
+* `eval_metric_ops`, which is an optional dictionary of metrics.
+
+So, we'll create a dictionary containing our sole metric. If we had calculated
+other metrics, we would have added them as additional key/value pairs to that
+same dictionary.  Then, we'll pass that dictionary in the `eval_metric_ops`
+argument of `tf.estimator.EstimatorSpec`. Here's the code:
+
+```python
+metrics = {'accuracy': accuracy}
+tf.summary.scalar('accuracy', accuracy[1])
+
+if mode == tf.estimator.ModeKeys.EVAL:
+    return tf.estimator.EstimatorSpec(
+        mode, loss=loss, eval_metric_ops=metrics)
+```
+
+The @{tf.summary.scalar} will make accuracy available to TensorBoard
+in both `TRAIN` and `EVAL` modes. (More on this later).
+
+### Train
+
+When the Estimator's `train` method is called, the `model_fn` is called
+with `mode = ModeKeys.TRAIN`. In this case, the model function must return an
+`EstimatorSpec` that contains the loss and a training operation.
+
+Building the training operation will require an optimizer. We will use
+@{tf.train.AdagradOptimizer} because we're mimicking the `DNNClassifier`, which
+also uses `Adagrad` by default. The `tf.train` package provides many other
+optimizers—feel free to experiment with them.
+
+Here is the code that builds the optimizer:
+
+``` python
+optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
+```
+
+Next, we build the training operation using the optimizer's
+@{tf.train.Optimizer.minimize$`minimize`} method on the loss we calculated
+earlier.
+
+The `minimize` method also takes a `global_step` parameter. TensorFlow uses this
+parameter to count the number of training steps that have been processed
+(to know when to end a training run). Furthermore, the `global_step` is
+essential for TensorBoard graphs to work correctly. Simply call
+@{tf.train.get_global_step} and pass the result to the `global_step`
+argument of `minimize`.
+
+Here's the code to train the model:
+
+``` python
+train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
+```
+
+The @{tf.estimator.EstimatorSpec$`EstimatorSpec`} returned for training
+must have the following fields set:
+
+* `loss`, which contains the value of the loss function.
+* `train_op`, which executes a training step.
+
+Here's our code to call `EstimatorSpec`:
+
+```python
+return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
+```
+
+The model function is now complete.
+
+## The custom Estimator
+
+Instantiate the custom Estimator through the Estimator base class as follows:
+
+```python
+    # Build 2 hidden layer DNN with 10, 10 units respectively.
+    classifier = tf.estimator.Estimator(
+        model_fn=my_model,
+        params={
+            'feature_columns': my_feature_columns,
+            # Two hidden layers of 10 nodes each.
+            'hidden_units': [10, 10],
+            # The model must choose between 3 classes.
+            'n_classes': 3,
+        })
+```
+Here the `params` dictionary serves the same purpose as the key-word
+arguments of `DNNClassifier`; that is, the `params` dictionary lets you
+configure your Estimator without modifying the code in the `model_fn`.
+
+The rest of the code to train, evaluate, and generate predictions using our
+Estimator is the same as in the
+@{$get_started/premade_estimators$Premade Estimators} chapter. For
+example, the following line will train the model:
+
+```python
+# Train the Model.
+classifier.train(
+    input_fn=lambda:iris_data.train_input_fn(train_x, train_y, args.batch_size),
+    steps=args.train_steps)
+```
+
+## TensorBoard
+
+You can view training results for your custom Estimator in TensorBoard. To see
+this reporting, start TensorBoard from your command line as follows:
+
+```bsh
+# Replace PATH with the actual path passed as model_dir
+tensorboard --logdir=PATH
+```
+
+Then, open TensorBoard by browsing to: [http://localhost:6006](http://localhost:6006)
+
+All the pre-made Estimators automatically log a lot of information to
+TensorBoard. With custom Estimators, however, TensorBoard only provides one
+default log (a graph of the loss) plus the information you explicitly tell
+TensorBoard to log. For the custom Estimator you just created, TensorBoard
+generates the following:
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="height:260px"
+  alt="Accuracy, steps/second, and loss 'scalar' graphs from tensorboard"
+  src="../images/custom_estimators/tensorboard.png">
+</div>
+<div style="text-align: center">
+TensorBoard displays three graphs.
+</div>
+
+In brief, here's what the three graphs tell you:
+
+* global_step/sec: A performance indicator showing how many batches (gradient
+  updates) we processed per second as the model trains.
+
+* loss: The loss reported.
+
+* accuracy: The accuracy is recorded by the following two lines:
+
+  * `eval_metric_ops={'my_accuracy': accuracy})`, during evaluation.
+  * `tf.summary.scalar('accuracy', accuracy[1])`, during training.
+
+These tensorboard graphs are one of the main reasons it's important to pass a
+`global_step` to your optimizer's `minimize` method. The model can't record
+the x-coordinate for these graphs without it.
+
+Note the following in the `my_accuracy` and `loss` graphs:
+
+* The orange line represents training.
+* The blue dot represents evaluation.
+
+During training, summaries (the orange line) are recorded periodically as
+batches are processed, which is why it becomes a graph spanning x-axis range.
+
+By contrast, evaluation produces only a single point on the graph for each call
+to `evaluate`. This point contains the average over the entire evaluation call.
+This has no width on the graph as it is evaluated entirely from the model state
+at a particular training step (from a single checkpoint).
+
+As suggested in the following figure, you may see and also selectively
+disable/enable the reporting using the controls on the left side.
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="margin:auto;display:block;"
+  alt="Check-boxes allowing the user to select which runs are shown."
+  src="../images/custom_estimators/select_run.jpg">
+</div>
+<div style="text-align: center">
+Enable or disable reporting.
+</div>
+
+
+## Summary
+
+Although pre-made Estimators can be an effective way to quickly create new
+models, you will often need the additional flexibility that custom Estimators
+provide. Fortunately, pre-made and custom Estimators follow the same
+programming model. The only practical difference is that you must write a model
+function for custom Estimators; everything else is the same.
+
+For more details, be sure to check out:
+
+* The
+[official TensorFlow implementation of MNIST](https://github.com/tensorflow/models/tree/master/official/mnist),
+which uses a custom estimator.
+
+* The TensorFlow
+[official models repository](https://github.com/tensorflow/models/tree/master/official),
+which contains more curated examples using custom estimators.
+
+* This [TensorBoard video](https://youtu.be/eBbEDRsCmv4), which introduces
+TensorBoard.
+
+
diff --git a/tensorflow/docs_src/get_started/datasets_quickstart.md b/tensorflow/docs_src/get_started/datasets_quickstart.md
new file mode 100644
index 0000000000000000000000000000000000000000..7daa08454c3743d736554395e7ecf52d97bebb53
--- /dev/null
+++ b/tensorflow/docs_src/get_started/datasets_quickstart.md
@@ -0,0 +1,398 @@
+# Datasets Quick Start
+
+The @{tf.data} module contains a collection of classes that allows you to
+easily load data, manipulate it, and pipe it into your model. This document
+introduces the API by walking through two simple examples:
+
+* Reading in-memory data from numpy arrays.
+* Reading lines from a csv file.
+
+<!-- TODO(markdaoust): Add links to an example reading from multiple-files
+(image_retraining), and a from_generator example. -->
+
+## Basic input
+
+Taking slices from an array is the simplest way to get started with `tf.data`.
+
+The @{$get_started/premade_estimators$Premade Estimators} chapter describes
+the following `train_input_fn`, from
+[`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py),
+to pipe the data into the Estimator:
+
+``` python
+def train_input_fn(features, labels, batch_size):
+    """An input function for training"""
+    # Convert the inputs to a Dataset.
+    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
+
+    # Shuffle, repeat, and batch the examples.
+    dataset = dataset.shuffle(1000).repeat().batch(batch_size)
+
+    # Build the Iterator, and return the read end of the pipeline.
+    return dataset.make_one_shot_iterator().get_next()
+```
+
+Let's look at this more closely.
+
+### Arguments
+
+This function expects three arguments. Arguments expecting an "array" can
+accept nearly anything that can be converted to an array with `numpy.array`.
+One exception is
+[`tuple`](https://docs.python.org/3/tutorial/datastructures.html#tuples-and-sequences)
+which has special meaning for `Datasets`.
+
+* `features`: A `{'feature_name':array}` dictionary (or
+  [`DataFrame`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html))
+  containing the raw input features.
+* `labels` : An array containing the
+  [label](https://developers.google.com/machine-learning/glossary/#label)
+  for each example.
+* `batch_size` : An integer indicating the desired batch size.
+
+In [`premade_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py)
+we retrieved the Iris data using the `iris_data.load_data()` function.
+You can run it, and unpack the results as follows:
+
+``` python
+import iris_data
+
+# Fetch the data
+train, test = iris_data.load_data()
+features, labels = train
+```
+
+Then we passed this data to the input function, with a line similar to this:
+
+``` python
+batch_size=100
+iris_data.train_input_fn(features, labels, batch_size)
+```
+
+Let's walk through the `train_input_fn()`.
+
+### Slices
+
+In the simplest cases, @{tf.data.Dataset.from_tensor_slices} function takes an
+array and returns a @{tf.data.Dataset} representing slices of the array. For
+example, an array containing the @{$mnist/beginners$mnist training data}
+has a shape of `(60000, 28, 28)`. Passing this to `from_tensor_slices` returns
+a `Dataset` object containing 60000 slices, each one a 28x28 image.
+
+The code that returns this `Dataset` is as follows:
+
+``` python
+train, test = tf.keras.datasets.mnist.load_data()
+mnist_x, mnist_y = train
+
+mnist_ds = tf.data.Dataset.from_tensor_slices(mnist_x)
+print(mnist_ds)
+```
+
+This will print the following line, showing the @{$programmers_guide/tensors#shapes$shapes} and @{$programmers_guide/tensors#data_types$types} of the items in
+the dataset. Note that the dataset does not know how many items it contains.
+
+``` None
+<TensorSliceDataset shapes: (28,28), types: tf.uint8>
+```
+
+The dataset above represents a collection of simple arrays, but datasets are
+much more powerful than this. Datasets transparently handle any nested
+combination of dictionaries or tuples. For example, ensuring that `features`
+is a standard dictionary, you can then convert the dictionary of arrays to
+a `Dataset` of dictionaries as follows:
+
+``` python
+dataset = tf.data.Dataset.from_tensor_slices(dict(features))
+print(dataset)
+```
+``` None
+<TensorSliceDataset
+
+  shapes: {
+    SepalLength: (), PetalWidth: (),
+    PetalLength: (), SepalWidth: ()},
+
+  types: {
+      SepalLength: tf.float64, PetalWidth: tf.float64,
+      PetalLength: tf.float64, SepalWidth: tf.float64}
+>
+```
+
+Here we see that when a `Dataset` contains structured elements, the `shapes`
+and `types` of the `Dataset` take on the same structure. This dataset contains
+dictionaries of @{$programmers_guide/tensors#rank$scalars}, all of type
+`tf.float64`.
+
+The first line of `train_input_fn` uses the same functionality, but adds
+another level of structure. It creates a dataset containing
+`(features, labels)` pairs.
+
+The following code shows that the label is a scalar with type `int64`:
+
+``` python
+# Convert the inputs to a Dataset.
+dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
+print(dataset)
+```
+```
+<TensorSliceDataset
+    shapes: (
+        {
+          SepalLength: (), PetalWidth: (),
+          PetalLength: (), SepalWidth: ()},
+        ()),
+
+    types: (
+        {
+          SepalLength: tf.float64, PetalWidth: tf.float64,
+          PetalLength: tf.float64, SepalWidth: tf.float64},
+        tf.int64)>
+```
+
+### Manipulation
+
+Currently the `Dataset` would iterate over the data once, in a fixed order, and
+only produce a single element at a time. It needs further processing before it
+can be used for training. Fortunately, the `tf.data.Dataset` class provides
+methods to better prepare the data for training. The next line of the input
+function takes advantage of several of these methods:
+
+``` python
+# Shuffle, repeat, and batch the examples.
+dataset = dataset.shuffle(1000).repeat().batch(batch_size)
+```
+
+The @{tf.data.Dataset.shuffle$`shuffle`} method uses a fixed-size buffer to
+shuffle the items as they pass through. Setting a `buffer_size` greater than
+the number of examples in the `Dataset` ensures that the data is completely
+shuffled. The Iris data set only contains 150 examples.
+
+The @{tf.data.Dataset.repeat$`repeat`} method has the `Dataset` restart when
+it reaches the end. To limit the number of epochss, set the `count` argument.
+
+The @{tf.data.Dataset.repeat$`batch`} method collects a number of examples and
+stacks them, to create batches. This adds a dimension to their shape. The new
+dimension is added as the first dimension. The following code uses
+the `batch` method on the MNIST `Dataset`, from earlier. This results in a
+`Dataset` containing 3D arrays representing stacks of `(28,28)` images:
+
+``` python
+print(mnist_ds.batch(100))
+```
+
+``` none
+<BatchDataset
+  shapes: (?, 28, 28),
+  types: tf.uint8>
+```
+Note that the dataset has an unknown batch size because the last batch will
+have fewer elements.
+
+In `train_input_fn`, after batching the `Dataset` contains 1D vectors of
+elements where each scalar was previously:
+
+```python
+print(dataset)
+```
+```
+<TensorSliceDataset
+    shapes: (
+        {
+          SepalLength: (?,), PetalWidth: (?,),
+          PetalLength: (?,), SepalWidth: (?,)},
+        (?,)),
+
+    types: (
+        {
+          SepalLength: tf.float64, PetalWidth: tf.float64,
+          PetalLength: tf.float64, SepalWidth: tf.float64},
+        tf.int64)>
+```
+
+
+### Return
+
+<!-- TODO(markdaoust) This line can be simplified to "return dataset" -->
+
+The `train`, `evaluate`, and `predict` methods of every Estimator require
+input functions to return a `(features, label)` pair containing
+@{$programmers_guide/tensors$tensorflow tensors}. The `train_input_fn` uses
+the following line to convert the Dataset into the expected format:
+
+```python
+# Build the Iterator, and return the read end of the pipeline.
+features_result, labels_result = dataset.make_one_shot_iterator().get_next()
+```
+
+The result is a structure of @{$programmers_guide/tensors$TensorFlow tensors},
+matching the layout of the items in the `Dataset`.
+For an introduction to what these objects are and how to work with them,
+see @{$get_started/get_started}.
+
+``` python
+print((features_result, labels_result))
+```
+
+```None
+({
+    'SepalLength': <tf.Tensor 'IteratorGetNext:2' shape=(?,) dtype=float64>,
+    'PetalWidth': <tf.Tensor 'IteratorGetNext:1' shape=(?,) dtype=float64>,
+    'PetalLength': <tf.Tensor 'IteratorGetNext:0' shape=(?,) dtype=float64>,
+    'SepalWidth': <tf.Tensor 'IteratorGetNext:3' shape=(?,) dtype=float64>},
+Tensor("IteratorGetNext_1:4", shape=(?,), dtype=int64))
+```
+
+## Reading a CSV File
+
+The most common real-world use case for the `Dataset` class is to stream data
+from files on disk. The @{tf.data} module includes a variety of
+file readers. Let's see how parsing the Iris dataset from the csv file looks
+using a `Dataset`.
+
+The following call to the `iris_data.maybe_download` function downloads the
+data if necessary, and returns the pathnames of the resulting files:
+
+``` python
+import iris_data
+train_path, test_path = iris_data.maybe_download()
+```
+
+The [`iris_data.csv_input_fn`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py)
+function contains an alternative implementation that parses the csv files using
+a `Dataset`.
+
+Let's look at how to build an Estimator-compatible input function that reads
+from the local files.
+
+### Build the `Dataset`
+
+We start by building a @{tf.data.TextLineDataset$`TextLineDataset`} object to
+read the file one line at a time. Then, we call the
+@{tf.data.Dataset.skip$`skip`} method to skip over the first line of the file, which contains a header, not an example:
+
+``` python
+ds = tf.data.TextLineDataset(train_path).skip(1)
+```
+
+### Build a csv line parser
+
+Ultimately we will need to parse each of the lines in the dataset, to
+produce the necessary `(features, label)` pairs.
+
+We will start by building a function to parse a single line.
+
+The following `iris_data.parse_line` function acomplishes this taks using the
+@{tf.decode_csv} function, and some simple python code:
+
+We must parse each of the lines in the dataset in order to generate the
+necessary `(features, label)` pairs. The following `_parse_line` function
+calls @{tf.decode_csv} to parse a single line into its features
+and the label. Since Estimators require that features be represented as a
+dictionary, we rely on Python's built-in `dict` and `zip` functions to build
+that dictionary.  The feature names are the keys of that dictionary.
+We then then call the dictionary's `pop` method to remove the label field from
+the features dictionary:
+
+``` python
+# Metadata describing the text columns
+COLUMNS = ['SepalLength', 'SepalWidth',
+           'PetalLength', 'PetalWidth',
+           'label']
+FIELD_DEFAULTS = [[0.0], [0.0], [0.0], [0.0], [0]]
+def _parse_line(line):
+    # Decode the line into its fields
+    fields = tf.decode_csv(line, FIELD_DEFAULTS)
+
+    # Pack the result into a dictionary
+    features = dict(zip(COLUMNS,fields))
+
+    # Separate the label from the features
+    label = features.pop('label')
+
+    return features, label
+```
+
+### Parse the lines
+
+Datasets have many methods for manipulating the data while it is being piped
+to a model. The most heavily-used method is @{tf.data.Dataset.map$`map`}, which
+applies a transformation to each element of the `Dataset`.
+
+The `map` method takes a `map_func` argument that describes how each item in the
+`Dataset` should be transformed.
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/datasets/map.png">
+</div>
+<div style="text-align: center">
+The @{tf.data.Dataset.map$`map`} method applies the `map_func` to
+transform each item in the <code>Dataset</code>.
+</div>
+
+So to parse the lines as they are streamed out of the csv file, we pass our
+`_parse_line` function to the `map` method:
+
+``` python
+ds = ds.map(_parse_line)
+print(ds)
+```
+``` None
+<MapDataset
+shapes: (
+    {SepalLength: (), PetalWidth: (), ...},
+    ()),
+types: (
+    {SepalLength: tf.float32, PetalWidth: tf.float32, ...},
+    tf.int32)>
+```
+
+Now instead of simple scalar strings, the dataset contains `(features, label)`
+pairs.
+
+the remainder of the `iris_data.csv_input_fn` function is identical
+to `iris_data.train_input_fn` which was covered in the in the
+[Basic input](#basic_input) section.
+
+### Try it out
+
+This function can be used as a replacement for
+`iris_data.train_input_fn`. It can be used to feed an estimator as follows:
+
+``` python
+train_path, test_path = iris_data.maybe_download()
+
+# All the inputs are numeric
+feature_columns = [
+    tf.feature_column.numeric_column(name)
+    for name in iris_data.CSV_COLUMN_NAMES[:-1]]
+
+# Build the estimator
+est = tf.estimator.LinearClassifier(feature_columns,
+                                    n_classes=3)
+# Train the estimator
+batch_size = 100
+est.train(
+    steps=1000,
+    input_fn=lambda : iris_data.csv_input_fn(train_path, batch_size))
+```
+
+Estimators expect an `input_fn` to take no arguments. To work around this
+restriction, we use `lambda` to capture the arguments and provide the expected
+interface.
+
+## Summary
+
+The `tf.data` module provides a collection of classes and functions for easily
+reading data from a variety of sources. Furthermore, `tf.data` has simple
+powerful methods for applying a wide variety of standard and custom
+transformations.
+
+Now that you have the basic idea of how to efficiently load data for an
+Estimator. The next step is to learn how to build your own custom estimator in:
+
+* @{$get_started/custom_estimators}
+
+If you'd like to learn more about additional functionality of `Datasets` see:
+
+* @{$programmers_guide/datasets}
diff --git a/tensorflow/docs_src/get_started/feature_columns.md b/tensorflow/docs_src/get_started/feature_columns.md
new file mode 100644
index 0000000000000000000000000000000000000000..e034483508e9f156c4665995ce205fef4795ac27
--- /dev/null
+++ b/tensorflow/docs_src/get_started/feature_columns.md
@@ -0,0 +1,570 @@
+# Feature Columns
+
+This document details feature columns. Think of **feature columns** as the
+intermediaries between raw data and Estimators. Feature columns are very rich,
+enabling you to transform a diverse range of raw data into formats that
+Estimators can use, allowing easy experimentation.
+
+In @{$get_started/estimator$Premade Estimators}, we used the premade Estimator,
+@{tf.estimator.DNNClassifier$`DNNClassifier`} to train a model to predict
+different types of Iris flowers from four input features. That example created
+only numerical feature columns (of type @{tf.feature_column.numeric_column}).
+Although numerical feature columns model the lengths of petals and sepals
+effectively, real world data sets contain all kinds of features, many of which
+are non-numerical.
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/feature_cloud.jpg">
+</div>
+<div style="text-align: center">
+Some real-world features (such as, longitude) are numerical, but many are not.
+</div>
+
+## Input to a Deep Neural Network
+
+What kind of data can a deep neural network operate on? The answer
+is, of course, numbers (for example, `tf.float32`). After all, every neuron in
+a neural network performs multiplication and addition operations on weights and
+input data. Real-life input data, however, often contains non-numerical
+(categorical) data. For example, consider a `product_class` feature that can
+contain the following three non-numerical values:
+
+* `kitchenware`
+* `electronics`
+* `sports`
+
+ML models generally represent categorical values as simple vectors in which a
+1 represents the presence of a value and a 0 represents the absence of a value.
+For example, when `product_class` is set to `sports`, an ML model would usually
+represent `product_class` as  `[0, 0, 1]`, meaning:
+
+* `0`: `kitchenware` is absent
+* `0`: `electronics` is absent
+* `1`: `sports` is present
+
+So, although raw data can be numerical or categorical, an ML model represents
+all features as numbers.
+
+## Feature Columns
+
+As the following figure suggests, you specify the input to a model through the
+`feature_columns` argument of an Estimator (`DNNClassifier` for Iris).
+Feature Columns bridge input data (as returned by `input_fn`) with your model.
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/inputs_to_model_bridge.jpg">
+</div>
+<div style="text-align: center">
+Feature columns bridge raw data with the data your model needs.
+</div>
+
+To create feature columns, call functions from the
+@{tf.feature_column} module. This document explains nine of the functions in
+that module. As the following figure shows, all nine functions return either a
+Categorical-Column or a Dense-Column object, except `bucketized_column`, which
+inherits from both classes:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/some_constructors.jpg">
+</div>
+<div style="text-align: center">
+Feature column methods fall into two main categories and one hybrid category.
+</div>
+
+Let's look at these functions in more detail.
+
+### Numeric column
+
+The Iris classifier calls the @{tf.feature_column.numeric_column} function for
+all input features:
+
+  * `SepalLength`
+  * `SepalWidth`
+  * `PetalLength`
+  * `PetalWidth`
+
+Although `tf.numeric_column` provides optional arguments, calling
+`tf.numeric_column` without any arguments, as follows, is a fine way to specify
+a numerical value with the default data type (`tf.float32`) as input to your
+model:
+
+```python
+# Defaults to a tf.float32 scalar.
+numeric_feature_column = tf.feature_column.numeric_column(key="SepalLength")
+```
+
+To specify a non-default numerical data type, use the `dtype` argument. For
+example:
+
+``` python
+# Represent a tf.float64 scalar.
+numeric_feature_column = tf.feature_column.numeric_column(key="SepalLength",
+                                                          dtype=tf.float64)
+```
+
+By default, a numeric column creates a single value (scalar). Use the shape
+argument to specify another shape. For example:
+
+<!--TODO(markdaoust) link to full example-->
+```python
+# Represent a 10-element vector in which each cell contains a tf.float32.
+vector_feature_column = tf.feature_column.numeric_column(key="Bowling",
+                                                         shape=10)
+
+# Represent a 10x5 matrix in which each cell contains a tf.float32.
+matrix_feature_column = tf.feature_column.numeric_column(key="MyMatrix",
+                                                         shape=[10,5])
+```
+### Bucketized column
+
+Often, you don't want to feed a number directly into the model, but instead
+split its value into different categories based on numerical ranges.  To do so,
+create a @{tf.feature_column.bucketized_column$bucketized column}. For
+example, consider raw data that represents the year a house was built. Instead
+of representing that year as a scalar numeric column, we could split the year
+into the following four buckets:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/bucketized_column.jpg">
+</div>
+<div style="text-align: center">
+Dividing year data into four buckets.
+</div>
+
+The model will represent the buckets as follows:
+
+|Date Range |Represented as... |
+|:----------|:-----------------|
+|< 1960               | [1, 0, 0, 0] |
+|>= 1960 but < 1980   | [0, 1, 0, 0] |
+|>= 1980 but < 2000   | [0, 0, 1, 0] |
+|> 2000               | [0, 0, 0, 1] |
+
+Why would you want to split a number—a perfectly valid input to your
+model—into a categorical value? Well, notice that the categorization splits a
+single input number into a four-element vector. Therefore, the model now can
+learn _four individual weights_ rather than just one; four weights creates a
+richer model than one weight. More importantly, bucketizing enables the model
+to clearly distinguish between different year categories since only one of the
+elements is set (1) and the other three elements are cleared (0). When we just
+use a single number (a year) as input, the model can only learn a linear
+relationship. So, bucketing provides the model with additional flexibility that
+the model can use to learn.
+
+The following code demonstrates how to create a bucketized feature:
+
+<!--TODO(markdaoust) link to full example - housing price grid?-->
+```python
+# First, convert the raw input to a numeric column.
+numeric_feature_column = tf.feature_column.numeric_column("Year")
+
+# Then, bucketize the numeric column on the years 1960, 1980, and 2000.
+bucketized_feature_column = tf.feature_column.bucketized_column(
+    source_column = numeric_feature_column,
+    boundaries = [1960, 1980, 2000])
+```
+Note that specifying a _three_-element boundaries vector creates a
+_four_-element bucketized vector.
+
+
+### Categorical identity column
+
+**Categorical identity columns** can be seen as a special case of bucketized
+columns. In traditional bucketized columns, each bucket represents a range of
+values (for example, from 1960 to 1979). In a categorical identity column, each
+bucket represents a single, unique integer. For example, let's say you want to
+represent the integer range `[0, 4)`.  That is, you want to represent the
+integers 0, 1, 2, or 3. In this case, the categorical identity mapping looks
+like this:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/categorical_column_with_identity.jpg">
+</div>
+<div style="text-align: center">
+A categorical identity column mapping. Note that this is a one-hot
+encoding, not a binary numerical encoding.
+</div>
+
+As with bucketized columns, a model can learn a separate weight for each class
+in a categorical identity column. For example, instead of using a string to
+represent the `product_class`, let's represent each class with a unique integer
+value. That is:
+
+* `0="kitchenware"`
+* `1="electronics"`
+* `2="sport"`
+
+Call @{tf.feature_column.categorical_column_with_identity} to implement a
+categorical identity column. For example:
+
+``` python
+# Create categorical output for an integer feature named "my_feature_b",
+# The values of my_feature_b must be >= 0 and < num_buckets
+identity_feature_column = tf.feature_column.categorical_column_with_identity(
+    key='my_feature_b',
+    num_buckets=4) # Values [0, 4)
+
+# In order for the preceding call to work, the input_fn() must return
+# a dictionary containing 'my_feature_b' as a key. Furthermore, the values
+# assigned to 'my_feature_b' must belong to the set [0, 4).
+def input_fn():
+    ...
+    return ({ 'my_feature_a':[7, 9, 5, 2], 'my_feature_b':[3, 1, 2, 2] },
+            [Label_values])
+```
+
+### Categorical vocabulary column
+
+We cannot input strings directly to a model. Instead, we must first map strings
+to numeric or categorical values. Categorical vocabulary columns provide a good
+way to represent strings as a one-hot vector. For example:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/categorical_column_with_vocabulary.jpg">
+</div>
+<div style="text-align: center">
+Mapping string values to vocabulary columns.
+</div>
+
+As you can see, categorical vocabulary columns are kind of an enum version of
+categorical identity columns. TensorFlow provides two different functions to
+create categorical vocabulary columns:
+
+* @{tf.feature_column.categorical_column_with_vocabulary_list}
+* @{tf.feature_column.categorical_column_with_vocabulary_file}
+
+`categorical_column_with_vocabulary_list` maps each string to an integer based
+on an explicit vocabulary list. For example:
+
+```python
+# Given input "feature_name_from_input_fn" which is a string,
+# create a categorical feature by mapping the input to one of
+# the elements in the vocabulary list.
+vocabulary_feature_column =
+    tf.feature_column.categorical_column_with_vocabulary_list(
+        key="a feature returned by input_fn()",
+        vocabulary_list=["kitchenware", "electronics", "sports"])
+```
+
+The preceding function is pretty straightforward, but it has a significant
+drawback. Namely, there's way too much typing when the vocabulary list is long.
+For these cases, call
+`tf.feature_column.categorical_column_with_vocabulary_file` instead, which lets
+you place the vocabulary words in a separate file. For example:
+
+```python
+
+# Given input "feature_name_from_input_fn" which is a string,
+# create a categorical feature to our model by mapping the input to one of
+# the elements in the vocabulary file
+vocabulary_feature_column =
+    tf.feature_column.categorical_column_with_vocabulary_file(
+        key="a feature returned by input_fn()",
+        vocabulary_file="product_class.txt",
+        vocabulary_size=3)
+```
+
+`product_class.txt` should contain one line for each vocabulary element. In our
+case:
+
+```None
+kitchenware
+electronics
+sports
+```
+
+### Hashed Column
+
+So far, we've worked with a naively small number of categories. For example,
+our product_class example has only 3 categories. Often though, the number of
+categories can be so big that it's not possible to have individual categories
+for each vocabulary word or integer because that would consume too much memory.
+For these cases, we can instead turn the question around and ask, "How many
+categories am I willing to have for my input?"  In fact, the
+@{tf.feature_column.categorical_column_with_hash_bucket} function enables you
+to specify the number of categories. For this type of feature column the model
+calculates a hash value of the input, then puts it into one of
+the `hash_bucket_size` categories using the modulo operator, as in the following
+pseudocode:
+
+```python
+# pseudocode
+feature_id = hash(raw_feature) % hash_buckets_size
+```
+
+The code to create the `feature_column` might look something like this:
+
+``` python
+hashed_feature_column =
+    tf.feature_column.categorical_column_with_hash_bucket(
+        key = "some_feature",
+        hash_buckets_size = 100) # The number of categories
+```
+At this point, you might rightfully think: "This is crazy!" After all, we are
+forcing the different input values to a smaller set of categories. This means
+that two probably unrelated inputs will be mapped to the same
+category, and consequently mean the same thing to the neural network. The
+following figure illustrates this dilemma, showing that kitchenware and sports
+both get assigned to category (hash bucket) 12:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/hashed_column.jpg">
+</div>
+<div style="text-align: center">
+Representing data with hash buckets.
+</div>
+
+As with many counterintuitive phenomena in machine learning, it turns out that
+hashing often works well in practice. That's because hash categories provide
+the model with some separation. The model can use additional features to further
+separate kitchenware from sports.
+
+### Crossed column
+
+Combining features into a single feature, better known as
+[feature crosses](https://developers.google.com/machine-learning/glossary/#feature_cross),
+enables the model to learn separate weights for each combination of
+features.
+
+More concretely, suppose we want our model to calculate real estate prices in
+Atlanta, GA. Real-estate prices within this city vary greatly depending on
+location. Representing latitude and longitude as separate features isn't very
+useful in identifying real-estate location dependencies; however, crossing
+latitude and longitude into a single feature can pinpoint locations. Suppose we
+represent Atlanta as a grid of 100x100 rectangular sections, identifying each
+of the 10,000 sections by a feature cross of latitude and longitude. This
+feature cross enables the model to train on pricing conditions related to each
+individual section, which is a much stronger signal than latitude and longitude
+alone.
+
+The following figure shows our plan, with the latitude & longitude values for
+the corners of the city in red text:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/Atlanta.jpg">
+</div>
+<div style="text-align: center">
+Map of Atlanta. Imagine this map divided into 10,000 sections of
+equal size.
+</div>
+
+For the solution, we used a combination of the `bucketized_column` we looked at
+earlier, with the @{tf.feature_column.crossed_column} function.
+
+<!--TODO(markdaoust) link to full example-->
+
+``` python
+def make_dataset(latitude, longitude, labels):
+    assert latitude.shape == longitude.shape == labels.shape
+
+    features = {'latitude': latitude.flatten(),
+                'longitude': longitude.flatten()}
+    labels=labels.flatten()
+
+    return tf.data.Dataset.from_tensor_slices((features, labels))
+
+
+# Bucketize the latitude and longitude usig the `edges`
+latitude_bucket_fc = tf.feature_column.bucketized_column(
+    tf.feature_column.numeric_column('latitude'),
+    list(atlanta.latitude.edges))
+
+longitude_bucket_fc = tf.feature_column.bucketized_column(
+    tf.feature_column.numeric_column('longitude'),
+    list(atlanta.longitude.edges))
+
+# Cross the bucketized columns, using 5000 hash bins.
+crossed_lat_lon_fc = tf.feature_column.crossed_column(
+    [latitude_bucket_fc, longitude_bucket_fc], 5000)
+
+fc = [
+    latitude_bucket_fc,
+    longitude_bucket_fc,
+    crossed_lat_lon_fc]
+
+# Build and train the Estimator.
+est = tf.estimator.LinearRegressor(fc, ...)
+```
+
+You may create a feature cross from either of the following:
+
+* Feature names; that is, names from the `dict` returned from `input_fn`.
+* Any categorical column, except `categorical_column_with_hash_bucket`
+  (since `crossed_column` hashes the input).
+
+When the feature columns `latitude_bucket_fc` and `longitude_bucket_fc` are
+crossed, TensorFlow will create `(latitude_fc, longitude_fc)` pairs for each
+example. This would produce a full grid of possibilities as follows:
+
+``` None
+ (0,0),  (0,1)...  (0,99)
+ (1,0),  (1,1)...  (1,99)
+   ...     ...       ...
+(99,0), (99,1)...(99, 99)
+```
+
+Except that a full grid would only be tractable for inputs with limited
+vocabularies. Instead of building this, potentially huge, table of inputs,
+the `crossed_column` only builds the number requested by the `hash_bucket_size`
+argument. The feature column assigns an example to a index by running a hash
+function on the tuple of inputs, followed by a modulo operation with
+`hash_bucket_size`.
+
+As discussed earlier, performing the
+hash and modulo function limits the number of categories, but can cause category
+collisions; that is, multiple (latitude, longitude) feature crosses will end
+up in the same hash bucket. In practice though, performing feature crosses
+still adds significant value to the learning capability of your models.
+
+Somewhat counterintuitively, when creating feature crosses, you typically still
+should include the original (uncrossed) features in your model (as in the
+preceding code snippet). The independent latitude and longitude features help the
+model distinguish between examples where a hash collision has occurred in the
+crossed feature.
+
+## Indicator and embedding columns
+
+Indicator columns and embedding columns never work on features directly, but
+instead take categorical columns as input.
+
+When using an indicator column, we're telling TensorFlow to do exactly what
+we've seen in our categorical product_class example. That is, an
+**indicator column** treats each category as an element in a one-hot vector,
+where the matching category has value 1 and the rest have 0s:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/categorical_column_with_identity.jpg">
+</div>
+<div style="text-align: center">
+Representing data in indicator columns.
+</div>
+
+Here's how you create an indicator column by calling
+@{tf.feature_column.indicator_column}:
+
+``` python
+categorical_column = ... # Create any type of categorical column.
+
+# Represent the categorical column as an indicator column.
+indicator_column = tf.feature_column.indicator_column(categorical_column)
+```
+
+Now, suppose instead of having just three possible classes, we have a million.
+Or maybe a billion. For a number of reasons, as the number of categories grow
+large, it becomes infeasible to train a neural network using indicator columns.
+
+We can use an embedding column to overcome this limitation. Instead of
+representing the data as a one-hot vector of many dimensions, an
+**embedding column** represents that data as a lower-dimensional, ordinary
+vector in which each cell can contain any number, not just 0 or 1. By
+permitting a richer palette of numbers for every cell, an embedding column
+contains far fewer cells than an indicator column.
+
+Let's look at an example comparing indicator and embedding columns. Suppose our
+input examples consists of different words from a limited palette of only 81
+words. Further suppose that the data set provides provides the following input
+words in 4 separate examples:
+
+* `"dog"`
+* `"spoon"`
+* `"scissors"`
+* `"guitar"`
+
+In that case, the following figure illustrates the processing path for
+embedding columns or indicator columns.
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/embedding_vs_indicator.jpg">
+</div>
+<div style="text-align: center">
+An embedding column stores categorical data in a lower-dimensional
+vector than an indicator column. (We just placed random numbers into the
+embedding vectors; training determines the actual numbers.)
+</div>
+
+When an example is processed, one of the `categorical_column_with...` functions
+maps the example string to a numerical categorical value. For example, a
+function maps "spoon" to `[32]`. (The 32 comes from our imagination—the actual
+values depend on the mapping function.) You may then represent these numerical
+categorical values in either of the following two ways:
+
+* As an indicator column. A function converts each numeric categorical value
+  into an 81-element vector (because our palette consists of 81 words), placing
+  a 1 in the index of the categorical value (0, 32, 79, 80) and a 0 in all the
+  other positions.
+
+* As an embedding column. A function uses the numerical categorical values
+  `(0, 32, 79, 80)` as indices to a lookup table. Each slot in that lookup table
+  contains a 3-element vector.
+
+How do the values in the embeddings vectors magically get assigned? Actually,
+the assignments happen during training. That is, the model learns the best way
+to map your input numeric categorical values to the embeddings vector value in
+order to solve your problem. Embedding columns increase your model's
+capabilities, since an embeddings vector learns new relationships between
+categories from the training data.
+
+Why is the embedding vector size 3 in our example? Well, the following "formula"
+provides a general rule of thumb about the number of embedding dimensions:
+
+```python
+embedding_dimensions =  number_of_categories**0.25
+```
+
+That is, the embedding vector dimension should be the 4th root of the number of
+categories. Since our vocabulary size in this example is 81, the recommended
+number of dimensions is 3:
+
+``` python
+3 =  81**0.25
+```
+Note that this is just a general guideline; you can set the number of embedding
+dimensions as you please.
+
+Call @{tf.feature_column.embedding_column} to create an `embedding_column` as
+suggested by the following snippet:
+
+``` python
+categorical_column = ... # Create any categorical column
+
+# Represent the categorical column as an embedding column.
+# This means creating a one-hot vector with one element for each category.
+embedding_column = tf.feature_column.embedding_column(
+    categorical_column=categorical_column,
+    dimension=dimension_of_embedding_vector)
+```
+
+@{$programmers_guide/embedding$Embeddings} is a significant topic within machine
+learning. This information was just to get you started using them as feature
+columns.
+
+## Passing feature columns to Estimators
+
+As the following list indicates, not all Estimators permit all types of
+`feature_columns` argument(s):
+
+* @{tf.estimator.LinearClassifier$`LinearClassifier`} and
+  @{tf.estimator.LinearRegressor$`LinearRegressor`}: Accept all types of
+  feature column.
+* @{tf.estimator.DNNClassifier$`DNNClassifier`} and
+  @{tf.estimator.DNNRegressor$`DNNRegressor`}: Only accept dense columns. Other
+  column types must be wrapped in either an `indicator_column` or
+  `embedding_column`.
+* @{tf.estimator.DNNLinearCombinedClassifier$`DNNLinearCombinedClassifier`} and
+  @{tf.estimator.DNNLinearCombinedRegressor$`DNNLinearCombinedRegressor`}:
+    * The `linear_feature_columns` argument accepts any feature column type.
+    * The `dnn_feature_columns` argument only accepts dense columns.
+
+## Other Sources
+
+For more examples on feature columns, view the following:
+
+* The @{$wide_and_deep$Wide & Deep Tutorial}
+* [Examples](https://github.com/tensorflow/models/tree/master/samples/cookbook/regression)
+  of DNNs and linear models that use feature columns.
+
+To learn more about embeddings, see the following:
+
+* [Deep Learning, NLP, and representations](http://colah.github.io/posts/2014-07-NLP-RNNs-Representations/)
+  (Chris Olah's blog)
+* The TensorFlow [Embedding Projector](http://projector.tensorflow.org)
diff --git a/tensorflow/docs_src/get_started/get_started.md b/tensorflow/docs_src/get_started/get_started.md
index be14ab4026873b2874d47d60d50ff8121b8c4844..231108215ac73bc9ab87a896b3441a7da5f2b507 100644
--- a/tensorflow/docs_src/get_started/get_started.md
+++ b/tensorflow/docs_src/get_started/get_started.md
@@ -330,8 +330,8 @@ When run, it produces
 W: [-0.9999969] b: [ 0.99999082] loss: 5.69997e-11
 ```
 
-Notice that the loss is a very small number (very close to zero). If you run 
-this program, your loss may not be exactly the same as the aforementioned loss 
+Notice that the loss is a very small number (very close to zero). If you run
+this program, your loss may not be exactly the same as the aforementioned loss
 because the model is initialized with pseudorandom values.
 
 This more complicated program can still be visualized in TensorBoard
diff --git a/tensorflow/docs_src/get_started/input_fn.md b/tensorflow/docs_src/get_started/input_fn.md
index f0dcdc47ff1fd70bc8fce670a51d0cef8234e4ba..24bfdbdd2e91a6d87a5ab1ec2ba264d90ef8e148 100644
--- a/tensorflow/docs_src/get_started/input_fn.md
+++ b/tensorflow/docs_src/get_started/input_fn.md
@@ -292,7 +292,7 @@ prediction_set = pd.read_csv("boston_predict.csv", skipinitialspace=True,
 Next, create a list of `FeatureColumn`s for the input data, which formally
 specify the set of features to use for training. Because all features in the
 housing data set contain continuous values, you can create their
-`FeatureColumn`s using the `tf.contrib.layers.real_valued_column()` function:
+`FeatureColumn`s using the `tf.feature_column.numeric_column()` function:
 
 ```python
 feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES]
diff --git a/tensorflow/docs_src/get_started/mnist/beginners.md b/tensorflow/docs_src/get_started/mnist/beginners.md
index 38c467ddc32c9ca21432cc7fe74a594446804293..c419ca87c363bc6c4507f70c25d1293e27612253 100644
--- a/tensorflow/docs_src/get_started/mnist/beginners.md
+++ b/tensorflow/docs_src/get_started/mnist/beginners.md
@@ -347,11 +347,10 @@ over all the examples in the batch.
 
 Note that in the source code, we don't use this formulation, because it is
 numerically unstable.  Instead, we apply
-`tf.nn.softmax_cross_entropy_with_logits` on the unnormalized logits (e.g., we
-call `softmax_cross_entropy_with_logits` on `tf.matmul(x, W) + b`), because this
-more numerically stable function internally computes the softmax activation.  In
-your code, consider using `tf.nn.softmax_cross_entropy_with_logits`
-instead.
+`tf.losses.sparse_softmax_cross_entropy` on the unnormalized logits (e.g., we
+call `sparse_softmax_cross_entropy` on the output of `tf.matmul(x, W) + b`),
+because this more numerically stable function internally computes the softmax
+activation.
 
 Now that we know what we want our model to do, it's very easy to have TensorFlow
 train it to do so.  Because TensorFlow knows the entire graph of your
diff --git a/tensorflow/docs_src/get_started/mnist/mechanics.md b/tensorflow/docs_src/get_started/mnist/mechanics.md
index 27fae45b5b0b4126132556cfac312fbb3c4f515a..dac00498e12d180d88fae0dc405dcda013441eff 100644
--- a/tensorflow/docs_src/get_started/mnist/mechanics.md
+++ b/tensorflow/docs_src/get_started/mnist/mechanics.md
@@ -47,7 +47,7 @@ training folder and then unpack that data to return a dictionary of `DataSet`
 instances.
 
 ```python
-data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)
+data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data)
 ```
 
 **NOTE**: The `fake_data` flag is used for unit-testing purposes and may be
@@ -167,20 +167,15 @@ Finally, the `logits` tensor that will contain the output is returned.
 The `loss()` function further builds the graph by adding the required loss
 ops.
 
-First, the values from the `labels_placeholder` are converted to 64-bit integers. Then, a @{tf.nn.sparse_softmax_cross_entropy_with_logits} op is added to automatically produce 1-hot labels from the `labels_placeholder` and compare the output logits from the `inference()` function with those 1-hot labels.
+First, the values from the `labels_placeholder` are converted to 64-bit
+integers. Then, a @{tf.losses.sparse_softmax_cross_entropy} op is used to
+calculate the batch's average cross entropy, of the `inference()` result,
+compared to the labels.
 
 ```python
 labels = tf.to_int64(labels)
-cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
-    labels=labels, logits=logits, name='xentropy')
-```
-
-It then uses @{tf.reduce_mean}
-to average the cross entropy values across the batch dimension (the first
-dimension) as the total loss.
-
-```python
-loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
+cross_entropy = tf.losses.sparse_softmax_cross_entropy(
+    labels=labels, logits=logits)
 ```
 
 And the tensor that will then contain the loss value is returned.
@@ -369,7 +364,7 @@ may be instantiated to write the events files, which
 contain both the graph itself and the values of the summaries.
 
 ```python
-summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
+summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)
 ```
 
 Lastly, the events file will be updated with new summary values every time the
@@ -403,7 +398,7 @@ method will periodically be called to write a checkpoint file to the training
 directory with the current values of all the trainable variables.
 
 ```python
-saver.save(sess, FLAGS.train_dir, global_step=step)
+saver.save(sess, checkpoint_file, global_step=step)
 ```
 
 At some later point in the future, training might be resumed by using the
@@ -411,7 +406,7 @@ At some later point in the future, training might be resumed by using the
 method to reload the model parameters.
 
 ```python
-saver.restore(sess, FLAGS.train_dir)
+saver.restore(sess, checkpoint_file)
 ```
 
 ## Evaluate the Model
diff --git a/tensorflow/docs_src/get_started/mnist/pros.md b/tensorflow/docs_src/get_started/mnist/pros.md
index 4933dd28cd37e695a10ab28832f26a613589d01a..c52e960bb34f53643bb2f8973595245e40932128 100644
--- a/tensorflow/docs_src/get_started/mnist/pros.md
+++ b/tensorflow/docs_src/get_started/mnist/pros.md
@@ -49,7 +49,7 @@ these two lines of code which will download and read in the data automatically:
 
 ```python
 from tensorflow.examples.tutorials.mnist import input_data
-mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
+mnist = input_data.read_data_sets('MNIST_data')
 ```
 
 Here `mnist` is a lightweight class which stores the training, validation, and
@@ -172,8 +172,7 @@ between the target and the softmax activation function applied to the model's
 prediction.  As in the beginners tutorial, we use the stable formulation:
 
 ```python
-cross_entropy = tf.reduce_mean(
-    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
+cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y))
 ```
 
 Note that `tf.nn.softmax_cross_entropy_with_logits` internally applies the
diff --git a/tensorflow/docs_src/get_started/premade_estimators.md b/tensorflow/docs_src/get_started/premade_estimators.md
new file mode 100644
index 0000000000000000000000000000000000000000..00b936905d55e8a7180e6fbc753fa0bd330f06a7
--- /dev/null
+++ b/tensorflow/docs_src/get_started/premade_estimators.md
@@ -0,0 +1,442 @@
+
+# Getting Started with TensorFlow
+
+This document introduces the TensorFlow programming environment and shows you
+how to write the Iris classification problem in TensorFlow.
+
+Prior to reading this document, do the following:
+
+* @{$install$Install TensorFlow}.
+* If you installed TensorFlow with virtualenv or Anaconda, activate your
+  TensorFlow environment.
+* To keep the data import simple, our Iris example uses Pandas. You can
+  install Pandas with:
+
+      `pip install pandas`
+
+## Getting the sample code
+
+Take the following steps to get the sample code for this program:
+
+1. Clone the TensorFlow Models repository from github by entering the following
+   command:
+
+       `git clone https://github.com/tensorflow/models`
+
+1. Change directory within that branch to the location containing the examples
+   used in this document:
+
+       `cd models/samples/core/get_started/`
+
+The program described in this document is
+[`premade_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py).
+This program uses
+[`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py)
+To fetch its training data.
+
+### Running the program
+
+You run TensorFlow programs as you would run any Python program. For example:
+
+``` bsh
+python premade_estimator.py
+```
+
+The program should output training logs followed by some predictions against
+the test set. For example, the first line in the following output shows that
+the model thinks there is a 99.6% chance that the first example in the test
+set is a Setosa. Since the test set `expected "Setosa"`, this appears to be
+a good prediction.
+
+``` None
+...
+Prediction is "Setosa" (99.6%), expected "Setosa"
+
+Prediction is "Versicolor" (99.8%), expected "Versicolor"
+
+Prediction is "Virginica" (97.9%), expected "Virginica"
+```
+
+If the program generates errors instead of answers, ask yourself the following
+questions:
+
+* Did you install TensorFlow properly?
+* Are you using the correct version of tensorflow?
+* Did you activate the environment you installed TensorFlow in? (This is
+  only relevant in certain installation environments.)
+
+## The programming stack
+
+Before getting into the details of the program itself, let's investigate the
+programming environment. As the following illustration shows, TensorFlow
+provides a programming stack consisting of multiple API layers:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/tensorflow_programming_environment.png">
+</div>
+<div style="text-align: center">
+The TensorFlow Programming Environment
+</div>
+
+We strongly recommend writing TensorFlow programs with the following APIs:
+
+* @{tf.estimator$Estimators}, which represent a complete model.
+  The Estimator API provides methods to train the model, to judge the model's
+  accuracy, and to generate predictions.
+* @{$get_started/datasets_quickstart$Datasets}, which build a data input
+  pipeline. The Dataset API has methods to load and manipulate data, and feed
+  it into your model. The Datasets API meshes well with the Estimators API.
+
+## Classifying irises: an overview
+
+The sample program in this document builds and tests a model that
+classifies Iris flowers into three different species based on the size of their
+[sepals](https://en.wikipedia.org/wiki/Sepal) and
+[petals](https://en.wikipedia.org/wiki/Petal).
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%"
+  alt="Petal geometry compared for three iris species: Iris setosa, Iris virginica, and Iris versicolor"
+  src="../images/iris_three_species.jpg">
+</div>
+**From left to right,
+[*Iris setosa*](https://commons.wikimedia.org/w/index.php?curid=170298) (by
+[Radomil](https://commons.wikimedia.org/wiki/User:Radomil), CC BY-SA 3.0),
+[*Iris versicolor*](https://commons.wikimedia.org/w/index.php?curid=248095) (by
+[Dlanglois](https://commons.wikimedia.org/wiki/User:Dlanglois), CC BY-SA 3.0),
+and [*Iris virginica*](https://www.flickr.com/photos/33397993@N05/3352169862)
+(by [Frank Mayfield](https://www.flickr.com/photos/33397993@N05), CC BY-SA
+2.0).**
+
+### The data set
+
+The Iris data set contains four features and one
+[label](https://developers.google.com/machine-learning/glossary/#label).
+The four features identify the following botanical characteristics of
+individual Iris flowers:
+
+* sepal length
+* sepal width
+* petal length
+* petal width
+
+Our model will represent these features as float32 numerical data.
+
+The label identifies the Iris species, which must be one of the following:
+
+* Iris setosa (0)
+* Iris versicolor (1)
+* Iris virginica (2)
+
+Our model will represent the label as `int32` categorical data.
+
+The following table shows three examples in the data set:
+
+|sepal length | sepal width | petal length | petal width| species (label) |
+|------------:|------------:|-------------:|-----------:|:---------------:|
+|         5.1 |         3.3 |          1.7 |        0.5 |   0 (Setosa)   |
+|         5.0 |         2.3 |          3.3 |        1.0 |   1 (versicolor)|
+|         6.4 |         2.8 |          5.6 |        2.2 |   2 (virginica) |
+
+### The algorithm
+
+The program trains a Deep Neural Network classifier model having the following
+topology:
+
+* 2 hidden layers.
+* Each hidden layer contains 10 nodes.
+
+The following figure illustrates the features, hidden layers, and predictions
+(not all of the nodes in the hidden layers are shown):
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%"
+  alt="A diagram of the network architecture: Inputs, 2 hidden layers, and outputs"
+  src="../images/custom_estimators/full_network.png">
+</div>
+<div style="text-align: center">
+The Model.
+</div>
+
+### Inference
+
+Running the trained model on an unlabeled example yields three predictions,
+namely, the likelihood that this flower is the given Iris species. The sum of
+those output predictions will be 1.0. For example, the prediction on an
+unlabeled example might be something like the following:
+
+* 0.03 for Iris Setosa
+* 0.95 for Iris Versicolor
+* 0.02 for Iris Virginica
+
+The preceding prediction indicates a 95% probability that the given unlabeled
+example is an Iris Versicolor.
+
+## Overview of programming with Estimators
+
+An Estimator is TensorFlow's high level representation of a complete model. It
+handles the details of initialization, logging, saving and restoring, and many
+other features so you can concentrate on your model. For more details see
+@{$programmers_guide/estimators}.
+
+An "Estimator" is any class derived from @{tf.estimator.Estimator}. TensorFlow
+provides a collection of
+[pre-made Estimators](https://developers.google.com/machine-learning/glossary/#pre-made_Estimator)
+(for example, `LinearRegressor`) to implement common ML algorithms. Beyond
+those, you may write your own
+[custom Estimators](https://developers.google.com/machine-learning/glossary/#custom_Estimator).
+We recommend using pre-made Estimators when just getting started with
+TensorFlow. After gaining expertise with the pre-made Estimators, we recommend
+optimizing your model by creating your own custom Estimators.
+
+To write a TensorFlow program based on pre-made Estimators, you must perform the
+following tasks:
+
+* Create one or more input functions.
+* Define the model's feature columns.
+* Instantiate an Estimator, specifying the feature columns and various
+  hyperparameters.
+* Call one or more methods on the Estimator object, passing the appropriate
+  input function as the source of the data.
+
+Let's see how those tasks are implemented in Iris.
+
+## Create input functions
+
+You must create input functions to supply data for training,
+evaluating, and prediction.
+
+An **input function** is a function that returns the following two-element
+tuple:
+
+* "features" - A Python dictionary in which:
+    * Each key is the name of a feature.
+    * Each value is an array containing all of that feature's values.
+* "label" - An array containing the values of the
+  [label](https://developers.google.com/machine-learning/glossary/#label) for
+  every example.
+
+Just to demonstrate the format of the input function here's a simple
+implementation:
+
+```python
+def input_evaluation_set():
+    features = {'SepalLength': np.array([6.4, 5.0]),
+                'SepalWidth':  np.array([2.8, 2.3]),
+                'PetalLength': np.array([5.6, 3.3]),
+                'PetalWidth':  np.array([2.2, 1.0])}
+    labels = np.array([2, 1])
+    return features, labels
+```
+
+Your input function may generate the "features" dictionary and "label" list any
+way you like. However, we recommend using TensorFlow's Dataset API, which can
+deftly parse all sorts of data. At a high-level, the Datasets API consists of
+the following classes:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%"
+  alt="A diagram showing subclasses of the Dataset class"
+  src="../images/dataset_classes.png">
+</div>
+
+
+Where:
+
+* Dataset: Base class containing methods to create and transform datasets. Also
+  allows you to initialize a dataset from data in memory, or from a Python
+  generator.
+* TextLineDataset: Reads lines from text files.
+* TFRecordDataset: Reads records from TFRecord files.
+* FixedLengthRecordDataset: Reads fixed size records from binary files.
+* Iterator: Provides a way to access one data set element at a time.
+
+The Dataset API can handle a lot of common cases for you. For example,
+using the Dataset API, you can easily read in records from a large collection
+of files in parallel and join them into a single stream.
+
+To keep things simple in this example we are going to load the data with pandas,
+and build our input pipeline from this in-memory data.
+
+Here is the input function used for training in this program, which is available
+in [`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py):
+
+``` python
+def train_input_fn(features, labels, batch_size):
+    """An input function for training"""
+    # Convert the inputs to a Dataset.
+    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
+
+    # Shuffle, repeat, and batch the examples.
+    dataset = dataset.shuffle(1000).repeat().batch(batch_size)
+
+    # Build the Iterator, and return the read end of the pipeline.
+    return dataset.make_one_shot_iterator().get_next()
+```
+
+## Define the Feature Columns
+
+A [**Feature Column**](https://developers.google.com/machine-learning/glossary/#feature_columns)
+is an object describing how the model should use raw input data from the
+features dictionary. When you build an Estimator model, you pass it a list of
+feature columns that describes each of the features you want the model to use.
+The @{tf.feature_column} module provides many options for representing data
+to the model.
+
+For Iris, the 4 raw features are numeric values, so we'll build a list of
+feature columns to tell the Estimator model to represent each of the four
+features as 32-bit floating-point values. Therefore, the code to create the
+Feature Column is simply:
+
+```python
+# Feature columns describe how to use the input.
+my_feature_columns = []
+for key in train_x.keys():
+    my_feature_columns.append(tf.feature_column.numeric_column(key=key))
+```
+
+Feature Columns can be far more sophisticated than those we're showing here.
+We detail feature columns @{$get_started/feature_columns$later on} in
+getting started.
+
+Now that we have the description of how we want the model to represent the raw
+features, we can build the estimator.
+
+
+## Instantiate an Estimator
+
+The Iris problem is a classic classifier problem. Fortunately, TensorFlow
+provides several pre-made classifier Estimators, including:
+
+* @{tf.estimator.DNNClassifier}—for deep models that perform multi-class
+  classification.
+* @{tf.estimator.DNNLinearCombinedClassifier}—for wide-n-deep models.
+* @{tf.estimator.LinearClassifier}— for classifiers based on linear models.
+
+For the Iris problem, `tf.estimator.DNNClassifier` seems like the best choice.
+Here's how we instantiated this Estimator:
+
+```python
+# Build 2 hidden layer DNN with 10, 10 units respectively.
+classifier = tf.estimator.DNNClassifier(
+    feature_columns=my_feature_columns,
+    # Two hidden layers of 10 nodes each.
+    hidden_units=[10, 10],
+    # The model must choose between 3 classes.
+    n_classes=3)
+```
+
+## Train, Evaluate, and Predict
+
+Now that we have an Estimator object, we can call methods to do the following:
+
+* Train the model.
+* Evaluate the trained model.
+* Use the trained model to make predictions.
+
+### Train the model
+
+Train the model by calling the Estimator's `train` method as follows:
+
+```python
+# Train the Model.
+classifier.train(
+    input_fn=lambda:iris_data.train_input_fn(train_x, train_y, args.batch_size),
+    steps=args.train_steps)
+```
+
+Here we wrap up our `input_fn` call in a
+[`lambda`](https://docs.python.org/3/tutorial/controlflow.html)
+to capture the arguments while providing an input function that takes no
+arguments, as expected by the Estimator. The `steps` argument tells the method
+to stop training after a number of training steps.
+
+### Evaluate the trained model
+
+Now that the model has been trained, we can get some statistics on its
+performance. The following code block evaluates the accuracy of the trained
+model on the test data:
+
+```python
+# Evaluate the model.
+eval_result = classifier.evaluate(
+    input_fn=lambda:iris_data.eval_input_fn(test_x, test_y, args.batch_size))
+
+print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
+```
+
+Unlike our call to the `train` method, we did not pass the `steps`
+argument to evaluate. Our `eval_input_fn` only yields a single
+[epoch](https://developers.google.com/machine-learning/glossary/#epoch) of data.
+
+Running this code yields the following output (or something similar):
+
+```none
+Test set accuracy: 0.967
+```
+
+### Making predictions (inferring) from the trained model
+
+We now have a trained model that produces good evaluation results.
+We can now use the trained model to predict the species of an Iris flower
+based on some unlabeled measurments. As with training and evaluation, we make
+predictions using a single function call:
+
+```python
+# Generate predictions from the model
+expected = ['Setosa', 'Versicolor', 'Virginica']
+predict_x = {
+    'SepalLength': [5.1, 5.9, 6.9],
+    'SepalWidth': [3.3, 3.0, 3.1],
+    'PetalLength': [1.7, 4.2, 5.4],
+    'PetalWidth': [0.5, 1.5, 2.1],
+}
+
+predictions = classifier.predict(
+    input_fn=lambda:iris_data.eval_input_fn(predict_x,
+                                            batch_size=args.batch_size))
+```
+
+The `predict` method returns a Python iterable, yielding a dictionary of
+prediction results for each example. The following code prints a few
+predictions and their probabilities:
+
+
+``` python
+for pred_dict, expec in zip(predictions, expected):
+    template = ('\nPrediction is "{}" ({:.1f}%), expected "{}"')
+
+    class_id = pred_dict['class_ids'][0]
+    probability = pred_dict['probabilities'][class_id]
+
+    print(template.format(iris_data.SPECIES[class_id],
+                          100 * probability, expec))
+```
+
+Running the preceding code yields the following output:
+
+``` None
+...
+Prediction is "Setosa" (99.6%), expected "Setosa"
+
+Prediction is "Versicolor" (99.8%), expected "Versicolor"
+
+Prediction is "Virginica" (97.9%), expected "Virginica"
+```
+
+
+## Summary
+
+Pre-made Estimators are an effective way to quickly create standard models.
+
+Now that you've gotten started writing TensorFlow programs, consider the
+following material:
+
+* @{$get_started/saving_models$Checkpoints} to learn how to save and restore
+  models.
+* @{$get_started/datasets_quickstart$Datasets} to learn more about importing
+  data into your
+  model.
+* @{$get_started/custom_estimators$Creating Custom Estimators} to learn how to
+  write your own Estimator, customized for a particular problem.
+
diff --git a/tensorflow/docs_src/get_started/saving_models.md b/tensorflow/docs_src/get_started/saving_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..680e1c0d3f58166a4f6b352816914f5220d84996
--- /dev/null
+++ b/tensorflow/docs_src/get_started/saving_models.md
@@ -0,0 +1,238 @@
+# Checkpoints
+
+This document examines how to save and restore TensorFlow models built with
+Estimators. TensorFlow provides two model formats:
+
+*   checkpoints, which is a format dependent on the code that created
+    the model.
+*   SavedModel, which is a format independent of the code that created
+    the model.
+
+This document focuses on checkpoints. For details on SavedModel, see the
+@{$saved_model$Saving and Restoring} chapter of the
+*TensorFlow Programmer's Guide*.
+
+
+## Sample code
+
+This document relies on the same
+[https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py](Iris classification example) detailed in @{$premade_estimators$Getting Started with TensorFlow}.
+To download and access the example, invoke the following two commands:
+
+```shell
+git clone https://github.com/tensorflow/models/
+cd models/samples/core/get_started
+```
+
+Most of the code snippets in this document are minor variations
+on `premade_estimator.py`.
+
+
+## Saving partially-trained models
+
+Estimators automatically write the following to disk:
+
+*   **checkpoints**, which are versions of the model created during training.
+*   **event files**, which contain information that
+    [TensorBoard](https://developers.google.com/machine-learning/glossary/#TensorBoard)
+    uses to create visualizations.
+
+To specify the top-level directory in which the Estimator stores its
+information, assign a value to the optional `model_dir` argument of any
+Estimator's constructor.  For example, the following code sets the `model_dir`
+argument to the `models/iris` directory:
+
+```python
+classifier = tf.estimator.DNNClassifier(
+    feature_columns=my_feature_columns,
+    hidden_units=[10, 10],
+    n_classes=3,
+    model_dir='models/iris')
+```
+
+Suppose you call the Estimator's `train` method. For example:
+
+
+```python
+classifier.train(
+        input_fn=lambda:train_input_fn(train_x, train_y, batch_size=100),
+                steps=200)
+```
+
+As suggested by the following diagrams, the first call to `train`
+adds checkpoints and other files to the `model_dir` directory:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/first_train_calls.png">
+</div>
+<div style="text-align: center">
+The first call to train().
+</div>
+
+
+To see the objects in the created `model_dir` directory on a
+UNIX-based system, just call `ls` as follows:
+
+```none
+$ ls -1 models/iris
+checkpoint
+events.out.tfevents.timestamp.hostname
+graph.pbtxt
+model.ckpt-1.data-00000-of-00001
+model.ckpt-1.index
+model.ckpt-1.meta
+model.ckpt-200.data-00000-of-00001
+model.ckpt-200.index
+model.ckpt-200.meta
+```
+
+The preceding `ls` command shows that the Estimator created checkpoints
+at steps 1 (the start of training) and 200 (the end of training).
+
+
+### Default checkpoint directory
+
+If you don't specify `model_dir` in an Estimator's constructor, the Estimator
+writes checkpoint files to a temporary directory chosen by Python's
+[tempfile.mkdtemp](https://docs.python.org/3/library/tempfile.html#tempfile.mkdtemp)
+function. For example, the following Estimator constructor does *not* specify
+the `model_dir` argument:
+
+```python
+classifier = tf.estimator.DNNClassifier(
+    feature_columns=my_feature_columns,
+    hidden_units=[10, 10],
+    n_classes=3)
+
+print(classifier.model_dir)
+```
+
+The `tempfile.mkdtemp` function picks a secure, temporary directory
+appropriate for your operating system. For example, a typical temporary
+directory on macOS might be something like the following:
+
+```None
+/var/folders/0s/5q9kfzfj3gx2knj0vj8p68yc00dhcr/T/tmpYm1Rwa
+```
+
+### Checkpointing Frequency
+
+By default, the Estimator saves
+[checkpoints](https://developers.google.com/machine-learning/glossary/#checkpoint)
+in the `model_dir` according to the following schedule:
+
+*   Writes a checkpoint every 10 minutes (600 seconds).
+*   Writes a checkpoint when the `train` method starts (first iteration)
+    and completes (final iteration).
+*   Retains only the 5 most recent checkpoints in the directory.
+
+You may alter the default schedule by taking the following steps:
+
+1.  Create a @{tf.estimator.RunConfig$`RunConfig`} object that defines the
+    desired schedule.
+2.  When instantiating the Estimator, pass that `RunConfig` object to the
+    Estimator's `config` argument.
+
+For example, the following code changes the checkpointing schedule to every
+20 minutes and retains the 10 most recent checkpoints:
+
+```python
+my_checkpointing_config = tf.estimator.RunConfig(
+    save_checkpoints_secs = 20*60,  # Save checkpoints every 20 minutes.
+    keep_checkpoint_max = 10,       # Retain the 10 most recent checkpoints.
+)
+
+classifier = tf.estimator.DNNClassifier(
+    feature_columns=my_feature_columns,
+    hidden_units=[10, 10],
+    n_classes=3,
+    model_dir='models/iris',
+    config=my_checkpointing_config)
+```
+
+## Restoring your model
+
+The first time you call an Estimator's `train` method, TensorFlow saves a
+checkpoint to the `model_dir`. Each subsequent call to the Estimator's
+`train`, `eval`, or `predict` method causes the following:
+
+1.  The Estimator builds the model's
+    [graph](https://developers.google.com/machine-learning/glossary/#graph)
+    by running the `model_fn()`.  (For details on the `model_fn()`, see
+    @{$custom_estimators$Creating Custom Estimators.})
+2.  The Estimator initializes the weights of the new model from the data
+    stored in the most recent checkpoint.
+
+In other words, as the following illustration suggests, once checkpoints
+exist, TensorFlow rebuilds the model each time you call `train()`,
+`evaluate()`, or `predict()`.
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/subsequent_calls.png">
+</div>
+<div style="text-align: center">
+Subsequent calls to train(), evaluate(), or predict()
+</div>
+
+
+### Avoiding a bad restoration
+
+Restoring a model's state from a checkpoint only works if the model
+and checkpoint are compatible.  For example, suppose you trained a
+`DNNClassifier` Estimator containing two hidden layers,
+each having 10 nodes:
+
+```python
+classifier = tf.estimator.DNNClassifier(
+    feature_columns=feature_columns,
+    hidden_units=[10, 10],
+    n_classes=3,
+    model_dir='models/iris')
+
+classifier.train(
+    input_fn=lambda:train_input_fn(train_x, train_y, batch_size=100),
+        steps=200)
+```
+
+After training (and, therefore, after creating checkpoints in `models/iris`),
+imagine that you changed the number of neurons in each hidden layer from 10 to
+20 and then attempted to retrain the model:
+
+``` python
+classifier2 = tf.estimator.DNNClassifier(
+    feature_columns=my_feature_columns,
+    hidden_units=[20, 20],  # Change the number of neurons in the model.
+    n_classes=3,
+    model_dir='models/iris')
+
+classifier.train(
+    input_fn=lambda:train_input_fn(train_x, train_y, batch_size=100),
+        steps=200)
+```
+
+Since the state in the checkpoint is incompatible with the model described
+in `classifier2`, retraining fails with the following error:
+
+```None
+...
+InvalidArgumentError (see above for traceback): tensor_name =
+dnn/hiddenlayer_1/bias/t_0/Adagrad; shape in shape_and_slice spec [10]
+does not match the shape stored in checkpoint: [20]
+```
+
+To run experiments in which you train and compare slightly different
+versions of a model, save a copy of the code that created each
+`model-dir`, possibly by creating a separate git branch for each version.
+This separation will keep your checkpoints recoverable.
+
+## Summary
+
+Checkpoints provide an easy automatic mechanism for saving and restoring
+models created by Estimators.
+
+See the @{$saved_model$Saving and Restoring}
+chapter of the *TensorFlow Programmer's Guide* for details on:
+
+*   Saving and restoring models using low-level TensorFlow APIs.
+*   Exporting and importing models in the SavedModel format, which is a
+    language-neutral, recoverable, serialization format.
diff --git a/tensorflow/docs_src/get_started/summaries_and_tensorboard.md b/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
index ce5db079ba3a502ffdec96191b03a8b951ac3db6..32f387ae8e0da0ef3d6f6cad62001a7e9f99961b 100644
--- a/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
+++ b/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
@@ -137,12 +137,10 @@ with tf.name_scope('cross_entropy'):
   #
   # can be numerically unstable.
   #
-  # So here we use tf.nn.softmax_cross_entropy_with_logits on the
-  # raw outputs of the nn_layer above, and then average across
-  # the batch.
-  diff = tf.nn.softmax_cross_entropy_with_logits(targets=y_, logits=y)
+  # So here we use tf.losses.sparse_softmax_cross_entropy on the
+  # raw logit outputs of the nn_layer above.
   with tf.name_scope('total'):
-    cross_entropy = tf.reduce_mean(diff)
+    cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y)
 tf.summary.scalar('cross_entropy', cross_entropy)
 
 with tf.name_scope('train'):
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index f7380bac8a7a378ff8682064147f443716576ad3..e3d5b80aa75614820dfc3b2e89816fb3480ba7e5 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -51,15 +51,15 @@ must be installed on your system:
     <pre>
     $ <b>sudo apt-get install cuda-command-line-tools</b>
     </pre>
-    
+
     and add its path to your `LD_LIBRARY_PATH` environment variable:
 
-    <pre> 
-    $ <b>export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64</b> 
+    <pre>
+    $ <b>export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64</b>
     </pre>
 
     For CUDA Toolkit <= 7.5 do:
-    
+
     <pre>
     $ <b>sudo apt-get install libcupti-dev</b>
     </pre>
@@ -718,44 +718,3 @@ https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0-cp36-cp
 
 Note that GPU support requires the NVIDIA hardware and software described in
 [NVIDIA requirements to run TensorFlow with GPU support](#NVIDIARequirements).
-
-<a name="Protobuf31"></a>
-## Protobuf pip package 3.1
-
-You can skip this section unless you are seeing problems related
-to the protobuf pip package.
-
-**NOTE:** If your TensorFlow programs are running slowly, you might
-have a problem related to the protobuf pip package.
-
-The TensorFlow pip package depends on protobuf pip package version 3.1. The
-protobuf pip package downloaded from PyPI (when invoking
-<tt>pip install protobuf</tt>) is a Python-only library containing
-Python implementations of proto serialization/deserialization that can run
-**10x-50x slower** than the C++ implementation. Protobuf also supports a
-binary extension for the Python package that contains fast
-C++ based proto parsing.  This extension is not available in the
-standard Python-only pip package.  We have created a custom binary
-pip package for protobuf that contains the binary extension. To install
-the custom binary protobuf pip package, invoke one of the following commands:
-
-  * for Python 2.7:
-
-  <pre>
-  $ <b>pip install --upgrade \
-  https://storage.googleapis.com/tensorflow/linux/cpu/protobuf-3.1.0-cp27-none-linux_x86_64.whl</b></pre>
-
-  * for Python 3.5:
-
-  <pre>
-  $ <b>pip3 install --upgrade \
-  https://storage.googleapis.com/tensorflow/linux/cpu/protobuf-3.1.0-cp35-none-linux_x86_64.whl</b></pre>
-
-Installing this protobuf package will overwrite the existing protobuf package.
-Note that the binary pip package already has support for protobufs
-larger than 64MB, which should fix errors such as these:
-
-<pre>[libprotobuf ERROR google/protobuf/src/google/protobuf/io/coded_stream.cc:207]
-A protocol message was rejected because it was too big (more than 67108864 bytes).
-To increase the limit (or to disable these warnings), see
-CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.</pre>
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 79b383817b4865dab20232b453d522c2613f9e9d..d4ab5475fa7af9abe4ca998280927c43285ea153 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -79,22 +79,23 @@ Take the following steps to install TensorFlow with Virtualenv:
   4. Activate the Virtualenv environment by issuing one of the
      following commands:
 
-     <pre>$ <b>source ~/tensorflow/bin/activate</b>      # If using bash, sh, ksh, or zsh
-    $ <b>source ~/tensorflow/bin/activate.csh</b>  # If using csh or tcsh </pre>
+     <pre>$ <b>cd <i>targetDirectory</i></b>
+    $ <b>source ./bin/activate</b>      # If using bash, sh, ksh, or zsh
+    $ <b>source ./bin/activate.csh</b>  # If using csh or tcsh </pre>
 
      The preceding `source` command should change your prompt to the following:
 
-     <pre> (tensorflow)$ </pre>
+     <pre> (<i>targetDirectory</i>)$ </pre>
 
   5. Ensure pip ≥8.1 is installed:
 
-     <pre> (tensorflow)$ <b>easy_install -U pip</b></pre>
+     <pre> (<i>targetDirectory</i>)$ <b>easy_install -U pip</b></pre>
 
   6. Issue one of the following commands to install TensorFlow and all the
      packages that TensorFlow requires into the active Virtualenv environment:
 
-     <pre> (tensorflow)$ <b>pip install --upgrade tensorflow</b>      # for Python 2.7
-     (tensorflow)$ <b>pip3 install --upgrade tensorflow</b>     # for Python 3.n
+     <pre> (<i>targetDirectory</i>)$ <b>pip install --upgrade tensorflow</b>      # for Python 2.7
+     (<i>targetDirectory</i>)$ <b>pip3 install --upgrade tensorflow</b>     # for Python 3.n
 
   7. Optional. If Step 6 failed (typically because you invoked a pip version
      lower than 8.1), install TensorFlow in the active
@@ -128,16 +129,18 @@ to confirm that the installation worked properly.
 
 Note that you must activate the Virtualenv environment each time you
 use TensorFlow in a new shell.  If the Virtualenv environment is not
-currently active (that is, the prompt is not `(tensorflow)`, invoke
+currently active (that is, the prompt is not `(<i>targetDirectory</i>)`, invoke
 one of the following commands:
 
-<pre>$ <b>source ~/tensorflow/bin/activate</b>      # bash, sh, ksh, or zsh
-$ <b>source ~/tensorflow/bin/activate.csh</b>  # csh or tcsh </pre>
+<pre>$ <b>cd <i>targetDirectory</i></b>
+$ <b>source ./bin/activate</b>      # If using bash, sh, ksh, or zsh
+$ <b>source ./bin/activate.csh</b>  # If using csh or tcsh </pre>
+
 
 Your prompt will transform to the following to indicate that your
 tensorflow environment is active:
 
-<pre> (tensorflow)$ </pre>
+<pre> (<i>targetDirectory</i>)$ </pre>
 
 When the Virtualenv environment is active, you may run
 TensorFlow programs from this shell.
@@ -145,7 +148,7 @@ TensorFlow programs from this shell.
 When you are done using TensorFlow, you may deactivate the
 environment by issuing the following command:
 
-<pre> (tensorflow)$ <b>deactivate</b> </pre>
+<pre> (<i>targetDirectory</i>)$ <b>deactivate</b> </pre>
 
 The prompt will revert back to your default prompt (as defined by `PS1`).
 
@@ -331,19 +334,19 @@ Take the following steps to install TensorFlow in an Anaconda environment:
   3. Activate the conda environment by issuing the following command:
 
      <pre>$ <b>source activate tensorflow</b>
-     (tensorflow)$  # Your prompt should change</pre>
+     (<i>targetDirectory</i>)$  # Your prompt should change</pre>
 
   4. Issue a command of the following format to install
      TensorFlow inside your conda environment:
 
-     <pre>(tensorflow)<b>$ pip install --ignore-installed --upgrade</b> <i>TF_PYTHON_URL</i></pre>
+     <pre>(<i>targetDirectory</i>)<b>$ pip install --ignore-installed --upgrade</b> <i>TF_PYTHON_URL</i></pre>
 
      where <i>TF_PYTHON_URL</i> is the
      [URL of the TensorFlow Python package](#the_url_of_the_tensorflow_python_package).
      For example, the following command installs the CPU-only version of
      TensorFlow for Python 2.7:
 
-     <pre> (tensorflow)$ <b>pip install --ignore-installed --upgrade \
+     <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
      https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.whl</b></pre>
 
 
@@ -527,44 +530,3 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.
 <pre>
 https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py3-none-any.whl
 </pre>
-
-
-
-<a name="Protobuf31"></a>
-## Protobuf pip package 3.1
-
-You can skip this section unless you are seeing problems related
-to the protobuf pip package.
-
-**NOTE:** If your TensorFlow programs are running slowly, you might
-have a problem related to the protobuf pip package.
-
-The TensorFlow pip package depends on protobuf pip package version 3.1. The
-protobuf pip package downloaded from PyPI (when invoking
-<tt>pip install protobuf</tt>) is a Python-only library containing
-Python implementations of proto serialization/deserialization that can run
-**10x-50x slower** than the C++ implementation. Protobuf also supports a
-binary extension for the Python package that contains fast
-C++ based proto parsing.  This extension is not available in the
-standard Python-only pip package.  We have created a custom binary
-pip package for protobuf that contains the binary extension. To install
-the custom binary protobuf pip package, invoke one of the following commands:
-
-  * for Python 2.7:
-
-    <pre>$ <b>pip install --upgrade \
-    https://storage.googleapis.com/tensorflow/mac/cpu/protobuf-3.1.0-cp27-none-macosx_10_11_x86_64.whl</b></pre>
-
-  * for Python 3.n:
-
-    <pre>$ <b>pip3 install --upgrade \
-    https://storage.googleapis.com/tensorflow/mac/cpu/protobuf-3.1.0-cp35-none-macosx_10_11_x86_64.whl</b></pre>
-
-Installing this protobuf package will overwrite the existing protobuf package.
-Note that the binary pip package already has support for protobufs
-larger than 64MB, which should fix errors such as these:
-
-<pre>[libprotobuf ERROR google/protobuf/src/google/protobuf/io/coded_stream.cc:207]
-A protocol message was rejected because it was too big (more than 67108864 bytes).
-To increase the limit (or to disable these warnings), see
-CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.</pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index aa4ae6c876fbe85e55be6b633d47ef97afa8c1e6..e453bd6ca19f1e9ca9acfd80d8df28d0611c152e 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -138,12 +138,12 @@ The following NVIDIA <i>software</i> must be installed on your system:
     `LD_LIBRARY_PATH` environment variable as described in the
     NVIDIA documentation.
   * The NVIDIA drivers associated with NVIDIA's Cuda Toolkit.
-  * cuDNN (>= v3). We recommend version 5.1. For details, see
+  * cuDNN (>= v3). We recommend version 6.0. For details, see
     [NVIDIA's documentation](https://developer.nvidia.com/cudnn),
     particularly the description of appending the appropriate pathname
     to your `LD_LIBRARY_PATH` environment variable.
 
-Finally, you must also install `libcupti` which for Cuda Toolkit >= 8.0 you do via 
+Finally, you must also install `libcupti` which for Cuda Toolkit >= 8.0 you do via
 
 <pre> $ <b>sudo apt-get install cuda-command-line-tools</b> </pre>
 
@@ -180,7 +180,7 @@ If bazel is not installed on your system, install it now by following
 
 ### Install python dependencies
 
-To install TensorFlow, you must install the following packages:
+To build TensorFlow, you must install the following packages:
 
   * six
   * numpy, which is a numerical processing package that TensorFlow requires.
@@ -196,7 +196,11 @@ After installing pip, invoke the following commands:
 
 <pre> $ <b>sudo pip install six numpy wheel</b> </pre>
 
-
+Note: These are just the minimum requirements to _build_ tensorflow. Installing
+the pip package will download additional packages required to _run_ it. If you
+plan on executing tasks directly with `bazel` , without the pip installation,
+you may need to install additional python packages. For example, you should
+`pip install mock enum34` before running TensorFlow's tests with bazel.
 
 ### Optional: install TensorFlow for GPU prerequisites
 
@@ -441,6 +445,15 @@ Stack Overflow and specify the `tensorflow` tag.
   <td>Invoking `python` or `ipython` generates the following error:
   <pre>ImportError: cannot import name pywrap_tensorflow</pre></td>
 </tr>
+
+<tr>
+  <td><a href="https://stackoverflow.com/questions/45276830">45276830</a></td>
+  <td><pre>external/local_config_cc/BUILD:50:5: in apple_cc_toolchain rule
+  @local_config_cc//:cc-compiler-darwin_x86_64: Xcode version must be specified
+  to use an Apple CROSSTOOL.</pre>
+  </td>
+</tr>
+
 </table>
 
 ## Tested source configurations
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index 2e5d797958f64e478106c91f00e403822a307ee5..8d0eb7966fdf17be1c259627a64803f0a392943a 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -36,7 +36,7 @@ installed on your system:
     Ensure that you append the relevant Cuda pathnames to the `%PATH%`
     environment variable as described in the NVIDIA documentation.
   * The NVIDIA drivers associated with CUDA Toolkit 8.0.
-  * cuDNN v6.1. For details, see
+  * cuDNN v6.0. For details, see
     [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
     Note that cuDNN is typically installed in a different location from the
     other CUDA DLLs. Ensure that you add the directory where you installed
diff --git a/tensorflow/docs_src/mobile/android_build.md b/tensorflow/docs_src/mobile/android_build.md
index 030cd0d051103e0d4bf903663d6fb7300c884b18..b5a1d5d7d1bf3b456ab24165e273969bdbd7bfca 100644
--- a/tensorflow/docs_src/mobile/android_build.md
+++ b/tensorflow/docs_src/mobile/android_build.md
@@ -66,7 +66,7 @@ them.
 
 ## Adding TensorFlow to your apps using Android Studio
 
-To add TensorFlow to your own apps on Android, the simplest way is to add the 
+To add TensorFlow to your own apps on Android, the simplest way is to add the
 following lines to your Gradle build file:
 
     allprojects {
@@ -74,7 +74,7 @@ following lines to your Gradle build file:
             jcenter()
         }
 	}
-											
+
     dependencies {
         compile 'org.tensorflow:tensorflow-android:+'
     }
diff --git a/tensorflow/docs_src/mobile/index.md b/tensorflow/docs_src/mobile/index.md
index 6bcd7d09d9c2c42492961599fbc52d7d27a7699f..419ae7094a180fb166eb5b00cc382773b95b91f4 100644
--- a/tensorflow/docs_src/mobile/index.md
+++ b/tensorflow/docs_src/mobile/index.md
@@ -2,7 +2,7 @@
 
 TensorFlow was designed to be a good deep learning solution for mobile
 platforms. Currently we have two solutions for deploying machine learning
-applications on mobile and embedded devices: 
+applications on mobile and embedded devices:
 @{$mobile/mobile_intro$TensorFlow for Mobile} and @{$mobile/tflite$TensorFlow Lite}.
 
 ## TensorFlow Lite versus TensorFlow Mobile
diff --git a/tensorflow/docs_src/mobile/ios_build.md b/tensorflow/docs_src/mobile/ios_build.md
index 6943b3c4b8fe161c2115d24161f784582e5975c6..4c84a1214a26eeb90c1b6a186a369212377b06cd 100644
--- a/tensorflow/docs_src/mobile/ios_build.md
+++ b/tensorflow/docs_src/mobile/ios_build.md
@@ -24,7 +24,7 @@ If you'd like to add TensorFlow capabilities to your own app, do the following:
 
 - Open `YourProjectName.xcworkspace` and add your code.
 
-- In your app's **Build Settings**, make sure to add `$(inherited)` to the 
+- In your app's **Build Settings**, make sure to add `$(inherited)` to the
   **Other Linker Flags**, and **Header Search Paths** sections.
 
 ## Running the Samples
diff --git a/tensorflow/docs_src/mobile/mobile_intro.md b/tensorflow/docs_src/mobile/mobile_intro.md
index 73b2396e696526b9b76ead0ffbd31762efdca5eb..17dbf1c3e6ad89768529864ba884274a51b3dfb2 100644
--- a/tensorflow/docs_src/mobile/mobile_intro.md
+++ b/tensorflow/docs_src/mobile/mobile_intro.md
@@ -82,7 +82,7 @@ new object enters or leaves the scene. We have some sample code for this
 available for Android [on
 Github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android),
 and also a [more general object detection
-model](https://github.com/tensorflow/models/tree/master/object_detection/README.md)
+model](https://github.com/tensorflow/models/tree/master/research/object_detection/README.md)
 available as well.
 
 ### Gesture Recognition
@@ -134,7 +134,7 @@ that covers everything from sentiment analysis to topic discovery. You’re like
 to have your own categories or labels that you want to apply, so the best place
 to start is with an example
 like
-[Skip-Thoughts](https://github.com/tensorflow/models/tree/master/skip_thoughts/),
+[Skip-Thoughts](https://github.com/tensorflow/models/tree/master/research/skip_thoughts/),
 and then train on your own examples.
 
 ### Voice Synthesis
diff --git a/tensorflow/docs_src/mobile/optimizing.md b/tensorflow/docs_src/mobile/optimizing.md
index 5abc68bb61b4b24a16045a6ed31446bd54c1bd82..44cacff5dbbcb0685044c342184464b47a8ed090 100644
--- a/tensorflow/docs_src/mobile/optimizing.md
+++ b/tensorflow/docs_src/mobile/optimizing.md
@@ -57,7 +57,7 @@ get one inference every two seconds.
 
 Having this estimate helps you plan for what you’ll be able to realistically
 achieve on a device. If the model is using too many ops, then there are a lot of
-opportunities to optimize the architecture to reduce that number. 
+opportunities to optimize the architecture to reduce that number.
 
 Advanced techniques include [SqueezeNet](https://arxiv.org/abs/1602.07360)
 and [MobileNet](https://arxiv.org/abs/1704.04861), which are architectures
@@ -278,7 +278,7 @@ The run above was on your desktop, but the tool also works on Android, which is
 where it’s most useful for mobile development. Here’s an example command line to
 run it on a 64-bit ARM device:
 
-    bazel build -c opt --config=android_arm64 \ 
+    bazel build -c opt --config=android_arm64 \
     tensorflow/tools/benchmark:benchmark_model
     adb push bazel-bin/tensorflow/tools/benchmark/benchmark_model /data/local/tmp
     adb push /tmp/tensorflow_inception_graph.pb /data/local/tmp/
diff --git a/tensorflow/docs_src/mobile/prepare_models.md b/tensorflow/docs_src/mobile/prepare_models.md
index 8fc65be35adee3338091d82cced4382e188f17d5..360ee302aa96bc3a0b65eab7b39c3dacf56b42c0 100644
--- a/tensorflow/docs_src/mobile/prepare_models.md
+++ b/tensorflow/docs_src/mobile/prepare_models.md
@@ -131,9 +131,9 @@ needs to understand which parts of the graph are actually needed, and which are
 artifacts of the training process, like summarization ops. Only ops that
 contribute to calculating the given output nodes will be kept. If you know how
 your graph is going to be used, these should just be the names of the nodes you
-pass into `Session::Run()` as your fetch targets. The easiest way to find the 
+pass into `Session::Run()` as your fetch targets. The easiest way to find the
 node names is to inspect the Node objects while building your graph in python.
-Inspecting your graph in TensorBoard is another simple way.  You can get some 
+Inspecting your graph in TensorBoard is another simple way.  You can get some
 suggestions on likely outputs by running the [`summarize_graph` tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/graph_transforms/README.md#inspecting-graphs).
 
 Because the output format for TensorFlow has changed over time, there are a
@@ -164,7 +164,7 @@ The trickiest part of this process is figuring out the names of the nodes you
 want to use as inputs and outputs during inference.  You'll need these anyway
 once you start to run inference, but you also need them here so that the
 transform can calculate which nodes are not needed on the inference-only
-path. These may not be obvious from the training code. The easiest way to 
+path. These may not be obvious from the training code. The easiest way to
 determine the node name is to explore the graph with TensorBoard.
 
 Remember that mobile applications typically gather their data from sensors and
@@ -187,9 +187,9 @@ output nodes.
 If you’ve just been given a frozen `GraphDef` file, and are not sure about the
 contents, try using the `summarize_graph` tool to print out information
 about the inputs and outputs it finds from the graph structure. Here’s an
-example with the original Inception v3 file: 
+example with the original Inception v3 file:
 
-    bazel run tensorflow/tools/graph_transforms:summarize_graph -- 
+    bazel run tensorflow/tools/graph_transforms:summarize_graph --
     --in_graph=tensorflow_inception_graph.pb
 
 Once you have an idea of what the input and output nodes are, you can feed them
@@ -259,7 +259,7 @@ on how to do this, and also see @{$mobile/optimizing#binary_size$Optimizing} for
 more on reducing your binary size.
 
 ### Locate the implementation
-   
+
 Operations are broken into two parts. The first is the op definition, which
 declares the signature of the operation, which inputs, outputs, and attributes
 it has. These take up very little space, and so all are included by default. The
@@ -267,7 +267,7 @@ implementations of the op computations are done in kernels, which live in the
 `tensorflow/core/kernels` folder. You need to compile the C++ file containing
 the kernel implementation of the op you need into the library. To figure out
 which file that is, you can search for the operation name in the source
-files. 
+files.
 
 [Here’s an example search in github](https://github.com/search?utf8=%E2%9C%93&q=repo%3Atensorflow%2Ftensorflow+extension%3Acc+path%3Atensorflow%2Fcore%2Fkernels+REGISTER+Mul&type=Code&ref=searchresults).
 
diff --git a/tensorflow/docs_src/mobile/tflite/index.md b/tensorflow/docs_src/mobile/tflite/index.md
index 59daa2fe25090595d4d9be4e1e2e46c22972ba67..6c4589d6937615da5b1de002314e2f6382181d91 100644
--- a/tensorflow/docs_src/mobile/tflite/index.md
+++ b/tensorflow/docs_src/mobile/tflite/index.md
@@ -40,7 +40,7 @@ TensorFlow Lite provides an interface to leverage hardware acceleration, if
 available on the device. It does so via the Android Neural Networks library,
 released as part of Android O-MR1.
 
-## Why do we need a new mobile-specific library? 
+## Why do we need a new mobile-specific library?
 
 Machine Learning is changing the computing paradigm, and we see an emerging
 trend of new use cases on mobile and embedded devices. Consumer expectations are
@@ -67,7 +67,7 @@ There are several factors which are fueling interest in this domain:
   connected to a network.
 
 We believe the next wave of machine learning applications will have significant
-processing on mobile and embedded devices. 
+processing on mobile and embedded devices.
 
 ## TensorFlow Lite developer preview highlights
 
@@ -155,7 +155,7 @@ retraining for both floating point and quantized inference.
 
 The following diagram shows the architectural design of TensorFlow Lite:
 
-<img src = "/images/tflite-architecture.jpg">
+![tensorflow lite architecture](https://www.tensorflow.org/images/tflite-architecture.jpg)
 
 Starting with a trained TensorFlow model on disk, you'll convert that model to
 the TensorFlow Lite file format (`.tflite`) using the TensorFlow Lite
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index 17f71a6d7705c75e7322932cc652ec6728c8c626..3ebafb907455cc91f799997fd9bd18000c979cf8 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -18,6 +18,7 @@ following sections:
 *   [Input pipeline optimizations](#input-pipeline-optimization)
 *   [Data formats](#data-formats)
 *   [Common fused Ops](#common-fused-ops)
+*   [RNN Performance](#rnn-performance)
 *   [Building and installing from source](#building-and-installing-from-source)
 
 ### Input pipeline optimization
@@ -197,6 +198,57 @@ since before TensorFlow 1.0.
 bn = tf.contrib.layers.batch_norm(input_layer, fused=True, data_format='NCHW')
 ```
 
+### RNN Performance
+
+There are many ways to specify an RNN computation in Tensorflow and they have
+have trade-offs with respect to model flexibility and performance. The
+@{tf.nn.rnn_cell.BasicLSTMCell} should be considered a reference implementation
+and used only as a last resort when no other options will work.
+
+When using one of the cells, rather than the fully fused RNN layers, you have a
+choice of whether to use @{tf.nn.static_rnn} or @{tf.nn.dynamic_rnn}.  There
+shouldn't generally be a performance difference at runtime, but large unroll
+amounts can increase the graph size of the @{tf.nn.static_rnn} and cause long
+compile times.  An additional advantage of @{tf.nn.dynamic_rnn} is that it can
+optionally swap memory from the GPU to the CPU to enable training of very long
+sequences.  Depending on the model and hardware configuration, this can come at
+a performance cost.  It is also possible to run multiple iterations of
+@{tf.nn.dynamic_rnn} and the underlying @{tf.while_loop} construct in parallel,
+although this is rarely useful with RNN models as they are inherently
+sequential.
+
+On NVIDIA GPUs, the use of @{tf.contrib.cudnn_rnn} should always be preferred
+unless you want layer normalization, which it doesn't support.  It is often at
+least an order of magnitude faster than @{tf.contrib.rnn.BasicLSTMCell} and
+@{tf.contrib.rnn.LSTMBlockCell} and uses 3-4x less memory than
+@{tf.contrib.rnn.BasicLSTMCell}.  Unfortunately, @{tf.contrib.cudnn_rnn} is not
+compatible with @{tf.train.SyncReplicasOptimizer} so you should either use a
+different synchronization mechanism (consider an all-reduce based strategy) or
+use the @{tf.contrib.rnn.LSTMBlockFusedCell} (at a significant performance
+penalty).
+
+If you need to run one step of the RNN at a time, as might be the case in
+reinforcement learning with a recurrent policy, then you should use the
+@{tf.contrib.rnn.LSTMBlockCell} with your own environment interaction loop
+inside a @{tf.while_loop} construct. Running one step of the RNN at a time and
+returning to python is possible but it will be slower.
+
+On CPUs, mobile devices, and if @{tf.contrib.cudnn_rnn} is not available on
+your GPU, the fastest and most memory efficient option is
+@{tf.contrib.rnn.LSTMBlockFusedCell}.
+
+For all of the less common cell types like @{tf.contrib.rnn.NASCell},
+@{tf.contrib.rnn.PhasedLSTMCell}, @{tf.contrib.rnn.UGRNNCell},
+@{tf.contrib.rnn.GLSTMCell}, @{tf.contrib.rnn.Conv1DLSTMCell},
+@{tf.contrib.rnn.Conv2DLSTMCell}, @{tf.contrib.rnn.LayerNormBasicLSTMCell},
+etc., one should be aware that they are implemented in the graph like
+@{tf.contrib.rnn.BasicLSTMCell} and as such will suffer from the same poor
+performance and high memory usage.  One should consider whether or not those
+trade-offs are worth it before using these cells. For example, while layer
+normalization can speed up convergence, because cuDNN is 20x faster the fastest
+wall clock time to convergence is usually obtained without it.
+
+
 ### Building and installing from source
 
 The default TensorFlow binaries target the broadest range of hardware to make
diff --git a/tensorflow/docs_src/performance/xla/broadcasting.md b/tensorflow/docs_src/performance/xla/broadcasting.md
index 8dbf0d0446f41b26489912734bc11704e61efeab..ca3bddf758cf64e7c580f9babfe559ae23708705 100644
--- a/tensorflow/docs_src/performance/xla/broadcasting.md
+++ b/tensorflow/docs_src/performance/xla/broadcasting.md
@@ -33,11 +33,11 @@ In Numpy, this is called [broadcasting]
 
 ## Principles
 
-XLA is a low-level infrastructure with a XLA language this is as strict and
-explicit as possible, avoiding implicit and "magical" features that may make
-some computations slightly easier to define, at the cost of more assumptions
-baked into user code that will be difficult to change in the long term. If
-necessary, implicit and magical features can be added in client-level wrappers.
+The XLA language is as strict and explicit as possible, avoiding implicit and
+"magical" features. Such features may make some computations slightly easier to
+define, at the cost of more assumptions baked into user code that will be
+difficult to change in the long term. If necessary, implicit and magical
+features can be added in client-level wrappers.
 
 In regards to broadcasting, explicit broadcasting specifications on operations
 between arrays of different ranks is required. This is different from Numpy,
diff --git a/tensorflow/docs_src/performance/xla/developing_new_backend.md b/tensorflow/docs_src/performance/xla/developing_new_backend.md
index 28010ff1b785813e15c56d4bb5c26b0bcedce3d9..74ea15bb2bac2014257f0b1719820f7ee313b66b 100644
--- a/tensorflow/docs_src/performance/xla/developing_new_backend.md
+++ b/tensorflow/docs_src/performance/xla/developing_new_backend.md
@@ -62,11 +62,11 @@ If it is not possible to utilize LLVM, then the best option is to implement a
 new backend for XLA for the desired hardware. This option requires the most
 effort. The classes that need to be implemented are as follows:
 
-*   [StreamExecutor](https://www.tensorflow.org/code/tensorflow/stream_executor/stream_executor.h):
+*   [`StreamExecutor`](https://www.tensorflow.org/code/tensorflow/stream_executor/stream_executor.h):
     For many devices not all methods of `StreamExecutor` are needed. See
     existing `StreamExecutor` implementations for details.
-*   [xla::Compiler](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/compiler.h):
-    This class encapsulates the compilation of a HLO computation into an
+*   [`xla::Compiler`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/compiler.h):
+    This class encapsulates the compilation of an HLO computation into an
     `xla::Executable`.
 *   [`xla::Executable`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/executable.h):
     This class is used to launch a compiled computation on the platform.
diff --git a/tensorflow/docs_src/performance/xla/index.md b/tensorflow/docs_src/performance/xla/index.md
index 19045b45d92a2ca42c3943bc0662ca42bd0c2c24..a8847830740302a0de6f57cb3b7a0d6c7e096d32 100644
--- a/tensorflow/docs_src/performance/xla/index.md
+++ b/tensorflow/docs_src/performance/xla/index.md
@@ -65,18 +65,19 @@ The following diagram shows the compilation process in XLA:
   <img src="https://www.tensorflow.org/images/how-does-xla-work.png">
 </div>
 
-XLA comes with several optimizations and analyzes that are target-independent,
-such as [CSE](https://en.wikipedia.org/wiki/Common_subexpression_elimination),
+XLA comes with several optimizations and analysis passes that are
+target-independent, such as
+[CSE](https://en.wikipedia.org/wiki/Common_subexpression_elimination),
 target-independent operation fusion, and buffer analysis for allocating runtime
 memory for the computation.
 
 After the target-independent step, XLA sends the HLO computation to a backend.
-The backend can perform further HLO-level analyzes and optimizations, this time
-with target specific information and needs in mind. For example, the XLA GPU
-backend may perform operation fusion beneficial specifically for the GPU
-programming model and determine how to partition the computation into streams.
-At this stage, backends may also pattern-match certain operations or
-combinations thereof to optimized library calls.
+The backend can perform further HLO-level optimizations, this time with target
+specific information and needs in mind. For example, the XLA GPU backend may
+perform operation fusion beneficial specifically for the GPU programming model
+and determine how to partition the computation into streams. At this stage,
+backends may also pattern-match certain operations or combinations thereof to
+optimized library calls.
 
 The next step is target-specific code generation. The CPU and GPU backends
 included with XLA use [LLVM](http://llvm.org) for low-level IR, optimization,
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index d532efea0c5e8c23f6773e4d84ba4ebca5ebddf3..71e5db5d9fd4ebd4059fc38f46ea1120ce3b27e6 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -13,6 +13,176 @@ arbitrary-dimensional array. For convenience, special cases have more specific
 and familiar names; for example a *vector* is a 1-dimensional array and a
 *matrix* is a 2-dimensional array.
 
+## BatchNormGrad
+
+See also
+[`ComputationBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h)
+and [the original batch normalization paper](https://arxiv.org/abs/1502.03167)
+for a detailed description of the algorithm.
+
+Calculates gradients of batch norm.
+
+<b> `BatchNormGrad(operand, scale, mean, variance, grad_output, epsilon, feature_index)` </b>
+
+| Arguments       | Type                    | Semantics                        |
+| --------------  | ----------------------- | -------------------------------- |
+| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
+:                 :                         : normalized (x)                   :
+| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
+:                 :                         : (\\(\gamma\\))                   :
+| `mean`          | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))  |
+| `variance`      | `ComputationDataHandle` | 1 dimensional array              |
+:                 :                         : (\\(\sigma^2\\))                 :
+| `grad_output`   | `ComputationDataHandle` | Gradients passed to              |
+:                 :                         : `BatchNormTraining`              :
+:                 :                         : (\\( \nabla y\\))                :
+| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
+| `feature_index` | `int64`                 | Index to feature dimension in    |
+:                 :                         : `operand`                        :
+
+For each feature in the feature dimension (`feature_index` is the index for the
+feature dimension in `operand`), the operation calculates the gradients with
+respect to `operand`, `offset` and `scale` across all the other dimensions. The
+`feature_index` must be a valid index for the feature dimension in `operand`.
+
+The three gradients are defined by the following formulas:
+
+\\( \nabla x = \nabla y * \gamma * \sqrt{\sigma^2+\epsilon} \\)
+
+\\( \nabla \gamma = sum(\nabla y * (x - \mu) * \sqrt{\sigma^2 + \epsilon}) \\)
+
+\\( \nabla \beta = sum(\nabla y) \\)
+
+The inputs `mean` and `variance` represents moments value
+across batch and spatial dimensions.
+
+The output type is a tuple of three handles:
+
+|Outputs       | Type                    | Semantics                           |
+|------------- | ----------------------- | ------------------------------------|
+|`grad_operand`| `ComputationDataHandle` | gradient with respect to input      |
+:              :                         : `operand`                           :
+|`grad_scale`  | `ComputationDataHandle` | gradient with respect to input      |
+:              :                         : `scale`                             :
+|`grad_offset` | `ComputationDataHandle` | gradient with respect to input      |
+:              :                         : `offset`                            :
+
+
+## BatchNormInference
+
+See also
+[`ComputationBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h) and
+[the original batch normalization paper](https://arxiv.org/abs/1502.03167)
+for a detailed description of the algorithm.
+
+Normalizes an array across batch and spatial dimensions.
+
+<b> `BatchNormInference(operand, scale, offset, mean, variance, epsilon, feature_index)` </b>
+
+| Arguments       | Type                    | Semantics                       |
+| --------------  | ----------------------- | ------------------------------- |
+| `operand`       | `ComputationDataHandle` | n dimensional array to be       |
+:                 :                         : normalized                      :
+| `scale`         | `ComputationDataHandle` | 1 dimensional array             |
+| `offset`        | `ComputationDataHandle` | 1 dimensional array             |
+| `mean`          | `ComputationDataHandle` | 1 dimensional array             |
+| `variance`      | `ComputationDataHandle` | 1 dimensional array             |
+| `epsilon`       | `float`                 | Epsilon value                   |
+| `feature_index` | `int64`                 | Index to feature dimension in   |
+:                 :                         : `operand`                       :
+
+For each feature in the feature dimension (`feature_index` is the index for the
+feature dimension in `operand`), the operation calculates the mean and variance
+across all the other dimensions and uses the mean and variance to normalize each
+element in `operand`. The `feature_index` must be a valid index for the feature
+dimension in `operand`.
+
+`BatchNormInference`  is equivalent to calling `BatchNormTraining` without
+computing `mean` and `variance` for each batch. It uses the input `mean` and
+`variance` instead as estimated values. The purpose of this op is to reduce
+latency in inference, hence the name `BatchNormInference`.
+
+The output is an n-dimensional, normalized array with the same shape as input
+`operand`.
+
+## BatchNormTraining
+
+See also
+[`ComputationBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h) and
+[`the original batch normalization paper`](https://arxiv.org/abs/1502.03167)
+for a detailed description of the algorithm.
+
+Normalizes an array across batch and spatial dimensions.
+
+<b> `BatchNormTraining(operand, scale, offset, epsilon, feature_index)` </b>
+
+| Arguments       | Type                    | Semantics                        |
+| --------------- | ----------------------- | -------------------------------- |
+| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
+:                 :                         : normalized                       :
+| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
+:                 :                         : (\\(\gamma\\))                   :
+| `offset`        | `ComputationDataHandle` | 1 dimensional array              |
+:                 :                         : (\\(\beta\\ )                    :
+| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
+| `feature_index` | `int64`                 | Index to feature dimension       |
+:                 :                         : in `operand`                     :
+
+For each feature in the feature dimension (`feature_index` is the index for the
+feature dimension in `operand`), the operation calculates the mean and variance
+across all the other dimensions and uses the mean and variance to normalize each
+element in `operand`. The `feature_index` must be a valid index for the feature
+dimension in `operand`.
+
+The algorithm goes as follows for each batch in `operand` \\(x\\) that
+contains `m` elements with `w` and `h` as the size of spatial dimensions (
+assuming `operand` is an 4 dimensional array):
+
+- Calculates batch mean \\(\mu_l\\) for each feature `l` in feature dimension:
+\\(\mu_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h x_{ijkl}\\)
+
+- Calculates batch variance \\(\sigma^2_l\\):
+\\(\sigma^2_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h (x_{ijkl} - \mu_l)^2\\)
+
+- Normalizes, scales and shifts:
+\\(y_{ijkl}=\frac{\gamma_l(x_{ijkl}-\mu_l)}{\sqrt[2]{\sigma^2_l+\epsilon}}+\beta_l\\)
+
+The epsilon value, usually a small number, is added to avoid divide-by-zero errors.
+
+The output type is a tuple of three `ComputationDataHandle`s:
+
+| Outputs      | Type                    | Semantics                            |
+| ------------ | ----------------------- | -------------------------------------|
+| `output`     | `ComputationDataHandle` | n dimensional array with the same    |
+:              :                         : shape as input `operand` (y)         :
+| `batch_mean` | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))      |
+| `batch_var`  | `ComputationDataHandle` | 1 dimensional array (\\(\sigma^2\\)) |
+
+The `batch_mean` and `batch_var` are moments calculated across the batch and
+spatial dimensions using the formulas above.
+
+## BitcastConvertType
+
+See also
+[`ComputationBuilder::BitcastConvertType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+
+Similar to a `tf.bitcast` in TensorFlow, performs an element-wise bitcast
+operation from a data shape to a target shape. The dimensions must match, and
+the conversion is an element-wise one; e.g. `s32` elements become `f32` elements
+via bitcast routine. Bitcast is implemented as a low-level cast, so machines
+with different floating point representations will give different results.
+
+<b> `BitcastConvertType(operand, new_element_type)` </b>
+
+Arguments          | Type                    | Semantics
+------------------ | ----------------------- | ---------------------------
+`operand`          | `ComputationDataHandle` | array of type T with dims D
+`new_element_type` | `PrimitiveType`         | type U
+
+The dimensions of the operand and the target shape must match. The bit-width of
+the source and destination element types must be equal. The source
+and destination element types must not be tuples.
+
 ## Broadcast
 
 See also
@@ -75,14 +245,14 @@ Clamps an operand to within the range between a minimum and maximum value.
 | `computation` | `Computation`           | computation of type `T_0, T_1,   |
 :               :                         : ..., T_N -> S` with N parameters :
 :               :                         : of arbitrary type                :
-| `operand`     | `ComputationDataHandle` | array of type T                  |
 | `min`         | `ComputationDataHandle` | array of type T                  |
+| `operand`     | `ComputationDataHandle` | array of type T                  |
 | `max`         | `ComputationDataHandle` | array of type T                  |
 
 Given an operand and minimum and maximum values, returns the operand if it is in
 the range between the minimum and maximum, else returns the minimum value if the
 operand is below this range or the maximum value if the operand is above this
-range.  That is, `clamp(x, a, b) =  max(min(x, a), b)`.
+range.  That is, `clamp(a, x, b) =  max(min(a, x), b)`.
 
 All three arrays must be the same shape. Alternately, as a restricted form of
 [broadcasting](broadcasting.md), `min` and/or `max` can be a scalar of type `T`.
@@ -94,7 +264,7 @@ let operand: s32[3] = {-1, 5, 9};
 let min: s32 = 0;
 let max: s32 = 6;
 ==>
-Clamp(operand, min, max) = s32[3]{0, 5, 6};
+Clamp(min, operand, max) = s32[3]{0, 5, 6};
 ```
 
 ## Collapse
@@ -217,40 +387,34 @@ Diagram:
   <img style="width:100%" src="https://www.tensorflow.org/images/ops_concatenate.png">
 </div>
 
-## ConvertElementType
-
-See also
-[`ComputationBuilder::ConvertElementType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
-
-Similar to an element-wise `static_cast` in C++, performs an element-wise
-conversion operation from a data shape to a target shape. The dimensions must
-match, and the conversion is an element-wise one; e.g. `s32` elements become
-`f32` elements via an `s32`-to-`f32` conversion routine.
+## Conditional
 
-<b> `ConvertElementType(operand, new_element_type)` </b>
+See also [`ComputationBuilder::Conditional`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-Arguments          | Type                    | Semantics
------------------- | ----------------------- | ---------------------------
-`operand`          | `ComputationDataHandle` | array of type T with dims D
-`new_element_type` | `PrimitiveType`         | type U
+<b> `Conditional(pred, true_operand, true_computation, false_operand,
+    false_computation)` </b>
 
-If the dimensions of the operand and the target shape do not match, or an
-invalid conversion is requested (e.g. to/from a tuple) an error will be
-produced.
+| Arguments           | Type                    | Semantics                   |
+| ------------------- | ----------------------- | --------------------------- |
+| `pred`              | `ComputationDataHandle` | Scalar of type `PRED`       |
+| `true_operand`      | `ComputationDataHandle` | Argument of type `T_0`      |
+| `true_computation`  | `Computation`           | Computation of type `T_0 -> |
+:                     :                         : S`                          :
+| `false_operand`     | `ComputationDataHandle` | Argument of type `T_1`      |
+| `false_computation` | `Computation`           | Computation of type `T_1 -> |
+:                     :                         : S`                          :
 
-A conversion such as `T=s32` to `U=f32` will perform a normalizing int-to-float
-conversion routine such as round-to-nearest-even.
+Executes `true_computation` if `pred` is `true`, `false_computation` if `pred`
+is `false`, and returns the result.
 
-> Note: The precise float-to-int and visa-versa conversions are currently
-> unspecified, but may become additional arguments to the convert operation in
-> the future.  Not all possible conversions have been implemented for all
->targets.
+The `true_computation` must take in a single argument of type `T_0` and will be
+invoked with `true_operand` which must be of the same type. The
+`false_computation` must take in a single argument of type `T_1` and will be
+invoked with `false_operand` which must be of the same type. The type of the
+returned value of `true_computation` and `false_computation` must be the same.
 
-```
-let a: s32[3] = {0, 1, 2};
-let b: f32[3] = convert(a, f32);
-then b == f32[3]{0.0, 1.0, 2.0}
-```
+Note that only one of `true_computation` and `false_computation` will be
+executed depending on the value of `pred`.
 
 ## Conv (convolution)
 
@@ -374,6 +538,40 @@ for (b, oz, oy, ox) {  // output coordinates
 }
 ```
 
+## ConvertElementType
+
+See also
+[`ComputationBuilder::ConvertElementType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+
+Similar to an element-wise `static_cast` in C++, performs an element-wise
+conversion operation from a data shape to a target shape. The dimensions must
+match, and the conversion is an element-wise one; e.g. `s32` elements become
+`f32` elements via an `s32`-to-`f32` conversion routine.
+
+<b> `ConvertElementType(operand, new_element_type)` </b>
+
+Arguments          | Type                    | Semantics
+------------------ | ----------------------- | ---------------------------
+`operand`          | `ComputationDataHandle` | array of type T with dims D
+`new_element_type` | `PrimitiveType`         | type U
+
+The dimensions of the operand and the target shape must match. The source and
+destination element types must not be tuples.
+
+A conversion such as `T=s32` to `U=f32` will perform a normalizing int-to-float
+conversion routine such as round-to-nearest-even.
+
+> Note: The precise float-to-int and visa-versa conversions are currently
+> unspecified, but may become additional arguments to the convert operation in
+> the future.  Not all possible conversions have been implemented for all
+>targets.
+
+```
+let a: s32[3] = {0, 1, 2};
+let b: f32[3] = convert(a, f32);
+then b == f32[3]{0.0, 1.0, 2.0}
+```
+
 ## CrossReplicaSum
 
 See also
@@ -388,9 +586,9 @@ Computes a sum across replicas.
 | `operand`    | `ComputationDataHandle` | Array to sum across replicas.      |
 
 The output shape is the same as the input shape. For example, if there are two
-replicas and the operand has the value `(1.0, 2.5)` and `(3.0, 5.1)`
+replicas and the operand has the value `(1.0, 2.5)` and `(3.0, 5.25)`
 respectively on the two replicas, then the output value from this op will be
-`(4.0, 7.6)` on both replicas.
+`(4.0, 7.75)` on both replicas.
 
 Computing the result of CrossReplicaSum requires having one input from each
 replica, so if one replica executes a CrossReplicaSum node more times than
@@ -490,282 +688,338 @@ contracted dimensions of `lhs` and `rhs` must be of the same size. In practice,
 it can be used to perform dot products between vectors, vector/matrix
 multiplications or matrix/matrix multiplications.
 
-## Element-wise binary arithmetic operations
+## DotGeneral
 
 See also
-[`ComputationBuilder::Add`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`ComputationBuilder::DotGeneral`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-A set of element-wise binary arithmetic operations is supported.
+<b> `DotGeneral(lhs, rhs, dimension_numbers)` </b>
 
-<b> `Op(lhs, rhs)` </b>
+| Arguments | Type                    | Semantics
+| --------- | ----------------------- | ---------------
+| `lhs`     | `ComputationDataHandle` | array of type T
+| `rhs`     | `ComputationDataHandle` | array of type T
+| `dimension_numbers` | `DotDimensionNumbers` | array of type T
 
-Where `Op` is one of `Add` (addition), `Sub` (subtraction), `Mul`
-(multiplication), `Div` (division), `Rem` (remainder), `Max` (maximum), `Min`
-(minimum), `LogicalAnd` (logical AND), or `LogicalOr` (logical OR).
+As Dot, but allows contracting and batch dimension numbers to be specified for
+both the 'lhs' and 'rhs'.
 
-Arguments | Type                    | Semantics
---------- | ----------------------- | ----------------------------------------
-`lhs`     | `ComputationDataHandle` | left-hand-side operand: array of type T
-`rhs`     | `ComputationDataHandle` | right-hand-side operand: array of type T
+| DotDimensionNumbers Fields | Type                    | Semantics
+| --------- | ----------------------- | ---------------
+| 'lhs_contracting_dimensions' | repeated int64 | 'lhs' contracting dimension numbers |
+| 'rhs_contracting_dimensions' | repeated int64 | 'rhs' contracting dimension numbers |
+| 'lhs_batch_dimensions' | repeated int64 | 'lhs' batch dimension numbers |
+| 'rhs_batch_dimensions' | repeated int64 | 'rhs' batch dimension numbers |
 
-The arguments' shapes have to be either similar or compatible. See the
-@{$broadcasting$broadcasting} documentation about what it means for shapes to
-be compatible. The result of an operation has a shape which is the result of
-broadcasting the two input arrays. In this variant, operations between arrays of
-different ranks are *not* supported, unless one of the operands is a scalar.
+DotGeneral performs the sum of products over contracting dimensions specified
+in 'dimension_numbers'.
 
-When `Op` is `Rem`, the sign of the result is taken from the dividend, and the
-absolute value of the result is always less than the divisor's absolute value.
+Associated contracting dimension numbers from the 'lhs' and 'rhs' do not need
+to be the same, but must be listed in the same order in both
+'lhs/rhs_contracting_dimensions' arrays and have the same dimension sizes.
 
-An alternative variant with different-rank broadcasting support exists for these
-operations:
+Example with contracting dimension numbers:
 
-<b> `Op(lhs, rhs, broadcast_dimensions)` </b>
+```
+lhs = { {1.0, 2.0, 3.0},
+        {4.0, 5.0, 6.0} }
 
-Where `Op` is the same as above. This variant of the operation should be used
-for arithmetic operations between arrays of different ranks (such as adding a
-matrix to a vector).
+rhs = { {1.0, 1.0, 1.0},
+        {2.0, 2.0, 2.0} }
 
-The additional `broadcast_dimensions` operand is a slice of integers used to
-expand the rank of the lower-rank operand up to the rank of the higher-rank
-operand. `broadcast_dimensions` maps the dimensions of the lower-rank shape to
-the dimensions of the higher-rank shape. The unmapped dimensions of the expanded
-shape are filled with dimensions of size one. Degenerate-dimension broadcasting
-then broadcasts the shapes along these degenerate dimension to equalize the
-shapes of both operands. The semantics are described in detail on the
-@{$broadcasting$broadcasting page}.
+DotDimensionNumbers dnums;
+dnums.add_lhs_contracting_dimensions(1);
+dnums.add_rhs_contracting_dimensions(1);
 
-## Element-wise comparison operations
+DotGeneral(lhs, rhs, dnums) -> { {6.0, 12.0},
+                                 {15.0, 30.0} }
+```
 
-See also
-[`ComputationBuilder::Eq`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+Associated batch dimension numbers from the 'lhs' and 'rhs' must have the same
+dimension number, must be listed in the same order in both arrays, and must
+have the same dimension sizes.
 
-A set of standard element-wise binary comparison operations is supported. Note
-that standard IEEE 754 floating-point comparison semantics apply when comparing
-floating-point types.
+Example with batch dimension numbers (batch size 2, 2x2 matrices):
 
-<b> `Op(lhs, rhs)` </b>
+```
+lhs = { { {1.0, 2.0},
+          {3.0, 4.0} },
+        { {5.0, 6.0},
+          {7.0, 8.0} } }
+
+rhs = { { {1.0, 0.0},
+          {0.0, 1.0} },
+        { {1.0, 0.0},
+          {0.0, 1.0} } }
+
+DotDimensionNumbers dnums;
+dnums.add_lhs_contracting_dimensions(2);
+dnums.add_rhs_contracting_dimensions(1);
+dnums.add_lhs_batch_dimensions(0);
+dnums.add_rhs_batch_dimensions(0);
+
+DotGeneral(lhs, rhs, dnums) -> { { {1.0, 2.0},
+                                   {3.0, 4.0} },
+                                 { {5.0, 6.0},
+                                   {7.0, 8.0} } }
+```
 
-Where `Op` is one of `Eq` (equal-to), `Ne` (not equal-to), `Ge`
-(greater-or-equal-than), `Gt` (greater-than), `Le` (less-or-equal-than), `Lt`
-(less-than).
+| Input                               | Output            | Semantics        |
+| ----------------------------------- | ----------------- | ---------------- |
+| [b0, m, k] `dot` [b0, k, n]         | [b0, m, n]        |  batch matmul    |
+| [b0, b1, m, k] `dot` [b0, b1, k, n] | [b0, b1, m, n]    |  batch matmul    |
 
-Arguments | Type                    | Semantics
---------- | ----------------------- | ----------------------------------------
-`lhs`     | `ComputationDataHandle` | left-hand-side operand: array of type T
-`rhs`     | `ComputationDataHandle` | right-hand-side operand: array of type T
+## DynamicSlice
 
-The arguments' shapes have to be either similar or compatible. See the
-@{$broadcasting$broadcasting} documentation about what it means for shapes to
-be compatible. The result of an operation has a shape which is the result of
-broadcasting the two input arrays with the element type `PRED`. In this variant,
-operations between arrays of different ranks are *not* supported, unless one of
-the operands is a scalar.
-
-An alternative variant with different-rank broadcasting support exists for these
-operations:
+See also
+[`ComputationBuilder::DynamicSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-<b> `Op(lhs, rhs, broadcast_dimensions)` </b>
+DynamicSlice extracts a sub-array from the input array at dynamic
+`start_indices`. The size of the slice in each dimension is passed in
+`size_indices`, which specify the end point of exclusive slice intervals in each
+dimension: [start, start + size). The shape of `start_indices` must be rank ==
+1, with dimension size equal to the rank of `operand`.
+Note: handling of out-of-bounds slice indices (generated by incorrect runtime
+calculation of 'start_indices') is currently implementation-defined. Currently,
+slice indices are computed modulo input dimension sizes to prevent out-of-bound
+array accesses, but this behavior may change in future implementations.
 
-Where `Op` is the same as above. This variant of the operation should be used
-for comparison operations between arrays of different ranks (such as adding a
-matrix to a vector).
+<b> `DynamicSlice(operand, start_indices, size_indices)` </b>
 
-The additional `broadcast_dimensions` operand is a slice of integers specifying
-the dimensions to use for broadcasting the operands. The semantics are described
-in detail on the @{$broadcasting$broadcasting page}.
+| Arguments       | Type                    | Semantics                        |
+| --------------- | ----------------------- | -------------------------------- |
+| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
+| `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
+:                 :                         : containing the starting indices  :
+:                 :                         : of the slice for each dimension. :
+:                 :                         : Value must be greater than or    :
+:                 :                         : equal to zero.                   :
+| `size_indices`  | `ArraySlice<int64>`     | List of N integers containing    |
+:                 :                         : the slice size for each          :
+:                 :                         : dimension. Each value must be    :
+:                 :                         : strictly greater than zero, and  :
+:                 :                         : start + size must be less than   :
+:                 :                         : or equal to the size of the      :
+:                 :                         : dimension to avoid wrapping      :
+:                 :                         : modulo dimension size.           :
 
-## Element-wise unary functions
+1-dimensional example:
 
-ComputationBuilder supports these element-wise unary functions:
+```
+let a = {0.0, 1.0, 2.0, 3.0, 4.0}
+let s = {2}
 
-<b>`Abs(operand)`</b> Element-wise abs `x -> |x|`.
+DynamicSlice(a, s, {2}) produces:
+  {2.0, 3.0}
+```
 
-<b>`Ceil(operand)`</b> Element-wise ceil `x -> ⌈x⌉`.
+2-dimensional example:
 
-<b>`Cos(operand)`</b> Element-wise cosine `x -> cos(x)`.
+```
+let b =
+ { {0.0,  1.0,  2.0},
+   {3.0,  4.0,  5.0},
+   {6.0,  7.0,  8.0},
+   {9.0, 10.0, 11.0} }
+let s = {2, 1}
 
-<b>`Exp(operand)`</b> Element-wise natural exponential `x -> e^x`.
+DynamicSlice(b, s, {2, 2}) produces:
+  { { 7.0,  8.0},
+    {10.0, 11.0} }
+```
+## DynamicUpdateSlice
 
-<b>`Floor(operand)`</b> Element-wise floor `x -> ⌊x⌋`.
+See also
+[`ComputationBuilder::DynamicUpdateSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-<b>`IsFinite(operand)`</b> Tests whether each element of `operand` is finite,
-i.e., is not positive or negative infinity, and is not `NaN`. Returns an array
-of `PRED` values with the same shape as the input, where each element is `true`
-if and only if the corresponding input element is finite.
+DynamicUpdateSlice generates a result which is the value of the input array
+`operand`, with a slice `update` overwritten at `start_indices`.
+The shape of `update` determines the shape of the sub-array of the result which
+is updated.
+The shape of `start_indices` must be rank == 1, with dimension size equal to
+the rank of `operand`.
+Note: handling of out-of-bounds slice indices (generated by incorrect runtime
+calculation of 'start_indices') is currently implementation-defined. Currently,
+slice indices are computed modulo update dimension sizes to prevent out-of-bound
+array accesses, but this behavior may change in future implementations.
 
-<b>`Log(operand)`</b> Element-wise natural logarithm `x -> ln(x)`.
+<b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
 
-<b>`LogicalNot(operand)`</b> Element-wise logical not `x -> !(x)`.
+| Arguments       | Type                    | Semantics                        |
+| --------------- | ----------------------- | -------------------------------- |
+| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
+| `update`        | `ComputationDataHandle` | N dimensional array of type T    |
+:                 :                         : containing the slice update.     :
+:                 :                         : Each dimension of update shape    :
+:                 :                         : must be strictly greater than    :
+:                 :                         : zero, and start + update must be :
+:                 :                         : less than operand size for each  :
+:                 :                         : dimension to avoid generating    :
+:                 :                         : out-of-bounds update indices.    :
+| `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
+:                 :                         : containing the starting indices  :
+:                 :                         : of the slice for each dimension. :
+:                 :                         : Value must be greater than or    :
+:                 :                         : equal to zero.                   :
 
-<b>`Neg(operand)`</b> Element-wise negation `x -> -x`.
+1-dimensional example:
 
-<b>`Sign(operand)`</b> Element-wise sign operation `x -> sgn(x)` where
+```
+let a = {0.0, 1.0, 2.0, 3.0, 4.0}
+let u = {5.0, 6.0}
+let s = {2}
 
-$$\text{sgn}(x) = \begin{cases} -1 & x < 0\\ 0 & x = 0\\ 1 & x > 0 \end{cases}$$
+DynamicUpdateSlice(a, u, s) produces:
+  {0.0, 1.0, 5.0, 6.0, 4.0}
+```
 
-using the comparison operator of the element type of `operand`.
+2-dimensional example:
 
-<b>`Tanh(operand)`</b> Element-wise hyperbolic tangent `x -> tanh(x)`.
+```
+let b =
+ { {0.0,  1.0,  2.0},
+   {3.0,  4.0,  5.0},
+   {6.0,  7.0,  8.0},
+   {9.0, 10.0, 11.0} }
+let u =
+ { {12.0,  13.0},
+   {14.0,  15.0},
+   {16.0,  17.0} }
 
+let s = {1, 1}
 
-Arguments | Type                    | Semantics
---------- | ----------------------- | ---------------------------
-`operand` | `ComputationDataHandle` | The operand to the function
+DynamicUpdateSlice(b, u, s) produces:
+ { {0.0,  1.0,  2.0},
+   {3.0, 12.0, 13.0},
+   {6.0, 14.0, 15.0},
+   {9.0, 16.0, 17.0} }
+```
 
-The function is applied to each element in the `operand` array, resulting in an
-array with the same shape. It is allowed for `operand` to be a scalar (rank 0).
+## Element-wise binary arithmetic operations
 
+See also
+[`ComputationBuilder::Add`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-## BatchNormTraining
+A set of element-wise binary arithmetic operations is supported.
 
-See also
-[`ComputationBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h) and
-[`the original batch normalization paper`](https://arxiv.org/abs/1502.03167)
-for a detailed description of the algorithm.
+<b> `Op(lhs, rhs)` </b>
 
-<b> Warning: Not implemented on GPU backend yet. </b>
+Where `Op` is one of `Add` (addition), `Sub` (subtraction), `Mul`
+(multiplication), `Div` (division), `Rem` (remainder), `Max` (maximum), `Min`
+(minimum), `LogicalAnd` (logical AND), or `LogicalOr` (logical OR).
 
-Normalizes an array across batch and spatial dimensions.
+Arguments | Type                    | Semantics
+--------- | ----------------------- | ----------------------------------------
+`lhs`     | `ComputationDataHandle` | left-hand-side operand: array of type T
+`rhs`     | `ComputationDataHandle` | right-hand-side operand: array of type T
 
-<b> `BatchNormTraining(operand, scale, offset, epsilon, feature_index)` </b>
+The arguments' shapes have to be either similar or compatible. See the
+@{$broadcasting$broadcasting} documentation about what it means for shapes to
+be compatible. The result of an operation has a shape which is the result of
+broadcasting the two input arrays. In this variant, operations between arrays of
+different ranks are *not* supported, unless one of the operands is a scalar.
 
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
-:                 :                         : normalized                       :
-| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\gamma\\))                   :
-| `offset`        | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\beta\\ )                    :
-| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
-| `feature_index` | `int64`                 | Index to feature dimension       |
-:                 :                         : in `operand`                     :
+When `Op` is `Rem`, the sign of the result is taken from the dividend, and the
+absolute value of the result is always less than the divisor's absolute value.
 
+An alternative variant with different-rank broadcasting support exists for these
+operations:
 
-For each feature in the feature dimension (`feature_index` is the index for the
-feature dimension in `operand`), the operation calculates the mean and variance
-across all the other dimensions and use the mean and variance to normalize each
-element in `operand`. If an invalid `feature_index` is passed, an error is
-produced.
+<b> `Op(lhs, rhs, broadcast_dimensions)` </b>
 
-The algorithm goes as follows for each batch in `operand` \\(x\\) that
-contains `m` elements with `w` and `h` as the size of spatial dimensions (
-assuming `operand` is an 4 dimensional array):
+Where `Op` is the same as above. This variant of the operation should be used
+for arithmetic operations between arrays of different ranks (such as adding a
+matrix to a vector).
 
-- Calculates batch mean \\(\mu_l\\) for each feature `l` in feature dimension:
-\\(\mu_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h x_{ijkl}\\)
+The additional `broadcast_dimensions` operand is a slice of integers used to
+expand the rank of the lower-rank operand up to the rank of the higher-rank
+operand. `broadcast_dimensions` maps the dimensions of the lower-rank shape to
+the dimensions of the higher-rank shape. The unmapped dimensions of the expanded
+shape are filled with dimensions of size one. Degenerate-dimension broadcasting
+then broadcasts the shapes along these degenerate dimension to equalize the
+shapes of both operands. The semantics are described in detail on the
+@{$broadcasting$broadcasting page}.
 
-- Calculates batch variance \\(\sigma^2_l\\):
-\\(\sigma^2_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h (x_{ijkl} - \mu_l)^2\\)
+## Element-wise comparison operations
 
-- Normalizes, scales and shifts:
-\\(y_{ijkl}=\frac{\gamma_l(x_{ijkl}-\mu_l)}{\sqrt[2]{\sigma^2_l+\epsilon}}+\beta_l\\)
+See also
+[`ComputationBuilder::Eq`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-The epsilon value, usually a small number, is added to avoid divide-by-zero errors.
+A set of standard element-wise binary comparison operations is supported. Note
+that standard IEEE 754 floating-point comparison semantics apply when comparing
+floating-point types.
 
-The output type is a tuple of three ComputationDataHandles:
+<b> `Op(lhs, rhs)` </b>
 
-| Outputs      | Type                    | Semantics                            |
-| ------------ | ----------------------- | -------------------------------------|
-| `output`     | `ComputationDataHandle` | n dimensional array with the same    |
-:              :                         : shape as input `operand` (y)         :
-| `batch_mean` | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))      |
-| `batch_var`  | `ComputationDataHandle` | 1 dimensional array (\\(\sigma^2\\)) |
+Where `Op` is one of `Eq` (equal-to), `Ne` (not equal-to), `Ge`
+(greater-or-equal-than), `Gt` (greater-than), `Le` (less-or-equal-than), `Lt`
+(less-than).
 
-The `batch_mean` and `batch_var` are moments calculated across the batch and
-spatial dimensions using the formulas above.
+Arguments | Type                    | Semantics
+--------- | ----------------------- | ----------------------------------------
+`lhs`     | `ComputationDataHandle` | left-hand-side operand: array of type T
+`rhs`     | `ComputationDataHandle` | right-hand-side operand: array of type T
 
-## BatchNormInference
+The arguments' shapes have to be either similar or compatible. See the
+@{$broadcasting$broadcasting} documentation about what it means for shapes to
+be compatible. The result of an operation has a shape which is the result of
+broadcasting the two input arrays with the element type `PRED`. In this variant,
+operations between arrays of different ranks are *not* supported, unless one of
+the operands is a scalar.
 
-See also
-[`ComputationBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+An alternative variant with different-rank broadcasting support exists for these
+operations:
 
-<b> Warning: Not implemented yet. </b>
+<b> `Op(lhs, rhs, broadcast_dimensions)` </b>
 
-Normalizes an array across batch and spatial dimensions.
+Where `Op` is the same as above. This variant of the operation should be used
+for comparison operations between arrays of different ranks (such as adding a
+matrix to a vector).
 
-<b> `BatchNormInference(operand, scale, offset, mean, variance, epsilon, feature_index)` </b>
+The additional `broadcast_dimensions` operand is a slice of integers specifying
+the dimensions to use for broadcasting the operands. The semantics are described
+in detail on the @{$broadcasting$broadcasting page}.
 
-| Arguments       | Type                    | Semantics                       |
-| --------------  | ----------------------- | ------------------------------- |
-| `operand`       | `ComputationDataHandle` | n dimensional array to be       |
-:                 :                         : normalized                      :
-| `scale`         | `ComputationDataHandle` | 1 dimensional array             |
-| `offset`        | `ComputationDataHandle` | 1 dimensional array             |
-| `mean`          | `ComputationDataHandle` | 1 dimensional array             |
-| `variance`      | `ComputationDataHandle` | 1 dimensional array             |
-| `epsilon`       | `float`                 | Epsilon value                   |
-| `feature_index` | `int64`                 | Index to feature dimension in   |
-:                 :                         : `operand`                       :
+## Element-wise unary functions
 
-For each feature in the feature dimension (`feature_index` is the index for the
-feature dimension in `operand`), the operation calculates the mean and variance
-across all the other dimensions and use the mean and variance to normalize each
-element in `operand`. If an invalid `feature_index` is passed, an error is
-produced.
+ComputationBuilder supports these element-wise unary functions:
 
-`BatchNormInference`  is equivalent to calling `BatchNormTraining` without
-computing `mean` and `variance` for each batch. It uses the input `mean` and
-`variance` instead as estimated values. The purpose of this op is to reduce
-latency in inference, hence the name `BatchNormInference`.
+<b>`Abs(operand)`</b> Element-wise abs `x -> |x|`.
 
-The output is a n dimensional, normalized array with the same shape as input
-`operand`.
+<b>`Ceil(operand)`</b> Element-wise ceil `x -> ⌈x⌉`.
 
-## BatchNormGrad
+<b>`Cos(operand)`</b> Element-wise cosine `x -> cos(x)`.
 
-See also
-[`ComputationBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+<b>`Exp(operand)`</b> Element-wise natural exponential `x -> e^x`.
 
-<b> Warning: Not implemented yet. </b>
+<b>`Floor(operand)`</b> Element-wise floor `x -> ⌊x⌋`.
 
-Calculates gradients of batch norm.
+<b>`IsFinite(operand)`</b> Tests whether each element of `operand` is finite,
+i.e., is not positive or negative infinity, and is not `NaN`. Returns an array
+of `PRED` values with the same shape as the input, where each element is `true`
+if and only if the corresponding input element is finite.
 
-<b> `BatchNormGrad(operand, scale, mean, variance, grad_output, epsilon, feature_index)` </b>
+<b>`Log(operand)`</b> Element-wise natural logarithm `x -> ln(x)`.
 
-| Arguments       | Type                    | Semantics                        |
-| --------------  | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
-:                 :                         : normalized (x)                   :
-| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\gamma\\))                   :
-| `mean`          | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))  |
-| `variance`      | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\sigma^2\\))                 :
-| `grad_output`   | `ComputationDataHandle` | Gradients passed to              |
-:                 :                         : `BatchNormTraining`              :
-:                 :                         : (\\( \nabla y\\))                :
-| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
-| `feature_index` | `int64`                 | Index to feature dimension in    |
-:                 :                         : `operand`                        :
+<b>`LogicalNot(operand)`</b> Element-wise logical not `x -> !(x)`.
 
-For each feature in the feature dimension (`feature_index` is the index for the
-feature dimension in `operand`), the operation calculates the gradients with
-respect to `operand`, `offset` and `scale` across all the other dimensions. If
-an invalid `feature_index` is passed, an error is produced.
+<b>`Neg(operand)`</b> Element-wise negation `x -> -x`.
 
-The three gradients are defined by the following formulas:
+<b>`Sign(operand)`</b> Element-wise sign operation `x -> sgn(x)` where
 
-\\( \nabla x = \nabla y * \gamma * \sqrt{\sigma^2+\epsilon} \\)
+$$\text{sgn}(x) = \begin{cases} -1 & x < 0\\ 0 & x = 0\\ 1 & x > 0 \end{cases}$$
 
-\\( \nabla \gamma = sum(\nabla y * (x - \mu) * \sqrt{\sigma^2 + \epsilon}) \\)
+using the comparison operator of the element type of `operand`.
 
-\\( \nabla \beta = sum(\nabla y) \\)
+<b>`Tanh(operand)`</b> Element-wise hyperbolic tangent `x -> tanh(x)`.
 
-The inputs `mean` and `variance` represents moments value
-across batch and spatial dimensions.
 
-The output type is a tuple of three ComputationDataHandles:
+Arguments | Type                    | Semantics
+--------- | ----------------------- | ---------------------------
+`operand` | `ComputationDataHandle` | The operand to the function
 
-|Outputs       | Type                    | Semantics                           |
-|------------- | ----------------------- | ------------------------------------|
-|`grad_operand`| `ComputationDataHandle` | gradient with respect to input      |
-:              :                         : `operand`                           :
-|`grad_offset` | `ComputationDataHandle` | gradient with respect to input      |
-:              :                         : `offset`                            :
-|`grad_scale`  | `ComputationDataHandle` | gradient with respect to input      |
-:              :                         : `scale`                             :
+The function is applied to each element in the `operand` array, resulting in an
+array with the same shape. It is allowed for `operand` to be a scalar (rank 0).
 
 
 ## GetTupleElement
@@ -808,8 +1062,7 @@ device, interpreting the data as the given shape and its layout, and returns a
 `ComputationDataHandle` of the data. Multiple Infeed operations are allowed in a
 computation, but there must be a total order among the Infeed operations. For
 example, two Infeeds in the code below have a total order since there is a
-dependency between the while loops. The compiler issues an error if there isn't
-a total order.
+dependency between the while loops.
 
 ```
 result1 = while (condition, init = init_value) {
@@ -935,61 +1188,6 @@ transfer. The context is a tuple of {receive buffer (shape), request identifier
 Given a context created by a `Recv` instruction, waits for the data transfer to
 complete and returns the received data.
 
-## Send
-
-See also
-[`ComputationBuilder::Send`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
-
-<b> `Send(operand, channel_handle)` </b>
-
-| Arguments        | Type                    | Semantics                        |
-| ---------------- | ----------------------- | -------------------------------- |
-| `operand`        | `ComputationDataHandle` | data to send (array of type T)   |
-| `channel_handle` | `ChannelHandle`         | unique identifier for each send/recv pair |
-
-Sends the given operand data to a `Recv` instruction in another computation
-that shares the same channel handle. Does not return any data.
-
-Similar to the `Recv` operation, the client API of `Send` operation represents
-synchronous communication, and is internally decomposed into 2 HLO instructions
-(`Send` and `SendDone`) to enable asynchronous data transfers. See also
-[`HloInstruction::CreateSend` and `HloInstruction::CreateSendDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/hlo_instruction.h).
-
-<b>`Send(HloInstruction operand, int64 channel_id)`</b>
-
-Initiates an asynchronous transfer of the operand to the resources allocated by
-the `Recv` instruction with the same channel id. Returns a context, which is
-used by a following `SendDone` instruction to wait for the completion of the
-data transfer. The context is a tuple of {operand (shape), request identifier
-(U32)} and it can only be used by a `SendDone` instruction.
-
-<b> `SendDone(HloInstruction context)` </b>
-
-Given a context created by a `Send` instruction, waits for the data transfer to
-complete.  The instruction does not return any data.
-
-<b> Scheduling of channel instructions </b>
-
-The execution order of the 4 instructions for each channel (`Recv`, `RecvDone`,
-`Send`, `SendDone`) is as below.
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:70%" src="../../images/send_recv_order.png">
-</div>
-
-* `Recv` happens before `Send`
-* `Send` happens before `RecvDone`
-* `Recv` happens before `RecvDone`
-* `Send` happens before `SendDone`
-
-When the backend compilers generate a linear schedule for each computation that
-communicates via channel instructions, there must not be cycles across the
-computations. For example, below schedules lead to deadlocks.
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/send_recv_schedule.png">
-</div>
-
 ## Reduce
 
 See also
@@ -1143,7 +1341,6 @@ must have a non-negative number of mantissa bits.  The number of exponent or
 mantissa bits may exceed the corresponding value for type `T`; the corresponding
 portion of the conversion is then simply a no-op.
 
-
 ## ReduceWindow
 
 See also
@@ -1357,34 +1554,85 @@ be scalar valued.
 
 <b>`RngNormal(mean, sigma, shape)`</b>
 
-| Arguments | Type                    | Semantics                              |
-| --------- | ----------------------- | -------------------------------------- |
-| `mu`      | `ComputationDataHandle` | Scalar of type F32 specifying mean of  |
-:           :                         : generated numbers                      :
-| `sigma`   | `ComputationDataHandle` | Scalar of type F32 specifying standard |
-:           :                         : deviation of generated numbers         :
-| `shape`   | `Shape`                 | Output shape of type F32               |
+| Arguments | Type                    | Semantics                              |
+| --------- | ----------------------- | -------------------------------------- |
+| `mu`      | `ComputationDataHandle` | Scalar of type F32 specifying mean of  |
+:           :                         : generated numbers                      :
+| `sigma`   | `ComputationDataHandle` | Scalar of type F32 specifying standard |
+:           :                         : deviation of generated numbers         :
+| `shape`   | `Shape`                 | Output shape of type F32               |
+
+## RngUniform
+
+See also
+[`ComputationBuilder::RngUniform`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+
+Constructs an output of a given shape with random numbers generated following
+the uniform distribution over the interval $$[a,b)$$. The parameters and output
+shape may be either F32, S32 or U32, but the types have to be consistent.
+Furthermore, the parameters need to be scalar valued. If $$b <= a$$ the result
+is implementation-defined.
+
+<b>`RngUniform(a, b, shape)`</b>
+
+| Arguments | Type                    | Semantics                         |
+| --------- | ----------------------- | --------------------------------- |
+| `a`       | `ComputationDataHandle` | Scalar of type T specifying lower |
+:           :                         : limit of interval                 :
+| `b`       | `ComputationDataHandle` | Scalar of type T specifying upper |
+:           :                         : limit of interval                 :
+| `shape`   | `Shape`                 | Output shape of type T            |
+
+## Select
+
+See also
+[`ComputationBuilder::Select`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+
+Constructs an output array from elements of two input arrays, based on the
+values of a predicate array.
+
+<b> `Select(pred, on_true, on_false)` </b>
+
+Arguments  | Type                    | Semantics
+---------- | ----------------------- | ------------------
+`pred`     | `ComputationDataHandle` | array of type PRED
+`on_true`  | `ComputationDataHandle` | array of type T
+`on_false` | `ComputationDataHandle` | array of type T
+
+The arrays `on_true` and `on_false` must have the same shape. This is also the
+shape of the output array. The array `pred` must have the same dimensionality as
+`on_true` and `on_false`, with the `PRED` element type.
+
+For each element `P` of `pred`, the corresponding element of the output array is
+taken from `on_true` if the value of `P` is `true`, and from `on_false` if the
+value of `P` is `false`. As a restricted form of [broadcasting]
+(broadcasting.md), `pred` can be a scalar of type `PRED`. In this case, the
+output array is taken wholly from `on_true` if `pred` is `true`, and from
+`on_false` if `pred` is `false`.
 
-## RngUniform
+Example with non-scalar `pred`:
 
-See also
-[`ComputationBuilder::RngUniform`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+```
+let pred: PRED[4] = {true, false, false, true};
+let v1: s32[4] = {1, 2, 3, 4};
+let v2: s32[4] = {100, 200, 300, 400};
+==>
+Select(pred, v1, v2) = s32[4]{1, 200, 300, 4};
+```
 
-Constructs an output of a given shape with random numbers generated following
-the uniform distribution over the interval $$[a,b)$$. The parameters and output
-shape may be either F32, S32 or U32, but the types have to be consistent.
-Furthermore, the parameters need to be scalar valued. If $$b <= a$$ the result
-is implementation-defined.
+Example with scalar `pred`:
 
-<b>`RngUniform(a, b, shape)`</b>
+```
+let pred: PRED = true;
+let v1: s32[4] = {1, 2, 3, 4};
+let v2: s32[4] = {100, 200, 300, 400};
+==>
+Select(pred, v1, v2) = s32[4]{1, 2, 3, 4};
+```
 
-| Arguments | Type                    | Semantics                         |
-| --------- | ----------------------- | --------------------------------- |
-| `a`       | `ComputationDataHandle` | Scalar of type T specifying lower |
-:           :                         : limit of interval                 :
-| `b`       | `ComputationDataHandle` | Scalar of type T specifying upper |
-:           :                         : limit of interval                 :
-| `shape`   | `Shape`                 | Output shape of type T            |
+Selections between tuples are supported. Tuples are considered to be scalar
+types for this purpose. If `on_true` and `on_false` are tuples (which must have
+the same shape!) then `pred` has to be a scalar of type `PRED`.
 
 ## SelectAndScatter
 
@@ -1467,56 +1715,60 @@ non-deterministic. Therefore, the `scatter` function should not be overly
 sensitive to reassociation. See the discussion about associativity in the
 context of [`Reduce`](#reduce) for more details.
 
-## Select
+## Send
 
 See also
-[`ComputationBuilder::Select`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`ComputationBuilder::Send`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-Constructs an output array from elements of two input arrays, based on the
-values of a predicate array.
+<b> `Send(operand, channel_handle)` </b>
 
-<b> `Select(pred, on_true, on_false)` </b>
+| Arguments        | Type                    | Semantics                        |
+| ---------------- | ----------------------- | -------------------------------- |
+| `operand`        | `ComputationDataHandle` | data to send (array of type T)   |
+| `channel_handle` | `ChannelHandle`         | unique identifier for each send/recv pair |
 
-Arguments  | Type                    | Semantics
----------- | ----------------------- | ------------------
-`pred`     | `ComputationDataHandle` | array of type PRED
-`on_true`  | `ComputationDataHandle` | array of type T
-`on_false` | `ComputationDataHandle` | array of type T
+Sends the given operand data to a `Recv` instruction in another computation
+that shares the same channel handle. Does not return any data.
 
-The arrays `on_true` and `on_false` must have the same shape. This is also the
-shape of the output array. The array `pred` must have the same dimensionality as
-`on_true` and `on_false`, with the `PRED` element type.
+Similar to the `Recv` operation, the client API of `Send` operation represents
+synchronous communication, and is internally decomposed into 2 HLO instructions
+(`Send` and `SendDone`) to enable asynchronous data transfers. See also
+[`HloInstruction::CreateSend` and `HloInstruction::CreateSendDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/hlo_instruction.h).
 
-For each element `P` of `pred`, the corresponding element of the output array is
-taken from `on_true` if the value of `P` is `true`, and from `on_false` if the
-value of `P` is `false`. As a restricted form of [broadcasting]
-(broadcasting.md), `pred` can be a scalar of type `PRED`. In this case, the
-output array is taken wholly from `on_true` if `pred` is `true`, and from
-`on_false` if `pred` is `false`.
+<b>`Send(HloInstruction operand, int64 channel_id)`</b>
 
-Example with non-scalar `pred`:
+Initiates an asynchronous transfer of the operand to the resources allocated by
+the `Recv` instruction with the same channel id. Returns a context, which is
+used by a following `SendDone` instruction to wait for the completion of the
+data transfer. The context is a tuple of {operand (shape), request identifier
+(U32)} and it can only be used by a `SendDone` instruction.
 
-```
-let pred: PRED[4] = {true, false, false, true};
-let v1: s32[4] = {1, 2, 3, 4};
-let v2: s32[4] = {100, 200, 300, 400};
-==>
-Select(pred, v1, v2) = s32[4]{1, 200, 300, 4};
-```
+<b> `SendDone(HloInstruction context)` </b>
 
-Example with scalar `pred`:
+Given a context created by a `Send` instruction, waits for the data transfer to
+complete.  The instruction does not return any data.
 
-```
-let pred: PRED = true;
-let v1: s32[4] = {1, 2, 3, 4};
-let v2: s32[4] = {100, 200, 300, 400};
-==>
-Select(pred, v1, v2) = s32[4]{1, 2, 3, 4};
-```
+<b> Scheduling of channel instructions </b>
 
-Selections between tuples are supported. Tuples are considered to be scalar
-types for this purpose. If `on_true` and `on_false` are tuples (which must have
-the same shape!) then `pred` has to be a scalar of type `PRED`.
+The execution order of the 4 instructions for each channel (`Recv`, `RecvDone`,
+`Send`, `SendDone`) is as below.
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:70%" src="../../images/send_recv_order.png">
+</div>
+
+* `Recv` happens before `Send`
+* `Send` happens before `RecvDone`
+* `Recv` happens before `RecvDone`
+* `Send` happens before `SendDone`
+
+When the backend compilers generate a linear schedule for each computation that
+communicates via channel instructions, there must not be cycles across the
+computations. For example, below schedules lead to deadlocks.
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="../../images/send_recv_schedule.png">
+</div>
 
 ## Slice
 
@@ -1570,132 +1822,6 @@ Slice(b, {2, 1}, {4, 3}) produces:
     {10.0, 11.0} }
 ```
 
-## DynamicSlice
-
-See also
-[`ComputationBuilder::DynamicSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
-
-DynamicSlice extracts a sub-array from the input array at dynamic
-`start_indices`. The size of the slice in each dimension is passed in
-`size_indices`, which specify the end point of exclusive slice intervals in each
-dimension: [start, start + size). The shape of `start_indices` must be rank ==
-1, with dimension size equal to the rank of `operand`.
-Note: handling of out-of-bounds slice indices (generated by incorrect runtime
-calculation of 'start_indices') is currently implementation-defined. Currently,
-slice indices are computed modulo input dimension sizes to prevent out-of-bound
-array accesses, but this behavior may change in future implementations.
-
-<b> `DynamicSlice(operand, start_indices, size_indices)` </b>
-
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
-| `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
-:                 :                         : containing the starting indices  :
-:                 :                         : of the slice for each dimension. :
-:                 :                         : Value must be greater than or    :
-:                 :                         : equal to zero.                   :
-| `size_indices`  | `ArraySlice<int64>`     | List of N integers containing    |
-:                 :                         : the slice size for each          :
-:                 :                         : dimension. Each value must be    :
-:                 :                         : strictly greater than zero, and  :
-:                 :                         : start + size must be less than   :
-:                 :                         : or equal to the size of the      :
-:                 :                         : dimension to avoid wrapping      :
-:                 :                         : modulo dimension size.           :
-
-1-dimensional example:
-
-```
-let a = {0.0, 1.0, 2.0, 3.0, 4.0}
-let s = {2}
-
-DynamicSlice(a, s, {2}) produces:
-  {2.0, 3.0}
-```
-
-2-dimensional example:
-
-```
-let b =
- { {0.0,  1.0,  2.0},
-   {3.0,  4.0,  5.0},
-   {6.0,  7.0,  8.0},
-   {9.0, 10.0, 11.0} }
-let s = {2, 1}
-
-DynamicSlice(b, s, {2, 2}) produces:
-  { { 7.0,  8.0},
-    {10.0, 11.0} }
-```
-## DynamicUpdateSlice
-
-See also
-[`ComputationBuilder::DynamicUpdateSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
-
-DynamicUpdateSlice generates a result which is the value of the input array
-`operand`, with a slice `update` overwritten at `start_indices`.
-The shape of `update` determines the shape of the sub-array of the result which
-is updated.
-The shape of `start_indices` must be rank == 1, with dimension size equal to
-the rank of `operand`.
-Note: handling of out-of-bounds slice indices (generated by incorrect runtime
-calculation of 'start_indices') is currently implementation-defined. Currently,
-slice indices are computed modulo update dimension sizes to prevent out-of-bound
-array accesses, but this behavior may change in future implementations.
-
-<b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
-
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
-| `update`        | `ComputationDataHandle` | N dimensional array of type T    |
-:                 :                         : containing the slice update.     :
-:                 :                         : Each dimension of update shape    :
-:                 :                         : must be strictly greater than    :
-:                 :                         : zero, and start + update must be :
-:                 :                         : less than operand size for each  :
-:                 :                         : dimension to avoid generating    :
-:                 :                         : out-of-bounds update indices.    :
-| `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
-:                 :                         : containing the starting indices  :
-:                 :                         : of the slice for each dimension. :
-:                 :                         : Value must be greater than or    :
-:                 :                         : equal to zero.                   :
-
-1-dimensional example:
-
-```
-let a = {0.0, 1.0, 2.0, 3.0, 4.0}
-let u = {5.0, 6.0}
-let s = {2}
-
-DynamicUpdateSlice(a, u, s) produces:
-  {0.0, 1.0, 5.0, 6.0, 4.0}
-```
-
-2-dimensional example:
-
-```
-let b =
- { {0.0,  1.0,  2.0},
-   {3.0,  4.0,  5.0},
-   {6.0,  7.0,  8.0},
-   {9.0, 10.0, 11.0} }
-let u =
- { {12.0,  13.0},
-   {14.0,  15.0},
-   {16.0,  17.0} }
-
-let s = {1, 1}
-
-DynamicUpdateSlice(b, u, s) produces:
- { {0.0,  1.0,  2.0},
-   {3.0, 12.0, 13.0},
-   {6.0, 14.0, 15.0},
-   {9.0, 16.0, 17.0} }
-```
-
 ## Sort
 
 See also
diff --git a/tensorflow/docs_src/programmers_guide/datasets.md b/tensorflow/docs_src/programmers_guide/datasets.md
index f458cbcef228b60fcce095a9326b5ea36494cde3..308cbad376468b4ae29b8e321ec8ce85c102cd47 100644
--- a/tensorflow/docs_src/programmers_guide/datasets.md
+++ b/tensorflow/docs_src/programmers_guide/datasets.md
@@ -1,16 +1,16 @@
 # Importing Data
 
-The @{tf.data.Dataset$`Dataset`} API enables you to build complex input pipelines from
+The `tf.data` API enables you to build complex input pipelines from
 simple, reusable pieces. For example, the pipeline for an image model might
 aggregate data from files in a distributed file system, apply random
 perturbations to each image, and merge randomly selected images into a batch
 for training. The pipeline for a text model might involve extracting symbols
 from raw text data, converting them to embedding identifiers with a lookup
-table, and batching together sequences of different lengths. The `Dataset` API
+table, and batching together sequences of different lengths. The `tf.data` API
 makes it easy to deal with large amounts of data, different data formats, and
 complicated transformations.
 
-The `Dataset` API introduces two new abstractions to TensorFlow:
+The `tf.data` API introduces two new abstractions to TensorFlow:
 
 * A `tf.data.Dataset` represents a sequence of elements, in which
   each element contains one or more `Tensor` objects. For example, in an image
@@ -121,7 +121,7 @@ dataset3 = dataset3.filter(lambda x, (y, z): ...)
 ### Creating an iterator
 
 Once you have built a `Dataset` to represent your input data, the next step is to
-create an `Iterator` to access elements from that dataset.  The `Dataset` API
+create an `Iterator` to access elements from that dataset.  The `tf.data` API
 currently supports the following iterators, in increasing level of
 sophistication:
 
@@ -190,8 +190,8 @@ validation_dataset = tf.data.Dataset.range(50)
 # A reinitializable iterator is defined by its structure. We could use the
 # `output_types` and `output_shapes` properties of either `training_dataset`
 # or `validation_dataset` here, because they are compatible.
-iterator = Iterator.from_structure(training_dataset.output_types,
-                                   training_dataset.output_shapes)
+iterator = tf.data.Iterator.from_structure(training_dataset.output_types,
+                                           training_dataset.output_shapes)
 next_element = iterator.get_next()
 
 training_init_op = iterator.make_initializer(training_dataset)
@@ -379,7 +379,7 @@ sess.run(iterator.initializer, feed_dict={features_placeholder: features,
 
 ### Consuming TFRecord data
 
-The `Dataset` API supports a variety of file formats so that you can process
+The `tf.data` API supports a variety of file formats so that you can process
 large datasets that do not fit in memory. For example, the TFRecord file format
 is a simple record-oriented binary format that many TensorFlow applications use
 for training data. The `tf.data.TFRecordDataset` class enables you to
@@ -628,7 +628,7 @@ TODO(mrry): Add this section.
 
 ### Processing multiple epochs
 
-The `Dataset` API offers two main ways to process multiple epochs of the same
+The `tf.data` API offers two main ways to process multiple epochs of the same
 data.
 
 The simplest way to iterate over a dataset in multiple epochs is to use the
@@ -693,7 +693,7 @@ dataset = dataset.repeat()
 The @{tf.train.MonitoredTrainingSession} API simplifies many aspects of running
 TensorFlow in a distributed setting. `MonitoredTrainingSession` uses the
 @{tf.errors.OutOfRangeError} to signal that training has completed, so to use it
-with the `Dataset` API, we recommend using
+with the `tf.data` API, we recommend using
 `Dataset.make_one_shot_iterator()`. For example:
 
 ```python
@@ -735,7 +735,7 @@ def dataset_input_fn():
     parsed = tf.parse_single_example(record, keys_to_features)
 
     # Perform additional preprocessing on the parsed data.
-    image = tf.decode_jpeg(parsed["image_data"])
+    image = tf.image.decode_jpeg(parsed["image_data"])
     image = tf.reshape(image, [299, 299, 1])
     label = tf.cast(parsed["label"], tf.int32)
 
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index 25cb72008d5a5418f46aa543871e97cee996ecb5..1a32882121efb1aa906bf6fb846194709d0f700e 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -392,7 +392,7 @@ diff = -(y_ * tf.log(y))
 to the built-in, numerically-stable implementation of softmax cross-entropy:
 
 ```python
-diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=logits)
+diff = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=logits)
 ```
 
 Rerun with the `--debug` flag as follows:
diff --git a/tensorflow/docs_src/programmers_guide/estimators.md b/tensorflow/docs_src/programmers_guide/estimators.md
index d465679817b030fe65f038750d1006d9749ad748..8b6cbbcd170efaa101af93e72c1ec24191e5759d 100644
--- a/tensorflow/docs_src/programmers_guide/estimators.md
+++ b/tensorflow/docs_src/programmers_guide/estimators.md
@@ -166,11 +166,29 @@ keras_inception_v3 = tf.keras.applications.inception_v3.InceptionV3(weights=None
 keras_inception_v3.compile(optimizer=tf.keras.optimizers.SGD(lr=0.0001, momentum=0.9),
                           loss='categorical_crossentropy',
                           metric='accuracy')
-# Create an Estimator from the compiled Keras model.
+# Create an Estimator from the compiled Keras model. Note the initial model
+# state of the keras model is preserved in the created Estimator.
 est_inception_v3 = tf.keras.estimator.model_to_estimator(keras_model=keras_inception_v3)
-# Treat the derived Estimator as you would any other Estimator. For example,
-# the following derived Estimator calls the train method:
-est_inception_v3.train(input_fn=my_training_set, steps=2000)
+
+# Treat the derived Estimator as you would with any other Estimator.
+# First, recover the input name(s) of Keras model, so we can use them as the
+# feature column name(s) of the Estimator input function:
+keras_inception_v3.input_names  # print out: ['input_1']
+# Once we have the input name(s), we can create the input function, for example,
+# for input(s) in the format of numpy ndarray:
+train_input_fn = tf.estimator.inputs.numpy_input_fn(
+    x={"input_1": train_data},
+    y=train_labels,
+    num_epochs=1,
+    shuffle=False)
+# To train, we call Estimator's train function:
+est_inception_v3.train(input_fn=train_input_fn, steps=2000)
 ```
+Note that the names of feature columns and labels of a keras estimator come from
+the corresponding compiled keras model. For example, the input key names for
+@{$get_started/input_fn} in above `est_inception_v3` estimator can be obtained
+from `keras_inception_v3.input_names`, and similarly, the predicted output
+names can be obtained from `keras_inception_v3.output_names`.
+
 For more details, please refer to the documentation for
 @{tf.keras.estimator.model_to_estimator}.
diff --git a/tensorflow/docs_src/programmers_guide/saved_model.md b/tensorflow/docs_src/programmers_guide/saved_model.md
index 8731cae0d75d1fdd06f9f0267af2ded4d43f7ed1..54693f3d4d356da93e6e31595d04ed58e173e061 100644
--- a/tensorflow/docs_src/programmers_guide/saved_model.md
+++ b/tensorflow/docs_src/programmers_guide/saved_model.md
@@ -33,7 +33,7 @@ roughly speaking, map variable names to tensor values.
 
 Create a `Saver` with `tf.train.Saver()` to manage all variables in the
 model. For example, the following snippet demonstrates how to call the
-`tf.train.Saver.save` method to save variables to a checkpoint file:
+`tf.train.Saver.save` method to save variables to checkpoint files:
 
 ```python
 # Create some variables.
@@ -58,7 +58,7 @@ with tf.Session() as sess:
   dec_v2.op.run()
   # Save the variables to disk.
   save_path = saver.save(sess, "/tmp/model.ckpt")
-  print("Model saved in file: %s" % save_path)
+  print("Model saved in path: %s" % save_path)
 ```
 
 
@@ -66,10 +66,10 @@ with tf.Session() as sess:
 ### Restoring variables
 
 The `tf.train.Saver` object not only saves variables to checkpoint files, it
-also restores variables.  Note that when you restore variables from a file you
-do not have to initialize them beforehand. For example, the following snippet
-demonstrates how to call the `tf.train.Saver.restore` method to restore
-variables from a checkpoint file:
+also restores variables. Note that when you restore variables you do not have
+to initialize them beforehand. For example, the following snippet demonstrates
+how to call the `tf.train.Saver.restore` method to restore variables from the
+checkpoint files:
 
 ```python
 tf.reset_default_graph()
@@ -92,6 +92,12 @@ with tf.Session() as sess:
   print("v2 : %s" % v2.eval())
 ```
 
+Notes:
+
+*  There is not a physical file called "/tmp/model.ckpt". It is the **prefix**
+   of filenames created for the checkpoint. Users only interact with the
+   prefix instead of physical checkpoint files.
+
 
 ### Choosing which variables to save and restore
 
@@ -160,7 +166,7 @@ Notes:
 
 ### Inspect variables in a checkpoint
 
-We can quickly inspect variables in a checkpoint with the 
+We can quickly inspect variables in a checkpoint with the
 [`inspect_checkpoint`](https://www.tensorflow.org/code/tensorflow/python/tools/inspect_checkpoint.py) library.
 
 Continuing from the save/restore examples shown earlier:
diff --git a/tensorflow/docs_src/programmers_guide/tensors.md b/tensorflow/docs_src/programmers_guide/tensors.md
index 88eb277e3514f73494107046698f74a0adde9fe0..58a80d533927e4f0d1458f87406914c1efa00605 100644
--- a/tensorflow/docs_src/programmers_guide/tensors.md
+++ b/tensorflow/docs_src/programmers_guide/tensors.md
@@ -43,8 +43,8 @@ generating a random number.
 
 The **rank** of a `tf.Tensor` object is its number of dimensions. Synonyms for
 rank include **order** or **degree** or **n-dimension**.
-Note that rank in TensorFlow is not the same as matrix rank in mathematics. 
-As the following table shows, each rank in TensorFlow corresponds to a 
+Note that rank in TensorFlow is not the same as matrix rank in mathematics.
+As the following table shows, each rank in TensorFlow corresponds to a
 different mathematical entity:
 
 Rank | Math entity
@@ -56,7 +56,7 @@ Rank | Math entity
 n | n-Tensor (you get the idea)
 
 
-### Rank 0 
+### Rank 0
 
 The following snippet demonstrates creating a few rank 0 variables:
 
@@ -108,12 +108,12 @@ my_image = tf.zeros([10, 299, 299, 3])  # batch x height x width x color
 ### Getting a `tf.Tensor` object's rank
 
 To determine the rank of a `tf.Tensor` object, call the `tf.rank` method.
-For example, the following method programmatically determines the rank 
+For example, the following method programmatically determines the rank
 of the `tf.Tensor` defined in the previous section:
 
 ```python
-r = tf.rank(my3d)
-# After the graph runs, r will hold the value 3.
+r = tf.rank(my_image)
+# After the graph runs, r will hold the value 4.
 ```
 
 ### Referring to `tf.Tensor` slices
diff --git a/tensorflow/docs_src/programmers_guide/variables.md b/tensorflow/docs_src/programmers_guide/variables.md
index bda39cc28e9a9e8805a9d502ea559a1e9f03244a..64250738056043e236b5eb236bcbf29375655260 100644
--- a/tensorflow/docs_src/programmers_guide/variables.md
+++ b/tensorflow/docs_src/programmers_guide/variables.md
@@ -37,7 +37,7 @@ You may optionally specify the `dtype` and initializer to `tf.get_variable`. For
 example:
 
 ``` python
-my_int_variable = tf.get_variable("my_int_variable", [1, 2, 3], dtype=tf.int32, 
+my_int_variable = tf.get_variable("my_int_variable", [1, 2, 3], dtype=tf.int32,
   initializer=tf.zeros_initializer)
 ```
 
@@ -45,7 +45,7 @@ TensorFlow provides many convenient initializers. Alternatively, you may
 initialize a `tf.Variable` to have the value of a `tf.Tensor`. For example:
 
 ``` python
-other_variable = tf.get_variable("other_variable", dtype=tf.int32, 
+other_variable = tf.get_variable("other_variable", dtype=tf.int32,
   initializer=tf.constant([23, 42]))
 ```
 
@@ -66,13 +66,13 @@ By default every `tf.Variable` gets placed in the following two collections:
 multiple devices,
  * `tf.GraphKeys.TRAINABLE_VARIABLES`--- variables for which TensorFlow will
    calculate gradients.
- 
+
 If you don't want a variable to be trainable, add it to the
 `tf.GraphKeys.LOCAL_VARIABLES` collection instead. For example, the following
 snippet demonstrates how to add a variable named `my_local` to this collection:
 
 ``` python
-my_local = tf.get_variable("my_local", shape=(), 
+my_local = tf.get_variable("my_local", shape=(),
 collections=[tf.GraphKeys.LOCAL_VARIABLES])
 ```
 
@@ -80,8 +80,8 @@ Alternatively, you can specify `trainable=False` as an argument to
 `tf.get_variable`:
 
 ``` python
-my_non_trainable = tf.get_variable("my_non_trainable", 
-                                   shape=(), 
+my_non_trainable = tf.get_variable("my_non_trainable",
+                                   shape=(),
                                    trainable=False)
 ```
 
@@ -126,7 +126,7 @@ cluster_spec = {
     "ps": ["ps0:2222", "ps1:2222"],
     "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]}
 with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
-  v = tf.get_variable("v", shape=[20, 20])  # this variable is placed 
+  v = tf.get_variable("v", shape=[20, 20])  # this variable is placed
                                             # in the parameter server
                                             # by the replica_device_setter
 ```
@@ -142,7 +142,7 @@ high-level frameworks such as `tf.contrib.slim`, `tf.estimator.Estimator` and
 Explicit initialization is otherwise useful because it allows you not to rerun
 potentially expensive initializers when reloading a model from a checkpoint as
 well as allowing determinism when randomly-initialized variables are shared in a
-distributed setting. 
+distributed setting.
 
 To initialize all trainable variables in one go, before training starts, call
 `tf.global_variables_initializer()`. This function returns a single operation
@@ -205,7 +205,7 @@ methods:
 v = tf.get_variable("v", shape=(), initializer=tf.zeros_initializer())
 assignment = v.assign_add(1)
 tf.global_variables_initializer().run()
-assignment.eval()
+sess.run(assignment)  # or assignment.op.run(), or assignment.eval()
 ```
 
 Most TensorFlow optimizers have specialized ops that efficiently update the
diff --git a/tensorflow/docs_src/tutorials/deep_cnn.md b/tensorflow/docs_src/tutorials/deep_cnn.md
index 6f802fd106d0e7cc8b2049af2548c51803b43195..679754020470dddfcffa76e62ca8f55a439ec4f5 100644
--- a/tensorflow/docs_src/tutorials/deep_cnn.md
+++ b/tensorflow/docs_src/tutorials/deep_cnn.md
@@ -195,9 +195,8 @@ The usual method for training a network to perform N-way classification is
 aka. *softmax regression*. Softmax regression applies a
 @{tf.nn.softmax$softmax} nonlinearity to the
 output of the network and calculates the
-@{tf.nn.softmax_cross_entropy_with_logits$cross-entropy}
-between the normalized predictions and a
-@{tf.sparse_to_dense$1-hot encoding} of the label.
+@{tf.nn.sparse_softmax_cross_entropy_with_logits$cross-entropy}
+between the normalized predictions and the label index.
 For regularization, we also apply the usual
 @{tf.nn.l2_loss$weight decay} losses to all learned
 variables.  The objective function for the model is the sum of the cross entropy
diff --git a/tensorflow/docs_src/tutorials/image_recognition.md b/tensorflow/docs_src/tutorials/image_recognition.md
index ddb771700a03d0d4f60ff3d26afbef9d861b5691..32257f87d6662f44536f45510b6a7c82628de2ff 100644
--- a/tensorflow/docs_src/tutorials/image_recognition.md
+++ b/tensorflow/docs_src/tutorials/image_recognition.md
@@ -5,7 +5,7 @@ tell apart a lion and a jaguar, read a sign, or recognize a human's face.
 But these are actually hard problems to solve with a computer: they only
 seem easy because our brains are incredibly good at understanding images.
 
-In the last few years the field of machine learning has made tremendous
+In the last few years, the field of machine learning has made tremendous
 progress on addressing these difficult problems. In particular, we've
 found that a kind of model called a deep
 [convolutional neural network](https://colah.github.io/posts/2014-07-Conv-Nets-Modular/)
@@ -42,7 +42,7 @@ For example, here are the results from [AlexNet] classifying some images:
 To compare models, we examine how often the model fails to predict the
 correct answer as one of their top 5 guesses -- termed "top-5 error rate".
 [AlexNet] achieved by setting a top-5 error rate of 15.3% on the 2012
-validation data set; [Inception (GoogLeNet)] achieved 6.67%; 
+validation data set; [Inception (GoogLeNet)] achieved 6.67%;
 [BN-Inception-v2] achieved 4.9%; [Inception-v3] reaches 3.46%.
 
 > How well do humans do on ImageNet Challenge? There's a [blog post] by
diff --git a/tensorflow/docs_src/tutorials/image_retraining.md b/tensorflow/docs_src/tutorials/image_retraining.md
index ad565e6d8be5e1e1c0efe5993608a4c1083e562b..52e6980e0070cdc6d03275c891283c25df4b31a1 100644
--- a/tensorflow/docs_src/tutorials/image_retraining.md
+++ b/tensorflow/docs_src/tutorials/image_retraining.md
@@ -44,8 +44,14 @@ following command (these examples are not included in the installation):
 
 ```sh
 git clone https://github.com/tensorflow/tensorflow
+```
+
+Then checkout the version of the tensorflow repository matching your
+installation and this tutorial as follows:
 
+``` sh
 cd tensorflow
+git checkout {version}
 ```
 
 In the simplest cases the retrainer can then be run like this:
diff --git a/tensorflow/docs_src/tutorials/index.md b/tensorflow/docs_src/tutorials/index.md
index a34dbd69569be9cd234e98009ed148080fbbdb70..15f8b54a295c8ebfb0076f58f2d39b7bcc344e90 100644
--- a/tensorflow/docs_src/tutorials/index.md
+++ b/tensorflow/docs_src/tutorials/index.md
@@ -46,6 +46,10 @@ The following tutorials focus on linear models:
   * @{$audio_recognition$Simple Audio Recognition}, which shows how to
     build a basic speech recognition network.
 
+The following tutorial covers building a classification model for sequences:
+
+  * @{$recurrent_quickdraw$Classifying Drawings using Recurrent Neural Networks}
+
 Although TensorFlow specializes in machine learning, you may also use
 TensorFlow to solve other kinds of math problems.  For example:
 
diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
index e808a3677f2a3e89597ef82cc86dd3646775d693..7c2029c4428b298ffb393d12af704a96b368f723 100644
--- a/tensorflow/docs_src/tutorials/layers.md
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -169,9 +169,7 @@ def cnn_model_fn(features, labels, mode):
     return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
 
   # Calculate Loss (for both TRAIN and EVAL modes)
-  onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Configure the Training Op (for TRAIN mode)
   if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/docs_src/tutorials/leftnav_files b/tensorflow/docs_src/tutorials/leftnav_files
index 5a5d6ca558867e1c8f3dca221a98ca7c0a7ee986..e612961ae05b6d8542cf0cd6d2064a7f972dc7cd 100644
--- a/tensorflow/docs_src/tutorials/leftnav_files
+++ b/tensorflow/docs_src/tutorials/leftnav_files
@@ -6,6 +6,7 @@ layers.md
 deep_cnn.md
 word2vec.md
 recurrent.md
+recurrent_quickdraw.md
 seq2seq.md
 linear.md
 wide.md
diff --git a/tensorflow/docs_src/tutorials/linear.md b/tensorflow/docs_src/tutorials/linear.md
index d333d01279067de47819410795505f731e14fed3..dddb0341076836f914675b38a45b75ec97bef28e 100644
--- a/tensorflow/docs_src/tutorials/linear.md
+++ b/tensorflow/docs_src/tutorials/linear.md
@@ -1,36 +1,40 @@
 # Large-scale Linear Models with TensorFlow
 
-The tf.estimator API provides (among other things) a rich set of tools for
+@{tf.estimator$Estimators} provides (among other things) a rich set of tools for
 working with linear models in TensorFlow. This document provides an overview of
 those tools. It explains:
 
-   * what a linear model is.
-   * why you might want to use a linear model.
-   * how tf.estimator makes it easy to build linear models in TensorFlow.
-   * how you can use tf.estimator to combine linear models with
-   deep learning to get the advantages of both.
+   * What a linear model is.
+   * Why you might want to use a linear model.
+   * How Estimators make it easy to build linear models in TensorFlow.
+   * How you can use Estimators to combine linear models with.
+     deep learning to get the advantages of both.
 
-Read this overview to decide whether the tf.estimator linear model tools might
+Read this overview to decide whether the Estimator's linear model tools  might
 be useful to you. Then do the @{$wide$Linear Models tutorial} to
 give it a try. This overview uses code samples from the tutorial, but the
 tutorial walks through the code in greater detail.
 
 To understand this overview it will help to have some familiarity
-with basic machine learning concepts, and also with @{$get_started/estimator$`tf.estimator`}.
+with basic machine learning concepts, and also with
+@{$get_started/estimator$Estimators}.
 
 [TOC]
 
 ## What is a linear model?
 
-A *linear model* uses a single weighted sum of features to make a prediction.
-For example, if you have [data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names)
+A **linear model** uses a single weighted sum of features to make a prediction.
+For example, if you have
+[data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names)
 on age, years of education, and weekly hours of
-work for a population, you can learn weights for each of those numbers so that
+work for a population, a model can learn weights for each of those numbers so that
 their weighted sum estimates a person's salary. You can also use linear models
 for classification.
 
 Some linear models transform the weighted sum into a more convenient form. For
-example, *logistic regression* plugs the weighted sum into the logistic
+example, 
+[**logistic regression**](https://developers.google.com/machine-learning/glossary/#logistic_regression)
+plugs the weighted sum into the logistic
 function to turn the output into a value between 0 and 1. But you still just
 have one weight for each input feature.
 
@@ -51,10 +55,10 @@ Linear models:
    * provide an excellent starting point for learning about machine learning.
    * are widely used in industry.
 
-## How does tf.estimator help you build linear models?
+## How do Estimators help you build linear models?
 
 You can build a linear model from scratch in TensorFlow without the help of a
-special API. But tf.estimator provides some tools that make it easier to build
+special API. But Estimators provides some tools that make it easier to build
 effective large-scale linear models.
 
 ### Feature columns and transformations
@@ -86,10 +90,10 @@ become [0, 1, 0] and 'green' would become [0, 0, 1]. These vectors are called
 "sparse" because they may be very long, with many zeros, when the set of
 possible values is very large (such as all English words).
 
-While you don't need to use categorical columns to use tf.estimator linear
-models, one of the strengths of linear models is their ability to deal with
-large sparse vectors. Sparse features are a primary use case for the
-tf.estimator linear model tools.
+While you don't need to use categorical columns to use the linear model tools
+provided by Estimators, one of the strengths of linear models is their ability
+to deal with large sparse vectors. Sparse features are a primary use case for
+the linear model tools provided by Estimators.
 
 ##### Encoding sparse columns
 
@@ -173,7 +177,7 @@ the data itself. You provide the data through an input function.
 The input function must return a dictionary of tensors. Each key corresponds to
 the name of a `FeatureColumn`. Each key's value is a tensor containing the
 values of that feature for all data instances. See
-@{$input_fn$Building Input Functions with tf.estimator} for a
+@{$input_fn$Building Input Functions} for a
 more comprehensive look at input functions, and `input_fn` in the
 [linear models tutorial code](https://github.com/tensorflow/models/tree/master/official/wide_deep/wide_deep.py)
 for an example implementation of an input function.
@@ -220,7 +224,7 @@ for key in sorted(results):
 
 ### Wide and deep learning
 
-The tf.estimator API also provides an estimator class that lets you jointly
+The `tf.estimator` module also provides an estimator class that lets you jointly
 train a linear model and a deep neural network. This novel approach combines the
 ability of linear models to "memorize" key features with the generalization
 ability of neural nets. Use `tf.estimator.DNNLinearCombinedClassifier` to
diff --git a/tensorflow/docs_src/tutorials/recurrent_quickdraw.md b/tensorflow/docs_src/tutorials/recurrent_quickdraw.md
new file mode 100644
index 0000000000000000000000000000000000000000..7306b4bf568397470ff3e52a7aa83e75208b1af9
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/recurrent_quickdraw.md
@@ -0,0 +1,410 @@
+# Recurrent Neural Networks for Drawing Classification
+
+[Quick, Draw!]: http://quickdraw.withgoogle.com
+
+[Quick, Draw!] is a game where a player is challenged to draw a number of
+objects and see if a computer can recognize the drawing.
+
+The recognition in [Quick, Draw!] is performed by a classifier that takes the
+user input, given as a sequence of strokes of points in x and y, and recognizes
+the object category that the user tried to draw.
+
+In this tutorial we'll show how to build an RNN-based recognizer for this
+problem. The model will use a combination of convolutional layers, LSTM layers,
+and a softmax output layer to classify the drawings:
+
+<center> ![RNN model structure](../images/quickdraw_model.png) </center>
+
+The figure above shows the structure of the model that we will build in this
+tutorial. The input is a drawing that is encoded as a sequence of strokes of
+points in x, y, and n, where n indicates whether a the point is the first point
+in a new stroke.
+
+Then, a series of 1-dimensional convolutions is applied. Then LSTM layers are
+applied and the sum of the outputs of all LSTM steps is fed into a softmax layer
+to make a classification decision among the classes of drawings that we know.
+
+This tutorial uses the data from actual [Quick, Draw!] games [that is publicly
+available](https://quickdraw.withgoogle.com/data). This dataset contains of 50M
+drawings in 345 categories.
+
+## Run the tutorial code
+
+To try the code for this tutorial:
+
+1.  @{$install$Install TensorFlow} if you haven't already.
+1.  Download the [tutorial code]
+(https://github.com/tensorflow/models/tree/master/tutorials/rnn/quickdraw/train_model.py).
+1.  [Download the data](#download-the-data) in `TFRecord` format from
+    [here](http://download.tensorflow.org/data/quickdraw_tutorial_dataset_v1.tar.gz) and unzip it. More details about [how to
+    obtain the original Quick, Draw!
+    data](#optional-download-the-full-quick-draw-data) and [how to convert that
+    to `TFRecord` files](#optional-converting-the-data) is available below.
+
+1.  Execute the tutorial code with the following command to train the RNN-based
+    model described in this tutorial. Make sure to adjust the paths to point to
+    the unzipped data from the download in step 3.
+
+```shell
+  python train_model.py \
+    --training_data=rnn_tutorial_data/training.tfrecord-?????-of-????? \
+    --eval_data=rnn_tutorial_data/eval.tfrecord-?????-of-????? \
+    --classes_file=rnn_tutorial_data/training.tfrecord.classes
+```
+
+## Tutorial details
+
+### Download the data
+
+We make the data that we use in this tutorial available as `TFRecord` files
+containing `TFExamples`. You can download the data from here:
+
+http://download.tensorflow.org/data/quickdraw_tutorial_dataset_v1.tar.gz
+
+Alternatively you can download the original data in `ndjson` format from the
+Google cloud and convert it to the `TFRecord` files containing `TFExamples`
+yourself as described in the next section.
+
+### Optional: Download the full Quick Draw Data
+
+The full [Quick, Draw!](https://quickdraw.withgoogle.com)
+[dataset](https://quickdraw.withgoogle.com/data) is available on Google Cloud
+Storage as [ndjson](http://ndjson.org/) files separated by category. You can
+[browse the list of files in Cloud
+Console](https://console.cloud.google.com/storage/quickdraw_dataset).
+
+To download the data we recommend using
+[gsutil](https://cloud.google.com/storage/docs/gsutil_install#install) to
+download the entire dataset. Note that the original .ndjson files require
+downloading ~22GB.
+
+Then use the following command to check that your gsutil installation works and
+that you can access the data bucket:
+
+```shell
+gsutil ls -r "gs://quickdraw_dataset/full/simplified/*"
+```
+
+which will output a long list of files like the following:
+
+```shell
+gs://quickdraw_dataset/full/simplified/The Eiffel Tower.ndjson
+gs://quickdraw_dataset/full/simplified/The Great Wall of China.ndjson
+gs://quickdraw_dataset/full/simplified/The Mona Lisa.ndjson
+gs://quickdraw_dataset/full/simplified/aircraft carrier.ndjson
+...
+```
+
+Then create a folder and download the dataset there.
+
+```shell
+mkdir rnn_tutorial_data
+cd rnn_tutorial_data
+gsutil -m cp "gs://quickdraw_dataset/full/simplified/*" .
+```
+
+This download will take a while and download a bit more than 23GB of data.
+
+### Optional: Converting the data
+
+To convert the `ndjson` files to
+@{$python/python_io#tfrecords_format_details$TFRecord} files containing
+${tf.train.Example} protos run the following command.
+
+```shell
+   python create_dataset.py --ndjson_path rnn_tutorial_data \
+      --output_path rnn_tutorial_data
+```
+
+This will store the data in 10 shards of
+@{$python/python_io#tfrecords_format_details$TFRecord} files with 10000 items
+per class for the training data and 1000 items per class as eval data.
+
+This conversion process is described in more detail in the following.
+
+The original QuickDraw data is formatted as `ndjson` files where each line
+contains a JSON object like the following:
+
+```json
+{"word":"cat",
+ "countrycode":"VE",
+ "timestamp":"2017-03-02 23:25:10.07453 UTC",
+ "recognized":true,
+ "key_id":"5201136883597312",
+ "drawing":[
+   [
+     [130,113,99,109,76,64,55,48,48,51,59,86,133,154,170,203,214,217,215,208,186,176,162,157,132],
+     [72,40,27,79,82,88,100,120,134,152,165,184,189,186,179,152,131,114,100,89,76,0,31,65,70]
+   ],[
+     [76,28,7],
+     [136,128,128]
+   ],[
+     [76,23,0],
+     [160,164,175]
+   ],[
+     [87,52,37],
+     [175,191,204]
+   ],[
+     [174,220,246,251],
+     [134,132,136,139]
+   ],[
+     [175,255],
+     [147,168]
+   ],[
+     [171,208,215],
+     [164,198,210]
+   ],[
+     [130,110,108,111,130,139,139,119],
+     [129,134,137,144,148,144,136,130]
+   ],[
+     [107,106],
+     [96,113]
+   ]
+ ]
+}
+```
+
+For our purpose of building a classifier we only care about the fields "`word`"
+and "`drawing`". While parsing the ndjson files, we process them line by line
+using a function that converts the strokes from the `drawing` field into a
+tensor of size `[number of points, 3]` containing the differences of consecutive
+points. This function also returns the class name as a string.
+
+```python
+def parse_line(ndjson_line):
+  """Parse an ndjson line and return ink (as np array) and classname."""
+  sample = json.loads(ndjson_line)
+  class_name = sample["word"]
+  inkarray = sample["drawing"]
+  stroke_lengths = [len(stroke[0]) for stroke in inkarray]
+  total_points = sum(stroke_lengths)
+  np_ink = np.zeros((total_points, 3), dtype=np.float32)
+  current_t = 0
+  for stroke in inkarray:
+    for i in [0, 1]:
+      np_ink[current_t:(current_t + len(stroke[0])), i] = stroke[i]
+    current_t += len(stroke[0])
+    np_ink[current_t - 1, 2] = 1  # stroke_end
+  # Preprocessing.
+  # 1. Size normalization.
+  lower = np.min(np_ink[:, 0:2], axis=0)
+  upper = np.max(np_ink[:, 0:2], axis=0)
+  scale = upper - lower
+  scale[scale == 0] = 1
+  np_ink[:, 0:2] = (np_ink[:, 0:2] - lower) / scale
+  # 2. Compute deltas.
+  np_ink = np_ink[1:, 0:2] - np_ink[0:-1, 0:2]
+  return np_ink, class_name
+```
+
+Since we want the data to be shuffled for writing we read from each of the
+category files in random order and write to a random shard.
+
+For the training data we read the first 10000 items for each class and for the
+eval data we read the next 1000 items for each class.
+
+This data is then reformatted into a tensor of shape `[num_training_samples,
+max_length, 3]`. Then we determine the bounding box of the original drawing in
+screen coordinates and normalize the size such that the drawing has unit height.
+
+<center> ![Size normalization](../images/quickdraw_sizenormalization.png) </center>
+
+Finally, we compute the differences between consecutive points and store these
+as a `VarLenFeature` in a
+[tensorflow.Example](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
+under the key `ink`. In addition we store the `class_index` as a single entry
+`FixedLengthFeature` and the `shape` of the `ink` as a `FixedLengthFeature` of
+length 2.
+
+### Defining the model
+
+To define the model we create a new `Estimator`. If you want to read more about
+estimators, we recommend @{$extend/estimators$this tutorial}.
+
+To build the model, we:
+
+1.  reshape the input back into the original shape - where the mini batch is
+    padded to the maximal length of its contents. In addition to the ink data we
+    also have the lengths for each example and the target class. This happens in
+    the function [`_get_input_tensors`](#-get-input-tensors).
+
+1.  pass the input through to a series of convolution layers in
+    [`_add_conv_layers`](#-add-conv-layers).
+
+1.  pass the output of the convolutions into a series of bidirectional LSTM
+    layers in [`_add_rnn_layers`](#-add-rnn-layers). At the end of that, the
+    outputs for each time step are summed up to have a compact, fixed length
+    embedding of the input.
+
+1.  classify this embedding using a softmax layer in
+    [`_add_fc_layers`](#-add-fc-layers).
+
+In code this looks like:
+
+```python
+inks, lengths, targets = _get_input_tensors(features, targets)
+convolved = _add_conv_layers(inks)
+final_state = _add_rnn_layers(convolved, lengths)
+logits =_add_fc_layers(final_state)
+```
+
+### _get_input_tensors
+
+To obtain the input features we first obtain the shape from the features dict
+and then create a 1D tensor of size `[batch_size]` containing the lengths of the
+input sequences. The ink is stored as a SparseTensor in the features dict which
+we convert into a dense tensor and then reshape to be `[batch_size, ?, 3]`. And
+finally, if targets were passed in we make sure they are stored as a 1D tensor
+of size `[batch_size]`
+
+In code this looks like this:
+
+```python
+shapes = features["shape"]
+lengths = tf.squeeze(
+    tf.slice(shapes, begin=[0, 0], size=[params["batch_size"], 1]))
+inks = tf.reshape(
+    tf.sparse_tensor_to_dense(features["ink"]),
+    [params["batch_size"], -1, 3])
+if targets is not None:
+  targets = tf.squeeze(targets)
+```
+
+### _add_conv_layers
+
+The desired number of convolution layers and the lengths of the filters is
+configured through the parameters `num_conv` and `conv_len` in the `params`
+dict.
+
+The input is a sequence where each point has dimensionality 3. We are going to
+use 1D convolutions where we treat the 3 input features as channels. That means
+that the input is a `[batch_size, length, 3]` tensor and the output will be a
+`[batch_size, length, number_of_filters]` tensor.
+
+```python
+convolved = inks
+for i in range(len(params.num_conv)):
+  convolved_input = convolved
+  if params.batch_norm:
+    convolved_input = tf.layers.batch_normalization(
+        convolved_input,
+        training=(mode == tf.estimator.ModeKeys.TRAIN))
+  # Add dropout layer if enabled and not first convolution layer.
+  if i > 0 and params.dropout:
+    convolved_input = tf.layers.dropout(
+        convolved_input,
+        rate=params.dropout,
+        training=(mode == tf.estimator.ModeKeys.TRAIN))
+  convolved = tf.layers.conv1d(
+      convolved_input,
+      filters=params.num_conv[i],
+      kernel_size=params.conv_len[i],
+      activation=None,
+      strides=1,
+      padding="same",
+      name="conv1d_%d" % i)
+return convolved, lengths
+```
+
+### _add_rnn_layers
+
+We pass the output from the convolutions into bidirectional LSTM layers for
+which we use a helper function from contrib.
+
+```python
+outputs, _, _ = contrib_rnn.stack_bidirectional_dynamic_rnn(
+    cells_fw=[cell(params.num_nodes) for _ in range(params.num_layers)],
+    cells_bw=[cell(params.num_nodes) for _ in range(params.num_layers)],
+    inputs=convolved,
+    sequence_length=lengths,
+    dtype=tf.float32,
+    scope="rnn_classification")
+```
+
+see the code for more details and how to use `CUDA` accelerated implementations.
+
+To create a compact, fixed-length embedding, we sum up the output of the LSTMs.
+We first zero out the regions of the batch where the sequences have no data.
+
+```python
+mask = tf.tile(
+    tf.expand_dims(tf.sequence_mask(lengths, tf.shape(outputs)[1]), 2),
+    [1, 1, tf.shape(outputs)[2]])
+zero_outside = tf.where(mask, outputs, tf.zeros_like(outputs))
+outputs = tf.reduce_sum(zero_outside, axis=1)
+```
+
+### _add_fc_layers
+
+The embedding of the input is passed into a fully connected layer which we then
+use as a softmax layer.
+
+```python
+tf.layers.dense(final_state, params.num_classes)
+```
+
+### Loss, predictions, and optimizer
+
+Finally, we need to add a loss, a training op, and predictions to create the
+`ModelFn`:
+
+```python
+cross_entropy = tf.reduce_mean(
+    tf.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=targets, logits=logits))
+# Add the optimizer.
+train_op = tf.contrib.layers.optimize_loss(
+    loss=cross_entropy,
+    global_step=tf.train.get_global_step(),
+    learning_rate=params.learning_rate,
+    optimizer="Adam",
+    # some gradient clipping stabilizes training in the beginning.
+    clip_gradients=params.gradient_clipping_norm,
+    summaries=["learning_rate", "loss", "gradients", "gradient_norm"])
+predictions = tf.argmax(logits, axis=1)
+return model_fn_lib.ModelFnOps(
+    mode=mode,
+    predictions={"logits": logits,
+                 "predictions": predictions},
+    loss=cross_entropy,
+    train_op=train_op,
+    eval_metric_ops={"accuracy": tf.metrics.accuracy(targets, predictions)})
+```
+
+### Training and evaluating the model
+
+To train and evaluate the model we can rely on the functionalities of the
+`Estimator` APIs and easily run training and evaluation with the `Experiment`
+APIs:
+
+```python
+  estimator = tf.estimator.Estimator(
+      model_fn=model_fn,
+      model_dir=output_dir,
+      config=config,
+      params=model_params)
+  # Train the model.
+  tf.contrib.learn.Experiment(
+      estimator=estimator,
+      train_input_fn=get_input_fn(
+          mode=tf.contrib.learn.ModeKeys.TRAIN,
+          tfrecord_pattern=FLAGS.training_data,
+          batch_size=FLAGS.batch_size),
+      train_steps=FLAGS.steps,
+      eval_input_fn=get_input_fn(
+          mode=tf.contrib.learn.ModeKeys.EVAL,
+          tfrecord_pattern=FLAGS.eval_data,
+          batch_size=FLAGS.batch_size),
+      min_eval_frequency=1000)
+```
+
+Note that this tutorial is just a quick example on a relatively small dataset to
+get you familiar with the APIs of recurrent neural networks and estimators. Such
+models can be even more powerful if you try them on a large dataset.
+
+When training the model for 1M steps you can expect to get an accuracy of
+approximately of approximately 70% on the top-1 candidate. Note that this
+accuracy is sufficient to build the quickdraw game because of the game dynamics
+the user will be able to adjust their drawing until it is ready. Also, the game
+does not use the top-1 candidate only but accepts a drawing as correct if the
+target category shows up with a score better than a fixed threshold.
diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md
index 79202a38d7199033a9fefa8c6ba71e383aa0bf19..30a26d13c5734c5cf4a3b565c793db3e093c8271 100644
--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/examples/android/README.md
@@ -126,6 +126,10 @@ the Android NDK and SDK must be installed on your system.
 2.  The Android NDK is required to build the native (C/C++) TensorFlow code. The
     current recommended version is 14b, which may be found
     [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-14b-downloads).
+
+      * NDK 16, the revision released in November 2017, is **incompatible** with
+        Bazel. See [here](https://github.com/tensorflow/tensorflow/issues/14918).
+
 3.  The Android SDK and build tools may be obtained
     [here](https://developer.android.com/tools/revisions/build-tools.html), or
     alternatively as part of [Android
@@ -133,8 +137,16 @@ the Android NDK and SDK must be installed on your system.
     23 is required to build the TF Android demo (though it will run on API >= 21
     devices).
 
+      - The Android Studio SDK Manager's NDK installer will install the latest
+        revision of the NDK, which is **incompatible** with Bazel. You'll need
+        to download an older version manually, as (2) suggests.
+
 ##### Edit WORKSPACE
 
+NOTE: As long as you have the SDK and NDK installed, the `./configure` script
+will create these rules for you. Answer "Yes" when the script asks to
+automatically configure the `./WORKSPACE`.
+
 The Android entries in
 [`<workspace_root>/WORKSPACE`](../../../WORKSPACE#L19-L36) must be uncommented
 with the paths filled in appropriately depending on where you installed the NDK
@@ -156,7 +168,7 @@ download-models.gradle.
 
 **Optional**: If you wish to place the models in your assets manually, remove
 all of the `model_files` entries from the `assets` list in `tensorflow_demo`
-found in the `[BUILD](BUILD)` file. Then download and extract the archives
+found in the [`BUILD`](BUILD#L92) file. Then download and extract the archives
 yourself to the `assets` directory in the source tree:
 
 ```bash
diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/examples/android/build.gradle
index 48f566f825d2714fe5970531e3d9c9f0f7ca940e..f7bdf8b816a8191770bc1ad59b890041b8e39912 100644
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/examples/android/build.gradle
@@ -28,8 +28,8 @@ buildscript {
     }
 
     dependencies {
-        classpath 'com.android.tools.build:gradle:2.3.0'
-        classpath 'org.apache.httpcomponents:httpclient:4.5.2'
+        classpath 'com.android.tools.build:gradle:3.0.1'
+        classpath 'org.apache.httpcomponents:httpclient:4.5.4'
     }
 }
 
@@ -75,7 +75,7 @@ apply plugin: 'com.android.application'
 
 android {
     compileSdkVersion 23
-    buildToolsVersion "25.0.2"
+    buildToolsVersion '26.0.2'
 
     if (nativeBuildSystem == 'cmake') {
         defaultConfig {
diff --git a/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.jar b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000000000000000000000000000000000000..13372aef5e24af05341d49695ee84e5f9b594659
Binary files /dev/null and b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 0000000000000000000000000000000000000000..bd9ee87db3742e9f8c62df2ec9a7852550d9bbc9
--- /dev/null
+++ b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Sat Nov 18 15:06:47 CET 2017
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-4.1-all.zip
diff --git a/tensorflow/examples/android/gradlew b/tensorflow/examples/android/gradlew
new file mode 100644
index 0000000000000000000000000000000000000000..9d82f78915133e1c35a6ea51252590fb38efac2f
--- /dev/null
+++ b/tensorflow/examples/android/gradlew
@@ -0,0 +1,160 @@
+#!/usr/bin/env bash
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS=""
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn ( ) {
+    echo "$*"
+}
+
+die ( ) {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+esac
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
+function splitJvmOpts() {
+    JVM_OPTS=("$@")
+}
+eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
+JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
+
+exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
diff --git a/tensorflow/examples/android/gradlew.bat b/tensorflow/examples/android/gradlew.bat
new file mode 100644
index 0000000000000000000000000000000000000000..8a0b282aa6885fb573c106b3551f7275c5f17e8e
--- /dev/null
+++ b/tensorflow/examples/android/gradlew.bat
@@ -0,0 +1,90 @@
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windowz variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+if "%@eval[2+2]" == "4" goto 4NT_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+goto execute
+
+:4NT_args
+@rem Get arguments from the 4NT Shell from JP Software
+set CMD_LINE_ARGS=%$
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
index 4e45f42d0c97ed9dad9f9702adc3c1efe658699f..8bd4abb154a8f8c74f2195d4acbb99d3d5d498ea 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
@@ -333,8 +333,12 @@ public abstract class CameraActivity extends Activity
           continue;
         }
 
-        useCamera2API = isHardwareLevelSupported(characteristics,
-            CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_FULL);
+        // Fallback to camera1 API for internal cameras that don't have full support.
+        // This should help with legacy situations where using the camera2 API causes
+        // distorted or otherwise broken previews.
+        useCamera2API = (facing == CameraCharacteristics.LENS_FACING_EXTERNAL)
+            || isHardwareLevelSupported(characteristics, 
+                                        CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_FULL);
         LOGGER.i("Camera API lv2?: %s", useCamera2API);
         return cameraId;
       }
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/Classifier.java b/tensorflow/examples/android/src/org/tensorflow/demo/Classifier.java
index eabc724f7fd93136c49c31adc4f096865ab1c8a5..07995febaf5caab65dd4dfcc262ccf3750cfa303 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/Classifier.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/Classifier.java
@@ -100,7 +100,7 @@ public interface Classifier {
   List<Recognition> recognizeImage(Bitmap bitmap);
 
   void enableStatLogging(final boolean debug);
-  
+
   String getStatString();
 
   void close();
diff --git a/tensorflow/examples/how_tos/reading_data/convert_to_records.py b/tensorflow/examples/how_tos/reading_data/convert_to_records.py
index a402eac053cb474db0fd90876501a9c13906ea82..c89e83956322cb87a4cf41c6b7172f03d941b429 100644
--- a/tensorflow/examples/how_tos/reading_data/convert_to_records.py
+++ b/tensorflow/examples/how_tos/reading_data/convert_to_records.py
@@ -55,12 +55,15 @@ def convert_to(data_set, name):
   with tf.python_io.TFRecordWriter(filename) as writer:
     for index in range(num_examples):
       image_raw = images[index].tostring()
-      example = tf.train.Example(features=tf.train.Features(feature={
-          'height': _int64_feature(rows),
-          'width': _int64_feature(cols),
-          'depth': _int64_feature(depth),
-          'label': _int64_feature(int(labels[index])),
-          'image_raw': _bytes_feature(image_raw)}))
+      example = tf.train.Example(
+          features=tf.train.Features(
+              feature={
+                  'height': _int64_feature(rows),
+                  'width': _int64_feature(cols),
+                  'depth': _int64_feature(depth),
+                  'label': _int64_feature(int(labels[index])),
+                  'image_raw': _bytes_feature(image_raw)
+              }))
       writer.write(example.SerializeToString())
 
 
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
index a9ed02dd1a60ad79c2943212155bad864a750a99..fa4c1c0da5f31863aa4d99b6ec84e1e50e1a1551 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
@@ -45,9 +45,7 @@ TRAIN_FILE = 'train.tfrecords'
 VALIDATION_FILE = 'validation.tfrecords'
 
 
-def read_and_decode(filename_queue):
-  reader = tf.TFRecordReader()
-  _, serialized_example = reader.read(filename_queue)
+def decode(serialized_example):
   features = tf.parse_single_example(
       serialized_example,
       # Defaults are not specified since both keys are required.
@@ -60,22 +58,26 @@ def read_and_decode(filename_queue):
   # length mnist.IMAGE_PIXELS) to a uint8 tensor with shape
   # [mnist.IMAGE_PIXELS].
   image = tf.decode_raw(features['image_raw'], tf.uint8)
-  image.set_shape([mnist.IMAGE_PIXELS])
+  image.set_shape((mnist.IMAGE_PIXELS))
 
+  # Convert label from a scalar uint8 tensor to an int32 scalar.
+  label = tf.cast(features['label'], tf.int32)
+
+  return image, label
+
+def augment(image, label):
   # OPTIONAL: Could reshape into a 28x28 image and apply distortions
   # here.  Since we are not applying any distortions in this
   # example, and the next step expects the image to be flattened
   # into a vector, we don't bother.
+  return image, label
 
+def normalize(image, label):
   # Convert from [0, 255] -> [-0.5, 0.5] floats.
   image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
 
-  # Convert label from a scalar uint8 tensor to an int32 scalar.
-  label = tf.cast(features['label'], tf.int32)
-
   return image, label
 
-
 def inputs(train, batch_size, num_epochs):
   """Reads input data num_epochs times.
 
@@ -91,31 +93,32 @@ def inputs(train, batch_size, num_epochs):
       in the range [-0.5, 0.5].
     * labels is an int32 tensor with shape [batch_size] with the true label,
       a number in the range [0, mnist.NUM_CLASSES).
-    Note that an tf.train.QueueRunner is added to the graph, which
-    must be run using e.g. tf.train.start_queue_runners().
+
+    This function creates a one_shot_iterator, meaning that it will only iterate
+    over the dataset once. On the other hand there is no special initialization
+    required.
   """
   if not num_epochs: num_epochs = None
   filename = os.path.join(FLAGS.train_dir,
                           TRAIN_FILE if train else VALIDATION_FILE)
 
   with tf.name_scope('input'):
-    filename_queue = tf.train.string_input_producer(
-        [filename], num_epochs=num_epochs)
+    # TFRecordDataset opens a protobuf and reads entries line by line
+    # could also be [list, of, filenames]
+    dataset = tf.data.TFRecordDataset(filename)
+    dataset = dataset.repeat(num_epochs)
 
-    # Even when reading in multiple threads, share the filename
-    # queue.
-    image, label = read_and_decode(filename_queue)
+    # map takes a python function and applies it to every sample
+    dataset = dataset.map(decode)
+    dataset = dataset.map(augment)
+    dataset = dataset.map(normalize)
 
-    # Shuffle the examples and collect them into batch_size batches.
-    # (Internally uses a RandomShuffleQueue.)
-    # We run this in two threads to avoid being a bottleneck.
-    images, sparse_labels = tf.train.shuffle_batch(
-        [image, label], batch_size=batch_size, num_threads=2,
-        capacity=1000 + 3 * batch_size,
-        # Ensures a minimum amount of shuffling of examples.
-        min_after_dequeue=1000)
+    #the parameter is the queue size
+    dataset = dataset.shuffle(1000 + 3 * batch_size)
+    dataset = dataset.batch(batch_size)
 
-    return images, sparse_labels
+    iterator = dataset.make_one_shot_iterator()
+  return iterator.get_next()
 
 
 def run_training():
@@ -124,16 +127,16 @@ def run_training():
   # Tell TensorFlow that the model will be built into the default Graph.
   with tf.Graph().as_default():
     # Input images and labels.
-    images, labels = inputs(train=True, batch_size=FLAGS.batch_size,
-                            num_epochs=FLAGS.num_epochs)
+    image_batch, label_batch = inputs(train=True, batch_size=FLAGS.batch_size,
+                               num_epochs=FLAGS.num_epochs)
 
     # Build a Graph that computes predictions from the inference model.
-    logits = mnist.inference(images,
+    logits = mnist.inference(image_batch,
                              FLAGS.hidden1,
                              FLAGS.hidden2)
 
     # Add to the Graph the loss calculation.
-    loss = mnist.loss(logits, labels)
+    loss = mnist.loss(logits, label_batch)
 
     # Add to the Graph operations that train the model.
     train_op = mnist.training(loss, FLAGS.learning_rate)
@@ -143,46 +146,32 @@ def run_training():
                        tf.local_variables_initializer())
 
     # Create a session for running operations in the Graph.
-    sess = tf.Session()
-
-    # Initialize the variables (the trained variables and the
-    # epoch counter).
-    sess.run(init_op)
-
-    # Start input enqueue threads.
-    coord = tf.train.Coordinator()
-    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
-
-    try:
-      step = 0
-      while not coord.should_stop():
-        start_time = time.time()
-
-        # Run one step of the model.  The return values are
-        # the activations from the `train_op` (which is
-        # discarded) and the `loss` op.  To inspect the values
-        # of your ops or variables, you may include them in
-        # the list passed to sess.run() and the value tensors
-        # will be returned in the tuple from the call.
-        _, loss_value = sess.run([train_op, loss])
-
-        duration = time.time() - start_time
-
-        # Print an overview fairly often.
-        if step % 100 == 0:
-          print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
+    with tf.Session() as sess:
+      # Initialize the variables (the trained variables and the
+      # epoch counter).
+      sess.run(init_op)
+      try:
+        step = 0
+        while True: #train until OutOfRangeError
+          start_time = time.time()
+
+          # Run one step of the model.  The return values are
+          # the activations from the `train_op` (which is
+          # discarded) and the `loss` op.  To inspect the values
+          # of your ops or variables, you may include them in
+          # the list passed to sess.run() and the value tensors
+          # will be returned in the tuple from the call.
+          _, loss_value = sess.run([train_op, loss])
+
+          duration = time.time() - start_time
+
+          # Print an overview fairly often.
+          if step % 100 == 0:
+            print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
                                                      duration))
-        step += 1
-    except tf.errors.OutOfRangeError:
-      print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
-    finally:
-      # When done, ask the threads to stop.
-      coord.request_stop()
-
-    # Wait for threads to finish.
-    coord.join(threads)
-    sess.close()
-
+          step += 1
+      except tf.errors.OutOfRangeError:
+        print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
 
 def main(_):
   run_training()
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index ebddfb20f4b60986fba1cdbfe3fcb184149b0a99..ec22684eaf63700c608c6ce45f22941555246b99 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -539,10 +539,8 @@ def get_random_cached_bottlenecks(sess, image_lists, how_many, category,
           sess, image_lists, label_name, image_index, image_dir, category,
           bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
           resized_input_tensor, bottleneck_tensor, architecture)
-      ground_truth = np.zeros(class_count, dtype=np.float32)
-      ground_truth[label_index] = 1.0
       bottlenecks.append(bottleneck)
-      ground_truths.append(ground_truth)
+      ground_truths.append(label_index)
       filenames.append(image_name)
   else:
     # Retrieve all bottlenecks.
@@ -555,10 +553,8 @@ def get_random_cached_bottlenecks(sess, image_lists, how_many, category,
             sess, image_lists, label_name, image_index, image_dir, category,
             bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
             resized_input_tensor, bottleneck_tensor, architecture)
-        ground_truth = np.zeros(class_count, dtype=np.float32)
-        ground_truth[label_index] = 1.0
         bottlenecks.append(bottleneck)
-        ground_truths.append(ground_truth)
+        ground_truths.append(label_index)
         filenames.append(image_name)
   return bottlenecks, ground_truths, filenames
 
@@ -610,10 +606,8 @@ def get_random_distorted_bottlenecks(
     bottleneck_values = sess.run(bottleneck_tensor,
                                  {resized_input_tensor: distorted_image_data})
     bottleneck_values = np.squeeze(bottleneck_values)
-    ground_truth = np.zeros(class_count, dtype=np.float32)
-    ground_truth[label_index] = 1.0
     bottlenecks.append(bottleneck_values)
-    ground_truths.append(ground_truth)
+    ground_truths.append(label_index)
   return bottlenecks, ground_truths
 
 
@@ -774,9 +768,8 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor,
         shape=[None, bottleneck_tensor_size],
         name='BottleneckInputPlaceholder')
 
-    ground_truth_input = tf.placeholder(tf.float32,
-                                        [None, class_count],
-                                        name='GroundTruthInput')
+    ground_truth_input = tf.placeholder(
+        tf.int64, [None], name='GroundTruthInput')
 
   # Organizing the following ops as `final_training_ops` so they're easier
   # to see in TensorBoard
@@ -823,10 +816,8 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor,
   tf.summary.histogram('activations', final_tensor)
 
   with tf.name_scope('cross_entropy'):
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+    cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(
         labels=ground_truth_input, logits=logits)
-    with tf.name_scope('total'):
-      cross_entropy_mean = tf.reduce_mean(cross_entropy)
 
   tf.summary.scalar('cross_entropy', cross_entropy_mean)
 
@@ -852,8 +843,7 @@ def add_evaluation_step(result_tensor, ground_truth_tensor):
   with tf.name_scope('accuracy'):
     with tf.name_scope('correct_prediction'):
       prediction = tf.argmax(result_tensor, 1)
-      correct_prediction = tf.equal(
-          prediction, tf.argmax(ground_truth_tensor, 1))
+      correct_prediction = tf.equal(prediction, ground_truth_tensor)
     with tf.name_scope('accuracy'):
       evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
   tf.summary.scalar('accuracy', evaluation_step)
@@ -1178,7 +1168,7 @@ def main(_):
     if FLAGS.print_misclassified_test_images:
       tf.logging.info('=== MISCLASSIFIED TEST IMAGES ===')
       for i, test_filename in enumerate(test_filenames):
-        if predictions[i] != test_ground_truth[i].argmax():
+        if predictions[i] != test_ground_truth[i]:
           tf.logging.info('%70s  %s' %
                           (test_filename,
                            list(image_lists.keys())[predictions[i]]))
diff --git a/tensorflow/examples/image_retraining/retrain_test.py b/tensorflow/examples/image_retraining/retrain_test.py
index 2de4c4ec99f87544bfda9d0fe5977f60742d82a0..8b8dd45fd72e3d29bdb7f6291cc53b912adf3644 100644
--- a/tensorflow/examples/image_retraining/retrain_test.py
+++ b/tensorflow/examples/image_retraining/retrain_test.py
@@ -87,7 +87,7 @@ class ImageRetrainingTest(test_util.TensorFlowTestCase):
   def testAddEvaluationStep(self):
     with tf.Graph().as_default():
       final = tf.placeholder(tf.float32, [1], name='final')
-      gt = tf.placeholder(tf.float32, [1], name='gt')
+      gt = tf.placeholder(tf.int64, [1], name='gt')
       self.assertIsNotNone(retrain.add_evaluation_step(final, gt))
 
   def testAddJpegDecoding(self):
diff --git a/tensorflow/examples/ios/.gitignore b/tensorflow/examples/ios/.gitignore
index e572b3012c600ab856ac8e5bd71e4291b1ba7bcf..dbabfb33bf11e0436d8900ba9f2d1ba6195a9a47 100644
--- a/tensorflow/examples/ios/.gitignore
+++ b/tensorflow/examples/ios/.gitignore
@@ -2,3 +2,6 @@ project.xcworkspace
 xcuserdata
 imagenet_comp_graph_label_strings.txt
 tensorflow_inception_graph.pb
+simple/data/LICENSE
+camera/data/LICENSE
+benchmark/data/LICENSE
diff --git a/tensorflow/examples/ios/README.md b/tensorflow/examples/ios/README.md
index 7d2eb870be2c23bf52cd335f7b3b4cb7f4baac52..5bdaeb43ce143e36e78cfe301fd9b59e8b85b034 100644
--- a/tensorflow/examples/ios/README.md
+++ b/tensorflow/examples/ios/README.md
@@ -6,7 +6,7 @@ This folder contains examples of how to build applications for iOS devices using
  - You'll need Xcode 7.3 or later.
 
  - There are currently three examples: simple, benchmark, and camera. For now,
-   you can download the sample code by cloning the main tensorflow repository 
+   you can download the sample code by cloning the main tensorflow repository
    (we are planning to make the samples available as a separate repository
    later).
 
@@ -48,8 +48,8 @@ open tf_simple_example.xcworkspace # obs, not the .xcodeproj directory
 ### Troubleshooting
 
  - Make sure you use the TensorFlow-experimental pod (and not TensorFlow).
-  
- - The TensorFlow-experimental pod is current about ~450MB. The reason it is 
+
+ - The TensorFlow-experimental pod is current about ~450MB. The reason it is
    so big is because we are bundling multiple platforms, and the pod includes
    all TensorFlow functionality (e.g. operations). The final app size after
    build is substantially smaller though (~25MB). Working with the complete
@@ -91,7 +91,7 @@ target 'YourProjectName'
    open up the Xcode project in the `camera` subfolder. Once you build and run
    that, you should get a live camera view that you can point at objects to get
    real-time recognition results.
-   
+
 ### Troubleshooting
 
 If you're hitting problems, here's a checklist of common things to investigate:
diff --git a/tensorflow/examples/learn/iris_custom_decay_dnn.py b/tensorflow/examples/learn/iris_custom_decay_dnn.py
index 072357e51c418ae1163debe29516c31ccc367386..4a219694d10ef075e0e0403cdd7ed100c39ddadd 100644
--- a/tensorflow/examples/learn/iris_custom_decay_dnn.py
+++ b/tensorflow/examples/learn/iris_custom_decay_dnn.py
@@ -46,12 +46,8 @@ def my_model(features, labels, mode):
     }
     return tf.estimator.EstimatorSpec(mode, predictions=predictions)
 
-  # Convert the labels to a one-hot tensor of shape (length of features, 3) and
-  # with a on-value of 1 for each one-hot vector of length 3.
-  onehot_labels = tf.one_hot(labels, 3, 1, 0)
   # Compute loss.
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Create training op with exponentially decaying learning rate.
   if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/learn/iris_custom_model.py b/tensorflow/examples/learn/iris_custom_model.py
index 471a99ba76dd8012ba3b1a519d5d07fb378f89e7..c6bdb86ba52b9715b977909d9b7d0fbc59161a53 100644
--- a/tensorflow/examples/learn/iris_custom_model.py
+++ b/tensorflow/examples/learn/iris_custom_model.py
@@ -47,12 +47,8 @@ def my_model(features, labels, mode):
     }
     return tf.estimator.EstimatorSpec(mode, predictions=predictions)
 
-  # Convert the labels to a one-hot tensor of shape (length of features, 3) and
-  # with a on-value of 1 for each one-hot vector of length 3.
-  onehot_labels = tf.one_hot(labels, 3, 1, 0)
   # Compute loss.
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Create training op.
   if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/learn/mnist.py b/tensorflow/examples/learn/mnist.py
index 88425ea0d0bf72fb7e7d9cbab27da023f3ade122..98819b20bfea5021d52e2c50b004bccdaf1f25e7 100644
--- a/tensorflow/examples/learn/mnist.py
+++ b/tensorflow/examples/learn/mnist.py
@@ -77,9 +77,7 @@ def conv_model(features, labels, mode):
     return tf.estimator.EstimatorSpec(mode, predictions=predictions)
 
   # Compute loss.
-  onehot_labels = tf.one_hot(tf.cast(labels, tf.int32), N_DIGITS, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Create training op.
   if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/learn/multiple_gpu.py b/tensorflow/examples/learn/multiple_gpu.py
index a294950a386a7207858bbcff345f14de44ffb9ca..3bad22ddf66b7981930637d64cc8653e3fb29cdf 100644
--- a/tensorflow/examples/learn/multiple_gpu.py
+++ b/tensorflow/examples/learn/multiple_gpu.py
@@ -65,12 +65,8 @@ def my_model(features, labels, mode):
       }
       return tf.estimator.EstimatorSpec(mode, predictions=predictions)
 
-    # Convert the labels to a one-hot tensor of shape (length of features, 3)
-    # and with a on-value of 1 for each one-hot vector of length 3.
-    onehot_labels = tf.one_hot(labels, 3, 1, 0)
     # Compute loss.
-    loss = tf.losses.softmax_cross_entropy(
-        onehot_labels=onehot_labels, logits=logits)
+    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
     # Create training op.
     if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/learn/resnet.py b/tensorflow/examples/learn/resnet.py
index 1e0966475b01d067330dc4797032d561857fd208..9542e552504580a6614f8bd2f43c38dfa795750f 100755
--- a/tensorflow/examples/learn/resnet.py
+++ b/tensorflow/examples/learn/resnet.py
@@ -151,9 +151,7 @@ def res_net_model(features, labels, mode):
     return tf.estimator.EstimatorSpec(mode, predictions=predictions)
 
   # Compute loss.
-  onehot_labels = tf.one_hot(tf.cast(labels, tf.int32), N_DIGITS, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Create training op.
   if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py
index ba89c532be5fa0e13a2dcb1f7894be4c631507d7..eb117c39a122f4f6c108dd18f8f8035edf05eaa1 100644
--- a/tensorflow/examples/learn/text_classification.py
+++ b/tensorflow/examples/learn/text_classification.py
@@ -46,9 +46,7 @@ def estimator_spec_for_softmax_classification(
             'prob': tf.nn.softmax(logits)
         })
 
-  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
   if mode == tf.estimator.ModeKeys.TRAIN:
     optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
     train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
diff --git a/tensorflow/examples/learn/text_classification_character_cnn.py b/tensorflow/examples/learn/text_classification_character_cnn.py
index 363ff003628e03be40c1be6b7b32e12a07533047..afda170e2a9c1b0281fdd3d7ed210a1bfcd4481b 100644
--- a/tensorflow/examples/learn/text_classification_character_cnn.py
+++ b/tensorflow/examples/learn/text_classification_character_cnn.py
@@ -88,9 +88,7 @@ def char_cnn_model(features, labels, mode):
             'prob': tf.nn.softmax(logits)
         })
 
-  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
   if mode == tf.estimator.ModeKeys.TRAIN:
     optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
     train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
diff --git a/tensorflow/examples/learn/text_classification_character_rnn.py b/tensorflow/examples/learn/text_classification_character_rnn.py
index 86adc056add508c309b3a5b93e58e9c195995642..15733821fb17eb17269fea295020f6690bb62854 100644
--- a/tensorflow/examples/learn/text_classification_character_rnn.py
+++ b/tensorflow/examples/learn/text_classification_character_rnn.py
@@ -59,9 +59,7 @@ def char_rnn_model(features, labels, mode):
             'prob': tf.nn.softmax(logits)
         })
 
-  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
   if mode == tf.estimator.ModeKeys.TRAIN:
     optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
     train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
diff --git a/tensorflow/examples/learn/text_classification_cnn.py b/tensorflow/examples/learn/text_classification_cnn.py
index be262285a3a7aa0d6b9430a2226b448fe674cd7f..9e21aee87f629835222ab367dc3ed55863f553e4 100644
--- a/tensorflow/examples/learn/text_classification_cnn.py
+++ b/tensorflow/examples/learn/text_classification_cnn.py
@@ -87,9 +87,7 @@ def cnn_model(features, labels, mode):
             'prob': tf.nn.softmax(logits)
         })
 
-  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
   if mode == tf.estimator.ModeKeys.TRAIN:
     optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
     train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
diff --git a/tensorflow/examples/speech_commands/input_data.py b/tensorflow/examples/speech_commands/input_data.py
index 751652b330cd203efe216567172fd3dbb4a5b401..e7db9cddf02daf9a32d3ed859ee9bd35b2cae838 100644
--- a/tensorflow/examples/speech_commands/input_data.py
+++ b/tensorflow/examples/speech_commands/input_data.py
@@ -417,8 +417,7 @@ class AudioProcessor(object):
       sess: TensorFlow session that was active when processor was created.
 
     Returns:
-      List of sample data for the transformed samples, and list of labels in
-      one-hot form.
+      List of sample data for the transformed samples, and list of label indexes
     """
     # Pick one of the partitions to choose samples from.
     candidates = self.data_index[mode]
@@ -428,7 +427,7 @@ class AudioProcessor(object):
       sample_count = max(0, min(how_many, len(candidates) - offset))
     # Data and labels will be populated and returned.
     data = np.zeros((sample_count, model_settings['fingerprint_size']))
-    labels = np.zeros((sample_count, model_settings['label_count']))
+    labels = np.zeros(sample_count)
     desired_samples = model_settings['desired_samples']
     use_background = self.background_data and (mode == 'training')
     pick_deterministically = (mode != 'training')
@@ -483,7 +482,7 @@ class AudioProcessor(object):
       # Run the graph to produce the output audio.
       data[i - offset, :] = sess.run(self.mfcc_, feed_dict=input_dict).flatten()
       label_index = self.word_to_index[sample['label']]
-      labels[i - offset, label_index] = 1
+      labels[i - offset] = label_index
     return data, labels
 
   def get_unprocessed_data(self, how_many, model_settings, mode):
diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index f46d5e59b46a9be8b261aade7dbeb4b41ba69b97..a4e80041f82191d7c58a3e52c929340eb604ec9d 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -133,7 +133,7 @@ def main(_):
 
   # Define loss and optimizer
   ground_truth_input = tf.placeholder(
-      tf.float32, [None, label_count], name='groundtruth_input')
+      tf.int64, [None], name='groundtruth_input')
 
   # Optionally we can add runtime checks to spot when NaNs or other symptoms of
   # numerical errors start occurring during training.
@@ -144,9 +144,8 @@ def main(_):
 
   # Create the back propagation and training evaluation machinery in the graph.
   with tf.name_scope('cross_entropy'):
-    cross_entropy_mean = tf.reduce_mean(
-        tf.nn.softmax_cross_entropy_with_logits(
-            labels=ground_truth_input, logits=logits))
+    cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(
+        labels=ground_truth_input, logits=logits)
   tf.summary.scalar('cross_entropy', cross_entropy_mean)
   with tf.name_scope('train'), tf.control_dependencies(control_dependencies):
     learning_rate_input = tf.placeholder(
@@ -154,13 +153,13 @@ def main(_):
     train_step = tf.train.GradientDescentOptimizer(
         learning_rate_input).minimize(cross_entropy_mean)
   predicted_indices = tf.argmax(logits, 1)
-  expected_indices = tf.argmax(ground_truth_input, 1)
-  correct_prediction = tf.equal(predicted_indices, expected_indices)
-  confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices, num_classes=label_count)
+  correct_prediction = tf.equal(predicted_indices, ground_truth_input)
+  confusion_matrix = tf.confusion_matrix(
+      ground_truth_input, predicted_indices, num_classes=label_count)
   evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
   tf.summary.scalar('accuracy', evaluation_step)
 
-  global_step = tf.contrib.framework.get_or_create_global_step()
+  global_step = tf.train.get_or_create_global_step()
   increment_global_step = tf.assign(global_step, global_step + 1)
 
   saver = tf.train.Saver(tf.global_variables())
diff --git a/tensorflow/examples/tutorials/deepdream/README.md b/tensorflow/examples/tutorials/deepdream/README.md
index 3a715f622488d260834db6b35a0da4c7ccdcd9c0..403e4b34f9bbf161cb2aad614f352679443595d4 100644
--- a/tensorflow/examples/tutorials/deepdream/README.md
+++ b/tensorflow/examples/tutorials/deepdream/README.md
@@ -2,7 +2,7 @@
 
 by [Alexander Mordvintsev](mailto:moralex@google.com)
 
-This directory contains Jupyter notebook that demonstrates a number of Convolutional Neural Network 
+This directory contains Jupyter notebook that demonstrates a number of Convolutional Neural Network
 image generation techniques implemented with TensorFlow:
 
 - visualizing individual feature channels and their combinations to explore the space of patterns learned by the neural network (see [GoogLeNet](http://storage.googleapis.com/deepdream/visualz/tensorflow_inception/index.html) and [VGG16](http://storage.googleapis.com/deepdream/visualz/vgg16/index.html) galleries)
@@ -11,8 +11,8 @@ image generation techniques implemented with TensorFlow:
 - using Laplacian Pyramid Gradient Normalization to produce smooth and colorful visuals at low cost
 - generating DeepDream-like images with TensorFlow
 
-You can view "deepdream.ipynb" directly on GitHub. Note that GitHub Jupyter notebook preview removes 
-embedded graph visualizations. You can still see them online 
+You can view "deepdream.ipynb" directly on GitHub. Note that GitHub Jupyter notebook preview removes
+embedded graph visualizations. You can still see them online
 [using nbviewer](http://nbviewer.jupyter.org/github/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/deepdream/deepdream.ipynb)
 service.
 
@@ -23,5 +23,5 @@ In order to run the notebook locally, the following dependencies must be install
 - NumPy
 - Jupyter Notebook
 
-To open the notebook, run `ipython notebook` command in this directory, and 
+To open the notebook, run `ipython notebook` command in this directory, and
 select 'deepdream.ipynb' in the opened browser window.
diff --git a/tensorflow/examples/tutorials/layers/cnn_mnist.py b/tensorflow/examples/tutorials/layers/cnn_mnist.py
index 2124843fcb21d0c4a28ef9a11aba012a5a116e84..1e8d7d05e1c6af08d788857e74c04134333d019c 100644
--- a/tensorflow/examples/tutorials/layers/cnn_mnist.py
+++ b/tensorflow/examples/tutorials/layers/cnn_mnist.py
@@ -97,9 +97,7 @@ def cnn_model_fn(features, labels, mode):
     return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
 
   # Calculate Loss (for both TRAIN and EVAL modes)
-  onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Configure the Training Op (for TRAIN mode)
   if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/tutorials/mnist/mnist.py b/tensorflow/examples/tutorials/mnist/mnist.py
index 3585043a2a9f1920422c50cd60ce18fcfa646419..7cedd0e264f35ac4ab924c93032b019e2aae78cf 100644
--- a/tensorflow/examples/tutorials/mnist/mnist.py
+++ b/tensorflow/examples/tutorials/mnist/mnist.py
@@ -94,9 +94,7 @@ def loss(logits, labels):
     loss: Loss tensor of type float.
   """
   labels = tf.to_int64(labels)
-  cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits, name='xentropy')
-  return tf.reduce_mean(cross_entropy, name='xentropy_mean')
+  return tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
 
 def training(loss, learning_rate):
diff --git a/tensorflow/examples/tutorials/mnist/mnist_deep.py b/tensorflow/examples/tutorials/mnist/mnist_deep.py
index a4dbab5123d49ee97445a5921a14bd1764593025..1e0294db27bc675870afceca77a2cdcd4b3f5ad3 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_deep.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_deep.py
@@ -125,27 +125,27 @@ def bias_variable(shape):
 
 def main(_):
   # Import data
-  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
+  mnist = input_data.read_data_sets(FLAGS.data_dir)
 
   # Create the model
   x = tf.placeholder(tf.float32, [None, 784])
 
   # Define loss and optimizer
-  y_ = tf.placeholder(tf.float32, [None, 10])
+  y_ = tf.placeholder(tf.int64, [None])
 
   # Build the graph for the deep net
   y_conv, keep_prob = deepnn(x)
 
   with tf.name_scope('loss'):
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_,
-                                                            logits=y_conv)
+    cross_entropy = tf.losses.sparse_softmax_cross_entropy(
+        labels=y_, logits=y_conv)
   cross_entropy = tf.reduce_mean(cross_entropy)
 
   with tf.name_scope('adam_optimizer'):
     train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
 
   with tf.name_scope('accuracy'):
-    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
+    correct_prediction = tf.equal(tf.argmax(y_conv, 1), y_)
     correct_prediction = tf.cast(correct_prediction, tf.float32)
   accuracy = tf.reduce_mean(correct_prediction)
 
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax.py b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
index addd2d3810219f70ffb5f7c919f01de35dd816d9..fb3ac942039e670fb5ca975c5d9835ba065190a2 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
@@ -34,7 +34,7 @@ FLAGS = None
 
 def main(_):
   # Import data
-  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
+  mnist = input_data.read_data_sets(FLAGS.data_dir)
 
   # Create the model
   x = tf.placeholder(tf.float32, [None, 784])
@@ -43,7 +43,7 @@ def main(_):
   y = tf.matmul(x, W) + b
 
   # Define loss and optimizer
-  y_ = tf.placeholder(tf.float32, [None, 10])
+  y_ = tf.placeholder(tf.int64, [None])
 
   # The raw formulation of cross-entropy,
   #
@@ -52,10 +52,9 @@ def main(_):
   #
   # can be numerically unstable.
   #
-  # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
+  # So here we use tf.losses.sparse_softmax_cross_entropy on the raw
   # outputs of 'y', and then average across the batch.
-  cross_entropy = tf.reduce_mean(
-      tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
+  cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y)
   train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
 
   sess = tf.InteractiveSession()
@@ -66,7 +65,7 @@ def main(_):
     sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
 
   # Test trained model
-  correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+  correct_prediction = tf.equal(tf.argmax(y, 1), y_)
   accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
   print(sess.run(accuracy, feed_dict={x: mnist.test.images,
                                       y_: mnist.test.labels}))
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py b/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py
index eaff05913af756c6ab0bf80e8f0893b1d239d60d..e89317494f9b7171a93b2706d9d612d456ddf937 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py
@@ -32,7 +32,7 @@ FLAGS = None
 
 def main(_):
   # Import data
-  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
+  mnist = input_data.read_data_sets(FLAGS.data_dir)
 
   # Create the model
   x = tf.placeholder(tf.float32, [None, 784])
@@ -41,7 +41,7 @@ def main(_):
   y = tf.matmul(x, w) + b
 
   # Define loss and optimizer
-  y_ = tf.placeholder(tf.float32, [None, 10])
+  y_ = tf.placeholder(tf.int64, [None])
 
   # The raw formulation of cross-entropy,
   #
@@ -50,10 +50,9 @@ def main(_):
   #
   # can be numerically unstable.
   #
-  # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
-  # outputs of 'y', and then average across the batch.
-  cross_entropy = tf.reduce_mean(
-      tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
+  # So here we use tf.losses.sparse_softmax_cross_entropy on the raw
+  # logit outputs of 'y', and then average across the batch.
+  cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y)
   train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
 
   config = tf.ConfigProto()
@@ -86,7 +85,7 @@ def main(_):
       sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
 
   # Test trained model
-  correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+  correct_prediction = tf.equal(tf.argmax(y, 1), y_)
   accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
   print(sess.run(accuracy,
                  feed_dict={x: mnist.test.images,
diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
index c401d09df8ca5132178ab31e3b14b3a5cf98e70d..7967e22d6a0319a530cb2f00e54872f022ac0095 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@@ -38,7 +38,6 @@ FLAGS = None
 def train():
   # Import data
   mnist = input_data.read_data_sets(FLAGS.data_dir,
-                                    one_hot=True,
                                     fake_data=FLAGS.fake_data)
 
   sess = tf.InteractiveSession()
@@ -47,7 +46,7 @@ def train():
   # Input placeholders
   with tf.name_scope('input'):
     x = tf.placeholder(tf.float32, [None, 784], name='x-input')
-    y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
+    y_ = tf.placeholder(tf.int64, [None], name='y-input')
 
   with tf.name_scope('input_reshape'):
     image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
@@ -117,12 +116,12 @@ def train():
     #
     # can be numerically unstable.
     #
-    # So here we use tf.nn.softmax_cross_entropy_with_logits on the
-    # raw outputs of the nn_layer above, and then average across
+    # So here we use tf.losses.sparse_softmax_cross_entropy on the
+    # raw logit outputs of the nn_layer above, and then average across
     # the batch.
-    diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)
     with tf.name_scope('total'):
-      cross_entropy = tf.reduce_mean(diff)
+      cross_entropy = tf.losses.sparse_softmax_cross_entropy(
+          labels=y_, logits=y)
   tf.summary.scalar('cross_entropy', cross_entropy)
 
   with tf.name_scope('train'):
@@ -131,7 +130,7 @@ def train():
 
   with tf.name_scope('accuracy'):
     with tf.name_scope('correct_prediction'):
-      correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+      correct_prediction = tf.equal(tf.argmax(y, 1), y_)
     with tf.name_scope('accuracy'):
       accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
   tf.summary.scalar('accuracy', accuracy)
diff --git a/tensorflow/examples/udacity/1_notmnist.ipynb b/tensorflow/examples/udacity/1_notmnist.ipynb
index 39674e1aa49ad70216b778444d2448d89f44d952..dffe5d37c64c33fe3d5ce632ad4671abe0b6f673 100644
--- a/tensorflow/examples/udacity/1_notmnist.ipynb
+++ b/tensorflow/examples/udacity/1_notmnist.ipynb
@@ -46,13 +46,13 @@
         "# These are all the modules we'll be using later. Make sure you can import them\n",
         "# before proceeding further.\n",
         "from __future__ import print_function\n",
+        "import imageio\n",
         "import matplotlib.pyplot as plt\n",
         "import numpy as np\n",
         "import os\n",
         "import sys\n",
         "import tarfile\n",
         "from IPython.display import display, Image\n",
-        "from scipy import ndimage\n",
         "from sklearn.linear_model import LogisticRegression\n",
         "from six.moves.urllib.request import urlretrieve\n",
         "from six.moves import cPickle as pickle\n",
@@ -325,13 +325,13 @@
         "  for image in image_files:\n",
         "    image_file = os.path.join(folder, image)\n",
         "    try:\n",
-        "      image_data = (ndimage.imread(image_file).astype(float) - \n",
+        "      image_data = (imageio.imread(image_file).astype(float) - \n",
         "                    pixel_depth / 2) / pixel_depth\n",
         "      if image_data.shape != (image_size, image_size):\n",
         "        raise Exception('Unexpected image shape: %s' % str(image_data.shape))\n",
         "      dataset[num_images, :, :] = image_data\n",
         "      num_images = num_images + 1\n",
-        "    except IOError as e:\n",
+        "    except (IOError, ValueError) as e:\n",
         "      print('Could not read:', image_file, ':', e, '- it\\'s ok, skipping.')\n",
         "    \n",
         "  dataset = dataset[0:num_images, :, :]\n",
diff --git a/tensorflow/examples/udacity/README.md b/tensorflow/examples/udacity/README.md
index 6faad294c2df59f480ed15e7cf3f216311d553bc..f80c56d1c181edcb26c93c01bf9ba4e486c6d146 100644
--- a/tensorflow/examples/udacity/README.md
+++ b/tensorflow/examples/udacity/README.md
@@ -43,15 +43,15 @@ In addition, you may need to pass `--memory=8g` as an extra argument to
 `docker-machine` is a tool to provision and manage docker hosts, it supports multiple platform (ex. aws, gce, azure, virtualbox, ...). To create a new virtual machine locally with built-in docker engine, you can use
 
     docker-machine create -d virtualbox --virtualbox-memory 8196 tensorflow
-    
+
 `-d` means the driver for the cloud platform, supported drivers listed [here](https://docs.docker.com/machine/drivers/). Here we use virtualbox to create a new virtual machine locally. `tensorflow` means the name of the virtual machine, feel free to use whatever you like. You can use
 
     docker-machine ip tensorflow
-    
+
 to get the ip of the new virtual machine. To switch from default virtual machine to a new one (here we use tensorflow), type
 
     eval $(docker-machine env tensorflow)
-    
+
 Note that `docker-machine env tensorflow` outputs some environment variables such like `DOCKER_HOST`. Then your docker client is now connected to the docker host in virtual machine `tensorflow`
 
 * **I'm getting a TLS connection error.**
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
index 1e375ed48edcc779509179d7eae0ff93bbc87b16..4a429837b7b997f0f6571060280a9a15543b9f54 100644
--- a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
@@ -53,7 +53,8 @@ tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
   //  - Scales, clamps, and converts that spectrogram to 0 to 255 uint8's.
   //  - Reshapes the tensor so that it's [height, width, 1] for imaging.
   //  - Encodes it as a PNG stream and saves it out to a file.
-  Output file_reader = ReadFile(root.WithOpName("input_wav"), input_wav);
+  Output file_reader =
+      tensorflow::ops::ReadFile(root.WithOpName("input_wav"), input_wav);
   DecodeWav wav_decoder =
       DecodeWav(root.WithOpName("wav_decoder"), file_reader);
   Output spectrogram = AudioSpectrogram(root.WithOpName("spectrogram"),
@@ -71,8 +72,8 @@ tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
   Output squeeze = Squeeze(root.WithOpName("squeeze"), expand_dims,
                            Squeeze::Attrs().Axis({0}));
   Output png_encoder = EncodePng(root.WithOpName("png_encoder"), squeeze);
-  WriteFile file_writer =
-      WriteFile(root.WithOpName("output_image"), output_image, png_encoder);
+  tensorflow::ops::WriteFile file_writer = tensorflow::ops::WriteFile(
+      root.WithOpName("output_image"), output_image, png_encoder);
   tensorflow::GraphDef graph;
   TF_RETURN_IF_ERROR(root.ToGraphDef(&graph));
 
diff --git a/tensorflow/g3doc/README.txt b/tensorflow/g3doc/README.txt
index 6eaf1e1bda1e6c43df96195a682961cd28dc177b..ed648f8b6b8895010be84becd4fda25ded5859fb 100644
--- a/tensorflow/g3doc/README.txt
+++ b/tensorflow/g3doc/README.txt
@@ -7,7 +7,7 @@ Documentation (on Github, tensorflow.org, and anywhere else we decide to
 serve it from) is now generated from the files in
 tensorflow/docs_src/ (for tutorials and other guides) and
 TensorFlow source code (for the API reference pages). If you see a problem with
-API reference, edit the code comments in the appropriate language. If you see a 
+API reference, edit the code comments in the appropriate language. If you see a
 problem with our other docs, edit the files in docs_src.
 
 To preview the results of your changes, or generate an offline copy of
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index 376e22b38082f7ebeacf49edd44e85c12be2d95f..b1bd87eb0c3b3a498a1db45f11d9a48552e08079 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -26,9 +26,12 @@ from source.
     ([Linux](https://www.tensorflow.org/install/install_sources#PrepareLinux)
     or [OS
     X](https://www.tensorflow.org/install/install_sources#PrepareMac)).
-    If you don't need GPU support, then try the following: `sh # Linux sudo
-    apt-get install python swig python-numpy # OS X with homebrew brew install
-    swig`
+    If you don't need GPU support, then try the following:
+
+    ```sh
+    sudo apt-get install python swig python-numpy # Linux
+    brew install swig                             # OS X with homebrew
+    ```
 
 ### Build
 
diff --git a/tensorflow/go/android.go b/tensorflow/go/android.go
index f7d666b7a920696d5ac516b9ef733bc2b59a97a9..3db3ddfec5cc16dbb47bc847513989dfd3810ea3 100644
--- a/tensorflow/go/android.go
+++ b/tensorflow/go/android.go
@@ -1,3 +1,17 @@
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 // +build android
 
 package tensorflow
diff --git a/tensorflow/go/genop/generate.sh b/tensorflow/go/genop/generate.sh
index 01fcfb9058378b49d1315ddbbcc08e6a5de09d7d..a894c87c2765d01d2310159b19092904ad50a8b3 100644
--- a/tensorflow/go/genop/generate.sh
+++ b/tensorflow/go/genop/generate.sh
@@ -19,6 +19,11 @@ set -e
 go get github.com/golang/protobuf/proto
 go get github.com/golang/protobuf/protoc-gen-go
 
+if [ -z "${GOPATH}" ]
+then
+  GOPATH=$(go env GOPATH)
+fi
+
 cd $(dirname $0)
 for g in $(echo "${GOPATH//:/ }"); do
     TF_DIR="${g}/src/github.com/tensorflow/tensorflow"
diff --git a/tensorflow/go/genop/main.go b/tensorflow/go/genop/main.go
index b6f8e2d5a8e30c4721b5c49f64b15f72cc70a794..0c7d9be5c137b384da21b5afac099fee4df04e6f 100644
--- a/tensorflow/go/genop/main.go
+++ b/tensorflow/go/genop/main.go
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-//go:generate sh generate.sh
+//go:generate bash generate.sh
 
 // Command genop generates a Go source file with functions for TensorFlow ops.
 package main
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index 46c600eab17c6c467d0b3a3312f848541f382e80..fc087d9d995dfe031e61fd0fa15d649c2ee35cc9 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -20,6 +20,25 @@ package tensorflow
 //
 // #include <stdlib.h>
 // #include <string.h>
+//
+// void TF_SetAttrShapeList_Helper(TF_OperationDescription* desc,
+//                                 const char* attr_name,
+//                                 const int64_t* flat_dims,
+//                                 const int* num_dims,
+//                                 int num_shapes) {
+//  const int64_t** dims =
+//    (const int64_t**)malloc(sizeof(const int64_t*) * num_shapes);
+//  int i = 0;
+//  for (i = 0; i < num_shapes; i++) {
+//    dims[i] = flat_dims;
+//    if (num_dims[i] > 0) {
+//      // flat_dims will be NULL iff num_shapes is 0 or all elements in num_dims are <= 0.
+//      flat_dims += num_dims[i];
+//    }
+//  }
+//  TF_SetAttrShapeList(desc, attr_name, dims, num_dims, num_shapes);
+//  free(dims);
+// }
 import "C"
 
 import (
@@ -114,6 +133,20 @@ func (g *Graph) Operation(name string) *Operation {
 	return &Operation{cop, g}
 }
 
+// Operations returns a list of all operations in the graph
+func (g *Graph) Operations() []Operation {
+	var pos C.size_t = 0
+	ops := []Operation{}
+	for {
+		cop := C.TF_GraphNextOperation(g.c, &pos)
+		if cop == nil {
+			break
+		}
+		ops = append(ops, Operation{cop, g})
+	}
+	return ops
+}
+
 // OpSpec is the specification of an Operation to be added to a Graph
 // (using Graph.AddOperation).
 type OpSpec struct {
@@ -289,41 +322,37 @@ func setAttr(cdesc *C.TF_OperationDescription, status *status, name string, valu
 			return fmt.Errorf("bad value for attribute %q: %v", name, err)
 		}
 	case Shape:
-		ndims, dims := cshape(value)
+		ndims := C.int(value.NumDimensions())
 		var dimsp *C.int64_t
 		if ndims > 0 {
+			dims := make([]C.int64_t, ndims)
+			for i, d := range value.dims {
+				dims[i] = C.int64_t(d)
+			}
 			dimsp = &dims[0]
 		}
 		C.TF_SetAttrShape(cdesc, cAttrName, dimsp, ndims)
 	case []Shape:
-		ndims := make([]C.int, len(value))
-		dims := make([][]C.int64_t, len(value))
-		dimsp := make([]*C.int64_t, len(value))
-		for i, s := range value {
-			ndims[i], dims[i] = cshape(s)
-			if ndims[i] > 0 {
-				dimsp[i] = &dims[i][0]
-			}
-		}
-		if len(value) > 0 {
-			C.TF_SetAttrShapeList(cdesc, cAttrName, &dimsp[0], &ndims[0], C.int(len(value)))
-		} else {
+		if len(value) == 0 {
 			C.TF_SetAttrShapeList(cdesc, cAttrName, nil, nil, 0)
+		} else {
+			var flatDims []C.int64_t
+			ndims := make([]C.int, len(value))
+			for i, s := range value {
+				nd := s.NumDimensions()
+				ndims[i] = C.int(nd)
+				for _, d := range s.dims {
+					flatDims = append(flatDims, C.int64_t(d))
+				}
+			}
+			var flatDimsp *C.int64_t
+			if len(flatDims) > 0 {
+				flatDimsp = &flatDims[0]
+			}
+			C.TF_SetAttrShapeList_Helper(cdesc, cAttrName, flatDimsp, &ndims[0], C.int(len(value)))
 		}
 	default:
 		return fmt.Errorf("attribute %q has a type (%T) which is not valid for operation attributes", name, value)
 	}
 	return nil
 }
-
-func cshape(s Shape) (C.int, []C.int64_t) {
-	ndims := C.int(s.NumDimensions())
-	if ndims < 0 {
-		return -1, nil
-	}
-	dims := make([]C.int64_t, ndims)
-	for i, s := range s.dims {
-		dims[i] = C.int64_t(s)
-	}
-	return ndims, dims
-}
diff --git a/tensorflow/go/graph_test.go b/tensorflow/go/graph_test.go
index c3120bc720308402b22884f29b7ff87ef035874b..b8d65c54f697153ad236f5e27d9f27d048c3a22e 100644
--- a/tensorflow/go/graph_test.go
+++ b/tensorflow/go/graph_test.go
@@ -29,10 +29,26 @@ func hasOperations(g *Graph, ops ...string) error {
 			missing = append(missing, op)
 		}
 	}
-	if len(missing) == 0 {
-		return nil
+	if len(missing) != 0 {
+		return fmt.Errorf("Graph does not have the operations %v", missing)
 	}
-	return fmt.Errorf("Graph does not have the operations %v", missing)
+
+	inList := map[string]bool{}
+	for _, op := range g.Operations() {
+		inList[op.Name()] = true
+	}
+
+	for _, op := range ops {
+		if !inList[op] {
+			missing = append(missing, op)
+		}
+	}
+
+	if len(missing) != 0 {
+		return fmt.Errorf("Operations %v are missing from graph.Operations()", missing)
+	}
+
+	return nil
 }
 
 func TestGraphWriteToAndImport(t *testing.T) {
diff --git a/tensorflow/go/op/op_test.go b/tensorflow/go/op/op_test.go
index 2451ba360699a7ac24f64209339e7b4f92ffb548..842dee9ffe396c44cfa26bbc7fd34a598e62bf89 100644
--- a/tensorflow/go/op/op_test.go
+++ b/tensorflow/go/op/op_test.go
@@ -58,3 +58,76 @@ func TestAddOperationFailure(t *testing.T) {
 	_ = resize.Shape()
 	t.Errorf("resize.Shape() should have paniced since the underlying Operation was not created")
 }
+
+func TestShapeAttribute(t *testing.T) {
+	s := NewScope()
+	x := Placeholder(s.SubScope("x"), tf.Int32, PlaceholderShape(tf.MakeShape(1)))
+	y := Placeholder(s.SubScope("y"), tf.Int32, PlaceholderShape(tf.Shape{}))
+	z := Add(s, x, y)
+	graph, err := s.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := tf.NewTensor([]int32{7})
+	if err != nil {
+		t.Fatal(err)
+	}
+	feeds := map[tf.Output]*tf.Tensor{
+		x: value,
+		y: value,
+	}
+	fetched, err := sess.Run(feeds, []tf.Output{z}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if got, want := len(fetched), 1; got != want {
+		t.Fatalf("Fetched %d tensors, expected %d", got, want)
+	}
+	if got, want := fetched[0].Value().([]int32), []int32{14}; len(got) != len(want) || len(got) != 1 || got[0] != want[0] {
+		t.Fatalf("Got %v, want %v", got, want)
+	}
+}
+
+func TestDataset(t *testing.T) {
+	var (
+		s = NewScope()
+
+		// The use of a non-scalar here is inspired by
+		// https://github.com/tensorflow/tensorflow/issues/14891
+		c       = Const(s, []int32{21718, 31415})
+		types   = []tf.DataType{c.DataType()}
+		shapes  = []tf.Shape{c.Shape()}
+		dataset = TensorDataset(s, []tf.Output{c}, shapes)
+
+		iterator = Iterator(s, "", "", types, shapes)
+		next     = IteratorGetNext(s, iterator, types, shapes)
+		init     = MakeIterator(s, dataset, iterator)
+	)
+	graph, err := s.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if _, err := sess.Run(nil, nil, []*tf.Operation{init}); err != nil {
+		t.Fatal(err)
+	}
+	results, err := sess.Run(nil, next, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	got := results[0].Value().([]int32)
+	if len(got) != 2 || got[0] != 21718 || got[1] != 31415 {
+		t.Errorf("Got %v, want {21718, 31415}", got)
+	}
+	if _, err := sess.Run(nil, next, nil); err == nil {
+		t.Errorf("Expected sess.Run() to fail since the iterator should have reached the end of the dataset")
+	}
+}
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a910b51fb97d130ffc111922c0a3aa11535fb37a..091f64de703733c62c7bf91c5df705bf1829188f 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -44,19 +44,19 @@ func makeOutputList(op *tf.Operation, start int, output string) ([]tf.Output, in
 //
 // Arguments:
 //	writer: A handle to a summary writer.
-//	global_step: The step to write the summary for.
+//	step: The step to write the summary for.
 //	tag: Tag for the summary.
 //	value: Value for the summary.
 //
 // Returns the created operation.
-func WriteScalarSummary(scope *Scope, writer tf.Output, global_step tf.Output, tag tf.Output, value tf.Output) (o *tf.Operation) {
+func WriteScalarSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
 		Type: "WriteScalarSummary",
 		Input: []tf.Input{
-			writer, global_step, tag, value,
+			writer, step, tag, value,
 		},
 	}
 	return scope.AddOperation(opspec)
@@ -89,21 +89,21 @@ func ImportEvent(scope *Scope, writer tf.Output, event tf.Output) (o *tf.Operati
 //
 // Arguments:
 //	writer: A handle to a summary writer.
-//	global_step: The step to write the summary for.
+//	step: The step to write the summary for.
 //	tensor: A tensor to serialize.
 //	tag: The summary's tag.
 //	summary_metadata: Serialized SummaryMetadata protocol buffer containing
 // plugin-related metadata for this summary.
 //
 // Returns the created operation.
-func WriteSummary(scope *Scope, writer tf.Output, global_step tf.Output, tensor tf.Output, tag tf.Output, summary_metadata tf.Output) (o *tf.Operation) {
+func WriteSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Output, tag tf.Output, summary_metadata tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
 		Type: "WriteSummary",
 		Input: []tf.Input{
-			writer, global_step, tensor, tag, summary_metadata,
+			writer, step, tensor, tag, summary_metadata,
 		},
 	}
 	return scope.AddOperation(opspec)
@@ -1484,6 +1484,61 @@ func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (outp
 	return op.Output(0)
 }
 
+// UniqueV2Attr is an optional argument to UniqueV2.
+type UniqueV2Attr func(optionalAttr)
+
+// UniqueV2OutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
+//
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx = unique(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`.
+//	axis: A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
+// find the unique elements.
+//
+// Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
+// value of x in the output y.
+func UniqueV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueV2Attr) (y tf.Output, idx tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniqueV2",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Shuffle dimensions of x according to a permutation and conjugate the result.
 //
 // The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
@@ -2019,6 +2074,28 @@ func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Gives a guarantee to the TF runtime that the input tensor is a constant.
+//
+// The runtime is then free to make optimizations based on this.
+//
+// Only accepts value typed tensors as inputs and rejects resource variable handles
+// as input.
+//
+// Returns the input tensor without modification.
+func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GuaranteeConst",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Splits a tensor into `num_split` tensors along one dimension.
 //
 // Arguments:
@@ -2147,19 +2224,19 @@ func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset
 //
 // Arguments:
 //	writer: A handle to a summary writer.
-//	global_step: The step to write the summary for.
+//	step: The step to write the summary for.
 //	tag: Scalar.  Tag to use for the `Summary.Value`.
 //	values: Any shape. Values to use to build the histogram.
 //
 // Returns the created operation.
-func WriteHistogramSummary(scope *Scope, writer tf.Output, global_step tf.Output, tag tf.Output, values tf.Output) (o *tf.Operation) {
+func WriteHistogramSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
 		Type: "WriteHistogramSummary",
 		Input: []tf.Input{
-			writer, global_step, tag, values,
+			writer, step, tag, values,
 		},
 	}
 	return scope.AddOperation(opspec)
@@ -2681,21 +2758,6 @@ func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
-// Does nothing. Serves as a control trigger for scheduling.
-//
-// Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func ControlTrigger(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ControlTrigger",
-	}
-	return scope.AddOperation(opspec)
-}
-
 // SpaceToDepthAttr is an optional argument to SpaceToDepth.
 type SpaceToDepthAttr func(optionalAttr)
 
@@ -2726,7 +2788,7 @@ func SpaceToDepthDataFormat(value string) SpaceToDepthAttr {
 //   "NHWC": `[ batch, height, width, channels ]`
 //   "NCHW": `[ batch, channels, height, width ]`
 //   "NCHW_VECT_C":
-//       `qint8 [ batch, channels / 4, height, width, channels % 4 ]`
+//       `qint8 [ batch, channels / 4, height, width, 4 ]`
 //
 // It is useful to consider the operation as transforming a 6-D Tensor.
 // e.g. for data_format = NHWC,
@@ -3656,7 +3718,7 @@ func DepthToSpaceDataFormat(value string) DepthToSpaceAttr {
 //   "NHWC": `[ batch, height, width, channels ]`
 //   "NCHW": `[ batch, channels, height, width ]`
 //   "NCHW_VECT_C":
-//       `qint8 [ batch, channels / 4, height, width, channels % 4 ]`
+//       `qint8 [ batch, channels / 4, height, width, 4 ]`
 //
 // It is useful to consider the operation as transforming a 6-D Tensor.
 // e.g. for data_format = NHWC,
@@ -5334,6 +5396,21 @@ func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged
 	return op.Output(0)
 }
 
+// Produces a summary of any statistics recorded by the given statistics manager.
+func StatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatsAggregatorSummary",
+		Input: []tf.Input{
+			iterator,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
 type FIFOQueueV2Attr func(optionalAttr)
 
@@ -5807,6 +5884,64 @@ func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, out
 	return op.Output(0)
 }
 
+// Creates a dataset that shuffles and repeats elements from `input_dataset`
+//
+// pseudorandomly.
+//
+// Arguments:
+//
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//	count: A scalar representing the number of times the underlying dataset
+// should be repeated. The default is `-1`, which results in infinite repetition.
+//
+//
+func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ShuffleAndRepeatDataset",
+		Input: []tf.Input{
+			input_dataset, buffer_size, seed, seed2, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a Dataset that returns pseudorandom numbers.
+//
+// Arguments:
+//	seed: A scalar seed for the random number generator. If either seed or
+// seed2 is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//
+//
+func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "RandomDataset",
+		Input: []tf.Input{
+			seed, seed2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Identity op for gradient debugging.
 //
 // This op is hidden from public in Python. It is used by TensorFlow Debugger to
@@ -5950,6 +6085,23 @@ func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.
 	return scope.AddOperation(opspec)
 }
 
+// Records the latency of producing `input_dataset` elements in a StatsAggregator.
+func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "LatencyStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Concatenates tensors along one dimension.
 //
 // Arguments:
@@ -6146,6 +6298,43 @@ func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_ou
 	return op.Output(0)
 }
 
+// StatsAggregatorHandleAttr is an optional argument to StatsAggregatorHandle.
+type StatsAggregatorHandleAttr func(optionalAttr)
+
+// StatsAggregatorHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StatsAggregatorHandleContainer(value string) StatsAggregatorHandleAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StatsAggregatorHandleSharedName(value string) StatsAggregatorHandleAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a statistics manager resource.
+func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatsAggregatorHandle",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
 type CropAndResizeGradBoxesAttr func(optionalAttr)
 
@@ -6223,8 +6412,8 @@ func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
 //	buffer_size: The number of output elements to buffer in an iterator over
 // this dataset. Compare with the `min_after_dequeue` attr when creating a
 // `RandomShuffleQueue`.
-//	seed: A scalar seed for the random number generator. If either seed or
-// seed2 is set to be non-zero, the random number generator is seeded
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
 // by the given seed.  Otherwise, a random seed is used.
 //	seed2: A second scalar seed to avoid seed collision.
 //
@@ -8054,30 +8243,71 @@ func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
-type QuantizeAndDequantizeV3Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
+// Merges summaries.
+//
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
+//
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
+//
+// Arguments:
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "MergeSummary",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
+// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
+type AudioSummaryV2Attr func(optionalAttr)
+
+// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
 	return func(m optionalAttr) {
-		m["range_given"] = value
+		m["max_outputs"] = value
 	}
 }
 
-// Quantizes then dequantizes a tensor.
+// Outputs a `Summary` protocol buffer with audio.
 //
-// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
-// tensor, so its value can change during training.
-func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8086,9 +8316,9 @@ func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV3",
+		Type: "AudioSummaryV2",
 		Input: []tf.Input{
-			input, input_min, input_max, num_bits,
+			tag, tensor, sample_rate,
 		},
 		Attrs: attrs,
 	}
@@ -8096,124 +8326,85 @@ func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output,
 	return op.Output(0)
 }
 
-// AvgPool3DAttr is an optional argument to AvgPool3D.
-type AvgPool3DAttr func(optionalAttr)
+// ImageSummaryAttr is an optional argument to ImageSummary.
+type ImageSummaryAttr func(optionalAttr)
 
-// AvgPool3DDataFormat sets the optional data_format attribute to value.
+// ImageSummaryMaxImages sets the optional max_images attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["max_images"] = value
 	}
 }
 
-// Performs 3D average pooling on the input.
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+// ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
-// Returns The average pooled output tensor.
-func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPool3D",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
+// value: Color to use for pixels with non-finite values.
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
+func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["bad_color"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Produces the max pool of the input tensor for quantized types.
+// Outputs a `Summary` protocol buffer with images.
 //
-// Arguments:
-//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMaxPool",
-		Input: []tf.Input{
-			input, min_input, max_input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
-type Conv3DBackpropInputV2Attr func(optionalAttr)
-
-// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
+//
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the tensor shape of `input`,
-// where `input` is a 5-D
-// `[batch, depth, rows, cols, in_channels]` tensor.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInputV2",
+		Type: "ImageSummary",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			tag, tensor,
 		},
 		Attrs: attrs,
 	}
@@ -8221,308 +8412,237 @@ func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output
 	return op.Output(0)
 }
 
-// Returns a tensor of ones with the same shape and type as x.
+// Computes the number of elements in the given queue.
 //
 // Arguments:
-//	x: a tensor of type T.
+//	handle: The handle to a queue.
 //
-// Returns a tensor of the same shape and type as x but filled with ones.
-func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns The number of elements in the given queue.
+func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OnesLike",
+		Type: "QueueSizeV2",
 		Input: []tf.Input{
-			x,
+			handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
-//
-// the result here is consistent with a truncating divide. E.g.
-// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
+// Outputs a `Summary` protocol buffer with a histogram.
 //
-// *NOTE*: `Mod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Mod",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradients of 3-D convolution with respect to the filter.
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
 //
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
+// This op reports an `InvalidArgument` error if any value is not finite.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilter",
+		Type: "HistogramSummary",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			tag, values,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradients of 3-D convolution with respect to the input.
+// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
+type RandomShuffleQueueV2Attr func(optionalAttr)
+
+// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
 //
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
 //
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
+// REQUIRES: len(value) >= 0
+func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ReverseSequenceAttr is an optional argument to ReverseSequence.
-type ReverseSequenceAttr func(optionalAttr)
-
-// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
+// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
 //
-// value: The dimension along which reversal is performed.
-// If not specified, defaults to 0
-func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["batch_dim"] = value
+		m["capacity"] = value
 	}
 }
 
-// Reverses variable length slices.
-//
-// This op first slices `input` along the dimension `batch_dim`, and for each
-// slice `i`, reverses the first `seq_lengths[i]` elements along
-// the dimension `seq_dim`.
-//
-// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
-// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
-//
-// The output slice `i` along dimension `batch_dim` is then given by input
-// slice `i`, with the first `seq_lengths[i]` slices along dimension
-// `seq_dim` reversed.
-//
-// For example:
-//
-// ```
-// # Given this:
-// batch_dim = 0
-// seq_dim = 1
-// input.dims = (4, 8, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
-// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
-// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
-// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
+// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
 //
-// # while entries past seq_lens are copied through:
-// output[0, 7:, :, ...] = input[0, 7:, :, ...]
-// output[1, 2:, :, ...] = input[1, 2:, :, ...]
-// output[2, 3:, :, ...] = input[2, 3:, :, ...]
-// output[3, 2:, :, ...] = input[3, 2:, :, ...]
-// ```
+// value: Dequeue will block unless there would be this
+// many elements after the dequeue or the queue is closed. This
+// ensures a minimum level of mixing of elements.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["min_after_dequeue"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
 //
-// In contrast, if:
+// value: If either seed or seed2 is set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
 //
-// ```
-// # Given this:
-// batch_dim = 2
-// seq_dim = 0
-// input.dims = (8, ?, 4, ...)
-// seq_lengths = [7, 2, 3, 5]
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomShuffleQueueV2Container sets the optional container attribute to value.
 //
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
-// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
-// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
-// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// # while entries past seq_lens are copied through:
-// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
-// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
-// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
-// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
-// ```
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that randomizes the order of elements.
 //
 // Arguments:
-//	input: The input to reverse.
-//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
-// `max(seq_lengths) <= input.dims(seq_dim)`
-//	seq_dim: The dimension which is partially reversed.
+//	component_types: The type of each component in a value.
 //
-// Returns The partially reversed input. It has the same shape as `input`.
-func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
+// Returns The handle to the queue.
+func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"seq_dim": seq_dim}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ReverseSequence",
-		Input: []tf.Input{
-			input, seq_lengths,
-		},
+		Type: "RandomShuffleQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient for the rsqrt of `x` wrt its input.
+// Outputs a `Summary` protocol buffer with scalar values.
 //
-// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RsqrtGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Makes its input available to the next iteration.
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
 //
 // Arguments:
-//	data: The tensor to be made available to the next iteration.
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
 //
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NextIteration",
+		Type: "ScalarSummary",
 		Input: []tf.Input{
-			data,
+			tags, values,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Does nothing. Only useful as a placeholder for control edges.
+// TensorSummaryAttr is an optional argument to TensorSummary.
+type TensorSummaryAttr func(optionalAttr)
+
+// TensorSummaryDescription sets the optional description attribute to value.
 //
-// Returns the created operation.
-func NoOp(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NoOp",
+// value: A json-encoded SummaryDescription proto.
+// If not specified, defaults to ""
+func TensorSummaryDescription(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["description"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
-type DepthwiseConv2dNativeAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
+// TensorSummaryLabels sets the optional labels attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
+// value: An unused list of strings.
+// If not specified, defaults to <>
+func TensorSummaryLabels(value []string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["labels"] = value
 	}
 }
 
-// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-// a different filter to each input channel (expanding from 1 channel to
-// `channel_multiplier` channels for each), then concatenates the results
-// together. Thus, the output has `in_channels * channel_multiplier` channels.
+// TensorSummaryDisplayName sets the optional display_name attribute to value.
 //
-// ```
-// for k in 0..in_channels-1
-//   for q in 0..channel_multiplier-1
-//     output[b, i, j, k * channel_multiplier + q] =
-//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-//                         filter[di, dj, k, q]
-// ```
+// value: An unused string.
+// If not specified, defaults to ""
+func TensorSummaryDisplayName(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["display_name"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with a tensor.
 //
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
+// a tag as well as a serialized SummaryMetadata proto string that contains
+// plugin-specific data. We will keep this op to maintain backwards compatibility.
 //
 // Arguments:
-//
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`.
-//	padding: The type of padding algorithm to use.
-func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+//	tensor: A tensor to serialize.
+func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNative",
+		Type: "TensorSummary",
 		Input: []tf.Input{
-			input, filter,
+			tensor,
 		},
 		Attrs: attrs,
 	}
@@ -8530,125 +8650,105 @@ func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, stri
 	return op.Output(0)
 }
 
-// CropAndResizeAttr is an optional argument to CropAndResize.
-type CropAndResizeAttr func(optionalAttr)
-
-// CropAndResizeMethod sets the optional method attribute to value.
+// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeMethod(value string) CropAndResizeAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
-	}
-}
-
-// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
+// Arguments:
 //
-// value: Value used for extrapolation, when applicable.
-// If not specified, defaults to 0
-func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
-	return func(m optionalAttr) {
-		m["extrapolation_value"] = value
-	}
-}
-
-// Extracts crops from the input image tensor and bilinearly resizes them (possibly
+//	buffer_size: The maximum number of elements to buffer in an iterator over
+// this dataset.
 //
-// with aspect ratio change) to a common output size specified by `crop_size`. This
-// is more general than the `crop_to_bounding_box` op which extracts a fixed size
-// slice from the input image and does not allow resizing or aspect ratio change.
 //
-// Returns a tensor with `crops` from the input `image` at positions defined at the
-// bounding box locations in `boxes`. The cropped boxes are all resized (with
-// bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
-// result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`. The
-// resizing is corner aligned. In particular, if `boxes = [[0, 0, 1, 1]]`, the
-// method will give identical results to using `tf.image.resize_bilinear()`
-// with `align_corners=True`.
+func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "PrefetchDataset",
+		Input: []tf.Input{
+			input_dataset, buffer_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
 //
 // Arguments:
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
-// cropped image patches are resized to this size. The aspect ratio of the image
-// content is not preserved. Both `crop_height` and `crop_width` need to be
-// positive.
-//
-// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
+//	tag: A string attached to this summary. Used for organization in TensorBoard.
+//	tensor: A tensor to serialize.
+//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
+// data.
+func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResize",
+		Type: "TensorSummaryV2",
 		Input: []tf.Input{
-			image, boxes, box_ind, crop_size,
+			tag, tensor, serialized_summary_metadata,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
-type MaxPoolGradAttr func(optionalAttr)
+// PrintAttr is an optional argument to Print.
+type PrintAttr func(optionalAttr)
 
-// MaxPoolGradDataFormat sets the optional data_format attribute to value.
+// PrintMessage sets the optional message attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
+// value: A string, prefix of the error message.
+// If not specified, defaults to ""
+func PrintMessage(value string) PrintAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["message"] = value
 	}
 }
 
-// Computes gradients of the maxpooling function.
+// PrintFirstN sets the optional first_n attribute to value.
+//
+// value: Only log `first_n` number of times. -1 disables logging.
+// If not specified, defaults to -1
+func PrintFirstN(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["first_n"] = value
+	}
+}
+
+// PrintSummarize sets the optional summarize attribute to value.
+//
+// value: Only print this many entries of each tensor.
+// If not specified, defaults to 3
+func PrintSummarize(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Prints a list of tensors.
+//
+// Passes `input` through to `output` and prints `data` when evaluating.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	input: The tensor passed to `output`
+//	data: A list of tensors to print out when op is evaluated.
 //
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
+// Returns = The unmodified `input` tensor
+func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGrad",
+		Type: "Print",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			input, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
@@ -8656,150 +8756,148 @@ func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad
 	return op.Output(0)
 }
 
-// Adds `bias` to `value`.
-//
-// This is a deprecated version of BiasAdd and will be soon removed.
-//
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// Makes its input available to the next iteration.
 //
 // Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+//	data: The tensor to be made available to the next iteration.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddV1",
+		Type: "NextIteration",
 		Input: []tf.Input{
-			value, bias,
+			data,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EncodeJpegAttr is an optional argument to EncodeJpeg.
-type EncodeJpegAttr func(optionalAttr)
-
-// EncodeJpegFormat sets the optional format attribute to value.
+// Does nothing. Only useful as a placeholder for control edges.
 //
-// value: Per pixel image format.
-// If not specified, defaults to ""
-func EncodeJpegFormat(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["format"] = value
+// Returns the created operation.
+func NoOp(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// EncodeJpegQuality sets the optional quality attribute to value.
-//
-// value: Quality of the compression from 0 to 100 (higher is better and slower).
-// If not specified, defaults to 95
-func EncodeJpegQuality(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["quality"] = value
+	opspec := tf.OpSpec{
+		Type: "NoOp",
 	}
+	return scope.AddOperation(opspec)
 }
 
-// EncodeJpegProgressive sets the optional progressive attribute to value.
+// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
+type DepthwiseConv2dNativeAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
 //
-// value: If True, create a JPEG that loads progressively (coarse to fine).
-// If not specified, defaults to false
-func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
-		m["progressive"] = value
+		m["data_format"] = value
 	}
 }
 
-// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
 //
-// value: If True, spend CPU/RAM to reduce size with no quality change.
-// If not specified, defaults to false
-func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
-		m["optimize_size"] = value
+		m["dilations"] = value
 	}
 }
 
-// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
 //
-// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-// If not specified, defaults to true
-func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["chroma_downsampling"] = value
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
+// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
+// a different filter to each input channel (expanding from 1 channel to
+// `channel_multiplier` channels for each), then concatenates the results
+// together. Thus, the output has `in_channels * channel_multiplier` channels.
+//
+// ```
+// for k in 0..in_channels-1
+//   for q in 0..channel_multiplier-1
+//     output[b, i, j, k * channel_multiplier + q] =
+//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+//                         filter[di, dj, k, q]
+// ```
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//
+// Arguments:
+//
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`.
+//	padding: The type of padding algorithm to use.
+func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNative",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
+type DataFormatDimMapAttr func(optionalAttr)
+
+// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
 //
-// value: Unit used to specify `x_density` and `y_density`:
-// pixels per inch (`'in'`) or centimeter (`'cm'`).
-// If not specified, defaults to "in"
-func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
 	return func(m optionalAttr) {
-		m["density_unit"] = value
+		m["src_format"] = value
 	}
 }
 
-// EncodeJpegXDensity sets the optional x_density attribute to value.
+// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
 //
-// value: Horizontal pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
 	return func(m optionalAttr) {
-		m["x_density"] = value
+		m["dst_format"] = value
 	}
 }
 
-// EncodeJpegYDensity sets the optional y_density attribute to value.
+// Returns the dimension index in the destination data format given the one in
 //
-// value: Vertical pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegYDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["y_density"] = value
-	}
-}
-
-// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
-//
-// value: If not empty, embed this XMP metadata in the image header.
-// If not specified, defaults to ""
-func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["xmp_metadata"] = value
-	}
-}
-
-// JPEG-encode an image.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-//
-// The attr `format` can be used to override the color format of the encoded
-// output.  Values can be:
-//
-// *   `''`: Use a default format based on the number of channels in the image.
-// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-//     of `image` must be 1.
-// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
-//     of `image` must be 3.
-//
-// If `format` is not specified or is the empty string, a default format is picked
-// in function of the number of channels in `image`:
-//
-// *   1: Output a grayscale image.
-// *   3: Output an RGB image.
+// the source data format.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	x: Scalar. Dimension index in source data format. Must be in the range [-4, 4).
 //
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+// Returns Scalar. Dimension index in destination data format.
+func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8808,9 +8906,9 @@ func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (cont
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeJpeg",
+		Type: "DataFormatDimMap",
 		Input: []tf.Input{
-			image,
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -8818,99 +8916,115 @@ func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (cont
 	return op.Output(0)
 }
 
-// Gradients for batch normalization.
+// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
+type ResourceApplyPowerSignAttr func(optionalAttr)
+
+// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
 //
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AddSign update.
 //
-// This op is deprecated. See `tf.nn.batch_normalization`.
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	logbase: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
 //
-// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+// Returns the created operation.
+func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
+		Type: "ResourceApplyPowerSign",
 		Input: []tf.Input{
-			t, m, v, gamma, backprop,
+			var_, m, lr, logbase, sign_decay, beta, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return scope.AddOperation(opspec)
 }
 
-// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
-type FusedBatchNormV2Attr func(optionalAttr)
+// CropAndResizeAttr is an optional argument to CropAndResize.
+type CropAndResizeAttr func(optionalAttr)
 
-// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
+// CropAndResizeMethod sets the optional method attribute to value.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeMethod(value string) CropAndResizeAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["method"] = value
 	}
 }
 
-// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
+// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
 //
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
+// value: Value used for extrapolation, when applicable.
+// If not specified, defaults to 0
+func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["extrapolation_value"] = value
 	}
 }
 
-// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
+// Extracts crops from the input image tensor and bilinearly resizes them (possibly
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Batch normalization.
+// with aspect ratio change) to a common output size specified by `crop_size`. This
+// is more general than the `crop_to_bounding_box` op which extracts a fixed size
+// slice from the input image and does not allow resizing or aspect ratio change.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// Returns a tensor with `crops` from the input `image` at positions defined at the
+// bounding box locations in `boxes`. The cropped boxes are all resized (with
+// bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
+// result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`. The
+// resizing is corner aligned. In particular, if `boxes = [[0, 0, 1, 1]]`, the
+// method will give identical results to using `tf.image.resize_bilinear()`
+// with `align_corners=True`.
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
+// cropped image patches are resized to this size. The aspect ratio of the image
+// content is not preserved. Both `crop_height` and `crop_width` need to be
+// positive.
 //
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8919,28 +9033,20 @@ func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormV2",
+		Type: "CropAndResize",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			image, boxes, box_ind, crop_size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
-type Conv2DBackpropInputAttr func(optionalAttr)
-
-// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
+// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
+type MaxPoolGradAttr func(optionalAttr)
 
-// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+// MaxPoolGradDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
 // default format "NHWC", the data is stored in the order of:
@@ -8948,40 +9054,36 @@ func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
 // Alternatively, the format could be "NCHW", the data storage order of:
 //     [batch, in_channels, in_height, in_width].
 // If not specified, defaults to "NHWC"
-func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
+func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes the gradients of convolution with respect to the input.
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the shape of `input`,
-// where `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
 //	padding: The type of padding algorithm to use.
 //
-// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-// w.r.t. the input of the convolution.
-func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropInput",
+		Type: "MaxPoolGrad",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -8989,124 +9091,135 @@ func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output,
 	return op.Output(0)
 }
 
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
 
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
+// EncodeJpegFormat sets the optional format attribute to value.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+// value: Per pixel image format.
+// If not specified, defaults to ""
+func EncodeJpegFormat(value string) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["format"] = value
 	}
 }
 
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+// EncodeJpegQuality sets the optional quality attribute to value.
 //
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["quality"] = value
 	}
 }
 
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+// EncodeJpegProgressive sets the optional progressive attribute to value.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+// value: If True, create a JPEG that loads progressively (coarse to fine).
+// If not specified, defaults to false
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["progressive"] = value
 	}
 }
 
-// Batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
 //
-// Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
-//
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If True, spend CPU/RAM to reduce size with no quality change.
+// If not specified, defaults to false
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["optimize_size"] = value
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+}
+
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+//
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["chroma_downsampling"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
-		Input: []tf.Input{
-			x, scale, offset, mean, variance,
-		},
-		Attrs: attrs,
+}
+
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+//
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["density_unit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
-type RandomStandardNormalAttr func(optionalAttr)
+// EncodeJpegXDensity sets the optional x_density attribute to value.
+//
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
+	}
+}
 
-// RandomStandardNormalSeed sets the optional seed attribute to value.
+// EncodeJpegYDensity sets the optional y_density attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["y_density"] = value
 	}
 }
 
-// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["xmp_metadata"] = value
 	}
 }
 
-// Outputs random values from a normal distribution.
+// JPEG-encode an image.
 //
-// The generated values will have mean 0 and standard deviation 1.
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+//
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
+//
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
+//
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
+//
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns A tensor of the specified shape filled with random normal values.
-func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomStandardNormal",
+		Type: "EncodeJpeg",
 		Input: []tf.Input{
-			shape,
+			image,
 		},
 		Attrs: attrs,
 	}
@@ -9114,323 +9227,633 @@ func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, opti
 	return op.Output(0)
 }
 
-// Computes sigmoid of `x` element-wise.
+// Gradients for batch normalization.
 //
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+//
+// This op is deprecated. See `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+//
+// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "Sigmoid",
+		Type: "BatchNormWithGlobalNormalizationGrad",
 		Input: []tf.Input{
-			x,
+			t, m, v, gamma, backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
-type ComputeAccidentalHitsAttr func(optionalAttr)
+// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
+type FusedBatchNormV2Attr func(optionalAttr)
 
-// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
+// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["epsilon"] = value
 	}
 }
 
-// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
+// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Computes the ids of the positions in sampled_candidates that match true_labels.
+// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
 //
-// When doing log-odds NCE, the result of this op should be passed through a
-// SparseToDense op, then added to the logits of the sampled candidates. This has
-// the effect of 'removing' the sampled labels that match the true labels by
-// making the classifier sure that they are sampled labels.
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	true_classes: The true_classes output of UnpackSparseLabels.
-//	sampled_candidates: The sampled_candidates output of CandidateSampler.
-//	num_true: Number of true labels per context.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
 //
-// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
-// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
-// is -FLOAT_MAX.
-func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ComputeAccidentalHits",
+		Type: "FusedBatchNormV2",
 		Input: []tf.Input{
-			true_classes, sampled_candidates,
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// StageClearAttr is an optional argument to StageClear.
-type StageClearAttr func(optionalAttr)
+// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
+type Conv2DBackpropInputAttr func(optionalAttr)
 
-// StageClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageClearCapacity(value int64) StageClearAttr {
+// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["use_cudnn_on_gpu"] = value
 	}
 }
 
-// StageClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
 //
-// REQUIRES: value >= 0
-func StageClearMemoryLimit(value int64) StageClearAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["data_format"] = value
 	}
 }
 
-// StageClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageClearContainer(value string) StageClearAttr {
+// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["dilations"] = value
 	}
 }
 
-// StageClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageClearSharedName(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
+// Computes the gradients of convolution with respect to the input.
 //
-// Returns the created operation.
-func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
+// Arguments:
+//	input_sizes: An integer vector representing the shape of `input`,
+// where `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+// w.r.t. the input of the convolution.
+func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StageClear",
-
+		Type: "Conv2DBackpropInput",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
 
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
 // If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes gradients of the average pooling function.
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
+		Type: "FusedBatchNorm",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Computes the maximum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
+
+// RandomStandardNormalSeed sets the optional seed attribute to value.
 //
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
 //
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
+// The generated values will have mean 0 and standard deviation 1.
 //
 // Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMax",
+		Type: "RandomStandardNormal",
 		Input: []tf.Input{
-			data, segment_ids,
+			shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the rank of a tensor.
-//
-// This operation returns an integer representing the rank of `input`.
-//
-// For example:
-//
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// # shape of tensor 't' is [2, 2, 3]
-// rank(t) ==> 3
-// ```
+// Computes sigmoid of `x` element-wise.
 //
-// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
-// of a tensor is the number of indices required to uniquely select each element
-// of the tensor. Rank is also known as "order", "degree", or "ndims."
-func Rank(scope *Scope, input tf.Output) (output tf.Output) {
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Rank",
+		Type: "Sigmoid",
 		Input: []tf.Input{
-			input,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
-
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
-//
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["field_delim"] = value
-	}
-}
+// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
+type ComputeAccidentalHitsAttr func(optionalAttr)
 
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
 //
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
 	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
+		m["seed"] = value
 	}
 }
 
-// DecodeCSVNaValue sets the optional na_value attribute to value.
+// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
 //
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
 	return func(m optionalAttr) {
-		m["na_value"] = value
+		m["seed2"] = value
 	}
 }
 
-// Convert CSV records to tensors. Each column maps to one tensor.
+// Computes the ids of the positions in sampled_candidates that match true_labels.
 //
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
+// When doing log-odds NCE, the result of this op should be passed through a
+// SparseToDense op, then added to the logits of the sampled candidates. This has
+// the effect of 'removing' the sampled labels that match the true labels by
+// making the classifier sure that they are sampled labels.
 //
 // Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or empty if the column is required.
+//	true_classes: The true_classes output of UnpackSparseLabels.
+//	sampled_candidates: The sampled_candidates output of CandidateSampler.
+//	num_true: Number of true labels per context.
 //
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
+// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
+// is -FLOAT_MAX.
+func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
+		Type: "ComputeAccidentalHits",
 		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
+			true_classes, sampled_candidates,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// StageClearAttr is an optional argument to StageClear.
+type StageClearAttr func(optionalAttr)
+
+// StageClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageClearCapacity(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// StageClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageClearMemoryLimit(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageClearContainer(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageClearSharedName(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StageClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
+
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the average pooling function.
+//
+// Arguments:
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPoolGrad",
+		Input: []tf.Input{
+			orig_input_shape, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the maximum along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMax",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the rank of a tensor.
+//
+// This operation returns an integer representing the rank of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// # shape of tensor 't' is [2, 2, 3]
+// rank(t) ==> 3
+// ```
+//
+// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
+// of a tensor is the number of indices required to uniquely select each element
+// of the tensor. Rank is also known as "order", "degree", or "ndims."
+func Rank(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rank",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+//
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
+	}
+}
+
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+//
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["use_quote_delim"] = value
+	}
+}
+
+// DecodeCSVNaValue sets the optional na_value attribute to value.
+//
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["na_value"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
+//
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
+//
+// Arguments:
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or empty if the column is required.
+//
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCSV",
+		Input: []tf.Input{
+			records, tf.OutputList(record_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
 		return
 	}
 	return output
@@ -10042,80 +10465,10 @@ func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf
 	return op.Output(0)
 }
 
-// Convert JSON-encoded Example records to binary protocol buffer strings.
+// Delete the TensorArray from its resource container.
 //
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
-//
-// Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
-//
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
-		Input: []tf.Input{
-			json_examples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Adds sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] += updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] += updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterAdd",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Delete the TensorArray from its resource container.
-//
-// This enables the user to close and release the resource in the middle
-// of a step/run.
+// This enables the user to close and release the resource in the middle
+// of a step/run.
 //
 // Arguments:
 //	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
@@ -10190,6 +10543,20 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 	}
 }
 
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
 // Computes a 2D convolution given quantized 4D input and filter tensors.
 //
 // The inputs are quantized tensors where the lowest value represents the real
@@ -10556,6 +10923,175 @@ func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Convert JSON-encoded Example records to binary protocol buffer strings.
+//
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
+//
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
+//
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJSONExample",
+		Input: []tf.Input{
+			json_examples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds sparse updates to the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] += updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] += updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterAdd",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Eagerly executes a python function to compute func(input)->output. The
+//
+// semantics of the input, output, and attributes are the same as those for
+// PyFunc.
+func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"token": token, "Tout": Tout}
+	opspec := tf.OpSpec{
+		Type: "EagerPyFunc",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("EagerPyFunc", err)
+		return
+	}
+	return output
+}
+
+// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
+type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the shape of `input`, based
+// on `data_format`.  For example, if `data_format` is 'NHWC' then
+//  `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape according to `data_format`.  For example, if
+// `data_format` is 'NHWC', output shape is `[batch, in_height,
+// in_width, in_channels]`.  Gradient w.r.t. the input of the
+// convolution.
+func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNativeBackpropInput",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a dataset with a range of values. Corresponds to python's xrange.
 //
 // Arguments:
@@ -11032,13 +11568,13 @@ func WriteAudioSummaryMaxOutputs(value int64) WriteAudioSummaryAttr {
 //
 // Arguments:
 //	writer: A handle to a summary writer.
-//	global_step: The step to write the summary for.
+//	step: The step to write the summary for.
 //	tag: Scalar. Used to build the `tag` attribute of the summary values.
 //	tensor: 2-D of shape `[batch_size, frames]`.
 //	sample_rate: The sample rate of the signal in hertz.
 //
 // Returns the created operation.
-func WriteAudioSummary(scope *Scope, writer tf.Output, global_step tf.Output, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...WriteAudioSummaryAttr) (o *tf.Operation) {
+func WriteAudioSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...WriteAudioSummaryAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11049,7 +11585,7 @@ func WriteAudioSummary(scope *Scope, writer tf.Output, global_step tf.Output, ta
 	opspec := tf.OpSpec{
 		Type: "WriteAudioSummary",
 		Input: []tf.Input{
-			writer, global_step, tag, tensor, sample_rate,
+			writer, step, tag, tensor, sample_rate,
 		},
 		Attrs: attrs,
 	}
@@ -12147,197 +12683,25 @@ func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Subtracts a value from the current value of a variable.
-//
-// Any ReadVariableOp which depends directly or indirectly on this assign is
-// guaranteed to see the incremented value or a subsequent newer one.
-//
-// Outputs the incremented value, which can be used to totally order the
-// increments to this variable.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
+// Computes numerical negative value element-wise.
 //
-// Returns the created operation.
-func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+// I.e., \\(y = -x\\).
+func Neg(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AssignSubVariableOp",
+		Type: "Neg",
 		Input: []tf.Input{
-			resource, value,
+			x,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
-type SparseReduceMaxAttr func(optionalAttr)
-
-// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-//
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReduceMax",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
-type Conv3DBackpropFilterV2Attr func(optionalAttr)
-
-// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the filter.
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 5-D
-// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-// tensor.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilterV2",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Execute a sub graph on a remote processor.
-//
-// The graph specifications(such as graph itself, input tensors and output names)
-// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
-// as serialized_remote_fused_graph_execute_info.
-// The specifications will be passed to a dedicated registered
-// remote fused graph executor.  The executor will send the graph specifications
-// to a remote processor and execute that graph.  The execution results
-// will be passed to consumer nodes as outputs of this node.
-//
-// Arguments:
-//	inputs: Arbitrary number of tensors with arbitrary data types
-//
-//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
-// of RemoteFusedGraphExecuteInfo which contains graph specifications.
-//
-// Returns Arbitrary number of tensors with arbitrary data types
-func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
-	opspec := tf.OpSpec{
-		Type: "RemoteFusedGraphExecute",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("RemoteFusedGraphExecute", err)
-		return
-	}
-	return outputs
-}
-
-// Computes numerical negative value element-wise.
-//
-// I.e., \\(y = -x\\).
-func Neg(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Neg",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
 
 // SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
 // If not specified, defaults to true
@@ -12899,148 +13263,463 @@ func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "FusedPadConv2D",
+		Input: []tf.Input{
+			input, paddings, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns immutable tensor from memory region.
+//
+// The current implementation memmaps the tensor from a file.
+//
+// Arguments:
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
+	opspec := tf.OpSpec{
+		Type: "ImmutableConst",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
+//
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension).
+//
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "DeserializeManySparse",
+		Input: []tf.Input{
+			serialized_sparse,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
+
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+//
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_a"] = value
+	}
+}
+
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+//
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+//
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
+//
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseMatMul",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// WriteImageSummaryAttr is an optional argument to WriteImageSummary.
+type WriteImageSummaryAttr func(optionalAttr)
+
+// WriteImageSummaryMaxImages sets the optional max_images attribute to value.
+//
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func WriteImageSummaryMaxImages(value int64) WriteImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_images"] = value
+	}
+}
+
+// Writes a `Summary` protocol buffer with images.
+//
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
+//
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
+//
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
+//
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
+//
+// Arguments:
+//	writer: A handle to a summary writer.
+//	step: The step to write the summary for.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
+//	bad_color: Color to use for pixels with non-finite values.
+//
+// Returns the created operation.
+func WriteImageSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, tensor tf.Output, bad_color tf.Output, optional ...WriteImageSummaryAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteImageSummary",
+		Input: []tf.Input{
+			writer, step, tag, tensor, bad_color,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Pads a tensor with zeros.
+//
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that emits the lines of one or more text files.
+//
+// Arguments:
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar containing the number of bytes to buffer.
+func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TextLineDataset",
+		Input: []tf.Input{
+			filenames, compression_type, buffer_size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of records this Reader has produced.
+//
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderNumRecordsProducedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes exponential of x - 1 element-wise.
+//
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
 	opspec := tf.OpSpec{
-		Type: "FusedPadConv2D",
+		Type: "Expm1",
 		Input: []tf.Input{
-			input, paddings, filter,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns immutable tensor from memory region.
+// Batch normalization.
 //
-// The current implementation memmaps the tensor from a file.
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+//
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
-
+		Type: "BatchNormWithGlobalNormalization",
+		Input: []tf.Input{
+			t, m, v, beta, gamma,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
-//
-// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-// `N` is the minibatch size and the rows correspond to packed outputs of
-// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
-// must all match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
+// MaxPoolV2Attr is an optional argument to MaxPoolV2.
+type MaxPoolV2Attr func(optionalAttr)
+
+// MaxPoolV2DataFormat sets the optional data_format attribute to value.
 //
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs max pooling on the input.
 //
 // Arguments:
-//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-// Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DeserializeManySparse",
+		Type: "MaxPoolV2",
 		Input: []tf.Input{
-			serialized_sparse,
+			input, ksize, strides,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
+// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
+type SparseReduceMaxAttr func(optionalAttr)
 
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
 //
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
 	return func(m optionalAttr) {
-		m["adjoint_a"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+// Computes the max of elements across dimensions of a SparseTensor.
 //
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_b"] = value
-	}
-}
-
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
 //
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13049,9 +13728,9 @@ func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
+		Type: "SparseReduceMax",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
@@ -13059,83 +13738,130 @@ func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Outp
 	return op.Output(0)
 }
 
-// Batch normalization.
+// Subtracts a value from the current value of a variable.
 //
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+// Any ReadVariableOp which depends directly or indirectly on this assign is
+// guaranteed to see the incremented value or a subsequent newer one.
 //
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+// Outputs the incremented value, which can be used to totally order the
+// increments to this variable.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
+		Type: "AssignSubVariableOp",
 		Input: []tf.Input{
-			t, m, v, beta, gamma,
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Execute a sub graph on a remote processor.
+//
+// The graph specifications(such as graph itself, input tensors and output names)
+// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
+// as serialized_remote_fused_graph_execute_info.
+// The specifications will be passed to a dedicated registered
+// remote fused graph executor.  The executor will send the graph specifications
+// to a remote processor and execute that graph.  The execution results
+// will be passed to consumer nodes as outputs of this node.
+//
+// Arguments:
+//	inputs: Arbitrary number of tensors with arbitrary data types
+//
+//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
+// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+//
+// Returns Arbitrary number of tensors with arbitrary data types
+func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
+	opspec := tf.OpSpec{
+		Type: "RemoteFusedGraphExecute",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RemoteFusedGraphExecute", err)
+		return
+	}
+	return outputs
 }
 
-// MaxPoolV2Attr is an optional argument to MaxPoolV2.
-type MaxPoolV2Attr func(optionalAttr)
+// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
+type Conv3DBackpropFilterV2Attr func(optionalAttr)
 
-// MaxPoolV2DataFormat sets the optional data_format attribute to value.
+// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Performs max pooling on the input.
+// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
 // Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 5-D
+// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
+// tensor.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-//
-// Returns The max pooled output tensor.
-func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolV2",
+		Type: "Conv3DBackpropFilterV2",
 		Input: []tf.Input{
-			input, ksize, strides,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -13215,30 +13941,51 @@ func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataTyp
 	return key, values
 }
 
-// Merges summaries.
+// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
+type DataFormatVecPermuteAttr func(optionalAttr)
+
+// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
 //
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
+	return func(m optionalAttr) {
+		m["src_format"] = value
+	}
+}
+
+// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
 //
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the permuted vector/tensor in the destination data format given the
+//
+// one in the source data format.
 //
 // Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
+//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
+func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MergeSummary",
+		Type: "DataFormatVecPermute",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -13319,6 +14066,20 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 	}
 }
 
+// Conv3DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DDilations(value []int64) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
 // Computes a 3-D convolution given 5-D `input` and `filter` tensors.
 //
 // In signal processing, cross-correlation is a measure of similarity of
@@ -13613,6 +14374,20 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 	}
 }
 
+// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
 // Computes the gradients of depthwise convolution with respect to the filter.
 //
 // Arguments:
@@ -13937,6 +14712,70 @@ func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Ou
 	return op.Output(0)
 }
 
+// Does nothing. Serves as a control trigger for scheduling.
+//
+// Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func ControlTrigger(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ControlTrigger",
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
+type ResourceApplyAddSignAttr func(optionalAttr)
+
+// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AddSign update.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+// variable <- variable - lr_t * update
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	alpha: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAddSign",
+		Input: []tf.Input{
+			var_, m, lr, alpha, sign_decay, beta, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Reorders a SparseTensor into the canonical, row-major ordering.
 //
 // Note that by convention, all sparse ops preserve the canonical ordering along
@@ -13994,23 +14833,147 @@ func PackAxis(value int64) PackAttr {
 // if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
 // Etc.
 //
-// For example:
+// For example:
+//
+// ```
+// # 'x' is [1, 4]
+// # 'y' is [2, 5]
+// # 'z' is [3, 6]
+// pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
+// ```
+//
+// This is the opposite of `unpack`.
+//
+// Arguments:
+//	values: Must be of same shape and type.
+//
+// Returns The packed tensor.
+func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Pack",
+		Input: []tf.Input{
+			tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArraySplitV3
+func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySplitV2",
+		Input: []tf.Input{
+			handle, value, lengths, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedReluAttr is an optional argument to QuantizedRelu.
+type QuantizedReluAttr func(optionalAttr)
+
+// QuantizedReluOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear: `max(features, 0)`
+//
+// Arguments:
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedRelu",
+		Input: []tf.Input{
+			features, min_features, max_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "BytesProducedStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QrAttr is an optional argument to Qr.
+type QrAttr func(optionalAttr)
+
+// QrFullMatrices sets the optional full_matrices attribute to value.
+//
+// value: If true, compute full-sized `q` and `r`. If false
+// (the default), compute only the leading `P` columns of `q`.
+// If not specified, defaults to false
+func QrFullMatrices(value bool) QrAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
+	}
+}
+
+// Computes the QR decompositions of one or more matrices.
+//
+// Computes the QR decomposition of each inner matrix in `tensor` such that
+// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
 //
+// ```python
+// # a is a tensor.
+// # q is a tensor of orthonormal matrices.
+// # r is a tensor of upper triangular matrices.
+// q, r = qr(a)
+// q_full, r_full = qr(a, full_matrices=True)
 // ```
-// # 'x' is [1, 4]
-// # 'y' is [2, 5]
-// # 'z' is [3, 6]
-// pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-// pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
-// ```
-//
-// This is the opposite of `unpack`.
 //
 // Arguments:
-//	values: Must be of same shape and type.
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-// Returns The packed tensor.
-func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Output) {
+// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
+// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14019,67 +14982,70 @@ func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Pack",
+		Type: "Qr",
 		Input: []tf.Input{
-			tf.OutputList(values),
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deprecated. Use TensorArraySplitV3
-func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV2",
-		Input: []tf.Input{
-			handle, value, lengths, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// QuantizedReluAttr is an optional argument to QuantizedRelu.
-type QuantizedReluAttr func(optionalAttr)
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
 
-// QuantizedReluOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["max_outputs"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear: `max(features, 0)`
+// Outputs a `Summary` protocol buffer with audio.
 //
-// Arguments:
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu",
+		Type: "AudioSummary",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			tag, tensor,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
 // Reverses specific dimensions of a tensor.
@@ -14300,62 +15266,6 @@ func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, def
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
-type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes the gradients of depthwise convolution with respect to the input.
-//
-// Arguments:
-//	input_sizes: An integer vector representing the shape of `input`, based
-// on `data_format`.  For example, if `data_format` is 'NHWC' then
-//  `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape according to `data_format`.  For example, if
-// `data_format` is 'NHWC', output shape is `[batch, in_height,
-// in_width, in_channels]`.  Gradient w.r.t. the input of the
-// convolution.
-func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropInput",
-		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // MatrixSolveAttr is an optional argument to MatrixSolve.
 type MatrixSolveAttr func(optionalAttr)
 
@@ -14602,6 +15512,39 @@ func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.O
 	return op.Output(0), op.Output(1)
 }
 
+// Returns x - y element-wise.
+//
+// *NOTE*: `Sub` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sub",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a copy of the input tensor.
+func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Snapshot",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Get the value of the tensor specified by its handle.
 //
 // Arguments:
@@ -14916,6 +15859,14 @@ func MultinomialSeed2(value int64) MultinomialAttr {
 	}
 }
 
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
 // Draws samples from a multinomial distribution.
 //
 // Arguments:
@@ -15007,76 +15958,16 @@ func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumul
 // Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
 func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
 	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSoftmaxCrossEntropyWithLogits",
-		Input: []tf.Input{
-			features, labels,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// TensorSummaryAttr is an optional argument to TensorSummary.
-type TensorSummaryAttr func(optionalAttr)
-
-// TensorSummaryDescription sets the optional description attribute to value.
-//
-// value: A json-encoded SummaryDescription proto.
-// If not specified, defaults to ""
-func TensorSummaryDescription(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["description"] = value
-	}
-}
-
-// TensorSummaryLabels sets the optional labels attribute to value.
-//
-// value: An unused list of strings.
-// If not specified, defaults to <>
-func TensorSummaryLabels(value []string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["labels"] = value
-	}
-}
-
-// TensorSummaryDisplayName sets the optional display_name attribute to value.
-//
-// value: An unused string.
-// If not specified, defaults to ""
-func TensorSummaryDisplayName(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["display_name"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with a tensor.
-//
-// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
-// a tag as well as a serialized SummaryMetadata proto string that contains
-// plugin-specific data. We will keep this op to maintain backwards compatibility.
-//
-// Arguments:
-//	tensor: A tensor to serialize.
-func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorSummary",
+		Type: "SparseSoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			tensor,
+			features, labels,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
 // Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
@@ -15313,6 +16204,20 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 	}
 }
 
+// Conv2DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DDilations(value []int64) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
 // Computes a 2-D convolution given 4-D `input` and `filter` tensors.
 //
 // Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
@@ -15344,7 +16249,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // `[filter_height, filter_width, in_channels, out_channels]`
 //	strides: 1-D tensor of length 4.  The stride of the sliding window for each
 // dimension of `input`. The dimension order is determined by the value of
-//   `data_format`, see below for details.
+// `data_format`, see below for details.
 //	padding: The type of padding algorithm to use.
 //
 // Returns A 4-D tensor. The dimension order is determined by the value of
@@ -16535,6 +17440,109 @@ func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (s
 	return op.Output(0)
 }
 
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
+
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse `updates` to individual values or slices within a given
+//
+// variable according to `indices`.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 11, 3, 10, 9, 6, 7, 12]
+//
+// See @{tf.scatter_nd} for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdUpdate",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the power of one value to another.
+//
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
+//
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pow",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SizeAttr is an optional argument to Size.
 type SizeAttr func(optionalAttr)
 
@@ -16724,13 +17732,54 @@ func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Deserialize `SparseTensor` from a (serialized) string 3-vector (1-D `Tensor`)
+// Deserialize `SparseTensor` objects.
+//
+// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+// the last dimension stores serialized `SparseTensor` objects and the other N
+// dimensions (N >= 0) correspond to a batch. The ranks of the original
+// `SparseTensor` objects must all match. When the final `SparseTensor` is
+// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+// the sparse tensors have been concatenated along new dimensions, one for each
+// batch.
+//
+// The output `SparseTensor` object's shape values for the original dimensions
+// are the max across the input `SparseTensor` objects' shape values for the
+// corresponding dimensions. The new dimensions match the size of the batch.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
 //
-// object.
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
 //
 // Arguments:
-//	serialized_sparse: 1-D, The serialized `SparseTensor` object. Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` object.
+//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+// must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
 func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -16936,67 +17985,6 @@ func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optio
 	return op.Output(0)
 }
 
-// PrintAttr is an optional argument to Print.
-type PrintAttr func(optionalAttr)
-
-// PrintMessage sets the optional message attribute to value.
-//
-// value: A string, prefix of the error message.
-// If not specified, defaults to ""
-func PrintMessage(value string) PrintAttr {
-	return func(m optionalAttr) {
-		m["message"] = value
-	}
-}
-
-// PrintFirstN sets the optional first_n attribute to value.
-//
-// value: Only log `first_n` number of times. -1 disables logging.
-// If not specified, defaults to -1
-func PrintFirstN(value int64) PrintAttr {
-	return func(m optionalAttr) {
-		m["first_n"] = value
-	}
-}
-
-// PrintSummarize sets the optional summarize attribute to value.
-//
-// value: Only print this many entries of each tensor.
-// If not specified, defaults to 3
-func PrintSummarize(value int64) PrintAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
-}
-
-// Prints a list of tensors.
-//
-// Passes `input` through to `output` and prints `data` when evaluating.
-//
-// Arguments:
-//	input: The tensor passed to `output`
-//	data: A list of tensors to print out when op is evaluated.
-//
-// Returns = The unmodified `input` tensor
-func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Print",
-		Input: []tf.Input{
-			input, tf.OutputList(data),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
 type LoadAndRemapMatrixAttr func(optionalAttr)
 
@@ -17482,54 +18470,6 @@ func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output)
 	return op.Output(0)
 }
 
-// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
-type AvgPool3DGradAttr func(optionalAttr)
-
-// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of average pooling function.
-//
-// Arguments:
-//	orig_input_shape: The original input dimensions.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The backprop for input.
-func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPool3DGrad",
-		Input: []tf.Input{
-			orig_input_shape, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Inverse fast Fourier transform.
 //
 // Computes the inverse 1-dimensional discrete Fourier transform over the
@@ -17895,28 +18835,54 @@ func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents t
 // values of A and B.
 //
 // Arguments:
-//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
-// the non-empty values of the sum.
-//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
-//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
-//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
-// `[nnz(sum), ndims]`.
+//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
+// the non-empty values of the sum.
+//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
+// `[nnz(sum), ndims]`.
+//
+// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
+// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
+// non-empty values of B.
+func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAddGrad",
+		Input: []tf.Input{
+			backprop_val_grad, a_indices, b_indices, sum_indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Adds `bias` to `value`.
+//
+// This is a deprecated version of BiasAdd and will be soon removed.
+//
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
+//
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
 //
-// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
-// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
-// non-empty values of B.
-func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseAddGrad",
+		Type: "BiasAddV1",
 		Input: []tf.Input{
-			backprop_val_grad, a_indices, b_indices, sum_indices,
+			value, bias,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
 // FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
@@ -17996,294 +18962,131 @@ func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...Fix
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"record_bytes": record_bytes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
-type QuantizedRelu6Attr func(optionalAttr)
-
-// QuantizedRelu6OutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
-//
-// Arguments:
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedRelu6",
-		Input: []tf.Input{
-			features, min_features, max_features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
-
-// CumsumExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumsum.
-// If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
-
-// CumsumReverse sets the optional reverse attribute to value.
-//
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Compute the cumulative sum of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-// ```
-//
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Cumsum",
-		Input: []tf.Input{
-			x, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// WriteImageSummaryAttr is an optional argument to WriteImageSummary.
-type WriteImageSummaryAttr func(optionalAttr)
-
-// WriteImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func WriteImageSummaryMaxImages(value int64) WriteImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_images"] = value
-	}
-}
-
-// Writes a `Summary` protocol buffer with images.
-//
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
-//
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
-//
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
-//
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
-//
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
-//
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
-//
-// Arguments:
-//	writer: A handle to a summary writer.
-//	global_step: The step to write the summary for.
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
-//	bad_color: Color to use for pixels with non-finite values.
-//
-// Returns the created operation.
-func WriteImageSummary(scope *Scope, writer tf.Output, global_step tf.Output, tag tf.Output, tensor tf.Output, bad_color tf.Output, optional ...WriteImageSummaryAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "WriteImageSummary",
-		Input: []tf.Input{
-			writer, global_step, tag, tensor, bad_color,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Pads a tensor with zeros.
-//
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
+	attrs := map[string]interface{}{"record_bytes": record_bytes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Pad",
-		Input: []tf.Input{
-			input, paddings,
-		},
+		Type: "FixedLengthRecordReaderV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the number of elements in the given queue.
+// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
+type QuantizedRelu6Attr func(optionalAttr)
+
+// QuantizedRelu6OutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
 //
 // Arguments:
-//	handle: The handle to a queue.
 //
-// Returns The number of elements in the given queue.
-func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QueueSizeV2",
+		Type: "QuantizedRelu6",
 		Input: []tf.Input{
-			handle,
+			features, min_features, max_features,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Outputs a `Summary` protocol buffer with a histogram.
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
+
+// CumsumExclusive sets the optional exclusive attribute to value.
 //
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
+// value: If `True`, perform exclusive cumsum.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumsumReverse sets the optional reverse attribute to value.
 //
-// This op reports an `InvalidArgument` error if any value is not finite.
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative sum of the tensor `x` along `axis`.
 //
-// Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
+		Type: "Cumsum",
 		Input: []tf.Input{
-			tag, values,
+			x, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -18854,45 +19657,196 @@ func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Out
 // position = [1, 5, 7]
 // length =   [3, 2, 1]
 //
-// output = [b'hir', b'ee', b'n']
-// ```
+// output = [b'hir', b'ee', b'n']
+// ```
+//
+// Arguments:
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
+//
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Substr",
+		Input: []tf.Input{
+			input, pos, len,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
+
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomNormal",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
+type UniqueWithCountsAttr func(optionalAttr)
+
+// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
+//
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. Finally, it returns a third tensor `count` that
+// contains the count of each element of `y` in `x`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx, count = unique_with_counts(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// count ==> [2, 1, 3, 1, 2]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.1-D.1-D.
+func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniqueWithCounts",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// RestoreSliceAttr is an optional argument to RestoreSlice.
+type RestoreSliceAttr func(optionalAttr)
+
+// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
+//
+// value: Index of file to open first if multiple files match
+// `file_pattern`. See the documentation for `Restore`.
+// If not specified, defaults to -1
+func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
+	}
+}
+
+// Restores a tensor from checkpoint files.
+//
+// This is like `Restore` except that restored tensor can be listed as filling
+// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
+// larger tensor and the slice that the restored tensor covers.
+//
+// The `shape_and_slice` input has the same format as the
+// elements of the `shapes_and_slices` input of the `SaveSlices` op.
 //
 // Arguments:
-//	input: Tensor of strings
-//	pos: Scalar defining the position of first character in each substring
-//	len: Scalar defining the number of characters to include in each substring
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	shape_and_slice: Scalar. The shapes and slice specifications to use when
+// restoring a tensors.
+//	dt: The type of the tensor to be restored.
 //
-// Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output) (output tf.Output) {
+// Returns The restored tensor.
+func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Substr",
+		Type: "RestoreSlice",
 		Input: []tf.Input{
-			input, pos, len,
+			file_pattern, tensor_name, shape_and_slice,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
 
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
 //
 // value: The type of the output.
 // If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
 	return func(m optionalAttr) {
 		m["dtype"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a normal distribution.
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
 //
-// The generated values will have mean 0 and standard deviation 1.
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
 // The outputs are a deterministic function of `shape` and `seed`.
 //
@@ -18901,7 +19855,7 @@ func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
 //	seed: 2 seeds (shape [2]).
 //
 // Returns Random values with specified shape.
-func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18910,7 +19864,7 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomNormal",
+		Type: "StatelessTruncatedNormal",
 		Input: []tf.Input{
 			shape, seed,
 		},
@@ -19067,6 +20021,22 @@ func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value
 	return op.Output(0)
 }
 
+// Associates the given iterator with the given statistics aggregator.
+//
+// Returns the created operation.
+func IteratorSetStatsAggregator(scope *Scope, iterator_handle tf.Output, stats_aggregator_handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IteratorSetStatsAggregator",
+		Input: []tf.Input{
+			iterator_handle, stats_aggregator_handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
 type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
 
@@ -19806,6 +20776,119 @@ func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Creates a dataset that skips `count` elements from the `input_dataset`.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
+//
+//
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SkipDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
+
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the imaginary part of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Imag",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
+
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Converts two real numbers to a complex number.
+//
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
+//
+// The input tensors `real` and `imag` must have the same shape.
+//
+// For example:
+//
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Complex",
+		Input: []tf.Input{
+			real, imag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Inverse real-valued fast Fourier transform.
 //
 // Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
@@ -20226,51 +21309,124 @@ func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Outp
 	return op.Output(0)
 }
 
-// RestoreSliceAttr is an optional argument to RestoreSlice.
-type RestoreSliceAttr func(optionalAttr)
+// Computes the gradients of 3-D convolution with respect to the input.
+//
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropInput",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
+// ReverseSequenceAttr is an optional argument to ReverseSequence.
+type ReverseSequenceAttr func(optionalAttr)
+
+// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
 //
-// value: Index of file to open first if multiple files match
-// `file_pattern`. See the documentation for `Restore`.
-// If not specified, defaults to -1
-func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+// value: The dimension along which reversal is performed.
+// If not specified, defaults to 0
+func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
+		m["batch_dim"] = value
 	}
 }
 
-// Restores a tensor from checkpoint files.
+// Reverses variable length slices.
+//
+// This op first slices `input` along the dimension `batch_dim`, and for each
+// slice `i`, reverses the first `seq_lengths[i]` elements along
+// the dimension `seq_dim`.
+//
+// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
+// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
+//
+// The output slice `i` along dimension `batch_dim` is then given by input
+// slice `i`, with the first `seq_lengths[i]` slices along dimension
+// `seq_dim` reversed.
+//
+// For example:
+//
+// ```
+// # Given this:
+// batch_dim = 0
+// seq_dim = 1
+// input.dims = (4, 8, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
+// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
+// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
+// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[0, 7:, :, ...] = input[0, 7:, :, ...]
+// output[1, 2:, :, ...] = input[1, 2:, :, ...]
+// output[2, 3:, :, ...] = input[2, 3:, :, ...]
+// output[3, 2:, :, ...] = input[3, 2:, :, ...]
+// ```
+//
+// In contrast, if:
+//
+// ```
+// # Given this:
+// batch_dim = 2
+// seq_dim = 0
+// input.dims = (8, ?, 4, ...)
+// seq_lengths = [7, 2, 3, 5]
 //
-// This is like `Restore` except that restored tensor can be listed as filling
-// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
-// larger tensor and the slice that the restored tensor covers.
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
+// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
+// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
+// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
 //
-// The `shape_and_slice` input has the same format as the
-// elements of the `shapes_and_slices` input of the `SaveSlices` op.
+// # while entries past seq_lens are copied through:
+// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
+// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
+// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
+// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
+// ```
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	shape_and_slice: Scalar. The shapes and slice specifications to use when
-// restoring a tensors.
-//	dt: The type of the tensor to be restored.
+//	input: The input to reverse.
+//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
+// `max(seq_lengths) <= input.dims(seq_dim)`
+//	seq_dim: The dimension which is partially reversed.
 //
-// Returns The restored tensor.
-func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
+// Returns The partially reversed input. It has the same shape as `input`.
+func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
+	attrs := map[string]interface{}{"seq_dim": seq_dim}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RestoreSlice",
+		Type: "ReverseSequence",
 		Input: []tf.Input{
-			file_pattern, tensor_name, shape_and_slice,
+			input, seq_lengths,
 		},
 		Attrs: attrs,
 	}
@@ -20278,44 +21434,46 @@ func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, s
 	return op.Output(0)
 }
 
-// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
-type StatelessTruncatedNormalAttr func(optionalAttr)
-
-// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
+// Computes the gradient for the rsqrt of `x` wrt its input.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
+// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RsqrtGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs deterministic pseudorandom values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "StatelessTruncatedNormal",
+		Type: "Conv3DBackpropFilter",
 		Input: []tf.Input{
-			shape, seed,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -20323,155 +21481,134 @@ func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, opt
 	return op.Output(0)
 }
 
-// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
-type UniqueWithCountsAttr func(optionalAttr)
+// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
+type Conv3DBackpropInputV2Attr func(optionalAttr)
 
-// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
+// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
-		m["out_idx"] = value
+		m["data_format"] = value
 	}
 }
 
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. Finally, it returns a third tensor `count` that
-// contains the count of each element of `y` in `x`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// For example:
+// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
 //
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx, count = unique_with_counts(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// count ==> [2, 1, 3, 1, 2]
-// ```
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
 //
 // Arguments:
-//	x: 1-D.
-//
-// Returns 1-D.1-D.1-D.
-func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
+//	input_sizes: An integer vector representing the tensor shape of `input`,
+// where `input` is a 5-D
+// `[batch, depth, rows, cols, in_channels]` tensor.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UniqueWithCounts",
+		Type: "Conv3DBackpropInputV2",
 		Input: []tf.Input{
-			x,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that skips `count` elements from the `input_dataset`.
+// Returns a tensor of ones with the same shape and type as x.
 //
 // Arguments:
+//	x: a tensor of type T.
 //
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
-//
-//
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns a tensor of the same shape and type as x but filled with ones.
+func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SkipDataset",
+		Type: "OnesLike",
 		Input: []tf.Input{
-			input_dataset, count,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
-
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
-//
-// The input tensors `real` and `imag` must have the same shape.
+// Returns element-wise remainder of division. This emulates C semantics in that
 //
-// For example:
+// the result here is consistent with a truncating divide. E.g.
+// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
 //
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+// *NOTE*: `Mod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Complex",
+		Type: "Mod",
 		Input: []tf.Input{
-			real, imag,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
+// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
+type QuantizeAndDequantizeV3Attr func(optionalAttr)
 
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
+// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["signed_input"] = value
 	}
 }
 
-// Returns the imaginary part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
-//
-// For example:
+// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// Quantizes then dequantizes a tensor.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+// tensor, so its value can change during training.
+func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20480,9 +21617,9 @@ func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Imag",
+		Type: "QuantizeAndDequantizeV3",
 		Input: []tf.Input{
-			input,
+			input, input_min, input_max, num_bits,
 		},
 		Attrs: attrs,
 	}
@@ -20490,79 +21627,125 @@ func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output
 	return op.Output(0)
 }
 
-// Creates a dataset that emits the lines of one or more text files.
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
+
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs 3D average pooling on the input.
 //
 // Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar containing the number of bytes to buffer.
-func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The average pooled output tensor.
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TextLineDataset",
+		Type: "AvgPool3D",
 		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the number of records this Reader has produced.
-//
-// This is the same as the number of ReaderRead executions that have
-// succeeded.
+// Produces the max pool of the input tensor for quantized types.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "ReaderNumRecordsProducedV2",
+		Type: "QuantizedMaxPool",
 		Input: []tf.Input{
-			reader_handle,
+			input, min_input, max_input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes exponential of x - 1 element-wise.
+// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
+type AvgPool3DGradAttr func(optionalAttr)
+
+// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
 //
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Expm1",
-		Input: []tf.Input{
-			x,
-		},
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns x - y element-wise.
+// Computes gradients of average pooling function.
 //
-// *NOTE*: `Sub` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	orig_input_shape: The original input dimensions.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The backprop for input.
+func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Sub",
+		Type: "AvgPool3DGrad",
 		Input: []tf.Input{
-			x, y,
+			orig_input_shape, grad,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -20572,18 +21755,18 @@ func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 //
 // Arguments:
 //	writer: Handle of `SummaryWriter`.
-//	global_step: The step to write the summary for.
+//	step: The step to write the summary for.
 //	tensor: A scalar string of the serialized tf.GraphDef proto.
 //
 // Returns the created operation.
-func WriteGraphSummary(scope *Scope, writer tf.Output, global_step tf.Output, tensor tf.Output) (o *tf.Operation) {
+func WriteGraphSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
 		Type: "WriteGraphSummary",
 		Input: []tf.Input{
-			writer, global_step, tensor,
+			writer, step, tensor,
 		},
 	}
 	return scope.AddOperation(opspec)
@@ -21815,147 +22998,16 @@ func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (ima
 //
 // Arguments:
 //	logits: 2-D with shape `[batch_size, num_classes]`.
-//
-// Returns Same shape as `logits`.
-func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Softmax",
-		Input: []tf.Input{
-			logits,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
-type RandomShuffleQueueV2Attr func(optionalAttr)
-
-// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
-//
-// value: Dequeue will block unless there would be this
-// many elements after the dequeue or the queue is closed. This
-// ensures a minimum level of mixing of elements.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["min_after_dequeue"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// RandomShuffleQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that randomizes the order of elements.
-//
-// Arguments:
-//	component_types: The type of each component in a value.
-//
-// Returns The handle to the queue.
-func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomShuffleQueueV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs a `Summary` protocol buffer with scalar values.
-//
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
-//
-// Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
-//
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+//
+// Returns Same shape as `logits`.
+func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
+		Type: "Softmax",
 		Input: []tf.Input{
-			tags, values,
+			logits,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -23670,7 +24722,21 @@ func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`.
+// SerializeManySparseAttr is an optional argument to SerializeManySparse.
+type SerializeManySparseAttr func(optionalAttr)
+
+// SerializeManySparseOutType sets the optional out_type attribute to value.
+//
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
 //
 // The `SparseTensor` must have rank `R` greater than 1, and the first dimension
 // is treated as the minibatch dimension.  Elements of the `SparseTensor`
@@ -23684,15 +24750,20 @@ func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...
 //	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
 //	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
 //	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) (serialized_sparse tf.Output) {
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "SerializeManySparse",
 		Input: []tf.Input{
 			sparse_indices, sparse_values, sparse_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -23834,6 +24905,20 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 	}
 }
 
+// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
 // Computes the gradients of convolution with respect to the filter.
 //
 // Arguments:
@@ -24045,265 +25130,31 @@ func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 // form square matrices. The output is a tensor containing the determinants
 // for all input submatrices `[..., :, :]`.
 //
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[...]`.
-func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDeterminant",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes cos of x element-wise.
-func Cos(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cos",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BatchToSpace for 4-D tensors of type T.
-//
-// This is a legacy version of the more general BatchToSpaceND.
-//
-// Rearranges (permutes) data from batch into blocks of spatial data, followed by
-// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
-// this op outputs a copy of the input tensor where values from the `batch`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions,
-// followed by cropping along the `height` and `width` dimensions.
-//
-// Arguments:
-//	input: 4-D tensor with shape
-// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//   depth]`. Note that the batch size of the input tensor must be divisible by
-// `block_size * block_size`.
-//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-// how many elements to crop from the intermediate result across the spatial
-// dimensions as follows:
-//
-//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
-//
-//
-// Returns 4-D with shape `[batch, height, width, depth]`, where:
-//
-//       height = height_pad - crop_top - crop_bottom
-//       width = width_pad - crop_left - crop_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
-//
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 3]` and value:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[1, 4, 4, 1]` and value:
-//
-// ```
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
-//
-// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[2, 2, 4, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]], [[5], [7]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"block_size": block_size}
-	opspec := tf.OpSpec{
-		Type: "BatchToSpace",
-		Input: []tf.Input{
-			input, crops,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
-
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
-//
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
-//
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
-//
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
-//
-// Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
-//
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseToDense",
-		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// NthElementAttr is an optional argument to NthElement.
-type NthElementAttr func(optionalAttr)
-
-// NthElementReverse sets the optional reverse attribute to value.
-//
-// value: When set to True, find the nth-largest value in the vector and vice
-// versa.
-// If not specified, defaults to false
-func NthElementReverse(value bool) NthElementAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Finds values of the `n`-th order statistic for the last dimension.
-//
-// If the input is a vector (rank-1), finds the entries which is the nth-smallest
-// value in the vector and outputs their values as scalar tensor.
-//
-// For matrices (resp. higher rank input), computes the entries which is the
-// nth-smallest value in each row (resp. vector along the last dimension). Thus,
-//
-//     values.shape = input.shape[:-1]
-//
-// Arguments:
-//	input: 1-D or higher with last dimension at least `n+1`.
-//	n: 0-D. Position of sorted vector to select along the last dimension (along
-// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
-//
-// Returns The `n`-th order statistic along each last dimensional slice.
-func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "NthElement",
+		Type: "MatrixDeterminant",
 		Input: []tf.Input{
-			input, n,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes asin of x element-wise.
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes cos of x element-wise.
+func Cos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Asin",
+		Type: "Cos",
 		Input: []tf.Input{
 			x,
 		},
@@ -24785,116 +25636,6 @@ func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Ou
 	return scope.AddOperation(opspec)
 }
 
-// QrAttr is an optional argument to Qr.
-type QrAttr func(optionalAttr)
-
-// QrFullMatrices sets the optional full_matrices attribute to value.
-//
-// value: If true, compute full-sized `q` and `r`. If false
-// (the default), compute only the leading `P` columns of `q`.
-// If not specified, defaults to false
-func QrFullMatrices(value bool) QrAttr {
-	return func(m optionalAttr) {
-		m["full_matrices"] = value
-	}
-}
-
-// Computes the QR decompositions of one or more matrices.
-//
-// Computes the QR decomposition of each inner matrix in `tensor` such that
-// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
-//
-// ```python
-// # a is a tensor.
-// # q is a tensor of orthonormal matrices.
-// # r is a tensor of upper triangular matrices.
-// q, r = qr(a)
-// q_full, r_full = qr(a, full_matrices=True)
-// ```
-//
-// Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
-//
-// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
-// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Qr",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
-
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-//
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AudioSummary",
-		Input: []tf.Input{
-			tag, tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // BiasAddAttr is an optional argument to BiasAdd.
 type BiasAddAttr func(optionalAttr)
 
@@ -25096,74 +25837,195 @@ func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr
 	}
 }
 
-// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a log-uniform distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LogUniformCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the truth value of (x < y) element-wise.
+//
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Less",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
+type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsGradientNumBits sets the optional num_bits attribute to value.
+//
+// value: The bitwidth of the quantization; between 2 and 8, inclusive.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsGradientNumBits(value int64) FakeQuantWithMinMaxVarsGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsGradientNarrowRange sets the optional narrow_range attribute to value.
+//
+// value: Whether to quantize into 2^num_bits - 1 distinct values.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxVars operation.
+//
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
+// min, max: Quantization interval, scalar floats.
+//
+//
+//
+// Returns Backpropagated gradients w.r.t. inputs:
+// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
+// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
+// `sum(gradients * (inputs > max))`.
+func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxVarsGradient",
+		Input: []tf.Input{
+			gradients, inputs, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
+type MaxPoolGradV2Attr func(optionalAttr)
+
+// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a log-uniform distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{"padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LogUniformCandidateSampler",
+		Type: "MaxPoolGradV2",
 		Input: []tf.Input{
-			true_classes,
+			orig_input, orig_output, grad, ksize, strides,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Returns the truth value of (x < y) element-wise.
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
 //
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Less",
+		Type: "Minimum",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -25220,30 +26082,6 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 	return op.Output(0)
 }
 
-// Computes the power of one value to another.
-//
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
-//
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Pow",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Compute the upper regularized incomplete Gamma function `Q(a, x)`.
 //
 // The upper regularized incomplete Gamma function is defined as:
@@ -25598,280 +26436,515 @@ func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgM
 // If the sum is empty for a given segment ID `i`, `output[i] = 0`.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentSum",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Bucketizes 'input' based on 'boundaries'.
+//
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
+//
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
+//
+// Arguments:
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
+//
+// Returns Same shape with 'input', each value of input replaced with bucket index.
+//
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"boundaries": boundaries}
+	opspec := tf.OpSpec{
+		Type: "Bucketize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reshapes a SparseTensor to represent values in a new dense shape.
+//
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+//
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
+//
+// Reshaping does not affect the order of values in the SparseTensor.
+//
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReshape",
+		Input: []tf.Input{
+			input_indices, input_shape, new_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes the product along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentProd",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// If the given segment ID `i` is negative, the value is dropped and will not be
+// added to the sum of the segment.
+//
+// `num_segments` should equal the number of distinct segment IDs.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
 // </div>
 //
 // Arguments:
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentSum",
+		Type: "UnsortedSegmentSum",
 		Input: []tf.Input{
-			data, segment_ids,
+			data, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ImageSummaryAttr is an optional argument to ImageSummary.
-type ImageSummaryAttr func(optionalAttr)
-
-// ImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_images"] = value
+// Computes hyperbolic sine of x element-wise.
+func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// ImageSummaryBadColor sets the optional bad_color attribute to value.
-//
-// value: Color to use for pixels with non-finite values.
-// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
-func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["bad_color"] = value
+	opspec := tf.OpSpec{
+		Type: "Sinh",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with images.
+// Computes the sum along sparse segments of a tensor.
 //
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
+// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
 //
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
+// For example:
 //
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
 //
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
+// # Select two rows, one segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+// # => [[0 0 0 0]]
 //
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// # Select two rows, two segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+// # => [[ 1  2  3  4]
+// #     [-1 -2 -3 -4]]
 //
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+// # Select all rows, two segments.
+// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+// # => [[0 0 0 0]
+// #     [5 6 7 8]]
 //
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
+// # Which is equivalent to:
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// ```
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ImageSummary",
+		Type: "SparseSegmentSum",
 		Input: []tf.Input{
-			tag, tensor,
+			data, indices, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Bucketizes 'input' based on 'boundaries'.
+// Counts the number of occurrences of each value in an integer array.
 //
-// For example, if the inputs are
-//     boundaries = [0, 10, 100]
-//     input = [[-5, 10000]
-//              [150,   10]
-//              [5,    100]]
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
 //
-// then the output will be
-//     output = [[0, 3]
-//               [3, 2]
-//               [1, 3]]
+// Values in `arr` outside of the range [0, size) are ignored.
 //
 // Arguments:
-//	input: Any shape of Tensor contains with int or float type.
-//	boundaries: A sorted list of floats gives the boundary of the buckets.
-//
-// Returns Same shape with 'input', each value of input replaced with bucket index.
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
 //
-// @compatibility(numpy)
-// Equivalent to np.digitize.
-// @end_compatibility
-func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"boundaries": boundaries}
 	opspec := tf.OpSpec{
-		Type: "Bucketize",
+		Type: "Bincount",
 		Input: []tf.Input{
-			input,
+			arr, size, weights,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reshapes a SparseTensor to represent values in a new dense shape.
+// BatchToSpace for 4-D tensors of type T.
 //
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+// This is a legacy version of the more general BatchToSpaceND.
 //
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
+// Rearranges (permutes) data from batch into blocks of spatial data, followed by
+// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
+// this op outputs a copy of the input tensor where values from the `batch`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions,
+// followed by cropping along the `height` and `width` dimensions.
 //
-// Reshaping does not affect the order of values in the SparseTensor.
+// Arguments:
+//	input: 4-D tensor with shape
+// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//   depth]`. Note that the batch size of the input tensor must be divisible by
+// `block_size * block_size`.
+//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+// how many elements to crop from the intermediate result across the spatial
+// dimensions as follows:
 //
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
+//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
 //
-// Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
 //
-// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+// Returns 4-D with shape `[batch, height, width, depth]`, where:
+//
+//       height = height_pad - crop_top - crop_bottom
+//       width = width_pad - crop_left - crop_right
+//
+// The attr `block_size` must be greater than one. It indicates the block size.
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[5], [7]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "SparseReshape",
+		Type: "BatchToSpace",
 		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
+			input, crops,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes the product along segments of a tensor.
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
+
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
+// If not specified, defaults to true
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Converts a sparse representation into a dense tensor.
 //
-// Computes a tensor such that
-// \\(output_i = \prod_j data_j\\) where the product is over `j` such
-// that `segment_ids[j] == i`.
+// Builds an array `dense` with shape `output_shape` such that
 //
-// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
-// </div>
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
 //
-// Arguments:
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
+//
+// Arguments:
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
+//
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SegmentProd",
+		Type: "SparseToDense",
 		Input: []tf.Input{
-			data, segment_ids,
+			sparse_indices, output_shape, sparse_values, default_value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// NthElementAttr is an optional argument to NthElement.
+type NthElementAttr func(optionalAttr)
+
+// NthElementReverse sets the optional reverse attribute to value.
 //
-// Computes a tensor such that
-// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
-// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-// need not be sorted and need not cover all values in the full
-// range of valid values.
+// value: When set to True, find the nth-largest value in the vector and vice
+// versa.
+// If not specified, defaults to false
+func NthElementReverse(value bool) NthElementAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Finds values of the `n`-th order statistic for the last dimension.
 //
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// If the input is a vector (rank-1), finds the entries which is the nth-smallest
+// value in the vector and outputs their values as scalar tensor.
 //
-// `num_segments` should equal the number of distinct segment IDs.
+// For matrices (resp. higher rank input), computes the entries which is the
+// nth-smallest value in each row (resp. vector along the last dimension). Thus,
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-// </div>
+//     values.shape = input.shape[:-1]
 //
 // Arguments:
+//	input: 1-D or higher with last dimension at least `n+1`.
+//	n: 0-D. Position of sorted vector to select along the last dimension (along
+// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns The `n`-th order statistic along each last dimensional slice.
+func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentSum",
+		Type: "NthElement",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			input, n,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes hyperbolic sine of x element-wise.
-func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes asin of x element-wise.
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sinh",
+		Type: "Asin",
 		Input: []tf.Input{
 			x,
 		},
@@ -25882,83 +26955,49 @@ func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
 
 // Computes the sum along sparse segments of a tensor.
 //
+// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
 // Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
 // segments.
 //
-// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
-//
 // For example:
 //
 // ```python
 // c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
 //
-// # Select two rows, one segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-// # => [[0 0 0 0]]
-//
-// # Select two rows, two segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-// # => [[ 1  2  3  4]
-// #     [-1 -2 -3 -4]]
-//
-// # Select all rows, two segments.
-// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+// tf.sparse_segment_sum_with_num_segments(
+//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
 // # => [[0 0 0 0]
-// #     [5 6 7 8]]
+// #     [0 0 0 0]
+// #     [0 0 0 0]]
 //
-// # Which is equivalent to:
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// tf.sparse_segment_sum_with_num_segments(c,
+//                                         tf.constant([0, 1]),
+//                                         tf.constant([0, 2],
+//                                         num_segments=4))
+// # => [[ 1  2  3  4]
+// #     [ 0  0  0  0]
+// #     [-1 -2 -3 -4]
+// #     [ 0  0  0  0]]
 // ```
 //
 // Arguments:
 //
 //	indices: A 1-D tensor. Has same rank as `segment_ids`.
 //	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
 //
 // Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSum",
-		Input: []tf.Input{
-			data, indices, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Counts the number of occurrences of each value in an integer array.
-//
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
-//
-// Values in `arr` outside of the range [0, size) are ignored.
-//
-// Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
-//
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+// has size `num_segments`.
+func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Bincount",
+		Type: "SparseSegmentSumWithNumSegments",
 		Input: []tf.Input{
-			arr, size, weights,
+			data, indices, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -26018,47 +27057,109 @@ func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
 // To use, enqueue filenames in a Queue.  The output of ReaderRead will
 // be a filename (key) and the contents of that file (value).
 //
-// Returns The handle to reference the Reader.
-func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+// Returns The handle to reference the Reader.
+func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WholeFileReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the mean along sparse segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMean",
+		Input: []tf.Input{
+			data, indices, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the mean along sparse segments of a tensor.
+//
+// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which has size
+// `num_segments`.
+func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "WholeFileReaderV2",
-
-		Attrs: attrs,
+		Type: "SparseSegmentMeanWithNumSegments",
+		Input: []tf.Input{
+			data, indices, segment_ids, num_segments,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+//
+// N is the size of the segment being reduced.
+//
+// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
 //
 // Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
 // segments.
 //
-// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
-//
 // Arguments:
 //
 //	indices: A 1-D tensor. Has same rank as `segment_ids`.
 //	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
 //
 // Returns Has same shape as data, except for dimension 0 which
 // has size `k`, the number of segments.
-func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMean",
+		Type: "SparseSegmentSqrtNWithNumSegments",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			data, indices, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -26993,21 +28094,40 @@ func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_pa
 	return outputs
 }
 
-// Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object.
+// SerializeSparseAttr is an optional argument to SerializeSparse.
+type SerializeSparseAttr func(optionalAttr)
+
+// SerializeSparseOutType sets the optional out_type attribute to value.
+//
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
 //
 // Arguments:
 //	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
 //	sparse_values: 1-D.  The `values` of the `SparseTensor`.
 //	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) (serialized_sparse tf.Output) {
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "SerializeSparse",
 		Input: []tf.Input{
 			sparse_indices, sparse_values, sparse_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -27033,223 +28153,3 @@ func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, val
 	}
 	return scope.AddOperation(opspec)
 }
-
-// FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
-type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsGradientNumBits sets the optional num_bits attribute to value.
-//
-// value: The bitwidth of the quantization; between 2 and 8, inclusive.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsGradientNumBits(value int64) FakeQuantWithMinMaxVarsGradientAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxVarsGradientNarrowRange sets the optional narrow_range attribute to value.
-//
-// value: Whether to quantize into 2^num_bits - 1 distinct values.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsGradientAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Compute gradients for a FakeQuantWithMinMaxVars operation.
-//
-// Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
-// min, max: Quantization interval, scalar floats.
-//
-//
-//
-// Returns Backpropagated gradients w.r.t. inputs:
-// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
-// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
-// `sum(gradients * (inputs > max))`.
-func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsGradient",
-		Input: []tf.Input{
-			gradients, inputs, min, max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
-type MaxPoolGradV2Attr func(optionalAttr)
-
-// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradV2",
-		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
-//
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Minimum",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
-//
-// Arguments:
-//
-//	buffer_size: The maximum number of elements to buffer in an iterator over
-// this dataset.
-//
-//
-func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "PrefetchDataset",
-		Input: []tf.Input{
-			input_dataset, buffer_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
-//
-// Arguments:
-//	tag: A string attached to this summary. Used for organization in TensorBoard.
-//	tensor: A tensor to serialize.
-//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
-// data.
-func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorSummaryV2",
-		Input: []tf.Input{
-			tag, tensor, serialized_summary_metadata,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
-type AudioSummaryV2Attr func(optionalAttr)
-
-// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with audio.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-//
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AudioSummaryV2",
-		Input: []tf.Input{
-			tag, tensor, sample_rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index 1326a952787f207b16e48a838f37d4ca80b8f6d8..2d25c04dc9b1d0bc2ae831f98c0879e73a6bfafa 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -270,7 +270,7 @@ func typeOf(dt DataType, shape []int64) reflect.Type {
 		}
 	}
 	if ret == nil {
-		panic(bug("DataType %v is not supported", dt))
+		panic(bug("DataType %v is not supported (see https://www.tensorflow.org/code/tensorflow/core/framework/types.proto)", dt))
 	}
 	for range shape {
 		ret = reflect.SliceOf(ret)
@@ -328,6 +328,14 @@ func encodeTensor(w *bytes.Buffer, v reflect.Value, shape []int64) error {
 			}
 		}
 
+		// Optimisation: if only one dimension is left we can use binary.Write() directly for this slice
+		if len(shape) == 1 && v.Len() > 0 {
+			switch v.Index(0).Kind() {
+			case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
+				return binary.Write(w, nativeEndian, v.Interface())
+			}
+		}
+
 		subShape := shape[1:]
 		for i := 0; i < v.Len(); i++ {
 			err := encodeTensor(w, v.Index(i), subShape)
@@ -360,6 +368,15 @@ func decodeTensor(r *bytes.Reader, shape []int64, typ reflect.Type, ptr reflect.
 	case reflect.Slice:
 		val := reflect.Indirect(ptr)
 		val.Set(reflect.MakeSlice(typ, int(shape[0]), int(shape[0])))
+
+		// Optimization: if only one dimension is left we can use binary.Read() directly for this slice
+		if len(shape) == 1 && val.Len() > 0 {
+			switch val.Index(0).Kind() {
+			case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
+				return binary.Read(r, nativeEndian, val.Interface())
+			}
+		}
+
 		for i := 0; i < val.Len(); i++ {
 			if err := decodeTensor(r, shape[1:], typ.Elem(), val.Index(i).Addr()); err != nil {
 				return err
diff --git a/tensorflow/go/tensor_test.go b/tensorflow/go/tensor_test.go
index 674a8ce86f8d6e5e5733d045f1712cee242750d2..793c36dd4db28fc5fdb713095c6d1d6713367a7a 100644
--- a/tensorflow/go/tensor_test.go
+++ b/tensorflow/go/tensor_test.go
@@ -243,3 +243,23 @@ func BenchmarkNewTensor(b *testing.B) {
 	)
 	b.Run("[150528]", func(b *testing.B) { benchmarkNewTensor(b, vector) })
 }
+
+func benchmarkDecodeTensor(b *testing.B, t *Tensor) {
+	for i := 0; i < b.N; i++ {
+		_ = t.Value()
+	}
+}
+
+func BenchmarkDecodeTensor(b *testing.B) {
+	var (
+		// Some sample sizes from the Inception image labeling model.
+		// Where input tensors correspond to a 224x224 RGB image
+		// flattened into a vector.
+		vector [224 * 224 * 3]int32
+	)
+	t, err := NewTensor(vector)
+	if err != nil {
+		b.Fatalf("(%v, %v)", t, err)
+	}
+	b.Run("[150528]", func(b *testing.B) { benchmarkDecodeTensor(b, t) })
+}
diff --git a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
index 45e42878c770b3c19d96790e5b4bf2ed41a0de29..11fda4fc22aeec9c2d94b5e884c11ceb2a66d29e 100644
--- a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
+++ b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
@@ -77,7 +77,7 @@ public final class OperatorProcessor extends AbstractProcessor {
     TypeElement annotation = annotations.iterator().next();
     Set<? extends Element> annotated = roundEnv.getElementsAnnotatedWith(annotation);
 
-    // If there are no annotated elements, claim the annotion but do nothing.
+    // If there are no annotated elements, claim the annotation but do nothing.
     if (annotated.size() == 0) {
       return true;
     }
diff --git a/tensorflow/java/src/gen/perl/tftypes-runall.pl b/tensorflow/java/src/gen/perl/tftypes-runall.pl
index a451ce92aa272ece591e71f85d08bd08acd6430e..65fe3b150667e3a5ed73e6bbd7e9da74157b72d8 100644
--- a/tensorflow/java/src/gen/perl/tftypes-runall.pl
+++ b/tensorflow/java/src/gen/perl/tftypes-runall.pl
@@ -1,13 +1,13 @@
 #!/usr/bin/perl
 #
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tensorflow/java/src/gen/perl/tftypes.pl b/tensorflow/java/src/gen/perl/tftypes.pl
index 115723ac8a8553966dc0906031c1962007ee6a82..c7c62e916f4860aa16503ae098eed9e90e9150e4 100644
--- a/tensorflow/java/src/gen/perl/tftypes.pl
+++ b/tensorflow/java/src/gen/perl/tftypes.pl
@@ -1,13 +1,13 @@
 #!/usr/bin/perl
 #
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -156,7 +156,7 @@ for (my $i = 1; $i <= $#info; $i++) {
                            ."   *  String elements are sequences of bytes from the last array dimension.\n";
             }
 
-    
+
             my $intro = ($trank > 0)
                 ?  "Creates a rank-$trank tensor of {\@code $jtype} elements."
                 :  "Creates a scalar tensor containing a single {\@code $jtype} element.";
diff --git a/tensorflow/java/src/gen/resources/Tensors.java.tmpl b/tensorflow/java/src/gen/resources/Tensors.java.tmpl
index 98e15885594ed4dd06201a7252817cb66d8bc0ba..e615524c8e59f056b1aeac322afdff1739cd90bf 100644
--- a/tensorflow/java/src/gen/resources/Tensors.java.tmpl
+++ b/tensorflow/java/src/gen/resources/Tensors.java.tmpl
@@ -11,7 +11,7 @@ public final class Tensors {
   private Tensors() {}
 
   /** Creates a scalar String tensor using the default, UTF-8 encoding.
-   * 
+   *
    *  @param data  The string to put into the new scalar tensor.
    */
   public static Tensor<String> create(String data) {
@@ -19,7 +19,7 @@ public final class Tensors {
   }
 
   /** Creates a scalar String tensor using a specified encoding.
-   * 
+   *
    *  @param charset The encoding from String to bytes.
    *  @param data    The string to put into the new scalar tensor.
    */
diff --git a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
index beb3635585c33f5a3942e4f7d44ac597daf8ff72..a24150484e83dcccf3e1869155569431969b74cf 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
@@ -352,7 +352,8 @@ public final class OperationBuilder {
 
   private static native void setAttrShape(long handle, String name, long[] shape, int numDims);
 
-  private static native void setAttrShapeList(long handle, String name, long[] shapes, int[] numDims);
+  private static native void setAttrShapeList(
+      long handle, String name, long[] shapes, int[] numDims);
 
   private static native void setAttrStringList(long handle, String name, Object[] value);
 }
diff --git a/tensorflow/java/src/main/native/operation_builder_jni.cc b/tensorflow/java/src/main/native/operation_builder_jni.cc
index 71a451ad1309659a9f96d9b9eedf60a8b3fd9683..55d214a7c4b81a01e48121214e91397626652f11 100644
--- a/tensorflow/java/src/main/native/operation_builder_jni.cc
+++ b/tensorflow/java/src/main/native/operation_builder_jni.cc
@@ -275,15 +275,15 @@ JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrShapeList(
   if (num_dims_length > 0) {
     const int shapes_length = env->GetArrayLength(shapes);
     cshapes.reset(new int64_t[shapes_length]);
-    cdims.reset(new int64_t* [num_dims_length]);
+    cdims.reset(new int64_t*[num_dims_length]);
     cnum_dims.reset(new int[num_dims_length]);
     jlong* shapes_elems =
-        (jlong*) env->GetPrimitiveArrayCritical(shapes, nullptr);
+        static_cast<jlong*>(env->GetPrimitiveArrayCritical(shapes, nullptr));
     std::memcpy(cshapes.get(), shapes_elems, shapes_length << 3);
     env->ReleasePrimitiveArrayCritical(shapes, shapes_elems, JNI_ABORT);
     int64_t* cshapes_ptr = cshapes.get();
     jint* num_dims_elems =
-        (jint*) env->GetPrimitiveArrayCritical(num_dims, nullptr);
+        static_cast<jint*>(env->GetPrimitiveArrayCritical(num_dims, nullptr));
     for (int i = 0; i < num_dims_length; ++i) {
       cnum_dims[i] = static_cast<int>(num_dims_elems[i]);
       cdims[i] = cshapes_ptr;
diff --git a/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
index 2430816725abdd664cd016cdfefa6c94b3d0b9b1..0a4a8cf4e3f65311ba887b4d47bc79080bfd5382 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
@@ -151,10 +151,10 @@ public class OperationBuilderTest {
   @Test
   public void setAttrShapeList() {
     // Those shapes match tensors ones, so no exception is thrown
-    testSetAttrShapeList(new Shape[] { Shape.make(2, 2), Shape.make(2, 2, 2) });
+    testSetAttrShapeList(new Shape[] {Shape.make(2, 2), Shape.make(2, 2, 2)});
     try {
       // Those shapes do not match tensors ones, exception is thrown
-      testSetAttrShapeList(new Shape[] { Shape.make(2, 2), Shape.make(2, 2, 2, 2) });
+      testSetAttrShapeList(new Shape[] {Shape.make(2, 2), Shape.make(2, 2, 2, 2)});
       fail("Shapes are incompatible and an exception was expected");
     } catch (IllegalArgumentException e) {
       // expected
@@ -189,20 +189,23 @@ public class OperationBuilderTest {
   }
 
   private static void testSetAttrShapeList(Shape[] shapes) {
-    try (Graph g = new Graph(); Session s = new Session(g)) {
-      int[][] matrix = new int[][] { { 0, 0 }, { 0, 0 } };
-      Output<?> queue = g.opBuilder("FIFOQueue", "queue")
-          .setAttr("component_types", new DataType[] { DataType.INT32, DataType.INT32 }) 
-          .setAttr("shapes", shapes)
-          .build()
-          .output(0);
+    try (Graph g = new Graph();
+        Session s = new Session(g)) {
+      int[][] matrix = new int[][] {{0, 0}, {0, 0}};
+      Output<?> queue =
+          g.opBuilder("FIFOQueue", "queue")
+              .setAttr("component_types", new DataType[] {DataType.INT32, DataType.INT32})
+              .setAttr("shapes", shapes)
+              .build()
+              .output(0);
       assertTrue(hasNode(g, "queue"));
       Output<Integer> c1 = TestUtil.constant(g, "const1", matrix);
-      Output<Integer> c2 = TestUtil.constant(g, "const2", new int[][][] { matrix, matrix });
-      Operation enqueue = g.opBuilder("QueueEnqueue", "enqueue")
-          .addInput(queue)
-          .addInputList(new Output<?>[] { c1, c2 })
-          .build();
+      Output<Integer> c2 = TestUtil.constant(g, "const2", new int[][][] {matrix, matrix});
+      Operation enqueue =
+          g.opBuilder("QueueEnqueue", "enqueue")
+              .addInput(queue)
+              .addInputList(new Output<?>[] {c1, c2})
+              .build();
       assertTrue(hasNode(g, "enqueue"));
 
       s.runner().addTarget(enqueue).run();
diff --git a/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
index a86b4dd117ede64d2b105ceb189220a5dd5d9740..e8cc76c2a6458193161a98e17483fe73de107b77 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
@@ -151,7 +151,7 @@ public class SessionTest {
       s.close();
       try {
         s.runner().run();
-        fail("methods on a close()d session should fail");
+        fail("methods on a session should fail after close() is called");
       } catch (IllegalStateException e) {
         // expected exception
       }
diff --git a/tensorflow/java/src/test/java/org/tensorflow/ShapeTest.java b/tensorflow/java/src/test/java/org/tensorflow/ShapeTest.java
index 92cc3bd60e2de0f7e936cb9e0d0ba9da56d6f811..313c09e1e40a9bf4e79933ff2a9ca1d3ce58473e 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/ShapeTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/ShapeTest.java
@@ -84,11 +84,10 @@ public class ShapeTest {
     assertEquals(Shape.scalar(), Shape.scalar());
     assertEquals(Shape.make(1, 2, 3), Shape.make(1, 2, 3));
 
-    assertNotEquals(Shape.make(1,2), null);
-    assertNotEquals(Shape.make(1,2), new Object());
+    assertNotEquals(Shape.make(1, 2), null);
+    assertNotEquals(Shape.make(1, 2), new Object());
     assertNotEquals(Shape.make(1, 2, 3), Shape.make(1, 2, 4));
 
-
     assertNotEquals(Shape.unknown(), Shape.unknown());
     assertNotEquals(Shape.make(-1), Shape.make(-1));
     assertNotEquals(Shape.make(1, -1, 3), Shape.make(1, -1, 3));
@@ -103,4 +102,3 @@ public class ShapeTest {
     assertNotEquals(Shape.make(1, 2).hashCode(), Shape.make(1, 3).hashCode());
   }
 }
-
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a438768809824e91c476ac78249d3a40129d9578..a40b87e84b13bc907a8f4b0b660f8fe9f96d8457 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -171,7 +171,10 @@ tf_py_test(
     name = "flags_test",
     size = "small",
     srcs = ["platform/flags_test.py"],
-    additional_deps = [":platform"],
+    additional_deps = [
+        ":client_testlib",
+        ":platform",
+    ],
 )
 
 tf_py_test(
@@ -179,10 +182,7 @@ tf_py_test(
     size = "small",
     srcs = ["platform/app_test.py"],
     additional_deps = [":platform"],
-    tags = [
-        "manual",
-        "notap",
-    ],
+    tags = ["notap"],
 )
 
 cc_library(
@@ -207,11 +207,11 @@ cc_library(
     srcs = ["grappler/model_analyzer.cc"],
     hdrs = ["grappler/model_analyzer.h"],
     deps = [
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/core/grappler/costs:utils",
     ],
 )
 
@@ -227,11 +227,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "bfloat16_lib",
+    srcs = ["lib/core/bfloat16.cc"],
+    hdrs = ["lib/core/bfloat16.h"],
+    deps = [
+        ":numpy_lib",
+        ":safe_ptr",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//util/python:python_headers",
+    ],
+)
+
 cc_library(
     name = "ndarray_tensor_bridge",
     srcs = ["lib/core/ndarray_tensor_bridge.cc"],
     hdrs = ["lib/core/ndarray_tensor_bridge.h"],
     deps = [
+        ":bfloat16_lib",
         ":numpy_lib",
         "//tensorflow/c:c_api",
         "//tensorflow/core:lib",
@@ -268,10 +282,15 @@ cc_library(
     deps = [
         ":ndarray_tensor_bridge",
         ":numpy_lib",
+        ":py_util",
+        ":safe_ptr",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/eager:c_api",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:script_ops_op_lib",
+        "//tensorflow/python/eager:pywrap_tfe_lib",
         "//third_party/py/numpy:headers",
         "//util/python:python_headers",
     ],
@@ -293,6 +312,7 @@ cc_library(
     srcs = ["lib/core/ndarray_tensor.cc"],
     hdrs = ["lib/core/ndarray_tensor.h"],
     deps = [
+        ":bfloat16_lib",
         ":ndarray_tensor_bridge",
         ":numpy_lib",
         ":safe_ptr",
@@ -309,6 +329,7 @@ cc_library(
     hdrs = ["lib/core/py_seq_tensor.h"],
     deps = [
         ":numpy_lib",
+        ":py_util",
         ":safe_ptr",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -316,6 +337,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "py_util",
+    srcs = ["lib/core/py_util.cc"],
+    hdrs = ["lib/core/py_util.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:script_ops_op_lib",
+        "//util/python:python_headers",
+    ],
+)
+
 cc_library(
     name = "py_record_reader_lib",
     srcs = ["lib/io/py_record_reader.cc"],
@@ -586,6 +618,7 @@ py_library(
     srcs = ["framework/dtypes.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":pywrap_tensorflow",
         "//tensorflow/core:protos_all_py",
     ],
 )
@@ -676,6 +709,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":c_api_util",
+        ":control_flow_util",
         ":device",
         ":dtypes",
         ":op_def_registry",
@@ -766,15 +800,23 @@ py_library(
     srcs = ["framework/test_util.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":array_ops",
         ":client",
         ":errors",
-        ":framework",
         ":framework_for_generated_wrappers",
         ":platform",
         ":platform_test",
         ":pywrap_tensorflow",
+        ":random_seed",
+        ":resource_variable_ops",
+        ":session",
         ":training",
         ":util",
+        ":variables",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:tape",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -1074,6 +1116,7 @@ py_test(
         ":variables",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
     ],
 )
 
@@ -1180,6 +1223,11 @@ py_test(
         ":framework_test_lib",
         ":platform_test",
         ":random_ops",
+        ":resource_variable_ops",
+        ":session",
+        ":variables",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
     ],
 )
@@ -1258,7 +1306,10 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "control_flow_ops_gen",
-    visibility = ["//learning/brain/python/ops:__pkg__"],
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
     deps = [
         "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:no_op_op_lib",
@@ -1526,6 +1577,7 @@ py_library(
     deps = [
         ":control_flow_ops",
         ":control_flow_ops_gen",
+        ":control_flow_util",
         ":framework",
         ":framework_for_generated_wrappers",
         ":math_ops",
@@ -1542,6 +1594,7 @@ py_library(
         ":array_ops_gen",
         ":constant_op",
         ":control_flow_ops_gen",
+        ":control_flow_util",
         ":data_flow_ops_gen",
         ":dtypes",
         ":framework_ops",
@@ -1557,6 +1610,15 @@ py_library(
     ],
 )
 
+py_library(
+    name = "control_flow_util",
+    srcs = ["ops/control_flow_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":platform",
+    ],
+)
+
 py_library(
     name = "ctc_ops",
     srcs = ["ops/ctc_ops.py"],
@@ -1630,6 +1692,7 @@ py_library(
         ":bitwise_ops",
         ":control_flow_grad",
         ":control_flow_ops",
+        ":control_flow_util",
         ":framework",
         ":framework_for_generated_wrappers",
         ":functional_ops",
@@ -1982,6 +2045,7 @@ py_library(
     deps = [
         ":array_ops",
         ":control_flow_ops",
+        ":control_flow_util",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":rnn_cell",
@@ -2581,7 +2645,7 @@ cuda_py_test(
         ":nn_grad",
         "//third_party/py/numpy",
     ],
-    shard_count = 4,
+    shard_count = 16,
 )
 
 cuda_py_test(
@@ -2972,6 +3036,7 @@ tf_cuda_library(
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//third_party/py/numpy:headers",
@@ -3007,6 +3072,7 @@ tf_py_wrap_cc(
         "grappler/item.i",
         "grappler/model_analyzer.i",
         "grappler/tf_optimizer.i",
+        "lib/core/bfloat16.i",
         "lib/core/py_func.i",
         "lib/core/strings.i",
         "lib/io/file_io.i",
@@ -3025,6 +3091,7 @@ tf_py_wrap_cc(
         "util/util.i",
     ],
     deps = [
+        ":bfloat16_lib",
         ":cost_analyzer_lib",
         ":model_analyzer_lib",
         ":cpp_python_util",
@@ -3046,7 +3113,9 @@ tf_py_wrap_cc(
         "//tensorflow/core/distributed_runtime/rpc:grpc_session",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/clusters:single_machine",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/costs:graph_memory",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/core:lib",
@@ -3097,130 +3166,124 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_test",
     size = "small",
     srcs = ["training/server_lib_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_multiple_containers_test",
     size = "small",
     srcs = ["training/server_lib_multiple_containers_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_same_variables_clear_container_test",
     size = "small",
     srcs = ["training/server_lib_same_variables_clear_container_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_same_variables_clear_test",
     size = "small",
     srcs = ["training/server_lib_same_variables_clear_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_same_variables_no_clear_test",
     size = "small",
     srcs = ["training/server_lib_same_variables_no_clear_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_sparse_job_test",
     size = "small",
     srcs = ["training/server_lib_sparse_job_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
 cuda_py_test(
@@ -3240,6 +3303,7 @@ cuda_py_test(
         ":variables",
         "//third_party/py/numpy",
     ],
+    grpc_enabled = True,
     tags = [
         "no_oss",  # Test flaky due to port collisions.
         "oss_serial",
@@ -3258,6 +3322,7 @@ tf_py_test(
         ":training",
         ":variables",
     ],
+    grpc_enabled = True,
     tags = [
         "no_oss",  # Test flaky due to port collisions.
         "notsan",  # data race due to b/62910646
@@ -3288,17 +3353,11 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
-py_test(
+tf_py_test(
     name = "session_test",
     size = "small",
     srcs = ["client/session_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_gpu",
-        "no_pip_gpu",  # testInteractivePlacePrunedGraph fails on invalid assumption about GPU ops.
-        "no_windows",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":control_flow_ops",
@@ -3316,20 +3375,19 @@ py_test(
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
+    grpc_enabled = True,
+    tags = [
+        "no_gpu",
+        "no_pip_gpu",  # testInteractivePlacePrunedGraph fails on invalid assumption about GPU ops.
+        "no_windows",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "session_clusterspec_prop_test",
     size = "small",
     srcs = ["client/session_clusterspec_prop_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_gpu",
-        "no_oss",
-        "no_pip_gpu",
-        "notap",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
@@ -3344,36 +3402,40 @@ py_test(
         ":variables",
         "//third_party/py/numpy",
     ],
+    grpc_enabled = True,
+    tags = [
+        "no_gpu",
+        "no_oss",
+        "no_pip",
+        "no_pip_gpu",
+        "notap",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "session_list_devices_test",
     size = "small",
     srcs = ["client/session_list_devices_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_gpu",
-        "no_pip_gpu",
-        "notsan",  # data race due to b/62910646
-    ],
-    deps = [
+    additional_deps = [
         ":client",
         ":framework",
         ":framework_test_lib",
         ":platform_test",
         ":training",
     ],
+    grpc_enabled = True,
+    tags = [
+        "no_gpu",
+        "no_pip_gpu",
+        "notsan",  # data race due to b/62910646
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "session_partial_run_test",
     size = "small",
     srcs = ["client/session_partial_run_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_gpu",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":errors",
@@ -3386,6 +3448,11 @@ py_test(
         ":util",
         "@six_archive//:six",
     ],
+    grpc_enabled = True,
+    tags = [
+        "no_gpu",
+        "no_windows",
+    ],
 )
 
 cuda_py_test(
@@ -3401,6 +3468,19 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "virtual_gpu_test",
+    size = "small",
+    srcs = ["client/virtual_gpu_test.py"],
+    additional_deps = [
+        ":client",
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":math_ops",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
 py_test(
     name = "graph_util_test",
     size = "small",
@@ -3418,6 +3498,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "bfloat16_test",
+    size = "small",
+    srcs = ["lib/core/bfloat16_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":lib",
+        ":pywrap_tensorflow",
+    ],
+)
+
 py_test(
     name = "file_io_test",
     size = "small",
@@ -3571,7 +3663,9 @@ cuda_py_test(
         "//third_party/py/numpy",
         "@six_archive//:six",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
+    tags = ["multi_gpu"],
 )
 
 py_test(
@@ -3628,20 +3722,18 @@ cuda_py_test(
         ":training",
         ":variables",
     ],
+    grpc_enabled = True,
     main = "training/session_manager_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "supervisor_test",
     size = "small",
     srcs = ["training/supervisor_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework",
         ":framework_for_generated_wrappers",
         ":io_ops",
@@ -3652,6 +3744,8 @@ py_test(
         ":variables",
         "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
+    tags = ["no_windows"],
 )
 
 py_test(
@@ -4265,6 +4359,7 @@ cuda_py_test(
         ":variables",
         "//third_party/py/numpy",
     ],
+    grpc_enabled = True,
     main = "client/session_benchmark.py",
 )
 
@@ -4348,7 +4443,10 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = [":pywrap_tensorflow_internal"],
+    deps = [
+        ":pywrap_tensorflow_internal",
+        ":tf_cluster",
+    ],
 )
 
 py_test(
@@ -4412,6 +4510,7 @@ cuda_py_test(
         ":nn",
         ":ops",
         ":random_ops",
+        ":tf_cluster",
         ":tf_optimizer",
         ":training",
         "//third_party/py/numpy",
@@ -4426,7 +4525,11 @@ py_library(
         "grappler/cost_analyzer.py",
     ],
     srcs_version = "PY2AND3",
-    deps = [":pywrap_tensorflow_internal"],
+    deps = [
+        ":pywrap_tensorflow_internal",
+        ":tf_cluster",
+        ":tf_item",
+    ],
 )
 
 py_binary(
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index af34aca3e345ff6d12f471f289b77001b40c00bf..bc9ddec2a54a784027120828e9b15a2bf500414e 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -263,6 +263,7 @@ _allowed_symbols.extend([
     'GIT_VERSION',
     'COMPILER_VERSION',
     'CXX11_ABI_FLAG',
+    'MONOLITHIC_BUILD',
 ])
 
 # Remove all extra symbols that don't have a docstring or are not explicitly
@@ -282,6 +283,7 @@ _exported_dunders = set([
     '__git_version__',
     '__compiler_version__',
     '__cxx11_abi_flag__',
+    '__monolithic_build__',
 ])
 
 # Expose symbols minus dunders, unless they are whitelisted above.
diff --git a/tensorflow/python/build_defs.bzl b/tensorflow/python/build_defs.bzl
index 2d8625933f9ea4ab3bedf8d3157430d821f3e584..48b03fab0fd639768b3e8bcfcb38429c1e536ecc 100644
--- a/tensorflow/python/build_defs.bzl
+++ b/tensorflow/python/build_defs.bzl
@@ -27,4 +27,8 @@ def tf_gen_op_wrapper_private_py(name, out=None, deps=[],
     deps=deps,
     require_shape_functions=require_shape_functions,
     generated_target_name=name,
+    api_def_srcs = [
+        "//tensorflow/core:base_api_def",
+        "//tensorflow/core:python_api_def",
+    ],
   )
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 759c36ad72e922671288b0d57fe9e442b915c144..1481a4d035cbc63aa655be6c4d441e6f6741e118 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -126,6 +126,12 @@ _REGISTERED_EXPANSIONS = [
      lambda feed: [feed])]
 # pylint: enable=g-long-lambda
 
+
+def _convert_to_numpy_obj(numpy_dtype, obj):
+  """Explicitly convert obj based on numpy type except for string type."""
+  return numpy_dtype(obj) if numpy_dtype is not object else str(obj)
+
+
 def register_session_run_conversion_functions(tensor_type, fetch_function,
     feed_function=None, feed_function_for_partial_run=None):
   """Register fetch and feed conversion functions for `tf.Session.run()`.
@@ -1072,12 +1078,14 @@ class BaseSession(SessionInterface):
                             'strings, lists, numpy ndarrays, or TensorHandles.')
 
           subfeed_dtype = subfeed_t.dtype.as_numpy_dtype
-          if isinstance(subfeed_val,
-                        int) and subfeed_dtype(subfeed_val) != subfeed_val:
+          if isinstance(subfeed_val, int) and _convert_to_numpy_obj(
+              subfeed_dtype, subfeed_val) != subfeed_val:
             raise TypeError(
-                'Type of feed value ' + str(subfeed_val) + ' is not'
-                ' compatible with Tensor type ' + str(subfeed_dtype) + '.'
-                ' Try explicitly setting the type of the feed tensor'
+                'Type of feed value ' + str(subfeed_val) + ' with type ' +
+                str(type(subfeed_val)) +
+                ' is not compatible with Tensor type ' +
+                str(subfeed_dtype) +
+                '. Try explicitly setting the type of the feed tensor'
                 ' to a larger type (e.g. int64).')
 
           is_tensor_handle_feed = isinstance(subfeed_val,
@@ -1160,9 +1168,6 @@ class BaseSession(SessionInterface):
       TypeError: If `fetches` or `feed_list` cannot be interpreted
         as arguments to @{tf.Session.run}.
     """
-    assert not self._created_with_new_api, ('session.make_callable() doesn\'t '
-                                            'work with C API')
-
     if feed_list is not None:
       if not isinstance(feed_list, (list, tuple)):
         raise TypeError('`feed_list` must be a list or tuple.')
@@ -1184,12 +1189,18 @@ class BaseSession(SessionInterface):
 
     # Create a fetch handler to take care of the structure of fetches.
     fetch_handler = _FetchHandler(self._graph, fetches, {})
-    fetch_list_as_strings = _name_list(fetch_handler.fetches())
-    target_list_as_strings = _name_list(fetch_handler.targets())
+    if self._created_with_new_api:
+      # pylint: disable=protected-access
+      fetch_list = [t._as_tf_output() for t in fetch_handler.fetches()]
+      target_list = [op._c_op for op in fetch_handler.targets()]
+      # pylint: enable=protected-access
+    else:
+      fetch_list = _name_list(fetch_handler.fetches())
+      target_list = _name_list(fetch_handler.targets())
 
     def _callable_template_with_options_and_metadata(
-        fetch_list_as_strings,
-        target_list_as_strings,
+        fetch_list,
+        target_list,
         fetch_handler,
         options=None,
         run_metadata=None):
@@ -1199,9 +1210,14 @@ class BaseSession(SessionInterface):
       run_metadata_ptr = tf_session.TF_NewBuffer() if run_metadata else None
       try:
         with errors.raise_exception_on_not_ok_status() as status:
-          results = tf_session.TF_Run(
-              self._session, options_ptr, {}, fetch_list_as_strings,
-              target_list_as_strings, status, run_metadata_ptr)
+          if self._created_with_new_api:
+            results = tf_session.TF_SessionRun_wrapper(
+                self._session, options_ptr, {}, fetch_list, target_list,
+                run_metadata_ptr, status)
+          else:
+            results = tf_session.TF_Run(
+                self._session, options_ptr, {}, fetch_list, target_list, status,
+                run_metadata_ptr)
           if fetch_handler:
             results = fetch_handler.build_results(self, results)
           else:
@@ -1218,27 +1234,35 @@ class BaseSession(SessionInterface):
 
     if accept_options:
       return functools.partial(
-          _callable_template_with_options_and_metadata, fetch_list_as_strings,
-          target_list_as_strings, fetch_handler)
+          _callable_template_with_options_and_metadata, fetch_list,
+          target_list, fetch_handler)
     elif isinstance(fetches, ops.Operation):
       # Special case for fetching a single operation, because the
       # function will have no return value.
-      assert not fetch_list_as_strings
-      assert len(target_list_as_strings) == 1
+      assert not fetch_list
+      assert len(target_list) == 1
       def _single_operation_run():
         with errors.raise_exception_on_not_ok_status() as status:
-          tf_session.TF_Run(self._session, None, {}, [],
-                            target_list_as_strings, status, None)
+          if self._created_with_new_api:
+            tf_session.TF_SessionRun_wrapper(
+                self._session, None, {}, [], target_list, None, status)
+          else:
+            tf_session.TF_Run(
+                self._session, None, {}, [], target_list, status, None)
       return _single_operation_run
     elif isinstance(fetches, ops.Tensor):
       # Special case for fetching a single tensor, because the
       # function can return the result of `TF_Run()` directly.
-      assert len(fetch_list_as_strings) == 1
-      assert not target_list_as_strings
+      assert len(fetch_list) == 1
+      assert not target_list
       def _single_tensor_run():
         with errors.raise_exception_on_not_ok_status() as status:
-          results = tf_session.TF_Run(self._session, None, {},
-                                      fetch_list_as_strings, [], status, None)
+          if self._created_with_new_api:
+            results = tf_session.TF_SessionRun_wrapper(
+                self._session, None, {}, fetch_list, [], None, status)
+          else:
+            results = tf_session.TF_Run(
+                self._session, None, {}, fetch_list, [], status, None)
         return results[0]
       return _single_tensor_run
     else:
@@ -1246,9 +1270,12 @@ class BaseSession(SessionInterface):
       # results for us.
       def _fetch_handler_run():
         with errors.raise_exception_on_not_ok_status() as status:
-          results = tf_session.TF_Run(self._session, None, {},
-                                      fetch_list_as_strings,
-                                      target_list_as_strings, status, None)
+          if self._created_with_new_api:
+            results = tf_session.TF_SessionRun_wrapper(
+                self._session, None, {}, fetch_list, target_list, None, status)
+          else:
+            results = tf_session.TF_Run(
+                self._session, None, {}, fetch_list, target_list, status, None)
         return fetch_handler.build_results(self, results)
       return _fetch_handler_run
 
diff --git a/tensorflow/python/client/session_clusterspec_prop_test.py b/tensorflow/python/client/session_clusterspec_prop_test.py
index 28a4dd27a7607e417226c4eaa6036246e420d6a4..c85b22eb156407fcb78302c43b9cb17b8f6b5e06 100644
--- a/tensorflow/python/client/session_clusterspec_prop_test.py
+++ b/tensorflow/python/client/session_clusterspec_prop_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -415,6 +416,48 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
               node_stats.node_name.startswith('Const')
           ]), run_metadata)
 
+  def testClusterSpecPropagationIsolation(self):
+    """Test that two sessions using ClusterSpec propagation are isolated."""
+    server = server_lib.Server.create_local_server()
+    init_value = array_ops.placeholder(dtypes.int32, shape=[])
+    v = variables.Variable(init_value)
+
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    sess1 = session.Session(server.target, config=config)
+    sess2 = session.Session(server.target, config=config)
+
+    # Initially, the variable is uninitialized in both sessions.
+    with self.assertRaises(errors.FailedPreconditionError):
+      sess1.run(v)
+    with self.assertRaises(errors.FailedPreconditionError):
+      sess2.run(v)
+
+    # An update in sess1 should be visible in sess1 only.
+    sess1.run(v.initializer, feed_dict={init_value: 37})
+    self.assertEqual(37, sess1.run(v))
+    with self.assertRaises(errors.FailedPreconditionError):
+      sess2.run(v)
+
+    # An update in sess2 should be visible in sess2 only.
+    sess2.run(v.initializer, feed_dict={init_value: 86})
+    self.assertEqual(37, sess1.run(v))
+    self.assertEqual(86, sess2.run(v))
+
+    # Closing sess2 has no effect on the state of sess1.
+    sess2.close()
+    self.assertEqual(37, sess1.run(v))
+
+    # Subsequent sessions will not see the state of existing sessions.
+    sess3 = session.Session(server.target, config=config)
+    self.assertEqual(37, sess1.run(v))
+    with self.assertRaises(errors.FailedPreconditionError):
+      sess3.run(v)
+
   @test_util.disable_c_api  # Partial runs don't work with C API
   def testClusterSpecPropagationPartialRun(self):
     """Test successful partial run with ClusterSpec propagation."""
diff --git a/tensorflow/python/client/session_list_devices_test.py b/tensorflow/python/client/session_list_devices_test.py
index 584b1abe55c0df09afad0c432837646e75beb653..5a7413c12e9db92cb85d54a69602753ff6476425 100644
--- a/tensorflow/python/client/session_list_devices_test.py
+++ b/tensorflow/python/client/session_list_devices_test.py
@@ -39,7 +39,6 @@ class SessionListDevicesTestMethods(object):
       devices = sess.list_devices()
       self.assertTrue('/job:localhost/replica:0/task:0/device:CPU:0' in set(
           [d.name for d in devices]), devices)
-      self.assertGreaterEqual(1, len(devices), devices)
 
   def testInvalidDeviceNumber(self):
     opts = tf_session.TF_NewSessionOptions()
@@ -65,7 +64,6 @@ class SessionListDevicesTestMethods(object):
       devices = sess.list_devices()
       self.assertTrue('/job:local/replica:0/task:0/device:CPU:0' in set(
           [d.name for d in devices]), devices)
-      self.assertGreaterEqual(1, len(devices), devices)
 
   def testListDevicesClusterSpecPropagation(self):
     server1 = server_lib.Server.create_local_server()
@@ -84,7 +82,6 @@ class SessionListDevicesTestMethods(object):
           '/job:worker/replica:0/task:0/device:CPU:0' in device_names)
       self.assertTrue(
           '/job:worker/replica:0/task:1/device:CPU:0' in device_names)
-      self.assertGreaterEqual(2, len(devices), devices)
 
 
 class SessionListDevicesTest(SessionListDevicesTestMethods,
diff --git a/tensorflow/python/client/session_partial_run_test.py b/tensorflow/python/client/session_partial_run_test.py
index 6ecf0fc6c7b5d55d9f0f139f67f69efa0d51daf1..6a389b078a54adea18bedb1e0412835c0e997a7f 100644
--- a/tensorflow/python/client/session_partial_run_test.py
+++ b/tensorflow/python/client/session_partial_run_test.py
@@ -199,11 +199,11 @@ class PartialRunTestMethods(object):
   def testPartialRunSetupNoFeedsPassed(self):
     sess = session.Session()
     r1 = constant_op.constant([6.0])
-   
+
     h = sess.partial_run_setup([r1])
     result1 = sess.partial_run(h, r1)
     self.assertEqual([6.0], result1)
-      
+
   def testPartialRunDirect(self):
     self.RunTestPartialRun(session.Session())
 
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 6b45a5f3134a8a60445413d6afba3b2d6b8eb87e..a563f5ef4aa245bbca5077d6f382f8cfec77441d 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -28,6 +28,8 @@ import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.framework import types_pb2
 from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
@@ -55,13 +57,13 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 
-ops._USE_C_API = True
 
 # NOTE(mrry): Dummy shape registration for ops used in the tests, since they
 # don't have C++ op registrations on which to attach C++ shape fns.
 ops.RegisterShape('ConstructionFails')(common_shapes.unknown_shape)
 
 
+@test_util.with_c_api
 class SessionTest(test_util.TensorFlowTestCase):
 
   def testUseExistingGraph(self):
@@ -163,8 +165,9 @@ class SessionTest(test_util.TensorFlowTestCase):
         # Run with a bogus handle.
         s.partial_run('foo', r1, feed_dict={a: 1, b: 2})
 
-  @test_util.disable_c_api  # No shape registration for 'ConstructionFails'
   def testOpConstructionErrorPayload(self):
+    if ops._USE_C_API: return  # No shape registration for 'ConstructionFails'
+
     with session.Session():
       failing_op = ops.get_default_graph().create_op(
           'ConstructionFails', [], [], name='f')
@@ -206,7 +209,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaises(TypeError):
         s.run({'a': a, 'b': None})
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFetchSingleton(self):
     with session.Session() as sess:
       a = constant_op.constant(42.0)
@@ -229,7 +231,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       res = sess.run(a.op)  # An op, not a tensor.
       self.assertEqual(None, res)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFetchList(self):
     with session.Session() as sess:
       a = constant_op.constant(42.0)
@@ -245,7 +246,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertTrue(isinstance(res, list))
       self.assertEqual([42.0, None, 44.0, 42.0, None], res)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFetchTuple(self):
     with session.Session() as sess:
       a = constant_op.constant(42.0)
@@ -259,7 +259,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertTrue(isinstance(res, tuple))
       self.assertEqual((42.0, None, 44.0, 42.0), res)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFetchNamedTuple(self):
     # pylint: disable=invalid-name
     ABC = collections.namedtuple('ABC', ['a', 'b', 'c'])
@@ -1176,7 +1175,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(b_val, [[2.0, 2.0, 2.0]])
       self.assertAllEqual(a2_val, [[1.0, 1.0]])
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFeedAndFetch(self):
     with session.Session() as sess:
       for dtype in [dtypes.float16,
@@ -1223,7 +1221,6 @@ class SessionTest(test_util.TensorFlowTestCase):
           self.assertAllEqual(np_array, out_v)
           self.assertAllEqual(np_array, feed_v)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testMakeCallableOnTensorWithRunOptions(self):
     with session.Session() as sess:
       a = constant_op.constant(42.0)
@@ -1236,7 +1233,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(42.0, res)
       self.assertGreater(len(run_metadata.step_stats.dev_stats), 0)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testMakeCallableOnOperationWithRunOptions(self):
     with session.Session() as sess:
       a = variables.Variable(42.0)
@@ -1251,7 +1247,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(43.0, sess.run(a))
       self.assertGreater(len(run_metadata.step_stats.dev_stats), 0)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testMakeCallableWithFeedListAndRunOptions(self):
     with session.Session() as sess:
       ph = array_ops.placeholder(dtypes.float32)
@@ -1459,6 +1454,9 @@ class SessionTest(test_util.TensorFlowTestCase):
         self.assertEquals(len(run_metadata.step_stats.dev_stats), 1)
 
   def testFeedShapeCompatibility(self):
+    # TODO(nolivia): C API doesn't yet handle marking nodes as not feedable.
+    if ops._USE_C_API: return
+
     with session.Session() as sess:
       some_tensor = constant_op.constant([2.0, 2.0, 2.0, 2.0])
       new_shape = constant_op.constant([2, 2])
@@ -1583,7 +1581,6 @@ class SessionTest(test_util.TensorFlowTestCase):
         sess.run(enqueue_op)
       self.assertEqual(sess.run(q.size()), num_epochs * 2)
 
-  @test_util.disable_c_api  # set_device does not work with C API
   def testRegisterFetchAndFeedConversionFunctions(self):
     class SquaredTensor(object):
       def __init__(self, tensor):
@@ -1733,15 +1730,159 @@ class SessionTest(test_util.TensorFlowTestCase):
       result = sess.run(f)
       self.assertEqual(result, 2.0)
 
-  @test_util.disable_c_api  # functions don't work with C API
   def testAddFunctionToSession(self):
     self.runTestAddFunctionToSession()
 
-  @test_util.disable_c_api  # functions don't work with C API
   def testAddFunctionToGrpcSession(self):
     server = server_lib.Server.create_local_server()
     self.runTestAddFunctionToSession(server.target)
 
+  def testAutoConvertAndCheckData(self):
+    with self.test_session() as sess:
+      a = array_ops.placeholder(dtype=dtypes.string)
+      with self.assertRaisesRegexp(
+          TypeError, 'Type of feed value 1 with type <(\w+) \'int\'> is not'):
+        sess.run(a, feed_dict={a: 1})
+
+class GraphMutationTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._original_use_c_api_value = ops._USE_C_API
+    ops._USE_C_API = True
+    super(GraphMutationTest, self).setUp()
+
+  def tearDown(self):
+    ops._USE_C_API = self._original_use_c_api_value
+    super(GraphMutationTest, self).tearDown()
+
+  def testUpdateInputAfterRunning(self):
+    with ops.Graph().as_default() as g:
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+    with session.Session(graph=g) as sess:
+      self.assertAllEqual(3.0, sess.run(c))
+      c.op._update_input(1, a)  # pylint: disable=protected-access
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          'add.*was changed by updating input tensor after it was run'):
+        sess.run(c)
+
+      # Check that running the graph with a new session is fine
+      with session.Session(graph=g) as sess2:
+        self.assertAllEqual(2.0, sess2.run(c))
+
+  def testSetDeviceAfterRunning(self):
+    with ops.Graph().as_default() as g:
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+    with session.Session(graph=g) as sess:
+      self.assertAllEqual(3.0, sess.run(c))
+      c.op._set_device('/cpu:0')  # pylint: disable=protected-access
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          'add.*was changed by setting device after it was run'):
+        sess.run(c)
+
+  def testSetAttrAfterRunning(self):
+    with ops.Graph().as_default() as g:
+      a = constant_op.constant(1.0, dtype=dtypes.float32)
+      b = math_ops.cast(a, dtypes.float64)
+
+    with session.Session(graph=g) as sess:
+      self.assertAllEqual(1.0, sess.run(b))
+      b.op._set_attr('DstT',
+                     attr_value_pb2.AttrValue(type=types_pb2.DT_FLOAT))
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          'Cast.*was changed by setting attribute after it was run'):
+        sess.run(b)
+
+  def testRunModifyRun(self):
+    with ops.Graph().as_default() as g:
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+      with session.Session(graph=g) as sess:
+        self.assertAllEqual(3.0, sess.run(c))
+
+        d = b + c
+        d.op._update_input(0, a)  # pylint: disable=protected-access
+        self.assertAllEqual(3.0, sess.run(c))
+        self.assertAllEqual(4.0, sess.run(d))
+
+  def testRunModifyRunTwoSessions(self):
+    with ops.Graph().as_default() as g:
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+      with session.Session(graph=g) as sess1:
+        with session.Session(graph=g) as sess2:
+          self.assertAllEqual(3.0, sess1.run(c))
+          self.assertAllEqual(3.0, sess2.run(c))
+
+          d = b + c
+          d.op._update_input(0, a)  # pylint: disable=protected-access
+          self.assertAllEqual(3.0, sess2.run(c))
+          self.assertAllEqual(4.0, sess2.run(d))
+
+          d.op._update_input(0, b)  # pylint: disable=protected-access
+          self.assertAllEqual(3.0, sess1.run(c))
+          self.assertAllEqual(5.0, sess1.run(d))
+
+          with self.assertRaisesRegexp(
+              errors.FailedPreconditionError,
+              'add.*was changed by updating input tensor after it was run'):
+            sess2.run(c)
+
+  def testTwoSessionsOneRunBeforeModification(self):
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+    with session.Session(graph=g) as sess1:
+      with session.Session(graph=g) as sess2:
+        sess1.run(c)
+
+        c.op._set_device('/cpu:0')  # pylint: disable=protected-access
+
+        with self.assertRaisesRegexp(
+            errors.FailedPreconditionError,
+            'add.*was changed by setting device after it was run'):
+          sess1.run(c)
+
+        # sess2 was not run before modification
+        self.assertAllEqual(3.0, sess2.run(c))
+
+  def testTwoSessionsBothRunBeforeModification(self):
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+    with session.Session(graph=g) as sess1:
+      with session.Session(graph=g) as sess2:
+        sess1.run(c)
+        sess2.run(c)
+
+        c.op._set_device('/cpu:0')  # pylint: disable=protected-access
+
+        with self.assertRaisesRegexp(
+            errors.FailedPreconditionError,
+            'add.*was changed by setting device after it was run'):
+          sess1.run(c)
+
+        with self.assertRaisesRegexp(
+            errors.FailedPreconditionError,
+            'add.*was changed by setting device after it was run'):
+          sess2.run(c)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 40731aba7d4ed8bb281191d719b3ddfcd2db1ddc..a94910042fd8618cd008bbe57e9d65fcbb1ae8e3 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -24,6 +24,49 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/public/version.h"
 
+// Helper function to convert a Python list of Tensors to a C++ vector of
+// TF_Outputs.
+//
+// Returns true if successful. Otherwise, returns false and sets error_msg.
+bool PyTensorListToVector(PyObject* py_tensor_list,
+                          std::vector<TF_Output>* vec,
+                          string* error_msg) {
+  if (!PyList_Check(py_tensor_list)) {
+    *error_msg = "expected Python list.";
+    return false;
+  }
+  size_t size = PyList_Size(py_tensor_list);
+  for (int i = 0; i < size; ++i) {
+    PyObject* item = PyList_GetItem(py_tensor_list, i);
+    TF_Output* input_ptr;
+    if (!SWIG_IsOK(SWIG_ConvertPtr(item, reinterpret_cast<void**>(&input_ptr),
+                                   SWIGTYPE_p_TF_Output, 0))) {
+      *error_msg = "expected Python list of wrapped TF_Output objects. "
+          "Found python list of something else.";
+      return false;
+    }
+    vec->push_back(*input_ptr);
+  }
+  return true;
+}
+
+// Helper function to convert a TF_Output to a wrapped TF_Output Python object.
+PyObject* CreateWrappedTFOutput(TF_Output tf_output) {
+  // We used heap-allocated pointers in the Python runtime (this is what SWIG
+  // generates by default for functions returning TF_Output).
+  TF_Output* tf_output_ptr = new TF_Output(tf_output);
+  // Use SWIG_POINTER_OWN so the TF_Output* is deleted by Python.
+  return SWIG_NewPointerObj(tf_output_ptr, SWIGTYPE_p_TF_Output,
+                            SWIG_POINTER_OWN);
+}
+
+// Helper function to convert a TF_Operation to a wrapped TF_Operation Python
+// object.
+PyObject* CreateWrappedTFOperation(TF_Operation* tf_operation) {
+  // No flags since operation is owned by TF_Graph.
+  return SWIG_NewPointerObj(tf_operation, SWIGTYPE_p_TF_Operation, 0);
+}
+
 %}
 
 %include "tensorflow/python/client/tf_sessionrun_wrapper.i"
@@ -48,6 +91,9 @@ tensorflow::ImportNumpy();
 // _GLIBCXX_USE_CXX11_ABI flag value
 %constant const int __cxx11_abi_flag__ = tf_cxx11_abi_flag();
 
+// Flag indicating whether the build is monolithic
+%constant const int __monolithic_build__ = tf_monolithic_build();
+
 // Release the Python GIL for the duration of most methods.
 %exception {
   Py_BEGIN_ALLOW_THREADS;
@@ -98,8 +144,26 @@ tensorflow::ImportNumpy();
   }
 
   for (size_t i = 0; i < $1.size(); ++i) {
-    PyList_SET_ITEM($result, i, SWIG_NewPointerObj(
-                            $1[i], SWIGTYPE_p_TF_Operation, 0));
+    PyList_SET_ITEM($result, i, CreateWrappedTFOperation($1[i]));
+  }
+}
+
+%ignore TF_OperationOutputConsumers;
+%unignore TF_OperationOutputConsumers_wrapper;
+// See comment for "%noexception TF_SessionRun_wrapper;"
+%noexception TF_OperationGetOutputConsumers_wrapper;
+
+// Build a Python list of unicode strings and return it. (Operation names are
+// always represented as unicode.)
+%typemap(out) std::vector<const char*>
+tensorflow::TF_OperationOutputConsumers_wrapper {
+  $result = PyList_New($1.size());
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+
+  for (size_t i = 0; i < $1.size(); ++i) {
+    PyList_SET_ITEM($result, i, PyUnicode_FromString($1[i]));
   }
 }
 
@@ -115,19 +179,30 @@ tensorflow::ImportNumpy();
     SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
   }
 
-  // Unwrap the generated SwigValueWrapper<std::vector<TF_Output>> via &
-  std::vector<TF_Output>* tf_outputs = &$1;
-  for (size_t i = 0; i < $1.size(); ++i) {
-    // We used wrapped heap-allocated pointers in the Python runtime (this is
-    // what SWIG generates by default for functions returning TF_Output).
-    TF_Output* tf_output_ptr = new TF_Output((*tf_outputs)[i]);
-    // Use SWIG_POINTER_OWN so the TF_Output* is deleted by Python.
-    PyList_SET_ITEM($result, i,
-                    SWIG_NewPointerObj(tf_output_ptr, SWIGTYPE_p_TF_Output,
-                                       SWIG_POINTER_OWN));
+  // Unwrap the generated SwigValueWrapper<std::vector<TF_Output>>
+  const std::vector<TF_Output>& tf_outputs = $1;
+  for (size_t i = 0; i < tf_outputs.size(); ++i) {
+    PyList_SET_ITEM($result, i, CreateWrappedTFOutput(tf_outputs[i]));
   }
 }
 
+%ignore TF_ImportGraphDefResultsMissingUnusedInputMappings;
+%unignore TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper;
+// See comment for "%noexception TF_SessionRun_wrapper;"
+%noexception TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper;
+
+%typemap(out) std::vector<string>
+TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{
+  $result = PyList_New($1.size());
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+  for (size_t i = 0; i < $1.size(); ++i) {
+    const string& input_str = $1[i];
+    PyList_SET_ITEM($result, i, PyBytes_FromStringAndSize(input_str.data(),
+                                                          input_str.size()));
+  }
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 // BEGIN TYPEMAPS FOR tensorflow::TF_Run_wrapper()
@@ -268,34 +343,6 @@ tensorflow::ImportNumpy();
       reinterpret_cast<const char*>($1.data), $1.length);
 }
 
-%inline %{
-// Helper function to convert a Python list of Tensors to a C++ vector of
-// TF_Outputs.
-//
-// Returns true if successful. Otherwise, returns false and sets error_msg.
-bool PyTensorListToVector(PyObject* py_tensor_list,
-                          std::vector<TF_Output>* vec,
-                          string* error_msg) {
-  if (!PyList_Check(py_tensor_list)) {
-    *error_msg = "expected Python list.";
-    return false;
-  }
-  size_t size = PyList_Size(py_tensor_list);
-  for (int i = 0; i < size; ++i) {
-    PyObject* item = PyList_GetItem(py_tensor_list, i);
-    TF_Output* input_ptr;
-    if (!SWIG_IsOK(SWIG_ConvertPtr(item, reinterpret_cast<void**>(&input_ptr),
-                                   SWIGTYPE_p_TF_Output, 0))) {
-      *error_msg = "expected Python list of wrapped TF_Output objects. "
-          "Found python list of something else.";
-      return false;
-    }
-    vec->push_back(*input_ptr);
-  }
-  return true;
-}
-%}
-
 // Converts input Python list of wrapped TF_Outputs into a single array
 %typemap(in) (const TF_Output* inputs, int num_inputs)
     (std::vector<TF_Output> inputs) {
@@ -307,6 +354,62 @@ bool PyTensorListToVector(PyObject* py_tensor_list,
   $2 = inputs.size();
 }
 
+// Typemaps for TF_ImportGraphDefResultsReturnOutputs
+%typemap(in, numinputs=0) (int* num_outputs, TF_Output** outputs)
+     (int num_outputs, TF_Output* outputs) {
+  $1 = &num_outputs;
+  $2 = &outputs;
+}
+
+%typemap(argout) (int* num_outputs, TF_Output** outputs) {
+  $result = PyList_New(*$1);
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+  int num_outputs = *$1;
+  TF_Output* outputs = *$2;
+  for (int i = 0; i < num_outputs; ++i) {
+    PyList_SET_ITEM($result, i, CreateWrappedTFOutput(outputs[i]));
+  }
+}
+
+// Typemaps for TF_ImportGraphDefResultsReturnOperations
+%typemap(in, numinputs=0) (int* num_opers, TF_Operation*** opers)
+     (int num_opers, TF_Operation** opers) {
+  $1 = &num_opers;
+  $2 = &opers;
+}
+
+%typemap(argout) (int* num_opers, TF_Operation*** opers) {
+  $result = PyList_New(*$1);
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+  int num_opers = *$1;
+  TF_Operation** opers = *$2;
+  for (int i = 0; i < num_opers; ++i) {
+    PyList_SET_ITEM($result, i, CreateWrappedTFOperation(opers[i]));
+  }
+}
+
+// Typemaps for TF_GraphNextOperation().
+%typemap(in) size_t* pos (size_t pos) {
+  pos = PyLong_AsUnsignedLong($input);
+  $1 = &pos;
+}
+
+// Returns a (TF_Operation*, int pos) tuple.
+%typemap(argout) size_t* pos {
+  PyObject* new_result = PyTuple_New(2);
+  if (!new_result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create tuple");
+  }
+  // Steals $result reference
+  PyTuple_SET_ITEM(new_result, 0, $result);
+  PyTuple_SET_ITEM(new_result, 1, PyLong_FromSize_t(*$1));
+  $result = new_result;
+}
+
 // TODO(skyewm): SWIG emits a warning for the const char* in TF_WhileParams,
 // skip for now
 %ignore TF_WhileParams;
@@ -433,6 +536,84 @@ def TF_Reset(target, containers=None, config=None):
   }
 }
 
+// Typemaps for TF_GraphGetTensorShapeHelper.
+
+// Convert from C++ integer vector to Python list of ints.
+%typemap(out) tensorflow::gtl::InlinedVector<int64_t, 6>
+     tensorflow::TF_GraphGetTensorShapeHelper {
+  $result = PyList_New($1.size());
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+
+  for (size_t i = 0; i < $1.size(); ++i) {
+    PyList_SET_ITEM($result, i, PyInt_FromLong($1[i]));
+  }
+}
+
+%typemap(in, numinputs=0) bool* unknown_shape (bool temp) {
+  $1=&temp;
+}
+
+// Returns a (list(int), bool) tuple.
+%typemap(argout) bool* unknown_shape {
+  PyObject* new_result = PyTuple_New(2);
+  if (!new_result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create tuple");
+  }
+  // Steals $result reference
+  PyTuple_SET_ITEM(new_result, 0, $result);
+  PyTuple_SET_ITEM(new_result, 1, PyBool_FromLong(*$1));
+  $result = new_result;
+}
+
+%unignore tensorflow;
+%unignore TF_GraphGetTensorShapeHelper;
+%ignore TF_GraphGetTensorShape;
+
+// We use TF_GraphSetTensorShape_wrapper instead of
+// TF_GraphSetTensorShape
+%ignore TF_GraphSetTensorShape;
+%unignore tensorflow;
+%unignore TF_GraphSetTensorShape_wrapper;
+
+// $input is a Python list of ints to a vector<int> for TF_GraphSetTensorShape_wrapper
+%typemap(in) (const std::vector<int64_t>& dims)
+    (std::vector<int64_t> dims_local){
+  if ($input != Py_None) {
+    if (!PyList_Check($input)) {
+      SWIG_exception_fail(SWIG_TypeError, tensorflow::strings::Printf(
+              "$symname: expected list but got %s ", Py_TYPE($input)->tp_name).c_str());
+    }
+    size_t size = PyList_Size($input);
+    for (int i = 0; i < size; ++i) {
+      PyObject* item = PyList_GetItem($input, i);
+      dims_local.push_back(PyInt_AsLong(item));
+    }
+    $1 = &dims_local;
+  } else {
+    $1 = nullptr;
+  }
+}
+
+// We use TF_GraphGetTensorShape_wrapper instead of
+// TF_GraphGetTensorShape
+%ignore TF_GraphGetTensorShape;
+%unignore tensorflow;
+%unignore TF_GraphGetTensorShape_wrapper;
+
+// Build a Python list of ints and return it.
+%typemap(out) std::vector<int64_t> tensorflow::TF_GraphGetTensorShape_wrapper {
+  $result = PyList_New($1.size());
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+
+  for (size_t i = 0; i < $1.size(); ++i) {
+    PyList_SET_ITEM($result, i, PyInt_FromLong($1[i]));
+  }
+}
+
 %include "tensorflow/python/client/tf_session_helper.h"
 
 %unignoreall
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index f5472f316dcd86c3bb15a042b68c51b4f04b4b10..efe50dc2473584b5bf0fc54e6fc234daa449c3b0 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/equal_graph_def.h"
@@ -299,6 +300,33 @@ string EqualGraphDefWrapper(const string& actual, const string& expected) {
   return EqualGraphDef(actual_def, expected_def, &diff) ? "" : diff;
 }
 
+// Return value set to 6 inlined elements so it fits in a 64-byte cache line.
+tensorflow::gtl::InlinedVector<int64_t, 6> TF_GraphGetTensorShapeHelper(
+    TF_Graph* graph, TF_Output output, TF_Status* out_status,
+    bool* unknown_shape) {
+  // Allocate a single variable for holding the result for RVO.
+  tensorflow::gtl::InlinedVector<int64_t, 6> result;
+  *unknown_shape = false;
+  int num_dims = TF_GraphGetTensorNumDims(graph, output, out_status);
+  if (TF_GetCode(out_status) != TF_OK) {
+    return result;
+  }
+  // If shape is unknown, set boolean and return.
+  if (num_dims == -1) {
+    *unknown_shape = true;
+    return result;
+  }
+
+  // If shape is a scalar, avoid another C call and just return {}.
+  if (num_dims == 0) {
+    return result;
+  }
+
+  result.resize(num_dims);
+  TF_GraphGetTensorShape(graph, output, result.data(), num_dims, out_status);
+  return result;
+}
+
 void TF_SessionPRunSetup_wrapper(TF_Session* session,
                                  const std::vector<TF_Output>& inputs,
                                  const std::vector<TF_Output>& outputs,
@@ -347,6 +375,19 @@ std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
   return control_inputs;
 }
 
+std::vector<const char*> TF_OperationOutputConsumers_wrapper(
+    TF_Output oper_out) {
+  int num_consumers = TF_OperationOutputNumConsumers(oper_out);
+  std::vector<TF_Input> consumers(num_consumers);
+  TF_OperationOutputConsumers(oper_out, consumers.data(), num_consumers);
+
+  std::vector<const char*> consumer_names(num_consumers);
+  for (int i = 0; i < num_consumers; ++i) {
+    consumer_names[i] = TF_OperationName(consumers[i].oper);
+  }
+  return consumer_names;
+}
+
 TF_Function* TF_GraphToFunction_wrapper(
     const TF_Graph* fn_body, const char* fn_name, bool append_hash_to_fn_name,
     const std::vector<TF_Operation*>* opers,
@@ -380,4 +421,37 @@ TF_Function* TF_GraphToFunction_wrapper(
                             opts, description, out_status);
 }
 
+void TF_GraphSetTensorShape_wrapper(TF_Graph* graph, TF_Output output,
+                                    const std::vector<int64_t>& dims,
+                                    bool unknown_shape, TF_Status* status) {
+  if (unknown_shape) {
+    TF_GraphSetTensorShape(graph, output, nullptr, -1, status);
+    return;
+  }
+  TF_GraphSetTensorShape(graph, output, dims.data(), dims.size(), status);
+}
+
+std::vector<int64_t> TF_GraphGetTensorShape_wrapper(TF_Graph* graph,
+                                                    TF_Output output,
+                                                    int num_dims,
+                                                    TF_Status* status) {
+  std::vector<int64_t> dims(num_dims);
+  TF_GraphGetTensorShape(graph, output, dims.data(), num_dims, status);
+  return dims;
+}
+
+std::vector<string> TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
+    TF_ImportGraphDefResults* results) {
+  int num_missing_unused_input_mappings;
+  const char** src_names;
+  int* src_indexes;
+  TF_ImportGraphDefResultsMissingUnusedInputMappings(
+      results, &num_missing_unused_input_mappings, &src_names, &src_indexes);
+  std::vector<string> input_strs(num_missing_unused_input_mappings);
+  for (int i = 0; i < num_missing_unused_input_mappings; ++i) {
+    input_strs[i] = TensorId(src_names[i], src_indexes[i]).ToString();
+  }
+  return input_strs;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index 0aca61a2b69752634d23511084721f94911a9ac4..cdb68d2a23de9ef9036cabdb606350d83132981f 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -97,6 +97,16 @@ void TF_Reset_wrapper(const TF_SessionOptions* opt,
 // for no difference.
 string EqualGraphDefWrapper(const string& actual, const string& expected);
 
+// Gets shape from C API Graph object.
+//
+// If shape is known, returns shape vector where -1 means "unknown
+// dimension".  Sets unknown_shape to false.
+//
+// If shape is unknown, sets unknown_shape to true.
+tensorflow::gtl::InlinedVector<int64_t, 6> TF_GraphGetTensorShapeHelper(
+    TF_Graph* graph, TF_Output output, TF_Status* out_status,
+    bool* unknown_shape);
+
 // Runs the graph associated with the session starting with the supplied inputs.
 // On success, `py_outputs` is populated with a numpy ndarray for each output
 // (the caller must decref these ndarrays, although this will likely be handled
@@ -150,6 +160,11 @@ std::vector<TF_Output> GetOperationInputs(TF_Operation* oper);
 std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
     TF_Operation* oper);
 
+// Retrieves the op names of the consumers of `oper_out`. The returned strings
+// have the lifetime of the underlying TF_Graph.
+std::vector<const char*> TF_OperationOutputConsumers_wrapper(
+    TF_Output oper_out);
+
 // `opers` equaling NULL are converted to `nopers = -1`.
 // `output_names` must be empty or have the same length as `outputs`.
 TF_Function* TF_GraphToFunction_wrapper(
@@ -158,6 +173,24 @@ TF_Function* TF_GraphToFunction_wrapper(
     const std::vector<TF_Output>& inputs, const std::vector<TF_Output>& outputs,
     const NameVector& output_names, const TF_FunctionOptions* opts,
     const char* description, TF_Status* out_status);
+
+// Set the shape of output. If unknown is true, `num_dims` must be set to
+// -1 and `dims` is set to nullptr.
+void TF_GraphSetTensorShape_wrapper(TF_Graph* graph, TF_Output output,
+                                    const std::vector<int64_t>& dims,
+                                    bool unknown_shape, TF_Status* status);
+
+// Return the shape of output. `num_dims` should be the output of
+// TF_GraphGetTensorNumDims. If `num_dims = -1`, this should not be called.
+std::vector<int64_t> TF_GraphGetTensorShape_wrapper(TF_Graph* graph,
+                                                    TF_Output output,
+                                                    int num_dims,
+                                                    TF_Status* status);
+
+// Returns the string representations of the missing unused input mappings.
+std::vector<string> TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
+    TF_ImportGraphDefResults* results);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_CLIENT_TF_SESSION_HELPER_H_
diff --git a/tensorflow/python/client/virtual_gpu_test.py b/tensorflow/python/client/virtual_gpu_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..addf63474c9ba213cf0c1eeffa9d31e94f15eac1
--- /dev/null
+++ b/tensorflow/python/client/virtual_gpu_test.py
@@ -0,0 +1,245 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for multiple virtual GPU support."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+
+import numpy as np
+
+from google.protobuf import text_format
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+
+
+class VirtualGpuTestUtil(object):
+
+  def __init__(self,
+               dim=1000,
+               num_ops=100,
+               virtual_devices_per_gpu=None,
+               device_probabilities=None):
+    self._dim = dim
+    self._num_ops = num_ops
+    if virtual_devices_per_gpu is None:
+      self._virtual_devices_per_gpu = [3]
+    else:
+      self._virtual_devices_per_gpu = virtual_devices_per_gpu
+    self._visible_device_list = [
+        i for i in range(len(self._virtual_devices_per_gpu))
+    ]
+    gpu_devices = [
+        ('/gpu:' + str(i)) for i in range(sum(self._virtual_devices_per_gpu))
+    ]
+    self.devices = ['/cpu:0'] + gpu_devices
+    self._num_devices = len(self.devices)
+    # Each virtual device gets 2GB memory.
+    self._mem_limits_mb = [
+        ([1 << 11] * i) for i in self._virtual_devices_per_gpu
+    ]
+    self.config = self._GetSessionConfig()
+
+    if device_probabilities is not None:
+      self._device_probabilities = list(device_probabilities)  # Deep copy
+      for i in range(1, self._num_devices):
+        self._device_probabilities[i] += self._device_probabilities[i - 1]
+    else:
+      # Each device gets same probability to be assigned an operation.
+      step = 1.0 / self._num_devices
+      self._device_probabilities = [
+          (x + 1) * step for x in range(self._num_devices)
+      ]
+    # To prevent rounding error causing problems.
+    self._device_probabilities[self._num_devices - 1] = 1.1
+
+    logging.info('dim: %d', self._dim)
+    logging.info('num_ops: %d', self._num_ops)
+    logging.info('visible_device_list: %s', str(self._visible_device_list))
+    logging.info('virtual_devices_per_gpu: %s',
+                 str(self._virtual_devices_per_gpu))
+    logging.info('mem_limits: %s', str(self._mem_limits_mb))
+    logging.info('devices: %s', str(self.devices))
+    logging.info('config: %s', text_format.MessageToString(self.config))
+    logging.info('device_probabilities: %s', str(self._device_probabilities))
+
+  # Creates virtual GPU devices
+  def _GetSessionConfig(self):
+    virtual_device_gpu_options = config_pb2.GPUOptions(
+        visible_device_list=','.join(str(d) for d in self._visible_device_list),
+        experimental=config_pb2.GPUOptions.Experimental(virtual_devices=[
+            config_pb2.GPUOptions.Experimental.VirtualDevices(
+                memory_limit_mb=i) for i in self._mem_limits_mb
+        ]))
+    return config_pb2.ConfigProto(gpu_options=virtual_device_gpu_options)
+
+  # Generates a list of 3-tuples, each tuple contains the source and destination
+  # device index for a binary operation like 'add', like:
+  # (src_devcie_1, src_device_2, dst_device)
+  def _GenerateOperationPlacement(self):
+    result = []
+    for unused_i in range(self._num_ops):
+      op_device = ()
+      for unused_j in range(3):
+        random_num = random.random()
+        for device_index in range(self._num_devices):
+          if self._device_probabilities[device_index] > random_num:
+            op_device += (device_index,)
+            break
+      result.append(op_device)
+    return result
+
+  # Logs part of the matrix for debugging purposes.
+  def _LogMatrix(self, mat, dim):
+    logging.info('---- printing the first 10*10 submatrix ----')
+    for i in range(min(10, dim)):
+      row = ''
+      for j in range(min(10, dim)):
+        row += ' ' + str(mat[i][j])
+      logging.info(row)
+
+  # Runs a list of 'add' operations where each operation satisfies the device
+  # placement constraints in `op_placement`, and returns the result.
+  def _TestRandomGraphWithDevices(self,
+                                  sess,
+                                  seed,
+                                  op_placement,
+                                  devices,
+                                  debug_mode=False):
+    data = []
+    shape = (self._dim, self._dim)
+    feed_dict = {}
+    # Initialize the matrices
+    for i in range(len(devices)):
+      with ops.device(devices[i]):
+        var = array_ops.placeholder(dtypes.float32, shape=shape)
+        np.random.seed(seed + i)
+        feed_dict[var] = np.random.uniform(
+            low=0, high=0.1, size=shape).astype(np.float32)
+        data.append(var)
+    # Run the 'add' operations on those matrices
+    for op in op_placement:
+      with ops.device(devices[op[2]]):
+        data[op[2]] = math_ops.add(data[op[0]], data[op[1]])
+    with ops.device('/cpu:0'):
+      s = data[0]
+      for i in range(1, len(data)):
+        s = math_ops.add(s, data[i])
+    if debug_mode:
+      logging.info(ops.get_default_graph().as_graph_def())
+    result = sess.run(s, feed_dict=feed_dict)
+    self._LogMatrix(result, self._dim)
+    return result
+
+  # Generates a random graph with `self._num_ops` 'add' operations with each
+  # operation placed on different virtual device, test that the result is
+  # identical to the result obtained by running the same graph on cpu only.
+  def TestRandomGraph(self, sess, op_placement=None, random_seed=None):
+    debug_mode = False
+    if op_placement is None:
+      op_placement = self._GenerateOperationPlacement()
+    else:
+      debug_mode = True
+    if random_seed is None:
+      random_seed = random.randint(0, 1 << 31)
+    else:
+      debug_mode = True
+    logging.info('Virtual gpu functional test for random graph...')
+    logging.info('operation placement: %s', str(op_placement))
+    logging.info('random seed: %d', random_seed)
+
+    # Run with multiple virtual gpus.
+    result_vgd = self._TestRandomGraphWithDevices(
+        sess, random_seed, op_placement, self.devices, debug_mode=debug_mode)
+    # Run with single cpu.
+    result_cpu = self._TestRandomGraphWithDevices(
+        sess,
+        random_seed,
+        op_placement, ['/cpu:0'] * self._num_devices,
+        debug_mode=debug_mode)
+    # Test the result
+    for i in range(self._dim):
+      for j in range(self._dim):
+        if result_vgd[i][j] != result_cpu[i][j]:
+          logging.error(
+              'Result mismatch at row %d column %d: expected %f, actual %f', i,
+              j, result_cpu[i][j], result_vgd[i][j])
+          logging.error('Devices: %s', self.devices)
+          logging.error('Memory limits (in MB): %s', self._mem_limits_mb)
+          return False
+    return True
+
+
+@test_util.with_c_api
+class VirtualGpuTest(test_util.TensorFlowTestCase):
+
+  def __init__(self, method_name):
+    super(VirtualGpuTest, self).__init__(method_name)
+    self._util = VirtualGpuTestUtil()
+
+  def testStatsContainAllDeviceNames(self):
+    with self.test_session(config=self._util.config) as sess:
+      # TODO(laigd): b/70811538. The is_gpu_available() call will invoke
+      # DeviceFactory::AddDevices() with a default SessionOption, which prevents
+      # adding virtual devices in the future, thus must be called within a
+      # context of a session within which virtual devices are created. Same in
+      # the following test case.
+      if not test.is_gpu_available(cuda_only=True):
+        self.skipTest('No GPU available')
+      run_options = config_pb2.RunOptions(
+          trace_level=config_pb2.RunOptions.FULL_TRACE)
+      run_metadata = config_pb2.RunMetadata()
+
+      mat_shape = [10, 10]
+      data = []
+      for d in self._util.devices:
+        with ops.device(d):
+          var = variables.Variable(random_ops.random_uniform(mat_shape))
+          sess.run(var.initializer)
+          data.append(var)
+      s = data[0]
+      for i in range(1, len(data)):
+        s = math_ops.add(s, data[i])
+      sess.run(s, options=run_options, run_metadata=run_metadata)
+
+    self.assertTrue(run_metadata.HasField('step_stats'))
+    step_stats = run_metadata.step_stats
+    devices = [d.device for d in step_stats.dev_stats]
+    self.assertTrue('/job:localhost/replica:0/task:0/device:CPU:0' in devices)
+    self.assertTrue('/job:localhost/replica:0/task:0/device:GPU:0' in devices)
+    self.assertTrue('/job:localhost/replica:0/task:0/device:GPU:1' in devices)
+    self.assertTrue('/job:localhost/replica:0/task:0/device:GPU:2' in devices)
+
+  def testLargeRandomGraph(self):
+    with self.test_session(config=self._util.config) as sess:
+      if not test.is_gpu_available(cuda_only=True):
+        self.skipTest('No GPU available')
+      for _ in range(10):
+        if not self._util.TestRandomGraph(sess):
+          return
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/__init__.py b/tensorflow/python/data/__init__.py
index 504500d2454e90d314ea539962ee35cd4472d822..239f9b0d5923451f3967eca572b1db099d463466 100644
--- a/tensorflow/python/data/__init__.py
+++ b/tensorflow/python/data/__init__.py
@@ -21,7 +21,6 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 @@FixedLengthRecordDataset
 @@TextLineDataset
 @@TFRecordDataset
-@@SparseType
 """
 
 from __future__ import absolute_import
@@ -34,7 +33,6 @@ from tensorflow.python.data.ops.iterator_ops import Iterator
 from tensorflow.python.data.ops.readers import FixedLengthRecordDataset
 from tensorflow.python.data.ops.readers import TextLineDataset
 from tensorflow.python.data.ops.readers import TFRecordDataset
-from tensorflow.python.data.util.sparse import SparseType
 # pylint: enable=unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..5fb389cf92818c7a464cf4a4479d86377185d5cf
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -0,0 +1,378 @@
+# Tests of TensorFlow kernels written using the Python API.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "sycl_py_test")
+
+tf_py_test(
+    name = "batch_dataset_op_test",
+    size = "small",
+    srcs = ["batch_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "dataset_constructor_op_test",
+    size = "small",
+    srcs = ["dataset_constructor_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+    tags = [
+        "manual",
+        "nomac",  # b/62040583
+    ],
+)
+
+tf_py_test(
+    name = "dataset_from_generator_op_test",
+    size = "small",
+    srcs = ["dataset_from_generator_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+tf_py_test(
+    name = "filter_dataset_op_test",
+    size = "small",
+    srcs = ["filter_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "flat_map_dataset_op_test",
+    size = "small",
+    srcs = ["flat_map_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+    grpc_enabled = True,
+)
+
+tf_py_test(
+    name = "list_files_dataset_op_test",
+    size = "small",
+    srcs = ["list_files_dataset_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "interleave_dataset_op_test",
+    size = "small",
+    srcs = ["interleave_dataset_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "map_dataset_op_test",
+    size = "small",
+    srcs = ["map_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "prefetch_dataset_op_test",
+    size = "small",
+    srcs = ["prefetch_dataset_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "range_dataset_op_test",
+    size = "small",
+    srcs = ["range_dataset_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+)
+
+tf_py_test(
+    name = "reader_dataset_ops_test",
+    size = "small",
+    srcs = ["reader_dataset_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
+tf_py_test(
+    name = "sequence_dataset_op_test",
+    size = "small",
+    srcs = ["sequence_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "shuffle_dataset_op_test",
+    size = "small",
+    srcs = ["shuffle_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+)
+
+tf_py_test(
+    name = "shard_dataset_op_test",
+    size = "small",
+    srcs = ["shard_dataset_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "cache_dataset_op_test",
+    size = "small",
+    srcs = ["cache_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+)
+
+tf_py_test(
+    name = "zip_dataset_op_test",
+    size = "small",
+    srcs = ["zip_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "concatenate_dataset_op_test",
+    size = "small",
+    srcs = ["concatenate_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+tf_py_test(
+    name = "iterator_ops_test",
+    size = "small",
+    srcs = ["iterator_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:training",
+    ],
+    grpc_enabled = True,
+)
+
+tf_py_test(
+    name = "iterator_ops_cluster_test",
+    size = "small",
+    srcs = ["iterator_ops_cluster_test.py"],
+    additional_deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+    grpc_enabled = True,
+    tags = [
+        "no_oss",  # Test flaky due to port collisions.
+        "no_windows",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
similarity index 72%
rename from tensorflow/python/kernel_tests/batch_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
index 236c5bc4ff9b5c92bb379aea3b4d93620bd5a60f..53c8be1d1dc8b2f23b4faef7d64350edffede34a 100644
--- a/tensorflow/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
@@ -101,13 +101,111 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(init_op, feed_dict={count: 14, batch_size: 0})
 
-  def testBatchSparseError(self):
-    def _map_fn(i):
-      return sparse_tensor.SparseTensor(
-          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
 
-    with self.assertRaises(TypeError):
-      _ = dataset_ops.Dataset.range(10).map(_map_fn).batch(10)
+  def testBatchSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(
+        5).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(2):
+        actual = sess.run(get_next)
+        expected = sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
+            dense_shape=[5, 1])
+        self.assertTrue(sparse_tensor.is_sparse(actual))
+        self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testBatchSparseWithDifferentDenseShapes(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=array_ops.expand_dims(
+              math_ops.range(i, dtype=dtypes.int64), 1),
+          values=array_ops.fill([math_ops.to_int32(i)], i),
+          dense_shape=[i])
+
+    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(
+        5).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(2):
+        actual = sess.run(get_next)
+        expected_indices = []
+        expected_values = []
+        for j in range(5):
+          for k in range(i * 5 + j):
+            expected_indices.append([j, k])
+            expected_values.append(i * 5 + j)
+        expected = sparse_tensor.SparseTensorValue(
+            indices=expected_indices,
+            values=expected_values,
+            dense_shape=[5, (i + 1) * 5 - 1])
+        self.assertTrue(sparse_tensor.is_sparse(actual))
+        self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testNestedBatchSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(5).batch(
+        2).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      actual = sess.run(get_next)
+      expected = sparse_tensor.SparseTensorValue(
+          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [0, 4, 0],
+                   [1, 0, 0], [1, 1, 0], [1, 2, 0], [1, 3, 0], [1, 4, 0]],
+          values=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+          dense_shape=[2, 5, 1])
+      self.assertTrue(sparse_tensor.is_sparse(actual))
+      self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testBatchShapeError(self):
+    def generator():
+      yield [1.0, 2.0, 3.0]
+      yield [4.0, 5.0, 6.0]
+      yield [7.0, 8.0, 9.0, 10.0]
+
+    iterator = (dataset_ops.Dataset.from_generator(generator, dtypes.float32,
+                                                   output_shapes=[None])
+                .batch(3)
+                .make_initializable_iterator())
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"Cannot batch tensors with different shapes in component 0. "
+          r"First element had shape \[3\] and element 2 had shape \[4\]."):
+        sess.run(next_element)
 
   def testPaddedBatchDataset(self):
     seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
@@ -236,7 +334,7 @@ class BatchDatasetTest(test.TestCase):
 
   def testPaddedBatchSparseError(self):
     def _map_fn(i):
-      return sparse_tensor.SparseTensor(
+      return sparse_tensor.SparseTensorValue(
           indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
 
     with self.assertRaises(TypeError):
diff --git a/tensorflow/python/kernel_tests/cache_dataset_op_test.py b/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/cache_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/concatenate_dataset_op_test.py b/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/concatenate_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/dataset_constructor_op_test.py b/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
similarity index 62%
rename from tensorflow/python/kernel_tests/dataset_constructor_op_test.py
rename to tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
index b51d483b5b6611d9596e59fd750c496bbb9c67d3..85ff228eb2838522d7a8264d14a79c918aba4b75 100644
--- a/tensorflow/python/kernel_tests/dataset_constructor_op_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -35,8 +36,8 @@ from tensorflow.python.platform import test
 
 class DatasetConstructorTest(test.TestCase):
 
-  def testTensorDataset(self):
-    """Test an dataset that represents a single tuple of tensors."""
+  def testFromTensors(self):
+    """Test a dataset that represents a single tuple of tensors."""
     components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
 
     iterator = (dataset_ops.Dataset.from_tensors(components)
@@ -55,8 +56,76 @@ class DatasetConstructorTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testTensorSliceDataset(self):
-    """Test an dataset that represents the slices from a tuple of tensors."""
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
+
+  def testFromTensorsSparse(self):
+    """Test a dataset that represents a single tuple of tensors."""
+    components = (sparse_tensor.SparseTensorValue(
+        indices=np.array([[0]]),
+        values=np.array([0]),
+        dense_shape=np.array([1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1]]),
+                      values=np.array([-1, 1]),
+                      dense_shape=np.array([2, 2])))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensors(components)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual(
+        [tensor_shape.TensorShape(c.dense_shape) for c in components],
+        [shape for shape in iterator.output_shapes])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      results = sess.run(get_next)
+      for component, result_component in zip(components, results):
+        self.assertSparseValuesEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromTensorsMixed(self):
+    """Test an dataset that represents a single tuple of tensors."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0]]),
+                      values=np.array([0]),
+                      dense_shape=np.array([1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1]]),
+                      values=np.array([-1, 1]),
+                      dense_shape=np.array([2, 2])))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensors(components)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([
+        tensor_shape.TensorShape(c.dense_shape)
+        if sparse_tensor.is_sparse(c) else c.shape for c in components
+    ], [shape for shape in iterator.output_shapes])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      results = sess.run(get_next)
+      for component, result_component in zip(components, results):
+        if sparse_tensor.is_sparse(component):
+          self.assertSparseValuesEqual(component, result_component)
+        else:
+          self.assertAllEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromTensorSlices(self):
+    """Test a dataset that represents the slices from a tuple of tensors."""
     components = (
         np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
             np.array([[12], [13], [14], [15]]), 22),
@@ -80,7 +149,127 @@ class DatasetConstructorTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testTensorSliceDatasetWithDict(self):
+  def testFromTensorSlicesSparse(self):
+    """Test a dataset that represents the slices from a tuple of tensors."""
+    components = (sparse_tensor.SparseTensorValue(
+        indices=np.array([[0, 0], [1, 0], [2, 0]]),
+        values=np.array([0, 0, 0]),
+        dense_shape=np.array([3, 1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
+                      values=np.array([1, 2, 3]),
+                      dense_shape=np.array([3, 3])))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual(
+        [tensor_shape.TensorShape(c.dense_shape[1:]) for c in components],
+        [shape for shape in iterator.output_shapes])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      expected = [
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[0]]),
+               values=np.array([1]),
+               dense_shape=np.array([3]))),
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[1]]),
+               values=np.array([2]),
+               dense_shape=np.array([3]))),
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[2]]),
+               values=np.array([3]),
+               dense_shape=np.array([3]))),
+      ]
+      for i in range(3):
+        results = sess.run(get_next)
+        for component, result_component in zip(expected[i], results):
+          self.assertSparseValuesEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromTensorSlicesMixed(self):
+    """Test a dataset that represents the slices from a tuple of tensors."""
+    components = (np.tile(np.array([[1], [2], [3]]), 20),
+                  np.tile(np.array([[12], [13], [14]]), 22),
+                  np.array([37.0, 38.0, 39.0]),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 0], [2, 0]]),
+                      values=np.array([0, 0, 0]),
+                      dense_shape=np.array([3, 1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
+                      values=np.array([1, 2, 3]),
+                      dense_shape=np.array([3, 3])))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([
+        tensor_shape.TensorShape(c.dense_shape[1:])
+        if sparse_tensor.is_sparse(c) else c.shape[1:] for c in components
+    ], [shape for shape in iterator.output_shapes])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      expected = [
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[0]]),
+               values=np.array([1]),
+               dense_shape=np.array([3]))),
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[1]]),
+               values=np.array([2]),
+               dense_shape=np.array([3]))),
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[2]]),
+               values=np.array([3]),
+               dense_shape=np.array([3]))),
+      ]
+      for i in range(3):
+        results = sess.run(get_next)
+        for component, result_component in zip(
+            (zip(*components[:3])[i] + expected[i]), results):
+          if sparse_tensor.is_sparse(component):
+            self.assertSparseValuesEqual(component, result_component)
+          else:
+            self.assertAllEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromTensorSlicesWithDict(self):
     components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
     iterator = (dataset_ops.Dataset.from_tensor_slices(components)
                 .make_initializable_iterator())
@@ -101,7 +290,7 @@ class DatasetConstructorTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testSparseTensorSliceDataset(self):
+  def testFromSparseTensorSlices(self):
     """Test a dataset based on slices of a `tf.SparseTensor`."""
     st = array_ops.sparse_placeholder(dtypes.float64)
     iterator = (dataset_ops.Dataset.from_sparse_tensor_slices(st)
diff --git a/tensorflow/python/kernel_tests/dataset_from_generator_op_test.py b/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/dataset_from_generator_op_test.py
rename to tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
diff --git a/tensorflow/python/kernel_tests/filter_dataset_op_test.py b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
similarity index 95%
rename from tensorflow/python/kernel_tests/filter_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
index 6eb445445f0156a3e0040a1eb9cb743cdced0352..b9258b720edd4ecd620c61eed18f6f975cb7f439 100644
--- a/tensorflow/python/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
@@ -131,9 +131,12 @@ class FilterDatasetTest(test.TestCase):
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
   def testSparse(self):
+
     def _map_fn(i):
-      return sparse_tensor.SparseTensor(
-          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1])), i
 
     def _filter_fn(_, i):
       return math_ops.equal(i % 2, 0)
@@ -148,10 +151,8 @@ class FilterDatasetTest(test.TestCase):
       sess.run(init_op)
       for i in range(5):
         actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensor(
-            indices=[[0, 0]], values=[i*2], dense_shape=[1, 1])
         self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
-        self.assertSparseValuesEqual(actual, expected.eval())
+        self.assertSparseValuesEqual(actual, _map_fn(i * 2)[0])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
diff --git a/tensorflow/python/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py
similarity index 99%
rename from tensorflow/python/kernel_tests/flat_map_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py
index 895f36382a440bb7e6baaaa9203d53875bcfff23..350234a8396a7e2d69cd016010aee4227fe222b7 100644
--- a/tensorflow/python/kernel_tests/flat_map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py
@@ -124,7 +124,7 @@ class FlatMapDatasetTest(test.TestCase):
 
   def testSparse(self):
     def _map_fn(i):
-      return sparse_tensor.SparseTensor(
+      return sparse_tensor.SparseTensorValue(
           indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
 
     def _flat_map_fn(x):
diff --git a/tensorflow/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py
similarity index 99%
rename from tensorflow/python/kernel_tests/interleave_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py
index 0a3c4af9e0c8d16811d10c4c631c2b2402537930..28cb50c00208f95e64bb11ae80656383b1f41e1e 100644
--- a/tensorflow/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py
@@ -177,8 +177,9 @@ class InterleaveDatasetTest(test.TestCase):
         sess.run(next_element)
 
   def testSparse(self):
+
     def _map_fn(i):
-      return sparse_tensor.SparseTensor(
+      return sparse_tensor.SparseTensorValue(
           indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
 
     def _interleave_fn(x):
diff --git a/tensorflow/python/kernel_tests/iterator_ops_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/iterator_ops_cluster_test.py
rename to tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
diff --git a/tensorflow/python/kernel_tests/iterator_ops_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
similarity index 98%
rename from tensorflow/python/kernel_tests/iterator_ops_test.py
rename to tensorflow/python/data/kernel_tests/iterator_ops_test.py
index 513c36d64fa3e8aa00410b7fd06fa2e061aec4c5..23c6d7385f8d4a12019fa514f349f2598d9629de 100644
--- a/tensorflow/python/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
@@ -18,6 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import warnings
+
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
@@ -633,6 +635,18 @@ class IteratorTest(test.TestCase):
         with self.assertRaises(errors.InvalidArgumentError):
           sess.run(restore_op)
 
+  def testRepeatedGetNextWarning(self):
+    iterator = dataset_ops.Dataset.range(10).make_one_shot_iterator()
+    warnings.simplefilter("always")
+    with warnings.catch_warnings(record=True) as w:
+      for _ in range(100):
+        iterator.get_next()
+    self.assertEqual(100 - iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD,
+                     len(w))
+    for warning in w:
+      self.assertTrue(
+          iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE in str(warning.message))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/list_files_dataset_op_test.py b/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/list_files_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
similarity index 88%
rename from tensorflow/python/kernel_tests/map_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/map_dataset_op_test.py
index c6c36d133c956e80d6c26634864edbb0399bfbb2..ad6bbc043db9e44ec7893cd9ae29898a8c7fedaa 100644
--- a/tensorflow/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
@@ -19,13 +19,16 @@ from __future__ import print_function
 
 from collections import namedtuple
 import threading
+import time
 
 import numpy as np
 
+from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
@@ -281,9 +284,8 @@ class MapDatasetTest(test.TestCase):
     with self.test_session() as sess:
       sess.run(table.init)
       sess.run(init_op)
-
-      print(sess.run(get_next))
-      print(sess.run(get_next))
+      sess.run(get_next)
+      sess.run(get_next)
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -550,9 +552,13 @@ class MapDatasetTest(test.TestCase):
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
   def testSparse(self):
+
     def _sparse(i):
-      return sparse_tensor.SparseTensor(
-          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1])
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
     iterator = (dataset_ops.Dataset.range(10)
                 .map(_sparse)
                 .make_initializable_iterator())
@@ -563,24 +569,26 @@ class MapDatasetTest(test.TestCase):
       sess.run(init_op)
       for i in range(10):
         actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensor(
-            indices=[[0, 0]], values=[i], dense_shape=[1, 1])
         self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
-        self.assertSparseValuesEqual(actual, expected.eval())
+        self.assertSparseValuesEqual(actual, _sparse(i))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
   def testSparseChain(self):
+
     def _sparse(i):
-      return sparse_tensor.SparseTensor(
-          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1])
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
     def _check(i):
-      self.assertTrue(isinstance(i, sparse_tensor.SparseTensor))
+      self.assertTrue(sparse_tensor.is_sparse(i))
       return sparse_ops.sparse_concat(0, [i, i])
 
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(_sparse).map(_check)
-                .make_initializable_iterator())
+    iterator = (
+        dataset_ops.Dataset.range(10).map(_sparse).map(_check)
+        .make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -588,12 +596,69 @@ class MapDatasetTest(test.TestCase):
       sess.run(init_op)
       for i in range(10):
         actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensor(
-            indices=[[0, 0], [1, 0]], values=[i, i], dense_shape=[2, 1])
         self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
-        self.assertSparseValuesEqual(actual, expected.eval())
+        self.assertSparseValuesEqual(actual, _check(_sparse(i)).eval())
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+
+class MapDatasetBenchmark(test.Benchmark):
+
+  def benchmarkChainOfMaps(self):
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      with ops.Graph().as_default():
+        dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
+        for _ in range(chain_length):
+          dataset = dataset.map(lambda x: x)
+        iterator = dataset.make_one_shot_iterator()
+        next_element = iterator.get_next()
+
+        with session.Session() as sess:
+          for _ in range(5):
+            sess.run(next_element.op)
+          deltas = []
+          for _ in range(100):
+            start = time.time()
+            for _ in range(100):
+              sess.run(next_element.op)
+            end = time.time()
+            deltas.append(end - start)
+
+          median_wall_time = np.median(deltas) / 100
+          print("Map dataset chain length: %d Median wall time: %f"
+                % (chain_length, median_wall_time))
+          self.report_benchmark(
+              iters=1000, wall_time=median_wall_time,
+              name="benchmark_map_dataset_chain_latency_%d" % chain_length)
+
+  def benchmarkMapFanOut(self):
+    fan_outs = [1, 2, 5, 10, 20, 50, 100]
+    for fan_out in fan_outs:
+      with ops.Graph().as_default():
+        dataset = dataset_ops.Dataset.from_tensors(
+            tuple(0 for _ in range(fan_out))).repeat(None).map(lambda *xs: xs)
+        iterator = dataset.make_one_shot_iterator()
+        next_element = iterator.get_next()
+
+        with session.Session() as sess:
+          for _ in range(5):
+            sess.run(next_element[0].op)
+          deltas = []
+          for _ in range(100):
+            start = time.time()
+            for _ in range(100):
+              sess.run(next_element[0].op)
+            end = time.time()
+            deltas.append(end - start)
+
+          median_wall_time = np.median(deltas) / 100
+          print("Map dataset fan out: %d Median wall time: %f"
+                % (fan_out, median_wall_time))
+          self.report_benchmark(
+              iters=1000, wall_time=median_wall_time,
+              name="benchmark_map_dataset_fan_out_%d" % fan_out)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/prefetch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py
similarity index 94%
rename from tensorflow/python/kernel_tests/prefetch_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py
index edea9c9027e72db33074adc31af71dc74e578f3b..646324cb95df6fc1fa0a901ebdccc8d4ef74a66c 100644
--- a/tensorflow/python/kernel_tests/prefetch_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py
@@ -25,10 +25,11 @@ from tensorflow.python.platform import test
 
 
 class PrefetchDatasetTest(test.TestCase):
+
   def testBufferSize(self):
     buffer_size = array_ops.placeholder(dtypes.int64, shape=[])
     iterator = dataset_ops.Dataset.range(10).prefetch(
-      buffer_size=buffer_size).make_initializable_iterator()
+        buffer_size=buffer_size).make_initializable_iterator()
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -42,7 +43,7 @@ class PrefetchDatasetTest(test.TestCase):
   def testInvalidBufferSize(self):
     buffer_size = array_ops.placeholder(dtypes.int64, shape=[])
     iterator = dataset_ops.Dataset.range(10).prefetch(
-      buffer_size=buffer_size).make_initializable_iterator()
+        buffer_size=buffer_size).make_initializable_iterator()
     init_op = iterator.initializer
 
     with self.assertRaisesRegexp(errors.InvalidArgumentError, "buffer_size"):
diff --git a/tensorflow/python/kernel_tests/range_dataset_op_test.py b/tensorflow/python/data/kernel_tests/range_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/range_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/range_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/reader_dataset_ops_test.py
rename to tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
diff --git a/tensorflow/python/kernel_tests/sequence_dataset_op_test.py b/tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/sequence_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/shard_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shard_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/shard_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/shard_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/shuffle_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/zip_dataset_op_test.py b/tensorflow/python/data/kernel_tests/zip_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/zip_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/zip_dataset_op_test.py
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 05acfe4de7855f398d4e14f7478f5909f3e20431..695d3ef7904b160a46e8755b84b2955c7a0fa882 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -21,6 +21,7 @@ py_library(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
         "//third_party/py/numpy",
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 5f981e2670492d31213eccfcdb1d7eca32555d59..eba9637bdceed0a66a2e2dadeb2ddbe45a6ca93f 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -40,6 +40,7 @@ from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.util import deprecation
 
 
 class Dataset(object):
@@ -97,13 +98,15 @@ class Dataset(object):
         container="",
         shared_name=shared_name,
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)),
-        output_shapes=nest.flatten(self.output_shapes))
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
     with ops.colocate_with(iterator_resource):
       initializer = gen_dataset_ops.make_iterator(self._as_variant_tensor(),
                                                   iterator_resource)
     return iterator_ops.Iterator(iterator_resource, initializer,
-                                 self.output_types, self.output_shapes)
+                                 self.output_types, self.output_shapes,
+                                 self.output_classes)
 
   def make_one_shot_iterator(self):
     """Creates an `Iterator` for enumerating the elements of this dataset.
@@ -144,9 +147,23 @@ class Dataset(object):
         gen_dataset_ops.one_shot_iterator(
             dataset_factory=_make_dataset,
             output_types=nest.flatten(
-                sparse.unwrap_sparse_types(self.output_types)),
-            output_shapes=nest.flatten(self.output_shapes)), None,
-        self.output_types, self.output_shapes)
+                sparse.as_dense_types(self.output_types, self.output_classes)),
+            output_shapes=nest.flatten(
+                sparse.as_dense_shapes(self.output_shapes,
+                                       self.output_classes))), None,
+        self.output_types, self.output_shapes, self.output_classes)
+
+  @abc.abstractproperty
+  def output_classes(self):
+    """Returns the class of each component of an element of this dataset.
+
+    The expected values are `tf.Tensor` and `tf.SparseTensor`.
+
+    Returns:
+      A nested structure of Python `type` objects corresponding to each
+      component of an element of this dataset.
+    """
+    raise NotImplementedError("Dataset.output_classes")
 
   @abc.abstractproperty
   def output_shapes(self):
@@ -163,9 +180,8 @@ class Dataset(object):
     """Returns the type of each component of an element of this dataset.
 
     Returns:
-      A nested structure of `tf.DType` (or `tf.data.SparseType`) objects
-      corresponding to each `tf.Tensor` (or `tf.SparseTensor`) component of an
-      element of this dataset.
+      A nested structure of `tf.DType` objects corresponding to each component
+      of an element of this dataset.
     """
     raise NotImplementedError("Dataset.output_types")
 
@@ -203,6 +219,7 @@ class Dataset(object):
     return TensorSliceDataset(tensors)
 
   @staticmethod
+  @deprecation.deprecated(None, "Use `tf.data.Dataset.from_tensor_slices()`.")
   def from_sparse_tensor_slices(sparse_tensor):
     """Splits each rank-N `tf.SparseTensor` in this dataset row-wise.
 
@@ -269,6 +286,23 @@ class Dataset(object):
     sess.run(value)  # (2, array([1, 1]))
     ```
 
+    NOTE: The current implementation of `Dataset.from_generator()` uses
+    @{tf.py_func} and inherits the same constraints. In particular, it
+    requires the `Dataset`- and `Iterator`-related operations to be placed
+    on a device in the same process as the Python program that called
+    `Dataset.from_generator()`. The body of `generator` will not be
+    serialized in a `GraphDef`, and you should not use this method if you
+    need to serialize your model and restore it in a different environment.
+
+    NOTE: If `generator` depends on mutable global variables or other external
+    state, be aware that the runtime may invoke `generator` multiple times
+    (in order to support repeating the `Dataset`) and at any time
+    between the call to `Dataset.from_generator()` and the production of the
+    first element from the generator. Mutating global variables or external
+    state can cause undefined behavior, and we recommend that you explicitly
+    cache any external state in `generator` before calling
+    `Dataset.from_generator()`.
+
     Args:
       generator: A callable object that takes no arguments and returns an
         object that supports the `iter()` protocol.
@@ -534,11 +568,14 @@ class Dataset(object):
   def repeat(self, count=None):
     """Repeats this dataset `count` times.
 
+    NOTE: If this dataset is a function of global state (e.g. a random number
+    generator), then different repetitions may produce different elements.
+
     Args:
       count: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
-        number of times the elements of this dataset should be repeated. The
-        default behavior (if `count` is `None` or `-1`) is for the elements to
-        be repeated indefinitely.
+        number of times the dataset should be repeated. The default behavior
+        (if `count` is `None` or `-1`) is for the dataset be repeated
+        indefinitely.
 
     Returns:
       A `Dataset`.
@@ -874,25 +911,37 @@ class TensorDataset(Dataset):
     """See `Dataset.from_tensors()` for details."""
     super(TensorDataset, self).__init__()
     with ops.name_scope("tensors"):
-      self._tensors = nest.pack_sequence_as(tensors, [
-          ops.convert_to_tensor(t, name="component_%d" % i)
+      tensors = nest.pack_sequence_as(tensors, [
+          sparse_tensor_lib.SparseTensor.from_value(t)
+          if sparse_tensor_lib.is_sparse(t) else ops.convert_to_tensor(
+              t, name="component_%d" % i)
           for i, t in enumerate(nest.flatten(tensors))
       ])
 
+    self._tensors = sparse.serialize_sparse_tensors(tensors)
+    self._output_classes = sparse.get_classes(tensors)
+    self._output_shapes = nest.pack_sequence_as(
+        tensors, [t.get_shape() for t in nest.flatten(tensors)])
+    self._output_types = nest.pack_sequence_as(
+        tensors, [t.dtype for t in nest.flatten(tensors)])
+
   def _as_variant_tensor(self):
     return gen_dataset_ops.tensor_dataset(
         nest.flatten(self._tensors),
-        output_shapes=nest.flatten(self.output_shapes))
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
 
   @property
   def output_shapes(self):
-    return nest.pack_sequence_as(self._tensors,
-                                 [t.shape for t in nest.flatten(self._tensors)])
+    return self._output_shapes
 
   @property
   def output_types(self):
-    return nest.pack_sequence_as(self._tensors,
-                                 [t.dtype for t in nest.flatten(self._tensors)])
+    return self._output_types
 
 
 class TensorSliceDataset(Dataset):
@@ -902,32 +951,41 @@ class TensorSliceDataset(Dataset):
     """See `Dataset.from_tensor_slices()` for details."""
     super(TensorSliceDataset, self).__init__()
     with ops.name_scope("tensors"):
-      flat_tensors = [
-          ops.convert_to_tensor(t, name="component_%d" % i)
+      tensors = nest.pack_sequence_as(tensors, [
+          sparse_tensor_lib.SparseTensor.from_value(t)
+          if sparse_tensor_lib.is_sparse(t) else ops.convert_to_tensor(
+              t, name="component_%d" % i)
           for i, t in enumerate(nest.flatten(tensors))
-      ]
+      ])
+      flat_tensors = nest.flatten(tensors)
 
-    self._tensors = nest.pack_sequence_as(tensors, flat_tensors)
     batch_dim = flat_tensors[0].get_shape()[0]
     for t in flat_tensors[1:]:
       batch_dim.assert_is_compatible_with(t.get_shape()[0])
+    self._tensors = sparse.serialize_many_sparse_tensors(tensors)
+    self._output_classes = sparse.get_classes(tensors)
+    self._output_shapes = nest.pack_sequence_as(
+        tensors, [t.get_shape()[1:] for t in nest.flatten(tensors)])
+    self._output_types = nest.pack_sequence_as(
+        tensors, [t.dtype for t in nest.flatten(tensors)])
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.tensor_slice_dataset(
         nest.flatten(self._tensors),
-        output_shapes=nest.flatten(self.output_shapes))
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
 
   @property
   def output_shapes(self):
-    return nest.pack_sequence_as(self._tensors, [
-        tensor_shape.TensorShape(t.shape[1:])
-        for t in nest.flatten(self._tensors)
-    ])
+    return self._output_shapes
 
   @property
   def output_types(self):
-    return nest.pack_sequence_as(self._tensors,
-                                 [t.dtype for t in nest.flatten(self._tensors)])
+    return self._output_types
 
 
 class SparseTensorSliceDataset(Dataset):
@@ -945,6 +1003,10 @@ class SparseTensorSliceDataset(Dataset):
         self._sparse_tensor.indices, self._sparse_tensor.values,
         self._sparse_tensor.dense_shape)
 
+  @property
+  def output_classes(self):
+    return (ops.Tensor, ops.Tensor, ops.Tensor)
+
   @property
   def output_shapes(self):
     indices_shape = self._sparse_tensor.indices.get_shape()
@@ -994,6 +1056,12 @@ class ZipDataset(Dataset):
         ])
     # pylint: enable=protected-access
 
+  @property
+  def output_classes(self):
+    return nest.pack_sequence_as(
+        self._datasets,
+        [ds.output_classes for ds in nest.flatten(self._datasets)])
+
   @property
   def output_shapes(self):
     return nest.pack_sequence_as(
@@ -1030,11 +1098,16 @@ class ConcatenateDataset(Dataset):
     return gen_dataset_ops.concatenate_dataset(
         self._input_dataset._as_variant_tensor(),
         self._dataset_to_concatenate._as_variant_tensor(),
-        output_shapes=nest.flatten(self.output_shapes),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)))
+            sparse.as_dense_types(self.output_types, self.output_classes)))
     # pylint: enable=protected-access
 
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
   @property
   def output_shapes(self):
     return nest.pack_sequence_as(self._input_dataset.output_shapes, [
@@ -1066,9 +1139,14 @@ class RepeatDataset(Dataset):
     return gen_dataset_ops.repeat_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         count=self._count,
-        output_shapes=nest.flatten(self.output_shapes),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)))
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
@@ -1111,9 +1189,14 @@ class RangeDataset(Dataset):
         start=self._start,
         stop=self._stop,
         step=self._step,
-        output_shapes=nest.flatten(self.output_shapes),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)))
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return ops.Tensor
 
   @property
   def output_shapes(self):
@@ -1138,9 +1221,14 @@ class CacheDataset(Dataset):
     return gen_dataset_ops.cache_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         filename=self._filename,
-        output_shapes=nest.flatten(self.output_shapes),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)))
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
@@ -1159,7 +1247,26 @@ class ShuffleDataset(Dataset):
                buffer_size,
                seed=None,
                reshuffle_each_iteration=None):
-    """See `Dataset.shuffle()` for details."""
+    """Randomly shuffles the elements of this dataset.
+
+    Args:
+      input_dataset: The input dataset.
+      buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
+        number of elements from this dataset from which the new
+        dataset will sample.
+      seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+        random seed that will be used to create the distribution. See
+        @{tf.set_random_seed} for behavior.
+      reshuffle_each_iteration: (Optional.) A boolean, which if true indicates
+        that the dataset should be pseudorandomly reshuffled each time it is
+        iterated over. (Defaults to `True`.)
+
+    Returns:
+      A `Dataset`.
+
+    Raises:
+      ValueError: if invalid arguments are provided.
+    """
     super(ShuffleDataset, self).__init__()
     self._input_dataset = input_dataset
     self._buffer_size = ops.convert_to_tensor(
@@ -1186,9 +1293,14 @@ class ShuffleDataset(Dataset):
         seed=self._seed,
         seed2=self._seed2,
         reshuffle_each_iteration=self._reshuffle_each_iteration,
-        output_shapes=nest.flatten(self.output_shapes),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)))
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
@@ -1212,9 +1324,14 @@ class TakeDataset(Dataset):
     return gen_dataset_ops.take_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         count=self._count,
-        output_shapes=nest.flatten(self.output_shapes),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)))
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
@@ -1238,9 +1355,14 @@ class SkipDataset(Dataset):
     return gen_dataset_ops.skip_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         count=self._count,
-        output_shapes=nest.flatten(self.output_shapes),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)))
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
@@ -1257,9 +1379,6 @@ class BatchDataset(Dataset):
   def __init__(self, input_dataset, batch_size):
     """See `Dataset.batch()` for details."""
     super(BatchDataset, self).__init__()
-    if sparse.any_sparse(input_dataset.output_types):
-      # TODO(b/63669786): support batching of sparse tensors
-      raise TypeError("Batching of sparse tensors is not currently supported")
     self._input_dataset = input_dataset
     self._batch_size = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
@@ -1268,9 +1387,14 @@ class BatchDataset(Dataset):
     return gen_dataset_ops.batch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         batch_size=self._batch_size,
-        output_shapes=nest.flatten(self.output_shapes),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)))
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
@@ -1330,9 +1454,10 @@ class PaddedBatchDataset(Dataset):
   def __init__(self, input_dataset, batch_size, padded_shapes, padding_values):
     """See `Dataset.batch()` for details."""
     super(PaddedBatchDataset, self).__init__()
-    if sparse.any_sparse(input_dataset.output_types):
+    if sparse.any_sparse(input_dataset.output_classes):
       # TODO(b/63669786): support batching of sparse tensors
-      raise TypeError("Batching of sparse tensors is not currently supported")
+      raise TypeError(
+          "Batching of padded sparse tensors is not currently supported")
     self._input_dataset = input_dataset
     self._batch_size = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
@@ -1364,7 +1489,12 @@ class PaddedBatchDataset(Dataset):
             for s in nest.flatten(self._padded_shapes)
         ],
         padding_values=nest.flatten(self._padding_values),
-        output_shapes=nest.flatten(self.output_shapes))
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
@@ -1393,20 +1523,25 @@ class MapDataset(Dataset):
     super(MapDataset, self).__init__()
     self._input_dataset = input_dataset
 
+    self._output_classes = None
     self._output_shapes = None
     self._output_types = None
 
-    @function.Defun(
-        *nest.flatten(sparse.unwrap_sparse_types(input_dataset.output_types)))
+    @function.Defun(*nest.flatten(
+        sparse.as_dense_types(input_dataset.output_types,
+                              input_dataset.output_classes)))
     def tf_map_func(*args):
       """A wrapper for Defun that facilitates shape inference."""
       # Pass in shape information from the input_dataset.
-      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+      dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
+                                            input_dataset.output_classes)
+      for arg, shape in zip(args, nest.flatten(dense_shapes)):
         arg.set_shape(shape)
 
       nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
       nested_args = sparse.deserialize_sparse_tensors(
-          nested_args, input_dataset.output_types)
+          nested_args, input_dataset.output_types, input_dataset.output_shapes,
+          input_dataset.output_classes)
       if _should_unpack_args(nested_args):
         ret = map_func(*nested_args)
       else:
@@ -1425,16 +1560,23 @@ class MapDataset(Dataset):
       if isinstance(ret, list):
         ret = tuple(ret)
 
-      # Identify components that hold sparse tensor values.
-      types = sparse.get_sparse_types(ret)
+      # Convert any `SparseTensorValue`s to `SparseTensor`s.
+      ret = nest.pack_sequence_as(ret, [
+          sparse_tensor_lib.SparseTensor.from_value(t)
+          if sparse_tensor_lib.is_sparse(t) else t for t in nest.flatten(ret)
+      ])
+
+      self._output_classes = sparse.get_classes(ret)
+      self._output_shapes = nest.pack_sequence_as(
+          ret, [t.get_shape() for t in nest.flatten(ret)])
+      self._output_types = nest.pack_sequence_as(
+          ret, [t.dtype for t in nest.flatten(ret)])
+
       # Serialize any sparse tensors and convert result to tensors.
       ret = nest.pack_sequence_as(ret, [
           ops.convert_to_tensor(t)
           for t in nest.flatten(sparse.serialize_sparse_tensors(ret))
       ])
-      self._output_shapes = nest.pack_sequence_as(
-          types, [t.get_shape() for t in nest.flatten(ret)])
-      self._output_types = sparse.wrap_sparse_types(ret, types)
       return nest.flatten(ret)
 
     self._map_func = tf_map_func
@@ -1447,8 +1589,13 @@ class MapDataset(Dataset):
         self._map_func.captured_inputs,
         f=self._map_func,
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)),
-        output_shapes=nest.flatten(self.output_shapes))
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
 
   @property
   def output_shapes(self):
@@ -1478,8 +1625,9 @@ class ParallelMapDataset(MapDataset):
         f=self._map_func,
         num_parallel_calls=self._num_parallel_calls,
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)),
-        output_shapes=nest.flatten(self.output_shapes))
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
     # pylint: enable=protected-access
 
 
@@ -1491,17 +1639,21 @@ class FlatMapDataset(Dataset):
     super(FlatMapDataset, self).__init__()
     self._input_dataset = input_dataset
 
-    @function.Defun(
-        *nest.flatten(sparse.unwrap_sparse_types(input_dataset.output_types)))
+    @function.Defun(*nest.flatten(
+        sparse.as_dense_types(input_dataset.output_types,
+                              input_dataset.output_classes)))
     def tf_map_func(*args):
       """A wrapper for Defun that facilitates shape inference."""
       # Pass in shape information from the input_dataset.
-      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+      dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
+                                            input_dataset.output_classes)
+      for arg, shape in zip(args, nest.flatten(dense_shapes)):
         arg.set_shape(shape)
 
       nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
       nested_args = sparse.deserialize_sparse_tensors(
-          nested_args, input_dataset.output_types)
+          nested_args, input_dataset.output_types, input_dataset.output_shapes,
+          input_dataset.output_classes)
       if _should_unpack_args(nested_args):
         dataset = map_func(*nested_args)
       else:
@@ -1510,6 +1662,7 @@ class FlatMapDataset(Dataset):
       if not isinstance(dataset, Dataset):
         raise TypeError("`map_func` must return a `Dataset` object.")
 
+      self._output_classes = dataset.output_classes
       self._output_types = dataset.output_types
       self._output_shapes = dataset.output_shapes
 
@@ -1524,8 +1677,13 @@ class FlatMapDataset(Dataset):
         self._map_func.captured_inputs,
         f=self._map_func,
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)),
-        output_shapes=nest.flatten(self.output_shapes))
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
 
   @property
   def output_shapes(self):
@@ -1545,17 +1703,21 @@ class InterleaveDataset(Dataset):
     super(InterleaveDataset, self).__init__()
     self._input_dataset = input_dataset
 
-    @function.Defun(
-        *nest.flatten(sparse.unwrap_sparse_types(input_dataset.output_types)))
+    @function.Defun(*nest.flatten(
+        sparse.as_dense_types(input_dataset.output_types,
+                              input_dataset.output_classes)))
     def tf_map_func(*args):
       """A wrapper for Defun that facilitates shape inference."""
       # Pass in shape information from the input_dataset.
-      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+      dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
+                                            input_dataset.output_classes)
+      for arg, shape in zip(args, nest.flatten(dense_shapes)):
         arg.set_shape(shape)
 
       nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
       nested_args = sparse.deserialize_sparse_tensors(
-          nested_args, input_dataset.output_types)
+          nested_args, input_dataset.output_types, input_dataset.output_shapes,
+          input_dataset.output_classes)
       if _should_unpack_args(nested_args):
         dataset = map_func(*nested_args)
       else:
@@ -1564,6 +1726,7 @@ class InterleaveDataset(Dataset):
       if not isinstance(dataset, Dataset):
         raise TypeError("`map_func` must return a `Dataset` object.")
 
+      self._output_classes = dataset.output_classes
       self._output_types = dataset.output_types
       self._output_shapes = dataset.output_shapes
 
@@ -1585,8 +1748,13 @@ class InterleaveDataset(Dataset):
         self._block_length,
         f=self._map_func,
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)),
-        output_shapes=nest.flatten(self.output_shapes))
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
 
   @property
   def output_shapes(self):
@@ -1605,17 +1773,21 @@ class FilterDataset(Dataset):
     super(FilterDataset, self).__init__()
     self._input_dataset = input_dataset
 
-    @function.Defun(
-        *nest.flatten(sparse.unwrap_sparse_types(input_dataset.output_types)))
+    @function.Defun(*nest.flatten(
+        sparse.as_dense_types(input_dataset.output_types,
+                              input_dataset.output_classes)))
     def tf_predicate(*args):
       """A wrapper for Defun that facilitates shape inference."""
       # Pass in shape information from the input_dataset.
-      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+      dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
+                                            input_dataset.output_classes)
+      for arg, shape in zip(args, nest.flatten(dense_shapes)):
         arg.set_shape(shape)
 
       nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
       nested_args = sparse.deserialize_sparse_tensors(
-          nested_args, input_dataset.output_types)
+          nested_args, input_dataset.output_types, input_dataset.output_shapes,
+          input_dataset.output_classes)
       if _should_unpack_args(nested_args):
         ret = predicate(*nested_args)
       else:
@@ -1637,8 +1809,13 @@ class FilterDataset(Dataset):
         other_arguments=self._predicate.captured_inputs,
         predicate=self._predicate,
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)),
-        output_shapes=nest.flatten(self.output_shapes))
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
@@ -1663,9 +1840,14 @@ class PrefetchDataset(Dataset):
     return gen_dataset_ops.prefetch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         buffer_size=self._buffer_size,
-        output_shapes=nest.flatten(self.output_shapes),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
         output_types=nest.flatten(
-            sparse.unwrap_sparse_types(self.output_types)))
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 987a9b53ad2c19462e7f13da9689811c2fca9628..0cbdb3ab19d8f1b966a867dfcf709c1a4a49b871 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
@@ -25,11 +27,31 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
 
 
+# NOTE(mrry): It is legitimate to call `Iterator.get_next()` multiple
+# times, e.g. when you are distributing different elements to multiple
+# devices in a single step. However, a common pitfall arises when
+# users call `Iterator.get_next()` in each iteration of their training
+# loop. `Iterator.get_next()` adds ops to the graph, and executing
+# each op allocates resources (including threads); as a consequence,
+# invoking it in every iteration of a training loop causes slowdown
+# and eventual resource exhaustion. To guard against this outcome, we
+# log a warning when the number of uses crosses a threshold of suspicion.
+GET_NEXT_CALL_WARNING_THRESHOLD = 32
+
+GET_NEXT_CALL_WARNING_MESSAGE = (
+    "An unusually high number of `Iterator.get_next()` calls was detected. "
+    "This often indicates that `Iterator.get_next()` is being called inside "
+    "a training loop, which will cause gradual slowdown and eventual resource "
+    "exhaustion. If this is the case, restructure your code to call "
+    "`next_element = iterator.get_next() once outside the loop, and use "
+    "`next_element` inside the loop.")
+
+
 class Iterator(object):
   """Represents the state of iterating through a `Dataset`."""
 
   def __init__(self, iterator_resource, initializer, output_types,
-               output_shapes):
+               output_shapes, output_classes):
     """Creates a new iterator from the given iterator resource.
 
     Note: Most users will not call this initializer directly, and will
@@ -41,21 +63,28 @@ class Iterator(object):
         iterator.
       initializer: A `tf.Operation` that should be run to initialize this
         iterator.
-      output_types: A nested structure of `tf.DType` (or `tf.data.SparseType`)
-        objects corresponding to each `tf.Tensor` (or `tf.SparseTensor`)
-        component of an element of this dataset.
+      output_types: A nested structure of `tf.DType` objects corresponding to
+        each component of an element of this dataset.
       output_shapes: A nested structure of `tf.TensorShape` objects
         corresponding to each component of an element of this dataset.
+      output_classes: A nested structure of Python `type` object corresponding
+        to each
+        component of an element of this iterator.
     """
     self._iterator_resource = iterator_resource
     self._initializer = initializer
+    self._output_classes = output_classes
     self._output_types = output_types
     self._output_shapes = output_shapes
     self._string_handle = gen_dataset_ops.iterator_to_string_handle(
         self._iterator_resource)
+    self._get_next_call_count = 0
 
   @staticmethod
-  def from_structure(output_types, output_shapes=None, shared_name=None):
+  def from_structure(output_types,
+                     output_shapes=None,
+                     shared_name=None,
+                     output_classes=None):
     """Creates a new, uninitialized `Iterator` with the given structure.
 
     This iterator-constructing method can be used to create an iterator that
@@ -102,15 +131,17 @@ class Iterator(object):
     ```
 
     Args:
-      output_types: A nested structure of `tf.DType` (or `tf.data.SparseType`)
-        objects corresponding to each `tf.Tensor` (or `tf.SparseTensor`)
-        component of an element of this dataset.
+      output_types: A nested structure of `tf.DType` objects corresponding to
+        each component of an element of this dataset.
       output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects
         corresponding to each component of an element of this dataset. If
         omitted, each component will have an unconstrainted shape.
       shared_name: (Optional.) If non-empty, this iterator will be shared under
         the given name across multiple sessions that share the same devices
         (e.g. when using a remote server).
+      output_classes: (Optional.) A nested structure of Python `type` objects
+        corresponding to each component of an element of this iterator. If
+        omitted, each component is assumed to be of type `tf.Tensor`.
 
     Returns:
       An `Iterator`.
@@ -126,18 +157,24 @@ class Iterator(object):
     else:
       output_shapes = nest.map_structure_up_to(
           output_types, tensor_shape.as_shape, output_shapes)
+    if output_classes is None:
+      output_classes = nest.map_structure(lambda _: ops.Tensor, output_types)
     nest.assert_same_structure(output_types, output_shapes)
     if shared_name is None:
       shared_name = ""
     iterator_resource = gen_dataset_ops.iterator(
         container="",
         shared_name=shared_name,
-        output_types=nest.flatten(sparse.unwrap_sparse_types(output_types)),
+        output_types=nest.flatten(output_types),
         output_shapes=nest.flatten(output_shapes))
-    return Iterator(iterator_resource, None, output_types, output_shapes)
+    return Iterator(iterator_resource, None, output_types, output_shapes,
+                    output_classes)
 
   @staticmethod
-  def from_string_handle(string_handle, output_types, output_shapes=None):
+  def from_string_handle(string_handle,
+                         output_types,
+                         output_shapes=None,
+                         output_classes=None):
     """Creates a new, uninitialized `Iterator` based on the given handle.
 
     This method allows you to define a "feedable" iterator where you can choose
@@ -170,12 +207,14 @@ class Iterator(object):
     Args:
       string_handle: A scalar `tf.Tensor` of type `tf.string` that evaluates
         to a handle produced by the `Iterator.string_handle()` method.
-      output_types: A nested structure of `tf.DType` (or `tf.data.SparseType`)
-        objects corresponding to each `tf.Tensor` (or `tf.SparseTensor`)
-        component of an element of this dataset.
+      output_types: A nested structure of `tf.DType` objects corresponding to
+        each component of an element of this dataset.
       output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects
         corresponding to each component of an element of this dataset. If
         omitted, each component will have an unconstrainted shape.
+      output_classes: (Optional.) A nested structure of Python `type` objects
+        corresponding to each component of an element of this iterator. If
+        omitted, each component is assumed to be of type `tf.Tensor`.
 
     Returns:
       An `Iterator`.
@@ -187,13 +226,16 @@ class Iterator(object):
     else:
       output_shapes = nest.map_structure_up_to(
           output_types, tensor_shape.as_shape, output_shapes)
+    if output_classes is None:
+      output_classes = nest.map_structure(lambda _: ops.Tensor, output_types)
     nest.assert_same_structure(output_types, output_shapes)
     string_handle = ops.convert_to_tensor(string_handle, dtype=dtypes.string)
     iterator_resource = gen_dataset_ops.iterator_from_string_handle(
         string_handle,
-        output_types=nest.flatten(sparse.unwrap_sparse_types(output_types)),
+        output_types=nest.flatten(output_types),
         output_shapes=nest.flatten(output_shapes))
-    return Iterator(iterator_resource, None, output_types, output_shapes)
+    return Iterator(iterator_resource, None, output_types, output_shapes,
+                    output_classes)
 
   @property
   def initializer(self):
@@ -230,6 +272,13 @@ class Iterator(object):
     with ops.name_scope(name, "make_initializer") as name:
       nest.assert_same_structure(self._output_types, dataset.output_types)
       nest.assert_same_structure(self._output_shapes, dataset.output_shapes)
+      for iterator_class, dataset_class in zip(
+          nest.flatten(self._output_classes),
+          nest.flatten(dataset.output_classes)):
+        if iterator_class is not dataset_class:
+          raise TypeError(
+              "Expected output classes %r but got dataset with output class %r."
+              % (self._output_classes, dataset.output_classes))
       for iterator_dtype, dataset_dtype in zip(
           nest.flatten(self._output_types), nest.flatten(dataset.output_types)):
         if iterator_dtype != dataset_dtype:
@@ -237,8 +286,8 @@ class Iterator(object):
               "Expected output types %r but got dataset with output types %r." %
               (self._output_types, dataset.output_types))
       for iterator_shape, dataset_shape in zip(
-          nest.flatten(self._output_shapes),
-          nest.flatten(dataset.output_shapes)):
+          nest.flatten(self._output_shapes), nest.flatten(
+              dataset.output_shapes)):
         if not iterator_shape.is_compatible_with(dataset_shape):
           raise TypeError("Expected output shapes compatible with %r but got "
                           "dataset with output shapes %r." %
@@ -256,16 +305,24 @@ class Iterator(object):
     Returns:
       A nested structure of `tf.Tensor` objects.
     """
+    self._get_next_call_count += 1
+    if self._get_next_call_count > GET_NEXT_CALL_WARNING_THRESHOLD:
+      warnings.warn(GET_NEXT_CALL_WARNING_MESSAGE)
+
     return sparse.deserialize_sparse_tensors(
         nest.pack_sequence_as(self._output_types,
                               gen_dataset_ops.iterator_get_next(
                                   self._iterator_resource,
                                   output_types=nest.flatten(
-                                      sparse.unwrap_sparse_types(
-                                          self._output_types)),
+                                      sparse.as_dense_types(
+                                          self._output_types,
+                                          self._output_classes)),
                                   output_shapes=nest.flatten(
-                                      self._output_shapes),
-                                  name=name)), self._output_types)
+                                      sparse.as_dense_shapes(
+                                          self._output_shapes,
+                                          self._output_classes)),
+                                  name=name)), self._output_types,
+        self._output_shapes, self._output_classes)
 
   def string_handle(self, name=None):
     """Returns a string-valued `tf.Tensor` that represents this iterator.
@@ -282,13 +339,25 @@ class Iterator(object):
       return gen_dataset_ops.iterator_to_string_handle(
           self._iterator_resource, name=name)
 
+  @property
+  def output_classes(self):
+    """Returns the class of each component of an element of this iterator.
+
+    The expected values are `tf.Tensor` and `tf.SparseTensor`.
+
+    Returns:
+      A nested structure of Python `type` objects corresponding to each
+      component of an element of this dataset.
+    """
+    return self._output_classes
+
   @property
   def output_shapes(self):
     """Returns the shape of each component of an element of this iterator.
 
     Returns:
       A nested structure of `tf.TensorShape` objects corresponding to each
-      component of an element of this iterator.
+      component of an element of this dataset.
     """
     return self._output_shapes
 
@@ -297,8 +366,7 @@ class Iterator(object):
     """Returns the type of each component of an element of this iterator.
 
     Returns:
-      A nested structure of `tf.DType` (or `tf.data.SparseType`) objects
-      corresponding to each `tf.Tensor` (or `tf.SparseTensor`) component of an
-      element of this dataset.
+      A nested structure of `tf.DType` objects corresponding to each component
+      of an element of this dataset.
     """
     return self._output_types
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index 531716581ffbc2daeac8eb41c24a848bf5fbb7ad..c6fb8531aea13850524e6b9a83911d7afe950395 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -70,6 +70,10 @@ class TextLineDataset(Dataset):
     return gen_dataset_ops.text_line_dataset(
         self._filenames, self._compression_type, self._buffer_size)
 
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
   @property
   def output_shapes(self):
     return tensor_shape.scalar()
@@ -110,6 +114,10 @@ class TFRecordDataset(Dataset):
     return gen_dataset_ops.tf_record_dataset(
         self._filenames, self._compression_type, self._buffer_size)
 
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
   @property
   def output_shapes(self):
     return tensor_shape.TensorShape([])
@@ -159,6 +167,10 @@ class FixedLengthRecordDataset(Dataset):
         self._filenames, self._header_bytes, self._record_bytes,
         self._footer_bytes, self._buffer_size)
 
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
   @property
   def output_shapes(self):
     return tensor_shape.scalar()
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index 41d8513b16ce2a74d47d42cd821b2d0ff00cab57..f7d7fe98d3eca10b6481e3c0f7d08b42e95ef81a 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -38,8 +38,10 @@ py_library(
     deps = [
         ":nest",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
         "@six_archive//:six",
     ],
@@ -56,6 +58,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
     ],
 )
 
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index 3ee490dbcfe879d104ddf00e9d40b14b6780b69c..2455395635c4c8fa5d157a38d4e7a118f554fd9f 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -17,17 +17,22 @@
 """## Functions for working with arbitrarily nested sequences of elements.
 
 NOTE(mrry): This fork of the `tensorflow.python.util.nest` module
-makes two changes:
+makes three changes:
 
 1. It adds support for dictionaries as a level of nesting in nested structures.
 2. It removes support for lists as a level of nesting in nested structures.
+3. It adds support for `SparseTensorValue` as an atomic element.
 
-The motivation for this change is twofold:
+The motivation for this change is threefold:
 
 1. Many input-processing functions (e.g. `tf.parse_example()`) return
    dictionaries, and we would like to support them natively in datasets.
 2. It seems more natural for lists to be treated (e.g. in Dataset constructors)
    as tensors, rather than lists of (lists of...) tensors.
+3. This is needed because `SparseTensorValue` is implemented as a `namedtuple`
+   that would normally be flattened and we want to be able to create sparse
+   tensor from `SparseTensorValue's similarly to creating tensors from numpy
+   arrays.
 """
 
 from __future__ import absolute_import
@@ -38,6 +43,7 @@ import collections as _collections
 
 import six as _six
 
+from tensorflow.python.framework import sparse_tensor as _sparse_tensor
 from tensorflow.python.util.all_util import remove_undocumented
 
 
@@ -87,6 +93,8 @@ def _yield_value(iterable):
     # corresponding `OrderedDict` to pack it back).
     for key in _sorted(iterable):
       yield iterable[key]
+  elif isinstance(iterable, _sparse_tensor.SparseTensorValue):
+    yield iterable
   else:
     for value in iterable:
       yield value
@@ -116,8 +124,9 @@ def is_sequence(seq):
     True if the sequence is a not a string or list and is a
     collections.Sequence.
   """
-  return (isinstance(seq, (_collections.Sequence, dict))
-          and not isinstance(seq, (list, _six.string_types)))
+  return (isinstance(seq, (_collections.Sequence, dict)) and
+          not isinstance(seq, _sparse_tensor.SparseTensorValue) and
+          not isinstance(seq, (list, _six.string_types)))
 
 
 def flatten(nest):
@@ -370,9 +379,9 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
     if check_types and isinstance(shallow_tree, dict):
       if set(input_tree) != set(shallow_tree):
         raise ValueError(
-          "The two structures don't have the same keys. Input "
-          "structure has keys %s, while shallow structure has keys %s."
-          % (list(_six.iterkeys(input_tree)),
+            "The two structures don't have the same keys. Input "
+            "structure has keys %s, while shallow structure has keys %s." %
+            (list(_six.iterkeys(input_tree)),
              list(_six.iterkeys(shallow_tree))))
       input_tree = list(_six.iteritems(input_tree))
       shallow_tree = list(_six.iteritems(shallow_tree))
diff --git a/tensorflow/python/data/util/nest_test.py b/tensorflow/python/data/util/nest_test.py
index 47547eb49f993e27f105e52f15fcd988e7593123..90dd7dfe7775b2f10611e5579784fbda63fc9669 100644
--- a/tensorflow/python/data/util/nest_test.py
+++ b/tensorflow/python/data/util/nest_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -86,7 +87,7 @@ class NestTest(test.TestCase):
         ordered_reconstruction)
     self.assertEqual({"d": 3, "b": 1, "a": 0, "c": 2}, plain_reconstruction)
 
-  def testFlattenAndPack_withDicts(self):
+  def testFlattenAndPackWithDicts(self):
     # A nice messy mix of tuples, lists, dicts, and `OrderedDict`s.
     named_tuple = collections.namedtuple("A", ("b", "c"))
     mess = (
@@ -132,6 +133,17 @@ class NestTest(test.TestCase):
     self.assertIsInstance(unflattened_ordered_dict, collections.OrderedDict)
     self.assertEqual(list(unflattened_ordered_dict.keys()), ["b", "a"])
 
+  def testFlattenSparseValue(self):
+    st = sparse_tensor.SparseTensorValue([[0]], [0], [1])
+    single_value = st
+    list_of_values = [st, st, st]
+    nest_of_values = ((st), ((st), (st)))
+    dict_of_values = {"foo": st, "bar": st, "baz": st}
+    self.assertEqual([st], nest.flatten(single_value))
+    self.assertEqual([[st, st, st]], nest.flatten(list_of_values))
+    self.assertEqual([st, st, st], nest.flatten(nest_of_values))
+    self.assertEqual([st, st, st], nest.flatten(dict_of_values))
+
   def testIsSequence(self):
     self.assertFalse(nest.is_sequence("1234"))
     self.assertFalse(nest.is_sequence([1, 3, [4, 5]]))
@@ -143,6 +155,8 @@ class NestTest(test.TestCase):
     self.assertFalse(nest.is_sequence(math_ops.tanh(ones)))
     self.assertFalse(nest.is_sequence(np.ones((4, 5))))
     self.assertTrue(nest.is_sequence({"foo": 1, "bar": 2}))
+    self.assertFalse(
+        nest.is_sequence(sparse_tensor.SparseTensorValue([[0]], [0], [1])))
 
   def testAssertSameStructure(self):
     structure1 = (((1, 2), 3), 4, (5, 6))
@@ -257,8 +271,9 @@ class NestTest(test.TestCase):
     inp_ab1 = {"a": (1, 1), "b": {"c": (2, 2)}}
     inp_ab2 = {"a": (1, 1), "b": {"d": (2, 2)}}
     expected_message = (
-        "The two structures don't have the same keys. Input "
-        "structure has keys \['c'\], while shallow structure has keys \['d'\].")
+        r"The two structures don't have the same keys. Input "
+        r"structure has keys \['c'\], while shallow structure has "
+        r"keys \['d'\].")
     with self.assertRaisesRegexp(ValueError, expected_message):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
 
diff --git a/tensorflow/python/data/util/sparse.py b/tensorflow/python/data/util/sparse.py
index 673fac095c9384201c190138a0467a71221c185c..5ebcb4ea81b23b60dc46bae78bfa792f4a8ab6d8 100644
--- a/tensorflow/python/data/util/sparse.py
+++ b/tensorflow/python/data/util/sparse.py
@@ -19,145 +19,134 @@ from __future__ import print_function
 
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import sparse_ops
 
 
-def any_sparse(types):
-  """Checks for sparse tensor types.
+def any_sparse(classes):
+  """Checks for sparse tensor.
 
   Args:
-    types: a structure with tensor types.
+    classes: a structure of objects that identify the dataset item classes
 
   Returns:
-    `True` if `types` contains a sparse tensor type and `False` otherwise.
+    `True` if `classes` contains a sparse tensor type and `False` otherwise.
   """
-  return any([isinstance(ty, SparseType) for ty in nest.flatten(types)])
+  return any([c is sparse_tensor.SparseTensor for c in nest.flatten(classes)])
 
 
-def deserialize_sparse_tensors(tensors, types):
-  """Deserializes sparse tensors.
+def as_dense_shapes(shapes, classes):
+  """Converts sparse tensor shapes to their physical shapes.
 
   Args:
-    tensors: a structure of tensors to deserialize.
-    types: a structure object the holds information about which tensors in
-      `tensors` represent serialized sparse tensors
+    shapes: a structure of shapes to convert.
+    classes: a structure of objects that identify the dataset item classes
 
   Returns:
-    `tensors` with any serialized sparse tensors replaced by their deserialized
-    version.
+    a structure matching the nested structure of `shapes`, containing
+    `tensor_shape.unknown_shape()` at positions where `classes` contains
+    `tf.SparseTensor` and matching contents of `shapes` otherwise
   """
-  # TODO(b/63669786): support batching of sparse tensors
-  ret = nest.pack_sequence_as(types, [
-      sparse_ops.deserialize_sparse(tensor, ty.dtype)
-      if isinstance(ty, SparseType) else tensor
-      for (tensor, ty) in zip(nest.flatten(tensors), nest.flatten(types))
+  ret = nest.pack_sequence_as(shapes, [
+      tensor_shape.unknown_shape() if c is sparse_tensor.SparseTensor else shape
+      for shape, c in zip(nest.flatten(shapes), nest.flatten(classes))
   ])
   return ret
 
 
-def get_sparse_types(tensors):
-  """Gets sparse types for a structure of tensors.
+def as_dense_types(types, classes):
+  """Converts sparse tensor types to `dtypes.variant`.
 
   Args:
-    tensors: the tensor structure to get sparse types for.
+    types: a structure of types to convert.
+    classes: a structure of objects that identify the dataset item classes
 
   Returns:
-    a structure matching the nested structure of `tensors`, containing
-    `SparseType` at positions where `tensors` contains a sparse tensor and
-    `None` otherwise
+    a structure matching the nested structure of `types`, containing
+    `dtypes.variant` at positions where `classes` contains `tf.SparseTensor` and
+    matching contents of `types` otherwise
   """
-  return nest.pack_sequence_as(tensors, [
-      SparseType(tensor.dtype)
-      if isinstance(tensor, sparse_tensor.SparseTensor) else None
-      for tensor in nest.flatten(tensors)
+  ret = nest.pack_sequence_as(types, [
+      dtypes.variant if c is sparse_tensor.SparseTensor else ty
+      for ty, c in zip(nest.flatten(types), nest.flatten(classes))
   ])
+  return ret
 
 
-def serialize_sparse_tensors(tensors):
-  """Serializes sparse tensors.
+def deserialize_sparse_tensors(tensors, types, shapes, classes):
+  """Deserializes sparse tensors.
 
   Args:
-    tensors: a tensor structure to serialize.
+    tensors: a structure of tensors to deserialize.
+    types: a structure that holds information about types of `tensors`
+    shapes: a structure that holds information about shapes of `tensors`
+    classes: a structure of objects that identify the dataset item classes
 
   Returns:
-    `tensors` with any sparse tensors replaced by the their serialized version.
+    `tensors` with any serialized sparse tensors replaced by their deserialized
+    version.
   """
-
-  ret = nest.pack_sequence_as(tensors, [
-      sparse_ops.serialize_sparse(tensor)
-      if isinstance(tensor, sparse_tensor.SparseTensor) else tensor
-      for tensor in nest.flatten(tensors)
+  ret = nest.pack_sequence_as(types, [
+      sparse_ops.deserialize_sparse(tensor, dtype=ty, rank=shape.ndims)
+      if c is sparse_tensor.SparseTensor else tensor
+      for (tensor, ty, shape, c) in zip(
+          nest.flatten(tensors), nest.flatten(types), nest.flatten(shapes),
+          nest.flatten(classes))
   ])
   return ret
 
 
-def unwrap_sparse_types(types):
-  """Unwraps sparse tensor types as `dtypes.string`.
+def get_classes(tensors):
+  """Gets classes for a structure of tensors.
 
   Args:
-    types: a structure of types to unwrap.
+    tensors: the tensor structure to get classes for.
 
   Returns:
-    a structure matching the nested structure of `types`, containing
-    `dtypes.string` at positions where `types` contains a sparse tensor and
-    matching contents of `types` otherwise
+    a structure matching the nested structure of `tensors`, containing
+    `tf.SparseTensor` at positions where `tensors` contains a sparse tensor and
+    `tf.Tensor` otherwise
   """
-  ret = nest.pack_sequence_as(types, [
-      dtypes.string if isinstance(ty, SparseType) else ty
-      for ty in nest.flatten(types)
+  return nest.pack_sequence_as(tensors, [
+      sparse_tensor.SparseTensor
+      if isinstance(tensor, sparse_tensor.SparseTensor) else ops.Tensor
+      for tensor in nest.flatten(tensors)
   ])
-  return ret
 
 
-def wrap_sparse_types(tensors, types):
-  """Wraps sparse tensor types in `SparseType`.
+def serialize_many_sparse_tensors(tensors):
+  """Serializes many sparse tensors into a batch.
 
   Args:
-    tensors: a structure of tensors for which to wrap types.
-    types: a structure that holds information about which tensors in
-      `tensors` represent serialized sparse tensors
+    tensors: a tensor structure to serialize.
 
   Returns:
-    a structure matching the nested structure of `tensors`, containing
-    `SparseType` at positions where `tensors` contains a sparse tensor and
-    `DType` otherwise
+    `tensors` with any sparse tensors replaced by the serialized batch.
   """
-  ret = nest.pack_sequence_as(types, [
-      tensor.dtype if ty is None else ty
-      for tensor, ty in zip(nest.flatten(tensors), nest.flatten(types))
+
+  ret = nest.pack_sequence_as(tensors, [
+      sparse_ops.serialize_many_sparse(tensor, out_type=dtypes.variant)
+      if sparse_tensor.is_sparse(tensor) else tensor
+      for tensor in nest.flatten(tensors)
   ])
   return ret
 
 
-class SparseType(object):
-  """Wrapper class for representing types of sparse tensors in tf.data."""
-
-  def __init__(self, dtype):
-    """Creates a new instace of `SparseType`.
-
-    Args:
-      dtype: the sparse tensor type to wrap.
-    """
-    self._dtype = dtype
-
-  def __repr__(self):
-    return "SparseType({0!r})".format(self._dtype)
-
-  def __eq__(self, other):
-    """Returns `True` iff `self == other`."""
-    if not isinstance(other, SparseType):
-      return False
-    return self._dtype == other.dtype
+def serialize_sparse_tensors(tensors):
+  """Serializes sparse tensors.
 
-  def __ne__(self, other):
-    """Returns `True` iff `self != other`."""
-    return not self.__eq__(other)
+  Args:
+    tensors: a tensor structure to serialize.
 
-  def __hash__(self):
-    return self._dtype.__hash__()
+  Returns:
+    `tensors` with any sparse tensors replaced by the their serialized version.
+  """
 
-  @property
-  def dtype(self):
-    """Returns the wrapped sparse tensor type."""
-    return self._dtype
+  ret = nest.pack_sequence_as(tensors, [
+      sparse_ops.serialize_sparse(tensor, out_type=dtypes.variant)
+      if isinstance(tensor, sparse_tensor.SparseTensor) else tensor
+      for tensor in nest.flatten(tensors)
+  ])
+  return ret
diff --git a/tensorflow/python/data/util/sparse_test.py b/tensorflow/python/data/util/sparse_test.py
index e30ed639c23386e81ca88325768f6cbc3e438126..d49b3ff34bd0ebd6beef1bea168dad22059317be 100644
--- a/tensorflow/python/data/util/sparse_test.py
+++ b/tensorflow/python/data/util/sparse_test.py
@@ -22,7 +22,9 @@ from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import test
 
 
@@ -30,17 +32,258 @@ class SparseTest(test.TestCase):
 
   def testAnySparse(self):
     test_cases = (
-        ((), False),
-        ((None), False),
-        ((dtypes.string), False),
-        ((None, -1, dtypes.string), False),
-        ((sparse.SparseType(dtypes.string)), True),
-        ((None, sparse.SparseType(dtypes.string)), True),
-        ((sparse.SparseType(dtypes.string), dtypes.string), True),
-        ((((sparse.SparseType(dtypes.string)))), True)
+        {
+            "classes": (),
+            "expected": False
+        },
+        {
+            "classes": (ops.Tensor),
+            "expected": False
+        },
+        {
+            "classes": (((ops.Tensor))),
+            "expected": False
+        },
+        {
+            "classes": (ops.Tensor, ops.Tensor),
+            "expected": False
+        },
+        {
+            "classes": (ops.Tensor, sparse_tensor.SparseTensor),
+            "expected": True
+        },
+        {
+            "classes": (sparse_tensor.SparseTensor, sparse_tensor.SparseTensor),
+            "expected":
+                True
+        },
+        {
+            "classes": (sparse_tensor.SparseTensor, ops.Tensor),
+            "expected": True
+        },
+        {
+            "classes": (((sparse_tensor.SparseTensor))),
+            "expected": True
+        },
     )
     for test_case in test_cases:
-      self.assertEqual(sparse.any_sparse(test_case[0]), test_case[1])
+      self.assertEqual(
+          sparse.any_sparse(test_case["classes"]), test_case["expected"])
+
+  def assertShapesEqual(self, a, b):
+    for a, b in zip(nest.flatten(a), nest.flatten(b)):
+      self.assertEqual(a.ndims, b.ndims)
+      if a.ndims is None:
+        continue
+      for c, d in zip(a.as_list(), b.as_list()):
+        self.assertEqual(c, d)
+
+  def testAsDenseShapes(self):
+    test_cases = (
+        {
+            "types": (),
+            "classes": (),
+            "expected": ()
+        },
+        {
+            "types": tensor_shape.scalar(),
+            "classes": ops.Tensor,
+            "expected": tensor_shape.scalar()
+        },
+        {
+            "types": tensor_shape.scalar(),
+            "classes": sparse_tensor.SparseTensor,
+            "expected": tensor_shape.unknown_shape()
+        },
+        {
+            "types": (tensor_shape.scalar()),
+            "classes": (ops.Tensor),
+            "expected": (tensor_shape.scalar())
+        },
+        {
+            "types": (tensor_shape.scalar()),
+            "classes": (sparse_tensor.SparseTensor),
+            "expected": (tensor_shape.unknown_shape())
+        },
+        {
+            "types": (tensor_shape.scalar(), ()),
+            "classes": (ops.Tensor, ()),
+            "expected": (tensor_shape.scalar(), ())
+        },
+        {
+            "types": ((), tensor_shape.scalar()),
+            "classes": ((), ops.Tensor),
+            "expected": ((), tensor_shape.scalar())
+        },
+        {
+            "types": (tensor_shape.scalar(), ()),
+            "classes": (sparse_tensor.SparseTensor, ()),
+            "expected": (tensor_shape.unknown_shape(), ())
+        },
+        {
+            "types": ((), tensor_shape.scalar()),
+            "classes": ((), sparse_tensor.SparseTensor),
+            "expected": ((), tensor_shape.unknown_shape())
+        },
+        {
+            "types": (tensor_shape.scalar(), (), tensor_shape.scalar()),
+            "classes": (ops.Tensor, (), ops.Tensor),
+            "expected": (tensor_shape.scalar(), (), tensor_shape.scalar())
+        },
+        {
+            "types": (tensor_shape.scalar(), (), tensor_shape.scalar()),
+            "classes": (sparse_tensor.SparseTensor, (),
+                        sparse_tensor.SparseTensor),
+            "expected": (tensor_shape.unknown_shape(), (),
+                         tensor_shape.unknown_shape())
+        },
+        {
+            "types": ((), tensor_shape.scalar(), ()),
+            "classes": ((), ops.Tensor, ()),
+            "expected": ((), tensor_shape.scalar(), ())
+        },
+        {
+            "types": ((), tensor_shape.scalar(), ()),
+            "classes": ((), sparse_tensor.SparseTensor, ()),
+            "expected": ((), tensor_shape.unknown_shape(), ())
+        },
+    )
+    for test_case in test_cases:
+      self.assertShapesEqual(
+          sparse.as_dense_shapes(test_case["types"], test_case["classes"]),
+          test_case["expected"])
+
+  def testAsDenseTypes(self):
+    test_cases = (
+        {
+            "types": (),
+            "classes": (),
+            "expected": ()
+        },
+        {
+            "types": dtypes.int32,
+            "classes": ops.Tensor,
+            "expected": dtypes.int32
+        },
+        {
+            "types": dtypes.int32,
+            "classes": sparse_tensor.SparseTensor,
+            "expected": dtypes.variant
+        },
+        {
+            "types": (dtypes.int32),
+            "classes": (ops.Tensor),
+            "expected": (dtypes.int32)
+        },
+        {
+            "types": (dtypes.int32),
+            "classes": (sparse_tensor.SparseTensor),
+            "expected": (dtypes.variant)
+        },
+        {
+            "types": (dtypes.int32, ()),
+            "classes": (ops.Tensor, ()),
+            "expected": (dtypes.int32, ())
+        },
+        {
+            "types": ((), dtypes.int32),
+            "classes": ((), ops.Tensor),
+            "expected": ((), dtypes.int32)
+        },
+        {
+            "types": (dtypes.int32, ()),
+            "classes": (sparse_tensor.SparseTensor, ()),
+            "expected": (dtypes.variant, ())
+        },
+        {
+            "types": ((), dtypes.int32),
+            "classes": ((), sparse_tensor.SparseTensor),
+            "expected": ((), dtypes.variant)
+        },
+        {
+            "types": (dtypes.int32, (), dtypes.int32),
+            "classes": (ops.Tensor, (), ops.Tensor),
+            "expected": (dtypes.int32, (), dtypes.int32)
+        },
+        {
+            "types": (dtypes.int32, (), dtypes.int32),
+            "classes": (sparse_tensor.SparseTensor, (),
+                        sparse_tensor.SparseTensor),
+            "expected": (dtypes.variant, (), dtypes.variant)
+        },
+        {
+            "types": ((), dtypes.int32, ()),
+            "classes": ((), ops.Tensor, ()),
+            "expected": ((), dtypes.int32, ())
+        },
+        {
+            "types": ((), dtypes.int32, ()),
+            "classes": ((), sparse_tensor.SparseTensor, ()),
+            "expected": ((), dtypes.variant, ())
+        },
+    )
+    for test_case in test_cases:
+      self.assertEqual(
+          sparse.as_dense_types(test_case["types"], test_case["classes"]),
+          test_case["expected"])
+
+  def testGetClasses(self):
+    s = sparse_tensor.SparseTensor(indices=[[0]], values=[1], dense_shape=[1])
+    d = ops.Tensor
+    t = sparse_tensor.SparseTensor
+    test_cases = (
+        {
+            "classes": (),
+            "expected": ()
+        },
+        {
+            "classes": s,
+            "expected": t
+        },
+        {
+            "classes": constant_op.constant([1]),
+            "expected": d
+        },
+        {
+            "classes": (s),
+            "expected": (t)
+        },
+        {
+            "classes": (constant_op.constant([1])),
+            "expected": (d)
+        },
+        {
+            "classes": (s, ()),
+            "expected": (t, ())
+        },
+        {
+            "classes": ((), s),
+            "expected": ((), t)
+        },
+        {
+            "classes": (constant_op.constant([1]), ()),
+            "expected": (d, ())
+        },
+        {
+            "classes": ((), constant_op.constant([1])),
+            "expected": ((), d)
+        },
+        {
+            "classes": (s, (), constant_op.constant([1])),
+            "expected": (t, (), d)
+        },
+        {
+            "classes": ((), s, ()),
+            "expected": ((), t, ())
+        },
+        {
+            "classes": ((), constant_op.constant([1]), ()),
+            "expected": ((), d, ())
+        },
+    )
+    for test_case in test_cases:
+      self.assertEqual(
+          sparse.get_classes(test_case["classes"]), test_case["expected"])
 
   def assertSparseValuesEqual(self, a, b):
     if not isinstance(a, sparse_tensor.SparseTensor):
@@ -66,75 +309,50 @@ class SparseTest(test.TestCase):
             indices=[[0, 0]], values=[1], dense_shape=[1, 1])),
         (sparse_tensor.SparseTensor(
             indices=[[0, 0]], values=[1], dense_shape=[1, 1]), ()),
-        ((), sparse_tensor.SparseTensor(
-            indices=[[0, 0]], values=[1], dense_shape=[1, 1])),
+        ((),
+         sparse_tensor.SparseTensor(
+             indices=[[0, 0]], values=[1], dense_shape=[1, 1])),
     )
     for expected in test_cases:
+      classes = sparse.get_classes(expected)
+      shapes = nest.map_structure(lambda _: tensor_shape.TensorShape(None),
+                                  classes)
+      types = nest.map_structure(lambda _: dtypes.int32, classes)
       actual = sparse.deserialize_sparse_tensors(
-          sparse.serialize_sparse_tensors(expected),
-          sparse.get_sparse_types(expected))
+          sparse.serialize_sparse_tensors(expected), types, shapes,
+          sparse.get_classes(expected))
       nest.assert_same_structure(expected, actual)
       for a, e in zip(nest.flatten(actual), nest.flatten(expected)):
         self.assertSparseValuesEqual(a, e)
 
-  def testGetSparseTypes(self):
-    s = sparse_tensor.SparseTensor(
-        indices=[[0, 0]], values=[1], dense_shape=[1, 1])
-    t = sparse.SparseType(dtypes.int32)
+  def testSerializeManyDeserialize(self):
     test_cases = (
-        ((), ()),
-        (s, t),
-        ((s), (t)),
-        ((s, ()), (t, ())),
-        (((), s), ((), t)),
-    )
-    for test_case in test_cases:
-      self.assertEqual(sparse.get_sparse_types(test_case[0]), test_case[1])
-
-  def testWrapSparseTypes(self):
-    c = constant_op.constant([1])
-    d = dtypes.int32
-    s = sparse_tensor.SparseTensor(
-        indices=[[0, 0]], values=[1], dense_shape=[1, 1])
-    t = sparse.SparseType(dtypes.int32)
-    test_cases = (
-        ((), ()),
-        (s, t),
-        (c, d),
-        ((s), (t)),
-        ((c), (d)),
-        ((s, ()), (t, ())),
-        (((), s), ((), t)),
-        ((c, ()), (d, ())),
-        (((), c), ((), d)),
-        ((s, (), c), (t, (), d)),
-        (((), s, ()), ((), t, ())),
-        (((), c, ()), ((), d, ())),
-    )
-    for test_case in test_cases:
-      self.assertEqual(
-          sparse.wrap_sparse_types(test_case[0], sparse.get_sparse_types(
-              test_case[0])), test_case[1])
-
-  def testUnwrapSparseTypes(self):
-    d = dtypes.string
-    t = sparse.SparseType(dtypes.int32)
-    test_cases = (
-        ((), ()),
-        (t, d),
-        (d, d),
-        ((t), (d)),
-        ((d), (d)),
-        ((t, ()), (d, ())),
-        (((), t), ((), d)),
-        ((d, ()), (d, ())),
-        (((), d), ((), d)),
-        ((t, (), d), (d, (), d)),
-        (((), t, ()), ((), d, ())),
-        (((), d, ()), ((), d, ())),
+        (),
+        sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+        sparse_tensor.SparseTensor(
+            indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
+        sparse_tensor.SparseTensor(
+            indices=[[0, 0], [3, 4]], values=[1, -1], dense_shape=[4, 5]),
+        (sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1])),
+        (sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]), ()),
+        ((),
+         sparse_tensor.SparseTensor(
+             indices=[[0, 0]], values=[1], dense_shape=[1, 1])),
     )
-    for test_case in test_cases:
-      self.assertEqual(sparse.unwrap_sparse_types(test_case[0]), test_case[1])
+    for expected in test_cases:
+      classes = sparse.get_classes(expected)
+      shapes = nest.map_structure(lambda _: tensor_shape.TensorShape(None),
+                                  classes)
+      types = nest.map_structure(lambda _: dtypes.int32, classes)
+      actual = sparse.deserialize_sparse_tensors(
+          sparse.serialize_many_sparse_tensors(expected), types, shapes,
+          sparse.get_classes(expected))
+      nest.assert_same_structure(expected, actual)
+      for a, e in zip(nest.flatten(actual), nest.flatten(expected)):
+        self.assertSparseValuesEqual(a, e)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 68b97ddbe3048b7aef18fcf8cc2b41ee545ee55f..789771508e2deaa7dfca1f80853e0d4d0aeb10d8 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -31,6 +31,7 @@ py_library(
         ":debug_graphs",
         ":debug_utils",
         ":grpc_debug_server",
+        ":grpc_debug_test_server",
         ":hooks",
         ":local_cli_wrapper",
         "//tensorflow/python:util",
@@ -45,6 +46,7 @@ py_library(
         ":grpc_debug_test_server",
         ":offline_analyzer",
         ":session_debug_testlib",
+        ":source_remote",
     ] + if_not_windows([
         ":debug_examples",
     ]),
@@ -110,6 +112,17 @@ py_library(
     ],
 )
 
+py_library(
+    name = "source_remote",
+    srcs = ["lib/source_remote.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":debug_service_pb2_grpc",
+        "//tensorflow/core/debug:debug_service_proto_py",
+        "//tensorflow/python/profiler:tfprof_logger",
+    ],
+)
+
 py_library(
     name = "stepper",
     srcs = ["lib/stepper.py"],
@@ -515,6 +528,32 @@ py_test(
     ],
 )
 
+py_test(
+    name = "source_remote_test",
+    size = "small",
+    srcs = ["lib/source_remote_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+        "nomac",
+        "oss_serial",
+    ],
+    deps = [
+        ":grpc_debug_test_server",
+        ":source_remote",
+        ":source_utils",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
     name = "stepper_test",
     size = "small",
@@ -924,6 +963,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     data = ["//tensorflow/tools/dist_test/server:grpc_tensorflow_server"],
+    grpc_enabled = True,
     tags = [
         "no_oss",  # Incompatible with bazel_pip.
         "no_windows",
diff --git a/tensorflow/python/debug/__init__.py b/tensorflow/python/debug/__init__.py
index 821350ee907c46aaa52b5f47ca763f34458eeb3e..34da44b60df9dbda836d6c91089c5ee90f11c584 100644
--- a/tensorflow/python/debug/__init__.py
+++ b/tensorflow/python/debug/__init__.py
@@ -30,6 +30,8 @@ See the @{$python/tfdbg} guide.
 @@GrpcDebugWrapperSession
 @@LocalCLIDebugHook
 @@LocalCLIDebugWrapperSession
+@@TensorBoardDebugHook
+@@TensorBoardDebugWrapperSession
 @@WatchOptions
 
 @@reconstruct_non_debug_graph_def
@@ -60,9 +62,11 @@ from tensorflow.python.debug.lib.debug_utils import watch_graph_with_blacklists
 from tensorflow.python.debug.wrappers.dumping_wrapper import DumpingDebugWrapperSession
 from tensorflow.python.debug.wrappers.framework import WatchOptions
 from tensorflow.python.debug.wrappers.grpc_wrapper import GrpcDebugWrapperSession
+from tensorflow.python.debug.wrappers.grpc_wrapper import TensorBoardDebugWrapperSession
 from tensorflow.python.debug.wrappers.hooks import DumpingDebugHook
 from tensorflow.python.debug.wrappers.hooks import GrpcDebugHook
 from tensorflow.python.debug.wrappers.hooks import LocalCLIDebugHook
+from tensorflow.python.debug.wrappers.hooks import TensorBoardDebugHook
 from tensorflow.python.debug.wrappers.local_cli_wrapper import LocalCLIDebugWrapperSession
 
 from tensorflow.python.util import all_util as _all_util
diff --git a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
index 442dfb7b3f52e74d3bbbc36391e7ec052365a017..bd00f738610627a4b3bc7c61476164188a7b460c 100644
--- a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
+++ b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
@@ -22,6 +22,7 @@ import tempfile
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.lib import debug_graphs
@@ -41,6 +42,12 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
   _OP_TYPE_BLACKLIST = (
       "_Send", "_Recv", "_HostSend", "_HostRecv", "_Retval")
 
+  def _no_rewrite_session_config(self):
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+    return config_pb2.ConfigProto(graph_options=graph_options)
+
   def setUp(self):
     super(ReconstructNonDebugGraphTest, self).setUp()
     self._dump_dir = tempfile.mkdtemp()
@@ -136,7 +143,7 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
           sess, c, expected_output=400.0)
 
   def testReonstructGraphWithCond(self):
-    with session.Session() as sess:
+    with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = variables.Variable(10.0, name="x")
       y = variables.Variable(20.0, name="y")
       cond = control_flow_ops.cond(
@@ -157,7 +164,7 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
       self._compareOriginalAndReconstructedGraphDefs(sess, loop)
 
   def testReconstructGraphWithGradients(self):
-    with session.Session() as sess:
+    with session.Session(config=self._no_rewrite_session_config()) as sess:
       u = variables.Variable(12.0, name="u")
       v = variables.Variable(30.0, name="v")
       x = constant_op.constant(1.1, name="x")
diff --git a/tensorflow/python/debug/lib/debug_service_pb2_grpc.py b/tensorflow/python/debug/lib/debug_service_pb2_grpc.py
index 98adc3284b94afc8190f7ee4240d7c5fbf37b4b5..16573eab6f0e61c12020c4becb72369c38f05b42 100755
--- a/tensorflow/python/debug/lib/debug_service_pb2_grpc.py
+++ b/tensorflow/python/debug/lib/debug_service_pb2_grpc.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 import grpc
 
 from tensorflow.core.debug import debug_service_pb2 as tensorflow_dot_core_dot_debug_dot_debug__service__pb2
+from tensorflow.core.protobuf import debug_pb2 as tensorflow_dot_core_dot_protobuf_dot_debug__pb2
 from tensorflow.core.util import event_pb2 as tensorflow_dot_core_dot_util_dot_event__pb2
 
 
@@ -42,6 +43,16 @@ class EventListenerStub(object):
         request_serializer=tensorflow_dot_core_dot_util_dot_event__pb2.Event.SerializeToString,
         response_deserializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.FromString,
         )
+    self.SendTracebacks = channel.unary_unary(
+        '/tensorflow.EventListener/SendTracebacks',
+        request_serializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.CallTraceback.SerializeToString,
+        response_deserializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.FromString,
+        )
+    self.SendSourceFiles = channel.unary_unary(
+        '/tensorflow.EventListener/SendSourceFiles',
+        request_serializer=tensorflow_dot_core_dot_protobuf_dot_debug__pb2.DebuggedSourceFiles.SerializeToString,
+        response_deserializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.FromString,
+        )
 
 
 class EventListenerServicer(object):
@@ -62,6 +73,20 @@ class EventListenerServicer(object):
     context.set_details('Method not implemented!')
     raise NotImplementedError('Method not implemented!')
 
+  def SendTracebacks(self, request, context):
+    """Send the tracebacks of ops in a Python graph definition.
+    """
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
+  def SendSourceFiles(self, request, context):
+    """Send a collection of source code files being debugged.
+    """
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
 
 def add_EventListenerServicer_to_server(servicer, server):
   rpc_method_handlers = {
@@ -70,6 +95,16 @@ def add_EventListenerServicer_to_server(servicer, server):
           request_deserializer=tensorflow_dot_core_dot_util_dot_event__pb2.Event.FromString,
           response_serializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.SerializeToString,
       ),
+      'SendTracebacks': grpc.unary_unary_rpc_method_handler(
+          servicer.SendTracebacks,
+          request_deserializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.CallTraceback.FromString,
+          response_serializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.SerializeToString,
+      ),
+      'SendSourceFiles': grpc.unary_unary_rpc_method_handler(
+          servicer.SendSourceFiles,
+          request_deserializer=tensorflow_dot_core_dot_protobuf_dot_debug__pb2.DebuggedSourceFiles.FromString,
+          response_serializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.SerializeToString,
+      ),
   }
   generic_handler = grpc.method_handlers_generic_handler(
       'tensorflow.EventListener', rpc_method_handlers)
diff --git a/tensorflow/python/debug/lib/grpc_debug_server.py b/tensorflow/python/debug/lib/grpc_debug_server.py
index 5ab910fb0c9d89bc31a15ecbec48516f07a02979..1b559f1f27538364d8e12339d321e41d33c52590 100644
--- a/tensorflow/python/debug/lib/grpc_debug_server.py
+++ b/tensorflow/python/debug/lib/grpc_debug_server.py
@@ -458,3 +458,36 @@ class EventListenerBaseServicer(debug_service_pb2_grpc.EventListenerServicer):
         `debug_op` as a `str`.
     """
     return list(self._gated_grpc_debug_watches)
+
+  def SendTracebacks(self, request, context):
+    """Base implementation of the handling of SendTracebacks calls.
+
+    The base implementation does nothing with the incoming request.
+    Override in an implementation of the server if necessary.
+
+    Args:
+      request: A `CallTraceback` proto, containing information about the
+        type (e.g., graph vs. eager execution) and source-code traceback of the
+        call and (any) associated `tf.Graph`s.
+      context: Server context.
+
+    Returns:
+      A `EventReply` proto.
+    """
+    return debug_service_pb2.EventReply()
+
+  def SendSourceFiles(self, request, context):
+    """Base implementation of the handling of SendSourceFiles calls.
+
+    The base implementation does nothing with the incoming request.
+    Override in an implementation of the server if necessary.
+
+    Args:
+      request: A `DebuggedSourceFiles` proto, containing the path, content, size
+        and last-modified timestamp of source files.
+      context: Server context.
+
+    Returns:
+      A `EventReply` proto.
+    """
+    return debug_service_pb2.EventReply()
diff --git a/tensorflow/python/debug/lib/grpc_debug_test_server.py b/tensorflow/python/debug/lib/grpc_debug_test_server.py
index 76e45c0bedbb463c872bfca466c6991c9d459e49..a637677d7d092152cd58c20b45520fad97eb90ff 100644
--- a/tensorflow/python/debug/lib/grpc_debug_test_server.py
+++ b/tensorflow/python/debug/lib/grpc_debug_test_server.py
@@ -238,6 +238,15 @@ class EventListenerTestServicer(grpc_debug_server.EventListenerBaseServicer):
         self, server_port,
         functools.partial(EventListenerTestStreamHandler, dump_dir, self))
 
+    # Members for storing the graph ops traceback and source files.
+    self._call_types = []
+    self._call_keys = []
+    self._origin_stacks = []
+    self._origin_id_to_strings = []
+    self._graph_tracebacks = []
+    self._graph_versions = []
+    self._source_files = None
+
   def _initialize_toggle_watch_state(self, toggle_watches):
     self._toggle_watches = toggle_watches
     self._toggle_watch_state = dict()
@@ -259,6 +268,97 @@ class EventListenerTestServicer(grpc_debug_server.EventListenerBaseServicer):
     self.core_metadata_json_strings = []
     self.partition_graph_defs = []
     self.debug_tensor_values = collections.defaultdict(list)
+    self._call_types = []
+    self._call_keys = []
+    self._origin_stacks = []
+    self._origin_id_to_strings = []
+    self._graph_tracebacks = []
+    self._graph_versions = []
+    self._source_files = None
+
+  def SendTracebacks(self, request, context):
+    self._call_types.append(request.call_type)
+    self._call_keys.append(request.call_key)
+    self._origin_stacks.append(request.origin_stack)
+    self._origin_id_to_strings.append(request.origin_id_to_string)
+    self._graph_tracebacks.append(request.graph_traceback)
+    self._graph_versions.append(request.graph_version)
+    return debug_service_pb2.EventReply()
+
+  def SendSourceFiles(self, request, context):
+    self._source_files = request
+    return debug_service_pb2.EventReply()
+
+  def query_op_traceback(self, op_name):
+    """Query the traceback of an op.
+
+    Args:
+      op_name: Name of the op to query.
+
+    Returns:
+      The traceback of the op, as a list of 3-tuples:
+        (filename, lineno, function_name)
+
+    Raises:
+      ValueError: If the op cannot be found in the tracebacks received by the
+        server so far.
+    """
+    for op_log_proto in self._graph_tracebacks:
+      for log_entry in op_log_proto.log_entries:
+        if log_entry.name == op_name:
+          return self._code_def_to_traceback(log_entry.code_def,
+                                             op_log_proto.id_to_string)
+    raise ValueError(
+        "Op '%s' does not exist in the tracebacks received by the debug "
+        "server.")
+
+  def query_origin_stack(self):
+    """Query the stack of the origin of the execution call.
+
+    Returns:
+      A `list` of all tracebacks. Each item corresponds to an execution call,
+        i.e., a `SendTracebacks` request. Each item is a `list` of 3-tuples:
+        (filename, lineno, function_name).
+    """
+    ret = []
+    for stack, id_to_string in zip(
+        self._origin_stacks, self._origin_id_to_strings):
+      ret.append(self._code_def_to_traceback(stack, id_to_string))
+    return ret
+
+  def query_call_types(self):
+    return self._call_types
+
+  def query_call_keys(self):
+    return self._call_keys
+
+  def query_graph_versions(self):
+    return self._graph_versions
+
+  def query_source_file_line(self, file_path, lineno):
+    """Query the content of a given line in a source file.
+
+    Args:
+      file_path: Path to the source file.
+      lineno: Line number as an `int`.
+
+    Returns:
+      Content of the line as a string.
+
+    Raises:
+      ValueError: If no source file is found at the given file_path.
+    """
+    for source_file_proto in self._source_files.source_files:
+      if source_file_proto.file_path == file_path:
+        return source_file_proto.lines[lineno - 1]
+    raise ValueError(
+        "Source file at path %s has not been received by the debug server",
+        file_path)
+
+  def _code_def_to_traceback(self, code_def, id_to_string):
+    return [(id_to_string[trace.file_id],
+             trace.lineno,
+             id_to_string[trace.function_id]) for trace in code_def.traces]
 
 
 def start_server_on_separate_thread(dump_to_filesystem=True,
diff --git a/tensorflow/python/debug/lib/session_debug_grpc_test.py b/tensorflow/python/debug/lib/session_debug_grpc_test.py
index e1ddd4ee642f2a11cf4bb65b1d60b8f731b9c8f6..99781bd9d900eaa848b79c8a5868d37895de43f2 100644
--- a/tensorflow/python/debug/lib/session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/session_debug_grpc_test.py
@@ -248,10 +248,24 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
     self.assertEqual(
         14, len(dump.get_tensors("v/read", 0, "DebugNumericSummary")[0]))
 
-  def testConstructGrpcDebugHookWithGrpcInUrlRaisesValueError(self):
-    """Tests that the hook raises an error if the URL starts with grpc://."""
-    with self.assertRaises(ValueError):
-      hooks.GrpcDebugHook(["grpc://foo:42"])
+  def testTensorBoardDebugHooWorks(self):
+    u = variables.Variable(2.1, name="u")
+    v = variables.Variable(20.0, name="v")
+    w = math_ops.multiply(u, v, name="w")
+
+    sess = session.Session(config=no_rewrite_session_config())
+    sess.run(u.initializer)
+    sess.run(v.initializer)
+
+    grpc_debug_hook = hooks.TensorBoardDebugHook(
+        ["localhost:%d" % self._server_port])
+    sess = monitored_session._HookedSession(sess, [grpc_debug_hook])
+
+    self.assertAllClose(42.0, sess.run(w))
+
+  def testConstructGrpcDebugHookWithOrWithouGrpcInUrlWorks(self):
+    hooks.GrpcDebugHook(["grpc://foo:42424"])
+    hooks.GrpcDebugHook(["foo:42424"])
 
 
 class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
@@ -684,6 +698,56 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
           # to disable the breakpoint at delta:0:DebugIdentity.
           self.assertSetEqual(set(), self._server_1.breakpoints)
 
+  def testTensorBoardDebuggerWrapperToggleBreakpointsWorks(self):
+    with session.Session(config=no_rewrite_session_config()) as sess:
+      v_1 = variables.Variable(50.0, name="v_1")
+      v_2 = variables.Variable(-50.0, name="v_2")
+      delta_1 = constant_op.constant(5.0, name="delta_1")
+      delta_2 = constant_op.constant(-5.0, name="delta_2")
+      inc_v_1 = state_ops.assign_add(v_1, delta_1, name="inc_v_1")
+      inc_v_2 = state_ops.assign_add(v_2, delta_2, name="inc_v_2")
+
+      sess.run([v_1.initializer, v_2.initializer])
+
+      # The TensorBoardDebugWrapperSession should add a DebugIdentity debug op
+      # with attribute gated_grpc=True for every tensor in the graph.
+      sess = grpc_wrapper.TensorBoardDebugWrapperSession(
+          sess, self._debug_server_url_1)
+
+      for i in xrange(4):
+        self._server_1.clear_data()
+
+        if i in (0, 2):
+          # Enable breakpoint at delta_[1,2]:0:DebugIdentity in runs 0 and 2.
+          self._server_1.request_watch(
+              "delta_1", 0, "DebugIdentity", breakpoint=True)
+          self._server_1.request_watch(
+              "delta_2", 0, "DebugIdentity", breakpoint=True)
+        else:
+          # Disable the breakpoint in runs 1 and 3.
+          self._server_1.request_unwatch("delta_1", 0, "DebugIdentity")
+          self._server_1.request_unwatch("delta_2", 0, "DebugIdentity")
+
+        output = sess.run([inc_v_1, inc_v_2])
+        self.assertAllClose([50.0 + 5.0 * (i + 1), -50 - 5.0 * (i + 1)], output)
+
+        if i in (0, 2):
+          # During runs 0 and 2, the server should have received the published
+          # debug tensor delta:0:DebugIdentity. The breakpoint should have been
+          # unblocked by EventReply reponses from the server.
+          self.assertAllClose(
+              [5.0],
+              self._server_1.debug_tensor_values["delta_1:0:DebugIdentity"])
+          self.assertAllClose(
+              [-5.0],
+              self._server_1.debug_tensor_values["delta_2:0:DebugIdentity"])
+          # After the runs, the server should have properly registered the
+          # breakpoints.
+        else:
+          # After the end of runs 1 and 3, the server has received the requests
+          # to disable the breakpoint at delta:0:DebugIdentity.
+          self.assertSetEqual(set(), self._server_1.breakpoints)
+
   def testGetGrpcDebugWatchesReturnsCorrectAnswer(self):
     with session.Session() as sess:
       v = variables.Variable(50.0, name="v")
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index ed31a8c8cd36668450e2e9d387ddda498c1d7a32..20a40018bf9c67c5b743963489c8fc5616efa2db 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -58,7 +58,8 @@ from tensorflow.python.training import gradient_descent
 def no_rewrite_session_config():
   rewriter_config = rewriter_config_pb2.RewriterConfig(
       disable_model_pruning=True,
-      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
   graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
   return config_pb2.ConfigProto(graph_options=graph_options)
 
@@ -963,7 +964,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
   def testOutputSlotWithoutOutgoingEdgeCanBeWatched(self):
     """Test watching output slots not attached to any outgoing edges."""
 
-    with session.Session() as sess:
+    with session.Session(config=no_rewrite_session_config()) as sess:
       u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
       u = constant_op.constant(u_init_val, shape=[2, 2], name="u")
 
diff --git a/tensorflow/python/debug/lib/source_remote.py b/tensorflow/python/debug/lib/source_remote.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d10d5a8d11aadcb7c13e498265a4a00dbc8a1fc
--- /dev/null
+++ b/tensorflow/python/debug/lib/source_remote.py
@@ -0,0 +1,205 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Communicating tracebacks and source code with debug server."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import socket
+
+import grpc
+
+from tensorflow.core.debug import debug_service_pb2
+from tensorflow.core.protobuf import debug_pb2
+from tensorflow.python.debug.lib import debug_service_pb2_grpc
+from tensorflow.python.debug.lib import source_utils
+from tensorflow.python.platform import gfile
+from tensorflow.python.profiler import tfprof_logger
+
+
+def _load_debugged_source_file(file_path, source_file_proto):
+  file_stat = gfile.Stat(file_path)
+  source_file_proto.host = socket.gethostname()
+  source_file_proto.file_path = file_path
+  source_file_proto.last_modified = file_stat.mtime_nsec
+  source_file_proto.bytes = file_stat.length
+  try:
+    with gfile.Open(file_path, "r") as f:
+      source_lines = f.readlines()
+      for line in source_lines:
+        source_file_proto.lines.append(line.strip())
+  except IOError:
+    pass
+
+
+def _string_to_id(string, string_to_id):
+  if string not in string_to_id:
+    string_to_id[string] = len(string_to_id)
+  return string_to_id[string]
+
+
+def _format_origin_stack(origin_stack, call_traceback_proto):
+  """Format a traceback stack for a `CallTraceback` proto.
+
+  Args:
+    origin_stack: The stack list as returned by `traceback.extract_stack()`.
+    call_traceback_proto: A `CallTraceback` proto whose fields are to be
+      populated.
+  """
+  string_to_id = dict()
+  string_to_id[None] = 0
+  for frame in origin_stack:
+    file_path, lineno, func_name, line_text = frame
+    call_traceback_proto.origin_stack.traces.add(
+        file_id=_string_to_id(file_path, string_to_id),
+        lineno=lineno,
+        function_id=_string_to_id(func_name, string_to_id),
+        line_id=_string_to_id(line_text, string_to_id))
+
+  id_to_string = call_traceback_proto.origin_id_to_string
+  for key, value in string_to_id.items():
+    id_to_string[value] = key if key is not None else ""
+
+
+def _source_file_paths_outside_tensorflow_py_library(code_defs, id_to_string):
+  """Extract source file paths outside TensorFlow Python library.
+
+  Args:
+    code_defs: An iterable of `CodeDef` protos, i.e., an iterable of stack
+      traces.
+    id_to_string: A proto map from integer ids to strings.
+
+  Returns:
+    An iterable of source file paths outside the TensorFlow Python library.
+  """
+  file_ids = set()
+  for code_def in code_defs:
+    for trace in code_def.traces:
+      file_ids.add(trace.file_id)
+  non_tf_files = (id_to_string[file_id] for file_id in file_ids)
+  non_tf_files = (
+      f for f in non_tf_files
+      if not source_utils.guess_is_tensorflow_py_library(f) and gfile.Exists(f))
+  return non_tf_files
+
+
+def _send_call_tracebacks(destinations,
+                          origin_stack,
+                          is_eager_execution=False,
+                          call_key=None,
+                          graph=None,
+                          send_source=True):
+  """Send the tracebacks of a TensorFlow execution call.
+
+  To gRPC debug server(s). This applies to graph execution (`tf.Session.run()`)
+  calls and eager execution calls.
+
+  If `send_source`, also sends the underlying source files outside the
+  TensorFlow library.
+
+  Args:
+    destinations: gRPC destination addresses, a `str` or a `list` of `str`s,
+      e.g., "localhost:4242". If a `list`, gRPC requests containing the same
+      `CallTraceback` proto payload will be sent to all the destinations.
+    origin_stack: The traceback stack for the origin of the execution call. For
+      graph execution, this is the traceback of the `tf.Session.run()`
+      invocation. For eager execution, this is the traceback of the Python
+      line that executes the eager opertion.
+    is_eager_execution: (`bool`) whether an eager execution call (i.e., not a
+      `tf.Session.run` or derived methods) is being sent.
+    call_key: The key of the execution call, as a string. For graph execution,
+      this is a string describing the feeds, fetches (and targets) names of the
+      `tf.Session.run` call. For eager execution, this is ignored.
+    graph: A Python `tf.Graph` object (i.e., *not* a `tf.GraphDef`), which
+      contains op tracebacks, if applicable.
+    send_source: Whether the source files involved in the op tracebacks but
+      outside the TensorFlow library are to be sent.
+  """
+  if not isinstance(destinations, list):
+    destinations = [destinations]
+
+  call_type = (debug_service_pb2.CallTraceback.EAGER_EXECUTION
+               if is_eager_execution
+               else debug_service_pb2.CallTraceback.GRAPH_EXECUTION)
+  graph_traceback = tfprof_logger.merge_default_with_oplog(
+      graph, add_trainable_var=False) if graph else None
+  call_traceback = debug_service_pb2.CallTraceback(
+      call_type=call_type, call_key=call_key, graph_traceback=graph_traceback,
+      graph_version=graph.version if graph else None)
+
+  _format_origin_stack(origin_stack, call_traceback)
+
+  if send_source:
+    source_file_paths = set()
+    source_file_paths.update(_source_file_paths_outside_tensorflow_py_library(
+        (log_entry.code_def for log_entry
+         in call_traceback.graph_traceback.log_entries),
+        call_traceback.graph_traceback.id_to_string))
+    source_file_paths.update(_source_file_paths_outside_tensorflow_py_library(
+        [call_traceback.origin_stack], call_traceback.origin_id_to_string))
+
+    debugged_source_files = debug_pb2.DebuggedSourceFiles()
+    for file_path in source_file_paths:
+      _load_debugged_source_file(
+          file_path, debugged_source_files.source_files.add())
+
+  for destination in destinations:
+    channel = grpc.insecure_channel(destination)
+    stub = debug_service_pb2_grpc.EventListenerStub(channel)
+    stub.SendTracebacks(call_traceback)
+    if send_source:
+      stub.SendSourceFiles(debugged_source_files)
+
+
+def send_graph_tracebacks(destinations,
+                          run_key,
+                          origin_stack,
+                          graph,
+                          send_source=True):
+  """Send the tracebacks of a graph execution call to debug server(s).
+
+  Args:
+    destinations: gRPC destination addresses, a `str` or a `list` of `str`s,
+      e.g., "localhost:4242". If a `list`, gRPC requests containing the same
+      `CallTraceback` proto payload will be sent to all the destinations.
+    run_key: A string describing the feeds, fetches (and targets) names of the
+      `tf.Session.run` call.
+    origin_stack: The traceback of the `tf.Session.run()` invocation.
+    graph: A Python `tf.Graph` object (i.e., *not* a `tf.GraphDef`), which
+      contains op tracebacks.
+    send_source: Whether the source files involved in the op tracebacks but
+      outside the TensorFlow library are to be sent.
+  """
+  _send_call_tracebacks(
+      destinations, origin_stack, is_eager_execution=False, call_key=run_key,
+      graph=graph, send_source=send_source)
+
+
+def send_eager_tracebacks(destinations,
+                          origin_stack,
+                          send_source=True):
+  """Send the tracebacks of an eager execution call to debug server(s).
+
+  Args:
+    destinations: gRPC destination addresses, a `str` or a `list` of `str`s,
+      e.g., "localhost:4242". If a `list`, gRPC requests containing the same
+    origin_stack: The traceback of the eager operation invocation.
+    send_source: Whether the source files involved in the op tracebacks but
+      outside the TensorFlow library are to be sent.
+  """
+  _send_call_tracebacks(
+      destinations, origin_stack, is_eager_execution=True,
+      send_source=send_source)
diff --git a/tensorflow/python/debug/lib/source_remote_test.py b/tensorflow/python/debug/lib/source_remote_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c4517f681dbd5414de6d4df269356db3a4b654d
--- /dev/null
+++ b/tensorflow/python/debug/lib/source_remote_test.py
@@ -0,0 +1,171 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for source_remote."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import traceback
+
+from tensorflow.core.debug import debug_service_pb2
+from tensorflow.python.client import session
+from tensorflow.python.debug.lib import grpc_debug_test_server
+from tensorflow.python.debug.lib import source_remote
+from tensorflow.python.debug.lib import source_utils
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.util import tf_inspect
+
+
+def line_number_above():
+  return tf_inspect.stack()[1][2] - 1
+
+
+class SendTracebacksTest(test_util.TensorFlowTestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    test_util.TensorFlowTestCase.setUpClass()
+    (cls._server_port, cls._debug_server_url, cls._server_dump_dir,
+     cls._server_thread,
+     cls._server) = grpc_debug_test_server.start_server_on_separate_thread()
+    cls._server_address = "localhost:%d" % cls._server_port
+    (cls._server_port_2, cls._debug_server_url_2, cls._server_dump_dir_2,
+     cls._server_thread_2,
+     cls._server_2) = grpc_debug_test_server.start_server_on_separate_thread()
+    cls._server_address_2 = "localhost:%d" % cls._server_port_2
+    cls._curr_file_path = os.path.normpath(os.path.abspath(__file__))
+
+  @classmethod
+  def tearDownClass(cls):
+    # Stop the test server and join the thread.
+    cls._server.stop_server().wait()
+    cls._server_thread.join()
+    cls._server_2.stop_server().wait()
+    cls._server_thread_2.join()
+    test_util.TensorFlowTestCase.tearDownClass()
+
+  def tearDown(self):
+    ops.reset_default_graph()
+    self._server.clear_data()
+    self._server_2.clear_data()
+    super(SendTracebacksTest, self).tearDown()
+
+  def _findFirstTraceInsideTensorFlowPyLibrary(self, op):
+    """Find the first trace of an op that belongs to the TF Python library."""
+    for trace in op.traceback:
+      if source_utils.guess_is_tensorflow_py_library(trace[0]):
+        return trace
+
+  def testSendGraphTracebacksToSingleDebugServer(self):
+    this_func_name = "testSendGraphTracebacksToSingleDebugServer"
+    with session.Session() as sess:
+      a = variables.Variable(21.0, name="a")
+      a_lineno = line_number_above()
+      b = variables.Variable(2.0, name="b")
+      b_lineno = line_number_above()
+      math_ops.add(a, b, name="x")
+      x_lineno = line_number_above()
+
+      send_stack = traceback.extract_stack()
+      send_lineno = line_number_above()
+      source_remote.send_graph_tracebacks(
+          self._server_address, "dummy_run_key", send_stack, sess.graph)
+
+      tb = self._server.query_op_traceback("a")
+      self.assertIn((self._curr_file_path, a_lineno, this_func_name), tb)
+      tb = self._server.query_op_traceback("b")
+      self.assertIn((self._curr_file_path, b_lineno, this_func_name), tb)
+      tb = self._server.query_op_traceback("x")
+      self.assertIn((self._curr_file_path, x_lineno, this_func_name), tb)
+
+      self.assertIn(
+          (self._curr_file_path, send_lineno, this_func_name),
+          self._server.query_origin_stack()[-1])
+
+      self.assertEqual(
+          "a = variables.Variable(21.0, name=\"a\")",
+          self._server.query_source_file_line(__file__, a_lineno))
+      # Files in the TensorFlow code base shouldn not have been sent.
+      tf_trace_file_path = self._findFirstTraceInsideTensorFlowPyLibrary(a.op)
+      with self.assertRaises(ValueError):
+        self._server.query_source_file_line(tf_trace_file_path, 0)
+      self.assertEqual([debug_service_pb2.CallTraceback.GRAPH_EXECUTION],
+                       self._server.query_call_types())
+      self.assertEqual(["dummy_run_key"], self._server.query_call_keys())
+      self.assertEqual(
+          [sess.graph.version], self._server.query_graph_versions())
+
+  def testSendGraphTracebacksToTwoDebugServers(self):
+    this_func_name = "testSendGraphTracebacksToTwoDebugServers"
+    with session.Session() as sess:
+      a = variables.Variable(21.0, name="two/a")
+      a_lineno = line_number_above()
+      b = variables.Variable(2.0, name="two/b")
+      b_lineno = line_number_above()
+      x = math_ops.add(a, b, name="two/x")
+      x_lineno = line_number_above()
+
+      send_traceback = traceback.extract_stack()
+      send_lineno = line_number_above()
+      source_remote.send_graph_tracebacks(
+          [self._server_address, self._server_address_2],
+          "dummy_run_key", send_traceback, sess.graph)
+
+      servers = [self._server, self._server_2]
+      for server in servers:
+        tb = server.query_op_traceback("two/a")
+        self.assertIn((self._curr_file_path, a_lineno, this_func_name), tb)
+        tb = server.query_op_traceback("two/b")
+        self.assertIn((self._curr_file_path, b_lineno, this_func_name), tb)
+        tb = server.query_op_traceback("two/x")
+        self.assertIn((self._curr_file_path, x_lineno, this_func_name), tb)
+
+        self.assertIn(
+            (self._curr_file_path, send_lineno, this_func_name),
+            server.query_origin_stack()[-1])
+
+        self.assertEqual(
+            "x = math_ops.add(a, b, name=\"two/x\")",
+            server.query_source_file_line(__file__, x_lineno))
+        tf_trace_file_path = self._findFirstTraceInsideTensorFlowPyLibrary(x.op)
+        with self.assertRaises(ValueError):
+          server.query_source_file_line(tf_trace_file_path, 0)
+        self.assertEqual([debug_service_pb2.CallTraceback.GRAPH_EXECUTION],
+                         server.query_call_types())
+        self.assertEqual(["dummy_run_key"], server.query_call_keys())
+        self.assertEqual([sess.graph.version], server.query_graph_versions())
+
+  def testSendEagerTracebacksToSingleDebugServer(self):
+    this_func_name = "testSendEagerTracebacksToSingleDebugServer"
+    send_traceback = traceback.extract_stack()
+    send_lineno = line_number_above()
+    source_remote.send_eager_tracebacks(self._server_address, send_traceback)
+
+    self.assertEqual([debug_service_pb2.CallTraceback.EAGER_EXECUTION],
+                     self._server.query_call_types())
+    self.assertIn((self._curr_file_path, send_lineno, this_func_name),
+                  self._server.query_origin_stack()[-1])
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/lib/stepper.py b/tensorflow/python/debug/lib/stepper.py
index 1fa0b3dba2b547bf1d311e42e1005a8e501f9829..c27b3f51cddb51654b1ff5a35fd7d689fc4109c4 100644
--- a/tensorflow/python/debug/lib/stepper.py
+++ b/tensorflow/python/debug/lib/stepper.py
@@ -80,7 +80,7 @@ class NodeStepper(object):
   when they are required as data dependencies.
 
   The temporary directories are automatically clean when the NodeStepper
-  instance exits as a context mananger.
+  instance exits as a context manager.
 
   Once the tracing is complete, it will issue a run() call on the
   underlying session, using the aforementioned feed_dict prepared by the input
diff --git a/tensorflow/python/debug/wrappers/framework.py b/tensorflow/python/debug/wrappers/framework.py
index 4e243cb6c9649a24009a0c9ac501c59eaac3bd79..909150eb6aa21b45af39f7cbfd6248c701ae1fb5 100644
--- a/tensorflow/python/debug/wrappers/framework.py
+++ b/tensorflow/python/debug/wrappers/framework.py
@@ -706,7 +706,8 @@ class BaseDebugWrapperSession(session.SessionInterface):
         exec_type, exec_value, exec_tb)
 
   def __del__(self):
-    self._sess.__del__()
+    if hasattr(self._sess, "__del__"):
+      self._sess.__del__()
 
   def close(self):
     self._sess.close()
diff --git a/tensorflow/python/debug/wrappers/grpc_wrapper.py b/tensorflow/python/debug/wrappers/grpc_wrapper.py
index 4062016607c8a56eb275fe4712a47c84bc7ed01c..c3bd1da7aa44111b411365034e1952e76fe12741 100644
--- a/tensorflow/python/debug/wrappers/grpc_wrapper.py
+++ b/tensorflow/python/debug/wrappers/grpc_wrapper.py
@@ -38,7 +38,7 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
       sess: The TensorFlow `Session` object being wrapped.
       grpc_debug_server_addresses: (`str` or `list` of `str`) Single or a list
         of the gRPC debug server addresses, in the format of
-        <host:port>, without the "grpc://" prefix. For example:
+        <host:port>, with or without the "grpc://" prefix. For example:
           "localhost:7000",
           ["localhost:7000", "192.168.0.2:8000"]
       watch_fn: (`Callable`) A Callable that can be used to define per-run
@@ -62,8 +62,7 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
 
     if isinstance(grpc_debug_server_addresses, str):
       self._grpc_debug_server_urls = [
-          self._GRPC_URL_PREFIX + grpc_debug_server_addresses
-      ]
+          self._normalize_grpc_url(grpc_debug_server_addresses)]
     elif isinstance(grpc_debug_server_addresses, list):
       self._grpc_debug_server_urls = []
       for address in grpc_debug_server_addresses:
@@ -71,7 +70,7 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
           raise TypeError(
               "Expected type str in list grpc_debug_server_addresses, "
               "received type %s" % type(address))
-        self._grpc_debug_server_urls.append(self._GRPC_URL_PREFIX + address)
+        self._grpc_debug_server_urls.append(self._normalize_grpc_url(address))
     else:
       raise TypeError(
           "Expected type str or list in grpc_debug_server_addresses, "
@@ -93,3 +92,37 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
     """
 
     return self._grpc_debug_server_urls
+
+  def _normalize_grpc_url(self, address):
+    return (self._GRPC_URL_PREFIX + address
+            if not address.startswith(self._GRPC_URL_PREFIX) else address)
+
+
+class TensorBoardDebugWrapperSession(GrpcDebugWrapperSession):
+  """A tfdbg Session wrapper that can be used with TensorBoard Debugger Plugin.
+
+  This wrapper is the same as `GrpcDebugWrapperSession`, except that it uses a
+    predefined `watch_fn` that
+    1) uses `DebugIdentity` debug ops with the `gated_grpc` attribute set to
+        `True` to allow the interactive enabling and disabling of tensor
+       breakpoints.
+    2) watches all tensors in the graph.
+  This saves the need for the user to define a `watch_fn`.
+  """
+
+  def __init__(self,
+               sess,
+               grpc_debug_server_addresses,
+               thread_name_filter=None,
+               log_usage=True):
+    def _gated_grpc_watch_fn(fetches, feeds):
+      del fetches, feeds  # Unused.
+      return framework.WatchOptions(
+          debug_ops=["DebugIdentity(gated_grpc=true)"])
+
+    super(TensorBoardDebugWrapperSession, self).__init__(
+        sess,
+        grpc_debug_server_addresses,
+        watch_fn=_gated_grpc_watch_fn,
+        thread_name_filter=thread_name_filter,
+        log_usage=log_usage)
diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index 4efa97973eb893a0105ca6abce6d306c1f6867d8..430669962484211e1d07555a605b85bf149465e5 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -27,9 +27,6 @@ from tensorflow.python.debug.wrappers import grpc_wrapper
 from tensorflow.python.debug.wrappers import local_cli_wrapper
 from tensorflow.python.training import session_run_hook
 
-# The prefix for GRPC endpoint URLs.
-_GRPC_ENDPOINT_PREFIX = "grpc://"
-
 
 class LocalCLIDebugHook(session_run_hook.SessionRunHook):
   """Command-line-interface debugger hook.
@@ -249,8 +246,8 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
 
     Args:
       grpc_debug_server_addresses: (`list` of `str`) A list of the gRPC debug
-        server addresses, in the format of <host:port>, without the "grpc://"
-        prefix. For example: ["localhost:7000", "192.168.0.2:8000"]
+        server addresses, in the format of <host:port>, with or without the
+        "grpc://" prefix. For example: ["localhost:7000", "192.168.0.2:8000"]
       watch_fn: A function that allows for customizing which ops to watch at
         which specific steps. See doc of
         `dumping_wrapper.DumpingDebugWrapperSession.__init__` for details.
@@ -258,23 +255,14 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
         wrapper session will be active. See doc of `BaseDebugWrapperSession` for
         more details.
       log_usage: (bool) Whether usage is to be logged.
-
-    Raises:
-      ValueError: if any debugger server addresses start with grpc://.
     """
-
-    for address in grpc_debug_server_addresses:
-      if address.startswith(_GRPC_ENDPOINT_PREFIX):
-        raise ValueError(
-            ("Debug server address %r starts with %r. It should not because "
-             "the hook already automatically adds the prefix.") % (
-                 address, _GRPC_ENDPOINT_PREFIX))
-
-    # A wrapper session responsible for GRPC communication.
     self._grpc_debug_wrapper_session = None
     self._thread_name_filter = thread_name_filter
+    self._grpc_debug_server_addresses = (
+        grpc_debug_server_addresses
+        if isinstance(grpc_debug_server_addresses, list)
+        else [grpc_debug_server_addresses])
 
-    self._grpc_debug_server_addresses = grpc_debug_server_addresses
     self._watch_fn = watch_fn
     self._log_usage = log_usage
 
@@ -315,3 +303,31 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
 
     return session_run_hook.SessionRunArgs(
         None, feed_dict=None, options=run_options)
+
+
+class TensorBoardDebugHook(GrpcDebugHook):
+  """A tfdbg hook that can be used with TensorBoard Debugger Plugin.
+
+  This hook is the same as `GrpcDebugHook`, except that it uses a predefined
+    `watch_fn` that
+    1) uses `DebugIdentity` debug ops with the `gated_grpc` attribute set to
+        `True`, to allow the interactive enabling and disabling of tensor
+       breakpoints.
+    2) watches all tensors in the graph.
+  This saves the need for the user to define a `watch_fn`.
+  """
+
+  def __init__(self,
+               grpc_debug_server_addresses,
+               thread_name_filter=None,
+               log_usage=True):
+    def _gated_grpc_watch_fn(fetches, feeds):
+      del fetches, feeds  # Unused.
+      return framework.WatchOptions(
+          debug_ops=["DebugIdentity(gated_grpc=true)"])
+
+    super(TensorBoardDebugHook, self).__init__(
+        grpc_debug_server_addresses,
+        watch_fn=_gated_grpc_watch_fn,
+        thread_name_filter=thread_name_filter,
+        log_usage=log_usage)
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index b491a637bacccd181cab0960f08a5306b719bdd0..f470e181200f19d672cced3ea21d05aa2eee0bea 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -110,6 +110,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:training",
@@ -144,6 +145,7 @@ cuda_py_test(
         ":test",
         "//tensorflow/python:clip_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
     ],
 )
 
@@ -415,6 +417,7 @@ cuda_py_test(
         "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:tensor_shape",
     ],
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 25f7ae785e6582682f9e2e98b6ecffe83d569916..dc1142705abb80abe3729aa42b44f2ca1e97d31f 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -540,7 +540,7 @@ def _ensure_unique_tensor_objects(parameter_positions, args):
     if i in parameter_positions:
       tid = ops.tensor_id(t)
       if tid in s:
-        args[i] = args[i]._dup()  # pylint: disable=protected-access
+        args[i] = gen_array_ops.identity(args[i])
       else:
         s.add(tid)
   return args
@@ -798,13 +798,41 @@ class GradientTape(object):
   grad = g.gradient(y, [x])[0]
   assert grad.numpy() == 6.0
   ```
+
+  By default, the resources held by a GradientTape are released as soon as
+  GradientTape.gradient() method is called. However, if one need to compute
+  multiple gradients over the same computation, she can create a persistent
+  GradientTape. Persistent tapes allow multiple calls to the gradient() method
+  and release resources when the tape object is destructed.
+
+  Example usage:
+
+  ```python
+  with tfe.GradientTape(persistent=True) as g:
+    x = tf.constant(3.0)
+    g.watch(x)
+    y = x * x
+    z = y * y
+  dz_dx = g.gradient(z, [x])[0]
+  assert dz_dx.numpy() == 108.0   # 4*x^3 at x = 3
+  dy_dx = g.gradient(y, [x])[0]
+  assert dy_dx.numpy() == 6.0
+  del g  # Drop the reference to the tape
   """
 
-  def __init__(self):
+  def __init__(self, persistent=False):
+    """Creates a new GradientTape.
+
+    Args:
+      persistent: Boolean controlling whether a persistent gradient tape
+        is created. Must be True or False.
+
+    """
     self._tape = None
+    self._persistent = persistent
 
   def __enter__(self):
-    tape.push_new_tape()
+    tape.push_new_tape(persistent=self._persistent)
     return self
 
   def __exit__(self, typ, value, traceback):
@@ -838,12 +866,14 @@ class GradientTape(object):
        than once.
     """
     if self._tape is None:
-      raise RuntimeError("GradientTape.gradient can only be called once, and "
+      raise RuntimeError("GradientTape.gradient can only be called once "
+                         "on non-persistent tapes, and "
                          "only when the context manager has exited.")
     sources = [x.handle if isinstance(x, resource_variable_ops.ResourceVariable)
                else x
                for x in sources]
     grad = imperative_grad.imperative_grad(
         _default_vspace, self._tape, [target], sources)
-    self._tape = None
+    if not self._persistent:
+      self._tape = None
     return grad
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 86c9cce3fd8252482163277a87d83fa0b6e9ca21..3da22d4c3416eeabaf311433ffcb9d09d600bf78 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -28,9 +28,9 @@ from tensorflow.python.eager import tape
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import gradients
@@ -111,7 +111,7 @@ class BackpropTest(test.TestCase):
       return x, grad
 
     # TODO(apassos) raise the right error here
-    with self.assertRaises(errors_impl.InternalError):
+    with self.assertRaises(RuntimeError):
       backprop.gradients_function(f)(constant_op.constant(1.0))
 
   def testImplicitGradOverEmbeddingLookup(self):
@@ -152,6 +152,7 @@ class BackpropTest(test.TestCase):
     opt.apply_gradients([(grad, embedding)])
     self.assertAllClose(expected, embedding.read_value())
 
+  @test_util.assert_no_new_tensors
   def testGradientNone(self):
 
     def loss(x, l):
@@ -166,6 +167,7 @@ class BackpropTest(test.TestCase):
     g, = backprop.gradients_function(loss, [0])(logits, labels)
     self.assertAllEqual(g.numpy(), [[-0.5, 0.5]])
 
+  @test_util.assert_no_new_tensors
   def testSecondGrad(self):
 
     def first(x):
@@ -182,6 +184,7 @@ class BackpropTest(test.TestCase):
     grad = backprop.gradients_function(second, [0])(f)[0]
     self.assertAllEqual([[0.0]], grad)
 
+  @test_util.assert_no_new_tensors
   def testMakeVJP(self):
 
     def f(x):
@@ -192,6 +195,7 @@ class BackpropTest(test.TestCase):
     self.assertAllEqual(result, 9.0)
     self.assertAllEqual(vjp(2.0)[0], 12.0)
 
+  @test_util.assert_no_new_tensors
   def testGradGrad(self):
 
     def sq(x):
@@ -205,6 +209,7 @@ class BackpropTest(test.TestCase):
 
     self.assertAllEqual(gradgrad(constant_op.constant(3.0))[0], 2.0)
 
+  @test_util.assert_no_new_tensors
   def testGradGradExp(self):
 
     def grad(x):
@@ -215,6 +220,22 @@ class BackpropTest(test.TestCase):
 
     self.assertAllEqual(gradgrad(constant_op.constant(0.0))[0], 1.0)
 
+  @test_util.assert_no_new_tensors
+  def testStopGradient(self):
+    grad = backprop.gradients_function(
+        lambda x: array_ops.stop_gradient(math_ops.argmax(x)))
+    self.assertAllEqual(grad([0.0])[0], None)
+
+  @test_util.assert_no_new_tensors
+  def testArgmax(self):
+    def argmax(x):
+      i = math_ops.argmax(x)
+      return array_ops.stop_gradient(i)
+
+    grad = backprop.gradients_function(argmax)
+    self.assertAllEqual(grad([0.0])[0], None)
+
+  @test_util.assert_no_new_tensors
   def testGPU(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
@@ -230,6 +251,7 @@ class BackpropTest(test.TestCase):
     grad = backprop.gradients_function(fn, [0])(constant_op.constant(1.0))[0]
     self.assertAllEqual(grad, 1.0)
 
+  @test_util.assert_no_new_tensors
   def testGPUImplicitGrad(self):
     if not context.context().num_gpus():
       self.skipTest('No GPU found')
@@ -245,6 +267,7 @@ class BackpropTest(test.TestCase):
     self.assertEqual(
         backprop.implicit_grad(f)()[0][0].cpu().numpy(), 1.0)
 
+  @test_util.assert_no_new_tensors
   def testCPU(self):
 
     def fn(x):
@@ -255,6 +278,7 @@ class BackpropTest(test.TestCase):
     grad = backprop.gradients_function(fn, [0])(constant_op.constant(1.0))[0]
     self.assertAllEqual(grad, 1.0)
 
+  @test_util.assert_no_new_tensors
   def testTensorCopyGPU2CPU2GPU(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
@@ -269,6 +293,7 @@ class BackpropTest(test.TestCase):
     grad = backprop.gradients_function(f, [0])(a, b)[0]
     self.assertAllEqual(grad, 1.0)
 
+  @test_util.assert_no_new_tensors
   def testEmptyParams(self):
 
     def fn(a, b):
@@ -280,6 +305,7 @@ class BackpropTest(test.TestCase):
     self.assertAllEqual(dx, y.numpy())
     self.assertAllEqual(dy, x.numpy())
 
+  @test_util.assert_no_new_tensors
   def testUnconnectedNone(self):
     v = resource_variable_ops.ResourceVariable(
         1.0, name='testUnconnectedNone')
@@ -290,6 +316,7 @@ class BackpropTest(test.TestCase):
 
     self.assertEqual(backprop.implicit_grad(f)()[0][0], None)
 
+  @test_util.assert_no_new_tensors
   def testGradientTape(self):
     with backprop.GradientTape() as g:
       x = constant_op.constant(3.0)
@@ -304,6 +331,7 @@ class BackpropTest(test.TestCase):
     grad = g.gradient(y, [x])[0]
     self.assertEqual(grad.numpy(), 6.0)
 
+  @test_util.assert_no_new_tensors
   def testGradientTapeGradientCalledMultipleTimes(self):
     with backprop.GradientTape() as g:
       x = constant_op.constant(3.0)
@@ -315,6 +343,40 @@ class BackpropTest(test.TestCase):
         RuntimeError, 'GradientTape.gradient can only be called once'):
       g.gradient(y, [x])
 
+  @test_util.assert_no_new_tensors
+  def testPersistentTape(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = constant_op.constant(3.0)
+      g.watch(x)
+      y = x * x
+      z = y * y
+    dz_dx = g.gradient(z, [x])[0]
+    self.assertEqual(dz_dx.numpy(), 4*3*3*3)
+    dy_dx = g.gradient(y, [x])[0]
+    self.assertEqual(dy_dx.numpy(), 2*3)
+    del g
+
+  @test_util.assert_no_new_tensors
+  def testPersistentNestedTape(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = constant_op.constant(3.0)
+      g.watch(x)
+      y = x * x
+      with backprop.GradientTape(persistent=True) as gg:
+        gg.watch(y)
+        z = 2 * y
+      for _ in range(2):
+        inner_grad = gg.gradient(z, [y])[0]
+        self.assertEqual(inner_grad.numpy(), 2.0)
+      y += inner_grad
+      del gg
+    grad = g.gradient(y, [x])[0]
+    self.assertEqual(grad.numpy(), 6.0)
+    grad = g.gradient(z, [x])[0]
+    self.assertEqual(grad.numpy(), 12.0)
+    del g
+
+  @test_util.assert_no_new_tensors
   def testGradientTapeVariable(self):
     v = resource_variable_ops.ResourceVariable(1.0, name='v')
     with backprop.GradientTape() as g:
@@ -322,6 +384,7 @@ class BackpropTest(test.TestCase):
     grad = g.gradient(y, [v])[0]
     self.assertAllEqual(grad, 2.0)
 
+  @test_util.assert_no_new_tensors
   def testEmptyParamsForValueAndGradFunction(self):
     def fn(a, b):
       return a * b
@@ -334,6 +397,7 @@ class BackpropTest(test.TestCase):
     self.assertAllEqual(dx, y)
     self.assertAllEqual(dy, x)
 
+  @test_util.assert_no_new_tensors
   def testNonEmptyParamsForValueAndGradFunction(self):
     def fn(a, b):
       return a * b
@@ -346,6 +410,7 @@ class BackpropTest(test.TestCase):
     self.assertEqual(1, len(grads))
     self.assertAllEqual(grads[0], x)
 
+  @test_util.assert_no_new_tensors
   def testTensorCopyCPU2GPU2CPU(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
@@ -430,6 +495,7 @@ class BackpropTest(test.TestCase):
 
     self.assertAllEqual(backprop.gradients_function(f)(1.0)[0], 3.0)
 
+  @test_util.assert_no_new_tensors
   def testExceptionSafety(self):
 
     def f(unused_x):
@@ -445,6 +511,7 @@ class BackpropTest(test.TestCase):
 
     self.assertAllEqual(backprop.gradients_function(real_f)(1.0)[0], 2.0)
 
+  @test_util.assert_no_new_tensors
   def testMultiValueConvertToTensor(self):
     x = resource_variable_ops.ResourceVariable(
         initial_value=array_ops.constant([1.0]), name='x')
@@ -505,6 +572,7 @@ class BackpropTest(test.TestCase):
         initial_value=1., name='testSameObjectForMultipleArguments.Variable')
     self.assertAllEqual([1., 1.], np_g(v, v))
 
+  @test_util.assert_no_new_tensors
   def testImplicitGradientsCustomGradientAndCachedVariableValue(self):
 
     @custom_gradient.custom_gradient
@@ -530,6 +598,7 @@ class BackpropTest(test.TestCase):
     self.assertAllEqual(7, grad)
     self.assertAllEqual(x, var)
 
+  @test_util.assert_no_new_tensors
   def testCustomGradient(self):
 
     @custom_gradient.custom_gradient
@@ -556,6 +625,7 @@ class BackpropTest(test.TestCase):
         var.assign_sub(lr*grad)
     self.assertAllEqual(losses, [4.0, 3., 2., 1., 0.])
 
+  @test_util.assert_no_new_tensors
   def testCustomGradientIdentity(self):
 
     @custom_gradient.custom_gradient
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 92f4e15c054bd8cf3886b8c22e414abdfccbdae5..8aec242f1de06f734cd853427599fcf37a21a747 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import contextlib
 import copy
 import random
@@ -62,6 +63,41 @@ class _EagerContext(threading.local):
     self.scalar_cache = {}
 
 
+ContextStackEntry = collections.namedtuple(
+    "ContextStackEntry", ["is_building_function", "enter_context_fn"])
+
+
+class ContextStack(threading.local):
+  """A thread-local stack of context switches."""
+
+  def __init__(self):
+    super(ContextStack, self).__init__()
+    self.stack = []
+
+  def push(self, is_building_function, enter_context_fn):
+    """Push metadata about a context switch onto the stack.
+
+    A context switch can take one of two forms: installing a graph as the
+    default graph, or entering the eager context.
+
+    Args:
+      is_building_function: (bool.) Whether the context is building a function.
+      enter_context_fn: (function.) A callable that executes the context switch.
+        For example, `graph.as_default` or `eager_mode`.
+    """
+
+    self.stack.append(
+        ContextStackEntry(is_building_function, enter_context_fn))
+
+  def pop(self):
+    """Pop the stack."""
+
+    self.stack.pop()
+
+
+context_stack = ContextStack()
+
+
 # TODO(agarwal): rename to EagerContext / EagerRuntime ?
 # TODO(agarwal): consider keeping the corresponding Graph here.
 class Context(object):
@@ -183,10 +219,14 @@ class Context(object):
     ctx = self._eager_context
     old_mode = ctx.mode
     ctx.mode = mode
+    if mode == EAGER_MODE:
+      context_stack.push(False, eager_mode)
     try:
       yield
     finally:
       ctx.mode = old_mode
+      if mode == EAGER_MODE:
+        context_stack.pop()
 
   def in_graph_mode(self):
     """Returns True if current thread is in GRAPH mode."""
@@ -288,6 +328,21 @@ class Context(object):
     self._initialize_handle_and_devices()
     return self._num_gpus
 
+  def add_function(self, fn):
+    """Add a function definition to the context.
+
+    Once added, the function (identified by its name) can be executed like any
+    other operation.
+
+    Args:
+      fn: A wrapped TF_Function (returned from TF_GraphToFunction_wrapper).
+    """
+    with errors.raise_exception_on_not_ok_status() as status:
+      pywrap_tensorflow.TFE_ContextAddFunction(
+          self._handle,  # pylint: disable=protected-access
+          fn,
+          status)
+
   def add_function_def(self, fdef):
     """Add a function definition to the context.
 
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index 2449162dcaa47cb71dde3be70675654709fec794..02694b34fe9279c0ce6dd5946e7779f4f9ca2c85 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -84,6 +84,20 @@ class TFETest(test_util.TensorFlowTestCase):
     self.assertTrue(has_cpu_device)
     del ctx
 
+  def testContextStackContainsEagerMode(self):
+    # Eager execution has been enabled, and no other context
+    # switch has occurred, so `context_stack` should contain
+    # exactly one entry.
+    self.assertEqual(len(context.context_stack.stack), 1)
+    stack_entry = context.context_stack.stack[0]
+
+    # The entry should log that eager mode was entered.
+    self.assertIs(stack_entry.enter_context_fn, context.eager_mode)
+
+    # It is not possible to build a graph function when eager execution
+    # is enabled; the stack entry should reflect this fact.
+    self.assertFalse(stack_entry.is_building_function)
+
   def _runInThread(self, target, args):
     t = threading.Thread(target=target, args=args)
     try:
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 9bcd9c23c7bad4d4e3b93fa4bb5fc2c316d5c828..d94a7acd09cda6aa14ef205b88e369e581ee81af 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -25,15 +25,19 @@ import threading
 
 import numpy as np
 
+from tensorflow.core.framework import function_pb2
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
+from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import graph_to_function_def
+from tensorflow.python.framework import dtypes as dtypes_module
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 
@@ -47,28 +51,6 @@ _scoped_captures = threading.local()
 _scoped_captures.tensors = None
 
 
-def make_function_def(graph, operations, inputs, outputs):
-  """Makes function def where accesses to resources are serialized."""
-  last_op_using_resource_tensor = {}
-
-  # TODO(apassos) probably control flow has to be handled delicately here as in
-  # if a resource is accessed inside a control flow context we need the control
-  # dependency to point to something outside the context which is guaranteed to
-  # happen after the access.
-  #
-  # TODO(apassos) this should do some form of alias analysis as ops which
-  # forward the resources such as Identity and Switch can cause serialization to
-  # fail.
-  for op in operations:
-    for t in op.inputs:
-      if t.dtype == dtypes.resource:
-        if t.name in last_op_using_resource_tensor:
-          op._add_control_input(last_op_using_resource_tensor[t.name])  # pylint: disable=protected-access
-        last_op_using_resource_tensor[t.name] = op
-  return graph_to_function_def.graph_to_function_def(
-      graph, operations, inputs, outputs)
-
-
 @contextlib.contextmanager
 def capture_tensors(captures):
   old = _scoped_captures.__dict__.get("tensors", None)
@@ -85,7 +67,7 @@ def capture_value(tensor_map, value, dtype, name):
   if captured_value is None:
     captured_value = graph_placeholder(
         dtype=dtype or value.dtype, shape=value.shape, name=name)
-    if captured_value.dtype == dtypes.resource:
+    if captured_value.dtype == dtypes_module.resource:
       captured_value._handle_data = value._handle_data  # pylint: disable=protected-access
     tensor_map[ops.tensor_id(value)] = (value, captured_value)
   else:
@@ -101,7 +83,7 @@ def _convert_to_graph_tensor(value, dtype=None, name=None, as_ref=False):
   Arguments:
     value: A Tensor object.
     dtype: The datatype of the value produced by the node in the graph.
-    name:  Name of the node in the graph.
+    name:  str, Name of the node in the graph.
     as_ref: Ignored (required by register_tensor_conversion_function).
 
   Returns:
@@ -109,22 +91,45 @@ def _convert_to_graph_tensor(value, dtype=None, name=None, as_ref=False):
     is not enabled. A placeholder which will have the value of the
     tensor at runtime otherwise.
   """
+  del as_ref  # Unused.
+
   if context.in_eager_mode():
     return value
-  _ = as_ref
+
+  default_graph = ops.get_default_graph()
+  if not default_graph.building_function:
+    return value
+
   tensor_map = _scoped_captures.tensors
   if tensor_map is None:
     # Capturing is not enabled.
     return constant_op.constant(value.numpy())
+  if type(value) == ops.Tensor and value.graph is default_graph:
+    # The tensor has already been converted and captured. The type check
+    # is intentional: we are checking that value is a Tensor and not an
+    # EagerTensor.
+    return value
   return capture_value(tensor_map, value, dtype, name)
 
 
 class CapturingGraph(ops.Graph):
+  """Graph used when constructing eager functions."""
 
   def __init__(self, captures):
     super(CapturingGraph, self).__init__()
     self._building_function = True
     self.captures = captures
+    # Map from resource tensor name to last op (in program order) which uses
+    # this tensor. Used to enforce that execution order matches program order
+    # for resource tensors.
+    self._last_op_using_resource_tensor = {}
+
+  # TODO(apassos) remove once the C API is used by default.
+  def _use_c_api_hack(self):
+    return True
+
+  def clear_resource_control_flow_state(self):
+    self._last_op_using_resource_tensor = {}
 
   def create_op(
       self,
@@ -137,12 +142,31 @@ class CapturingGraph(ops.Graph):
       op_def=None,
       compute_shapes=True,
       compute_device=True):
+    # TODO(apassos) probably control flow has to be handled delicately here as
+    # in if a resource is accessed inside a control flow context we need the
+    # control dependency to point to something outside the context which is
+    # guaranteed to happen after the access.
+    #
+    # TODO(apassos) this should do some form of alias analysis as ops which
+    # forward the resources such as Identity and Switch can cause serialization
+    # to fail.
+    resource_inputs = set()
+    control_inputs = set()
     for i, inp in enumerate(inputs):
       if inp.graph is not self:
         inputs[i] = capture_value(self.captures, inp, inp.dtype, inp.op.name)
-    return super(CapturingGraph, self).create_op(
-        op_type, inputs, dtypes, input_types, name, attrs, op_def,
-        compute_shapes, compute_device)
+      inp = inputs[i]
+      if inp.dtype == dtypes_module.resource:
+        if inp.name in self._last_op_using_resource_tensor:
+          control_inputs.add(self._last_op_using_resource_tensor[inp.name])
+        resource_inputs.add(inp.name)
+    with self.control_dependencies(list(control_inputs)):
+      op = super(CapturingGraph, self).create_op(
+          op_type, inputs, dtypes, input_types, name, attrs, op_def,
+          compute_shapes, compute_device)
+    for name in resource_inputs:
+      self._last_op_using_resource_tensor[name] = op
+    return op
 
 
 # TODO(apassos): it'd be really nice if we could scope this registration.
@@ -196,14 +220,52 @@ def _inference_name(n):
   return "__inference_%s_%s" % (n, ops.uid())
 
 
-class _DefinedFunction(object):
-  """Mocks the interface of tf _DefinedFunction."""
+# TODO(apassos) get rid of this by splitting framework.function._DefinedFunction
+# so it doesn't have the definition-generating logic and is just a container for
+# an already-defined function.
+class _EagerDefinedFunction(object):
+  """Function object with the interface of tf _DefinedFunction."""
+
+  def __init__(self, name, graph, operations, inputs, outputs):
+    """Initializes an eager defined function.
 
-  def __init__(self, fdef):
-    self.definition = fdef
-    self.name = fdef.signature.name
+    Args:
+      name: str, the name for the created function.
+      graph: Graph, the graph containing the operations in the function
+      operations: list of Operation; the subset of operations in the graph
+        which will be in the function
+      inputs: the tensors in the graph to be used as inputs to the function
+      outputs: the tensors in the graph which will be outputs to the function
+    """
+    with errors.raise_exception_on_not_ok_status() as status:
+      fn = pywrap_tensorflow.TF_GraphToFunction_wrapper(
+          graph._c_graph,  # pylint: disable=protected-access
+          compat.as_str(name),
+          False,
+          [o._c_op for o in operations],  # pylint: disable=protected-access
+          [t._as_tf_output() for t in inputs],  # pylint: disable=protected-access
+          [t._as_tf_output() for t in outputs],  # pylint: disable=protected-access
+          [],
+          None,
+          compat.as_str(""),
+          status)
+    # TODO(apassos) avoid creating a FunctionDef (specially to grab the
+    # signature, but also in general it's nice not to depend on it.
+    with c_api_util.tf_buffer() as buffer_:
+      with errors.raise_exception_on_not_ok_status() as status:
+        pywrap_tensorflow.TF_FunctionToFunctionDef(fn, buffer_, status)
+      proto_data = pywrap_tensorflow.TF_GetBuffer(buffer_)
+    function_def = function_pb2.FunctionDef()
+    function_def.ParseFromString(compat.as_bytes(proto_data))
+    if context.in_eager_mode():
+      _register(fn)
+    self.definition = function_def
+    self.name = function_def.signature.name
+    self.signature = function_def.signature
     self.grad_func_name = None
     self.python_grad_func = None
+    self._c_func = fn
+    self._grad_func = None
 
 
 def _map_sequence_obj_to_idx(sequence):
@@ -215,50 +277,51 @@ class GraphModeFunction(object):
   """Callable object representing a graph-mode function.
 
   Args:
-    input_placeholders: list of placeholder values to feed when calling
-      the wrapped function.
+    name: str the name of the created function
+    input_placeholders: list of placeholder values (tensors) to feed when
+      calling the wrapped function.
     extra_inputs: Tensor inputs this function definition closed over which
       are passed as arguments. Need to track so gradients are supported
       correctly.
-    fdef: the function definition we want to call.
-    graph: the graph from which the fdef operations were pulled. Used as
+    graph: the Graph from which the operations will be pulled. Used as
       a context when computing gradients.
-    operations: the subset of operations in the graph used in the function
+    operations: the subset of Operations in the graph used in the function
       definition.
-    func_outputs: the python outputs of the graph-mode function, with
-      tensorflow.Tensor objects to be replaced by tfe values when called.
-    func_outputs_to_fdef_outputs: Maps id(obj) in func_outputs to index of
-      fdef's outputs. It allows mapping fdef output tensors to nested
-      func_outputs structure.
-    output_shapes: List of shapes of all tensors which are output by the
-      internal function.
+    outputs: a flat list of the Tensors in the graph used as outputs to the
+      function
+    func_outputs: a possibly nested python object which will be returned by
+      this function. The Tensors in this structure will be replaced by their
+      corresponding values in outputs.
+    output_shapes: List of shapes of all tensors in outputs
     variables: (optional) List of variables to watch during function execution.
   """
 
   def __init__(self,
+               name,
                input_placeholders,
                extra_inputs,
-               fdef,
                graph,
                operations,
+               outputs,
                func_outputs,
-               func_outputs_to_fdef_outputs,
                output_shapes,
                variables=None):
-    assert len(input_placeholders) == len(fdef.signature.input_arg), "%s %s" % (
-        len(input_placeholders), len(fdef.signature.input_arg))
+    defined_function = _EagerDefinedFunction(
+        name, graph, operations, input_placeholders, outputs)
+    if len(input_placeholders) != len(defined_function.signature.input_arg):
+      raise ValueError("Internal error: invalid lengths. %s %s" % (
+          len(input_placeholders), len(defined_function.signature.input_arg)))
     self._input_placeholders = input_placeholders
     self._extra_inputs = list(extra_inputs)
     self._graph = graph
     self._has_backprop = False
-    self._func_name = fdef.signature.name
-    self._fdef = _DefinedFunction(fdef)
-    self._num_outputs = len(fdef.signature.output_arg)
+    self._func_name = name
+    self._function_def = defined_function
+    self._num_outputs = len(defined_function.signature.output_arg)
     self._ops = operations
     self._func_outputs = func_outputs
     self._returns = [func_outputs] if isinstance(
         func_outputs, (ops.Tensor, type(None))) else list(func_outputs)
-    self._returns_to_fedf_outputs = func_outputs_to_fdef_outputs
     self._output_shapes = output_shapes
     self._variables = variables if variables is not None else []
 
@@ -272,49 +335,47 @@ class GraphModeFunction(object):
     with self._graph.as_default(), context.graph_mode():
       c = _CapturingContext()
       with c:
-        filtered_outputs = [
-            x for x in self._returns if x is not None
-        ]
+        filtered_outputs = [x for x in self._returns if x is not None]
         self._out_grad_placeholders = [
-            graph_placeholder(x.dtype, x.shape) for x in filtered_outputs
-        ]
+            graph_placeholder(x.dtype, x.shape) for x in filtered_outputs]
         in_gradients = gradients_impl.gradients(
             filtered_outputs,
             self._input_placeholders,
             grad_ys=self._out_grad_placeholders)
-        shapes = [x.shape for x in in_gradients if x is not None]
+        shapes = tuple(x.shape for x in in_gradients if x is not None)
     captures = list(sorted(c.captured_tensors, key=lambda x: x.name))
-    forward_function_def = make_function_def(
-        self._graph, self._ops, self._input_placeholders,
+    forward_name = _forward_name(self._func_name)
+    self._forward_fdef = _EagerDefinedFunction(
+        forward_name, self._graph, self._ops, self._input_placeholders,
         filtered_outputs + captures)
-    self._forward_fdef = _DefinedFunction(forward_function_def)
-    _register_with_name(_forward_name(self._func_name), forward_function_def)
-    backward_outputs = [x for x in in_gradients if x is not None]
+    backward_outputs = tuple(x for x in in_gradients if x is not None)
     all_inputs = self._out_grad_placeholders + captures
-    backward_function_def = make_function_def(
-        self._graph, [x.op for x in self._out_grad_placeholders
-                     ] + list(sorted(c.known_ops, key=lambda x: x.name)),
-        all_inputs, backward_outputs)
-    _register_with_name(_backward_name(self._func_name), backward_function_def)
+    # Excluding input ops from the body as we do not intend to execute these
+    # operations when the function is executed.
+    all_ignored_ops = frozenset(x.op for x in all_inputs)
+    # Enforce a deterministic order of operations in the generated graph. This
+    # means rerunning the function-defining code will always define the same
+    # function, which is useful if we serialize this etc.
+    function_def_ops = tuple(x
+                             for x in sorted(c.known_ops, key=lambda x: x.name)
+                             if x not in all_ignored_ops)
+    bname = _backward_name(self._func_name)
     self._backward_function = GraphModeFunction(
-        all_inputs, [], backward_function_def, self._graph, c.known_ops,
-        in_gradients, _map_sequence_obj_to_idx(backward_outputs), shapes)
+        bname, all_inputs, [], self._graph, function_def_ops,
+        backward_outputs, in_gradients, shapes)
 
   def _backprop_call(self, args):
     """Calls the wrapped function and records the result on a tape."""
     all_args = args + self._extra_inputs
-    signature = self._forward_fdef.definition.signature
+    signature = self._forward_fdef.signature
     ctx = context.context()
     if ctx.in_graph_mode():
       g = ops.get_default_graph()
       g._add_function(self._forward_fdef)  # pylint: disable=protected-access
-      def make_tensor(x):
-        if isinstance(x, ops.Tensor):
-          return x
-        return ops.internal_convert_to_tensor(x, ctx=ctx)
       op = g.create_op(
-          signature.name, [make_tensor(x) for x in all_args],
-          [dtypes.DType(x.type) for x in signature.output_arg],
+          signature.name,
+          [ops.internal_convert_to_tensor(x, ctx=ctx) for x in all_args],
+          tuple(dtypes_module.DType(x.type) for x in signature.output_arg),
           op_def=signature,
           name="FunctionCall",
           compute_shapes=False)
@@ -334,7 +395,7 @@ class GraphModeFunction(object):
     side_outputs = outputs[len(self._returns):]
 
     def backward_function(*args):
-      return self._backward_function(*(list(args) + side_outputs))
+      return self._backward_function(*(list(args) + side_outputs))  # pylint: disable=not-callable
 
     tape.record_operation(
         signature.name,
@@ -344,17 +405,40 @@ class GraphModeFunction(object):
 
     return self._build_call_outputs(real_outputs)
 
+  @property
+  def output_shapes(self):
+    # TODO(ebrevdo): Should we only keep the output shapes associated
+    # with len(self._returns) outputs?
+    return nest.pack_sequence_as(self._func_outputs, self._output_shapes)
+
+  @property
+  def output_dtypes(self):
+    return nest.map_structure(
+        lambda x: x.dtype if x is not None else None, self._func_outputs)
+
+  @property
+  def captured_inputs(self):
+    return self._extra_inputs
+
+  @property
+  def name(self):
+    return self._function_def.name
+
+  def add_to_graph(self, g):
+    if self._function_def.name not in g._functions:  # pylint: disable=protected-access
+      g._add_function(self._function_def)  # pylint: disable=protected-access
+    for f in self._graph._functions.values():  # pylint: disable=protected-access
+      if f.name not in g._functions:  # pylint: disable=protected-access
+        g._add_function(f)  # pylint: disable=protected-access
+
   def __call__(self, *args):
     """Executes the passed function in eager mode."""
     for v in self._variables:
       if v._trainable:  # pylint: disable=protected-access
         tape.watch_variable(v)
 
-    tensor_inputs = [
-        x for x in nest.flatten(args)
-        if isinstance(x, ops.Tensor)
-    ]
-
+    tensor_inputs = [x for x in nest.flatten(args)
+                     if isinstance(x, ops.Tensor)]
     if tape.should_record(tensor_inputs) or tape.should_record(
         self._extra_inputs):
       if not self._has_backprop:
@@ -364,16 +448,13 @@ class GraphModeFunction(object):
     ctx = context.context()
     if ctx.in_graph_mode():
       g = ops.get_default_graph()
-      if self._fdef.name not in g._functions:  # pylint: disable=protected-access
-        g._add_function(self._fdef)  # pylint: disable=protected-access
-      for f in self._graph._functions.values():  # pylint: disable=protected-access
-        if f.name not in g._functions:  # pylint: disable=protected-access
-          g._add_function(f)  # pylint: disable=protected-access
-      signature = self._fdef.definition.signature
+      self.add_to_graph(g)
+      signature = self._function_def.definition.signature
       args = list(tensor_inputs) + self._extra_inputs
       op = g.create_op(
-          signature.name, [ops.convert_to_tensor(x) for x in args],
-          [dtypes.DType(x.type) for x in signature.output_arg],
+          signature.name,
+          [ops.internal_convert_to_tensor(x, ctx=ctx) for x in args],
+          tuple(dtypes_module.DType(x.type) for x in signature.output_arg),
           op_def=signature,
           name="FunctionCall",
           compute_shapes=False)
@@ -451,40 +532,40 @@ def _defun_internal(name, func, args, kwds):
           func_outputs = func(*func_inputs, **kwds)
         finally:
           variables = tape.pop_tape().watched_variables()
+
+        # Returning a closed-over tensor as an output does not trigger a
+        # call to convert_to_tensor, so we manually capture all such tensors.
+        outputs_list = nest.flatten(func_outputs)
+        func_def_outputs = [
+            _convert_to_graph_tensor(x) for x in outputs_list if x is not None
+        ]
+
       ids = list(sorted(captures.keys()))
       if ids:
         extra_inputs, extra_placeholders = zip(* [captures[x] for x in ids])
       else:
         extra_inputs = []
         extra_placeholders = []
-      outputs_list = nest.flatten(func_outputs)
-      output_shapes = [x.shape for x in outputs_list if x is not None]
+      output_shapes = tuple(
+          x.shape if isinstance(x, ops.Tensor) else None
+          for x in outputs_list)
 
-  flat_inputs = [
-      x for x in nest.flatten(func_inputs) if isinstance(x, ops.Tensor)
-  ]
+  flat_inputs = [x for x in nest.flatten(func_inputs)
+                 if isinstance(x, ops.Tensor)]
   all_inputs = flat_inputs + list(extra_placeholders)
-
-  func_def_outputs = [x for x in outputs_list if x is not None]
-  inference_function_def = make_function_def(
-      tmp_graph, tmp_graph.get_operations(), all_inputs, func_def_outputs)
+  all_ignored_ops = frozenset(x.op for x in all_inputs)
+  fname = _inference_name(name)
+  operations = tuple(x for x in tmp_graph.get_operations()
+                     if x not in all_ignored_ops)
   # Register any other functions defined in the graph
   # TODO(ashankar): Oh lord, forgive me for this lint travesty.
-  for f in tmp_graph._functions.values():  # pylint: disable=protected-access
-    # TODO(ashankar): What about the gradient registry?
-    _register_with_name(f.name, f.definition)
-  _register_with_name(_inference_name(name), inference_function_def)
-
+  if context.in_eager_mode():
+    for f in tmp_graph._functions.values():  # pylint: disable=protected-access
+      # TODO(ashankar): What about the gradient registry?
+      _register(f._c_func)  # pylint: disable=protected-access
   return GraphModeFunction(
-      all_inputs,
-      extra_inputs,
-      inference_function_def,
-      tmp_graph,
-      tmp_graph.get_operations(),
-      func_outputs,
-      _map_sequence_obj_to_idx(func_def_outputs),
-      output_shapes,
-      variables=variables)
+      fname, all_inputs, extra_inputs, tmp_graph, operations, func_def_outputs,
+      func_outputs, output_shapes, variables)
 
 
 # Defun uses this instead of Tensor as a cache key. Using dtype because
@@ -506,10 +587,9 @@ def _cache_key(x):
   return x
 
 
-def _register_with_name(name, fdef):
-  """Registers the function `fdef` with the name `name`."""
-  fdef.signature.name = name
-  context.context().add_function_def(fdef)
+def _register(fn):
+  """Registers the function `fn`."""
+  context.context().add_function(fn)
 
 
 # TODO(apassos): better error messages for non-hashable arguments.
@@ -532,7 +612,8 @@ def named_defun(func, name):
     """Decorated version of func."""
     # Macroexpand on non-Tensor arguments
     cache_key = tuple(_cache_key(x) for x in args)
-    assert all(not isinstance(x, ops.EagerTensor) for x in kwds.values())
+    if any(isinstance(x, ops.EagerTensor) for x in kwds.values()):
+      raise ValueError("Tensor keyword arguments are not supported.")
     cache_key = (cache_key, tuple(kwds.items()))
 
     if cache_key not in arguments_to_functions:
@@ -595,3 +676,55 @@ def defun(func):
   """
   # TODO(apassos): deal with captured global state. Deal with control flow.
   return tf_decorator.make_decorator(func, named_defun(func, func.__name__))
+
+
+def make_defun_op(func, *args, **kwds):
+  """Compile func into graph_mode, assuming func arguments are *args, **kwargs.
+
+  `make_defun_op` converts a function that constructs a TensorFlow graph into
+  a function object and attaches it to the graph.  The resulting function
+  object can be queried for its properties, and called directly with different
+  inputs to execute.
+
+  More details on use cases and limitations are available in the
+  documentation for `defun`.
+
+  Example:
+  ```python
+  def f(x, y):
+    return tf.reduce_mean(tf.multiply(x ** 2, 3) + y)
+
+  def g(x, y):
+    return tf.reduce_mean(tf.multiply(x ** 2, 3) + y)
+
+  z = tf.constant([[0.0, 0.0]])
+  g_op = make_defun_op(g, z, z)
+
+  assert g_op.output_shapes == tf.TensorShape([])
+  assert g_op.output_types == tf.float32
+
+  x = tf.constant([[2.0, 3.0]])
+  y = tf.constant([[3.0, -2.0]])
+
+  # The plain function and defun-compiled function should return the same value.
+  assert f(x, y).numpy() == g_op(x, y).numpy()
+  ```
+
+  Args:
+    func: function to be compiled.
+    *args: List arguments to pass to `func` when attaching to the graph.
+    **kwds: Keyword arguments to pass to `func` when attaching to the graph.
+
+  Returns:
+     A wrapper object which can be queried for its output properties,
+     and which can be called directly the way a `@defun` wrapped function
+     can.
+
+  Raises:
+    ValueError: if any of the keyword arguments to `func` are `EagerTensor`
+      objects (not yet supported).
+  """
+  name = func.__name__
+  if any(isinstance(x, ops.EagerTensor) for x in kwds.values()):
+    raise ValueError("Tensor keyword arguments are not supported.")
+  return _defun_internal(name, func, args, kwds)
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index c55f2f1d5957cabfaf3bae617d88dca55f7b8e4b..e3ea35a64009c702252cf717d89454852a26369a 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function as tf_function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import math_ops
@@ -68,6 +69,65 @@ class FunctionTest(test.TestCase):
 
     self.assertAllEqual(step(), 2.0)
 
+  def testBasicDefunOpGraphMode(self):
+    matmul = function.defun(math_ops.matmul)
+
+    def sq(a):
+      return matmul(a, a)
+
+    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+
+    sq_op = function.make_defun_op(sq, t)
+
+    self.assertEqual(sq_op.output_shapes, tensor_shape.TensorShape([2, 2]))
+    out = sq_op(t)
+    self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
+
+  def testNestedOutputDefunOpGraphMode(self):
+    matmul = function.defun(math_ops.matmul)
+
+    def sq(a):
+      return (matmul(a, a), {'b': constant_op.constant(1.0)})
+
+    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+
+    sq_op = function.make_defun_op(sq, t)
+
+    self.assertEqual(sq_op.output_shapes,
+                     (tensor_shape.TensorShape([2, 2]),
+                      {'b': tensor_shape.TensorShape([])}))
+    self.assertEqual(sq_op.output_dtypes,
+                     (dtypes.float32, {'b': dtypes.float32}))
+    (a, b) = sq_op(t)
+    self.assertAllEqual(a, math_ops.matmul(t, t).numpy())
+    self.assertAllEqual(b['b'].numpy(), 1.0)
+
+  def testDefunOpGraphModeWithGradients(self):
+    v = resource_variable_ops.ResourceVariable(1.0, name='v')
+
+    def step():
+      def inner():
+        return v * v
+
+      return backprop.implicit_grad(inner)()[0][0]
+
+    step_op = function.make_defun_op(step)
+
+    self.assertEqual(step_op.output_dtypes, dtypes.float32)
+    self.assertEqual(step_op.output_shapes, tensor_shape.TensorShape(None))
+    self.assertAllEqual(step_op(), 2.0)
+
+  def testDefunOpGraphModeNoneOutput(self):
+    def fn(unused_a, unused_b):
+      return None
+
+    x = constant_op.constant(1)
+    fn_op = function.make_defun_op(fn, x, x)
+
+    self.assertEqual(fn_op.output_dtypes, None)
+    self.assertEqual(fn_op.output_shapes, None)
+    self.assertAllEqual(fn_op(x, x), None)
+
   def testDefunReadVariable(self):
     v = resource_variable_ops.ResourceVariable(1.0)
 
@@ -310,6 +370,38 @@ class FunctionTest(test.TestCase):
 
     self.assertAllEqual(3, add_one(constant_op.constant(2)))
 
+  def testVariableCaptureInNestedFunctions(self):
+    v = resource_variable_ops.ResourceVariable(1)
+
+    @function.defun
+    def read():
+      return v.read_value()
+
+    @function.defun
+    def outer():
+      return read()
+
+    self.assertEqual(1, int(outer()))
+
+  def testReturnCapturedEagerTensor(self):
+    t = constant_op.constant(1)
+
+    @function.defun
+    def read():
+      return t
+
+    self.assertEqual(1, int(read()))
+
+  def testReturnCapturedGraphTensor(self):
+    with context.graph_mode(), self.test_session():
+      t = constant_op.constant(1)
+
+      @function.defun
+      def read():
+        return t
+
+      self.assertEqual(1, int(self.evaluate(read())))
+
   def testSequenceInputs(self):
     clip_by_global_norm = function.defun(clip_ops.clip_by_global_norm)
     t_list = [constant_op.constant(1.0), constant_op.constant(2.0)]
diff --git a/tensorflow/python/eager/gen_op.bzl b/tensorflow/python/eager/gen_op.bzl
index 1c99d342befaf04112ac83aeecce2b122eb361c5..8bc1d6c10a60b89a026cb34dbf6fd98d29e909c2 100644
--- a/tensorflow/python/eager/gen_op.bzl
+++ b/tensorflow/python/eager/gen_op.bzl
@@ -10,7 +10,9 @@ def tfe_gen_op_wrapper_py(name,
                           out=None,
                           visibility=None,
                           deps=[],
-                          generated_target_name=None):
+                          generated_target_name=None,
+                          # ApiDefs will be loaded in the order specified in this list.
+                          api_def_srcs=[]):
   """Generate an eager-mode Python op wrapper for an op library."""
   # Construct a cc_binary containing the specified ops.
   tool_name = "gen_" + name + "_py_wrappers_cc"
@@ -30,11 +32,25 @@ def tfe_gen_op_wrapper_py(name,
   if not out:
     out = "gen_" + name + ".py"
 
+  if not api_def_srcs:
+    api_def_args_str = ","
+  else:
+    api_def_args = []
+    for api_def_src in api_def_srcs:
+      # Add directory of the first ApiDef source to args.
+      # We are assuming all ApiDefs in a single api_def_src are in the
+      # same directory.
+      api_def_args.append(
+          "$$(dirname $$(echo $(locations " + api_def_src +
+          ") | cut -d\" \" -f1))")
+    api_def_args_str = ",".join(api_def_args)
+
   native.genrule(
       name=name + "_pygenrule",
       outs=[out],
+      srcs=api_def_srcs,
       tools=[tool_name] + tf_binary_additional_srcs(),
-      cmd=("$(location " + tool_name + ")  > $@"))
+      cmd=("$(location " + tool_name + ") " + api_def_args_str + " > $@"))
 
   # Make a py_library out of the generated python file.
   if not generated_target_name:
diff --git a/tensorflow/python/eager/graph_callable.py b/tensorflow/python/eager/graph_callable.py
index 837a75c808f94d4561a0eb68c8e77700d0e413da..5c13ea89081a7d060c0ed1201f0169b739a204c2 100644
--- a/tensorflow/python/eager/graph_callable.py
+++ b/tensorflow/python/eager/graph_callable.py
@@ -296,6 +296,7 @@ def _graph_callable_internal(func, shape_and_dtypes):
       # Call the function again, now replacing usages of variables with
       # placeholders. This assumes the variable capturing scope created above
       # knows about all variables.
+      tmp_graph.clear_resource_control_flow_state()
       with variable_captures.capturing_scope(), function.capture_tensors(
           captures):
         captured_outputs = func(*func_inputs)
@@ -317,46 +318,33 @@ def _graph_callable_internal(func, shape_and_dtypes):
   placeholder_inputs = flat_inputs+ list(extra_placeholders)
 
   func_def_outputs = [x for x in outputs_list if isinstance(x, tf_ops.Tensor)]
-  initializer_function_def = function.make_function_def(
-      tmp_graph,
-      initializing_operations,
-      placeholder_inputs,
-      func_def_outputs)
+  initialization_name = function._inference_name(func.__name__)  # pylint: disable=protected-access
   # TODO(ashankar): Oh lord, forgive me for this lint travesty.
   # Also, what about the gradient registry of these functions? Those need to be
   # addressed as well.
   for f in tmp_graph._functions.values():  # pylint: disable=protected-access
-    function._register_with_name(f.name, f.definition)  # pylint: disable=protected-access
-  function._register_with_name(function._inference_name(func.__name__),  # pylint: disable=protected-access
-                               initializer_function_def)
+    function._register(f._c_func)  # pylint: disable=protected-access
   initializer_function = function.GraphModeFunction(
+      initialization_name,
       placeholder_inputs,
       extra_inputs,
-      initializer_function_def,
       tmp_graph,
       initializing_operations,
+      func_def_outputs,
       func_outputs,
-      function._map_sequence_obj_to_idx(func_def_outputs),  # pylint: disable=protected-access
       output_shapes)
 
   capture_func_def_outputs = [
       x for x in captured_outlist if isinstance(x, tf_ops.Tensor)]
-  captured_function_def = function.make_function_def(
-      tmp_graph,
-      capturing_operations,
-      placeholder_inputs,
-      capture_func_def_outputs)
-  function._register_with_name(function._inference_name(func.__name__),  # pylint: disable=protected-access
-                               captured_function_def)
-
+  captured_function_name = function._inference_name(func.__name__)  # pylint: disable=protected-access
   captured_function = function.GraphModeFunction(
+      captured_function_name,
       placeholder_inputs,
       extra_inputs,
-      captured_function_def,
       tmp_graph,
       capturing_operations,
+      capture_func_def_outputs,
       captured_outputs,
-      function._map_sequence_obj_to_idx(capture_func_def_outputs),  # pylint: disable=protected-access
       output_shapes,
       variables=[x.variable for x in sorted_variables])
 
diff --git a/tensorflow/python/eager/graph_callable_test.py b/tensorflow/python/eager/graph_callable_test.py
index 548e16a909f8fe846ea6d5a7a33c4247c5d90054..b9e6ca2a93ac6ff02b741051234dbdd8a55bf12b 100644
--- a/tensorflow/python/eager/graph_callable_test.py
+++ b/tensorflow/python/eager/graph_callable_test.py
@@ -152,7 +152,6 @@ class GraphCallableTest(test.TestCase):
     self.assertAllEqual(5, f(constant_op.constant(2)))
 
   def testNestedFunction(self):
-
     # TensorFlow function (which is what would be used in TensorFlow graph
     # construction).
     @function.Defun(dtypes.int32, dtypes.int32)
diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index 70e23b9311792fd7e5243bbc9fd6e4009f1493a9..48dcb4830ccf4eda649c939c81f88a10750b23da 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 
 
@@ -322,6 +323,13 @@ class OpsTest(test_util.TensorFlowTestCase):
   def testIdentity(self):
     self.assertAllEqual(2, array_ops.identity(2))
 
+  def testIdentityOnVariable(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+    with context.device('/gpu:0'):
+      v = resource_variable_ops.ResourceVariable(True)
+    self.assertAllEqual(True, array_ops.identity(v))
+
   def testIncompatibleSetShape(self):
     x = constant_op.constant(1)
     with self.assertRaises(ValueError):
diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc
index 956fbdac50d05fbd23ab93ec97145645805ac5e7..90a8779ff845b2fd63d1ba1019e8601fef257e42 100644
--- a/tensorflow/python/eager/python_eager_op_gen.cc
+++ b/tensorflow/python/eager/python_eager_op_gen.cc
@@ -99,6 +99,15 @@ string TensorPBString(const TensorProto& pb) {
   return strings::StrCat("\"\"\"", ProtoShortDebugString(pb), "\"\"\"");
 }
 
+const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
+  for (int i = 0; i < api_def.in_arg_size(); ++i) {
+    if (api_def.in_arg(i).name() == name) {
+      return &api_def.in_arg(i);
+    }
+  }
+  return nullptr;
+}
+
 class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
  public:
   GenEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
@@ -164,14 +173,14 @@ string GenEagerPythonOp::FlattenInputs(
       } else if (inputs_state == WAS_LIST_INPUT) {
         strings::StrAppend(&inputs, " + ");
       }
-      strings::StrAppend(&inputs, "list(", param_names_[i], ")");
+      strings::StrAppend(&inputs, "list(", param_names_[i].GetRenameTo(), ")");
       inputs_state = WAS_LIST_INPUT;
       if (output_sizes != nullptr) {
         if (!arg.number_attr().empty()) {
           output_sizes->emplace_back(AttrVarName(arg.number_attr(), nullptr));
         } else {
           output_sizes->emplace_back(
-              strings::StrCat("len(", param_names_[i], ")"));
+              strings::StrCat("len(", param_names_[i].GetRenameTo(), ")"));
         }
       }
     } else {
@@ -182,7 +191,7 @@ string GenEagerPythonOp::FlattenInputs(
       } else {
         strings::StrAppend(&inputs, "[");
       }
-      strings::StrAppend(&inputs, param_names_[i]);
+      strings::StrAppend(&inputs, param_names_[i].GetRenameTo());
       inputs_state = WAS_SOLO_INPUT;
       if (output_sizes != nullptr) output_sizes->emplace_back();
     }
@@ -195,15 +204,21 @@ string GenEagerPythonOp::FlattenInputs(
 }
 
 string GenEagerPythonOp::Code() {
+  if (api_def_.visibility() == ApiDef::SKIP) {
+    return "";
+  }
   // This has all the input args followed by those attrs that don't have
   // defaults.
-  std::vector<string> args_no_default;
+  std::vector<python_op_gen_internal::ParamNames> params_no_default;
   // The parameters with defaults (these have to be listed after those without).
   // No input args are included, just attrs.
-  std::vector<std::pair<string, string>> args_with_defaults;
-  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
-    const auto& arg(op_def_.input_arg(i));
-    args_no_default.push_back(arg.name());
+  std::vector<std::pair<python_op_gen_internal::ParamNames, string>>
+      params_with_default;
+
+  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
+    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
+    const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
+    params_no_default.emplace_back(api_def_arg.name(), api_def_arg.rename_to());
     if (!arg.type_attr().empty()) {
       AddAttrForArg(arg.type_attr(), i);
     } else if (!arg.type_list_attr().empty()) {
@@ -215,31 +230,39 @@ string GenEagerPythonOp::Code() {
   }
   for (int i = 0; i < op_def_.attr_size(); ++i) {
     const auto& attr(op_def_.attr(i));
+    const auto& api_def_attr(api_def_.attr(i));
     // Do not add inferred attrs to the Python function signature.
     if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
-      if (attr.has_default_value()) {
+      if (api_def_attr.has_default_value()) {
         if (attr.type() == "tensor") {
-          args_with_defaults.emplace_back(
-              attr.name(),
-              strings::StrCat("_execute.make_tensor(",
-                              TensorPBString(attr.default_value().tensor()),
-                              ", \"", attr.name(), "\")"));
+          params_with_default.emplace_back(
+              python_op_gen_internal::ParamNames(api_def_attr.name(),
+                                                 api_def_attr.rename_to()),
+              strings::StrCat(
+                  "_execute.make_tensor(",
+                  TensorPBString(api_def_attr.default_value().tensor()), ", \"",
+                  api_def_attr.rename_to(), "\")"));
         } else if (attr.type() == "list(tensor)") {
           std::vector<string> pbtxt;
-          for (const auto& pb : attr.default_value().list().tensor()) {
+          for (const auto& pb : api_def_attr.default_value().list().tensor()) {
             pbtxt.emplace_back(TensorPBString(pb));
           }
-          args_with_defaults.emplace_back(
-              attr.name(),
-              strings::StrCat("[_execute.make_tensor(_pb, \"", attr.name(),
-                              "\") for _pb in ", VectorToTuple(pbtxt), "]"));
+          params_with_default.emplace_back(
+              python_op_gen_internal::ParamNames(api_def_attr.name(),
+                                                 api_def_attr.rename_to()),
+              strings::StrCat("[_execute.make_tensor(_pb, \"",
+                              api_def_attr.rename_to(), "\") for _pb in ",
+                              VectorToTuple(pbtxt), "]"));
         } else {
-          args_with_defaults.emplace_back(
-              attr.name(), python_op_gen_internal::AttrValueToPython(
-                               attr.type(), attr.default_value(), "_dtypes."));
+          params_with_default.emplace_back(
+              python_op_gen_internal::ParamNames(api_def_attr.name(),
+                                                 api_def_attr.rename_to()),
+              python_op_gen_internal::AttrValueToPython(
+                  attr.type(), api_def_attr.default_value(), "_dtypes."));
         }
       } else {
-        args_no_default.push_back(attr.name());
+        params_no_default.emplace_back(api_def_attr.name(),
+                                       api_def_attr.rename_to());
       }
     }
   }
@@ -247,34 +270,37 @@ string GenEagerPythonOp::Code() {
   // Save the list of attr parameters (attrs that won't be inferred),
   // those with defaults go at the end.
   // Get the attrs in the order we want by taking the attrs without defaults
-  // from the end of args_no_default, and adding args_no_default.
-  attrs_.reserve(args_no_default.size() - op_def_.input_arg_size() +
-                 args_with_defaults.size());
-  attrs_.insert(attrs_.end(),
-                args_no_default.begin() + op_def_.input_arg_size(),
-                args_no_default.end());
-  for (const auto& a : args_with_defaults) {
-    attrs_.push_back(a.first);
+  // from the end of params_no_default, and adding params_no_default.
+  attrs_.reserve(params_no_default.size() - op_def_.input_arg_size() +
+                 params_with_default.size());
+  for (int i = op_def_.input_arg_size(); i < params_no_default.size(); ++i) {
+    attrs_.push_back(params_no_default[i].GetName());
+  }
+  for (const auto& p : params_with_default) {
+    attrs_.push_back(p.first.GetName());
+  }
+
+  param_names_.reserve(params_no_default.size() + params_with_default.size());
+  param_names_.insert(param_names_.begin(), params_no_default.begin(),
+                      params_no_default.end());
+  for (const auto& param_and_default : params_with_default) {
+    param_names_.push_back(param_and_default.first);
   }
 
-  param_names_.reserve(args_no_default.size() + args_with_defaults.size());
   string parameters;
-  for (const string& name : args_no_default) {
+  for (const auto& param : params_no_default) {
     if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
-    const string param = python_op_gen_internal::AvoidPythonReserved(name);
-    strings::StrAppend(&parameters, param);
-    param_names_.push_back(param);
+    strings::StrAppend(&parameters, param.GetRenameTo());
   }
-  for (const auto& name_default : args_with_defaults) {
+  for (const auto& param_and_default : params_with_default) {
     if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
-    const string param =
-        python_op_gen_internal::AvoidPythonReserved(name_default.first);
-    strings::StrAppend(&parameters, param, "=", name_default.second);
-    param_names_.push_back(param);
+    strings::StrAppend(&parameters, param_and_default.first.GetRenameTo(), "=",
+                       param_and_default.second);
   }
   if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
   strings::StrAppend(&parameters, "name=None");
 
+  AddExport();
   AddDefLine(parameters);
   AddDocStringDescription();
   AddDocStringArgs();
@@ -297,25 +323,26 @@ string GenEagerPythonOp::Code() {
         // inputs are lists and have the same length.
         for (auto iter = arg_list->second.begin();
              iter != arg_list->second.end(); ++iter) {
-          const string& arg_name = param_names_[*iter];
-          ExpectListArg(arg_name);
+          const string& arg_api_name = param_names_[*iter].GetRenameTo();
+          ExpectListArg(arg_api_name);
           if (iter == arg_list->second.begin()) {
-            AddInferredAttr(attr.name(), strings::StrCat("len(", arg_name, ")"),
+            AddInferredAttr(attr.name(),
+                            strings::StrCat("len(", arg_api_name, ")"),
                             &result_, &attr_expressions_);
           } else {
             const auto& attr_var = attr_expressions_[attr.name()];
-            strings::StrAppend(&result_, "  if len(", arg_name,
+            strings::StrAppend(&result_, "  if len(", arg_api_name,
                                ") != ", attr_var,
                                ":\n"
                                "    raise ValueError(\n"
                                "        \"List argument '",
-                               arg_name, "' to '", op_name_,
+                               arg_api_name, "' to '", op_name_,
                                "' Op with length %d \"\n"
                                "        \"must match length %d of argument '",
                                inferred_attrs_[attr.name()],
                                "'.\" %\n"
                                "        (len(",
-                               arg_name, "), ", attr_var, "))\n");
+                               arg_api_name, "), ", attr_var, "))\n");
           }
         }
       }
@@ -325,65 +352,76 @@ string GenEagerPythonOp::Code() {
   // Values for non-inferred attrs.
   for (int i = 0; i < attrs_.size(); ++i) {
     const string& attr_name = attrs_[i];
-    const string& param = param_names_[i + op_def_.input_arg_size()];
+    const auto& param = param_names_[i + op_def_.input_arg_size()];
     const auto& attr = *FindAttr(attr_name, op_def_);
+    const string& attr_api_name = param.GetRenameTo();
     StringPiece attr_type = attr.type();
-    attr_expressions_[attr_name] = param;
-    const int default_index = i - (attrs_.size() - args_with_defaults.size());
+    attr_expressions_[attr_name] = attr_api_name;
+    const int default_index = i - (attrs_.size() - params_with_default.size());
     if (default_index >= 0) {
-      const string& default_value = args_with_defaults[default_index].second;
-      strings::StrAppend(&result_, "  if ", param, " is None:\n");
-      strings::StrAppend(&result_, "    ", param, " = ", default_value, "\n");
+      const string& default_value = params_with_default[default_index].second;
+      strings::StrAppend(&result_, "  if ", attr_api_name, " is None:\n");
+      strings::StrAppend(&result_, "    ", attr_api_name, " = ", default_value,
+                         "\n");
     }
     if (attr_type.starts_with("list(")) {
-      ExpectListArg(param);
+      ExpectListArg(attr_api_name);
     }
 
     if (attr_type == "string") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_str(", param,
-                         ", \"", param, "\")\n");
+      strings::StrAppend(&result_, "  ", attr_api_name, " = _execute.make_str(",
+                         attr_api_name, ", \"", attr_api_name, "\")\n");
     } else if (attr_type == "list(string)") {
-      strings::StrAppend(&result_, "  ", param, " = [_execute.make_str(_s, \"",
-                         param, "\") for _s in ", param, "]\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = [_execute.make_str(_s, \"", attr_api_name,
+                         "\") for _s in ", attr_api_name, "]\n");
     } else if (attr_type == "int") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_int(", param,
-                         ", \"", param, "\")\n");
+      strings::StrAppend(&result_, "  ", attr_api_name, " = _execute.make_int(",
+                         attr_api_name, ", \"", attr_api_name, "\")\n");
     } else if (attr_type == "list(int)") {
-      strings::StrAppend(&result_, "  ", param, " = [_execute.make_int(_i, \"",
-                         param, "\") for _i in ", param, "]\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = [_execute.make_int(_i, \"", attr_api_name,
+                         "\") for _i in ", attr_api_name, "]\n");
     } else if (attr_type == "float") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_float(",
-                         param, ", \"", param, "\")\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = _execute.make_float(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
     } else if (attr_type == "list(float)") {
-      strings::StrAppend(&result_, "  ", param,
-                         " = [_execute.make_float(_f, \"", param,
-                         "\") for _f in ", param, "]\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = [_execute.make_float(_f, \"", attr_api_name,
+                         "\") for _f in ", attr_api_name, "]\n");
     } else if (attr_type == "bool") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_bool(", param,
-                         ", \"", param, "\")\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = _execute.make_bool(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
     } else if (attr_type == "list(bool)") {
-      strings::StrAppend(&result_, "  ", param, " = [_execute.make_bool(_b, \"",
-                         param, "\") for _b in ", param, "]\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = [_execute.make_bool(_b, \"", attr_api_name,
+                         "\") for _b in ", attr_api_name, "]\n");
     } else if (attr_type == "type") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_type(", param,
-                         ", \"", param, "\")\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = _execute.make_type(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
     } else if (attr_type == "list(type)") {
-      strings::StrAppend(&result_, "  ", param, " = [_execute.make_type(_t, \"",
-                         param, "\") for _t in ", param, "]\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = [_execute.make_type(_t, \"", attr_api_name,
+                         "\") for _t in ", attr_api_name, "]\n");
     } else if (attr_type == "shape") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_shape(",
-                         param, ", \"", param, "\")\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = _execute.make_shape(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
     } else if (attr_type == "list(shape)") {
-      strings::StrAppend(&result_, "  ", param,
-                         " = [_execute.make_shape(_s, \"", param,
-                         "\") for _s in ", param, "]\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = [_execute.make_shape(_s, \"", attr_api_name,
+                         "\") for _s in ", attr_api_name, "]\n");
     } else if (attr_type == "tensor") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_tensor(",
-                         param, ", \"", param, "\")\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = _execute.make_tensor(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
     } else if (attr_type == "list(tensor)") {
-      strings::StrAppend(&result_, "  ", param,
-                         " = [_execute.make_tensor(_t, \"", param,
-                         "\") for _t in ", param, "]\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = [_execute.make_tensor(_t, \"", attr_api_name,
+                         "\") for _t in ", attr_api_name, "]\n");
     } else if (attr_type != "func") {
       return strings::StrCat("# No definition for ", function_name_,
                              " since we don't support attrs with type\n"
@@ -484,16 +522,20 @@ string GenEagerPythonOp::Code() {
 
   bool eager_allowed = true;
   string ref_arg;
-  for (const auto& arg : op_def_.input_arg()) {
+  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
+    const auto& arg = op_def_.input_arg(i);
     if (arg.is_ref()) {
       eager_allowed = false;
-      ref_arg = arg.name();
+      DCHECK_EQ(op_def_.input_arg(i).name(), api_def_.in_arg(i).name());
+      ref_arg = api_def_.in_arg(i).rename_to();
     }
   }
-  for (const auto& arg : op_def_.output_arg()) {
+  for (int i = 0; i < op_def_.output_arg_size(); ++i) {
+    const auto& arg = op_def_.output_arg(i);
     if (arg.is_ref()) {
       eager_allowed = false;
-      ref_arg = arg.name();
+      DCHECK_EQ(op_def_.output_arg(i).name(), api_def_.out_arg(i).name());
+      ref_arg = api_def_.out_arg(i).rename_to();
     }
   }
 
@@ -553,6 +595,7 @@ void GenEagerPythonOp::AddEagerInferredAttrs() {
   // Figure out values for inferred attrs, and cast to eager tensors.
   for (int i = 0; i < op_def_.attr_size(); ++i) {
     const auto& attr(op_def_.attr(i));
+    const auto& api_def_attr(api_def_.attr(i));
     auto arg_list = attr_to_args_.find(attr.name());
     if (arg_list != attr_to_args_.end()) {
       if (attr.type() == "type") {
@@ -565,14 +608,15 @@ void GenEagerPythonOp::AddEagerInferredAttrs() {
           strings::StrAppend(
               &conversion, ", ",
               python_op_gen_internal::AttrValueToPython(
-                  attr.type(), attr.default_value(), "_dtypes."));
+                  attr.type(), api_def_attr.default_value(), "_dtypes."));
         }
         strings::StrAppend(&conversion, ")");
         const string var_name = AttrVarName(attr.name(), &attr_expressions_);
         if (output_sizes.size() == 1) {
           // Avoid creating a temporary variable in the case where
           // we can easily assign to the right value directly.
-          const string inputs_var = param_names_[arg_list->second.front()];
+          const string inputs_var =
+              param_names_[arg_list->second.front()].GetRenameTo();
           if (output_sizes.front().empty()) {
             strings::StrAppend(&result_, "    ", var_name, ", (", inputs_var,
                                ",) = ", conversion, "\n");
@@ -589,7 +633,7 @@ void GenEagerPythonOp::AddEagerInferredAttrs() {
           Unflatten("    ", output_sizes, inputs_var, &result_);
           std::vector<string> p;
           for (int j : arg_list->second) {
-            p.emplace_back(param_names_[j]);
+            p.emplace_back(param_names_[j].GetRenameTo());
           }
           strings::StrAppend(&result_, "    ", VectorToTuple(p), " = ",
                              inputs_var, "\n");
@@ -608,14 +652,14 @@ void GenEagerPythonOp::AddEagerInferredAttrs() {
           std::vector<string> lists;
           for (auto iter = arg_list->second.begin();
                iter != arg_list->second.end(); ++iter) {
-            lists.push_back(param_names_[*iter]);
+            lists.push_back(param_names_[*iter].GetRenameTo());
           }
           inputs_var = VectorToTuple(lists);
           conversion = "_execute.args_to_mixed_eager_tensors";
         } else {
           // For one list(tensor) argument, we just convert every
           // element of the list to an eager tensor.
-          inputs_var = param_names_[arg_list->second.front()];
+          inputs_var = param_names_[arg_list->second.front()].GetRenameTo();
           conversion = "_execute.convert_to_mixed_eager_tensors";
         }
         strings::StrAppend(&result_, "    ", var_name, ", ", inputs_var, " = ",
@@ -630,7 +674,7 @@ void GenEagerPythonOp::AddEagerInputCasts() {
   for (int i = 0; i < op_def_.input_arg_size(); ++i) {
     const auto& arg(op_def_.input_arg(i));
     if (!arg.type_attr().empty() || !arg.type_list_attr().empty()) continue;
-    const string& param = param_names_[i];
+    const string& param = param_names_[i].GetRenameTo();
     const string fn = arg.number_attr().empty() ? "" : "n_";
     const string dtype =
         python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
diff --git a/tensorflow/python/eager/python_eager_op_gen_main.cc b/tensorflow/python/eager/python_eager_op_gen_main.cc
index cd74c438ec6f5cd7f807a7205f76eff7421aeb74..05351bd8b115ae07482b82166974e86758bc7712 100644
--- a/tensorflow/python/eager/python_eager_op_gen_main.cc
+++ b/tensorflow/python/eager/python_eager_op_gen_main.cc
@@ -21,34 +21,32 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 
 namespace tensorflow {
 namespace {
 
-constexpr char kBaseApiDef[] =
-    "tensorflow/core/api_def/base_api/*.pbtxt";
-constexpr char kPythonApiDef[] =
-    "tensorflow/core/api_def/python_api/*.pbtxt";
-constexpr bool kUseApiDef = false;
-
-void PrintAllPythonOps(const std::vector<string>& hidden_ops) {
+void PrintAllPythonOps(const std::vector<string>& hidden_ops,
+                       const std::vector<string>& api_def_dirs) {
   OpList ops;
   OpRegistry::Global()->Export(false, &ops);
 
   ApiDefMap api_def_map(ops);
-  if (kUseApiDef) {
+  if (!api_def_dirs.empty()) {
     Env* env = Env::Default();
 
-    std::vector<string> base_api_files;
-    std::vector<string> python_api_files;
-    TF_CHECK_OK(env->GetMatchingPaths(kBaseApiDef, &base_api_files));
-    TF_CHECK_OK(env->GetMatchingPaths(kPythonApiDef, &python_api_files));
-
-    TF_CHECK_OK(api_def_map.LoadFileList(env, base_api_files));
-    TF_CHECK_OK(api_def_map.LoadFileList(env, python_api_files));
+    for (const auto& api_def_dir : api_def_dirs) {
+      std::vector<string> api_files;
+      TF_CHECK_OK(env->GetMatchingPaths(io::JoinPath(api_def_dir, "*.pbtxt"),
+                                        &api_files));
+      TF_CHECK_OK(api_def_map.LoadFileList(env, api_files));
+    }
+    api_def_map.UpdateDocs();
   }
+
   PrintEagerPythonOps(ops, api_def_map, hidden_ops, true /* require_shapes */);
 }
 
@@ -58,8 +56,15 @@ void PrintAllPythonOps(const std::vector<string>& hidden_ops) {
 int main(int argc, char* argv[]) {
   tensorflow::port::InitMain(argv[0], &argc, &argv);
 
+  // Usage:
+  //   python_eager_op_gen_main api_def_dir1,api_def_dir2,...
   if (argc == 1) {
-    tensorflow::PrintAllPythonOps({});
+    tensorflow::PrintAllPythonOps({}, {});
+  } else if (argc == 2) {
+    const std::vector<tensorflow::string> api_def_dirs =
+        tensorflow::str_util::Split(argv[1], ",",
+                                    tensorflow::str_util::SkipEmpty());
+    tensorflow::PrintAllPythonOps({}, api_def_dirs);
   } else {
     return -1;
   }
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index f96245f7a5316919a36e751aab6d0986144d99e9..a33b17ada6f94e43ac16696c502be4b885e9d33a 100644
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -88,7 +88,8 @@ TFE_TensorHandle* EagerTensor_Handle(const PyObject* o);
 PyObject* TFE_Py_InitEagerTensor(PyObject* base_class);
 
 // Pushes a new tape into the thread-local stack.
-void TFE_Py_TapeStackPushNew();
+// `persistent` must be a PyBool_Type, i.e either Py_True or Py_False
+void TFE_Py_TapeStackPushNew(PyObject* persistent);
 
 // Pops the tape from the top of the stack and returns it.
 PyObject* TFE_Py_TapeStackPop();
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 387eec1358418a3ad532b93da0b4ddbd45256ad0..3ba81fb3d04422b1929b08e057a7858e18a7ca17 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -469,11 +469,22 @@ static tensorflow::int64 FastTensorId(PyObject* tensor) {
 class GradientTape
     : public tensorflow::eager::GradientTape<PyObject, PyObject> {
  public:
-  GradientTape() {}
+  explicit GradientTape(bool persistent)
+      : tensorflow::eager::GradientTape<PyObject, PyObject>(persistent) {}
+
+  virtual ~GradientTape() {
+    for (PyObject* v : watched_variables_) {
+      Py_DECREF(v);
+    }
+  }
 
   void WatchVariable(PyObject* v) {
-    watched_variables_.insert(v);
-    Py_INCREF(v);
+    auto insert_result = watched_variables_.insert(v);
+    if (insert_result.second) {
+      // Only increment the reference count if we aren't already watching this
+      // variable.
+      Py_INCREF(v);
+    }
     PyObject* handle = PyObject_GetAttrString(v, "handle");
     if (handle == nullptr) {
       return;
@@ -530,12 +541,9 @@ static PyTypeObject TFE_Py_Tape_Type = {
 // xcode 7 doesn't define thread_local, so for compatibility we implement our
 // own. TODO(apassos) remove once we can deprecate xcode 7.
 #ifndef __APPLE__
-thread_local std::vector<TFE_Py_Tape*>* tape_stack = nullptr;
 std::vector<TFE_Py_Tape*>* GetTapeStack() {
-  if (tape_stack == nullptr) {
-    tape_stack = new std::vector<TFE_Py_Tape*>;
-  }
-  return tape_stack;
+  thread_local std::vector<TFE_Py_Tape*> tape_stack;
+  return &tape_stack;
 }
 #else
 static tensorflow::mutex stack_mu(tensorflow::LINKER_INITIALIZED);
@@ -557,11 +565,11 @@ std::vector<TFE_Py_Tape*>* GetTapeStack() {
 }
 #endif
 
-void TFE_Py_TapeStackPushNew() {
+void TFE_Py_TapeStackPushNew(PyObject* persistent) {
   TFE_Py_Tape_Type.tp_new = PyType_GenericNew;
   if (PyType_Ready(&TFE_Py_Tape_Type) < 0) return;
   TFE_Py_Tape* tape = PyObject_NEW(TFE_Py_Tape, &TFE_Py_Tape_Type);
-  tape->tape = new GradientTape();
+  tape->tape = new GradientTape(persistent == Py_True);
   GetTapeStack()->push_back(tape);
 }
 
@@ -704,6 +712,7 @@ std::vector<tensorflow::int64> MakeTensorIDList(PyObject* tensors) {
     PyObject* tensor = PySequence_Fast_GET_ITEM(seq, i);
     list.push_back(FastTensorId(tensor));
     if (PyErr_Occurred()) {
+      Py_DECREF(seq);
       return list;
     }
   }
@@ -723,7 +732,6 @@ PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape) {
   PyObject* result = PySet_New(nullptr);
   for (PyObject* variable : watched_variables) {
     PySet_Add(result, variable);
-    Py_DECREF(variable);
   }
   return result;
 }
@@ -889,12 +897,7 @@ class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyObject> {
     PyObject* py_result = PyEval_CallObject(
         reinterpret_cast<PyObject*>(backward_function), grads);
     Py_DECREF(grads);
-    Py_DECREF(backward_function);
     if (py_result == nullptr) {
-      VLOG(1) << "Gradient function threw exceptions";
-      if (VLOG_IS_ON(1)) {
-        PyErr_Print();
-      }
       return tensorflow::errors::Internal("gradient function threw exceptions");
     }
     result->clear();
@@ -921,6 +924,10 @@ class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyObject> {
     return tensorflow::Status::OK();
   }
 
+  void ReleaseBackwardFunction(PyObject* backward_function) const final {
+    Py_DECREF(backward_function);
+  }
+
   void DeleteGradient(PyObject* tensor) const final { Py_XDECREF(tensor); }
 
  private:
@@ -981,6 +988,11 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
   status->status = tape_obj->tape->ComputeGradient(
       c_vspace, target_vec, sources_vec, outgrad_vec, &result);
   if (!status->status.ok()) {
+    if (PyErr_Occurred()) {
+      // Do not propagate the erroneous status as that would swallow the
+      // exception which caused the problem.
+      status->status = tensorflow::Status::OK();
+    }
     return nullptr;
   }
   if (!result.empty()) {
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index 440c84b7ea97a4672ff20328ca0af3527d51ead2..14b5238f74039ec23bd197699de68c4c0254e8d3 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -33,9 +33,9 @@ class Tape(object):
     return pywrap_tensorflow.TFE_Py_TapeWatchedVariables(self._tape)
 
 
-def push_new_tape():
+def push_new_tape(persistent=False):
   """Pushes a new tape onto the tape stack."""
-  pywrap_tensorflow.TFE_Py_TapeStackPushNew()
+  pywrap_tensorflow.TFE_Py_TapeStackPushNew(persistent)
 
 
 def watch(tensor):
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 7a4593ec464ab1834a555a131b8b717f5010de62..727f80efb4a5aa5b5d9bee72aed1c56c3649d3bc 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -106,6 +106,11 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     t = _create_tensor(n)
     self.assertAllEqual([[1, 2], [3, 4]], t)
 
+  def testNumpyArrayDtype(self):
+    tensor = constant_op.constant([1.0, 2.0, 3.0])
+    numpy_tensor = np.asarray(tensor, dtype=np.int32)
+    self.assertAllEqual(numpy_tensor, [1, 2, 3])
+
   def testCopy(self):
     t = constant_op.constant(1.0)
     tt = copy.copy(t)
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 03f386e9cf885fb88cbb557a99b9d0abe78b3062..e062e1fbfe64df2c5e6068b6f748e885b9b493a6 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -215,6 +215,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
+        "noasan",  # test flakily times out in asan mode.
         "notsan",  # b/67510291
     ],
     deps = [
@@ -433,6 +434,7 @@ py_library(
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/data",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:tag_constants",
         "//third_party/py/numpy",
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index f267f4a54e541c8942fd6430a802798e430a5a47..63103ef4c123fe5d7e6a3609aa0f8d1d01a8bf94 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -30,6 +30,7 @@ from google.protobuf import message
 from tensorflow.core.framework import summary_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as tf_session
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config
@@ -416,7 +417,7 @@ class Estimator(object):
     with ops.Graph().as_default() as g:
       random_seed.set_random_seed(self._config.tf_random_seed)
       self._create_and_assert_global_step(g)
-      features = self._get_features_from_input_fn(
+      features, input_hooks = self._get_features_from_input_fn(
           input_fn, model_fn_lib.ModeKeys.PREDICT)
       estimator_spec = self._call_model_fn(
           features, None, model_fn_lib.ModeKeys.PREDICT, self.config)
@@ -426,7 +427,7 @@ class Estimator(object):
               checkpoint_filename_with_path=checkpoint_path,
               scaffold=estimator_spec.scaffold,
               config=self._session_config),
-          hooks=hooks) as mon_sess:
+          hooks=input_hooks + hooks) as mon_sess:
         while not mon_sess.should_stop():
           preds_evaluated = mon_sess.run(predictions)
           if not isinstance(predictions, dict):
@@ -582,6 +583,11 @@ class Estimator(object):
   def _get_features_from_input_fn(self, input_fn, mode):
     """Extracts the `features` from return values of `input_fn`."""
     result = self._call_input_fn(input_fn, mode)
+    input_hooks = []
+    if isinstance(result, dataset_ops.Dataset):
+      iterator = result.make_initializable_iterator()
+      input_hooks.append(_DatasetInitializerHook(iterator))
+      result = iterator.get_next()
     if isinstance(result, (list, tuple)):
       # Unconditionally drop the label (the second element of result).
       result = result[0]
@@ -590,16 +596,22 @@ class Estimator(object):
       logging.warning('Input graph does not use tf.data.Dataset or contain a '
                       'QueueRunner. That means predict yields forever. '
                       'This is probably a mistake.')
-    return result
+    return result, input_hooks
 
   def _get_features_and_labels_from_input_fn(self, input_fn, mode):
+    """Extracts the `features` and labels from return values of `input_fn`."""
     result = self._call_input_fn(input_fn, mode)
+    input_hooks = []
+    if isinstance(result, dataset_ops.Dataset):
+      iterator = result.make_initializable_iterator()
+      input_hooks.append(_DatasetInitializerHook(iterator))
+      result = iterator.get_next()
     if isinstance(result, (list, tuple)):
       if len(result) != 2:
         raise ValueError(
             'input_fn should return (feautures, labels) as a len 2 tuple.')
-      return result
-    return result, None
+      return result[0], result[1], input_hooks
+    return result, None, input_hooks
 
   def _extract_batch_length(self, preds_evaluated):
     """Extracts batch length of predictions."""
@@ -723,8 +735,10 @@ class Estimator(object):
       random_seed.set_random_seed(self._config.tf_random_seed)
       global_step_tensor = self._create_and_assert_global_step(g)
       training_util._get_or_create_global_step_read()  # pylint: disable=protected-access
-      features, labels = self._get_features_and_labels_from_input_fn(
-          input_fn, model_fn_lib.ModeKeys.TRAIN)
+      features, labels, input_hooks = (
+          self._get_features_and_labels_from_input_fn(
+              input_fn, model_fn_lib.ModeKeys.TRAIN))
+      worker_hooks.extend(input_hooks)
       estimator_spec = self._call_model_fn(
           features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
       # Check if the user created a loss summary, and add one if they didn't.
@@ -822,8 +836,9 @@ class Estimator(object):
     with ops.Graph().as_default() as g:
       random_seed.set_random_seed(self._config.tf_random_seed)
       global_step_tensor = self._create_and_assert_global_step(g)
-      features, labels = self._get_features_and_labels_from_input_fn(
-          input_fn, model_fn_lib.ModeKeys.EVAL)
+      features, labels, input_hooks = (
+          self._get_features_and_labels_from_input_fn(
+              input_fn, model_fn_lib.ModeKeys.EVAL))
       estimator_spec = self._call_model_fn(
           features, labels, model_fn_lib.ModeKeys.EVAL, self.config)
 
@@ -844,7 +859,8 @@ class Estimator(object):
             'already defines a default metric with the same name.')
       eval_dict[ops.GraphKeys.GLOBAL_STEP] = global_step_tensor
 
-      all_hooks = list(hooks or [])
+      all_hooks = list(input_hooks)
+      all_hooks.extend(hooks)
       all_hooks.extend(list(estimator_spec.evaluation_hooks or []))
 
       eval_results = evaluation._evaluate_once(  # pylint: disable=protected-access
@@ -1039,3 +1055,16 @@ def _has_dataset_or_queue_runner(maybe_tensor):
 
   # Now, check queue.
   return ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS)
+
+
+class _DatasetInitializerHook(training.SessionRunHook):
+
+  def __init__(self, iterator):
+    self._iterator = iterator
+
+  def begin(self):
+    self._initializer = self._iterator.initializer
+
+  def after_create_session(self, session, coord):
+    del coord
+    session.run(self._initializer)
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index c1b773b8c408dbfe7df685d5dcf2748ae5428adf..db64fbc9ccc3a212e7dfa1ad4d82e3138e3a3d56 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -913,6 +913,80 @@ class EstimatorGetVariablesTest(test.TestCase):
     self.assertEqual(3., est.get_variable_value('three'))
 
 
+class EstimatorDatasetIntegrationTest(test.TestCase):
+  """Tests dataset integration."""
+
+  def test_returned_by_input_fn(self):
+
+    def _input_fn():
+      return dataset_ops.Dataset.from_tensors(([1.], [2.]))
+
+    def _model_fn(features, labels, mode):
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=features + labels,  # 1 + 2
+          train_op=state_ops.assign_add(training.get_global_step(), 1))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(_input_fn, steps=1)
+    scores = est.evaluate(_input_fn, steps=1)
+    self.assertEqual(3., scores[model_fn_lib.LOSS_METRIC_KEY])
+
+  def test_with_none_labels(self):
+
+    def _input_fn():
+      return dataset_ops.Dataset.from_tensors([7.])
+
+    def _model_fn(features, labels, mode):
+      self.assertIsNone(labels)
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=features,  # 7
+          train_op=state_ops.assign_add(training.get_global_step(), 1))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(_input_fn, steps=1)
+    scores = est.evaluate(_input_fn, steps=1)
+    self.assertEqual(7., scores[model_fn_lib.LOSS_METRIC_KEY])
+
+  def test_with_predict(self):
+
+    def _input_fn():
+      return dataset_ops.Dataset.from_tensors([10.])
+
+    def _model_fn(features, labels, mode):
+      _ = labels
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          predictions=features,  # 10
+          loss=features,  # 10
+          train_op=state_ops.assign_add(training.get_global_step(), 1))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(_input_fn, steps=1)
+    self.assertEqual([10.], next(est.predict(input_fn=_input_fn)))
+
+  def test_batching(self):
+
+    def _input_fn():
+      return dataset_ops.Dataset.from_tensor_slices(([[1.], [2.]],
+                                                     [[10.], [20.]])).batch(1)
+
+    def _model_fn(features, labels, mode):
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          predictions=features,
+          loss=features + (0 if labels is None else labels),  # 11, 22
+          train_op=state_ops.assign_add(training.get_global_step(), 1))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(_input_fn)
+    scores = est.evaluate(_input_fn)
+    # (11 + 22)/2 = 16.5
+    self.assertEqual(16.5, scores[model_fn_lib.LOSS_METRIC_KEY])
+    self.assertEqual([1., 2.], list(est.predict(_input_fn)))
+
+
 class EstimatorEvaluateTest(test.TestCase):
 
   def test_input_fn_args(self):
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 31e9933c6f702393eb21b10c5bdd770739056032..51075731ddc52a55799958c3bfa6140f77404541 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -57,7 +57,7 @@ class ServingInputReceiver(collections.namedtuple(
       groups of receiver tensors, each of which may be a `Tensor` or a dict of
       string to `Tensor`.  These named receiver tensor alternatives generate
       additional serving signatures, which may be used to feed inputs at
-      different points within the input reciever subgraph.  A typical usage is
+      different points within the input receiver subgraph.  A typical usage is
       to allow feeding raw feature `Tensor`s *downstream* of the
       tf.parse_example() op.  Defaults to None.
   """
@@ -191,7 +191,8 @@ def build_all_signature_defs(receiver_tensors,
   if not isinstance(receiver_tensors, dict):
     receiver_tensors = {_SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
   if export_outputs is None or not isinstance(export_outputs, dict):
-    raise ValueError('export_outputs must be a dict.')
+    raise ValueError('export_outputs must be a dict and not'
+                     '{}'.format(type(export_outputs)))
 
   signature_def_map = {}
   excluded_signatures = {}
diff --git a/tensorflow/python/estimator/export/export_test.py b/tensorflow/python/estimator/export/export_test.py
index 3cbef4707a536128e0cc6ca9a14dc2aea8a44707..8442bf04accbd0bc15f5958069bf3060debd42bc 100644
--- a/tensorflow/python/estimator/export/export_test.py
+++ b/tensorflow/python/estimator/export/export_test.py
@@ -358,7 +358,8 @@ class ExportTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError) as e:
       export.build_all_signature_defs(receiver_tensor, None)
 
-    self.assertEqual("export_outputs must be a dict.", str(e.exception))
+    self.assertTrue(str(e.exception).startswith(
+        "export_outputs must be a dict"))
 
   def test_get_timestamped_export_dir(self):
     export_dir_base = tempfile.mkdtemp() + "export/"
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index 3512f66284f93806f897d65afd765fba54bc2af1..750af20e8a1e27c0f9c4fcf3ebf586c41bc9c66c 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -117,11 +117,11 @@ def numpy_input_fn(x,
         raise ValueError('y cannot be empty dict, use None instead.')
 
       ordered_dict_y = collections.OrderedDict(
-        sorted(y.items(), key=lambda t: t[0]))
+          sorted(y.items(), key=lambda t: t[0]))
       target_keys = list(ordered_dict_y.keys())
 
       duplicate_keys = set(feature_keys).intersection(set(target_keys))
-      if len(duplicate_keys):
+      if duplicate_keys:
         raise ValueError('{} duplicate keys are found in both x and y: '
                          '{}'.format(len(duplicate_keys), duplicate_keys))
 
@@ -131,16 +131,14 @@ def numpy_input_fn(x,
       ordered_dict_data[target_keys] = y
 
     if len(set(v.shape[0] for v in ordered_dict_data.values())) != 1:
-      shape_dict_of_x = {k: ordered_dict_data[k].shape
-                         for k in feature_keys}
+      shape_dict_of_x = {k: ordered_dict_data[k].shape for k in feature_keys}
 
       if target_keys is None:
         shape_of_y = None
       elif isinstance(target_keys, string_types):
         shape_of_y = y.shape
       else:
-        shape_of_y = {k: ordered_dict_data[k].shape
-                      for k in target_keys}
+        shape_of_y = {k: ordered_dict_data[k].shape for k in target_keys}
 
       raise ValueError('Length of tensors in x and y is mismatched. All '
                        'elements in x and y must have the same length.\n'
@@ -155,11 +153,12 @@ def numpy_input_fn(x,
         enqueue_size=batch_size,
         num_epochs=num_epochs)
 
-    batch = (queue.dequeue_many(batch_size) if num_epochs is None
-                else queue.dequeue_up_to(batch_size))
+    batch = (
+        queue.dequeue_many(batch_size)
+        if num_epochs is None else queue.dequeue_up_to(batch_size))
 
     # Remove the first `Tensor` in `batch`, which is the row number.
-    if len(batch) > 0:
+    if batch:
       batch.pop(0)
 
     features = dict(zip(feature_keys, batch[:len(feature_keys)]))
diff --git a/tensorflow/python/estimator/inputs/numpy_io_test.py b/tensorflow/python/estimator/inputs/numpy_io_test.py
index 65eae7a7dcb1b80d57c7da96f6e26ee4c964ea47..1374e3f7e12e76683f14737747b490c9a5e319eb 100644
--- a/tensorflow/python/estimator/inputs/numpy_io_test.py
+++ b/tensorflow/python/estimator/inputs/numpy_io_test.py
@@ -255,7 +255,7 @@ class NumpyIoTest(test.TestCase):
 
     with self.test_session() as session:
       input_fn = numpy_io.numpy_input_fn(
-        x, y, batch_size=2, shuffle=False, num_epochs=1)
+          x, y, batch_size=2, shuffle=False, num_epochs=1)
       features_tensor = input_fn()
 
       coord = coordinator.Coordinator()
@@ -327,7 +327,7 @@ class NumpyIoTest(test.TestCase):
 
     with self.test_session() as session:
       input_fn = numpy_io.numpy_input_fn(
-        x, y, batch_size=2, shuffle=False, num_epochs=1)
+          x, y, batch_size=2, shuffle=False, num_epochs=1)
       features_tensor, targets_tensor = input_fn()
 
       coord = coordinator.Coordinator()
@@ -362,13 +362,10 @@ class NumpyIoTest(test.TestCase):
     a = np.arange(4) * 1.0
     b = np.arange(32, 36)
     x = {'a': a, 'b': b}
-    y = {'y1': np.arange(-32, -28),
-         'a': a,
-         'y2': np.arange(32, 28, -1),
-         'b': b}
+    y = {'y1': np.arange(-32, -28), 'a': a, 'y2': np.arange(32, 28, -1), 'b': b}
     with self.test_session():
       with self.assertRaisesRegexp(
-              ValueError, '2 duplicate keys are found in both x and y'):
+          ValueError, '2 duplicate keys are found in both x and y'):
         failing_input_fn = numpy_io.numpy_input_fn(x, y, shuffle=False)
         failing_input_fn()
 
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index c0a287e922223e8999c45da16d291c95842718f9..75c0e61d47b37110b14aa57f6a185cab822a70bb 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -47,13 +47,13 @@ except ImportError:
 
 
 def _fill_array(arr, seq, fillvalue=0):
-  """ 
-  Recursively fills padded arr with elements from seq. 
+  """
+  Recursively fills padded arr with elements from seq.
   If length of seq is less than arr padded length, fillvalue used.
 
   Args:
     arr: Padded tensor of shape [batch_size, ..., max_padded_dim_len].
-    seq: Non-padded list of data sampels of shape 
+    seq: Non-padded list of data sampels of shape
       [batch_size, ..., padded_dim(None)]
     fillvalue: Default fillvalue to use.
   """
@@ -73,12 +73,12 @@ def _pad_if_needed(batch_key_item, fillvalue=0):
   """ Returns padded batch.
 
   Args:
-    batch_key_item: List of data samples of any type with shape 
+    batch_key_item: List of data samples of any type with shape
       [batch_size, ..., padded_dim(None)].
     fillvalue: Default fillvalue to use.
 
   Returns:
-    Padded with zeros tensor of same type and shape 
+    Padded with zeros tensor of same type and shape
       [batch_size, ..., max_padded_dim_len].
 
   Raises:
@@ -375,7 +375,7 @@ def _enqueue_data(data,
       arrays, a numpy `ndarray`, or a generator producing these.
     NotImplementedError: padding and shuffling data at the same time.
     NotImplementedError: padding usage with non generator data type.
-  """ 
+  """
   with ops.name_scope(name):
     if isinstance(data, np.ndarray):
       types = [dtypes.int64, dtypes.as_dtype(data.dtype)]
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index d71964d2ec8e8ce21934428c3fff88f65b2751da..3893f48caef1b69ccef3f13f35577a4de3c8af1d 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -80,6 +80,13 @@ def _get_master(cluster_spec, task_type, task_id):
         '%s\n\n'
         'Note that these values may be coming from the TF_CONFIG environment '
         'variable.' % (task_id, task_type, cluster_spec))
+
+  # If there is only one node in the cluster, do things locally by setting
+  # master to ''.  If a service or user sets TF_CONFIG with a single node, it's
+  # more performant to use a direct master rather than an RPC service.
+  if len(jobs) == 1 and len(cluster_spec.job_tasks(jobs[0])) == 1:
+    return _LOCAL_MASTER
+
   return _GRPC_SCHEME + addresses[task_id]
 
 
diff --git a/tensorflow/python/estimator/run_config_test.py b/tensorflow/python/estimator/run_config_test.py
index ecc850d5405837e8bf803b9a7c8c156ff19b7a90..6a62c061ff83057525424c36364bc7baea7e1d97 100644
--- a/tensorflow/python/estimator/run_config_test.py
+++ b/tensorflow/python/estimator/run_config_test.py
@@ -344,7 +344,7 @@ class RunConfigDistributedSettingTest(test.TestCase):
         expected_cluster_spec=tf_config['cluster'],
         expected_task_type=run_config_lib.TaskType.CHIEF,
         expected_task_id=0,
-        expected_master='grpc://host0:0',
+        expected_master='',
         expected_evaluation_master='',
         expected_is_chief=True,
         expected_num_worker_replicas=1,
@@ -572,7 +572,7 @@ class RunConfigDistributedSettingWithMasterTest(test.TestCase):
         expected_cluster_spec=tf_config['cluster'],
         expected_task_type=run_config_lib.TaskType.MASTER,
         expected_task_id=0,
-        expected_master='grpc://host0:0',
+        expected_master='',
         expected_evaluation_master='',
         expected_is_chief=True,
         expected_num_worker_replicas=1,
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index 1131995b3ef1a832c3312d27a46d8395d62cecc7..569ea04f01b3ebcd4350c79702da7140e55ebb72 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -43,6 +43,8 @@ _DELAY_SECS_PER_WORKER = 5
 _TF_CONFIG_ENV = 'TF_CONFIG'
 _ENVIRONMENT_KEY = 'environment'
 _ENVIRONMENT_GOOGLE_VALUE = 'google'
+_TRAINER_JOBS = (run_config_lib.TaskType.CHIEF, run_config_lib.TaskType.MASTER,
+                 run_config_lib.TaskType.WORKER)
 
 
 def _validate_input_fn(input_fn):
@@ -486,7 +488,11 @@ class _TrainingExecutor(object):
   training and evaluation based on the setting in `tf.estimator.RunConfig`.
   """
 
-  def __init__(self, estimator, train_spec, eval_spec):
+  def __init__(self,
+               estimator,
+               train_spec,
+               eval_spec,
+               continuous_eval_listener=None):
     if not isinstance(estimator, estimator_lib.Estimator):
       raise TypeError('`estimator` must have type `tf.estimator.Estimator`.')
     self._estimator = estimator
@@ -499,6 +505,13 @@ class _TrainingExecutor(object):
       raise TypeError('`eval_spec` must have type `tf.estimator.EvalSpec`.')
     self._eval_spec = eval_spec
 
+    if (continuous_eval_listener and
+        not isinstance(continuous_eval_listener, _ContinuousEvalListener)):
+      raise TypeError('`continuous_eval_listener` must have type '
+                      '`_ContinuousEvalListener`.')
+    self._continuous_eval_listener = (
+        continuous_eval_listener or _ContinuousEvalListener())
+
   @property
   def estimator(self):
     return self._estimator
@@ -613,22 +626,42 @@ class _TrainingExecutor(object):
       # _should_stop_local_train will then end the while True as the stopping
       # condition is satisfied (both checks use the same global_step value,
       # i.e., no race condition)
-      metrics = evaluator.evaluate_and_export()
+      eval_result = evaluator.evaluate_and_export()
 
-      if not metrics:
-        #  This is unexpected. Training should always end with a new checkpoint.
-        raise RuntimeError('There was no new checkpoint after the training.')
+      if eval_result.status != _EvalStatus.EVALUATED:
+        #  This is unexpected; should never happen.
+        #  Training should always end with a new checkpoint.
+        raise RuntimeError('There was no new checkpoint after the training. '
+                           'Eval status: {}'.format(eval_result.status))
 
-      if _should_stop_local_train(metrics[ops.GraphKeys.GLOBAL_STEP]):
+      if _should_stop_local_train(
+          eval_result.metrics[ops.GraphKeys.GLOBAL_STEP]):
         break
 
   def _start_std_server(self, config):
     """Creates, starts, and returns a server_lib.Server."""
-    if (not config.cluster_spec or not config.task_type or not config.master or
+    if (not config.cluster_spec or not config.task_type or
         config.task_id is None):
       raise RuntimeError('Could not start server; be sure to specify '
-                         'cluster_spec, task_type, master, and task in '
+                         'cluster_spec, task_type, and task in '
                          'RunConfig or set the TF_CONFIG environment variable.')
+
+    if not config.master:
+      jobs = config.cluster_spec.jobs
+      if (len(jobs) == 1 and len(config.cluster_spec.job_tasks(jobs[0])) == 1
+          and config.task_type in _TRAINER_JOBS):
+        # For distributed training, config.master is empty if and only if it has
+        # a single node in the cluster spec. In this case, we should not start
+        # the server.
+        logging.info('Skip starting Tensorflow server as there is only one '
+                     'node in the cluster.')
+        return
+      else:
+        raise RuntimeError(
+            'Could not start server; be sure to specify master in '
+            'RunConfig or set the TF_CONFIG environment variable.')
+
+    logging.info('Start Tensorflow server.')
     server = server_lib.Server(
         config.cluster_spec,
         job_name=config.task_type,
@@ -678,9 +711,11 @@ class _TrainingExecutor(object):
     evaluator = _TrainingExecutor._Evaluator(self._estimator, self._eval_spec,
                                              self._train_spec.max_steps)
 
-    while True:
-      if latest_eval_result:
-        global_step = latest_eval_result.get(ops.GraphKeys.GLOBAL_STEP)
+    should_early_stop = False
+    while not should_early_stop:
+      if (latest_eval_result and
+          latest_eval_result.status == _EvalStatus.EVALUATED):
+        global_step = latest_eval_result.metrics.get(ops.GraphKeys.GLOBAL_STEP)
         if (global_step and self._train_spec.max_steps and
             global_step >= self._train_spec.max_steps):
           logging.info(
@@ -689,21 +724,46 @@ class _TrainingExecutor(object):
               self._train_spec.max_steps)
           return
 
-      # Final export signal: For any eval result with global_step >= train
-      # max_steps, the evaluator will send the final export signal. The next
-      # iteration of while loop will end the continuous eval as the stopping
-      # condition is satisfied (both checks use the same global_step value,
-      # i.e., no race condition)
-      start = time.time()
-      latest_eval_result = evaluator.evaluate_and_export()
+      latest_eval_result, should_early_stop = self._execute_evaluator_once(
+          evaluator, self._continuous_eval_listener,
+          self._eval_spec.throttle_secs)
+
+  def _execute_evaluator_once(self, evaluator, continuous_eval_listener,
+                              throttle_secs):
+    """Executes the `evaluator`."""
+    start = time.time()
 
-      # Throttle if necessary.
-      elapsed_time = time.time() - start
-      difference = self._eval_spec.throttle_secs  - elapsed_time
-      if difference > 0:
-        logging.info('Waiting %f secs before starting next eval run.',
-                     difference)
-        time.sleep(difference)
+    eval_result = None
+    should_early_stop = False
+
+    if not continuous_eval_listener.before_eval():
+      logging.info('Exiting evaluation, as requested by '
+                   '_ContinuousEvalListener.before_eval.')
+      should_early_stop = True
+      return (eval_result, should_early_stop)
+
+    # Final export signal: For any eval result with global_step >= train
+    # max_steps, the evaluator will send the final export signal. The next
+    # iteration of while loop will end the continuous eval as the stopping
+    # condition is satisfied (both checks use the same global_step value,
+    # i.e., no race condition)
+    eval_result = evaluator.evaluate_and_export()
+
+    if not self._continuous_eval_listener.after_eval(eval_result):
+      logging.info('Exiting evaluation, as requested by '
+                   '_ContinuousEvalListener.after_eval.')
+      should_early_stop = True
+      return (eval_result, should_early_stop)
+
+    # Throttle if necessary.
+    elapsed_time = time.time() - start
+    difference = throttle_secs  - elapsed_time
+    if difference > 0:
+      logging.info('Waiting %f secs before starting next eval run.',
+                   difference)
+      time.sleep(difference)
+
+    return (eval_result, should_early_stop)
 
   class _Evaluator(object):
     """A helper class to call `Estimator.evaluate` and export model."""
@@ -724,8 +784,7 @@ class _TrainingExecutor(object):
       """Evaluate and (maybe) export the current model.
 
       Returns:
-        Evaluation results. Returns `None` if current round of evaluation is
-        skipped.
+        An `EvalResult` instance.
 
       Raises:
         RuntimeError: for any unexpected internal error.
@@ -735,39 +794,32 @@ class _TrainingExecutor(object):
       if not latest_ckpt_path:
         self._log_err_msg('Estimator is not trained yet. Will start an '
                           'evaluation when a checkpoint is ready.')
-        return None
+        return _EvalResult(status=_EvalStatus.MISSING_CHECKPOINT)
 
       if latest_ckpt_path == self._previous_ckpt_path:
         self._log_err_msg(
             'No new checkpoint ready for evaluation. Skip the current '
             'evaluation pass as evaluation results are expected to be same '
             'for the same checkpoint.')
-        return None
-      eval_result = self._estimator.evaluate(
+        return _EvalResult(status=_EvalStatus.NO_NEW_CHECKPOINT)
+
+      metrics = self._estimator.evaluate(
           input_fn=self._eval_spec.input_fn,
           steps=self._eval_spec.steps,
           name=self._eval_spec.name,
           checkpoint_path=latest_ckpt_path,
           hooks=self._eval_spec.hooks)
 
-      if not eval_result:
-        raise RuntimeError(
-            'Internal error: `Estimator.evaluate` should never return empty '
-            'result.')
-      if not isinstance(eval_result, dict):
-        raise TypeError(
-            '`Estimator.evaluate` should return dict. Given {}.'.format(
-                type(eval_result)))
-      if ops.GraphKeys.GLOBAL_STEP not in eval_result:
-        raise RuntimeError(
-            'Internal error: `Estimator.evaluate` result should have '
-            '`global_step` in result. Given {}'.format(eval_result))
+      # _EvalResult validates the metrics.
+      eval_result = _EvalResult(
+          status=_EvalStatus.EVALUATED,
+          metrics=metrics,
+          checkpoint_path=latest_ckpt_path)
 
-      is_the_final_export = (eval_result[ops.GraphKeys.GLOBAL_STEP] >=
-                             self._max_training_steps
-                             if self._max_training_steps else False)
-      self._export_eval_result(eval_result, latest_ckpt_path,
-                               is_the_final_export)
+      is_the_final_export = (
+          eval_result.metrics[ops.GraphKeys.GLOBAL_STEP] >=
+          self._max_training_steps if self._max_training_steps else False)
+      self._export_eval_result(eval_result, is_the_final_export)
 
       if is_the_final_export:
         logging.debug('Calling exporter with the `is_the_final_export=True`.')
@@ -784,8 +836,7 @@ class _TrainingExecutor(object):
         logging.warning(message)
         self._last_warning_time = current_time
 
-    def _export_eval_result(self, eval_result, checkpoint_path,
-                            is_the_final_export):
+    def _export_eval_result(self, eval_result, is_the_final_export):
       """Export `eval_result` according to exporters in `EvalSpec`."""
       export_dir_base = os.path.join(
           compat.as_str_any(self._estimator.model_dir),
@@ -797,6 +848,114 @@ class _TrainingExecutor(object):
             export_path=os.path.join(
                 compat.as_str_any(export_dir_base),
                 compat.as_str_any(exporter.name)),
-            checkpoint_path=checkpoint_path,
-            eval_result=eval_result,
+            checkpoint_path=eval_result.checkpoint_path,
+            eval_result=eval_result.metrics,
             is_the_final_export=is_the_final_export)
+
+
+class _EvalStatus(object):
+  """The status of an evaluation event.
+
+  For local training and evaluation, the status can only be `EVALUATED` as
+  `Estimator.train` always generates a new checkpoint.
+
+  For distributed training and evaluation, a separated evaluator keeps looking
+  for new checkpoint. So, multiple situations might occur:
+
+  - EVALUATED: A new checkpoint is found since last evaluation.
+      `Estimator.evaluate` will be invoked.
+  - MISSING_CHECKPOINT: No checkpoint can be found. Typically, this means
+      the trainer has not yet produced any checkpoint.
+  - NO_NEW_CHECKPOINT: No new checkpoint can be found since last evaluation.
+      Typically, this means the trainer has not yet produced any new checkpoint.
+  """
+
+  EVALUATED = 'evaluated'
+  MISSING_CHECKPOINT = 'missing checkpoint'
+  NO_NEW_CHECKPOINT = 'no new checkpoint'
+
+
+class _EvalResult(
+    collections.namedtuple('EvalResult',
+                           ['status', 'metrics', 'checkpoint_path'])):
+  """_EvalResult holds the result of an evaluation event."""
+
+  def __new__(cls, status, metrics=None, checkpoint_path=None):
+    """Creates a validated `_EvalResult`.
+
+    Args:
+      status: See `_EvalStatus`.
+      metrics: The evaluation results returned by `Estimator.evaluate`. Only set
+          if status is `EVALUATED`.
+      checkpoint_path: The corresponding checkpoint path for the `metrics`. Only
+          set if status is `EVALUATED`.
+    Returns:
+      A validated `_EvalResult` object.
+
+    Raises:
+      ValueError: If validation fails.
+      TypeError: If any of the arguments is not the expected type.
+    """
+
+    if status != _EvalStatus.EVALUATED:
+      if metrics:
+        raise ValueError(
+            'metrics must be `None` if status is not {}; got status {},'
+            ' metrics {}'.format(_EvalStatus.EVALUATED, status, metrics))
+      if checkpoint_path:
+        raise ValueError(
+            'checkpoint must be `None` if status is not {}; got status {}, '
+            'checkpoint_path {}'.format(
+                _EvalStatus.EVALUATED, status, checkpoint_path))
+      return super(_EvalResult, cls).__new__(cls, status, metrics,
+                                             checkpoint_path)
+
+    # Now, evaluated case.
+    assert status == _EvalStatus.EVALUATED
+
+    # Validates metrics.
+    if not metrics:
+      raise ValueError(
+          'Internal error: `Estimator.evaluate` should never return empty '
+          'metrics.')
+    if not isinstance(metrics, dict):
+      raise TypeError(
+          '`Estimator.evaluate` should return dict. Given {}.'.format(
+              type(metrics)))
+    if ops.GraphKeys.GLOBAL_STEP not in metrics:
+      raise ValueError(
+          'Internal error: `Estimator.evaluate` result should have '
+          '`global_step` in result. Given {}'.format(metrics))
+
+    # Validates checkpoint_path.
+    if not checkpoint_path:
+      raise ValueError(
+          'Internal error: `checkpoint_path` should never be empty.')
+
+    return super(_EvalResult, cls).__new__(cls, status, metrics,
+                                           checkpoint_path)
+
+
+class _ContinuousEvalListener(object):
+  """Interface for listeners that take action before or after evaluation."""
+
+  def before_eval(self):
+    """Called before evaluation.
+
+    Returns:
+      `False` if you want to skip the current evaluation and early stop the
+      continuous evaluation; `True` otherwise.
+    """
+    return True
+
+  def after_eval(self, eval_result):
+    """Called after the evaluation is executed.
+
+    Args:
+      eval_result: An `_EvalResult` instance.
+
+    Returns:
+      False if you want to early stop continuous evaluation; `True` otherwise.
+    """
+    del eval_result
+    return True
diff --git a/tensorflow/python/estimator/training_test.py b/tensorflow/python/estimator/training_test.py
index 1862e325e2b65ae2141132c4b900673c755e179e..6390a67762a9497dcd1f2cd0a39a293225aeacf6 100644
--- a/tensorflow/python/estimator/training_test.py
+++ b/tensorflow/python/estimator/training_test.py
@@ -81,7 +81,7 @@ _INVALID_TASK_TYPE = '`estimator.config` must have task_type set.'
 _INVALID_TASK_TO_RUN = (
     'Task type .* is not supported. Supported task types are ((?!local).)*$')
 _INVALID_EMPTY_EVAL_RESULT_ERR = (
-    'Internal error: `Estimator.evaluate` should never return empty result')
+    'Internal error: `Estimator.evaluate` should never return empty metrics')
 _INVALID_EVAL_RESULT_TYPE_ERR = '`Estimator.evaluate` should return dict.'
 _MISSING_GLOBAL_STEP_IN_EVAL_RESULT_ERR = (
     'Internal error: `Estimator.evaluate` result should have `global_step`')
@@ -480,7 +480,7 @@ class TrainAndEvaluteTest(test.TestCase):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.Mock()
-    mock_est.config.cluster_spec = {'1': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'1': ['dummy']})
     mock_est.config.task_type = ''
 
     with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TYPE):
@@ -598,7 +598,8 @@ class _TrainingExecutorTrainingTest(object):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec(
+        {'worker': ['dummy', 'dummy1']})
     mock_est.config.master = ''
     mock_est.config.task_type = 'worker'
     mock_est.config.task_id = 2
@@ -608,13 +609,33 @@ class _TrainingExecutorTrainingTest(object):
       self._run_task(training._TrainingExecutor(mock_est, mock_train_spec,
                                                 mock_eval_spec))
 
+  @test.mock.patch.object(time, 'sleep')
+  @test.mock.patch.object(server_lib, 'Server')
+  def test_single_worker_node_with_empty_tf_master(
+      self, mock_server, unused_mock_sleep):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
+
+    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
+    # Single node cluster.
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'worker': ['dummy']})
+    mock_est.config.master = ''
+    mock_est.config.task_type = 'worker'
+    mock_est.config.task_id = 2
+
+    self._run_task(training._TrainingExecutor(mock_est, mock_train_spec,
+                                              mock_eval_spec))
+    self.assertTrue(mock_est.train.called)
+    mock_server.assert_not_called()
+
   def test_fail_with_empty_task_type(self):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
     mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'worker': ['dummy']})
     mock_est.config.master = 'grpc://...'
     mock_est.config.task_type = ''
     mock_est.config.task_id = 2
@@ -630,7 +651,7 @@ class _TrainingExecutorTrainingTest(object):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'worker': ['dummy']})
     mock_est.config.master = 'grpc://...'
     mock_est.config.task_type = 'worker'
     mock_est.config.task_id = None
@@ -768,7 +789,7 @@ class TrainingExecutorRunMasterTest(test.TestCase):
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
     mock_est.config.cluster_spec = None
     mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'worker'
+    mock_est.config.task_type = 'master'
     mock_est.config.task_id = 2
 
     with self.assertRaisesRegexp(RuntimeError,
@@ -782,23 +803,48 @@ class TrainingExecutorRunMasterTest(test.TestCase):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec(
+        {'master': ['dummy'], 'worker': ['dummy1']})
     mock_est.config.master = ''
-    mock_est.config.task_type = 'worker'
-    mock_est.config.task_id = 2
+    mock_est.config.task_type = 'master'
+    mock_est.config.task_id = 0
 
     with self.assertRaisesRegexp(RuntimeError,
                                  _INVALID_CONFIG_FOR_STD_SERVER_MSG):
       training._TrainingExecutor(
           mock_est, mock_train_spec, mock_eval_spec).run_master()
 
+  @test.mock.patch.object(time, 'sleep')
+  @test.mock.patch.object(server_lib, 'Server')
+  def test_single_master_node_with_empty_tf_master(
+      self, mock_server, unused_mock_sleep):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.evaluate = lambda *args, **kw: {ops.GraphKeys.GLOBAL_STEP: 123}
+
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec, max_steps=123)
+    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec, exporters=[])
+
+    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
+    mock_est.config.cluster_spec = server_lib.ClusterSpec(
+        {'master': ['dummy']})
+    mock_est.config.master = ''
+    mock_est.config.task_type = 'master'
+    mock_est.config.task_id = 0
+
+    executor = training._TrainingExecutor(
+        mock_est, mock_train_spec, mock_eval_spec)
+    executor.run_master()
+
+    mock_server.assert_not_called()
+    self.assertTrue(mock_est.train.called)
+
   def test_fail_with_empty_task_type(self):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
     mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'master': ['dummy']})
     mock_est.config.master = 'grpc://...'
     mock_est.config.task_type = ''
     mock_est.config.task_id = 2
@@ -814,9 +860,9 @@ class TrainingExecutorRunMasterTest(test.TestCase):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'master': ['dummy']})
     mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'worker'
+    mock_est.config.task_type = 'master'
     mock_est.config.task_id = None
 
     with self.assertRaisesRegexp(RuntimeError,
@@ -1016,7 +1062,7 @@ class TrainingExecutorRunEvaluatorTest(test.TestCase):
                is_the_final_export):
       del export_path, checkpoint_path, eval_result
       estimator.times_export_was_called += 1
-      # final_export is happend at the end.
+      # final_export is happened at the end.
       self.assertEqual(0, estimator.times_final_export_was_called)
       if is_the_final_export:
         estimator.times_final_export_was_called += 1
@@ -1036,6 +1082,86 @@ class TrainingExecutorRunEvaluatorTest(test.TestCase):
     self.assertEqual(2, mock_est.times_export_was_called)
     self.assertEqual(1, mock_est.times_final_export_was_called)
 
+  def test_evaluate_listener_before_eval(self):
+    training_max_step = 200
+
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.model_dir = compat.as_bytes(test.get_temp_dir())
+    # Without early stopping, this eval will be run twice.
+    mock_est.evaluate.side_effect = [{
+        _GLOBAL_STEP_KEY: training_max_step // 2
+    }, {
+        _GLOBAL_STEP_KEY: training_max_step
+    }]
+    mock_est.latest_checkpoint.side_effect = ['path_1', 'path_2']
+
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    mock_train_spec.max_steps = training_max_step
+
+    class _Listener(training._ContinuousEvalListener):
+
+      def __init__(self):
+        self.call_count = 0
+
+      def before_eval(self):
+        self.call_count += 1
+        return  self.call_count == 1
+
+    listener = _Listener()
+
+    eval_spec = training.EvalSpec(
+        input_fn=lambda: 1, start_delay_secs=0, throttle_secs=0)
+
+    training._TrainingExecutor(mock_est, mock_train_spec, eval_spec,
+                               listener).run_evaluator()
+
+    # Before_eval returns False during the second time, so, evaluate will be
+    # called once.
+    self.assertEqual(1, mock_est.evaluate.call_count)
+    self.assertEqual(2, listener.call_count)
+
+  def test_evaluate_listener_after_eval(self):
+    training_max_step = 200
+
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.model_dir = compat.as_bytes(test.get_temp_dir())
+    # Without early stopping, this eval will be run twice.
+    expected_eval_metrics = [{
+        _GLOBAL_STEP_KEY: training_max_step // 2
+    }, {
+        _GLOBAL_STEP_KEY: training_max_step
+    }]
+    mock_est.evaluate.side_effect = expected_eval_metrics
+    mock_est.latest_checkpoint.side_effect = ['path_1', 'path_2']
+
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    mock_train_spec.max_steps = training_max_step
+
+    class _Listener(training._ContinuousEvalListener):
+
+      def __init__(self):
+        self.call_count = 0
+
+      def after_eval(self, eval_result):
+        self.call_count += 1
+        self.eval_result = eval_result
+        return False
+
+    listener = _Listener()
+
+    eval_spec = training.EvalSpec(
+        input_fn=lambda: 1, start_delay_secs=0, throttle_secs=0)
+
+    training._TrainingExecutor(mock_est, mock_train_spec, eval_spec,
+                               listener).run_evaluator()
+
+    # after_eval returns False during the first time, so, evaluate will be
+    # called once.
+    self.assertEqual(1, mock_est.evaluate.call_count)
+    self.assertEqual(1, listener.call_count)
+    self.assertAllEqual(expected_eval_metrics[0], listener.eval_result.metrics)
+    self.assertEqual('path_1', listener.eval_result.checkpoint_path)
+
   def test_final_export_is_true_in_the_end(self):
     training_max_step = 200
 
@@ -1108,6 +1234,67 @@ class TrainingExecutorRunEvaluatorTest(test.TestCase):
     # successuful evaluation)
     self.assertEqual(2, mock_log.call_count)
 
+  def test_continuous_eval_listener_eval_result(self):
+    training_max_step = 200
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    expected_eval_metrics = [{
+        _GLOBAL_STEP_KEY: training_max_step // 2
+    }, {
+        _GLOBAL_STEP_KEY: training_max_step
+    }]
+    mock_est.evaluate.side_effect = expected_eval_metrics
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    mock_train_spec.max_steps = training_max_step
+
+    class _Listener(training._ContinuousEvalListener):
+
+      def __init__(self):
+        self.eval_results = []
+
+      def after_eval(self, eval_result):
+        self.eval_results.append(eval_result)
+        return True
+
+    continuous_eval_listener = _Listener()
+
+    self._set_up_mock_est_to_train_and_evaluate_once(mock_est, mock_train_spec)
+
+    # First two items are invalid, next two items are same.
+    mock_est.latest_checkpoint.side_effect = [
+        None, '', 'same', 'same', 'path_2'
+    ]
+    expected_eval_results = [
+        training._EvalResult(training._EvalStatus.MISSING_CHECKPOINT),
+        training._EvalResult(training._EvalStatus.MISSING_CHECKPOINT),
+        training._EvalResult(
+            training._EvalStatus.EVALUATED,
+            metrics=expected_eval_metrics[0],
+            checkpoint_path='same'),
+        training._EvalResult(training._EvalStatus.NO_NEW_CHECKPOINT),
+        training._EvalResult(
+            training._EvalStatus.EVALUATED,
+            metrics=expected_eval_metrics[1],
+            checkpoint_path='path_2'),
+    ]
+
+    eval_spec = training.EvalSpec(
+        input_fn=lambda: 1, start_delay_secs=0, throttle_secs=0)
+
+    executor = training._TrainingExecutor(mock_est, mock_train_spec, eval_spec,
+                                          continuous_eval_listener)
+    executor.run_evaluator()
+
+    # Three checkpoint paths are invalid.
+    self.assertEqual(5, mock_est.latest_checkpoint.call_count)
+    self.assertEqual(2, mock_est.evaluate.call_count)
+
+    self.assertEqual(5, len(continuous_eval_listener.eval_results))
+    for i, result in enumerate(continuous_eval_listener.eval_results):
+      self.assertEqual(expected_eval_results[i].status, result.status)
+      self.assertAllEqual(expected_eval_results[i].metrics, result.metrics)
+      self.assertEqual(expected_eval_results[i].checkpoint_path,
+                       result.checkpoint_path)
+
   def test_sleep_start_delay_secs(self):
     training_max_step = 200
     start_delay_secs = 123
@@ -1184,7 +1371,7 @@ class TrainingExecutorRunEvaluatorTest(test.TestCase):
     mock_est.evaluate.return_value = {}
 
     executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    with self.assertRaisesRegexp(RuntimeError, _INVALID_EMPTY_EVAL_RESULT_ERR):
+    with self.assertRaisesRegexp(ValueError, _INVALID_EMPTY_EVAL_RESULT_ERR):
       executor.run_evaluator()
 
   def test_errors_out_if_evaluate_returns_non_dict(self):
@@ -1206,7 +1393,7 @@ class TrainingExecutorRunEvaluatorTest(test.TestCase):
     mock_est.evaluate.return_value = {'loss': 123}
 
     executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    with self.assertRaisesRegexp(RuntimeError,
+    with self.assertRaisesRegexp(ValueError,
                                  _MISSING_GLOBAL_STEP_IN_EVAL_RESULT_ERR):
       executor.run_evaluator()
 
@@ -1246,7 +1433,7 @@ class TrainingExecutorRunPsTest(test.TestCase):
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
     mock_est.config.cluster_spec = None
     mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'gs'
+    mock_est.config.task_type = 'ps'
     mock_est.config.task_id = 2
 
     with self.assertRaisesRegexp(RuntimeError,
@@ -1260,9 +1447,9 @@ class TrainingExecutorRunPsTest(test.TestCase):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'gs': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'ps': ['dummy']})
     mock_est.config.master = ''
-    mock_est.config.task_type = 'gs'
+    mock_est.config.task_type = 'ps'
     mock_est.config.task_id = 2
 
     with self.assertRaisesRegexp(RuntimeError,
@@ -1276,7 +1463,7 @@ class TrainingExecutorRunPsTest(test.TestCase):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'gs': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'ps': ['dummy']})
     mock_est.config.master = 'grpc://...'
     mock_est.config.task_type = ''
     mock_est.config.task_id = 2
@@ -1292,9 +1479,9 @@ class TrainingExecutorRunPsTest(test.TestCase):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'gs': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'ps': ['dummy']})
     mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'gs'
+    mock_est.config.task_type = 'ps'
     mock_est.config.task_id = None
 
     with self.assertRaisesRegexp(RuntimeError,
@@ -1361,7 +1548,7 @@ class TrainingExecutorRunLocalTest(test.TestCase):
                is_the_final_export):
       del export_path, checkpoint_path, eval_result
       estimator.times_export_was_called += 1
-      # final_export is happend at the end.
+      # final_export is happened at the end.
       self.assertEqual(0, estimator.times_final_export_was_called)
       if is_the_final_export:
         estimator.times_final_export_was_called += 1
@@ -1527,7 +1714,7 @@ class TrainingExecutorRunLocalTest(test.TestCase):
     mock_est.evaluate.return_value = {}
 
     executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    with self.assertRaisesRegexp(RuntimeError, _INVALID_EMPTY_EVAL_RESULT_ERR):
+    with self.assertRaisesRegexp(ValueError, _INVALID_EMPTY_EVAL_RESULT_ERR):
       executor.run_local()
 
   def test_errors_out_if_evaluate_returns_non_dict(self):
@@ -1547,7 +1734,7 @@ class TrainingExecutorRunLocalTest(test.TestCase):
     mock_est.evaluate.return_value = {'loss': 123}
 
     executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    with self.assertRaisesRegexp(RuntimeError,
+    with self.assertRaisesRegexp(ValueError,
                                  _MISSING_GLOBAL_STEP_IN_EVAL_RESULT_ERR):
       executor.run_local()
 
diff --git a/tensorflow/python/estimator/util.py b/tensorflow/python/estimator/util.py
index 12f2592d848c3ce55777ffdae5cee7ac602ee87f..b31486dfa1122c2549ba3e9f6a730fd26444450a 100644
--- a/tensorflow/python/estimator/util.py
+++ b/tensorflow/python/estimator/util.py
@@ -52,7 +52,7 @@ def fn_args(fn):
   else:
     if _is_callable_object(fn):
       fn = fn.__call__
-    args = tf_inspect.getargspec(fn).args
+    args = tf_inspect.getfullargspec(fn).args
     if _is_bounded_method(fn):
       args.remove('self')
   return tuple(args)
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index b1c81dd58c7d2d9cf95821ea78eda2e7ee675d25..76d44fc474f936733f4eeeefd5d9510964ebb430 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -48,6 +48,7 @@ py_library(
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
+        "//tensorflow/python:template",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 5ee93be7c3e51badac6bfb966c143a488ce655bf..a7fe528ee1d85c3c06d4e9376ca4937aaf168b8a 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -134,6 +134,7 @@ import math
 import numpy as np
 import six
 
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
@@ -149,13 +150,65 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.util import nest
 
 
+def _internal_input_layer(features,
+                          feature_columns,
+                          weight_collections=None,
+                          trainable=True,
+                          cols_to_vars=None,
+                          scope=None):
+  """See input_layer. `scope` is a name or variable scope to use."""
+
+  feature_columns = _clean_feature_columns(feature_columns)
+  for column in feature_columns:
+    if not isinstance(column, _DenseColumn):
+      raise ValueError(
+          'Items of feature_columns must be a _DenseColumn. '
+          'You can wrap a categorical column with an '
+          'embedding_column or indicator_column. Given: {}'.format(column))
+  weight_collections = list(weight_collections or [])
+  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
+  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
+
+  # a non-None `scope` can allow for variable reuse, when, e.g., this function
+  # is wrapped by a `make_template`.
+  with variable_scope.variable_scope(
+      scope, default_name='input_layer', values=features.values()):
+    builder = _LazyBuilder(features)
+    output_tensors = []
+    ordered_columns = []
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      ordered_columns.append(column)
+      with variable_scope.variable_scope(
+          None, default_name=column._var_scope_name):  # pylint: disable=protected-access
+        tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+            builder,
+            weight_collections=weight_collections,
+            trainable=trainable)
+        num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
+        batch_size = array_ops.shape(tensor)[0]
+        output_tensors.append(
+            array_ops.reshape(tensor, shape=(batch_size, num_elements)))
+        if cols_to_vars is not None:
+          # Retrieve any variables created (some _DenseColumn's don't create
+          # variables, in which case an empty list is returned).
+          cols_to_vars[column] = ops.get_collection(
+              ops.GraphKeys.GLOBAL_VARIABLES,
+              scope=variable_scope.get_variable_scope().name)
+    _verify_static_batch_size_equality(output_tensors, ordered_columns)
+    return array_ops.concat(output_tensors, 1)
+
+
 def input_layer(features,
                 feature_columns,
                 weight_collections=None,
@@ -192,7 +245,7 @@ def input_layer(features,
       `bucketized_column`, `indicator_column`. If you have categorical features,
       you can wrap them with an `embedding_column` or `indicator_column`.
     weight_collections: A list of collection names to which the Variable will be
-      added. Note that, variables will also be added to collections
+      added. Note that variables will also be added to collections
       `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
     trainable: If `True` also add the variable to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
@@ -214,43 +267,66 @@ def input_layer(features,
   Raises:
     ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
   """
-  feature_columns = _clean_feature_columns(feature_columns)
-  for column in feature_columns:
-    if not isinstance(column, _DenseColumn):
-      raise ValueError(
-          'Items of feature_columns must be a _DenseColumn. '
-          'You can wrap a categorical column with an '
-          'embedding_column or indicator_column. Given: {}'.format(column))
-  weight_collections = list(weight_collections or [])
-  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
-    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
-  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
-    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
-  with variable_scope.variable_scope(
-      None, default_name='input_layer', values=features.values()):
-    builder = _LazyBuilder(features)
-    output_tensors = []
-    ordered_columns = []
-    for column in sorted(feature_columns, key=lambda x: x.name):
-      ordered_columns.append(column)
-      with variable_scope.variable_scope(
-          None, default_name=column._var_scope_name):  # pylint: disable=protected-access
-        tensor = column._get_dense_tensor(  # pylint: disable=protected-access
-            builder,
-            weight_collections=weight_collections,
-            trainable=trainable)
-        if cols_to_vars is not None:
-          # Retrieve any variables created (some _DenseColumn's don't create
-          # variables, in which case an empty list is returned).
-          cols_to_vars[column] = ops.get_collection(
-              ops.GraphKeys.GLOBAL_VARIABLES,
-              scope=variable_scope.get_variable_scope().name)
-        num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
-        batch_size = array_ops.shape(tensor)[0]
-        tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
-        output_tensors.append(tensor)
-    _verify_static_batch_size_equality(output_tensors, ordered_columns)
-    return array_ops.concat(output_tensors, 1)
+  return _internal_input_layer(features, feature_columns, weight_collections,
+                               trainable, cols_to_vars)
+
+
+# TODO(akshayka): InputLayer should be a subclass of Layer, and it
+# should implement the logic in input_layer using Layer's build-and-call
+# paradigm; input_layer should create an instance of InputLayer and
+# return the result of inovking its apply method, just as functional layers do.
+class InputLayer(object):
+  """An object-oriented version of `input_layer` that reuses variables."""
+
+  def __init__(self,
+               feature_columns,
+               weight_collections=None,
+               trainable=True,
+               cols_to_vars=None):
+    """See `input_layer`."""
+
+    self._feature_columns = feature_columns
+    self._weight_collections = weight_collections
+    self._trainable = trainable
+    self._cols_to_vars = cols_to_vars
+    self._input_layer_template = template.make_template(
+        'feature_column_input_layer',
+        _internal_input_layer,
+        create_scope_now_=True)
+    self._scope = self._input_layer_template.variable_scope
+
+  def __call__(self, features):
+    return self._input_layer_template(
+        features=features,
+        feature_columns=self._feature_columns,
+        weight_collections=self._weight_collections,
+        trainable=self._trainable,
+        cols_to_vars=None,
+        scope=self._scope)
+
+  @property
+  def non_trainable_variables(self):
+    return self._input_layer_template.non_trainable_variables
+
+  @property
+  def non_trainable_weights(self):
+    return self._input_layer_template.non_trainable_weights
+
+  @property
+  def trainable_variables(self):
+    return self._input_layer_template.trainable_variables
+
+  @property
+  def trainable_weights(self):
+    return self._input_layer_template.trainable_weights
+
+  @property
+  def variables(self):
+    return self._input_layer_template.variables
+
+  @property
+  def weights(self):
+    return self._input_layer_template.weights
 
 
 def linear_model(features,
@@ -344,13 +420,13 @@ def linear_model(features,
       with variable_scope.variable_scope(
           None, default_name=column._var_scope_name):  # pylint: disable=protected-access
         ordered_columns.append(column)
-        if isinstance(column, _CategoricalColumn):
-          weighted_sum = _create_categorical_column_weighted_sum(
-              column, builder, units, sparse_combiner, weight_collections,
-              trainable)
-        else:
-          weighted_sum = _create_dense_column_weighted_sum(
-              column, builder, units, weight_collections, trainable)
+        weighted_sum = _create_weighted_sum(
+            column=column,
+            builder=builder,
+            units=units,
+            sparse_combiner=sparse_combiner,
+            weight_collections=weight_collections,
+            trainable=trainable)
         weighted_sums.append(weighted_sum)
         if cols_to_vars is not None:
           # Retrieve the variables created.
@@ -554,6 +630,7 @@ def embedding_column(
     ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
       is specified.
     ValueError: if `initializer` is specified and is not callable.
+    RuntimeError: If eager execution is enabled.
   """
   if (dimension is None) or (dimension < 1):
     raise ValueError('Invalid dimension {}.'.format(dimension))
@@ -574,7 +651,6 @@ def embedding_column(
       dimension=dimension,
       combiner=combiner,
       initializer=initializer,
-      shared_embedding_collection_name=None,
       ckpt_to_load_from=ckpt_to_load_from,
       tensor_name_in_ckpt=tensor_name_in_ckpt,
       max_norm=max_norm,
@@ -689,18 +765,36 @@ def _shared_embedding_columns(
     raise ValueError('initializer must be callable if specified.')
   if initializer is None:
     initializer = init_ops.truncated_normal_initializer(
-        mean=0.0, stddev=1 / math.sqrt(dimension))
-  # TODO(b/67952670): Validate categorical_columns.
+        mean=0.0, stddev=1. / math.sqrt(dimension))
+
+  # Sort the columns so the default collection name is deterministic even if the
+  # user passes columns from an unsorted collection, such as dict.values().
+  sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
+
+  c0 = sorted_columns[0]
+  if not isinstance(c0, _CategoricalColumn):
+    raise ValueError(
+        'All categorical_columns must be subclasses of _CategoricalColumn. '
+        'Given: {}, of type: {}'.format(c0, type(c0)))
+  if isinstance(c0, _WeightedCategoricalColumn):
+    c0 = c0.categorical_column
+  for c in sorted_columns[1:]:
+    if isinstance(c, _WeightedCategoricalColumn):
+      c = c.categorical_column
+    if not isinstance(c, type(c0)):
+      raise ValueError(
+          'To use shared_embedding_column, all categorical_columns must have '
+          'the same type, or be weighted_categorical_column of the same type. '
+          'Given column: {} of type: {} does not match given column: {} of '
+          'type: {}'.format(c0, type(c0), c, type(c)))
+
   if not shared_embedding_collection_name:
-    # Sort the columns so the name is deterministic even if the user passes
-    # columns from an unsorted collection, such as dict.values().
-    sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
     shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)
     shared_embedding_collection_name += '_shared_embedding'
 
   result = []
   for column in categorical_columns:
-    result.append(_EmbeddingColumn(
+    result.append(_SharedEmbeddingColumn(
         categorical_column=column,
         dimension=dimension,
         combiner=combiner,
@@ -932,9 +1026,12 @@ def categorical_column_with_hash_bucket(key,
   return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
 
 
-def categorical_column_with_vocabulary_file(
-    key, vocabulary_file, vocabulary_size, num_oov_buckets=0,
-    default_value=None, dtype=dtypes.string):
+def categorical_column_with_vocabulary_file(key,
+                                            vocabulary_file,
+                                            vocabulary_size=None,
+                                            num_oov_buckets=0,
+                                            default_value=None,
+                                            dtype=dtypes.string):
   """A `_CategoricalColumn` with a vocabulary file.
 
   Use this when your inputs are in string or integer format, and you have a
@@ -993,7 +1090,7 @@ def categorical_column_with_vocabulary_file(
     vocabulary_file: The vocabulary file name.
     vocabulary_size: Number of the elements in the vocabulary. This must be no
       greater than length of `vocabulary_file`, if less than length, later
-      values are ignored.
+      values are ignored. If None, it is set to the length of `vocabulary_file`.
     num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
       buckets. All out-of-vocabulary inputs will be assigned IDs in the range
       `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
@@ -1008,7 +1105,7 @@ def categorical_column_with_vocabulary_file(
     A `_CategoricalColumn` with a vocabulary file.
 
   Raises:
-    ValueError: `vocabulary_file` is missing.
+    ValueError: `vocabulary_file` is missing or cannot be opened.
     ValueError: `vocabulary_size` is missing or < 1.
     ValueError: `num_oov_buckets` is a negative integer.
     ValueError: `num_oov_buckets` and `default_value` are both specified.
@@ -1016,8 +1113,19 @@ def categorical_column_with_vocabulary_file(
   """
   if not vocabulary_file:
     raise ValueError('Missing vocabulary_file in {}.'.format(key))
+
+  if vocabulary_size is None:
+    if not gfile.Exists(vocabulary_file):
+      raise ValueError('vocabulary_file in {} does not exist.'.format(key))
+
+    with gfile.GFile(vocabulary_file) as f:
+      vocabulary_size = sum(1 for _ in f)
+    logging.info(
+        'vocabulary_size = %d in %s is inferred from the number of elements '
+        'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file)
+
   # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`.
-  if (vocabulary_size is None) or (vocabulary_size < 1):
+  if vocabulary_size < 1:
     raise ValueError('Invalid vocabulary_size in {}.'.format(key))
   if num_oov_buckets:
     if default_value is not None:
@@ -1462,7 +1570,7 @@ class _FeatureColumn(object):
 
   @abc.abstractproperty
   def name(self):
-    """Returns string. Used for naming."""
+    """Returns string. Used for naming and for name_scope."""
     pass
 
   @property
@@ -1560,6 +1668,31 @@ class _DenseColumn(_FeatureColumn):
     pass
 
 
+def _create_weighted_sum(
+    column,
+    builder,
+    units,
+    sparse_combiner,
+    weight_collections,
+    trainable):
+  """Creates a weighted sum for a dense or sparse column for linear_model."""
+  if isinstance(column, _CategoricalColumn):
+    return _create_categorical_column_weighted_sum(
+        column=column,
+        builder=builder,
+        units=units,
+        sparse_combiner=sparse_combiner,
+        weight_collections=weight_collections,
+        trainable=trainable)
+  else:
+    return _create_dense_column_weighted_sum(
+        column=column,
+        builder=builder,
+        units=units,
+        weight_collections=weight_collections,
+        trainable=trainable)
+
+
 def _create_dense_column_weighted_sum(
     column, builder, units, weight_collections, trainable):
   """Create a weighted sum of a dense column for linear_model."""
@@ -1825,29 +1958,26 @@ def _to_sparse_input(input_tensor, ignore_value=None):
   if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
     return input_tensor
   with ops.name_scope(None, 'to_sparse_input', (input_tensor, ignore_value,)):
-    input_rank = input_tensor.get_shape().ndims
-    if input_rank is None:
-      # TODO(b/32318825): Implement dense_to_sparse_tensor for undefined rank.
-      raise ValueError('Undefined input_tensor shape.')
     if ignore_value is None:
-      ignore_value = '' if input_tensor.dtype == dtypes.string else -1
-    dense_shape = math_ops.cast(array_ops.shape(input_tensor), dtypes.int64)
-    indices = array_ops.where(math_ops.not_equal(
-        input_tensor, math_ops.cast(ignore_value, input_tensor.dtype)))
-    # Flattens the tensor and indices for use with gather.
-    flat_tensor = array_ops.reshape(input_tensor, [-1])
-    flat_indices = indices[:, input_rank - 1]
-    # Computes the correct flattened indices for 2d (or higher) tensors.
-    if input_rank > 1:
-      higher_dims = indices[:, :input_rank - 1]
-      shape_offsets = array_ops.stack(
-          _shape_offsets(array_ops.unstack(dense_shape)[1:]))
-      offsets = math_ops.reduce_sum(
-          math_ops.multiply(higher_dims, shape_offsets),
-          reduction_indices=[1])
-      flat_indices = math_ops.add(flat_indices, offsets)
-    values = array_ops.gather(flat_tensor, flat_indices)
-    return sparse_tensor_lib.SparseTensor(indices, values, dense_shape)
+      if input_tensor.dtype == dtypes.string:
+        # Exception due to TF strings are converted to numpy objects by default.
+        ignore_value = ''
+      elif input_tensor.dtype.is_integer:
+        ignore_value = -1  # -1 has a special meaning of missing feature
+      else:
+        # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is
+        # constructing a new numpy object of the given type, which yields the
+        # default value for that type.
+        ignore_value = input_tensor.dtype.as_numpy_dtype()
+    ignore_value = math_ops.cast(
+        ignore_value, input_tensor.dtype, name='ignore_value')
+    indices = array_ops.where(
+        math_ops.not_equal(input_tensor, ignore_value), name='indices')
+    return sparse_tensor_lib.SparseTensor(
+        indices=indices,
+        values=array_ops.gather_nd(input_tensor, indices, name='values'),
+        dense_shape=array_ops.shape(
+            input_tensor, out_type=dtypes.int64, name='dense_shape'))
 
 
 def _clean_feature_columns(feature_columns):
@@ -2008,24 +2138,16 @@ class _EmbeddingColumn(
     _DenseColumn,
     collections.namedtuple('_EmbeddingColumn', (
         'categorical_column', 'dimension', 'combiner', 'initializer',
-        'shared_embedding_collection_name', 'ckpt_to_load_from',
-        'tensor_name_in_ckpt', 'max_norm', 'trainable'
+        'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'
     ))):
   """See `embedding_column`."""
 
   @property
   def name(self):
     if not hasattr(self, '_name'):
-      if self.shared_embedding_collection_name:
-        self._name = '{}_shared_embedding'.format(self.categorical_column.name)
-      else:
-        self._name = '{}_embedding'.format(self.categorical_column.name)
+      self._name = '{}_embedding'.format(self.categorical_column.name)
     return self._name
 
-  @property
-  def _var_scope_name(self):
-    return self.shared_embedding_collection_name or self.name
-
   @property
   def _parse_example_spec(self):
     return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
@@ -2047,7 +2169,75 @@ class _EmbeddingColumn(
     sparse_weights = sparse_tensors.weight_tensor
 
     embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
-    if self.shared_embedding_collection_name:
+    embedding_weights = variable_scope.get_variable(
+        name='embedding_weights',
+        shape=embedding_shape,
+        dtype=dtypes.float32,
+        initializer=self.initializer,
+        trainable=self.trainable and trainable,
+        collections=weight_collections)
+    if self.ckpt_to_load_from is not None:
+      to_restore = embedding_weights
+      if isinstance(to_restore, variables.PartitionedVariable):
+        to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
+      checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
+          self.tensor_name_in_ckpt: to_restore
+      })
+
+    # Return embedding lookup result.
+    return _safe_embedding_lookup_sparse(
+        embedding_weights=embedding_weights,
+        sparse_ids=sparse_ids,
+        sparse_weights=sparse_weights,
+        combiner=self.combiner,
+        name='%s_weights' % self.name,
+        max_norm=self.max_norm)
+
+
+class _SharedEmbeddingColumn(
+    _DenseColumn,
+    collections.namedtuple('_SharedEmbeddingColumn', (
+        'categorical_column', 'dimension', 'combiner', 'initializer',
+        'shared_embedding_collection_name', 'ckpt_to_load_from',
+        'tensor_name_in_ckpt', 'max_norm', 'trainable'
+    ))):
+  """See `embedding_column`."""
+
+  @property
+  def name(self):
+    if not hasattr(self, '_name'):
+      self._name = '{}_shared_embedding'.format(self.categorical_column.name)
+    return self._name
+
+  @property
+  def _var_scope_name(self):
+    return self.shared_embedding_collection_name
+
+  @property
+  def _parse_example_spec(self):
+    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
+
+  def _transform_feature(self, inputs):
+    return inputs.get(self.categorical_column)
+
+  @property
+  def _variable_shape(self):
+    if not hasattr(self, '_shape'):
+      self._shape = tensor_shape.vector(self.dimension)
+    return self._shape
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    # This method is called from a variable_scope with name _var_scope_name,
+    # which is shared among all shared embeddings. Open a name_scope here, so
+    # that the ops for different columns have distinct names.
+    with ops.name_scope(None, default_name=self.name):
+      # Get sparse IDs and weights.
+      sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
+          inputs, weight_collections=weight_collections, trainable=trainable)
+      sparse_ids = sparse_tensors.id_tensor
+      sparse_weights = sparse_tensors.weight_tensor
+
+      embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
       shared_embedding_collection = ops.get_collection(
           self.shared_embedding_collection_name)
       if shared_embedding_collection:
@@ -2059,7 +2249,7 @@ class _EmbeddingColumn(
               'The feature_column library already adds a variable under the '
               'hood.'.format(shared_embedding_collection))
         embedding_weights = shared_embedding_collection[0]
-        if embedding_weights.shape != embedding_shape:
+        if embedding_weights.get_shape() != embedding_shape:
           raise ValueError(
               'Shared embedding collection {} contains variable {} of '
               'unexpected shape {}. Expected shape is {}. '
@@ -2068,7 +2258,7 @@ class _EmbeddingColumn(
               'The feature_column library already adds a variable under the '
               'hood.'.format(
                   self.shared_embedding_collection_name, embedding_weights.name,
-                  embedding_weights.shape, embedding_shape))
+                  embedding_weights.get_shape(), embedding_shape))
       else:
         embedding_weights = variable_scope.get_variable(
             name='embedding_weights',
@@ -2079,30 +2269,22 @@ class _EmbeddingColumn(
             collections=weight_collections)
         ops.add_to_collection(
             self.shared_embedding_collection_name, embedding_weights)
-    else:
-      embedding_weights = variable_scope.get_variable(
-          name='embedding_weights',
-          shape=embedding_shape,
-          dtype=dtypes.float32,
-          initializer=self.initializer,
-          trainable=self.trainable and trainable,
-          collections=weight_collections)
-    if self.ckpt_to_load_from is not None:
-      to_restore = embedding_weights
-      if isinstance(to_restore, variables.PartitionedVariable):
-        to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
-      checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
-          self.tensor_name_in_ckpt: to_restore
-      })
-
-    # Return embedding lookup result.
-    return _safe_embedding_lookup_sparse(
-        embedding_weights=embedding_weights,
-        sparse_ids=sparse_ids,
-        sparse_weights=sparse_weights,
-        combiner=self.combiner,
-        name='%s_weights' % self.name,
-        max_norm=self.max_norm)
+      if self.ckpt_to_load_from is not None:
+        to_restore = embedding_weights
+        if isinstance(to_restore, variables.PartitionedVariable):
+          to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
+        checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
+            self.tensor_name_in_ckpt: to_restore
+        })
+
+      # Return embedding lookup result.
+      return _safe_embedding_lookup_sparse(
+          embedding_weights=embedding_weights,
+          sparse_ids=sparse_ids,
+          sparse_weights=sparse_weights,
+          combiner=self.combiner,
+          name='%s_weights' % self.name,
+          max_norm=self.max_norm)
 
 
 def _create_tuple(shape, value):
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 9981f358b15997c537f53a9ae59e8313516996cc..2374680b968813b76d0ec115aa46c547eb9ab036 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -26,6 +26,8 @@ import numpy as np
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column as fc_lib
 from tensorflow.python.feature_column import feature_column_lib as fc
@@ -34,11 +36,13 @@ from tensorflow.python.feature_column.feature_column import _DenseColumn
 from tensorflow.python.feature_column.feature_column import _FeatureColumn
 from tensorflow.python.feature_column.feature_column import _LazyBuilder
 from tensorflow.python.feature_column.feature_column import _transform_features
+from tensorflow.python.feature_column.feature_column import InputLayer
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import parsing_ops
@@ -1646,8 +1650,9 @@ class LinearModelTest(test.TestCase):
         indices=((0,), (1,)),
         values=('sedan', 'hardtop'),
         dense_shape=(2,))
+    country_data = np.array(['US', 'CA'])
 
-    net = fc.linear_model(features, [price_buckets, body_style])
+    net = fc.linear_model(features, [price_buckets, body_style, country])
     bias = get_linear_model_bias()
     price_buckets_var = get_linear_model_column_var(price_buckets)
     body_style_var = get_linear_model_column_var(body_style)
@@ -1656,15 +1661,14 @@ class LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose(
-          [[10 - 1000 + 5.], [1000 - 10 + 5.]],
-          sess.run(net, feed_dict={
-              features['price']: price_data,
-              features['body-style']: body_style_data}))
-
-    # Dense categorical_column with unknown shape is not allowed.
-    with self.assertRaisesRegexp(ValueError, 'Undefined input_tensor shape.'):
-      fc.linear_model(features, [price_buckets, body_style, country])
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          sess.run(
+                              net,
+                              feed_dict={
+                                  features['price']: price_data,
+                                  features['body-style']: body_style_data,
+                                  features['country']: country_data
+                              }))
 
   def test_with_rank_0_feature(self):
     price = fc.numeric_column('price')
@@ -1690,6 +1694,105 @@ class LinearModelTest(test.TestCase):
 
 class InputLayerTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_retrieving_input(self):
+    features = {'a': [0.]}
+    input_layer = InputLayer(fc.numeric_column('a'))
+    inputs = self.evaluate(input_layer(features))
+    self.assertAllClose([[0.]], inputs)
+
+  def test_reuses_variables(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(key='a',
+                                                               num_buckets=3)
+      embedding_dimension = 2
+      def _embedding_column_initializer(shape, dtype, partition_info):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      input_layer = InputLayer([embedding_column])
+      features = {'a': sparse_input}
+
+      inputs = input_layer(features)
+      variables = input_layer.variables
+
+      # Sanity check: test that the inputs are correct.
+      self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
+
+      # Check that only one variable was created.
+      self.assertEqual(1, len(variables))
+
+      # Check that invoking input_layer on the same features does not create
+      # additional variables
+      _ = input_layer(features)
+      self.assertEqual(1, len(variables))
+      self.assertEqual(variables[0], input_layer.variables[0])
+
+  def test_feature_column_input_layer_gradient(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(key='a',
+                                                               num_buckets=3)
+      embedding_dimension = 2
+
+      def _embedding_column_initializer(shape, dtype, partition_info):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      input_layer = InputLayer([embedding_column])
+      features = {'a': sparse_input}
+
+      def scale_matrix():
+        matrix = input_layer(features)
+        return 2 * matrix
+
+      # Sanity check: Verify that scale_matrix returns the correct output.
+      self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+
+      # Check that the returned gradient is correct.
+      grad_function = backprop.implicit_grad(scale_matrix)
+      grads_and_vars = grad_function()
+      indexed_slice = grads_and_vars[0][0]
+      gradient = grads_and_vars[0][0].values
+
+      self.assertAllEqual([0, 1, 2], indexed_slice.indices)
+      self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
+
+
+class FunctionalInputLayerTest(test.TestCase):
+
   def test_raises_if_empty_feature_columns(self):
     with self.assertRaisesRegexp(ValueError,
                                  'feature_columns must not be empty'):
@@ -2016,9 +2119,9 @@ class InputLayerTest(test.TestCase):
 
   def test_with_1d_unknown_shape_sparse_tensor(self):
     embedding_values = (
-        (1., 2., 3., 4., 5.),  # id 0
-        (6., 7., 8., 9., 10.),  # id 1
-        (11., 12., 13., 14., 15.)  # id 2
+        (1., 2.),  # id 0
+        (6., 7.),  # id 1
+        (11., 12.)  # id 2
     )
     def _initializer(shape, dtype, partition_info):
       del shape, dtype, partition_info
@@ -2035,8 +2138,8 @@ class InputLayerTest(test.TestCase):
     # embedded_body_style has 5 dims in input_layer.
     country = fc.categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = fc.embedding_column(country, dimension=5,
-                                           initializer=_initializer)
+    embedded_country = fc.embedding_column(
+        country, dimension=2, initializer=_initializer)
 
     # Provides 1-dim tensor and dense tensor.
     features = {
@@ -2054,22 +2157,24 @@ class InputLayerTest(test.TestCase):
         indices=((0,), (1,)),
         values=('sedan', 'hardtop'),
         dense_shape=(2,))
+    country_data = np.array([['US'], ['CA']])
 
-    # Dense categorical_column with unknown shape is not allowed.
-    with self.assertRaisesRegexp(ValueError, 'Undefined input_tensor shape.'):
-      fc.input_layer(features, [price, one_hot_body_style, embedded_country])
-
-    net = fc.input_layer(features, [price, one_hot_body_style])
-    self.assertEqual(1 + 3, net.shape[1])
+    net = fc.input_layer(features,
+                         [price, one_hot_body_style, embedded_country])
+    self.assertEqual(1 + 3 + 2, net.shape[1])
     with _initialized_session() as sess:
 
       # Each row is formed by concatenating `embedded_body_style`,
       # `one_hot_body_style`, and `price` in order.
       self.assertAllEqual(
-          [[0., 0., 1., 11.], [1., 0., 0., 12.]],
-          sess.run(net, feed_dict={
-              features['price']: price_data,
-              features['body-style']: body_style_data}))
+          [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]],
+          sess.run(
+              net,
+              feed_dict={
+                  features['price']: price_data,
+                  features['body-style']: body_style_data,
+                  features['country']: country_data
+              }))
 
   def test_with_rank_0_feature(self):
     # price has 1 dimension in input_layer
@@ -2255,10 +2360,6 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         lookup_ops.tables_initializer().run()
 
   def test_invalid_vocabulary_size(self):
-    with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
-          vocabulary_size=None)
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
       fc.categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
@@ -2372,6 +2473,24 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  def test_get_sparse_tensors_none_vocabulary_size(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa', vocabulary_file=self._wire_vocabulary_file_name)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(self,
+                                  sparse_tensor.SparseTensorValue(
+                                      indices=inputs.indices,
+                                      values=np.array(
+                                          (2, -1, 0), dtype=np.int64),
+                                      dense_shape=inputs.dense_shape),
+                                  id_weight_pair.id_tensor.eval())
+
   def test_transform_feature(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -3432,7 +3551,6 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertEqual('mean', embedding_column.combiner)
     self.assertIsNotNone(embedding_column.initializer)
     self.assertIsNone(embedding_column.ckpt_to_load_from)
-    self.assertIsNone(embedding_column.shared_embedding_collection_name)
     self.assertIsNone(embedding_column.tensor_name_in_ckpt)
     self.assertIsNone(embedding_column.max_norm)
     self.assertTrue(embedding_column.trainable)
@@ -3457,7 +3575,6 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertEqual(embedding_dimension, embedding_column.dimension)
     self.assertEqual('my_combiner', embedding_column.combiner)
     self.assertEqual('my_initializer', embedding_column.initializer())
-    self.assertIsNone(embedding_column.shared_embedding_collection_name)
     self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
     self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
     self.assertEqual(42., embedding_column.max_norm)
@@ -3489,7 +3606,6 @@ class EmbeddingColumnTest(test.TestCase):
       self.assertEqual(embedding_dimension, embedding_column.dimension)
       self.assertEqual('my_combiner', embedding_column.combiner)
       self.assertEqual('my_initializer', embedding_column.initializer())
-      self.assertIsNone(embedding_column.shared_embedding_collection_name)
       self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
       self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
       self.assertEqual(42., embedding_column.max_norm)
@@ -4162,6 +4278,38 @@ class SharedEmbeddingColumnTest(test.TestCase):
           [categorical_column_a, categorical_column_b], dimension=2,
           initializer='not_fn')
 
+  def test_incompatible_column_type(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    categorical_column_c = fc.categorical_column_with_hash_bucket(
+        key='ccc', hash_bucket_size=3)
+    with self.assertRaisesRegexp(
+        ValueError,
+        'all categorical_columns must have the same type.*'
+        '_IdentityCategoricalColumn.*_HashedCategoricalColumn'):
+      fc_lib._shared_embedding_columns(
+          [categorical_column_a, categorical_column_b, categorical_column_c],
+          dimension=2)
+
+  def test_weighted_categorical_column_ok(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    weighted_categorical_column_a = fc.weighted_categorical_column(
+        categorical_column_a, weight_feature_key='aaa_weights')
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    weighted_categorical_column_b = fc.weighted_categorical_column(
+        categorical_column_b, weight_feature_key='bbb_weights')
+    fc_lib._shared_embedding_columns(
+        [weighted_categorical_column_a, categorical_column_b], dimension=2)
+    fc_lib._shared_embedding_columns(
+        [categorical_column_a, weighted_categorical_column_b], dimension=2)
+    fc_lib._shared_embedding_columns(
+        [weighted_categorical_column_a, weighted_categorical_column_b],
+        dimension=2)
+
   def test_parse_example(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
@@ -4199,25 +4347,256 @@ class SharedEmbeddingColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['bbb'].eval())
 
-  def test_input_layer(self):
+  def test_transform_feature(self):
+    a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    b = fc.categorical_column_with_identity(key='bbb', num_buckets=3)
+    a_embedded, b_embedded = fc_lib._shared_embedding_columns(
+        [a, b], dimension=2)
+    features = {
+        'aaa': sparse_tensor.SparseTensor(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=(0, 1, 0),
+            dense_shape=(2, 2)),
+        'bbb': sparse_tensor.SparseTensor(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=(1, 2, 1),
+            dense_shape=(2, 2)),
+    }
+    outputs = _transform_features(features, [a, a_embedded, b, b_embedded])
+    output_a = outputs[a]
+    output_a_embedded = outputs[a_embedded]
+    output_b = outputs[b]
+    output_b_embedded = outputs[b_embedded]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self, output_a.eval(), output_a_embedded.eval())
+      _assert_sparse_tensor_value(
+          self, output_b.eval(), output_b_embedded.eval())
+
+  def test_get_dense_tensor(self):
+    # Inputs.
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array(
+        [[2, -1, -1],  # example 0, ids [2]
+         [0, 1, -1]])  # example 1, ids [0, 1]
+    input_b = np.array(
+        [[0, -1, -1],  # example 0, ids [0]
+         [-1, -1, -1]])  # example 1, ids []
+    input_features = {
+        'aaa': input_a,
+        'bbb': input_b
+    }
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups_a = (
+        # example 0:
+        (7., 11.),  # ids [2], embedding = [7, 11]
+        # example 1:
+        (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+    )
+    expected_lookups_b = (
+        # example 0:
+        (1., 2.),  # ids [0], embedding = [1, 2]
+        # example 1:
+        (0., 0.),  # ids [], embedding = [0, 0]
+    )
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc_lib._shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension, initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup_a = embedding_column_a._get_dense_tensor(
+        _LazyBuilder(input_features))
+    embedding_lookup_b = embedding_column_b._get_dense_tensor(
+        _LazyBuilder(input_features))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    embedding_var = global_vars[0]
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, embedding_var.eval())
+      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
+      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
+
+  def test_get_dense_tensor_placeholder_inputs(self):
+    # Inputs.
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array(
+        [[2, -1, -1],  # example 0, ids [2]
+         [0, 1, -1]])  # example 1, ids [0, 1]
+    input_b = np.array(
+        [[0, -1, -1],  # example 0, ids [0]
+         [-1, -1, -1]])  # example 1, ids []
+    # Specify shape, because dense input must have rank specified.
+    input_a_placeholder = array_ops.placeholder(
+        dtype=dtypes.int64, shape=[None, 3])
+    input_b_placeholder = array_ops.placeholder(
+        dtype=dtypes.int64, shape=[None, 3])
+    input_features = {
+        'aaa': input_a_placeholder,
+        'bbb': input_b_placeholder,
+    }
+    feed_dict = {
+        input_a_placeholder: input_a,
+        input_b_placeholder: input_b,
+    }
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc_lib._shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension, initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup_a = embedding_column_a._get_dense_tensor(
+        _LazyBuilder(input_features))
+    embedding_lookup_b = embedding_column_b._get_dense_tensor(
+        _LazyBuilder(input_features))
+
+    with _initialized_session() as sess:
+      sess.run([embedding_lookup_a, embedding_lookup_b], feed_dict=feed_dict)
+
+  def test_linear_model(self):
+    # Inputs.
+    batch_size = 2
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array(
+        [[2, -1, -1],  # example 0, ids [2]
+         [0, 1, -1]])  # example 1, ids [0, 1]
+    input_b = np.array(
+        [[0, -1, -1],  # example 0, ids [0]
+         [-1, -1, -1]])  # example 1, ids []
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_shape = (vocabulary_size, embedding_dimension)
+    zeros_embedding_values = np.zeros(embedding_shape)
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual(embedding_shape, shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return zeros_embedding_values
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc_lib._shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension, initializer=_initializer)
+
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          categorical_column_a.name: input_a,
+          categorical_column_b.name: input_b,
+      }, (embedding_column_a, embedding_column_b))
+      # Linear weights do not follow the column name. But this is a rare use
+      # case, and fixing it would add too much complexity to the code.
+      expected_var_names = (
+          'linear_model/bias_weights:0',
+          'linear_model/aaa_bbb_shared_embedding/weights:0',
+          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
+          'linear_model/aaa_bbb_shared_embedding_1/weights:0',
+      )
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+      trainable_vars = {
+          v.name: v for v in ops.get_collection(
+              ops.GraphKeys.TRAINABLE_VARIABLES)
+      }
+      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      bias = trainable_vars['linear_model/bias_weights:0']
+      embedding_weights = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
+      linear_weights_a = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding/weights:0']
+      linear_weights_b = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding_1/weights:0']
+      with _initialized_session():
+        # Predictions with all zero weights.
+        self.assertAllClose(np.zeros((1,)), bias.eval())
+        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
+        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+
+        # Predictions with all non-zero weights.
+        embedding_weights.assign((
+            (1., 2.),  # id 0
+            (3., 5.),  # id 1
+            (7., 11.)  # id 2
+        )).eval()
+        linear_weights_a.assign(((4.,), (6.,))).eval()
+        # example 0, ids [2], embedding[0] = [7, 11]
+        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # sum(embeddings * linear_weights)
+        # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29]
+        linear_weights_b.assign(((3.,), (5.,))).eval()
+        # example 0, ids [0], embedding[0] = [1, 2]
+        # example 1, ids [], embedding[1] = 0, 0]
+        # sum(embeddings * linear_weights)
+        # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
+        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
+
+  def _test_input_layer(self, trainable=True):
     # Inputs.
     vocabulary_size = 3
     sparse_input_a = sparse_tensor.SparseTensorValue(
         # example 0, ids [2]
         # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
+        indices=((0, 0), (1, 0), (1, 4)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
     sparse_input_b = sparse_tensor.SparseTensorValue(
         # example 0, ids [0]
         # example 1, ids []
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (3, 0)),
-        values=(0, 1),
-        dense_shape=(4, 5))
+        indices=((0, 0),),
+        values=(0,),
+        dense_shape=(2, 5))
 
     # Embedding variable.
     embedding_dimension = 2
@@ -4242,14 +4621,6 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
         # B ids [], embedding = [0, 0]
         (2., 3.5, 0., 0.),
-        # example 2:
-        # A ids [], embedding = [0, 0]
-        # B ids [], embedding = [0, 0]
-        (0., 0., 0., 0.),
-        # example 3:
-        # A ids [1], embedding = [3, 5]
-        # B ids [1], embedding = [3, 5]
-        (3., 5., 3., 5.),
     )
 
     # Build columns.
@@ -4259,7 +4630,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
         key='bbb', num_buckets=vocabulary_size)
     embedding_column_a, embedding_column_b = fc_lib._shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer)
+        dimension=embedding_dimension, initializer=_initializer,
+        trainable=trainable)
 
     # Provide sparse input and get dense result.
     input_layer = fc.input_layer(
@@ -4272,17 +4644,26 @@ class SharedEmbeddingColumnTest(test.TestCase):
         ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
         tuple([v.name for v in global_vars]))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(
-        ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
-        tuple([v.name for v in trainable_vars]))
+    if trainable:
+      self.assertItemsEqual(
+          ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
+          tuple([v.name for v in trainable_vars]))
+    else:
+      self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
     shared_embedding_vars = ops.get_collection('aaa_bbb_shared_embedding')
     self.assertItemsEqual(
         ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
         tuple([v.name for v in shared_embedding_vars]))
     with _initialized_session():
-      self.assertAllEqual(embedding_values, trainable_vars[0].eval())
+      self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
       self.assertAllEqual(expected_lookups, input_layer.eval())
 
+  def test_input_layer(self):
+    self._test_input_layer()
+
+  def test_input_layer_no_trainable(self):
+    self._test_input_layer(trainable=False)
+
 
 class WeightedCategoricalColumnTest(test.TestCase):
 
diff --git a/tensorflow/python/framework/c_api_util.py b/tensorflow/python/framework/c_api_util.py
index 1d0dd88dc5ce75504817eea9de9bf970a5660011..6c522de452b59ea9a200ccf89cfb428a26970db1 100644
--- a/tensorflow/python/framework/c_api_util.py
+++ b/tensorflow/python/framework/c_api_util.py
@@ -94,3 +94,57 @@ def tf_buffer(data=None):
     yield buf
   finally:
     c_api.TF_DeleteBuffer(buf)
+
+
+def tf_output(c_op, index):
+  """Returns a wrapped TF_Output with specified operation and index.
+
+  Args:
+    c_op: wrapped TF_Operation
+    index: integer
+
+  Returns:
+    Wrapped TF_Output
+  """
+  ret = c_api.TF_Output()
+  ret.oper = c_op
+  ret.index = index
+  return ret
+
+
+def tf_operations(graph):
+  """Generator that yields every TF_Operation in `graph`.
+
+  Args:
+    graph: Graph
+
+  Yields:
+    wrapped TF_Operation
+  """
+  # pylint: disable=protected-access
+  pos = 0
+  c_op, pos = c_api.TF_GraphNextOperation(graph._c_graph, pos)
+  while c_op is not None:
+    yield c_op
+    c_op, pos = c_api.TF_GraphNextOperation(graph._c_graph, pos)
+  # pylint: enable=protected-access
+
+
+def new_tf_operations(graph):
+  """Generator that yields newly-added TF_Operations in `graph`.
+
+  Specifically, yields TF_Operations that don't have associated Operations in
+  `graph`. This is useful for processing nodes added by the C API.
+
+  Args:
+    graph: Graph
+
+  Yields:
+    wrapped TF_Operation
+  """
+  # TODO(b/69679162): do this more efficiently
+  for c_op in tf_operations(graph):
+    try:
+      graph._get_operation_by_tf_operation(c_op)  # pylint: disable=protected-access
+    except KeyError:
+      yield c_op
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index db124ab12acdfb9724f9800f5be36b9f1d45f323..b0422eb6be091a3fcf4b213f04a2e13a3ae8a963 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -18,9 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+
 import numpy as np
 
 from tensorflow.core.framework import types_pb2
+from tensorflow.python import pywrap_tensorflow
+
+
+_np_bfloat16 = pywrap_tensorflow.TF_bfloat16_type()
 
 
 class DType(object):
@@ -146,8 +151,9 @@ class DType(object):
   @property
   def is_floating(self):
     """Returns whether this is a (non-quantized, real) floating point type."""
-    return self.is_numpy_compatible and np.issubdtype(self.as_numpy_dtype,
-                                                      np.floating)
+    return ((self.is_numpy_compatible and np.issubdtype(self.as_numpy_dtype,
+                                                        np.floating))
+            or self.base_dtype == bfloat16)
 
   @property
   def is_complex(self):
@@ -157,7 +163,7 @@ class DType(object):
   @property
   def is_quantized(self):
     """Returns whether this is a quantized data type."""
-    return self.base_dtype in [qint8, quint8, qint16, quint16, qint32, bfloat16]
+    return self.base_dtype in [qint8, quint8, qint16, quint16, qint32]
 
   @property
   def is_unsigned(self):
@@ -194,6 +200,8 @@ class DType(object):
       try:
         return np.iinfo(self.as_numpy_dtype()).min
       except:
+        if self.base_dtype == bfloat16:
+          return _np_bfloat16(float.fromhex("-0x1.FEp127"))
         raise TypeError("Cannot find minimum value of %s." % self)
 
   @property
@@ -216,6 +224,8 @@ class DType(object):
       try:
         return np.iinfo(self.as_numpy_dtype()).max
       except:
+        if self.base_dtype == bfloat16:
+          return _np_bfloat16(float.fromhex("0x1.FEp127"))
         raise TypeError("Cannot find maximum value of %s." % self)
 
   @property
@@ -486,6 +496,8 @@ _np_qint16 = np.dtype([("qint16", np.int16, 1)])
 _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
 _np_qint32 = np.dtype([("qint32", np.int32, 1)])
 
+# _np_bfloat16 is defined by a module import.
+
 # Custom struct dtype for directly-fed ResourceHandles of supported type(s).
 np_resource = np.dtype([("resource", np.ubyte, 1)])
 
@@ -511,7 +523,7 @@ _NP_TO_TF = frozenset([
     (_np_qint16, qint16),
     (_np_quint16, quint16),
     (_np_qint32, qint32),
-    # NOTE(touts): Intentionally no way to feed a DT_BFLOAT16.
+    (_np_bfloat16, bfloat16),
 ])
 _TF_TO_NP = {
     types_pb2.DT_HALF: np.float16,
@@ -536,7 +548,7 @@ _TF_TO_NP = {
     types_pb2.DT_QINT16: _np_qint16,
     types_pb2.DT_QUINT16: _np_quint16,
     types_pb2.DT_QINT32: _np_qint32,
-    types_pb2.DT_BFLOAT16: np.uint16,
+    types_pb2.DT_BFLOAT16: _np_bfloat16,
 
     # Ref types
     types_pb2.DT_HALF_REF: np.float16,
@@ -559,7 +571,7 @@ _TF_TO_NP = {
     types_pb2.DT_QINT16_REF: _np_qint16,
     types_pb2.DT_QUINT16_REF: _np_quint16,
     types_pb2.DT_QINT32_REF: _np_qint32,
-    types_pb2.DT_BFLOAT16_REF: np.uint16,
+    types_pb2.DT_BFLOAT16_REF: _np_bfloat16,
 }
 
 
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index 67842e14b1077fdf69aa3405f4f43fc92e499b4d..e49e2fda5d84da4f8f87fae73874351afe0a20f2 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -176,7 +176,7 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertEqual(dtypes.as_dtype("float64").is_floating, True)
     self.assertEqual(dtypes.as_dtype("string").is_floating, False)
     self.assertEqual(dtypes.as_dtype("bool").is_floating, False)
-    self.assertEqual(dtypes.as_dtype("bfloat16").is_integer, False)
+    self.assertEqual(dtypes.as_dtype("bfloat16").is_floating, True)
     self.assertEqual(dtypes.as_dtype("qint8").is_floating, False)
     self.assertEqual(dtypes.as_dtype("qint16").is_floating, False)
     self.assertEqual(dtypes.as_dtype("qint32").is_floating, False)
@@ -276,6 +276,9 @@ class TypesTest(test_util.TensorFlowTestCase):
       if numpy_dtype in (np.float16, np.float32, np.float64):
         self.assertEquals(dtype.min, np.finfo(numpy_dtype).min)
         self.assertEquals(dtype.max, np.finfo(numpy_dtype).max)
+      if numpy_dtype == dtypes.bfloat16.as_numpy_dtype:
+        self.assertEquals(dtype.min, float.fromhex("-0x1.FEp127"))
+        self.assertEquals(dtype.max, float.fromhex("0x1.FEp127"))
 
   def testRepr(self):
     for enum, name in dtypes._TYPE_TO_STRING.items():
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 29cf2237244810a888d53927f44889b4a4e9704e..e06899f81d08c06ece206ab0f49c817f8c570bde 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -82,8 +82,8 @@ class Defun(object):
     return x + y, x - y
 
   # Building the graph.
-  a = tf.Constant([1.0])
-  b = tf.Constant([2.0])
+  a = tf.constant([1.0])
+  b = tf.constant([2.0])
   c, d = MyFunc(a, b, name='mycall')
   ```
   """
@@ -692,7 +692,10 @@ class _FuncGraph(ops.Graph):
         else:
           # Substitute with a placeholder.
           self.extra_inputs.append(x)
-          ph = array_ops.placeholder(x.dtype, shape=x.get_shape())
+          # Hoist the new input placeholder out of any control flow context
+          # we're currently in.
+          with ops.control_dependencies(None):
+            ph = array_ops.placeholder(x.dtype, shape=x.get_shape())
           # pylint: disable=protected-access
           ph._handle_data = x._handle_data
           # pylint: enable=protected-access
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index ba43e9199b4764fef4b86056a1ae57bd9070003e..f5a97eb197120410fafdd2d1ae98249562a94196 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -724,6 +724,38 @@ class FunctionTest(test.TestCase):
         # NOTE: We still do not support capturing control deps.
         _ = Foo(x)
 
+  def testCaptureInWhileLoop(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = constant_op.constant(1)
+
+      @function.Defun()
+      def Foo():
+        return control_flow_ops.while_loop(lambda i: i < 10,
+                                           lambda i: i + x,
+                                           [0])
+      y = Foo()
+
+    with self.test_session(graph=g) as sess:
+      self.assertEqual(sess.run(y), 10)
+
+  def testCaptureInCond(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = constant_op.constant(1)
+
+      @function.Defun(dtypes.bool)
+      def Foo(pred):
+        return control_flow_ops.cond(pred,
+                                     lambda: x,
+                                     lambda: x + 1)
+      y = Foo(True)
+      z = Foo(False)
+
+    with self.test_session(graph=g) as sess:
+      self.assertEqual(sess.run(y), 1)
+      self.assertEqual(sess.run(z), 2)
+
   def testStableName(self):
 
     @function.Defun()
@@ -882,6 +914,75 @@ class FunctionTest(test.TestCase):
           np.array([1.0, 0.0]).astype(np.float32),
           sess.run(dinp, {inp: x}))
 
+  def testFunctionMarkedStateful(self):
+
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def Foo(t, x):
+      return x[t]
+
+    @function.Defun(dtypes.int64)
+    def Bar(x):
+      return x
+
+    # NOTE(mrry): All functions are currently considered stateless by the
+    # runtime, so we simulate a "stateful" function.
+    # TODO(b/70565970): Remove this hack when we are able to build stateful
+    # functions using the API.
+    # pylint: disable=protected-access
+    Foo._signature.is_stateful = True
+    Bar._signature.is_stateful = True
+    # pylint: enable=protected-access
+
+    result_1 = Foo(3, [1.0, 2.0, 3.0, 4.0])
+    result_2 = Bar(constant_op.constant(100, dtype=dtypes.int64))
+
+    with session.Session() as sess:
+      self.assertEqual(4.0, sess.run(result_1))
+      self.assertEqual(100, sess.run(result_2))
+      self.assertEqual((4.0, 100), sess.run((result_1, result_2)))
+
+  def testStatefulFunction(self):
+
+    @function.Defun()
+    def FunctionWithStatelessOp():
+      return constant_op.constant(42.0)
+
+    @function.Defun()
+    def FunctionWithStatefulOp():
+      return random_ops.random_uniform([100], maxval=10, dtype=dtypes.int32)
+
+    @function.Defun()
+    def FunctionWithStatelessFunctionCall():
+      return FunctionWithStatelessOp()
+
+    @function.Defun()
+    def FunctionWithStatefulFunctionCall():
+      return FunctionWithStatefulOp()
+
+    # Test that the `is_stateful` bit is propagated.
+    self.assertFalse(FunctionWithStatelessOp.definition.signature.is_stateful)
+    self.assertTrue(FunctionWithStatefulOp.definition.signature.is_stateful)
+    self.assertFalse(
+        FunctionWithStatelessFunctionCall.definition.signature.is_stateful)
+    self.assertTrue(
+        FunctionWithStatefulFunctionCall.definition.signature.is_stateful)
+
+    # Ensure that two invocations of the same random-number-generating
+    # function produce different results.
+    result1 = FunctionWithStatefulFunctionCall()
+    result2 = FunctionWithStatefulFunctionCall()
+
+    # Statefulness affects how the function is treated by the various
+    # optimization passes, so run the test in each optimizer
+    # configuration.
+    for config in _OptimizerOptions():
+      with session.Session(config=config) as sess:
+        val1, val2 = sess.run((result1, result2))
+        self.assertFalse(all(val1 == val2))
+        val3, val4 = sess.run((result1, result2))
+        self.assertFalse(all(val3 == val1))
+        self.assertFalse(all(val4 == val2))
+
 
 @test_util.with_c_api
 class FunctionsFromProtos(test.TestCase):
diff --git a/tensorflow/python/framework/graph_to_function_def.py b/tensorflow/python/framework/graph_to_function_def.py
index 448f87aa6ee31127113ed10aee8e4e0fa06482f1..625f31146be89f09481b634127484d15f0631fc6 100644
--- a/tensorflow/python/framework/graph_to_function_def.py
+++ b/tensorflow/python/framework/graph_to_function_def.py
@@ -110,6 +110,13 @@ def _add_op_node(op, func, input_dict):
                                                (node_def.input[i],
                                                 input_dict.items()))
       node_def.input[i] = input_dict[node_def.input[i]]
+  # The function is stateful if any of its operations are stateful.
+  # NOTE(mrry): The "Const" node typically does not have an `OpDef` associated
+  # with it, so we assume any nodes without an `OpDef` are stateless.
+  # TODO(skyewm): Remove the `is not None` test after we transition to the C
+  # API.
+  if op.op_def is not None and op.op_def.is_stateful:
+    func.signature.is_stateful = True
 
 
 def graph_to_function_def(graph, operations, inputs, outputs, out_names=None):
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index e4b94e1a344826ec8329e5de2da39de17a700e6c..33c966ad88ea9e27ef50d048672bc30d40a87ede 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -179,12 +179,11 @@ def _ProcessInputMapParam(input_map):
 
 def _ProcessReturnElementsParam(return_elements):
   """Type-checks and possibly canonicalizes `return_elements`."""
-  if return_elements is not None:
-    return_elements = tuple(return_elements)
-    if not all(isinstance(x, compat.bytes_or_text_types)
-               for x in return_elements):
-      raise TypeError('return_elements must be a list of strings.')
-  return return_elements
+  if return_elements is None: return None
+  if not all(isinstance(x, compat.bytes_or_text_types)
+             for x in return_elements):
+    raise TypeError('return_elements must be a list of strings.')
+  return tuple(compat.as_str(x) for x in return_elements)
 
 
 def _FindAttrInOpDef(attr_name, op_def):
@@ -194,6 +193,182 @@ def _FindAttrInOpDef(attr_name, op_def):
   return None
 
 
+def _RemoveDefaultAttrs(op_dict, producer_op_list, graph_def):
+  """Removes unknown default attrs according to `producer_op_list`.
+
+  Removes any unknown attrs in `graph_def` (i.e. attrs that do not appear in
+  the OpDefs in `op_dict`) that have a default value in `producer_op_list`.
+
+  Args:
+    op_dict: dict mapping operation name to OpDef.
+    producer_op_list: OpList proto.
+    graph_def: GraphDef proto
+  """
+  producer_op_dict = {op.name: op for op in producer_op_list.op}
+  for node in graph_def.node:
+    # Remove any default attr values that aren't in op_def.
+    if node.op in producer_op_dict:
+      op_def = op_dict[node.op]
+      producer_op_def = producer_op_dict[node.op]
+      # We make a copy of node.attr to iterate through since we may modify
+      # node.attr inside the loop.
+      for key in list(node.attr):
+        if _FindAttrInOpDef(key, op_def) is None:
+          # No attr_def in consumer, look in producer.
+          attr_def = _FindAttrInOpDef(key, producer_op_def)
+          if (attr_def and attr_def.HasField('default_value') and
+              node.attr[key] == attr_def.default_value):
+            # Unknown attr had default value in producer, delete it so it can be
+            # understood by consumer.
+            del node.attr[key]
+
+
+def _ConvertInputMapValues(name, input_map):
+  """Ensures all input map values are tensors.
+
+  This should be called from inside the import name scope.
+
+  Args:
+    name: the `name` argument passed to import_graph_def
+    input_map: the `input_map` argument passed to import_graph_def.
+
+  Returns:
+    An possibly-updated version of `input_map`.
+
+  Raises:
+    ValueError: if input map values cannot be converted due to empty name scope.
+  """
+  if not all(isinstance(v, ops.Tensor) for v in input_map.values()):
+    if name == '':  # pylint: disable=g-explicit-bool-comparison
+      raise ValueError(
+          'tf.import_graph_def() requires a non-empty `name` if `input_map` '
+          'contains non-Tensor values. Try calling tf.convert_to_tensor() on '
+          '`input_map` values before calling tf.import_graph_def().')
+    with ops.name_scope('_inputs'):
+      input_map = {k: ops.convert_to_tensor(v) for k, v in input_map.items()}
+  return input_map
+
+
+def _PopulateTFImportGraphDefOptions(options, prefix, input_map,
+                                     return_elements):
+  """Populates the TF_ImportGraphDefOptions `options`."""
+  c_api.TF_ImportGraphDefOptionsSetPrefix(options, prefix)
+  c_api.TF_ImportGraphDefOptionsSetUniquifyNames(options, True)
+  c_api.TF_ImportGraphDefOptionsSetUniquifyPrefix(options, True)
+
+  for input_src, input_dst in input_map.items():
+    input_src = compat.as_str(input_src)
+    if input_src.startswith('^'):
+      src_name = compat.as_bytes(input_src[1:])
+      dst_op = input_dst._as_tf_output().oper  # pylint: disable=protected-access
+      c_api.TF_ImportGraphDefOptionsRemapControlDependency(options, src_name,
+                                                           dst_op)
+    else:
+      src_name, src_idx = _ParseTensorName(input_src)
+      src_name = compat.as_str(src_name)
+      dst_output = input_dst._as_tf_output()  # pylint: disable=protected-access
+      c_api.TF_ImportGraphDefOptionsAddInputMapping(options, src_name,
+                                                    src_idx, dst_output)
+  for name in return_elements or []:
+    if ':' in name:
+      op_name, index = _ParseTensorName(name)
+      op_name = compat.as_str(op_name)
+      c_api.TF_ImportGraphDefOptionsAddReturnOutput(options, op_name, index)
+    else:
+      c_api.TF_ImportGraphDefOptionsAddReturnOperation(options,
+                                                       compat.as_str(name))
+
+
+def _ProcessNewOps(graph):
+  """Processes the newly-added TF_Operations in `graph`."""
+  # Maps from a node to the names of the ops it's colocated with, if colocation
+  # is specified in the attributes.
+  colocation_pairs = {}
+
+  for new_op in graph._add_new_tf_operations(compute_devices=False):  # pylint: disable=protected-access
+    colocation_names = _GetColocationNames(new_op)
+    if colocation_names:
+      colocation_pairs[new_op] = colocation_names
+      # Don't apply this op's device function, since colocation constraints
+      # override device functions. Note that this op's device may still be set
+      # by the loop below.
+    else:
+      with _MaybeDevice(new_op.device):
+        graph._apply_device_functions(new_op)  # pylint: disable=protected-access
+
+  # The following loop populates the device field of ops that are colocated
+  # with another op.  This is implied by the colocation attribute, but we
+  # propagate the device field for completeness.
+  for op, coloc_op_list in colocation_pairs.items():
+    coloc_device = None
+    # Find any device in the list of colocated ops that have a device, if it
+    # exists.  We assume that if multiple ops have devices, they refer to the
+    # same device.  Otherwise, a runtime error will occur since the colocation
+    # property cannot be guaranteed.
+    #
+    # One possible improvement is to try to check for compatibility of all
+    # devices in this list at import time here, which would require
+    # implementing a compatibility function for device specs in python.
+    for coloc_op_name in coloc_op_list:
+      try:
+        coloc_op = graph._get_operation_by_name_unsafe(coloc_op_name)  # pylint: disable=protected-access
+      except KeyError:
+        raise ValueError('Specified colocation to an op that '
+                         'does not exist during import: %s in %s' % (
+                             coloc_op_name, op.name))
+      if coloc_op.device:
+        coloc_device = pydev.DeviceSpec.from_string(coloc_op.device)
+        break
+    if coloc_device:
+      op._set_device(coloc_device)  # pylint: disable=protected-access
+
+
+def _GetColocationNames(op):
+  """Returns names of the ops that `op` should be colocated with."""
+  colocation_names = []
+  try:
+    class_values = op.get_attr('_class')
+  except ValueError:
+    # No _class attr
+    return
+  for val in class_values:
+    val = compat.as_str(val)
+    if val.startswith('loc:@'):
+      colocation_node_name = val[len('loc:@'):]
+      if colocation_node_name != op.name:
+        colocation_names.append(colocation_node_name)
+  return colocation_names
+
+
+def _GatherReturnElements(requested_return_elements, graph, results):
+  """Returns the requested return elements from results.
+
+  Args:
+    requested_return_elements: list of strings of operation and tensor names
+    graph: Graph
+    results: wrapped TF_ImportGraphDefResults
+
+  Returns:
+    list of `Operation` and/or `Tensor` objects
+  """
+  return_outputs = c_api.TF_ImportGraphDefResultsReturnOutputs(results)
+  return_opers = c_api.TF_ImportGraphDefResultsReturnOperations(results)
+
+  combined_return_elements = []
+  outputs_idx = 0
+  opers_idx = 0
+  for name in requested_return_elements:
+    if ':' in name:
+      combined_return_elements.append(
+          graph._get_tensor_by_tf_output(return_outputs[outputs_idx]))  # pylint: disable=protected-access
+      outputs_idx += 1
+    else:
+      combined_return_elements.append(
+          graph._get_operation_by_tf_operation(return_opers[opers_idx]))  # pylint: disable=protected-access
+      opers_idx += 1
+  return combined_return_elements
+
+
 @deprecated_args(None, 'Please file an issue at '
                  'https://github.com/tensorflow/tensorflow/issues if you depend'
                  ' on this feature.',
@@ -247,24 +422,72 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
 
   op_dict = op_def_registry.get_registered_ops()
 
-  if producer_op_list is None:
-    producer_op_dict = None
-  else:
-    producer_op_dict = {op.name: op for op in producer_op_list.op}
+  if producer_op_list is not None:
+    # TODO(skyewm): make a copy of graph_def so we're not mutating the argument?
+    _RemoveDefaultAttrs(op_dict, producer_op_list, graph_def)
 
   graph = ops.get_default_graph()
 
   if graph._c_graph:  # pylint: disable=protected-access
-    scoped_options = c_api_util.ScopedTFImportGraphDefOptions()
+    with ops.name_scope(name, 'import', input_map.values()) as scope:
+      # Save unique prefix generated by name_scope
+      if scope:
+        assert scope.endswith('/')
+        prefix = scope[:-1]
+      else:
+        prefix = ''
 
-    with errors.raise_exception_on_not_ok_status() as status:
-      with c_api_util.tf_buffer(graph_def.SerializeToString()) as serialized:
-        c_api.TF_GraphImportGraphDefWithResults(
-            graph._c_graph, serialized, scoped_options.options, status)  # pylint: disable=protected-access
+      # Generate any input map tensors inside name scope
+      input_map = _ConvertInputMapValues(name, input_map)
 
-    if return_elements is not None:
-      raise ValueError('return_elements not yet implemented with C API')
-    return None
+    scoped_options = c_api_util.ScopedTFImportGraphDefOptions()
+    options = scoped_options.options
+    _PopulateTFImportGraphDefOptions(options, prefix, input_map,
+                                     return_elements)
+
+    with c_api_util.tf_buffer(graph_def.SerializeToString()) as serialized:
+      try:
+        with errors.raise_exception_on_not_ok_status() as status:
+          results = c_api.TF_GraphImportGraphDefWithResults(
+              graph._c_graph, serialized, options, status)  # pylint: disable=protected-access
+      except errors.InvalidArgumentError as e:
+        # Convert to ValueError for backwards compatibility.
+        raise ValueError(str(e))
+
+    _ProcessNewOps(graph)
+
+    # Create _DefinedFunctions for any imported functions.
+    #
+    # We do this by creating _DefinedFunctions directly from `graph_def`, and
+    # adding them to `graph`. Adding an existing function to a TF_Graph is a
+    # no-op, so this only has the effect of updating the Python state (usually
+    # _DefinedFunction.add_to_graph also adds the function to the TF_Graph).
+    #
+    # TODO(skyewm): fetch the TF_Functions directly from the TF_Graph
+    # TODO(skyewm): avoid sending serialized FunctionDefs back to the TF_Graph
+    if graph_def.library and graph_def.library.function:
+      # pylint: disable=protected-access
+      functions = function._from_library(graph_def.library)
+      for f in functions:
+        f.add_to_graph(graph)
+      # pylint: enable=protected-access
+
+    # Treat input mappings that don't appear in the graph as an error, because
+    # they are likely to be due to a typo.
+    missing_unused_input_keys = (
+        c_api.TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
+            results))
+    if missing_unused_input_keys:
+      missing_unused_input_keys = [compat.as_str(s)
+                                   for s in missing_unused_input_keys]
+      raise ValueError(
+          'Attempted to map inputs that were not found in graph_def: [%s]'
+          % ', '.join(missing_unused_input_keys))
+
+    if return_elements is None:
+      return None
+    else:
+      return _GatherReturnElements(return_elements, graph, results)
 
   else:
     g = graph
@@ -297,16 +520,7 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
       # more nuanced.
       g.graph_def_versions.CopyFrom(graph_def.versions)
 
-      if not all(isinstance(v, ops.Tensor) for v in input_map.values()):
-        if not scope:
-          # The caller must have passed `name=''`.
-          raise ValueError(
-              'tf.import_graph_def() requires a non-empty `name` if `input_map`'
-              ' contains non-Tensor values. Try calling tf.convert_to_tensor() '
-              'on `input_map` values before calling tf.import_graph_def().')
-        with ops.name_scope('_inputs'):
-          input_map = {k: ops.convert_to_tensor(v)
-                       for k, v in input_map.items()}
+      input_map = _ConvertInputMapValues(name, input_map)
 
       # NOTE(mrry): We do this in two passes, because there may be a cycle in
       # `graph_def`.
@@ -326,21 +540,6 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
             value = node.attr[key]
             if value is None or value.WhichOneof('value') is None:
               node.attr[key].CopyFrom(attr_def.default_value)
-        if producer_op_dict:
-          # Remove any default attr values that aren't in op_def.
-          if node.op in producer_op_dict:
-            producer_op_def = producer_op_dict[node.op]
-            # We make a copy of node.attr to iterate through since we
-            # may modify node.attr inside the loop.
-            for key in list(node.attr):
-              if _FindAttrInOpDef(key, op_def) is None:
-                # No attr_def in consumer, look in producer.
-                attr_def = _FindAttrInOpDef(key, producer_op_def)
-                if (attr_def and attr_def.HasField('default_value') and
-                    node.attr[key] == attr_def.default_value):
-                  # Unknown attr had default value in producer, delete it
-                  # so it can be understood by consumer.
-                  del node.attr[key]
 
         output_types = _OutputTypes(node, op_dict)
         name_to_op[node.name] = g.create_op(
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index d27ec1e30ca88ca9853c4e1f143396c0e332b07f..c57b7d47b8ff61cd16300ae8b9444d6c69e496c8 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import test_ops  # pylint: disable=unused-impor
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -43,6 +44,7 @@ import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class ImportGraphDefTest(test.TestCase):
 
   def _MakeGraphDef(self,
@@ -56,28 +58,6 @@ class ImportGraphDefTest(test.TestCase):
     text_format.Merge(text, ret)
     return ret
 
-  # The C API doesn't currently support return elements (or anything else beyond
-  # the most basic import). This test only checks that the import can run
-  # without error, and will be removed once more functionality is implemented
-  # and we can get coverage from the other tests.
-  @test_util.enable_c_api
-  def testCApi(self):
-    importer.import_graph_def(
-        self._MakeGraphDef("""
-        node { name: 'A' op: 'IntOutputFloatOutput' }
-          node { name: 'B' op: 'ListOutput'
-                 attr { key: 'T'
-                        value { list { type: DT_INT32 type: DT_FLOAT } } } }
-          node { name: 'C' op: 'ListInput'
-                 attr { key: 'N' value { i: 2 } }
-                 attr { key: 'T' value { type: DT_INT32 } }
-                 input: 'A:0' input: 'B:0' }
-          node { name: 'D' op: 'ListInput'
-                 attr { key: 'N' value { i: 2 } }
-                 attr { key: 'T' value { type: DT_FLOAT } }
-                 input: 'A:1' input: 'B:1' }
-          """))
-
   def testBasic(self):
     with ops.Graph().as_default():
       a, b, c, d = importer.import_graph_def(
@@ -175,16 +155,16 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(list(b3.inputs), [a3.outputs[0]])
 
       # Import with existing de-duped node names
-      a4, b4 = importer.import_graph_def(
+      a1_1, b1_1 = importer.import_graph_def(
           self._MakeGraphDef("""
           node { name: 'A_1' op: 'IntOutput' }
           node { name: 'B_1' op: 'IntInput' input: 'A_1:0' }
           """),
           return_elements=["A_1", "B_1"],
           name="")
-      self.assertEqual(a4.name, "A_1_1")
-      self.assertEqual(b4.name, "B_1_1")
-      self.assertEqual(list(b4.inputs), [a4.outputs[0]])
+      self.assertEqual(a1_1.name, "A_1_1")
+      self.assertEqual(b1_1.name, "B_1_1")
+      self.assertEqual(list(b1_1.inputs), [a1_1.outputs[0]])
 
       # Create a name scope and then import node with same name
       with ops.name_scope("foo"):
@@ -357,31 +337,37 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(d._input_dtypes, [dtypes.int32_ref, dtypes.int32])
       self.assertEqual(d.outputs, [])
 
-  def testCyclic(self):
-    with ops.Graph().as_default():
-      a, b = importer.import_graph_def(
-          self._MakeGraphDef("""
-          node { name: 'A' op: 'Unary'
-                 attr { key: 'T' value { type: DT_INT32 } } input: 'B:0' }
-          node { name: 'B' op: 'Unary'
-                 attr { key: 'T' value { type: DT_INT32 } } input: 'A:0' }
-          """),
-          return_elements=["A", "B"])
+  def testWhileLoop(self):
+    # Produce GraphDef containing while loop.
+    graph = ops.Graph()
+    with graph.as_default():
+      r = control_flow_ops.while_loop(lambda i: i < 10, lambda i: i + 1, [0])
+    graph_def = graph.as_graph_def()
 
-      self.assertEqual(a.inputs[0], b.outputs[0])
-      self.assertEqual(b.inputs[0], a.outputs[0])
+    # Import the GraphDef and make sure it runs.
+    with ops.Graph().as_default():
+      imported_r, = importer.import_graph_def(graph_def,
+                                              return_elements=[r.name])
+      self.assertEqual(imported_r.name, "import/" + r.name)
+      with self.test_session() as sess:
+        self.assertEqual(sess.run(imported_r), 10)
 
   def testTypeMismatchInGraphDef(self):
+    if ops._USE_C_API:
+      # TODO(skyewm): improve error message
+      error_msg = ("Input 0 of node import/B was passed int32 from import/A:0 "
+                   "incompatible with expected float.")
+    else:
+      error_msg = ("Cannot convert a tensor of type int32 to an input of type "
+                   "float")
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             node { name: 'B' op: 'FloatInput' input: 'A:0' }
             """))
-      self.assertTrue(
-          "Cannot convert a tensor of type int32 to an input of type float" in
-          str(e.exception))
 
   def testShapeWhitelist(self):
     # Barrier's shape is an output vector of 2, but the
@@ -391,7 +377,9 @@ class ImportGraphDefTest(test.TestCase):
           self._MakeGraphDef("""
           node { name: 'A' op: 'Barrier'
                  attr { key: '_output_shapes'
-                        value { list { shape { } } } } }
+                        value { list { shape { } } } }
+                 attr { key: 'component_types'
+                        value { list { type: DT_FLOAT } } } }
           """),
           return_elements=["A"],
           name="import")
@@ -416,35 +404,49 @@ class ImportGraphDefTest(test.TestCase):
             "Shapes () and (43,) are not compatible" in str(e.exception))
 
   def testInvalidSignatureTooManyInputsInGraphDef(self):
+    if ops._USE_C_API:
+      # TODO(skyewm): improve error message
+      error_msg = "NodeDef expected inputs '' do not match 1 inputs specified"
+    else:
+      error_msg = r"More inputs specified \('A:0'\) than the op expects"
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             node { name: 'B' op: 'None' input: 'A:0' }
             """))
-      self.assertTrue("More inputs specified ('A:0') than the op expects" in
-                      str(e.exception))
 
   def testInvalidSignatureNotEnoughInputsInGraphDef(self):
+    if ops._USE_C_API:
+      # TODO(skyewm): improve error message
+      error_msg = ("NodeDef expected inputs 'int32, float' do not match 1 "
+                   "inputs specified")
+    else:
+      error_msg = (r"Input types mismatch \(expected 'int32, float32' but "
+                   r"got 'int32'\)")
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             node { name: 'B' op: 'IntInputFloatInput' input: 'A:0' }
             """))
-      self.assertTrue("Input types mismatch (expected 'int32, float32' but "
-                      "got 'int32')" in str(e.exception))
 
   def testMissingInputOpInGraphDef(self):
+    if ops._USE_C_API:
+      error_msg = "Node 'B': Unknown input node 'A:0'"
+    else:
+      error_msg = "Input tensor 'A:0' not found"
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'FloatInput' input: 'A:0' }
             """))
-      self.assertTrue("Input tensor 'A:0' not found" in str(e.exception))
 
   def testMissingInputOpInGraphDefButAppearsInInputMap(self):
     with ops.Graph().as_default():
@@ -458,93 +460,122 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(b.inputs[0], feed_a_0)
 
   def testMissingInputTensorInGraphDef(self):
+    if ops._USE_C_API:
+      error_msg = ("Node 'B': Connecting to invalid output 1 of source node A "
+                   "which has 1 outputs")
+    else:
+      error_msg = "Input tensor 'A:1' not found"
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'FloatOutput' }
             node { name: 'B' op: 'FloatInput' input: 'A:1' }
             """))
-      self.assertTrue("Input tensor 'A:1' not found" in str(e.exception))
 
   def testMissingControlInputInGraphDef(self):
+    if ops._USE_C_API:
+      error_msg = r"Node 'B': Unknown input node '\^A'"
+    else:
+      error_msg = r"Control input '\^A' not found"
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: '^A' }
             """))
-      self.assertTrue("Control input '^A' not found" in str(e.exception))
 
   def testInvalidTensorNameOutputIndexInGraphDef(self):
+    if ops._USE_C_API:
+      error_msg = "Node 'B': Unknown input node 'A:B'"
+    else:
+      error_msg = "Cannot convert 'A:B' to a tensor name."
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: 'A:B' }
             """))
-      self.assertEqual("Cannot convert 'A:B' to a tensor name.",
-                       str(e.exception))
 
   def testInvalidTensorNameInGraphDef(self):
+    if ops._USE_C_API:
+      error_msg = "Node 'B': Unknown input node 'A:B:0'"
+    else:
+      error_msg = "Cannot convert 'A:B:0' to a tensor name."
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: 'A:B:0' }
             """))
-      self.assertEqual("Cannot convert 'A:B:0' to a tensor name.",
-                       str(e.exception))
 
   def testMissingReturnOperation(self):
+    if ops._USE_C_API:
+      error_msg = "Requested return node 'B' not found in graph def"
+    else:
+      error_msg = "return_element 'B' not found in graph_def."
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'None' }
             """),
             return_elements=["B"])
-      self.assertTrue(
-          "return_element 'B' not found in graph_def." in str(e.exception))
 
   def testMissingReturnTensor(self):
+    if ops._USE_C_API:
+      error_msg = (r"Invalid return output 1 of node 'A', which has 1 "
+                   r"output\(s\)")
+    else:
+      error_msg = "return_element 'A:1' not found in graph_def."
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             """),
             return_elements=["A:1"])
-      self.assertTrue(
-          "return_element 'A:1' not found in graph_def." in str(e.exception))
 
-      with self.assertRaises(ValueError) as e:
+      if ops._USE_C_API:
+        error_msg = "Requested return tensor 'B:0' not found in graph def"
+      else:
+        error_msg = "return_element 'B:0' not found in graph_def."
+
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             """),
             return_elements=["B:0"])
-      self.assertTrue(
-          "return_element 'B:0' not found in graph_def." in str(e.exception))
 
-      with self.assertRaises(ValueError) as e:
+      if ops._USE_C_API:
+        error_msg = "Cannot convert 'A:B:0' to a tensor name."
+      else:
+        error_msg = "return_element 'A:B:0' not found in graph_def."
+
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             """),
             return_elements=["A:B:0"])
-      self.assertTrue(
-          "return_element 'A:B:0' not found in graph_def." in str(e.exception))
 
   def testMissingInputMap(self):
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"Attempted to map inputs that were not found in graph_def: \[B:0\]"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'None' }
             """),
             input_map={"B:0": constant_op.constant(5.0)})
-      self.assertTrue("not found in graph_def: [B:0]" in str(e.exception))
 
   def testInputMapUnusedAsInput(self):
     with ops.Graph().as_default():
@@ -556,26 +587,30 @@ class ImportGraphDefTest(test.TestCase):
           input_map={"A:0": constant_op.constant(5.0)})
 
       # Mapping a non-existent output of an existing node should fail.
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"Attempted to map inputs that were not found in graph_def: \[A:2\]"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             """),
             input_map={"A:2": constant_op.constant(5.0)})
-      self.assertTrue("not found in graph_def: [A:2]" in str(e.exception))
 
   def testInputMapTypeMismatch(self):
+    if ops._USE_C_API:
+      error_msg = ("Input 0 of node import/B was passed float from Const:0 "
+                   "incompatible with expected int32.")
+    else:
+      error_msg = ("Cannot convert a tensor of type float32 to an input of "
+                   "type int32.")
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             node { name: 'B' op: 'IntInput' input: 'A:0' }
             """),
             input_map={"A:0": constant_op.constant(5.0)})
-      self.assertTrue(
-          "Cannot convert a tensor of type float32 to an input of type int32."
-          in str(e.exception))
 
   def testNoReturns(self):
     with ops.Graph().as_default() as g:
@@ -598,6 +633,16 @@ class ImportGraphDefTest(test.TestCase):
           name="imported_graph")
       self.assertEqual(a.name, "imported_graph/A")
 
+  def testDefaultNamePrefix(self):
+    with ops.Graph().as_default():
+      a, = importer.import_graph_def(
+          self._MakeGraphDef("""
+          node { name: 'A' op: 'None' }
+          """),
+          return_elements=["A"],
+          name=None)
+      self.assertEqual(a.name, "import/A")
+
   def testNamePrefixColocationAttrs(self):
     original_graph_def = self._MakeGraphDef("""
           node { name: 'A' op: 'None' }
@@ -609,12 +654,10 @@ class ImportGraphDefTest(test.TestCase):
     with ops.Graph().as_default():
       b, = importer.import_graph_def(
           original_graph_def, return_elements=["B"], name="imported_graph")
-      self.assertProtoEqualsVersion("""
-          node { name: 'imported_graph/A' op: 'None' }
-          node { name: 'imported_graph/B' op: 'None'  attr {
-            key: '_class'
-            value { list { s: 'loc:@imported_graph/A' } }
-          } }""", b.graph.as_graph_def())
+      self.assertTrue("_class" in b.node_def.attr)
+      self.assertProtoEquals(
+          "list { s: 'loc:@imported_graph/A' }",
+          b.node_def.attr["_class"])
 
   def testColocationWithDeviceFn(self):
     original_graph_def = self._MakeGraphDef("""
@@ -638,23 +681,17 @@ class ImportGraphDefTest(test.TestCase):
 
     with ops.Graph().as_default():
       with ops.device(CustomDeviceFn):
-        b, = importer.import_graph_def(
-            original_graph_def, return_elements=["B"], name="imported_graph")
-
-      self.assertProtoEqualsVersion("""
-          node { name: 'imported_graph/A' op: 'None' device: "/device:A:0"
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-                }
-          }
-          node { name: 'imported_graph/B' op: 'None' device: "/device:A:0"
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-          } }""", b.graph.as_graph_def())
-
-    # Test a scenario where 'A' doesn't get a device; 'A' should
-    # not have a device, but during runtime will get colocated with
-    # 'B' because of the colocation attribute.
+        a, b = importer.import_graph_def(original_graph_def,
+                                         return_elements=["A", "B"],
+                                         name="imported_graph")
+      self.assertEqual(a.device, "/device:A:0")
+      self.assertEqual(b.device, "/device:A:0")
+      self.assertEqual(a.colocation_groups(), [b"loc:@imported_graph/A"])
+      self.assertEqual(b.colocation_groups(), [b"loc:@imported_graph/A"])
+
+    # Test a scenario where 'A' doesn't get a device; 'A' should not have a
+    # device, but during runtime will get colocated with 'B' because of the
+    # colocation attribute. B's device function is still overridden by A.
     def BDeviceFn(op):
       if "B" in op.name:
         return "/device:B:0"
@@ -662,19 +699,13 @@ class ImportGraphDefTest(test.TestCase):
 
     with ops.Graph().as_default():
       with ops.device(BDeviceFn):
-        b, = importer.import_graph_def(
-            original_graph_def, return_elements=["B"], name="imported_graph")
-
-      self.assertProtoEqualsVersion("""
-          node { name: 'imported_graph/A' op: 'None'
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-                }
-          }
-          node { name: 'imported_graph/B' op: 'None'
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-          } }""", b.graph.as_graph_def())
+        a, b = importer.import_graph_def(original_graph_def,
+                                         return_elements=["A", "B"],
+                                         name="imported_graph")
+      self.assertEqual(a.device, "")
+      self.assertEqual(b.device, "")
+      self.assertEqual(a.colocation_groups(), [b"loc:@imported_graph/A"])
+      self.assertEqual(b.colocation_groups(), [b"loc:@imported_graph/A"])
 
     # Only A gets a device, so B inherits it implicitly.
     def ADeviceFn(op):
@@ -684,19 +715,13 @@ class ImportGraphDefTest(test.TestCase):
 
     with ops.Graph().as_default():
       with ops.device(ADeviceFn):
-        b, = importer.import_graph_def(
-            original_graph_def, return_elements=["B"], name="imported_graph")
-
-      self.assertProtoEqualsVersion("""
-          node { name: 'imported_graph/A' op: 'None' device: "/device:A:0"
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-                }
-          }
-          node { name: 'imported_graph/B' op: 'None' device: "/device:A:0"
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-          } }""", b.graph.as_graph_def())
+        a, b = importer.import_graph_def(original_graph_def,
+                                         return_elements=["A", "B"],
+                                         name="imported_graph")
+      self.assertEqual(a.device, "/device:A:0")
+      self.assertEqual(b.device, "/device:A:0")
+      self.assertEqual(a.colocation_groups(), [b"loc:@imported_graph/A"])
+      self.assertEqual(b.colocation_groups(), [b"loc:@imported_graph/A"])
 
   def testMultipleColocationWithDeviceFn(self):
     original_graph_def = self._MakeGraphDef("""
@@ -719,20 +744,16 @@ class ImportGraphDefTest(test.TestCase):
 
     with ops.Graph().as_default():
       with ops.device(CustomDeviceFn):
-        c, = importer.import_graph_def(
-            original_graph_def, return_elements=["C"], name="imported_graph")
-
-      self.assertProtoEqualsVersion("""
-          node { name: 'imported_graph/A' op: 'None' }
-          node { name: 'imported_graph/B' op: 'None' device: "/device:B:0" }
-          node { name: 'imported_graph/C' op: 'None' device: "/device:B:0"
-                 attr {
-                   key: '_class' value {
-                     list { s: 'loc:@imported_graph/A'
-                            s: 'loc:@imported_graph/B' }
-                   }
-                 }
-               }""", c.graph.as_graph_def())
+        a, b, c = importer.import_graph_def(original_graph_def,
+                                            return_elements=["A", "B", "C"],
+                                            name="imported_graph")
+      self.assertEqual(a.device, "")
+      self.assertEqual(b.device, "/device:B:0")
+      self.assertEqual(c.device, "/device:B:0")
+      self.assertEqual(a.colocation_groups(), [b"loc:@imported_graph/A"])
+      self.assertEqual(b.colocation_groups(), [b"loc:@imported_graph/B"])
+      self.assertEqual(c.colocation_groups(),
+                       [b"loc:@imported_graph/A", b"loc:@imported_graph/B"])
 
   def testNamePrefixColocationAttrsMultipleImport(self):
     original_graph_def = self._MakeGraphDef("""
@@ -743,21 +764,18 @@ class ImportGraphDefTest(test.TestCase):
           } }""")
 
     with ops.Graph().as_default():
-      b, = importer.import_graph_def(
-          original_graph_def, return_elements=["B"], name="")
-      _, = importer.import_graph_def(
-          original_graph_def, return_elements=["B"], name="")
-      self.assertProtoEqualsVersion("""
-          node { name: 'A' op: 'None' }
-          node { name: 'B' op: 'None'  attr {
-            key: '_class'
-            value { list { s: 'loc:@A' } }
-          } }
-          node { name: 'A_1' op: 'None' }
-          node { name: 'B_1' op: 'None'  attr {
-            key: '_class'
-            value { list { s: 'loc:@A_1' } }
-          } }""", b.graph.as_graph_def())
+      a, b = importer.import_graph_def(
+          original_graph_def, return_elements=["A", "B"], name="")
+      a_1, b_1 = importer.import_graph_def(
+          original_graph_def, return_elements=["A", "B"], name="")
+
+      self.assertEqual(a.name, "A")
+      self.assertEqual(b.name, "B")
+      self.assertEqual(b.colocation_groups(), [b"loc:@A"])
+
+      self.assertEqual(a_1.name, "A_1")
+      self.assertEqual(b_1.name, "B_1")
+      self.assertEqual(b_1.colocation_groups(), [b"loc:@A_1"])
 
   def testNamePrefixColocationAttrsNotFound(self):
     original_graph_def = self._MakeGraphDef("""
@@ -765,8 +783,14 @@ class ImportGraphDefTest(test.TestCase):
             key: '_class'
             value { list { s: 'loc:@A' } }
           } }""")
+
+    if ops._USE_C_API:
+      error_msg = "Node 'B' expects to be colocated with unknown node 'A'"
+    else:
+      error_msg = "does not exist during import"
+
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, "does not exist during import"):
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             original_graph_def, return_elements=["B"], name="imported_graph")
 
@@ -814,21 +838,32 @@ class ImportGraphDefTest(test.TestCase):
 
   def testInvalidInputForReturnOperations(self):
     with ops.Graph().as_default():
-      with self.assertRaises(TypeError) as e:
+      with self.assertRaisesRegexp(
+          TypeError, "return_elements must be a list of strings."):
         importer.import_graph_def(self._MakeGraphDef(""), return_elements=[7])
-      self.assertEqual("return_elements must be a list of strings.",
-                       str(e.exception))
+
+      if ops._USE_C_API:
+        error_msg = "Cannot convert 'a:b:c' to a tensor name."
+      else:
+        error_msg = "Requested return_element 'a:b:c' not found in graph_def."
+      with self.assertRaisesRegexp(ValueError, error_msg):
+        importer.import_graph_def(self._MakeGraphDef(""),
+                                  return_elements=["a:b:c"])
 
   def testDuplicateOperationNames(self):
+    if ops._USE_C_API:
+      error_msg = "Node 'A' is not unique"
+    else:
+      error_msg = "Duplicate name 'A' in GraphDef."
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             node { name: 'B' op: 'IntOutput' }
             node { name: 'A' op: 'IntOutput' }
             """))
-      self.assertEqual("Duplicate name 'A' in GraphDef.", str(e.exception))
 
   def testWithExtensionAndAttr(self):
     with ops.Graph().as_default() as g:
@@ -973,24 +1008,36 @@ class ImportGraphDefTest(test.TestCase):
       pat = (r"GraphDef producer version -1 below min producer %d supported "
              r"by TensorFlow \S+\.  Please regenerate your graph.$" %
              versions.GRAPH_DEF_VERSION_MIN_PRODUCER)
-      importer.import_graph_def(self._MakeGraphDef("", producer=-1))
-      x = constant_op.constant(
-          7)  # Need at least one op to get a C++ graph generated
-      with self.test_session(graph=g) as sess:
+      # C API throws error during import, Python-only throws error during run
+      if ops._USE_C_API:
         with self.assertRaisesRegexp(Exception, pat):
-          sess.run(x)
+          importer.import_graph_def(self._MakeGraphDef("", producer=-1))
+      else:
+        importer.import_graph_def(self._MakeGraphDef("", producer=-1))
+        x = constant_op.constant(
+            7)  # Need at least one op to get a C++ graph generated
+        with self.test_session(graph=g) as sess:
+          with self.assertRaisesRegexp(Exception, pat):
+            sess.run(x)
 
   def testVersionHigh(self):
     with ops.Graph().as_default() as g:
       pat = (r"GraphDef min consumer version %d above current version %d "
              r"for TensorFlow \S+\.  Please upgrade TensorFlow\.$" %
              (1 << 30, versions.GRAPH_DEF_VERSION))
-      importer.import_graph_def(self._MakeGraphDef("", min_consumer=1 << 30))
-      x = constant_op.constant(
-          7)  # Need at least one op to get a C++ graph generated
-      with self.test_session(graph=g) as sess:
-        with self.assertRaisesRegexp(Exception, pat):
-          sess.run(x)
+
+      if ops._USE_C_API:
+        with self.assertRaisesRegexp(ValueError, pat):
+          importer.import_graph_def(self._MakeGraphDef("",
+                                                       min_consumer=1 << 30))
+      else:
+        # Python API only throws when graph is run
+        importer.import_graph_def(self._MakeGraphDef("", min_consumer=1 << 30))
+        x = constant_op.constant(
+            7)  # Need at least one op to get a C++ graph generated
+        with self.test_session(graph=g) as sess:
+          with self.assertRaisesRegexp(Exception, pat):
+            sess.run(x)
 
   def testVersionAppliesToOpConstruction(self):
     """These tests rely on shape fns in test_ops.cc."""
@@ -1036,19 +1083,26 @@ class ImportGraphDefTest(test.TestCase):
           """),
           return_elements=["A"],
           producer_op_list=producer_op_list)
-      with self.assertRaisesRegexp(ValueError, "No attr named 'default_int'"):
+      if ops._USE_C_API:
+        error_msg = "Operation 'import/A' has no attr named 'default_int'."
+      else:
+        error_msg = "No attr named 'default_int'"
+      with self.assertRaisesRegexp(ValueError, error_msg):
         a[0].get_attr("default_int")
 
-    # Attr only in producer_op_list with non-default value is preserved.
-    with ops.Graph().as_default():
-      a = importer.import_graph_def(
-          self._MakeGraphDef("""
-          node { name: 'A' op: 'OpWithFutureDefaultAttr'
-                 attr { key: 'default_int' value { i: 987 } } }
-          """),
-          return_elements=["A"],
-          producer_op_list=producer_op_list)
-      self.assertEqual(987, a[0].get_attr("default_int"))
+    # Unknown attrs cannot be imported using C API. This test will eventually be
+    # deleted.
+    if not ops._USE_C_API:
+      # Attr only in producer_op_list with non-default value is preserved.
+      with ops.Graph().as_default():
+        a = importer.import_graph_def(
+            self._MakeGraphDef("""
+            node { name: 'A' op: 'OpWithFutureDefaultAttr'
+                   attr { key: 'default_int' value { i: 987 } } }
+            """),
+            return_elements=["A"],
+            producer_op_list=producer_op_list)
+        self.assertEqual(987, a[0].get_attr("default_int"))
 
   def testFunctions(self):
     dtype = dtypes.float32
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index a8bc2d2e3fb1bdddf163ff226d6430a9222bb769..c839d7a9a693a4e1201c558173662fd24b5036dd 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -663,7 +663,7 @@ def import_scoped_meta_graph(meta_graph_or_file,
         [part for part in [graph.get_name_scope(), import_scope] if part])
 
     # Restores all the other collections.
-    for key, col_def in meta_graph_def.collection_def.items():
+    for key, col_def in sorted(meta_graph_def.collection_def.items()):
       # Don't add unbound_inputs to the new graph.
       if key == unbound_inputs_col_name:
         continue
@@ -773,6 +773,7 @@ def export_scoped_meta_graph(filename=None,
     if graph_def:
       new_graph_def = graph_pb2.GraphDef()
       new_graph_def.versions.CopyFrom(graph_def.versions)
+      new_graph_def.library.CopyFrom(graph_def.library)
 
       if clear_extraneous_savers:
         exclude_nodes = _find_extraneous_saver_nodes(graph_def, saver_def)
@@ -810,6 +811,9 @@ def export_scoped_meta_graph(filename=None,
           bytesize += value.node_def.ByteSize()
           if bytesize >= (1 << 31) or bytesize < 0:
             raise ValueError("GraphDef cannot be larger than 2GB.")
+
+      graph._copy_functions_to_graph_def(graph_def, bytesize)  # pylint: disable=protected-access
+
     # It's possible that not all the inputs are in the export_scope.
     # If we would like such information included in the exported meta_graph,
     # add them to a special unbound_inputs collection.
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index 06cee46bf623ff0521f4ebe91ff1909aa45e00e3..4c22c913b850685bd6e50b03b5fbb09a01441b68 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -662,22 +662,36 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
 class ExportImportAcrossScopesTest(test.TestCase):
 
   def testPartionedVariables(self):
-    def make_graph_with_partitioned_variables():
+
+    def make_graph_with_partitioned_variables(use_resource):
       variable_scope.get_variable(
           name="weights",
           partitioner=partitioned_variables.fixed_size_partitioner(3, axis=0),
-          initializer=random_ops.truncated_normal([100, 10]))
-    self._testExportImportAcrossScopes(make_graph_with_partitioned_variables)
+          initializer=random_ops.truncated_normal([100, 10]),
+          use_resource=use_resource)
+      # The next variable illustrates the necessity of restoring collections
+      # in a deterministic fashion when using ResourceVariables.
+      variable_scope.get_variable(
+          name="another",
+          shape=[],
+          collections=["a", "b", "z", "f", "e", "d", "g"],
+          use_resource=use_resource)
+
+    self._testExportImportAcrossScopes(
+        make_graph_with_partitioned_variables, use_resource=False)
+    self._testExportImportAcrossScopes(
+        make_graph_with_partitioned_variables, use_resource=True)
 
-  def _testExportImportAcrossScopes(self, graph_fn):
+  def _testExportImportAcrossScopes(self, graph_fn, use_resource):
     """Tests export and importing a graph across scopes.
 
     Args:
       graph_fn: A closure that creates a graph on the current scope.
+      use_resource: A bool indicating whether or not to use ResourceVariables.
     """
     with ops.Graph().as_default() as original_graph:
       with variable_scope.variable_scope("dropA/dropB/keepA"):
-        graph_fn()
+        graph_fn(use_resource=use_resource)
     exported_meta_graph_def = meta_graph.export_scoped_meta_graph(
         graph=original_graph,
         export_scope="dropA/dropB")[0]
@@ -689,10 +703,32 @@ class ExportImportAcrossScopesTest(test.TestCase):
 
     with ops.Graph().as_default() as expected_graph:
       with variable_scope.variable_scope("importA/keepA"):
-        graph_fn()
+        graph_fn(use_resource=use_resource)
+
+      if use_resource:
+        # Bringing in a collection that contains ResourceVariables adds ops
+        # to the graph, so mimic the same behavior.
+        for collection_key in sorted([
+            ops.GraphKeys.GLOBAL_VARIABLES,
+            ops.GraphKeys.TRAINABLE_VARIABLES,
+        ]):
+          for var in expected_graph.get_collection(collection_key):
+            var._read_variable_op()
 
     result = meta_graph.export_scoped_meta_graph(graph=imported_graph)[0]
     expected = meta_graph.export_scoped_meta_graph(graph=expected_graph)[0]
+
+    if use_resource:
+      # Clear all shared_name attributes before comparing, since they are
+      # supposed to be orthogonal to scopes.
+      for meta_graph_def in [result, expected]:
+        for node in meta_graph_def.graph_def.node:
+          shared_name_attr = "shared_name"
+          shared_name_value = node.attr.get(shared_name_attr, None)
+          if shared_name_value and shared_name_value.HasField("s"):
+            if shared_name_value.s:
+              node.attr[shared_name_attr].s = b""
+
     self.assertProtoEquals(expected, result)
 
 
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 503e76577010373d72dc865b783f275532c01d1e..721836f0257b3d8f26b2b78a29e02783e3edaeca 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import collections
 import copy
 import linecache
+import os
 import re
 import sys
 import threading
@@ -35,6 +36,7 @@ from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
 from tensorflow.core.framework import op_def_pb2
 from tensorflow.core.framework import versions_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
@@ -47,28 +49,19 @@ from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import versions
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.platform import app
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import decorator_utils
 from tensorflow.python.util import tf_contextlib
 
+
 # Temporary global switch determining if we should enable the work-in-progress
 # calls to the C API. Currently disabled by default but can be manually enabled
-# e.g. in tests. This will be removed once all functionality is supported and
-# there's no performance penalty with it enabled.
-#
-# TODO(skyewm) before we can remove this:
-# - functions
-# - import_graph_def() incrementally adds inputs to ops (i.e. creates an
-#   Operation and then calls _add_input()). The current code requires that all
-#   inputs be specified when creating the Operation (since we call
-#   TF_FinishOperation()).
-# - ops_test.py (and others?) create unregistered op types
-# - while loop
-# - performance (e.g. delete/refactor redundant Python functionality, switch to
-#   new session API)
-_USE_C_API = False
+# in code or via the environment variable. This will be removed once all
+# functionality is supported and there's no performance penalty with it enabled.
+_USE_C_API = os.getenv("TF_C_API_GRAPH_CONSTRUCTION", "0") is not "0"
 
 
 def tensor_id(tensor):
@@ -157,14 +150,18 @@ def register_dense_tensor_like_type(tensor_type):
   """
   try:
     if not isinstance(tensor_type.name, property):
-      raise TypeError("Type %s does not define a `name` property")
+      raise TypeError("Type %s does not define a `name` property" %
+                      tensor_type.__name__)
   except AttributeError:
-    raise TypeError("Type %s does not define a `name` property")
+    raise TypeError("Type %s does not define a `name` property" %
+                    tensor_type.__name__)
   try:
     if not isinstance(tensor_type.dtype, property):
-      raise TypeError("Type %s does not define a `dtype` property")
+      raise TypeError("Type %s does not define a `dtype` property" %
+                      tensor_type.__name__)
   except AttributeError:
-    raise TypeError("Type %s does not define a `dtype` property")
+    raise TypeError("Type %s does not define a `dtype` property" %
+                    tensor_type.__name__)
   # We expect this list to be small, so choose quadratic complexity
   # for registration, so that we have a tuple that can be used for
   # more efficient `isinstance` checks later.
@@ -369,6 +366,19 @@ class Tensor(_TensorLike):
       A `TensorShape` representing the shape of this tensor.
 
     """
+    if _USE_C_API:
+      graph = self._op._graph._c_graph  # pylint: disable=protected-access
+      with errors.raise_exception_on_not_ok_status() as status:
+        num_dims = c_api.TF_GraphGetTensorNumDims(graph, self._as_tf_output(),
+                                                  status)
+      if num_dims == -1:
+        dim_list = None
+      else:
+        with errors.raise_exception_on_not_ok_status() as status:
+          dim_list = c_api.TF_GraphGetTensorShape_wrapper(
+              graph, self._as_tf_output(), num_dims, status)
+        dim_list = [None if i == -1 else i for i in dim_list]
+      return tensor_shape.TensorShape(dim_list)
     return self._shape
 
   def __iter__(self):
@@ -388,8 +398,8 @@ class Tensor(_TensorLike):
       yield self[i]
 
   def _shape_as_list(self):
-    if self._shape.ndims is not None:
-      return [dim.value for dim in self._shape.dims]
+    if self.shape.ndims is not None:
+      return [dim.value for dim in self.shape.dims]
     else:
       return None
 
@@ -405,7 +415,7 @@ class Tensor(_TensorLike):
     Returns:
       Integer rank or None
     """
-    return self._shape.ndims
+    return self.shape.ndims
 
   def get_shape(self):
     """Alias of Tensor.shape."""
@@ -436,14 +446,35 @@ class Tensor(_TensorLike):
     ```
 
     Args:
-      shape: A `TensorShape` representing the shape of this tensor.
+      shape: A `TensorShape` representing the shape of this tensor, a
+      `TensorShapeProto`, a list, a tuple, or None.
 
     Raises:
       ValueError: If `shape` is not compatible with the current shape of
         this tensor.
     """
-    # TODO(skyewm): call C API
-    self._shape = self._shape.merge_with(shape)
+    if not _USE_C_API:
+      self._shape = self._shape.merge_with(shape)  # pylint: disable=protected-access
+      return
+    if not isinstance(shape, tensor_shape.TensorShape):
+      shape = tensor_shape.TensorShape(shape)
+    dim_list = []
+    if shape.dims is None:
+      unknown_shape = True
+    else:
+      unknown_shape = False
+      for dim in shape.dims:
+        if dim.value is None:
+          dim_list.append(-1)
+        else:
+          dim_list.append(dim.value)
+    with errors.raise_exception_on_not_ok_status() as status:
+      c_api.TF_GraphSetTensorShape_wrapper(
+          self._op._graph._c_graph,  # pylint: disable=protected-access
+          self._as_tf_output(),
+          dim_list,
+          unknown_shape,
+          status)
 
   @property
   def value_index(self):
@@ -456,7 +487,17 @@ class Tensor(_TensorLike):
     Returns:
       A list of `Operation`s.
     """
-    return self._consumers
+    if self._op._c_op:  # pylint: disable=protected-access
+      consumer_names = c_api.TF_OperationOutputConsumers_wrapper(
+          self._as_tf_output())
+      # pylint: disable=protected-access
+      return [
+          self.graph._get_operation_by_name_unsafe(name)
+          for name in consumer_names
+      ]
+      # pylint: enable=protected-access
+    else:
+      return self._consumers
 
   def _add_consumer(self, consumer):
     """Add a consumer to this tensor.
@@ -467,6 +508,9 @@ class Tensor(_TensorLike):
     Raises:
       TypeError: if the consumer is not an Operation.
     """
+    # pylint: disable=protected-access
+    assert not self._op._c_op, "Tensor._add_consumer doesn't work with C API"
+    # pylint: enable=protected-access
     if not isinstance(consumer, Operation):
       raise TypeError("Consumer must be an Operation: %s" % consumer)
     self._consumers.append(consumer)
@@ -491,11 +535,10 @@ class Tensor(_TensorLike):
       return "%s:%d" % (self._op.name, self._value_index)
 
   def _as_tf_output(self):
-    assert self.op._c_op  # pylint: disable=protected-access
-    tf_output = c_api.TF_Output()
-    tf_output.oper = self.op._c_op  # pylint: disable=protected-access
-    tf_output.index = self.value_index
-    return tf_output
+    # pylint: disable=protected-access
+    assert self.op._c_op
+    return c_api_util.tf_output(self.op._c_op, self.value_index)
+    # pylint: enable=protected-access
 
   def __str__(self):
     return "Tensor(\"%s\"%s%s%s)" % (
@@ -595,11 +638,6 @@ class Tensor(_TensorLike):
     """
     return _eval_using_default_session(self, feed_dict, self.graph, session)
 
-  def _dup(self):
-    ret = copy.copy(self)
-    ret._id = uid()  # pylint: disable=protected-access
-    return ret
-
 
 # TODO(agarwal): consider getting rid of this.
 class _EagerTensorBase(Tensor):
@@ -638,8 +676,8 @@ class _EagerTensorBase(Tensor):
   def __float__(self):
     return float(self.numpy())
 
-  def __array__(self):
-    return np.array(self.numpy())
+  def __array__(self, dtype=None):
+    return np.array(self.numpy(), dtype=dtype)
 
   def __format__(self, format_spec):
     return self.numpy().__format__(format_spec)
@@ -725,9 +763,6 @@ class _EagerTensorBase(Tensor):
     return new_tensor
     # pylint: enable=protected-access
 
-  def _dup(self):
-    return self._copy(device_name=self.device)
-
   @property
   def shape(self):
     return tensor_shape.TensorShape(self._shape_tuple())
@@ -935,7 +970,7 @@ def internal_convert_to_tensor(value,
     # Fast path for EagerTensors that don't need any conversion.
     if isinstance(value, EagerTensor):
       # Note that we don't check that value's dtype matches the dtype
-      # argument.  We exepct that the C runtime will do that checking
+      # argument.  We expect that the C runtime will do that checking
       # when we execute the kernel.
       return value
 
@@ -1436,8 +1471,12 @@ def _create_c_op(graph, node_def, inputs, control_inputs):
       c_api.TF_SetAttrValueProto(op_desc,
                                  compat.as_str(name), serialized, status)
 
-  with errors.raise_exception_on_not_ok_status() as status:
-    c_op = c_api.TF_FinishOperation(op_desc, status)
+  try:
+    with errors.raise_exception_on_not_ok_status() as status:
+      c_op = c_api.TF_FinishOperation(op_desc, status)
+  except errors.InvalidArgumentError as e:
+    # Convert to ValueError for backwards compatibility.
+    raise ValueError(str(e))
 
   return c_op
 
@@ -1509,16 +1548,34 @@ class Operation(object):
         or if `inputs` and `input_types` are incompatible.
       ValueError: if the `node_def` name is not valid.
     """
-    if not isinstance(node_def, node_def_pb2.NodeDef):
+    # For internal use only: `node_def` can be set to a TF_Operation to create
+    # an Operation for that op. This is useful for creating Operations for ops
+    # indirectly created by C API methods, e.g. the ops created by
+    # TF_ImportGraphDef. When `node_def` is a TF_Operation, all optional fields
+    # should be None.
+
+    if isinstance(node_def, node_def_pb2.NodeDef):
+      if node_def.ByteSize() >= (1 << 31) or node_def.ByteSize() < 0:
+        raise ValueError(
+            "Cannot create a tensor proto whose content is larger than 2GB.")
+      if not _VALID_OP_NAME_REGEX.match(node_def.name):
+        raise ValueError("'%s' is not a valid node name" % node_def.name)
+      self._node_def = copy.deepcopy(node_def)
+      c_op = None
+    elif type(node_def).__name__ == "SwigPyObject":
+      assert inputs is None
+      assert output_types is None
+      assert control_inputs is None
+      assert input_types is None
+      assert original_op is None
+      assert op_def is None
+      self._node_def = None
+      c_op = node_def
+    else:
       raise TypeError("node_def needs to be a NodeDef: %s" % node_def)
-    if node_def.ByteSize() >= (1 << 31) or node_def.ByteSize() < 0:
-      raise ValueError(
-          "Cannot create a tensor proto whose content is larger than 2GB.")
-    if not _VALID_OP_NAME_REGEX.match(node_def.name):
-      raise ValueError("'%s' is not a valid node name" % node_def.name)
+
     if not isinstance(g, Graph):
       raise TypeError("g needs to be a Graph: %s" % g)
-    self._node_def = copy.deepcopy(node_def)
     self._graph = g
     if inputs is None:
       inputs = []
@@ -1528,8 +1585,6 @@ class Operation(object):
     for a in self._inputs:
       if not isinstance(a, Tensor):
         raise TypeError("input needs to be a Tensor: %s" % a)
-      # Mark that we consume the inputs.
-      a._add_consumer(self)  # pylint: disable=protected-access
     if input_types is None:
       input_types = [i.dtype.base_dtype for i in self._inputs]
     else:
@@ -1546,21 +1601,28 @@ class Operation(object):
     self._control_inputs = []
     if control_inputs:
       for c in control_inputs:
-        c_op = None
+        control_op = None
         if isinstance(c, Operation):
-          c_op = c
+          control_op = c
         elif isinstance(c, (Tensor, IndexedSlices)):
-          c_op = c.op
+          control_op = c.op
         else:
           raise TypeError("Control input must be an Operation, "
                           "a Tensor, or IndexedSlices: %s" % c)
-        self._control_inputs.append(c_op)
+        self._control_inputs.append(control_op)
 
+    self._id_value = self._graph._next_id()  # pylint: disable=protected-access
     self._original_op = original_op
     self._op_def = op_def
     self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access
-
-    if self._graph._c_graph:  # pylint: disable=protected-access
+    self._control_flow_context = self.graph._get_control_flow_context()  # pylint: disable=protected-access
+
+    # Initialize self._c_op.
+    if c_op:
+      # TODO(skyewm): remove this assert when we remove USE_C_API
+      assert self._graph._c_graph  # pylint: disable=protected-access
+      self._c_op = c_op
+    elif self._graph._c_graph:  # pylint: disable=protected-access
       if self._op_def:
         # TODO(skyewm): op_def_library.apply_op() flattens the incoming
         # inputs. Refactor so we don't have to do this here.
@@ -1575,8 +1637,20 @@ class Operation(object):
     else:
       self._c_op = None
 
-    # Initialize self._outputs
-    if output_types is None:
+    # Mark that we consume the inputs. This is unnecessary and unsupported with
+    # the C API enabled, since the C API tracks the tensor consumers instead.
+    if not self._c_op:
+      for input_tensor in self._inputs:
+        input_tensor._add_consumer(self)  # pylint: disable=protected-access
+
+    # Initialize self._outputs.
+    if self._c_op:
+      num_outputs = c_api.TF_OperationNumOutputs(self._c_op)
+      output_types = [
+          c_api.TF_OperationOutputType(c_api_util.tf_output(self._c_op, i))
+          for i in range(num_outputs)]
+      assert output_types is not None
+    elif output_types is None:
       output_types = []
     self._output_types_val = output_types
     self._outputs = [
@@ -1584,16 +1658,19 @@ class Operation(object):
         for i, output_type in enumerate(output_types)
     ]
 
-    # Add this op to the current control flow context:
-    self._control_flow_context = g._get_control_flow_context()  # pylint: disable=protected-access
+    if not c_op:
+      self._control_flow_post_processing()
+
+  def _control_flow_post_processing(self):
+    """Add this op to its control flow context.
+
+    This may add new ops and change this op's inputs. self.inputs must be
+    available before calling this method.
+    """
+    for input_tensor in self.inputs:
+      control_flow_util.CheckInputFromValidContext(self, input_tensor.op)
     if self._control_flow_context is not None:
       self._control_flow_context.AddOp(self)
-    # NOTE(keveman): Control flow context's AddOp could be creating new ops and
-    # setting op.inputs[index] = new_op. Thus the new ops' id could be larger
-    # than this op's id even though this op depend on them. Therefore, delaying
-    # assigning id to this op until all ops this could be dependent on are
-    # created.
-    self._id_value = self._graph._next_id()  # pylint: disable=protected-access
     self._recompute_node_def()
 
   def _reconstruct_sequence_inputs(self, op_def, inputs, attrs):
@@ -1676,9 +1753,6 @@ class Operation(object):
   def name(self):
     """The full name of this operation."""
     if self._c_op:
-      # TODO(iga): Remove this assert after converting to C API by default.
-      # Just being a bit paranoid here.
-      assert self._node_def.name == c_api.TF_OperationName(self._c_op)
       return c_api.TF_OperationName(self._c_op)
     else:
       return self._node_def.name
@@ -1698,9 +1772,6 @@ class Operation(object):
       device.
     """
     if self._c_op:
-      # TODO(iga): Remove this assert after converting to C API by default.
-      # Just being a bit paranoid here
-      assert self._node_def.device == c_api.TF_OperationDevice(self._c_op)
       return c_api.TF_OperationDevice(self._c_op)
     else:
       return self._node_def.device
@@ -1761,9 +1832,9 @@ class Operation(object):
       c_api.SetRequestedDevice(
           self._graph._c_graph,  # pylint: disable=protected-access
           self._c_op,  # pylint: disable=protected-access
-          _device_string(device))
-    # TODO(nolivia): remove this line when switch to C api
-    self._node_def.device = _device_string(device)
+          compat.as_str(_device_string(device)))
+    else:
+      self._node_def.device = _device_string(device)
 
   def _add_input(self, tensor, dtype=None):
     """Add a new input to this operation.
@@ -1868,8 +1939,18 @@ class Operation(object):
     else:
       self._add_control_inputs([op])
 
+  def _remove_all_control_inputs(self):
+    """Removes any control inputs to this operation."""
+    if self._c_op:
+      c_api.RemoveAllControlInputs(self._graph._c_graph, self._c_op)  # pylint: disable=protected-access
+    else:
+      del self.control_inputs[:]
+
   # Methods below are used when building the NodeDef and Graph proto.
   def _recompute_node_def(self):
+    # TODO(skyewm): remove this function when we switch to C API
+    if self._c_op: return
+
     del self._node_def.input[:]
     # pylint: disable=protected-access
     self._node_def.input.extend([t._as_node_def_input() for t in self._inputs])
@@ -1894,23 +1975,23 @@ class Operation(object):
   class _InputList(object):
     """Immutable input list wrapper."""
 
-    def __init__(self, op):
-      self._op = op
+    def __init__(self, inputs):
+      self._inputs = inputs
 
     def __iter__(self):
-      return iter(self._op._inputs)
+      return iter(self._inputs)
 
     def __len__(self):
-      return len(self._op._inputs)
+      return len(self._inputs)
 
     def __bool__(self):
-      return bool(self._op._inputs)
+      return bool(self._inputs)
 
     # Python 3 wants __bool__, Python 2.7 wants __nonzero__
     __nonzero__ = __bool__
 
     def __getitem__(self, i):
-      return self._op._inputs[i]
+      return self._inputs[i]
 
 # pylint: enable=protected-access
 
@@ -1919,13 +2000,14 @@ class Operation(object):
     """The list of `Tensor` objects representing the data inputs of this op."""
     if self._c_op:
       tf_outputs = c_api.GetOperationInputs(self._c_op)
-      # TODO(skyewm): return Operation._InputList
       # pylint: disable=protected-access
-      return [self.graph._get_tensor_by_tf_output(tf_output)
-              for tf_output in tf_outputs]
+      retval = [
+          self.graph._get_tensor_by_tf_output(tf_output)
+          for tf_output in tf_outputs
+      ]
       # pylint: enable=protected-access
-    else:
-      return Operation._InputList(self)
+      return Operation._InputList(retval)
+    return Operation._InputList(self._inputs)
 
   @property
   def _input_dtypes(self):
@@ -1939,9 +2021,6 @@ class Operation(object):
           dtypes.as_dtype(c_api.TF_OperationInputType(self._tf_input(i)))
           for i in xrange(num_inputs)
       ]
-      # TODO(iga): Remove this assert after converting to C API by default.
-      # Just being a bit paranoid here.
-      assert self._input_types_val == input_types
       return input_types
     else:
       return self._input_types_val
@@ -1976,14 +2055,6 @@ class Operation(object):
     """The type of the op (e.g. `"MatMul"`)."""
     if self._c_op:
       op_type = c_api.TF_OperationOpType(self._c_op)
-      # TODO(iga): Remove these asserts after converting to C API by default.
-      # Just being a bit paranoid here.
-      # pylint: disable=unidiomatic-typecheck
-      assert type(op_type) == type(self._node_def.op), (
-          "Expected same types %s vs %s" % (type(op_type),
-                                            type(self._node_def.op)))
-      # pylint: enable=unidiomatic-typecheck
-      assert op_type == self._node_def.op
       return op_type
     else:
       return self._node_def.op
@@ -2058,7 +2129,7 @@ class Operation(object):
 
   def _set_attr(self, attr_name, attr_value):
     """Private method used to set an attribute in the node_def."""
-    if _USE_C_API:
+    if self._c_op:
       buf = c_api.TF_NewBufferFromString(
           compat.as_bytes(attr_value.SerializeToString()))
       try:
@@ -2296,8 +2367,28 @@ class RegisterShape(object):
     return f
 
 
-def set_shapes_for_outputs(op):
-  """Uses the registered shape functions to set the shapes for op's outputs."""
+def _set_shapes_for_outputs_c_api(op):
+  """set_shapes_for_outputs implementation when C API is enabled."""
+  # The C API computes the shapes when the TF_Operation is created. Fetch the
+  # output shapes from the C object.
+  for output in op.outputs:
+    with errors.raise_exception_on_not_ok_status() as status:
+      # pylint: disable=protected-access
+      shape_vector, unknown_shape = c_api.TF_GraphGetTensorShapeHelper(
+          op._graph._c_graph, output._as_tf_output(), status)
+      # pylint: enable=protected-access
+    if unknown_shape:
+      output.set_shape(tensor_shape.unknown_shape())
+    elif not shape_vector:
+      output.set_shape(tensor_shape.scalar())
+    else:
+      shape_vector = [None if d == -1 else d for d in shape_vector]
+      output.set_shape(tensor_shape.TensorShape(shape_vector))
+
+
+# TODO(skyewm): remove this when _USE_C_API flag is removed.
+def _set_shapes_for_outputs(op):
+  """set_shapes_for_outputs implementation when C API is disabled."""
   try:
     shape_func = _shape_registry.lookup(op.type)
   except LookupError:
@@ -2328,6 +2419,14 @@ def set_shapes_for_outputs(op):
     output.set_shape(s)
 
 
+def set_shapes_for_outputs(op):
+  """Set the shapes for op's outputs."""
+  if op._c_op:  # pylint: disable=protected-access
+    return _set_shapes_for_outputs_c_api(op)
+  else:
+    return _set_shapes_for_outputs(op)
+
+
 class OpStats(object):
   """A holder for statistics about an operator.
 
@@ -2599,11 +2698,16 @@ class Graph(object):
 
     # TODO(skyewm): fold as much of the above as possible into the C
     # implementation
-    if _USE_C_API:
+    if _USE_C_API or self._use_c_api_hack():
       self._scoped_c_graph = c_api_util.ScopedTFGraph()
     else:
       self._scoped_c_graph = None
 
+  # TODO(apassos) remove once the C API is used by default.
+  def _use_c_api_hack(self):
+    """Temporary hack; can be overridden to force C API usage."""
+    return False
+
   def _convert_stack(self, stack, include_func_start_lineno=False):
     """Converts a stack extracted using _extract_stack() to a traceback stack.
 
@@ -2804,6 +2908,20 @@ class Graph(object):
     """
     self._control_flow_context = ctx
 
+  def _copy_functions_to_graph_def(self, graph_def, starting_bytesize):
+    """If this graph contains functions, copy them to `graph_def`."""
+    bytesize = starting_bytesize
+    for f in self._functions.values():
+      bytesize += f.definition.ByteSize()
+      if bytesize >= (1 << 31) or bytesize < 0:
+        raise ValueError("GraphDef cannot be larger than 2GB.")
+      graph_def.library.function.extend([f.definition])
+      if f.grad_func_name:
+        grad_def = function_pb2.GradientDef()
+        grad_def.function_name = f.name
+        grad_def.gradient_func = f.grad_func_name
+        graph_def.library.gradient.extend([grad_def])
+
   def _as_graph_def(self, from_version=None, add_shapes=False):
     # pylint: disable=line-too-long
     """Returns a serialized `GraphDef` representation of this graph.
@@ -2847,17 +2965,7 @@ class Graph(object):
           bytesize += op.node_def.ByteSize()
           if bytesize >= (1 << 31) or bytesize < 0:
             raise ValueError("GraphDef cannot be larger than 2GB.")
-      if self._functions:
-        for f in self._functions.values():
-          bytesize += f.definition.ByteSize()
-          if bytesize >= (1 << 31) or bytesize < 0:
-            raise ValueError("GraphDef cannot be larger than 2GB.")
-          graph.library.function.extend([f.definition])
-          if f.grad_func_name:
-            grad_def = function_pb2.GradientDef()
-            grad_def.function_name = f.name
-            grad_def.gradient_func = f.grad_func_name
-            graph.library.gradient.extend([grad_def])
+      self._copy_functions_to_graph_def(graph, bytesize)
       return graph, self._version
 
   def as_graph_def(self, from_version=None, add_shapes=False):
@@ -2932,9 +3040,14 @@ class Graph(object):
     # Add function to graph
     # pylint: disable=protected-access
     if self._c_graph:
-      assert function._c_func, (
-          "Cannot add function created without C API support to graph "
-          "created with C API support")
+      # Handle functions created without using the C API. TODO(apassos,skyewm)
+      # remove this when all functions are generated using the C API by default
+      # as this will be unnecessary.
+      if not function._c_func:
+        with errors.raise_exception_on_not_ok_status() as status:
+          serialized = function.definition.SerializeToString()
+          function._c_func = c_api.TF_FunctionImportFunctionDef(
+              serialized, status)
       with errors.raise_exception_on_not_ok_status() as status:
         gradient = function._grad_func._c_func if function._grad_func else None
         c_api.TF_GraphCopyFunction(self._c_graph, function._c_func, gradient,
@@ -3034,75 +3147,120 @@ class Graph(object):
 
     node_def = _NodeDef(op_type, name, device=None, attrs=attrs)
 
+    input_ops = set([t.op for t in inputs])
+    control_inputs = self._control_dependencies_for_inputs(input_ops)
+    ret = Operation(
+        node_def,
+        self,
+        inputs=inputs,
+        output_types=dtypes,
+        control_inputs=control_inputs,
+        input_types=input_types,
+        original_op=self._default_original_op,
+        op_def=op_def)
+    self._create_op_helper(ret, compute_shapes=compute_shapes,
+                           compute_device=compute_device)
+    return ret
+
+  def _create_op_from_tf_operation(self, c_op, compute_device=True):
+    """Creates an `Operation` in this graph from the supplied TF_Operation.
+
+    This method is like create_op() except the new Operation is constructed
+    using `c_op`. The returned Operation will have `c_op` as its _c_op
+    field. This is used to create Operation objects around TF_Operations created
+    indirectly by the C API (e.g. by TF_ImportGraphDef, TF_FinishWhile).
+
+    This function does not call Operation._control_flow_post_processing or
+    Graph._control_dependencies_for_inputs (since the inputs may not be
+    available yet). The caller is responsible for calling these methods.
+
+    Args:
+      c_op: a wrapped TF_Operation
+      compute_device: (Optional.) If True, device functions will be executed
+        to compute the device property of the Operation.
+
+    Returns:
+      An `Operation` object.
+    """
+    self._check_not_finalized()
+    ret = Operation(c_op, self)
+    assert ret.name not in self._names_in_use
+    self._names_in_use[ret.name] = 1
+    self._create_op_helper(ret, compute_device=compute_device)
+    return ret
+
+  def _create_op_helper(self, op, compute_shapes=True, compute_device=True):
+    """Common logic for creating an op in this graph."""
+    # TODO(vrv): Instead of eagerly filling in shape property for every op, only
+    # populate the shape when requested.
+    #
+    # TODO(skyewm): unlike in the original Python implementation, the C API
+    # always computes shape information (even for function calls, which the
+    # original Python shape inference code doesn't handle). Deprecate the
+    # compute_shapes argument.
+    if op._c_op or compute_shapes:  # pylint: disable=protected-access
+      set_shapes_for_outputs(op)
+    # TODO(b/XXXX): move to Operation.__init__ once _USE_C_API flag is removed.
+    self._add_op(op)
+
     # Apply any additional attributes requested. Do not overwrite any existing
     # attributes.
     for key, value in self._attr_scope_map.items():
-      if key not in node_def.attr:
+      try:
+        op.get_attr(key)
+      except ValueError:
         if callable(value):
-          value = value(node_def)
+          value = value(op.node_def)
           if not isinstance(value, (type(None), attr_value_pb2.AttrValue)):
             raise TypeError(
                 "Callable for scope map key '%s' must return either None or "
                 "an AttrValue protocol buffer; but it returned: %s" % (key,
                                                                        value))
-        node_def.attr[key].CopyFrom(value)
+        if value:
+          op._set_attr(key, value)  # pylint: disable=protected-access
 
-    # Apply a kernel label if one has been specified for this op_type.
+    # Apply a kernel label if one has been specified for this op type.
     try:
-      kernel_label = self._op_to_kernel_label_map[op_type]
-      node_def.attr["_kernel"].CopyFrom(
-          attr_value_pb2.AttrValue(s=compat.as_bytes(kernel_label)))
+      kernel_label = self._op_to_kernel_label_map[op.type]
+      op._set_attr("_kernel",  # pylint: disable=protected-access
+                   attr_value_pb2.AttrValue(s=compat.as_bytes(kernel_label)))
     except KeyError:
       pass
 
-    # Apply the overriding op_type for gradients if one has been
-    # specified for this op_type.
+    # Apply the overriding op type for gradients if one has been specified for
+    # this op type.
     try:
-      mapped_op_type = self._gradient_override_map[op_type]
-      node_def.attr["_gradient_op_type"].CopyFrom(
-          attr_value_pb2.AttrValue(s=compat.as_bytes(mapped_op_type)))
+      mapped_op_type = self._gradient_override_map[op.type]
+      op._set_attr("_gradient_op_type",  # pylint: disable=protected-access
+                   attr_value_pb2.AttrValue(s=compat.as_bytes(mapped_op_type)))
     except KeyError:
       pass
 
-    control_inputs = self._control_dependencies_for_inputs(inputs)
-    ret = Operation(
-        node_def,
-        self,
-        inputs=inputs,
-        output_types=dtypes,
-        control_inputs=control_inputs,
-        input_types=input_types,
-        original_op=self._default_original_op,
-        op_def=op_def)
-    if compute_shapes:
-      set_shapes_for_outputs(ret)
-    self._add_op(ret)
-    self._record_op_seen_by_control_dependencies(ret)
+    self._record_op_seen_by_control_dependencies(op)
 
     if compute_device:
-      self._apply_device_functions(ret)
+      self._apply_device_functions(op)
 
     if self._colocation_stack:
       all_colocation_groups = []
       for colocation_op in self._colocation_stack:
         all_colocation_groups.extend(colocation_op.colocation_groups())
         if colocation_op.device:
-          # Make this device match the device of the colocated op, to
-          # provide consistency between the device and the colocation
-          # property.
-          if (ret.device and pydev.canonical_name(ret.device) !=
+          # Make this device match the device of the colocated op, to provide
+          # consistency between the device and the colocation property.
+          if (op.device and pydev.canonical_name(op.device) !=
               pydev.canonical_name(colocation_op.device)):
             logging.warning("Tried to colocate %s with an op %s that had "
                             "a different device: %s vs %s. "
-                            "Ignoring colocation property.", name,
-                            colocation_op.name, ret.device,
+                            "Ignoring colocation property.", op.name,
+                            colocation_op.name, op.device,
                             colocation_op.device)
           else:
-            ret._set_device(colocation_op.device)  # pylint: disable=protected-access
+            op._set_device(colocation_op.device)  # pylint: disable=protected-access
 
       all_colocation_groups = sorted(set(all_colocation_groups))
       # pylint: disable=protected-access
-      ret._set_attr("_class", attr_value_pb2.AttrValue(
+      op._set_attr("_class", attr_value_pb2.AttrValue(
           list=attr_value_pb2.AttrValue.ListValue(s=all_colocation_groups)))
       # pylint: enable=protected-access
 
@@ -3111,14 +3269,48 @@ class Graph(object):
     # (2) "is_stateful" is set in OpDef
     # (3) "container" attribute is in OpDef
     # (4) "container" attribute is None
-    if (self._container and op_type in self._registered_ops and
-        self._registered_ops[op_type].is_stateful and
-        "container" in ret.node_def.attr and
-        not ret.node_def.attr["container"].s):
-      ret.node_def.attr["container"].CopyFrom(
-          attr_value_pb2.AttrValue(s=compat.as_bytes(self._container)))
+    if (self._container and op.type in self._registered_ops and
+        self._registered_ops[op.type].is_stateful):
+      try:
+        container_attr = op.get_attr("container")
+      except ValueError:
+        # "container" attribute is not in OpDef
+        pass
+      else:
+        if not container_attr:
+          op._set_attr("container", attr_value_pb2.AttrValue(  # pylint: disable=protected-access
+              s=compat.as_bytes(self._container)))
 
-    return ret
+  def _add_new_tf_operations(self, compute_devices=True):
+    """Creates `Operations` in this graph for any new TF_Operations.
+
+    This is useful for when TF_Operations are indirectly created by the C API
+    outside of the Operation constructor (e.g. by TF_ImportGraphDef,
+    TF_FinishWhile). This ensures there are corresponding Operations for all
+    TF_Operations in the underlying TF_Graph.
+
+    Args:
+      compute_devices: (Optional.) If True, device functions will be executed
+        to compute the device properties of each new Operation.
+
+    Returns:
+      A list of the new `Operation` objects.
+    """
+    # Create all Operation objects before accessing their inputs since an op may
+    # be created before its inputs.
+    new_ops = [
+        self._create_op_from_tf_operation(c_op, compute_device=compute_devices)
+        for c_op in c_api_util.new_tf_operations(self)
+    ]
+
+    for op in new_ops:
+      new_control_inputs = self._control_dependencies_for_inputs(op.inputs)
+      # pylint: disable=protected-access
+      op._add_control_inputs(new_control_inputs)
+      op._control_flow_post_processing()
+      # pylint: enable=protected-access
+
+    return new_ops
 
   def as_graph_element(self, obj, allow_tensor=True, allow_operation=True):
     """Returns the object referred to by `obj`, as an `Operation` or `Tensor`.
@@ -3306,6 +3498,10 @@ class Graph(object):
     with self._lock:
       return self._nodes_by_name[name]
 
+  def _get_operation_by_tf_operation(self, tf_oper):
+    op_name = c_api.TF_OperationName(tf_oper)
+    return self._get_operation_by_name_unsafe(op_name)
+
   def get_tensor_by_name(self, name):
     """Returns the `Tensor` with the given `name`.
 
@@ -3340,8 +3536,7 @@ class Graph(object):
     Returns:
       The `Tensor` that represents `tf_output`.
     """
-    op_name = c_api.TF_OperationName(tf_output.oper)
-    op = self._get_operation_by_name_unsafe(op_name)
+    op = self._get_operation_by_tf_operation(tf_output.oper)
     return op.outputs[tf_output.index]
 
   def _next_id(self):
@@ -4007,8 +4202,8 @@ class Graph(object):
         ret.add(op)
     return ret
 
-  def _control_dependencies_for_inputs(self, input_tensors):
-    """For an op that takes `input_tensors` as inputs, compute control inputs.
+  def _control_dependencies_for_inputs(self, input_ops):
+    """For an op that takes `input_ops` as inputs, compute control inputs.
 
     The returned control dependencies should yield an execution that
     is equivalent to adding all control inputs in
@@ -4019,13 +4214,12 @@ class Graph(object):
     the explicit approach redundant.
 
     Args:
-      input_tensors: The direct data dependencies for an op to be created.
+      input_ops: The data input ops for an op to be created.
 
     Returns:
       A list of control inputs for the op to be created.
     """
     ret = []
-    input_ops = set([t.op for t in input_tensors])
     for controller in self._control_dependencies_stack:
       # If any of the input_ops already depends on the inputs from controller,
       # we say that the new op is dominated (by that input), and we therefore
@@ -4648,10 +4842,71 @@ class _DefaultGraphStack(_DefaultStack):  # pylint: disable=protected-access
     super(_DefaultGraphStack, self).reset()
     self._global_default_graph = None
 
+  @tf_contextlib.contextmanager
+  def get_controller(self, default):
+    try:
+      context.context_stack.push(default.building_function, default.as_default)
+      with super(_DefaultGraphStack, self).get_controller(default) as g:
+        yield g
+    finally:
+      context.context_stack.pop()
+
 
 _default_graph_stack = _DefaultGraphStack()
 
 
+# pylint: disable=g-doc-return-or-yield,line-too-long
+@tf_contextlib.contextmanager
+def init_scope():
+  """A context manager that lifts ops out of control-flow scopes and function-building graphs.
+
+  There is often a need to lift variable initialization ops out of control-flow
+  scopes, function-building graphs, and gradient tapes. Entering an
+  `init_scope` is a mechanism for satisfying these desiderata. In particular,
+  entering an `init_scope` has three effects:
+
+    (1) All control dependencies are cleared the moment the scope is entered;
+        this is equivalent to entering the context manager returned from
+        `control_dependencies(None)`, which has the side-effect of exiting
+        control-flow scopes like `tf.cond` and `tf.while_loop`.
+
+    (2) All operations that are created while the scope is active are lifted
+        into the lowest context on the `context_stack` that is not building a
+        graph function. Here, a context is defined as either a graph or an eager
+        context. Every context switch, i.e., every installation of a graph as
+        the default graph and every switch into eager mode, is logged in a
+        thread-local stack called the `context_stack`; the log entry for a
+        context switch is popped from the stack when the context is exited.
+        Entering an `init_scope` is equivalent to crawling up the
+        `context_stack`, finding the first context that is not building a graph
+        function, and entering it.
+
+    (3) The gradient tape is paused while the scope is active.
+  """
+  # pylint: enable=g-doc-return-or-yield,line-too-long
+
+  outer_context = None
+  if not context.context_stack.stack:
+    # This is correct because of an invariant: the stack is
+    # empty if and only if eager execution has not been enabled.
+    outer_context = get_default_graph().as_default
+  else:
+    for stack_entry in reversed(context.context_stack.stack):
+      if not stack_entry.is_building_function:
+        outer_context = stack_entry.enter_context_fn
+        break
+
+  if outer_context is None:
+    raise AssertionError("All graphs are building functions, and no "
+                         "eager context was previously active.")
+
+  try:
+    with outer_context(), control_dependencies(None), tape.stop_recording():
+      yield
+  finally:
+    pass
+
+
 def enable_eager_execution(config=None, device_policy=None):
   """Enables, for the rest of the lifetime of this program, eager execution.
 
@@ -4686,6 +4941,16 @@ def enable_eager_execution(config=None, device_policy=None):
      or if trying to create a context with nontrivial options which differ
      from those of the existing context.
   """
+  if config is not None and not isinstance(config, config_pb2.ConfigProto):
+    raise TypeError(
+        "config must be a tf.ConfigProto, but got %s" % type(config))
+  if device_policy not in (None, context.DEVICE_PLACEMENT_EXPLICIT,
+                           context.DEVICE_PLACEMENT_WARN,
+                           context.DEVICE_PLACEMENT_SILENT):
+    raise ValueError(
+        "device_policy must be one of None, tfe.DEVICE_PLACEMENT_EXPLICIT, "
+        "tfe.DEVICE_PLACEMENT_WARN, tfe.DEVICE_PLACEMENT_SILENT"
+    )
   # pylint: disable=protected-access
   if context._default_mode == context.GRAPH_MODE:
     graph_mode_has_been_used = (
@@ -4698,6 +4963,13 @@ def enable_eager_execution(config=None, device_policy=None):
   if context._context is None:
     context._context = context.Context(config=config,
                                        device_policy=device_policy)
+    if context.context_stack.stack:
+      raise AssertionError("Invariant violated: The context stack must "
+                           "be empty when eager execution is enabled.")
+    # Log that eager execution has been enabled by pushing an entry onto the
+    # context stack; this entry won't ever be popped, as it's impossible to
+    # disable eager execution
+    context.context_stack.push(False, context.eager_mode)
   elif ((config is not None and config is not context._context._config)
         or (device_policy is not None
             and device_policy is not context._context._device_policy)):
@@ -4707,6 +4979,9 @@ def enable_eager_execution(config=None, device_policy=None):
                      " policy: %s." % (config, context._context._config,
                                        device_policy,
                                        context._context._device_policy))
+  else:
+    raise ValueError(
+        "tfe.enable_eager_execution has to be called at program startup.")
 
 
 def eager_run(main=None, argv=None):
@@ -4995,6 +5270,9 @@ class GraphKeys(object):
   COND_CONTEXT = "cond_context"
   WHILE_CONTEXT = "while_context"
 
+  # Used to store v2 summary names.
+  _SUMMARY_COLLECTION = "_SUMMARY_V2"
+
   # List of all collections that keep track of variables.
   _VARIABLE_COLLECTIONS = [
       GLOBAL_VARIABLES,
@@ -5167,11 +5445,18 @@ class name_scope(object):  # pylint: disable=invalid-name
     """
     if self._in_eager_mode:
       self._old_name = self._ctx.scope_name
-      if self._name:
-        scope_name = (self._old_name + self._name + "/"
-                      if self._old_name else self._name + "/")
-      else:
+      if not self._name:
         scope_name = ""
+      else:
+        if self._name[-1] == "/":
+          # A trailing slash breaks out of nested name scopes, indicating a
+          # fully specified scope name, for compatibility with Graph.name_scope.
+          scope_name = self._name
+        else:
+          name_with_trailing_slash = self._name + "/"
+          scope_name = (
+              self._old_name + name_with_trailing_slash
+              if self._old_name else name_with_trailing_slash)
       self._ctx.scope_name = scope_name
       return scope_name
     else:
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 1be306ddc598e3ea442bd1ac7e3ed3c951c71505..bfaddefc46c197624f569481470d8be3abe2ea85 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -26,6 +26,7 @@ from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
@@ -43,6 +44,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import resources
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -80,7 +82,7 @@ class ResourceTest(test_util.TensorFlowTestCase):
 
 
 @test_util.with_c_api
-class TensorTest(test_util.TensorFlowTestCase):
+class TensorAndShapeTest(test_util.TensorFlowTestCase):
 
   def testShape(self):
     op = ops.Operation(
@@ -99,6 +101,44 @@ class TensorTest(test_util.TensorFlowTestCase):
       for _ in t:
         pass
 
+  def testAddShape(self):
+    with self.test_session():
+      a = array_ops.zeros([2, 3])
+      b = array_ops.ones([1, 3])
+      c = a + b
+      self.assertEqual([2, 3], c.shape)
+
+  def testUnknownDim(self):
+    with self.test_session():
+      a = array_ops.placeholder(dtype=dtypes.float32, shape=[2, None, 3])
+      b = array_ops.placeholder(dtype=dtypes.float32, shape=[2, None, 3])
+      c = a + b
+      self.assertEqual([2, None, 3], c.shape.as_list())
+
+  def testUnknownShape(self):
+    with self.test_session():
+      a = array_ops.placeholder(dtype=dtypes.float32, shape=None)
+      b = array_ops.ones([1, 3])
+      c = a + b
+      self.assertEqual(tensor_shape.unknown_shape(), c.shape)
+
+  def testScalarShape(self):
+    with self.test_session():
+      a = array_ops.placeholder(dtype=dtypes.float32, shape=[])
+      b = array_ops.ones([])
+      c = a + b
+      self.assertEqual(tensor_shape.scalar(), c.shape)
+
+  def testShapeFunctionError(self):
+    with self.test_session():
+      a = array_ops.ones([1, 2, 3])
+      b = array_ops.ones([4, 5, 6])
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"Dimensions must be equal, but are 2 and 5 for 'add' \(op: 'Add'\) "
+          r"with input shapes: \[1,2,3\], \[4,5,6\]."):
+        _ = a + b
+
 
 @test_util.with_c_api
 class IndexedSlicesTest(test_util.TensorFlowTestCase):
@@ -165,13 +205,13 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(dtypes.float32, float_t.dtype)
     self.assertEqual(op, float_t.op)
     self.assertEqual(0, float_t._value_index)
-    self.assertEqual(0, len(float_t._consumers))
+    self.assertEqual(0, len(float_t.consumers()))
     self.assertEqual("myop", float_t._as_node_def_input())
 
     self.assertEqual(dtypes.string, label_str_t.dtype)
     self.assertEqual(op, label_str_t.op)
     self.assertEqual(1, label_str_t._value_index)
-    self.assertEqual(0, len(label_str_t._consumers))
+    self.assertEqual(0, len(label_str_t.consumers()))
     self.assertEqual("myop:1", label_str_t._as_node_def_input())
 
     self.assertProtoEquals("op:'FloatOutputStringOutput' name:'myop'",
@@ -185,8 +225,8 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(1, len(op2.inputs))
     self.assertIs(float_t, op2.inputs[0])
 
-    self.assertEqual(1, len(float_t._consumers))
-    self.assertEqual(op2, float_t._consumers[0])
+    self.assertEqual(1, len(float_t.consumers()))
+    self.assertEqual(op2, float_t.consumers()[0])
 
     self.assertProtoEquals("op:'FloatOutput' name:'myop1'", op1.node_def)
     self.assertProtoEquals("op:'FloatInput' name:'myop2' input:'myop1'",
@@ -205,14 +245,14 @@ class OperationTest(test_util.TensorFlowTestCase):
     op3 = test_ops.foo2(float1_t, label2_str_t, label2_str_t, name="myop3").d.op
     self.assertEqual(2, len(op3.values()))
 
-    self.assertEqual(1, len(float1_t._consumers))
-    self.assertEqual(op3, float1_t._consumers[0])
+    self.assertEqual(1, len(float1_t.consumers()))
+    self.assertEqual(op3, float1_t.consumers()[0])
 
-    self.assertEqual(0, len(float2_t._consumers))
+    self.assertEqual(0, len(float2_t.consumers()))
 
-    self.assertEqual(2, len(label2_str_t._consumers))
-    self.assertEqual(op3, label2_str_t._consumers[0])
-    self.assertEqual(op3, label2_str_t._consumers[1])
+    self.assertEqual(2, len(label2_str_t.consumers()))
+    self.assertEqual(op3, label2_str_t.consumers()[0])
+    self.assertEqual(op3, label2_str_t.consumers()[1])
 
     self.assertProtoEquals("""
     op:'Foo2' name:'myop3'
@@ -236,18 +276,23 @@ class OperationTest(test_util.TensorFlowTestCase):
     op1 = ops.Operation(
         ops._NodeDef("RefOutputFloatOutput", "op1"), g, [],
         [dtypes.float32_ref, dtypes.float32])
+    g._add_op(op1)
     self.assertProtoEquals("op:'RefOutputFloatOutput' name:'op1'", op1.node_def)
+    self.assertEquals([], list(op1.inputs))
     ref_t, nonref_t = op1.values()
     # NOTE(mrry): Must specify input_types to preserve ref-typed input.
     op2 = ops.Operation(
         ops._NodeDef("RefInputFloatInput", "op2"),
         g, [ref_t, nonref_t], [],
         input_types=[dtypes.float32_ref, dtypes.float32])
+    g._add_op(op2)
     self.assertProtoEquals(
         "op:'RefInputFloatInput' name:'op2' input:'op1' input:'op1:1'",
         op2.node_def)
+    self.assertEquals([ref_t, nonref_t], list(op2.inputs))
     op3 = ops.Operation(
         ops._NodeDef("TwoFloatInputs", "op3"), g, [ref_t, nonref_t], [])
+    g._add_op(op3)
     self.assertProtoEquals(
         "op:'TwoFloatInputs' name:'op3' input:'op1' input:'op1:1'",
         op3.node_def)
@@ -442,6 +487,30 @@ class OperationTest(test_util.TensorFlowTestCase):
     z._add_control_inputs([x, y, y])  # pylint: disable=protected-access
     self.assertEqual(z.control_inputs, [x, y])
 
+  def testRemoveAllControlInputs(self):
+    a = constant_op.constant(1)
+    with ops.control_dependencies([a]):
+      b = constant_op.constant(2)
+    c = constant_op.constant(3)
+    d = constant_op.constant(4)
+    e = constant_op.constant(5)
+    with ops.control_dependencies([a, c]):
+      f = d + e
+
+    self.assertEqual(a.op.control_inputs, [])
+    self.assertEqual(b.op.control_inputs, [a.op])
+    self.assertEqual(f.op.control_inputs, [a.op, c.op])
+
+    a.op._remove_all_control_inputs()  # pylint: disable=protected-access
+    self.assertEqual(a.op.control_inputs, [])
+
+    b.op._remove_all_control_inputs()  # pylint: disable=protected-access
+    self.assertEqual(b.op.control_inputs, [])
+
+    f.op._remove_all_control_inputs()  # pylint: disable=protected-access
+    self.assertEqual(f.op.control_inputs, [])
+    self.assertEqual(list(f.op.inputs), [d, e])
+
   def testControlInputCycle(self):
     # Non-C API path has a different error message
     if not ops._USE_C_API: return
@@ -468,16 +537,22 @@ class OperationTest(test_util.TensorFlowTestCase):
 
     z.op._update_input(0, y)  # pylint: disable=protected-access
     self.assertEquals(list(z.op.inputs), [y, y])
+    self.assertEquals(x.consumers(), [])
+    self.assertEquals(y.consumers(), [z.op, z.op])
     with session.Session(graph=g) as sess:
       self.assertEquals(sess.run(z), 4)
 
     z.op._update_input(0, x)  # pylint: disable=protected-access
     self.assertEquals(list(z.op.inputs), [x, y])
+    self.assertEquals(x.consumers(), [z.op])
+    self.assertEquals(y.consumers(), [z.op])
     with session.Session(graph=g) as sess:
       self.assertEquals(sess.run(z), 3)
 
     z.op._update_input(1, y)  # pylint: disable=protected-access
     self.assertEquals(list(z.op.inputs), [x, y])
+    self.assertEquals(x.consumers(), [z.op])
+    self.assertEquals(y.consumers(), [z.op])
     with session.Session(graph=g) as sess:
       self.assertEquals(sess.run(z), 3)
 
@@ -575,6 +650,25 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(len(z.op.op_def.input_arg), 2)
     self.assertEqual(len(z.op.op_def.output_arg), 1)
 
+  def testInputFromDifferentGraphError(self):
+    g_0 = ops.Graph()
+    g_1 = ops.Graph()
+    with g_0.as_default():
+      x = constant_op.constant(1)
+    with g_1.as_default():
+      y = constant_op.constant(2)
+      with self.assertRaisesRegexp(ValueError, "must be from the same graph"):
+        y * x  # pylint: disable=pointless-statement
+
+  def testInputsAreImmutable(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = test_ops.int_output()
+      op = test_ops.int_input_int_output(x, name="myop").op
+    with self.assertRaisesRegexp(
+        AttributeError, "'_InputList' object has no attribute 'append'"):
+      op.inputs.append(None)
+
 
 @test_util.with_c_api
 class CreateOpTest(test_util.TensorFlowTestCase):
@@ -634,6 +728,200 @@ class CreateOpTest(test_util.TensorFlowTestCase):
     g.create_op("FloatOutput", [], [dtypes.float32], None, name="myop1")
 
 
+# NOTE(skyewm): these cases test the private Graph._create_op_from_tf_operation
+# method. Arguably we should only test the public APIs that depend on this
+# method. However, this logic is complex and tricky, and it can be difficult to
+# ascertain if we have adequate coverage (e.g. a graph may run successfully if
+# the control flow context isn't set properly, but a more complicated use case
+# that might not be obvious to test will fail). Thus we instead explicitly test
+# the low-level behavior.
+@test_util.with_c_api
+class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
+
+  def testBasic(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = test_ops.int_output()
+      if ops._USE_C_API:
+        c_op = ops._create_c_op(
+            g, ops._NodeDef("IntInputIntOutput", "myop"), [x], [])
+        op = g._create_op_from_tf_operation(c_op)
+      else:
+        # Test pure-Python version to make sure C API has same behavior.
+        op = test_ops.int_input_int_output(x, name="myop").op
+
+    self.assertEqual(op.name, "myop")
+    self.assertEqual(op.type, "IntInputIntOutput")
+    self.assertEqual(len(op.outputs), 1)
+    self.assertEqual(op.outputs[0].shape, tensor_shape.unknown_shape())
+    self.assertEqual(list(op.inputs), [x])
+    self.assertEqual(op.control_inputs, [])
+    self.assertEqual(op.graph, g)
+    self.assertEqual(x.consumers(), [op])
+    self.assertIsNotNone(op.traceback)
+    self.assertEqual(g.get_operation_by_name("myop"), op)
+    self.assertEqual(g.get_tensor_by_name("myop:0"), op.outputs[0])
+
+  def testShape(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = constant_op.constant([[1, 2, 3], [4, 5, 6]])
+      if ops._USE_C_API:
+        c_op = ops._create_c_op(g, ops._NodeDef("Identity", "myop"), [x], [])
+        op = g._create_op_from_tf_operation(c_op)
+      else:
+        # Test pure-Python version to make sure C API has same behavior.
+        op = array_ops.identity(x, name="myop").op
+
+    self.assertEqual(op.name, "myop")
+    self.assertEqual(op.type, "Identity")
+    self.assertEqual(len(op.outputs), 1)
+    self.assertEqual(op.outputs[0].shape, tensor_shape.matrix(2, 3))
+
+  def testUniqueName(self):
+    g = ops.Graph()
+    with g.as_default():
+      if ops._USE_C_API:
+        c_op = ops._create_c_op(g, ops._NodeDef("IntOutput", "myop"), [], [])
+        c_op2 = ops._create_c_op(g, ops._NodeDef("IntOutput", "myop_1"), [], [])
+        op = g._create_op_from_tf_operation(c_op)
+        op2 = g._create_op_from_tf_operation(c_op2)
+      else:
+        # Test pure-Python version to make sure C API has same behavior.
+        op = test_ops.int_output(name="myop").op
+        op2 = test_ops.int_output(name="myop_1").op
+
+      # Create ops with same names as op1 and op2. We expect the new names to be
+      # uniquified.
+      op3 = test_ops.int_output(name="myop").op
+      op4 = test_ops.int_output(name="myop_1").op
+
+    self.assertEqual(op.name, "myop")
+    self.assertEqual(op2.name, "myop_1")
+    self.assertEqual(op3.name, "myop_2")
+    self.assertEqual(op4.name, "myop_1_1")
+
+  def testCond(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = test_ops.int_output()
+
+      def true_fn():
+        if ops._USE_C_API:
+          ops._create_c_op(ops.get_default_graph(),
+                           ops._NodeDef("IntInput", "cond/myop"), [x], [])
+          new_ops = g._add_new_tf_operations()
+          self.assertEqual(len(new_ops), 1)
+        else:
+          # Test pure-Python version to make sure C API has same behavior.
+          test_ops.int_input(x, name="myop")
+        return x
+
+      control_flow_ops.cond(x < 10, true_fn, lambda: x)
+
+    op = g.get_operation_by_name("cond/myop")
+    self.assertIsNotNone(op)
+    self.assertEqual(op.name, "cond/myop")
+    self.assertEqual(op.type, "IntInput")
+    self.assertEqual(op.outputs, [])
+    op_input = op.inputs[0].op
+    self.assertEqual(op_input.type, "Switch")
+    self.assertEqual(op_input.inputs[0], x)
+    self.assertEqual(op.graph, g)
+    # pylint: disable=protected-access
+    self.assertIsNotNone(op._get_control_flow_context())
+    self.assertEqual(op._get_control_flow_context().name,
+                     "cond/cond_text")
+    # pylint: enable=protected-access
+
+  def testWhileLoop(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = test_ops.int_output()
+
+      def body(i):
+        if ops._USE_C_API:
+          ops._create_c_op(ops.get_default_graph(),
+                           ops._NodeDef("IntInput", "myloop/myop"), [x], [])
+          new_ops = g._add_new_tf_operations()
+          self.assertEqual(len(new_ops), 1)
+        else:
+          # Test pure-Python version to make sure C API has same behavior.
+          test_ops.int_input(x, name="myop")
+        return i
+
+      control_flow_ops.while_loop(lambda i: i < 10, body, [0], name="myloop")
+
+    op = g.get_operation_by_name("myloop/myop")
+    self.assertIsNotNone(op)
+    self.assertEqual(op.name, "myloop/myop")
+    self.assertEqual(op.type, "IntInput")
+    self.assertEqual(op.outputs, [])
+    op_input = op.inputs[0].op
+    self.assertEqual(op_input.type, "Enter")
+    self.assertEqual(list(op_input.inputs), [x])
+    self.assertEqual(op.graph, g)
+    # pylint: disable=protected-access
+    self.assertIsNotNone(op._get_control_flow_context())
+    self.assertEqual(op._get_control_flow_context().name,
+                     "myloop/while_context")
+    # pylint: enable=protected-access
+
+  def testWhileLoopWithInternalControlDep(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = test_ops.int_output()
+
+      def body(i):
+        c = constant_op.constant(1.0, name="c")
+        if ops._USE_C_API:
+          ops._create_c_op(ops.get_default_graph(),
+                           ops._NodeDef("IntInput", "myloop/myop"), [x], [])
+          with ops.control_dependencies([c]):
+            new_ops = g._add_new_tf_operations()
+            self.assertEqual(len(new_ops), 1)
+        else:
+          with ops.control_dependencies([c]):
+            test_ops.int_input(x, name="myop")
+        return i
+
+      control_flow_ops.while_loop(lambda i: i < 10, body, [0], name="myloop")
+
+    op = g.get_operation_by_name("myloop/myop")
+    self.assertIsNotNone(op)
+    c = g.get_operation_by_name("myloop/c")
+    self.assertIsNotNone(c)
+    # Internal control dep is preserved
+    self.assertEqual(op.control_inputs, [c])
+
+  def testWhileLoopWithExternalControlDep(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = test_ops.int_output()
+      c = constant_op.constant(1.0)
+
+      def body(i):
+        if ops._USE_C_API:
+          ops._create_c_op(ops.get_default_graph(),
+                           ops._NodeDef("IntInput", "myloop/myop"), [x], [])
+          with ops.control_dependencies([c]):
+            new_ops = g._add_new_tf_operations()
+            self.assertEqual(len(new_ops), 1)
+        else:
+          with ops.control_dependencies([c]):
+            test_ops.int_input(x, name="myop")
+        return i
+
+      control_flow_ops.while_loop(lambda i: i < 10, body, [0], name="myloop")
+
+    op = g.get_operation_by_name("myloop/myop")
+    self.assertIsNotNone(op)
+    self.assertEqual(len(op.control_inputs), 1)
+    # External control dep is removed and replaced with internal control dep
+    self.assertNotEqual(op.control_inputs[0], c.op)
+    self.assertIsNotNone(op.control_inputs[0]._get_control_flow_context())
+
+
 @test_util.with_c_api
 class ApplyOpTest(test_util.TensorFlowTestCase):
 
@@ -1294,6 +1582,29 @@ class ControlDependenciesTest(test_util.TensorFlowTestCase):
     # e should be dominated by c.
     self.assertEqual(e.op.control_inputs, [])
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testEager(self):
+    def future():
+      future.calls += 1
+      return constant_op.constant(2.0)
+    future.calls = 0
+
+    if context.in_graph_mode():
+      g = ops.Graph()
+      with g.as_default():
+        a = constant_op.constant(1.0)
+        b = future()
+        with g.control_dependencies([a, b]):
+          c = constant_op.constant(3.0)
+      self.assertEqual(c.op.control_inputs, [a.op, b.op])
+      self.assertEqual(future.calls, 1)
+    else:
+      a = constant_op.constant(1.0)
+      b = future()
+      with ops.control_dependencies([a, b]):
+        c = constant_op.constant(3.0)
+      self.assertEqual(future.calls, 1)
+
   def testBasicWithConversion(self):
     g = ops.Graph()
     a = _apply_op(g, "FloatOutput", [], [dtypes.float32])
@@ -1457,6 +1768,37 @@ class ControlDependenciesTest(test_util.TensorFlowTestCase):
 @test_util.with_c_api
 class OpScopeTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testNames(self):
+    with ops.name_scope("foo") as foo:
+      self.assertEqual("foo/", foo)
+      with ops.name_scope("foo2") as foo2:
+        self.assertEqual("foo/foo2/", foo2)
+      with ops.name_scope(None) as empty1:
+        self.assertEqual("", empty1)
+        with ops.name_scope("foo3") as foo3:
+          self.assertEqual("foo3/", foo3)
+      with ops.name_scope("") as empty2:
+        self.assertEqual("", empty2)
+    with ops.name_scope("foo/") as outer_foo:
+      self.assertEqual("foo/", outer_foo)
+      with ops.name_scope("") as empty3:
+        self.assertEqual("", empty3)
+      with ops.name_scope("foo4") as foo4:
+        self.assertEqual("foo/foo4/", foo4)
+      with ops.name_scope("foo5//") as foo5:
+        self.assertEqual("foo5//", foo5)
+        with ops.name_scope("foo6") as foo6:
+          self.assertEqual("foo5//foo6/", foo6)
+      with ops.name_scope("/") as foo7:
+        self.assertEqual("/", foo7)
+      with ops.name_scope("//") as foo8:
+        self.assertEqual("//", foo8)
+      with ops.name_scope("a//b/c") as foo9:
+        self.assertEqual("foo/a//b/c/", foo9)
+    with ops.name_scope("a//b/c") as foo10:
+      self.assertEqual("a//b/c/", foo10)
+
   @test_util.run_in_graph_and_eager_modes()
   def testEagerDefaultScopeName(self):
     with ops.name_scope(None, "default") as scope:
@@ -1537,6 +1879,195 @@ class OpScopeTest(test_util.TensorFlowTestCase):
     self._testGraphElements([a, variable, b])
 
 
+class InitScopeTest(test_util.TensorFlowTestCase):
+
+  def testClearsControlDependencies(self):
+    g = ops.Graph()
+    a_1 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+    a_2 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+    a_3 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+    a_4 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+
+    with g.as_default():
+      with g.control_dependencies([a_1]):
+        with g.control_dependencies([a_2]):
+          with ops.init_scope():
+            with g.control_dependencies([a_3]):
+              with g.control_dependencies([a_4]):
+                # deps [a_3, a_4]
+                b_3_4 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+              # deps = [a_3]
+              b_3 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+            # deps back to None
+            b_none = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+          # deps back to [a_1, a_2]
+          b_1_2 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+        # deps back to [a_1]
+        b_1 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+        with ops.init_scope():
+          # deps are None again
+          b_none2 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+
+    self.assertItemsEqual([a_3.op, a_4.op], b_3_4.op.control_inputs)
+    self.assertItemsEqual([a_3.op], b_3.op.control_inputs)
+    self.assertItemsEqual([], b_none.op.control_inputs)
+    self.assertItemsEqual([a_1.op, a_2.op], b_1_2.op.control_inputs)
+    self.assertItemsEqual([a_1.op], b_1.op.control_inputs)
+    self.assertItemsEqual([], b_none2.op.control_inputs)
+
+  def testLiftsOpsFromFunctions(self):
+    g0 = ops.Graph()
+    g1 = ops.Graph()
+    g1._building_function = True  # pylint: disable=protected-access
+    g2 = ops.Graph()
+    g2._building_function = True  # pylint: disable=protected-access
+
+    with g0.as_default():
+      with g1.as_default():
+        with g2.as_default():
+          with ops.init_scope():
+            _ = constant_op.constant(1.0)
+
+    self.assertEqual(len(g2.get_operations()), 0)
+    self.assertEqual(len(g1.get_operations()), 0)
+    self.assertEqual(len(g0.get_operations()), 1)
+
+  def testComposes(self):
+    g0 = ops.Graph()
+    g1 = ops.Graph()
+    g1._building_function = True  # pylint: disable=protected-access
+    g2 = ops.Graph()
+    g2._building_function = True  # pylint: disable=protected-access
+    g3 = ops.Graph()
+    g3._building_function = False  # pylint: disable=protected-access
+
+    with g0.as_default():
+      with g1.as_default():
+        with ops.init_scope():
+          # This op should be lifted into g0.
+          _ = constant_op.constant(1.0)
+          self.assertIs(g0, ops.get_default_graph())
+          self.assertEqual(len(g2.get_operations()), 0)
+          self.assertEqual(len(g1.get_operations()), 0)
+          self.assertEqual(len(g0.get_operations()), 1)
+        with g2.as_default():
+          with ops.init_scope():
+            # This op should be lifted into g0.
+            _ = constant_op.constant(1.0)
+            self.assertIs(g0, ops.get_default_graph())
+            with g3.as_default():
+              with ops.init_scope():
+                # This op should be lifted into g3, because g3 is not building a
+                # function.
+                _ = constant_op.constant(1.0)
+                self.assertIs(g3, ops.get_default_graph())
+
+    self.assertEqual(len(g3.get_operations()), 1)
+    self.assertEqual(len(g2.get_operations()), 0)
+    self.assertEqual(len(g1.get_operations()), 0)
+    self.assertEqual(len(g0.get_operations()), 2)
+
+  def testEscapesToEagerContext(self):
+    g = ops.Graph()
+    g._building_function = True  # pylint: disable=protected-access
+    with context.eager_mode():
+      with context.graph_mode():
+        with g.as_default():
+          with ops.init_scope():
+            # Because g is building a function, init_scope should
+            # escape out to the eager context.
+            self.assertTrue(context.in_eager_mode())
+          # g should be reinstated as the default graph, and the
+          # graph context should be re-entered.
+          self.assertIs(g, ops.get_default_graph())
+          self.assertTrue(context.in_graph_mode())
+
+  def testAllGraphsBuildingFunctionsRaisesError(self):
+    g = ops.Graph()
+    g._building_function = True  # pylint: disable=protected-access
+    with g.as_default():
+      with self.assertRaises(AssertionError):
+        with ops.init_scope():
+          pass
+
+  def testStaysInEagerWhenOnlyEagerContextActive(self):
+    with context.eager_mode():
+      with ops.init_scope():
+        self.assertTrue(context.eager_mode())
+      self.assertTrue(context.eager_mode())
+
+  def testEscapesDefunWhenInEagerMode(self):
+
+    def function_with_variables():
+      with ops.init_scope():
+        v = resource_variable_ops.ResourceVariable(3)
+      return v.assign_add(1)
+
+    with context.eager_mode():
+      # Each invocation of function_with_variables recreates a variable.
+      self.assertEqual(4, int(function_with_variables()))
+      self.assertEqual(4, int(function_with_variables()))
+
+      compiled = eager_function.defun(function_with_variables)
+      # The init_scope in function_with_variables lifts the variable out
+      # of the graph function constructed by defun; hence,
+      # compiled now appears to be stateful.
+      self.assertEqual(4, int(compiled()))
+      self.assertEqual(5, int(compiled()))
+
+  def testEscapesDefunWhenInGraphMode(self):
+    def function_with_variables(name):
+      with ops.init_scope():
+        _ = variable_scope.get_variable(name, shape=(1,))
+
+    g = ops.Graph()
+    with g.as_default():
+      with self.test_session():
+        # First ensure that graphs that are not building functions are
+        # not escaped.
+        function_with_variables("foo")
+        with self.assertRaisesRegexp(ValueError,
+                                     r"Variable foo already exists.*"):
+          # This will fail because reuse is not set to True.
+          function_with_variables("foo")
+
+        compiled = eager_function.defun(function_with_variables)
+        compiled("bar")
+        self.assertEqual(
+            len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)), 2)
+
+        # The second call to `compiled` should not create variables: the
+        # init_scope has lifted the variable creation code out of the defun.
+        compiled("bar")
+        self.assertEqual(
+            len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)), 2)
+
+  def testEscapesNestedDefun(self):
+
+    def inner_function():
+      with ops.init_scope():
+        v = resource_variable_ops.ResourceVariable(1)
+      return v.assign_add(2)
+
+    def outer_function(inner=None):
+      with ops.init_scope():
+        v0 = resource_variable_ops.ResourceVariable(0)
+      return v0.assign_add(1) + inner()
+
+    with context.eager_mode():
+      # Each invocation of outer_function recreates variables.
+      self.assertEqual(4, int(outer_function(inner=inner_function)))
+      self.assertEqual(4, int(outer_function(inner=inner_function)))
+
+      compiled_inner = eager_function.defun(inner_function)
+      compiled_outer = eager_function.defun(outer_function)
+      # The init_scope lifts variables out of the graph functions
+      # constructed by defun; hence, compiled_outer should now appear to be
+      # stateful.
+      self.assertEqual(4, int(compiled_outer(inner=compiled_inner)))
+      self.assertEqual(7, int(compiled_outer(inner=compiled_inner)))
+
+
 @test_util.with_c_api
 class GraphTest(test_util.TensorFlowTestCase):
 
@@ -1571,6 +2102,20 @@ class GraphTest(test_util.TensorFlowTestCase):
       self._AssertDefault(g0)
     self._AssertDefault(orig)
 
+  def testPreventFeeding(self):
+    g = ops.Graph()
+    a = constant_op.constant(2.0)
+    self.assertTrue(g.is_feedable(a))
+    g.prevent_feeding(a)
+    self.assertFalse(g.is_feedable(a))
+
+  def testPreventFetching(self):
+    g = ops.Graph()
+    a = constant_op.constant(2.0)
+    self.assertTrue(g.is_fetchable(a))
+    g.prevent_fetching(a.op)
+    self.assertFalse(g.is_fetchable(a))
+
   def testAsGraphElementConversions(self):
 
     class ConvertibleObj(object):
@@ -1614,6 +2159,24 @@ class GraphTest(test_util.TensorFlowTestCase):
     gc.collect()
     self.assertIsNone(g_ref())
 
+  def testRunnableAfterInvalidShape(self):
+    with ops.Graph().as_default():
+      with self.assertRaises(ValueError):
+        math_ops.add([1, 2], [1, 2, 3])
+      a = constant_op.constant(1)
+      with session.Session() as sess:
+        sess.run(a)
+
+  def testRunnableAfterInvalidShapeWithKernelLabelMap(self):
+    g = ops.Graph()
+    with g.as_default():
+      with g._kernel_label_map({"KernelLabelRequired": "overload_1"}):
+        with self.assertRaises(ValueError):
+          test_ops.kernel_label_required(1)
+      a = constant_op.constant(1)
+      with session.Session() as sess:
+        sess.run(a)
+
 
 @test_util.with_c_api
 class AttrScopeTest(test_util.TensorFlowTestCase):
@@ -1628,7 +2191,6 @@ class AttrScopeTest(test_util.TensorFlowTestCase):
       b = compat.as_text(x.get_attr("_B"))
     except ValueError:
       b = None
-    print(a, b)
     return (a, b)
 
   def testNoLabel(self):
@@ -1934,7 +2496,7 @@ class DenseTensorLikeTypeTest(test_util.TensorFlowTestCase):
 
   def testSuccess(self):
     op = ops.Operation(
-        ops._NodeDef("None", "myop"), ops.Graph(), [], [dtypes.float32])
+        ops._NodeDef("FloatOutput", "myop"), ops.Graph(), [], [dtypes.float32])
     t = op.outputs[0]
     self.assertTrue(ops.is_dense_tensor_like(t))
 
@@ -2133,6 +2695,13 @@ class InputTypesTest(test_util.TensorFlowTestCase):
       self.assertEqual([dtypes.double, dtypes.double], z.op._input_dtypes)
       # pylint: enable=protected-access
 
+  def testBadArgumentsToEnableEagerExecution(self):
+    with self.assertRaisesRegexp(TypeError, "config must be a tf.ConfigProto"):
+      ops.enable_eager_execution(context.DEVICE_PLACEMENT_SILENT)
+    with self.assertRaisesRegexp(ValueError, "device_policy must be one of"):
+      c = config_pb2.ConfigProto()
+      ops.enable_eager_execution(c, c)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index c57f0a98421fa88e5faa870157116c1617c19620..72d3ea90fd60dd532ecd71ba4257f651db963625 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stdio.h>
 #include <sstream>
 #include <unordered_map>
+#include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb_text.h"
@@ -480,15 +481,15 @@ string GenPythonOp::Code() {
   }
   // This has all the input args followed by those attrs that don't have
   // defaults.
-  std::vector<string> args_no_default;
+  std::vector<ParamNames> params_no_default;
   // The parameters with defaults (these have to be listed after those without).
   // No input args are included, just attrs.
-  std::vector<string> args_with_defaults;
+  std::vector<ParamNames> params_with_default;
 
   for (int i = 0; i < api_def_.arg_order_size(); ++i) {
     const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
     const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
-    args_no_default.push_back(api_def_arg.rename_to());
+    params_no_default.emplace_back(api_def_arg.name(), api_def_arg.rename_to());
     if (!arg.type_attr().empty()) {
       gtl::InsertIfNotPresent(&inferred_attrs_, arg.type_attr(), arg.name());
     } else if (!arg.type_list_attr().empty()) {
@@ -504,9 +505,9 @@ string GenPythonOp::Code() {
     // Do not add inferred attrs to the Python function signature.
     if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
       if (attr.has_default_value()) {
-        args_with_defaults.push_back(attr.rename_to());
+        params_with_default.emplace_back(attr.name(), attr.rename_to());
       } else {
-        args_no_default.push_back(attr.rename_to());
+        params_no_default.emplace_back(attr.name(), attr.rename_to());
       }
     }
   }
@@ -515,27 +516,30 @@ string GenPythonOp::Code() {
   // those with defaults go at the end.
   // Get the attrs in the order we want by taking the attrs without defaults
   // from the end of args_no_default, and adding args_no_default.
-  attrs_.reserve(args_no_default.size() - op_def_.input_arg_size() +
-                 args_with_defaults.size());
-  attrs_.insert(attrs_.end(),
-                args_no_default.begin() + op_def_.input_arg_size(),
-                args_no_default.end());
-  attrs_.insert(attrs_.end(), args_with_defaults.begin(),
-                args_with_defaults.end());
-
-  param_names_.reserve(args_no_default.size() + args_with_defaults.size());
+  attrs_.reserve(params_no_default.size() - op_def_.input_arg_size() +
+                 params_with_default.size());
+  for (int i = op_def_.input_arg_size(); i < params_no_default.size(); ++i) {
+    attrs_.push_back(params_no_default[i].GetName());
+  }
+  for (int i = 0; i < params_with_default.size(); ++i) {
+    attrs_.push_back(params_with_default[i].GetName());
+  }
+
+  param_names_.reserve(params_no_default.size() + params_with_default.size());
+  param_names_.insert(param_names_.begin(), params_no_default.begin(),
+                      params_no_default.end());
+  for (const auto& param : params_with_default) {
+    param_names_.push_back(param);
+  }
+
   string parameters;
-  for (const string& name : args_no_default) {
+  for (const auto& param : params_no_default) {
     AddDelimiter(&parameters, ", ");
-    const string param = AvoidPythonReserved(name);
-    strings::StrAppend(&parameters, param);
-    param_names_.push_back(param);
+    strings::StrAppend(&parameters, param.GetRenameTo());
   }
-  for (const string& name : args_with_defaults) {
+  for (const auto& param_and_default : params_with_default) {
     AddDelimiter(&parameters, ", ");
-    const string param = AvoidPythonReserved(name);
-    strings::StrAppend(&parameters, param, "=None");
-    param_names_.push_back(param);
+    strings::StrAppend(&parameters, param_and_default.GetRenameTo(), "=None");
   }
   AddDelimiter(&parameters, ", ");
   strings::StrAppend(&parameters, "name=None");
@@ -557,10 +561,11 @@ string GenPythonOp::Code() {
 }
 
 void GenPythonOp::AddExport() {
-  if (api_def_.visibility() != api_def_.VISIBLE) {
+  if (api_def_.visibility() != ApiDef::VISIBLE) {
     return;
   }
-  strings::StrAppend(&result_, "tf_export(");
+
+  strings::StrAppend(&result_, "@tf_export(");
 
   // Add all endpoint names to tf_export.
   bool first_endpoint = true;
@@ -603,9 +608,9 @@ void GenPythonOp::AddDocStringInputs() {
     StringPiece description = api_def_arg.description();
     string desc;
     if (ConsumeEquals(&description)) {  // Skip the generated type info.
-      desc = strings::StrCat(param_names_[i], ": ");
+      desc = strings::StrCat(param_names_[i].GetRenameTo(), ": ");
     } else {
-      desc = strings::StrCat(param_names_[i], ": ",
+      desc = strings::StrCat(param_names_[i].GetRenameTo(), ": ",
                              ArgTypeName(op_def_, arg, inferred_attrs_, false));
     }
     if (!description.empty()) {
@@ -750,7 +755,8 @@ void GenPythonOp::AddBody(const string& prefix) {
 void GenPythonOp::AddBodyNoReturn(const string& apply_prefix) {
   string args = strings::StrCat("\"", op_def_.name(), "\", ");
   for (size_t i = 0; i < param_names_.size(); ++i) {
-    strings::StrAppend(&args, param_names_[i], "=", param_names_[i], ", ");
+    strings::StrAppend(&args, AvoidPythonReserved(param_names_[i].GetName()),
+                       "=", param_names_[i].GetRenameTo(), ", ");
   }
   strings::StrAppend(&args, "name=name)");
 
diff --git a/tensorflow/python/framework/python_op_gen_internal.h b/tensorflow/python/framework/python_op_gen_internal.h
index c1efbf9be2277dbc047868dde5110b5505fc9e23..6b53825a6d325c00eaf9f60fbcd9d4e0f9c9183c 100644
--- a/tensorflow/python/framework/python_op_gen_internal.h
+++ b/tensorflow/python/framework/python_op_gen_internal.h
@@ -41,6 +41,28 @@ void GenerateLowerCaseOpName(const string& str, string* result);
 
 string DataTypeToPython(DataType dtype, const string& dtype_module);
 
+// Names that corresponds to a single input parameter.
+class ParamNames {
+ public:
+  // Create param based on Arg.
+  ParamNames(const string& name, const string& rename_to) : name_(name) {
+    rename_to_ = AvoidPythonReserved(rename_to);
+  }
+
+  // Get original parameter name.
+  string GetName() const { return name_; }
+
+  // Get the name to rename the parameter to. Note that AvoidPythonReserved
+  // has already been applied.
+  string GetRenameTo() const { return rename_to_; }
+
+ private:
+  // Original parameter name.
+  string name_;
+  // API name for this parameter.
+  string rename_to_;
+};
+
 class GenPythonOp {
  public:
   GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
@@ -84,7 +106,7 @@ class GenPythonOp {
 
   // All parameters, including inputs & non-inferred attrs, required and those
   // with defaults, except "name"
-  std::vector<string> param_names_;
+  std::vector<ParamNames> param_names_;
 };
 
 }  // namespace python_op_gen_internal
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index 61b1d02a5e85f40c884ffe77104b425b3554b796..bc5ca195da50499c6fbab822a9a093be3f0277e0 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -34,12 +34,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-constexpr char kBaseApiDef[] =
-    "tensorflow/core/api_def/base_api/*.pbtxt";
-constexpr char kPythonApiDef[] =
-    "tensorflow/core/api_def/python_api/*.pbtxt";
-constexpr bool kUseApiDef = false;
-
 Status ReadOpListFromFile(const string& filename,
                           std::vector<string>* op_list) {
   std::unique_ptr<RandomAccessFile> file;
@@ -110,22 +104,23 @@ string InferSourceFileName(const char* argv_zero) {
 }
 
 void PrintAllPythonOps(const std::vector<string>& op_list,
+                       const std::vector<string>& api_def_dirs,
                        const string& source_file_name, bool require_shapes,
                        bool op_list_is_whitelist) {
   OpList ops;
   OpRegistry::Global()->Export(false, &ops);
 
   ApiDefMap api_def_map(ops);
-  if (kUseApiDef) {
+  if (!api_def_dirs.empty()) {
     Env* env = Env::Default();
 
-    std::vector<string> base_api_files;
-    std::vector<string> python_api_files;
-    TF_CHECK_OK(env->GetMatchingPaths(kBaseApiDef, &base_api_files));
-    TF_CHECK_OK(env->GetMatchingPaths(kPythonApiDef, &python_api_files));
-
-    TF_CHECK_OK(api_def_map.LoadFileList(env, base_api_files));
-    TF_CHECK_OK(api_def_map.LoadFileList(env, python_api_files));
+    for (const auto& api_def_dir : api_def_dirs) {
+      std::vector<string> api_files;
+      TF_CHECK_OK(env->GetMatchingPaths(io::JoinPath(api_def_dir, "*.pbtxt"),
+                                        &api_files));
+      TF_CHECK_OK(api_def_map.LoadFileList(env, api_files));
+    }
+    api_def_map.UpdateDocs();
   }
 
   if (op_list_is_whitelist) {
@@ -154,23 +149,30 @@ int main(int argc, char* argv[]) {
       tensorflow::InferSourceFileName(argv[0]);
 
   // Usage:
-  //   gen_main [ @FILENAME | OpName[,OpName]* ] (0 | 1) [0 | 1]
-  if (argc == 2) {
-    tensorflow::PrintAllPythonOps({}, source_file_name,
-                                  tensorflow::string(argv[1]) == "1",
-                                  false /* op_list_is_whitelist */);
-  } else if (argc == 3) {
-    std::vector<tensorflow::string> hidden_ops;
-    TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[1], &hidden_ops));
-    tensorflow::PrintAllPythonOps(hidden_ops, source_file_name,
+  //   gen_main api_def_dir1,api_def_dir2,...
+  //       [ @FILENAME | OpName[,OpName]* ] (0 | 1) [0 | 1]
+  if (argc < 3) {
+    return -1;
+  }
+  std::vector<tensorflow::string> api_def_dirs = tensorflow::str_util::Split(
+      argv[1], ",", tensorflow::str_util::SkipEmpty());
+
+  if (argc == 3) {
+    tensorflow::PrintAllPythonOps({}, api_def_dirs, source_file_name,
                                   tensorflow::string(argv[2]) == "1",
                                   false /* op_list_is_whitelist */);
   } else if (argc == 4) {
+    std::vector<tensorflow::string> hidden_ops;
+    TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[2], &hidden_ops));
+    tensorflow::PrintAllPythonOps(hidden_ops, api_def_dirs, source_file_name,
+                                  tensorflow::string(argv[3]) == "1",
+                                  false /* op_list_is_whitelist */);
+  } else if (argc == 5) {
     std::vector<tensorflow::string> op_list;
-    TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[1], &op_list));
-    tensorflow::PrintAllPythonOps(op_list, source_file_name,
-                                  tensorflow::string(argv[2]) == "1",
-                                  tensorflow::string(argv[3]) == "1");
+    TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[2], &op_list));
+    tensorflow::PrintAllPythonOps(op_list, api_def_dirs, source_file_name,
+                                  tensorflow::string(argv[3]) == "1",
+                                  tensorflow::string(argv[4]) == "1");
   } else {
     return -1;
   }
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 10f5579ae599bcff641ada8bb7c2b50f7a54de63..6218cc34cad50aa6e291dcffcf352c717e0d85f0 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -93,8 +93,7 @@ class SparseTensor(_TensorLike):
 
   @classmethod
   def from_value(cls, sparse_tensor_value):
-    if not (isinstance(sparse_tensor_value, SparseTensor) or
-            isinstance(sparse_tensor_value, SparseTensorValue)):
+    if not is_sparse(sparse_tensor_value):
       raise TypeError("Neither a SparseTensor nor SparseTensorValue: %s." %
                       sparse_tensor_value)
     return SparseTensor(
@@ -253,3 +252,17 @@ def convert_to_tensor_or_sparse_tensor(value, dtype=None, name=None):
     return value
   return ops.internal_convert_to_tensor(
       value, dtype=dtype, name=name)
+
+
+def is_sparse(x):
+  """Check whether `x` is sparse.
+
+  Check whether an object is a `tf.SparseTensor` or `tf.SparseTensorValue`.
+
+  Args:
+    x: A python object to check.
+
+  Returns:
+    `True` iff `x` is a `tf.SparseTensor` or `tf.SparseTensorValue`.
+  """
+  return isinstance(x, (SparseTensor, SparseTensorValue))
diff --git a/tensorflow/python/framework/sparse_tensor_test.py b/tensorflow/python/framework/sparse_tensor_test.py
index e709eaeda14e1eaae93ff39a4dc6b85970e976e1..c001fed3b058fe1e7f01f6a4f32b125783ed935e 100644
--- a/tensorflow/python/framework/sparse_tensor_test.py
+++ b/tensorflow/python/framework/sparse_tensor_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
@@ -51,6 +53,16 @@ class SparseTensorTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(sess_run_value.values, value.values)
         self.assertAllEqual(sess_run_value.dense_shape, value.dense_shape)
 
+  def testIsSparse(self):
+    self.assertFalse(sparse_tensor.is_sparse(3))
+    self.assertFalse(sparse_tensor.is_sparse("foo"))
+    self.assertFalse(sparse_tensor.is_sparse(np.array(3)))
+    self.assertTrue(
+        sparse_tensor.is_sparse(sparse_tensor.SparseTensor([[0]], [0], [1])))
+    self.assertTrue(
+        sparse_tensor.is_sparse(
+            sparse_tensor.SparseTensorValue([[0]], [0], [1])))
+
 
 class ConvertToTensorOrSparseTensorTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index e2835421729d90e7fb4b354764c9564ead41f128..1b90c7ad4d68287bfa5c1c74c82d2936a20e4a80 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -49,8 +49,20 @@ def SlowAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
   tensor_proto.half_val.extend([
       ExtractBitsFromFloat16(x) for x in proto_values])
 
+
+def ExtractBitsFromBFloat16(x):
+  return np.asscalar(
+      np.asarray(x, dtype=dtypes.bfloat16.as_numpy_dtype).view(np.uint16))
+
+
+def SlowAppendBFloat16ArrayToTensorProto(tensor_proto, proto_values):
+  tensor_proto.half_val.extend([
+      ExtractBitsFromBFloat16(x) for x in proto_values])
+
+
 if _FAST_TENSOR_UTIL_AVAILABLE:
   _NP_TO_APPEND_FN = {
+      dtypes.bfloat16.as_numpy_dtype: SlowAppendBFloat16ArrayToTensorProto,
       # TODO(sesse): We should have a
       # fast_tensor_util.AppendFloat16ArrayToTensorProto,
       # but it seems np.float16_t doesn't exist?
@@ -121,6 +133,7 @@ else:
     tensor_proto.bool_val.extend([np.asscalar(x) for x in proto_values])
 
   _NP_TO_APPEND_FN = {
+      dtypes.bfloat16.as_numpy_dtype: SlowAppendBFloat16ArrayToTensorProto,
       np.float16: SlowAppendFloat16ArrayToTensorProto,
       np.float32: SlowAppendFloat32ArrayToTensorProto,
       np.float64: SlowAppendFloat64ArrayToTensorProto,
@@ -874,7 +887,7 @@ def is_tensor(x):  # pylint: disable=invalid-name
   `isinstance(x, [tf.Tensor, tf.SparseTensor, tf.Variable])`.
 
   Args:
-    x: An python object to check.
+    x: A python object to check.
 
   Returns:
     `True` if `x` is a tensor, `False` if not.
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index b4f28cfce0d1897c2b3be649971a8ddc06f6998d..f2de69e159646b4a085645fa1bfef7782e78cd59 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -51,9 +51,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0])
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_FLOAT  
-        tensor_shape { dim { size: 3 } }  
-        tensor_content: "A \000\000A\240\000\000A\360\000\000"  
+        dtype: DT_FLOAT
+        tensor_shape { dim { size: 3 } }
+        tensor_content: "A \000\000A\240\000\000A\360\000\000"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -69,9 +69,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0], dtype=dtypes.float32)
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_FLOAT  
-        tensor_shape { dim { size: 3 } }  
-        tensor_content: "A \000\000A\240\000\000A\360\000\000"  
+        dtype: DT_FLOAT
+        tensor_shape { dim { size: 3 } }
+        tensor_content: "A \000\000A\240\000\000A\360\000\000"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -87,9 +87,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto([10, 20, 30], dtype=dtypes.float32)
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_FLOAT  
-        tensor_shape { dim { size: 3 } }  
-        tensor_content: "A \000\000A\240\000\000A\360\000\000"  
+        dtype: DT_FLOAT
+        tensor_shape { dim { size: 3 } }
+        tensor_content: "A \000\000A\240\000\000A\360\000\000"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -106,9 +106,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto(arr, dtype=dtypes.float32)
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_FLOAT  
-        tensor_shape { dim { size: 3 } }  
-        tensor_content: "A \000\000A\240\000\000A\360\000\000"  
+        dtype: DT_FLOAT
+        tensor_shape { dim { size: 3 } }
+        tensor_content: "A \000\000A\240\000\000A\360\000\000"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -124,9 +124,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0], shape=[1, 3])
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_FLOAT  
-        tensor_shape { dim { size: 1 } dim { size: 3 } }  
-        tensor_content: "A \000\000A\240\000\000A\360\000\000"  
+        dtype: DT_FLOAT
+        tensor_shape { dim { size: 1 } dim { size: 3 } }
+        tensor_content: "A \000\000A\240\000\000A\360\000\000"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -142,9 +142,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0], shape=[3, 1])
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_FLOAT  
-        tensor_shape { dim { size: 3 } dim { size: 1 } }  
-        tensor_content: "A \000\000A\240\000\000A\360\000\000"  
+        dtype: DT_FLOAT
+        tensor_shape { dim { size: 3 } dim { size: 1 } }
+        tensor_content: "A \000\000A\240\000\000A\360\000\000"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -170,9 +170,9 @@ class TensorUtilTest(test.TestCase):
         np.array([[10.0, 20.0, 30.0]], dtype=np.float64))
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_DOUBLE  
-        tensor_shape { dim { size: 1 } dim { size: 3 } }  
-        tensor_content: "@$\000\000\000\000\000\000@4\000\000\000\000\000\000@>\000\000\000\000\000\000"  
+        dtype: DT_DOUBLE
+        tensor_shape { dim { size: 1 } dim { size: 3 } }
+        tensor_content: "@$\000\000\000\000\000\000@4\000\000\000\000\000\000@>\000\000\000\000\000\000"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -261,9 +261,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto([10, 20, 30, 40], shape=[2, 2])
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_INT32  
-        tensor_shape { dim { size: 2 } dim { size: 2 } }  
-        tensor_content: "\000\000\000\\n\000\000\000\024\000\000\000\036\000\000\000("  
+        dtype: DT_INT32
+        tensor_shape { dim { size: 2 } dim { size: 2 } }
+        tensor_content: "\000\000\000\\n\000\000\000\024\000\000\000\036\000\000\000("
         """, t)
     else:
       self.assertProtoEquals("""
@@ -342,9 +342,9 @@ class TensorUtilTest(test.TestCase):
         [10, 20, 30], shape=[1, 3], dtype=dtypes.int64)
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_INT64  
-        tensor_shape { dim { size: 1 } dim { size: 3 } }  
-        tensor_content: "\000\000\000\000\000\000\000\\n\000\000\000\000\000\000\000\024\000\000\000\000\000\000\000\036"  
+        dtype: DT_INT64
+        tensor_shape { dim { size: 1 } dim { size: 3 } }
+        tensor_content: "\000\000\000\000\000\000\000\\n\000\000\000\000\000\000\000\024\000\000\000\000\000\000\000\036"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -360,9 +360,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto(np.array([10, 20, 30]))
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_INT64  
-        tensor_shape { dim { size: 3 } }  
-        tensor_content: "\000\000\000\000\000\000\000\\n\000\000\000\000\000\000\000\024\000\000\000\000\000\000\000\036"  
+        dtype: DT_INT64
+        tensor_shape { dim { size: 3 } }
+        tensor_content: "\000\000\000\000\000\000\000\\n\000\000\000\000\000\000\000\024\000\000\000\000\000\000\000\036"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -381,9 +381,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto(data, dtype=dtypes.qint32)
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_QINT32  
-        tensor_shape { dim { size: 3 } }  
-        tensor_content: "\000\000\000\025\000\000\000\026\000\000\000\027"  
+        dtype: DT_QINT32
+        tensor_shape { dim { size: 3 } }
+        tensor_content: "\000\000\000\025\000\000\000\026\000\000\000\027"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -418,9 +418,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto(data, dtype=dtypes.quint16)
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_QUINT16  
-        tensor_shape { dim { size: 3 } }  
-        tensor_content: "\000\025\000\026\000\027"  
+        dtype: DT_QUINT16
+        tensor_shape { dim { size: 3 } }
+        tensor_content: "\000\025\000\026\000\027"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -435,9 +435,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto(data, dtype=dtypes.qint16)
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_QINT16  
-        tensor_shape { dim { size: 3 } }  
-        tensor_content: "\000\025\000\026\000\027"  
+        dtype: DT_QINT16
+        tensor_shape { dim { size: 3 } }
+        tensor_content: "\000\025\000\026\000\027"
         """, t)
     else:
       self.assertProtoEquals("""
diff --git a/tensorflow/python/framework/test_ops.cc b/tensorflow/python/framework/test_ops.cc
index 35e0167b2601620cd82ff37d451e4496ece9daff..dbabce096294608b7d7df06e2b5355f5f0a6e9c2 100644
--- a/tensorflow/python/framework/test_ops.cc
+++ b/tensorflow/python/framework/test_ops.cc
@@ -26,6 +26,16 @@ REGISTER_OP("KernelLabel")
     .Output("result: string")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("KernelLabelRequired")
+    .Input("input: int32")
+    .Output("result: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &out));
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
 REGISTER_OP("GraphDefVersion")
     .Output("version: int32")
     .SetIsStateful()
@@ -104,6 +114,14 @@ REGISTER_KERNEL_BUILDER(Name("KernelLabel")
                             .Label("overload_2"),
                         KernelLabelOp<OVERLOAD_2_LABEL>);
 
+// All "KernelLabelRequired" kernels have labels
+REGISTER_KERNEL_BUILDER(
+    Name("KernelLabelRequired").Device(DEVICE_CPU).Label("overload_1"),
+    KernelLabelOp<OVERLOAD_1_LABEL>);
+REGISTER_KERNEL_BUILDER(
+    Name("KernelLabelRequired").Device(DEVICE_CPU).Label("overload_2"),
+    KernelLabelOp<OVERLOAD_2_LABEL>);
+
 class GraphDefVersionOp : public OpKernel {
  public:
   explicit GraphDefVersionOp(OpKernelConstruction* ctx)
@@ -252,6 +270,11 @@ REGISTER_OP("IntInput")
     .Input("a: int32")
     .SetShapeFn(shape_inference::UnknownShape);
 
+REGISTER_OP("IntInputIntOutput")
+    .Input("a: int32")
+    .Output("b: int32")
+    .SetShapeFn(shape_inference::UnknownShape);
+
 REGISTER_OP("FloatInput")
     .Input("a: float32")
     .SetShapeFn(shape_inference::UnknownShape);
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index d796b0ebea1d4047ecdc00d20a976fde4028fa34..7627fb3e69d5e8c71363c6f2dff24a069b139f42 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -47,19 +47,23 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
+from tensorflow.python.util import nest
 from tensorflow.python.util.protobuf import compare
 
 
@@ -453,6 +457,62 @@ class IsolateTest(object):
         type_arg, value_arg, traceback_arg)
 
 
+def assert_no_new_tensors(f):
+  """Decorator for asserting that no new Tensors persist after a test.
+
+  Mainly useful for checking that code using the Python C API has correctly
+  manipulated reference counts.
+
+  Clears the caches that it knows about, runs the garbage collector, then checks
+  that there are no Tensor or Tensor-like objects still around. This includes
+  Tensors to which something still has a reference (e.g. from missing
+  Py_DECREFs) and uncollectable cycles (i.e. Python reference cycles where one
+  of the objects has __del__ defined).
+
+  Args:
+    f: The test case to run.
+  Returns:
+    The decorated test case.
+  """
+
+  def decorator(self, **kwargs):
+    """Finds existing Tensors, runs the test, checks for new Tensors."""
+
+    def _is_tensor(obj):
+      try:
+        return (isinstance(obj, ops.Tensor) or
+                isinstance(obj, variables.Variable))
+      except ReferenceError:
+        # If the object no longer exists, we don't care about it.
+        return False
+
+    tensors_before = set(id(obj) for obj in gc.get_objects() if _is_tensor(obj))
+    outside_container_prefix = ops.get_default_graph()._container_prefix
+    with IsolateTest():
+      # Run the test in a new graph so that collections get cleared when it's
+      # done, but inherit the container prefix so that we can print the values
+      # of variables which get leaked when executing eagerly.
+      ops.get_default_graph()._container_prefix = outside_container_prefix
+      f(self, **kwargs)
+    # Make an effort to clear caches, which would otherwise look like leaked
+    # Tensors.
+    backprop._last_zero = [None]
+    backprop._shape_dtype = [None, None]
+    context.get_default_context().scalar_cache().clear()
+    gc.collect()
+    tensors_after = [
+        obj for obj in gc.get_objects()
+        if _is_tensor(obj) and id(obj) not in tensors_before
+    ]
+    if tensors_after:
+      raise AssertionError(("%d Tensors not deallocated after test: %s" % (
+          len(tensors_after),
+          str(tensors_after),
+      )))
+
+  return decorator
+
+
 def assert_no_garbage_created(f):
   """Test method decorator to assert that no garbage has been created.
 
@@ -507,7 +567,8 @@ def run_in_graph_and_eager_modes(
       garbage for legitimate reasons (e.g. they define a class which inherits
       from `object`), and because DEBUG_SAVEALL is sticky in some Python
       interpreters (meaning that tests which rely on objects being collected
-      elsewhere in the unit test file will not work).
+      elsewhere in the unit test file will not work). Additionally, checks that
+      nothing still has a reference to Tensors that the test allocated.
   Returns:
     Returns a decorator that will run the decorated test function
         using both a graph and using eager execution.
@@ -544,7 +605,8 @@ def run_in_graph_and_eager_modes(
             f(self, **kwargs)
 
       if assert_no_eager_garbage:
-        run_eager_mode = assert_no_garbage_created(run_eager_mode)
+        run_eager_mode = assert_no_new_tensors(
+            assert_no_garbage_created(run_eager_mode))
 
       with context.eager_mode():
         with IsolateTest():
@@ -715,23 +777,22 @@ class TensorFlowTestCase(googletest.TestCase):
       fail_msg += " : %r" % (msg) if msg else ""
       self.fail(fail_msg)
 
-  def _eval_helper(self, tensors):
-    if isinstance(tensors, ops.EagerTensor):
-      return tensors.numpy()
-    if isinstance(tensors, resource_variable_ops.ResourceVariable):
-      return tensors.read_value().numpy()
-
-    if isinstance(tensors, tuple):
-      return tuple([self._eval_helper(t) for t in tensors])
-    elif isinstance(tensors, list):
-      return [self._eval_helper(t) for t in tensors]
-    elif isinstance(tensors, dict):
-      assert not tensors, "Only support empty dict now."
-      return dict()
-    elif tensors is None:
+  def _eval_tensor(self, tensor):
+    if tensor is None:
       return None
+    elif isinstance(tensor, ops.EagerTensor):
+      return tensor.numpy()
+    elif isinstance(tensor, resource_variable_ops.ResourceVariable):
+      return tensor.read_value().numpy()
+    elif callable(tensor):
+      return self._eval_helper(tensor())
     else:
-      raise ValueError("Unsupported type %s." % type(tensors))
+      raise ValueError("Unsupported type %s." % type(tensor))
+
+  def _eval_helper(self, tensors):
+    if tensors is None:
+      return None
+    return nest.map_structure(self._eval_tensor, tensors)
 
   def evaluate(self, tensors):
     """Evaluates tensors and returns numpy values.
@@ -985,10 +1046,9 @@ class TensorFlowTestCase(googletest.TestCase):
       msg: An optional string message to append to the failure message.
     """
     # f1 == f2 is needed here as we might have: f1, f2 = inf, inf
-    self.assertTrue(
-        f1 == f2 or math.fabs(f1 - f2) <= err,
-        "%f != %f +/- %f%s" % (f1, f2, err, " (%s)" % msg
-                               if msg is not None else ""))
+    self.assertTrue(f1 == f2 or math.fabs(f1 - f2) <= err,
+                    "%f != %f +/- %f%s" % (f1, f2, err, " (%s)" % msg
+                                           if msg is not None else ""))
 
   def assertArrayNear(self, farray1, farray2, err):
     """Asserts that two float arrays are near each other.
@@ -1091,7 +1151,9 @@ class TensorFlowTestCase(googletest.TestCase):
                                     float_rtol=1e-6,
                                     float_atol=1e-6,
                                     half_rtol=1e-3,
-                                    half_atol=1e-3):
+                                    half_atol=1e-3,
+                                    bfloat16_rtol=1e-2,
+                                    bfloat16_atol=1e-2):
     """Like assertAllClose, but also suitable for comparing fp16 arrays.
 
     In particular, the tolerance is reduced to 1e-3 if at least
@@ -1106,9 +1168,12 @@ class TensorFlowTestCase(googletest.TestCase):
       float_atol: absolute tolerance for float32.
       half_rtol: relative tolerance for float16.
       half_atol: absolute tolerance for float16.
+      bfloat16_rtol: relative tolerance for bfloat16.
+      bfloat16_atol: absolute tolerance for bfloat16.
     """
     a = self._GetNdArray(a)
     b = self._GetNdArray(b)
+    # types with lower tol are put later to overwrite previous ones.
     if (a.dtype == np.float32 or b.dtype == np.float32 or
         a.dtype == np.complex64 or b.dtype == np.complex64):
       rtol = max(rtol, float_rtol)
@@ -1116,6 +1181,10 @@ class TensorFlowTestCase(googletest.TestCase):
     if a.dtype == np.float16 or b.dtype == np.float16:
       rtol = max(rtol, half_rtol)
       atol = max(atol, half_atol)
+    if (a.dtype == dtypes.bfloat16.as_numpy_dtype or
+        b.dtype == dtypes.bfloat16.as_numpy_dtype):
+      rtol = max(rtol, bfloat16_rtol)
+      atol = max(atol, bfloat16_atol)
 
     self.assertAllClose(a, b, rtol=rtol, atol=atol)
 
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 1c5db945005b6a8c1006dc75c2f1507b2ecb3077..f6aed118ca478daf1f1926ec9d9653015194cab3 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -183,11 +183,13 @@ class TestUtilTest(test_util.TensorFlowTestCase):
 
   def _WeMustGoDeeper(self, msg):
     with self.assertRaisesOpError(msg):
-      node_def = ops._NodeDef("op_type", "name")
-      node_def_orig = ops._NodeDef("op_type_orig", "orig")
-      op_orig = ops.Operation(node_def_orig, ops.get_default_graph())
-      op = ops.Operation(node_def, ops.get_default_graph(), original_op=op_orig)
-      raise errors.UnauthenticatedError(node_def, op, "true_err")
+      with ops.Graph().as_default():
+        node_def = ops._NodeDef("op_type", "name")
+        node_def_orig = ops._NodeDef("op_type_orig", "orig")
+        op_orig = ops.Operation(node_def_orig, ops.get_default_graph())
+        op = ops.Operation(node_def, ops.get_default_graph(),
+                           original_op=op_orig)
+        raise errors.UnauthenticatedError(node_def, op, "true_err")
 
   def testAssertRaisesOpErrorDoesNotPassMessageDueToLeakedStack(self):
     with self.assertRaises(AssertionError):
@@ -328,6 +330,25 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     self.assertEqual(a_np_rand, b_np_rand)
     self.assertEqual(a_rand, b_rand)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_callable_evaluate(self):
+    def model():
+      return resource_variable_ops.ResourceVariable(
+          name="same_name",
+          initial_value=1) + 1
+    with context.eager_mode():
+      self.assertEqual(2, self.evaluate(model))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_nested_tensors_evaluate(self):
+    expected = {"a": 1, "b": 2, "nested": {"d": 3, "e": 4}}
+    nested = {"a": constant_op.constant(1),
+              "b": constant_op.constant(2),
+              "nested": {"d": constant_op.constant(3),
+                         "e": constant_op.constant(4)}}
+
+    self.assertEqual(expected, self.evaluate(nested))
+
 
 class GarbageCollectionTest(test_util.TensorFlowTestCase):
 
@@ -352,6 +373,26 @@ class GarbageCollectionTest(test_util.TensorFlowTestCase):
 
     ReferenceCycleTest().test_has_no_cycle()
 
+  def test_no_leaked_tensor_decorator(self):
+
+    class LeakedTensorTest(object):
+
+      def __init__(inner_self):  # pylint: disable=no-self-argument
+        inner_self.assertEqual = self.assertEqual  # pylint: disable=invalid-name
+
+      @test_util.assert_no_new_tensors
+      def test_has_leak(self):
+        self.a = constant_op.constant([3.])
+
+      @test_util.assert_no_new_tensors
+      def test_has_no_leak(self):
+        constant_op.constant([3.])
+
+    with self.assertRaisesRegexp(AssertionError, "Tensors not deallocated"):
+      LeakedTensorTest().test_has_leak()
+
+    LeakedTensorTest().test_has_no_leak()
+
 
 @test_util.with_c_api
 class IsolationTest(test_util.TensorFlowTestCase):
@@ -419,6 +460,5 @@ class IsolationTest(test_util.TensorFlowTestCase):
         with self.assertRaises(ValueError):
           first_container_variable.read_value()
 
-
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/versions.py b/tensorflow/python/framework/versions.py
index 81529e2b1e06e70fb2839c037c555ef41bcdd291..f03b81eb28a7073873579390eae133d3c930c5a0 100644
--- a/tensorflow/python/framework/versions.py
+++ b/tensorflow/python/framework/versions.py
@@ -25,11 +25,13 @@ __version__ = pywrap_tensorflow.__version__
 __git_version__ = pywrap_tensorflow.__git_version__
 __compiler_version__ = pywrap_tensorflow.__compiler_version__
 __cxx11_abi_flag__ = pywrap_tensorflow.__cxx11_abi_flag__
+__monolithic_build__ = pywrap_tensorflow.__monolithic_build__
 
 VERSION = __version__
 GIT_VERSION = __git_version__
 COMPILER_VERSION = __compiler_version__
 CXX11_ABI_FLAG = __cxx11_abi_flag__
+MONOLITHIC_BUILD = __monolithic_build__
 
 GRAPH_DEF_VERSION = pywrap_tensorflow.GRAPH_DEF_VERSION
 GRAPH_DEF_VERSION_MIN_CONSUMER = (
@@ -42,6 +44,7 @@ __all__ = [
     "__git_version__",
     "__compiler_version__",
     "__cxx11_abi_flag__",
+    "__monolithic_build__",
     "COMPILER_VERSION",
     "CXX11_ABI_FLAG",
     "GIT_VERSION",
@@ -49,4 +52,5 @@ __all__ = [
     "GRAPH_DEF_VERSION_MIN_CONSUMER",
     "GRAPH_DEF_VERSION_MIN_PRODUCER",
     "VERSION",
+    "MONOLITHIC_BUILD",
 ]
diff --git a/tensorflow/python/grappler/cluster.i b/tensorflow/python/grappler/cluster.i
index 3df9431282c1fa6a62778e474a14d01e5738c578..9981c1d22d69d7372ef7bb6d033bede219cec2af 100644
--- a/tensorflow/python/grappler/cluster.i
+++ b/tensorflow/python/grappler/cluster.i
@@ -14,6 +14,62 @@ limitations under the License.
 ==============================================================================*/
 
 %include "tensorflow/python/platform/base.i"
+%include <std_shared_ptr.i>
+%include "item.i"
+
+// Wrap the cluster into an object that swig can manipulate. This ensures it will call the object
+// destructor upon garbage collection instead of leaking memory.
+struct GCluster {
+  std::shared_ptr<tensorflow::grappler::Cluster> cluster_;
+};
+
+%{
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+template <>
+bool _PyObjAs(PyObject *input, tensorflow::NamedDevice *out) {
+  char* c_string;
+  Py_ssize_t py_size;
+  if (PyBytes_AsStringAndSize(input, &c_string, &py_size) == -1) {
+    // Python has raised an error (likely TypeError or UnicodeEncodeError).
+    return false;
+  }
+
+  tensorflow::NamedDevice named_device;
+  if (!named_device.ParseFromString(string(c_string, py_size))) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "The NamedDevice could not be parsed as a valid protocol buffer");
+    return false;
+  }
+  if (out) *out = named_device;
+  return true;
+}
+%}
+
+%typemap(in) const std::vector<tensorflow::NamedDevice>& (std::vector<tensorflow::NamedDevice> temp) {
+  if (!tf_vector_input_helper($input, &temp, &_PyObjAs<tensorflow::NamedDevice>)) {
+    SWIG_fail;
+  }
+  $1 = &temp;
+}
+
+%typemap(in) const tensorflow::NamedDevice& (tensorflow::NamedDevice temp) {
+  char* c_string;
+  Py_ssize_t py_size;
+  if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
+    // Python has raised an error (likely TypeError or UnicodeEncodeError).
+    SWIG_fail;
+  }
+
+  if (!temp.ParseFromString(string(c_string, py_size))) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "The NamedDevice could not be parsed as a valid protocol buffer");
+    SWIG_fail;
+  }
+  $1 = &temp;
+}
 
 %typemap(in) const tensorflow::RunMetadata& (tensorflow::RunMetadata temp) {
   char* c_string;
@@ -26,7 +82,7 @@ limitations under the License.
   if (!temp.ParseFromString(string(c_string, py_size))) {
     PyErr_SetString(
         PyExc_TypeError,
-        "The MetaGraphDef could not be parsed as a valid protocol buffer");
+        "The RunMetadata could not be parsed as a valid protocol buffer");
     SWIG_fail;
   }
   $1 = &temp;
@@ -41,36 +97,78 @@ limitations under the License.
 }
 
 %{
+#include <memory>
+#include <vector>
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/clusters/single_machine.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/graph_memory.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
 #include "tensorflow/core/grappler/costs/measuring_cost_estimator.h"
 #include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+// Provide the implementation of the GCluster struct here.
+struct GCluster {
+  GCluster() {}
+  GCluster(tensorflow::grappler::Cluster* cluster) : cluster_(cluster) {}
 
-static tensorflow::grappler::Cluster* TF_NewCluster(
-    bool allow_soft_placement, bool disable_detailed_stats, TF_Status* out_status) {
-  int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
-  int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();;
+  tensorflow::grappler::Cluster* operator->() const {
+    return cluster_.get();
+  }
+  tensorflow::grappler::Cluster* get() const {
+    return cluster_.get();
+  }
+  bool is_none() const {
+    return cluster_.get() == nullptr;
+  }
+
+  std::shared_ptr<tensorflow::grappler::Cluster> cluster_;
+};
+
+
+static GCluster TF_NewCluster(bool allow_soft_placement,
+                   bool disable_detailed_stats, TF_Status* out_status) {
+    int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
+  int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
   int timeout_s = 60 * 10;
-  tensorflow::grappler::Cluster* cluster = new tensorflow::grappler::SingleMachine(
-      timeout_s, num_cpu_cores, num_gpus);
-  cluster->DisableDetailedStats(disable_detailed_stats);
-  cluster->AllowSoftPlacement(allow_soft_placement);
-  tensorflow::Status status = cluster->Provision();
+  tensorflow::grappler::Cluster* cluster_ =
+      new tensorflow::grappler::SingleMachine(
+          timeout_s, num_cpu_cores, num_gpus);
+  cluster_->DisableDetailedStats(disable_detailed_stats);
+  cluster_->AllowSoftPlacement(allow_soft_placement);
+  tensorflow::Status status = cluster_->Provision();
   tensorflow::Set_TF_Status_from_Status(out_status, status);
-  return cluster;
+  return GCluster(cluster_);
 }
 
-static void TF_DeleteCluster(tensorflow::grappler::Cluster* cluster) {
+static GCluster TF_NewVirtualCluster(
+    const std::vector<tensorflow::NamedDevice>& named_devices,
+    TF_Status* out_status) {
+  std::unordered_map<string, tensorflow::DeviceProperties> devices;
+  for (const auto& named_device : named_devices) {
+    devices[named_device.name()]= named_device.properties();
+  }
+  tensorflow::grappler::Cluster*cluster_ =
+      new tensorflow::grappler::VirtualCluster(devices);
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  tensorflow::Status status = cluster_->Provision();
+  PyGILState_Release(gstate);
+  tensorflow::Set_TF_Status_from_Status(out_status, status);
+  return GCluster(cluster_);
+}
+
+static void TF_ShutdownCluster(GCluster cluster) {
+  PyGILState_STATE gstate = PyGILState_Ensure();
   cluster->Shutdown();
-  delete cluster;
+  PyGILState_Release(gstate);
 }
 
-tensorflow::Status _GetOpPerformanceDataAndRunTime(const tensorflow::grappler::GrapplerItem& item,
-                                       tensorflow::grappler::CostEstimator* cost_measure,
-                                       tensorflow::OpPerformanceList* op_performance_data,
-                                       tensorflow::grappler::Costs* costs) {
+tensorflow::Status _GetOpPerformanceDataAndRunTime(
+    const tensorflow::grappler::GrapplerItem& item,
+    tensorflow::grappler::CostEstimator* cost_measure,
+    tensorflow::OpPerformanceList* op_performance_data,
+    tensorflow::grappler::Costs* costs) {
   tensorflow::Status status = cost_measure->Initialize(item);
   if (!status.ok()) return status;
 
@@ -85,24 +183,64 @@ tensorflow::Status _GetOpPerformanceDataAndRunTime(const tensorflow::grappler::G
   return tensorflow::Status::OK();
 }
 
+static PyObject* TF_ListDevices(GCluster cluster) {
+  const std::unordered_map<string, tensorflow::DeviceProperties>& devices = cluster->GetDevices();
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  PyObject* result = PyList_New(devices.size());
+  int i = 0;
+  for (auto& dev : devices) {
+    tensorflow::NamedDevice d;
+    d.set_name(dev.first);
+    *d.mutable_properties() = dev.second;
+    string dev_str = d.SerializeAsString();
+    PyObject* dev_obj = PyBytes_FromStringAndSize(dev_str.data(),
+                                                  dev_str.size());
+    PyList_SetItem(result, i, dev_obj);
+    ++i;
+  }
+  PyGILState_Release(gstate);
+  return result;
+}
+
+static std::vector<string> TF_ListAvailableOps() {
+  tensorflow::OpRegistry* registry = tensorflow::OpRegistry::Global();
+  std::vector<tensorflow::OpDef> ops;
+  registry->GetRegisteredOps(&ops);
+  std::vector<string> op_names;
+  for (const tensorflow::OpDef& op : ops) {
+    op_names.push_back(op.name());
+  }
+  std::sort(op_names.begin(), op_names.end());
+  return op_names;
+}
+
+static double TF_EstimatePerformance(const tensorflow::NamedDevice& device) {
+  tensorflow::grappler::OpLevelCostEstimator estimator;
+  tensorflow::grappler::OpLevelCostEstimator::DeviceInfo info =
+      estimator.GetDeviceInfo(device.properties());
+  return info.gigaops;
+}
+
 static PyObject* TF_MeasureCosts(
-    const tensorflow::grappler::GrapplerItem* item, tensorflow::grappler::Cluster* cluster,
+    GItem item,
+    GCluster cluster,
     bool generate_timeline, TF_Status* out_status) {
   tensorflow::OpPerformanceList op_performance_data;
   tensorflow::StepStats step_stats;
 
-  tensorflow::grappler::MeasuringCostEstimator cost_measure(cluster, 10, 0);
+  tensorflow::grappler::MeasuringCostEstimator cost_measure(cluster.get(), 10, 0);
 
   tensorflow::grappler::Costs costs;
-  tensorflow::Status status = _GetOpPerformanceDataAndRunTime(*item, &cost_measure,
-                                                 &op_performance_data, &costs);
+  tensorflow::Status status = _GetOpPerformanceDataAndRunTime(
+      *item, &cost_measure, &op_performance_data, &costs);
   double run_time = FLT_MAX;
   if (status.ok()) {
     run_time = static_cast<double>(costs.execution_time.count()) / 1e9;
   }
   if (generate_timeline) {
     tensorflow::RunMetadata metadata;
-    tensorflow::Status s = cluster->Run(item->graph, item->feed, item->fetch, &metadata);
+    tensorflow::Status s = cluster->Run(
+        item->graph, item->feed, item->fetch, &metadata);
     if (s.ok()) {
       step_stats = metadata.step_stats();
     } else {
@@ -114,9 +252,12 @@ static PyObject* TF_MeasureCosts(
   if (!status.ok()) {
     Py_RETURN_NONE;
   }
-  PyObject* op_perf_objs = PyList_New(op_performance_data.op_performance_size());
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  PyObject* op_perf_objs = PyList_New(
+      op_performance_data.op_performance_size());
   for (int i = 0; i < op_performance_data.op_performance_size(); i++) {
-    string op_perf_str = op_performance_data.op_performance(i).SerializeAsString();
+    string op_perf_str =
+        op_performance_data.op_performance(i).SerializeAsString();
     PyObject* op_perf_obj = PyBytes_FromStringAndSize(op_perf_str.data(),
                                                       op_perf_str.size());
     PyList_SetItem(op_perf_objs, i, op_perf_obj);
@@ -139,16 +280,19 @@ static PyObject* TF_MeasureCosts(
     status = tensorflow::Status(tensorflow::error::Code::INTERNAL,
                                 "Error setting return tuples.");
     tensorflow::Set_TF_Status_from_Status(out_status, status);
-    Py_RETURN_NONE;
+    Py_INCREF(Py_None);
+    ret = Py_None;
   }
+  PyGILState_Release(gstate);
   return ret;
 }
 
 
 static PyObject* TF_DeterminePeakMemoryUsage(
-    const tensorflow::grappler::GrapplerItem* item, tensorflow::grappler::Cluster* cluster,
+    GItem item,
+    GCluster cluster,
     TF_Status* out_status) {
-  if (!item || !cluster) {
+  if (item.is_none() || cluster.is_none()) {
     tensorflow::Status status(tensorflow::error::Code::INTERNAL,
                               "You need both a cluster and an item to determine peak memory usage");
     tensorflow::Set_TF_Status_from_Status(out_status, status);
@@ -158,7 +302,7 @@ static PyObject* TF_DeterminePeakMemoryUsage(
 
   tensorflow::Status status;
   if (cluster->DetailedStatsEnabled()) {
-    status = memory.InferDynamically(cluster);
+    status = memory.InferDynamically(cluster.get());
   } else {
     status = memory.InferStatically(cluster->GetDevices());
   }
@@ -167,6 +311,7 @@ static PyObject* TF_DeterminePeakMemoryUsage(
     Py_RETURN_NONE;
   }
 
+  PyGILState_STATE gstate = PyGILState_Ensure();
   PyObject* result = PyDict_New();
   for (const auto& device : cluster->GetDevices()) {
     const tensorflow::grappler::GraphMemory::MemoryUsage& usage =
@@ -188,19 +333,25 @@ static PyObject* TF_DeterminePeakMemoryUsage(
     PyTuple_SetItem(ret, 1, per_device);
     PyDict_SetItem(result, PyString_FromString(device.first.c_str()), ret);
   }
+  PyGILState_Release(gstate);
   return result;
 }
 
 %}
 
 // Wrap these functions.
-
-static tensorflow::grappler::Cluster* TF_NewCluster(
+static GCluster TF_NewCluster(
     bool allow_soft_placement, bool disable_detailed_stats, TF_Status* out_status);
-static void TF_DeleteCluster(tensorflow::grappler::Cluster* cluster);
+static GCluster TF_NewVirtualCluster(
+    const std::vector<tensorflow::NamedDevice>& named_devices,
+    TF_Status* out_status);
+static void TF_ShutdownCluster(GCluster cluster);
+static PyObject* TF_ListDevices(GCluster cluster);
+static std::vector<string> TF_ListAvailableOps();
+static float TF_EstimatePerformance(const tensorflow::NamedDevice& device);
 static PyObject* TF_MeasureCosts(
-    const tensorflow::grappler::GrapplerItem* item, tensorflow::grappler::Cluster* cluster,
+    GItem item, GCluster cluster,
     bool generate_timeline, TF_Status* out_status);
 static PyObject* TF_DeterminePeakMemoryUsage(
-    const tensorflow::grappler::GrapplerItem* item, tensorflow::grappler::Cluster* cluster,
+    GItem item, GCluster cluster,
     TF_Status* out_status);
diff --git a/tensorflow/python/grappler/cluster.py b/tensorflow/python/grappler/cluster.py
index baac604f411b3fb48ab2336e9479853f26fd690c..ba1a734ee04b2ddba801779bbb61e38bec82d4b8 100644
--- a/tensorflow/python/grappler/cluster.py
+++ b/tensorflow/python/grappler/cluster.py
@@ -18,8 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
+
 from tensorflow.core.framework import step_stats_pb2
 from tensorflow.core.grappler.costs import op_performance_data_pb2
+from tensorflow.core.protobuf import device_properties_pb2
 from tensorflow.python import pywrap_tensorflow as tf_cluster
 from tensorflow.python.framework import errors
 
@@ -30,33 +33,68 @@ class Cluster(object):
   def __init__(self,
                allow_soft_placement=True,
                disable_detailed_stats=True,
-               disable_timeline=True):
+               disable_timeline=True,
+               devices=None):
     """Creates a Cluster.
 
     Args:
-      allow_soft_placement: if True, TF will automatically fix illegal
+      allow_soft_placement: If True, TF will automatically fix illegal
         placements instead of erroring out if the placement isn't legal.
-      disable_detailed_stats: if True, detailed statistics will not be
+      disable_detailed_stats: If True, detailed statistics will not be
         available.
-      disable_timeline: if True, the timeline information will not be
-        reported.
+      disable_timeline: If True, the timeline information will not be reported.
+      devices: A list of devices of type device_properties_pb2.NamedDevice.
+        If None, a device list will be created based on the spec of
+        the local machine.
     """
     self._tf_cluster = None
-    with errors.raise_exception_on_not_ok_status() as status:
-      self._tf_cluster = tf_cluster.TF_NewCluster(
-          allow_soft_placement, disable_detailed_stats, status)
     self._generate_timeline = not disable_timeline
+    with errors.raise_exception_on_not_ok_status() as status:
+      if devices is None:
+        self._tf_cluster = tf_cluster.TF_NewCluster(
+            allow_soft_placement, disable_detailed_stats, status)
+      else:
+        devices_serialized = [device.SerializeToString() for device in devices]
+        self._tf_cluster = tf_cluster.TF_NewVirtualCluster(
+            devices_serialized, status)
+
+  def Shutdown(self):
+    if self._tf_cluster is not None:
+      tf_cluster.TF_ShutdownCluster(self._tf_cluster)
+      self._tf_cluster = None
 
   def __del__(self):
+    self.Shutdown()
+
+  @property
+  def tf_cluster(self):
+    return self._tf_cluster
+
+  def ListDevices(self):
+    """Returns the list of available hardware devices."""
+    devices = []
     if self._tf_cluster is not None:
-      tf_cluster.TF_DeleteCluster(self._tf_cluster)
+      ret_from_swig = tf_cluster.TF_ListDevices(self._tf_cluster)
+      devices = []
+      for raw_dev in ret_from_swig:
+        devices.append(device_properties_pb2.NamedDevice.FromString(raw_dev))
+    return devices
+
+  def ListAvailableOps(self):
+    """Returns a list of all the available operations (sorted alphatically)."""
+    return tf_cluster.TF_ListAvailableOps()
+
+  def EstimatePerformance(self, device):
+    """Estimate the performance of the specified device."""
+    serialized = device.SerializeToString()
+    return tf_cluster.TF_EstimatePerformance(serialized)
 
   def MeasureCosts(self, item):
     """Returns the cost of running the specified item.
 
     Args:
-      item: the item for which to measure the costs.
-    Returns: the triplet op_perfs, runtime, step_stats.
+      item: The item for which to measure the costs.
+    Returns: The triplet op_perfs, runtime, step_stats.
     """
     with errors.raise_exception_on_not_ok_status() as status:
       ret_from_swig = tf_cluster.TF_MeasureCosts(
@@ -77,11 +115,22 @@ class Cluster(object):
     """Returns a snapshot of the peak memory usage.
 
     Args:
-      item: the item for which to measure the costs.
-    Returns: a hashtable indexed by device name.
+      item: The item for which to measure the costs.
+    Returns: A hashtable indexed by device name.
     """
     with errors.raise_exception_on_not_ok_status() as status:
       ret_from_swig = tf_cluster.TF_DeterminePeakMemoryUsage(
           item.tf_item, self._tf_cluster, status)
 
     return ret_from_swig
+
+
+@contextlib.contextmanager
+def Provision(allow_soft_placement=True,
+              disable_detailed_stats=True,
+              disable_timeline=True,
+              devices=None):
+  cluster = Cluster(allow_soft_placement, disable_detailed_stats,
+                    disable_timeline, devices)
+  yield cluster
+  cluster.Shutdown()
diff --git a/tensorflow/python/grappler/cluster_test.py b/tensorflow/python/grappler/cluster_test.py
index de4ded571f79313f075ef8f63ee643332dd68033..26feac0a23149d4f797dbaee87c14585f84f20ea 100644
--- a/tensorflow/python/grappler/cluster_test.py
+++ b/tensorflow/python/grappler/cluster_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.protobuf import device_properties_pb2
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.grappler import cluster
@@ -42,7 +43,7 @@ class ClusterTest(test.TestCase):
       op_perfs, run_time, step_stats = grappler_cluster.MeasureCosts(
           grappler_item)
       self.assertTrue(run_time > 0)
-      self.assertEqual(len(op_perfs), 10)
+      self.assertEqual(len(op_perfs), 9)
       self.assertTrue(step_stats.dev_stats)
 
   def testNoDetailedStats(self):
@@ -82,6 +83,56 @@ class ClusterTest(test.TestCase):
         live_tensors = snapshot[1]
         self.assertEqual(15, len(live_tensors))
 
+  def testVirtualCluster(self):
+    with ops.Graph().as_default() as g:
+      a = random_ops.random_uniform(shape=())
+      b = random_ops.random_uniform(shape=())
+      c = a + b
+      train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+      train_op.append(c)
+      mg = meta_graph.create_meta_graph_def(graph=g)
+      grappler_item = item.Item(mg)
+      device_properties = device_properties_pb2.DeviceProperties(
+          type='GPU',
+          frequency=1000,
+          num_cores=60,
+          environment={
+              'architecture': '7'
+          })
+      named_device = device_properties_pb2.NamedDevice(
+          properties=device_properties, name='/GPU:0')
+      grappler_cluster = cluster.Cluster(devices=[named_device])
+      op_perfs, run_time, _ = grappler_cluster.MeasureCosts(grappler_item)
+      self.assertGreater(run_time, 0)
+      self.assertEqual(len(op_perfs), 15)
+
+      estimated_perf = grappler_cluster.EstimatePerformance(named_device)
+      self.assertEqual(7680.0, estimated_perf)
+
+  def testContext(self):
+    with ops.Graph().as_default() as g:
+      a = random_ops.random_uniform(shape=())
+      b = random_ops.random_uniform(shape=())
+      c = a + b
+      train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+      train_op.append(c)
+      mg = meta_graph.create_meta_graph_def(graph=g)
+      grappler_item = item.Item(mg)
+
+    with cluster.Provision(
+        disable_detailed_stats=False, disable_timeline=False) as gcluster:
+      op_perfs, run_time, step_stats = gcluster.MeasureCosts(grappler_item)
+      self.assertTrue(run_time > 0)
+      self.assertEqual(len(op_perfs), 9)
+      self.assertTrue(step_stats.dev_stats)
+
+  def testAvailableOps(self):
+    with cluster.Provision() as gcluster:
+      op_names = gcluster.ListAvailableOps()
+      self.assertTrue(b'Add' in op_names)
+      self.assertTrue(b'MatMul' in op_names)
+      self.assertEqual(op_names, sorted(op_names))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/grappler/cost_analyzer.i b/tensorflow/python/grappler/cost_analyzer.i
index 1f024e439d8d4d819e5f603ae3b8a843063baeeb..4c0953435ba3fa6423bbc869fcca909d0c2ccb25 100644
--- a/tensorflow/python/grappler/cost_analyzer.i
+++ b/tensorflow/python/grappler/cost_analyzer.i
@@ -15,6 +15,7 @@ limitations under the License.
 
 %include "tensorflow/python/lib/core/strings.i"
 %include "tensorflow/python/platform/base.i"
+%include "cluster.i"
 
 %typemap(in) const tensorflow::MetaGraphDef& (tensorflow::MetaGraphDef temp) {
   char* c_string;
@@ -42,8 +43,8 @@ limitations under the License.
 %}
 
 %{
-string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph, bool
-per_node_report) {
+string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph, bool per_node_report,
+                          GCluster cluster) {
   tensorflow::grappler::ItemConfig cfg;
   cfg.apply_optimizations = false;
   std::unique_ptr<tensorflow::grappler::GrapplerItem> item =
@@ -51,20 +52,9 @@ per_node_report) {
   if (!item) {
     return "Error: failed to preprocess metagraph: check your log file for errors";
   }
-  
-  // TODO(bsteiner): we should wrap the tf session instead to properly handle the case of a
-  // distributed setup.
-  const int timeout_s = 3600;
-  int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
-  int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
-  tensorflow::grappler::SingleMachine cluster(timeout_s, num_cpu_cores, num_gpus);
-  cluster.SetNumWarmupSteps(10);
-  cluster.AllowSoftPlacement(true);
-  cluster.DisableDetailedStats(false);
-  TF_CHECK_OK(cluster.Provision());
 
   string suffix;
-  tensorflow::grappler::CostAnalyzer analyzer(*item, &cluster, suffix);
+  tensorflow::grappler::CostAnalyzer analyzer(*item, cluster.get(), suffix);
 
   std::stringstream os;
   analyzer.GenerateReport(os, per_node_report);
@@ -73,5 +63,5 @@ per_node_report) {
 
 %}
 
-string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph, bool
-per_node_report);
+string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph, bool per_node_report,
+                          GCluster cluster);
diff --git a/tensorflow/python/grappler/cost_analyzer.py b/tensorflow/python/grappler/cost_analyzer.py
index 75c21e572719128cfd5f9a36191b5765386c43dc..a1ff915c61ba14d9a899d7f6c9a2c49855969b00 100644
--- a/tensorflow/python/grappler/cost_analyzer.py
+++ b/tensorflow/python/grappler/cost_analyzer.py
@@ -20,21 +20,64 @@ from __future__ import print_function
 
 from tensorflow.python import pywrap_tensorflow as tf_wrap
 from tensorflow.python.framework import errors
+from tensorflow.python.grappler import cluster as gcluster
+from tensorflow.python.grappler import item as gitem
 
 
-def GenerateCostReport(metagraph, per_node_report=False):
+def GenerateCostReport(metagraph, per_node_report=False, cluster=None):
   """Analyze the cost of each TensorFlow op and node in the provided metagraph.
 
   Args:
-    metagraph: An TensorFlow MetaGraphDef.
+    metagraph: A TensorFlow MetaGraphDef.
     per_node_report: by default the report contains stats aggregated on a per op
       type basis, setting per_node_report to True adds results for each
       individual node to the report.
+    cluster: Analyze the costs using the specified cluster, or the local machine
+      if no cluster was specified.
 
   Returns:
     A string of cost report.
   """
+  if cluster is None:
+    cluster = gcluster.Cluster(disable_detailed_stats=False)
+
   with errors.raise_exception_on_not_ok_status():
-    ret_from_swig = tf_wrap.GenerateCostReport(metagraph.SerializeToString(),
-                                               per_node_report)
+    ret_from_swig = tf_wrap.GenerateCostReport(
+        metagraph.SerializeToString(), per_node_report, cluster.tf_cluster)
   return ret_from_swig
+
+
+def GenerateMemoryReport(metagraph, detailed_report=True, cluster=None):
+  """Analyze the peak memory usage for the provided metagraph.
+
+  Args:
+    metagraph: A TensorFlow MetaGraphDef.
+    detailed_report: print the live tensors in addition to the peak memory
+      usage.
+    cluster: Analyze the memory using the specified cluster, or the local
+      machine if no cluster was specified.
+
+  Returns:
+    A string with the formatted memory usage.
+  """
+  if cluster is None:
+    cluster = gcluster.Cluster(
+        disable_detailed_stats=True, disable_timeline=True)
+
+  item = gitem.Item(metagraph)
+  peak_usage = cluster.DeterminePeakMemoryUsage(item)
+  report = ""
+  for device, snapshot in peak_usage.items():
+    peak_usage = snapshot[0]
+    report += "Peak usage for device " + device + ": " + str(
+        peak_usage) + " bytes\n"
+    if detailed_report:
+      live_tensors = snapshot[1]
+      for tensor in live_tensors:
+        op_name = tensor[0]
+        output_id = tensor[1]
+        mem_used = tensor[2]
+        report += "  " + str(op_name) + ":" + str(output_id) + " uses " + str(
+            mem_used) + " bytes\n"
+
+  return report
diff --git a/tensorflow/python/grappler/cost_analyzer_test.py b/tensorflow/python/grappler/cost_analyzer_test.py
index d59f1d04f6135163152283d5d4922df800c51a00..325ff0fb00b006cf29d3ef2028d37745d7480d34 100644
--- a/tensorflow/python/grappler/cost_analyzer_test.py
+++ b/tensorflow/python/grappler/cost_analyzer_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import cost_analyzer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -35,9 +36,9 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 
 
-class PyWrapOptimizeGraphTest(test.TestCase):
+class CostAnalysisTest(test.TestCase):
 
-  def testBasic(self):
+  def testBasicCost(self):
     """Make sure arguments can be passed correctly."""
     a = constant_op.constant(10, name="a")
     b = constant_op.constant(20, name="b")
@@ -60,7 +61,7 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     # Also print the report to make it easier to debug
     print("{}".format(report))
 
-  def testSmallNetwork(self):
+  def testSmallNetworkCost(self):
     image = array_ops.placeholder(dtypes.float32, shape=[1, 28, 28, 1])
     label = array_ops.placeholder(dtypes.float32, shape=[1, 10])
     w = variables.Variable(
@@ -88,13 +89,10 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     self.assertTrue(b"MatMul" in report)
     self.assertTrue(b"ApplyAdam" in report)
     self.assertTrue(b"Conv2D" in report)
-    self.assertTrue(b"Conv2DBackpropInput" in report)
     self.assertTrue(b"Conv2DBackpropFilter" in report)
     self.assertTrue(b"Softmax" in report)
 
-    for op_type in [
-        b"MatMul", b"Conv2D", b"Conv2DBackpropInput", b"Conv2DBackpropFilter"
-    ]:
+    for op_type in [b"MatMul", b"Conv2D", b"Conv2DBackpropFilter"]:
       matcher = re.compile(
           br"\s+" + op_type + br",\s*(\d+),\s*(\d+),\s*([\d\.eE+-]+)%,\s*" +
           br"([\d\.eE+-]+)%,\s*(-?\d+),\s*(\d+),", re.MULTILINE)
@@ -111,6 +109,31 @@ class PyWrapOptimizeGraphTest(test.TestCase):
       # self.assertTrue(0 < upper)
       # self.assertTrue(lower <= upper)
 
+  def testBasicMemory(self):
+    """Make sure arguments can be passed correctly."""
+    with test_util.device(use_gpu=False):
+      a = constant_op.constant(10, name="a")
+      b = constant_op.constant(20, name="b")
+      c = math_ops.add_n([a, b], name="c")
+      d = math_ops.add_n([b, c], name="d")
+      train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+      train_op.append(d)
+      mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
+
+    report = cost_analyzer.GenerateMemoryReport(mg)
+
+    # Print the report to make it easier to debug
+    print("{}".format(report))
+
+    # Check the report
+    self.assertTrue(
+        "Peak usage for device /job:localhost/replica:0/task:0/cpu:0: 16 bytes"
+        in report)
+    self.assertTrue("  a:0 uses 4 bytes" in report)
+    self.assertTrue("  b:0 uses 4 bytes" in report)
+    self.assertTrue("  c:0 uses 4 bytes" in report)
+    self.assertTrue("  d:0 uses 4 bytes" in report)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/grappler/item.i b/tensorflow/python/grappler/item.i
index 632f614558774169f03f23c2e29719bec5740f75..eb396ef1ad46c50b5efafd59fae28cc1c0f2d459 100644
--- a/tensorflow/python/grappler/item.i
+++ b/tensorflow/python/grappler/item.i
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+%include <std_shared_ptr.i>
 %typemap(in) const tensorflow::MetaGraphDef& (tensorflow::MetaGraphDef temp) {
   char* c_string;
   Py_ssize_t py_size;
@@ -30,6 +31,13 @@ limitations under the License.
   $1 = &temp;
 }
 
+// Wrap the item into an object that swig can manipulate. This ensures it will call the object
+// destructor upon garbage collection instead of leaking memory.
+struct GItem {
+  std::shared_ptr<tensorflow::grappler::GrapplerItem> item_;
+};
+
+
 %{
 #include <unordered_set>
 #include <map>
@@ -37,11 +45,30 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
 #include "tensorflow/core/grappler/grappler_item_builder.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+// Provide the implementation fo the GItem struct here.
+struct GItem {
+  GItem() {}
+  GItem(tensorflow::grappler::GrapplerItem* item) : item_(item) {}
 
-static tensorflow::grappler::GrapplerItem* TF_NewItem(
+  tensorflow::grappler::GrapplerItem* operator->() const {
+    return item_.get();
+  }
+  const tensorflow::grappler::GrapplerItem& operator*() const {
+    return *item_.get();
+  }
+  bool is_none() const {
+    return item_.get() == nullptr;
+  }
+  std::shared_ptr<tensorflow::grappler::GrapplerItem> item_;
+};
+
+static GItem TF_NewItem(
     const tensorflow::MetaGraphDef& meta_graph, bool ignore_colocation,
     bool ignore_user_placement, TF_Status* out_status) {
   if (meta_graph.collection_def().count("train_op") == 0) {
@@ -54,6 +81,7 @@ static tensorflow::grappler::GrapplerItem* TF_NewItem(
   tensorflow::grappler::ItemConfig cfg;
   cfg.ignore_user_placement = ignore_user_placement;
   cfg.ignore_colocation = ignore_colocation;
+  cfg.inline_functions = true;
   std::unique_ptr<tensorflow::grappler::GrapplerItem> item =
       tensorflow::grappler::GrapplerItemFromMetaGraphDef("item", meta_graph, cfg);
   if (!item) {
@@ -63,15 +91,12 @@ static tensorflow::grappler::GrapplerItem* TF_NewItem(
     return nullptr;
   }
   tensorflow::Set_TF_Status_from_Status(out_status, tensorflow::Status::OK());
-  return item.release();
-}
-
-static void TF_DeleteItem(tensorflow::grappler::GrapplerItem* item) {
-  delete item;
+  return GItem(item.release());
 }
 
-static std::vector<string> TF_IdentifyImportantOps(const tensorflow::grappler::GrapplerItem* item) {
-  if (!item) {
+static std::vector<string> TF_IdentifyImportantOps(GItem item, bool sort_topologically,
+                                                   TF_Status* status) {
+  if (item.is_none()) {
     return {};
   }
 
@@ -86,23 +111,39 @@ static std::vector<string> TF_IdentifyImportantOps(const tensorflow::grappler::G
   }
 
   std::vector<string> ops;
-  for (const auto& op_name : op_names) {
-    ops.push_back(op_name);
+  if (sort_topologically) {
+    tensorflow::GraphDef subgraph;
+    for (const tensorflow::NodeDef& node : item->graph.node()) {
+      if (op_names.find(node.name()) != op_names.end()) {
+        *subgraph.add_node() = node;
+      }
+    }
+    tensorflow::Status s = tensorflow::grappler::TopologicalSort(&subgraph);
+    tensorflow::Set_TF_Status_from_Status(status, s);
+    for (const tensorflow::NodeDef& node : subgraph.node()) {
+      ops.push_back(node.name());
+    }
+  }
+  else {
+    for (const auto& op_name : op_names) {
+      ops.push_back(op_name);
+    }
   }
 
   return ops;
 }
 
-static PyObject* TF_GetOpProperties(const tensorflow::grappler::GrapplerItem* item) {
-  if (!item) {
+static PyObject* TF_GetOpProperties(GItem item) {
+  if (item.is_none()) {
     Py_RETURN_NONE;
   }
   tensorflow::grappler::GraphProperties properties(*item);
-  tensorflow::Status status = properties.InferStatically();
+  tensorflow::Status status = properties.InferStatically(false);
   if (!status.ok()) {
     Py_RETURN_NONE;
   }
 
+  PyGILState_STATE gstate = PyGILState_Ensure();
   PyObject* props = PyDict_New();
   for (const auto& node : item->graph.node()) {
     const string& node_name = node.name();
@@ -117,8 +158,8 @@ static PyObject* TF_GetOpProperties(const tensorflow::grappler::GrapplerItem* it
       PyList_SetItem(prop, i, output_prop);
     }
     CHECK_EQ(0, PyDict_SetItem(props, PyString_FromString(node_name.c_str()), prop));
-   }
-
+  }
+  PyGILState_Release(gstate);
   return props;
 }
 
@@ -126,9 +167,9 @@ static PyObject* TF_GetOpProperties(const tensorflow::grappler::GrapplerItem* it
 
 
 // Wrap these functions.
-static tensorflow::grappler::GrapplerItem* TF_NewItem(
+static GItem TF_NewItem(
     const tensorflow::MetaGraphDef& meta_graph, bool ignore_colocation,
     bool ignore_user_placement, TF_Status* out_status);
-static void TF_DeleteItem(tensorflow::grappler::GrapplerItem* item);
-static std::vector<string> TF_IdentifyImportantOps(const tensorflow::grappler::GrapplerItem* item);
-static PyObject* TF_GetOpProperties(const tensorflow::grappler::GrapplerItem* item);
+static std::vector<string> TF_IdentifyImportantOps(GItem item, bool sort_topologically,
+                                                   TF_Status* status);
+static PyObject* TF_GetOpProperties(GItem item);
diff --git a/tensorflow/python/grappler/item.py b/tensorflow/python/grappler/item.py
index f53fc7f337d9d76699a89e6808098531d9ed20eb..c6e66d3c27e245de2cdb4aa84f6c9ed43f5c1de3 100644
--- a/tensorflow/python/grappler/item.py
+++ b/tensorflow/python/grappler/item.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.grappler.costs import op_performance_data_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python import pywrap_tensorflow as tf_item
 from tensorflow.python.framework import errors
 
@@ -42,21 +43,20 @@ class Item(object):
       ValueError: the metagraph is incomplete or invalid.
     """
     self._metagraph = metagraph
+    self._item_graph = meta_graph_pb2.MetaGraphDef()
+    self._item_graph.CopyFrom(metagraph)
+    self._ignore_colocation = ignore_colocation
+    self._ignore_user_placement = ignore_user_placement
     self._tf_item = None
-    with errors.raise_exception_on_not_ok_status() as status:
-      self._tf_item = tf_item.TF_NewItem(metagraph.SerializeToString(),
-                                         ignore_colocation,
-                                         ignore_user_placement, status)
-
-  def __del__(self):
-    if self._tf_item:
-      tf_item.TF_DeleteItem(self._tf_item)
+    self._BuildTFItem()
 
-  def IdentifyImportantOps(self):
-    return tf_item.TF_IdentifyImportantOps(self._tf_item)
+  def IdentifyImportantOps(self, sort_topologically=False):
+    with errors.raise_exception_on_not_ok_status() as status:
+      return tf_item.TF_IdentifyImportantOps(self.tf_item, sort_topologically,
+                                             status)
 
   def GetOpProperties(self):
-    ret_from_swig = tf_item.TF_GetOpProperties(self._tf_item)
+    ret_from_swig = tf_item.TF_GetOpProperties(self.tf_item)
     properties = {}
     for key, values in ret_from_swig.items():
       prop = []
@@ -72,4 +72,13 @@ class Item(object):
 
   @property
   def tf_item(self):
+    if self._item_graph != self._metagraph:
+      self._BuildTFItem()
+      self._item_graph.CopyFrom(self._metagraph)
     return self._tf_item
+
+  def _BuildTFItem(self):
+    with errors.raise_exception_on_not_ok_status() as status:
+      self._tf_item = tf_item.TF_NewItem(self._metagraph.SerializeToString(),
+                                         self._ignore_colocation,
+                                         self._ignore_user_placement, status)
diff --git a/tensorflow/python/grappler/item_test.py b/tensorflow/python/grappler/item_test.py
index 0739a7a0e4c8c142d3c46ac1697ab243d7982cde..71c68d25cd928d5cb2dc5028ed331d468c5b9826 100644
--- a/tensorflow/python/grappler/item_test.py
+++ b/tensorflow/python/grappler/item_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.grappler import item
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
 
 
@@ -59,6 +60,7 @@ class ItemTest(test.TestCase):
       a = constant_op.constant(10)
       b = constant_op.constant(20)
       c = a + b
+      z = control_flow_ops.no_op()
       train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
       train_op.append(c)
       mg = meta_graph.create_meta_graph_def(graph=g)
@@ -69,9 +71,38 @@ class ItemTest(test.TestCase):
       for node in grappler_item.metagraph.graph_def.node:
         node_prop = op_properties[node.name]
 
-        self.assertEqual(1, len(node_prop))
-        self.assertEqual(dtypes.int32, node_prop[0].dtype)
-        self.assertEqual(tensor_shape.scalar(), node_prop[0].shape)
+        if node.name == z.name:
+          self.assertEqual(0, len(node_prop))
+        else:
+          self.assertEqual(1, len(node_prop))
+          self.assertEqual(dtypes.int32, node_prop[0].dtype)
+          self.assertEqual(tensor_shape.scalar(), node_prop[0].shape)
+
+  def testUpdates(self):
+    with ops.Graph().as_default() as g:
+      a = constant_op.constant(10)
+      b = constant_op.constant(20)
+      c = a + b
+      train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+      train_op.append(c)
+      mg = meta_graph.create_meta_graph_def(graph=g)
+      grappler_item = item.Item(mg)
+
+    initial_tf_item = grappler_item.tf_item
+    no_change_tf_item = grappler_item.tf_item
+    self.assertEqual(initial_tf_item, no_change_tf_item)
+
+    # Modify the placement.
+    for node in grappler_item.metagraph.graph_def.node:
+      node.device = '/cpu:0'
+    new_tf_item = grappler_item.tf_item
+    self.assertNotEqual(initial_tf_item, new_tf_item)
+
+    # Assign the same placement.
+    for node in grappler_item.metagraph.graph_def.node:
+      node.device = '/cpu:0'
+    newest_tf_item = grappler_item.tf_item
+    self.assertEqual(new_tf_item, newest_tf_item)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 99a4d23b6aa0b91deb91d9b25d99bf659a96222d..d677385ebe319a920ae3f1cc5a31a4e90f90d3f3 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import device_properties_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.client import session
@@ -28,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.grappler import cluster as gcluster
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.layers import convolutional as conv_layers
 from tensorflow.python.ops import array_ops
@@ -41,53 +43,107 @@ from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import saver as saver_lib
 
 
-def weight(shape):
-  """weights generates a weight of a given shape."""
+def _weight(shape):
+  """Generates a weight of a given shape."""
   return random_ops.truncated_normal(shape, seed=0, stddev=0.1)
 
 
-def bias(shape):
-  """bias generates a bias of a given shape."""
+def _bias(shape):
+  """Generates a bias of a given shape."""
   return constant_op.constant(0.1, shape=shape)
 
 
-def conv2d(x, w):
-  """conv2d returns a 2d convolution layer with full stride."""
+def _conv2d(x, w):
+  """Returns a 2d convolution layer with full stride."""
   return nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
 
 
-def max_pool_2x2(x):
-  """max_pool_2x2 downsamples a feature map by 2X."""
+def _max_pool_2x2(x):
+  """Downsamples a feature map by 2X."""
   return nn.max_pool(
       x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
 
 
 # Taken from tensorflow/examples/tutorials/mnist/mnist_deep.py
-def two_layer_model(x):
+def _two_layer_model(x):
   x_image = array_ops.reshape(x, [-1, 28, 28, 1])
-  w_conv1 = weight([5, 5, 1, 32])
-  b_conv1 = bias([32])
-  h_conv1 = nn.relu(conv2d(x_image, w_conv1) + b_conv1)
-  h_pool1 = max_pool_2x2(h_conv1)
-  w_conv2 = weight([5, 5, 32, 64])
-  b_conv2 = bias([64])
-  h_conv2 = nn.relu(conv2d(h_pool1, w_conv2) + b_conv2)
-  h_pool2 = max_pool_2x2(h_conv2)
+  w_conv1 = _weight([5, 5, 1, 32])
+  b_conv1 = _bias([32])
+  h_conv1 = nn.relu(_conv2d(x_image, w_conv1) + b_conv1)
+  h_pool1 = _max_pool_2x2(h_conv1)
+  w_conv2 = _weight([5, 5, 32, 64])
+  b_conv2 = _bias([64])
+  h_conv2 = nn.relu(_conv2d(h_pool1, w_conv2) + b_conv2)
+  h_pool2 = _max_pool_2x2(h_conv2)
   return h_pool2
 
 
-def loop():
+def _model_with_second_port():
+  random_seed.set_random_seed(0)
+  x = random_ops.truncated_normal([2, 5, 5, 4], seed=0)
+  scale = constant_op.constant(0.1, shape=[4])
+  offset = constant_op.constant(0.3, shape=[4])
+  y, mean, _ = nn.fused_batch_norm(x, scale, offset)
+  mul = math_ops.add(y, mean)
+  output = array_ops.identity(mul)
+  return output
+
+
+def _model_with_branch(x):
+  x_image = array_ops.reshape(x, [-1, 28, 28, 1])
+  w_conv1 = _weight([5, 5, 1, 32])
+  w_conv2 = _weight([5, 5, 1, 32])
+  c_conv1 = _conv2d(x_image, w_conv1)
+  c_conv2 = _conv2d(x_image, w_conv2)
+  add = math_ops.add(c_conv1, c_conv2)
+  return add
+
+
+def _model_with_vec_and_4d(x):
+  x_image = array_ops.reshape(x, [-1, 28, 28, 1])
+  w_conv1 = _weight([5, 5, 1, 32])
+  c_conv1 = _conv2d(x_image, w_conv1)
+  vector = constant_op.constant(6.4, shape=[32])
+  add = math_ops.add(c_conv1, vector)
+  return add
+
+
+def _loop():
+  random_seed.set_random_seed(0)
+  x1 = random_ops.truncated_normal([1, 784], seed=0)
+  x2 = random_ops.truncated_normal([1, 784], seed=0)
+  x3 = random_ops.truncated_normal([1, 784], seed=0)
+  x4 = random_ops.truncated_normal([1, 784], seed=0)
+  elems = (x1, x2, x3, x4)
+  outputs = functional_ops.map_fn(_two_layer_model, elems, dtype=dtypes.float32)
+  return outputs
+
+
+def _loop_with_branch():
+  random_seed.set_random_seed(0)
+  x1 = random_ops.truncated_normal([1, 784], seed=0)
+  x2 = random_ops.truncated_normal([1, 784], seed=0)
+  x3 = random_ops.truncated_normal([1, 784], seed=0)
+  x4 = random_ops.truncated_normal([1, 784], seed=0)
+  elems = (x1, x2, x3, x4)
+  outputs = functional_ops.map_fn(
+      _model_with_branch, elems, dtype=dtypes.float32)
+  return outputs
+
+
+def _loop_with_vec_and_4d():
   random_seed.set_random_seed(0)
   x1 = random_ops.truncated_normal([1, 784], seed=0)
   x2 = random_ops.truncated_normal([1, 784], seed=0)
   x3 = random_ops.truncated_normal([1, 784], seed=0)
   x4 = random_ops.truncated_normal([1, 784], seed=0)
   elems = (x1, x2, x3, x4)
-  outputs = functional_ops.map_fn(two_layer_model, elems, dtype=dtypes.float32)
+  outputs = functional_ops.map_fn(
+      _model_with_vec_and_4d, elems, dtype=dtypes.float32)
   return outputs
 
 
-def get_config(layout_optimizer=True):
+def _get_config(layout_optimizer=True):
   if layout_optimizer:
     rewrite_options = rewriter_config_pb2.RewriterConfig(
         layout_optimizer=rewriter_config_pb2.RewriterConfig.ON)
@@ -100,6 +156,30 @@ def get_config(layout_optimizer=True):
   return config
 
 
+def _simple_metagraph(depthwise=False):
+  random_seed.set_random_seed(0)
+  x = variables.Variable(random_ops.truncated_normal([1, 200, 200, 3], seed=0))
+  conv = conv_layers.separable_conv2d if depthwise else conv_layers.conv2d
+  y = conv(x, 32, [3, 3])
+  z = conv(y, 32, [3, 3])
+  optimizer = gradient_descent.GradientDescentOptimizer(1e-4)
+  loss = math_ops.reduce_mean(z)
+  train_op = optimizer.minimize(loss)
+  graph = ops.get_default_graph()
+  graph.add_to_collection('train_op', train_op)
+  meta_graph = saver_lib.export_meta_graph(graph_def=graph.as_graph_def())
+  return meta_graph
+
+
+def _get_cluster():
+  named_device = device_properties_pb2.NamedDevice()
+  named_device.name = '/GPU:0'
+  named_device.properties.type = 'GPU'
+  named_device.properties.environment['architecture'] = '4'
+  cluster = gcluster.Cluster(devices=[named_device])
+  return cluster
+
+
 class LayoutOptimizerTest(test.TestCase):
   """Tests the Grappler layout optimizer."""
 
@@ -107,7 +187,7 @@ class LayoutOptimizerTest(test.TestCase):
     ops.reset_default_graph()
     graph = ops.get_default_graph()
     with session.Session(
-        config=get_config(layout_optimizer), graph=graph) as sess:
+        config=_get_config(layout_optimizer), graph=graph) as sess:
       batch = 2
       height = 6
       width = 7
@@ -142,12 +222,12 @@ class LayoutOptimizerTest(test.TestCase):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
-      output = two_layer_model(x)
+      output = _two_layer_model(x)
 
       with session.Session() as sess:
         output_val_ref = sess.run(output)
 
-      with session.Session(config=get_config()) as sess:
+      with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
         output_val = sess.run(output, run_metadata=metadata)
 
@@ -162,45 +242,292 @@ class LayoutOptimizerTest(test.TestCase):
       # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
       expected_num_transposes = 2
       self.assertEqual(expected_num_transposes, num_transposes)
-      self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Reshape-0',
-                    nodes)
-      self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-Relu_1-MaxPool_1',
-                    nodes)
+      self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-Conv2D-0', nodes)
+      self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-Relu_1-0-0', nodes)
+
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testSplitWithNonConstAxis(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      dim = array_ops.placeholder(dtype='int32')
+      split = array_ops.split(conv, 2, axis=dim)
+      output = math_ops.reduce_sum(split[0])
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={dim: 3})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata, feed_dict={dim: 3})
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if node.name.startswith('LayoutOptimizerTranspose'):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-Conv2D-0', nodes)
+      self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-split-0-0', nodes)
+      self.assertIn('LayoutOptimizerVecPermuteNHWCToNCHW_split_0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testPadWithConstPaddings(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      paddings_val = [[1, 2], [3, 4], [5, 6], [7, 8]]
+      paddings = constant_op.constant(
+          paddings_val, dtype='int32', name='PaddingsConst')
+      pad = array_ops.pad(conv, paddings)
+      output = array_ops.identity(pad)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if node.name.startswith('LayoutOptimizerTranspose'):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-Conv2D-0', nodes)
+      self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-Pad-0-0', nodes)
+      self.assertIn('LayoutOptimizer-Pad-PaddingsConst', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testPadWithNonConstPaddings(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      paddings = array_ops.placeholder(dtype='int32')
+      pad = array_ops.pad(conv, paddings)
+      output = array_ops.identity(pad)
+
+      paddings_val = [[1, 2], [3, 4], [5, 6], [7, 8]]
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={paddings: paddings_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={
+                paddings: paddings_val
+            })
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if node.name.startswith('LayoutOptimizerTranspose'):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-Conv2D-0', nodes)
+      self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-Pad-0-0', nodes)
+      self.assertIn('LayoutOptimizerVecPermuteNHWCToNCHW_Pad_1', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testSliceWithNonConstAxis(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      size = array_ops.placeholder(dtype='int32')
+      s = array_ops.slice(conv, [0, 0, 0, 0], size)
+      output = array_ops.identity(s)
+
+      size_val = [1, 2, 3, 4]
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={size: size_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={
+                size: size_val
+            })
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if node.name.startswith('LayoutOptimizerTranspose'):
+          num_transposes += 1
+        nodes.append(node.name)
 
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-Conv2D-0', nodes)
+      self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-Slice-0-0', nodes)
+      self.assertIn('LayoutOptimizerVecPermuteNHWCToNCHW_Slice_2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  def testShapeN(self):
+    if test.is_gpu_available(cuda_only=True):
+      x = array_ops.placeholder(dtype='float32')
+      conv = _two_layer_model(x)
+      shapen = array_ops.shape_n([conv, conv])
+      output = math_ops.add(shapen[0], shapen[1])
+
+      x_val = [1.7] * 784
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={x: x_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={
+                x: x_val
+            })
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if node.name.startswith('LayoutOptimizerTranspose'):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 1
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-Conv2D-0', nodes)
+      self.assertIn('LayoutOptimizerVecPermuteNCHWToNHWC-ShapeN-0-0', nodes)
+      self.assertAllEqual(output_val_ref, output_val)
+
   def testLoop(self):
     if test.is_gpu_available(cuda_only=True):
-      output = loop()
+      output = _loop()
 
       with session.Session() as sess:
         output_val_ref = sess.run(output)
 
-      with session.Session(config=get_config()) as sess:
+      with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
         output_val = sess.run(output, run_metadata=metadata)
 
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if node.name.startswith('LayoutOptimizerTranspose'):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-map/while/Conv2D-0',
+                    nodes)
+      self.assertIn(
+          'LayoutOptimizerTransposeNCHWToNHWC-map/while/MaxPool_1-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
-  def testGradient(self):
-    if not test.is_gpu_available(cuda_only=True):
-      self.skipTest('GPU required')
+  def testLoopWithBranch(self):
+    if test.is_gpu_available(cuda_only=True):
+      output = _loop_with_branch()
 
-    random_seed.set_random_seed(0)
-    x = variables.Variable(
-        random_ops.truncated_normal([1, 200, 200, 3], seed=0))
-    y = conv_layers.conv2d(x, 32, [3, 3])
-    z = conv_layers.conv2d(y, 32, [3, 3])
-    optimizer = gradient_descent.GradientDescentOptimizer(1e-4)
-    loss = math_ops.reduce_mean(z)
-    train_op = optimizer.minimize(loss)
-    graph = ops.get_default_graph()
-    graph.add_to_collection('train_op', train_op)
-    meta_graph = saver_lib.export_meta_graph(graph_def=graph.as_graph_def())
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
 
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if node.name.startswith('LayoutOptimizerTranspose'):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-map/while/Conv2D-0',
+                    nodes)
+      self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-map/while/Add-0-2',
+                    nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testLoopWithVecAnd4D(self):
+    if test.is_gpu_available(cuda_only=True):
+      output = _loop_with_vec_and_4d()
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if node.name.startswith('LayoutOptimizerTranspose'):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-map/while/Conv2D-0',
+                    nodes)
+      self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-map/while/Add-0-2',
+                    nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testBinaryOpSecondPort(self):
+    if test.is_gpu_available(cuda_only=True):
+      output = _model_with_second_port()
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if node.name.startswith('LayoutOptimizerTranspose'):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-FusedBatchNorm-0',
+                    nodes)
+      self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-Add-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testGradient(self):
+    meta_graph = _simple_metagraph()
     rewrite_options = rewriter_config_pb2.RewriterConfig(
         layout_optimizer=rewriter_config_pb2.RewriterConfig.ON)
-    optimized_graph = tf_optimizer.OptimizeGraph(rewrite_options, meta_graph)
+    optimized_graph = tf_optimizer.OptimizeGraph(
+        rewrite_options, meta_graph, cluster=_get_cluster())
 
     found = 0
     for node in optimized_graph.node:
@@ -209,7 +536,27 @@ class LayoutOptimizerTest(test.TestCase):
         self.assertEqual(node.attr['data_format'].s, 'NCHW')
     self.assertEqual(found, 5)
 
+  def testDepthwise(self):
+    meta_graph = _simple_metagraph(depthwise=True)
+    rewrite_options = rewriter_config_pb2.RewriterConfig(
+        layout_optimizer=rewriter_config_pb2.RewriterConfig.ON)
+    optimized_graph = tf_optimizer.OptimizeGraph(
+        rewrite_options, meta_graph, cluster=_get_cluster())
+
+    found = 0
+    for node in optimized_graph.node:
+      if node.op in [
+          'DepthwiseConv2dNative', 'DepthwiseConv2dNativeBackpropFilter',
+          'DepthwiseConv2dNativeBackpropInput'
+      ]:
+        found += 1
+        self.assertEqual(node.attr['data_format'].s, 'NCHW')
+    self.assertEqual(found, 6)
+
   def testCheckpointCompatibility(self):
+    if not test.is_gpu_available(cuda_only=True):
+      self.skipTest('GPU required')
+
     checkpoint_path = self.get_temp_dir()
     self._train(checkpoint_path)
     vars_expected = self._train(checkpoint_path, restore=True)
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index 09cf5f2270424f9b3274c0c07d17f9166cdbb79e..9fbadeceb3b1a8c9f949bc59a5ec75c5b7420cac 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -128,6 +128,7 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+            layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             memory_optimization=rewriter_config_pb2.RewriterConfig.
             RECOMPUTATION_HEURISTICS), original_metagraph)
@@ -151,6 +152,7 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+            layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             memory_optimization=rewriter_config_pb2.RewriterConfig.
             RECOMPUTATION_HEURISTICS,
diff --git a/tensorflow/python/grappler/model_analyzer.cc b/tensorflow/python/grappler/model_analyzer.cc
index 7d365c3be923e216b44149921b76d734c2b9a82f..d23eb811ac2b0a6a8802979b4d966b5617c8a8d9 100644
--- a/tensorflow/python/grappler/model_analyzer.cc
+++ b/tensorflow/python/grappler/model_analyzer.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/python/grappler/model_analyzer.h"
 
 #include <iomanip>
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -25,26 +26,26 @@ namespace grappler {
 
 ModelAnalyzer::ModelAnalyzer(const GrapplerItem& item) : item_(item) {}
 
-Status ModelAnalyzer::GenerateReport(std::ostream& os) {
+Status ModelAnalyzer::GenerateReport(bool debug, std::ostream& os) {
   GraphProperties properties(item_);
-  TF_RETURN_IF_ERROR(properties.InferStatically());
+  TF_RETURN_IF_ERROR(properties.InferStatically(false));
 
   for (const auto& node : item_.MainOpsFanin()) {
-    PrintNodeInfo(node, properties, os);
+    PrintNodeInfo(node, properties, debug, os);
   }
   for (const auto& node : item_.EnqueueOpsFanin()) {
-    PrintNodeInfo(node, properties, os);
+    PrintNodeInfo(node, properties, debug, os);
   }
 
   return Status::OK();
 }
 
 void ModelAnalyzer::PrintNodeInfo(const NodeDef* node,
-                                  const GraphProperties& properties,
+                                  const GraphProperties& properties, bool debug,
                                   std::ostream& os) const {
   os << node->name() << " [" << node->op() << "]" << std::endl;
   if (properties.HasOutputProperties(node->name())) {
-    std::vector<OpInfo::TensorProperties> props =
+    const std::vector<OpInfo::TensorProperties>& props =
         properties.GetOutputProperties(node->name());
     for (int i = 0; i < props.size(); ++i) {
       const OpInfo::TensorProperties& prop = props[i];
@@ -75,6 +76,27 @@ void ModelAnalyzer::PrintNodeInfo(const NodeDef* node,
       os << std::endl;
     }
   }
+
+  if (debug) {
+    const OpRegistrationData* op_reg_data;
+    Status status = OpRegistry::Global()->LookUp(node->op(), &op_reg_data);
+    if (!status.ok()) {
+      os << "\tCouldn't find op registration for " << node->op() << std::endl;
+    } else if (!op_reg_data->shape_inference_fn) {
+      os << "\tCouldn't find shape function for op " << node->op() << std::endl;
+    } else if (properties.HasInputProperties(node->name())) {
+      const std::vector<OpInfo::TensorProperties>& props =
+          properties.GetInputProperties(node->name());
+      for (int i = 0; i < props.size(); ++i) {
+        const OpInfo::TensorProperties& prop = props[i];
+        if (prop.has_value()) {
+          os << "\t"
+             << "input " << i << " (" << DataTypeString(prop.dtype())
+             << ") has known value" << std::endl;
+        }
+      }
+    }
+  }
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/python/grappler/model_analyzer.h b/tensorflow/python/grappler/model_analyzer.h
index a14034103ca70e59ac24d88318edc198e7d1c5f4..5bc551927d88db723e21b29903d6f5b941048139 100644
--- a/tensorflow/python/grappler/model_analyzer.h
+++ b/tensorflow/python/grappler/model_analyzer.h
@@ -31,11 +31,11 @@ class GraphProperties;
 class ModelAnalyzer {
  public:
   explicit ModelAnalyzer(const GrapplerItem& item);
-  Status GenerateReport(std::ostream& os);
+  Status GenerateReport(bool debug, std::ostream& os);
 
  private:
   void PrintNodeInfo(const NodeDef* node, const GraphProperties& properties,
-                     std::ostream& os) const;
+                     bool debug, std::ostream& os) const;
 
   const GrapplerItem& item_;
 };
diff --git a/tensorflow/python/grappler/model_analyzer.i b/tensorflow/python/grappler/model_analyzer.i
index d74bd37c6372733d25d2b5766a302aa1701dac17..7c3a692d0efc501341ff1dff3cf24b8a4830ec84 100644
--- a/tensorflow/python/grappler/model_analyzer.i
+++ b/tensorflow/python/grappler/model_analyzer.i
@@ -40,7 +40,7 @@ limitations under the License.
 %}
 
 %{
-string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph) {
+string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph, bool debug) {
   tensorflow::grappler::ItemConfig cfg;
   cfg.apply_optimizations = false;
   std::unique_ptr<tensorflow::grappler::GrapplerItem> item =
@@ -48,15 +48,15 @@ string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph) {
   if (!item) {
     return "Error: failed to preprocess metagraph: check your log file for errors";
   }
-  
+
   string suffix;
   tensorflow::grappler::ModelAnalyzer analyzer(*item);
 
   std::stringstream os;
-  analyzer.GenerateReport(os);
+  analyzer.GenerateReport(debug, os);
   return os.str();
 }
 
 %}
 
-string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph);
+string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph, bool debug);
diff --git a/tensorflow/python/grappler/model_analyzer.py b/tensorflow/python/grappler/model_analyzer.py
index c852d71ad8b047f5437ca62c49a5500bc29cec60..535889e1c4034952562a05e4d044fcafeddbc0ca 100644
--- a/tensorflow/python/grappler/model_analyzer.py
+++ b/tensorflow/python/grappler/model_analyzer.py
@@ -22,16 +22,18 @@ from tensorflow.python import pywrap_tensorflow as tf_wrap
 from tensorflow.python.framework import errors
 
 
-def GenerateModelReport(metagraph):
+def GenerateModelReport(metagraph, debug=False):
   """Report what's known statically about each node in the provided metagraph.
 
   Args:
     metagraph: A TensorFlow MetaGraphDef.
+    debug: Add some information useful for debugging.
 
   Returns:
     A string containing the report.
   """
   with errors.raise_exception_on_not_ok_status():
-    ret_from_swig = tf_wrap.GenerateModelReport(metagraph.SerializeToString())
+    ret_from_swig = tf_wrap.GenerateModelReport(metagraph.SerializeToString(),
+                                                debug)
 
   return ret_from_swig
diff --git a/tensorflow/python/grappler/model_analyzer_test.py b/tensorflow/python/grappler/model_analyzer_test.py
index b59d1650f4b5e4c7239c2275213e9a26c3aafafe..ec172755f1ae43fc7581e97c6a18471da45f9100 100644
--- a/tensorflow/python/grappler/model_analyzer_test.py
+++ b/tensorflow/python/grappler/model_analyzer_test.py
@@ -49,6 +49,24 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     # Also print the report to make it easier to debug
     print("{}".format(report))
 
+  def testDebugMode(self):
+    """Make sure arguments can be passed correctly."""
+    a = constant_op.constant([10, 11], name="a")
+    b = constant_op.constant([10], name="b")
+    c = math_ops.add(a, b, name="c")
+    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+    train_op.append(c)
+    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
+
+    report = model_analyzer.GenerateModelReport(mg, debug=True)
+
+    # Check the report headers
+    self.assertTrue(b"input 0 (int32) has known value" in report)
+    self.assertTrue(b"input 1 (int32) has known value" in report)
+
+    # Also print the report to make it easier to debug
+    print("{}".format(report))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/grappler/tf_optimizer.i b/tensorflow/python/grappler/tf_optimizer.i
index f3d8fe194b848f94037044b5d3b5e2d8ebe01d1c..f0dd4483a635ddf39e7f51ad0008390c1feb2e13 100644
--- a/tensorflow/python/grappler/tf_optimizer.i
+++ b/tensorflow/python/grappler/tf_optimizer.i
@@ -15,6 +15,7 @@ limitations under the License.
 
 
 %include "tensorflow/python/platform/base.i"
+%include "cluster.i"
 
 %typemap(in) const tensorflow::MetaGraphDef& (tensorflow::MetaGraphDef temp) {
   char* c_string;
@@ -62,6 +63,7 @@ limitations under the License.
   #include "tensorflow/core/framework/graph.pb.h"
   #include "tensorflow/core/grappler/grappler_item.h"
   #include "tensorflow/core/grappler/grappler_item_builder.h"
+  #include "tensorflow/core/grappler/clusters/cluster.h"
   #include "tensorflow/core/grappler/clusters/utils.h"
   #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
   #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
@@ -91,22 +93,23 @@ void DetectDevices(std::unordered_map<string, tensorflow::DeviceProperties>* dev
 }
 
 PyObject* TF_OptimizeGraph(
+      GCluster cluster,
       const tensorflow::RewriterConfig& rewriter_config,
       const tensorflow::MetaGraphDef& metagraph,
-      const string& graph_id, TF_Status* out_status) {
+      bool verbose, const string& graph_id, TF_Status* out_status) {
     tensorflow::grappler::ItemConfig item_config;
     item_config.inline_functions = false;
     item_config.apply_optimizations = false;
     std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
         tensorflow::grappler::GrapplerItemFromMetaGraphDef(graph_id, metagraph, item_config);
-    std::unordered_map<string, tensorflow::DeviceProperties> device_map;
-    DetectDevices(&device_map);
+
     tensorflow::DeviceBase* cpu_device = nullptr;
-    tensorflow::grappler::VirtualCluster cluster(device_map);
     tensorflow::GraphDef out_graph;
     tensorflow::grappler::MetaOptimizer optimizer(cpu_device, rewriter_config);
-    tensorflow::Status status = optimizer.Optimize(&cluster, *grappler_item, &out_graph);
-    optimizer.PrintResult();
+    tensorflow::Status status = optimizer.Optimize(cluster.get(), *grappler_item, &out_graph);
+    if (verbose) {
+      optimizer.PrintResult();
+    }
     tensorflow::Set_TF_Status_from_Status(out_status, status);
     string out_graph_str = out_graph.SerializeAsString();
     PyObject* ret = PyBytes_FromStringAndSize(out_graph_str.data(),
@@ -118,8 +121,9 @@ PyObject* TF_OptimizeGraph(
 
 // Wrap this function
 PyObject* TF_OptimizeGraph(
+    GCluster cluster,
     const tensorflow::RewriterConfig& rewriter_config,
-    const tensorflow::MetaGraphDef& metagraph,
+    const tensorflow::MetaGraphDef& metagraph, bool verbose,
     const string& graph_id, TF_Status* out_status);
 
 
diff --git a/tensorflow/python/grappler/tf_optimizer.py b/tensorflow/python/grappler/tf_optimizer.py
index d0464c6054293b8499231526317d5bd42bc88752..a73a4a98fc5a883cf8681a20ca332f16f3b7f0ce 100644
--- a/tensorflow/python/grappler/tf_optimizer.py
+++ b/tensorflow/python/grappler/tf_optimizer.py
@@ -21,14 +21,22 @@ from __future__ import print_function
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python import pywrap_tensorflow as tf_opt
 from tensorflow.python.framework import errors
+from tensorflow.python.grappler import cluster as gcluster
 
 
-def OptimizeGraph(rewriter_config, metagraph, graph_id=b'graph_to_optimize'):
+def OptimizeGraph(rewriter_config,
+                  metagraph,
+                  verbose=True,
+                  graph_id=b'graph_to_optimize',
+                  cluster=None):
   """Optimize the provided metagraph."""
   with errors.raise_exception_on_not_ok_status() as status:
-    ret_from_swig = tf_opt.TF_OptimizeGraph(rewriter_config.SerializeToString(),
+    if cluster is None:
+      cluster = gcluster.Cluster()
+    ret_from_swig = tf_opt.TF_OptimizeGraph(cluster.tf_cluster,
+                                            rewriter_config.SerializeToString(),
                                             metagraph.SerializeToString(),
-                                            graph_id, status)
+                                            verbose, graph_id, status)
   if ret_from_swig is None:
     return None
   out_graph = graph_pb2.GraphDef().FromString(ret_from_swig)
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
old mode 100644
new mode 100755
index e4992afbca7a12366554fc810f37908a85f2413a..4a60b7835ec3ee6224a84dcfbd7e380f9454d8eb
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -150,6 +150,7 @@ py_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/saved_model",
         "@six_archive//:six",
     ],
 )
@@ -552,10 +553,11 @@ py_test(
 
 py_test(
     name = "data_utils_test",
-    size = "small",
+    size = "medium",
     srcs = ["_impl/keras/utils/data_utils_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "no_windows",
         "noasan",  # times out
         "notsan",
     ],
diff --git a/tensorflow/python/keras/_impl/keras/__init__.py b/tensorflow/python/keras/_impl/keras/__init__.py
index 74cc9d0488c88de04bf29aafcd0e23895c59826a..a70250d796b4dd8d08ac65ebdac84b307b917b13 100644
--- a/tensorflow/python/keras/_impl/keras/__init__.py
+++ b/tensorflow/python/keras/_impl/keras/__init__.py
@@ -40,4 +40,4 @@ from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.models import Sequential
 
-__version__ = '2.1.1-tf'
+__version__ = '2.1.2-tf'
diff --git a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
index 58841e5db06229727ea088388a901633216aa6fe..63ee83cb51e8366f391f192a9408566076cad468 100644
--- a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import json
 
+import numpy as np
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
@@ -28,12 +30,15 @@ from tensorflow.python.platform import tf_logging as logging
 CLASS_INDEX = None
 CLASS_INDEX_PATH = 'https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json'
 
+# Global tensor of imagenet mean for preprocessing symbolic inputs
+_IMAGENET_MEAN = None
 
-def preprocess_input(x, data_format=None, mode='caffe'):
-  """Preprocesses a tensor encoding a batch of images.
+
+def _preprocess_numpy_input(x, data_format, mode):
+  """Preprocesses a image tensor as a Numpy array.
 
   Arguments:
-      x: input Numpy tensor, 4D.
+      x: input Numpy, 3D or 4D.
       data_format: data format of the image tensor.
       mode: One of "caffe", "tf".
           - caffe: will convert the images from RGB to BGR,
@@ -44,16 +49,11 @@ def preprocess_input(x, data_format=None, mode='caffe'):
               sample-wise.
 
   Returns:
-      Preprocessed tensor.
+      Preprocessed array.
   """
-  if data_format is None:
-    data_format = K.image_data_format()
-  assert data_format in {'channels_last', 'channels_first'}
-
   if mode == 'tf':
-    x /= 255.
-    x -= 0.5
-    x *= 2.
+    x /= 127.5
+    x -= 1.
     return x
 
   if data_format == 'channels_first':
@@ -79,6 +79,81 @@ def preprocess_input(x, data_format=None, mode='caffe'):
   return x
 
 
+def _preprocess_symbolic_input(x, data_format, mode):
+  """Preprocesses a symbolic image tensor.
+
+  Arguments:
+      x: symoblic tensor, 3D or 4D.
+      data_format: data format of the image tensor.
+      mode: One of "caffe", "tf".
+          - caffe: will convert the images from RGB to BGR,
+              then will zero-center each color channel with
+              respect to the ImageNet dataset,
+              without scaling.
+          - tf: will scale pixels between -1 and 1,
+              sample-wise.
+
+  Returns:
+      Preprocessed tensor.
+  """
+  global _IMAGENET_MEAN
+
+  if mode == 'tf':
+    x /= 127.5
+    x -= 1.
+    return x
+
+  if data_format == 'channels_first':
+    # 'RGB'->'BGR'
+    if K.ndim(x) == 3:
+      x = x[::-1, ...]
+    else:
+      x = x[:, ::-1, ...]
+  else:
+    # 'RGB'->'BGR'
+    x = x[..., ::-1]
+
+  if _IMAGENET_MEAN is None:
+    _IMAGENET_MEAN = K.constant(-np.array([103.939, 116.779, 123.68]))
+  # Zero-center by mean pixel
+  if K.dtype(x) != K.dtype(_IMAGENET_MEAN):
+    x = K.bias_add(x, K.cast(_IMAGENET_MEAN, K.dtype(x)), data_format)
+  else:
+    x = K.bias_add(x, _IMAGENET_MEAN, data_format)
+  return x
+
+
+def preprocess_input(x, data_format=None, mode='caffe'):
+  """Preprocesses a tensor encoding a batch of images.
+
+  Arguments:
+      x: input Numpy or symoblic tensor, 3D or 4D.
+      data_format: data format of the image tensor.
+      mode: One of "caffe", "tf".
+          - caffe: will convert the images from RGB to BGR,
+              then will zero-center each color channel with
+              respect to the ImageNet dataset,
+              without scaling.
+          - tf: will scale pixels between -1 and 1,
+              sample-wise.
+
+  Returns:
+      Preprocessed tensor.
+
+  Raises:
+      ValueError: in case of incorrect data_format.
+  """
+  if data_format is None:
+    data_format = K.image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  if isinstance(x, np.ndarray):
+    return _preprocess_numpy_input(x, data_format=data_format, mode=mode)
+  else:
+    return _preprocess_symbolic_input(x, data_format=data_format, mode=mode)
+
+
 def decode_predictions(preds, top=5):
   """Decodes the prediction of an ImageNet model.
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils_test.py b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils_test.py
index 517ba91219fc0ec0b61ccd673b420021a0db483d..d843dace59f1c88744217fbaee605d2ac859ec55 100644
--- a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils_test.py
+++ b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
 from tensorflow.python.platform import test
 
 
@@ -29,22 +30,62 @@ class ImageNetUtilsTest(test.TestCase):
   def test_preprocess_input(self):
     # Test batch of images
     x = np.random.uniform(0, 255, (2, 10, 10, 3))
-    self.assertEqual(
-        keras.applications.imagenet_utils.preprocess_input(x).shape, x.shape)
-    out1 = keras.applications.imagenet_utils.preprocess_input(
-        x, 'channels_last')
-    out2 = keras.applications.imagenet_utils.preprocess_input(
-        np.transpose(x, (0, 3, 1, 2)), 'channels_first')
+    self.assertEqual(preprocess_input(x).shape, x.shape)
+    out1 = preprocess_input(x, 'channels_last')
+    out2 = preprocess_input(np.transpose(x, (0, 3, 1, 2)), 'channels_first')
     self.assertAllClose(out1, out2.transpose(0, 2, 3, 1))
 
     # Test single image
     x = np.random.uniform(0, 255, (10, 10, 3))
-    self.assertEqual(
-        keras.applications.imagenet_utils.preprocess_input(x).shape, x.shape)
-    out1 = keras.applications.imagenet_utils.preprocess_input(
-        x, 'channels_last')
-    out2 = keras.applications.imagenet_utils.preprocess_input(
-        np.transpose(x, (2, 0, 1)), 'channels_first')
+    self.assertEqual(preprocess_input(x).shape, x.shape)
+    out1 = preprocess_input(x, 'channels_last')
+    out2 = preprocess_input(np.transpose(x, (2, 0, 1)), 'channels_first')
+    self.assertAllClose(out1, out2.transpose(1, 2, 0))
+
+  def test_preprocess_input_symbolic(self):
+    # Test image batch
+    x = np.random.uniform(0, 255, (2, 10, 10, 3))
+    inputs = keras.layers.Input(shape=x.shape[1:])
+    outputs = keras.layers.Lambda(
+        preprocess_input, output_shape=x.shape[1:])(inputs)
+    model = keras.models.Model(inputs, outputs)
+    assert model.predict(x).shape == x.shape
+    # pylint: disable=g-long-lambda
+    outputs1 = keras.layers.Lambda(lambda x:
+                                   preprocess_input(x, 'channels_last'),
+                                   output_shape=x.shape[1:])(inputs)
+    model1 = keras.models.Model(inputs, outputs1)
+    out1 = model1.predict(x)
+    x2 = np.transpose(x, (0, 3, 1, 2))
+    inputs2 = keras.layers.Input(shape=x2.shape[1:])
+    # pylint: disable=g-long-lambda
+    outputs2 = keras.layers.Lambda(lambda x:
+                                   preprocess_input(x, 'channels_first'),
+                                   output_shape=x2.shape[1:])(inputs2)
+    model2 = keras.models.Model(inputs2, outputs2)
+    out2 = model2.predict(x2)
+    self.assertAllClose(out1, out2.transpose(0, 2, 3, 1))
+
+    # Test single image
+    x = np.random.uniform(0, 255, (10, 10, 3))
+    inputs = keras.layers.Input(shape=x.shape)
+    outputs = keras.layers.Lambda(preprocess_input,
+                                  output_shape=x.shape)(inputs)
+    model = keras.models.Model(inputs, outputs)
+    assert model.predict(x[np.newaxis])[0].shape == x.shape
+    # pylint: disable=g-long-lambda
+    outputs1 = keras.layers.Lambda(lambda x:
+                                   preprocess_input(x, 'channels_last'),
+                                   output_shape=x.shape)(inputs)
+    model1 = keras.models.Model(inputs, outputs1)
+    out1 = model1.predict(x[np.newaxis])[0]
+    x2 = np.transpose(x, (2, 0, 1))
+    inputs2 = keras.layers.Input(shape=x2.shape)
+    outputs2 = keras.layers.Lambda(lambda x:
+                                   preprocess_input(x, 'channels_first'),
+                                   output_shape=x2.shape)(inputs2)  # pylint: disable=g-long-lambda
+    model2 = keras.models.Model(inputs2, outputs2)
+    out2 = model2.predict(x2[np.newaxis])[0]
     self.assertAllClose(out1, out2.transpose(1, 2, 0))
 
   def test_obtain_input_shape(self):
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
index de29b92575e48410614d3b32520d99436891344a..2e73cefb6ce32c2a770eb9bde5ffb220be2da92c 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
@@ -23,6 +23,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
@@ -208,8 +210,9 @@ def InceptionResNetV2(include_top=True,  # pylint: disable=invalid-name
   Arguments:
       include_top: whether to include the fully-connected
           layer at the top of the network.
-      weights: one of `None` (random initialization)
-          or `'imagenet'` (pre-training on ImageNet).
+      weights: one of `None` (random initialization),
+          'imagenet' (pre-training on ImageNet),
+          or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
@@ -239,10 +242,11 @@ def InceptionResNetV2(include_top=True,  # pylint: disable=invalid-name
       ValueError: in case of invalid argument for `weights`,
           or invalid input shape.
   """
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as imagenet with `include_top`'
@@ -365,5 +369,7 @@ def InceptionResNetV2(include_top=True,  # pylint: disable=invalid-name
           cache_subdir='models',
           file_hash='d19885ff4a710c122648d3b5c3b684e4')
     model.load_weights(weights_path)
+  elif weights is not None:
+    model.load_weights(weights)
 
   return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py b/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
index d4fea4fbb0223d079149224e2d3d89487834ca40..4424b9280413bb8e556ab376b0c0acccf4030c73 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
@@ -29,6 +29,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import layers
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
@@ -118,8 +120,9 @@ def InceptionV3(include_top=True,
   Arguments:
       include_top: whether to include the fully-connected
           layer at the top of the network.
-      weights: one of `None` (random initialization)
-          or "imagenet" (pre-training on ImageNet).
+      weights: one of `None` (random initialization),
+          "imagenet" (pre-training on ImageNet),
+          or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
@@ -151,10 +154,11 @@ def InceptionV3(include_top=True,
       ValueError: in case of invalid argument for `weights`,
           or invalid input shape.
   """
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as imagenet with `include_top`'
@@ -383,6 +387,8 @@ def InceptionV3(include_top=True,
           cache_subdir='models',
           file_hash='bcbd6486424b2319ff4ef7d526e38f63')
     model.load_weights(weights_path)
+  elif weights is not None:
+    model.load_weights(weights)
   return model
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
index 653bd8c09f2d7a4ac2f6cb5e6c792b2285b378cc..5f97c138fc038688a009dfa83b48c8f367ee8df2 100644
--- a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
@@ -67,7 +67,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import warnings
+import os
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
@@ -89,6 +89,7 @@ from tensorflow.python.keras._impl.keras.layers import Reshape
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
 
 BASE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.6/'
 
@@ -348,8 +349,9 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
       dropout: dropout rate
       include_top: whether to include the fully-connected
           layer at the top of the network.
-      weights: `None` (random initialization) or
-          `imagenet` (ImageNet weights)
+      weights: one of `None` (random initialization),
+          'imagenet' (pre-training on ImageNet),
+          or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of
           `layers.Input()`)
           to use as image input for the model.
@@ -378,16 +380,11 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
       RuntimeError: If attempting to run this model with a
           backend that does not support separable convolutions.
   """
-
-  if K.backend() != 'tensorflow':
-    raise RuntimeError('Only TensorFlow backend is currently supported, '
-                       'as other backends do not support '
-                       'depthwise convolution.')
-
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as ImageNet with `include_top` '
@@ -438,15 +435,15 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
                        ' Input shape provided = %s' % (input_shape,))
 
   if K.image_data_format() != 'channels_last':
-    warnings.warn('The MobileNet family of models is only available '
-                  'for the input data format "channels_last" '
-                  '(width, height, channels). '
-                  'However your settings specify the default '
-                  'data format "channels_first" (channels, width, height).'
-                  ' You should set `image_data_format="channels_last"` '
-                  'in your Keras config located at ~/.keras/keras.json. '
-                  'The model being returned right now will expect inputs '
-                  'to follow the "channels_last" data format.')
+    logging.warning('The MobileNet family of models is only available '
+                    'for the input data format "channels_last" '
+                    '(width, height, channels). '
+                    'However your settings specify the default '
+                    'data format "channels_first" (channels, width, height).'
+                    ' You should set `image_data_format="channels_last"` '
+                    'in your Keras config located at ~/.keras/keras.json. '
+                    'The model being returned right now will expect inputs '
+                    'to follow the "channels_last" data format.')
     K.set_image_data_format('channels_last')
     old_data_format = 'channels_first'
   else:
@@ -534,9 +531,13 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
       weigh_path = BASE_WEIGHT_PATH + model_name
       weights_path = get_file(model_name, weigh_path, cache_subdir='models')
     model.load_weights(weights_path)
+  elif weights is not None:
+    model.load_weights(weights)
 
   if old_data_format:
     K.set_image_data_format(old_data_format)
+  elif weights is not None:
+    model.load_weights(weights)
   return model
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/resnet50.py b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
index 717b626fdc3c65d510cf190e53b4b1c04a89ebfa..8ab46693aa6e46de6c6df320c745ca9ed01fbe0b 100644
--- a/tensorflow/python/keras/_impl/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
@@ -26,6 +26,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import layers
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
@@ -161,8 +163,9 @@ def ResNet50(include_top=True,
   Arguments:
       include_top: whether to include the fully-connected
           layer at the top of the network.
-      weights: one of `None` (random initialization)
-          or "imagenet" (pre-training on ImageNet).
+      weights: one of `None` (random initialization),
+          'imagenet' (pre-training on ImageNet),
+          or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
@@ -194,10 +197,11 @@ def ResNet50(include_top=True,
       ValueError: in case of invalid argument for `weights`,
           or invalid input shape.
   """
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as imagenet with `include_top`'
@@ -283,4 +287,6 @@ def ResNet50(include_top=True,
           cache_subdir='models',
           md5_hash='a268eb855778b3df3c7506639542a6af')
     model.load_weights(weights_path)
+  elif weights is not None:
+    model.load_weights(weights)
   return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg16.py b/tensorflow/python/keras/_impl/keras/applications/vgg16.py
index a0862e6407747cd0ad3d698c63da77b17c272e1b..38dbbdc809e708cc19d5529665352fe4807fad90 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/_impl/keras/applications/vgg16.py
@@ -25,6 +25,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
@@ -68,8 +70,9 @@ def VGG16(include_top=True,
   Arguments:
       include_top: whether to include the 3 fully-connected
           layers at the top of the network.
-      weights: one of `None` (random initialization)
-          or "imagenet" (pre-training on ImageNet).
+      weights: one of `None` (random initialization),
+          'imagenet' (pre-training on ImageNet),
+          or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
@@ -101,10 +104,11 @@ def VGG16(include_top=True,
       ValueError: in case of invalid argument for `weights`,
           or invalid input shape.
   """
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as imagenet with `include_top`'
@@ -211,4 +215,6 @@ def VGG16(include_top=True,
         dense = model.get_layer(name='fc1')
         layer_utils.convert_dense_weights_data_format(dense, shape,
                                                       'channels_first')
+  elif weights is not None:
+    model.load_weights(weights)
   return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg19.py b/tensorflow/python/keras/_impl/keras/applications/vgg19.py
index cfa1c95336e8ab798e4d5bd67f9c7f89e4705ca7..126c64260b51a7d4e6ca653e850e22c03799dcb0 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/_impl/keras/applications/vgg19.py
@@ -25,6 +25,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
@@ -68,8 +70,9 @@ def VGG19(include_top=True,
   Arguments:
       include_top: whether to include the 3 fully-connected
           layers at the top of the network.
-      weights: one of `None` (random initialization)
-          or "imagenet" (pre-training on ImageNet).
+      weights: one of `None` (random initialization),
+         'imagenet' (pre-training on ImageNet),
+         or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
@@ -101,10 +104,11 @@ def VGG19(include_top=True,
       ValueError: in case of invalid argument for `weights`,
           or invalid input shape.
   """
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as imagenet with `include_top`'
@@ -207,8 +211,6 @@ def VGG19(include_top=True,
           cache_subdir='models',
           file_hash='253f8cb515780f3b799900260a226db6')
     model.load_weights(weights_path)
-    if K.backend() == 'theano':
-      layer_utils.convert_all_kernels_in_model(model)
 
     if K.image_data_format() == 'channels_first':
       if include_top:
@@ -217,4 +219,6 @@ def VGG19(include_top=True,
         dense = model.get_layer(name='fc1')
         layer_utils.convert_dense_weights_data_format(dense, shape,
                                                       'channels_first')
+  elif weights is not None:
+    model.load_weights(weights)
   return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/xception.py b/tensorflow/python/keras/_impl/keras/applications/xception.py
index 14f6ad809015aae451f8ddc13fa64166b06995a6..821983140852b9f1ab505376d824db2392f54391 100644
--- a/tensorflow/python/keras/_impl/keras/applications/xception.py
+++ b/tensorflow/python/keras/_impl/keras/applications/xception.py
@@ -36,6 +36,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import layers
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
@@ -80,8 +82,9 @@ def Xception(include_top=True,
   Arguments:
       include_top: whether to include the fully-connected
           layer at the top of the network.
-      weights: one of `None` (random initialization)
-          or "imagenet" (pre-training on ImageNet).
+      weights: one of `None` (random initialization),
+          'imagenet' (pre-training on ImageNet),
+          or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
@@ -114,18 +117,16 @@ def Xception(include_top=True,
       RuntimeError: If attempting to run this model with a
           backend that does not support separable convolutions.
   """
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as imagenet with `include_top`'
                      ' as true, `classes` should be 1000')
 
-  if K.backend() != 'tensorflow':
-    raise RuntimeError('The Xception model is only available with '
-                       'the TensorFlow backend.')
   if K.image_data_format() != 'channels_last':
     logging.warning(
         'The Xception model is only available for the '
@@ -297,9 +298,13 @@ def Xception(include_top=True,
           cache_subdir='models',
           file_hash='b0042744bf5b25fce3cb969f33bebb97')
     model.load_weights(weights_path)
+  elif weights is not None:
+    model.load_weights(weights)
 
   if old_data_format:
     K.set_image_data_format(old_data_format)
+  elif weights is not None:
+    model.load_weights(weights)
   return model
 
 
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index b029e5161f7f61cfbaa5a417da2d94b8f70637a5..9476085bd8cbc36f63d3c6c8ecad732b557a4f8a 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -2487,7 +2487,7 @@ class Function(object):
   """Runs a computation graph.
 
   It's possible to pass arguments to `tf.Session.run()` via `session_kwargs`.
-  In particular additonal operations via `fetches` argument and additional
+  In particular additional operations via `fetches` argument and additional
   tensor substitutions via `feed_dict` arguments. Note that given
   substitutions are merged with substitutions from `inputs`. Even though
   `feed_dict` is passed once in the constructor (called in `model.compile()`)
@@ -3120,8 +3120,8 @@ def sparse_categorical_crossentropy(target, output, from_logits=False):
   logits = array_ops.reshape(output, [-1, int(output_shape[-1])])
   res = nn.sparse_softmax_cross_entropy_with_logits(
       labels=targets, logits=logits)
-  if len(output_shape) == 3:
-    # if our output includes timesteps we need to reshape
+  if len(output_shape) >= 3:
+    # If our output includes timesteps or spatial dimensions we need to reshape
     return array_ops.reshape(res, array_ops.shape(output)[:-1])
   else:
     return res
diff --git a/tensorflow/python/keras/_impl/keras/callbacks.py b/tensorflow/python/keras/_impl/keras/callbacks.py
index 40a996a03f70051e8c8603bef2e8951669b12811..8da3b857182237a47daa0f00a2340959a448160e 100644
--- a/tensorflow/python/keras/_impl/keras/callbacks.py
+++ b/tensorflow/python/keras/_impl/keras/callbacks.py
@@ -189,6 +189,7 @@ class Callback(object):
 
   def __init__(self):
     self.validation_data = None
+    self.model = None
 
   def set_params(self, params):
     self.params = params
@@ -768,7 +769,7 @@ class TensorBoard(Callback):
       self.writer.add_summary(summary, epoch)
     self.writer.flush()
 
-  def on_train_end(self, _):
+  def on_train_end(self, logs=None):
     self.writer.close()
 
 
diff --git a/tensorflow/python/keras/_impl/keras/callbacks_test.py b/tensorflow/python/keras/_impl/keras/callbacks_test.py
index 97a650a9920608094356b783d7d90e1fddf52549..79dfcd1bb669db09de0cbaa103914efaaf19c6fb 100644
--- a/tensorflow/python/keras/_impl/keras/callbacks_test.py
+++ b/tensorflow/python/keras/_impl/keras/callbacks_test.py
@@ -19,16 +19,18 @@ from __future__ import division
 from __future__ import print_function
 
 import csv
-import multiprocessing
 import os
 import re
 import shutil
+import threading
+import unittest
 
 import numpy as np
 
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -498,7 +500,10 @@ class KerasCallbacksTest(test.TestCase):
       values = []
       with open(fp) as f:
         for x in csv.reader(f):
-          values.append(x)
+          # In windows, due to \r\n line ends we may end up reading empty lines
+          # after each line. Skip empty lines.
+          if x:
+            values.append(x)
       assert 'nan' in values[-1], 'The last epoch was not logged.'
 
   def test_TerminateOnNaN(self):
@@ -678,23 +683,41 @@ class KerasCallbacksTest(test.TestCase):
             batch_size=5)]
 
       # fit w/o validation data should raise ValueError if histogram_freq > 0
+      cbs = callbacks_factory(histogram_freq=1)
       with self.assertRaises(ValueError):
-        model.fit(x_train, y_train, batch_size=BATCH_SIZE,
-                  callbacks=callbacks_factory(histogram_freq=1), epochs=3)
+        model.fit(
+            x_train, y_train, batch_size=BATCH_SIZE, callbacks=cbs, epochs=3)
+
+      for cb in cbs:
+        cb.on_train_end()
 
       # fit generator without validation data should raise ValueError if
       # histogram_freq > 0
+      cbs = callbacks_factory(histogram_freq=1)
       with self.assertRaises(ValueError):
-        model.fit_generator(data_generator(True), len(x_train), epochs=2,
-                            callbacks=callbacks_factory(histogram_freq=1))
+        model.fit_generator(
+            data_generator(True), len(x_train), epochs=2, callbacks=cbs)
+
+      for cb in cbs:
+        cb.on_train_end()
 
       # fit generator with validation data generator should raise ValueError if
       # histogram_freq > 0
+      cbs = callbacks_factory(histogram_freq=1)
       with self.assertRaises(ValueError):
-        model.fit_generator(data_generator(True), len(x_train), epochs=2,
-                            validation_data=data_generator(False),
-                            validation_steps=1,
-                            callbacks=callbacks_factory(histogram_freq=1))
+        model.fit_generator(
+            data_generator(True),
+            len(x_train),
+            epochs=2,
+            validation_data=data_generator(False),
+            validation_steps=1,
+            callbacks=cbs)
+
+      for cb in cbs:
+        cb.on_train_end()
+
+      # Make sure file writer cache is clear to avoid failures during cleanup.
+      writer_cache.FileWriterCache.clear()
 
   def test_TensorBoard_multi_input_output(self):
     np.random.seed(1337)
@@ -767,6 +790,9 @@ class KerasCallbacksTest(test.TestCase):
                           callbacks=callbacks_factory(histogram_freq=1))
       assert os.path.isdir(filepath)
 
+  @unittest.skipIf(
+      os.name == 'nt',
+      'use_multiprocessing=True does not work on windows properly.')
   def test_LambdaCallback(self):
     with self.test_session():
       np.random.seed(1337)
@@ -789,14 +815,15 @@ class KerasCallbacksTest(test.TestCase):
 
       # Start an arbitrary process that should run during model
       # training and be terminated after training has completed.
+      e = threading.Event()
+
       def target():
-        while True:
-          pass
+        e.wait()
 
-      p = multiprocessing.Process(target=target)
-      p.start()
+      t = threading.Thread(target=target)
+      t.start()
       cleanup_callback = keras.callbacks.LambdaCallback(
-          on_train_end=lambda logs: p.terminate())
+          on_train_end=lambda logs: e.set())
 
       cbks = [cleanup_callback]
       model.fit(
@@ -807,8 +834,8 @@ class KerasCallbacksTest(test.TestCase):
           callbacks=cbks,
           epochs=5,
           verbose=0)
-      p.join()
-      assert not p.is_alive()
+      t.join()
+      assert not t.is_alive()
 
   def test_TensorBoard_with_ReduceLROnPlateau(self):
     with self.test_session():
diff --git a/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py b/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
index 4359be89280f7ffa3479af38cd66ebd3aaf6c30e..5d5d2c4f75003847306aad88a7a1f4804ee48707 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
@@ -48,7 +48,7 @@ def load_data(path='boston_housing.npz', seed=113, test_split=0.2):
   f.close()
 
   np.random.seed(seed)
-  indices = np.arrange(len(x))
+  indices = np.arange(len(x))
   np.random.shuffle(indices)
   x = x[indices]
   y = y[indices]
diff --git a/tensorflow/python/keras/_impl/keras/datasets/imdb.py b/tensorflow/python/keras/_impl/keras/datasets/imdb.py
index 0e83473899c303e3ad96d253cf31a1def476fa52..c5b3f0476b0f6f4c512329a6c9fab33d898dfbfe 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/imdb.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/imdb.py
@@ -43,7 +43,7 @@ def load_data(path='imdb.npz',
           the most frequent words are kept
       skip_top: skip the top N most frequently occurring words
           (which may not be informative).
-      maxlen: truncate sequences after this length.
+      maxlen: sequences longer than this will be filtered out.
       seed: random seed for sample shuffling.
       start_char: The start of a sequence will be marked with this character.
           Set to 1 because 0 is usually the padding character.
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology.py b/tensorflow/python/keras/_impl/keras/engine/topology.py
index 4a7bb2e83894f06c433964409ccb2bd3ebfed128..0ccb1722694af636162abd0d1495ff727f7e1814 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology.py
@@ -1426,10 +1426,11 @@ def preprocess_weights_for_loading(layer,
         weights[1] = np.transpose(weights[1], (3, 2, 0, 1))
 
   # convert the weights of CuDNNLSTM so that they could be loaded into LSTM
-  if layer.__class__.__name__ == 'LSTM':
+  if layer.__class__.__name__ == 'LSTM' and len(weights) == 3:
     # determine if we're loading a CuDNNLSTM layer from the number of bias
     # weights:
     # CuDNNLSTM has (units * 8) weights; while LSTM has (units * 4)
+    # if there's no bias weight in the file, skip this conversion
     units = weights[1].shape[0]
     bias = weights[2]
     if len(bias) == units * 8:
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index b4205bf4a397690ce6dd3424e0dd4076d9860e9d..debea2503ee2e440000847c0ce92185e3d230138 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -194,7 +194,7 @@ def _standardize_sample_or_class_weights(x_weight, output_names, weight_type):
     return x_weights
   else:
     raise TypeError('The model has multiple outputs, so `' + weight_type + '` '
-                    'should be either a list of a dict. '
+                    'should be either a list or a dict. '
                     'Provided `' + weight_type + '` type not understood: ' +
                     str(x_weight))
 
@@ -375,7 +375,7 @@ def _make_batches(size, batch_size):
   Returns:
       A list of tuples of array indices.
   """
-  num_batches = int(np.ceil(size / float(batch_size)))
+  num_batches = (size + batch_size - 1) // batch_size  # round up
   return [(i * batch_size, min(size, (i + 1) * batch_size))
           for i in range(num_batches)]
 
@@ -657,7 +657,7 @@ class Model(Network):
     elif isinstance(loss, list):
       if len(loss) != len(self.outputs):
         raise ValueError('When passing a list as loss, '
-                         'it should have one entry per model outputs. '
+                         'it should have one entry per model output. '
                          'The model has ' + str(len(self.outputs)) +
                          ' outputs, but you passed loss=' + str(loss))
       loss_functions = [losses.get(l) for l in loss]
@@ -700,7 +700,7 @@ class Model(Network):
     elif isinstance(loss_weights, list):
       if len(loss_weights) != len(self.outputs):
         raise ValueError('When passing a list as loss_weights, '
-                         'it should have one entry per model outputs. '
+                         'it should have one entry per model output. '
                          'The model has ' + str(len(self.outputs)) +
                          ' outputs, but you passed loss_weights=' +
                          str(loss_weights))
@@ -716,7 +716,7 @@ class Model(Network):
       if isinstance(target_tensors, list):
         if len(target_tensors) != len(self.outputs):
           raise ValueError('When passing a list as `target_tensors`, '
-                           'it should have one entry per model outputs. '
+                           'it should have one entry per model output. '
                            'The model has ' + str(len(self.outputs)) +
                            ' outputs, but you passed target_tensors=' +
                            str(target_tensors))
@@ -789,7 +789,7 @@ class Model(Network):
     elif isinstance(sample_weight_mode, list):
       if len(sample_weight_mode) != len(self.outputs):
         raise ValueError('When passing a list as sample_weight_mode, '
-                         'it should have one entry per model outputs. '
+                         'it should have one entry per model output. '
                          'The model has ' + str(len(self.outputs)) +
                          ' outputs, but you passed '
                          'sample_weight_mode=' + str(sample_weight_mode))
@@ -1414,6 +1414,13 @@ class Model(Network):
                                      self._feed_loss_fns):
       if loss_fn is losses.sparse_categorical_crossentropy:
         output_shapes.append(output_shape[:-1] + (1,))
+      elif (not hasattr(loss_fn, '__name__') or
+            getattr(losses, loss_fn.__name__, None) is None):
+        # If `loss_fn` is not a function (e.g. callable class)
+        # or if it not in the `losses` module, then
+        # it is a user-defined loss and we make no assumptions
+        # about it.
+        output_shapes.append(None)
       else:
         output_shapes.append(output_shape)
     x = _standardize_input_data(
@@ -1919,7 +1926,7 @@ class Model(Network):
 
   def fit_generator(self,
                     generator,
-                    steps_per_epoch,
+                    steps_per_epoch=None,
                     epochs=1,
                     verbose=1,
                     callbacks=None,
@@ -1956,7 +1963,9 @@ class Model(Network):
             to yield from `generator` before declaring one epoch
             finished and starting the next epoch. It should typically
             be equal to the number of unique samples of your dataset
-            divided by the batch size. Not used if using `Sequence`.
+            divided by the batch size.
+            Optional for `Sequence`: if unspecified, will use
+            `len(generator)` as a number of steps.
         epochs: Integer, total number of iterations on the data.
         verbose: Verbosity mode, 0, 1, or 2.
         callbacks: List of callbacks to be called during training.
@@ -1967,11 +1976,15 @@ class Model(Network):
         validation_steps: Only relevant if `validation_data`
             is a generator. Total number of steps (batches of samples)
             to yield from `generator` before stopping.
+            Optional for `Sequence`: if unspecified, will use
+            `len(generator)` as a number of steps.
         class_weight: Dictionary mapping class indices to a weight
             for the class.
-        max_queue_size: Maximum size for the generator queue
-        workers: Maximum number of processes to spin up
-            when using process-based threading.
+        max_queue_size: Maximum size for the generator queue.
+        workers: Integer. Maximum number of processes to spin up
+            when using process based threading.
+            If unspecified, `workers` will default to 1. If 0, will
+            execute the generator on the main thread.
         use_multiprocessing: If True, use process based threading.
             Note that because
             this implementation relies on multiprocessing,
@@ -2031,15 +2044,33 @@ class Model(Network):
     if do_validation:
       self._make_test_function()
 
+    is_sequence = isinstance(generator, Sequence)
+    if not is_sequence and use_multiprocessing and workers > 1:
+      logging.warning('Using a generator with `use_multiprocessing=True`'
+                      ' and multiple workers may duplicate your data.'
+                      ' Please consider using the`keras.utils.Sequence'
+                      ' class.')
+    if steps_per_epoch is None:
+      if is_sequence:
+        steps_per_epoch = len(generator)
+      else:
+        raise ValueError('`steps_per_epoch=None` is only valid for a'
+                         ' generator based on the `keras.utils.Sequence`'
+                         ' class. Please specify `steps_per_epoch` or use'
+                         ' the `keras.utils.Sequence` class.')
+
     # python 2 has 'next', 3 has '__next__'
     # avoid any explicit version checks
-    val_gen = (hasattr(validation_data, 'next') or
-               hasattr(validation_data, '__next__') or
-               isinstance(validation_data, Sequence))
-    if val_gen and not validation_steps:
-      raise ValueError('When using a generator for validation data, '
-                       'you must specify a value for '
-                       '`validation_steps`.')
+    val_gen = (
+        hasattr(validation_data, 'next') or
+        hasattr(validation_data, '__next__') or
+        isinstance(validation_data, Sequence))
+    if (val_gen and not isinstance(validation_data, Sequence) and
+        not validation_steps):
+      raise ValueError('`validation_steps=None` is only valid for a'
+                       ' generator based on the `keras.utils.Sequence`'
+                       ' class. Please specify `validation_steps` or use'
+                       ' the `keras.utils.Sequence` class.')
 
     # Prepare display labels.
     out_labels = self._get_deduped_metrics_names()
@@ -2084,28 +2115,24 @@ class Model(Network):
         val_data += [0.]
       for cbk in callbacks:
         cbk.validation_data = val_data
-    is_sequence = isinstance(generator, Sequence)
-    if not is_sequence and use_multiprocessing and workers > 1:
-      logging.warning(
-          logging.warning('Using a generator with `use_multiprocessing=True`'
-                          ' and multiple workers may duplicate your data.'
-                          ' Please consider using the`keras.utils.Sequence'
-                          ' class.'))
-    if is_sequence:
-      steps_per_epoch = len(generator)
     enqueuer = None
 
     try:
-      if is_sequence:
-        enqueuer = OrderedEnqueuer(
-            generator, use_multiprocessing=use_multiprocessing, shuffle=shuffle)
+      if workers > 0:
+        if is_sequence:
+          enqueuer = OrderedEnqueuer(
+              generator,
+              use_multiprocessing=use_multiprocessing,
+              shuffle=shuffle)
+        else:
+          enqueuer = GeneratorEnqueuer(
+              generator,
+              use_multiprocessing=use_multiprocessing,
+              wait_time=wait_time)
+        enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+        output_generator = enqueuer.get()
       else:
-        enqueuer = GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing,
-            wait_time=wait_time)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
+        output_generator = generator
 
       callback_model.stop_training = False
       while epoch < epochs:
@@ -2119,6 +2146,7 @@ class Model(Network):
             raise ValueError('Output of generator should be '
                              'a tuple `(x, y, sample_weight)` '
                              'or `(x, y)`. Found: ' + str(generator_output))
+
           if len(generator_output) == 2:
             x, y = generator_output
             sample_weight = None
@@ -2196,7 +2224,7 @@ class Model(Network):
 
   def evaluate_generator(self,
                          generator,
-                         steps,
+                         steps=None,
                          max_queue_size=10,
                          workers=1,
                          use_multiprocessing=False,
@@ -2214,10 +2242,13 @@ class Model(Network):
             when using multiprocessing.
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
-            Not used if using `Sequence`.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(generator)` as a number of steps.
         max_queue_size: maximum size for the generator queue
-        workers: maximum number of processes to spin up
-            when using process-based threading.
+        workers: Integer. Maximum number of processes to spin up
+            when using process based threading.
+            If unspecified, `workers` will default to 1. If 0, will
+            execute the generator on the main thread.
         use_multiprocessing: if True, use process based threading.
             Note that because
             this implementation relies on multiprocessing,
@@ -2258,26 +2289,34 @@ class Model(Network):
     batch_sizes = []
     is_sequence = isinstance(generator, Sequence)
     if not is_sequence and use_multiprocessing and workers > 1:
-      logging.warning(
-          logging.warning('Using a generator with `use_multiprocessing=True`'
-                          ' and multiple workers may duplicate your data.'
-                          ' Please consider using the`keras.utils.Sequence'
-                          ' class.'))
-    if is_sequence:
-      steps = len(generator)
+      logging.warning('Using a generator with `use_multiprocessing=True`'
+                      ' and multiple workers may duplicate your data.'
+                      ' Please consider using the`keras.utils.Sequence'
+                      ' class.')
+    if steps is None:
+      if is_sequence:
+        steps = len(generator)
+      else:
+        raise ValueError('`steps=None` is only valid for a generator'
+                         ' based on the `keras.utils.Sequence` class.'
+                         ' Please specify `steps` or use the'
+                         ' `keras.utils.Sequence` class.')
     enqueuer = None
 
     try:
-      if is_sequence:
-        enqueuer = OrderedEnqueuer(
-            generator, use_multiprocessing=use_multiprocessing)
+      if workers > 0:
+        if is_sequence:
+          enqueuer = OrderedEnqueuer(
+              generator, use_multiprocessing=use_multiprocessing)
+        else:
+          enqueuer = GeneratorEnqueuer(
+              generator,
+              use_multiprocessing=use_multiprocessing,
+              wait_time=wait_time)
+        enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+        output_generator = enqueuer.get()
       else:
-        enqueuer = GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing,
-            wait_time=wait_time)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
+        output_generator = generator
 
       while steps_done < steps:
         generator_output = next(output_generator)
@@ -2297,11 +2336,11 @@ class Model(Network):
         outs = self.test_on_batch(x, y, sample_weight=sample_weight)
 
         if isinstance(x, list):
-          batch_size = len(x[0])
+          batch_size = x[0].shape[0]
         elif isinstance(x, dict):
-          batch_size = len(list(x.values())[0])
+          batch_size = list(x.values())[0].shape[0]
         else:
-          batch_size = len(x)
+          batch_size = x.shape[0]
         if batch_size == 0:
           raise ValueError('Received an empty batch. '
                            'Batches should at least contain one item.')
@@ -2325,7 +2364,7 @@ class Model(Network):
 
   def predict_generator(self,
                         generator,
-                        steps,
+                        steps=None,
                         max_queue_size=10,
                         workers=1,
                         use_multiprocessing=False,
@@ -2343,10 +2382,13 @@ class Model(Network):
                 when using multiprocessing.
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(generator)` as a number of steps.
         max_queue_size: Maximum size for the generator queue.
-          Not used if using `Sequence`.
-        workers: Maximum number of processes to spin up
-            when using process-based threading.
+        workers: Integer. Maximum number of processes to spin up
+            when using process based threading.
+            If unspecified, `workers` will default to 1. If 0, will
+            execute the generator on the main thread.
         use_multiprocessing: If `True`, use process based threading.
             Note that because
             this implementation relies on multiprocessing,
@@ -2382,26 +2424,34 @@ class Model(Network):
     all_outs = []
     is_sequence = isinstance(generator, Sequence)
     if not is_sequence and use_multiprocessing and workers > 1:
-      logging.warning(
-          logging.warning('Using a generator with `use_multiprocessing=True`'
-                          ' and multiple workers may duplicate your data.'
-                          ' Please consider using the`keras.utils.Sequence'
-                          ' class.'))
-    if is_sequence:
-      steps = len(generator)
+      logging.warn('Using a generator with `use_multiprocessing=True`'
+                   ' and multiple workers may duplicate your data.'
+                   ' Please consider using the`keras.utils.Sequence'
+                   ' class.')
+    if steps is None:
+      if is_sequence:
+        steps = len(generator)
+      else:
+        raise ValueError('`steps=None` is only valid for a generator'
+                         ' based on the `keras.utils.Sequence` class.'
+                         ' Please specify `steps` or use the'
+                         ' `keras.utils.Sequence` class.')
     enqueuer = None
 
     try:
-      if is_sequence:
-        enqueuer = OrderedEnqueuer(
-            generator, use_multiprocessing=use_multiprocessing)
+      if workers > 0:
+        if is_sequence:
+          enqueuer = OrderedEnqueuer(
+              generator, use_multiprocessing=use_multiprocessing)
+        else:
+          enqueuer = GeneratorEnqueuer(
+              generator,
+              use_multiprocessing=use_multiprocessing,
+              wait_time=wait_time)
+        enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+        output_generator = enqueuer.get()
       else:
-        enqueuer = GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing,
-            wait_time=wait_time)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
+        output_generator = generator
 
       if verbose == 1:
         progbar = Progbar(target=steps)
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py
index e2a06e8e778c5013b72005e5fe9f01fe5c94f127..78224814d3baa4e343bc8aac6af6415959159612 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py
@@ -18,6 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+import unittest
+
 import numpy as np
 
 from tensorflow.python.keras._impl import keras
@@ -783,6 +786,9 @@ class TestDynamicTrainability(test.TestCase):
 
 class TestGeneratorMethods(test.TestCase):
 
+  @unittest.skipIf(
+      os.name == 'nt',
+      'use_multiprocessing=True does not work on windows properly.')
   def test_generator_methods(self):
     arr_data = np.random.random((50, 2))
     arr_labels = np.random.random((50,))
@@ -830,6 +836,11 @@ class TestGeneratorMethods(test.TestCase):
                             use_multiprocessing=False,
                             validation_data=custom_generator(),
                             validation_steps=10)
+        model.fit_generator(custom_generator(),
+                            steps_per_epoch=5,
+                            validation_data=custom_generator(),
+                            validation_steps=1,
+                            workers=0)
         model.predict_generator(custom_generator(),
                                 steps=5,
                                 max_queue_size=10,
@@ -839,6 +850,10 @@ class TestGeneratorMethods(test.TestCase):
                                 steps=5,
                                 max_queue_size=10,
                                 use_multiprocessing=False)
+        model.predict_generator(custom_generator(),
+                                steps=5,
+                                max_queue_size=10,
+                                workers=0)
         model.evaluate_generator(custom_generator(),
                                  steps=5,
                                  max_queue_size=10,
@@ -848,6 +863,11 @@ class TestGeneratorMethods(test.TestCase):
                                  steps=5,
                                  max_queue_size=10,
                                  use_multiprocessing=False)
+        model.evaluate_generator(custom_generator(),
+                                 steps=5,
+                                 max_queue_size=10,
+                                 use_multiprocessing=False,
+                                 workers=0)
 
         # Test legacy API
         model.fit_generator(custom_generator(),
diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/keras/_impl/keras/estimator.py
index 125e63e1b84603416237250d46cfd95441dff78b..624e92a04b8860d9a3974f2edb4a443482958259 100644
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/keras/_impl/keras/estimator.py
@@ -19,9 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator as estimator_lib
+from tensorflow.python.estimator import export as export_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
@@ -29,11 +31,23 @@ from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import models
 from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_module
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
 
+_DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+
+
+def _cast_tensor_to_floatx(x):
+  """Cast tensor to keras's floatx dtype if it is not already the same dtype."""
+  if x.dtype == K.floatx():
+    return x
+  else:
+    return math_ops.cast(x, K.floatx())
+
 
 def _create_ordered_io(keras_model, estimator_io_dict, is_input=True):
   """Create a list of tensors from IO dictionary based on Keras IO order.
@@ -63,7 +77,7 @@ def _create_ordered_io(keras_model, estimator_io_dict, is_input=True):
                                         ', '.join(keras_io_names)))
   tensors = []
   for io_name in keras_io_names:
-    tensors.append(estimator_io_dict[io_name])
+    tensors.append(_cast_tensor_to_floatx(estimator_io_dict[io_name]))
   return tensors
 
 
@@ -111,7 +125,8 @@ def _clone_and_build_model(mode,
       target_tensors = _create_ordered_io(keras_model, labels, is_input=False)
     else:
       target_tensors = [
-          sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(labels)
+          _cast_tensor_to_floatx(
+              sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(labels))
       ]
 
     model.compile(
@@ -184,7 +199,11 @@ def _create_keras_model_fn(keras_model, custom_objects=None):
         predictions=predictions,
         loss=loss,
         train_op=train_op,
-        eval_metric_ops=eval_metric_ops)
+        eval_metric_ops=eval_metric_ops,
+        export_outputs={
+            _DEFAULT_SERVING_KEY:
+            export_lib.export_output.PredictOutput(predictions)
+        })
 
   return model_fn
 
@@ -222,7 +241,7 @@ def _save_first_checkpoint(keras_model, estimator, custom_objects,
           K._initialize_variables(sess)
           # pylint: enable=protected-access
         saver = saver_lib.Saver()
-        saver.save(sess, estimator.model_dir + '/')
+        saver.save(sess, os.path.join(estimator.model_dir, 'keras_model.ckpt'))
 
 
 def model_to_estimator(keras_model=None,
@@ -232,6 +251,9 @@ def model_to_estimator(keras_model=None,
                        config=None):
   """Constructs an `Estimator` instance from given keras model.
 
+  For usage example, please see
+  @{$programmers_guide/estimators$creating_estimators_from_keras_models}.
+
   Args:
     keras_model: Keras model in memory.
     keras_model_path: Directory to a keras model on disk.
diff --git a/tensorflow/python/keras/_impl/keras/estimator_test.py b/tensorflow/python/keras/_impl/keras/estimator_test.py
index 1144aa3152b79860b6fa9c4f4c361028e10ee469..9fc48b4117e7ee2c717d5418754254aa02b82869 100644
--- a/tensorflow/python/keras/_impl/keras/estimator_test.py
+++ b/tensorflow/python/keras/_impl/keras/estimator_test.py
@@ -25,14 +25,13 @@ import numpy as np
 
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.keras._impl.keras.applications import mobilenet
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
 
 
 try:
@@ -79,22 +78,17 @@ def get_resource_for_simple_model(is_sequential, is_evaluate):
   y_test = keras.utils.to_categorical(y_test)
 
   train_input_fn = numpy_io.numpy_input_fn(
-      x={input_name: np.array(x_train, dtype=np.float32)},
-      y=np.array(y_train, dtype=np.float32),
+      x={input_name: x_train},
+      y=y_train,
       shuffle=False,
       num_epochs=None,
       batch_size=16)
 
   evaluate_input_fn = numpy_io.numpy_input_fn(
-      x={input_name: np.array(x_test, dtype=np.float32)},
-      y=np.array(y_test, dtype=np.float32),
-      num_epochs=1,
-      shuffle=False)
+      x={input_name: x_test}, y=y_test, num_epochs=1, shuffle=False)
 
   predict_input_fn = numpy_io.numpy_input_fn(
-      x={input_name: np.array(x_test, dtype=np.float32)},
-      num_epochs=1,
-      shuffle=False)
+      x={input_name: x_test}, num_epochs=1, shuffle=False)
 
   inference_input_fn = evaluate_input_fn if is_evaluate else predict_input_fn
 
@@ -132,6 +126,8 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
         tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir)
 
   def tearDown(self):
+    # Make sure nothing is stuck in limbo.
+    writer_cache.FileWriterCache.clear()
     if os.path.isdir(self._base_dir):
       gfile.DeleteRecursively(self._base_dir)
 
@@ -153,6 +149,8 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
         est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
         after_eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
         self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
+
+      writer_cache.FileWriterCache.clear()
       gfile.DeleteRecursively(self._config.model_dir)
 
   def test_evaluate(self):
@@ -238,41 +236,13 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
     d_test = keras.utils.to_categorical(d_test)
 
     def train_input_fn():
-      input_dict = {
-          'input_a':
-              ops.convert_to_tensor(
-                  np.array(a_train, dtype=np.float32), dtype=dtypes.float32),
-          'input_b':
-              ops.convert_to_tensor(
-                  np.array(b_train, dtype=np.float32), dtype=dtypes.float32)
-      }
-      output_dict = {
-          'dense_2':
-              ops.convert_to_tensor(
-                  np.array(c_train, dtype=np.float32), dtype=dtypes.float32),
-          'dense_3':
-              ops.convert_to_tensor(
-                  np.array(d_train, dtype=np.float32), dtype=dtypes.float32)
-      }
+      input_dict = {'input_a': a_train, 'input_b': b_train}
+      output_dict = {'dense_2': c_train, 'dense_3': d_train}
       return input_dict, output_dict
 
     def eval_input_fn():
-      input_dict = {
-          'input_a':
-              ops.convert_to_tensor(
-                  np.array(a_test, dtype=np.float32), dtype=dtypes.float32),
-          'input_b':
-              ops.convert_to_tensor(
-                  np.array(b_test, dtype=np.float32), dtype=dtypes.float32)
-      }
-      output_dict = {
-          'dense_2':
-              ops.convert_to_tensor(
-                  np.array(c_test, dtype=np.float32), dtype=dtypes.float32),
-          'dense_3':
-              ops.convert_to_tensor(
-                  np.array(d_test, dtype=np.float32), dtype=dtypes.float32)
-      }
+      input_dict = {'input_a': a_test, 'input_b': b_test}
+      output_dict = {'dense_2': c_test, 'dense_3': d_test}
       return input_dict, output_dict
 
     with self.test_session():
@@ -342,26 +312,12 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
     y_train = keras.utils.to_categorical(y_train)
 
     def invald_input_name_input_fn():
-      input_dict = {
-          'invalid_input_name':
-              ops.convert_to_tensor(
-                  np.array(x_train, dtype=np.float32), dtype=dtypes.float32),
-      }
-      output = ops.convert_to_tensor(
-          np.array(y_train, dtype=np.float32), dtype=dtypes.float32)
-      return input_dict, output
+      input_dict = {'invalid_input_name': x_train}
+      return input_dict, y_train
 
     def invald_output_name_input_fn():
-      input_dict = {
-          'input_1':
-              ops.convert_to_tensor(
-                  np.array(x_train, dtype=np.float32), dtype=dtypes.float32),
-      }
-      output_dict = {
-          'invalid_output_name':
-              ops.convert_to_tensor(
-                  np.array(y_train, dtype=np.float32), dtype=dtypes.float32),
-      }
+      input_dict = {'input_1': x_train}
+      output_dict = {'invalid_output_name': y_train}
       return input_dict, output_dict
 
     model = simple_functional_model()
diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py
index 517129fab05a504245725032e715b624a3b975a7..6a745844b2dd681bd2a9cc73f0207e8aa88044c8 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core.py
@@ -104,13 +104,13 @@ class Dropout(tf_core_layers.Dropout, Layer):
   """
 
   def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
-    self.supports_masking = True
     # Inheritance call order:
     # 1) tf.layers.Dropout, 2) keras.layers.Layer, 3) tf.layers.Layer
     super(Dropout, self).__init__(rate=rate,
                                   noise_shape=noise_shape,
                                   seed=seed,
                                   **kwargs)
+    self.supports_masking = True
 
   def call(self, inputs, training=None):
     if training is None:
@@ -547,8 +547,19 @@ class Lambda(Layer):
   Arguments:
       function: The function to be evaluated.
           Takes input tensor as first argument.
+      output_shape: Expected output shape from function.
+            This argument can be inferred if not explicitly provided.
+            Can be a tuple or function.
+            If a tuple, it only specifies the first dimension onward;
+                 sample dimension is assumed either the same as the input:
+                 `output_shape = (input_shape[0], ) + output_shape`
+                 or, the input is `None` and
+                 the sample dimension is also `None`:
+                 `output_shape = (None, ) + output_shape`
+            If a function, it specifies the entire shape as a function of the
+            input shape: `output_shape = f(input_shape)`
       arguments: optional dictionary of keyword arguments to be passed
-          to the function.
+            to the function.
 
   Input shape:
       Arbitrary. Use the keyword argument input_shape
@@ -557,16 +568,52 @@ class Lambda(Layer):
 
   Output shape:
       Specified by `output_shape` argument
-      (or auto-inferred when using TensorFlow).
   """
 
-  def __init__(self, function, mask=None, arguments=None, **kwargs):
+  def __init__(self, function, output_shape=None, mask=None, arguments=None,
+               **kwargs):
     super(Lambda, self).__init__(**kwargs)
     self.function = function
     self.arguments = arguments if arguments else {}
     if mask is not None:
       self.supports_masking = True
     self.mask = mask
+    if output_shape is None:
+      self._output_shape = None
+    elif isinstance(output_shape, (tuple, list)):
+      self._output_shape = tuple(output_shape)
+    else:
+      if not callable(output_shape):
+        raise TypeError('In Lambda, `output_shape` '
+                        'must be a list, a tuple, or a function.')
+      self._output_shape = output_shape
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
+
+    if self._output_shape is None:
+      x = K.placeholder(shape=input_shape)
+      x = self.call(x)
+      if isinstance(x, list):
+        return [tensor_shape.TensorShape(K.int_shape(x_elem)) for x_elem in x]
+      else:
+        return tensor_shape.TensorShape(K.int_shape(x))
+    elif isinstance(self._output_shape, (tuple, list)):
+      if isinstance(input_shape, list):
+        num_samples = input_shape[0][0]
+      else:
+        num_samples = input_shape[0] if input_shape else None
+      return tensor_shape.TensorShape((num_samples,) +
+                                      tuple(self._output_shape))
+    else:
+      shape = self._output_shape(input_shape)
+      if not isinstance(shape, (list, tuple)):
+        raise ValueError(
+            '`output_shape` function must return a tuple or a list of tuples.')
+      if isinstance(shape, list):
+        if isinstance(shape[0], int) or shape[0] is None:
+          shape = tuple(shape)
+      return tensor_shape.TensorShape(shape)
 
   def call(self, inputs, mask=None):
     arguments = self.arguments
@@ -587,9 +634,21 @@ class Lambda(Layer):
       function = self.function.__name__
       function_type = 'function'
 
+    if isinstance(self._output_shape, python_types.LambdaType):
+      output_shape = func_dump(self._output_shape)
+      output_shape_type = 'lambda'
+    elif callable(self._output_shape):
+      output_shape = self._output_shape.__name__
+      output_shape_type = 'function'
+    else:
+      output_shape = self._output_shape
+      output_shape_type = 'raw'
+
     config = {
         'function': function,
         'function_type': function_type,
+        'output_shape': output_shape,
+        'output_shape_type': output_shape_type,
         'arguments': self.arguments
     }
     base_config = super(Lambda, self).get_config()
@@ -614,6 +673,19 @@ class Lambda(Layer):
     else:
       raise TypeError('Unknown function type:', function_type)
 
+    output_shape_type = config.pop('output_shape_type')
+    if output_shape_type == 'function':
+      # Simple lookup in custom objects
+      output_shape = deserialize_keras_object(
+          config['output_shape'],
+          custom_objects=custom_objects,
+          printable_module_name='output_shape function in Lambda layer')
+    elif output_shape_type == 'lambda':
+      # Unsafe deserialization from bytecode
+      output_shape = func_load(config['output_shape'], globs=globs)
+    else:
+      output_shape = config['output_shape']
+
     # If arguments were numpy array, they have been saved as
     # list. We need to recover the ndarray
     if 'arguments' in config:
@@ -625,6 +697,7 @@ class Lambda(Layer):
             config['arguments'][key] = np.array(arg_dict['value'])
 
     config['function'] = function
+    config['output_shape'] = output_shape
     return cls(**config)
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/core_test.py b/tensorflow/python/keras/_impl/keras/layers/core_test.py
index dd768dc268ef6b39f64b522fd88393610c832287..bdb99c91c289cf808fec7b891376dbfcf5504aca 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core_test.py
@@ -47,6 +47,11 @@ class CoreLayersTest(test.TestCase):
                   'noise_shape': [3, 1]},
           input_shape=(3, 2))
 
+    # https://github.com/tensorflow/tensorflow/issues/14819
+    with self.test_session():
+      dropout = keras.layers.Dropout(0.5)
+      self.assertEqual(True, dropout.supports_masking)
+
     with self.test_session():
       testing_utils.layer_test(
           keras.layers.SpatialDropout1D,
@@ -220,6 +225,34 @@ class CoreLayersTest(test.TestCase):
       self.assertEqual(1, len(layer.losses))
       _ = layer.get_config()
 
+  def test_lambda_output_shape(self):
+    with self.test_session():
+      l = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
+      l(keras.backend.variable(np.ones((1, 1))))
+      self.assertEqual((1, 1), l.get_config()['output_shape'])
+
+  def test_lambda_output_shape_function(self):
+    def get_output_shape(input_shape):
+      return 1 * input_shape
+
+    with self.test_session():
+      l = keras.layers.Lambda(lambda x: x + 1, output_shape=get_output_shape)
+      l(keras.backend.variable(np.ones((1, 1))))
+      self.assertEqual('lambda', l.get_config()['output_shape_type'])
+
+  def test_lambda_config_serialization(self):
+    with self.test_session():
+      # test serialization with output_shape and output_shape_type
+      layer = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
+      layer(keras.backend.variable(np.ones((1, 1))))
+      config = layer.get_config()
+      layer = keras.layers.deserialize({
+          'class_name': 'Lambda',
+          'config': config
+      })
+
+      layer = keras.layers.Lambda.from_config(config)
 
 if __name__ == '__main__':
   test.main()
+
diff --git a/tensorflow/python/keras/_impl/keras/losses.py b/tensorflow/python/keras/_impl/keras/losses.py
index 19212aeee8cd4fbc723ba3e47c9d3e226ec339a9..1d6319abb13619932fe76966a69004dcfcd0e022 100644
--- a/tensorflow/python/keras/_impl/keras/losses.py
+++ b/tensorflow/python/keras/_impl/keras/losses.py
@@ -61,10 +61,10 @@ def categorical_hinge(y_true, y_pred):
 
 def logcosh(y_true, y_pred):
 
-  def cosh(x):
-    return (K.exp(x) + K.exp(-x)) / 2
+  def _logcosh(x):
+    return x + K.softplus(-2. * x) - K.log(2.)
 
-  return K.mean(K.log(cosh(y_pred - y_true)), axis=-1)
+  return K.mean(_logcosh(y_pred - y_true), axis=-1)
 
 
 def categorical_crossentropy(y_true, y_pred):
diff --git a/tensorflow/python/keras/_impl/keras/models.py b/tensorflow/python/keras/_impl/keras/models.py
index ba202827ce3fca397ab487f58c01667b9b0c4444..e262cc8c8e9d728c1e7f504ffaf543faa1f3db50 100644
--- a/tensorflow/python/keras/_impl/keras/models.py
+++ b/tensorflow/python/keras/_impl/keras/models.py
@@ -1070,7 +1070,7 @@ class Sequential(Model):
 
   def fit_generator(self,
                     generator,
-                    steps_per_epoch,
+                    steps_per_epoch=None,
                     epochs=1,
                     verbose=1,
                     callbacks=None,
@@ -1101,8 +1101,10 @@ class Sequential(Model):
         steps_per_epoch: Total number of steps (batches of samples)
             to yield from `generator` before declaring one epoch
             finished and starting the next epoch. It should typically
-            be equal to the number of unique samples of your dataset
+            be equal to the number of samples of your dataset
             divided by the batch size.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(generator)` as a number of steps.
         epochs: Integer, total number of iterations on the data.
             Note that in conjunction with initial_epoch, the parameter
             epochs is to be understood as "final epoch". The model is
@@ -1118,8 +1120,10 @@ class Sequential(Model):
             is a generator.
             Number of steps to yield from validation generator
             at the end of every epoch. It should typically
-            be equal to the number of unique samples of your
+            be equal to the number of samples of your
             validation dataset divided by the batch size.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(validation_data)` as a number of steps.
         class_weight: Dictionary mapping class indices to a weight
             for the class.
         max_queue_size: Maximum size for the generator queue
@@ -1195,7 +1199,7 @@ class Sequential(Model):
 
   def evaluate_generator(self,
                          generator,
-                         steps,
+                         steps=None,
                          max_queue_size=10,
                          workers=1,
                          use_multiprocessing=False,
@@ -1210,6 +1214,8 @@ class Sequential(Model):
             or (inputs, targets, sample_weights)
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(generator)` as a number of steps.
         max_queue_size: maximum size for the generator queue
         workers: maximum number of processes to spin up
         use_multiprocessing: if True, use process based threading.
@@ -1254,7 +1260,7 @@ class Sequential(Model):
 
   def predict_generator(self,
                         generator,
-                        steps,
+                        steps=None,
                         max_queue_size=10,
                         workers=1,
                         use_multiprocessing=False,
@@ -1269,6 +1275,8 @@ class Sequential(Model):
         generator: generator yielding batches of input samples.
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(generator)` as a number of steps.
         max_queue_size: maximum size for the generator queue
         workers: maximum number of processes to spin up
         use_multiprocessing: if True, use process based threading.
diff --git a/tensorflow/python/keras/_impl/keras/models_test.py b/tensorflow/python/keras/_impl/keras/models_test.py
index 86acac4604a2b87919704ae86f86ac2dd4d6b25f..61938066b98b9f6bb48e7e68870d15ed60ad3dd9 100644
--- a/tensorflow/python/keras/_impl/keras/models_test.py
+++ b/tensorflow/python/keras/_impl/keras/models_test.py
@@ -54,10 +54,11 @@ class TestModelSaving(test.TestCase):
       model.train_on_batch(x, y)
 
       out = model.predict(x)
-      _, fname = tempfile.mkstemp('.h5')
+      fd, fname = tempfile.mkstemp('.h5')
       keras.models.save_model(model, fname)
 
       new_model = keras.models.load_model(fname)
+      os.close(fd)
       os.remove(fname)
 
       out2 = new_model.predict(x)
@@ -95,13 +96,14 @@ class TestModelSaving(test.TestCase):
       model.train_on_batch(x, y)
 
       out = model.predict(x)
-      _, fname = tempfile.mkstemp('.h5')
+      fd, fname = tempfile.mkstemp('.h5')
       keras.models.save_model(model, fname)
 
       model = keras.models.load_model(
           fname,
           custom_objects={'CustomOp': CustomOp,
                           'custom_loss': custom_loss})
+      os.close(fd)
       os.remove(fname)
 
       out2 = model.predict(x)
@@ -125,10 +127,11 @@ class TestModelSaving(test.TestCase):
       model.train_on_batch(x, y)
 
       out = model.predict(x)
-      _, fname = tempfile.mkstemp('.h5')
+      fd, fname = tempfile.mkstemp('.h5')
       keras.models.save_model(model, fname)
 
       model = keras.models.load_model(fname)
+      os.close(fd)
       os.remove(fname)
 
       out2 = model.predict(x)
@@ -144,9 +147,10 @@ class TestModelSaving(test.TestCase):
       model.add(keras.layers.Dense(3))
       model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
 
-      _, fname = tempfile.mkstemp('.h5')
+      fd, fname = tempfile.mkstemp('.h5')
       keras.models.save_model(model, fname)
       model = keras.models.load_model(fname)
+      os.close(fd)
       os.remove(fname)
 
   def test_saving_with_tf_optimizer(self):
@@ -161,9 +165,10 @@ class TestModelSaving(test.TestCase):
                     optimizer=training_module.AdadeltaOptimizer(0.1),
                     metrics=['acc'])
 
-      _, fname = tempfile.mkstemp('.h5')
+      fd, fname = tempfile.mkstemp('.h5')
       keras.models.save_model(model, fname)
       model = keras.models.load_model(fname)
+      os.close(fd)
       os.remove(fname)
 
   def test_saving_right_after_compilation(self):
@@ -177,9 +182,10 @@ class TestModelSaving(test.TestCase):
       model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
       model.model._make_train_function()
 
-      _, fname = tempfile.mkstemp('.h5')
+      fd, fname = tempfile.mkstemp('.h5')
       keras.models.save_model(model, fname)
       model = keras.models.load_model(fname)
+      os.close(fd)
       os.remove(fname)
 
   def test_saving_lambda_numpy_array_arguments(self):
@@ -194,10 +200,11 @@ class TestModelSaving(test.TestCase):
     model = keras.models.Model(inputs, output)
     model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
 
-    _, fname = tempfile.mkstemp('.h5')
+    fd, fname = tempfile.mkstemp('.h5')
     keras.models.save_model(model, fname)
 
     model = keras.models.load_model(fname)
+    os.close(fd)
     os.remove(fname)
 
     self.assertAllClose(mean, model.layers[1].arguments['mu'])
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/image.py b/tensorflow/python/keras/_impl/keras/preprocessing/image.py
index 12dc718cd791d0a5829c4809474a83783ed561f9..82441de5925cac0d66af95202c613b3e5e9aeb79 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/image.py
@@ -556,7 +556,8 @@ class ImageDataGenerator(object):
                           save_to_dir=None,
                           save_prefix='',
                           save_format='png',
-                          follow_links=False):
+                          follow_links=False,
+                          interpolation='nearest'):
     return DirectoryIterator(
         directory,
         self,
@@ -571,7 +572,8 @@ class ImageDataGenerator(object):
         save_to_dir=save_to_dir,
         save_prefix=save_prefix,
         save_format=save_format,
-        follow_links=follow_links)
+        follow_links=follow_links,
+        interpolation=interpolation)
 
   def standardize(self, x):
     """Apply the normalization configuration to a batch of inputs.
@@ -596,7 +598,7 @@ class ImageDataGenerator(object):
         x -= self.mean
       else:
         logging.warning('This ImageDataGenerator specifies '
-                        '`featurewise_center`, but it hasn\'t'
+                        '`featurewise_center`, but it hasn\'t '
                         'been fit on any training data. Fit it '
                         'first by calling `.fit(numpy_data)`.')
     if self.featurewise_std_normalization:
@@ -604,7 +606,7 @@ class ImageDataGenerator(object):
         x /= (self.std + 1e-7)
       else:
         logging.warning('This ImageDataGenerator specifies '
-                        '`featurewise_std_normalization`, but it hasn\'t'
+                        '`featurewise_std_normalization`, but it hasn\'t '
                         'been fit on any training data. Fit it '
                         'first by calling `.fit(numpy_data)`.')
     if self.zca_whitening:
@@ -614,7 +616,7 @@ class ImageDataGenerator(object):
         x = np.reshape(whitex, x.shape)
       else:
         logging.warning('This ImageDataGenerator specifies '
-                        '`zca_whitening`, but it hasn\'t'
+                        '`zca_whitening`, but it hasn\'t '
                         'been fit on any training data. Fit it '
                         'first by calling `.fit(numpy_data)`.')
     return x
@@ -833,8 +835,7 @@ class Iterator(Sequence):
     return self._get_batches_of_transformed_samples(index_array)
 
   def __len__(self):
-    length = int(np.ceil(self.n / float(self.batch_size)))
-    return np.maximum(length, 0)
+    return (self.n + self.batch_size - 1) // self.batch_size  # round up
 
   def on_epoch_end(self):
     self._set_index_array()
@@ -1091,6 +1092,12 @@ class DirectoryIterator(Iterator):
           images (if `save_to_dir` is set).
       save_format: Format to use for saving sample images
           (if `save_to_dir` is set).
+      interpolation: Interpolation method used to resample the image if the
+          target size is different from that of the loaded image.
+          Supported methods are "nearest", "bilinear", and "bicubic".
+          If PIL version 1.1.3 or newer is installed, "lanczos" is also
+          supported. If PIL version 3.4.0 or newer is installed, "box" and
+          "hamming" are also supported. By default, "nearest" is used.
   """
 
   def __init__(self,
@@ -1107,7 +1114,8 @@ class DirectoryIterator(Iterator):
                save_to_dir=None,
                save_prefix='',
                save_format='png',
-               follow_links=False):
+               follow_links=False,
+               interpolation='nearest'):
     if data_format is None:
       data_format = K.image_data_format()
     self.directory = directory
@@ -1138,6 +1146,7 @@ class DirectoryIterator(Iterator):
     self.save_to_dir = save_to_dir
     self.save_prefix = save_prefix
     self.save_format = save_format
+    self.interpolation = interpolation
 
     white_list_formats = {'png', 'jpg', 'jpeg', 'bmp', 'ppm'}
 
@@ -1192,7 +1201,8 @@ class DirectoryIterator(Iterator):
       fname = self.filenames[j]
       img = load_img(os.path.join(self.directory, fname),
                      grayscale=grayscale,
-                     target_size=self.target_size)
+                     target_size=self.target_size,
+                     interpolation=self.interpolation)
       x = img_to_array(img, data_format=self.data_format)
       x = self.image_data_generator.random_transform(x)
       x = self.image_data_generator.standardize(x)
diff --git a/tensorflow/python/keras/_impl/keras/utils/data_utils.py b/tensorflow/python/keras/_impl/keras/utils/data_utils.py
index 1f2e9ac44076582c7aea083203b13fddaa597474..df76e6712a478a14f4461b73901dfda53ca5a099 100644
--- a/tensorflow/python/keras/_impl/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/data_utils.py
@@ -28,6 +28,7 @@ import sys
 import tarfile
 import threading
 import time
+import traceback
 import zipfile
 
 import numpy as np
@@ -560,9 +561,9 @@ class OrderedEnqueuer(SequenceEnqueuer):
         self.queue.task_done()
         if inputs is not None:
           yield inputs
-    except Exception as e:
+    except Exception as e:  # pylint: disable=broad-except
       self.stop()
-      raise StopIteration(e)
+      six.raise_from(StopIteration(e), e)
 
   def _send_sequence(self):
     """Send current Sequence to all workers."""
@@ -623,6 +624,7 @@ class GeneratorEnqueuer(SequenceEnqueuer):
     self._use_multiprocessing = use_multiprocessing
     self._threads = []
     self._stop_event = None
+    self._manager = None
     self.queue = None
     self.seed = seed
 
@@ -640,18 +642,27 @@ class GeneratorEnqueuer(SequenceEnqueuer):
         try:
           if self._use_multiprocessing or self.queue.qsize() < max_queue_size:
             generator_output = next(self._generator)
-            self.queue.put(generator_output)
+            self.queue.put((True, generator_output))
           else:
             time.sleep(self.wait_time)
         except StopIteration:
           break
-        except Exception:
+        except Exception as e:  # pylint: disable=broad-except
+          # Can't pick tracebacks.
+          # As a compromise, print the traceback and pickle None instead.
+          if self._use_multiprocessing:
+            traceback.print_exc()
+            setattr(e, '__traceback__', None)
+          elif not hasattr(e, '__traceback__'):
+            setattr(e, '__traceback__', sys.exc_info()[2])
+          self.queue.put((False, e))
           self._stop_event.set()
-          raise
+          break
 
     try:
       if self._use_multiprocessing:
-        self.queue = multiprocessing.Queue(maxsize=max_queue_size)
+        self._manager = multiprocessing.Manager()
+        self.queue = self._manager.Queue(maxsize=max_queue_size)
         self._stop_event = multiprocessing.Event()
       else:
         self.queue = queue.Queue()
@@ -695,9 +706,8 @@ class GeneratorEnqueuer(SequenceEnqueuer):
         else:
           thread.join(timeout)
 
-    if self._use_multiprocessing:
-      if self.queue is not None:
-        self.queue.close()
+    if self._manager:
+      self._manager.shutdown()
 
     self._threads = []
     self._stop_event = None
@@ -713,12 +723,23 @@ class GeneratorEnqueuer(SequenceEnqueuer):
     """
     while self.is_running():
       if not self.queue.empty():
-        inputs = self.queue.get()
-        if inputs is not None:
-          yield inputs
+        success, value = self.queue.get()
+        # Rethrow any exceptions found in the queue
+        if not success:
+          six.reraise(value.__class__, value, value.__traceback__)
+        # Yield regular values
+        if value is not None:
+          yield value
       else:
         all_finished = all([not thread.is_alive() for thread in self._threads])
         if all_finished and self.queue.empty():
           raise StopIteration()
         else:
           time.sleep(self.wait_time)
+
+      # Make sure to rethrow the first exception in the queue, if any
+    while not self.queue.empty():
+      success, value = self.queue.get()
+      if not success:
+        six.reraise(value.__class__, value, value.__traceback__)
+
diff --git a/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py b/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py
index 14b2f084423327cda8211fce53b3386a3e5635f2..d541cccbe51485381f07d220a88b916494813a30 100644
--- a/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py
+++ b/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py
@@ -22,6 +22,7 @@ from itertools import cycle
 import os
 import tarfile
 import threading
+import unittest
 import zipfile
 
 import numpy as np
@@ -164,6 +165,9 @@ class TestEnqueuers(test.TestCase):
     self.assertEqual(len(set(acc) - set(range(100))), 0)
     enqueuer.stop()
 
+  @unittest.skipIf(
+      os.name == 'nt',
+      'use_multiprocessing=True does not work on windows properly.')
   def test_generator_enqueuer_processes(self):
     enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
         create_generator_from_sequence_pcs(TestSequence([3, 200, 200, 3])),
@@ -182,16 +186,19 @@ class TestEnqueuers(test.TestCase):
         use_multiprocessing=False)
     enqueuer.start(3, 10)
     gen_output = enqueuer.get()
-    with self.assertRaises(StopIteration):
+    with self.assertRaises(IndexError):
       next(gen_output)
 
+  @unittest.skipIf(
+      os.name == 'nt',
+      'use_multiprocessing=True does not work on windows properly.')
   def test_generator_enqueuer_fail_processes(self):
     enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
         create_generator_from_sequence_pcs(FaultSequence()),
         use_multiprocessing=True)
     enqueuer.start(3, 10)
     gen_output = enqueuer.get()
-    with self.assertRaises(StopIteration):
+    with self.assertRaises(IndexError):
       next(gen_output)
 
   def test_ordered_enqueuer_threads(self):
diff --git a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py b/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
index 025e5d30a597c560804293b12b0bd063764c87fe..e9e54c2a2a713423b77e8279740f0338263206eb 100644
--- a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import codecs
 import marshal
 import os
 import sys
@@ -197,10 +198,11 @@ def func_dump(func):
       A tuple `(code, defaults, closure)`.
   """
   if os.name == 'nt':
-    code = marshal.dumps(
-        func.__code__).replace(b'\\', b'/').decode('raw_unicode_escape')
+    raw_code = marshal.dumps(func.__code__).replace(b'\\', b'/')
+    code = codecs.encode(raw_code, 'base64').decode('ascii')
   else:
-    code = marshal.dumps(func.__code__).decode('raw_unicode_escape')
+    raw_code = marshal.dumps(func.__code__)
+    code = codecs.encode(raw_code, 'base64').decode('ascii')
   defaults = func.__defaults__
   if func.__closure__:
     closure = tuple(c.cell_contents for c in func.__closure__)
@@ -225,7 +227,30 @@ def func_load(code, defaults=None, closure=None, globs=None):
     code, defaults, closure = code
     if isinstance(defaults, list):
       defaults = tuple(defaults)
-  code = marshal.loads(code.encode('raw_unicode_escape'))
+
+  def ensure_value_to_cell(value):
+    """Ensures that a value is converted to a python cell object.
+
+    Arguments:
+        value: Any value that needs to be casted to the cell type
+
+    Returns:
+        A value wrapped as a cell object (see function "func_load")
+    """
+    def dummy_fn():
+      # pylint: disable=pointless-statement
+      value  # just access it so it gets captured in .__closure__
+
+    cell_value = dummy_fn.__closure__[0]
+    if not isinstance(value, type(cell_value)):
+      return cell_value
+    else:
+      return value
+
+  if closure is not None:
+    closure = tuple(ensure_value_to_cell(_) for _ in closure)
+  raw_code = codecs.decode(code.encode('ascii'), 'base64')
+  code = marshal.loads(raw_code)
   if globs is None:
     globs = globals()
   return python_types.FunctionType(
diff --git a/tensorflow/python/keras/_impl/keras/utils/io_utils.py b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
index 1c8299c27d2cf00fa9402fc770ee4742a0bdc242..a8fc18c17aee58fa406c3057cc98844d9687a9ba 100644
--- a/tensorflow/python/keras/_impl/keras/utils/io_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
@@ -63,11 +63,11 @@ class HDF5Matrix(object):
                         'HDF5 and h5py installed.')
 
     if datapath not in list(self.refs.keys()):
-      f = h5py.File(datapath)
-      self.refs[datapath] = f
+      self._f = h5py.File(datapath)
+      self.refs[datapath] = self._f
     else:
-      f = self.refs[datapath]
-    self.data = f[dataset]
+      self._f = self.refs[datapath]
+    self.data = self._f[dataset]
     self.start = start
     if end is None:
       self.end = self.data.shape[0]
@@ -78,6 +78,9 @@ class HDF5Matrix(object):
   def __len__(self):
     return self.end - self.start
 
+  def __del__(self):
+    self._f.close()
+
   def __getitem__(self, key):
     if isinstance(key, slice):
       start, stop = key.start, key.stop
diff --git a/tensorflow/python/keras/_impl/keras/utils/np_utils.py b/tensorflow/python/keras/_impl/keras/utils/np_utils.py
index 896016d4d8bb48192e32ab094f7b7a0e6799921c..67d83bf42c4387be6e5ba578663ecf02ade054c8 100644
--- a/tensorflow/python/keras/_impl/keras/utils/np_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/np_utils.py
@@ -35,7 +35,7 @@ def to_categorical(y, num_classes=None):
   """
   y = np.array(y, dtype='int')
   input_shape = y.shape
-  if input_shape and input_shape[-1] == 1:
+  if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
     input_shape = tuple(input_shape[:-1])
   y = y.ravel()
   if not num_classes:
diff --git a/tensorflow/python/keras/_impl/keras/utils/np_utils_test.py b/tensorflow/python/keras/_impl/keras/utils/np_utils_test.py
index 9680c295cd31c40114726a919d4e327c07ddd240..1e974c2ef2aee3b6a83ad777673505f8c75b2b58 100644
--- a/tensorflow/python/keras/_impl/keras/utils/np_utils_test.py
+++ b/tensorflow/python/keras/_impl/keras/utils/np_utils_test.py
@@ -28,8 +28,9 @@ class TestNPUtils(test.TestCase):
 
   def test_to_categorical(self):
     num_classes = 5
-    shapes = [(3,), (4, 3), (5, 4, 3), (3, 1), (3, 2, 1)]
-    expected_shapes = [(3, num_classes),
+    shapes = [(1,), (3,), (4, 3), (5, 4, 3), (3, 1), (3, 2, 1)]
+    expected_shapes = [(1, num_classes),
+                       (3, num_classes),
                        (4, 3, num_classes),
                        (5, 4, 3, num_classes),
                        (3, num_classes)]
diff --git a/tensorflow/python/keras/_impl/keras/utils/training_utils.py b/tensorflow/python/keras/_impl/keras/utils/training_utils.py
index 8939c814cf3f9c6fa2f2af79e71919c6666e5561..0bf4ac8a24d3011e05f2db101cd02931e0b65849 100644
--- a/tensorflow/python/keras/_impl/keras/utils/training_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/training_utils.py
@@ -112,12 +112,22 @@ def multi_gpu_model(model, gpus):
   from tensorflow.python.keras._impl.keras.layers.core import Lambda
   from tensorflow.python.keras._impl.keras.layers.merge import concatenate
 
-  if gpus <= 1:
-    raise ValueError('For multi-gpu usage to be effective, '
-                     'call `multi_gpu_model` with `gpus >= 2`. '
-                     'Received: `gpus=%d`' % gpus)
-
-  target_devices = ['/cpu:0'] + ['/gpu:%d' % i for i in range(gpus)]
+  if isinstance(gpus, (list, tuple)):
+    if len(gpus) <= 1:
+      raise ValueError('For multi-gpu usage to be effective, '
+                       'call `multi_gpu_model` with `len(gpus) >= 2`. '
+                       'Received: `gpus=%s`' % gpus)
+    num_gpus = len(gpus)
+    target_gpu_ids = gpus
+  else:
+    if gpus <= 1:
+      raise ValueError('For multi-gpu usage to be effective, '
+                       'call `multi_gpu_model` with `gpus >= 2`. '
+                       'Received: `gpus=%d`' % gpus)
+    num_gpus = gpus
+    target_gpu_ids = range(num_gpus)
+
+  target_devices = ['/cpu:0'] + ['/gpu:%d' % i for i in target_gpu_ids]
   available_devices = _get_available_devices()
   available_devices = [
       _normalize_device_name(name) for name in available_devices
@@ -145,7 +155,7 @@ def multi_gpu_model(model, gpus):
     batch_size = shape[:1]
     input_shape = shape[1:]
     step = batch_size // parts
-    if i == gpus - 1:
+    if i == num_gpus - 1:
       size = batch_size - step * i
     else:
       size = step
@@ -160,9 +170,9 @@ def multi_gpu_model(model, gpus):
 
   # Place a copy of the model on each GPU,
   # each getting a slice of the inputs.
-  for i in range(gpus):
-    with ops.device('/gpu:%d' % i):
-      with ops.name_scope('replica_%d' % i):
+  for i, gpu_id in enumerate(target_gpu_ids):
+    with ops.device('/gpu:%d' % gpu_id):
+      with ops.name_scope('replica_%d' % gpu_id):
         inputs = []
         # Retrieve a slice of the input.
         for x in model.inputs:
@@ -172,8 +182,9 @@ def multi_gpu_model(model, gpus):
               output_shape=input_shape,
               arguments={
                   'i': i,
-                  'parts': gpus
-              })(x)
+                  'parts': num_gpus
+              })(
+                  x)
           inputs.append(slice_i)
 
         # Apply model on slice
@@ -189,6 +200,7 @@ def multi_gpu_model(model, gpus):
   # Merge outputs on CPU.
   with ops.device('/cpu:0'):
     merged = []
-    for outputs in all_outputs:
-      merged.append(concatenate(outputs, axis=0))
+    for name, outputs in zip(model.output_names, all_outputs):
+      merged.append(concatenate(outputs, axis=0, name=name))
     return Model(model.inputs, merged)
+
diff --git a/tensorflow/python/keras/_impl/keras/utils/training_utils_test.py b/tensorflow/python/keras/_impl/keras/utils/training_utils_test.py
index 51fbd041a4943b1837c5f725a06c0c08fb9cb216..12354c49ca72cddc0f395bcfcfabab18c1189227 100644
--- a/tensorflow/python/keras/_impl/keras/utils/training_utils_test.py
+++ b/tensorflow/python/keras/_impl/keras/utils/training_utils_test.py
@@ -33,6 +33,7 @@ class TestMultiGPUModel(test.TestCase):
     output_dim = 1
     hidden_dim = 10
     epochs = 2
+    target_gpu_id = [0, 2, 4]
 
     with self.test_session():
       model = keras.models.Sequential()
@@ -42,8 +43,12 @@ class TestMultiGPUModel(test.TestCase):
 
       x = np.random.random((num_samples, input_dim))
       y = np.random.random((num_samples, output_dim))
+
       parallel_model = keras.utils.multi_gpu_model(model, gpus=gpus)
+      parallel_model.compile(loss='mse', optimizer='rmsprop')
+      parallel_model.fit(x, y, epochs=epochs)
 
+      parallel_model = keras.utils.multi_gpu_model(model, gpus=target_gpu_id)
       parallel_model.compile(loss='mse', optimizer='rmsprop')
       parallel_model.fit(x, y, epochs=epochs)
 
@@ -56,6 +61,7 @@ class TestMultiGPUModel(test.TestCase):
     output_dim_b = 2
     hidden_dim = 10
     epochs = 2
+    target_gpu_id = [0, 2, 4]
 
     with self.test_session():
       input_a = keras.Input((input_dim_a,))
@@ -76,6 +82,10 @@ class TestMultiGPUModel(test.TestCase):
       parallel_model.compile(loss='mse', optimizer='rmsprop')
       parallel_model.fit([a_x, b_x], [a_y, b_y], epochs=epochs)
 
+      parallel_model = keras.utils.multi_gpu_model(model, gpus=target_gpu_id)
+      parallel_model.compile(loss='mse', optimizer='rmsprop')
+      parallel_model.fit([a_x, b_x], [a_y, b_y], epochs=epochs)
+
   def multi_gpu_test_invalid_devices(self):
     with self.test_session():
       input_shape = (1000, 10)
@@ -92,3 +102,16 @@ class TestMultiGPUModel(test.TestCase):
         parallel_model = keras.utils.multi_gpu_model(
             model, gpus=len(keras.backend._get_available_gpus()) + 1)
         parallel_model.fit(x, y, epochs=2)
+
+      with self.assertRaises(ValueError):
+        parallel_model = keras.utils.multi_gpu_model(
+            model, gpus=[0, 2, 4, 6, 8])
+        parallel_model.fit(x, y, epochs=2)
+
+      with self.assertRaises(ValueError):
+        parallel_model = keras.utils.multi_gpu_model(model, gpus=1)
+        parallel_model.fit(x, y, epochs=2)
+
+      with self.assertRaises(ValueError):
+        parallel_model = keras.utils.multi_gpu_model(model, gpus=[0])
+        parallel_model.fit(x, y, epochs=2)
diff --git a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py b/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
index 31ef4773ad6481264aea09c72f955a5a6ef8a11d..bc788d874f663caefd46d56fbf715a802fe08ec1 100644
--- a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
+++ b/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
@@ -38,18 +38,18 @@ class BaseWrapper(object):
       build_fn: callable function or class instance
       **sk_params: model parameters & fitting parameters
 
-  The build_fn should construct, compile and return a Keras model, which
+  The `build_fn` should construct, compile and return a Keras model, which
   will then be used to fit/predict. One of the following
-  three values could be passed to build_fn:
+  three values could be passed to `build_fn`:
   1. A function
-  2. An instance of a class that implements the __call__ method
+  2. An instance of a class that implements the `__call__` method
   3. None. This means you implement a class that inherits from either
-  `KerasClassifier` or `KerasRegressor`. The __call__ method of the
-  present class will then be treated as the default build_fn.
+  `KerasClassifier` or `KerasRegressor`. The `__call__` method of the
+  present class will then be treated as the default `build_fn`.
 
   `sk_params` takes both model parameters and fitting parameters. Legal model
   parameters are the arguments of `build_fn`. Note that like all other
-  estimators in scikit-learn, 'build_fn' should provide default values for
+  estimators in scikit-learn, `build_fn` should provide default values for
   its arguments, so that you could create the estimator without passing any
   values to `sk_params`.
 
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 4fffdfda7d082cf254ceb37d0113f6e14ab40fa3..640edb26bcf9e7fc2686aee448cff04d94b6254b 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -294,6 +294,19 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "decode_compressed_op_test",
+    size = "small",
+    srcs = ["decode_compressed_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:parsing_ops",
+    ],
+)
+
 cuda_py_test(
     name = "determinant_op_test",
     size = "small",
@@ -483,6 +496,7 @@ tf_py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
     ],
+    grpc_enabled = True,
 )
 
 tf_py_test(
@@ -676,6 +690,7 @@ cuda_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
+        "//tensorflow/python:resource_variable_ops",
     ],
     tags = ["noasan"],  # http://b/32635055
 )
@@ -1230,7 +1245,9 @@ cuda_py_test(
 
 cuda_py_test(
     name = "control_flow_ops_py_test",
-    size = "small",
+    # TOOD(b/70473603): change this back to "small" once the C API is
+    # permanently enabled
+    size = "medium",
     srcs = ["control_flow_ops_py_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1264,6 +1281,19 @@ cuda_py_test(
     ],
 )
 
+tf_py_test(
+    name = "control_flow_util_test",
+    size = "small",
+    srcs = ["control_flow_util_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_ops_gen",
+        "//tensorflow/python:control_flow_util",
+        "//tensorflow/python:test_ops",
+    ],
+)
+
 cuda_py_test(
     name = "conv1d_test",
     size = "small",
@@ -1370,7 +1400,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "dynamic_partition_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["dynamic_partition_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1427,6 +1457,7 @@ cuda_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
+    grpc_enabled = True,
     tags = ["no_windows"],
 )
 
@@ -1631,6 +1662,8 @@ cuda_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
     ],
     tags = ["no_windows"],
 )
@@ -2055,7 +2088,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "transpose_op_test",
-    size = "medium",
+    size = "large",
     srcs = ["transpose_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2063,6 +2096,11 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    shard_count = 2,
+    tags = [
+        "no_gpu",
+        "no_oss",
+    ],
 )
 
 cuda_py_test(
@@ -2338,6 +2376,7 @@ cuda_py_test(
         "//tensorflow/python:rnn_cell",
         "//tensorflow/python:sparse_grad",
         "//tensorflow/python:tensor_array_grad",
+        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
     ],
@@ -2775,104 +2814,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "batch_dataset_op_test",
-    size = "small",
-    srcs = ["batch_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "dataset_constructor_op_test",
-    size = "small",
-    srcs = ["dataset_constructor_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-    ],
-    tags = [
-        "manual",
-        "nomac",  # b/62040583
-    ],
-)
-
-tf_py_test(
-    name = "dataset_from_generator_op_test",
-    size = "small",
-    srcs = ["dataset_from_generator_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-    ],
-)
-
-tf_py_test(
-    name = "filter_dataset_op_test",
-    size = "small",
-    srcs = ["filter_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "flat_map_dataset_op_test",
-    size = "small",
-    srcs = ["flat_map_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
 tf_py_test(
     name = "garbage_collection_test",
     size = "small",
@@ -2887,259 +2828,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "list_files_dataset_op_test",
-    size = "small",
-    srcs = ["list_files_dataset_op_test.py"],
-    additional_deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "interleave_dataset_op_test",
-    size = "small",
-    srcs = ["interleave_dataset_op_test.py"],
-    additional_deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "map_dataset_op_test",
-    size = "small",
-    srcs = ["map_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "prefetch_dataset_op_test",
-    size = "small",
-    srcs = ["prefetch_dataset_op_test.py"],
-    additional_deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "range_dataset_op_test",
-    size = "small",
-    srcs = ["range_dataset_op_test.py"],
-    additional_deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-)
-
-tf_py_test(
-    name = "reader_dataset_ops_test",
-    size = "small",
-    srcs = ["reader_dataset_ops_test.py"],
-    additional_deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/ops:readers",
-    ],
-)
-
-tf_py_test(
-    name = "sequence_dataset_op_test",
-    size = "small",
-    srcs = ["sequence_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "shuffle_dataset_op_test",
-    size = "small",
-    srcs = ["shuffle_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-)
-
-tf_py_test(
-    name = "shard_dataset_op_test",
-    size = "small",
-    srcs = ["shard_dataset_op_test.py"],
-    additional_deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "cache_dataset_op_test",
-    size = "small",
-    srcs = ["cache_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-)
-
-tf_py_test(
-    name = "zip_dataset_op_test",
-    size = "small",
-    srcs = ["zip_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "concatenate_dataset_op_test",
-    size = "small",
-    srcs = ["concatenate_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-    ],
-)
-
-tf_py_test(
-    name = "iterator_ops_test",
-    size = "small",
-    srcs = ["iterator_ops_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python/data/ops:readers",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:training",
-    ],
-)
-
-tf_py_test(
-    name = "iterator_ops_cluster_test",
-    size = "small",
-    srcs = ["iterator_ops_cluster_test.py"],
-    additional_deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-    tags = [
-        "no_oss",  # Test flaky due to port collisions.
-        "no_windows",
-    ],
-)
-
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 76b80e60ea19162a46a2dd64f7fae63dd27f0d5a..17492e9255ca9f8cdae65a9acab33ed9156de10c 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -33,10 +33,13 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test as test_lib
 
@@ -114,21 +117,21 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
     arr = np.random.rand(*arr_shape)
     mask = make_mask(arr_shape[:ndims_mask])
     if axis is not None:
-      mask = make_mask(arr_shape[axis:ndims_mask+axis])
+      mask = make_mask(arr_shape[axis:ndims_mask + axis])
     if axis is None or axis == 0:
       masked_arr = arr[mask]
     elif axis == 1:
-      masked_arr = arr[:,mask]
+      masked_arr = arr[:, mask]
     elif axis == 2:
-      masked_arr = arr[:,:,mask]
-    with self.test_session() as sess:
+      masked_arr = arr[:, :, mask]
+    with self.test_session():
       masked_tensor = array_ops.boolean_mask(arr, mask, axis=axis)
 
       # Leading dimension size of masked_tensor is always unknown until runtime
       # since we don't how many elements will be kept.
       leading = 1 if axis is None else axis + 1
       self.assertAllEqual(masked_tensor.get_shape()[leading:],
-          masked_arr.shape[leading:])
+                          masked_arr.shape[leading:])
 
       self.assertAllClose(masked_arr, masked_tensor.eval())
 
@@ -1078,6 +1081,7 @@ class PadTest(test_util.TensorFlowTestCase):
                            [0, 0, 4, 5, 6, 0, 0],
                            [0, 0, 0, 0, 0, 0, 0]])
 
+
 class InvertPermutationTest(test_util.TensorFlowTestCase):
 
   def testInvertPermutation(self):
@@ -1089,5 +1093,47 @@ class InvertPermutationTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(y.eval(), [2, 4, 3, 0, 1])
 
 
+class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
+
+  def testSimple(self):
+    with self.test_session():
+      a = array_ops.constant(10)
+      guarantee_a = array_ops.guarantee_const(a)
+      self.assertEqual(10, guarantee_a.eval())
+
+  def testVariables(self):
+    with self.test_session() as sess:
+      for use_resource in [False, True]:
+        a = variable_scope.get_variable(
+            "var_{}".format(use_resource), [],
+            initializer=init_ops.constant_initializer(10.0),
+            use_resource=use_resource)
+        guarantee_a = array_ops.guarantee_const(a)
+        sess.run(variables.global_variables_initializer())
+        self.assertEqual(10.0, guarantee_a.eval())
+
+  def testResourceRejection(self):
+    with self.test_session() as sess:
+      a = variable_scope.get_variable(
+          "resource_var", [],
+          initializer=init_ops.constant_initializer(10.0),
+          use_resource=True)
+      guarantee_a = array_ops.guarantee_const(a.handle)
+      sess.run(variables.global_variables_initializer())
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                               "cannot be a resource variable"):
+        guarantee_a.eval()
+
+
+class SnapshotOpTest(test_util.TensorFlowTestCase):
+
+  def testInvertPermutation(self):
+    for dtype in [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([0, 1, 2, 3], dtype=dtype)
+        y = gen_array_ops._snapshot(x)
+        self.assertAllEqual(y.eval(), [0, 1, 2, 3])
+
+
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/kernel_tests/bincount_op_test.py b/tensorflow/python/kernel_tests/bincount_op_test.py
index 79285476b4489f033cf252177302b5f14ec5003d..2767df127e324fe54fb1b6d068e75588d4209f98 100644
--- a/tensorflow/python/kernel_tests/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/bincount_op_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
 
+
 class BincountTest(test_util.TensorFlowTestCase):
 
   def test_empty(self):
@@ -72,8 +73,7 @@ class BincountTest(test_util.TensorFlowTestCase):
         else:
           weights = np.random.random(num_samples)
         self.assertAllClose(
-            math_ops.bincount(arr, weights).eval(),
-            np.bincount(arr, weights))
+            math_ops.bincount(arr, weights).eval(), np.bincount(arr, weights))
 
   def test_random_without_weights(self):
     num_samples = 10000
@@ -83,8 +83,7 @@ class BincountTest(test_util.TensorFlowTestCase):
         arr = np.random.randint(0, 1000, num_samples)
         weights = np.ones(num_samples).astype(dtype)
         self.assertAllClose(
-            math_ops.bincount(arr, None).eval(),
-            np.bincount(arr, weights))
+            math_ops.bincount(arr, None).eval(), np.bincount(arr, weights))
 
   def test_zero_weights(self):
     with self.test_session(use_gpu=True):
diff --git a/tensorflow/python/kernel_tests/cast_op_test.py b/tensorflow/python/kernel_tests/cast_op_test.py
index c785f2358d5e659c71acf02457e2146616a9e880..214d5cb3c064dc4b046d09959eaa1d770bcabc3d 100644
--- a/tensorflow/python/kernel_tests/cast_op_test.py
+++ b/tensorflow/python/kernel_tests/cast_op_test.py
@@ -144,9 +144,9 @@ class CastOpTest(test.TestCase):
 
     self._compare(np.inf, np.float32, np.inf, False)
     self._compare(np.inf, np.float64, np.inf, False)
-    if sys.byteorder == "big":  
-      self._compare(np.inf, np.int32, i4.max, False)  
-      self._compare(np.inf, np.int64, i8.max, False)  
+    if sys.byteorder == "big":
+      self._compare(np.inf, np.int32, i4.max, False)
+      self._compare(np.inf, np.int64, i8.max, False)
     else:
       # np.float64("np.inf").astype(np.int32) is negative on x86 but positive on ppc64le
       # Numpy link to relevant discussion - https://github.com/numpy/numpy/issues/9040
@@ -156,7 +156,7 @@ class CastOpTest(test.TestCase):
         self._compare(-np.inf, np.int64, i8.min, False)
       else:
         self._compare(np.inf, np.int32, i4.min, False)
-        self._compare(np.inf, np.int64, i8.min, False)  
+        self._compare(np.inf, np.int64, i8.min, False)
     self._compare(-np.inf, np.float32, -np.inf, False)
     self._compare(-np.inf, np.float64, -np.inf, False)
     self._compare(-np.inf, np.int32, i4.min, False)
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 43785adceeccfbeef5cb80af3499425520f3d874..7ce0f1e7b8a4df7c8c3acb36c0d46f60cbf0f703 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -34,38 +34,45 @@ from tensorflow.python.platform import test
 
 class AssertProperIterableTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_single_tensor_raises(self):
     tensor = constant_op.constant(1)
     with self.assertRaisesRegexp(TypeError, "proper"):
       check_ops.assert_proper_iterable(tensor)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_single_sparse_tensor_raises(self):
     ten = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
     with self.assertRaisesRegexp(TypeError, "proper"):
       check_ops.assert_proper_iterable(ten)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_single_ndarray_raises(self):
     array = np.array([1, 2, 3])
     with self.assertRaisesRegexp(TypeError, "proper"):
       check_ops.assert_proper_iterable(array)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_single_string_raises(self):
     mystr = "hello"
     with self.assertRaisesRegexp(TypeError, "proper"):
       check_ops.assert_proper_iterable(mystr)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_non_iterable_object_raises(self):
     non_iterable = 1234
     with self.assertRaisesRegexp(TypeError, "to be iterable"):
       check_ops.assert_proper_iterable(non_iterable)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_list_does_not_raise(self):
     list_of_stuff = [
         constant_op.constant([11, 22]), constant_op.constant([1, 2])
     ]
     check_ops.assert_proper_iterable(list_of_stuff)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_generator_does_not_raise(self):
     generator_of_stuff = (constant_op.constant([11, 22]), constant_op.constant(
         [1, 2]))
@@ -333,265 +340,283 @@ class AssertLessTest(test.TestCase):
 
 class AssertLessEqualTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_equal(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      with ops.control_dependencies(
-          [check_ops.assert_less_equal(small, small)]):
-        out = array_ops.identity(small)
-      out.eval()
+    small = constant_op.constant([1, 2], name="small")
+    with ops.control_dependencies(
+        [check_ops.assert_less_equal(small, small)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_greater(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      big = constant_op.constant([3, 4], name="big")
+    small = constant_op.constant([1, 2], name="small")
+    big = constant_op.constant([3, 4], name="big")
+    with self.assertRaisesOpError("fail"):
       with ops.control_dependencies(
           [check_ops.assert_less_equal(
               big, small, message="fail")]):
         out = array_ops.identity(small)
-      with self.assertRaisesOpError("fail.*big.*small"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_less_equal(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      big = constant_op.constant([3, 2], name="big")
-      with ops.control_dependencies([check_ops.assert_less_equal(small, big)]):
-        out = array_ops.identity(small)
-      out.eval()
+    small = constant_op.constant([1, 2], name="small")
+    big = constant_op.constant([3, 2], name="big")
+    with ops.control_dependencies([check_ops.assert_less_equal(small, big)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_less_equal_and_broadcastable_shapes(self):
-    with self.test_session():
-      small = constant_op.constant([1], name="small")
-      big = constant_op.constant([3, 1], name="big")
-      with ops.control_dependencies([check_ops.assert_less_equal(small, big)]):
-        out = array_ops.identity(small)
-      out.eval()
+    small = constant_op.constant([1], name="small")
+    big = constant_op.constant([3, 1], name="big")
+    with ops.control_dependencies([check_ops.assert_less_equal(small, big)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_less_equal_but_non_broadcastable_shapes(self):
-    with self.test_session():
-      small = constant_op.constant([1, 1, 1], name="small")
-      big = constant_op.constant([3, 1], name="big")
-      with self.assertRaisesRegexp(ValueError, "must be"):
-        with ops.control_dependencies(
-            [check_ops.assert_less_equal(small, big)]):
-          out = array_ops.identity(small)
-        out.eval()
+    small = constant_op.constant([3, 1], name="small")
+    big = constant_op.constant([1, 1, 1], name="big")
+    # The exception in eager and non-eager mode is different because
+    # eager mode relies on shape check done as part of the C++ op, while
+    # graph mode does shape checks when creating the `Operation` instance.
+    with self.assertRaisesRegexp(
+        (errors.InvalidArgumentError, ValueError),
+        (r"Incompatible shapes: \[2\] vs. \[3\]|"
+         r"Dimensions must be equal, but are 2 and 3")):
+      with ops.control_dependencies(
+          [check_ops.assert_less_equal(small, big)]):
+        out = array_ops.identity(small)
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_both_empty(self):
-    with self.test_session():
-      larry = constant_op.constant([])
-      curly = constant_op.constant([])
-      with ops.control_dependencies(
-          [check_ops.assert_less_equal(larry, curly)]):
-        out = array_ops.identity(larry)
-      out.eval()
+    larry = constant_op.constant([])
+    curly = constant_op.constant([])
+    with ops.control_dependencies(
+        [check_ops.assert_less_equal(larry, curly)]):
+      out = array_ops.identity(larry)
+    self.evaluate(out)
 
 
 class AssertGreaterTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_equal(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
+    small = constant_op.constant([1, 2], name="small")
+    with self.assertRaisesOpError("fail"):
       with ops.control_dependencies(
           [check_ops.assert_greater(
               small, small, message="fail")]):
         out = array_ops.identity(small)
-      with self.assertRaisesOpError("fail.*small.*small"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_less(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      big = constant_op.constant([3, 4], name="big")
+    small = constant_op.constant([1, 2], name="small")
+    big = constant_op.constant([3, 4], name="big")
+    with self.assertRaisesOpError("x > y did not hold"):
       with ops.control_dependencies([check_ops.assert_greater(small, big)]):
         out = array_ops.identity(big)
-      with self.assertRaisesOpError("small.*big"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_greater(self):
-    with self.test_session():
-      small = constant_op.constant([3, 1], name="small")
-      big = constant_op.constant([4, 2], name="big")
-      with ops.control_dependencies([check_ops.assert_greater(big, small)]):
-        out = array_ops.identity(small)
-      out.eval()
+    small = constant_op.constant([3, 1], name="small")
+    big = constant_op.constant([4, 2], name="big")
+    with ops.control_dependencies([check_ops.assert_greater(big, small)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_greater_and_broadcastable_shapes(self):
-    with self.test_session():
-      small = constant_op.constant([1], name="small")
-      big = constant_op.constant([3, 2], name="big")
-      with ops.control_dependencies([check_ops.assert_greater(big, small)]):
-        out = array_ops.identity(small)
-      out.eval()
+    small = constant_op.constant([1], name="small")
+    big = constant_op.constant([3, 2], name="big")
+    with ops.control_dependencies([check_ops.assert_greater(big, small)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_greater_but_non_broadcastable_shapes(self):
-    with self.test_session():
-      small = constant_op.constant([1, 1, 1], name="small")
-      big = constant_op.constant([3, 2], name="big")
-      with self.assertRaisesRegexp(ValueError, "must be"):
-        with ops.control_dependencies([check_ops.assert_greater(big, small)]):
-          out = array_ops.identity(small)
-        out.eval()
+    small = constant_op.constant([1, 1, 1], name="small")
+    big = constant_op.constant([3, 2], name="big")
+    # The exception in eager and non-eager mode is different because
+    # eager mode relies on shape check done as part of the C++ op, while
+    # graph mode does shape checks when creating the `Operation` instance.
+    with self.assertRaisesRegexp(
+        (errors.InvalidArgumentError, ValueError),
+        (r"Incompatible shapes: \[2\] vs. \[3\]|"
+         r"Dimensions must be equal, but are 2 and 3")):
+      with ops.control_dependencies([check_ops.assert_greater(big, small)]):
+        out = array_ops.identity(small)
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_both_empty(self):
-    with self.test_session():
-      larry = constant_op.constant([])
-      curly = constant_op.constant([])
-      with ops.control_dependencies([check_ops.assert_greater(larry, curly)]):
-        out = array_ops.identity(larry)
-      out.eval()
+    larry = constant_op.constant([])
+    curly = constant_op.constant([])
+    with ops.control_dependencies([check_ops.assert_greater(larry, curly)]):
+      out = array_ops.identity(larry)
+    self.evaluate(out)
 
 
 class AssertGreaterEqualTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_equal(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      with ops.control_dependencies(
-          [check_ops.assert_greater_equal(small, small)]):
-        out = array_ops.identity(small)
-      out.eval()
+    small = constant_op.constant([1, 2], name="small")
+    with ops.control_dependencies(
+        [check_ops.assert_greater_equal(small, small)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_less(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      big = constant_op.constant([3, 4], name="big")
+    small = constant_op.constant([1, 2], name="small")
+    big = constant_op.constant([3, 4], name="big")
+    with self.assertRaisesOpError("fail"):
       with ops.control_dependencies(
           [check_ops.assert_greater_equal(
               small, big, message="fail")]):
         out = array_ops.identity(small)
-      with self.assertRaisesOpError("fail.*small.*big"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_greater_equal(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      big = constant_op.constant([3, 2], name="big")
-      with ops.control_dependencies(
-          [check_ops.assert_greater_equal(big, small)]):
-        out = array_ops.identity(small)
-      out.eval()
+    small = constant_op.constant([1, 2], name="small")
+    big = constant_op.constant([3, 2], name="big")
+    with ops.control_dependencies(
+        [check_ops.assert_greater_equal(big, small)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_greater_equal_and_broadcastable_shapes(self):
-    with self.test_session():
-      small = constant_op.constant([1], name="small")
-      big = constant_op.constant([3, 1], name="big")
+    small = constant_op.constant([1], name="small")
+    big = constant_op.constant([3, 1], name="big")
+    with ops.control_dependencies(
+        [check_ops.assert_greater_equal(big, small)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_raises_when_less_equal_but_non_broadcastable_shapes(self):
+    small = constant_op.constant([1, 1, 1], name="big")
+    big = constant_op.constant([3, 1], name="small")
+    # The exception in eager and non-eager mode is different because
+    # eager mode relies on shape check done as part of the C++ op, while
+    # graph mode does shape checks when creating the `Operation` instance.
+    with self.assertRaisesRegexp(
+        (errors.InvalidArgumentError, ValueError),
+        (r"Incompatible shapes: \[2\] vs. \[3\]|"
+         r"Dimensions must be equal, but are 2 and 3")):
       with ops.control_dependencies(
           [check_ops.assert_greater_equal(big, small)]):
         out = array_ops.identity(small)
-      out.eval()
-
-  def test_raises_when_less_equal_but_non_broadcastable_shapes(self):
-    with self.test_session():
-      small = constant_op.constant([1, 1, 1], name="big")
-      big = constant_op.constant([3, 1], name="small")
-      with self.assertRaisesRegexp(ValueError, "Dimensions must be equal"):
-        with ops.control_dependencies(
-            [check_ops.assert_greater_equal(big, small)]):
-          out = array_ops.identity(small)
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_both_empty(self):
-    with self.test_session():
-      larry = constant_op.constant([])
-      curly = constant_op.constant([])
-      with ops.control_dependencies(
-          [check_ops.assert_greater_equal(larry, curly)]):
-        out = array_ops.identity(larry)
-      out.eval()
+    larry = constant_op.constant([])
+    curly = constant_op.constant([])
+    with ops.control_dependencies(
+        [check_ops.assert_greater_equal(larry, curly)]):
+      out = array_ops.identity(larry)
+    self.evaluate(out)
 
 
 class AssertNegativeTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_negative(self):
-    with self.test_session():
-      frank = constant_op.constant([-1, -2], name="frank")
-      with ops.control_dependencies([check_ops.assert_negative(frank)]):
-        out = array_ops.identity(frank)
-      out.eval()
+    frank = constant_op.constant([-1, -2], name="frank")
+    with ops.control_dependencies([check_ops.assert_negative(frank)]):
+      out = array_ops.identity(frank)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_positive(self):
-    with self.test_session():
-      doug = constant_op.constant([1, 2], name="doug")
+    doug = constant_op.constant([1, 2], name="doug")
+    with self.assertRaisesOpError("fail"):
       with ops.control_dependencies(
           [check_ops.assert_negative(
               doug, message="fail")]):
         out = array_ops.identity(doug)
-      with self.assertRaisesOpError("fail.*doug"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_zero(self):
-    with self.test_session():
-      claire = constant_op.constant([0], name="claire")
+    claire = constant_op.constant([0], name="claire")
+    with self.assertRaisesOpError("x < 0 did not hold"):
       with ops.control_dependencies([check_ops.assert_negative(claire)]):
         out = array_ops.identity(claire)
-      with self.assertRaisesOpError("claire"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_empty_tensor_doesnt_raise(self):
     # A tensor is negative when it satisfies:
     #   For every element x_i in x, x_i < 0
     # and an empty tensor has no elements, so this is trivially satisfied.
     # This is standard set theory.
-    with self.test_session():
-      empty = constant_op.constant([], name="empty")
-      with ops.control_dependencies([check_ops.assert_negative(empty)]):
-        out = array_ops.identity(empty)
-      out.eval()
+    empty = constant_op.constant([], name="empty")
+    with ops.control_dependencies([check_ops.assert_negative(empty)]):
+      out = array_ops.identity(empty)
+    self.evaluate(out)
 
 
 class AssertPositiveTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_negative(self):
-    with self.test_session():
-      freddie = constant_op.constant([-1, -2], name="freddie")
+    freddie = constant_op.constant([-1, -2], name="freddie")
+    with self.assertRaisesOpError("fail"):
       with ops.control_dependencies(
           [check_ops.assert_positive(
               freddie, message="fail")]):
         out = array_ops.identity(freddie)
-      with self.assertRaisesOpError("fail.*freddie"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_positive(self):
-    with self.test_session():
-      remmy = constant_op.constant([1, 2], name="remmy")
-      with ops.control_dependencies([check_ops.assert_positive(remmy)]):
-        out = array_ops.identity(remmy)
-      out.eval()
+    remmy = constant_op.constant([1, 2], name="remmy")
+    with ops.control_dependencies([check_ops.assert_positive(remmy)]):
+      out = array_ops.identity(remmy)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_zero(self):
-    with self.test_session():
-      meechum = constant_op.constant([0], name="meechum")
+    meechum = constant_op.constant([0], name="meechum")
+    with self.assertRaisesOpError("x > 0 did not hold"):
       with ops.control_dependencies([check_ops.assert_positive(meechum)]):
         out = array_ops.identity(meechum)
-      with self.assertRaisesOpError("meechum"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_empty_tensor_doesnt_raise(self):
     # A tensor is positive when it satisfies:
     #   For every element x_i in x, x_i > 0
     # and an empty tensor has no elements, so this is trivially satisfied.
     # This is standard set theory.
-    with self.test_session():
-      empty = constant_op.constant([], name="empty")
-      with ops.control_dependencies([check_ops.assert_positive(empty)]):
-        out = array_ops.identity(empty)
-      out.eval()
+    empty = constant_op.constant([], name="empty")
+    with ops.control_dependencies([check_ops.assert_positive(empty)]):
+      out = array_ops.identity(empty)
+    self.evaluate(out)
 
 
 class AssertRankTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_zero_tensor_raises_if_rank_too_small_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant(1, name="my_tensor")
-      desired_rank = 1
-      with self.assertRaisesRegexp(ValueError,
-                                   "fail.*my_tensor.*must have rank 1"):
-        with ops.control_dependencies(
-            [check_ops.assert_rank(
-                tensor, desired_rank, message="fail")]):
-          array_ops.identity(tensor).eval()
+    tensor = constant_op.constant(1, name="my_tensor")
+    desired_rank = 1
+    with self.assertRaisesRegexp(ValueError,
+                                 "fail.*must have rank 1"):
+      with ops.control_dependencies(
+          [check_ops.assert_rank(
+              tensor, desired_rank, message="fail")]):
+        self.evaluate(array_ops.identity(tensor))
 
   def test_rank_zero_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.test_session():
@@ -603,13 +628,13 @@ class AssertRankTest(test.TestCase):
         with self.assertRaisesOpError("fail.*my_tensor.*rank"):
           array_ops.identity(tensor).eval(feed_dict={tensor: 0})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_zero_tensor_doesnt_raise_if_rank_just_right_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant(1, name="my_tensor")
-      desired_rank = 0
-      with ops.control_dependencies(
-          [check_ops.assert_rank(tensor, desired_rank)]):
-        array_ops.identity(tensor).eval()
+    tensor = constant_op.constant(1, name="my_tensor")
+    desired_rank = 0
+    with ops.control_dependencies(
+        [check_ops.assert_rank(tensor, desired_rank)]):
+      self.evaluate(array_ops.identity(tensor))
 
   def test_rank_zero_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.test_session():
@@ -619,14 +644,14 @@ class AssertRankTest(test.TestCase):
           [check_ops.assert_rank(tensor, desired_rank)]):
         array_ops.identity(tensor).eval(feed_dict={tensor: 0})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_one_tensor_raises_if_rank_too_large_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant([1, 2], name="my_tensor")
-      desired_rank = 0
-      with self.assertRaisesRegexp(ValueError, "my_tensor.*rank"):
-        with ops.control_dependencies(
-            [check_ops.assert_rank(tensor, desired_rank)]):
-          array_ops.identity(tensor).eval()
+    tensor = constant_op.constant([1, 2], name="my_tensor")
+    desired_rank = 0
+    with self.assertRaisesRegexp(ValueError, "rank"):
+      with ops.control_dependencies(
+          [check_ops.assert_rank(tensor, desired_rank)]):
+        self.evaluate(array_ops.identity(tensor))
 
   def test_rank_one_tensor_raises_if_rank_too_large_dynamic_rank(self):
     with self.test_session():
@@ -637,13 +662,13 @@ class AssertRankTest(test.TestCase):
         with self.assertRaisesOpError("my_tensor.*rank"):
           array_ops.identity(tensor).eval(feed_dict={tensor: [1, 2]})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_one_tensor_doesnt_raise_if_rank_just_right_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant([1, 2], name="my_tensor")
-      desired_rank = 1
-      with ops.control_dependencies(
-          [check_ops.assert_rank(tensor, desired_rank)]):
-        array_ops.identity(tensor).eval()
+    tensor = constant_op.constant([1, 2], name="my_tensor")
+    desired_rank = 1
+    with ops.control_dependencies(
+        [check_ops.assert_rank(tensor, desired_rank)]):
+      self.evaluate(array_ops.identity(tensor))
 
   def test_rank_one_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.test_session():
@@ -653,14 +678,14 @@ class AssertRankTest(test.TestCase):
           [check_ops.assert_rank(tensor, desired_rank)]):
         array_ops.identity(tensor).eval(feed_dict={tensor: [1, 2]})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_one_tensor_raises_if_rank_too_small_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant([1, 2], name="my_tensor")
-      desired_rank = 2
-      with self.assertRaisesRegexp(ValueError, "my_tensor.*rank"):
-        with ops.control_dependencies(
-            [check_ops.assert_rank(tensor, desired_rank)]):
-          array_ops.identity(tensor).eval()
+    tensor = constant_op.constant([1, 2], name="my_tensor")
+    desired_rank = 2
+    with self.assertRaisesRegexp(ValueError, "rank"):
+      with ops.control_dependencies(
+          [check_ops.assert_rank(tensor, desired_rank)]):
+        self.evaluate(array_ops.identity(tensor))
 
   def test_rank_one_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.test_session():
@@ -671,11 +696,11 @@ class AssertRankTest(test.TestCase):
         with self.assertRaisesOpError("my_tensor.*rank"):
           array_ops.identity(tensor).eval(feed_dict={tensor: [1, 2]})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_if_rank_is_not_scalar_static(self):
-    with self.test_session():
-      tensor = constant_op.constant([1, 2], name="my_tensor")
-      with self.assertRaisesRegexp(ValueError, "Rank must be a scalar"):
-        check_ops.assert_rank(tensor, np.array([], dtype=np.int32))
+    tensor = constant_op.constant([1, 2], name="my_tensor")
+    with self.assertRaisesRegexp(ValueError, "Rank must be a scalar"):
+      check_ops.assert_rank(tensor, np.array([], dtype=np.int32))
 
   def test_raises_if_rank_is_not_scalar_dynamic(self):
     with self.test_session():
@@ -687,12 +712,12 @@ class AssertRankTest(test.TestCase):
             [check_ops.assert_rank(tensor, rank_tensor)]):
           array_ops.identity(tensor).eval(feed_dict={rank_tensor: [1, 2]})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_if_rank_is_not_integer_static(self):
-    with self.test_session():
-      tensor = constant_op.constant([1, 2], name="my_tensor")
-      with self.assertRaisesRegexp(TypeError,
-                                   "must be of type <dtype: 'int32'>"):
-        check_ops.assert_rank(tensor, .5)
+    tensor = constant_op.constant([1, 2], name="my_tensor")
+    with self.assertRaisesRegexp(TypeError,
+                                 "must be of type <dtype: 'int32'>"):
+      check_ops.assert_rank(tensor, .5)
 
   def test_raises_if_rank_is_not_integer_dynamic(self):
     with self.test_session():
@@ -708,14 +733,14 @@ class AssertRankTest(test.TestCase):
 
 class AssertRankInTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_zero_tensor_raises_if_rank_mismatch_static_rank(self):
-    with self.test_session():
-      tensor_rank0 = constant_op.constant(42, name="my_tensor")
-      with self.assertRaisesRegexp(
-          ValueError, "fail.*my_tensor.*must have rank.*in.*1.*2"):
-        with ops.control_dependencies([
-            check_ops.assert_rank_in(tensor_rank0, (1, 2), message="fail")]):
-          array_ops.identity(tensor_rank0).eval()
+    tensor_rank0 = constant_op.constant(42, name="my_tensor")
+    with self.assertRaisesRegexp(
+        ValueError, "fail.*must have rank.*in.*1.*2"):
+      with ops.control_dependencies([
+          check_ops.assert_rank_in(tensor_rank0, (1, 2), message="fail")]):
+        self.evaluate(array_ops.identity(tensor_rank0))
 
   def test_rank_zero_tensor_raises_if_rank_mismatch_dynamic_rank(self):
     with self.test_session():
@@ -725,13 +750,13 @@ class AssertRankInTest(test.TestCase):
         with self.assertRaisesOpError("fail.*my_tensor.*rank"):
           array_ops.identity(tensor_rank0).eval(feed_dict={tensor_rank0: 42.0})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_zero_tensor_doesnt_raise_if_rank_matches_static_rank(self):
-    with self.test_session():
-      tensor_rank0 = constant_op.constant(42, name="my_tensor")
-      for desired_ranks in ((0, 1, 2), (1, 0, 2), (1, 2, 0)):
-        with ops.control_dependencies([
-            check_ops.assert_rank_in(tensor_rank0, desired_ranks)]):
-          array_ops.identity(tensor_rank0).eval()
+    tensor_rank0 = constant_op.constant(42, name="my_tensor")
+    for desired_ranks in ((0, 1, 2), (1, 0, 2), (1, 2, 0)):
+      with ops.control_dependencies([
+          check_ops.assert_rank_in(tensor_rank0, desired_ranks)]):
+        self.evaluate(array_ops.identity(tensor_rank0))
 
   def test_rank_zero_tensor_doesnt_raise_if_rank_matches_dynamic_rank(self):
     with self.test_session():
@@ -741,13 +766,13 @@ class AssertRankInTest(test.TestCase):
             check_ops.assert_rank_in(tensor_rank0, desired_ranks)]):
           array_ops.identity(tensor_rank0).eval(feed_dict={tensor_rank0: 42.0})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_one_tensor_doesnt_raise_if_rank_matches_static_rank(self):
-    with self.test_session():
-      tensor_rank1 = constant_op.constant([42, 43], name="my_tensor")
-      for desired_ranks in ((0, 1, 2), (1, 0, 2), (1, 2, 0)):
-        with ops.control_dependencies([
-            check_ops.assert_rank_in(tensor_rank1, desired_ranks)]):
-          array_ops.identity(tensor_rank1).eval()
+    tensor_rank1 = constant_op.constant([42, 43], name="my_tensor")
+    for desired_ranks in ((0, 1, 2), (1, 0, 2), (1, 2, 0)):
+      with ops.control_dependencies([
+          check_ops.assert_rank_in(tensor_rank1, desired_ranks)]):
+        self.evaluate(array_ops.identity(tensor_rank1))
 
   def test_rank_one_tensor_doesnt_raise_if_rank_matches_dynamic_rank(self):
     with self.test_session():
@@ -759,13 +784,13 @@ class AssertRankInTest(test.TestCase):
               tensor_rank1: (42.0, 43.0)
           })
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_one_tensor_raises_if_rank_mismatches_static_rank(self):
-    with self.test_session():
-      tensor_rank1 = constant_op.constant((42, 43), name="my_tensor")
-      with self.assertRaisesRegexp(ValueError, "my_tensor.*rank"):
-        with ops.control_dependencies([
-            check_ops.assert_rank_in(tensor_rank1, (0, 2))]):
-          array_ops.identity(tensor_rank1).eval()
+    tensor_rank1 = constant_op.constant((42, 43), name="my_tensor")
+    with self.assertRaisesRegexp(ValueError, "rank"):
+      with ops.control_dependencies([
+          check_ops.assert_rank_in(tensor_rank1, (0, 2))]):
+        self.evaluate(array_ops.identity(tensor_rank1))
 
   def test_rank_one_tensor_raises_if_rank_mismatches_dynamic_rank(self):
     with self.test_session():
@@ -777,14 +802,14 @@ class AssertRankInTest(test.TestCase):
               tensor_rank1: (42.0, 43.0)
           })
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_if_rank_is_not_scalar_static(self):
-    with self.test_session():
-      tensor = constant_op.constant((42, 43), name="my_tensor")
-      desired_ranks = (
-          np.array(1, dtype=np.int32),
-          np.array((2, 1), dtype=np.int32))
-      with self.assertRaisesRegexp(ValueError, "Rank must be a scalar"):
-        check_ops.assert_rank_in(tensor, desired_ranks)
+    tensor = constant_op.constant((42, 43), name="my_tensor")
+    desired_ranks = (
+        np.array(1, dtype=np.int32),
+        np.array((2, 1), dtype=np.int32))
+    with self.assertRaisesRegexp(ValueError, "Rank must be a scalar"):
+      check_ops.assert_rank_in(tensor, desired_ranks)
 
   def test_raises_if_rank_is_not_scalar_dynamic(self):
     with self.test_session():
@@ -801,12 +826,12 @@ class AssertRankInTest(test.TestCase):
               desired_ranks[1]: [2, 1],
           })
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_if_rank_is_not_integer_static(self):
-    with self.test_session():
-      tensor = constant_op.constant((42, 43), name="my_tensor")
-      with self.assertRaisesRegexp(TypeError,
-                                   "must be of type <dtype: 'int32'>"):
-        check_ops.assert_rank_in(tensor, (1, .5,))
+    tensor = constant_op.constant((42, 43), name="my_tensor")
+    with self.assertRaisesRegexp(TypeError,
+                                 "must be of type <dtype: 'int32'>"):
+      check_ops.assert_rank_in(tensor, (1, .5,))
 
   def test_raises_if_rank_is_not_integer_dynamic(self):
     with self.test_session():
@@ -822,14 +847,14 @@ class AssertRankInTest(test.TestCase):
 
 class AssertRankAtLeastTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_zero_tensor_raises_if_rank_too_small_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant(1, name="my_tensor")
-      desired_rank = 1
-      with self.assertRaisesRegexp(ValueError, "my_tensor.*rank at least 1"):
-        with ops.control_dependencies(
-            [check_ops.assert_rank_at_least(tensor, desired_rank)]):
-          array_ops.identity(tensor).eval()
+    tensor = constant_op.constant(1, name="my_tensor")
+    desired_rank = 1
+    with self.assertRaisesRegexp(ValueError, "rank at least 1"):
+      with ops.control_dependencies(
+          [check_ops.assert_rank_at_least(tensor, desired_rank)]):
+        self.evaluate(array_ops.identity(tensor))
 
   def test_rank_zero_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.test_session():
@@ -840,13 +865,13 @@ class AssertRankAtLeastTest(test.TestCase):
         with self.assertRaisesOpError("my_tensor.*rank"):
           array_ops.identity(tensor).eval(feed_dict={tensor: 0})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_zero_tensor_doesnt_raise_if_rank_just_right_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant(1, name="my_tensor")
-      desired_rank = 0
-      with ops.control_dependencies(
-          [check_ops.assert_rank_at_least(tensor, desired_rank)]):
-        array_ops.identity(tensor).eval()
+    tensor = constant_op.constant(1, name="my_tensor")
+    desired_rank = 0
+    with ops.control_dependencies(
+        [check_ops.assert_rank_at_least(tensor, desired_rank)]):
+      self.evaluate(array_ops.identity(tensor))
 
   def test_rank_zero_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.test_session():
@@ -856,13 +881,13 @@ class AssertRankAtLeastTest(test.TestCase):
           [check_ops.assert_rank_at_least(tensor, desired_rank)]):
         array_ops.identity(tensor).eval(feed_dict={tensor: 0})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_one_ten_doesnt_raise_raise_if_rank_too_large_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant([1, 2], name="my_tensor")
-      desired_rank = 0
-      with ops.control_dependencies(
-          [check_ops.assert_rank_at_least(tensor, desired_rank)]):
-        array_ops.identity(tensor).eval()
+    tensor = constant_op.constant([1, 2], name="my_tensor")
+    desired_rank = 0
+    with ops.control_dependencies(
+        [check_ops.assert_rank_at_least(tensor, desired_rank)]):
+      self.evaluate(array_ops.identity(tensor))
 
   def test_rank_one_ten_doesnt_raise_if_rank_too_large_dynamic_rank(self):
     with self.test_session():
@@ -872,13 +897,13 @@ class AssertRankAtLeastTest(test.TestCase):
           [check_ops.assert_rank_at_least(tensor, desired_rank)]):
         array_ops.identity(tensor).eval(feed_dict={tensor: [1, 2]})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_one_tensor_doesnt_raise_if_rank_just_right_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant([1, 2], name="my_tensor")
-      desired_rank = 1
-      with ops.control_dependencies(
-          [check_ops.assert_rank_at_least(tensor, desired_rank)]):
-        array_ops.identity(tensor).eval()
+    tensor = constant_op.constant([1, 2], name="my_tensor")
+    desired_rank = 1
+    with ops.control_dependencies(
+        [check_ops.assert_rank_at_least(tensor, desired_rank)]):
+      self.evaluate(array_ops.identity(tensor))
 
   def test_rank_one_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.test_session():
@@ -888,14 +913,14 @@ class AssertRankAtLeastTest(test.TestCase):
           [check_ops.assert_rank_at_least(tensor, desired_rank)]):
         array_ops.identity(tensor).eval(feed_dict={tensor: [1, 2]})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_one_tensor_raises_if_rank_too_small_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant([1, 2], name="my_tensor")
-      desired_rank = 2
-      with self.assertRaisesRegexp(ValueError, "my_tensor.*rank"):
-        with ops.control_dependencies(
-            [check_ops.assert_rank_at_least(tensor, desired_rank)]):
-          array_ops.identity(tensor).eval()
+    tensor = constant_op.constant([1, 2], name="my_tensor")
+    desired_rank = 2
+    with self.assertRaisesRegexp(ValueError, "rank at least 2"):
+      with ops.control_dependencies(
+          [check_ops.assert_rank_at_least(tensor, desired_rank)]):
+        self.evaluate(array_ops.identity(tensor))
 
   def test_rank_one_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.test_session():
@@ -909,144 +934,165 @@ class AssertRankAtLeastTest(test.TestCase):
 
 class AssertNonNegativeTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_negative(self):
-    with self.test_session():
-      zoe = constant_op.constant([-1, -2], name="zoe")
+    zoe = constant_op.constant([-1, -2], name="zoe")
+    with self.assertRaisesOpError("x >= 0 did not hold"):
       with ops.control_dependencies([check_ops.assert_non_negative(zoe)]):
         out = array_ops.identity(zoe)
-      with self.assertRaisesOpError("zoe"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_zero_and_positive(self):
-    with self.test_session():
-      lucas = constant_op.constant([0, 2], name="lucas")
-      with ops.control_dependencies([check_ops.assert_non_negative(lucas)]):
-        out = array_ops.identity(lucas)
-      out.eval()
+    lucas = constant_op.constant([0, 2], name="lucas")
+    with ops.control_dependencies([check_ops.assert_non_negative(lucas)]):
+      out = array_ops.identity(lucas)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_empty_tensor_doesnt_raise(self):
     # A tensor is non-negative when it satisfies:
     #   For every element x_i in x, x_i >= 0
     # and an empty tensor has no elements, so this is trivially satisfied.
     # This is standard set theory.
-    with self.test_session():
-      empty = constant_op.constant([], name="empty")
-      with ops.control_dependencies([check_ops.assert_non_negative(empty)]):
-        out = array_ops.identity(empty)
-      out.eval()
+    empty = constant_op.constant([], name="empty")
+    with ops.control_dependencies([check_ops.assert_non_negative(empty)]):
+      out = array_ops.identity(empty)
+    self.evaluate(out)
 
 
 class AssertNonPositiveTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_zero_and_negative(self):
-    with self.test_session():
-      tom = constant_op.constant([0, -2], name="tom")
-      with ops.control_dependencies([check_ops.assert_non_positive(tom)]):
-        out = array_ops.identity(tom)
-      out.eval()
+    tom = constant_op.constant([0, -2], name="tom")
+    with ops.control_dependencies([check_ops.assert_non_positive(tom)]):
+      out = array_ops.identity(tom)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_positive(self):
-    with self.test_session():
-      rachel = constant_op.constant([0, 2], name="rachel")
+    rachel = constant_op.constant([0, 2], name="rachel")
+    with self.assertRaisesOpError("x <= 0 did not hold"):
       with ops.control_dependencies([check_ops.assert_non_positive(rachel)]):
         out = array_ops.identity(rachel)
-      with self.assertRaisesOpError("rachel"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_empty_tensor_doesnt_raise(self):
     # A tensor is non-positive when it satisfies:
     #   For every element x_i in x, x_i <= 0
     # and an empty tensor has no elements, so this is trivially satisfied.
     # This is standard set theory.
-    with self.test_session():
-      empty = constant_op.constant([], name="empty")
-      with ops.control_dependencies([check_ops.assert_non_positive(empty)]):
-        out = array_ops.identity(empty)
-      out.eval()
+    empty = constant_op.constant([], name="empty")
+    with ops.control_dependencies([check_ops.assert_non_positive(empty)]):
+      out = array_ops.identity(empty)
+    self.evaluate(out)
 
 
 class AssertIntegerTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_integer(self):
-    with self.test_session():
-      integers = constant_op.constant([1, 2], name="integers")
-      with ops.control_dependencies([check_ops.assert_integer(integers)]):
-        out = array_ops.identity(integers)
-      out.eval()
+    integers = constant_op.constant([1, 2], name="integers")
+    with ops.control_dependencies([check_ops.assert_integer(integers)]):
+      out = array_ops.identity(integers)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_float(self):
-    with self.test_session():
-      floats = constant_op.constant([1.0, 2.0], name="floats")
-      with self.assertRaisesRegexp(TypeError, "Expected.*integer"):
-        check_ops.assert_integer(floats)
+    floats = constant_op.constant([1.0, 2.0], name="floats")
+    with self.assertRaisesRegexp(TypeError, "Expected.*integer"):
+      check_ops.assert_integer(floats)
+
+
+class AssertTypeTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_doesnt_raise_when_correct_type(self):
+    integers = constant_op.constant([1, 2], dtype=dtypes.int64)
+    with ops.control_dependencies([
+        check_ops.assert_type(integers, dtypes.int64)]):
+      out = array_ops.identity(integers)
+    self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_raises_when_wrong_type(self):
+    floats = constant_op.constant([1.0, 2.0], dtype=dtypes.float16)
+    with self.assertRaisesRegexp(TypeError, "must be of type.*float32"):
+      check_ops.assert_type(floats, dtypes.float32)
 
 
 class IsStrictlyIncreasingTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_constant_tensor_is_not_strictly_increasing(self):
-    with self.test_session():
-      self.assertFalse(check_ops.is_strictly_increasing([1, 1, 1]).eval())
+    self.assertFalse(self.evaluate(check_ops.is_strictly_increasing([1, 1, 1])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_decreasing_tensor_is_not_strictly_increasing(self):
-    with self.test_session():
-      self.assertFalse(check_ops.is_strictly_increasing([1, 0, -1]).eval())
+    self.assertFalse(self.evaluate(
+        check_ops.is_strictly_increasing([1, 0, -1])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_2d_decreasing_tensor_is_not_strictly_increasing(self):
-    with self.test_session():
-      self.assertFalse(
-          check_ops.is_strictly_increasing([[1, 3], [2, 4]]).eval())
+    self.assertFalse(
+        self.evaluate(check_ops.is_strictly_increasing([[1, 3], [2, 4]])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_increasing_tensor_is_increasing(self):
-    with self.test_session():
-      self.assertTrue(check_ops.is_strictly_increasing([1, 2, 3]).eval())
+    self.assertTrue(self.evaluate(check_ops.is_strictly_increasing([1, 2, 3])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_increasing_rank_two_tensor(self):
-    with self.test_session():
-      self.assertTrue(
-          check_ops.is_strictly_increasing([[-1, 2], [3, 4]]).eval())
+    self.assertTrue(
+        self.evaluate(check_ops.is_strictly_increasing([[-1, 2], [3, 4]])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_tensor_with_one_element_is_strictly_increasing(self):
-    with self.test_session():
-      self.assertTrue(check_ops.is_strictly_increasing([1]).eval())
+    self.assertTrue(self.evaluate(check_ops.is_strictly_increasing([1])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_empty_tensor_is_strictly_increasing(self):
-    with self.test_session():
-      self.assertTrue(check_ops.is_strictly_increasing([]).eval())
+    self.assertTrue(self.evaluate(check_ops.is_strictly_increasing([])))
 
 
 class IsNonDecreasingTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_constant_tensor_is_non_decreasing(self):
-    with self.test_session():
-      self.assertTrue(check_ops.is_non_decreasing([1, 1, 1]).eval())
+    self.assertTrue(self.evaluate(check_ops.is_non_decreasing([1, 1, 1])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_decreasing_tensor_is_not_non_decreasing(self):
-    with self.test_session():
-      self.assertFalse(check_ops.is_non_decreasing([3, 2, 1]).eval())
+    self.assertFalse(self.evaluate(check_ops.is_non_decreasing([3, 2, 1])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_2d_decreasing_tensor_is_not_non_decreasing(self):
-    with self.test_session():
-      self.assertFalse(check_ops.is_non_decreasing([[1, 3], [2, 4]]).eval())
+    self.assertFalse(self.evaluate(
+        check_ops.is_non_decreasing([[1, 3], [2, 4]])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_increasing_rank_one_tensor_is_non_decreasing(self):
-    with self.test_session():
-      self.assertTrue(check_ops.is_non_decreasing([1, 2, 3]).eval())
+    self.assertTrue(self.evaluate(check_ops.is_non_decreasing([1, 2, 3])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_increasing_rank_two_tensor(self):
-    with self.test_session():
-      self.assertTrue(check_ops.is_non_decreasing([[-1, 2], [3, 3]]).eval())
+    self.assertTrue(self.evaluate(
+        check_ops.is_non_decreasing([[-1, 2], [3, 3]])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_tensor_with_one_element_is_non_decreasing(self):
-    with self.test_session():
-      self.assertTrue(check_ops.is_non_decreasing([1]).eval())
+    self.assertTrue(self.evaluate(check_ops.is_non_decreasing([1])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_empty_tensor_is_non_decreasing(self):
-    with self.test_session():
-      self.assertTrue(check_ops.is_non_decreasing([]).eval())
+    self.assertTrue(self.evaluate(check_ops.is_non_decreasing([])))
 
 
 class FloatDTypeTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_assert_same_float_dtype(self):
     self.assertIs(dtypes.float32,
                   check_ops.assert_same_float_dtype(None, None))
@@ -1100,6 +1146,7 @@ class FloatDTypeTest(test.TestCase):
 
 class AssertScalarTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_assert_scalar(self):
     check_ops.assert_scalar(constant_op.constant(3))
     check_ops.assert_scalar(constant_op.constant("foo"))
diff --git a/tensorflow/python/kernel_tests/constant_op_eager_test.py b/tensorflow/python/kernel_tests/constant_op_eager_test.py
index 3b71586b55451df86bf214437be3ceec8a4265eb..8e9d75667d49bf9e377ccb9290a3a91786b5a1cb 100644
--- a/tensorflow/python/kernel_tests/constant_op_eager_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_eager_test.py
@@ -237,6 +237,39 @@ class ConstantTest(test.TestCase):
     self._testAll((1, x))
     self._testAll((x, 1))
 
+  def testInvalidLength(self):
+
+    class BadList(list):
+
+      def __init__(self):
+        super(BadList, self).__init__([1, 2, 3])  # pylint: disable=invalid-length-returned
+
+      def __len__(self):
+        return -1
+
+    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+      constant_op.constant([BadList()])
+    with self.assertRaisesRegexp(ValueError, "mixed types"):
+      constant_op.constant([1, 2, BadList()])
+    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+      constant_op.constant(BadList())
+    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+      constant_op.constant([[BadList(), 2], 3])
+    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+      constant_op.constant([BadList(), [1, 2, 3]])
+    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+      constant_op.constant([BadList(), []])
+
+    # TODO(allenl, josh11b): These cases should return exceptions rather than
+    # working (currently shape checking only checks the first element of each
+    # sequence recursively). Maybe the first one is fine, but the second one
+    # silently truncating is rather bad.
+
+    # with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+    #   constant_op.constant([[3, 2, 1], BadList()])
+    # with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+    #   constant_op.constant([[], BadList()])
+
   def testSparseValuesRaiseErrors(self):
     with self.assertRaisesRegexp(ValueError, "non-rectangular Python sequence"):
       constant_op.constant([[1, 2], [3]], dtype=dtypes_lib.int32)
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 6cbdd4cbb32259a198f6d4c9088a94b2251de916..030c690167fd7edef9ad929eb5cee5f03d9d5883 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -44,7 +44,8 @@ class ConstantTest(test.TestCase):
     np_ans = np.array(x)
     with self.test_session(use_gpu=False):
       tf_ans = ops.convert_to_tensor(x).eval()
-    if np_ans.dtype in [np.float32, np.float64, np.complex64, np.complex128]:
+    dtype = dtypes_lib.as_dtype(np_ans.dtype)
+    if dtype.is_floating or dtype.is_complex:
       self.assertAllClose(np_ans, tf_ans)
     else:
       self.assertAllEqual(np_ans, tf_ans)
@@ -53,7 +54,8 @@ class ConstantTest(test.TestCase):
     np_ans = np.array(x)
     with self.test_session(use_gpu=True):
       tf_ans = ops.convert_to_tensor(x).eval()
-    if np_ans.dtype in [np.float32, np.float64, np.complex64, np.complex128]:
+    dtype = dtypes_lib.as_dtype(np_ans.dtype)
+    if dtype.is_floating or dtype.is_complex:
       self.assertAllClose(np_ans, tf_ans)
     else:
       self.assertAllEqual(np_ans, tf_ans)
@@ -62,6 +64,19 @@ class ConstantTest(test.TestCase):
     self._testCpu(x)
     self._testGpu(x)
 
+  def testBFloat16(self):
+    bfloat16 = dtypes_lib.bfloat16.as_numpy_dtype
+    self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(bfloat16))
+    self._testAll(
+        np.random.normal(size=30).reshape([2, 3, 5]).astype(bfloat16))
+    self._testAll(np.empty((2, 0, 5)).astype(bfloat16))
+
+  def testHalf(self):
+    self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.float16))
+    self._testAll(
+        np.random.normal(size=30).reshape([2, 3, 5]).astype(np.float16))
+    self._testAll(np.empty((2, 0, 5)).astype(np.float16))
+
   def testFloat(self):
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.float32))
     self._testAll(
@@ -439,11 +454,10 @@ class ZerosLikeTest(test.TestCase):
 
   def testZerosLikeCPU(self):
     for dtype in [
-        dtypes_lib.float32, dtypes_lib.float64,
-        dtypes_lib.int8, dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.uint16,
-        dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.bool,
-        dtypes_lib.complex64, dtypes_lib.complex128,
-        dtypes_lib.string
+        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int8,
+        dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.uint16, dtypes_lib.int32,
+        dtypes_lib.int64, dtypes_lib.bool, dtypes_lib.complex64,
+        dtypes_lib.complex128, dtypes_lib.string
     ]:
       self._compareZeros(dtype, fully_defined_shape=False, use_gpu=False)
       self._compareZeros(dtype, fully_defined_shape=True, use_gpu=False)
@@ -574,10 +588,10 @@ class OnesLikeTest(test.TestCase):
 
   def testOnesLike(self):
     for dtype in [
-        dtypes_lib.float32, dtypes_lib.float64,
-        dtypes_lib.int8, dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.uint16,
-        dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.bool,
-        dtypes_lib.complex64, dtypes_lib.complex128
+        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int8,
+        dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.uint16, dtypes_lib.int32,
+        dtypes_lib.int64, dtypes_lib.bool, dtypes_lib.complex64,
+        dtypes_lib.complex128
     ]:
       numpy_dtype = dtype.as_numpy_dtype
       with self.test_session():
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index fc125daf38e73a88e2a89de7acea5cc9518f955d..7f2c2545dc6483bb8282671aaa4fa65b39a73af7 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
@@ -68,16 +69,6 @@ from tensorflow.python.training import gradient_descent
 from tensorflow.python.util import nest
 
 
-def check_op_order(graph):
-  """Sanity check on the ordering of op id."""
-
-  for op in graph.get_operations():
-    for v in op.inputs:
-      assert v.op._id < op._id or op.type == "Merge", (
-          "The id of %s must be less than the id of %s" % (v.op.name, op.name))
-  return True
-
-
 def check_consumers(graph):
   """Sanity check on the consumer list of the tensors."""
 
@@ -122,14 +113,16 @@ def opt_cfg():
               do_constant_folding=True)))
 
 
-def isum(s):
+def isum(s, maximum_iterations=None):
   i = constant_op.constant(0, name="i")
   c = lambda i, s: math_ops.less(i, 10)
   b = lambda i, s: [math_ops.add(i, 1), math_ops.add(i, s)]
-  _, r_s = control_flow_ops.while_loop(c, b, [i, s])
+  _, r_s = control_flow_ops.while_loop(
+      c, b, [i, s], maximum_iterations=maximum_iterations)
   return r_s
 
 
+@test_util.with_c_api
 class ControlFlowTest(test.TestCase):
 
   def testRefIdentity(self):
@@ -140,7 +133,6 @@ class ControlFlowTest(test.TestCase):
       op = state_ops.assign(v, 9)
       v2 = control_flow_ops.with_dependencies([op], v)
 
-      self.assertTrue(check_op_order(v.graph))
       self.assertTrue(isinstance(v2, ops.Tensor))
       variables.global_variables_initializer().run()
       self.assertEqual(9, v2.eval())
@@ -352,14 +344,20 @@ class ControlFlowTest(test.TestCase):
     grad = gradients_impl.gradients(y, [v])
     self.assertAllEqual([None], grad)
 
-  def testFetchables(self):
+  def testFetchable(self):
     with self.test_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
       control_flow_ops.cond(
           constant_op.constant(True), lambda: x + 2, lambda: x + 0)
-      tensor_names = all_fetchables()
-      for name in tensor_names:
-        sess.run(name, feed_dict={x: 3})
+      graph = ops.get_default_graph()
+      for op in graph.get_operations():
+        for t in op.inputs:
+          if graph.is_fetchable(t.op):
+            sess.run(t, feed_dict={x: 3})
+          else:
+            with self.assertRaisesRegexp(ValueError,
+                                         "has been marked as not fetchable"):
+              sess.run(t, feed_dict={x: 3})
 
   def testFeedable(self):
     with self.test_session() as sess:
@@ -390,7 +388,6 @@ class ControlFlowTest(test.TestCase):
 
       val = r.values.eval()
       ind = r.indices.eval()
-    self.assertTrue(check_op_order(x.values.graph))
     self.assertAllEqual(11, val)
     self.assertAllEqual(0, ind)
 
@@ -437,7 +434,6 @@ class ControlFlowTest(test.TestCase):
 
       val = r.values.eval()
       ind = r.indices.eval()
-    self.assertTrue(check_op_order(x.values.graph))
     self.assertAllEqual(11, val)
     self.assertAllEqual(0, ind)
     self.assertTrue(ind.dtype == np.int64)
@@ -466,7 +462,6 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
 
       result = r.eval()
-    self.assertTrue(check_op_order(x.graph))
     self.assertAllEqual(11, result)
 
   def testCond_1(self):
@@ -480,7 +475,6 @@ class ControlFlowTest(test.TestCase):
           math_ops.less(1, 0), lambda: math_ops.add(x, 1),
           lambda: math_ops.subtract(x, 1))
       result = r.eval()
-    self.assertTrue(check_op_order(x.graph))
     self.assertAllEqual(9, result)
 
   def testCond_3(self):
@@ -493,7 +487,6 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn3, fn2)
 
       result = r.eval()
-    self.assertTrue(check_op_order(x.graph))
     self.assertAllEqual(12, result)
 
   def testCond_4(self):
@@ -512,7 +505,6 @@ class ControlFlowTest(test.TestCase):
       variables.global_variables_initializer().run()
       self.assertEqual(len(r), 2)
       result = r[1].eval()
-      self.assertTrue(check_op_order(age.graph))
       self.assertAllEqual(True, result)
       self.assertAllEqual(7, v1.eval())
       self.assertAllEqual(2, v2.eval())
@@ -740,6 +732,34 @@ class ControlFlowTest(test.TestCase):
       r = isum(s)
       self.assertAllEqual(45, r.eval())
 
+  def testWhileWithMaximumIterations(self):
+    with self.test_session():
+      s = constant_op.constant([1, 2, 3, 4, 5])
+      r = isum(s, maximum_iterations=3)
+      self.assertAllEqual([1+3, 2+3, 3+3, 4+3, 5+3], r.eval())
+
+  def testWhileWithMaximumIterationsAndSingleArgument(self):
+    with self.test_session():
+      r = control_flow_ops.while_loop(
+          lambda i: i < 3,
+          lambda i: i + 1,
+          [0],
+          maximum_iterations=1)
+      self.assertEqual(1, r.eval())
+
+  def testInvalidMaximumIterationsContext(self):
+    def outer_body(i, r):
+      r = control_flow_ops.while_loop(lambda i: i < 3, lambda i: i + 1, [0],
+                                      maximum_iterations=r.shape[0])
+      return i, r
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        "maximum_iterations tensor cannot be declared in tf.cond or "
+        "tf.while_loop"):
+      control_flow_ops.while_loop(lambda i, r: i < 3, outer_body,
+                                  [0, constant_op.constant([1])])
+
   # Have more than 10 parallel iterations and hence exercise k-bound
   # most of the time.
   def testWhile_3(self):
@@ -760,7 +780,6 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(lambda i, m, c, o: math_ops.less(i, d),
                                       compute, [i, m, c, o])
       result = r[3].eval()
-    self.assertTrue(check_op_order(i.graph))
     self.assertAllEqual(10100, result)
 
   def testWhile_4(self):
@@ -782,7 +801,6 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(lambda i, m, c, o: math_ops.less(i, s),
                                       compute, [i, m, c, o])
       result = r[3].eval()
-    self.assertTrue(check_op_order(i.graph))
     self.assertAllEqual(42, result)
 
   def testWhile_5(self):
@@ -807,7 +825,6 @@ class ControlFlowTest(test.TestCase):
               tensor_shape.unknown_shape()
           ])
       result = r[2].eval()
-    self.assertTrue(check_op_order(i.graph))
     self.assertAllEqual(np.array([0, 1, 2, 3, 4, 5, 6]), result)
 
   def testBufferForwarding(self):
@@ -908,7 +925,13 @@ class ControlFlowTest(test.TestCase):
       self.assertTrue(r[1].get_shape()[0].value is None)
       self.assertEqual(r[1].get_shape()[1], tensor_shape.Dimension(2))
 
-      with self.assertRaisesRegexp(ValueError, "not an invariant for"):
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"The shape for while_1/Merge_1:0 is not an invariant for the loop. "
+          r"It enters the loop with shape \(2, 2\), but has shape \(4, 2\) "
+          r"after one iteration. Provide shape invariants using either the "
+          r"`shape_invariants` argument of tf.while_loop or set_shape\(\) on "
+          r"the loop variables."):
         r = control_flow_ops.while_loop(c, b, [i, m])
 
   def testWhileShapeInferenceSparseTensor(self):
@@ -1247,7 +1270,6 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
-      self.assertTrue(check_op_order(n.graph))
       variables.global_variables_initializer().run()
       self.assertEqual(3, r.eval())
       result = select.eval()
@@ -1272,7 +1294,6 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
-      self.assertTrue(check_op_order(n.graph))
       variables.global_variables_initializer().run()
       self.assertEqual(3, r.eval())
       result1 = select1.eval()
@@ -1299,7 +1320,6 @@ class ControlFlowTest(test.TestCase):
           parallel_iterations=1)
       variables.global_variables_initializer().run()
       result = r[1].eval()
-    self.assertTrue(check_op_order(n.graph))
     self.assertAllClose(np.array([10.0, 10.0, 10.0]), result)
 
   # b/24814703
@@ -1444,7 +1464,8 @@ class ControlFlowTest(test.TestCase):
     gpu_dev_name = test.gpu_device_name() if test.is_gpu_available(
     ) else "/device:GPU:0"
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    graph = ops.Graph()
+    with graph.as_default():
       v = constant_op.constant(2.0, name="v")
       c = lambda v: math_ops.less(v, 100.0)
 
@@ -1455,7 +1476,8 @@ class ControlFlowTest(test.TestCase):
       loop = control_flow_ops.while_loop(c, b, [v], parallel_iterations=1)
       r = gradients_impl.gradients(
           loop, v, colocate_gradients_with_ops=colocate)[0]
-    r_ops = r.graph.get_operations()
+
+    r_ops = graph.get_operations()
     r_devices = [(op.name, op.device) for op in r_ops]
 
     self.assertTrue(any("Square" in op.name for op in r_ops))
@@ -1469,7 +1491,9 @@ class ControlFlowTest(test.TestCase):
         self.assertTrue(gpu_dev_name in dev)
       else:
         self.assertFalse(gpu_dev_name in dev)
-    self.assertAllClose(1024.0, sess.run(r))
+
+    with self.test_session(graph=graph) as sess:
+      self.assertAllClose(1024.0, sess.run(r))
 
   def testWhileGrad_ColocateGradients(self):
     self._testWhileGrad_ColocateGradients(colocate=False)
@@ -2268,8 +2292,7 @@ class ControlFlowTest(test.TestCase):
       # Duplicate events cause an error if exclusive = True
       r4 = control_flow_ops.case(
           [(x < y, f1), (x < y, f2)], default=f3, exclusive=True)
-      with self.assertRaisesOpError(
-          "More than one condition evaluated as True but exclusive=True."):
+      with self.assertRaisesOpError("Input error:"):
         r4.eval()
 
       # Check that the default is called if none of the others are
@@ -2616,6 +2639,124 @@ class ControlFlowTest(test.TestCase):
           1)
 
 
+@test_util.with_c_api
+class ControlFlowContextCheckTest(test.TestCase):
+
+  def _getWhileTensor(self):
+    """Creates and returns a tensor from a while context."""
+    tensor = []
+
+    def body(i):
+      if not tensor:
+        tensor.append(constant_op.constant(1))
+      return i + tensor[0]
+
+    control_flow_ops.while_loop(lambda i: i < 10, body, [0])
+    return tensor[0]
+
+  def _getCondTensor(self):
+    cond_tensor = []
+    def true_fn():
+      if not cond_tensor:
+        cond_tensor.append(constant_op.constant(1))
+      return cond_tensor[0]
+    control_flow_ops.cond(math_ops.less(1, 2), true_fn,
+                          lambda: constant_op.constant(0))
+    return cond_tensor[0]
+
+  def testInvalidContext(self):
+    # Accessing a while loop tensor outside of control flow is illegal.
+    while_tensor = self._getWhileTensor()
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Cannot use 'while/Const_1' as input to 'Add' because 'while/Const_1' "
+        "is in a while loop. See info log for more details."):
+      math_ops.add(1, while_tensor)
+
+  def testInvalidContextInCond(self):
+    # Accessing a while loop tensor in cond is illegal.
+    while_tensor = self._getWhileTensor()
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Cannot use 'while/Const_1' as input to 'cond/Add' because "
+        "'while/Const_1' is in a while loop. See info log for more details."):
+      # TODO(skyewm): this passes if we return while_tensor directly instead
+      # of using it as input to another op.
+      control_flow_ops.cond(math_ops.less(1, 2),
+                            lambda: math_ops.add(1, while_tensor),
+                            lambda: constant_op.constant(0))
+
+  def testInvalidContextInWhile(self):
+    # Accessing a while loop tensor in a different while loop is illegal.
+    while_tensor = self._getWhileTensor()
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Cannot use 'while_1/Add' as input to 'while/Const_1' because they are "
+        "in different while loops. See info log for more details."):
+      control_flow_ops.while_loop(lambda i: i < 10,
+                                  lambda x: math_ops.add(1, while_tensor), [0])
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Cannot use 'while_2/NextIteration' as input to 'while/Const_1' "
+        "because they are in different while loops. See info log for more "
+        "details."):
+      control_flow_ops.while_loop(lambda i: i < 10, lambda i: while_tensor, [0])
+
+  def testValidCondContext(self):
+    # Accessing a tensor from a cond context is OK (although dangerous).
+    cond_tensor = self._getCondTensor()
+    math_ops.add(1, cond_tensor)
+
+  def testValidCondContextBranches(self):
+    # Accessing a tensor from a cond context from the other branch's cond
+    # context is OK (although dangerous).
+    cond_tensor = []
+    def branch_fn():
+      if not cond_tensor:
+        cond_tensor.append(constant_op.constant(1))
+      return cond_tensor[0]
+
+    control_flow_ops.cond(math_ops.less(1, 2), branch_fn, branch_fn)
+
+  def testValidWhileContext(self):
+    # Accessing a tensor in a nested while is OK.
+    def body(_):
+      c = constant_op.constant(1)
+      return control_flow_ops.while_loop(lambda i: i < 3, lambda i: i + c, [0])
+
+    control_flow_ops.while_loop(lambda i: i < 5, body, [0])
+
+  def testValidNestedContexts(self):
+    # Accessing a tensor from a cond context in a while context, all inside an
+    # outer while context, is OK.
+    def body(_):
+      cond_tensor = self._getCondTensor()
+      # Create another cond containing the while loop for good measure
+      return control_flow_ops.cond(
+          math_ops.less(1, 2),
+          lambda: control_flow_ops.while_loop(lambda i: i < 3,
+                                              lambda i: i + cond_tensor, [0]),
+          lambda: constant_op.constant(0))
+
+    control_flow_ops.while_loop(lambda i: i < 5, body, [0])
+
+  def testInvalidNestedContexts(self):
+    # Accessing a tensor from a while context in a different while context, all
+    # inside a cond context, is illegal.
+    def true_fn():
+      while_tensor = self._getWhileTensor()
+      return control_flow_ops.while_loop(lambda i: i < 3,
+                                         lambda i: i + while_tensor, [0])
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Cannot use 'cond/while_1/add' as input to 'cond/while/Const_1' because"
+        " they are in different while loops. See info log for more details."):
+      control_flow_ops.cond(math_ops.less(1, 2), true_fn,
+                            lambda: constant_op.constant(0))
+
+
+@test_util.with_c_api
 class TupleTest(test.TestCase):
 
   def testTensors(self):
@@ -2701,6 +2842,7 @@ class TupleTest(test.TestCase):
       self.assertEquals(1, var.eval())
 
 
+@test_util.with_c_api
 class AssertTest(test.TestCase):
 
   def testGuardedAssertDoesNotCopyWhenTrue(self):
@@ -2738,6 +2880,7 @@ class AssertTest(test.TestCase):
       self.assertEqual([], guarded_memcpy_nodestat_names)
 
 
+@test_util.with_c_api
 class WhileOpBenchmark(test.Benchmark):
   """Evaluate the performance of while_loop op."""
 
@@ -2851,6 +2994,7 @@ class WhileOpBenchmark(test.Benchmark):
         name="unroll_same_device", iters=iters, wall_time=duration)
 
 
+@test_util.with_c_api
 class EagerTest(test.TestCase):
 
   def testCond(self):
@@ -2869,6 +3013,22 @@ class EagerTest(test.TestCase):
       self.assertAllEqual(isum(tensor).numpy(),
                           [46, 47, 48, 49, 50])
 
+  def testWhileLoopWithMaxIterations(self):
+    with context.eager_mode():
+      tensor = constant_op.constant([1, 2, 3, 4, 5])
+      self.assertAllEqual(isum(tensor, maximum_iterations=3).numpy(),
+                          [1+3, 2+3, 3+3, 4+3, 5+3])
+
+  def testWhileWithMaximumIterationsAndSingleArgument(self):
+    with context.eager_mode():
+      tensor = constant_op.constant(0)
+      r = control_flow_ops.while_loop(
+          lambda i: i < 3,
+          lambda i: i + 1,
+          [tensor],
+          maximum_iterations=1)
+      self.assertEqual(1, r.numpy())
+
   def testWithDependencies(self):
     with context.eager_mode():
       t1 = constant_op.constant(1)
@@ -2897,5 +3057,6 @@ class EagerTest(test.TestCase):
                                  default=f3, exclusive=True)
       self.assertAllEqual(r1.numpy(), 17)
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/control_flow_util_test.py b/tensorflow/python/kernel_tests/control_flow_util_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..39e96f74b0461da0cf499e303b30a4a41aae4899
--- /dev/null
+++ b/tensorflow/python/kernel_tests/control_flow_util_test.py
@@ -0,0 +1,71 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for tensorflow.python.ops.control_flow_util."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import test_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import gen_control_flow_ops
+from tensorflow.python.platform import test
+
+
+class ControlFlowUtilTest(test.TestCase):
+
+  def testIsSwitch(self):
+    switch_false, _ = control_flow_ops.switch(1, True)
+    switch = switch_false.op
+    self.assertTrue(control_flow_util.IsSwitch(switch))
+
+    ref_switch_false, _ = control_flow_ops.ref_switch(test_ops.ref_output(),
+                                                      True)
+    ref_switch = ref_switch_false.op
+    self.assertTrue(control_flow_util.IsSwitch(ref_switch))
+
+    self.assertFalse(control_flow_util.IsSwitch(test_ops.int_output().op))
+
+  def testIsLoopEnter(self):
+    enter = gen_control_flow_ops.enter(1, frame_name="name").op
+    self.assertTrue(control_flow_util.IsLoopEnter(enter))
+    self.assertFalse(control_flow_util.IsLoopConstantEnter(enter))
+
+    ref_enter = gen_control_flow_ops.ref_enter(test_ops.ref_output(),
+                                               frame_name="name").op
+    self.assertTrue(control_flow_util.IsLoopEnter(ref_enter))
+    self.assertFalse(control_flow_util.IsLoopConstantEnter(ref_enter))
+
+    const_enter = gen_control_flow_ops.enter(1, frame_name="name",
+                                             is_constant=True).op
+    self.assertTrue(control_flow_util.IsLoopEnter(const_enter))
+    self.assertTrue(control_flow_util.IsLoopConstantEnter(const_enter))
+
+    self.assertFalse(control_flow_util.IsLoopEnter(test_ops.int_output().op))
+
+  def testIsLoopExit(self):
+    exit_op = control_flow_ops.exit(1).op
+    self.assertTrue(control_flow_util.IsLoopExit(exit_op))
+
+    ref_exit = control_flow_ops.exit(test_ops.ref_output()).op
+    self.assertTrue(control_flow_util.IsLoopExit(ref_exit))
+
+    self.assertFalse(control_flow_util.IsLoopExit(test_ops.int_output().op))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/conv1d_test.py b/tensorflow/python/kernel_tests/conv1d_test.py
index 7c8d309bbd36b3f81144da1a96b1eb55894e70c0..d92797a7d38cbe359d8166ea9ad7c25bd9cd1f4b 100644
--- a/tensorflow/python/kernel_tests/conv1d_test.py
+++ b/tensorflow/python/kernel_tests/conv1d_test.py
@@ -40,7 +40,7 @@ class Conv1DTest(test.TestCase):
     filters = array_ops.expand_dims(filters, 2)  # out_channels
     # Filters is 2x1x1
     for stride in [1, 2]:
-      with self.test_session():
+      with self.test_session(use_gpu=test.is_gpu_available()):
         c = nn_ops.conv1d(x, filters, stride, padding="VALID")
         reduced = array_ops.squeeze(c)
         output = reduced.eval()
@@ -52,7 +52,6 @@ class Conv1DTest(test.TestCase):
           self.assertEqual(len(output), 2)
           self.assertAllClose(output, [2 * 1 + 1 * 2, 2 * 3 + 1 * 4])
 
-
   def testConv1DTranspose(self):
     with self.test_session():
       stride = 2
@@ -93,5 +92,6 @@ class Conv1DTest(test.TestCase):
 
     self.assertAllClose(cache_values, value)
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py b/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
index 1679857bd5b9c5a9a1fbf89f207befc4382223b1..be299beee48cd8fb058393840eddfe08da1d6d99 100644
--- a/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
@@ -42,17 +42,21 @@ class Conv2DBackpropFilterGradTest(test.TestCase):
           filter_shape = [3, 3, 4, 6]
           # Make a convolution op with the current settings, just to easily get
           # the shape of the output.
-          conv_out = nn_ops.conv2d(in_val,
-                                   array_ops.zeros(filter_shape),
-                                   [1, stride, stride, 1], padding)
+          conv_out = nn_ops.conv2d(
+              in_val,
+              array_ops.zeros(filter_shape),
+              strides=[1, stride, stride, 1],
+              padding=padding)
           out_backprop_shape = conv_out.get_shape().as_list()
           out_backprop_val = constant_op.constant(
               2 * np.random.random_sample(out_backprop_shape) - 1,
               dtype=dtypes.float32)
-          output = nn_ops.conv2d_backprop_filter(in_val, filter_shape,
-                                                 out_backprop_val,
-                                                 [1, stride, stride, 1],
-                                                 padding)
+          output = nn_ops.conv2d_backprop_filter(
+              in_val,
+              filter_shape,
+              out_backprop_val,
+              strides=[1, stride, stride, 1],
+              padding=padding)
           err = gradient_checker.compute_gradient_error(
               [in_val, out_backprop_val], [in_shape, out_backprop_shape],
               output, filter_shape)
@@ -60,6 +64,42 @@ class Conv2DBackpropFilterGradTest(test.TestCase):
           err_tolerance = 2e-3
           self.assertLess(err, err_tolerance)
 
+  def testGradientDilatedConv(self):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        for padding in ["SAME", "VALID"]:
+          for stride in [1, 2]:
+            np.random.seed(1)
+            in_shape = [5, 8, 6, 4]
+            in_val = constant_op.constant(
+                2 * np.random.random_sample(in_shape) - 1, dtype=dtypes.float32)
+            filter_shape = [3, 3, 4, 6]
+            # Make a convolution op with the current settings,
+            # just to easily get the shape of the output.
+            conv_out = nn_ops.conv2d(
+                in_val,
+                array_ops.zeros(filter_shape),
+                dilations=[1, 2, 2, 1],
+                strides=[1, stride, stride, 1],
+                padding=padding)
+            out_backprop_shape = conv_out.get_shape().as_list()
+            out_backprop_val = constant_op.constant(
+                2 * np.random.random_sample(out_backprop_shape) - 1,
+                dtype=dtypes.float32)
+            output = nn_ops.conv2d_backprop_filter(
+                in_val,
+                filter_shape,
+                out_backprop_val,
+                dilations=[1, 2, 2, 1],
+                strides=[1, stride, stride, 1],
+                padding=padding)
+            err = gradient_checker.compute_gradient_error(
+                [in_val, out_backprop_val], [in_shape, out_backprop_shape],
+                output, filter_shape)
+            print("conv2d_backprop_filter gradient err = %g " % err)
+            err_tolerance = 2e-3
+            self.assertLess(err, err_tolerance)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index 116681fc4c817faab3f577166bc35b1f3018b66e..ec8ac74163d093c57e6e4ffbab6977ce732cc3ef 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -68,8 +68,8 @@ class Conv3DTest(test.TestCase):
       total_size_2 *= s
 
     # Initializes the input tensor with array containing numbers from 0 to 1.
-    # We keep the input tensor values fairly small to avoid overflowing a float16 
-    # tensor during the conv3d 
+    # We keep the input tensor values fairly small to avoid overflowing float16
+    # during the conv3d.
     x1 = [f * 1.0 / total_size_1 for f in range(1, total_size_1 + 1)]
     x2 = [f * 1.0 / total_size_2 for f in range(1, total_size_2 + 1)]
     with self.test_session(use_gpu=use_gpu):
@@ -115,15 +115,13 @@ class Conv3DTest(test.TestCase):
           if value.dtype == np.float16:
             tol = 1e-3
 
-          self.assertAllClose(expected, value.flatten(), atol=tol,
-                              rtol=tol)
+          self.assertAllClose(expected, value.flatten(), atol=tol, rtol=tol)
 
   def testConv3D1x1x1Filter(self):
     expected_output = [
-        0.18518519,  0.22222222,  0.25925926,  0.40740741,  0.5       ,
-        0.59259259,  0.62962963,  0.77777778,  0.92592593,  0.85185185,
-        1.05555556,  1.25925926,  1.07407407,  1.33333333,  1.59259259,
-        1.2962963 ,  1.61111111,  1.92592593
+        0.18518519, 0.22222222, 0.25925926, 0.40740741, 0.5, 0.59259259,
+        0.62962963, 0.77777778, 0.92592593, 0.85185185, 1.05555556, 1.25925926,
+        1.07407407, 1.33333333, 1.59259259, 1.2962963, 1.61111111, 1.92592593
     ]
 
     # These are equivalent to the Conv2D1x1 case.
@@ -149,10 +147,10 @@ class Conv3DTest(test.TestCase):
   # Expected values computed using scipy's correlate function.
   def testConv3D2x2x2Filter(self):
     expected_output = [
-        3.77199074,   3.85069444,   3.92939815,   4.2650463 ,   4.35763889,
-        4.45023148,   6.73032407,   6.89236111,   7.05439815,   7.22337963,
-        7.39930556,   7.57523148,   9.68865741,   9.93402778,  10.17939815,
-        10.18171296,  10.44097222,  10.70023148
+        3.77199074, 3.85069444, 3.92939815, 4.2650463, 4.35763889, 4.45023148,
+        6.73032407, 6.89236111, 7.05439815, 7.22337963, 7.39930556, 7.57523148,
+        9.68865741, 9.93402778, 10.17939815, 10.18171296, 10.44097222,
+        10.70023148
     ]
     # expected_shape = [1, 3, 1, 2, 5]
     self._VerifyValues(
@@ -164,19 +162,17 @@ class Conv3DTest(test.TestCase):
 
   def testConv3DStrides(self):
     expected_output = [
-        0.06071429,  0.08988095,  0.10238095,  0.11488095,  0.12738095,
-        0.13988095,  0.08452381,  0.26071429,  0.35238095,  0.36488095,
-        0.37738095,  0.38988095,  0.40238095,  0.23452381,  0.46071429,
-        0.61488095,  0.62738095,  0.63988095,  0.65238095,  0.66488095,
-        0.38452381,  1.12738095,  1.48988095,  1.50238095,  1.51488095,
-        1.52738095,  1.53988095,  0.88452381,  1.32738095,  1.75238095,
-        1.76488095,  1.77738095,  1.78988095,  1.80238095,  1.03452381,
-        1.52738095,  2.01488095,  2.02738095,  2.03988095,  2.05238095,
-        2.06488095,  1.18452381,  2.19404762,  2.88988095,  2.90238095,
-        2.91488095,  2.92738095,  2.93988095,  1.68452381,  2.39404762,
-        3.15238095,  3.16488095,  3.17738095,  3.18988095,  3.20238095,
-        1.83452381,  2.59404762,  3.41488095,  3.42738095,  3.43988095,
-        3.45238095,  3.46488095,  1.98452381
+        0.06071429, 0.08988095, 0.10238095, 0.11488095, 0.12738095, 0.13988095,
+        0.08452381, 0.26071429, 0.35238095, 0.36488095, 0.37738095, 0.38988095,
+        0.40238095, 0.23452381, 0.46071429, 0.61488095, 0.62738095, 0.63988095,
+        0.65238095, 0.66488095, 0.38452381, 1.12738095, 1.48988095, 1.50238095,
+        1.51488095, 1.52738095, 1.53988095, 0.88452381, 1.32738095, 1.75238095,
+        1.76488095, 1.77738095, 1.78988095, 1.80238095, 1.03452381, 1.52738095,
+        2.01488095, 2.02738095, 2.03988095, 2.05238095, 2.06488095, 1.18452381,
+        2.19404762, 2.88988095, 2.90238095, 2.91488095, 2.92738095, 2.93988095,
+        1.68452381, 2.39404762, 3.15238095, 3.16488095, 3.17738095, 3.18988095,
+        3.20238095, 1.83452381, 2.59404762, 3.41488095, 3.42738095, 3.43988095,
+        3.45238095, 3.46488095, 1.98452381
     ]
     self._VerifyValues(
         tensor_in_sizes=[1, 5, 8, 7, 1],
@@ -187,8 +183,7 @@ class Conv3DTest(test.TestCase):
 
   def testConv3D2x2x2FilterStride2(self):
     expected_output = [
-        3.77199074,  3.85069444,  3.92939815,  9.68865741,  9.93402778,
-        10.17939815
+        3.77199074, 3.85069444, 3.92939815, 9.68865741, 9.93402778, 10.17939815
     ]
     self._VerifyValues(
         tensor_in_sizes=[1, 4, 2, 3, 3],
@@ -199,14 +194,12 @@ class Conv3DTest(test.TestCase):
 
   def testConv3DStride3(self):
     expected_output = [
-        1.51140873,  1.57167659,  1.63194444,  1.56349206,  1.62673611,
-        1.68998016,  1.6155754 ,  1.68179563,  1.74801587,  1.9280754 ,
-        2.01215278,  2.09623016,  1.98015873,  2.0672123 ,  2.15426587,
-        2.03224206,  2.12227183,  2.21230159,  4.4280754 ,  4.65500992,
-        4.88194444,  4.48015873,  4.71006944,  4.93998016,  4.53224206,
-        4.76512897,  4.99801587,  4.84474206,  5.09548611,  5.34623016,
-        4.8968254 ,  5.15054563,  5.40426587,  4.94890873,  5.20560516,
-        5.46230159
+        1.51140873, 1.57167659, 1.63194444, 1.56349206, 1.62673611, 1.68998016,
+        1.6155754, 1.68179563, 1.74801587, 1.9280754, 2.01215278, 2.09623016,
+        1.98015873, 2.0672123, 2.15426587, 2.03224206, 2.12227183, 2.21230159,
+        4.4280754, 4.65500992, 4.88194444, 4.48015873, 4.71006944, 4.93998016,
+        4.53224206, 4.76512897, 4.99801587, 4.84474206, 5.09548611, 5.34623016,
+        4.8968254, 5.15054563, 5.40426587, 4.94890873, 5.20560516, 5.46230159
     ]
     self._VerifyValues(
         tensor_in_sizes=[1, 6, 7, 8, 2],
@@ -217,9 +210,8 @@ class Conv3DTest(test.TestCase):
 
   def testConv3D2x2x2FilterStride2Same(self):
     expected_output = [
-        3.77199074,   3.85069444,   3.92939815,   2.0162037 ,   2.06597222,
-        2.11574074,   9.68865741,   9.93402778,  10.17939815,   4.59953704,
-        4.73263889,   4.86574074
+        3.77199074, 3.85069444, 3.92939815, 2.0162037, 2.06597222, 2.11574074,
+        9.68865741, 9.93402778, 10.17939815, 4.59953704, 4.73263889, 4.86574074
     ]
     self._VerifyValues(
         tensor_in_sizes=[1, 4, 2, 3, 3],
@@ -230,8 +222,8 @@ class Conv3DTest(test.TestCase):
 
   def testKernelSmallerThanStride(self):
     expected_output = [
-        0.03703704,  0.11111111,  0.25925926,  0.33333333,  0.7037037 ,
-        0.77777778,  0.92592593,  1.
+        0.03703704, 0.11111111, 0.25925926, 0.33333333, 0.7037037, 0.77777778,
+        0.92592593, 1.
     ]
     self._VerifyValues(
         tensor_in_sizes=[1, 3, 3, 3, 1],
@@ -247,12 +239,11 @@ class Conv3DTest(test.TestCase):
         expected=expected_output)
 
     expected_output = [
-        0.54081633,  0.58017493,  0.28061224,  0.81632653,  0.85568513,
-        0.40306122,  0.41873178,  0.4340379 ,  0.19642857,  2.46938776,
-        2.50874636,  1.1377551 ,  2.74489796,  2.78425656,  1.26020408,
-        1.16873178,  1.1840379 ,  0.51785714,  1.09511662,  1.10604956,
-        0.44642857,  1.17164723,  1.18258017,  0.47704082,  0.3691691 ,
-        0.37244898,  0.125
+        0.54081633, 0.58017493, 0.28061224, 0.81632653, 0.85568513, 0.40306122,
+        0.41873178, 0.4340379, 0.19642857, 2.46938776, 2.50874636, 1.1377551,
+        2.74489796, 2.78425656, 1.26020408, 1.16873178, 1.1840379, 0.51785714,
+        1.09511662, 1.10604956, 0.44642857, 1.17164723, 1.18258017, 0.47704082,
+        0.3691691, 0.37244898, 0.125
     ]
     self._VerifyValues(
         tensor_in_sizes=[1, 7, 7, 7, 1],
@@ -262,8 +253,8 @@ class Conv3DTest(test.TestCase):
         expected=expected_output)
 
     expected_output = [
-        0.540816,  0.580175,  0.816327,  0.855685,  2.469388,  2.508746,
-        2.744898,  2.784257
+        0.540816, 0.580175, 0.816327, 0.855685, 2.469388, 2.508746, 2.744898,
+        2.784257
     ]
     self._VerifyValues(
         tensor_in_sizes=[1, 7, 7, 7, 1],
@@ -278,7 +269,7 @@ class Conv3DTest(test.TestCase):
         filter_in_sizes=[2, 1, 2, 1, 2],
         stride=1,
         padding="VALID",
-        expected=[1.5625,  1.875])
+        expected=[1.5625, 1.875])
 
   def _ConstructAndTestGradientForConfig(
       self, batch, input_shape, filter_shape, in_depth, out_depth, stride,
@@ -318,7 +309,6 @@ class Conv3DTest(test.TestCase):
     input_data = [x * 1.0 / input_size for x in range(0, input_size)]
     filter_data = [x * 1.0 / filter_size for x in range(0, filter_size)]
 
-
     for data_type in self._DtypesToTest(use_gpu=use_gpu):
       # TODO(mjanusz): Modify gradient_checker to also provide max relative
       # error and synchronize the tolerance levels between the tests for forward
@@ -330,12 +320,11 @@ class Conv3DTest(test.TestCase):
       elif data_type == dtypes.float16:
         tolerance = 1e-3
 
-
       with self.test_session(use_gpu=use_gpu):
         orig_input_tensor = constant_op.constant(
-          input_data, shape=input_shape, dtype=data_type, name="input")
+            input_data, shape=input_shape, dtype=data_type, name="input")
         filter_tensor = constant_op.constant(
-          filter_data, shape=filter_shape, dtype=data_type, name="filter")
+            filter_data, shape=filter_shape, dtype=data_type, name="filter")
 
         if data_format == "NCDHW":
           input_tensor = test_util.NHWCToNCHW(orig_input_tensor)
@@ -345,25 +334,23 @@ class Conv3DTest(test.TestCase):
           new_strides = strides
 
         conv = nn_ops.conv3d(
-          input_tensor, filter_tensor, new_strides, padding,
-          data_format=data_format, name="conv")
+            input_tensor,
+            filter_tensor,
+            new_strides,
+            padding,
+            data_format=data_format,
+            name="conv")
 
         if data_format == "NCDHW":
           conv = test_util.NCHWToNHWC(conv)
 
-        
         if test_input:
-          jacob_t, jacob_n = gradient_checker.compute_gradient(orig_input_tensor,
-                                                               input_shape,
-                                                               conv,
-                                                               output_shape)
+          jacob_t, jacob_n = gradient_checker.compute_gradient(
+              orig_input_tensor, input_shape, conv, output_shape)
         else:
-          jacob_t, jacob_n = gradient_checker.compute_gradient(filter_tensor,
-                                                               filter_shape,
-                                                               conv,
-                                                               output_shape)
-        
-        
+          jacob_t, jacob_n = gradient_checker.compute_gradient(
+              filter_tensor, filter_shape, conv, output_shape)
+
         if data_type != dtypes.float16:
           reference_jacob_t = jacob_t
           err = np.fabs(jacob_t - jacob_n).max()
@@ -375,7 +362,6 @@ class Conv3DTest(test.TestCase):
       print("conv3d gradient error = ", err)
       self.assertLess(err, tolerance)
 
-
   def ConstructAndTestGradient(self, **kwargs):
     for data_format, use_gpu in GetTestConfigs():
       self._ConstructAndTestGradientForConfig(data_format=data_format,
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 22e5400c3745a735d783fef761276694dc830c32..a7cbc76b87979fd57f68e469619929226631ee32 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import os
 import time
 
@@ -32,6 +33,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
@@ -162,8 +164,8 @@ class Conv2DTest(test.TestCase):
       # as we will be using its gradients as reference for fp16 gradients.
       return [dtypes.float32, dtypes.float16]
 
-  def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, strides,
-                            padding, data_format, dtype, use_gpu):
+  def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, dilations,
+                            strides, padding, data_format, dtype, use_gpu):
     """Verifies the output values of the convolution function.
 
     Args:
@@ -171,6 +173,7 @@ class Conv2DTest(test.TestCase):
         [batch, input_rows, input_cols, input_depth].
       filter_in_sizes: Filter tensor dimensions in
         [kernel_rows, kernel_cols, input_depth, output_depth].
+      dilations: Dilated rate: [col_dilation, row_dilation]
       strides: Stride: [col_stride, row_stride]
       padding: Padding type.
       data_format: Format of the data tensors.
@@ -194,11 +197,18 @@ class Conv2DTest(test.TestCase):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
       t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
       strides = [1] + strides + [1]
+      dilations = [1] + dilations + [1]
       if data_format == "NCHW":
         t1 = test_util.NHWCToNCHW(t1)
         strides = test_util.NHWCToNCHW(strides)
+        dilations = test_util.NHWCToNCHW(dilations)
       conv = nn_ops.conv2d(
-          t1, t2, strides=strides, padding=padding, data_format=data_format)
+          t1,
+          t2,
+          dilations=dilations,
+          strides=strides,
+          padding=padding,
+          data_format=data_format)
       if data_format == "NCHW":
         conv = test_util.NCHWToNHWC(conv)
 
@@ -240,14 +250,87 @@ class Conv2DTest(test.TestCase):
     for i in range(1, len(values)):
       self.assertAllClose(values[0], values[i], rtol=1e-5, atol=1e-5)
 
+  def _ComputeReferenceDilatedConv(self, tensor_in_sizes, filter_in_sizes,
+                                   stride, dilation, padding, data_format,
+                                   use_gpu):
+    total_size_1 = 1
+    total_size_2 = 1
+    for s in tensor_in_sizes:
+      total_size_1 *= s
+    for s in filter_in_sizes:
+      total_size_2 *= s
+
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
+    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
+    with test_util.device(use_gpu):
+      t1 = constant_op.constant(x1, shape=tensor_in_sizes)
+      t2 = constant_op.constant(x2, shape=filter_in_sizes)
+      if isinstance(stride, collections.Iterable):
+        strides = list(stride)
+      else:
+        strides = [stride, stride]
+      if data_format == "NCHW":
+        t1 = test_util.NHWCToNCHW(t1)
+        full_strides = [1, 1] + strides
+        full_dilation = [1, 1] + dilation
+      else:
+        full_strides = [1] + strides + [1]
+        full_dilation = [1] + dilation + [1]
+      expected = nn_ops.convolution(
+          t1,
+          t2,
+          padding=padding,
+          strides=strides,
+          dilation_rate=dilation,
+          data_format=data_format)
+      computed = nn_ops.conv2d(
+          t1,
+          t2,
+          strides=full_strides,
+          dilations=full_dilation,
+          padding=padding,
+          data_format=data_format)
+      if data_format == "NCHW":
+        expected = test_util.NCHWToNHWC(expected)
+        computed = test_util.NCHWToNHWC(computed)
+    return expected, computed
+
+  def _VerifyDilatedConvValues(self, tensor_in_sizes, filter_in_sizes, strides,
+                               padding, dilations):
+    expected_results = []
+    computed_results = []
+    default_dilations = (dilations[0] == 1 and dilations[1] == 1)
+    for data_format, use_gpu in GetTestConfigs():
+      # If any dilation rate is larger than 1, only do test on the GPU
+      # because we currently do not have a CPU implementation for arbitrary
+      # dilation rates.
+      if default_dilations or use_gpu:
+        expected, computed = self._ComputeReferenceDilatedConv(
+            tensor_in_sizes, filter_in_sizes, strides, dilations, padding,
+            data_format, use_gpu)
+        expected_results.append(expected)
+        computed_results.append(computed)
+        tolerance = 1e-2 if use_gpu else 1e-5
+        expected_values = self.evaluate(expected_results)
+        computed_values = self.evaluate(computed_results)
+        for e_value, c_value in zip(expected_values, computed_values):
+          print("expected = ", e_value)
+          print("actual = ", c_value)
+          self.assertAllClose(
+              e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-4)
+
   def _VerifyValues(self, tensor_in_sizes, filter_in_sizes, strides, padding,
                     expected):
     tensors = []
+    dilations = [1, 1]
     for (data_format, use_gpu) in GetTestConfigs():
       for dtype in self._DtypesToTest(use_gpu):
         result = self._SetupValuesForDevice(
             tensor_in_sizes,
             filter_in_sizes,
+            dilations,
             strides,
             padding,
             data_format,
@@ -279,6 +362,16 @@ class Conv2DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Filter2x1Dilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 4, 4, 1],
+          filter_in_sizes=[2, 2, 1, 1],
+          strides=[1, 1],
+          dilations=[2, 1],
+          padding="VALID")
+
   @test_util.run_in_graph_and_eager_modes()
   def testConv2DEmpty(self):
     expected_output = []
@@ -289,6 +382,16 @@ class Conv2DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DEmptyDilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[0, 2, 3, 3],
+          filter_in_sizes=[1, 1, 3, 3],
+          strides=[1, 1],
+          dilations=[2, 1],
+          padding="VALID")
+
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2Filter(self):
     # The outputs are computed using third_party/py/IPython/notebook.
@@ -300,6 +403,16 @@ class Conv2DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2FilterDilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 2, 3, 3],
+          filter_in_sizes=[2, 2, 3, 3],
+          strides=[1, 1],
+          dilations=[1, 2],
+          padding="VALID")
+
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D1x2Filter(self):
     # The outputs are computed using third_party/py/IPython/notebook.
@@ -314,6 +427,16 @@ class Conv2DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D1x2FilterDilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 2, 3, 3],
+          filter_in_sizes=[1, 2, 3, 3],
+          strides=[1, 1],
+          dilations=[2, 1],
+          padding="VALID")
+
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2FilterStride2(self):
     expected_output = [2271.0, 2367.0, 2463.0]
@@ -386,13 +509,23 @@ class Conv2DTest(test.TestCase):
         padding="VALID",
         expected=[50, 60])
 
-    # TODO this currently fails.
-    # self._VerifyValues(tensor_in_sizes=[1, 8, 8, 1],
-    #                   filter_in_sizes=[2, 2, 1, 1],
-    #                   strides=[4, 4], padding="SAME",
-    #                   expected=[72, 112, 392, 432])
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DKernelSizeMatchesInputSizeDilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 3, 3, 1],
+          filter_in_sizes=[2, 2, 1, 2],
+          strides=[1, 1],
+          dilations=[2, 2],
+          padding="VALID")
+
+  # TODO this currently fails.
+  # self._VerifyValues(tensor_in_sizes=[1, 8, 8, 1],
+  #                   filter_in_sizes=[2, 2, 1, 1],
+  #                   strides=[4, 4], padding="SAME",
+  #                   expected=[72, 112, 392, 432])
 
-    # Testing for backprops
+  # Testing for backprops
   def _RunAndVerifyBackpropInput(self, input_sizes, filter_sizes, output_sizes,
                                  strides, padding, expected, data_format,
                                  use_gpu, err):
@@ -724,6 +857,255 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  # Testing for backprops
+  def _RunAndVerifyBackpropInputDilation(self, input_sizes, filter_sizes,
+                                         output_sizes, strides, dilations,
+                                         padding, data_format, use_gpu, err):
+    total_input_size = 1
+    total_filter_size = 1
+    for s in input_sizes:
+      total_input_size *= s
+    for s in filter_sizes:
+      total_filter_size *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, total_input_size + 1)]
+    x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
+    default_dilations = (dilations[0] == 1 and dilations[1] == 1)
+    if default_dilations or use_gpu:
+      with self.test_session(use_gpu=use_gpu) as sess:
+        if data_format == "NCHW":
+          input_sizes = test_util.NHWCToNCHW(input_sizes)
+        t1 = constant_op.constant(x1, shape=input_sizes)
+        t2 = constant_op.constant(x2, shape=filter_sizes)
+        full_strides = [1] + strides + [1]
+        full_dilations = [1] + dilations + [1]
+        if data_format == "NCHW":
+          full_strides = test_util.NHWCToNCHW(full_strides)
+          full_dilations = test_util.NHWCToNCHW(full_dilations)
+        conv_forward = nn_ops.conv2d(
+            t1,
+            t2,
+            strides=full_strides,
+            dilations=full_dilations,
+            padding=padding,
+            data_format=data_format)
+        conv_forward_2 = nn_ops.convolution(
+            t1,
+            t2,
+            padding=padding,
+            strides=strides,
+            dilation_rate=dilations,
+            data_format=data_format)
+        if data_format == "NCHW":
+          conv_forward = test_util.NCHWToNHWC(conv_forward)
+          conv_forward_2 = test_util.NCHWToNHWC(conv_forward_2)
+        conv = gradients_impl.gradients(conv_forward, t1)[0]
+        conv_2 = gradients_impl.gradients(conv_forward_2, t1)[0]
+        # "values" consists of two tensors for two backprops
+        value = sess.run(conv)
+        value_2 = sess.run(conv_2)
+        self.assertShapeEqual(value, conv)
+        self.assertShapeEqual(value_2, conv_2)
+      print("expected = ", value_2)
+      print("actual = ", value)
+      self.assertArrayNear(value_2.flatten(), value.flatten(), err)
+
+  # Testing for backprops
+  def _RunAndVerifyBackpropFilterDilation(self, input_sizes, filter_sizes,
+                                          output_sizes, strides, dilations,
+                                          padding, data_format, use_gpu, err):
+    total_input_size = 1
+    total_filter_size = 1
+    for s in input_sizes:
+      total_input_size *= s
+    for s in filter_sizes:
+      total_filter_size *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, total_input_size + 1)]
+    x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
+    default_dilations = (dilations[0] == 1 and dilations[1] == 1)
+    if default_dilations or use_gpu:
+      with self.test_session(use_gpu=use_gpu) as sess:
+        if data_format == "NCHW":
+          input_sizes = test_util.NHWCToNCHW(input_sizes)
+        t1 = constant_op.constant(x1, shape=input_sizes)
+        t2 = constant_op.constant(x2, shape=filter_sizes)
+        full_strides = [1] + strides + [1]
+        full_dilations = [1] + dilations + [1]
+        if data_format == "NCHW":
+          full_strides = test_util.NHWCToNCHW(full_strides)
+          full_dilations = test_util.NHWCToNCHW(full_dilations)
+        conv_forward = nn_ops.conv2d(
+            t1,
+            t2,
+            strides=full_strides,
+            dilations=full_dilations,
+            padding=padding,
+            data_format=data_format)
+        conv_forward_2 = nn_ops.convolution(
+            t1,
+            t2,
+            padding=padding,
+            strides=strides,
+            dilation_rate=dilations,
+            data_format=data_format)
+        if data_format == "NCHW":
+          conv_forward = test_util.NCHWToNHWC(conv_forward)
+          conv_forward_2 = test_util.NCHWToNHWC(conv_forward_2)
+        conv = gradients_impl.gradients(conv_forward, t2)[0]
+        conv_2 = gradients_impl.gradients(conv_forward, t2)[0]
+        value = sess.run(conv)
+        value_2 = sess.run(conv_2)
+        self.assertShapeEqual(value, conv)
+        self.assertShapeEqual(value_2, conv_2)
+      print("expected = ", value_2)
+      print("actual = ", value)
+      self.assertArrayNear(value_2.flatten(), value.flatten(), err)
+
+  def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropFilterDilation(
+            input_sizes=[1, 3, 6, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 5, 1],
+            strides=[1, 1],
+            dilations=[2, 1],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2D2x2Depth1ValidBackpropFilterDilation1x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropFilterDilation(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 2, 1],
+            strides=[1, 1],
+            dilations=[1, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2DEmptyBackpropFilterDilation1x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropFilterDilation(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 0],
+            output_sizes=[1, 1, 2, 0],
+            strides=[1, 1],
+            dilations=[1, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2D2x2Depth3ValidBackpropFilterDilation2x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropFilterDilation(
+            input_sizes=[1, 3, 4, 3],
+            filter_sizes=[2, 2, 3, 3],
+            output_sizes=[1, 1, 2, 3],
+            strides=[1, 1],
+            dilations=[2, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2DKernelSizeMatchesInputSizeBackpropFilterDilation2x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropFilterDilation(
+            input_sizes=[1, 3, 3, 1],
+            filter_sizes=[2, 2, 1, 2],
+            output_sizes=[1, 1, 1, 2],
+            strides=[1, 1],
+            dilations=[2, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2D2x2Depth3ValidBackpropInputStride1x1Dilation2x1(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropInputDilation(
+            input_sizes=[1, 3, 6, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 5, 1],
+            strides=[1, 1],
+            dilations=[2, 1],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2D2x2Depth1ValidBackpropInputDilation1x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropInputDilation(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 2, 1],
+            strides=[1, 1],
+            dilations=[1, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2DEmptyBackpropInputDilation1x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropInputDilation(
+            input_sizes=[0, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[0, 1, 2, 1],
+            strides=[1, 1],
+            dilations=[1, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2D2x2Depth3ValidBackpropInputDilation2x1(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        # The GPU version of this test is not very stable. So adjusting the
+        # error threshold to 1e-4.
+        self._RunAndVerifyBackpropInputDilation(
+            input_sizes=[1, 3, 2, 3],
+            filter_sizes=[2, 2, 3, 3],
+            output_sizes=[1, 1, 2, 3],
+            strides=[1, 1],
+            dilations=[2, 1],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-4)
+
+  def testConv2DKernelSizeMatchesInputSizeBackpropInputDilation2x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropInputDilation(
+            input_sizes=[1, 3, 3, 1],
+            filter_sizes=[2, 2, 1, 2],
+            output_sizes=[1, 1, 1, 2],
+            strides=[1, 1],
+            dilations=[2, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
   # Gradient checkers
   def ConstructAndTestGradient(self, batch, input_rows, input_cols, filter_rows,
                                filter_cols, in_depth, out_depth, stride_rows,
@@ -1126,6 +1508,36 @@ class Conv2DTest(test.TestCase):
                 strides=[1, 1, 1, 1],
                 padding="VALID"))
 
+  def testCPUConv2DNCHWUnimplemented(self):
+    with self.test_session(use_gpu=False):
+      with self.assertRaisesRegexp(errors_impl.UnimplementedError,
+                                   "NHWC tensor format for now"):
+        conv = self._SetupValuesForDevice(
+            tensor_in_sizes=[1, 4, 4, 1],
+            filter_in_sizes=[2, 2, 1, 1],
+            dilations=[1, 1],
+            strides=[1, 1],
+            padding="VALID",
+            data_format="NCHW",
+            dtype=dtypes.float32,
+            use_gpu=False)
+        self.evaluate(conv)
+
+  def testCPUConv2DDilatedUnimplemented(self):
+    with self.test_session(use_gpu=False):
+      with self.assertRaisesRegexp(errors_impl.UnimplementedError,
+                                   "dilated rate of 1 for now"):
+        conv = self._SetupValuesForDevice(
+            tensor_in_sizes=[1, 4, 4, 1],
+            filter_in_sizes=[2, 2, 1, 1],
+            dilations=[2, 1],
+            strides=[1, 1],
+            padding="VALID",
+            data_format="NHWC",
+            dtype=dtypes.float32,
+            use_gpu=False)
+        self.evaluate(conv)
+
 
 class DepthwiseConv2DTest(test.TestCase):
 
@@ -1457,6 +1869,22 @@ def GetInceptionFwdTest(input_size, filter_size, stride, padding,
   return Test
 
 
+def GetInceptionFwdDilatedConvTest(input_size, filter_size, stride, padding):
+
+  def Test(self):
+    if test.is_gpu_available(cuda_only=True) and stride == 1:
+      tf_logging.info("Testing InceptionFwd with dilations %s",
+                      (input_size, filter_size, stride, padding))
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=input_size,
+          filter_in_sizes=filter_size,
+          strides=[stride, stride],
+          dilations=[2, 2],
+          padding=padding)
+
+  return Test
+
+
 def GetInceptionBackInputTest(input_size, filter_size, output_size, stride,
                               padding,
                               gpu_only=False):
@@ -1497,6 +1925,10 @@ if __name__ == "__main__":
             test_util.run_in_graph_and_eager_modes()(
                 GetInceptionFwdTest(input_size_, filter_size_, stride_,
                                     padding_)))
+    setattr(
+        Conv2DTest, "testInceptionFwdDilatedConv_" + str(index),
+        test_util.run_in_graph_and_eager_modes()(GetInceptionFwdDilatedConvTest(
+            input_size_, filter_size_, stride_, padding_)))
     setattr(Conv2DTest, "testInceptionBackInput_" + str(index),
             test_util.run_in_graph_and_eager_modes()(
                 GetInceptionBackInputTest(input_size_, filter_size_,
@@ -1519,6 +1951,9 @@ if __name__ == "__main__":
   setattr(Conv2DTest, "testInceptionFwd_No_Winograd_Nonfused",
           test_util.run_in_graph_and_eager_modes()(
               GetInceptionFwdTest(ishape, fshape, 1, "SAME", gpu_only=True)))
+  setattr(Conv2DTest, "testInceptionFwdDilatedConv_No_Winograd_Nonfused",
+          test_util.run_in_graph_and_eager_modes()(
+              GetInceptionFwdDilatedConvTest(ishape, fshape, 1, "SAME")))
   setattr(Conv2DTest, "testInceptionBackInput_No_Winograd_Nonfused",
           test_util.run_in_graph_and_eager_modes()(
               GetInceptionBackInputTest(ishape, fshape, oshape, 1, "SAME",
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index e0c53950e6ccb22f47a1c5a19a62b8373fbe4445..cea12ea8ecfa7a4f592454a96f7f3dc9dd3663ed 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -416,7 +416,7 @@ class UnaryOpTest(test.TestCase):
     self._compareCpu(x, np.square, math_ops.square)
     self._compareCpu(y, np.sqrt, math_ops.sqrt)
     self._compareCpu(y, self._rsqrt, math_ops.rsqrt)
-    self._compareCpu(x, np.exp, math_ops.exp)
+    self._compareBoth(x, np.exp, math_ops.exp)
     self._compareCpu(x, np.expm1, math_ops.expm1)
     self._compareCpu(y, np.log, math_ops.log)
     self._compareCpu(y, np.log1p, math_ops.log1p)
@@ -460,7 +460,7 @@ class UnaryOpTest(test.TestCase):
     self._compareCpu(x, np.square, math_ops.square)
     self._compareCpu(y, np.sqrt, math_ops.sqrt)
     self._compareCpu(y, self._rsqrt, math_ops.rsqrt)
-    self._compareCpu(x, np.exp, math_ops.exp)
+    self._compareBoth(x, np.exp, math_ops.exp)
     self._compareCpu(x, np.expm1, math_ops.expm1)
     self._compareCpu(y, np.log, math_ops.log)
     self._compareCpu(y, np.log1p, math_ops.log1p)
diff --git a/tensorflow/python/kernel_tests/decode_bmp_op_test.py b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
index e7b472240e5729123a56eb4bf24c348d437ad3b3..c67c26b7be0777587eb6d7c49119ad6cd2e22953 100644
--- a/tensorflow/python/kernel_tests/decode_bmp_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import test
@@ -67,28 +68,68 @@ class DecodeBmpOpTest(test.TestCase):
   def testGrayscale(self):
     img_bytes = [[[255], [0]], [[255], [0]]]
     encoded_bytes = [
-        0x42, 0x40,
-        0x3d, 0, 0, 0,
-        0, 0,
-        0, 0,
-        0x36, 0, 0, 0,
-        0x28, 0, 0, 0,
-        0x2, 0, 0, 0,
-        0x2, 0, 0, 0,
-        0x1, 0,
-        0x8, 0,
-        0, 0, 0, 0,
-        0x10, 0, 0, 0,
-        0x13, 0xb, 0, 0,
-        0x13, 0xb, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0,
+        0x42,
+        0x40,
+        0x3d,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0x36,
+        0,
+        0,
+        0,
+        0x28,
+        0,
+        0,
+        0,
+        0x2,
+        0,
+        0,
+        0,
+        0x2,
+        0,
+        0,
+        0,
+        0x1,
+        0,
+        0x8,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0x10,
+        0,
+        0,
+        0,
+        0x13,
+        0xb,
+        0,
+        0,
+        0x13,
+        0xb,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
         0xff,
         0,
-        0, 0,
+        0,
+        0,
         0xff,
         0,
-        0, 0,
+        0,
+        0,
     ]
 
     byte_string = bytes(bytearray(encoded_bytes))
@@ -99,5 +140,6 @@ class DecodeBmpOpTest(test.TestCase):
       decoded = decode.eval()
       self.assertAllEqual(decoded, img_bytes)
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/decode_compressed_op_test.py b/tensorflow/python/kernel_tests/decode_compressed_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0deb24136b276c7f80554f75ecc1e2c066f24458
--- /dev/null
+++ b/tensorflow/python/kernel_tests/decode_compressed_op_test.py
@@ -0,0 +1,75 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DecodeRaw op from parsing_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import gzip
+import sys
+import zlib
+
+from six import BytesIO
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import test
+
+
+class DecodeCompressedOpTest(test.TestCase):
+
+  def _compress(self, bytes, compression_type):
+    if compression_type == "":
+      return bytes
+    elif compression_type == "ZLIB":
+      return zlib.compress(bytes)
+    else:
+      out = BytesIO()
+      with gzip.GzipFile(fileobj=out, mode="wb") as f:
+        f.write(bytes)
+      return out.getvalue()
+
+  def testDecompress(self):
+    for compression_type in ["ZLIB", "GZIP", ""]:
+      with self.test_session():
+        in_bytes = array_ops.placeholder(dtypes.string, shape=[2])
+        decompressed = parsing_ops.decode_compressed(
+            in_bytes, compression_type=compression_type)
+        self.assertEqual([2], decompressed.get_shape().as_list())
+
+        result = decompressed.eval(
+            feed_dict={in_bytes: [self._compress(b"AaAA", compression_type),
+                                  self._compress(b"bBbb", compression_type)]})
+        self.assertAllEqual([b"AaAA", b"bBbb"], result)
+
+  def testDecompressWithRaw(self):
+    for compression_type in ["ZLIB", "GZIP", ""]:
+      with self.test_session():
+        in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
+        decompressed = parsing_ops.decode_compressed(
+            in_bytes, compression_type=compression_type)
+        decode = parsing_ops.decode_raw(decompressed, out_type=dtypes.int16)
+
+        result = decode.eval(
+            feed_dict={in_bytes: [self._compress(b"AaBC", compression_type)]})
+        self.assertAllEqual(
+            [[ord("A") + ord("a") * 256, ord("B") + ord("C") * 256]], result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
index b1d8da771612fe42a153a1a11b6cb26bdcb983a0..d0fa1fe98996fd234f457bd0199fad5efc2547dc 100644
--- a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
+++ b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
@@ -59,13 +59,21 @@ class KLTest(test.TestCase):
     # pylint: disable=unused-argument,unused-variable
 
     with self.test_session():
-      a = MyDistException(loc=0.0, scale=1.0)
+      a = MyDistException(loc=0.0, scale=1.0, allow_nan_stats=False)
       kl = kullback_leibler.kl_divergence(a, a, allow_nan_stats=False)
       with self.assertRaisesOpError(
           "KL calculation between .* and .* returned NaN values"):
         kl.eval()
+      with self.assertRaisesOpError(
+          "KL calculation between .* and .* returned NaN values"):
+        a.kl_divergence(a).eval()
+      a = MyDistException(loc=0.0, scale=1.0, allow_nan_stats=True)
       kl_ok = kullback_leibler.kl_divergence(a, a)
       self.assertAllEqual([float("nan")], kl_ok.eval())
+      self_kl_ok = a.kl_divergence(a)
+      self.assertAllEqual([float("nan")], self_kl_ok.eval())
+      cross_ok = a.cross_entropy(a)
+      self.assertAllEqual([float("nan")], cross_ok.eval())
 
   def testRegistrationFailures(self):
 
@@ -86,16 +94,22 @@ class KLTest(test.TestCase):
     for (k, v) in _DIVERGENCES.items():
       self.assertEqual(v, _registered_kl(*k))
 
-  def testIndirectRegistration(self):
+  def _testIndirectRegistration(self, fn):
 
     class Sub1(normal.Normal):
-      pass
+
+      def entropy(self):
+        return ""
 
     class Sub2(normal.Normal):
-      pass
+
+      def entropy(self):
+        return ""
 
     class Sub11(Sub1):
-      pass
+
+      def entropy(self):
+        return ""
 
     # pylint: disable=unused-argument,unused-variable
     @kullback_leibler.RegisterKL(Sub1, Sub1)
@@ -116,16 +130,30 @@ class KLTest(test.TestCase):
     sub2 = Sub2(loc=0.0, scale=1.0)
     sub11 = Sub11(loc=0.0, scale=1.0)
 
-    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub1))
-    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub1, sub2))
-    self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub1))
-    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub11))
-    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1))
-    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2))
-    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1))
-    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2))
-    self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub11))
-    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub11))
+    self.assertEqual("sub1-1", fn(sub1, sub1))
+    self.assertEqual("sub1-2", fn(sub1, sub2))
+    self.assertEqual("sub2-1", fn(sub2, sub1))
+    self.assertEqual("sub1-1", fn(sub11, sub11))
+    self.assertEqual("sub1-1", fn(sub11, sub1))
+    self.assertEqual("sub1-2", fn(sub11, sub2))
+    self.assertEqual("sub1-1", fn(sub11, sub1))
+    self.assertEqual("sub1-2", fn(sub11, sub2))
+    self.assertEqual("sub2-1", fn(sub2, sub11))
+    self.assertEqual("sub1-1", fn(sub1, sub11))
+
+  def testIndirectRegistrationKLFun(self):
+    self._testIndirectRegistration(kullback_leibler.kl_divergence)
+
+  def testIndirectRegistrationKLSelf(self):
+    self._testIndirectRegistration(
+        lambda p, q: p.kl_divergence(q))
+
+  def testIndirectRegistrationCrossEntropy(self):
+    self._testIndirectRegistration(
+        lambda p, q: p.cross_entropy(q))
+
+  def testFunctionCrossEntropy(self):
+    self._testIndirectRegistration(kullback_leibler.cross_entropy)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/distributions/special_math_test.py b/tensorflow/python/kernel_tests/distributions/special_math_test.py
index 9441cdbe39ed2fe83a34b4c3d6f9f2e92006365d..2d434a39c2933832daebbe8f710de9553d0bf38d 100644
--- a/tensorflow/python/kernel_tests/distributions/special_math_test.py
+++ b/tensorflow/python/kernel_tests/distributions/special_math_test.py
@@ -332,6 +332,32 @@ class LogNdtrGradientTest(NdtrGradientTest):
   _use_log = True
 
 
+class ErfInvTest(test.TestCase):
+
+  def testErfInvValues(self):
+    with self.test_session():
+      if not special:
+        return
+
+      x = np.linspace(0., 1.0, 50).astype(np.float64)
+
+      expected_x = special.erfinv(x)
+      x = special_math.erfinv(x)
+      self.assertAllClose(expected_x, x.eval(), atol=0.)
+
+  def testErfInvIntegerInput(self):
+    with self.test_session():
+
+      with self.assertRaises(TypeError):
+        x = np.array([1, 2, 3]).astype(np.int32)
+        special_math.erfinv(x)
+
+      with self.assertRaises(TypeError):
+        x = np.array([1, 2, 3]).astype(np.int64)
+        special_math.erfinv(x)
+
+
+
 class LogCDFLaplaceTest(test.TestCase):
   # Note that scipy.stats.laplace does not have a stable Log CDF, so we cannot
   # rely on scipy to cross check the extreme values.
diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
index 8fd26a1c9afe0ab701db199147e2de7c3ded3211..00781d01505699cccb83d7e50efdc6298553575b 100644
--- a/tensorflow/python/kernel_tests/distributions/util_test.py
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -557,6 +557,124 @@ class PickVectorTest(test.TestCase):
                               constant_op.constant(False), x, y))  # No eval.
 
 
+class PreferStaticRankTest(test.TestCase):
+
+  def testNonEmptyConstantTensor(self):
+    x = array_ops.zeros((2, 3, 4))
+    rank = du.prefer_static_rank(x)
+    self.assertIsInstance(rank, np.ndarray)
+    self.assertEqual(3, rank)
+
+  def testEmptyConstantTensor(self):
+    x = constant_op.constant([])
+    rank = du.prefer_static_rank(x)
+    self.assertIsInstance(rank, np.ndarray)
+    self.assertEqual(1, rank)
+
+  def testScalarTensor(self):
+    x = constant_op.constant(1.)
+    rank = du.prefer_static_rank(x)
+    self.assertIsInstance(rank, np.ndarray)
+    self.assertEqual(0, rank)
+
+  def testDynamicRankEndsUpBeingNonEmpty(self):
+    x = array_ops.placeholder(np.float64, shape=None)
+    rank = du.prefer_static_rank(x)
+    with self.test_session():
+      self.assertAllEqual(2, rank.eval(feed_dict={x: np.zeros((2, 3))}))
+
+  def testDynamicRankEndsUpBeingEmpty(self):
+    x = array_ops.placeholder(np.int32, shape=None)
+    rank = du.prefer_static_rank(x)
+    with self.test_session():
+      self.assertAllEqual(1, rank.eval(feed_dict={x: []}))
+
+  def testDynamicRankEndsUpBeingScalar(self):
+    x = array_ops.placeholder(np.int32, shape=None)
+    rank = du.prefer_static_rank(x)
+    with self.test_session():
+      self.assertAllEqual(0, rank.eval(feed_dict={x: 1}))
+
+
+class PreferStaticShapeTest(test.TestCase):
+
+  def testNonEmptyConstantTensor(self):
+    x = array_ops.zeros((2, 3, 4))
+    shape = du.prefer_static_shape(x)
+    self.assertIsInstance(shape, np.ndarray)
+    self.assertAllEqual(np.array([2, 3, 4]), shape)
+
+  def testEmptyConstantTensor(self):
+    x = constant_op.constant([])
+    shape = du.prefer_static_shape(x)
+    self.assertIsInstance(shape, np.ndarray)
+    self.assertAllEqual(np.array([0]), shape)
+
+  def testScalarTensor(self):
+    x = constant_op.constant(1.)
+    shape = du.prefer_static_shape(x)
+    self.assertIsInstance(shape, np.ndarray)
+    self.assertAllEqual(np.array([]), shape)
+
+  def testDynamicShapeEndsUpBeingNonEmpty(self):
+    x = array_ops.placeholder(np.float64, shape=None)
+    shape = du.prefer_static_shape(x)
+    with self.test_session():
+      self.assertAllEqual((2, 3), shape.eval(feed_dict={x: np.zeros((2, 3))}))
+
+  def testDynamicShapeEndsUpBeingEmpty(self):
+    x = array_ops.placeholder(np.int32, shape=None)
+    shape = du.prefer_static_shape(x)
+    with self.test_session():
+      self.assertAllEqual(np.array([0]), shape.eval(feed_dict={x: []}))
+
+  def testDynamicShapeEndsUpBeingScalar(self):
+    x = array_ops.placeholder(np.int32, shape=None)
+    shape = du.prefer_static_shape(x)
+    with self.test_session():
+      self.assertAllEqual(np.array([]), shape.eval(feed_dict={x: 1}))
+
+
+class PreferStaticValueTest(test.TestCase):
+
+  def testNonEmptyConstantTensor(self):
+    x = array_ops.zeros((2, 3, 4))
+    value = du.prefer_static_value(x)
+    self.assertIsInstance(value, np.ndarray)
+    self.assertAllEqual(np.zeros((2, 3, 4)), value)
+
+  def testEmptyConstantTensor(self):
+    x = constant_op.constant([])
+    value = du.prefer_static_value(x)
+    self.assertIsInstance(value, np.ndarray)
+    self.assertAllEqual(np.array([]), value)
+
+  def testScalarTensor(self):
+    x = constant_op.constant(1.)
+    value = du.prefer_static_value(x)
+    self.assertIsInstance(value, np.ndarray)
+    self.assertAllEqual(np.array(1.), value)
+
+  def testDynamicValueEndsUpBeingNonEmpty(self):
+    x = array_ops.placeholder(np.float64, shape=None)
+    value = du.prefer_static_value(x)
+    with self.test_session():
+      self.assertAllEqual(np.zeros((2, 3)),
+                          value.eval(feed_dict={x: np.zeros((2, 3))}))
+
+  def testDynamicValueEndsUpBeingEmpty(self):
+    x = array_ops.placeholder(np.int32, shape=None)
+    value = du.prefer_static_value(x)
+    with self.test_session():
+      self.assertAllEqual(np.array([]), value.eval(feed_dict={x: []}))
+
+  def testDynamicValueEndsUpBeingScalar(self):
+    x = array_ops.placeholder(np.int32, shape=None)
+    value = du.prefer_static_value(x)
+    with self.test_session():
+      self.assertAllEqual(np.array(1), value.eval(feed_dict={x: 1}))
+
+
 class FillTriangularTest(test.TestCase):
 
   def setUp(self):
@@ -587,7 +705,7 @@ class FillTriangularTest(test.TestCase):
     x_ = np.asarray(x_)
     with self.test_session() as sess:
       static_shape = None if use_deferred_shape else x_.shape
-      x_pl = array_ops.placeholder(dtype=x_.dtype, shape=static_shape)
+      x_pl = array_ops.placeholder_with_default(x_, shape=static_shape)
       # Add `zeros_like(x)` such that x's value and gradient are identical. We
       # do this so we can ensure each gradient value is mapped to the right
       # gradient location.  (Not doing this means the gradient wrt `x` is simple
diff --git a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
index 2460950aa925d9ac19daededbbaf19c9c4a9245c..fedbf9e696923a34968e7a907e4099c520d1447b 100644
--- a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import unittest
+
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -40,6 +42,7 @@ class DynamicPartitionTest(test.TestCase):
           data, indices, num_partitions=4)
       partition_vals = sess.run(partitions)
 
+    self.assertEqual(4, len(partition_vals))
     self.assertAllEqual([0, 13], partition_vals[0])
     self.assertAllEqual([17], partition_vals[1])
     self.assertAllEqual([2, 4], partition_vals[2])
@@ -61,6 +64,7 @@ class DynamicPartitionTest(test.TestCase):
           data, indices, num_partitions=4)
       partition_vals = sess.run(partitions)
 
+    self.assertEqual(4, len(partition_vals))
     self.assertAllEqual([[0, 1, 2], [3, 4, 5]], partition_vals[0])
     self.assertAllEqual([[15, 16, 17]], partition_vals[1])
     self.assertAllEqual([[6, 7, 8], [12, 13, 14]], partition_vals[2])
@@ -85,6 +89,7 @@ class DynamicPartitionTest(test.TestCase):
           data, indices, num_partitions=2)
       partition_vals = sess.run(partitions)
 
+    self.assertEqual(2, len(partition_vals))
     self.assertAllEqual(part1, partition_vals[0])
     self.assertAllEqual(part2, partition_vals[1])
 
@@ -106,6 +111,7 @@ class DynamicPartitionTest(test.TestCase):
           data, indices, num_partitions=num_partitions)
       partition_vals = sess.run(partitions)
 
+    self.assertEqual(num_partitions, len(partition_vals))
     for i in range(num_partitions):
       # reshape because of empty parts
       parts_np = np.array(parts[i], dtype=np.float).reshape(-1, cols)
@@ -121,9 +127,30 @@ class DynamicPartitionTest(test.TestCase):
           data, indices, num_partitions=2)
       partition_vals = sess.run(partitions)
 
+    self.assertEqual(2, len(partition_vals))
     self.assertAllEqual([3 + 4j, 7 + 8j], partition_vals[0])
     self.assertAllEqual([1 + 2j, 5 + 6j], partition_vals[1])
 
+  def testScalarPartitions(self):
+    data_list = [10, 13, 12, 11]
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float64)
+      indices = 3
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=4)
+      partition_vals = sess.run(partitions)
+
+    self.assertEqual(4, len(partition_vals))
+    self.assertAllEqual(np.array([], dtype=np.float64).reshape(-1, 4),
+                        partition_vals[0])
+    self.assertAllEqual(np.array([], dtype=np.float64).reshape(-1, 4),
+                        partition_vals[1])
+    self.assertAllEqual(np.array([], dtype=np.float64).reshape(-1, 4),
+                        partition_vals[2])
+    self.assertAllEqual(np.array([10, 13, 12, 11],
+                                 dtype=np.float64).reshape(-1, 4),
+                        partition_vals[3])
+
   def testHigherRank(self):
     np.random.seed(7)
     with self.test_session(use_gpu=True) as sess:
@@ -158,6 +185,7 @@ class DynamicPartitionTest(test.TestCase):
           data, indices, num_partitions=4)
       partition_vals = sess.run(partitions)
 
+    self.assertEqual(4, len(partition_vals))
     self.assertAllEqual([], partition_vals[0])
     self.assertAllEqual([1, 3], partition_vals[1])
     self.assertAllEqual([], partition_vals[2])
@@ -173,6 +201,7 @@ class DynamicPartitionTest(test.TestCase):
           data, indices, num_partitions=3)
       partition_vals = sess.run(partitions)
 
+    self.assertEqual(3, len(partition_vals))
     self.assertAllEqual([[]], partition_vals[0])
     self.assertAllEqual([[]], partition_vals[1])
     self.assertAllEqual(np.array([], dtype=np.float).reshape(0, 0),
@@ -188,9 +217,76 @@ class DynamicPartitionTest(test.TestCase):
           data, indices, num_partitions=2)
       partition_vals = sess.run(partitions)
 
+    self.assertEqual(2, len(partition_vals))
     self.assertAllEqual([], partition_vals[0])
     self.assertAllEqual([], partition_vals[1])
 
+  @unittest.skip("Fails on windows.")
+  def testGPUTooManyParts(self):
+    # This test only makes sense on the GPU. There we do not check
+    # for errors. In this case, we should discard all but the first
+    # num_partitions indices.
+    if not test.is_gpu_available():
+      return
+
+    data_list = [1, 2, 3, 4, 5, 6]
+    indices_list = [6, 5, 4, 3, 1, 0]
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float32)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=2)
+      partition_vals = sess.run(partitions)
+
+    self.assertEqual(2, len(partition_vals))
+    self.assertAllEqual([6], partition_vals[0])
+    self.assertAllEqual([5], partition_vals[1])
+
+  @unittest.skip("Fails on windows.")
+  def testGPUPartsTooLarge(self):
+    # This test only makes sense on the GPU. There we do not check
+    # for errors. In this case, we should discard all the values
+    # larger than num_partitions.
+    if not test.is_gpu_available():
+      return
+
+    data_list = [1, 2, 3, 4, 5, 6]
+    indices_list = [10, 11, 2, 12, 0, 1000]
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float32)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=5)
+      partition_vals = sess.run(partitions)
+
+    self.assertEqual(5, len(partition_vals))
+    self.assertAllEqual([5], partition_vals[0])
+    self.assertAllEqual([], partition_vals[1])
+    self.assertAllEqual([3], partition_vals[2])
+    self.assertAllEqual([], partition_vals[3])
+    self.assertAllEqual([], partition_vals[4])
+
+  @unittest.skip("Fails on windows.")
+  def testGPUAllIndicesBig(self):
+    # This test only makes sense on the GPU. There we do not check
+    # for errors. In this case, we should discard all the values
+    # and have an empty output.
+    if not test.is_gpu_available():
+      return
+
+    data_list = [1.1, 2.1, 3.1, 4.1, 5.1, 6.1]
+    indices_list = [90, 70, 60, 100, 110, 40]
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float32)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=40)
+      partition_vals = sess.run(partitions)
+
+    self.assertEqual(40, len(partition_vals))
+    for i in range(40):
+      self.assertAllEqual([], partition_vals[i])
+
   def testErrorIndexOutOfRange(self):
     with self.test_session() as sess:
       data = constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11],
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 5109ed98c92002917a5dfa3b4cd79953fd950af8..91ebe8de9921268b2a3c5ad645585e1fe83c7419 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -255,6 +255,35 @@ class GatherNdTest(test.TestCase):
     with self.test_session(use_gpu=True):
       self.assertAllEqual(expected_grads, grads.eval())
 
+  def testGradientsRank7Elements(self):
+    # Shape [1,1,2,1,1,2,2]
+    indices = constant_op.constant(
+        [[[
+            [[[[0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0]]]],
+            [[[[0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1]]]]
+        ]]],
+        dtype=dtypes.int32)
+    inputs = constant_op.constant(
+        [[[
+            [[[[1, 3], [5, 7]]]],
+            [[[[2, 4], [6, 8]]]]
+        ]]], dtype=dtypes.float64)
+    outputs = array_ops.gather_nd(inputs, indices)
+
+    grad_vals = constant_op.constant(
+        [[[
+            [[[[1, 2], [3, 4]]]],
+            [[[[5, 6], [7, 8]]]]
+        ]]], dtype=dtypes.float64)
+    grads = gradients_impl.gradients([outputs], [inputs], [grad_vals])[0]
+    expected_grads = np.array(
+        [[[
+            [[[[5, 6], [1, 2]]]],
+            [[[[3, 4], [7, 8]]]]
+        ]]], dtype=np.float64)
+    with self.test_session(use_gpu=True):
+      self.assertAllEqual(expected_grads, grads.eval())
+
   def testGradientsInt64Indices(self):
     indices = constant_op.constant(
         [[[0, 1], [1, 0]], [[0, 0], [1, 1]]], dtype=dtypes.int64)
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index d4bc71f1c8ea040b19eeb2008d3c0665759c2679..ee7aeee687f7ddaca486702f2d045d73497b3e49 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -488,6 +488,20 @@ class IndexTableFromFile(test.TestCase):
       self.assertRaises(ValueError, table.lookup,
                         constant_op.constant(["salad", "surgery", "tarkus"]))
 
+  def test_index_table_from_file_table_ref_with_oov_buckets(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab9.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=1)
+      self.assertIsNotNone(table.table_ref)
+
+  def test_index_table_from_file_table_ref_without_oov_buckets(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab10.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=0)
+      self.assertIsNotNone(table.table_ref)
+
 
 class KeyValueTensorInitializerTest(test.TestCase):
 
@@ -604,15 +618,19 @@ class IndexToStringTableFromFileTest(test.TestCase):
     return vocabulary_file
 
   def test_index_to_string_table(self):
-    vocabulary_file = self._createVocabFile("i2f_vocab1.txt")
-    with self.test_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file)
-      features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
-      self.assertRaises(errors_impl.OpError, features.eval)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          features.eval())
+    vocabulary_path = self._createVocabFile("i2f_vocab1.txt")
+    # vocabulary_file supports string and tensor
+    type_funcs = [str, constant_op.constant]
+    for type_func in type_funcs:
+      vocabulary_file = type_func(vocabulary_path)
+      with self.test_session():
+        table = lookup_ops.index_to_string_table_from_file(
+            vocabulary_file=vocabulary_file)
+        features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
+        self.assertRaises(errors_impl.OpError, features.eval)
+        lookup_ops.tables_initializer().run()
+        self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                            features.eval())
 
   def test_index_to_string_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
@@ -1431,6 +1449,10 @@ class IdTableWithHashBucketsTest(test.TestCase):
             oov_buckets,
             hasher_spec=lookup_ops.StrongHashSpec([None, 2]))
 
+  def testIdTableWithHashBucketsNoInnerTable(self):
+    with self.test_session():
+      table = lookup_ops.IdTableWithHashBuckets(None, num_oov_buckets=1)
+      self.assertIsNone(table.table_ref)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/partitioned_variables_test.py b/tensorflow/python/kernel_tests/partitioned_variables_test.py
index d40517510046959e353cad4df0c6ddbed0db90aa..56a07cb012f08dec750c5ee18cc73b3b127ef5dd 100644
--- a/tensorflow/python/kernel_tests/partitioned_variables_test.py
+++ b/tensorflow/python/kernel_tests/partitioned_variables_test.py
@@ -46,6 +46,15 @@ class PartitionerCreatorsTest(test.TestCase):
         self.assertEqual(len(v0_list), 5)
         self.assertAllEqual(v0_part, (5, 1))
 
+  def testFixedSizePartitionerInt64(self):
+    with self.test_session():
+      partitioner = partitioned_variables.fixed_size_partitioner(4, axis=0)
+      with variable_scope.variable_scope("root", partitioner=partitioner):
+        v0 = variable_scope.get_variable(
+            "v0", dtype=dtypes.int64, shape=[20])
+        v0_list = v0._get_variable_list()
+        self.assertEqual(len(v0_list), 4)
+
   def testResourceFixedSizePartitioner(self):
     with self.test_session():
       partitioner = partitioned_variables.fixed_size_partitioner(5, axis=0)
diff --git a/tensorflow/python/kernel_tests/pool_test.py b/tensorflow/python/kernel_tests/pool_test.py
index 563815b7d841d2b2d459befd21f55833a000e94c..63848976336f5487cf2a44f7cf62ea316c40d7c8 100644
--- a/tensorflow/python/kernel_tests/pool_test.py
+++ b/tensorflow/python/kernel_tests/pool_test.py
@@ -154,7 +154,7 @@ class PoolingTest(test.TestCase):
     self.assertAllClose(y1, y2.eval(), rtol=1e-2, atol=1e-2)
 
   def testPoolSimple(self):
-    with self.test_session():
+    with self.test_session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["MAX", "AVG"]:
           self._test(
@@ -166,7 +166,7 @@ class PoolingTest(test.TestCase):
               strides=[1, 2])
 
   def testPool1D(self):
-    with self.test_session():
+    with self.test_session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["MAX", "AVG"]:
           for input_shape in [[2, 9, 2], [2, 10, 2]]:
@@ -192,7 +192,7 @@ class PoolingTest(test.TestCase):
                     strides=strides)
 
   def testPool2D(self):
-    with self.test_session():
+    with self.test_session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["MAX", "AVG"]:
           for input_shape in [[2, 9, 10, 2], [2, 10, 9, 2]]:
@@ -218,7 +218,7 @@ class PoolingTest(test.TestCase):
                     strides=strides)
 
   def testPool3D(self):
-    with self.test_session():
+    with self.test_session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["MAX", "AVG"]:
           for input_shape in [[2, 9, 10, 11, 2], [2, 10, 9, 11, 2]]:
@@ -302,7 +302,7 @@ class PoolingTest(test.TestCase):
     self.assertLess(err, err_tolerance)
 
   def testGradient1D(self):
-    with self.test_session():
+    with self.test_session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["AVG", "MAX"]:
           for input_shape in [[2, 5, 2], [1, 4, 1]]:
@@ -328,7 +328,7 @@ class PoolingTest(test.TestCase):
                     strides=strides)
 
   def testGradient2D(self):
-    with self.test_session():
+    with self.test_session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["AVG", "MAX"]:
           for input_shape in [[2, 4, 5, 2], [1, 5, 4, 1]]:
@@ -354,7 +354,7 @@ class PoolingTest(test.TestCase):
                     strides=strides)
 
   def testGradient3D(self):
-    with self.test_session():
+    with self.test_session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["AVG", "MAX"]:
           for input_shape in [[1, 3, 5, 4, 1], [1, 5, 4, 3, 1]]:
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 988a72603f09ac427d997a297023d53b2410a1ac..6be8997cabdb4cba87f90378c405a63aa6f78ea3 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 import os
+import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -1442,7 +1442,6 @@ class PoolingTest(test.TestCase):
           use_gpu=True,
           v2=v2)
 
-
     # Propagate the diff in cases of NaNs
     os.environ["TF_ENABLE_MAXPOOL_NANPROP"] = "1"
     expected_input_backprop_cudnn = expected_input_backprop_tf_cpu
@@ -1779,7 +1778,7 @@ class PoolingTest(test.TestCase):
             padding="SAME")
 
   def testOpEdgeCases(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=test.is_gpu_available()) as sess:
       pool_funcs = [nn_ops.max_pool, nn_ops.avg_pool]
       if test.is_gpu_available():
         pool_funcs.append(nn_ops.max_pool_with_argmax)
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 7ed99c1be9b62a145b9584fd6412f1074f501ae8..92fb68820e04c3db1385296d91d956134b8ff2d4 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -23,82 +23,93 @@ from six.moves import queue
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
-class PyOpTest(test.TestCase):
+def np_func(x, y):
+  return np.sinh(x) + np.cosh(y)
 
-  def testBasic(self):
 
-    def my_func(x, y):
-      return np.sinh(x) + np.cosh(y)
+def matmul(x, y):
+  return math_ops.matmul(x, y)
 
-    # single type
+
+class PyFuncTest(test.TestCase):
+  """Encapsulates tests for py_func and eager_py_func."""
+
+  # ----- Tests for py_func -----
+  def testSingleType(self):
     with self.test_session():
       x = constant_op.constant(1.0, dtypes.float32)
       y = constant_op.constant(2.0, dtypes.float32)
-      z = script_ops.py_func(my_func, [x, y], dtypes.float32)
-      self.assertEqual(z.eval(), my_func(1.0, 2.0).astype(np.float32))
+      z = self.evaluate(script_ops.py_func(np_func, [x, y], dtypes.float32))
+      self.assertEqual(z, np_func(1.0, 2.0).astype(np.float32))
 
-    # scalar
+  def testScalar(self):
     with self.test_session():
       x = constant_op.constant(1.0, dtypes.float32)
       y = constant_op.constant(2.0, dtypes.float32)
-      z = script_ops.py_func(my_func, [x, y], [dtypes.float32])
-      self.assertEqual(z[0].eval(), my_func(1.0, 2.0).astype(np.float32))
+      z = self.evaluate(
+          script_ops.eager_py_func(np_func, [x, y], [dtypes.float32]))
+      self.assertEqual(z[0], np_func(1.0, 2.0).astype(np.float32))
 
-    # array
+  def testArray(self):
     with self.test_session():
       x = constant_op.constant([1.0, 2.0], dtypes.float64)
       y = constant_op.constant([2.0, 3.0], dtypes.float64)
-      z = script_ops.py_func(my_func, [x, y], [dtypes.float64])
-      self.assertAllEqual(z[0].eval(),
-                          my_func([1.0, 2.0], [2.0, 3.0]).astype(np.float64))
+      z = self.evaluate(script_ops.py_func(np_func, [x, y], [dtypes.float64]))
+      self.assertAllEqual(z[0],
+                          np_func([1.0, 2.0], [2.0, 3.0]).astype(np.float64))
 
-    # a bit exotic type (complex64)
+  def testComplexType(self):
     with self.test_session():
       x = constant_op.constant(1 + 2j, dtypes.complex64)
       y = constant_op.constant(3 + 4j, dtypes.complex64)
-      z, = script_ops.py_func(my_func, [x, y], [dtypes.complex64])
-      self.assertAllClose(z.eval(), my_func(1 + 2j, 3 + 4j))
+      z = self.evaluate(script_ops.py_func(np_func, [x, y], dtypes.complex64))
+      self.assertAllClose(z, np_func(1 + 2j, 3 + 4j))
 
-    # a bit excotic function (rfft)
+  def testRFFT(self):
     with self.test_session():
       x = constant_op.constant([1., 2., 3., 4.], dtypes.float32)
 
       def rfft(x):
         return np.fft.rfft(x).astype(np.complex64)
 
-      y, = script_ops.py_func(rfft, [x], [dtypes.complex64])
-      self.assertAllClose(y.eval(), np.fft.rfft([1., 2., 3., 4.]))
+      y = self.evaluate(script_ops.py_func(rfft, [x], dtypes.complex64))
+      self.assertAllClose(y, np.fft.rfft([1., 2., 3., 4.]))
 
-    # returns a python literal.
+  def testPythonLiteral(self):
     with self.test_session():
 
       def literal(x):
-        return 1.0 if x == 0.0 else 0.0
+        return 1.0 if float(x) == 0.0 else 0.0
 
       x = constant_op.constant(0.0, dtypes.float64)
-      y, = script_ops.py_func(literal, [x], [dtypes.float64])
-      self.assertAllClose(y.eval(), 1.0)
+      y = self.evaluate(script_ops.py_func(literal, [x], dtypes.float64))
+      self.assertAllClose(y, 1.0)
 
-    # returns a list
+  def testList(self):
     with self.test_session():
 
       def list_func(x):
         return [x, x + 1]
 
       x = constant_op.constant(0.0, dtypes.float64)
-      y, z = script_ops.py_func(list_func, [x], [dtypes.float64] * 2)
-      self.assertAllClose(y.eval(), 0.0)
-      self.assertAllClose(z.eval(), 1.0)
+      y = self.evaluate(
+          script_ops.py_func(list_func, [x], [dtypes.float64] * 2))
+      self.assertAllClose(y, [0.0, 1.0])
 
+  def testTuple(self):
     # returns a tuple
     with self.test_session():
 
@@ -106,17 +117,17 @@ class PyOpTest(test.TestCase):
         return x, x + 1
 
       x = constant_op.constant(0.0, dtypes.float64)
-      y, z = script_ops.py_func(tuple_func, [x], [dtypes.float64] * 2)
-      self.assertAllClose(y.eval(), 0.0)
-      self.assertAllClose(z.eval(), 1.0)
+      y = self.evaluate(
+          script_ops.py_func(tuple_func, [x], [dtypes.float64] * 2))
+      self.assertAllClose(y, [0.0, 1.0])
 
     # returns a tuple, Tout and inp a tuple
     with self.test_session():
       x = constant_op.constant(0.0, dtypes.float64)
-      y, z = script_ops.py_func(tuple_func, (x,), (dtypes.float64,
-                                                   dtypes.float64))
-      self.assertAllClose(y.eval(), 0.0)
-      self.assertAllClose(z.eval(), 1.0)
+      y = self.evaluate(
+          script_ops.py_func(tuple_func, (x,),
+                             (dtypes.float64, dtypes.float64)))
+      self.assertAllClose(y, [0.0, 1.0])
 
   def testStrings(self):
 
@@ -128,10 +139,12 @@ class PyOpTest(test.TestCase):
 
     with self.test_session():
       x = constant_op.constant([b"hello", b"hi"], dtypes.string)
-      y, = script_ops.py_func(read_fixed_length_numpy_strings, [],
-                              [dtypes.string])
-      z, = script_ops.py_func(read_and_return_strings, [x, y], [dtypes.string])
-      self.assertListEqual(list(z.eval()), [b"hello there", b"hi there"])
+      y = self.evaluate(
+          script_ops.py_func(read_fixed_length_numpy_strings, [],
+                             dtypes.string))
+      z = self.evaluate(
+          script_ops.py_func(read_and_return_strings, [x, y], dtypes.string))
+      self.assertAllEqual(z, [b"hello there", b"hi there"])
 
   def testStringsAreConvertedToBytes(self):
 
@@ -143,10 +156,12 @@ class PyOpTest(test.TestCase):
 
     with self.test_session():
       x = constant_op.constant(["hello", "hi"], dtypes.string)
-      y, = script_ops.py_func(read_fixed_length_numpy_strings, [],
-                              [dtypes.string])
-      z, = script_ops.py_func(read_and_return_strings, [x, y], [dtypes.string])
-      self.assertListEqual(list(z.eval()), [b"hello there", b"hi there"])
+      y = self.evaluate(
+          script_ops.py_func(read_fixed_length_numpy_strings, [],
+                             dtypes.string))
+      z = self.evaluate(
+          script_ops.py_func(read_and_return_strings, [x, y], dtypes.string))
+      self.assertAllEqual(z, [b"hello there", b"hi there"])
 
   def testObjectArraysAreConvertedToBytes(self):
 
@@ -186,16 +201,8 @@ class PyOpTest(test.TestCase):
 
   def testNoInput(self):
     with self.test_session():
-      x, = script_ops.py_func(lambda: 42.0, [], [dtypes.float64])
-      self.assertAllClose(x.eval(), 42.0)
-
-  def testCleanup(self):
-    for _ in xrange(1000):
-      g = ops.Graph()
-      with g.as_default():
-        c = constant_op.constant([1.], dtypes.float32)
-        _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
-    self.assertTrue(script_ops._py_funcs.size() < 100)
+      x = self.evaluate(script_ops.py_func(lambda: 42.0, [], dtypes.float64))
+      self.assertAllClose(x, 42.0)
 
   def testAlias(self):
     with self.test_session():
@@ -242,8 +249,8 @@ class PyOpTest(test.TestCase):
       # Create a numpy array aliasing a tensor and a tensor aliasing this array
       z, = script_ops.py_func(ident, [p], [dtypes.float32])
       z += 0.0  # Makes sure we release the tensor aliasing the numpy array x[0]
-                # above instead of using its memory as the return value of
-                # session.run
+      # above instead of using its memory as the return value of
+      # session.run
       self.assertEqual(0.0, z.eval(feed_dict={p: [0.0]}))
 
   def testStateful(self):
@@ -319,10 +326,10 @@ class PyOpTest(test.TestCase):
       def value(self):
         return self._value
 
-    with self.test_session() as sess:
+    with self.test_session():
       s = State()
       op = s.increment(constant_op.constant(2, dtypes.int64))
-      ret = sess.run(op)
+      ret = self.evaluate(op)
       self.assertIsNone(ret)
       self.assertAllEqual([3], s.value)
 
@@ -336,15 +343,24 @@ class PyOpTest(test.TestCase):
     with self.test_session() as sess:
       self.assertEqual(sess.run(f), [])
 
-  def _testExceptionHandling(self, py_exp, tf_exp):
+  def _testExceptionHandling(self, py_exp, tf_exp, eager=False):
 
     def raise_exception():
       raise py_exp("blah")  # pylint: disable=not-callable
 
-    f = script_ops.py_func(raise_exception, [], [])
-    with self.test_session() as sess:
+    if eager:
+      if context.in_eager_mode():
+        with self.assertRaisesRegexp(tf_exp, "blah"):
+          f = script_ops.eager_py_func(raise_exception, [], [])
+        return
+      else:
+        f = script_ops.eager_py_func(raise_exception, [], [])
+    else:
+      f = script_ops.py_func(raise_exception, [], [])
+
+    with self.test_session():
       with self.assertRaisesRegexp(tf_exp, "blah"):
-        sess.run(f)
+        self.evaluate(f)
 
   def testExceptionHandling(self):
     self._testExceptionHandling(ValueError, errors.InvalidArgumentError)
@@ -358,6 +374,89 @@ class PyOpTest(test.TestCase):
 
     self._testExceptionHandling(WeirdError, errors.UnknownError)
 
+  # ----- Tests shared by py_func and eager_py_func -----
+  def testCleanup(self):
+    for _ in xrange(1000):
+      g = ops.Graph()
+      with g.as_default():
+        c = constant_op.constant([1.], dtypes.float32)
+        _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
+        _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
+    self.assertTrue(script_ops._py_funcs.size() < 100)
+
+  # ----- Tests for eager_py_func -----
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerSingleOutputInt32(self):
+    a = array_ops.ones((3, 3), dtype=dtypes.int32)
+    x = array_ops.ones((3, 1), dtype=dtypes.int32)
+    output = script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.int32)
+    with self.test_session():
+      ret = self.evaluate(output)
+      self.assertAllEqual(ret, [[3], [3], [3]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerSingleOutputFloat32(self):
+    a = array_ops.ones((3, 3), dtype=dtypes.float32)
+    x = array_ops.ones((3, 1), dtype=dtypes.float32)
+    output = script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.float32)
+    with self.test_session():
+      ret = self.evaluate(output)
+      self.assertAllClose(ret, [[3.0], [3.0], [3.0]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerArrayOutput(self):
+    a = array_ops.ones((3, 3), dtype=dtypes.int32)
+    x = array_ops.ones((3, 1), dtype=dtypes.int32)
+    output = script_ops.eager_py_func(
+        lambda a, x: [matmul(a, x)], inp=[a, x], Tout=[dtypes.int32])
+
+    with self.test_session():
+      ret = self.evaluate(output)
+      self.assertAllEqual(ret, [[[3], [3], [3]]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerReturnNone(self):
+
+    def no_return_value():
+      return
+
+    output = script_ops.eager_py_func(no_return_value, inp=[], Tout=[])
+    ret = self.evaluate(output)
+    if context.in_eager_mode():
+      self.assertEquals(len(ret), 0)
+    else:
+      self.assertIsNone(ret)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerPyFuncInDefun(self):
+
+    def wrapper():
+      a = array_ops.ones((3, 3), dtype=dtypes.int32)
+      x = array_ops.ones((3, 1), dtype=dtypes.int32)
+      return script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.int32)
+
+    wrapped = function.defun(wrapper)
+    ret = self.evaluate(wrapped())
+    self.assertAllEqual(ret, [[3], [3], [3]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerExceptionHandling(self):
+    self._testExceptionHandling(
+        ValueError, errors.InvalidArgumentError, eager=True)
+    self._testExceptionHandling(
+        TypeError, errors.InvalidArgumentError, eager=True)
+    self._testExceptionHandling(
+        StopIteration, errors.OutOfRangeError, eager=True)
+    self._testExceptionHandling(
+        MemoryError, errors.ResourceExhaustedError, eager=True)
+    self._testExceptionHandling(
+        NotImplementedError, errors.UnimplementedError, eager=True)
+
+    class WeirdError(Exception):
+      pass
+
+    self._testExceptionHandling(WeirdError, errors.UnknownError, eager=True)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/random/multinomial_op_test.py b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
index ca48ba6cadee431c3af41b72646d4f1b3e60ec66..a9dc7b7de000024f23b88406bf0c1c2f32ac4fac 100644
--- a/tensorflow/python/kernel_tests/random/multinomial_op_test.py
+++ b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
@@ -57,12 +57,14 @@ class MultinomialTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes()
   def testSmallEntropy(self):
     random_seed.set_random_seed(1618)
-    with test_util.device(use_gpu=True):
-      # A logit value of -10 corresponds to a probability of ~5e-5.
-      logits = constant_op.constant([[-10., 10., -10.], [-10., -10., 10.]])
-      num_samples = 1000
-      samples = self.evaluate(random_ops.multinomial(logits, num_samples))
-      self.assertAllEqual([[1] * num_samples, [2] * num_samples], samples)
+    for output_dtype in [np.int32, np.int64]:
+      with test_util.device(use_gpu=True):
+        # A logit value of -10 corresponds to a probability of ~5e-5.
+        logits = constant_op.constant([[-10., 10., -10.], [-10., -10., 10.]])
+        num_samples = 1000
+        samples = self.evaluate(random_ops.multinomial(
+            logits, num_samples, output_dtype=output_dtype))
+        self.assertAllEqual([[1] * num_samples, [2] * num_samples], samples)
 
   def testOneOpMultipleStepsIndependent(self):
     with self.test_session(use_gpu=True) as sess:
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index 8e54d10f321d587cf4081ef56202ac4ff815c83d..223a4b2c8726d957f014e65ea9f87c0fb61e65bb 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -1018,15 +1018,15 @@ class LMDBReaderTest(test.TestCase):
     with self.test_session() as sess:
       reader1 = io_ops.LMDBReader(name="test_read_from_same_file1")
       reader2 = io_ops.LMDBReader(name="test_read_from_same_file2")
-      filename_queue = input_lib.string_input_producer([self.db_path],
-                                                       num_epochs=None)
+      filename_queue = input_lib.string_input_producer(
+          [self.db_path], num_epochs=None)
       key1, value1 = reader1.read(filename_queue)
       key2, value2 = reader2.read(filename_queue)
 
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
-      for i in range(3):
-        for j in range(10):
+      for _ in range(3):
+        for _ in range(10):
           k1, v1, k2, v2 = sess.run([key1, value1, key2, value2])
           self.assertAllEqual(compat.as_bytes(k1), compat.as_bytes(k2))
           self.assertAllEqual(compat.as_bytes(v1), compat.as_bytes(v2))
@@ -1054,14 +1054,14 @@ class LMDBReaderTest(test.TestCase):
   def testReadFromFileRepeatedly(self):
     with self.test_session() as sess:
       reader = io_ops.LMDBReader(name="test_read_from_file_repeated")
-      filename_queue = input_lib.string_input_producer([self.db_path],
-                                                       num_epochs=None)
+      filename_queue = input_lib.string_input_producer(
+          [self.db_path], num_epochs=None)
       key, value = reader.read(filename_queue)
 
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
       # Iterate over the lmdb 3 times.
-      for i in range(3):
+      for _ in range(3):
         # Go over all 10 records each time.
         for j in range(10):
           k, v = sess.run([key, value])
@@ -1071,5 +1071,6 @@ class LMDBReaderTest(test.TestCase):
       coord.request_stop()
       coord.join(threads)
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/record_input_test.py b/tensorflow/python/kernel_tests/record_input_test.py
index 1ec48ac361b81e66fd77e8a4506bebf910ea0e8a..0945ed24bf9ac36c508d5da5f66bcc1a3e034083 100644
--- a/tensorflow/python/kernel_tests/record_input_test.py
+++ b/tensorflow/python/kernel_tests/record_input_test.py
@@ -26,13 +26,14 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
-
 class RecordInputOpTest(test.TestCase):
 
-  def generateTestData(self, prefix, n, m):
+  def generateTestData(self, prefix, n, m,
+      compression_type=tf_record.TFRecordCompressionType.NONE):
+    options = tf_record.TFRecordOptions(compression_type)
     for i in range(n):
       f = os.path.join(self.get_temp_dir(), prefix + "." + str(i))
-      w = tf_record.TFRecordWriter(f)
+      w = tf_record.TFRecordWriter(f, options=options)
 
       for j in range(m):
         w.write("{0:0{width}}".format(i * m + j, width=10).encode("utf-8"))
@@ -52,6 +53,38 @@ class RecordInputOpTest(test.TestCase):
 
       self.assertEqual(sess.run(yield_op), b"0000000000")
 
+  def testRecordInputSimpleGzip(self):
+    with self.test_session() as sess:
+      self.generateTestData("basic", 1, 1,
+          compression_type=tf_record.TFRecordCompressionType.GZIP)
+
+      yield_op = data_flow_ops.RecordInput(
+          file_pattern=os.path.join(self.get_temp_dir(), "basic.*"),
+          parallelism=1,
+          buffer_size=1,
+          batch_size=1,
+          name="record_input",
+          compression_type=
+              tf_record.TFRecordCompressionType.GZIP).get_yield_op()
+
+      self.assertEqual(sess.run(yield_op), b"0000000000")
+
+  def testRecordInputSimpleZlib(self):
+    with self.test_session() as sess:
+      self.generateTestData("basic", 1, 1,
+          compression_type=tf_record.TFRecordCompressionType.ZLIB)
+
+      yield_op = data_flow_ops.RecordInput(
+          file_pattern=os.path.join(self.get_temp_dir(), "basic.*"),
+          parallelism=1,
+          buffer_size=1,
+          batch_size=1,
+          name="record_input",
+          compression_type=
+              tf_record.TFRecordCompressionType.ZLIB).get_yield_op()
+
+      self.assertEqual(sess.run(yield_op), b"0000000000")
+
   def testRecordInputEpochs(self):
     files = 100
     records_per_file = 100
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 2dc65b13849439b413b39c7dfec6e86225f6c49b..4231a79b2dcef951048ca54e8c8df2f42b44b1a1 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -50,7 +50,7 @@ def _powerset(iterable):
   """
   s = list(iterable)
   return itertools.chain.from_iterable(
-      itertools.combinations(s, r) for r in range(len(s)+1))
+      itertools.combinations(s, r) for r in range(len(s) + 1))
 
 
 class ReducedShapeTest(test.TestCase):
@@ -91,6 +91,23 @@ class ReducedShapeTest(test.TestCase):
       self._check([10, 10, 10], [-3], [1, 10, 10])
 
 
+class ReductionUnknownShape(test.TestCase):
+
+  def testBasic(self):
+    with self.test_session():
+      for dtype, reductions in [(dtypes.float32,
+                                 (math_ops.reduce_sum, math_ops.reduce_mean,
+                                  math_ops.reduce_prod, math_ops.reduce_max,
+                                  math_ops.reduce_min)),
+                                (dtypes.bool, (math_ops.reduce_all,
+                                               math_ops.reduce_any))]:
+        for reduction in reductions:
+          x = array_ops.placeholder(
+              dtype=dtype, shape=None)  # Some tensor w/ unknown shape.
+          y = reduction(x)
+          self.assertEqual(y.shape, ())
+
+
 class BaseReductionTest(test.TestCase):
 
   def _tf_reduce(self, x, reduction_axes, keep_dims):
@@ -200,7 +217,6 @@ class SumReductionTest(BaseReductionTest):
       tf_out_mean = sess.run(tf_mean)
     self.assertAllClose(tf_out_mean, 1.)
 
-
   def testFloat32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
@@ -309,8 +325,9 @@ class SumReductionTest(BaseReductionTest):
   # Int64??
 
   def testGradient(self):
-    for dtype in [dtypes.float32, dtypes.float64, dtypes.complex64,
-                  dtypes.complex128]:
+    for dtype in [
+        dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128
+    ]:
       x = self._makeIncremental([2, 3, 4, 2], dtype)
       self._compareGradientAxes(x)
 
@@ -913,8 +930,9 @@ class CountNonzeroReductionTest(test.TestCase):
   def testFloatReduce4D(self):
     # Create a 4D array of floats and reduce across some
     # dimensions
-    np_arr = np.floor(np.arange(0.0, 210.0) / 100.0).reshape(
-        [2, 3, 5, 7]).astype(np.float32)
+    np_arr = np.floor(np.arange(0.0, 210.0) / 100.0).reshape([2, 3, 5,
+                                                              7]).astype(
+                                                                  np.float32)
     self._compareAll(np_arr, None)
     self._compareAll(np_arr, [])
     self._compareAll(np_arr, [0])
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 8f328cea631767085177d3e555c4f7565abc2c27..4c7a9cb0f9542afe8fc1608a05864b739d741c97 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -498,6 +498,12 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       state_ops.scatter_update(v, [1], [3.0])
       self.assertAllEqual([1.0, 3.0], v.numpy())
 
+  def testScatterUpdateCast(self):
+    with context.eager_mode():
+      v = resource_variable_ops.ResourceVariable([1.0, 2.0], name="update")
+      state_ops.scatter_update(v, [1], [3])
+      self.assertAllEqual([1.0, 3.0], v.numpy())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index d8f4b439e37981f3d21181feae9baa8d492ee1d5..0c77d1db921566000c2a52e6ddb9d3dddd9b193c 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables as variables_lib
 import tensorflow.python.ops.data_flow_grad  # pylint: disable=unused-import
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -57,7 +58,7 @@ class Plus1RNNCell(rnn_cell_impl.RNNCell):
   def state_size(self):
     return 5
 
-  def __call__(self, input_, state, scope=None):
+  def call(self, input_, state, scope=None):
     return (input_ + 1, state + 1)
 
 
@@ -75,10 +76,31 @@ class ScalarStateRNNCell(rnn_cell_impl.RNNCell):
   def zero_state(self, batch_size, dtype):
     return array_ops.zeros([], dtype=dtypes.int32)
 
-  def __call__(self, input_, state, scope=None):
+  def call(self, input_, state, scope=None):
     return (input_, state + 1)
 
 
+class TensorArrayStateRNNCell(rnn_cell_impl.RNNCell):
+  """RNN Cell its state as a TensorArray."""
+
+  @property
+  def output_size(self):
+    return 1
+
+  @property
+  def state_size(self):
+    return (tensor_shape.TensorShape([]), ())
+
+  def zero_state(self, batch_size, dtype):
+    return (array_ops.zeros([], dtype=dtypes.int32),
+            tensor_array_ops.TensorArray(
+                dtype=dtype, size=0, dynamic_size=True))
+
+  def call(self, input_, state, scope=None):
+    new_array = state[1].write(state[0], input_)
+    return (input_, (state[0] + 1, new_array))
+
+
 class RNNTest(test.TestCase):
 
   def setUp(self):
@@ -171,6 +193,36 @@ class RNNTest(test.TestCase):
       self.assertAllEqual(outputs.numpy(), np.array([[[1], [2], [3], [4]]]))
       self.assertEqual(state.numpy(), 4)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testTensorArrayStateIsAccepted(self):
+    cell = TensorArrayStateRNNCell()
+    in_graph_mode = context.in_graph_mode()
+
+    if in_graph_mode:
+      inputs = array_ops.placeholder(dtypes.float32, shape=(1, 4, 1))
+    else:
+      inputs = np.array([[[1], [2], [3], [4]]], dtype=np.float32)
+
+    with self.test_session() as sess:
+      outputs, state = rnn.dynamic_rnn(
+          cell, inputs, dtype=dtypes.float32, sequence_length=[4])
+      state = (state[0], state[1].stack())
+      if in_graph_mode:
+        outputs, state = sess.run(
+            [outputs, state], feed_dict={
+                inputs: [[[1], [2], [3], [4]]]
+            })
+
+    if in_graph_mode:
+      self.assertAllEqual(outputs, np.array([[[1], [2], [3], [4]]]))
+      self.assertEqual(state[0], 4)
+      self.assertAllEqual(state[1], np.array([[[1]], [[2]], [[3]], [[4]]]))
+    else:
+      self.assertAllEqual(outputs.numpy(), np.array([[[1], [2], [3], [4]]]))
+      self.assertEqual(state[0].numpy(), 4)
+      self.assertAllEqual(state[1].numpy(),
+                          np.array([[[1]], [[2]], [[3]], [[4]]]))
+
 
 ######### Benchmarking RNN code
 
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index a79d66e9889b4dc55a66c505bac9b29a453356be..9f5794951524b2689daa5fc4eefb19703262b8f0 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -157,6 +158,20 @@ class StatefulScatterNdTest(test.TestCase):
       result = sess.run(scatter)
       self.assertAllClose(result, expected)
 
+  def testSimpleResource(self):
+    indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
+    updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
+    ref = resource_variable_ops.ResourceVariable(
+        [0, 0, 0, 0, 0, 0, 0, 0], dtype=dtypes.float32)
+    expected = np.array([0, 11, 0, 10, 9, 0, 0, 12])
+    scatter = state_ops.scatter_nd_update(ref, indices, updates)
+    init = variables.global_variables_initializer()
+
+    with self.test_session(use_gpu=True) as sess:
+      sess.run(init)
+      sess.run(scatter)
+      self.assertAllClose(ref.eval(), expected)
+
   def testSimple2(self):
     indices = constant_op.constant([[1, 0], [1, 1]], dtype=dtypes.int32)
     updates = constant_op.constant([11., 12.], dtype=dtypes.float32)
@@ -335,7 +350,7 @@ class StatefulScatterNdTest(test.TestCase):
         indices = np.array([2, 0, 5])
         op(ref, indices, updates).eval()
 
-        # Indicies out of range should not fail.
+        # Indices out of range should not fail.
         indices = np.array([-1, 0, 5])
         op(ref, indices, updates).eval()
         indices = np.array([2, 0, 6])
@@ -487,6 +502,43 @@ class ScatterNdTest(test.TestCase):
       if self.non_aliasing_add_test:
         self.assertAllEqual(expected_input_grad, input_grad.eval())
 
+  def testGradientsRank7SliceUpdate(self):
+    indices = constant_op.constant(
+        [[[
+            [[[[0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0]]]],
+            [[[[0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1]]]]
+        ]]], dtype=dtypes.int32)
+    updates = constant_op.constant(
+        [[[
+            [[[[5, 6], [2, 4]]]],
+            [[[[1, 3], [6, 8]]]]
+        ]]], dtype=dtypes.float64)
+    shape = constant_op.constant([1, 1, 2, 1, 1, 2, 2], dtype=dtypes.int32)
+    input_ = array_ops.zeros(shape, dtype=dtypes.float64)
+    outputs = self.scatter_nd(indices, updates, shape, input_)
+
+    grad_vals = constant_op.constant(
+        [[[
+            [[[[1, 2], [3, 4]]]],
+            [[[[5, 6], [7, 8]]]]
+        ]]], dtype=dtypes.float64)
+    updates_grad, input_grad = gradients_impl.gradients(
+        [outputs], [updates, input_], [grad_vals])
+    expected_updates_grad = np.array(
+        [[[
+            [[[[3, 4], [5, 6]]]],
+            [[[[1, 2], [7, 8]]]]
+        ]]], dtype=np.float64)
+    expected_input_grad = np.array(
+        [[[
+            [[[[1, 2], [3, 4]]]],
+            [[[[5, 6], [7, 8]]]]
+        ]]], dtype=np.float64)
+    with self.test_session():
+      self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+      if self.non_aliasing_add_test:
+        self.assertAllEqual(expected_input_grad, input_grad.eval())
+
   def testScatterNdRepatedIndicesAdd(self):
     indices = array_ops.zeros([100000, 1], dtypes.int32)
     values = np.random.randn(100000)
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 3a02f249028c134158802e55c38d6bef46184000..5a54f448d092093db668570d055801f9f9cd0f9f 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -46,13 +46,13 @@ class SegmentReductionHelper(test.TestCase):
     return constant_op.constant(
         np_values, shape=input_shape, dtype=dtype), np_values
 
-  def _segmentReduce(self, indices, x, op1, op2=None, num_out_rows=None):
+  def _segmentReduce(self, indices, x, op1, op2=None, num_segments=None):
     if not x.size:
       return np.array([])
     indices = np.asarray(indices)
-    if num_out_rows is None:
-      num_out_rows = indices[-1] + 1
-    output = [None] * num_out_rows
+    if num_segments is None:
+      num_segments = indices[-1] + 1
+    output = [None] * num_segments
     slice_shape = x.shape[indices.ndim:]
     x_flat = x.reshape((indices.size,) + slice_shape)
     for i, index in enumerate(indices.ravel()):
@@ -259,13 +259,34 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
         with self.test_session(use_gpu=True):
           tf_x, np_x = self._input(shape, dtype=dtype)
           np_ans = self._segmentReduce(
-              indices, np_x, np.add, op2=None, num_out_rows=num_segments)
+              indices, np_x, np.add, op2=None, num_segments=num_segments)
           s = math_ops.unsorted_segment_sum(
               data=tf_x, segment_ids=indices, num_segments=num_segments)
           tf_ans = s.eval()
         self.assertAllClose(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, s)
 
+  def testNumSegmentsTypes(self):
+    dtypes = [dtypes_lib.int32, dtypes_lib.int64]
+    indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
+    num_segments = 12
+    for indices in indices_flat, indices_flat.reshape(5, 2):
+      shape = indices.shape + (2,)
+      for dtype in dtypes:
+        with self.test_session(use_gpu=True):
+          tf_x, np_x = self._input(shape)
+          num_segments_constant = constant_op.constant(
+              num_segments, dtype=dtype)
+          np_ans = self._segmentReduce(
+              indices, np_x, np.add, op2=None, num_segments=num_segments)
+          s = math_ops.unsorted_segment_sum(
+              data=tf_x,
+              segment_ids=indices,
+              num_segments=num_segments_constant)
+          tf_ans = s.eval()
+        self.assertAllClose(np_ans, tf_ans)
+        self.assertShapeEqual(np_ans, s)
+
   def testGradientSegmentSum(self):
     num_cols = 2
     indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
@@ -376,11 +397,11 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
         with self.test_session(use_gpu=True):
           tf_x, np_x = self._input(shape, dtype=dtype)
           np_ans = self._segmentReduce(
-              indices, np_x, np.add, op2=None, num_out_rows=num_segments)
+              indices, np_x, np.add, op2=None, num_segments=num_segments)
           # Replace np_ans[8] with 0 for the value
           np_ans[8:] = 0
           # Replace 8 with -1 in indices
-          np.place(indices, indices==8, [-1])
+          np.place(indices, indices == 8, [-1])
           s = math_ops.unsorted_segment_sum(
               data=tf_x, segment_ids=indices, num_segments=num_segments)
           tf_ans = s.eval()
@@ -396,8 +417,15 @@ class SparseSegmentReductionHelper(SegmentReductionHelper):
     return (constant_op.constant(
         indices, dtype=dtypes_lib.int32), indices, a, b)
 
-  def _sparseSegmentReduce(self, x, indices, segment_indices, op1, op2=None):
-    return self._segmentReduce(segment_indices, x[indices], op1, op2)
+  def _sparseSegmentReduce(self,
+                           x,
+                           indices,
+                           segment_indices,
+                           op1,
+                           op2=None,
+                           num_segments=None):
+    return self._segmentReduce(
+        segment_indices, x[indices], op1, op2, num_segments=num_segments)
 
 
 class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
@@ -454,6 +482,31 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         tf_ans = s.eval()
         self.assertAllClose(np_ans, tf_ans)
 
+  def testWithNumSegments(self):
+    tf_x, np_x = self._input([10, 4], dtype=dtypes_lib.float32)
+    ops_list = [(np.add, None, math_ops.sparse_segment_sum_with_num_segments),
+                (self._mean_cum_op, self._mean_reduce_op,
+                 math_ops.sparse_segment_mean_with_num_segments)]
+    segment_indices = [0, 2, 2, 2]
+    tf_indices = [8, 3, 0, 9]
+    num_segments = 5
+    with self.test_session(use_gpu=False):
+      for np_op1, np_op2, tf_op in ops_list:
+        np_ans = self._sparseSegmentReduce(
+            np_x,
+            tf_indices,
+            segment_indices,
+            np_op1,
+            np_op2,
+            num_segments=num_segments)
+        s = tf_op(
+            data=tf_x,
+            indices=tf_indices,
+            segment_ids=segment_indices,
+            num_segments=num_segments)
+        tf_ans = s.eval()
+        self.assertAllClose(np_ans, tf_ans)
+
   def testSegmentIdsGreaterThanZero(self):
     tf_x, np_x = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [(np.add, None, math_ops.sparse_segment_sum), (
@@ -562,6 +615,63 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         with self.assertRaisesOpError("segment ids must be >= 0"):
           s.eval()
 
+  def testSegmentWithNumSegmentsValid(self):
+    # Baseline for the test*WithNumSegmentsInvalid* methods below.
+    tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
+    ops_list = [
+        math_ops.sparse_segment_sum_with_num_segments,
+        math_ops.sparse_segment_mean_with_num_segments,
+    ]
+    num_segments = 5
+    segment_indices = [0, 1, 3, 3]
+    tf_indices = [8, 3, 0, 9]
+    with self.test_session(use_gpu=False):
+      for tf_op in ops_list:
+        s = tf_op(
+            data=tf_x,
+            indices=tf_indices,
+            segment_ids=segment_indices,
+            num_segments=num_segments)
+        s.eval()
+
+  def testSegmentWithNumSegmentsInvalid1(self):
+    tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
+    ops_list = [
+        math_ops.sparse_segment_sum_with_num_segments,
+        math_ops.sparse_segment_mean_with_num_segments,
+    ]
+    num_segments = 5
+    segment_indices = [0, 1, 3, 5]
+    tf_indices = [8, 3, 0, 9]
+    with self.test_session(use_gpu=False):
+      for tf_op in ops_list:
+        s = tf_op(
+            data=tf_x,
+            indices=tf_indices,
+            segment_ids=segment_indices,
+            num_segments=num_segments)
+        with self.assertRaisesOpError("segment ids must be < num_segments"):
+          s.eval()
+
+  def testSegmentWithNumSegmentsInvalid2(self):
+    tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
+    ops_list = [
+        math_ops.sparse_segment_sum_with_num_segments,
+        math_ops.sparse_segment_mean_with_num_segments,
+    ]
+    num_segments = -2
+    segment_indices = [0, 1, 3, 3]
+    tf_indices = [8, 3, 0, 9]
+    with self.test_session(use_gpu=False):
+      for tf_op in ops_list:
+        with self.assertRaisesRegexp(
+            ValueError, "Cannot specify a negative value for num_segments"):
+          tf_op(
+              data=tf_x,
+              indices=tf_indices,
+              segment_ids=segment_indices,
+              num_segments=num_segments)
+
   def testGradient(self):
     shape = [10, 4]
 
@@ -580,6 +690,32 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             delta=1)
       self.assertAllClose(jacob_t, jacob_n)
 
+  def testGradientWithEmptySegmentsAtEnd(self):
+    shape = [10, 4]
+
+    num_segments = 5
+    segment_indices = [0, 1, 2, 2]
+    num_indices = len(segment_indices)
+    for tf_op in [
+        math_ops.sparse_segment_sum_with_num_segments,
+        math_ops.sparse_segment_mean_with_num_segments,
+    ]:
+      with self.test_session():
+        tf_indices, _, tf_x, np_x = self._sparse_input(
+            shape, num_indices, dtype=dtypes_lib.float64)
+        s = tf_op(
+            data=tf_x,
+            indices=tf_indices,
+            segment_ids=segment_indices,
+            num_segments=num_segments)
+        jacob_t, jacob_n = gradient_checker.compute_gradient(
+            tf_x,
+            shape,
+            s, [5, 4],
+            x_init_value=np_x.astype(np.double),
+            delta=1)
+      self.assertAllClose(jacob_t, jacob_n)
+
   def testGradientValid(self):
     # Baseline for the testGradient*Invalid* methods below.
     tf_x, _ = self._input([3, 4], dtype=dtypes_lib.float32)
@@ -625,7 +761,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     ops_list = [
         math_ops.sparse_segment_mean_grad, math_ops.sparse_segment_sqrt_n_grad
     ]
-    segment_indices = [0, 1, 1, 1]  # 2 segments
+    segment_indices = [0, 1, 1, 4]  # 5 segments
     tf_indices = [8, 3, 0, 9]
     with self.test_session(use_gpu=False):
       for tf_op in ops_list:
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index 6cdc7872f9176453c5ea9c318812f141214b723e..051a25080b826de05ee3e24a82fbcd1f47995544 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -217,30 +217,6 @@ class SliceTest(test.TestCase):
     self.assertEqual(expected_val.shape, slice_t.get_shape())
     self.assertEqual(expected_val.shape, slice2_t.get_shape())
 
-  def testRandomHighRank(self):
-    # Random dims of rank 8
-    input_shape = np.random.randint(0, 20, size=8)
-    inp = np.random.rand(*input_shape).astype("f")
-    with self.test_session(use_gpu=True) as sess:
-      a = constant_op.constant(
-          [float(x) for x in inp.ravel(order="C")],
-          shape=input_shape,
-          dtype=dtypes.float32)
-      indices = [0 if x == 0 else np.random.randint(x) for x in input_shape]
-      sizes = [
-          np.random.randint(0, input_shape[i] - indices[i] + 1)
-          for i in range(8)
-      ]
-      slice_t = array_ops.slice(a, indices, sizes)
-      slice_val = sess.run(slice_t)
-
-    expected_val = inp[indices[0]:indices[0] + sizes[0], indices[1]:indices[1] + sizes[
-      1], indices[2]:indices[2] + sizes[2], indices[3]:indices[3] + sizes[3], indices[
-        4]:indices[4] + sizes[4], indices[5]:indices[5] + sizes[5], indices[6]:indices[
-          6] + sizes[6], indices[7]:indices[7] + sizes[7]]
-    self.assertAllEqual(slice_val, expected_val)
-    self.assertEqual(expected_val.shape, slice_t.get_shape())
-
   def testPartialShapeInference(self):
     z = array_ops.zeros((1, 2, 3))
     self.assertAllEqual(z.get_shape().as_list(), [1, 2, 3])
@@ -251,6 +227,7 @@ class SliceTest(test.TestCase):
     m2 = array_ops.slice(z, [0, 0, 0], [constant_op.constant(1) + 0, 2, -1])
     self.assertAllEqual(m2.get_shape().as_list(), [None, 2, None])
 
+
   def _testGradientSlice(self, input_shape, slice_begin, slice_size):
     with self.test_session(use_gpu=True):
       num_inputs = np.prod(input_shape)
diff --git a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
index e87fa0c94c4cf3346c0127dd17b037cabb3cbb56..0d2887f3cef88605e87bddb7830845f12e37220b 100644
--- a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
@@ -196,7 +196,7 @@ class SparseReshapeTest(test.TestCase):
       sp_input = self._SparseTensorPlaceholder()
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(sp_input, [4, -1, -1])
-      with self.assertRaisesOpError("only one output shape size may be -1"):
+      with self.assertRaisesOpError("only one output dimension may be -1"):
         sess.run(sp_output, {sp_input: input_val})
 
   def testProvideStaticallyMismatchedSizes(self):
diff --git a/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py b/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
index af395b31bfc71e85350ea4c57e34a520a80f06fd..27b39a626fcc6b2705bf9e797b5293ed3f1c7820 100644
--- a/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
@@ -64,16 +64,75 @@ class SerializeSparseTest(test.TestCase):
     shape = np.array([3, 4, 5]).astype(np.int64)
     return sparse_tensor_lib.SparseTensorValue(ind, val, shape)
 
-  def testSerializeDeserializeMany(self):
+  def _testSerializeDeserializeHelper(self,
+                                      serialize_fn,
+                                      deserialize_fn,
+                                      out_type=dtypes.string):
+    with self.test_session(use_gpu=False) as sess:
+      sp_input = self._SparseTensorValue_5x6(np.arange(6))
+      serialized = serialize_fn(sp_input, out_type=out_type)
+      sp_deserialized = deserialize_fn(serialized, dtype=dtypes.int32)
+
+      indices, values, shape = sess.run(sp_deserialized)
+
+      self.assertAllEqual(indices, sp_input[0])
+      self.assertAllEqual(values, sp_input[1])
+      self.assertAllEqual(shape, sp_input[2])
+
+  def testSerializeDeserialize(self):
+    self._testSerializeDeserializeHelper(sparse_ops.serialize_sparse,
+                                         sparse_ops.deserialize_sparse)
+
+  def testVariantSerializeDeserialize(self):
+    self._testSerializeDeserializeHelper(sparse_ops.serialize_sparse,
+                                         sparse_ops.deserialize_sparse,
+                                         dtypes.variant)
+
+  def _testSerializeDeserializeBatchHelper(self,
+                                           serialize_fn,
+                                           deserialize_fn,
+                                           out_type=dtypes.string):
+    with self.test_session(use_gpu=False) as sess:
+      sp_input = self._SparseTensorValue_5x6(np.arange(6))
+      serialized = serialize_fn(sp_input, out_type=out_type)
+      serialized = array_ops.stack([serialized, serialized])
+
+      sp_deserialized = deserialize_fn(serialized, dtype=dtypes.int32)
+
+      combined_indices, combined_values, combined_shape = sess.run(
+          sp_deserialized)
+
+      self.assertAllEqual(combined_indices[:6, 0], [0] * 6)  # minibatch 0
+      self.assertAllEqual(combined_indices[:6, 1:], sp_input[0])
+      self.assertAllEqual(combined_indices[6:, 0], [1] * 6)  # minibatch 1
+      self.assertAllEqual(combined_indices[6:, 1:], sp_input[0])
+      self.assertAllEqual(combined_values[:6], sp_input[1])
+      self.assertAllEqual(combined_values[6:], sp_input[1])
+      self.assertAllEqual(combined_shape, [2, 5, 6])
+
+  def testSerializeDeserializeBatch(self):
+    self._testSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
+                                              sparse_ops.deserialize_sparse)
+
+  def testSerializeDeserializeManyBatch(self):
+    self._testSerializeDeserializeBatchHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
+
+  def testVariantSerializeDeserializeBatch(self):
+    self._testSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
+                                              sparse_ops.deserialize_sparse,
+                                              dtypes.variant)
+
+  def _testSerializeDeserializeBatchInconsistentShapeHelper(
+      self, serialize_fn, deserialize_fn, out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorValue_5x6(np.arange(6))
       sp_input1 = self._SparseTensorValue_3x4(np.arange(6))
-      serialized0 = sparse_ops.serialize_sparse(sp_input0)
-      serialized1 = sparse_ops.serialize_sparse(sp_input1)
-      serialized_concat = array_ops.stack([serialized0, serialized1])
+      serialized0 = serialize_fn(sp_input0, out_type=out_type)
+      serialized1 = serialize_fn(sp_input1, out_type=out_type)
+      serialized = array_ops.stack([serialized0, serialized1])
 
-      sp_deserialized = sparse_ops.deserialize_many_sparse(
-          serialized_concat, dtype=dtypes.int32)
+      sp_deserialized = deserialize_fn(serialized, dtype=dtypes.int32)
 
       combined_indices, combined_values, combined_shape = sess.run(
           sp_deserialized)
@@ -86,18 +145,72 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], sp_input1[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
-  def testFeedSerializeDeserializeMany(self):
+  def testSerializeDeserializeBatchInconsistentShape(self):
+    self._testSerializeDeserializeBatchInconsistentShapeHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
+
+  def testVariantSerializeDeserializeBatchInconsistentShape(self):
+    self._testSerializeDeserializeBatchInconsistentShapeHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
+        dtypes.variant)
+
+  def _testSerializeDeserializeNestedBatchHelper(self,
+                                                 serialize_fn,
+                                                 deserialize_fn,
+                                                 out_type=dtypes.string):
+    with self.test_session(use_gpu=False) as sess:
+      sp_input = self._SparseTensorValue_5x6(np.arange(6))
+      serialized = serialize_fn(sp_input, out_type=out_type)
+      serialized = array_ops.stack([serialized, serialized])
+      serialized = array_ops.stack([serialized, serialized])
+
+      sp_deserialized = deserialize_fn(serialized, dtype=dtypes.int32)
+
+      combined_indices, combined_values, combined_shape = sess.run(
+          sp_deserialized)
+
+      # minibatch 0
+      self.assertAllEqual(combined_indices[:6, :2], [[0, 0]] * 6)
+      self.assertAllEqual(combined_indices[:6, 2:], sp_input[0])
+      self.assertAllEqual(combined_values[:6], sp_input[1])
+      # minibatch 1
+      self.assertAllEqual(combined_indices[6:12, :2], [[0, 1]] * 6)
+      self.assertAllEqual(combined_indices[6:12, 2:], sp_input[0])
+      self.assertAllEqual(combined_values[6:12], sp_input[1])
+      # minibatch 2
+      self.assertAllEqual(combined_indices[12:18, :2], [[1, 0]] * 6)
+      self.assertAllEqual(combined_indices[12:18, 2:], sp_input[0])
+      self.assertAllEqual(combined_values[12:18], sp_input[1])
+      # minibatch 3
+      self.assertAllEqual(combined_indices[18:, :2], [[1, 1]] * 6)
+      self.assertAllEqual(combined_indices[18:, 2:], sp_input[0])
+      self.assertAllEqual(combined_values[18:], sp_input[1])
+
+      self.assertAllEqual(combined_shape, [2, 2, 5, 6])
+
+  def testSerializeDeserializeNestedBatch(self):
+    self._testSerializeDeserializeNestedBatchHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
+
+  def testVariantSerializeDeserializeNestedBatch(self):
+    self._testSerializeDeserializeNestedBatchHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
+        dtypes.variant)
+
+  def _testFeedSerializeDeserializeBatchHelper(self,
+                                               serialize_fn,
+                                               deserialize_fn,
+                                               out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorPlaceholder()
       sp_input1 = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
       input1_val = self._SparseTensorValue_3x4(np.arange(6))
-      serialized0 = sparse_ops.serialize_sparse(sp_input0)
-      serialized1 = sparse_ops.serialize_sparse(sp_input1)
+      serialized0 = serialize_fn(sp_input0, out_type=out_type)
+      serialized1 = serialize_fn(sp_input1, out_type=out_type)
       serialized_concat = array_ops.stack([serialized0, serialized1])
 
-      sp_deserialized = sparse_ops.deserialize_many_sparse(
-          serialized_concat, dtype=dtypes.int32)
+      sp_deserialized = deserialize_fn(serialized_concat, dtype=dtypes.int32)
 
       combined_indices, combined_values, combined_shape = sess.run(
           sp_deserialized, {sp_input0: input0_val,
@@ -111,40 +224,96 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], input1_val[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
-  def testSerializeManyDeserializeManyRoundTrip(self):
+  def testFeedSerializeDeserializeBatch(self):
+    self._testFeedSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
+                                                  sparse_ops.deserialize_sparse)
+
+  def testFeedSerializeDeserializeManyBatch(self):
+    self._testFeedSerializeDeserializeBatchHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
+
+  def testFeedVariantSerializeDeserializeBatch(self):
+    self._testFeedSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
+                                                  sparse_ops.deserialize_sparse,
+                                                  dtypes.variant)
+
+  def _testSerializeManyShapeHelper(self,
+                                    serialize_many_fn,
+                                    out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       # N == 4 because shape_value == [4, 5]
       indices_value = np.array([[0, 0], [0, 1], [2, 0]], dtype=np.int64)
       values_value = np.array([b"a", b"b", b"c"])
       shape_value = np.array([4, 5], dtype=np.int64)
       sparse_tensor = self._SparseTensorPlaceholder(dtype=dtypes.string)
-      serialized = sparse_ops.serialize_many_sparse(sparse_tensor)
-      deserialized = sparse_ops.deserialize_many_sparse(
-          serialized, dtype=dtypes.string)
-      serialized_value, deserialized_value = sess.run(
-          [serialized, deserialized],
+      serialized = serialize_many_fn(sparse_tensor, out_type=out_type)
+      serialized_value = sess.run(
+          serialized,
           feed_dict={
               sparse_tensor.indices: indices_value,
               sparse_tensor.values: values_value,
               sparse_tensor.dense_shape: shape_value
           })
       self.assertEqual(serialized_value.shape, (4, 3))
+
+  def testSerializeManyShape(self):
+    self._testSerializeManyShapeHelper(sparse_ops.serialize_many_sparse)
+
+  def testVariantSerializeManyShape(self):
+    # NOTE: The following test is a no-op as it is currently not possible to
+    # convert the serialized variant value to a numpy value.
+    pass
+
+  def _testSerializeManyDeserializeBatchHelper(self,
+                                               serialize_many_fn,
+                                               deserialize_fn,
+                                               out_type=dtypes.string):
+    with self.test_session(use_gpu=False) as sess:
+      # N == 4 because shape_value == [4, 5]
+      indices_value = np.array([[0, 0], [0, 1], [2, 0]], dtype=np.int64)
+      values_value = np.array([b"a", b"b", b"c"])
+      shape_value = np.array([4, 5], dtype=np.int64)
+      sparse_tensor = self._SparseTensorPlaceholder(dtype=dtypes.string)
+      serialized = serialize_many_fn(sparse_tensor, out_type=out_type)
+      deserialized = deserialize_fn(serialized, dtype=dtypes.string)
+      deserialized_value = sess.run(
+          deserialized,
+          feed_dict={
+              sparse_tensor.indices: indices_value,
+              sparse_tensor.values: values_value,
+              sparse_tensor.dense_shape: shape_value
+          })
       self.assertAllEqual(deserialized_value.indices, indices_value)
       self.assertAllEqual(deserialized_value.values, values_value)
       self.assertAllEqual(deserialized_value.dense_shape, shape_value)
 
-  def testDeserializeFailsWrongType(self):
+  def testSerializeManyDeserializeBatch(self):
+    self._testSerializeManyDeserializeBatchHelper(
+        sparse_ops.serialize_many_sparse, sparse_ops.deserialize_sparse)
+
+  def testSerializeManyDeserializeManyBatch(self):
+    self._testSerializeManyDeserializeBatchHelper(
+        sparse_ops.serialize_many_sparse, sparse_ops.deserialize_many_sparse)
+
+  def testVariantSerializeManyDeserializeBatch(self):
+    self._testSerializeManyDeserializeBatchHelper(
+        sparse_ops.serialize_many_sparse, sparse_ops.deserialize_sparse,
+        dtypes.variant)
+
+  def _testDeserializeFailsWrongTypeHelper(self,
+                                           serialize_fn,
+                                           deserialize_fn,
+                                           out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorPlaceholder()
       sp_input1 = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
       input1_val = self._SparseTensorValue_3x4(np.arange(6))
-      serialized0 = sparse_ops.serialize_sparse(sp_input0)
-      serialized1 = sparse_ops.serialize_sparse(sp_input1)
+      serialized0 = serialize_fn(sp_input0, out_type=out_type)
+      serialized1 = serialize_fn(sp_input1, out_type=out_type)
       serialized_concat = array_ops.stack([serialized0, serialized1])
 
-      sp_deserialized = sparse_ops.deserialize_many_sparse(
-          serialized_concat, dtype=dtypes.int64)
+      sp_deserialized = deserialize_fn(serialized_concat, dtype=dtypes.int64)
 
       with self.assertRaisesOpError(
           r"Requested SparseTensor of type int64 but "
@@ -153,41 +322,78 @@ class SerializeSparseTest(test.TestCase):
                  {sp_input0: input0_val,
                   sp_input1: input1_val})
 
-  def testDeserializeFailsInconsistentRank(self):
+  def testDeserializeFailsWrongType(self):
+    self._testDeserializeFailsWrongTypeHelper(sparse_ops.serialize_sparse,
+                                              sparse_ops.deserialize_sparse)
+
+  def testDeserializeManyFailsWrongType(self):
+    self._testDeserializeFailsWrongTypeHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
+
+  def testVariantDeserializeFailsWrongType(self):
+    self._testDeserializeFailsWrongTypeHelper(sparse_ops.serialize_sparse,
+                                              sparse_ops.deserialize_sparse,
+                                              dtypes.variant)
+
+  def _testDeserializeFailsInconsistentRankHelper(self,
+                                                  serialize_fn,
+                                                  deserialize_fn,
+                                                  out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorPlaceholder()
       sp_input1 = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
       input1_val = self._SparseTensorValue_1x1x1()
-      serialized0 = sparse_ops.serialize_sparse(sp_input0)
-      serialized1 = sparse_ops.serialize_sparse(sp_input1)
+      serialized0 = serialize_fn(sp_input0, out_type=out_type)
+      serialized1 = serialize_fn(sp_input1, out_type=out_type)
       serialized_concat = array_ops.stack([serialized0, serialized1])
 
-      sp_deserialized = sparse_ops.deserialize_many_sparse(
-          serialized_concat, dtype=dtypes.int32)
+      sp_deserialized = deserialize_fn(serialized_concat, dtype=dtypes.int32)
 
       with self.assertRaisesOpError(
-          r"Inconsistent rank across SparseTensors: rank prior to "
-          r"SparseTensor\[1\] was: 3 but rank of SparseTensor\[1\] is: 4"):
+          r"Inconsistent shape across SparseTensors: rank prior to "
+          r"SparseTensor\[1\] was: 2 but rank of SparseTensor\[1\] is: 3"):
         sess.run(sp_deserialized,
                  {sp_input0: input0_val,
                   sp_input1: input1_val})
 
-  def testDeserializeFailsInvalidProto(self):
+  def testDeserializeFailsInconsistentRank(self):
+    self._testDeserializeFailsInconsistentRankHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
+
+  def testDeserializeManyFailsInconsistentRank(self):
+    self._testDeserializeFailsInconsistentRankHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
+
+  def testVariantDeserializeFailsInconsistentRank(self):
+    self._testDeserializeFailsInconsistentRankHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
+        dtypes.variant)
+
+  def _testDeserializeFailsInvalidProtoHelper(self,
+                                              serialize_fn,
+                                              deserialize_fn,
+                                              out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
-      serialized0 = sparse_ops.serialize_sparse(sp_input0)
+      serialized0 = serialize_fn(sp_input0, out_type=out_type)
       serialized1 = ["a", "b", "c"]
       serialized_concat = array_ops.stack([serialized0, serialized1])
 
-      sp_deserialized = sparse_ops.deserialize_many_sparse(
-          serialized_concat, dtype=dtypes.int32)
+      sp_deserialized = deserialize_fn(serialized_concat, dtype=dtypes.int32)
 
-      with self.assertRaisesOpError(
-          r"Could not parse serialized_sparse\[1, 0\]"):
+      with self.assertRaisesOpError(r"Could not parse serialized proto"):
         sess.run(sp_deserialized, {sp_input0: input0_val})
 
+  def testDeserializeFailsInvalidProto(self):
+    self._testDeserializeFailsInvalidProtoHelper(sparse_ops.serialize_sparse,
+                                                 sparse_ops.deserialize_sparse)
+
+  def testDeserializeManyFailsInvalidProto(self):
+    self._testDeserializeFailsInvalidProtoHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/summary_image_op_test.py b/tensorflow/python/kernel_tests/summary_image_op_test.py
index d2152ab560ad27b8a761ff8029fa425fdc9ff20d..4718827e8885c328cb2e84c2f1e8880bdbdb6cae 100644
--- a/tensorflow/python/kernel_tests/summary_image_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_image_op_test.py
@@ -50,7 +50,6 @@ class SummaryImageOpTest(test.TestCase):
     self.assertProtoEquals(expected, image_summ)
 
   def testImageSummary(self):
-    np.random.seed(7)
     for depth in (1, 3, 4):
       for positive in False, True:
         with self.test_session(graph=ops.Graph()) as sess:
diff --git a/tensorflow/python/kernel_tests/template_test.py b/tensorflow/python/kernel_tests/template_test.py
index 40c0ade62a8df5a73b61c5679685ad9368c9dbbf..f0354374ac82ee6ac201095c24716f51589fa965 100644
--- a/tensorflow/python/kernel_tests/template_test.py
+++ b/tensorflow/python/kernel_tests/template_test.py
@@ -34,9 +34,10 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 
 
-def variable_scoped_function():
+def variable_scoped_function(trainable=True):
   return variable_scope.get_variable(
-      "dummy", shape=[1], initializer=init_ops.zeros_initializer())
+      "dummy", shape=[1], trainable=trainable,
+      initializer=init_ops.zeros_initializer())
 
 
 def internally_variable_scoped_function(scope_name):
@@ -413,7 +414,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual(custom_getter_count[0], 2)
 
     # Test that custom getter is called when the variable scope is created
-  # during construction
+    # during construction
     custom_getter_count[0] = 0
     tmpl2 = template.make_template(
         "s2",
@@ -539,6 +540,36 @@ class TemplateTest(test.TestCase):
     # Ensure we can get the scopes before either template is actually called.
     self.assertEqual(1, len(ta.trainable_variables))
     self.assertEqual(1, len(tb.trainable_variables))
+    # None non-trainable variable was created.
+    self.assertEqual([], list(ta.non_trainable_variables))
+    self.assertEqual([], list(tb.non_trainable_variables))
+    # Ensure variables returns all the variables.
+    self.assertEqual(1, len(ta.variables))
+    self.assertEqual(1, len(tb.variables))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_non_trainable_variables(self):
+    # Make sure non_trainable_variables are created.
+    with variable_scope.variable_scope("foo2"):
+      ta = template.make_template("a", variable_scoped_function,
+                                  trainable=True)
+      tb = template.make_template("b", variable_scoped_function,
+                                  trainable=False)
+    # Initially there are not variables created.
+    self.assertEqual([], list(ta.variables))
+    self.assertEqual([], list(tb.variables))
+    # After calling there are variables created.
+    ta()
+    tb()
+    # Check the trainable and non_trainable variables.
+    self.assertEqual(1, len(ta.trainable_variables))
+    self.assertEqual([], list(ta.non_trainable_variables))
+
+    self.assertEqual([], list(tb.trainable_variables))
+    self.assertEqual(1, len(tb.non_trainable_variables))
+    # Ensure variables returns all the variables.
+    self.assertEqual(1, len(ta.variables))
+    self.assertEqual(1, len(tb.variables))
 
   # TODO(apassos) handle local variables in Eager
   def test_local_variables(self):
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 835fdbe2aa531ed28f59279e4e83d9f8297a3b98..aad2443eea7ad87faf481973e91ca3df32ccfb44 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -43,10 +43,6 @@ import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
-# TODO(ebrevdo): Delete this line after Dec. 4, 2017.
-tensor_array_ops._ENABLE_IDENTICAL_ELEMENT_SHAPES = True
-
-
 def _make_converter(tf_dtype):
   def _converter(x):
     if tf_dtype == dtypes.string:
diff --git a/tensorflow/python/kernel_tests/transpose_op_test.py b/tensorflow/python/kernel_tests/transpose_op_test.py
index c551d9c3d056b50600d1331749ba865439748f7e..290200ce45488a9796f437d9f748e06483e83d96 100644
--- a/tensorflow/python/kernel_tests/transpose_op_test.py
+++ b/tensorflow/python/kernel_tests/transpose_op_test.py
@@ -53,11 +53,11 @@ class TransposeTest(test.TestCase):
       # Gradient check on CPU.
       xs = list(np.shape(x))
       ys = list(np.shape(tf_ans))
-      if x.dtype == np.float32:
+      if x.dtype in [np.float32, np.complex64]:
         jacob_t, jacob_n = gradient_checker.compute_gradient(inx, xs, y, ys, x,
                                                              1e-2)
         self.assertAllClose(jacob_t, jacob_n, 1e-3, 1e-3)
-      elif x.dtype == np.float64:
+      elif x.dtype in [np.float64, np.complex128]:
         jacob_t, jacob_n = gradient_checker.compute_gradient(inx, xs, y, ys, x,
                                                              1e-2)
         self.assertAllClose(jacob_t, jacob_n, 1e-6, 1e-6)
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index 04758ce45a0ba00c8adbcf29d7afedb71b43a08c..6390b7c51808cf338f0651bbbdb30c7b71af7d8e 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -87,6 +87,7 @@ class UniqueTest(test.TestCase):
     for i in range(len(x)):
       self.assertEqual(x[i], tf_y[tf_idx[i]])
 
+
 class UniqueWithCountsTest(test.TestCase):
 
   def testInt32(self):
diff --git a/tensorflow/python/kernel_tests/unstack_op_test.py b/tensorflow/python/kernel_tests/unstack_op_test.py
index c2dcff978a4ac07b290352c98f2fc062583a3df1..d93710859976e1c01ba4e7c25034ae6cb095368e 100644
--- a/tensorflow/python/kernel_tests/unstack_op_test.py
+++ b/tensorflow/python/kernel_tests/unstack_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -42,15 +43,33 @@ class UnstackOpTest(test.TestCase):
     np.random.seed(7)
     with self.test_session(use_gpu=True):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        data = np.random.randn(*shape)
-        # Convert data to a single tensorflow tensor
-        x = constant_op.constant(data)
-        # Unpack into a list of tensors
-        cs = array_ops.unstack(x, num=shape[0])
-        self.assertEqual(type(cs), list)
-        self.assertEqual(len(cs), shape[0])
-        cs = [c.eval() for c in cs]
-        self.assertAllEqual(cs, data)
+        for dtype in [np.bool, np.float16, np.float32, np.float64, np.int32, np.int64]:
+          data = np.random.randn(*shape).astype(dtype)
+          # Convert data to a single tensorflow tensor
+          x = constant_op.constant(data)
+          # Unpack into a list of tensors
+          cs = array_ops.unstack(x, num=shape[0])
+          self.assertEqual(type(cs), list)
+          self.assertEqual(len(cs), shape[0])
+          cs = [c.eval() for c in cs]
+          self.assertAllEqual(cs, data)
+
+  def testSimpleGpu(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+    np.random.seed(7)
+    with self.test_session(use_gpu=True, force_gpu=True):
+      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
+        for dtype in [np.float16, np.float32, np.float64, np.int32, np.int64]:
+          data = np.random.randn(*shape).astype(dtype)
+          # Convert data to a single tensorflow tensor
+          x = constant_op.constant(data)
+          # Unpack into a list of tensors
+          cs = array_ops.unstack(x, num=shape[0])
+          self.assertEqual(type(cs), list)
+          self.assertEqual(len(cs), shape[0])
+          cs = [c.eval() for c in cs]
+          self.assertAllEqual(cs, data)
 
   def testGradientsAxis0(self):
     for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 53962149561c8aad1eb48f30d304e7c37021ba96..8d57ff03c81be6a189a15eccdef7ed1d5a20e77b 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -128,6 +128,8 @@ class VariableScopeTest(test.TestCase):
       self.assertTrue(w in store.variables())
       self.assertTrue(v in store.trainable_variables())
       self.assertFalse(w in store.trainable_variables())
+      self.assertFalse(v in store.non_trainable_variables())
+      self.assertTrue(w in store.non_trainable_variables())
 
   @test_util.run_in_graph_and_eager_modes()
   def testInitFromNonTensorValue(self):
@@ -744,6 +746,115 @@ class VariableScopeTest(test.TestCase):
           with ops.name_scope("scope2") as sc2:
             self.assertEqual(sc2, "outer_1/default/scope2/")
 
+  def testBasicWhenAuxiliaryNameScopeIsFalse(self):
+    with self.test_session():
+      with variable_scope.variable_scope("scope",
+                                         auxiliary_name_scope=False) as scope:
+        self.assertEqual(scope.original_name_scope, "")
+        self.assertEqual(variable_scope.get_variable("w", []).name, "scope/w:0")
+        self.assertEqual(constant_op.constant([], name="c").name, "c:0")
+      with variable_scope.variable_scope(scope,
+                                         auxiliary_name_scope=False) as scope1:
+        self.assertEqual(scope.original_name_scope, "")
+        self.assertEqual(variable_scope.get_variable("w1", []).name, "scope/w1:0")
+        self.assertEqual(constant_op.constant([], name="c1").name, "c1:0")
+      # Recheck: new name scope is NOT created before
+      with ops.name_scope("scope"):
+        self.assertEqual(constant_op.constant([], name="c").name, "scope/c:0")
+
+      with variable_scope.variable_scope("outer"):
+        with variable_scope.variable_scope("inner",
+                                           auxiliary_name_scope=False) as inner:
+          self.assertEqual(inner.original_name_scope, "outer/")
+          self.assertEqual(variable_scope.get_variable("w", []).name, "outer/inner/w:0")
+          self.assertEqual(constant_op.constant([], name="c").name, "outer/c:0")
+        with variable_scope.variable_scope(inner,
+                                           auxiliary_name_scope=False) as inner1:
+          self.assertEqual(inner1.original_name_scope, "outer/")
+          self.assertEqual(variable_scope.get_variable("w1", []).name, "outer/inner/w1:0")
+          self.assertEqual(constant_op.constant([], name="c1").name, "outer/c1:0")
+        # Recheck: new name scope is NOT created before
+        with ops.name_scope("inner"):
+          self.assertEqual(constant_op.constant([], name="c").name, "outer/inner/c:0")
+
+  def testCreatedByDefaultNameWhenAuxiliaryNameScopeIsFalse(self):
+    with self.test_session():
+      with variable_scope.variable_scope(None, default_name="default",
+                                         auxiliary_name_scope=False) as scope:
+        self.assertEqual(scope.original_name_scope, "")
+        self.assertEqual(variable_scope.get_variable("w", []).name, "default/w:0")
+        self.assertEqual(constant_op.constant([], name="c").name, "c:0")
+      # Recheck: new name scope is NOT created before
+      with ops.name_scope("default"):
+        self.assertEqual(constant_op.constant([], name="c").name, "default/c:0")
+
+      with variable_scope.variable_scope("outer"):
+        with variable_scope.variable_scope(None, default_name="default",
+                                           auxiliary_name_scope=False) as inner:
+          self.assertEqual(inner.original_name_scope, "outer/")
+          self.assertEqual(variable_scope.get_variable("w", []).name, "outer/default/w:0")
+          self.assertEqual(constant_op.constant([], name="c").name, "outer/c:0")
+        # Recheck: new name scope is NOT created before
+        with ops.name_scope("default"):
+          self.assertEqual(constant_op.constant([], name="c").name, "outer/default/c:0")
+
+  def testReenterRootScopeWhenAuxiliaryNameScopeIsFalse(self):
+    with self.test_session():
+      root_scope = variable_scope.get_variable_scope()
+      with variable_scope.variable_scope(root_scope,
+                                         auxiliary_name_scope=False) as scope:
+        self.assertEqual(scope.original_name_scope, "")
+        self.assertEqual(variable_scope.get_variable("w", []).name, "w:0")
+        self.assertEqual(constant_op.constant([], name="c").name, "c:0")
+
+      with variable_scope.variable_scope("outer"):
+        with variable_scope.variable_scope(root_scope,
+                                           auxiliary_name_scope=False) as inner:
+          self.assertEqual(inner.original_name_scope, "")
+          self.assertEqual(variable_scope.get_variable("w1", []).name, "w1:0")
+          self.assertEqual(constant_op.constant([], name="c1").name, "outer/c1:0")
+
+  def testAuxiliaryNameScopeIsInvalid(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, "auxiliary_name_scope"):
+        with variable_scope.variable_scope(None, default_name="scope",
+                                           auxiliary_name_scope="invalid"):
+          pass
+
+      with self.assertRaisesRegexp(TypeError, "auxiliary_name_scope"):
+        with variable_scope.variable_scope("scope", auxiliary_name_scope="invalid"):
+          pass
+
+      with variable_scope.variable_scope("scope") as scope:
+        pass
+      with self.assertRaisesRegexp(TypeError, "auxiliary_name_scope"):
+        with variable_scope.variable_scope(scope, auxiliary_name_scope="invalid"):
+          pass
+
+  def testReuseScopeWithoutNameScopeCollision(self):
+    # Github issue: #13429
+    with self.test_session():
+      with variable_scope.variable_scope("outer"):
+        with variable_scope.variable_scope("inner") as inner:
+          pass
+
+      with variable_scope.variable_scope(inner,
+                                         auxiliary_name_scope=False) as scope:
+        with ops.name_scope(scope.original_name_scope):
+          self.assertEqual(variable_scope.get_variable("w", []).name, "outer/inner/w:0")
+          self.assertEqual(constant_op.constant([], name="c").name, "outer/inner/c:0")
+        with ops.name_scope("inner"):
+          self.assertEqual(constant_op.constant([], name="c").name, "inner/c:0")
+
+      with variable_scope.variable_scope("another"):
+        with variable_scope.variable_scope(inner,
+                                           auxiliary_name_scope=False) as scope1:
+          with ops.name_scope(scope1.original_name_scope):
+            self.assertEqual(variable_scope.get_variable("w1", []).name, "outer/inner/w1:0")
+            self.assertEqual(constant_op.constant([], name="c1").name, "outer/inner/c1:0")
+          with ops.name_scope("inner"):
+            self.assertEqual(constant_op.constant([], name="c").name, "another/inner/c:0")
+
   @test_util.run_in_graph_and_eager_modes()
   def testGetLocalVar(self):
     # Check that local variable respects naming.
@@ -899,35 +1010,6 @@ def axis0_into3_partitioner(shape=None, **unused_kwargs):
 
 class VariableScopeWithPartitioningTest(test.TestCase):
 
-  def testInitFromNonInitializer(self):
-    with self.test_session() as sess:
-      # Test various dtypes with zeros initializer as following:
-      types = [
-          dtypes.int8, dtypes.uint8, dtypes.int16, dtypes.uint16, dtypes.int32,
-          dtypes.int64, dtypes.bool
-      ]
-
-      # Use different variable_name to distinguish various dtypes
-      for (i, dtype) in enumerate(types):
-        x = variable_scope.get_variable(
-            name="x%d" % i,
-            shape=(3, 4),
-            dtype=dtype,
-            partitioner=axis0_into2_partitioner)
-        y = variable_scope.get_variable(
-            name="y%d" % i,
-            shape=(6, 4),
-            dtype=dtype,
-            partitioner=axis0_into2_partitioner,
-            initializer=init_ops.zeros_initializer(dtype=dtype))
-
-        variables_lib.global_variables_initializer().run()
-        # x and y would become var list after partition
-        val_x = sess.run(list(x))
-        val_y = sess.run(list(y))
-
-        self.assertAllEqual(val_x, val_y)
-
   def testResultNameMatchesRequested(self):
     with variable_scope.variable_scope(
         "scope0", partitioner=axis0_into2_partitioner):
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 6be2bc3e7692bdba569f011243f368f0ee7abc94..acbbb213220c3c3ceaf2faada9e69f83334a557e 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -103,10 +103,16 @@ class Layer(object):
     self.built = False
     self.input_spec = None
 
+    if activity_regularizer and context.in_eager_mode():
+      raise ValueError(
+          ('Activity regularization is not supported when executing eagerly. '
+           'Got activity_regularizer=%s') % (activity_regularizer,))
     self._activity_regularizer = activity_regularizer
     self._trainable_weights = []
     self._non_trainable_weights = []
     self._updates = []
+    # When executing eagerly, _losses is a list of zero-argument lambdas which
+    # return tensors. When using graph execution, _losses is a list of ops.
     self._losses = []
     self._reuse = kwargs.get('_reuse')
     self._graph = ops.get_default_graph()
@@ -287,9 +293,22 @@ class Layer(object):
 
   @property
   def losses(self):
+    """Losses which are associated with this `Layer`.
+
+    Note that when executing eagerly, getting this property evaluates
+    regularizers. When using graph execution, variable regularization ops have
+    already been created and are simply returned here.
+
+    Returns:
+      A list of tensors.
+    """
     if context.in_eager_mode():
-      raise RuntimeError('Layer.losses not supported in Eager mode.')
-    return self._losses
+      # _losses may only contain variable regularization losses when executing
+      # eagerly, and they have been saved as lambdas to be executed when
+      # requested.
+      return [regularizer() for regularizer in self._losses]
+    else:
+      return self._losses
 
   def add_loss(self, losses, inputs=None):
     """Add loss tensor(s), potentially dependent on layer inputs.
@@ -303,6 +322,11 @@ class Layer(object):
     The `get_losses_for` method allows to retrieve the losses relevant to a
     specific set of inputs.
 
+    Note that `add_loss` is not supported when executing eagerly. Instead,
+    variable regularizers may be added through `add_variable`. Activity
+    regularization is not supported directly (but such losses may be returned
+    from `Layer.call()`).
+
     Arguments:
       losses: Loss tensor, or list/tuple of tensors.
       inputs: Optional input tensor(s) that the loss(es) depend on. Must
@@ -462,22 +486,15 @@ class Layer(object):
     Raises:
       RuntimeError: If called in Eager mode with regularizers.
     """
-    # Note that we currently don't support variable regularization in Eager
-    # mode. An alternative is for users to directly compute these losses before
-    # performing a backward pass.
     if context.in_graph_mode():
       existing_variables = set(tf_variables.global_variables())
-    else:
-      existing_variables = []
-      if regularizer is not None:
-        raise RuntimeError('Variable regularization not supported in Eager '
-                           'mode.')
     if dtype is None:
       dtype = self.dtype or dtypes.float32
 
     self._set_scope(None)
     with vs.variable_scope(
-        self._scope, reuse=(self.built or self._reuse)) as scope:
+        self._scope, reuse=(self.built or self._reuse),
+        auxiliary_name_scope=False) as scope:
       with ops.name_scope(self._name_scope_name(scope)):
         variable = vs.get_variable(name,
                                    shape=shape,
@@ -486,28 +503,39 @@ class Layer(object):
                                    constraint=constraint,
                                    trainable=trainable and self.trainable,
                                    partitioner=partitioner)
-        if (context.in_graph_mode() and trainable and self.trainable
-            and variable not in tf_variables.trainable_variables()):
-          # A custom getter / variable scope overrode the trainable flag.
-          trainable = False
-        if variable in existing_variables:
-          return variable
-        if regularizer:
-          # To match the behavior of tf.get_variable(), we only
-          # apply regularization if the variable is newly created.
-          if isinstance(variable, tf_variables.PartitionedVariable):
-            for v in variable:
-              with ops.colocate_with(v.op):
+        if context.in_graph_mode():
+          if (trainable and self.trainable
+              and variable not in tf_variables.trainable_variables()):
+            # A custom getter / variable scope overrode the trainable flag.
+            trainable = False
+          if variable in existing_variables:
+            return variable
+          if regularizer:
+            # To match the behavior of tf.get_variable(), we only
+            # apply regularization if the variable is newly created.
+            if isinstance(variable, tf_variables.PartitionedVariable):
+              for v in variable:
+                with ops.colocate_with(v.op):
+                  with ops.name_scope(name + '/Regularizer'):
+                    regularization = regularizer(v)
+                if regularization is not None:
+                  self.add_loss(regularization)
+            else:
+              with ops.colocate_with(variable.op):
                 with ops.name_scope(name + '/Regularizer'):
-                  regularization = regularizer(v)
+                  regularization = regularizer(variable)
               if regularization is not None:
                 self.add_loss(regularization)
-          else:
-            with ops.colocate_with(variable.op):
-              with ops.name_scope(name + '/Regularizer'):
-                regularization = regularizer(variable)
-            if regularization is not None:
-              self.add_loss(regularization)
+        elif regularizer:
+          if isinstance(variable, tf_variables.PartitionedVariable):
+            raise RuntimeError(
+                'Partitioned variable regularization is not yet supported when '
+                'executing eagerly. File a feature request is this is '
+                'important to you.')
+          # Save a zero-argument lambda which runs the regularizer on the
+          # variable, to be executed when `Layer.losses` is requested. This
+          # makes losses responsive to variable updates when executing eagerly.
+          self._losses.append(lambda: regularizer(variable))
     if trainable:
       self._trainable_weights.append(variable)
     else:
@@ -575,11 +603,11 @@ class Layer(object):
         # variable scope with this setting. We avoid re-creating variable scopes
         # after this point as an optimization.
         self._always_reuse_variable_scope = vs.variable_scope(
-            self._scope, reuse=True)
+            self._scope, reuse=True, auxiliary_name_scope=False)
         scope_context_manager = self._always_reuse_variable_scope
     else:
       scope_context_manager = vs.variable_scope(
-          self._scope, reuse=self._reuse)
+          self._scope, reuse=self._reuse, auxiliary_name_scope=False)
     with scope_context_manager as scope:
       with ops.name_scope(self._name_scope_name(scope)):
         if not self.built:
@@ -602,7 +630,7 @@ class Layer(object):
           self._assert_input_compatibility(inputs)
           if input_list and self._dtype is None:
             try:
-              self._dtype = input_list[0].dtype.name
+              self._dtype = input_list[0].dtype.base_dtype.name
             except AttributeError:
               pass
           input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 1eea20deefe2f033ab9827f9d5b92f8661618d21..42c3693922730825eebdf5f29070f98b08dc0483 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -88,6 +88,11 @@ class BaseLayerTest(test.TestCase):
           regularizer=regularizer)
       self.assertEqual(len(layer.losses), 1)
 
+  def testNoEagerActivityRegularizer(self):
+    with context.eager_mode():
+      with self.assertRaisesRegexp(ValueError, 'activity_regularizer'):
+        core_layers.Dense(1, activity_regularizer=lambda *args, **kwargs: 0.)
+
   def testGetVariable(self):
     with self.test_session():
 
@@ -469,6 +474,62 @@ class BaseLayerTest(test.TestCase):
     layer.apply(x)
     self.assertEqual(len(layer.get_losses_for(x)), 1)
 
+  def testNameScopeIsConsistentWithVariableScope(self):
+    # Github issue 13429.
+    class MyLayer(base_layers.Layer):
+
+      def build(self, input_shape):
+        self.my_var = self.add_variable('my_var', (), dtypes.float32)
+        self.built = True
+
+      def call(self, inputs):
+        return math_ops.multiply(inputs, self.my_var, name='my_op')
+
+    def _gen_layer(x, name=None):
+      layer = MyLayer(name=name)
+      out = layer.apply(x)
+      return layer, out
+
+    # unnamed layer
+    with ops.Graph().as_default():
+      x = array_ops.placeholder(dtypes.float32, (), 'x')
+      layer, op = _gen_layer(x)
+      layer1, op1 = _gen_layer(op)
+      layer2, op2 = _gen_layer(op1)
+
+      self.assertEqual(layer.my_var.name, 'my_layer/my_var:0')
+      self.assertEqual(op.name, 'my_layer/my_op:0')
+      self.assertEqual(layer1.my_var.name, 'my_layer_1/my_var:0')
+      self.assertEqual(op1.name, 'my_layer_1/my_op:0')
+      self.assertEqual(layer2.my_var.name, 'my_layer_2/my_var:0')
+      self.assertEqual(op2.name, 'my_layer_2/my_op:0')
+    # name starts from zero
+    with ops.Graph().as_default():
+      x = array_ops.placeholder(dtypes.float32, (), 'x')
+      layer, op = _gen_layer(x, name='name')
+      layer1, op1 = _gen_layer(op, name='name_1')
+      layer2, op2 = _gen_layer(op1, name='name_2')
+
+      self.assertEqual(layer.my_var.name, 'name/my_var:0')
+      self.assertEqual(op.name, 'name/my_op:0')
+      self.assertEqual(layer1.my_var.name, 'name_1/my_var:0')
+      self.assertEqual(op1.name, 'name_1/my_op:0')
+      self.assertEqual(layer2.my_var.name, 'name_2/my_var:0')
+      self.assertEqual(op2.name, 'name_2/my_op:0')
+    # name starts from one
+    with ops.Graph().as_default():
+      x = array_ops.placeholder(dtypes.float32, (), 'x')
+      layer, op = _gen_layer(x, name='name_1')
+      layer1, op1 = _gen_layer(op, name='name_2')
+      layer2, op2 = _gen_layer(op1, name='name_3')
+
+      self.assertEqual(layer.my_var.name, 'name_1/my_var:0')
+      self.assertEqual(op.name, 'name_1/my_op:0')
+      self.assertEqual(layer1.my_var.name, 'name_2/my_var:0')
+      self.assertEqual(op1.name, 'name_2/my_op:0')
+      self.assertEqual(layer2.my_var.name, 'name_3/my_var:0')
+      self.assertEqual(op2.name, 'name_3/my_op:0')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 7213fa1db8ee2eb4a36366464703b30d3f1a84c3..fbb13bb72c435ad3675a8f3f31c568952c043743 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -1232,7 +1232,8 @@ class Conv2DTranspose(Conv2D):
 
   def build(self, input_shape):
     if len(input_shape) != 4:
-      raise ValueError('Inputs should have rank 4. Received input shape: ' + str(input_shape))
+      raise ValueError('Inputs should have rank 4. Received input shape: ' +
+                       str(input_shape))
     if self.data_format == 'channels_first':
       channel_axis = 1
     else:
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index 7be1fa5cfe95f13f67ee94bb20304fba00b33d1b..44016d5eda59882f9e0def4ac2a236109714c1dd 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -43,7 +43,7 @@ class Dense(base.Layer):
   """Densely-connected layer class.
 
   This layer implements the operation:
-  `outputs = activation(inputs.kernel + bias)`
+  `outputs = activation(inputs * kernel + bias)`
   Where `activation` is the activation function passed as the `activation`
   argument (if not `None`), `kernel` is a weights matrix created by the layer,
   and `bias` is a bias vector created by the layer
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index 2d47cc69798d8c3e34e14e24301e8be9a00f49bc..2e99f783e010ac054039841102584d71d3f1e4c3 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -59,6 +59,14 @@ class DenseTest(test.TestCase):
     dense.apply(random_ops.random_uniform((5, 2)))
     self.assertEqual(dense.name, 'dense_2')
 
+  def testVariableInput(self):
+    with self.test_session():
+      v = variable_scope.get_variable(
+          'X', initializer=init_ops.zeros_initializer(), shape=(1, 1))
+      x = core_layers.Dense(1)(v)
+      variables.global_variables_initializer().run()
+      self.assertAllEqual(x.eval(), [[0.0]])
+
   @test_util.run_in_graph_and_eager_modes()
   def testCall(self):
     dense = core_layers.Dense(2, activation=nn_ops.relu, name='my_dense')
diff --git a/tensorflow/python/layers/network.py b/tensorflow/python/layers/network.py
index 9a33a5c7269f100b12d35f77add74c310ea37722..edc52545f92cb9b9c6f78f5c58fe44b3187d370b 100644
--- a/tensorflow/python/layers/network.py
+++ b/tensorflow/python/layers/network.py
@@ -181,11 +181,11 @@ def Input(  # pylint: disable=invalid-name
 class GraphNetwork(base.Layer):
   """A GraphNetwork is a directed acyclic graph of layers.
 
-  It is the topological form of a "model".
-  A Model is simply a GraphNetwork with added training/evaluation routines.
+  It is the topological form of a `tf.keras.models.Model`. A `Model` is simply a
+  `GraphNetwork` with added training/evaluation routines.
 
-  A GraphNetwork instance implements the full Layer API. In particular, a
-  GraphNetwork can be called on new inputs.
+  A `GraphNetwork` instance implements the full `Layer` API. In particular, a
+  `GraphNetwork` can be called on new inputs.
 
   Example:
 
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index dc39e96f87ec1673b5983ef6ee02e356ae4e71f1..65e67dd016fcf4fe6e395bf983b560cd2c7b0f8a 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -142,7 +142,10 @@ class BatchNormalization(base.Layer):
                **kwargs):
     super(BatchNormalization, self).__init__(
         name=name, trainable=trainable, **kwargs)
-    self.axis = axis
+    if isinstance(axis, list):
+      self.axis = axis[:]
+    else:
+      self.axis = axis
     self.momentum = momentum
     self.epsilon = epsilon
     self.center = center
@@ -238,7 +241,7 @@ class BatchNormalization(base.Layer):
                          'axis == [1] or axis == [3]')
 
     # Raise parameters of fp16 batch norm to fp32
-    if self.dtype == dtypes.float16:
+    if self.dtype == dtypes.float16 or self.dtype == dtypes.bfloat16:
       param_dtype = dtypes.float32
     else:
       param_dtype = self.dtype or dtypes.float32
@@ -264,34 +267,34 @@ class BatchNormalization(base.Layer):
           self.axis[idx] = x + 1      # Account for added dimension
 
     if self.scale:
-      self.gamma = self.add_variable(name='gamma',
-                                     shape=param_shape,
-                                     dtype=param_dtype,
-                                     initializer=self.gamma_initializer,
-                                     regularizer=self.gamma_regularizer,
-                                     constraint=self.gamma_constraint,
-                                     trainable=True)
+      self.gamma = self.add_variable(
+          name='gamma',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.gamma_initializer,
+          regularizer=self.gamma_regularizer,
+          constraint=self.gamma_constraint,
+          trainable=True)
     else:
       self.gamma = None
       if self.fused:
-        self._gamma_const = array_ops.constant(1.0,
-                                               dtype=param_dtype,
-                                               shape=param_shape)
+        self._gamma_const = array_ops.constant(
+            1.0, dtype=param_dtype, shape=param_shape)
 
     if self.center:
-      self.beta = self.add_variable(name='beta',
-                                    shape=param_shape,
-                                    dtype=param_dtype,
-                                    initializer=self.beta_initializer,
-                                    regularizer=self.beta_regularizer,
-                                    constraint=self.beta_constraint,
-                                    trainable=True)
+      self.beta = self.add_variable(
+          name='beta',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.beta_initializer,
+          regularizer=self.beta_regularizer,
+          constraint=self.beta_constraint,
+          trainable=True)
     else:
       self.beta = None
       if self.fused:
-        self._beta_const = array_ops.constant(0.0,
-                                              dtype=param_dtype,
-                                              shape=param_shape)
+        self._beta_const = array_ops.constant(
+            0.0, dtype=param_dtype, shape=param_shape)
 
     # Disable variable partitioning when creating the moving mean and variance
     try:
@@ -324,11 +327,12 @@ class BatchNormalization(base.Layer):
         # stack to be cleared. The nested ones use a `lambda` to set the desired
         # device and ignore any devices that may be set by the custom getter.
         def _renorm_variable(name, shape):
-          var = self.add_variable(name=name,
-                                  shape=shape,
-                                  dtype=param_dtype,
-                                  initializer=init_ops.zeros_initializer(),
-                                  trainable=False)
+          var = self.add_variable(
+              name=name,
+              shape=shape,
+              dtype=param_dtype,
+              initializer=init_ops.zeros_initializer(),
+              trainable=False)
           return var
 
         with ops.device(None):
diff --git a/tensorflow/python/layers/normalization_test.py b/tensorflow/python/layers/normalization_test.py
index b2876c58c20c7da45b6505278526c307cdc8eb47..e147f348b0a60dbefb38aa9f89318f261c03684e 100644
--- a/tensorflow/python/layers/normalization_test.py
+++ b/tensorflow/python/layers/normalization_test.py
@@ -101,15 +101,21 @@ class BNTest(test.TestCase):
       loss_val = sess.run(loss, feed_dict={image: image_val})
       return loss_val
 
-  def _trainEvalSequence(self,
-                         dtype,
-                         train1_use_gpu,
-                         train2_use_gpu,
+  def _trainEvalSequence(self, dtype, train1_use_gpu, train2_use_gpu,
                          infer_use_gpu):
     batch, height, width, input_channels = 2, 4, 5, 3
     shape = [batch, height, width, input_channels]
-    checkpoint = os.path.join(self.get_temp_dir(), 'cp_%s_%s_%s_%s' %
-        (dtype, train1_use_gpu, train2_use_gpu, infer_use_gpu))
+
+    # Not all characters in a dtype string representation are allowed in
+    # filenames in all operating systems. This map will sanitize these.
+    dtype_to_valid_fn = {
+        dtypes.float16: 'float16',
+        dtypes.float32: 'float32',
+    }
+    checkpoint = os.path.join(
+        self.get_temp_dir(), 'cp_%s_%s_%s_%s' % (
+            dtype_to_valid_fn[dtype], train1_use_gpu, train2_use_gpu,
+            infer_use_gpu))
 
     self._train(
         checkpoint,
@@ -130,30 +136,27 @@ class BNTest(test.TestCase):
         dtype=dtype)
 
     np.random.seed(0)
-    image_val = np.random.rand(batch,
-                               height,
-                               width,
-                               input_channels).astype(dtype.as_numpy_dtype)
-    loss_val = self._infer(checkpoint, image_val, shape,
-                           use_gpu=infer_use_gpu, is_fused=True)
+    image_val = np.random.rand(batch, height, width, input_channels).astype(
+        dtype.as_numpy_dtype)
+    loss_val = self._infer(
+        checkpoint, image_val, shape, use_gpu=infer_use_gpu, is_fused=True)
 
     return train_vars, loss_val
 
   def testHalfPrecision(self):
-    ref_vars, ref_loss = self._trainEvalSequence(dtype=dtypes.float32,
-                                                 train1_use_gpu=True,
-                                                 train2_use_gpu=True,
-                                                 infer_use_gpu=True)
- 
+    ref_vars, ref_loss = self._trainEvalSequence(
+        dtype=dtypes.float32,
+        train1_use_gpu=True,
+        train2_use_gpu=True,
+        infer_use_gpu=True)
+
     self.assertEqual(len(ref_vars), 5)
 
     for train1_use_gpu in [True, False]:
       for train2_use_gpu in [True, False]:
         for infer_use_gpu in [True, False]:
-          test_vars, test_loss = self._trainEvalSequence(dtypes.float16,
-                                                         train1_use_gpu,
-                                                         train2_use_gpu,
-                                                         infer_use_gpu)
+          test_vars, test_loss = self._trainEvalSequence(
+              dtypes.float16, train1_use_gpu, train2_use_gpu, infer_use_gpu)
           self.assertEqual(len(test_vars), 5)
           for test_var, ref_var in zip(test_vars, ref_vars):
             self.assertAllClose(test_var, ref_var, rtol=1.e-3, atol=1.e-3)
@@ -281,9 +284,8 @@ class BNTest(test.TestCase):
   def testCreateFusedBNFloat16(self):
     # Call layer.
     bn = normalization_layers.BatchNormalization(axis=1, fused=True)
-    inputs = random_ops.random_uniform((5, 4, 3, 3),
-                                       seed=1,
-                                       dtype=dtypes.float16)
+    inputs = random_ops.random_uniform(
+        (5, 4, 3, 3), seed=1, dtype=dtypes.float16)
     training = array_ops.placeholder(dtype='bool')
     outputs = bn.apply(inputs, training=training)
 
diff --git a/tensorflow/python/layers/utils.py b/tensorflow/python/layers/utils.py
index 766a6800d443a79d9bd130833c27f26c844cadaf..64c7124a4321fbea63859d7a18b3be3c4b28abd3 100644
--- a/tensorflow/python/layers/utils.py
+++ b/tensorflow/python/layers/utils.py
@@ -208,7 +208,7 @@ def smart_cond(pred, fn1, fn2, name=None):
     else:
       return fn2()
   else:
-    return control_flow_ops.cond(pred, fn1, fn2, name)
+    return control_flow_ops.cond(pred, true_fn=fn1, false_fn=fn2, name=name)
 
 
 def constant_value(pred):
diff --git a/tensorflow/python/lib/core/bfloat16.cc b/tensorflow/python/lib/core/bfloat16.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4902978e2dcd108fb4e07dae2d3cf3f7cf880978
--- /dev/null
+++ b/tensorflow/python/lib/core/bfloat16.cc
@@ -0,0 +1,568 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/lib/core/bfloat16.h"
+
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/python/lib/core/numpy.h"
+#include "tensorflow/python/lib/core/safe_ptr.h"
+
+namespace tensorflow {
+namespace {
+
+// Workarounds for Python 2 vs 3 API differences.
+#if PY_MAJOR_VERSION < 3
+
+PyObject* MakePyString(const string& s) {
+  return PyString_FromString(s.c_str());
+}
+
+typedef long HashType;  // NOLINT
+
+bool TfPyInt_Check(PyObject* object) { return PyInt_Check(object); }
+
+PyObject* TfPyInt_FromLong(long x) {  // NOLINT
+  return PyInt_FromLong(x);
+}
+
+long TfPyInt_AsLong(PyObject* x) {  // NOLINT
+  return PyInt_AsLong(x);
+}
+
+#else  // PY_MAJOR_VERSION < 3
+
+PyObject* MakePyString(const string& s) {
+  return PyUnicode_FromString(s.c_str());
+}
+
+bool TfPyInt_Check(PyObject* object) {
+  if (!PyLong_Check(object)) {
+    return 0;
+  }
+  int overflow = 0;
+  PyLong_AsLongAndOverflow(object, &overflow);
+  return (overflow == 0);
+}
+
+PyObject* TfPyInt_FromLong(long x) {  // NOLINT
+  return PyLong_FromLong(x);
+}
+
+long TfPyInt_AsLong(PyObject* x) {  // NOLINT
+  return PyLong_AsLong(x);
+}
+
+typedef Py_hash_t HashType;
+
+#endif  // PY_MAJOR_VERSION < 3
+
+// Forward declaration.
+extern PyTypeObject PyBfloat16_Type;
+
+// Representation of a Python bfloat16 object.
+struct PyBfloat16 {
+  PyObject_HEAD;  // Python object header
+  bfloat16 value;
+};
+
+// Returns true if 'object' is a PyBfloat16.
+bool PyBfloat16_Check(PyObject* object) {
+  return PyObject_IsInstance(object,
+                             reinterpret_cast<PyObject*>(&PyBfloat16_Type));
+}
+
+// Extracts the value of a PyBfloat16 object.
+bfloat16 PyBfloat16_Bfloat16(PyObject* object) {
+  return reinterpret_cast<PyBfloat16*>(object)->value;
+}
+
+// Constructs a PyBfloat16 object from a bfloat16.
+Safe_PyObjectPtr PyBfloat16_FromBfloat16(bfloat16 x) {
+  Safe_PyObjectPtr ref =
+      make_safe(PyBfloat16_Type.tp_alloc(&PyBfloat16_Type, 0));
+  PyBfloat16* p = reinterpret_cast<PyBfloat16*>(ref.get());
+  if (p) {
+    p->value = x;
+  }
+  return ref;
+}
+
+// Converts a Python object to a bfloat16 value. Returns true on success,
+// returns false and reports a Python error on failure.
+bool AsBfloat16(PyObject* arg, bfloat16* output) {
+  if (PyBfloat16_Check(arg)) {
+    *output = PyBfloat16_Bfloat16(arg);
+    return true;
+  }
+  if (PyFloat_Check(arg)) {
+    double d = PyFloat_AsDouble(arg);
+    if (PyErr_Occurred()) {
+      return false;
+    }
+    // TODO(phawkins): check for overflow
+    *output = bfloat16(d);
+    return true;
+  }
+  if (TfPyInt_Check(arg)) {
+    long l = TfPyInt_AsLong(arg);  // NOLINT
+    if (PyErr_Occurred()) {
+      return false;
+    }
+    // TODO(phawkins): check for overflow
+    *output = bfloat16(static_cast<float>(l));
+    return true;
+  }
+  if (PyArray_IsScalar(arg, Float)) {
+    float f;
+    PyArray_ScalarAsCtype(arg, &f);
+    *output = bfloat16(f);
+    return true;
+  }
+  PyErr_Format(PyExc_TypeError, "expected number, got %s",
+               arg->ob_type->tp_name);
+  return false;
+}
+
+// Converts a PyBfloat16 into a PyFloat.
+PyObject* PyBfloat16_Float(PyObject* self) {
+  bfloat16 x = PyBfloat16_Bfloat16(self);
+  return PyFloat_FromDouble(static_cast<double>(x));
+}
+
+// Converts a PyBfloat16 into a PyInt.
+PyObject* PyBfloat16_Int(PyObject* self) {
+  bfloat16 x = PyBfloat16_Bfloat16(self);
+  long y = static_cast<long>(x);  // NOLINT
+  return TfPyInt_FromLong(y);
+}
+
+// Negates a PyBfloat16.
+PyObject* PyBfloat16_Negative(PyObject* self) {
+  bfloat16 x = PyBfloat16_Bfloat16(self);
+  return PyBfloat16_FromBfloat16(-x).release();
+}
+
+// Binary arithmetic operators on PyBfloat16 values.
+#define BFLOAT16_BINOP(name, op)                                  \
+  PyObject* PyBfloat16_##name(PyObject* a, PyObject* b) {         \
+    bfloat16 x, y;                                                \
+    if (!AsBfloat16(a, &x) || !AsBfloat16(b, &y)) return nullptr; \
+    bfloat16 z = x op y;                                          \
+    return PyBfloat16_FromBfloat16(z).release();                  \
+  }
+BFLOAT16_BINOP(Add, +)
+BFLOAT16_BINOP(Subtract, -)
+BFLOAT16_BINOP(Multiply, *)
+BFLOAT16_BINOP(Divide, /)
+#undef BFLOAT16_BINOP
+
+// Python number methods for PyBfloat16 objects.
+PyNumberMethods PyBfloat16_AsNumber = {
+    PyBfloat16_Add,       // nb_add
+    PyBfloat16_Subtract,  // nb_subtract
+    PyBfloat16_Multiply,  // nb_multiply
+#if PY_MAJOR_VERSION < 3
+    PyBfloat16_Divide,  // nb_divide
+#endif
+    nullptr,              // nb_remainder
+    nullptr,              // nb_divmod
+    nullptr,              // nb_power
+    PyBfloat16_Negative,  // nb_negative
+    nullptr,              // nb_positive
+    nullptr,              // nb_absolute
+    nullptr,              // nb_nonzero
+    nullptr,              // nb_invert
+    nullptr,              // nb_lshift
+    nullptr,              // nb_rshift
+    nullptr,              // nb_and
+    nullptr,              // nb_xor
+    nullptr,              // nb_or
+#if PY_MAJOR_VERSION < 3
+    nullptr,  // nb_coerce
+#endif
+    PyBfloat16_Int,  // nb_int
+#if PY_MAJOR_VERSION < 3
+    PyBfloat16_Int,  // nb_long
+#else
+    nullptr,  // reserved
+#endif
+    PyBfloat16_Float,  // nb_float
+#if PY_MAJOR_VERSION < 3
+    nullptr,  // nb_oct
+    nullptr,  // nb_hex
+#endif
+
+    nullptr,  // nb_inplace_add
+    nullptr,  // nb_inplace_subtract
+    nullptr,  // nb_inplace_multiply
+#if PY_MAJOR_VERSION < 3
+    nullptr,  // nb_inplace_divide
+#endif
+    nullptr,  // nb_inplace_remainder
+    nullptr,  // nb_inplace_power
+    nullptr,  // nb_inplace_lshift
+    nullptr,  // nb_inplace_rshift
+    nullptr,  // nb_inplace_and
+    nullptr,  // nb_inplace_xor
+    nullptr,  // nb_inplace_or
+
+    nullptr,            // nb_floor_divide
+    PyBfloat16_Divide,  // nb_true_divide
+    nullptr,            // nb_inplace_floor_divide
+    nullptr,            // nb_inplace_true_divide
+    nullptr,            // nb_index
+};
+
+// Constructs a new PyBfloat16.
+PyObject* PyBfloat16_New(PyTypeObject* type, PyObject* args, PyObject* kwds) {
+  if (kwds && PyDict_Size(kwds)) {
+    PyErr_SetString(PyExc_TypeError, "constructor takes no keyword arguments");
+    return nullptr;
+  }
+  Py_ssize_t size = PyTuple_Size(args);
+  if (size != 1) {
+    PyErr_SetString(PyExc_TypeError,
+                    "expected number as argument to bfloat16 constructor");
+    return nullptr;
+  }
+  PyObject* arg = PyTuple_GetItem(args, 0);
+
+  if (PyBfloat16_Check(arg)) {
+    Py_INCREF(arg);
+    return arg;
+  } else {
+    bfloat16 value;
+    if (!AsBfloat16(arg, &value)) {
+      return nullptr;
+    }
+    return PyBfloat16_FromBfloat16(value).release();
+  }
+}
+
+// Comparisons on PyBfloat16s.
+PyObject* PyBfloat16_RichCompare(PyObject* a, PyObject* b, int op) {
+  bfloat16 x, y;
+  if (!AsBfloat16(a, &x) || !AsBfloat16(b, &y)) return nullptr;
+  bool result;
+  switch (op) {
+    case Py_LT:
+      result = x < y;
+      break;
+    case Py_LE:
+      result = x <= y;
+      break;
+    case Py_EQ:
+      result = x == y;
+      break;
+    case Py_NE:
+      result = x != y;
+      break;
+    case Py_GT:
+      result = x > y;
+      break;
+    case Py_GE:
+      result = x >= y;
+      break;
+    default:
+      LOG(FATAL) << "Invalid op type " << op;
+  }
+  return PyBool_FromLong(result);
+}
+
+// Implementation of repr() for PyBfloat16.
+PyObject* PyBfloat16_Repr(PyObject* self) {
+  bfloat16 x = reinterpret_cast<PyBfloat16*>(self)->value;
+  string v = strings::StrCat("bfloat16(", static_cast<float>(x), ")");
+  return MakePyString(v);
+}
+
+// Implementation of str() for PyBfloat16.
+PyObject* PyBfloat16_Str(PyObject* self) {
+  bfloat16 x = reinterpret_cast<PyBfloat16*>(self)->value;
+  string v = strings::StrCat(static_cast<float>(x));
+  return MakePyString(v);
+}
+
+// Hash function for PyBfloat16. We use the identity function, which is a weak
+// hash function.
+HashType PyBfloat16_Hash(PyObject* self) {
+  bfloat16 x = reinterpret_cast<PyBfloat16*>(self)->value;
+  return x.value;
+}
+
+// Python type for PyBfloat16 objects.
+PyTypeObject PyBfloat16_Type = {
+#if PY_MAJOR_VERSION < 3
+    PyObject_HEAD_INIT(nullptr) 0,  // ob_size
+#else
+    PyVarObject_HEAD_INIT(nullptr, 0)
+#endif
+    "bfloat16",                                // tp_name
+    sizeof(PyBfloat16),                        // tp_basicsize
+    0,                                         // tp_itemsize
+    nullptr,                                   // tp_dealloc
+    nullptr,                                   // tp_print
+    nullptr,                                   // tp_getattr
+    nullptr,                                   // tp_setattr
+    nullptr,                                   // tp_compare / tp_reserved
+    PyBfloat16_Repr,                           // tp_repr
+    &PyBfloat16_AsNumber,                      // tp_as_number
+    nullptr,                                   // tp_as_sequence
+    nullptr,                                   // tp_as_mapping
+    PyBfloat16_Hash,                           // tp_hash
+    nullptr,                                   // tp_call
+    PyBfloat16_Str,                            // tp_str
+    nullptr,                                   // tp_getattro
+    nullptr,                                   // tp_setattro
+    nullptr,                                   // tp_as_buffer
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,  // tp_flags
+    "bfloat16 floating-point values",          // tp_doc
+    nullptr,                                   // tp_traverse
+    nullptr,                                   // tp_clear
+    PyBfloat16_RichCompare,                    // tp_richcompare
+    0,                                         // tp_weaklistoffset
+    nullptr,                                   // tp_iter
+    nullptr,                                   // tp_iternext
+    nullptr,                                   // tp_methods
+    nullptr,                                   // tp_members
+    nullptr,                                   // tp_getset
+    nullptr,                                   // tp_base
+    nullptr,                                   // tp_dict
+    nullptr,                                   // tp_descr_get
+    nullptr,                                   // tp_descr_set
+    0,                                         // tp_dictoffset
+    nullptr,                                   // tp_init
+    nullptr,                                   // tp_alloc
+    PyBfloat16_New,                            // tp_new
+    nullptr,                                   // tp_free
+    nullptr,                                   // tp_is_gc
+    nullptr,                                   // tp_bases
+    nullptr,                                   // tp_mro
+    nullptr,                                   // tp_cache
+    nullptr,                                   // tp_subclasses
+    nullptr,                                   // tp_weaklist
+    nullptr,                                   // tp_del
+    0,                                         // tp_version_tag
+};
+
+// Numpy support
+
+PyArray_ArrFuncs NPyBfloat16_ArrFuncs;
+
+PyArray_Descr NPyBfloat16_Descr = {
+    PyObject_HEAD_INIT(nullptr) & PyBfloat16_Type,  // typeobj
+    // We must register bfloat16 with a kind other than "f", because numpy
+    // considers two types with the same kind and size to be equal, but
+    // float16 != bfloat16.
+    'V',  // kind
+    // TODO(phawkins): there doesn't seem to be a way of guaranteeing a type
+    // character is unique.
+    'E',                                                  // type
+    '=',                                                  // byteorder
+    NPY_NEEDS_PYAPI | NPY_USE_GETITEM | NPY_USE_SETITEM,  // hasobject
+    0,                                                    // type_num
+    sizeof(bfloat16),                                     // elsize
+    alignof(bfloat16),                                    // alignment
+    nullptr,                                              // subarray
+    nullptr,                                              // fields
+    nullptr,                                              // names
+    &NPyBfloat16_ArrFuncs,                                // f
+};
+
+// Registered numpy type ID. Global variable populated by the registration code.
+int npy_bfloat16_ = -1;
+
+// Implementations of NumPy array methods.
+
+PyObject* NPyBfloat16_GetItem(void* data, void* arr) {
+  bfloat16 x;
+  memcpy(&x, data, sizeof(bfloat16));
+  return PyBfloat16_FromBfloat16(x).release();
+}
+
+int NPyBfloat16_SetItem(PyObject* item, void* data, void* arr) {
+  bfloat16 x;
+  if (!AsBfloat16(item, &x)) return -1;
+  memcpy(data, &x, sizeof(bfloat16));
+  return 0;
+}
+
+void ByteSwap16(void* value) {
+  char* p = reinterpret_cast<char*>(value);
+  std::swap(p[0], p[1]);
+}
+
+void NPyBfloat16_CopySwapN(void* dstv, npy_intp dstride, void* srcv,
+                           npy_intp sstride, npy_intp n, int swap, void* arr) {
+  char* dst = reinterpret_cast<char*>(dstv);
+  char* src = reinterpret_cast<char*>(srcv);
+  if (!src) {
+    return;
+  }
+  if (swap) {
+    for (npy_intp i = 0; i < n; i++) {
+      char* r = dst + dstride * i;
+      memcpy(r, src + sstride * i, sizeof(uint16_t));
+      ByteSwap16(r);
+    }
+  } else if (dstride == sizeof(uint16_t) && sstride == sizeof(uint16_t)) {
+    memcpy(dst, src, n * sizeof(uint16_t));
+  } else {
+    for (npy_intp i = 0; i < n; i++) {
+      memcpy(dst + dstride * i, src + sstride * i, sizeof(uint16_t));
+    }
+  }
+}
+
+void NPyBfloat16_CopySwap(void* dst, void* src, int swap, void* arr) {
+  if (!src) {
+    return;
+  }
+  memcpy(dst, src, sizeof(uint16_t));
+  if (swap) {
+    ByteSwap16(dst);
+  }
+}
+
+npy_bool NPyBfloat16_NonZero(void* data, void* arr) {
+  bfloat16 x;
+  memcpy(&x, data, sizeof(x));
+  return x != static_cast<bfloat16>(0);
+}
+
+// NumPy casts
+
+// Performs a NumPy array cast from type 'From' to 'To'.
+template <typename From, typename To>
+void NPyCast(void* from_void, void* to_void, npy_intp n, void* fromarr,
+             void* toarr) {
+  const From* from = reinterpret_cast<From*>(from_void);
+  To* to = reinterpret_cast<To*>(to_void);
+  for (npy_intp i = 0; i < n; ++i) {
+    to[i] = static_cast<To>(from[i]);
+  }
+}
+
+// Registers a cast between bfloat16 and type 'T'. 'numpy_type' is the NumPy
+// type corresponding to 'T'. If 'cast_is_safe', registers that bfloat16 can be
+// safely coerced to T.
+template <typename T>
+bool RegisterBfloat16Cast(int numpy_type, bool cast_is_safe) {
+  if (PyArray_RegisterCastFunc(PyArray_DescrFromType(numpy_type), npy_bfloat16_,
+                               NPyCast<T, bfloat16>) < 0) {
+    return false;
+  }
+  if (PyArray_RegisterCastFunc(&NPyBfloat16_Descr, numpy_type,
+                               NPyCast<bfloat16, T>) < 0) {
+    return false;
+  }
+  if (cast_is_safe && PyArray_RegisterCanCast(&NPyBfloat16_Descr, numpy_type,
+                                              NPY_NOSCALAR) < 0) {
+    return false;
+  }
+  return true;
+}
+
+// Initializes the module.
+bool Initialize() {
+  // We hit a mysterious crash if we haven't initialized numpy before this:
+  PyBfloat16_Type.tp_base = &PyGenericArrType_Type;
+
+  if (PyType_Ready(&PyBfloat16_Type) < 0) {
+    return false;
+  }
+
+  // Initializes the NumPy descriptor.
+  PyArray_InitArrFuncs(&NPyBfloat16_ArrFuncs);
+  NPyBfloat16_ArrFuncs.getitem = NPyBfloat16_GetItem;
+  NPyBfloat16_ArrFuncs.setitem = NPyBfloat16_SetItem;
+  NPyBfloat16_ArrFuncs.copyswapn = NPyBfloat16_CopySwapN;
+  NPyBfloat16_ArrFuncs.copyswap = NPyBfloat16_CopySwap;
+  NPyBfloat16_ArrFuncs.nonzero = NPyBfloat16_NonZero;
+
+  Py_TYPE(&NPyBfloat16_Descr) = &PyArrayDescr_Type;
+  npy_bfloat16_ = PyArray_RegisterDataType(&NPyBfloat16_Descr);
+  if (npy_bfloat16_ < 0) return false;
+
+  // Support dtype(bfloat16)
+  if (PyDict_SetItemString(PyBfloat16_Type.tp_dict, "dtype",
+                           reinterpret_cast<PyObject*>(&NPyBfloat16_Descr)) <
+      0) {
+    return false;
+  }
+
+  // Register casts
+
+  // We lie shamelessly and say that a cast from half to bfloat16 is safe.
+  // Numpy frequently uses the smallest legal representation type for small
+  // float constants (e.g., 1.0), which is often float16. Things break if these
+  // cannot be converted transparently to bfloat16.
+  if (!RegisterBfloat16Cast<Eigen::half>(NPY_HALF, /*cast_is_safe=*/true)) {
+    return false;
+  }
+
+  if (!RegisterBfloat16Cast<float>(NPY_FLOAT, /*cast_is_safe=*/true)) {
+    return false;
+  }
+  if (!RegisterBfloat16Cast<double>(NPY_DOUBLE, /*cast_is_safe=*/true)) {
+    return false;
+  }
+  if (!RegisterBfloat16Cast<int32>(NPY_INT32, /*cast_is_safe=*/false)) {
+    return false;
+  }
+  if (!RegisterBfloat16Cast<int64>(NPY_INT64, /*cast_is_safe=*/false)) {
+    return false;
+  }
+  // Following the numpy convention. imag part is dropped when converting to
+  // float.
+  if (!RegisterBfloat16Cast<complex64>(NPY_COMPLEX64, /*cast_is_safe=*/true)) {
+    return false;
+  }
+  if (!RegisterBfloat16Cast<complex128>(NPY_COMPLEX128,
+                                        /*cast_is_safe=*/true)) {
+    return false;
+  }
+  return true;
+}
+
+}  // namespace
+
+void RegisterNumpyBfloat16() {
+  if (npy_bfloat16_ >= 0) {
+    // Already initialized.
+    return;
+  }
+  if (!Initialize()) {
+    if (!PyErr_Occurred()) {
+      PyErr_SetString(PyExc_RuntimeError, "cannot load bfloat16 module.");
+    }
+    PyErr_Print();
+  }
+}
+
+PyObject* Bfloat16PyType() {
+  CHECK(PyBfloat16_Type.tp_base != nullptr);
+  Py_INCREF(&PyBfloat16_Type);
+  return reinterpret_cast<PyObject*>(&PyBfloat16_Type);
+}
+
+int Bfloat16NumpyType() {
+  CHECK_GE(npy_bfloat16_, 0);
+  return npy_bfloat16_;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/lib/core/bfloat16.h b/tensorflow/python/lib/core/bfloat16.h
new file mode 100644
index 0000000000000000000000000000000000000000..a609928ba9029af00553a4664bef18d3749e64db
--- /dev/null
+++ b/tensorflow/python/lib/core/bfloat16.h
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_BFLOAT16_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_BFLOAT16_H_
+
+#include <Python.h>
+
+namespace tensorflow {
+
+// Register the bfloat16 numpy type.
+void RegisterNumpyBfloat16();
+
+// Returns the PyObject for the bfloat16 type.
+PyObject* Bfloat16PyType();
+
+// Returns the id number of the bfloat16 numpy type.
+int Bfloat16NumpyType();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_BFLOAT16_H_
diff --git a/tensorflow/python/lib/core/bfloat16.i b/tensorflow/python/lib/core/bfloat16.i
new file mode 100644
index 0000000000000000000000000000000000000000..10444b676b2549e0d9f96391f96e7a523f768d85
--- /dev/null
+++ b/tensorflow/python/lib/core/bfloat16.i
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%{
+#include "tensorflow/python/lib/core/bfloat16.h"
+%}
+
+%init %{
+tensorflow::RegisterNumpyBfloat16();
+%}
+
+%{
+PyObject* TF_bfloat16_type() {
+  return tensorflow::Bfloat16PyType();
+}
+%}
+
+PyObject* TF_bfloat16_type();
diff --git a/tensorflow/python/lib/core/bfloat16_test.py b/tensorflow/python/lib/core/bfloat16_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0872348c51b2ce653842e4d9f41d63496b1e7b23
--- /dev/null
+++ b/tensorflow/python/lib/core/bfloat16_test.py
@@ -0,0 +1,213 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Test cases for the bfloat16 Python type."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import numpy as np
+
+# pylint: disable=unused-import,g-bad-import-order
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.platform import test
+
+
+bfloat16 = pywrap_tensorflow.TF_bfloat16_type()
+
+
+class Bfloat16Test(test.TestCase):
+
+  def float_values(self):
+    """Returns values that should round trip exactly to float and back."""
+    epsilon = float.fromhex("1.0p-7")
+    return [
+        0.0, 1.0, -1, 0.5, -0.5, epsilon, 1.0 + epsilon, 1.0 - epsilon,
+        -1.0 - epsilon, -1.0 + epsilon, 3.5, 42.0, 255.0, 256.0,
+        float("inf"), float("-inf"), float("nan")]
+
+  def _assertFloatIdentical(self, v, w):
+    if math.isnan(v):
+      self.assertTrue(math.isnan(w))
+    else:
+      self.assertEqual(v, w)
+
+  def testRoundTripToFloat(self):
+    for v in self.float_values():
+      self._assertFloatIdentical(v, float(bfloat16(v)))
+
+  def testRoundTripToInt(self):
+    for v in [-256, -255, -34, -2, -1, 0, 1, 2, 10, 47, 128, 255, 256, 512]:
+      self.assertEqual(v, int(bfloat16(v)))
+
+  def testStr(self):
+    self.assertEqual("0", str(bfloat16(0.0)))
+    self.assertEqual("1", str(bfloat16(1.0)))
+    self.assertEqual("-3.5", str(bfloat16(-3.5)))
+    self.assertEqual("0.0078125", str(bfloat16(float.fromhex("1.0p-7"))))
+    self.assertEqual("inf", str(bfloat16(float("inf"))))
+    self.assertEqual("-inf", str(bfloat16(float("-inf"))))
+    self.assertEqual("nan", str(bfloat16(float("nan"))))
+
+  def testRepr(self):
+    self.assertEqual("bfloat16(0)", repr(bfloat16(0)))
+    self.assertEqual("bfloat16(1)", repr(bfloat16(1)))
+    self.assertEqual("bfloat16(-3.5)", repr(bfloat16(-3.5)))
+    self.assertEqual("bfloat16(0.0078125)",
+                     repr(bfloat16(float.fromhex("1.0p-7"))))
+    self.assertEqual("bfloat16(inf)", repr(bfloat16(float("inf"))))
+    self.assertEqual("bfloat16(-inf)", repr(bfloat16(float("-inf"))))
+    self.assertEqual("bfloat16(nan)", repr(bfloat16(float("nan"))))
+
+  def testHash(self):
+    self.assertEqual(0, hash(bfloat16(0.0)))
+    self.assertEqual(0x3f80, hash(bfloat16(1.0)))
+    self.assertEqual(0x7fc0, hash(bfloat16(float("nan"))))
+
+  # Tests for Python operations
+  def testNegate(self):
+    for v in self.float_values():
+      self._assertFloatIdentical(-v, float(-bfloat16(v)))
+
+  def testAdd(self):
+    self._assertFloatIdentical(0, float(bfloat16(0) + bfloat16(0)))
+    self._assertFloatIdentical(1, float(bfloat16(1) + bfloat16(0)))
+    self._assertFloatIdentical(0, float(bfloat16(1) + bfloat16(-1)))
+    self._assertFloatIdentical(5.5, float(bfloat16(2) + bfloat16(3.5)))
+    self._assertFloatIdentical(1.25, float(bfloat16(3.5) + bfloat16(-2.25)))
+    self._assertFloatIdentical(float("inf"),
+                               float(bfloat16(float("inf")) + bfloat16(-2.25)))
+    self._assertFloatIdentical(float("-inf"),
+                               float(bfloat16(float("-inf")) + bfloat16(-2.25)))
+    self.assertTrue(math.isnan(float(bfloat16(3.5) + bfloat16(float("nan")))))
+
+  def testSub(self):
+    self._assertFloatIdentical(0, float(bfloat16(0) - bfloat16(0)))
+    self._assertFloatIdentical(1, float(bfloat16(1) - bfloat16(0)))
+    self._assertFloatIdentical(2, float(bfloat16(1) - bfloat16(-1)))
+    self._assertFloatIdentical(-1.5, float(bfloat16(2) - bfloat16(3.5)))
+    self._assertFloatIdentical(5.75, float(bfloat16(3.5) - bfloat16(-2.25)))
+    self._assertFloatIdentical(float("-inf"),
+                               float(bfloat16(-2.25) - bfloat16(float("inf"))))
+    self._assertFloatIdentical(float("inf"),
+                               float(bfloat16(-2.25) - bfloat16(float("-inf"))))
+    self.assertTrue(math.isnan(float(bfloat16(3.5) - bfloat16(float("nan")))))
+
+  def testMul(self):
+    self._assertFloatIdentical(0, float(bfloat16(0) * bfloat16(0)))
+    self._assertFloatIdentical(0, float(bfloat16(1) * bfloat16(0)))
+    self._assertFloatIdentical(-1, float(bfloat16(1) * bfloat16(-1)))
+    self._assertFloatIdentical(-7.875, float(bfloat16(3.5) * bfloat16(-2.25)))
+    self._assertFloatIdentical(float("-inf"),
+                               float(bfloat16(float("inf")) * bfloat16(-2.25)))
+    self._assertFloatIdentical(float("inf"),
+                               float(bfloat16(float("-inf")) * bfloat16(-2.25)))
+    self.assertTrue(math.isnan(float(bfloat16(3.5) * bfloat16(float("nan")))))
+
+  def testDiv(self):
+    self.assertTrue(math.isnan(float(bfloat16(0) / bfloat16(0))))
+    self._assertFloatIdentical(float("inf"), float(bfloat16(1) / bfloat16(0)))
+    self._assertFloatIdentical(-1, float(bfloat16(1) / bfloat16(-1)))
+    self._assertFloatIdentical(-1.75, float(bfloat16(3.5) / bfloat16(-2)))
+    self._assertFloatIdentical(float("-inf"),
+                               float(bfloat16(float("inf")) / bfloat16(-2.25)))
+    self._assertFloatIdentical(float("inf"),
+                               float(bfloat16(float("-inf")) / bfloat16(-2.25)))
+    self.assertTrue(math.isnan(float(bfloat16(3.5) / bfloat16(float("nan")))))
+
+  def testLess(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v < w, bfloat16(v) < bfloat16(w))
+
+  def testLessEqual(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v <= w, bfloat16(v) <= bfloat16(w))
+
+  def testGreater(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v > w, bfloat16(v) > bfloat16(w))
+
+  def testGreaterEqual(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v >= w, bfloat16(v) >= bfloat16(w))
+
+  def testEqual(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v == w, bfloat16(v) == bfloat16(w))
+
+  def testNotEqual(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v != w, bfloat16(v) != bfloat16(w))
+
+
+class Bfloat16NumPyTest(test.TestCase):
+
+  def testDtype(self):
+    self.assertEqual(bfloat16, np.dtype(bfloat16))
+
+  def testArray(self):
+    x = np.array([[1, 2, 3]], dtype=bfloat16)
+    self.assertEqual(bfloat16, x.dtype)
+    self.assertEqual("[[bfloat16(1) bfloat16(2) bfloat16(3)]]", str(x))
+    self.assertAllEqual(x, x)
+    self.assertAllClose(x, x)
+
+  def testCasts(self):
+    for dtype in [
+        np.float16, np.float32, np.float64, np.int32, np.int64,
+        np.complex64, np.complex128]:
+      x = np.array([[1, 2, 3]], dtype=dtype)
+      y = x.astype(bfloat16)
+      z = y.astype(dtype)
+      self.assertTrue(np.all(x == y))
+      self.assertEqual(bfloat16, y.dtype)
+      self.assertTrue(np.all(x == z))
+      self.assertEqual(dtype, z.dtype)
+
+  def testConformNumpyComplex(self):
+    for dtype in [np.complex64, np.complex128]:
+      x = np.array([1.1, 2.2 + 2.2j, 3.3], dtype=dtype)
+      y_np = x.astype(np.float32)
+      y_tf = x.astype(bfloat16)
+      self.assertAllClose(y_np, y_tf, atol=2e-2)
+
+      z_np = y_np.astype(dtype)
+      z_tf = y_tf.astype(dtype)
+      self.assertAllClose(z_np, z_tf, atol=2e-2)
+
+  def testAdd(self):
+    x = np.array([[1, 2, 3]], dtype=bfloat16)
+    y = np.array([[4, 5, 6]], dtype=bfloat16)
+    self.assertAllClose(np.array([[5, 7, 9]]), x + y)
+
+  def testLogSumExp(self):
+    x = np.array([[1, 2, 3]], dtype=np.float32)
+    y = np.array([[4, 5, 6]], dtype=np.float32)
+    self.assertAllClose(np.logaddexp(x, y),
+                        np.logaddexp(x.astype(bfloat16), y.astype(bfloat16)),
+                        atol=2e-2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index cf2c2e6eb00cccf82adf3c9eb65b685130a2f632..994af69386b278f6b88c051f898cd6a9dc607f3f 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/python/lib/core/bfloat16.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
 
 namespace tensorflow {
@@ -125,6 +126,10 @@ Status PyArray_TYPE_to_TF_DataType(PyArrayObject* array,
       // custom struct type.
       return PyArrayDescr_to_TF_DataType(descr, out_tf_datatype);
     default:
+      if (pyarray_type == Bfloat16NumpyType()) {
+        *out_tf_datatype = TF_BFLOAT16;
+        break;
+      }
       // TODO(mrry): Support these.
       return errors::Internal("Unsupported feed type");
   }
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
index 82c45f5a315d485585b1514634201225f4123de1..65e2178cda498294ffc4a5066b5692132e86180f 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/python/lib/core/bfloat16.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
 
 namespace tensorflow {
@@ -175,7 +176,7 @@ Status TF_DataType_to_PyArray_TYPE(TF_DataType tf_datatype,
       *out_pyarray_type = NPY_INT32;
       break;
     case TF_BFLOAT16:
-      *out_pyarray_type = NPY_UINT16;
+      *out_pyarray_type = Bfloat16NumpyType();
       break;
     default:
       return errors::Internal("Tensorflow type ", tf_datatype,
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index b30125761fc7778b58793062d186994ef2a58b0f..dc56b3948626de7d76895378ade04b14e7d779b1 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -18,21 +18,25 @@ limitations under the License.
 #include <array>
 
 #include "numpy/arrayobject.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/python/eager/pywrap_tfe.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
+#include "tensorflow/python/lib/core/py_util.h"
+#include "tensorflow/python/lib/core/safe_ptr.h"
 #include <Python.h>
 
 namespace tensorflow {
 namespace {
 
-static mutex mu;
+static mutex mu(LINKER_INITIALIZED);
 static PyObject* py_trampoline GUARDED_BY(mu) = nullptr;
 
 // Returns the py_trampoline that is used to pass the control to the
@@ -48,6 +52,9 @@ struct PyCall {
   // with this "token".
   string token;
 
+  // True if the call is associated with an EagerPyFunc.
+  bool eager;
+
   // Inputs and outputs of this function invocation.
   std::vector<Tensor> ins;
   std::vector<Tensor> out;
@@ -55,19 +62,26 @@ struct PyCall {
 
 // Givens the 'call', prepares the token and inputs as a python tuple
 // that is appropriate for calling the trampoline.
-Status MakeArgTuple(PyCall* call, PyObject** tuple) {
+Status MakeArgTuple(const PyCall* call, PyObject** tuple) {
   int64 n = call->ins.size();
   PyObject* lst = PyList_New(n);
   CHECK(lst);
   for (int64 i = 0; i < n; ++i) {
+    PyObject* arg = nullptr;
     const Tensor& t = call->ins[i];
-    PyObject* a = nullptr;
-    Status s = ConvertTensorToNdarray(t, &a);
-    if (!s.ok()) {
-      Py_DECREF(lst);
-      return s;
+    if (call->eager) {
+      arg = EagerTensorFromHandle(TFE_NewTensorHandle(t));
+      if (arg == nullptr) {
+        return errors::Internal("Unable to procure EagerTensor from Tensor.");
+      }
+    } else {
+      Status s = ConvertTensorToNdarray(t, &arg);
+      if (!s.ok()) {
+        Py_DECREF(lst);
+        return s;
+      }
     }
-    PyList_SetItem(lst, i, a);
+    PyList_SetItem(lst, i, arg);
   }
   *tuple = Py_BuildValue("(sN)", call->token.c_str(), lst);
   CHECK(*tuple);
@@ -133,46 +147,16 @@ bool IsSingleNone(PyObject* obj) {
   return item == Py_None;
 }
 
-// py.__class__.__name__
-const char* ClassName(PyObject* py) {
-/* PyPy doesn't have a separate C API for old-style classes. */
-#if PY_MAJOR_VERSION < 3 && !defined(PYPY_VERSION)
-  if (PyClass_Check(py))
-    return PyString_AS_STRING(
-        CHECK_NOTNULL(reinterpret_cast<PyClassObject*>(py)->cl_name));
-  if (PyInstance_Check(py))
-    return PyString_AS_STRING(CHECK_NOTNULL(
-        reinterpret_cast<PyInstanceObject*>(py)->in_class->cl_name));
-#endif
-  if (Py_TYPE(py) == &PyType_Type) {
-    return reinterpret_cast<PyTypeObject*>(py)->tp_name;
-  }
-  return Py_TYPE(py)->tp_name;
-}
-
-string PyExcFetch() {
-  CHECK(PyErr_Occurred()) << "Must only call PyExcFetch after an exception.";
-  PyObject* ptype;
-  PyObject* pvalue;
-  PyObject* ptraceback;
-  PyErr_Fetch(&ptype, &pvalue, &ptraceback);
-  PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
-  string err = ClassName(ptype);
-  if (pvalue) {
-    PyObject* str = PyObject_Str(pvalue);
-    if (str) {
-#if PY_MAJOR_VERSION < 3
-      strings::StrAppend(&err, ": ", PyString_AS_STRING(str));
-#else
-      strings::StrAppend(&err, ": ", PyUnicode_AsUTF8(str));
-#endif
-      Py_DECREF(str);
-    }
-    Py_DECREF(pvalue);
-  }
-  Py_DECREF(ptype);
-  Py_XDECREF(ptraceback);
-  return err;
+// Retrieves a Tensor from `eager_tensor` and stores it in `output_tensor`.
+Status ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
+                                    Tensor* output_tensor,
+                                    TF_Status* tf_status) {
+  // TODO(akshayka): Lift the restriction requiring output tensors to
+  // lie in host memory; EagerPyFunc should be able to dispatch ops on GPU
+  // tensors, so we should eventually implement a GPU kernel for EagerPyFunc.
+  *output_tensor = *TFE_TensorHandleUnderlyingTensorInHostMemory(
+      EagerTensor_Handle(eager_tensor), tf_status);
+  return StatusFromTF_Status(tf_status);
 }
 
 // Calls the registered py function through the trampoline.
@@ -195,18 +179,18 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
     if (PyErr_Occurred()) {
       if (PyErr_ExceptionMatches(PyExc_ValueError) ||
           PyErr_ExceptionMatches(PyExc_TypeError)) {
-        return errors::InvalidArgument(PyExcFetch());
+        return errors::InvalidArgument(PyExceptionFetch());
       } else if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
         *out_log_on_error = false;
-        return errors::OutOfRange(PyExcFetch());
+        return errors::OutOfRange(PyExceptionFetch());
       } else if (PyErr_ExceptionMatches(PyExc_MemoryError)) {
-        return errors::ResourceExhausted(PyExcFetch());
+        return errors::ResourceExhausted(PyExceptionFetch());
       } else if (PyErr_ExceptionMatches(PyExc_NotImplementedError)) {
-        return errors::Unimplemented(PyExcFetch());
+        return errors::Unimplemented(PyExceptionFetch());
       } else {
         // TODO(ebrevdo): Check if exception is an OpError and use the
         // OpError.error_code property to map it back in the Status.
-        return errors::Unknown(PyExcFetch());
+        return errors::Unknown(PyExceptionFetch());
       }
     } else {
       return errors::Internal("Failed to run py callback ", call->token,
@@ -214,21 +198,37 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
     }
   }
 
-  // Process the return values and converts them to tf Tensors.
+  // Process the return values and convert them to TF Tensors.
   Status s;
   if (PyList_Check(result)) {
-    // 'result' is a list.
     call->out.clear();
     for (int i = 0; i < PyList_Size(result); ++i) {
       Tensor t;
-      s = ConvertNdarrayToTensor(PyList_GetItem(result, i), &t);
+      if (call->eager) {
+        auto tf_status = tensorflow::make_safe(TF_NewStatus());
+        s = ExtractTensorFromEagerTensor(PyList_GetItem(result, i), &t,
+                                         tf_status.get());
+      } else {
+        s = ConvertNdarrayToTensor(PyList_GetItem(result, i), &t);
+      }
+
       if (!s.ok()) {
         break;
       }
       call->out.push_back(t);
     }
+  } else if (EagerTensor_CheckExact(result) || result == Py_None) {
+    DCHECK(call->eager);
+    Tensor t;
+    if (result != Py_None) {
+      auto tf_status = tensorflow::make_safe(TF_NewStatus());
+      s = ExtractTensorFromEagerTensor(result, &t, tf_status.get());
+      if (s.ok()) {
+        call->out.push_back(t);
+      }
+    }
   } else if (PyArray_Check(result)) {
-    // 'result' is a single ndarray.
+    DCHECK(!call->eager);
     if (!IsSingleNone(result)) {
       Tensor t;
       s = ConvertNdarrayToTensor(result, &t);
@@ -417,11 +417,13 @@ class PyFuncOp : public OpKernel {
  public:
   explicit PyFuncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("token", &token_));
+    eager_ = type_string() == "EagerPyFunc";
   }
 
   void Compute(OpKernelContext* ctx) override {
     PyCall call;
     call.token = token_;
+    call.eager = eager_;
     for (int i = 0; i < ctx->num_inputs(); ++i) {
       call.ins.push_back(ctx->input(i));
     }
@@ -430,6 +432,9 @@ class PyFuncOp : public OpKernel {
     py_threadstate = PyGILState_Ensure();
     bool log_on_error;
     Status s = DoCallPyFunc(&call, &log_on_error);
+    // Sometimes py_funcs can be called without a session and leak memory. This
+    // ensures we clear the decref cache so this doesn't happen.
+    ClearDecrefCache();
     PyGILState_Release(py_threadstate);
 
     // Ensures that GIL is released even when !s.ok().
@@ -460,9 +465,15 @@ class PyFuncOp : public OpKernel {
  private:
   string token_;
 
+  // True if and only if this op should execute the python function eagerly,
+  // i.e., if and only if the eager attribute is set.
+  bool eager_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(PyFuncOp);
 };
+
 REGISTER_KERNEL_BUILDER(Name("PyFunc").Device(DEVICE_CPU), PyFuncOp);
 REGISTER_KERNEL_BUILDER(Name("PyFuncStateless").Device(DEVICE_CPU), PyFuncOp);
+REGISTER_KERNEL_BUILDER(Name("EagerPyFunc").Device(DEVICE_CPU), PyFuncOp);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/python/lib/core/py_func.h b/tensorflow/python/lib/core/py_func.h
index 5a451d5f43285d19dff6c158ebc28045b3ff13d4..3197a7ddfa0ce3db9f8244215690e5ede5096ac2 100644
--- a/tensorflow/python/lib/core/py_func.h
+++ b/tensorflow/python/lib/core/py_func.h
@@ -24,21 +24,27 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Called by py code on initialization.
+// Called by python code on initialization.
 //
 // "trampoline" must represent a python function which has the
 // following signature:
-//   (string, list(ndarray)) -> ndarray | list(ndarray) | python scalar
+//   (string, list(ndarray)) | (string, list(EagerTensor)) ->
+//     ndarray | list(ndarray) | python scalar |
+//     EagerTensor | list(EagerTensor) | None
 //
 // The trampoline takes two arguments, the first is a string token
 // used by the python frontend's dispatching logic; the second is a
-// list of numpy ndarrays.
+// list of numpy ndarrays or EagerTensor objects. It can return a
+// single numpy ndarray, a list of numpy ndarrays, a python scalar, an
+// EagerTensor, a list of EagerTensors, or None.
 //
-// The trampoline can return a single numpy ndarray, a list of numpy
-// ndarrays, or a simply python scalar. The C++ runtime converts them,
-// if supported, back to Tensor objects.
+// PyFunc requires inputs and outputs to be ndarrays. EagerPyFunc requires
+// inputs to be a list of EagerTensors and outputs to be an EagerTensor, a list
+// of EagerTensors, or None.
 //
-// This is called by script_ops.py during its module initialization.
+// The C++ runtime converts outputs back to Tensor objects.
+//
+// This function is called by script_ops.py during its module initialization.
 //
 // TODO(zhifengc): Support distributed runtime.
 void InitializePyTrampoline(PyObject* trampoline);
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 71cb38f8fd24beeb9efe149a6bd39e0ef2031051..317bdc2e14747583f372808f48a5928273f5570a 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/python/lib/core/numpy.h"
+#include "tensorflow/python/lib/core/py_util.h"
 #include "tensorflow/python/lib/core/safe_ptr.h"
 
 namespace tensorflow {
@@ -89,12 +90,25 @@ Status InferShapeAndType(PyObject* obj, TensorShape* shape, DataType* dtype) {
       *dtype = DT_STRING;
     } else if (PySequence_Check(obj)) {
       auto length = PySequence_Length(obj);
-      shape->AddDim(length);
       if (length > 0) {
+        shape->AddDim(length);
         obj = PySequence_GetItem(obj, 0);
         continue;
-      } else {
+      } else if (length == 0) {
+        shape->AddDim(length);
         *dtype = DT_INVALID;  // Invalid dtype for empty tensors.
+      } else {
+        // The sequence does not have a valid length (PySequence_Length < 0).
+        if (PyErr_Occurred()) {
+          // PySequence_Length failed and set an exception. Fetch the message
+          // and convert it to a failed status.
+          return errors::InvalidArgument(PyExceptionFetch());
+        } else {
+          // This is almost certainly dead code: PySequence_Length failed but
+          // did not set an exception.
+          return errors::InvalidArgument(
+              "Attempted to convert an invalid sequence to a Tensor.");
+        }
       }
     } else if (IsPyFloat(obj)) {
       *dtype = DT_DOUBLE;
diff --git a/tensorflow/python/lib/core/py_util.cc b/tensorflow/python/lib/core/py_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2635694e23c07dd8e75d4bb0cfb9e83a2042d921
--- /dev/null
+++ b/tensorflow/python/lib/core/py_util.cc
@@ -0,0 +1,70 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/lib/core/py_util.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include <Python.h>
+
+namespace tensorflow {
+namespace {
+
+// py.__class__.__name__
+const char* ClassName(PyObject* py) {
+/* PyPy doesn't have a separate C API for old-style classes. */
+#if PY_MAJOR_VERSION < 3 && !defined(PYPY_VERSION)
+  if (PyClass_Check(py))
+    return PyString_AS_STRING(
+        CHECK_NOTNULL(reinterpret_cast<PyClassObject*>(py)->cl_name));
+  if (PyInstance_Check(py))
+    return PyString_AS_STRING(CHECK_NOTNULL(
+        reinterpret_cast<PyInstanceObject*>(py)->in_class->cl_name));
+#endif
+  if (Py_TYPE(py) == &PyType_Type) {
+    return reinterpret_cast<PyTypeObject*>(py)->tp_name;
+  }
+  return Py_TYPE(py)->tp_name;
+}
+
+}  // end namespace
+
+string PyExceptionFetch() {
+  CHECK(PyErr_Occurred())
+      << "Must only call PyExceptionFetch after an exception.";
+  PyObject* ptype;
+  PyObject* pvalue;
+  PyObject* ptraceback;
+  PyErr_Fetch(&ptype, &pvalue, &ptraceback);
+  PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
+  string err = ClassName(ptype);
+  if (pvalue) {
+    PyObject* str = PyObject_Str(pvalue);
+    if (str) {
+#if PY_MAJOR_VERSION < 3
+      strings::StrAppend(&err, ": ", PyString_AS_STRING(str));
+#else
+      strings::StrAppend(&err, ": ", PyUnicode_AsUTF8(str));
+#endif
+      Py_DECREF(str);
+    }
+    Py_DECREF(pvalue);
+  }
+  Py_DECREF(ptype);
+  Py_XDECREF(ptraceback);
+  return err;
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/python/lib/core/py_util.h b/tensorflow/python/lib/core/py_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..44dfe7ba21285d06667a8d0f6ab8ac0ec8f2aa00
--- /dev/null
+++ b/tensorflow/python/lib/core/py_util.h
@@ -0,0 +1,27 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_UTIL_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_UTIL_H_
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+// Fetch the exception message as a string. An exception must be set
+// (PyErr_Occurred() must be true).
+string PyExceptionFetch();
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_UTIL_H_
diff --git a/tensorflow/python/lib/core/safe_ptr.cc b/tensorflow/python/lib/core/safe_ptr.cc
index 456ea3348baa634075082fedde9dac175e237997..ce34b6d0041878c4122d36ab8bf9db6c17253680 100644
--- a/tensorflow/python/lib/core/safe_ptr.cc
+++ b/tensorflow/python/lib/core/safe_ptr.cc
@@ -16,25 +16,21 @@ limitations under the License.
 #include "tensorflow/python/lib/core/safe_ptr.h"
 
 namespace tensorflow {
-namespace {
 
-inline void Py_DECREF_wrapper(PyObject* o) { Py_DECREF(o); }
-
-}  // namespace
-
-Safe_PyObjectPtr make_safe(PyObject* o) {
-  return Safe_PyObjectPtr(o, Py_DECREF_wrapper);
+Safe_PyObjectPtr make_safe(PyObject* object) {
+  return Safe_PyObjectPtr(object);
 }
 
 Safe_TF_TensorPtr make_safe(TF_Tensor* tensor) {
-  return Safe_TF_TensorPtr(tensor, TF_DeleteTensor);
+  return Safe_TF_TensorPtr(tensor);
 }
 
 Safe_TFE_TensorHandlePtr make_safe(TFE_TensorHandle* handle) {
-  return Safe_TFE_TensorHandlePtr(handle, TFE_DeleteTensorHandle);
+  return Safe_TFE_TensorHandlePtr(handle);
 }
 
 Safe_TF_StatusPtr make_safe(TF_Status* status) {
-  return Safe_TF_StatusPtr(status, TF_DeleteStatus);
+  return Safe_TF_StatusPtr(status);
 }
+
 }  // namespace tensorflow
diff --git a/tensorflow/python/lib/core/safe_ptr.h b/tensorflow/python/lib/core/safe_ptr.h
index 70cd2fdf6ccf4d722892f06e1e3aa40919b63ac7..80db840aebcc7ca341b0f6c40fdaee2136d21aaa 100644
--- a/tensorflow/python/lib/core/safe_ptr.h
+++ b/tensorflow/python/lib/core/safe_ptr.h
@@ -17,39 +17,51 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
 
 #include <memory>
-#include <Python.h>
 
+#include <Python.h>
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api.h"
 
 namespace tensorflow {
+namespace detail {
+
+struct PyDecrefDeleter {
+  void operator()(PyObject* p) const { Py_DECREF(p); }
+};
+
+struct TFTensorDeleter {
+  void operator()(TF_Tensor* p) const { TF_DeleteTensor(p); }
+};
+
+struct TFETensorHandleDeleter {
+  void operator()(TFE_TensorHandle* p) const { TFE_DeleteTensorHandle(p); }
+};
+
+struct TFStatusDeleter {
+  void operator()(TF_Status* p) const { TF_DeleteStatus(p); }
+};
+
+}  // namespace detail
 
 // Safe container for an owned PyObject. On destruction, the reference count of
 // the contained object will be decremented.
-typedef void (*Py_DECREF_wrapper_type)(PyObject*);
-typedef std::unique_ptr<PyObject, Py_DECREF_wrapper_type> Safe_PyObjectPtr;
+using Safe_PyObjectPtr = std::unique_ptr<PyObject, detail::PyDecrefDeleter>;
 Safe_PyObjectPtr make_safe(PyObject* o);
 
 // Safe containers for an owned TF_Tensor. On destruction, the tensor will be
 // deleted by TF_DeleteTensor.
-// Note: can't use decltype(&TF_DeleteTensor) due to SWIG
-typedef void (*TF_DeleteTensor_type)(TF_Tensor*);
-typedef std::unique_ptr<TF_Tensor, TF_DeleteTensor_type> Safe_TF_TensorPtr;
+using Safe_TF_TensorPtr = std::unique_ptr<TF_Tensor, detail::TFTensorDeleter>;
 Safe_TF_TensorPtr make_safe(TF_Tensor* tensor);
 
 // Safe containers for an owned TFE_TensorHandle. On destruction, the handle
-// will be deleted by TFE_DeleteTensorHandle. Note: can't use
-// decltype(&TFE_DeleteTensorHandle) due to SWIG
-typedef void (*TFE_DeleteTensorHandle_type)(TFE_TensorHandle*);
-typedef std::unique_ptr<TFE_TensorHandle, TFE_DeleteTensorHandle_type>
-    Safe_TFE_TensorHandlePtr;
+// will be deleted by TFE_DeleteTensorHandle.
+using Safe_TFE_TensorHandlePtr =
+    std::unique_ptr<TFE_TensorHandle, detail::TFETensorHandleDeleter>;
 Safe_TFE_TensorHandlePtr make_safe(TFE_TensorHandle* handle);
 
 // Safe containers for an owned TF_Status. On destruction, the handle
-// will be deleted by TF_DeleteStatus. Note: can't use
-// decltype(&TF_DeleteStatus) due to SWIG
-typedef void (*TF_DeleteStatus_type)(TF_Status*);
-typedef std::unique_ptr<TF_Status, TF_DeleteStatus_type> Safe_TF_StatusPtr;
+// will be deleted by TF_DeleteStatus.
+using Safe_TF_StatusPtr = std::unique_ptr<TF_Status, detail::TFStatusDeleter>;
 Safe_TF_StatusPtr make_safe(TF_Status* status);
 
 }  // namespace tensorflow
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 87f8d1486011683c89095aeb04e2d01461f83749..55cae0bcbfca8a9cacfe525fe3b69c7fb232acd3 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -524,6 +524,16 @@ def _TransposeGrad(op, grad):
   return [array_ops.transpose(grad, array_ops.invert_permutation(p)), None]
 
 
+@ops.RegisterGradient("ConjugateTranspose")
+def _ConjugateTransposeGrad(op, grad):
+  """Returns conj(unshuffle(grad))."""
+  p = op.inputs[1]
+  return [
+      array_ops.transpose(
+          grad, array_ops.invert_permutation(p), conjugate=True), None
+  ]
+
+
 ops.NotDifferentiable("Shape")
 
 
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 037ab4ff507dcd99338d15163345d34310a00b61..74b405681b5b6cbda7df207a6deb2a172d858743 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -70,6 +70,7 @@ See the @{$python/array_ops} guide.
 @@quantize_v2
 @@quantized_concat
 @@setdiff1d
+@@guarantee_const
 @@fake_quant_with_min_max_args
 @@fake_quant_with_min_max_args_gradient
 @@fake_quant_with_min_max_vars
@@ -125,11 +126,8 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
   if context.in_graph_mode():
     return gen_array_ops.identity(input, name=name)
   else:
-    try:
-      in_device = input.device
-    except AttributeError:
-      input = ops.convert_to_tensor(input)
-      in_device = input.device
+    input = ops.convert_to_tensor(input)
+    in_device = input.device
     # TODO(ashankar): Does 'identity' need to invoke execution callbacks?
     if context.context().device_name != in_device:
       return input._copy()  # pylint: disable=protected-access
@@ -641,40 +639,35 @@ def strided_slice(input_,
                   name=None):
   """Extracts a strided slice of a tensor (generalized python array indexing).
 
-  **Most users will want to use @{tf.Tensor.__getitem__} and
-  @{tf.Variable.__getitem__}.** That allows  NumPy style slicing syntax (i.e.
-  `tensor[..., 3:4:-1, tf.newaxis, 3]`).
-  This op is the low-level interface that are used to implement operators.
-  Those interfaces are much more friendly, and highly recommended.
-
-  To a first order, this operation extracts a slice of size `end - begin`
-  from a tensor `input`
-  starting at the location specified by `begin`. The slice continues by adding
-  `stride` to the `begin` index until all dimensions are not less than `end`.
-  Note that components of stride can be negative, which causes a reverse
-  slice.
-
-  This operation can be thought of an encoding of a numpy style sliced
-  range. Given a python slice input[<spec0>, <spec1>, ..., <specn>]
+  **Instead of calling this op directly most users will want to use the
+  NumPy-style slicing syntax (e.g. `tensor[..., 3:4:-1, tf.newaxis, 3]`), which
+  is supported via @{tf.Tensor.__getitem__} and @{tf.Variable.__getitem__}.**
+  The interface of this op is a low-level encoding of the slicing syntax.
+
+  Roughly speaking, this op extracts a slice of size `(end-begin)/stride`
+  from the given `input_` tensor. Starting at the location specified by `begin`
+  the slice continues by adding `stride` to the index until all dimensions are
+  not less than `end`.
+  Note that a stride can be negative, which causes a reverse slice.
+
+  Given a Python slice `input[spec0, spec1, ..., specn]`,
   this function will be called as follows.
 
-  `begin`, `end`, and `strides` will be all length n. n is in general
-  not the same dimensionality as `input`.
+  `begin`, `end`, and `strides` will be vectors of length n.
+  n in general is not equal to the rank of the `input_` tensor.
 
-  For the ith spec,
-  `begin_mask`, `end_mask`, `ellipsis_mask`, `new_axis_mask`,
-  and `shrink_axis_mask` will have the ith bit corresponding to
+  In each mask field (`begin_mask`, `end_mask`, `ellipsis_mask`,
+  `new_axis_mask`, `shrink_axis_mask`) the ith bit will correspond to
   the ith spec.
 
-  If the ith bit of `begin_mask` is non-zero, `begin[i]` is ignored and
+  If the ith bit of `begin_mask` is set, `begin[i]` is ignored and
   the fullest possible range in that dimension is used instead.
   `end_mask` works analogously, except with the end range.
 
   `foo[5:,:,:3]` on a 7x8x9 tensor is equivalent to `foo[5:7,0:8,0:3]`.
   `foo[::-1]` reverses a tensor with shape 8.
 
-
-  If the ith bit of `ellipsis_mask` is non-zero, as many unspecified dimensions
+  If the ith bit of `ellipsis_mask` is set, as many unspecified dimensions
   as needed will be inserted between other dimensions. Only one
   non-zero bit is allowed in `ellipsis_mask`.
 
@@ -682,20 +675,21 @@ def strided_slice(input_,
   equivalent to `foo[3:5,:,:,4:5]` and
   `foo[3:5,...]` is equivalent to `foo[3:5,:,:,:]`.
 
-  If the ith bit of `new_axis_mask` is one, then `begin`,
+  If the ith bit of `new_axis_mask` is set, then `begin`,
   `end`, and `stride` are ignored and a new length 1 dimension is
   added at this point in the output tensor.
 
-  For example `foo[3:5,4]` on a 10x8 tensor produces a shape 2 tensor
-  whereas `foo[3:5,4:5]` produces a shape 2x1 tensor with shrink_mask
-  being 1<<1 == 2.
+  For example,
+  `foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor.
+
+  If the ith bit of `shrink_axis_mask` is set, it implies that the ith
+  specification shrinks the dimensionality by 1. `begin[i]`, `end[i]` and
+  `strides[i]` must imply a slice of size 1 in the dimension. For example in
+  Python one might do `foo[:, 3, :]` which would result in
+  `shrink_axis_mask` equal to 2.
 
-  If the ith bit of `shrink_axis_mask` is one, then `begin`,
-  `end[i]`, and `stride[i]` are used to do a slice in the appropriate
-  dimension, but the output tensor will be reduced in dimensionality
-  by one. This is only valid if the ith entry of slice[i]==1.
 
-  NOTE: `begin` and `end` are zero-indexed`.
+  NOTE: `begin` and `end` are zero-indexed.
   `strides` entries must be non-zero.
 
 
@@ -1198,18 +1192,19 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
           "Number of mask dimensions must be specified, even if some dimensions"
           " are None.  E.g. shape=[None] is ok, but shape=None is not.")
     axis = 0 if axis is None else axis
-    shape_tensor[axis:axis+ndims_mask].assert_is_compatible_with(shape_mask)
+    shape_tensor[axis:axis + ndims_mask].assert_is_compatible_with(shape_mask)
 
-    leading_size = gen_math_ops._prod(shape(tensor)[axis:axis+ndims_mask], [0])
+    leading_size = gen_math_ops._prod(
+        shape(tensor)[axis:axis + ndims_mask], [0])
     tensor = reshape(tensor,
-                     concat([shape(tensor)[:axis],
-                             [leading_size],
-                             shape(tensor)[axis+ndims_mask:]], 0))
-    first_dim = shape_tensor[axis:axis+ndims_mask].num_elements()
+                     concat([
+                         shape(tensor)[:axis], [leading_size],
+                         shape(tensor)[axis + ndims_mask:]
+                     ], 0))
+    first_dim = shape_tensor[axis:axis + ndims_mask].num_elements()
     tensor.set_shape(
-        tensor_shape.as_shape(shape_tensor[:axis])
-        .concatenate([first_dim])
-        .concatenate(shape_tensor[axis+ndims_mask:]))
+        tensor_shape.as_shape(shape_tensor[:axis]).concatenate([first_dim])
+        .concatenate(shape_tensor[axis + ndims_mask:]))
 
     mask = reshape(mask, [-1])
     return _apply_mask_1d(tensor, mask, axis)
@@ -1308,7 +1303,7 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   size_splits = ops.convert_to_tensor(num_or_size_splits)
   if size_splits._rank() == 0 and size_splits.dtype.is_integer:
     return gen_array_ops._split(
-        split_dim=axis, num_split=num_or_size_splits, value=value, name=name)
+        axis=axis, num_split=num_or_size_splits, value=value, name=name)
 
   if num is None:
     num = size_splits._shape_tuple()[0]
@@ -1318,7 +1313,7 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   return gen_array_ops._split_v(
       value=value,
       size_splits=size_splits,
-      split_dim=axis,
+      axis=axis,
       num_split=num,
       name=name)
 
@@ -2540,9 +2535,9 @@ def where(condition, x=None, y=None, name=None):
     with ops.name_scope(name, "Where", [condition]) as name:
       condition = ops.convert_to_tensor(
           condition, preferred_dtype=dtypes.bool, name="condition")
-      return gen_array_ops.where(input=condition, name=name)
+      return gen_array_ops.where(condition=condition, name=name)
   elif x is not None and y is not None:
-    return gen_math_ops._select(condition=condition, t=x, e=y, name=name)
+    return gen_math_ops._select(condition=condition, x=x, y=y, name=name)
   else:
     raise ValueError("x and y must both be non-None or both be None.")
 
diff --git a/tensorflow/python/ops/bitwise_ops_test.py b/tensorflow/python/ops/bitwise_ops_test.py
index fa1b219b1771dbd8f99939d8f6571d2a8791433e..75eb100a90ff86dc514e735012922101d693e3d2 100644
--- a/tensorflow/python/ops/bitwise_ops_test.py
+++ b/tensorflow/python/ops/bitwise_ops_test.py
@@ -36,7 +36,7 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
 
   def testBinaryOps(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
-                  dtypes.uint8, dtypes.uint16]
+                  dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
 
     with self.test_session(use_gpu=True) as sess:
       for dtype in dtype_list:
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 7e509f72c158726f7070b7e3d363e6b58e521755..1377af3eac43a5846353257304ef7e022d3506d4 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -104,6 +104,11 @@ def _assert_static(condition, data):
                                       message='\n'.join(data_static))
 
 
+def _shape_and_dtype_str(tensor):
+  """Returns a string containing tensor's shape and dtype."""
+  return 'shape=%s dtype=%s' % (tensor.shape, tensor.dtype.name)
+
+
 def assert_proper_iterable(values):
   """Static assert that values is a "proper" iterable.
 
@@ -159,10 +164,14 @@ def assert_negative(x, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_negative', [x, data]):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
+      if context.in_eager_mode():
+        name = _shape_and_dtype_str(x)
+      else:
+        name = x.name
       data = [
           message,
           'Condition x < 0 did not hold element-wise:',
-          'x (%s) = ' % x.name, x]
+          'x (%s) = ' % name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less(x, zero, data=data, summarize=summarize)
 
@@ -195,9 +204,13 @@ def assert_positive(x, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_positive', [x, data]):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
+      if context.in_eager_mode():
+        name = _shape_and_dtype_str(x)
+      else:
+        name = x.name
       data = [
           message, 'Condition x > 0 did not hold element-wise:',
-          'x (%s) = ' % x.name, x]
+          'x (%s) = ' % name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less(zero, x, data=data, summarize=summarize)
 
@@ -232,7 +245,7 @@ def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
       if context.in_eager_mode():
-        name = str(x)
+        name = _shape_and_dtype_str(x)
       else:
         name = x.name
       data = [
@@ -272,10 +285,14 @@ def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_non_positive', [x, data]):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
+      if context.in_eager_mode():
+        name = _shape_and_dtype_str(x)
+      else:
+        name = x.name
       data = [
           message,
           'Condition x <= 0 did not hold element-wise:'
-          'x (%s) = ' % x.name, x]
+          'x (%s) = ' % name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less_equal(x, zero, data=data, summarize=summarize)
 
@@ -408,8 +425,8 @@ def assert_none_equal(
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
     if context.in_eager_mode():
-      x_name = 'x'
-      y_name = 'y'
+      x_name = _shape_and_dtype_str(x)
+      y_name = _shape_and_dtype_str(y)
     else:
       x_name = x.name
       y_name = y.name
@@ -456,8 +473,8 @@ def assert_less(x, y, data=None, summarize=None, message=None, name=None):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
     if context.in_eager_mode():
-      x_name = 'x'
-      y_name = 'y'
+      x_name = _shape_and_dtype_str(x)
+      y_name = _shape_and_dtype_str(y)
     else:
       x_name = x.name
       y_name = y.name
@@ -502,11 +519,18 @@ def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_less_equal', [x, y, data]):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
+    if context.in_eager_mode():
+      x_name = _shape_and_dtype_str(x)
+      y_name = _shape_and_dtype_str(y)
+    else:
+      x_name = x.name
+      y_name = y.name
+
     if data is None:
       data = [
           message,
           'Condition x <= y did not hold element-wise:'
-          'x (%s) = ' % x.name, x, 'y (%s) = ' % y.name, y
+          'x (%s) = ' % x_name, x, 'y (%s) = ' % y_name, y
       ]
     condition = math_ops.reduce_all(math_ops.less_equal(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
@@ -542,11 +566,18 @@ def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_greater', [x, y, data]):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
+    if context.in_eager_mode():
+      x_name = _shape_and_dtype_str(x)
+      y_name = _shape_and_dtype_str(y)
+    else:
+      x_name = x.name
+      y_name = y.name
+
     if data is None:
       data = [
           message,
           'Condition x > y did not hold element-wise:'
-          'x (%s) = ' % x.name, x, 'y (%s) = ' % y.name, y
+          'x (%s) = ' % x_name, x, 'y (%s) = ' % y_name, y
       ]
     condition = math_ops.reduce_all(math_ops.greater(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
@@ -584,11 +615,18 @@ def assert_greater_equal(x, y, data=None, summarize=None, message=None,
   with ops.name_scope(name, 'assert_greater_equal', [x, y, data]):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
+    if context.in_eager_mode():
+      x_name = _shape_and_dtype_str(x)
+      y_name = _shape_and_dtype_str(y)
+    else:
+      x_name = x.name
+      y_name = y.name
+
     if data is None:
       data = [
           message,
           'Condition x >= y did not hold element-wise:'
-          'x (%s) = ' % x.name, x, 'y (%s) = ' % y.name, y
+          'x (%s) = ' % x_name, x, 'y (%s) = ' % y_name, y
       ]
     condition = math_ops.reduce_all(math_ops.greater_equal(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
@@ -676,10 +714,15 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
     static_condition = lambda actual_rank, given_rank: actual_rank == given_rank
     dynamic_condition = math_ops.equal
 
+    if context.in_eager_mode():
+      name = ''
+    else:
+      name = x.name
+
     if data is None:
       data = [
           message,
-          'Tensor %s must have rank' % x.name, rank, 'Received shape: ',
+          'Tensor %s must have rank' % name, rank, 'Received shape: ',
           array_ops.shape(x)
       ]
 
@@ -691,7 +734,7 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
       if e.args[0] == 'Static rank condition failed':
         raise ValueError(
             '%s.  Tensor %s must have rank %d.  Received rank %d, shape %s' %
-            (message, x.name, e.args[2], e.args[1], x.get_shape()))
+            (message, name, e.args[2], e.args[1], x.get_shape()))
       else:
         raise
 
@@ -734,10 +777,16 @@ def assert_rank_at_least(
 
     static_condition = lambda actual_rank, given_rank: actual_rank >= given_rank
     dynamic_condition = math_ops.greater_equal
+
+    if context.in_eager_mode():
+      name = ''
+    else:
+      name = x.name
+
     if data is None:
       data = [
           message,
-          'Tensor %s must have rank at least' % x.name, rank,
+          'Tensor %s must have rank at least' % name, rank,
           'Received shape: ', array_ops.shape(x)
       ]
 
@@ -749,7 +798,7 @@ def assert_rank_at_least(
       if e.args[0] == 'Static rank condition failed':
         raise ValueError(
             '%s.  Tensor %s must have rank at least %d.  Received rank %d, '
-            'shape %s' % (message, x.name, e.args[2], e.args[1], x.get_shape()))
+            'shape %s' % (message, name, e.args[2], e.args[1], x.get_shape()))
       else:
         raise
 
@@ -856,9 +905,14 @@ def assert_rank_in(
     ranks = tuple([ops.convert_to_tensor(rank, name='rank') for rank in ranks])
     message = message or ''
 
+    if context.in_eager_mode():
+      name = ''
+    else:
+      name = x.name
+
     if data is None:
       data = [
-          message, 'Tensor %s must have rank in' % x.name
+          message, 'Tensor %s must have rank in' % name
       ] + list(ranks) + [
           'Received shape: ', array_ops.shape(x)
       ]
@@ -871,7 +925,7 @@ def assert_rank_in(
       if e.args[0] == 'Static rank condition failed':
         raise ValueError(
             '%s.  Tensor %s must have rank in %s.  Received rank %d, '
-            'shape %s' % (message, x.name, e.args[2], e.args[1], x.get_shape()))
+            'shape %s' % (message, name, e.args[2], e.args[1], x.get_shape()))
       else:
         raise
 
@@ -903,9 +957,13 @@ def assert_integer(x, message=None, name=None):
   with ops.name_scope(name, 'assert_integer', [x]):
     x = ops.convert_to_tensor(x, name='x')
     if not x.dtype.is_integer:
+      if context.in_eager_mode():
+        name = 'tensor'
+      else:
+        name = x.name
       err_msg = (
           '%s  Expected "x" to be integer type.  Found: %s of dtype %s'
-          % (message, x.name, x.dtype))
+          % (message, name, x.dtype))
       raise TypeError(err_msg)
 
     return control_flow_ops.no_op('statically_determined_was_integer')
@@ -1079,6 +1137,10 @@ def assert_scalar(tensor, name=None):
     tensor = ops.convert_to_tensor(tensor, name=name_scope)
     shape = tensor.get_shape()
     if shape.ndims != 0:
-      raise ValueError('Expected scalar shape for %s, saw shape: %s.'
-                       % (tensor.name, shape))
+      if context.in_eager_mode():
+        raise ValueError('Expected scalar shape, saw shape: %s.'
+                         % (shape,))
+      else:
+        raise ValueError('Expected scalar shape for %s, saw shape: %s.'
+                         % (tensor.name, shape))
     return tensor
diff --git a/tensorflow/python/ops/control_flow_grad.py b/tensorflow/python/ops/control_flow_grad.py
index 3c082b19b6b79491dc6572c056084932d8697a2d..97b57177b29986a006df992f4c0c2b79e11467aa 100644
--- a/tensorflow/python/ops/control_flow_grad.py
+++ b/tensorflow/python/ops/control_flow_grad.py
@@ -23,6 +23,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,undefined-variable
@@ -52,7 +53,8 @@ def _SwitchGrad(op, *grad):
       # TODO(yuanbyu): Perform shape inference with this new input.
       if grad[1] is not None:
         # pylint: disable=protected-access
-        control_flow_ops._AddNextAndBackEdge(merge_grad, grad[1])
+        control_flow_ops._AddNextAndBackEdge(merge_grad, grad[1],
+                                             enforce_shape_invariant=False)
         # pylint: enable=protected-access
       return None, None
     elif grad[0] is not None:
@@ -69,13 +71,12 @@ def _SwitchGrad(op, *grad):
       # meaning the output is not differentiable.
       return None, None
   elif isinstance(op_ctxt, CondContext):
-    good_grad = grad[op_ctxt.branch]
     zero_grad = grad[1 - op_ctxt.branch]
     # At this point, we have created zero_grad guarded by the right switch.
     # Unfortunately, we may still get None here for not trainable data types.
     if zero_grad is None:
       return None, None
-    return merge([good_grad, zero_grad], name="cond_grad")[0], None
+    return merge(grad, name="cond_grad")[0], None
   else:
     false_grad = switch(grad[0], op.inputs[1])[0]
     true_grad = switch(grad[1], op.inputs[1])[1]
@@ -92,7 +93,7 @@ def _MergeGrad(op, grad, _):
   input_op = op.inputs[0].op
   graph = ops.get_default_graph()
   # pylint: disable=protected-access
-  op_ctxt = control_flow_ops._GetOutputContext(input_op)
+  op_ctxt = control_flow_util.GetOutputContext(input_op)
   grad_ctxt = graph._get_control_flow_context()
   # pylint: enable=protected-access
   if isinstance(op_ctxt, WhileContext):
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index d33d4cd597c177fe43d7331bce60a83768f5bbbd..4d108155e4ed8754b2f377874f58432b19985bd4 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -52,6 +52,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
 
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -66,6 +67,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_util as util
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
@@ -505,29 +507,6 @@ def _convert_flows_to_tensorarrays(tensors_or_tensorarrays, tensors_or_flows):
       for (ta, t_or_flow) in zip(tensors_or_tensorarrays, tensors_or_flows)]
 
 
-def _IsLoopConstantEnter(op):
-  """Return true iff op is a loop invariant."""
-  is_enter = (op.type == "Enter" or op.type == "RefEnter")
-  return is_enter and op.get_attr("is_constant")
-
-
-def _GetLoopConstantEnter(value):
-  """Return the enter op if we can infer `value` to be a loop invariant."""
-  id_ops = {"Switch", "RefSwitch", "Identity", "RefIdentity"}
-  op = value.op
-  while op.type in id_ops:
-    op = op.inputs[0].op
-  return op if _IsLoopConstantEnter(op) else None
-
-
-def _GetOutputContext(op):
-  """Return the control flow context for the output of an op."""
-  ctxt = op._get_control_flow_context()
-  if IsLoopExit(op):
-    ctxt = ctxt.outer_context
-  return ctxt
-
-
 def _ShapeLessThanOrEqual(shape1, shape2):
   if shape2.dims is None:
     return True
@@ -612,6 +591,8 @@ def _EnforceShapeInvariant(merge_var, next_var):
     m_shape = merge_var.get_shape()
     n_shape = next_var.get_shape()
     if not _ShapeLessThanOrEqual(n_shape, m_shape):
+      # TODO(skyewm): get original loop input that caused the shape error and
+      # report its name instead of the merge node's.
       raise ValueError(
           "The shape for %s is not an invariant for the loop. It enters "
           "the loop with shape %s, but has shape %s after one iteration. "
@@ -663,11 +644,17 @@ def _EnforceShapeInvariant(merge_var, next_var):
              n_values_shape, n_indices_shape, n_shape_shape))
 
 
-def _AddNextAndBackEdge(m, v):
+def _AddNextAndBackEdge(m, v, enforce_shape_invariant=True):
   """Add NextIteration and back edge from v to m."""
   if isinstance(m, ops.Tensor):
     v = ops.convert_to_tensor(v)
     v = _NextIteration(v)
+    if enforce_shape_invariant:
+      # Make sure the shapes of loop outputs are correct. We do this before
+      # calling _update_input, which will raise a less-helpful error message if
+      # the types don't match.
+      # TODO(skyewm): call this for other cases below (needs testing)
+      _EnforceShapeInvariant(m, v)
     m.op._update_input(1, v)   # pylint: disable=protected-access
   elif isinstance(m, ops.IndexedSlices):
     # pylint: disable=protected-access
@@ -762,22 +749,26 @@ class GradLoopState(object):
 
       outer_grad_ctxt = outer_grad_state.grad_context
       outer_grad_ctxt.Enter()
-      self._grad_context = WhileContext(forward_ctxt.parallel_iterations,
-                                        forward_ctxt.back_prop,
-                                        forward_ctxt.swap_memory,
-                                        forward_ctxt.name,
-                                        self)
+      self._grad_context = WhileContext(
+          maximum_iterations=forward_ctxt.maximum_iterations,
+          parallel_iterations=forward_ctxt.parallel_iterations,
+          back_prop=forward_ctxt.back_prop,
+          swap_memory=forward_ctxt.swap_memory,
+          name=forward_ctxt.name,
+          grad_state=self)
       real_cnt = outer_grad_state.AddBackpropAccumulatedValue(history_cnt, cnt)
       self._grad_index = self._grad_context.AddBackpropLoopCounter(
           real_cnt, outer_grad_state)
       outer_grad_ctxt.Exit()
     else:
       if outer_forward_ctxt: outer_forward_ctxt.Enter()
-      self._grad_context = WhileContext(forward_ctxt.parallel_iterations,
-                                        forward_ctxt.back_prop,
-                                        forward_ctxt.swap_memory,
-                                        forward_ctxt.name,
-                                        self)
+      self._grad_context = WhileContext(
+          maximum_iterations=forward_ctxt.maximum_iterations,
+          parallel_iterations=forward_ctxt.parallel_iterations,
+          back_prop=forward_ctxt.back_prop,
+          swap_memory=forward_ctxt.swap_memory,
+          name=forward_ctxt.name,
+          grad_state=self)
       self._grad_index = self._grad_context.AddBackpropLoopCounter(
           cnt, outer_grad_state)
       if outer_forward_ctxt: outer_forward_ctxt.Exit()
@@ -907,9 +898,14 @@ class GradLoopState(object):
     with ops.control_dependencies(None):
       if curr_ctxt: curr_ctxt.Enter()
       with ops.colocate_with(value):
+        maximum_iterations = self.forward_context.maximum_iterations
+        if maximum_iterations is None:
+          maximum_iterations = constant_op.constant(-1, dtypes.int32)
         # pylint: disable=protected-access
-        acc = gen_data_flow_ops._stack_v2(-1, value.dtype.base_dtype,
-                                          name="f_acc")
+        acc = gen_data_flow_ops._stack_v2(
+            max_size=maximum_iterations,
+            elem_type=value.dtype.base_dtype,
+            name="f_acc")
         # pylint: enable=protected-access
       if curr_ctxt: curr_ctxt.Exit()
 
@@ -918,7 +914,7 @@ class GradLoopState(object):
 
       # Add the stack_push op in the context of value.op.
       swap_enabled = self.forward_context.swap_memory
-      value_ctxt = _GetOutputContext(value.op)
+      value_ctxt = util.GetOutputContext(value.op)
       if value_ctxt == self.forward_context:
         # value is not nested in the forward context.
         self.forward_context.Enter()
@@ -1028,7 +1024,7 @@ class GradLoopState(object):
       cur_value = value
       cur_grad_state = self
       while True:
-        enter_op = _GetLoopConstantEnter(cur_value)
+        enter_op = util.GetLoopConstantEnter(cur_value)
         if enter_op:
           # Special case: cur_value comes from a constant Enter node.
           cur_value = enter_op.inputs[0]
@@ -1081,7 +1077,7 @@ class ControlFlowState(object):
 
   def GetGradState(self, op, before):
     """Return the grad state for this op if it's in a forward loop context."""
-    if before and IsLoopExit(op):
+    if before and util.IsLoopExit(op):
       forward_ctxt = op._get_control_flow_context()
       forward_ctxt = forward_ctxt.outer_context
       if forward_ctxt:
@@ -1241,8 +1237,8 @@ class ControlFlowState(object):
     Returns:
       A zero tensor of the same shape of op.outputs[index].
     """
-    if IsLoopSwitch(op): return None
-    dead_branch = IsSwitch(op)
+    if util.IsLoopSwitch(op): return None
+    dead_branch = util.IsSwitch(op)
     forward_ctxt = _GetWhileContext(op)
     grad_state = self._map.get(forward_ctxt)
     if grad_state is None:
@@ -1342,7 +1338,7 @@ def MaybeCreateControlFlowState(between_op_list, between_ops,
   """
   loop_state = None
   for op in between_op_list:
-    if IsLoopExit(op):
+    if util.IsLoopExit(op):
       if loop_state is None:
         loop_state = ControlFlowState()
       if colocate_gradients_with_ops:
@@ -1353,28 +1349,10 @@ def MaybeCreateControlFlowState(between_op_list, between_ops,
   return loop_state
 
 
-def IsSwitch(op):
-  """Return true if `op` is a Switch."""
-  return op.type == "Switch" or op.type == "RefSwitch"
-
-
-def IsLoopExit(op):
-  """Return true if `op` is an Exit."""
-  return op.type == "Exit" or op.type == "RefExit"
-
-
-def IsLoopSwitch(op):
-  """Return true if `op` is the Switch for a while loop."""
-  if IsSwitch(op):
-    ctxt = op._get_control_flow_context()
-    return ctxt and isinstance(ctxt, WhileContext)
-  return False
-
-
 def ZerosLikeOutsideLoop(op, index):
   """Create zeros_like for the specified output of an op."""
   val = op.outputs[index]
-  if not IsSwitch(op):
+  if not util.IsSwitch(op):
     return array_ops.zeros_like(val, optimize=False)
   else:
     op_ctxt = op._get_control_flow_context()
@@ -1445,6 +1423,10 @@ class ControlFlowContext(object):
       g.as_graph_element(op)._set_control_flow_context(self)
       # pylint: enable=protected-access
 
+  @property
+  def name(self):
+    return self._name
+
   @property
   def outer_context(self):
     """Return the context containing this context."""
@@ -1511,7 +1493,7 @@ class ControlFlowContext(object):
     return None
 
   def _IsInOuterContext(self, op):
-    op_ctxt = _GetOutputContext(op)
+    op_ctxt = util.GetOutputContext(op)
     outer_ctxt = self.outer_context
     while outer_ctxt != op_ctxt:
       if outer_ctxt is None:
@@ -1529,11 +1511,11 @@ class ControlFlowContext(object):
     else:
       internal_control_inputs = []
       for x in op.control_inputs:
-        ctxt = _GetOutputContext(x)
+        ctxt = util.GetOutputContext(x)
         if ctxt is not None and ctxt.GetWhileContext() == while_ctxt:
           internal_control_inputs.append(x)
     if len(internal_control_inputs) != len(op.control_inputs):
-      del op.control_inputs[:]
+      op._remove_all_control_inputs()
       op._add_control_inputs(internal_control_inputs)
     return internal_control_inputs
   # pylint: enable=protected-access
@@ -1547,6 +1529,15 @@ class ControlFlowContext(object):
     """Returns the pivot node for this context, or None."""
     return None
 
+  def IsWhileContext(self):
+    return False
+
+  def IsCondContext(self):
+    return False
+
+  def __str__(self):
+    return self.name
+
 
 class CondContext(ControlFlowContext):
   """The context for the conditional construct."""
@@ -1600,10 +1591,6 @@ class CondContext(ControlFlowContext):
     super(CondContext, self).__init__(values_def=context_def.values_def,
                                       import_scope=import_scope)
 
-  @property
-  def name(self):
-    return self._name
-
   @property
   def pred(self):
     return self._pred
@@ -1720,7 +1707,7 @@ class CondContext(ControlFlowContext):
         op._add_control_input(self._pivot.op)
       # pylint: enable=protected-access
 
-    if self._outer_context or not IsLoopExit(op):
+    if self._outer_context or not util.IsLoopExit(op):
       op.graph.prevent_fetching(op)
 
     if self._outer_context:
@@ -1764,7 +1751,19 @@ class CondContext(ControlFlowContext):
 
   def BuildCondBranch(self, fn):
     """Add the subgraph defined by fn() to the graph."""
+    pre_summaries = ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
     original_result = fn()
+    post_summaries = ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
+    if len(post_summaries) > len(pre_summaries):
+      new_summaries = post_summaries[len(pre_summaries):]
+      summary_ref = ops.get_collection_ref(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
+      summary_ref[:] = pre_summaries
+      with ops.control_dependencies(new_summaries):
+        if original_result is None:
+          return no_op(), None
+        else:
+          original_result = nest.map_structure(
+              array_ops.identity, original_result)
     if original_result is None:
       return None, None
 
@@ -1773,6 +1772,9 @@ class CondContext(ControlFlowContext):
       result = [result]
     return original_result, result
 
+  def IsCondContext(self):
+    return True
+
 
 def _UnpackIfSingleton(res):
   if isinstance(res, (list, _basetuple)) and len(res) == 1:
@@ -1781,6 +1783,7 @@ def _UnpackIfSingleton(res):
     return res
 
 
+# pylint: disable=redefined-outer-name
 # pylint: disable=g-doc-args
 @deprecation.deprecated_args(
     None,
@@ -1957,6 +1960,7 @@ def cond(pred, true_fn=None, false_fn=None, strict=False, name=None,
       merges = _UnpackIfSingleton(merges)
     return merges
 # pylint: enable=g-doc-args
+# pylint: enable=redefined-outer-name
 
 
 def _resource_safe_shape(t):
@@ -1974,12 +1978,19 @@ def _resource_safe_shape(t):
 class WhileContext(ControlFlowContext):
   """The context for the loop construct."""
 
-  def __init__(self, parallel_iterations=10, back_prop=True, swap_memory=False,
-               name="while_context", grad_state=None, context_def=None,
+  def __init__(self,
+               maximum_iterations=None,
+               parallel_iterations=10,
+               back_prop=True,
+               swap_memory=False,
+               name="while_context",
+               grad_state=None,
+               context_def=None,
                import_scope=None):
     """"Creates a `WhileContext`.
 
     Args:
+      maximum_iterations: Optional upper bound on number of loop iterations.
       parallel_iterations: The number of iterations allowed to run in parallel.
       back_prop: Whether backprop is enabled for this while loop.
       swap_memory: Whether GPU-CPU memory swap is enabled for this loop.
@@ -1994,16 +2005,17 @@ class WhileContext(ControlFlowContext):
       self._init_from_proto(context_def, import_scope=import_scope)
     else:
       ControlFlowContext.__init__(self)
-      self._init_from_args(parallel_iterations, back_prop, swap_memory,
-                           name)
+      self._init_from_args(maximum_iterations, parallel_iterations, back_prop,
+                           swap_memory, name)
     # The gradient loop state.
     self._grad_state = grad_state
 
-  def _init_from_args(self, parallel_iterations, back_prop, swap_memory,
-                      name):
+  def _init_from_args(self, maximum_iterations, parallel_iterations, back_prop,
+                      swap_memory, name):
     """Creates a new `WhileContext` from arguments.
 
     Args:
+      maximum_iterations: Optional upper bound on number of loop iterations.
       parallel_iterations: The number of iterations allowed to run in parallel.
       back_prop: Whether backprop is enabled for this while loop.
       swap_memory: Whether GPU-CPU memory swap is enabled for this loop.
@@ -2016,6 +2028,7 @@ class WhileContext(ControlFlowContext):
       raise ValueError("`parallel_iterations` must be a positive integer: "
                        "%s" % parallel_iterations)
     self._name = ops.get_default_graph().unique_name(name)
+    self._maximum_iterations = maximum_iterations
     self._parallel_iterations = parallel_iterations
     self._back_prop = back_prop
     self._swap_memory = swap_memory
@@ -2043,6 +2056,12 @@ class WhileContext(ControlFlowContext):
     g = ops.get_default_graph()
     self._name = ops.prepend_name_scope(
         context_def.context_name, import_scope)
+    if context_def.maximum_iterations_name:
+      self._maximum_iterations = g.as_graph_element(
+          ops.prepend_name_scope(context_def.maximum_iterations_name,
+                                 import_scope))
+    else:
+      self._maximum_iterations = None
     self._parallel_iterations = context_def.parallel_iterations
     self._back_prop = context_def.back_prop
     self._swap_memory = context_def.swap_memory
@@ -2067,8 +2086,9 @@ class WhileContext(ControlFlowContext):
                                        import_scope=import_scope)
 
   @property
-  def name(self):
-    return self._name
+  def maximum_iterations(self):
+    """The maximum number of iterations that will be executed."""
+    return self._maximum_iterations
 
   @property
   def parallel_iterations(self):
@@ -2120,6 +2140,9 @@ class WhileContext(ControlFlowContext):
       context_def.context_name = ops.strip_name_scope(
           self.name, export_scope)
       context_def.parallel_iterations = self._parallel_iterations
+      if self._maximum_iterations is not None:
+        context_def.maximum_iterations_name = ops.strip_name_scope(
+            self._maximum_iterations.name, export_scope)
       context_def.back_prop = self._back_prop
       context_def.swap_memory = self._swap_memory
       context_def.pivot_for_pred_name = ops.strip_name_scope(
@@ -2178,7 +2201,7 @@ class WhileContext(ControlFlowContext):
         grad_ctxt = grad_ctxt.GetWhileContext()
         if grad_ctxt.grad_state:
           forward_ctxt = _GetWhileContext(val.op)
-          if IsLoopExit(val.op):
+          if util.IsLoopExit(val.op):
             forward_ctxt = forward_ctxt.outer_context
             if forward_ctxt:
               forward_ctxt = forward_ctxt.GetWhileContext()
@@ -2260,7 +2283,7 @@ class WhileContext(ControlFlowContext):
       self._MaybeAddControlDependency(op)
       for x in op.outputs:
         self._values.add(x.name)
-    if self._outer_context or not IsLoopExit(op):
+    if self._outer_context or not util.IsLoopExit(op):
       op.graph.prevent_fetching(op)
       for x in op.outputs:
         op.graph.prevent_feeding(x)
@@ -2279,7 +2302,7 @@ class WhileContext(ControlFlowContext):
         return True
       # pylint: enable=protected-access
       for x in op.inputs:
-        if not _IsLoopConstantEnter(x.op):
+        if not util.IsLoopConstantEnter(x.op):
           return False
       return True
     if _IsOpFree(op):
@@ -2515,9 +2538,17 @@ class WhileContext(ControlFlowContext):
     if shape_acc is not None:
       self.AddName(shape_acc.name)
       init_acc.append(shape_acc)
+
+    # Set use_input_shape=False since the accumulator tensors will grow in
+    # size. If use_input_shape=True, the _update_input call below will result in
+    # incompatible shapes.
     enter_acc = [_Enter(x, self._name, is_constant=False,
                         parallel_iterations=self._parallel_iterations,
-                        name="b_acc") for x in init_acc]
+                        use_input_shape=False, name="b_acc") for x in init_acc]
+    # Manually set appropriate partial shapes.
+    enter_acc[0].set_shape([None])
+    if values_acc.shape.dims is not None:
+      enter_acc[1].set_shape([None] + values_acc.shape.as_list()[1:])
     self.loop_enters.extend(enter_acc)
 
     merge_acc = [merge([x, x], name="b_acc")[0] for x in enter_acc]
@@ -2595,7 +2626,7 @@ class WhileContext(ControlFlowContext):
 
     if control_pivot is not None:
       for var in enter_vars:
-        if _IsLoopConstantEnter(var.op.inputs[0].op):
+        if util.IsLoopConstantEnter(var.op.inputs[0].op):
           # pylint: disable=protected-access
           var.op._add_control_input(control_pivot.op)
           # pylint: enable=protected-access
@@ -2629,9 +2660,23 @@ class WhileContext(ControlFlowContext):
     packed_vars_for_body = nest.pack_sequence_as(
         structure=original_loop_vars,
         flat_sequence=vars_for_body_with_tensor_arrays)
+    pre_summaries = ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
     body_result = body(*packed_vars_for_body)
+    post_summaries = ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
     if not nest.is_sequence(body_result):
       body_result = [body_result]
+    if len(post_summaries) > len(pre_summaries):
+      new_summaries = post_summaries[len(pre_summaries):]
+      summary_ref = ops.get_collection_ref(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
+      summary_ref[:] = pre_summaries
+      with ops.control_dependencies(new_summaries):
+        def map_fn(x):
+          # TODO(apassos) figure out how to trigger with tensor arrays as well
+          if isinstance(x, tensor_array_ops.TensorArray):
+            return x
+          return array_ops.identity(x)
+        body_result = nest.map_structure(map_fn, body_result)
+
     # Compare the structure types of input and output of body.
     # For backwards compatibility, the first layer is forced to a list
     # during this comparison, because inputs are typically lists and
@@ -2657,11 +2702,6 @@ class WhileContext(ControlFlowContext):
     exit_vars = [exit(x[0]) for x in switch_vars]
     self._loop_exits = exit_vars
 
-    # Make sure the shapes of loop outputs are correct.
-    for m_var, n_var in zip(merge_vars, next_vars):
-      if isinstance(m_var, ops.Tensor):
-        _EnforceShapeInvariant(m_var, n_var)
-
     # Exit the loop.
     self.ExitResult(exit_vars)
 
@@ -2708,7 +2748,7 @@ class WhileContext(ControlFlowContext):
         if shape is not None:
           xs.append(shape)
       for x in xs:
-        inp_op = x.op.inputs[0]
+        inp_op = x.op.inputs[0].op
         control_inputs = graph._control_dependencies_for_inputs([inp_op])
         outer_control_inputs = [op for op in control_inputs
                                 if self._IsInOuterContext(op)]
@@ -2717,10 +2757,14 @@ class WhileContext(ControlFlowContext):
         graph._record_op_seen_by_control_dependencies(x.op)
     # pylint: enable=protected-access
 
+  def IsWhileContext(self):
+    return True
+
 
+# pylint: disable=redefined-outer-name
 def while_loop(cond, body, loop_vars, shape_invariants=None,
                parallel_iterations=10, back_prop=True, swap_memory=False,
-               name=None):
+               name=None, maximum_iterations=None):
   """Repeat `body` while the condition `cond` is true.
 
   `cond` is a callable returning a boolean scalar tensor. `body` is a callable
@@ -2792,6 +2836,10 @@ def while_loop(cond, body, loop_vars, shape_invariants=None,
     back_prop: Whether backprop is enabled for this while loop.
     swap_memory: Whether GPU-CPU memory swap is enabled for this loop.
     name: Optional name prefix for the returned tensors.
+    maximum_iterations: Optional maximum number of iterations of the while loop
+      to run.  If provided, the `cond` output is AND-ed with an additional
+      condition ensuring the number of iterations executed is no greater than
+      `maximum_iterations`.
 
   Returns:
     The output tensors for the loop variables after the loop. When the length
@@ -2845,18 +2893,74 @@ def while_loop(cond, body, loop_vars, shape_invariants=None,
     if parallel_iterations < 1:
       raise TypeError("parallel_iterations must be a positive integer.")
 
+    if maximum_iterations is not None:
+      maximum_iterations = ops.convert_to_tensor(
+          maximum_iterations, name="maximum_iterations")
+      if maximum_iterations.shape.ndims != 0:
+        raise ValueError("maximum_iterations must be a scalar, saw shape: %s" %
+                         maximum_iterations.shape)
+
+      # If/when we generated the gradient for this while loop, the
+      # maximum_iterations tensor will be used as the input to any generated
+      # stack ops. It's likely the stacks will be outside any control flow
+      # context (i.e. if gradients() is called outside any control flow
+      # context), which will result in the maximum_iterations tensor being an
+      # illegal input (see control_flow_util.CheckInputFromValidContext).
+      #
+      # NOTE(skyewm): we could technically allow tensors from CondContexts, but
+      # that will be error-prone and hard to reason about for users.
+      #
+      # TODO(skyewm): make this work (it's tricky).
+      if (context.in_graph_mode() and
+          (util.IsInWhileLoop(maximum_iterations.op) or
+           util.IsInCond(maximum_iterations.op))):
+        raise ValueError(
+            "maximum_iterations tensor cannot be declared in tf.cond or "
+            "tf.while_loop. Please file an issue at "
+            "https://github.com/tensorflow/tensorflow/issues if you require "
+            "this functionality. (Control flow context: %s)" %
+            maximum_iterations.op._get_control_flow_context().name)  # pylint: disable=protected-access
+
+      counter = constant_op.constant(
+          0, dtype=maximum_iterations.dtype, name="iteration_counter")
+      orig_cond = cond
+      orig_body = body
+      if len(loop_vars) == 1:
+        loop_vars = (counter, loop_vars[0])
+        cond = lambda i, lv: (  # pylint: disable=g-long-lambda
+            math_ops.logical_and(i < maximum_iterations, orig_cond(lv)))
+        body = lambda i, lv: (i + 1, orig_body(lv))
+      else:
+        loop_vars = (counter, loop_vars)
+        cond = lambda i, lv: (  # pylint: disable=g-long-lambda
+            math_ops.logical_and(i < maximum_iterations, orig_cond(*lv)))
+        body = lambda i, lv: (i + 1, orig_body(*lv))
+
     if context.in_eager_mode():
       while cond(*loop_vars):
         loop_vars = body(*loop_vars)
-      return loop_vars
+      if maximum_iterations is not None:
+        return loop_vars[1]
+      else:
+        return loop_vars
 
     if shape_invariants is not None:
+      if maximum_iterations is not None:
+        shape_invariants = (tensor_shape.TensorShape([]), shape_invariants)
       nest.assert_same_structure(loop_vars, shape_invariants)
 
-    loop_context = WhileContext(parallel_iterations, back_prop, swap_memory)  # pylint: disable=redefined-outer-name
+    loop_context = WhileContext(
+        maximum_iterations=maximum_iterations,
+        parallel_iterations=parallel_iterations,
+        back_prop=back_prop,
+        swap_memory=swap_memory)
     ops.add_to_collection(ops.GraphKeys.WHILE_CONTEXT, loop_context)
     result = loop_context.BuildLoop(cond, body, loop_vars, shape_invariants)
-    return result
+    if maximum_iterations is not None:
+      return result[1]
+    else:
+      return result
+# pylint: enable=redefined-outer-name
 
 
 def _AsTensorList(x, p):
@@ -3069,23 +3173,105 @@ def tuple(tensors, name=None, control_inputs=None):
     return tpl
 
 
-def _assert_exclusive(preds):
-  """Returns an Assert op that checks that the predicates are exclusive."""
-  preds_c = array_ops.stack(preds, name="preds_c")
+def _assert_at_most_n_true(predicates, n, msg):
+  """Returns an Assert op that checks that at most n predicates are True.
+
+  Args:
+    predicates: list of bool scalar tensors.
+    n: maximum number of true predicates allowed.
+    msg: Error message.
+  """
+  preds_c = array_ops.stack(predicates, name="preds_c")
   num_true_conditions = math_ops.reduce_sum(
       math_ops.cast(preds_c, dtypes.int32), name="num_true_conds")
-  at_most_one_true_condition = math_ops.less(
-      num_true_conditions, constant_op.constant(2, name="two_true_conds"))
+  condition = math_ops.less_equal(num_true_conditions,
+                                  constant_op.constant(n, name="n_true_conds"))
+  preds_names = ", ".join(getattr(p, "name", "?") for p in predicates)
+  error_msg = [
+      "%s: more than %d conditions (%s) evaluated as True:" %
+      (msg, n, preds_names), preds_c
+  ]
+  return Assert(condition, data=error_msg, summarize=len(predicates))
+
 
-  error_msg = [("More than one condition evaluated as True but "
-                "exclusive=True.  Conditions: (%s), Values:"
-                % ", ".join([p.name for p in preds])),
-               preds_c]
-  return Assert(condition=at_most_one_true_condition, data=error_msg,
-                summarize=len(preds))
+def _case_create_default_action(predicates, actions):
+  """Creates default action for a list of actions and their predicates.
 
+  It uses the input actions to select an arbitrary as default and makes sure
+  that corresponding predicates have valid values.
 
-def case(pred_fn_pairs, default=None, exclusive=False, strict=False,
+  Args:
+    predicates: a list of bool scalar tensors
+    actions: a list of callable objects which return tensors.
+
+  Returns:
+    a callable
+  """
+  k = len(predicates) - 1  # could pick any
+  predicate, action = predicates[k], actions[k]
+  other_predicates, other_actions = predicates[:k], actions[:k]
+
+  def default_action():
+    others_msg = ("Implementation error: "
+                  "selected default action #%d was called, but some of other "
+                  "predicates are True: " % k)
+    default_msg = ("Input error: "
+                   "None of conditions evaluated as True:",
+                   array_ops.stack(predicates, name="preds_c"))
+    with ops.control_dependencies([
+        _assert_at_most_n_true(other_predicates, n=0, msg=others_msg),
+        Assert(predicate, data=default_msg)
+    ]):
+      return action()
+
+  return default_action, other_predicates, other_actions
+
+
+def _case_verify_and_canonicalize_args(pred_fn_pairs, exclusive, name):
+  """Verifies input arguments for the case function.
+
+  Args:
+    pred_fn_pairs: Dict or list of pairs of a boolean scalar tensor and a
+                   callable which returns a list of tensors.
+    exclusive: True iff at most one predicate is allowed to evaluate to `True`.
+    name: A name for the case operation.
+
+  Raises:
+    TypeError: If `pred_fn_pairs` is not a list/dictionary.
+    TypeError: If `pred_fn_pairs` is a list but does not contain 2-tuples.
+    TypeError: If `fns[i]` is not callable for any i, or `default` is not
+               callable.
+
+  Returns:
+    a tuple <list of scalar bool tensors, list of callables>.
+  """
+  if not isinstance(pred_fn_pairs, (list, _basetuple, dict)):
+    raise TypeError("fns must be a list, tuple, or dict")
+
+  if isinstance(pred_fn_pairs, collections.OrderedDict):
+    pred_fn_pairs = pred_fn_pairs.items()
+  elif isinstance(pred_fn_pairs, dict):
+    pred_fn_pairs = sorted(pred_fn_pairs.items(), key=lambda item: item[0].name)
+    if not exclusive:
+      logging.warn("%s: An unordered dictionary of predicate/fn pairs was "
+                   "provided, but exclusive=False. The order of conditional "
+                   "tests is deterministic but not guaranteed.", name)
+  for pred_fn_pair in pred_fn_pairs:
+    if not isinstance(pred_fn_pair, _basetuple) or len(pred_fn_pair) != 2:
+      raise TypeError("Each entry in pred_fn_pairs must be a 2-tuple")
+    pred, fn = pred_fn_pair
+    if pred.dtype != dtypes.bool:
+      raise TypeError("pred must be of type bool: %s", pred.name)
+    if not callable(fn):
+      raise TypeError("fn for pred %s must be callable." % pred.name)
+  predicates, actions = zip(*pred_fn_pairs)
+  return predicates, actions
+
+
+def case(pred_fn_pairs,
+         default=None,
+         exclusive=False,
+         strict=False,
          name="case"):
   """Create a case operation.
 
@@ -3170,152 +3356,27 @@ def case(pred_fn_pairs, default=None, exclusive=False, strict=False,
     TypeError: If `pred_fn_pairs` is a list but does not contain 2-tuples.
     TypeError: If `fns[i]` is not callable for any i, or `default` is not
                callable.
-    ValueError: If in eager mode and all predicates are false and no
-               default is provided.
-    ValueError: If in eager mode and is passed a dictionary.
   """
-  pfp = pred_fn_pairs  # For readability
-  if not (isinstance(pfp, list) or isinstance(pfp, _basetuple)
-          or isinstance(pfp, dict)):
-    raise TypeError("fns must be a list, tuple, or dict")
-  if isinstance(pfp, dict):
-    if context.in_eager_mode():
-      raise ValueError(
-          "In eager mode the predicates must be a list, not a dictionary.")
-    if isinstance(pfp, collections.OrderedDict):
-      pfp = pfp.items()
-    else:
-      pfp = sorted(pfp.items(), key=lambda item: item[0].name)
-      if not exclusive:
-        logging.warn("%s: An unordered dictionary of predicate/fn pairs was "
-                     "provided, but exclusive=False. The order of conditional "
-                     "tests is deterministic but not guaranteed.", name)
-  for tup in pfp:
-    if not isinstance(tup, _basetuple) or len(tup) != 2:
-      raise TypeError("Each entry in pred_fn_pairs must be a 2-tuple")
-    pred, fn = tup
-    if pred.dtype != dtypes.bool:
-      raise TypeError("pred must be of type bool: %s", pred.name)
-    if not callable(fn):
-      raise TypeError("fn for pred %s must be callable." % pred.name)
-
-  if default is not None and not callable(default):
-    raise TypeError("default must be callable.")
-
-  if context.in_eager_mode():
-    for pred, fn in pfp:
-      if pred:
-        return fn()
-    if default is None:
-      raise ValueError("tf.case received all false predicates and no default.")
-    return default()
-
-  preds, fns = map(list, zip(*pfp))
-  del pfp  # From now on, preds and fns form the source of truth.
-
-  with ops.name_scope(name, "case", [preds]):
-    exclusivity_assert = _assert_exclusive(preds) if exclusive else None
-    # If no default is provided, then we remove one of the (predicate, function)
-    # pairs and define the default to be the removed function with an additional
-    # control dependency that asserts that the removed predicate holds.
+  predicates, actions = _case_verify_and_canonicalize_args(
+      pred_fn_pairs, exclusive, name)
+  with ops.name_scope(name, "case", [predicates]):
     if default is None:
-      all_preds = _basetuple(preds)  # For the error message.
-      last_pred, last_fn = preds.pop(), fns.pop()
-      def new_default():
-        preds_c = array_ops.stack(all_preds, name="preds_c")
-        error_msg = [
-            ("None of the conditions evaluated as True. Conditions: (%s), "
-             "Values:" % ", ".join([p.name for p in all_preds])),
-            preds_c]
-        assertion = Assert(condition=last_pred,
-                           data=error_msg, summarize=len(all_preds))
-        with ops.control_dependencies([assertion]):
-          return last_fn()
-      default = new_default
-
-    if not preds:
-      return default()
-    not_preds = []
-    for i, p in enumerate(preds):
-      with ops.name_scope("not_%d" % i):
-        not_preds.append(math_ops.logical_not(p))
-    and_not_preds = [constant_op.constant(True, name="always_true")]
-    for i, notp in enumerate(not_preds):
-      with ops.name_scope("and_not_%d" % i):
-        and_not_preds.append(math_ops.logical_and(and_not_preds[-1], notp))
-
-    # preds = [p1, p2, p3]
-    # fns = [f1, f2, f3]
-    # not_preds = [~p1, ~p2, ~p3]
-    # and_not_preds = [True, ~p1, ~p1 & ~p2, ~p1 & ~p2 & ~p3]
-    # case_preds = [p1,
-    #               p2 & ~p1,
-    #               p3 & ~p2 & ~p1,
-    #              ~p3 & ~p2 & ~p1]
-
-    case_preds = []
-    for i, (p, and_not_p_prev) in enumerate(zip(preds, and_not_preds[:-1])):
-      with ops.name_scope("case_%d" % i):
-        case_preds.append(math_ops.logical_and(p, and_not_p_prev))
-    with ops.name_scope("case_none_are_true"):
-      case_preds.append(and_not_preds[-1])
-
-    # Create an empty tensor, or list, with the right type and shape
-    with ops.name_scope("case_create_empty"):
-      def _create_empty_constant(dtype, shape):
-        value = ("" if dtype == dtypes.string else dtype.as_numpy_dtype())
-        if shape.ndims is None:
-          return array_ops.constant(value, dtype=dtype)
-        else:
-          temp_shape = [1 if x.value is None else x.value for x in shape]
-          result = array_ops.constant(value, shape=temp_shape, dtype=dtype)
-          result._shape = shape  # pylint: disable=protected-access
-          return result
-
-      def _correct_empty(v):
-        if isinstance(v, ops.Operation):
-          return no_op()
-        elif isinstance(v, tensor_array_ops.TensorArray):
-          return v
-        elif not hasattr(v, "dtype"):
-          return ops.convert_to_tensor(v)
-        elif isinstance(v, sparse_tensor.SparseTensor):
-          return sparse_tensor.SparseTensor(indices=[[0] * len(v.get_shape())],
-                                            values=[v.dtype.as_numpy_dtype()],
-                                            dense_shape=v.get_shape())
-        else:
-          return _create_empty_constant(v.dtype, v.get_shape())
-
-      empty = lambda: nest.map_structure(_correct_empty, default())
-
-    # case_sequence = [
-    #   cond(~p3 & ~p2 & ~p1, default, empty),
-    #   cond(p3 & ~p2 & ~p1, f3, lambda: case_sequence[0]),
-    #   cond(p2 & ~p1, f2, lambda: case_sequence[1]),
-    #   cond(p1, f1, lambda: case_sequence[2])
-    # ]
-    #
-    # And the return value will be case_sequence[-1]
-    def _build_case():
-      all_fns = [fn for fn in fns]
-      all_fns.append(default)
-      prev_case = None
-      for i, (cp, fn) in enumerate(list(zip(case_preds, all_fns))[::-1]):
-        prev_case = cond(
-            cp, fn,
-            empty if i == 0 else lambda: prev_case,
-            strict=strict, name="If_%d" % i)
-      return prev_case
-
-    if exclusivity_assert is not None:
-      with ops.control_dependencies([exclusivity_assert]):
-        case_seq = _build_case()
+      default, predicates, actions = _case_create_default_action(
+          predicates, actions)
+    fn = default
+    # To eval conditions in direct order we create nested conditions in reverse:
+    #   cond(c[0], true_fn=.., false_fn=cond(c[1], ...))
+    for predicate, action in reversed(list(zip(predicates, actions))):
+      fn = functools.partial(
+          cond, predicate, true_fn=action, false_fn=fn, strict=strict)
+    if exclusive:
+      with ops.control_dependencies([
+          _assert_at_most_n_true(
+              predicates, n=1, msg="Input error: exclusive=True")
+      ]):
+        return fn()
     else:
-      case_seq = _build_case()
-
-    if not strict:
-      case_seq = _UnpackIfSingleton(case_seq)
-    return case_seq
+      return fn()
 
 
 ops.register_proto_function(ops.GraphKeys.COND_CONTEXT,
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 3e8f39dd240af3a5030d259603ab648d50c27cd3..cc5a42bf3ddd4b37d037f8d28a2fe6af79f79ba1 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -51,6 +51,7 @@ TestTuple = collections.namedtuple("TestTuple", "a b")
 SingletonTestTuple = collections.namedtuple("SingletonTestTuple", "a")
 
 
+@test_util.with_c_api
 class GroupTestCase(test_util.TensorFlowTestCase):
 
   def _StripNode(self, nd):
@@ -132,6 +133,7 @@ class GroupTestCase(test_util.TensorFlowTestCase):
         control_flow_ops.group(1, 2)
 
 
+@test_util.with_c_api
 class ShapeTestCase(test_util.TensorFlowTestCase):
 
   def testShape(self):
@@ -143,6 +145,7 @@ class ShapeTestCase(test_util.TensorFlowTestCase):
                             [constant_op.constant(1.0)], tensor).get_shape())
 
 
+@test_util.with_c_api
 class WithDependenciesTestCase(test_util.TensorFlowTestCase):
 
   def testTupleDependencies(self):
@@ -174,6 +177,7 @@ class WithDependenciesTestCase(test_util.TensorFlowTestCase):
         self.assertEquals(1, counter.eval())
 
 
+@test_util.with_c_api
 class SwitchTestCase(test_util.TensorFlowTestCase):
 
   def testIndexedSlicesWithDenseShape(self):
@@ -431,6 +435,7 @@ class CondTest(test_util.TensorFlowTestCase):
           control_flow_ops.cond(True, lambda: x, lambda: x, fn2=lambda: x)
 
 
+@test_util.with_c_api
 class ContextTest(test_util.TensorFlowTestCase):
 
   def testCondContext(self):
@@ -447,18 +452,25 @@ class ContextTest(test_util.TensorFlowTestCase):
               c.to_proto(),
               control_flow_ops.CondContext.from_proto(c.to_proto()).to_proto())
 
-  def testWhileContext(self):
+  def _testWhileContextHelper(self, maximum_iterations=None):
     with self.test_session() as sess:
       i = constant_op.constant(0)
       c = lambda i: math_ops.less(i, 10)
       b = lambda i: math_ops.add(i, 1)
-      control_flow_ops.while_loop(c, b, [i])
+      control_flow_ops.while_loop(
+          c, b, [i], maximum_iterations=maximum_iterations)
       for op in sess.graph.get_operations():
-        c = op._get_control_flow_context()
-        if c:
-          self.assertProtoEquals(
-              c.to_proto(),
-              control_flow_ops.WhileContext.from_proto(c.to_proto()).to_proto())
+        context = op._get_control_flow_context()
+        if context:
+          self.assertProtoEquals(context.to_proto(),
+                                 control_flow_ops.WhileContext.from_proto(
+                                     context.to_proto()).to_proto())
+
+  def testWhileContext(self):
+    self._testWhileContextHelper()
+
+  def testWhileContextWithMaximumIterations(self):
+    self._testWhileContextHelper(maximum_iterations=10)
 
   def testControlContextImportScope(self):
     with self.test_session():
@@ -516,6 +528,7 @@ def _RawNestedShape(nested_shape):
 
 
 # TODO(yori): Add tests for indexed slices.
+@test_util.with_c_api
 class DataTypesTest(test_util.TensorFlowTestCase):
 
   def assertAllEqualNested(self, a, b):
@@ -540,7 +553,9 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
   def _testReturnValues(self, fn_true, fn_false, expected_value_true,
                         expected_value_false, strict=False,
-                        check_cond=True):
+                        check_cond=True, feed_dict=None):
+    if feed_dict is None: feed_dict = {}
+
     condition = array_ops.placeholder(dtypes.bool)
     output_cond = control_flow_ops.cond(condition, fn_true, fn_false,
                                         strict=strict)
@@ -549,13 +564,17 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
     with self.test_session() as sess:
       variables.global_variables_initializer().run()
+      true_feed_dict = {condition: True}
+      true_feed_dict.update(feed_dict)
       result_cond, result_case = sess.run([output_cond, output_case],
-                                          feed_dict={condition: True})
+                                          feed_dict=true_feed_dict)
       self.assertAllEqualNested(result_cond, expected_value_true)
       if check_cond:
         self.assertAllEqualNested(result_case, expected_value_true)
+      false_feed_dict = {condition: False}
+      false_feed_dict.update(feed_dict)
       result_cond, result_case = sess.run([output_cond, output_case],
-                                          feed_dict={condition: False})
+                                          feed_dict=false_feed_dict)
       self.assertAllEqualNested(result_cond, expected_value_false)
       if check_cond:
         self.assertAllEqualNested(result_case, expected_value_false)
@@ -631,26 +650,26 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
   def test_tensors_unknown_shape(self):
     def _BuildTrueBranch(dtype):
+      tensor = array_ops.placeholder(dtype=dtype, shape=None)
       def _Build():
-        tensor = array_ops.zeros([2, 2], dtype=dtype)
-        tensor._shape = tensor_shape.TensorShape(None)
         return tensor
-      return _Build
+      return _Build, tensor
 
     def _BuildFalseBranch(dtype):
+      tensor = array_ops.placeholder(dtype=dtype, shape=None)
       def _Build():
-        tensor = array_ops.ones([2, 2], dtype=dtype)
-        tensor._shape = tensor_shape.TensorShape(None)
         return tensor
-      return _Build
+      return _Build, tensor
 
     for dtype in (dtypes.float16, dtypes.int8, dtypes.int32, dtypes.uint8):
       shape = tensor_shape.TensorShape(None)
-      fn_true = _BuildTrueBranch(dtype)
-      fn_false = _BuildFalseBranch(dtype)
+      fn_true, true_tensor = _BuildTrueBranch(dtype)
+      fn_false, false_tensor = _BuildFalseBranch(dtype)
       self._testShape(fn_true, fn_false, shape)
       self._testReturnValues(fn_true, fn_false,
-                             np.zeros([2, 2]), np.ones([2, 2]))
+                             np.zeros([2, 2]), np.ones([2, 2]),
+                             feed_dict={true_tensor: np.zeros([2, 2]),
+                                        false_tensor: np.ones([2, 2])})
 
   def test_sparse_tensors(self):
     shape = tensor_shape.TensorShape([None, None])
@@ -674,26 +693,29 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
   def test_tensors_with_partially_specified_shapes(self):
     def _BuildBranch(dtype, shape):
+      a = array_ops.placeholder(dtype=dtype, shape=shape[0])
+      b = array_ops.placeholder(dtype=dtype, shape=shape[1])
+      c = array_ops.placeholder(dtype=dtype, shape=shape[2])
       def _Build():
-        a = array_ops.zeros([2, 2], dtype=dtype)
-        b = array_ops.zeros([5], dtype=dtype)
-        c = array_ops.ones([3, 3], dtype=dtype)
-        a._shape = tensor_shape.TensorShape(shape[0])
-        b._shape = tensor_shape.TensorShape(shape[1])
-        c._shape = tensor_shape.TensorShape(shape[2])
         return a, b, c
-      return _Build
+      return _Build, (a, b, c)
 
     for dtype in (dtypes.float16, dtypes.int8, dtypes.int32, dtypes.uint8):
       shape = (tensor_shape.TensorShape([None, 2]),
                tensor_shape.TensorShape([None]),
                tensor_shape.TensorShape([3, None]))
-      fn_true = _BuildBranch(dtype, shape)
-      fn_false = _BuildBranch(dtype, shape)
+      fn_true, true_tensors = _BuildBranch(dtype, shape)
+      fn_false, false_tensors = _BuildBranch(dtype, shape)
       self._testShape(fn_true, fn_false, shape)
       self._testReturnValues(fn_true, fn_false,
                              (np.zeros([2, 2]), np.zeros(5), np.ones([3, 3])),
-                             (np.zeros([2, 2]), np.zeros(5), np.ones([3, 3])))
+                             (np.zeros([2, 2]), np.zeros(5), np.ones([3, 3])),
+                             feed_dict={true_tensors[0]: np.zeros([2, 2]),
+                                        false_tensors[0]: np.zeros([2, 2]),
+                                        true_tensors[1]: np.zeros([5]),
+                                        false_tensors[1]: np.zeros([5]),
+                                        true_tensors[2]: np.ones([3, 3]),
+                                        false_tensors[2]: np.ones([3, 3])})
 
   def test_tensor_arrays(self):
     element_shape = tensor_shape.TensorShape([2])
@@ -837,6 +859,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self.assertEqual(matrix.get_shape(), tensor_shape.TensorShape([2, 2]))
 
 
+@test_util.with_c_api
 class CaseTest(test_util.TensorFlowTestCase):
 
   def testCase_withDefault(self):
@@ -860,8 +883,7 @@ class CaseTest(test_util.TensorFlowTestCase):
     with self.test_session() as sess:
       self.assertEqual(sess.run(output, feed_dict={x: 1}), 2)
       self.assertEqual(sess.run(output, feed_dict={x: 3}), 8)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "More than one condition evaluated as True"):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 2})
 
   def testCase_multiple_matches_non_exclusive(self):
@@ -886,11 +908,7 @@ class CaseTest(test_util.TensorFlowTestCase):
       self.assertEqual(sess.run(output, feed_dict={x: 1}), 2)
       self.assertEqual(sess.run(output, feed_dict={x: 2}), 4)
       self.assertEqual(sess.run(output, feed_dict={x: 3}), 6)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r"\[None of the conditions evaluated as True. "
-          r"Conditions: \(Equal:0, Equal_1:0, Equal_2:0\), Values:\] "
-          r"\[0 0 0\]"):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 4})
 
   def testCase_withoutDefault_oneCondition(self):
@@ -899,10 +917,7 @@ class CaseTest(test_util.TensorFlowTestCase):
     output = control_flow_ops.case(conditions, exclusive=True)
     with self.test_session() as sess:
       self.assertEqual(sess.run(output, feed_dict={x: 1}), 2)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r"\[None of the conditions evaluated as True. "
-          r"Conditions: \(Equal:0\), Values:\] \[0\]"):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 4})
 
 
diff --git a/tensorflow/python/ops/control_flow_util.py b/tensorflow/python/ops/control_flow_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..91cd90f1895331aaea71f67448eef2dab9da4b3b
--- /dev/null
+++ b/tensorflow/python/ops/control_flow_util.py
@@ -0,0 +1,228 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utilty functions for control flow.
+
+This file is necessary to avoid cyclic dependencies between ops.py and
+control_flow_ops.py.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import traceback
+
+from tensorflow.python.platform import tf_logging as logging
+
+
+def IsInWhileLoop(op):
+  ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
+  return GetContainingWhileContext(ctxt) is not None
+
+
+def IsInCond(op):
+  ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
+  return GetContainingCondContext(ctxt) is not None
+
+
+def IsSwitch(op):
+  """Return true if `op` is a Switch."""
+  return op.type == "Switch" or op.type == "RefSwitch"
+
+
+def IsLoopEnter(op):
+  """Returns true if `op` is an Enter."""
+  return op.type == "Enter" or op.type == "RefEnter"
+
+
+def IsLoopExit(op):
+  """Return true if `op` is an Exit."""
+  return op.type == "Exit" or op.type == "RefExit"
+
+
+def IsLoopSwitch(op):
+  """Return true if `op` is the Switch for a while loop."""
+  if IsSwitch(op):
+    ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
+    return ctxt and ctxt.IsWhileContext()
+  return False
+
+
+def IsLoopConstantEnter(op):
+  """Return true iff op is a loop invariant."""
+  return IsLoopEnter(op) and op.get_attr("is_constant")
+
+
+def GetLoopConstantEnter(value):
+  """Return the enter op if we can infer `value` to be a loop invariant."""
+  id_ops = {"Switch", "RefSwitch", "Identity", "RefIdentity"}
+  op = value.op
+  while op.type in id_ops:
+    op = op.inputs[0].op
+  return op if IsLoopConstantEnter(op) else None
+
+
+def GetOutputContext(op):
+  """Return the control flow context for the output of an op."""
+  ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
+  if IsLoopExit(op):
+    ctxt = ctxt.outer_context
+  return ctxt
+
+
+def GetContainingWhileContext(ctxt):
+  """Returns the first ancestor WhileContext of `ctxt`.
+
+  Returns `ctxt` if `ctxt` is a WhileContext, or None if `ctxt` is not in a
+  while loop.
+
+  Args:
+    ctxt: ControlFlowContext
+
+  Returns:
+    `ctxt` if `ctxt` is a WhileContext, the most nested WhileContext containing
+    `ctxt`, or None if `ctxt` is not in a while loop.
+  """
+  while ctxt:
+    if ctxt.IsWhileContext(): return ctxt
+    ctxt = ctxt.outer_context
+  return None
+
+
+def GetContainingCondContext(ctxt):
+  """Returns the first ancestor CondContext of `ctxt`.
+
+  Returns `ctxt` if `ctxt` is a CondContext, or None if `ctxt` is not in a cond.
+
+  Args:
+    ctxt: ControlFlowContext
+
+  Returns:
+    `ctxt` if `ctxt` is a CondContext, the most nested CondContext containing
+    `ctxt`, or None if `ctxt` is not in a cond.
+  """
+  while ctxt:
+    if ctxt.IsCondContext(): return ctxt
+    ctxt = ctxt.outer_context
+  return None
+
+
+def IsContainingContext(ctxt, maybe_containing_ctxt):
+  """Returns true if `maybe_containing_ctxt` is or contains `ctxt`."""
+  while ctxt is not maybe_containing_ctxt:
+    if ctxt is None: return False
+    ctxt = ctxt.outer_context
+  return True
+
+
+def CheckInputFromValidContext(op, input_op):
+  """Returns whether `input_op` can be used from `op`s context.
+
+  Conceptually, only inputs from op's while context or any ancestor while
+  context (including outside of any context) are valid. In practice, there are
+  many other edge cases as well.
+
+  Args:
+    op: Operation
+    input_op: Operation
+
+  Raises:
+    ValueError: if input_op is from an invalid context.
+  """
+  op_ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
+  input_ctxt = GetOutputContext(input_op)
+  valid = False
+
+  if not input_ctxt:
+    # input_op isn't in a control flow context.
+    valid = True
+  elif op_ctxt is input_ctxt:
+    # input_op is in the same context as op.
+    valid = True
+  else:
+    while_ctxt = GetContainingWhileContext(op_ctxt)
+    input_while_ctxt = GetContainingWhileContext(input_ctxt)
+
+    if while_ctxt is None:
+      if input_while_ctxt is None:
+        # Neither op nor input_op is in a while loop, but one or both are in
+        # conds. We allow this, although execution will fail if the branch
+        # corresponding to input_op's cond context isn't taken.
+        valid = True
+      # Invalid if op isn't in a while loop and input_op is. Unless...
+      if IsLoopEnter(op):
+        # WhileContext._BuildLoop clears context for Enter nodes.
+        valid = True
+      if IsSwitch(op):
+        # CondContext.AddValue clears context for Switch nodes.
+        valid = True
+    elif IsContainingContext(while_ctxt, input_while_ctxt):
+      # input_op is in a while loop which contains op's while loop (or not in a
+      # while loop at all).
+      valid = True
+    elif (while_ctxt.grad_state and
+          IsContainingContext(while_ctxt.grad_state.forward_context,
+                              input_while_ctxt)):
+      # op is in a gradient context and input_op is in the associated forward
+      # pass context or an ancestor thereof. This case is need to build while
+      # loop gradients.
+      # NOTE(skyewm): we theoretically also need this case for custom gradient
+      # functions that close over tensors from ancestor contexts, but I haven't
+      # verified this.
+      valid = True
+    elif (while_ctxt.grad_state and
+          while_ctxt.grad_state.forward_context is
+          input_while_ctxt._outer_context):  # pylint: disable=protected-access
+      # op is in a gradient context and input_op is in a child of the associated
+      # forward pass context. This case is needed for the gradients of while
+      # loops with conds.
+      valid = True
+    elif (input_while_ctxt.grad_state and
+          input_while_ctxt.grad_state.forward_context is while_ctxt):
+      # input_op is in the gradient context of op's context. This case is needed
+      # when the gradient of a while loop gradient is requested (this will
+      # eventually fail unless there is a stop_gradient() or similar).
+      valid = True
+    elif (input_while_ctxt.grad_state and
+          input_ctxt.grad_state.forward_context.grad_state and
+          input_ctxt.grad_state.forward_context.grad_state.forward_context is
+          while_ctxt):
+      # input_op is in the grad grad context of op's context. This case is
+      # needed when the gradient of a while loop gradient is requested (this
+      # will eventually fail unless there is a stop_gradient() or similar).
+      valid = True
+
+  if not valid:
+    if while_ctxt:
+      error_msg = (
+          "Cannot use '%s' as input to '%s' because they are in different while"
+          " loops." % (op.name, input_op.name))
+    else:
+      error_msg = (
+          "Cannot use '%s' as input to '%s' because '%s' is in a while loop."
+          % (input_op.name, op.name, input_op.name))
+
+    # Log the error message plus the relevant stack traces. The stacks may be
+    # useful for debugging this error, but we don't want to raise an
+    # unreadable exception.
+    log_msg = error_msg
+    log_msg += "\n\n%s while context: %s" % (op.name, while_ctxt)
+    log_msg += "\n%s while context: %s" % (input_op.name, input_while_ctxt)
+    log_msg += "\n\nTraceback for %s:\n%s\nTraceback for %s:\n%s\n" % (
+        op.name, "".join(traceback.format_list(op.traceback)),
+        input_op.name, "".join(traceback.format_list(input_op.traceback)))
+    logging.info(log_msg)
+    raise ValueError(error_msg + " See info log for more details.")
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index c186eb5b7ecaa5c74841aca15f0f11e994eba2ea..f441f6d4bf7986bbfb15593edf2b2b1bfe6ec71f 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.lib.io import python_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
@@ -2225,7 +2226,8 @@ class RecordInput(object):
                shift_ratio=0,
                seed=0,
                name=None,
-               batches=None):
+               batches=None,
+               compression_type=None):
     """Constructs a RecordInput Op.
 
     Args:
@@ -2243,6 +2245,8 @@ class RecordInput(object):
         how many batches to create, which are returned as a list when
         `get_yield_op()` is called. An example use case is to split processing
         between devices on one computer.
+      compression_type: The type of compression for the file. Currently ZLIB and
+        GZIP are supported. Defaults to none.
 
     Raises:
       ValueError: If one of the arguments is invalid.
@@ -2257,12 +2261,17 @@ class RecordInput(object):
     self._shift_ratio = shift_ratio
     self._seed = seed
     self._name = name
+    self._compression_type = python_io.TFRecordCompressionType.NONE
+    if compression_type is not None:
+      self._compression_type = compression_type
 
   def get_yield_op(self):
     """Adds a node that yields a group of records every time it is executed.
     If RecordInput `batches` parameter is not None, it yields a list of
     record batches with the specified `batch_size`.
     """
+    compression_type = python_io.TFRecordOptions.get_compression_type_string(
+        python_io.TFRecordOptions(self._compression_type))
     records = gen_data_flow_ops.record_input(
         file_pattern=self._file_pattern,
         file_buffer_size=self._buffer_size,
@@ -2270,6 +2279,7 @@ class RecordInput(object):
         file_shuffle_shift_ratio=self._shift_ratio,
         batch_size=self._batch_size,
         file_random_seed=self._seed,
+        compression_type=compression_type,
         name=self._name)
     if self._batches is None:
       return records
diff --git a/tensorflow/python/ops/distributions/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
index d792e9fe52dee4325d0956dbb74c8b408d5a1e8c..aa2b511c5413944df665198eacc26066b8457773 100644
--- a/tensorflow/python/ops/distributions/dirichlet_multinomial.py
+++ b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
@@ -122,21 +122,22 @@ class DirichletMultinomial(distribution.Distribution):
   #### Examples
 
   ```python
-  alpha = [1, 2, 3]
-  n = 2
+  alpha = [1., 2., 3.]
+  n = 2.
   dist = DirichletMultinomial(n, alpha)
   ```
 
-  Creates a 3-class distribution, with the 3rd class is most likely to be drawn.
+  Creates a 3-class distribution, with the 3rd class is most likely to be
+  drawn.
   The distribution functions can be evaluated on counts.
 
   ```python
   # counts same shape as alpha.
-  counts = [0, 0, 2]
+  counts = [0., 0., 2.]
   dist.prob(counts)  # Shape []
 
-  # alpha will be broadcast to [[1, 2, 3], [1, 2, 3]] to match counts.
-  counts = [[1, 1, 0], [1, 0, 1]]
+  # alpha will be broadcast to [[1., 2., 3.], [1., 2., 3.]] to match counts.
+  counts = [[1., 1., 0.], [1., 0., 1.]]
   dist.prob(counts)  # Shape [2]
 
   # alpha will be broadcast to shape [5, 7, 3] to match counts.
@@ -147,12 +148,12 @@ class DirichletMultinomial(distribution.Distribution):
   Creates a 2-batch of 3-class distributions.
 
   ```python
-  alpha = [[1, 2, 3], [4, 5, 6]]  # Shape [2, 3]
-  n = [3, 3]
+  alpha = [[1., 2., 3.], [4., 5., 6.]]  # Shape [2, 3]
+  n = [3., 3.]
   dist = DirichletMultinomial(n, alpha)
 
-  # counts will be broadcast to [[2, 1, 0], [2, 1, 0]] to match alpha.
-  counts = [2, 1, 0]
+  # counts will be broadcast to [[2., 1., 0.], [2., 1., 0.]] to match alpha.
+  counts = [2., 1., 0.]
   dist.prob(counts)  # Shape [2]
   ```
 
diff --git a/tensorflow/python/ops/distributions/distribution.py b/tensorflow/python/ops/distributions/distribution.py
index 22687a093ae72edff1d53131cab49fa004aa3be0..2d4c3509bc79dd44fec67dbf62ea21e1de7e2778 100644
--- a/tensorflow/python/ops/distributions/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import util
 from tensorflow.python.util import tf_inspect
 
@@ -43,10 +44,26 @@ __all__ = [
 ]
 
 _DISTRIBUTION_PUBLIC_METHOD_WRAPPERS = [
-    "batch_shape_tensor", "batch_shape", "event_shape_tensor", "event_shape",
-    "sample", "log_prob", "prob", "log_cdf", "cdf", "log_survival_function",
-    "survival_function", "entropy", "mean", "variance", "stddev", "mode",
-    "covariance"]
+    "batch_shape",
+    "batch_shape_tensor",
+    "cdf",
+    "covariance",
+    "cross_entropy",
+    "entropy",
+    "event_shape",
+    "event_shape_tensor",
+    "kl_divergence",
+    "log_cdf",
+    "log_prob",
+    "log_survival_function",
+    "mean",
+    "mode",
+    "prob",
+    "sample",
+    "stddev",
+    "survival_function",
+    "variance",
+]
 
 
 @six.add_metaclass(abc.ABCMeta)
@@ -608,7 +625,7 @@ class Distribution(_BaseDistribution):
     """Indicates that `event_shape == []`.
 
     Args:
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       is_scalar_event: `bool` scalar `Tensor`.
@@ -622,7 +639,7 @@ class Distribution(_BaseDistribution):
     """Indicates that `batch_shape == []`.
 
     Args:
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       is_scalar_batch: `bool` scalar `Tensor`.
@@ -683,7 +700,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       log_prob: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
@@ -710,7 +727,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       prob: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
@@ -747,7 +764,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       logcdf: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
@@ -780,7 +797,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       cdf: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
@@ -818,7 +835,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
@@ -853,7 +870,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
@@ -899,7 +916,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       quantile: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
@@ -923,7 +940,7 @@ class Distribution(_BaseDistribution):
     denotes expectation, and `Var.shape = batch_shape + event_shape`.
 
     Args:
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       variance: Floating-point `Tensor` with shape identical to
@@ -954,7 +971,7 @@ class Distribution(_BaseDistribution):
     denotes expectation, and `stddev.shape = batch_shape + event_shape`.
 
     Args:
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       stddev: Floating-point `Tensor` with shape identical to
@@ -1002,7 +1019,7 @@ class Distribution(_BaseDistribution):
     length-`k'` vector.
 
     Args:
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       covariance: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
@@ -1020,6 +1037,67 @@ class Distribution(_BaseDistribution):
     with self._name_scope(name):
       return self._mode()
 
+  def _cross_entropy(self, other):
+    return kullback_leibler.cross_entropy(
+        self, other, allow_nan_stats=self.allow_nan_stats)
+
+  def cross_entropy(self, other, name="cross_entropy"):
+    """Computes the (Shannon) cross entropy.
+
+    Denote this distribution (`self`) by `P` and the `other` distribution by
+    `Q`. Assuming `P, Q` are absolutely continuous with respect to
+    one another and permit densities `p(x) dr(x)` and `q(x) dr(x)`, (Shanon)
+    cross entropy is defined as:
+
+    ```none
+    H[P, Q] = E_p[-log q(X)] = -int_F p(x) log q(x) dr(x)
+    ```
+
+    where `F` denotes the support of the random variable `X ~ P`.
+
+    Args:
+      other: `tf.distributions.Distribution` instance.
+      name: Python `str` prepended to names of ops created by this function.
+
+    Returns:
+      cross_entropy: `self.dtype` `Tensor` with shape `[B1, ..., Bn]`
+        representing `n` different calculations of (Shanon) cross entropy.
+    """
+    with self._name_scope(name):
+      return self._cross_entropy(other)
+
+  def _kl_divergence(self, other):
+    return kullback_leibler.kl_divergence(
+        self, other, allow_nan_stats=self.allow_nan_stats)
+
+  def kl_divergence(self, other, name="kl_divergence"):
+    """Computes the Kullback--Leibler divergence.
+
+    Denote this distribution (`self`) by `p` and the `other` distribution by
+    `q`. Assuming `p, q` are absolutely continuous with respect to reference
+    measure `r`, (Shanon) cross entropy is defined as:
+
+    ```none
+    KL[p, q] = E_p[log(p(X)/q(X))]
+             = -int_F p(x) log q(x) dr(x) + int_F p(x) log p(x) dr(x)
+             = H[p, q] - H[p]
+    ```
+
+    where `F` denotes the support of the random variable `X ~ p`, `H[., .]`
+    denotes (Shanon) cross entropy, and `H[.]` denotes (Shanon) entropy.
+
+    Args:
+      other: `tf.distributions.Distribution` instance.
+      name: Python `str` prepended to names of ops created by this function.
+
+    Returns:
+      kl_divergence: `self.dtype` `Tensor` with shape `[B1, ..., Bn]`
+        representing `n` different calculations of the Kullback-Leibler
+        divergence.
+    """
+    with self._name_scope(name):
+      return self._kl_divergence(other)
+
   @contextlib.contextmanager
   def _name_scope(self, name=None, values=None):
     """Helper function to standardize op scope."""
diff --git a/tensorflow/python/ops/distributions/kullback_leibler.py b/tensorflow/python/ops/distributions/kullback_leibler.py
index a6ab581cc22ce8e9a278bb8e0c7e6afc2dcc30eb..829b9611cff02895b67ec39711b8c53e682eb3c5 100644
--- a/tensorflow/python/ops/distributions/kullback_leibler.py
+++ b/tensorflow/python/ops/distributions/kullback_leibler.py
@@ -110,6 +110,38 @@ def kl_divergence(distribution_a, distribution_b,
       return array_ops.identity(kl_t, name="checked_kl")
 
 
+def cross_entropy(ref, other,
+                  allow_nan_stats=True, name=None):
+  """Computes the (Shannon) cross entropy.
+
+  Denote two distributions by `P` (`ref`) and `Q` (`other`). Assuming `P, Q`
+  are absolutely continuous with respect to one another and permit densities
+  `p(x) dr(x)` and `q(x) dr(x)`, (Shanon) cross entropy is defined as:
+
+  ```none
+  H[P, Q] = E_p[-log q(X)] = -int_F p(x) log q(x) dr(x)
+  ```
+
+  where `F` denotes the support of the random variable `X ~ P`.
+
+  Args:
+    ref: `tf.distributions.Distribution` instance.
+    other: `tf.distributions.Distribution` instance.
+    allow_nan_stats: Python `bool`, default `True`. When `True`,
+      statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+      indicate the result is undefined. When `False`, an exception is raised
+      if one or more of the statistic's batch members are undefined.
+    name: Python `str` prepended to names of ops created by this function.
+
+  Returns:
+    cross_entropy: `ref.dtype` `Tensor` with shape `[B1, ..., Bn]`
+      representing `n` different calculations of (Shanon) cross entropy.
+  """
+  with ops.name_scope(name, "cross_entropy"):
+    return ref.entropy() + kl_divergence(
+        ref, other, allow_nan_stats=allow_nan_stats)
+
+
 class RegisterKL(object):
   """Decorator to register a KL divergence implementation function.
 
diff --git a/tensorflow/python/ops/distributions/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
index d49fac59ca993b32db48733c2bde87ddc7c5e92b..04762565c2a982f4df47a1a85547db7a104a5ec3 100644
--- a/tensorflow/python/ops/distributions/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -23,10 +23,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import util as distribution_util
 
@@ -243,25 +243,26 @@ class Multinomial(distribution.Distribution):
         n_draws[..., array_ops.newaxis], dtype=self.logits.dtype) * self.logits
 
     # flatten the total_count and logits
-    flat_logits = array_ops.reshape(logits, [-1, k]) # [B1B2...Bm, k]
-    flat_ndraws = n * array_ops.reshape(n_draws, [-1]) # [B1B2...Bm]
+    flat_logits = array_ops.reshape(logits, [-1, k])  # [B1B2...Bm, k]
+    flat_ndraws = n * array_ops.reshape(n_draws, [-1])  # [B1B2...Bm]
 
     # computes each total_count and logits situation by map_fn
     def _sample_single(args):
-      logits, n_draw = args[0], args[1] # [K], []
-      x = random_ops.multinomial(logits[array_ops.newaxis, ...],
-                                 n_draw, seed) # [1, n*n_draw]
-      x = array_ops.reshape(x, shape=[n, -1]) # [n, n_draw]
-      x = math_ops.reduce_sum(array_ops.one_hot(x, depth=k), axis=-2) # [n, k]
+      logits, n_draw = args[0], args[1]  # [K], []
+      x = random_ops.multinomial(logits[array_ops.newaxis, ...], n_draw,
+                                 seed)  # [1, n*n_draw]
+      x = array_ops.reshape(x, shape=[n, -1])  # [n, n_draw]
+      x = math_ops.reduce_sum(array_ops.one_hot(x, depth=k), axis=-2)  # [n, k]
       return x
-    x = functional_ops.map_fn(_sample_single,
-                              [flat_logits, flat_ndraws],
-                              dtype=self.dtype) # [B1B2...Bm, n, k]
+
+    x = functional_ops.map_fn(
+        _sample_single, [flat_logits, flat_ndraws],
+        dtype=self.dtype)  # [B1B2...Bm, n, k]
 
     # reshape the results to proper shape
     x = array_ops.transpose(x, perm=[1, 0, 2])
     final_shape = array_ops.concat([[n], self.batch_shape_tensor(), [k]], 0)
-    x = array_ops.reshape(x, final_shape) # [n, B1, B2,..., Bm, k]
+    x = array_ops.reshape(x, final_shape)  # [n, B1, B2,..., Bm, k]
     return x
 
   @distribution_util.AppendDocstring(_multinomial_sample_note)
diff --git a/tensorflow/python/ops/distributions/special_math.py b/tensorflow/python/ops/distributions/special_math.py
index 222a39ad8284cf95e271097fa8ac478125ee329c..bed4cbb2c1a43b6952861f4fab82957229e23c9c 100644
--- a/tensorflow/python/ops/distributions/special_math.py
+++ b/tensorflow/python/ops/distributions/special_math.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 
 __all__ = [
+    "erfinv",
     "ndtr",
     "ndtri",
     "log_ndtr",
@@ -350,6 +351,29 @@ def _log_ndtr_asymptotic_series(x, series_order):
   return 1. + even_sum - odd_sum
 
 
+def erfinv(x, name="erfinv"):
+  """The inverse function for erf, the error function.
+
+  Args:
+    x: `Tensor` of type `float32`, `float64`.
+    name: Python string. A name for the operation (default="erfinv").
+
+  Returns:
+    x: `Tensor` with `dtype=x.dtype`.
+
+  Raises:
+    TypeError: if `x` is not floating-type.
+  """
+
+  with ops.name_scope(name, values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+    if x.dtype.as_numpy_dtype not in [np.float32, np.float64]:
+      raise TypeError(
+          "x.dtype=%s is not handled, see docstring for supported types."
+          % x.dtype)
+    return ndtri((x + 1.0) / 2.0) / np.sqrt(2)
+
+
 def _double_factorial(n):
   """The double factorial function for small Python integer `n`."""
   return np.prod(np.arange(n, 1, -2))
diff --git a/tensorflow/python/ops/distributions/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
index ba25b2c3485706cc769b8f37118a994e065c1f93..1efcf9d32e9ea9924bb080459efb7015e33ccd54 100644
--- a/tensorflow/python/ops/distributions/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -434,7 +434,7 @@ class TransformedDistribution(distribution_lib.Distribution):
     log_prob = self.distribution.log_prob(x)
     if self._is_maybe_event_override:
       log_prob = math_ops.reduce_sum(log_prob, self._reduce_event_indices)
-    log_prob = ildj + log_prob
+    log_prob += math_ops.cast(ildj, log_prob.dtype)
     if self._is_maybe_event_override:
       log_prob.set_shape(array_ops.broadcast_static_shape(
           y.get_shape().with_rank_at_least(1)[:-1], self.batch_shape))
@@ -457,7 +457,7 @@ class TransformedDistribution(distribution_lib.Distribution):
     prob = self.distribution.prob(x)
     if self._is_maybe_event_override:
       prob = math_ops.reduce_prod(prob, self._reduce_event_indices)
-    prob *= math_ops.exp(ildj)
+    prob *= math_ops.exp(math_ops.cast(ildj, prob.dtype))
     if self._is_maybe_event_override:
       prob.set_shape(array_ops.broadcast_static_shape(
           y.get_shape().with_rank_at_least(1)[:-1], self.batch_shape))
@@ -546,7 +546,9 @@ class TransformedDistribution(distribution_lib.Distribution):
       ], 0)
       entropy = array_ops.tile(entropy, multiples)
     dummy = array_ops.zeros([], self.dtype)
-    entropy -= self.bijector.inverse_log_det_jacobian(dummy)
+    entropy -= math_ops.cast(
+        self.bijector.inverse_log_det_jacobian(dummy),
+        entropy.dtype)
     entropy.set_shape(self.batch_shape)
     return entropy
 
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 41b86f79409aef76dbd710606d09b21f34cab7ba..5bc25128a8d6f77895fc4decc98a8978ae8400f3 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -699,6 +700,88 @@ def pick_vector(cond,
         [array_ops.where(cond, 0, n)], [array_ops.where(cond, n, -1)])
 
 
+def prefer_static_broadcast_shape(
+    shape1, shape2, name="prefer_static_broadcast_shape"):
+  """Convenience function which statically broadcasts shape when possible.
+
+  Args:
+    shape1:  `1-D` integer `Tensor`.  Already converted to tensor!
+    shape2:  `1-D` integer `Tensor`.  Already converted to tensor!
+    name:  A string name to prepend to created ops.
+
+  Returns:
+    The broadcast shape, either as `TensorShape` (if broadcast can be done
+      statically), or as a `Tensor`.
+  """
+  with ops.name_scope(name, values=[shape1, shape2]):
+    def make_shape_tensor(x):
+      return ops.convert_to_tensor(x, name="shape", dtype=dtypes.int32)
+
+    def get_tensor_shape(s):
+      if isinstance(s, tensor_shape.TensorShape):
+        return s
+      s_ = tensor_util.constant_value(make_shape_tensor(s))
+      if s_ is not None:
+        return tensor_shape.TensorShape(s_)
+      return None
+
+    def get_shape_tensor(s):
+      if not isinstance(s, tensor_shape.TensorShape):
+        return make_shape_tensor(s)
+      if s.is_fully_defined():
+        return make_shape_tensor(s.as_list())
+      raise ValueError("Cannot broadcast from partially "
+                       "defined `TensorShape`.")
+
+    shape1_ = get_tensor_shape(shape1)
+    shape2_ = get_tensor_shape(shape2)
+    if shape1_ is not None and shape2_ is not None:
+      return array_ops.broadcast_static_shape(shape1_, shape2_)
+
+    shape1_ = get_shape_tensor(shape1)
+    shape2_ = get_shape_tensor(shape2)
+    return array_ops.broadcast_dynamic_shape(shape1_, shape2_)
+
+
+def prefer_static_rank(x):
+  """Return static rank of tensor `x` if available, else `tf.rank(x)`.
+
+  Args:
+    x: `Tensor` (already converted).
+
+  Returns:
+    Numpy array (if static rank is obtainable), else `Tensor`.
+  """
+  return prefer_static_value(array_ops.rank(x))
+
+
+def prefer_static_shape(x):
+  """Return static shape of tensor `x` if available, else `tf.shape(x)`.
+
+  Args:
+    x: `Tensor` (already converted).
+
+  Returns:
+    Numpy array (if static shape is obtainable), else `Tensor`.
+  """
+  return prefer_static_value(array_ops.shape(x))
+
+
+def prefer_static_value(x):
+  """Return static value of tensor `x` if available, else `x`.
+
+  Args:
+    x: `Tensor` (already converted).
+
+  Returns:
+    Numpy array (if static value is obtainable), else `Tensor`.
+  """
+  static_x = tensor_util.constant_value(x)
+  if static_x is not None:
+    return static_x
+  return x
+
+
 def gen_new_seed(seed, salt):
   """Generate a new seed, from the given seed and salt."""
   if seed is None:
@@ -751,6 +834,7 @@ def fill_triangular(x, upper=False, name=None):
   """
 
   with ops.name_scope(name, "fill_triangular", values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
     if x.shape.with_rank_at_least(1)[-1].value is not None:
       # Formula derived by solving for n: m = n(n+1)/2.
       m = np.int32(x.shape[-1].value)
@@ -1050,8 +1134,8 @@ def dimension_size(x, axis):
   """Returns the size of a specific dimension."""
   # Since tf.gather isn't "constant-in, constant-out", we must first check the
   # static shape or fallback to dynamic shape.
-  s = x.shape.with_rank_at_least(axis + 1)[axis].value
-  if axis > -1 and s is not None:
+  s = x.shape.with_rank_at_least(np.abs(axis))[axis].value
+  if s is not None:
     return s
   return array_ops.shape(x)[axis]
 
@@ -1099,28 +1183,100 @@ def process_quadrature_grid_and_probs(
     probs /= linalg_ops.norm(probs, ord=1, axis=-1, keep_dims=True,
                              name="probs")
 
-    def _static_dim_size(x, axis):
+    def _static_event_size(x):
       """Returns the static size of a specific dimension or `None`."""
-      return x.shape.with_rank_at_least(axis + 1)[axis].value
+      return x.shape.with_rank_at_least(1)[-1].value
 
-    m, n = _static_dim_size(probs, axis=0), _static_dim_size(grid, axis=0)
+    m, n = _static_event_size(probs), _static_event_size(grid)
     if m is not None and n is not None:
       if m != n:
         raise ValueError("`quadrature_grid_and_probs` must be a `tuple` of "
                          "same-length zero-th-dimension `Tensor`s "
                          "(saw lengths {}, {})".format(m, n))
     elif validate_args:
-      grid = control_flow_ops.with_dependencies([
+      assertions = [
           check_ops.assert_equal(
-              dimension_size(probs, axis=0),
-              dimension_size(grid, axis=0),
+              dimension_size(probs, axis=-1),
+              dimension_size(grid, axis=-1),
               message=("`quadrature_grid_and_probs` must be a `tuple` of "
                        "same-length zero-th-dimension `Tensor`s")),
-      ], grid)
-
+      ]
+      with ops.control_dependencies(assertions):
+        grid = array_ops.identity(grid)
+        probs = array_ops.identity(probs)
     return grid, probs
 
 
+def pad(x, axis, front=False, back=False, value=0, count=1, name=None):
+  """Pads `value` to the front and/or back of a `Tensor` dim, `count` times.
+
+  Args:
+    x: `Tensor` input.
+    axis: Scalar `int`-like `Tensor` representing the single dimension to pad.
+      (Negative indexing is supported.)
+    front: Python `bool`; if `True` the beginning of the `axis` dimension is
+      padded with `value`, `count` times. If `False` no front padding is made.
+    back: Python `bool`; if `True` the end of the `axis` dimension is
+      padded with `value`, `count` times. If `False` no end padding is made.
+    value: Scalar `int`-like `Tensor` representing the actual value added to the
+      front and/or back of the `axis` dimension of `x`.
+    count: Scalar `int`-like `Tensor` representing number of elements added to
+      the front and/or back of the `axis` dimension of `x`. E.g., if
+      `front = back = True` then `2 * count` elements are added.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    pad: The padded version of input `x`.
+
+  Raises:
+    ValueError: if both `front` and `back` are `False`.
+    TypeError: if `count` is not `int`-like.
+  """
+  with ops.name_scope(name, "pad", [x, value, count]):
+    x = ops.convert_to_tensor(x, name="x")
+    value = ops.convert_to_tensor(value, dtype=x.dtype, name="value")
+    count = ops.convert_to_tensor(count, name="count")
+    if not count.dtype.is_integer:
+      raise TypeError("`count.dtype` (`{}`) must be `int`-like.".format(
+          count.dtype.name))
+    if not front and not back:
+      raise ValueError("At least one of `front`, `back` must be `True`.")
+    ndims = (x.shape.ndims if x.shape.ndims is not None
+             else array_ops.rank(x, name="ndims"))
+    axis = ops.convert_to_tensor(axis, name="axis")
+    axis_ = tensor_util.constant_value(axis)
+    if axis_ is not None:
+      axis = axis_
+      if axis < 0:
+        axis = ndims + axis
+      count_ = tensor_util.constant_value(count)
+      if axis_ >= 0 or x.shape.ndims is not None:
+        head = x.shape[:axis]
+        middle = tensor_shape.TensorShape(
+            None if count_ is None
+            else (x.shape[axis] + count_ * (front + back)))
+        tail = x.shape[axis+1:]
+        final_shape = head.concatenate(middle.concatenate(tail))
+      else:
+        final_shape = None
+    else:
+      axis = array_ops.where(axis < 0, ndims + axis, axis)
+      final_shape = None
+    x = array_ops.pad(
+        x,
+        paddings=array_ops.one_hot(
+            indices=array_ops.stack([axis if front else -1,
+                                     axis if back else -1]),
+            depth=ndims,
+            axis=0,
+            on_value=count,
+            dtype=dtypes.int32),
+        constant_values=value)
+    if final_shape is not None:
+      x.set_shape(final_shape)
+    return x
+
+
 class AppendDocstring(object):
   """Helper class to promote private subclass docstring to public counterpart.
 
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 8d00a3c6ab2fdfff53b7e9659710659265cedc65..f5fdb12b2c8ae470a1b671b85ae12c675fd16cd4 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import image_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_grad  # pylint: disable=unused-import
@@ -668,10 +669,10 @@ def _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state):
     ready = (pending_count[x.op._id] == 0)
     if loop_state and not ready:
       ready = (pending_count[x.op._id] > 0 and
-               control_flow_ops.IsLoopSwitch(x.op))
+               control_flow_util.IsLoopSwitch(x.op))
     # pylint: enable=protected-access
     if ready:
-      if control_flow_ops.IsLoopExit(x.op):
+      if control_flow_util.IsLoopExit(x.op):
         # if x is an exit without real gradient, defer processing them.
         grad_state = loop_state.GetGradState(x.op, before=False)
         grad_state.deferred_exits.append(x)
@@ -711,7 +712,7 @@ def _SetGrad(grads, t, grad):
   if isinstance(t_grads, list):
     t_grads.append(grad)
   else:
-    assert control_flow_ops.IsLoopSwitch(op)
+    assert control_flow_util.IsLoopSwitch(op)
     op_grads[t.value_index] = grad
 
 
@@ -851,7 +852,7 @@ def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
   for i, out_grad in enumerate(out_grads):
     if loop_state:
       if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
-        assert control_flow_ops.IsLoopSwitch(op)
+        assert control_flow_util.IsLoopSwitch(op)
         continue
     # Grads have to be Tensors or IndexedSlices
     if (isinstance(out_grad, collections.Sequence) and not all([
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index dacc2947fe31b0cbe81f6acacd52fb4a74719090..1211b2e923082d8d24b8b924227cbc52e6f2eaef 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -573,9 +573,7 @@ class HessianVectorProductTest(test_util.TensorFlowTestCase):
       self.assertAllClose(hess_v_value, hess_v_actual)
 
 
-# TODO(skyewm): reenable C API once
-# ControlFlowContext._RemoveExternalControlEdges works with C API enabled
-# @test_util.with_c_api
+@test_util.with_c_api
 class HessianTest(test_util.TensorFlowTestCase):
 
   def testHessian1D(self):
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
index f834d9002c3e14451bdf2de31cf3c1505e39be4b..ec0890c0168744e089904d94f1fddeb4f7312aca 100644
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -21,6 +21,7 @@ ParallelConcat
 Placeholder
 RefIdentity
 Reverse
+Snapshot
 SpaceToBatch
 Split
 SplitV
@@ -341,6 +342,7 @@ TruncatedNormal
 # script_ops
 PyFunc
 PyFuncStateless
+EagerPyFunc
 
 # sdca_ops
 
@@ -354,8 +356,8 @@ DestroyTemporaryVariable
 AddSparseToTensorsMap
 AddManySparseToTensorsMap
 TakeManySparseFromTensorsMap
-DeserializeSparse
 DeserializeManySparse
+DeserializeSparse
 SerializeManySparse
 SerializeSparse
 SparseAdd
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 7c23321ca5ea7395d347679dcc339ab49dbe6b42..21561f368981e62a75220eb23cd532249942b0bb 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1119,9 +1119,8 @@ def rgb_to_grayscale(images, name=None):
     # https://en.wikipedia.org/wiki/Luma_%28video%29
     rgb_weights = [0.2989, 0.5870, 0.1140]
     rank_1 = array_ops.expand_dims(array_ops.rank(images) - 1, 0)
-    gray_float = math_ops.reduce_sum(flt_image * rgb_weights,
-                                     rank_1,
-                                     keepdims=True)
+    gray_float = math_ops.reduce_sum(
+        flt_image * rgb_weights, rank_1, keepdims=True)
     gray_float.set_shape(images.get_shape()[:-1].concatenate([1]))
     return convert_image_dtype(gray_float, orig_dtype, name=name)
 
@@ -1169,7 +1168,7 @@ def random_hue(image, max_delta, seed=None):
       set_random_seed for its interaction with the graph-level random seed.
 
   Returns:
-    3-D float tensor of shape `[height, width, channels]`.
+    Adjusted image(s), same shape and DType as `image`.
 
   Raises:
     ValueError: if `max_delta` is invalid.
@@ -1276,30 +1275,9 @@ def adjust_saturation(image, saturation_factor, name=None):
     orig_dtype = image.dtype
     flt_image = convert_image_dtype(image, dtypes.float32)
 
-    # TODO(zhengxq): we will switch to the fused version after we add a GPU
-    # kernel for that.
-    fused = os.environ.get('TF_ADJUST_SATURATION_FUSED', '')
-    fused = fused.lower() in ('true', 't', '1')
-
-    if fused:
-      return convert_image_dtype(
-          gen_image_ops.adjust_saturation(flt_image, saturation_factor),
-          orig_dtype)
-
-    hsv = gen_image_ops.rgb_to_hsv(flt_image)
-
-    hue = array_ops.slice(hsv, [0, 0, 0], [-1, -1, 1])
-    saturation = array_ops.slice(hsv, [0, 0, 1], [-1, -1, 1])
-    value = array_ops.slice(hsv, [0, 0, 2], [-1, -1, 1])
-
-    saturation *= saturation_factor
-    saturation = clip_ops.clip_by_value(saturation, 0.0, 1.0)
-
-    hsv_altered = array_ops.concat([hue, saturation, value], 2)
-    rgb_altered = gen_image_ops.hsv_to_rgb(hsv_altered)
-
-    return convert_image_dtype(rgb_altered, orig_dtype)
-
+    return convert_image_dtype(
+        gen_image_ops.adjust_saturation(flt_image, saturation_factor),
+        orig_dtype)
 
 def decode_image(contents, channels=None, name=None):
   """Convenience function for `decode_bmp`, `decode_gif`, `decode_jpeg`,
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index d1554b399f3776933bf970f7b2ceb8db5865d844..4af9bd2a009af593ade0bb7ff94643b305c3091a 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -281,6 +281,21 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
+  def testBatchAdjustHue(self):
+    x_shape = [2, 1, 2, 3]
+    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+
+    delta = 0.25
+    y_data = [13, 0, 11, 226, 54, 221, 234, 8, 92, 1, 217, 255]
+    y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
+
+    with self.test_session(use_gpu=True):
+      x = constant_op.constant(x_np, shape=x_shape)
+      y = image_ops.adjust_hue(x, delta)
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
+
   def _adjustHueNp(self, x_np, delta_h):
     self.assertEqual(x_np.shape[-1], 3)
     x_v = x_np.reshape([-1, 3])
@@ -359,6 +374,89 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
       self._adjustHueTf(x_np, delta_h)
 
 
+class FlipImageBenchmark(test.Benchmark):
+
+  def _benchmarkFlipLeftRight(self, device, cpu_count):
+    image_shape = [299, 299, 3]
+    warmup_rounds = 100
+    benchmark_rounds = 1000
+    config = config_pb2.ConfigProto()
+    if cpu_count is not None:
+      config.inter_op_parallelism_threads = 1
+      config.intra_op_parallelism_threads = cpu_count
+    with session.Session("", graph=ops.Graph(), config=config) as sess:
+      with ops.device(device):
+        inputs = variables.Variable(
+            random_ops.random_uniform(
+                image_shape, dtype=dtypes.float32) * 255,
+            trainable=False,
+            dtype=dtypes.float32)
+        run_op = image_ops.flip_left_right(inputs)
+        sess.run(variables.global_variables_initializer())
+        for i in xrange(warmup_rounds + benchmark_rounds):
+          if i == warmup_rounds:
+            start = time.time()
+          sess.run(run_op)
+    end = time.time()
+    step_time = (end - start) / benchmark_rounds
+    tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
+    print("benchmarkFlipLeftRight_299_299_3_%s step_time: %.2f us" %
+          (tag, step_time * 1e6))
+    self.report_benchmark(
+        name="benchmarkFlipLeftRight_299_299_3_%s" % (tag),
+        iters=benchmark_rounds,
+        wall_time=step_time)
+
+  def _benchmarkRandomFlipLeftRight(self, device, cpu_count):
+    image_shape = [299, 299, 3]
+    warmup_rounds = 100
+    benchmark_rounds = 1000
+    config = config_pb2.ConfigProto()
+    if cpu_count is not None:
+      config.inter_op_parallelism_threads = 1
+      config.intra_op_parallelism_threads = cpu_count
+    with session.Session("", graph=ops.Graph(), config=config) as sess:
+      with ops.device(device):
+        inputs = variables.Variable(
+            random_ops.random_uniform(
+                image_shape, dtype=dtypes.float32) * 255,
+            trainable=False,
+            dtype=dtypes.float32)
+        run_op = image_ops.random_flip_left_right(inputs)
+        sess.run(variables.global_variables_initializer())
+        for i in xrange(warmup_rounds + benchmark_rounds):
+          if i == warmup_rounds:
+            start = time.time()
+          sess.run(run_op)
+    end = time.time()
+    step_time = (end - start) / benchmark_rounds
+    tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
+    print("benchmarkRandomFlipLeftRight_299_299_3_%s step_time: %.2f us" %
+          (tag, step_time * 1e6))
+    self.report_benchmark(
+        name="benchmarkRandomFlipLeftRight_299_299_3_%s" % (tag),
+        iters=benchmark_rounds,
+        wall_time=step_time)
+
+  def benchmarkFlipLeftRightCpu1(self):
+    self._benchmarkFlipLeftRight("/cpu:0", 1)
+
+  def benchmarkFlipLeftRightCpuAll(self):
+    self._benchmarkFlipLeftRight("/cpu:0", None)
+
+  def benchmarkFlipLeftRightGpu(self):
+    self._benchmarkFlipLeftRight(test.gpu_device_name(), None)
+
+  def benchmarkRandomFlipLeftRightCpu1(self):
+    self._benchmarkRandomFlipLeftRight("/cpu:0", 1)
+
+  def benchmarkRandomFlipLeftRightCpuAll(self):
+    self._benchmarkRandomFlipLeftRight("/cpu:0", None)
+
+  def benchmarkRandomFlipLeftRightGpu(self):
+    self._benchmarkRandomFlipLeftRight(test.gpu_device_name(), None)
+
+
 class AdjustHueBenchmark(test.Benchmark):
 
   def _benchmarkAdjustHue(self, device, cpu_count):
@@ -632,6 +730,21 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
+  def testBatchSaturation(self):
+    x_shape = [2, 1, 2, 3]
+    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+
+    saturation_factor = 0.5
+    y_data = [6, 9, 13, 140, 180, 226, 135, 121, 234, 172, 255, 128]
+    y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
+
+    with self.test_session(use_gpu=True):
+      x = constant_op.constant(x_np, shape=x_shape)
+      y = image_ops.adjust_saturation(x, saturation_factor)
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
+
   def _adjust_saturation(self, image, saturation_factor):
     image = ops.convert_to_tensor(image, name="image")
     orig_dtype = image.dtype
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 14a039ffd0aeae8fb7a186a2039f25550dcd7d1b..be9beee633bb7c900b1618c2922b6eff5bf65df0 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -30,7 +30,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.gen_linalg_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import compat
-from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util import deprecation
 
 # Names below are lower_case.
 # pylint: disable=invalid-name
@@ -439,9 +439,13 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
 
 
 # pylint: disable=redefined-builtin
-@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
-                 "keep_dims")
-def norm(tensor, ord='euclidean', axis=None, keepdims=None, name=None,
+@deprecation.deprecated_args(
+    None, 'keep_dims is deprecated, use keepdims instead', 'keep_dims')
+def norm(tensor,
+         ord='euclidean',
+         axis=None,
+         keepdims=None,
+         name=None,
          keep_dims=None):
   r"""Computes the norm of vectors, matrices, and tensors.
 
@@ -478,6 +482,7 @@ def norm(tensor, ord='euclidean', axis=None, keepdims=None, name=None,
     keepdims: If True, the axis indicated in `axis` are kept with size 1.
       Otherwise, the dimensions in `axis` are removed from the output shape.
     name: The name of the op.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     output: A `Tensor` of the same type as tensor, containing the vector or
@@ -500,11 +505,8 @@ def norm(tensor, ord='euclidean', axis=None, keepdims=None, name=None,
      higher order tensors.
   @end_compatibility
   """
-
-  if keep_dims is not None:
-    if keepdims is not None:
-      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
-    keepdims = keep_dims
+  keepdims = deprecation.deprecated_argument_lookup('keepdims', keepdims,
+                                                    'keep_dims', keep_dims)
   if keepdims is None:
     keepdims = False
 
@@ -555,8 +557,8 @@ def norm(tensor, ord='euclidean', axis=None, keepdims=None, name=None,
       else:
         # General p-norms (positive p only)
         result = math_ops.pow(
-            math_ops.reduce_sum(
-                math_ops.pow(result, ord), axis, keepdims=True), 1.0 / ord)
+            math_ops.reduce_sum(math_ops.pow(result, ord), axis, keepdims=True),
+            1.0 / ord)
     if not keepdims:
       result = array_ops.squeeze(result, axis)
     return result
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 08e3f83a0b21a8444ad3500c62fe624440edc255..51ab2aec2298a9072c90c226992f122a804ec02e 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -39,8 +39,8 @@ def Print(input_, data, message=None, first_n=None, summarize=None,
           name=None):
   """Prints a list of tensors.
 
-  This is an identity op with the side effect of printing `data` when
-  evaluating.
+  This is an identity op (behaves like `tf.identity`) with the side effect
+  of printing `data` when evaluating.
 
   Note: This op prints to the standard error. It is not currently compatible
     with jupyter notebook (printing to the notebook *server's* output, not into
@@ -57,7 +57,7 @@ def Print(input_, data, message=None, first_n=None, summarize=None,
     name: A name for the operation (optional).
 
   Returns:
-    Same tensor as `input_`.
+    A `Tensor`. Has the same type and contents as `input_`.
   """
   return gen_logging_ops._print(input_, data, message, first_n, summarize, name)
 
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 156e415735fe970969637a77a9eef242b90f4b01..e12d7e656a0ef503396510df7f17169b8bf27d4a 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import collections
 import functools
+import six
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -688,19 +689,22 @@ class IdTableWithHashBuckets(LookupInterface):
 
   For example, if an instance of `IdTableWithHashBuckets` is initialized with a
   string-to-id table that maps:
-  - emerson -> 0
-  - lake -> 1
-  - palmer -> 2
+
+  * `emerson -> 0`
+  * `lake -> 1`
+  * `palmer -> 2`
 
   The `IdTableWithHashBuckets` object will performs the following mapping:
-  - emerson -> 0
-  - lake -> 1
-  - palmer -> 2
-  - <other term> -> bucket id between 3 and 3 + num_oov_buckets - 1, calculated
-    by: hash(<term>) % num_oov_buckets + vocab_size
 
-  If input_tensor is ["emerson", "lake", "palmer", "king", "crimson"],
-  the lookup result is [0, 1, 2, 4, 7]
+  * `emerson -> 0`
+  * `lake -> 1`
+  * `palmer -> 2`
+  * `<other term> -> bucket_id`, where bucket_id will be between `3` and
+  `3 + num_oov_buckets - 1`, calculated by:
+  `hash(<term>) % num_oov_buckets + vocab_size`
+
+  If input_tensor is `["emerson", "lake", "palmer", "king", "crimson"]`,
+  the lookup result is `[0, 1, 2, 4, 7]`.
 
   If `table` is None, only out-of-vocabulary buckets are used.
 
@@ -789,6 +793,25 @@ class IdTableWithHashBuckets(LookupInterface):
     with ops.name_scope(None, "init"):
       return control_flow_ops.no_op()
 
+  @property
+  def table_ref(self):
+    """Returns the table_ref of the underlying table, if one exists.
+
+    Only use the table_ref directly if you know what you are doing. The
+    table_ref does not have the "hash bucket" functionality, as that is provided
+    by this class.
+
+    One possible use of the table_ref is subtokenization, i.e. ops which
+    dynamically decompose tokens into subtokens based on the contents of the
+    table_ref.
+
+    Returns:
+      the underlying table_ref, or None if there is no underlying table
+    """
+    if self._table is not None:
+      return self._table.table_ref
+    return None
+
   def size(self, name=None):
     """Compute the number of elements in this table."""
     with ops.name_scope(name, "%s_Size" % self.name) as scope:
@@ -940,7 +963,7 @@ def index_table_from_file(vocabulary_file=None,
       than zero.
   """
   if vocabulary_file is None or (
-      isinstance(vocabulary_file, str) and not vocabulary_file):
+      isinstance(vocabulary_file, six.string_types) and not vocabulary_file):
     raise ValueError("vocabulary_file must be specified and must not be empty.")
   if num_oov_buckets < 0:
     raise ValueError("num_oov_buckets must be greater or equal than 0, got %d."
@@ -1144,7 +1167,7 @@ def index_to_string_table_from_file(vocabulary_file,
   ```
 
   Args:
-    vocabulary_file: The vocabulary filename.
+    vocabulary_file: The vocabulary filename, may be a constant scalar `Tensor`.
     vocab_size: Number of the elements in the vocabulary, if known.
     default_value: The value to use for out-of-vocabulary indices.
     name: A name for this op (optional).
@@ -1162,8 +1185,10 @@ def index_to_string_table_from_file(vocabulary_file,
     ValueError: when `vocabulary_file` is empty.
     ValueError: when `vocab_size` is invalid.
   """
-  if not vocabulary_file:
-    raise ValueError("vocabulary_file must be specified.")
+  if vocabulary_file is None or (
+        isinstance(vocabulary_file, six.string_types) and not vocabulary_file):
+    raise ValueError("vocabulary_file must be specified and must not be empty.")
+
   if vocab_size is not None and vocab_size < 1:
     raise ValueError("vocab_size must be greater than 0, got %d." % vocab_size)
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 55a18d28cae5c2326db98c4fed2f6bf38b39a0b0..b74971f654294e25e131a6ba21d982da16cf4264 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -652,7 +652,7 @@ def softmax_cross_entropy(
 
   Args:
     onehot_labels: `[batch_size, num_classes]` target one-hot-encoded labels.
-    logits: [batch_size, num_classes] logits outputs of the network .
+    logits: `[batch_size, num_classes]` logits outputs of the network .
     weights: Optional `Tensor` whose rank is either 0, or rank 1 and is
       broadcastable to the loss which is a `Tensor` of shape `[batch_size]`.
     label_smoothing: If greater than 0 then smooth the labels.
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 38fe093ba7236ff7fe7b580a893501c84c71f6b1..0239396ae32fe62bc75fb19bb05cb2e8e0e8695e 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -184,6 +184,15 @@ def _SparseSegmentSumGrad(op, grad):
           None)
 
 
+@ops.RegisterGradient("SparseSegmentSumWithNumSegments")
+def _SparseSegmentSumWithNumSegmentsGrad(op, grad):
+  """Gradient for SparseSegmentSumWithNumSegments."""
+  input_rows = array_ops.shape(op.inputs[0])[0]
+  return (math_ops.unsorted_segment_sum(
+      array_ops.gather(grad, op.inputs[2]), op.inputs[1], input_rows), None,
+          None, None)
+
+
 @ops.RegisterGradient("SparseSegmentMean")
 def _SparseSegmentMeanGrad(op, grad):
   """Gradient for SparseSegmentMean."""
@@ -192,6 +201,14 @@ def _SparseSegmentMeanGrad(op, grad):
                                             dim0), None, None)
 
 
+@ops.RegisterGradient("SparseSegmentMeanWithNumSegments")
+def _SparseSegmentMeanWithNumSegmentsGrad(op, grad):
+  """Gradient for SparseSegmentMeanWithNumSegments."""
+  dim0 = array_ops.shape(op.inputs[0])[0]
+  return (math_ops.sparse_segment_mean_grad(grad, op.inputs[1], op.inputs[2],
+                                            dim0), None, None, None)
+
+
 @ops.RegisterGradient("SparseSegmentSqrtN")
 def _SparseSegmentSqrtNGrad(op, grad):
   """Gradient for SparseSegmentSqrtN."""
@@ -200,6 +217,14 @@ def _SparseSegmentSqrtNGrad(op, grad):
                                               dim0), None, None)
 
 
+@ops.RegisterGradient("SparseSegmentSqrtNWithNumSegments")
+def _SparseSegmentSqrtNWithNumSegmentsGrad(op, grad):
+  """Gradient for SparseSegmentSqrtNWithNumSegmnets."""
+  dim0 = array_ops.shape(op.inputs[0])[0]
+  return (math_ops.sparse_segment_sqrt_n_grad(grad, op.inputs[1], op.inputs[2],
+                                              dim0), None, None, None)
+
+
 def _SegmentMinOrMaxGrad(op, grad, is_sorted):
   """Gradient for SegmentMin and (unsorted) SegmentMax. They share similar code."""
   zeros = array_ops.zeros(array_ops.shape(op.inputs[0]),
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 81b3c218085cff0e8408e120a15f0ddc57977f28..cd07dad613458369d2762334901f96f51d408d6c 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -170,28 +170,30 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops.gen_math_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import compat
-from tensorflow.python.util.deprecation import deprecated
-from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util import deprecation
 
 # Aliases for some automatically-generated names.
 linspace = gen_math_ops.lin_space
 
-arg_max = deprecated(None, "Use `argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
-arg_min = deprecated(None, "Use `argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
+arg_max = deprecation.deprecated(None, "Use `argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
+arg_min = deprecation.deprecated(None, "Use `argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
 
 
 def _set_doc(doc):
+
   def _decorator(func):
     func.__doc__ = doc
     return func
+
   return _decorator
 
 
 # pylint: disable=redefined-builtin
-@deprecated_args(None, "Use the `axis` argument instead", "dimension")
-@_set_doc(gen_math_ops.arg_max.__doc__
-          .replace("dimensions", "axes")
-          .replace("dimension", "axis"))
+@deprecation.deprecated_args(None, "Use the `axis` argument instead",
+                             "dimension")
+@_set_doc(
+    gen_math_ops.arg_max.__doc__.replace("dimensions", "axes").replace(
+        "dimension", "axis"))
 def argmax(input,
            axis=None,
            name=None,
@@ -206,10 +208,11 @@ def argmax(input,
   return gen_math_ops.arg_max(input, axis, name=name, output_type=output_type)
 
 
-@deprecated_args(None, "Use the `axis` argument instead", "dimension")
-@_set_doc(gen_math_ops.arg_min.__doc__
-          .replace("dimensions", "axes")
-          .replace("dimension", "axis"))
+@deprecation.deprecated_args(None, "Use the `axis` argument instead",
+                             "dimension")
+@_set_doc(
+    gen_math_ops.arg_min.__doc__.replace("dimensions", "axes").replace(
+        "dimension", "axis"))
 def argmin(input,
            axis=None,
            name=None,
@@ -249,7 +252,7 @@ def abs(x, name=None):
   Returns:
     A `Tensor` or `SparseTensor` the same size and type as `x` with absolute
       values.
-    Note, for `complex64` or `complex128' input, the returned `Tensor` will be
+    Note, for `complex64` or `complex128` input, the returned `Tensor` will be
       of type `float32` or `float64`, respectively.
   """
   with ops.name_scope(name, "Abs", [x]) as name:
@@ -275,6 +278,8 @@ def abs(x, name=None):
 # pylint: disable=redefined-builtin
 def _bucketize(input, boundaries, name=None):
   return gen_math_ops._bucketize(input=input, boundaries=boundaries, name=name)
+
+
 # pylint: enable=redefined-builtin
 
 
@@ -320,15 +325,15 @@ multiply.__doc__ = gen_math_ops._mul.__doc__.replace("Mul", "`tf.multiply`")
 
 
 # TODO(aselle): put deprecation in after another round of global code changes
-@deprecated(
+@deprecation.deprecated(
     "2016-12-30",
     "`tf.mul(x, y)` is deprecated, please use `tf.multiply(x, y)` or `x * y`")
 def _mul(x, y, name=None):
   return gen_math_ops._mul(x, y, name)
 
 
-_mul.__doc__ = (gen_math_ops._mul.__doc__ +
-                ("" if _mul.__doc__ is None else _mul.__doc__))
+_mul.__doc__ = (
+    gen_math_ops._mul.__doc__ + ("" if _mul.__doc__ is None else _mul.__doc__))
 
 
 def subtract(x, y, name=None):
@@ -339,15 +344,15 @@ subtract.__doc__ = gen_math_ops._sub.__doc__.replace("`Sub`", "`tf.subtract`")
 
 
 # TODO(aselle): put deprecation in after another round of global code changes
-@deprecated(
+@deprecation.deprecated(
     "2016-12-30",
     "`tf.sub(x, y)` is deprecated, please use `tf.subtract(x, y)` or `x - y`")
 def _sub(x, y, name=None):
   return gen_math_ops._sub(x, y, name)
 
 
-_sub.__doc__ = (gen_math_ops._sub.__doc__ +
-                ("" if _sub.__doc__ is None else _sub.__doc__))
+_sub.__doc__ = (
+    gen_math_ops._sub.__doc__ + ("" if _sub.__doc__ is None else _sub.__doc__))
 
 
 # pylint: disable=g-docstring-has-escape
@@ -377,8 +382,9 @@ def negative(x, name=None):
 
 
 # pylint: disable=g-docstring-has-escape
-@deprecated("2016-12-30",
-            "`tf.neg(x)` is deprecated, please use `tf.negative(x)` or `-x`")
+@deprecation.deprecated(
+    "2016-12-30",
+    "`tf.neg(x)` is deprecated, please use `tf.negative(x)` or `-x`")
 def _neg(x, name=None):
   """Computes numerical negative value element-wise.
 
@@ -944,6 +950,7 @@ _TRUEDIV_TABLE = {
     dtypes.int16: dtypes.float32,
     dtypes.int32: dtypes.float64,
     dtypes.int64: dtypes.float64,
+    dtypes.bfloat16: None,
     dtypes.float16: None,
     dtypes.float32: None,
     dtypes.float64: None,
@@ -957,8 +964,8 @@ _TRUEDIV_TABLE = {
 # to explicitly use the "/" operator to invoke either truediv or div.
 def _sparse_dense_truediv(sp_indices, sp_values, sp_shape, y, name=None):
   """Internal helper function for 'sp_t / dense_t'."""
-  with ops.name_scope(name, "truediv", [sp_indices, sp_values, sp_shape,
-                                        y]) as name:
+  with ops.name_scope(name, "truediv",
+                      [sp_indices, sp_values, sp_shape, y]) as name:
     sp_values = ops.convert_to_tensor(sp_values, name="sp_values")
     y = ops.convert_to_tensor(y, name="y")
     x_dtype = sp_values.dtype.base_dtype
@@ -1265,8 +1272,16 @@ def _ReductionDims(x, axis, reduction_indices):
     return range(0, array_ops.rank(x))
 
 
-@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
-                 "keep_dims")
+def _may_reduce_to_scalar(keepdims, axis, reduction_indices, output):
+  """Set a reduction's output's shape to be a scalar if we are certain."""
+  if (not output.shape.is_fully_defined()) and (not keepdims) and (
+      axis is None) and (reduction_indices is None):
+    output.set_shape(())
+  return output
+
+
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_sum(input_tensor,
                axis=None,
                keepdims=None,
@@ -1302,6 +1317,7 @@ def reduce_sum(input_tensor,
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
@@ -1310,23 +1326,22 @@ def reduce_sum(input_tensor,
   Equivalent to np.sum
   @end_compatibility
   """
-
-  if keep_dims is not None:
-    if keepdims is not None:
-      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
-    keepdims = keep_dims
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
   if keepdims is None:
     keepdims = False
 
-  return gen_math_ops._sum(
-      input_tensor,
-      _ReductionDims(input_tensor, axis, reduction_indices),
-      keepdims,
-      name=name)
+  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
+                               gen_math_ops._sum(
+                                   input_tensor,
+                                   _ReductionDims(input_tensor, axis,
+                                                  reduction_indices),
+                                   keepdims,
+                                   name=name))
 
 
-@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
-                 "keep_dims")
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def count_nonzero(input_tensor,
                   axis=None,
                   keepdims=None,
@@ -1368,14 +1383,13 @@ def count_nonzero(input_tensor,
     dtype: The output dtype; defaults to `tf.int64`.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor (number of nonzero values).
   """
-  if keep_dims is not None:
-    if keepdims is not None:
-      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
-    keepdims = keep_dims
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
   if keepdims is None:
     keepdims = False
 
@@ -1392,8 +1406,8 @@ def count_nonzero(input_tensor,
         dtype=dtype)
 
 
-@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
-                 "keep_dims")
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_mean(input_tensor,
                 axis=None,
                 keepdims=None,
@@ -1423,10 +1437,11 @@ def reduce_mean(input_tensor,
     input_tensor: The tensor to reduce. Should have numeric type.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+      `[-rank(input_tensor), rank(input_tensor)]`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
@@ -1434,7 +1449,10 @@ def reduce_mean(input_tensor,
   @compatibility(numpy)
   Equivalent to np.mean
 
-  Please note that `np.mean` has a `dtype` parameter that could be used to specify the output type. By default this is `dtype=float64`. On the other hand, `tf.reduce_mean` has an aggressive type inference from `input_tensor`, for example:
+  Please note that `np.mean` has a `dtype` parameter that could be used to
+  specify the output type. By default this is `dtype=float64`. On the other
+  hand, `tf.reduce_mean` has an aggressive type inference from `input_tensor`,
+  for example:
 
   ```python
   x = tf.constant([1, 0, 1, 0])
@@ -1445,21 +1463,22 @@ def reduce_mean(input_tensor,
 
   @end_compatibility
   """
-  if keep_dims is not None:
-    if keepdims is not None:
-      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
-    keepdims = keep_dims
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+
   if keepdims is None:
     keepdims = False
-  return gen_math_ops._mean(
-      input_tensor,
-      _ReductionDims(input_tensor, axis, reduction_indices),
-      keepdims,
-      name=name)
+  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
+                               gen_math_ops._mean(
+                                   input_tensor,
+                                   _ReductionDims(input_tensor, axis,
+                                                  reduction_indices),
+                                   keepdims,
+                                   name=name))
 
 
-@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
-                 "keep_dims")
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_prod(input_tensor,
                 axis=None,
                 keepdims=None,
@@ -1484,6 +1503,7 @@ def reduce_prod(input_tensor,
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
@@ -1492,21 +1512,22 @@ def reduce_prod(input_tensor,
   Equivalent to np.prod
   @end_compatibility
   """
-  if keep_dims is not None:
-    if keepdims is not None:
-      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
-    keepdims = keep_dims
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+
   if keepdims is None:
     keepdims = False
-  return gen_math_ops._prod(
-      input_tensor,
-      _ReductionDims(input_tensor, axis, reduction_indices),
-      keepdims,
-      name=name)
+  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
+                               gen_math_ops._prod(
+                                   input_tensor,
+                                   _ReductionDims(input_tensor, axis,
+                                                  reduction_indices),
+                                   keepdims,
+                                   name=name))
 
 
-@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
-                 "keep_dims")
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_min(input_tensor,
                axis=None,
                keepdims=None,
@@ -1531,6 +1552,7 @@ def reduce_min(input_tensor,
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
@@ -1539,21 +1561,21 @@ def reduce_min(input_tensor,
   Equivalent to np.min
   @end_compatibility
   """
-  if keep_dims is not None:
-    if keepdims is not None:
-      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
-    keepdims = keep_dims
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
   if keepdims is None:
     keepdims = False
-  return gen_math_ops._min(
-      input_tensor,
-      _ReductionDims(input_tensor, axis, reduction_indices),
-      keepdims,
-      name=name)
+  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
+                               gen_math_ops._min(
+                                   input_tensor,
+                                   _ReductionDims(input_tensor, axis,
+                                                  reduction_indices),
+                                   keepdims,
+                                   name=name))
 
 
-@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
-                 "keep_dims")
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_max(input_tensor,
                axis=None,
                keepdims=None,
@@ -1578,6 +1600,7 @@ def reduce_max(input_tensor,
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
@@ -1586,21 +1609,21 @@ def reduce_max(input_tensor,
   Equivalent to np.max
   @end_compatibility
   """
-  if keep_dims is not None:
-    if keepdims is not None:
-      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
-    keepdims = keep_dims
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
   if keepdims is None:
     keepdims = False
-  return gen_math_ops._max(
-      input_tensor,
-      _ReductionDims(input_tensor, axis, reduction_indices),
-      keepdims,
-      name=name)
+  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
+                               gen_math_ops._max(
+                                   input_tensor,
+                                   _ReductionDims(input_tensor, axis,
+                                                  reduction_indices),
+                                   keepdims,
+                                   name=name))
 
 
-@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
-                 "keep_dims")
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_all(input_tensor,
                axis=None,
                keepdims=None,
@@ -1634,6 +1657,7 @@ def reduce_all(input_tensor,
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
@@ -1642,21 +1666,21 @@ def reduce_all(input_tensor,
   Equivalent to np.all
   @end_compatibility
   """
-  if keep_dims is not None:
-    if keepdims is not None:
-      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
-    keepdims = keep_dims
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
   if keepdims is None:
     keepdims = False
-  return gen_math_ops._all(
-      input_tensor,
-      _ReductionDims(input_tensor, axis, reduction_indices),
-      keepdims,
-      name=name)
+  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
+                               gen_math_ops._all(
+                                   input_tensor,
+                                   _ReductionDims(input_tensor, axis,
+                                                  reduction_indices),
+                                   keepdims,
+                                   name=name))
 
 
-@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
-                 "keep_dims")
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_any(input_tensor,
                axis=None,
                keepdims=None,
@@ -1690,6 +1714,7 @@ def reduce_any(input_tensor,
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
@@ -1698,21 +1723,21 @@ def reduce_any(input_tensor,
   Equivalent to np.any
   @end_compatibility
   """
-  if keep_dims is not None:
-    if keepdims is not None:
-      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
-    keepdims = keep_dims
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
   if keepdims is None:
     keepdims = False
-  return gen_math_ops._any(
-      input_tensor,
-      _ReductionDims(input_tensor, axis, reduction_indices),
-      keepdims,
-      name=name)
+  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
+                               gen_math_ops._any(
+                                   input_tensor,
+                                   _ReductionDims(input_tensor, axis,
+                                                  reduction_indices),
+                                   keepdims,
+                                   name=name))
 
 
-@deprecated_args(None, "keep_dims is deprecated, use keepdims instead",
-                 "keep_dims")
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_logsumexp(input_tensor,
                      axis=None,
                      keepdims=None,
@@ -1752,14 +1777,13 @@ def reduce_logsumexp(input_tensor,
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
   """
-  if keep_dims is not None:
-    if keepdims is not None:
-      raise ValueError("Cannot specify both 'keep_dims' and 'keepdims'")
-    keepdims = keep_dims
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
   if keepdims is None:
     keepdims = False
   with ops.name_scope(name, "ReduceLogSumExp", [input_tensor]) as name:
@@ -1770,8 +1794,7 @@ def reduce_logsumexp(input_tensor,
         keepdims=True)
     my_max = array_ops.stop_gradient(
         array_ops.where(
-            gen_math_ops.is_finite(raw_max),
-            raw_max,
+            gen_math_ops.is_finite(raw_max), raw_max,
             array_ops.zeros_like(raw_max)))
     result = gen_math_ops.log(
         reduce_sum(
@@ -1783,7 +1806,7 @@ def reduce_logsumexp(input_tensor,
       if isinstance(axis, int):
         axis = [axis]
       result = array_ops.squeeze(result, axis)
-    return result
+    return _may_reduce_to_scalar(keepdims, axis, reduction_indices, result)
 
 
 def trace(x, name=None):
@@ -1947,9 +1970,9 @@ def matmul(a,
     # TODO(apassos) remove _shape_tuple here when it is not needed.
     a_shape = a._shape_tuple()  # pylint: disable=protected-access
     b_shape = b._shape_tuple()  # pylint: disable=protected-access
-    if (not a_is_sparse and not b_is_sparse) and (
-        (a_shape is None or len(a_shape) > 2) and
-        (b_shape is None or len(b_shape) > 2)):
+    if (not a_is_sparse and
+        not b_is_sparse) and ((a_shape is None or len(a_shape) > 2) and
+                              (b_shape is None or len(b_shape) > 2)):
       # BatchMatmul does not support transpose, so we conjugate the matrix and
       # use adjoint instead. Conj() is a noop for real matrices.
       if transpose_a:
@@ -1974,8 +1997,8 @@ def matmul(a,
     use_sparse_matmul = False
     if a_is_sparse or b_is_sparse:
       sparse_matmul_types = [dtypes.bfloat16, dtypes.float32]
-      use_sparse_matmul = (a.dtype in sparse_matmul_types and
-                           b.dtype in sparse_matmul_types)
+      use_sparse_matmul = (
+          a.dtype in sparse_matmul_types and b.dtype in sparse_matmul_types)
     if a.dtype == dtypes.bfloat16 or b.dtype == dtypes.bfloat16:
       # matmul currently doesn't handle bfloat16 inputs.
       use_sparse_matmul = True
@@ -2066,8 +2089,8 @@ def _as_indexed_slices_list(inputs, optimize=True):
   for o in outputs:
     if o.indices.dtype == dtypes.int32:
       casted_outputs.append(
-          ops.IndexedSlices(o.values,
-                            cast(o.indices, dtypes.int64), o.dense_shape))
+          ops.IndexedSlices(o.values, cast(o.indices, dtypes.int64),
+                            o.dense_shape))
     else:
       casted_outputs.append(o)
   return casted_outputs
@@ -2166,8 +2189,8 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   if tensor_dtype is None:
     tensor_dtype = inputs[0].dtype
   if tensor_dtype != inputs[0].dtype:
-    raise TypeError("tensor_dtype is {}, but input is of type {}"
-                    .format(tensor_dtype, inputs[0].dtype))
+    raise TypeError("tensor_dtype is {}, but input is of type {}".format(
+        tensor_dtype, inputs[0].dtype))
   if len(inputs) == 1:
     return inputs[0]
   with ops.name_scope(name, "AccumulateN", inputs) as name:
@@ -2472,6 +2495,159 @@ def reduced_shape(input_shape, axes):
       ])  # [1, 1]
 
 
+def sparse_segment_sum(data, indices, segment_ids, name=None,
+                       num_segments=None):
+  r"""Computes the sum along sparse segments of a tensor.
+
+  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
+  of segments.
+
+  Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+  dimension, selecting a subset of dimension 0, specified by `indices`.
+  `segment_ids` is allowed to have missing ids, in which case the output will
+  be zeros at those indices. In those cases `num_segments` is used to determine
+  the size of the output.
+
+  For example:
+
+  ```python
+  c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+
+  # Select two rows, one segment.
+  tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+  # => [[0 0 0 0]]
+
+  # Select two rows, two segment.
+  tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+  # => [[ 1  2  3  4]
+  #     [-1 -2 -3 -4]]
+
+  # With missing segment ids.
+  tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 2]),
+                        num_segments=4)
+  # => [[ 1  2  3  4]
+  #     [ 0  0  0  0]
+  #     [-1 -2 -3 -4]
+  #     [ 0  0  0  0]]
+
+  # Select all rows, two segments.
+  tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+  # => [[0 0 0 0]
+  #     [5 6 7 8]]
+
+  # Which is equivalent to:
+  tf.segment_sum(c, tf.constant([0, 0, 1]))
+  ```
+
+  Args:
+    data: A `Tensor` with data that will be assembled in the output.
+    indices: A 1-D `Tensor` with indices into `data`. Has same rank as
+      `segment_ids`.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`.
+      Values should be sorted and can be repeated.
+    name: A name for the operation (optional).
+    num_segments: An optional int32 scalar. Indicates the size of the output
+      `Tensor`.
+
+  Returns:
+    A `tensor` of the shape as data, except for dimension 0 which
+    has size `k`, the number of segments specified via `num_segments` or
+    inferred for the last element in `segments_ids`.
+  """
+  if num_segments is not None:
+    return gen_math_ops.sparse_segment_sum_with_num_segments(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        num_segments=num_segments,
+        name=name)
+  else:
+    return gen_math_ops.sparse_segment_sum(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        name=name)
+
+
+def sparse_segment_mean(data, indices, segment_ids, name=None,
+                        num_segments=None):
+  r"""Computes the mean along sparse segments of a tensor.
+
+  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
+  of segments.
+
+  Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+  dimension, selecting a subset of dimension 0, specified by `indices`.
+  `segment_ids` is allowed to have missing ids, in which case the output will
+  be zeros at those indices. In those cases `num_segments` is used to determine
+  the size of the output.
+
+  Args:
+    data: A `Tensor` with data that will be assembled in the output.
+    indices: A 1-D `Tensor` with indices into `data`. Has same rank as
+      `segment_ids`.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`.
+      Values should be sorted and can be repeated.
+    name: A name for the operation (optional).
+    num_segments: An optional int32 scalar. Indicates the size of the output
+      `Tensor`.
+
+  Returns:
+    A `tensor` of the shape as data, except for dimension 0 which
+    has size `k`, the number of segments specified via `num_segments` or
+    inferred for the last element in `segments_ids`.
+  """
+  if num_segments is not None:
+    return gen_math_ops.sparse_segment_mean_with_num_segments(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        num_segments=num_segments,
+        name=name)
+  else:
+    return gen_math_ops.sparse_segment_mean(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        name=name)
+
+
+def sparse_segment_sqrt_n(data, indices, segment_ids, name=None,
+                          num_segments=None):
+  r"""Computes the sum along sparse segments of a tensor divided by the sqrt(N).
+
+  `N` is the size of the segment being reduced.
+
+  Args:
+    data: A `Tensor` with data that will be assembled in the output.
+    indices: A 1-D `Tensor` with indices into `data`. Has same rank as
+      `segment_ids`.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`.
+      Values should be sorted and can be repeated.
+    name: A name for the operation (optional).
+    num_segments: An optional int32 scalar. Indicates the size of the output
+      `Tensor`.
+
+  Returns:
+    A `tensor` of the shape as data, except for dimension 0 which
+    has size `k`, the number of segments specified via `num_segments` or
+    inferred for the last element in `segments_ids`.
+  """
+  if num_segments is not None:
+    return gen_math_ops.sparse_segment_sqrt_n_with_num_segments(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        num_segments=num_segments,
+        name=name)
+  else:
+    return gen_math_ops.sparse_segment_sqrt_n(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        name=name)
+
+
 def tensordot(a, b, axes, name=None):
   r"""Tensor contraction of a and b along specified axes.
 
@@ -2568,7 +2744,8 @@ def tensordot(a, b, axes, name=None):
       rank_a = array_ops.rank(a)
       axes = ops.convert_to_tensor(axes, dtype=dtypes.int32, name="axes")
       axes = cast(axes >= 0, dtypes.int32) * axes + cast(
-          axes < 0, dtypes.int32) * (axes + rank_a)
+          axes < 0, dtypes.int32) * (
+              axes + rank_a)
       free, _ = array_ops.setdiff1d(range(rank_a), axes)
       free_dims = array_ops.gather(shape_a, free)
       axes_dims = array_ops.gather(shape_a, axes)
@@ -2594,8 +2771,8 @@ def tensordot(a, b, axes, name=None):
         return range(a_shape.ndims - axes, a_shape.ndims), range(axes)
       else:
         rank = array_ops.rank(a)
-        return (range(rank - axes, rank, dtype=dtypes.int32), range(
-            axes, dtype=dtypes.int32))
+        return (range(rank - axes, rank, dtype=dtypes.int32),
+                range(axes, dtype=dtypes.int32))
     elif isinstance(axes, (list, tuple)):
       if len(axes) != 2:
         raise ValueError("'axes' must be an integer or have length 2.")
@@ -2619,8 +2796,8 @@ def tensordot(a, b, axes, name=None):
     b = ops.convert_to_tensor(b, name="b")
     a_axes, b_axes = _tensordot_axes(a, axes)
     a_reshape, a_free_dims, a_free_dims_static = _tensordot_reshape(a, a_axes)
-    b_reshape, b_free_dims, b_free_dims_static = _tensordot_reshape(b, b_axes,
-                                                                    True)
+    b_reshape, b_free_dims, b_free_dims_static = _tensordot_reshape(
+        b, b_axes, True)
     ab_matmul = matmul(a_reshape, b_reshape)
     if isinstance(a_free_dims, list) and isinstance(b_free_dims, list):
       return array_ops.reshape(ab_matmul, a_free_dims + b_free_dims, name=name)
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 4642f4c580fbf5401af4c6a5ec43851e67a0af8b..bd26ff66961c858865c8a61469abac0b783ed645 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -21,7 +21,6 @@ import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -31,12 +30,12 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
-ops._USE_C_API = True
 
 exp = np.exp
 log = np.log
 
 
+@test_util.with_c_api
 class ReduceTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -62,16 +61,16 @@ class ReduceTest(test_util.TensorFlowTestCase):
   @test_util.run_in_graph_and_eager_modes()
   def testReduceInvalidAxis(self):
     if context.in_eager_mode():
-      # The shape check is in run a graph contruction time. In eager mode,
+      # The shape check is in run a graph construction time. In eager mode,
       # it misses the check, magically return result given wrong shape.
       return
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     axis = np.array([[0], [1]])
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "must be at most rank 1"):
+    with self.assertRaisesRegexp(ValueError, "must be at most rank 1"):
       math_ops.reduce_sum(x, axis)
 
 
+@test_util.with_c_api
 class LogSumExpTest(test_util.TensorFlowTestCase):
 
   def testReduceLogSumExp(self):
@@ -151,6 +150,7 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
       self.assertEqual(-np.inf, res)
 
 
+@test_util.with_c_api
 class RoundTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -168,6 +168,7 @@ class RoundTest(test_util.TensorFlowTestCase):
         self.assertAllClose(y_tf_np, y_np, atol=1e-2)
 
 
+@test_util.with_c_api
 class ModTest(test_util.TensorFlowTestCase):
 
   def testFloat(self):
@@ -197,6 +198,7 @@ class ModTest(test_util.TensorFlowTestCase):
         self.assertAllClose(y_tf_np, y_np)
 
 
+@test_util.with_c_api
 class SquaredDifferenceTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -210,6 +212,7 @@ class SquaredDifferenceTest(test_util.TensorFlowTestCase):
         self.assertAllClose(z, z_tf)
 
 
+@test_util.with_c_api
 class ApproximateEqualTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -241,6 +244,7 @@ class ApproximateEqualTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(z, z_tf)
 
 
+@test_util.with_c_api
 class ScalarMulTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -282,6 +286,7 @@ class ScalarMulTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(self.evaluate(x.indices), [0, 2, 5])
 
 
+@test_util.with_c_api
 class AccumulateNTest(test_util.TensorFlowTestCase):
 
   def testFloat(self):
@@ -301,6 +306,7 @@ class AccumulateNTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(x[0] * 6, math_ops.accumulate_n([tf_x[0]] * 6).eval())
 
 
+@test_util.with_c_api
 class AddNTest(test_util.TensorFlowTestCase):
 
   def testPartials(self):
@@ -354,6 +360,7 @@ class AddNTest(test_util.TensorFlowTestCase):
                             [g.eval() for g in add_n_grad])
 
 
+@test_util.with_c_api
 class DivAndModTest(test_util.TensorFlowTestCase):
   # TODO(aselle): Test more types before exposing new division operators.
 
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index d30f6b92ad42259f47b1135b72c4a1d3dc4f810e..e04121ee31d1b6c82151bf7415b3e73614b24781 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -792,9 +792,10 @@ def mean_cosine_distance(labels, predictions, dim, weights=None,
   predictions, labels, weights = _remove_squeezable_dimensions(
       predictions=predictions, labels=labels, weights=weights)
   radial_diffs = math_ops.multiply(predictions, labels)
-  radial_diffs = math_ops.reduce_sum(radial_diffs,
-                                     reduction_indices=[dim,],
-                                     keepdims=True)
+  radial_diffs = math_ops.reduce_sum(
+      radial_diffs, reduction_indices=[
+          dim,
+      ], keepdims=True)
   mean_distance, update_op = mean(radial_diffs, weights,
                                   None,
                                   None,
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index 8aed2e293fa2dd6559d342f109a996d810db13bf..fc013b565b764f0d22df29f99e78cb97498c5ced 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -21,9 +21,12 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.core.framework import graph_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
@@ -34,8 +37,18 @@ import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class BatchNormalizationTest(test.TestCase):
 
+  def SetProducerVersion(self, graph, producer_version):
+    # The C API doesn't expose altering GraphDefVersions. We can indirectly set
+    # it via import_graph_def though.
+    graph_def = graph_pb2.GraphDef()
+    graph_def.versions.producer = producer_version
+    with graph.as_default():
+      importer.import_graph_def(graph_def)
+    assert graph.graph_def_versions.producer, producer_version
+
   def _npBatchNorm(self, x, m, v, beta, gamma, epsilon,
                    scale_after_normalization, shift_after_normalization):
     y = (x - m) / np.sqrt(v + epsilon)
@@ -52,9 +65,7 @@ class BatchNormalizationTest(test.TestCase):
   def _tfBatchNormV1(self, x, m, v, beta, gamma, epsilon,
                      scale_after_normalization):
     """Original implementation."""
-    # _batch_norm_with_global_normalization is deprecated in v9
-    ops.get_default_graph().graph_def_versions.producer = 8
-    # pylint: disable=protected-access
+    self.SetProducerVersion(ops.get_default_graph(), 8)
     return gen_nn_ops._batch_norm_with_global_normalization(
         x, m, v, beta, gamma, epsilon, scale_after_normalization)
     # pylint: enable=protected-access
@@ -222,7 +233,7 @@ class BatchNormalizationTest(test.TestCase):
         epsilon = 0.001
         for scale_after_normalization in [True, False]:
           # _batch_norm_with_global_normalization_grad is deprecated in v9
-          ops.get_default_graph().graph_def_versions.producer = 8
+          self.SetProducerVersion(ops.get_default_graph(), 8)
           grad = gen_nn_ops._batch_norm_with_global_normalization_grad(
               x, m, v, gamma, backprop, epsilon, scale_after_normalization)
           dx, dm, dv, db, dg = grad
@@ -334,6 +345,7 @@ class BatchNormalizationTest(test.TestCase):
         (2, 3, 2, 4, 5), (1, 1, 1, 4, 5), atol=0.005)
 
 
+@test_util.with_c_api
 class SufficientStatisticsTest(test.TestCase):
 
   def _npSuffStats(self, x, axes, shift, keep_dims):
@@ -393,6 +405,7 @@ class SufficientStatisticsTest(test.TestCase):
           self._testSuffStats([1, 2, 3], [0, 2], shift, keep_dims, has_shape)
 
 
+@test_util.with_c_api
 class NormalizeMomentsTest(test.TestCase):
 
   def _npNormalizeMoments(self, counts, mean_ss, variance_ss, shift):
@@ -436,6 +449,7 @@ class NormalizeMomentsTest(test.TestCase):
       self._testNormalizeMoments([2, 3], shift)
 
 
+@test_util.with_c_api
 class MomentsTest(test.TestCase):
 
   def _unweighted_moments(self, x, axes, keep_dims=False, extra_out_grads=None):
@@ -573,6 +587,7 @@ class MomentsTest(test.TestCase):
     self._testGlobalGradient(from_y="var")
 
 
+@test_util.with_c_api
 class WeightedMomentsTest(MomentsTest):
   """Tests for nn.weighted_moments.
 
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index e72d34d1f728344709cd7429ab560379a2836cab..ff7137d492ccd3ea3bd076eda4f149b53cb09260 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -333,7 +333,7 @@ class BatchNormalizationTest(test.TestCase):
     self.assertLess(err_grad_x_2, err_tolerance)
     self.assertLess(err_grad_scale, err_tolerance)
 
-  def testInference(self):
+  def testInferenceShape1(self):
     x_shape = [1, 1, 6, 1]
     for dtype in [np.float16, np.float32]:
       if test.is_gpu_available(cuda_only=True):
@@ -344,6 +344,7 @@ class BatchNormalizationTest(test.TestCase):
       self._test_inference(
           x_shape, dtype, [1], np.float32, use_gpu=False, data_format='NHWC')
 
+  def testInferenceShape2(self):
     x_shape = [1, 1, 6, 2]
     if test.is_gpu_available(cuda_only=True):
       for dtype in [np.float16, np.float32]:
@@ -352,12 +353,14 @@ class BatchNormalizationTest(test.TestCase):
         self._test_inference(
             x_shape, dtype, [2], np.float32, use_gpu=False, data_format='NHWC')
 
+  def testInferenceShape3(self):
     x_shape = [1, 2, 1, 6]
     if test.is_gpu_available(cuda_only=True):
       for dtype in [np.float16, np.float32]:
         self._test_inference(
             x_shape, dtype, [2], np.float32, use_gpu=True, data_format='NCHW')
 
+  def testInferenceShape4(self):
     x_shape = [27, 131, 127, 6]
     for dtype in [np.float16, np.float32]:
       if test.is_gpu_available(cuda_only=True):
@@ -368,7 +371,7 @@ class BatchNormalizationTest(test.TestCase):
       self._test_inference(
           x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
 
-  def testTraining(self):
+  def testTrainingShape1(self):
     x_shape = [1, 1, 6, 1]
     for dtype in [np.float16, np.float32]:
       if test.is_gpu_available(cuda_only=True):
@@ -379,6 +382,7 @@ class BatchNormalizationTest(test.TestCase):
       self._test_training(
           x_shape, dtype, [1], np.float32, use_gpu=False, data_format='NHWC')
 
+  def testTrainingShape2(self):
     x_shape = [1, 1, 6, 2]
     for dtype in [np.float16, np.float32]:
       if test.is_gpu_available(cuda_only=True):
@@ -387,12 +391,14 @@ class BatchNormalizationTest(test.TestCase):
       self._test_training(
           x_shape, dtype, [2], np.float32, use_gpu=False, data_format='NHWC')
 
+  def testTrainingShape3(self):
     x_shape = [1, 2, 1, 6]
     if test.is_gpu_available(cuda_only=True):
       for dtype in [np.float16, np.float32]:
         self._test_training(
             x_shape, dtype, [2], np.float32, use_gpu=True, data_format='NCHW')
 
+  def testTrainingShape4(self):
     x_shape = [27, 131, 127, 6]
     for dtype in [np.float16, np.float32]:
       if test.is_gpu_available(cuda_only=True):
@@ -403,7 +409,7 @@ class BatchNormalizationTest(test.TestCase):
       self._test_training(
           x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
 
-  def testBatchNormGrad(self):
+  def testBatchNormGradShape1(self):
     for is_training in [True, False]:
       x_shape = [1, 1, 6, 1]
       for dtype in [np.float16, np.float32]:
@@ -430,6 +436,8 @@ class BatchNormalizationTest(test.TestCase):
             data_format='NHWC',
             is_training=is_training)
 
+  def testBatchNormGradShape2(self):
+    for is_training in [True, False]:
       x_shape = [1, 1, 6, 2]
       for dtype in [np.float16, np.float32]:
         if test.is_gpu_available(cuda_only=True):
@@ -448,6 +456,8 @@ class BatchNormalizationTest(test.TestCase):
             data_format='NHWC',
             is_training=is_training)
 
+  def testBatchNormGradShape3(self):
+    for is_training in [True, False]:
       x_shape = [1, 2, 1, 6]
       if test.is_gpu_available(cuda_only=True):
         for dtype in [np.float16, np.float32]:
@@ -459,6 +469,8 @@ class BatchNormalizationTest(test.TestCase):
               data_format='NCHW',
               is_training=is_training)
 
+  def testBatchNormGradShape4(self):
+    for is_training in [True, False]:
       x_shape = [5, 7, 11, 4]
       for dtype in [np.float16, np.float32]:
         if test.is_gpu_available(cuda_only=True):
@@ -515,26 +527,37 @@ class BatchNormalizationTest(test.TestCase):
           is_training=is_training,
           err_tolerance=err_tolerance)
 
-  def testBatchNormGradGrad(self):
-    configs = [{
+  def testBatchNormGradGradConfig1(self):
+    config = {
         'shape': [2, 3, 4, 5],
         'err_tolerance': 1e-2,
         'dtype': np.float32,
-    }, {
+    }
+    self._testBatchNormGradGrad(config)
+
+  def testBatchNormGradGradConfig2(self):
+    config = {
         'shape': [2, 3, 2, 2],
         'err_tolerance': 1e-3,
         'dtype': np.float32,
-    }, {
+    }
+    self._testBatchNormGradGrad(config)
+
+  def testBatchNormGradGradConfig3(self):
+    config = {
         'shape': [2, 3, 4, 5],
         'err_tolerance': 1e-2,
         'dtype': np.float16,
-    }, {
+    }
+    self._testBatchNormGradGrad(config)
+
+  def testBatchNormGradGradConfig4(self):
+    config = {
         'shape': [2, 3, 2, 2],
         'err_tolerance': 2e-3,
         'dtype': np.float16,
-    }]
-    for config in configs:
-      self._testBatchNormGradGrad(config)
+    }
+    self._testBatchNormGradGrad(config)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 4b406ba8404d60fbed43afa30f44b1e1a9b26d84..8cd535aa0b1a220e33d766714696092f212e1e83 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -41,33 +41,48 @@ def _Conv2DBackpropInputGrad(op, grad):
   Returns:
     the gradients w.r.t. the input and the filter
   """
-  return [None,
-          nn_ops.conv2d_backprop_filter(grad, array_ops.shape(op.inputs[1]),
-                                        op.inputs[2], op.get_attr("strides"),
-                                        op.get_attr("padding"),
-                                        op.get_attr("use_cudnn_on_gpu"),
-                                        op.get_attr("data_format")),
-          nn_ops.conv2d(grad, op.inputs[1], op.get_attr("strides"),
-                        op.get_attr("padding"), op.get_attr("use_cudnn_on_gpu"),
-                        op.get_attr("data_format"))]
+  return [
+      None,
+      nn_ops.conv2d_backprop_filter(
+          grad,
+          array_ops.shape(op.inputs[1]),
+          op.inputs[2],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
+          data_format=op.get_attr("data_format")),
+      nn_ops.conv2d(
+          grad,
+          op.inputs[1],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
+          data_format=op.get_attr("data_format"))
+  ]
 
 
 @ops.RegisterGradient("Conv2DBackpropFilter")
 def _Conv2DBackpropFilterGrad(op, grad):
   return [
       nn_ops.conv2d_backprop_input(
-          array_ops.shape(op.inputs[0]), grad, op.inputs[2],
-          op.get_attr("strides"),
-          op.get_attr("padding"),
-          op.get_attr("use_cudnn_on_gpu"),
-          op.get_attr("data_format")),
-      None,
+          array_ops.shape(op.inputs[0]),
+          grad,
+          op.inputs[2],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
+          data_format=op.get_attr("data_format")), None,
       nn_ops.conv2d(
-          op.inputs[0], grad,
-          op.get_attr("strides"),
-          op.get_attr("padding"),
-          op.get_attr("use_cudnn_on_gpu"),
-          op.get_attr("data_format"))
+          op.inputs[0],
+          grad,
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
+          data_format=op.get_attr("data_format"))
   ]
 
 
@@ -466,25 +481,32 @@ def _SparseSoftmaxCrossEntropyWithLogitsGrad(op, grad_0, _):
 
 @ops.RegisterGradient("Conv2D")
 def _Conv2DGrad(op, grad):
+  dilations = op.get_attr("dilations")
   strides = op.get_attr("strides")
   padding = op.get_attr("padding")
   use_cudnn_on_gpu = op.get_attr("use_cudnn_on_gpu")
   data_format = op.get_attr("data_format")
   shape_0, shape_1 = array_ops.shape_n([op.inputs[0], op.inputs[1]])
-  return [nn_ops.conv2d_backprop_input(shape_0,
-                                       op.inputs[1],
-                                       grad,
-                                       strides,
-                                       padding,
-                                       use_cudnn_on_gpu,
-                                       data_format),
-          nn_ops.conv2d_backprop_filter(op.inputs[0],
-                                        shape_1,
-                                        grad,
-                                        strides,
-                                        padding,
-                                        use_cudnn_on_gpu,
-                                        data_format)]
+  return [
+      nn_ops.conv2d_backprop_input(
+          shape_0,
+          op.inputs[1],
+          grad,
+          dilations=dilations,
+          strides=strides,
+          padding=padding,
+          use_cudnn_on_gpu=use_cudnn_on_gpu,
+          data_format=data_format),
+      nn_ops.conv2d_backprop_filter(
+          op.inputs[0],
+          shape_1,
+          grad,
+          dilations=dilations,
+          strides=strides,
+          padding=padding,
+          use_cudnn_on_gpu=use_cudnn_on_gpu,
+          data_format=data_format)
+  ]
 
 
 @ops.RegisterGradient("DepthwiseConv2dNative")
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index da037a79839b77d6781a35522712fb05bfc71f52..fd96f7b8fcf423e2381f84b50b0532e46ce2fe6e 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import candidate_sampling_ops
 from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -333,6 +334,7 @@ def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
     epsilon: A lower bound value for the norm. Will use `sqrt(epsilon)` as the
       divisor if `norm < sqrt(epsilon)`.
     name: A name for this operation (optional).
+    dim: Deprecated alias for axis.
 
   Returns:
     A `Tensor` with the same shape as `x`.
@@ -340,7 +342,7 @@ def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
   with ops.name_scope(name, "l2_normalize", [x]) as name:
     axis = deprecated_argument_lookup("axis", axis, "dim", dim)
     x = ops.convert_to_tensor(x, name="x")
-    square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keep_dims=True)
+    square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
     x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
     return math_ops.multiply(x, x_inv_norm, name=name)
 
@@ -592,8 +594,8 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
     else:  # no shift.
       m_ss = x
       v_ss = math_ops.square(x)
-    m_ss = math_ops.reduce_sum(m_ss, axes, keep_dims=keep_dims, name="mean_ss")
-    v_ss = math_ops.reduce_sum(v_ss, axes, keep_dims=keep_dims, name="var_ss")
+    m_ss = math_ops.reduce_sum(m_ss, axes, keepdims=keep_dims, name="mean_ss")
+    v_ss = math_ops.reduce_sum(v_ss, axes, keepdims=keep_dims, name="var_ss")
   return counts, m_ss, v_ss, shift
 
 
@@ -637,7 +639,7 @@ def moments(x, axes,
   across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
   and variance of a vector.
 
-  Note: shift is currently not used, the true mean is computed and used.
+  Note: shift is currently not used; the true mean is computed and used.
 
   When using these moments for batch normalization (see
   `tf.nn.batch_normalization`):
@@ -663,12 +665,12 @@ def moments(x, axes,
     # on 32-bit floats before converting the mean and variance back to fp16
     y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x
     # Compute true mean while keeping the dims for proper broadcasting.
-    mean = math_ops.reduce_mean(y, axes, keep_dims=True, name="mean")
+    mean = math_ops.reduce_mean(y, axes, keepdims=True, name="mean")
     # sample variance, not unbiased variance
     variance = math_ops.reduce_mean(
         math_ops.squared_difference(y, array_ops.stop_gradient(mean)),
         axes,
-        keep_dims=True,
+        keepdims=True,
         name="variance")
     if not keep_dims:
       mean = array_ops.squeeze(mean, axes)
@@ -713,7 +715,7 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
     # Note that we use keep_dims=True for our reductions regardless of the arg;
     # this is so that the results remain broadcast-compatible with the inputs.
     weighted_input_sum = math_ops.reduce_sum(
-        frequency_weights * x, axes, name="weighted_input_sum", keep_dims=True)
+        frequency_weights * x, axes, name="weighted_input_sum", keepdims=True)
 
     # The shape of the weights isn't necessarily the same as x's
     # shape, just broadcast-compatible with it -- so this expression
@@ -724,7 +726,7 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
     broadcasted_weights = frequency_weights + array_ops.zeros_like(x)
 
     sum_of_weights = math_ops.reduce_sum(
-        broadcasted_weights, axes, name="sum_of_weights", keep_dims=True)
+        broadcasted_weights, axes, name="sum_of_weights", keepdims=True)
 
     divisor = math_ops.reciprocal(sum_of_weights, name="inv_weight_sum")
 
@@ -735,7 +737,7 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
         frequency_weights * math_ops.squared_difference(x, weighted_mean),
         axes,
         name="weighted_distsq",
-        keep_dims=True)
+        keepdims=True)
 
     weighted_variance = math_ops.multiply(weighted_distsq, divisor)
 
@@ -862,7 +864,7 @@ def fused_batch_norm(
   # currently only use the V2 version for float16 inputs, which is not supported
   # by the V1 version.
   # pylint: disable=protected-access
-  if x.dtype == dtypes.float16:
+  if x.dtype == dtypes.float16 or x.dtype == dtypes.bfloat16:
     fused_batch_norm_func = gen_nn_ops._fused_batch_norm_v2
   else:
     fused_batch_norm_func = gen_nn_ops._fused_batch_norm
@@ -980,10 +982,11 @@ def _compute_sampled_logits(weights,
         Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
     name: A name for the operation (optional).
   Returns:
-    out_logits, out_labels: `Tensor` objects each with shape
+    out_logits: `Tensor` object with shape
         `[batch_size, num_true + num_sampled]`, for passing to either
         `nn.sigmoid_cross_entropy_with_logits` (NCE) or
         `nn.softmax_cross_entropy_with_logits` (sampled softmax).
+    out_labels: A Tensor object with the same shape as `out_logits`.
   """
 
   if isinstance(weights, variables.PartitionedVariable):
@@ -1094,15 +1097,16 @@ def _compute_sampled_logits(weights,
 
     # Construct output logits and labels. The true labels/logits start at col 0.
     out_logits = array_ops.concat([true_logits, sampled_logits], 1)
-    # true_logits is a float tensor, ones_like(true_logits) is a float tensor
-    # of ones. We then divide by num_true to ensure the per-example labels sum
-    # to 1.0, i.e. form a proper probability distribution.
+
+    # true_logits is a float tensor, ones_like(true_logits) is a float
+    # tensor of ones. We then divide by num_true to ensure the per-example
+    # labels sum to 1.0, i.e. form a proper probability distribution.
     out_labels = array_ops.concat([
         array_ops.ones_like(true_logits) / num_true,
         array_ops.zeros_like(sampled_logits)
     ], 1)
 
-  return out_logits, out_labels
+    return out_logits, out_labels
 
 
 def nce_loss(weights,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 61fa4629888064556fbb0b352918d19346738266..3a77d89760daeef39ac0b57d4a1c5336a33f1080 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -23,7 +23,6 @@ import numbers
 import numpy as np
 
 from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import ops
@@ -38,11 +37,10 @@ from tensorflow.python.ops import random_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_nn_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util.deprecation import deprecated_args
-from tensorflow.python.util.deprecation import deprecated_argument_lookup
 
 from tensorflow.python.util import deprecation
 
+
 # Aliases for some automatically-generated names.
 local_response_normalization = gen_nn_ops.lrn
 
@@ -1207,13 +1205,14 @@ def conv2d_transpose(value,
       raise ValueError("padding must be either VALID or SAME:"
                        " {}".format(padding))
 
-    return gen_nn_ops.conv2d_backprop_input(input_sizes=output_shape_,
-                                            filter=filter,
-                                            out_backprop=value,
-                                            strides=strides,
-                                            padding=padding,
-                                            data_format=data_format,
-                                            name=name)
+    return gen_nn_ops.conv2d_backprop_input(
+        input_sizes=output_shape_,
+        filter=filter,
+        out_backprop=value,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
 
 
 def atrous_conv2d_transpose(value,
@@ -1345,12 +1344,13 @@ def atrous_conv2d_transpose(value,
                    (in_width + pad_right_extra) // rate,
                    output_shape[3]]
 
-    value = gen_nn_ops.conv2d_backprop_input(input_sizes=input_sizes,
-                                             filter=filters,
-                                             out_backprop=value,
-                                             strides=[1, 1, 1, 1],
-                                             padding="VALID",
-                                             data_format="NHWC")
+    value = gen_nn_ops.conv2d_backprop_input(
+        input_sizes=input_sizes,
+        filter=filters,
+        out_backprop=value,
+        strides=[1, 1, 1, 1],
+        padding="VALID",
+        data_format="NHWC")
 
     # The crops argument to batch_to_space includes both padding components.
     batch_to_space_crop = [[pad_top, pad_bottom + pad_bottom_extra],
@@ -1648,7 +1648,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   return output
 
 
-@deprecated_args(None, "dim is deprecated, use axis instead", "dim")
+@deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def softmax(logits, axis=None, name=None, dim=None):
   """Computes softmax activations.
 
@@ -1662,6 +1662,7 @@ def softmax(logits, axis=None, name=None, dim=None):
     axis: The dimension softmax would be performed on. The default is -1 which
       indicates the last dimension.
     name: A name for the operation (optional).
+    dim: Deprecated alias for `axis`.
 
   Returns:
     A `Tensor`. Has the same type and shape as `logits`.
@@ -1670,13 +1671,13 @@ def softmax(logits, axis=None, name=None, dim=None):
     InvalidArgumentError: if `logits` is empty or `axis` is beyond the last
       dimension of `logits`.
   """
-  axis = deprecated_argument_lookup("axis", axis, "dim", dim)
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   if axis is None:
     axis = -1
   return _softmax(logits, gen_nn_ops._softmax, axis, name)
 
 
-@deprecated_args(None, "dim is deprecated, use axis instead", "dim")
+@deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def log_softmax(logits, axis=None, name=None, dim=None):
   """Computes log softmax activations.
 
@@ -1690,6 +1691,7 @@ def log_softmax(logits, axis=None, name=None, dim=None):
     axis: The dimension softmax would be performed on. The default is -1 which
       indicates the last dimension.
     name: A name for the operation (optional).
+    dim: Deprecated alias for `axis`.
 
   Returns:
     A `Tensor`. Has the same type as `logits`. Same shape as `logits`.
@@ -1698,7 +1700,7 @@ def log_softmax(logits, axis=None, name=None, dim=None):
     InvalidArgumentError: if `logits` is empty or `axis` is beyond the last
       dimension of `logits`.
   """
-  axis = deprecated_argument_lookup("axis", axis, "dim", dim)
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   if axis is None:
     axis = -1
   return _softmax(logits, gen_nn_ops._log_softmax, axis, name)
@@ -2251,6 +2253,12 @@ def nth_element(input, n, reverse=False, name=None):
   return gen_nn_ops.nth_element(input, n, reverse=reverse, name=name)
 
 
+@deprecation.deprecated_arg_values(
+    None, '`NCHW` for data_format is deprecated, use `NCW` instead',
+    warn_once=True, data_format="NCHW")
+@deprecation.deprecated_arg_values(
+    None, '`NHWC` for data_format is deprecated, use `NWC` instead',
+    warn_once=True, data_format="NHWC")
 def conv1d(value, filters, stride, padding,
            use_cudnn_on_gpu=None, data_format=None,
            name=None):
@@ -2258,9 +2266,9 @@ def conv1d(value, filters, stride, padding,
 
   Given an input tensor of shape
     [batch, in_width, in_channels]
-  if data_format is "NHWC", or
+  if data_format is "NWC", or
     [batch, in_channels, in_width]
-  if data_format is "NCHW",
+  if data_format is "NCW",
   and a filter / kernel tensor of shape
   [filter_width, in_channels, out_channels], this op reshapes
   the arguments to pass them to conv2d to perform the equivalent
@@ -2285,9 +2293,9 @@ def conv1d(value, filters, stride, padding,
       the filter is moved right at each step.
     padding: 'SAME' or 'VALID'
     use_cudnn_on_gpu: An optional `bool`.  Defaults to `True`.
-    data_format: An optional `string` from `"NHWC", "NCHW"`.  Defaults
-      to `"NHWC"`, the data is stored in the order of
-      [batch, in_width, in_channels].  The `"NCHW"` format stores
+    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults
+      to `"NWC"`, the data is stored in the order of
+      [batch, in_width, in_channels].  The `"NCW"` format stores
       data as [batch, in_channels, in_width].
     name: A name for the operation (optional).
 
@@ -2299,15 +2307,16 @@ def conv1d(value, filters, stride, padding,
   """
   with ops.name_scope(name, "conv1d", [value, filters]) as name:
     # Reshape the input tensor to [batch, 1, in_width, in_channels]
-    if data_format is None or data_format == "NHWC":
+    if data_format is None or data_format == "NHWC" or data_format == "NWC":
       data_format = "NHWC"
       spatial_start_dim = 1
       strides = [1, 1, stride, 1]
-    elif data_format == "NCHW":
+    elif data_format == "NCHW" or data_format == "NCW":
+      data_format = "NCHW"
       spatial_start_dim = 2
       strides = [1, 1, 1, stride]
     else:
-      raise ValueError("data_format must be \"NHWC\" or \"NCHW\".")
+      raise ValueError("data_format must be \"NWC\" or \"NCW\".")
     value = array_ops.expand_dims(value, spatial_start_dim)
     filters = array_ops.expand_dims(filters, 0)
     result = gen_nn_ops.conv2d(value, filters, strides, padding,
@@ -2316,13 +2325,14 @@ def conv1d(value, filters, stride, padding,
     return array_ops.squeeze(result, [spatial_start_dim])
 
 
-def conv1d_transpose(value,
-                     filter,
-                     output_shape,
-                     stride,
-                     padding="SAME",
-                     data_format="NWC",
-                     name=None):
+def conv1d_transpose(
+    value,
+    filter,  # pylint: disable=redefined-builtin
+    output_shape,
+    stride,
+    padding="SAME",
+    data_format="NWC",
+    name=None):
   """The transpose of `conv1d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
@@ -2357,8 +2367,8 @@ def conv1d_transpose(value,
                       [value, filter, output_shape]) as name:
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
     if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(3)):
-      raise ValueError("output_shape must have shape (3,), got {}"
-                       .format(output_shape_.get_shape()))
+      raise ValueError("output_shape must have shape (3,), got {}".format(
+          output_shape_.get_shape()))
 
     # The format could be either NWC or NCW, map to NHWC or NCHW
     if data_format is None or data_format == "NWC":
@@ -2380,7 +2390,8 @@ def conv1d_transpose(value,
       if not filter.get_shape()[1].is_compatible_with(output_shape[axis]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[axis], filter.get_shape()[1]))
+            "{} != {}".format(output_shape[axis],
+                              filter.get_shape()[1]))
 
     if padding != "VALID" and padding != "SAME":
       raise ValueError("padding must be either VALID or SAME:"
@@ -2388,25 +2399,26 @@ def conv1d_transpose(value,
 
     # Reshape the input tensor to [batch, 1, in_width, in_channels]
     if data_format_2d == "NHWC":
-      output_shape_ = array_ops.concat([output_shape_[:1], [1],
-                                        output_shape_[1:]], axis=0)
+      output_shape_ = array_ops.concat(
+          [output_shape_[:1], [1], output_shape_[1:]], axis=0)
       spatial_start_dim = 1
       strides = [1, 1, stride, 1]
     else:
-      output_shape_ = array_ops.concat([output_shape_[:2], [1],
-                                        output_shape_[2:]], axis=0)
+      output_shape_ = array_ops.concat(
+          [output_shape_[:2], [1], output_shape_[2:]], axis=0)
       spatial_start_dim = 2
       strides = [1, 1, 1, stride]
     value = array_ops.expand_dims(value, spatial_start_dim)
     filter = array_ops.expand_dims(filter, 0)
 
-    result = gen_nn_ops.conv2d_backprop_input(input_sizes=output_shape_,
-                                              filter=filter,
-                                              out_backprop=value,
-                                              strides=strides,
-                                              padding=padding,
-                                              data_format=data_format_2d,
-                                              name=name)
+    result = gen_nn_ops.conv2d_backprop_input(
+        input_sizes=output_shape_,
+        filter=filter,
+        out_backprop=value,
+        strides=strides,
+        padding=padding,
+        data_format=data_format_2d,
+        name=name)
     return array_ops.squeeze(result, [spatial_start_dim])
 
 
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 3b918e4f74c64868ef74f7e26295941c6f2801ff..d391e345fe7936a1e8a0312aa6dc423e48ce6f9c 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -953,5 +953,60 @@ class MomentsTest(test_lib.TestCase):
     self.doOutputTest((10, 10, 10, 30), (1, 2, 3))
 
 
+class DataFormatDimMapTest(test_lib.TestCase):
+
+  def _test(self, x_val, y_val_expected):
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x)
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertEqual(y_val, y_val_expected)
+
+  def test(self):
+    self._test(0, 0)
+    self._test(1, 2)
+    self._test(2, 3)
+    self._test(3, 1)
+    self._test(-1, 1)
+    self._test(-2, 3)
+    self._test(-3, 2)
+    self._test(-4, 0)
+
+
+class DataFormatVectorPermuteTest(test_lib.TestCase):
+
+  def testNHWCToNCHW(self):
+    x_val = [7, 4, 9, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x)
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, [7, 3, 4, 9])
+
+  def testNCHWToNHWC(self):
+    x_val = [7, 4, 9, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x, src_format="NCHW", dst_format="NHWC")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, [7, 9, 3, 4])
+
+  def testNHWCToNCHW2D(self):
+    x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x)
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, [[7, 4], [5, 1], [9, 3], [4, 5]])
+
+  def testNCHWToNHWC2D(self):
+    x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x, src_format="NCHW", dst_format="NHWC")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, [[7, 4], [4, 5], [5, 1], [9, 3]])
+
+
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/ops/quantized_conv_ops_test.py b/tensorflow/python/ops/quantized_conv_ops_test.py
index 5ea47ea40e5f283736523d5d09a63176b5e8fbbf..5e9e71002705293403de83276fb70099d8864907 100644
--- a/tensorflow/python/ops/quantized_conv_ops_test.py
+++ b/tensorflow/python/ops/quantized_conv_ops_test.py
@@ -93,7 +93,7 @@ class Conv2DTest(test.TestCase):
     quantized_range = ((quantized_max - quantized_min) * range_adjust)
     range_scale = (quantized_range / number_of_steps)
     lowest_quantized = -(1 << (number_of_bits - 1))
-    result = np.array([(quantized_min + ((x - lowest_quantized) * range_scale))
+    result = np.array([(quantized_min + ((float(x) - lowest_quantized) * range_scale))
                        for x in quantized.flatten()])
     return result
 
diff --git a/tensorflow/python/ops/quantized_ops_test.py b/tensorflow/python/ops/quantized_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bf3b35e13879069e40162fc50180520a5f855f6
--- /dev/null
+++ b/tensorflow/python/ops/quantized_ops_test.py
@@ -0,0 +1,57 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for quantized operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class QuantizedOpsTest(test.TestCase):
+
+  def __init__(self, method_name="runTest"):
+    super(QuantizedOpsTest, self).__init__(method_name)
+
+  def testQuantizeOp(self):
+    expected_output = [1, 1, 2, 127, 255, 255]
+    with self.test_session(use_gpu=False) as sess:
+      x = constant_op.constant([1.0, 1.25, 1.75, 127.0, 255.0, 500.0], shape=[6], dtype=dtypes.float32)
+      x_min = 0.0
+      x_max = 255.0
+      op = array_ops.quantize(x, x_min, x_max, dtypes.quint8, mode="MIN_FIRST")
+      value = sess.run(op)
+      self.assertArrayNear(expected_output, value.output, 0.1)
+
+  def testDequantizeOp(self):
+    expected_output = [1.0, 2.0, 4.0, 8.0, 16.0, 255.0]
+    inp = np.array([1, 2, 4, 8, 16, 255]).astype(np.uint8)
+    with self.test_session(use_gpu=False) as sess:
+      x = constant_op.constant(inp, shape=[6], dtype=dtypes.quint8)
+      x_min = 0.0
+      x_max = 255.0
+      op = array_ops.dequantize(x, x_min, x_max, mode="MIN_FIRST")
+      value = sess.run(op)
+      self.assertArrayNear(expected_output, value, 0.1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 52fb5131cfa6d7152ef49d7c10d5f57292d81f24..a2264a7bdfff398e405ccd4a509d20c592ee886b 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -152,7 +152,7 @@ def truncated_normal(shape,
     mean: A 0-D Tensor or Python value of type `dtype`. The mean of the
       truncated normal distribution.
     stddev: A 0-D Tensor or Python value of type `dtype`. The standard deviation
-      of the truncated normal distribution.
+      of the normal distribution, before truncation.
     dtype: The type of the output.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
@@ -220,8 +220,8 @@ def random_uniform(shape,
     ValueError: If `dtype` is integral and `maxval` is not specified.
   """
   dtype = dtypes.as_dtype(dtype)
-  if dtype not in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
-                   dtypes.int64):
+  if dtype not in (dtypes.float16, dtypes.bfloat16, dtypes.float32,
+                   dtypes.float64, dtypes.int32, dtypes.int64):
     raise ValueError("Invalid dtype %r" % dtype)
   if maxval is None:
     if dtype.is_integer:
@@ -316,7 +316,7 @@ def random_crop(value, size, seed=None, name=None):
     return array_ops.slice(value, offset, size, name=name)
 
 
-def multinomial(logits, num_samples, seed=None, name=None):
+def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
   """Draws samples from a multinomial distribution.
 
   Example:
@@ -336,6 +336,7 @@ def multinomial(logits, num_samples, seed=None, name=None):
       @{tf.set_random_seed}
       for behavior.
     name: Optional name for the operation.
+    output_dtype: integer type to use for the output. Defaults to int64.
 
   Returns:
     The drawn samples of shape `[batch_size, num_samples]`.
@@ -344,7 +345,7 @@ def multinomial(logits, num_samples, seed=None, name=None):
     logits = ops.convert_to_tensor(logits, name="logits")
     seed1, seed2 = random_seed.get_seed(seed)
     return gen_random_ops.multinomial(
-        logits, num_samples, seed=seed1, seed2=seed2)
+        logits, num_samples, seed=seed1, seed2=seed2, output_dtype=output_dtype)
 
 
 ops.NotDifferentiable("Multinomial")
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index a746735f58fedd1b52744772f54a68aeec3df3ef..58ede027477667a9d5f821dbf42d8a3fdab50b1a 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -184,11 +184,12 @@ class ResourceVariable(variables.Variable):
     assign = a.assign(2.0)
     with tf.control_dependencies([assign]):
       b = a.read_value()
-
-    other_assign = a.assign(3.0)
+    with tf.control_dependencies([b]):
+      other_assign = a.assign(3.0)
     with tf.control_dependencies([other_assign]):
-      tf.Print(b, [b]).run()  # Will print 2.0 because the value was read before
-                              # other_assign ran.
+      # Will print 2.0 because the value was read before other_assign ran. If
+      # `a` was a tf.Variable instead, 2.0 or 3.0 could be printed.
+      tf.Print(b, [b]).eval()
   ```
 
   To enforce these consistency properties tf.ResourceVariable might make more
@@ -513,7 +514,8 @@ class ResourceVariable(variables.Variable):
       self._cached_value = None
     if variable_def.HasField("save_slice_info_def"):
       self._save_slice_info = variables.Variable.SaveSliceInfo(
-          save_slice_info_def=variable_def.save_slice_info_def)
+          save_slice_info_def=variable_def.save_slice_info_def,
+          import_scope=import_scope)
     else:
       self._save_slice_info = None
     self._caching_device = None
@@ -886,26 +888,19 @@ def _ReadGrad(_, grad):
 def _GatherGrad(op, grad):
   """Gradient for gather op."""
   # Build appropriately shaped IndexedSlices
-  # Walk graph back until the original handle is found.
-  # TODO(apassos): more robust way of getting the shape.
-  # TODO(apassos): implement this for EAGER mode.
-  if context.in_eager_mode():
-    dense_shape = gen_resource_variable_ops.variable_shape(op.inputs[0])
-    return (ops.IndexedSlices(grad,
-                              op.inputs[1],
-                              dense_shape=dense_shape),
-            None)
   handle = op.inputs[0]
-  while handle.op.type != "VarHandleOp":
-    handle = handle.op.inputs[0]
-  params_shape = ops.convert_to_tensor(
-      tensor_shape.TensorShape(handle.op.get_attr("shape")))
   indices = op.inputs[1]
+  if context.in_graph_mode():
+    # Walk graph back until the original handle is found.
+    # TODO(apassos): implement this for EAGER mode.
+    while handle.op.type != "VarHandleOp":
+      handle = handle.op.inputs[0]
+  params_shape = gen_resource_variable_ops.variable_shape(handle)
   size = array_ops.expand_dims(array_ops.size(indices), 0)
   values_shape = array_ops.concat([size, params_shape[1:]], 0)
   values = array_ops.reshape(grad, values_shape)
   indices = array_ops.reshape(indices, size)
-  return [ops.IndexedSlices(values, indices, params_shape), None]
+  return (ops.IndexedSlices(values, indices, params_shape), None)
 
 
 def _to_proto_fn(v, export_scope=None):
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index df66302402881be7712e2dd659d9ad30dc4a551f..fd14740a00a24b006cd1e47b20d46e86e261528a 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
@@ -148,7 +149,7 @@ def _rnn_step(
     zero_output, state, call_cell, state_size, skip_conditionals=False):
   """Calculate one step of a dynamic RNN minibatch.
 
-  Returns an (output, state) pair conditioned on the sequence_lengths.
+  Returns an (output, state) pair conditioned on `sequence_length`.
   When skip_conditionals=False, the pseudocode is something like:
 
   if t >= max_sequence_length:
@@ -157,14 +158,14 @@ def _rnn_step(
     return call_cell()
 
   # Selectively output zeros or output, old state or new state depending
-  # on if we've finished calculating each row.
+  # on whether we've finished calculating each row.
   new_output, new_state = call_cell()
   final_output = np.vstack([
-    zero_output if time >= sequence_lengths[r] else new_output_r
+    zero_output if time >= sequence_length[r] else new_output_r
     for r, new_output_r in enumerate(new_output)
   ])
   final_state = np.vstack([
-    state[r] if time >= sequence_lengths[r] else new_state_r
+    state[r] if time >= sequence_length[r] else new_state_r
     for r, new_state_r in enumerate(new_state)
   ])
   return (final_output, final_state)
@@ -202,9 +203,12 @@ def _rnn_step(
   flat_zero_output = nest.flatten(zero_output)
 
   def _copy_one_through(output, new_output):
-    # If the state contains a scalar value we simply pass it through.
+    # TensorArray and scalar get passed through.
+    if isinstance(output, tensor_array_ops.TensorArray):
+      return new_output
     if output.shape.ndims == 0:
       return new_output
+    # Otherwise propagate the old or the new value.
     copy_cond = (time >= sequence_length)
     with ops.colocate_with(new_output):
       return array_ops.where(copy_cond, output, new_output)
@@ -264,7 +268,8 @@ def _rnn_step(
   for output, flat_output in zip(final_output, flat_zero_output):
     output.set_shape(flat_output.get_shape())
   for substate, flat_substate in zip(final_state, flat_state):
-    substate.set_shape(flat_substate.get_shape())
+    if not isinstance(substate, tensor_array_ops.TensorArray):
+      substate.set_shape(flat_substate.get_shape())
 
   final_output = nest.pack_sequence_as(
       structure=zero_output, flat_sequence=final_output)
@@ -561,33 +566,34 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
   if not _like_rnncell(cell):
     raise TypeError("cell must be an instance of RNNCell")
 
-  # By default, time_major==False and inputs are batch-major: shaped
-  #   [batch, time, depth]
-  # For internal calculations, we transpose to [time, batch, depth]
-  flat_input = nest.flatten(inputs)
-
-  if not time_major:
-    # (B,T,D) => (T,B,D)
-    flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
-    flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input)
-
-  parallel_iterations = parallel_iterations or 32
-  if sequence_length is not None:
-    sequence_length = math_ops.to_int32(sequence_length)
-    if sequence_length.get_shape().ndims not in (None, 1):
-      raise ValueError(
-          "sequence_length must be a vector of length batch_size, "
-          "but saw shape: %s" % sequence_length.get_shape())
-    sequence_length = array_ops.identity(  # Just to find it in the graph.
-        sequence_length, name="sequence_length")
-
-  # Create a new scope in which the caching device is either
-  # determined by the parent scope, or is set to place the cached
-  # Variable using the same placement as for the rest of the RNN.
   with vs.variable_scope(scope or "rnn") as varscope:
+    # Create a new scope in which the caching device is either
+    # determined by the parent scope, or is set to place the cached
+    # Variable using the same placement as for the rest of the RNN.
     if context.in_graph_mode():
       if varscope.caching_device is None:
         varscope.set_caching_device(lambda op: op.device)
+
+    # By default, time_major==False and inputs are batch-major: shaped
+    #   [batch, time, depth]
+    # For internal calculations, we transpose to [time, batch, depth]
+    flat_input = nest.flatten(inputs)
+
+    if not time_major:
+      # (B,T,D) => (T,B,D)
+      flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
+      flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input)
+
+    parallel_iterations = parallel_iterations or 32
+    if sequence_length is not None:
+      sequence_length = math_ops.to_int32(sequence_length)
+      if sequence_length.get_shape().ndims not in (None, 1):
+        raise ValueError(
+            "sequence_length must be a vector of length batch_size, "
+            "but saw shape: %s" % sequence_length.get_shape())
+      sequence_length = array_ops.identity(  # Just to find it in the graph.
+          sequence_length, name="sequence_length")
+
     batch_size = _best_effort_input_batch_size(flat_input)
 
     if initial_state is not None:
@@ -660,7 +666,7 @@ def _dynamic_rnn_loop(cell,
     final_outputs:
       A `Tensor` of shape `[time, batch_size, cell.output_size]`.  If
       `cell.output_size` is a (possibly nested) tuple of ints or `TensorShape`
-      objects, then this returns a (possibly nsted) tuple of Tensors matching
+      objects, then this returns a (possibly nested) tuple of Tensors matching
       the corresponding shapes.
     final_state:
       A `Tensor`, or possibly nested tuple of Tensors, matching in length
@@ -801,11 +807,28 @@ def _dynamic_rnn_loop(cell,
 
     return (time + 1, output_ta_t, new_state)
 
+  # TODO(pbar) `loop_bound` can be reduced to `max_sequence_length` once
+  # TensorArray shape inference is working.  When sequence lengths are highly
+  # variable, this will reduce the performance overheads of padding to a fixed
+  # maximum length.
+  loop_bound = time_steps
+
+  # This is a workaround since we cannot currently use maximum_iterations if
+  # time_steps is defined inside control flow, see the comment in
+  # control_flow_ops.py.
+  if (context.in_eager_mode() or
+      not (control_flow_util.IsInWhileLoop(time_steps.op) or
+           control_flow_util.IsInCond(time_steps.op))):
+    maximum_iterations = time_steps
+  else:
+    maximum_iterations = None
+
   _, output_final_ta, final_state = control_flow_ops.while_loop(
-      cond=lambda time, *_: time < time_steps,
+      cond=lambda time, *_: time < loop_bound,
       body=_time_step,
       loop_vars=(time, output_ta, state),
       parallel_iterations=parallel_iterations,
+      maximum_iterations=maximum_iterations,
       swap_memory=swap_memory)
 
   # Unpack final output if not using output tuples.
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 8aaf77f1733fc0569ebcbc71373a204cfb3f2913..7cb9f7762dcacfe4853a0a1ff855b06e0ab13f74 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -265,7 +265,7 @@ class _LayerRNNCell(RNNCell):
   `call` methods do not access Variables `tf.get_variable`.
   """
 
-  def __call__(self, inputs, state, scope=None):
+  def __call__(self, inputs, state, scope=None, *args, **kwargs):
     """Run this RNN cell on inputs, starting from the given state.
 
     Args:
@@ -274,8 +274,9 @@ class _LayerRNNCell(RNNCell):
         with shape `[batch_size, self.state_size]`.  Otherwise, if
         `self.state_size` is a tuple of integers, this should be a tuple
         with shapes `[batch_size, s] for s in self.state_size`.
-      scope: `VariableScope` for the created subgraph; if not provided,
-        defaults to standard `tf.layers.Layer` behavior.
+      scope: optional cell scope.
+      *args: Additional positional arguments.
+      **kwargs: Additional keyword arguments.
 
     Returns:
       A pair containing:
@@ -287,7 +288,8 @@ class _LayerRNNCell(RNNCell):
     # Bypass RNNCell's variable capturing semantics for LayerRNNCell.
     # Instead, it is up to subclasses to provide a proper build
     # method.  See the class docstring for more details.
-    return base_layer.Layer.__call__(self, inputs, state, scope=scope)
+    return base_layer.Layer.__call__(self, inputs, state, scope=scope,
+                                     *args, **kwargs)
 
 
 class BasicRNNCell(_LayerRNNCell):
@@ -1037,7 +1039,7 @@ class DropoutWrapper(RNNCell):
       inputs = self._dropout(inputs, "input",
                              self._recurrent_input_noise,
                              self._input_keep_prob)
-    output, new_state = self._cell(inputs, state, scope)
+    output, new_state = self._cell(inputs, state, scope=scope)
     if _should_dropout(self._state_keep_prob):
       # Identify which subsets of the state to perform dropout on and
       # which ones to keep.
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 2c3667dffedf111f37a9f6eadcc7f1de83c2347e..c0c1ade495455df6a4965eefba4b823ca84e7c31 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -29,11 +29,41 @@ import numpy as np
 import six
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.eager import context
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_script_ops
 
 
+class EagerFunc(object):
+  """A wrapper for a function owned by an EagerPyFunc."""
+
+  def __init__(self, func, Tout):
+    """Constructs an EagerFunc.
+
+    Args:
+      func: The function to wrap.
+      Tout: A list of datatypes for the output; an empty list if the output is
+            None.
+    """
+    self._func = func
+    self._out_dtypes = Tout
+
+  def __call__(self, *args, **kwargs):
+    """Passes args, kwargs to `self._func`, which is executed eagerly."""
+    with context.eager_mode():
+      ret = self._func(*args, **kwargs)
+      if isinstance(ret, (tuple, list)):
+        return [
+            ops.convert_to_tensor(x, dtype=dtype)
+            for (x, dtype) in zip(ret, self._out_dtypes)
+        ]
+      elif ret is None:
+        return ret
+      else:
+        return ops.convert_to_tensor(ret, dtype=self._out_dtypes[0])
+
+
 class FuncRegistry(object):
   """A helper class to keep track of registered py functions.
 
@@ -91,16 +121,20 @@ class FuncRegistry(object):
     if func is None:
       raise ValueError("callback %s is not found" % token)
     ret = func(*args)
-    # Strings seem to lead to a memory leak here if they're not wrapped in a
-    # list.
-    if isinstance(ret, six.binary_type):
-      ret = [ret]
-    # Ensures that we return either a single numpy array or a list of numpy
-    # arrays.
-    if isinstance(ret, (tuple, list)):
-      return [self._convert(x) for x in ret]
+
+    if isinstance(func, EagerFunc):
+      return ret
     else:
-      return self._convert(ret)
+      # Strings seem to lead to a memory leak here if they're not wrapped in a
+      # list.
+      if isinstance(ret, six.binary_type):
+        ret = [ret]
+      # Ensures that we return either a single numpy array or a list of numpy
+      # arrays.
+      if isinstance(ret, (tuple, list)):
+        return [self._convert(x) for x in ret]
+      else:
+        return self._convert(ret)
 
   def size(self):
     """Returns how many functions are currently registered."""
@@ -129,6 +163,86 @@ class CleanupFunc(object):
     _py_funcs.remove(self._token)
 
 
+def _internal_py_func(func, inp, Tout, stateful=None, eager=False, name=None):
+  """See documentation for py_func and eager_py_func."""
+
+  is_list_or_tuple = False
+  if isinstance(Tout, (list, tuple)):
+    is_list_or_tuple = True
+  else:
+    Tout = [Tout]
+
+  if eager:
+    func = EagerFunc(func, Tout)
+
+  token = _py_funcs.insert(func)
+  # We tie the registered function's lifetime with the current default graph,
+  # i.e., when the current graph is destroyed, we remove its py funcs.
+  graph = ops.get_default_graph()
+
+  # pylint: disable=protected-access
+  while isinstance(graph, function._FuncGraph):
+    # If the py_func was declared inside a _FuncGraph, its lifetime should be
+    # bound to that of the outer graph instead.
+    graph = graph._outer_graph
+
+  cleanup = CleanupFunc(token)
+
+  # TODO(zhifengc): Consider adding a Graph method to collect
+  # `cleanup` objects in one of its member.
+  if not hasattr(graph, "_cleanup_py_funcs_used_in_graph"):
+    graph._cleanup_py_funcs_used_in_graph = []
+
+  # When `graph` is destroyed, elements in _cleanup_py_funcs_used_in_graph
+  # will be destroyed and their __del__ will remove the 'token' from
+  # the funcs registry.
+  graph._cleanup_py_funcs_used_in_graph.append(cleanup)
+  # pylint: enable=protected-access
+
+  # pylint: disable=protected-access
+  if eager:
+    result = gen_script_ops._eager_py_func(
+        input=inp, token=token, Tout=Tout, name=name)
+  else:
+    if stateful:
+      result = gen_script_ops._py_func(
+          input=inp, token=token, Tout=Tout, name=name)
+    else:
+      result = gen_script_ops._py_func_stateless(
+          input=inp, token=token, Tout=Tout, name=name)
+  # pylint: enable=protected-access
+  return result if is_list_or_tuple else result[0]
+
+
+def eager_py_func(func, inp, Tout, name=None):
+  """Wraps a python function into a TensorFlow op.
+
+  When the returned op is executed, `func` is invoked with eager execution
+  enabled. Inputs are Tensor objects and func must return None or objects
+  that may be converted to Tensor objects.
+
+  This function has the same limitations as `py_func` with respect to
+  serialization and distribution.
+
+  Args:
+    func: A Python function which accepts a list of `Tensor` objects
+      having element types that match the corresponding `tf.Tensor` objects
+      in `inp` and returns a list of `Tensor` objects (or a single
+      `Tensor`, or `None`) having element types that match the
+      corresponding values in `Tout`.
+    inp: A list of `Tensor` objects.
+    Tout: A list or tuple of tensorflow data types or a single tensorflow data
+      type if there is only one, indicating what `func` returns; an empty list
+      if no value is returned (i.e., if the return value is `None`).
+    name: A name for the operation (optional).
+
+  Returns:
+    A list of `Tensor` or a single `Tensor` which `func` computes; an empty list
+    if `func` returns None.
+  """
+  return _internal_py_func(func=func, inp=inp, Tout=Tout, eager=True, name=name)
+
+
 def py_func(func, inp, Tout, stateful=True, name=None):
   """Wraps a python function and uses it as a TensorFlow op.
 
@@ -182,46 +296,12 @@ def py_func(func, inp, Tout, stateful=True, name=None):
   Returns:
     A list of `Tensor` or a single `Tensor` which `func` computes.
   """
-  token = _py_funcs.insert(func)
-  # We tie the registered function's life-time with the current
-  # default graph. I.e., when the current graph is destroyed, we
-  # should remove its py funcs.
-  g = ops.get_default_graph()
-
-  # pylint: disable=protected-access
-  while isinstance(g, function._FuncGraph):
-    # If the py_func was declared inside a _FuncGraph, its lifetime should be
-    # bound to that of the outer graph instead.
-    g = g._outer_graph
-
-  cleanup = CleanupFunc(token)
-
-  # TODO(zhifengc): Consider adding a Graph method to collect
-  # `cleanup` objects in one of its member.
-  if not hasattr(g, "_cleanup_py_funcs_used_in_graph"):
-    g._cleanup_py_funcs_used_in_graph = []
-
-  # When g is destroyed, elements in _cleanup_py_funcs_used_in_graph
-  # will be destroyed and their __del__ will remove the 'token' from
-  # the funcs registry.
-  g._cleanup_py_funcs_used_in_graph.append(cleanup)
-  # pylint: enable=protected-access
-
-  if isinstance(Tout, (list, tuple)):
-    is_list_or_tuple = True
-  else:
-    Tout = [Tout]
-    is_list_or_tuple = False
-  # pylint: disable=protected-access
-  if stateful:
-    result = gen_script_ops._py_func(
-        input=inp, token=token, Tout=Tout, name=name)
-  else:
-    result = gen_script_ops._py_func_stateless(
-        input=inp, token=token, Tout=Tout, name=name)
-  # pylint: enable=protected-access
-  return result if is_list_or_tuple else result[0]
+  return _internal_py_func(
+      func=func, inp=inp, Tout=Tout, stateful=stateful, eager=False, name=name)
 
 
+# TODO(akshayka): PyFuncs where the 'eager' attribute is set to True should be
+# differentiable, i.e., the gradient of PyFunc should propagate Nones if the
+# eager attribute is not set, and otherwise, it should return the gradient.
 ops.NotDifferentiable("PyFunc")
 ops.NotDifferentiable("PyFuncStateless")
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 2ef6a0015b5c894b2d01cfd18735ed032f828707..62f20e8c9de58a2d40e7e8fa232493fd44429c26 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -1385,16 +1385,17 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
             empty_row_indicator)
 
 
-def serialize_sparse(sp_input, name=None):
-  """Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object.
+def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
+  """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
 
   Args:
     sp_input: The input `SparseTensor`.
     name: A name prefix for the returned tensors (optional).
+    out_type: The `dtype` to use for serialization.
 
   Returns:
-    A string 3-vector (1D `Tensor`), with each column representing the
-    serialized `SparseTensor`'s indices, values, and shape (respectively).
+    A 3-vector (1-D `Tensor`), with each column representing the serialized
+    `SparseTensor`'s indices, values, and shape (respectively).
 
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
@@ -1402,11 +1403,15 @@ def serialize_sparse(sp_input, name=None):
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._serialize_sparse(
-      sp_input.indices, sp_input.values, sp_input.dense_shape, name=name)
+      sp_input.indices,
+      sp_input.values,
+      sp_input.dense_shape,
+      name=name,
+      out_type=out_type)
 
 
-def serialize_many_sparse(sp_input, name=None):
-  """Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`.
+def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
+  """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
 
   The `SparseTensor` must have rank `R` greater than 1, and the first dimension
   is treated as the minibatch dimension.  Elements of the `SparseTensor`
@@ -1419,11 +1424,12 @@ def serialize_many_sparse(sp_input, name=None):
   Args:
     sp_input: The input rank `R` `SparseTensor`.
     name: A name prefix for the returned tensors (optional).
+    out_type: The `dtype` to use for serialization.
 
   Returns:
-    A string matrix (2-D `Tensor`) with `N` rows and `3` columns.
-    Each column represents serialized `SparseTensor`'s indices, values, and
-    shape (respectively).
+    A matrix (2-D `Tensor`) with `N` rows and `3` columns. Each column
+    represents serialized `SparseTensor`'s indices, values, and shape
+    (respectively).
 
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
@@ -1431,21 +1437,67 @@ def serialize_many_sparse(sp_input, name=None):
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._serialize_many_sparse(
-      sp_input.indices, sp_input.values, sp_input.dense_shape, name=name)
+      sp_input.indices,
+      sp_input.values,
+      sp_input.dense_shape,
+      name=name,
+      out_type=out_type)
 
 
 def deserialize_sparse(serialized_sparse, dtype, rank=None, name=None):
-  """Deserialize `SparseTensor` from a string 3-vector (1-D `Tensor`) object.
+  """Deserialize `SparseTensor` objects.
+
+  The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+  the last dimension stores serialized `SparseTensor` objects and the other N
+  dimensions (N >= 0) correspond to a batch. The ranks of the original
+  `SparseTensor` objects must all match. When the final `SparseTensor` is
+  created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+  the sparse tensors have been concatenated along new dimensions, one for each
+  batch.
+
+  The output `SparseTensor` object's shape values for the original dimensions
+  are the max across the input `SparseTensor` objects' shape values for the
+  corresponding dimensions. The new dimensions match the size of the batch.
+
+  The input `SparseTensor` objects' indices are assumed ordered in
+  standard lexicographic order.  If this is not the case, after this
+  step run `SparseReorder` to restore index ordering.
+
+  For example, if the serialized input is a `[2 x 3]` matrix representing two
+  original `SparseTensor` objects:
+
+      index = [ 0]
+              [10]
+              [20]
+      values = [1, 2, 3]
+      shape = [50]
+
+  and
+
+      index = [ 2]
+              [10]
+      values = [4, 5]
+      shape = [30]
+
+  then the final deserialized `SparseTensor` will be:
+
+      index = [0  0]
+              [0 10]
+              [0 20]
+              [1  2]
+              [1 10]
+      values = [1, 2, 3, 4, 5]
+      shape = [2 50]
 
   Args:
-    serialized_sparse: 1-D, The serialized `SparseTensor` object.
-      Must have 3 columns.
-    dtype: The `dtype` of the serialized `SparseTensor` object.
-    rank: (optional) Python int, the rank of the `SparseTensor` object.
-    name: A name prefix for the returned tensors (optional)
+    serialized_sparse: The serialized `SparseTensor` objects.
+      The last dimension must have 3 columns.
+    dtype: The `dtype` of the serialized `SparseTensor` objects.
+    rank: (optional) Python int, the rank of the `SparseTensor` objects.
+    name: A name prefix for the returned tensors (optional).
 
   Returns:
-    A `SparseTensor` representing the deserialized `SparseTensor` object.
+    A `SparseTensor` representing the deserialized `SparseTensor` objects.
 
   """
   output_indices, output_values, output_shape = (
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index dbab07da42671744284d703f0cd80e601a5fa8a8..dee495f78fa5c2fa099772d0a84f5ff0981c8c59 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -347,5 +347,71 @@ def scatter_update(ref, indices, updates, use_locking=True, name=None):
   if ref.dtype._is_ref_dtype:
     return gen_state_ops.scatter_update(ref, indices, updates,
                                         use_locking=use_locking, name=name)
-  return gen_resource_variable_ops.resource_scatter_update(
-      ref.handle, indices, updates, name=name)
+  with ops.control_dependencies(
+      [gen_resource_variable_ops.resource_scatter_update(
+          ref.handle, indices, ops.convert_to_tensor(updates, ref.dtype),
+          name=name)]):
+    return ref.read_value()
+
+
+def scatter_nd_update(ref, indices, updates, use_locking=True, name=None):
+  r"""Applies sparse `updates` to individual values or slices in a Variable.
+
+  `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+  `indices` must be integer tensor, containing indices into `ref`.
+  It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+  The innermost dimension of `indices` (with length `K`) corresponds to
+  indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+  dimension of `ref`.
+
+  `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+  ```
+  [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+  ```
+
+  For example, say we want to update 4 scattered elements to a rank-1 tensor to
+  8 elements. In Python, that update would look like this:
+
+  ```python
+      ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+      indices = tf.constant([[4], [3], [1] ,[7]])
+      updates = tf.constant([9, 10, 11, 12])
+      update = tf.scatter_nd_update(ref, indices, updates)
+      with tf.Session() as sess:
+        print sess.run(update)
+  ```
+
+  The resulting update to ref would look like this:
+
+      [1, 11, 3, 10, 9, 6, 7, 12]
+
+  See @{tf.scatter_nd} for more details about how to make updates to
+  slices.
+
+  Args:
+    ref: A Variable.
+    indices: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      A Tensor. Must be one of the following types: int32, int64.
+      A tensor of indices into ref.
+    updates: A `Tensor`. Must have the same type as `ref`.
+      A Tensor. Must have the same type as ref. A tensor of updated
+      values to add to ref.
+    use_locking: An optional `bool`. Defaults to `True`.
+      An optional bool. Defaults to True. If True, the assignment will
+      be protected by a lock; otherwise the behavior is undefined,
+      but may exhibit less contention.
+    name: A name for the operation (optional).
+
+  Returns:
+    The value of the variable after the update.
+  """
+  if ref.dtype._is_ref_dtype:
+    return gen_state_ops.scatter_nd_update(
+        ref, indices, updates, use_locking, name)
+  with ops.control_dependencies([gen_state_ops.resource_scatter_nd_update(
+      ref.handle, indices, ops.convert_to_tensor(updates, dtype=ref.dtype),
+      use_locking, name)]):
+    return ref.read_value()
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 98578b799a814962b560e8ed40868b2e94010f4e..07796b28d9f6b85aa2d4ee8cbc47d10eef3894de 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -307,6 +307,12 @@ class Template(object):
       # To prevent partial matches on the scope_name, we add '/' at the end.
       return name if name[-1] == "/" else name + "/"
 
+  @property
+  def variables(self):
+    """Returns the list of global and local variables created by the Template.
+    """
+    return self.global_variables + self.local_variables
+
   @property
   def trainable_variables(self):
     """Returns the list of trainable variables created by the Template."""
@@ -316,6 +322,14 @@ class Template(object):
     else:
       return []
 
+  @property
+  def non_trainable_variables(self):
+    """Returns the list of non-trainable variables created by the Template."""
+    # TODO(apassos) Make sure it matches Eager when using local variables.
+    global_variables = self.global_variables
+    trainable_variables = set(self.trainable_variables)
+    return [x for x in global_variables if x not in trainable_variables]
+
   @property
   def global_variables(self):
     """Returns the list of global variables created by the Template."""
@@ -334,6 +348,21 @@ class Template(object):
     else:
       return []
 
+  @property
+  def weights(self):
+    """List of weights/variables created by the Template."""
+    return self.variables
+
+  @property
+  def trainable_weights(self):
+    """List of trainable weights/variables created by the Template."""
+    return self.trainable_variables
+
+  @property
+  def non_trainable_weights(self):
+    """List of non-trainable weights/variables created by the Template."""
+    return self.non_trainable_variables
+
   @property
   @deprecated(
       "2017-02-21", "The .var_scope property is deprecated. Please change your "
@@ -501,7 +530,7 @@ class EagerTemplate(Template):
 
   @property
   def variables(self):
-    """Returns the list of trainable variables created by the Template."""
+    """Returns the list of variables created by the Template."""
     # Currently there is no local variable in Eager mode.
     return self._eager_variable_store.variables()
 
@@ -511,6 +540,12 @@ class EagerTemplate(Template):
     # Currently there is no local variable in Eager mode.
     return self._eager_variable_store.trainable_variables()
 
+  @property
+  def non_trainable_variables(self):
+    """Returns the list of non-trainable variables created by the Template."""
+    # Currently there is no local variable in Eager mode.
+    return self._eager_variable_store.non_trainable_variables()
+
   @property
   def global_variables(self):
     """Returns the list of global variables created by the Template."""
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 605654d9be7985f4b0d2677cf688c31796db31b5..398521c9b5ae9240f03a2ba5c4b0681bd8b3bfd7 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -36,9 +36,6 @@ from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import tf_should_use
 
-# TODO(ebrevdo): Set to True in Dec. 4, 2017.
-_ENABLE_IDENTICAL_ELEMENT_SHAPES = False
-
 
 # _GraphTensorArray accesses many of the hidden generated ops, but is in
 # fact built to wrap these methods.
@@ -150,18 +147,15 @@ class _GraphTensorArray(object):
         # will retroactively set the device value of this op.
         def create():
           """Create the TensorArray op."""
-          ta_kwargs = {}
-          if _ENABLE_IDENTICAL_ELEMENT_SHAPES:
-            ta_kwargs["identical_element_shapes"] = infer_shape
           return gen_data_flow_ops._tensor_array_v3(
               dtype=dtype,
               size=size,
               element_shape=element_shape,
+              identical_element_shapes=infer_shape,
               dynamic_size=dynamic_size,
               clear_after_read=clear_after_read,
               tensor_array_name=tensor_array_name,
-              name=scope,
-              **ta_kwargs)
+              name=scope)
         if colocate_with_first_write_call:
           with ops.device(None), ops.colocate_with(None, ignore_existing=True):
             self._handle, self._flow = create()
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 91dea12da23af15d0213b9207617e57f288ef368..ac6173c260c91575366743b50e2445181a57143f 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -843,6 +843,7 @@ class _VariableStore(object):
     Raises:
       ValueError: When giving unsupported dtype.
     """
+    del shape
     # If dtype is DT_FLOAT, provide a uniform unit scaling initializer
     if dtype.is_floating:
       initializer = init_ops.glorot_uniform_initializer()
@@ -850,9 +851,8 @@ class _VariableStore(object):
     # If dtype is DT_INT/DT_UINT, provide a default value `zero`
     # If dtype is DT_BOOL, provide a default value `FALSE`
     elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool:
-      initializer = init_ops.zeros_initializer()(
-          shape=shape, dtype=dtype.base_dtype)
-      initializing_from_value = True
+      initializer = init_ops.zeros_initializer()
+      initializing_from_value = False
     # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
     else:
       raise ValueError("An initializer for variable %s of %s is required"
@@ -1233,6 +1233,12 @@ class EagerVariableStore(object):
                   key=lambda x: x.name)
     # pylint: enable=protected-access
 
+  def non_trainable_variables(self):
+    # pylint: disable=protected-access
+    return sorted([x for x in self._store._vars.values() if not x._trainable],
+                  key=lambda x: x.name)
+    # pylint: enable=protected-access
+
 
 def get_variable(name,
                  shape=None,
@@ -1578,6 +1584,10 @@ class _pure_variable_scope(object):  # pylint: disable=invalid-name
           else self._name_or_scope)
       self._reuse = (self._reuse
                      or self._old.reuse)  # Re-using is inherited by sub-scopes.
+      if self._old_name_scope is None:
+        name_scope = self._name_or_scope
+      else:
+        name_scope = self._old_name_scope
       variable_scope_object = VariableScope(
           self._reuse,
           name=self._new_name,
@@ -1588,7 +1598,7 @@ class _pure_variable_scope(object):  # pylint: disable=invalid-name
           dtype=self._old.dtype,
           use_resource=self._old.use_resource,
           custom_getter=self._old.custom_getter,
-          name_scope=self._old_name_scope or self._name_or_scope,
+          name_scope=name_scope,
           constraint=self._constraint)
       if self._initializer is not None:
         variable_scope_object.set_initializer(self._initializer)
@@ -1691,7 +1701,7 @@ class variable_scope(object):  # pylint: disable=invalid-name
   v1 = foo()  # Creates v.
   v2 = foo()  # Gets the same, existing v.
   assert v1 == v2
-
+  ```
 
   Basic example of sharing a variable with reuse=True:
 
@@ -1757,7 +1767,8 @@ class variable_scope(object):  # pylint: disable=invalid-name
                reuse=None,
                dtype=None,
                use_resource=None,
-               constraint=None):
+               constraint=None,
+               auxiliary_name_scope=True):
     """Initialize the context manager.
 
     Args:
@@ -1789,6 +1800,8 @@ class variable_scope(object):  # pylint: disable=invalid-name
         variable and return the Tensor for the projected value
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
+      auxiliary_name_scope: If `True`, we create an auxiliary name scope with
+        the scope. If `False`, we don't touch name scope.
 
     Returns:
       A scope that can be captured and reused.
@@ -1826,6 +1839,10 @@ class variable_scope(object):  # pylint: disable=invalid-name
       self._graph = ops._get_graph_from_inputs(self._values)  # pylint: disable=protected-access
     self._cached_pure_variable_scope = None
     self._current_name_scope = None
+    if not isinstance(auxiliary_name_scope, bool):
+      raise TypeError("The auxiliary_name_scope must be `True` or `False`, "
+                      "while get {}".format(auxiliary_name_scope))
+    self._auxiliary_name_scope = auxiliary_name_scope
 
   def __enter__(self):
     # If the default graph is building a function, then we should not replace it
@@ -1844,6 +1861,21 @@ class variable_scope(object):  # pylint: disable=invalid-name
       if self._current_name_scope is not None:
         self._current_name_scope.__enter__()
       return self._cached_pure_variable_scope.__enter__()
+
+    if self._auxiliary_name_scope:
+      # Create a new name scope later
+      current_name_scope = None
+    else:
+      # Reenter the current name scope
+      name_scope = ops.get_name_scope()
+      if name_scope:
+        # Hack to reenter
+        name_scope = name_scope + "/"
+        current_name_scope = ops.name_scope(name_scope)
+      else:
+        # Root scope
+        current_name_scope = ops.name_scope(name_scope)
+
     if self._name_or_scope is not None:
       if not isinstance(self._name_or_scope,
                         (VariableScope,) + six.string_types):
@@ -1853,8 +1885,8 @@ class variable_scope(object):  # pylint: disable=invalid-name
         name_scope = self._name_or_scope
       else:
         name_scope = self._name_or_scope.name.split("/")[-1]
-      if name_scope:
-        self._current_name_scope = ops.name_scope(name_scope)
+      if name_scope or current_name_scope:
+        self._current_name_scope = current_name_scope or ops.name_scope(name_scope)
         current_name_scope_name = self._current_name_scope.__enter__()
         if isinstance(self._name_or_scope, six.string_types):
           old_name_scope = current_name_scope_name
@@ -1892,7 +1924,7 @@ class variable_scope(object):  # pylint: disable=invalid-name
     else:  # Here name_or_scope is None. Using default name, but made unique.
       if self._reuse:
         raise ValueError("reuse=True cannot be used without a name_or_scope")
-      self._current_name_scope = ops.name_scope(self._default_name)
+      self._current_name_scope = current_name_scope or ops.name_scope(self._default_name)
       current_name_scope_name = self._current_name_scope.__enter__()
       unique_default_name = _get_unique_variable_scope(self._default_name)
       self._cached_pure_variable_scope = _pure_variable_scope(
@@ -1985,8 +2017,10 @@ def variable(initial_value=None,
              validate_shape=True,
              caching_device=None,
              name=None,
-             dtype=None):
-  use_resource = get_variable_scope().use_resource
+             dtype=None,
+             use_resource=None):
+  if use_resource is None:
+    use_resource = get_variable_scope().use_resource
   if use_resource or (use_resource is None and context.in_eager_mode()):
     return resource_variable_ops.ResourceVariable(
         initial_value=initial_value, trainable=trainable,
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index eab7c3828f1c0864450f78837f5fc7f4a0cc9fc0..e0748d87e2d6ef2c2f8565669357f881334fa737 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -200,7 +200,7 @@ class Variable(object):
 
     @compatibility(eager)
     `tf.Variable` is not compatible with eager execution.  Use
-    `tfe.Variable` instead which is compatable with both eager execution
+    `tfe.Variable` instead which is compatible with both eager execution
     and graph construction.  See [the TensorFlow Eager Execution
     guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/g3doc/guide.md#variables-and-optimizers)
     for details on how variables work in eager execution.
@@ -1064,7 +1064,7 @@ class PartitionedVariable(object):
   """A container for partitioned `Variable` objects.
 
   @compatibility(eager) `tf.PartitionedVariable` is not compatible with
-  eager execution.  Use `tfe.Variable` instead which is compatable
+  eager execution.  Use `tfe.Variable` instead which is compatible
   with both eager execution and graph construction.  See [the
   TensorFlow Eager Execution
   guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/g3doc/guide.md#variables-and-optimizers)
@@ -1447,6 +1447,8 @@ def local_variables_initializer():
   Returns:
     An Op that initializes all local variables in the graph.
   """
+  if context.in_eager_mode():
+    return control_flow_ops.no_op(name="local_variables_initializer")
   return variables_initializer(local_variables())
 
 
diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py
index 1d8acf3f006bd26ece974ef3f3674e7f13d9f827..9b92d9a18005ca5e6be3820427e3a3ba60a8ec2d 100644
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@@ -114,13 +114,8 @@ def run(main=None, argv=None):
   # Define help flags.
   _define_help_flags()
 
-  # Parse flags.
-  try:
-    argv = flags.FLAGS(_sys.argv if argv is None else argv)
-  except flags.Error as error:
-    _sys.stderr.write('FATAL Flags parsing error: %s\n' % error)
-    _sys.stderr.write('Pass --helpshort or --helpfull to see help on flags.\n')
-    _sys.exit(1)
+  # Parse known flags.
+  argv = flags.FLAGS(_sys.argv if argv is None else argv, known_only=True)
 
   main = main or _sys.modules['__main__'].main
 
diff --git a/tensorflow/python/platform/flags.py b/tensorflow/python/platform/flags.py
index e9a36ae75d6ce4763ff83c97bec008a4da0897b0..6225db77440e9d63eade956c5c4749c9e2884f6c 100644
--- a/tensorflow/python/platform/flags.py
+++ b/tensorflow/python/platform/flags.py
@@ -18,5 +18,108 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import logging as _logging
+import sys as _sys
+
 # go/tf-wildcard-import
 from absl.flags import *  # pylint: disable=wildcard-import
+import six as _six
+
+from tensorflow.python.util import tf_decorator
+
+
+# Since we wrap absl.flags DEFINE functions, we need to declare this module
+# does not affect key flags.
+disclaim_key_flags()  # pylint: disable=undefined-variable
+
+
+_RENAMED_ARGUMENTS = {
+    'flag_name': 'name',
+    'default_value': 'default',
+    'docstring': 'help',
+}
+
+
+def _wrap_define_function(original_function):
+  """Wraps absl.flags's define functions so tf.flags accepts old names."""
+
+  def wrapper(*args, **kwargs):
+    """Wrapper function that turns old keyword names to new ones."""
+    has_old_names = False
+    for old_name, new_name in _six.iteritems(_RENAMED_ARGUMENTS):
+      if old_name in kwargs:
+        has_old_names = True
+        value = kwargs.pop(old_name)
+        kwargs[new_name] = value
+    if has_old_names:
+      _logging.warning(
+          'Use of the keyword argument names (flag_name, default_value, '
+          'docstring) is deprecated, please use (name, default, help) instead.')
+    return original_function(*args, **kwargs)
+
+  return tf_decorator.make_decorator(original_function, wrapper)
+
+
+class _FlagValuesWrapper(object):
+  """Wrapper class for absl.flags.FLAGS.
+
+  The difference is that tf.flags.FLAGS implicitly parses flags with sys.argv
+  when accessing the FLAGS values before it's explicitly parsed,
+  while absl.flags.FLAGS raises an exception.
+  """
+
+  def __init__(self, flags_object):
+    self.__dict__['__wrapped'] = flags_object
+
+  def __getattribute__(self, name):
+    if name == '__dict__':
+      return super(_FlagValuesWrapper, self).__getattribute__(name)
+    return self.__dict__['__wrapped'].__getattribute__(name)
+
+  def __getattr__(self, name):
+    wrapped = self.__dict__['__wrapped']
+    # To maintain backwards compatibility, implicitly parse flags when reading
+    # a flag.
+    if not wrapped.is_parsed():
+      wrapped(_sys.argv)
+    return wrapped.__getattr__(name)
+
+  def __setattr__(self, name, value):
+    return self.__dict__['__wrapped'].__setattr__(name, value)
+
+  def __delattr__(self, name):
+    return self.__dict__['__wrapped'].__delattr__(name)
+
+  def __dir__(self):
+    return self.__dict__['__wrapped'].__dir__()
+
+  def __getitem__(self, name):
+    return self.__dict__['__wrapped'].__getitem__(name)
+
+  def __setitem__(self, name, flag):
+    return self.__dict__['__wrapped'].__setitem__(name, flag)
+
+  def __len__(self):
+    return self.__dict__['__wrapped'].__len__()
+
+  def __iter__(self):
+    return self.__dict__['__wrapped'].__iter__()
+
+  def __str__(self):
+    return self.__dict__['__wrapped'].__str__()
+
+  def __call__(self, *args, **kwargs):
+    return self.__dict__['__wrapped'].__call__(*args, **kwargs)
+
+
+# pylint: disable=invalid-name,used-before-assignment
+# absl.flags APIs use `default` as the name of the default value argument.
+# Allow the following functions continue to accept `default_value`.
+DEFINE_string = _wrap_define_function(DEFINE_string)
+DEFINE_boolean = _wrap_define_function(DEFINE_boolean)
+DEFINE_bool = DEFINE_boolean
+DEFINE_float = _wrap_define_function(DEFINE_float)
+DEFINE_integer = _wrap_define_function(DEFINE_integer)
+# pylint: enable=invalid-name,used-before-assignment
+
+FLAGS = _FlagValuesWrapper(FLAGS)  # pylint: disable=used-before-assignment
diff --git a/tensorflow/python/platform/flags_test.py b/tensorflow/python/platform/flags_test.py
index 23060e17d279cfb282f20610e0a1639db3a43ecf..bd3c8e39959a41ada22f7ee4cef4d3d462e9e6cf 100644
--- a/tensorflow/python/platform/flags_test.py
+++ b/tensorflow/python/platform/flags_test.py
@@ -17,18 +17,110 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import sys
 import unittest
 
 from absl import flags as absl_flags
 
 from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
+
+
+flags.DEFINE_string(
+    flag_name='old_string', default_value='default', docstring='docstring')
+flags.DEFINE_string(
+    name='new_string', default='default', help='docstring')
+flags.DEFINE_integer(
+    flag_name='old_integer', default_value=1, docstring='docstring')
+flags.DEFINE_integer(
+    name='new_integer', default=1, help='docstring')
+flags.DEFINE_float(
+    flag_name='old_float', default_value=1.5, docstring='docstring')
+flags.DEFINE_float(
+    name='new_float', default=1.5, help='docstring')
+flags.DEFINE_bool(
+    flag_name='old_bool', default_value=True, docstring='docstring')
+flags.DEFINE_bool(
+    name='new_bool', default=True, help='docstring')
+flags.DEFINE_boolean(
+    flag_name='old_boolean', default_value=False, docstring='docstring')
+flags.DEFINE_boolean(
+    name='new_boolean', default=False, help='docstring')
 
 
 class FlagsTest(unittest.TestCase):
 
-  def test_global_flags_object(self):
-    self.assertIs(flags.FLAGS, absl_flags.FLAGS)
+  def setUp(self):
+    self.original_flags = flags.FlagValues()
+    self.wrapped_flags = flags._FlagValuesWrapper(self.original_flags)
+    flags.DEFINE_string(
+        'test', 'default', 'test flag', flag_values=self.wrapped_flags)
+
+  def test_attribute_overrides(self):
+    # Test that methods defined in absl.flags.FlagValues are the same as the
+    # wrapped ones.
+    self.assertEqual(flags.FLAGS.is_parsed, absl_flags.FLAGS.is_parsed)
+
+  def test_getattr(self):
+    self.assertFalse(self.wrapped_flags.is_parsed())
+    with test.mock.patch.object(sys, 'argv', new=['program', '--test=new']):
+      self.assertEqual('new', self.wrapped_flags.test)
+    self.assertTrue(self.wrapped_flags.is_parsed())
+
+  def test_setattr(self):
+    self.assertEqual('default', self.wrapped_flags.test)
+    self.wrapped_flags.test = 'new'
+    self.assertEqual('new', self.wrapped_flags.test)
+
+  def test_delattr(self):
+    del self.wrapped_flags.test
+    self.assertNotIn('test', self.wrapped_flags)
+    with self.assertRaises(AttributeError):
+      _ = self.wrapped_flags.test
+
+  def test_dir(self):
+    self.assertEqual(['test'], dir(self.wrapped_flags))
+
+  def test_getitem(self):
+    self.assertIs(self.original_flags['test'], self.wrapped_flags['test'])
+
+  def test_setitem(self):
+    flag = flags.Flag(flags.ArgumentParser(), flags.ArgumentSerializer(),
+                      'fruit', 'apple', 'the fruit type')
+    self.wrapped_flags['fruit'] = flag
+    self.assertIs(self.original_flags['fruit'], self.wrapped_flags['fruit'])
+    self.assertEqual('apple', self.wrapped_flags.fruit)
+
+  def test_len(self):
+    self.assertEqual(1, len(self.wrapped_flags))
+
+  def test_iter(self):
+    self.assertEqual(['test'], list(self.wrapped_flags))
+
+  def test_str(self):
+    self.assertEqual(str(self.wrapped_flags), str(self.original_flags))
+
+  def test_call(self):
+    self.wrapped_flags(['program', '--test=new'])
+    self.assertEqual('new', self.wrapped_flags.test)
+
+  def test_keyword_arguments(self):
+    test_cases = (
+        ('old_string', 'default'),
+        ('new_string', 'default'),
+        ('old_integer', 1),
+        ('new_integer', 1),
+        ('old_float', 1.5),
+        ('new_float', 1.5),
+        ('old_bool', True),
+        ('new_bool', True),
+        ('old_boolean', False),
+        ('new_boolean', False),
+    )
+    for flag_name, default_value in test_cases:
+      self.assertEqual(default_value, absl_flags.FLAGS[flag_name].default)
+      self.assertEqual('docstring', absl_flags.FLAGS[flag_name].help)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   unittest.main()
diff --git a/tensorflow/python/platform/sysconfig.py b/tensorflow/python/platform/sysconfig.py
index 57635fb4d9d6698f1a6f1a51918fe3f269d8909b..f6c4f2227fbba75e4fdb41ddeaa55ba3f9168677 100644
--- a/tensorflow/python/platform/sysconfig.py
+++ b/tensorflow/python/platform/sysconfig.py
@@ -27,6 +27,7 @@ from __future__ import print_function
 import os.path as _os_path
 
 from tensorflow.python.framework.versions import CXX11_ABI_FLAG as _CXX11_ABI_FLAG
+from tensorflow.python.framework.versions import MONOLITHIC_BUILD as _MONOLITHIC_BUILD
 from tensorflow.python.util.all_util import remove_undocumented
 
 
@@ -75,8 +76,9 @@ def get_link_flags():
     The link flags.
   """
   flags = []
-  flags.append('-L%s' % get_lib())
-  flags.append('-ltensorflow_framework')
+  if not _MONOLITHIC_BUILD:
+    flags.append('-L%s' % get_lib())
+    flags.append('-ltensorflow_framework')
   return flags
 
 _allowed_symbols = []
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 71ee5e365f7d093ebc105917e7dd68ba92b31231..85ed4f071c7022801f20db75d538e5917b8eea66 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -30,64 +30,92 @@ from logging import ERROR
 from logging import FATAL
 from logging import INFO
 from logging import WARN
+import threading
 
 import six
 
 from tensorflow.python.util.all_util import remove_undocumented
 
 
-# Determine whether we are in an interactive environment
-_interactive = False
-try:
-  # This is only defined in interactive shells
-  if _sys.ps1: _interactive = True
-except AttributeError:
-  # Even now, we may be in an interactive shell with `python -i`.
-  _interactive = _sys.flags.interactive
+# Don't use this directly. Use _get_logger() instead.
+_logger = None
+_logger_lock = threading.Lock()
 
-# Scope the tensorflow logger to not conflict with users' loggers
-_logger = _logging.getLogger('tensorflow')
 
-# If we are in an interactive environment (like jupyter), set loglevel to info
-# and pipe the output to stdout
-if _interactive:
-  _logger.setLevel(INFO)
-  _logging_target = _sys.stdout
-else:
-  _logging_target = _sys.stderr
+def _get_logger():
+  global _logger
 
-# Add the output handler
-_handler = _logging.StreamHandler(_logging_target)
-_handler.setFormatter(_logging.Formatter(_logging.BASIC_FORMAT, None))
-_logger.addHandler(_handler)
+  # Use double-checked locking to avoid taking lock unnecessarily.
+  if _logger:
+    return _logger
+
+  _logger_lock.acquire()
+
+  try:
+    if _logger:
+      return _logger
+
+    # Scope the TensorFlow logger to not conflict with users' loggers.
+    logger = _logging.getLogger('tensorflow')
+
+    # Don't further configure the TensorFlow logger if the root logger is
+    # already configured. This prevents double logging in those cases.
+    if not _logging.getLogger().handlers:
+      # Determine whether we are in an interactive environment
+      _interactive = False
+      try:
+        # This is only defined in interactive shells.
+        if _sys.ps1: _interactive = True
+      except AttributeError:
+        # Even now, we may be in an interactive shell with `python -i`.
+        _interactive = _sys.flags.interactive
+
+      # If we are in an interactive environment (like Jupyter), set loglevel
+      # to INFO and pipe the output to stdout.
+      if _interactive:
+        logger.setLevel(INFO)
+        _logging_target = _sys.stdout
+      else:
+        _logging_target = _sys.stderr
+
+      # Add the output handler.
+      _handler = _logging.StreamHandler(_logging_target)
+      _handler.setFormatter(_logging.Formatter(_logging.BASIC_FORMAT, None))
+      logger.addHandler(_handler)
+
+    _logger = logger
+    return _logger
+
+  finally:
+    _logger_lock.release()
 
 
 def log(level, msg, *args, **kwargs):
-  _logger.log(level, msg, *args, **kwargs)
+  _get_logger().log(level, msg, *args, **kwargs)
 
 
 def debug(msg, *args, **kwargs):
-  _logger.debug(msg, *args, **kwargs)
+  _get_logger().debug(msg, *args, **kwargs)
 
 
 def error(msg, *args, **kwargs):
-  _logger.error(msg, *args, **kwargs)
+  _get_logger().error(msg, *args, **kwargs)
 
 
 def fatal(msg, *args, **kwargs):
-  _logger.fatal(msg, *args, **kwargs)
+  _get_logger().fatal(msg, *args, **kwargs)
 
 
 def info(msg, *args, **kwargs):
-  _logger.info(msg, *args, **kwargs)
+  _get_logger().info(msg, *args, **kwargs)
 
 
 def warn(msg, *args, **kwargs):
-  _logger.warn(msg, *args, **kwargs)
+  _get_logger().warn(msg, *args, **kwargs)
 
 
 def warning(msg, *args, **kwargs):
-  _logger.warning(msg, *args, **kwargs)
+  _get_logger().warning(msg, *args, **kwargs)
 
 
 _level_names = {
@@ -118,7 +146,7 @@ def flush():
 
 # Code below is taken from pyglib/logging
 def vlog(level, msg, *args, **kwargs):
-  _logger.log(level, msg, *args, **kwargs)
+  _get_logger().log(level, msg, *args, **kwargs)
 
 
 def _GetNextLogCountPerToken(token):
@@ -225,12 +253,12 @@ def google2_log_prefix(level, timestamp=None, file_and_line=None):
 
 def get_verbosity():
   """Return how much logging output will be produced."""
-  return _logger.getEffectiveLevel()
+  return _get_logger().getEffectiveLevel()
 
 
 def set_verbosity(v):
   """Sets the threshold for what messages will be logged."""
-  _logger.setLevel(v)
+  _get_logger().setLevel(v)
 
 
 def _get_thread_id():
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index 519b05975f03c5f1899f527636a4c855feceaacc..c815aad0a065eaba4a0dc52487b5ee67e271a146 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -57,7 +57,10 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:variables",
     ],
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "oss_serial",
+    ],
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py
index 46a921c0a13ecca0febf6aa4085539abbd1a6fbf..72422f11e91993e7d6e3d905788d54f9f782c892 100644
--- a/tensorflow/python/profiler/model_analyzer.py
+++ b/tensorflow/python/profiler/model_analyzer.py
@@ -162,7 +162,7 @@ class Profiler(object):
     self._coverage = 0.0
     self._graph = graph
     # pylint: disable=protected-access
-    op_log = tfprof_logger._merge_default_with_oplog(
+    op_log = tfprof_logger.merge_default_with_oplog(
         self._graph, op_log=op_log)
     # pylint: enable=protected-access
 
@@ -182,7 +182,7 @@ class Profiler(object):
       run_meta: RunMetadata proto that contains statistics of a session run.
     """
     # pylint: disable=protected-access
-    op_log = tfprof_logger._merge_default_with_oplog(
+    op_log = tfprof_logger.merge_default_with_oplog(
         self._graph, run_meta=run_meta)
     # pylint: enable=protected-access
     # TODO(xpan): P1: Better to find the current graph.
@@ -315,7 +315,7 @@ def profile(graph,
                .trainable_variables_parameter())
 
   # pylint: disable=protected-access
-  op_log = tfprof_logger._merge_default_with_oplog(
+  op_log = tfprof_logger.merge_default_with_oplog(
       graph, op_log, run_meta, add_trace=cmd == 'code')
   # pylint: enable=protected-access
 
@@ -374,7 +374,7 @@ def advise(graph, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS):
     options = ALL_ADVICE.copy()
 
   # pylint: disable=protected-access
-  op_log = tfprof_logger._merge_default_with_oplog(
+  op_log = tfprof_logger.merge_default_with_oplog(
       graph, None, run_meta, add_trace=True)
   # pylint: enable=protected-access
 
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
index 698f8906d48b64872e1ba9398216bf33900e8278..a379bd5236a9bb46aedf6794f21eb2a09c17cc5e 100644
--- a/tensorflow/python/profiler/model_analyzer_test.py
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -23,12 +23,18 @@ import os
 import random
 import re
 
+import numpy as np
+
 from tensorflow.core.profiler import profile_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
@@ -62,7 +68,7 @@ class PrintModelAnalysisTest(test.TestCase):
                          '  ScalarW (1, 1/1 params)\n',
                          f.read())
 
-  def testSelectEverthingDetail(self):
+  def testSelectEverythingDetail(self):
     ops.reset_default_graph()
     dev = '/device:GPU:0' if test.is_gpu_available() else '/device:CPU:0'
     outfile = os.path.join(test.get_temp_dir(), 'dump')
@@ -224,12 +230,12 @@ class PrintModelAnalysisTest(test.TestCase):
         with gfile.Open(outfile, 'r') as f:
           lines = f.read().split('\n')
           result = '\n'.join([l[:min(len(l), 80)] for l in lines])
-          self.assertEqual(compat.as_bytes('node name | # parameters | # float_ops\n_TFProfRoot (--/2.84k params, --/168.85k flops)\n  model_analyzer_testlib.py:63:BuildFullModel (0/1.80k params, 0/45.37k flops)\n    model_analyzer_testlib.py:40:BuildSmallModel (0/0 params, 0/0 flops)\n    model_analyzer_testlib.py:44:BuildSmallModel (0/4 params, 0/8 flops)\n    model_analyzer_testlib.py:48:BuildSmallModel (0/648 params, 0/1.30k flops)\n    model_analyzer_testlib.py:49:BuildSmallModel (0/0 params, 0/23.33k flops)\n    model_analyzer_testlib.py:53:BuildSmallModel (0/1.15k params, 0/2.30k flops)\n    model_analyzer_testlib.py:54:BuildSmallModel (0/0 params, 0/18.43k flops)\n  model_analyzer_testlib.py:63:BuildFullModel (gradient) (0/0 params, 0/67.39k f\n    model_analyzer_testlib.py:49:BuildSmallModel (gradient) (0/0 params, 0/46.66\n    model_analyzer_testlib.py:54:BuildSmallModel (gradient) (0/0 params, 0/20.74\n  model_analyzer_testlib.py:67:BuildFullModel (0/1.04k params, 0/18.57k flops)\n  model_analyzer_testlib.py:67:BuildFullModel (gradient) (0/0 params, 0/37.00k f\n  model_analyzer_testlib.py:69:BuildFullModel (0/0 params, 0/0 flops)\n  model_analyzer_testlib.py:70:BuildFullModel (0/0 params, 0/258 flops)\n  model_analyzer_testlib.py:70:BuildFullModel (gradient) (0/0 params, 0/129 flop\n  model_analyzer_testlib.py:72:BuildFullModel (0/0 params, 0/141 flops)\n'),
+          self.assertEqual(compat.as_bytes('node name | # parameters | # float_ops\n_TFProfRoot (--/2.84k params, --/168.86k flops)\n  model_analyzer_testlib.py:63:BuildFullModel (0/1.80k params, 0/45.37k flops)\n    model_analyzer_testlib.py:40:BuildSmallModel (0/0 params, 0/0 flops)\n    model_analyzer_testlib.py:44:BuildSmallModel (0/4 params, 0/8 flops)\n    model_analyzer_testlib.py:48:BuildSmallModel (0/648 params, 0/1.30k flops)\n    model_analyzer_testlib.py:49:BuildSmallModel (0/0 params, 0/23.33k flops)\n    model_analyzer_testlib.py:53:BuildSmallModel (0/1.15k params, 0/2.30k flops)\n    model_analyzer_testlib.py:54:BuildSmallModel (0/0 params, 0/18.43k flops)\n  model_analyzer_testlib.py:63:BuildFullModel (gradient) (0/0 params, 0/67.39k f\n    model_analyzer_testlib.py:49:BuildSmallModel (gradient) (0/0 params, 0/46.66\n    model_analyzer_testlib.py:54:BuildSmallModel (gradient) (0/0 params, 0/20.74\n  model_analyzer_testlib.py:67:BuildFullModel (0/1.04k params, 0/18.58k flops)\n  model_analyzer_testlib.py:67:BuildFullModel (gradient) (0/0 params, 0/37.00k f\n  model_analyzer_testlib.py:69:BuildFullModel (0/0 params, 0/0 flops)\n  model_analyzer_testlib.py:70:BuildFullModel (0/0 params, 0/258 flops)\n  model_analyzer_testlib.py:70:BuildFullModel (gradient) (0/0 params, 0/129 flop\n  model_analyzer_testlib.py:72:BuildFullModel (0/0 params, 0/141 flops)\n'),
                            compat.as_bytes(result))
 
         self.assertLess(0, tfprof_node.total_exec_micros)
         self.assertEqual(2844, tfprof_node.total_parameters)
-        self.assertEqual(168854, tfprof_node.total_float_ops)
+        self.assertEqual(168863, tfprof_node.total_float_ops)
         self.assertEqual(8, len(tfprof_node.children))
         self.assertEqual('_TFProfRoot', tfprof_node.name)
         self.assertEqual(
@@ -346,8 +352,8 @@ class PrintModelAnalysisTest(test.TestCase):
       with gfile.Open(outfile, 'r') as f:
         # pylint: disable=line-too-long
         self.assertEqual(
-            'nodename|requestedbytes|peakbytes|residualbytes|outputbytes|totalexecutiontime|acceleratorexecutiontime|cpuexecutiontime|#parameters|opoccurrence(run|defined)|inputshapes\nConst0B(0',
-            f.read().replace('\t', '').replace(' ', '')[0:180])
+            'nodename|requestedbytes|peakbytes|residualbytes|outputbytes|totalexecutiontime|acceleratorexecutiontime|cpuexecutiontime|#parameters|opoccurrence(run|defined)|inputshapes',
+            f.read().replace('\t', '').replace(' ', '')[0:170])
         # pylint: enable=line-too-long
 
       total_children = 0
@@ -694,6 +700,76 @@ class PrintModelAnalysisTest(test.TestCase):
                       exception_str)
       self.assertTrue(mat is None)
 
+  def testTrackPersistentBytes(self):
+    ops.reset_default_graph()
+    a = array_ops.constant(np.ones((100, 100)))
+    b = array_ops.constant(np.ones((100, 100)))
+    c = a * b
+
+    with session.Session() as sess:
+      run_options = config_pb2.RunOptions(
+          trace_level=config_pb2.RunOptions.FULL_TRACE)
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(c, options=run_options, run_metadata=run_metadata)
+
+      options = option_builder.ProfileOptionBuilder.time_and_memory()
+      options['min_bytes'] = 0
+      options['select'] = ('bytes', 'peak_bytes', 'output_bytes',
+                           'residual_bytes')
+      ret = model_analyzer.profile(
+          sess.graph, run_meta=run_metadata, cmd='scope', options=options)
+
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(c, options=run_options, run_metadata=run_metadata)
+      ret2 = model_analyzer.profile(
+          sess.graph, run_meta=run_metadata, cmd='scope', options=options)
+
+      n = lib.SearchTFProfNode(ret, 'mul')
+      n2 = lib.SearchTFProfNode(ret2, 'mul')
+      self.assertGreater(n.peak_bytes, 0)
+      self.assertGreater(n.output_bytes, 0)
+      self.assertGreater(n.residual_bytes, 0)
+      self.assertEqual(n.peak_bytes, n2.peak_bytes)
+      self.assertEqual(n.output_bytes, n2.output_bytes)
+      self.assertEqual(n.residual_bytes, n2.residual_bytes)
+
+  def testTraceLoopBytes(self):
+    if not test.is_gpu_available(): return
+    ops.reset_default_graph()
+    steps = 100
+
+    with ops.device('/gpu:0'):
+      x = array_ops.ones((100, 100), dtype=dtypes.float32)
+      n = array_ops.constant(steps, dtype=dtypes.int32)
+      x1 = array_ops.ones((100, 100))
+
+      x *= x1
+      def loop_body(i, x):
+        x *= x
+        return i + 1, x
+
+      _, y = control_flow_ops.while_loop(
+          lambda i, x: i < n, loop_body,
+          [array_ops.constant(0), x])
+
+    grad = gradients.gradients(y, [x1])
+
+    with session.Session() as sess:
+      run_options = config_pb2.RunOptions(
+          trace_level=config_pb2.RunOptions.FULL_TRACE)
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(grad, options=run_options, run_metadata=run_metadata)
+
+      options = option_builder.ProfileOptionBuilder.time_and_memory()
+      options['min_bytes'] = 0
+      options['min_micros'] = 0
+      options['select'] = ('bytes', 'peak_bytes', 'output_bytes',
+                           'residual_bytes')
+      options['output'] = 'none'
+      ret_pb = model_analyzer.profile(
+          sess.graph, run_meta=run_metadata, cmd='scope', options=options)
+      self.assertGreater(ret_pb.total_requested_bytes, 1000000)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/profiler/tfprof_logger.py b/tensorflow/python/profiler/tfprof_logger.py
index 838064a1f0836a2041c2823f54fea4e6b5606d7f..15c273794da8ab0ff46e6455e502792c5b19729f 100644
--- a/tensorflow/python/profiler/tfprof_logger.py
+++ b/tensorflow/python/profiler/tfprof_logger.py
@@ -139,8 +139,8 @@ def _get_logged_ops(graph, run_meta=None, add_trace=True,
   return logged_ops, string_to_id
 
 
-def _merge_default_with_oplog(graph, op_log=None, run_meta=None,
-                              add_trace=True, add_trainable_var=True):
+def merge_default_with_oplog(graph, op_log=None, run_meta=None,
+                             add_trace=True, add_trainable_var=True):
   """Merge the tfprof default extra info with caller's op_log.
 
   Args:
@@ -199,7 +199,7 @@ def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
     add_trace: Whether to add python code trace information.
         Used to support "code" view.
   """
-  op_log = _merge_default_with_oplog(graph, op_log, run_meta, add_trace)
+  op_log = merge_default_with_oplog(graph, op_log, run_meta, add_trace)
 
   with gfile.Open(os.path.join(log_dir, 'tfprof_log'), 'w') as log:
     log.write(op_log.SerializeToString())
diff --git a/tensorflow/python/pywrap_tensorflow.py b/tensorflow/python/pywrap_tensorflow.py
index 91373fa544b62e1b4760a92bf6630edf0c7f1ee4..5c0c5783dce19ec8fa1b090827d06d203e83de68 100644
--- a/tensorflow/python/pywrap_tensorflow.py
+++ b/tensorflow/python/pywrap_tensorflow.py
@@ -60,6 +60,7 @@ try:
   from tensorflow.python.pywrap_tensorflow_internal import __git_version__
   from tensorflow.python.pywrap_tensorflow_internal import __compiler_version__
   from tensorflow.python.pywrap_tensorflow_internal import __cxx11_abi_flag__
+  from tensorflow.python.pywrap_tensorflow_internal import __monolithic_build__
 
   if _use_dlopen_global_flags:
     pywrap_dlopen_global_flags.reset_dlopen_flags()
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 82b154164e85a1044860ef501c3d32cd00eb6fde..82750e9e491dbe9b742531c431aa499621082776 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -18,6 +18,7 @@ limitations under the License.
 %rename("%s") TFE_NewContext;
 %rename("%s") TFE_DeleteContext;
 %rename("%s") TFE_ContextListDevices;
+%rename("%s") TFE_ContextAddFunction;
 %rename("%s") TFE_ContextAddFunctionDef;
 %rename("%s") TFE_OpNameGetAttrType;
 %rename("%s") TFE_Py_InitEagerTensor;
@@ -149,7 +150,7 @@ limitations under the License.
   }
   $1 = &temp;
   $1->resize(PyInt_AsLong($input), nullptr);
-}
+} 
 
 // Create new Status object.
 %typemap(in, numinputs=0) TF_Status *out_status {
diff --git a/tensorflow/python/saved_model/README.md b/tensorflow/python/saved_model/README.md
index 8213e52ce9c004c9b9c53b76e08a028508703d06..8c78013ffd25feda7315657bfe070f8243959ae1 100644
--- a/tensorflow/python/saved_model/README.md
+++ b/tensorflow/python/saved_model/README.md
@@ -93,7 +93,7 @@ with an asset of the same name, only the first version is retained.
 Each meta graph added to the SavedModel must be annotated with user specified
 tags. The tags provide a means to identify the specific meta graph to load and
 restore, along with the shared set of variables and assets. These tags
-typically annotate a MetaGraph with it's functionality (e.g. serving or
+typically annotate a MetaGraph with its functionality (e.g. serving or
 training), and possibly hardware specific aspects such as GPU.
 
 #### Usage
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index c6d2c3229330c64d5e788c45574dbdbd8b6616ca..92ca7dec6f63b50b33dde9909b4738676fb8c783 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -214,6 +214,13 @@ class SavedModelTest(test.TestCase):
       self._init_and_validate_variable(sess, "v", 45)
       builder.add_meta_graph([tag_constants.SERVING, tag_constants.GPU])
 
+    # Graph that updates the single variable. SavedModel invoked to:
+    # - simply add the model (weights are not updated).
+    # - multiple tags (from predefined constants for serving on TPU).
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 45)
+      builder.add_meta_graph([tag_constants.SERVING, tag_constants.TPU])
+
     # Graph that updates the single variable. SavedModel is invoked:
     # - to add the model (weights are not updated).
     # - multiple custom tags.
@@ -244,6 +251,13 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
+    # Restore the graph with multiple predefined tags (for serving on TPU)
+    # whose variables were not saved.
+    with self.test_session(graph=ops.Graph()) as sess:
+      loader.load(sess, [tag_constants.SERVING, tag_constants.TPU], export_dir)
+      self.assertEqual(
+          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+
     # Restore the graph with multiple tags. Provide duplicate tags to test set
     # semantics.
     with self.test_session(graph=ops.Graph()) as sess:
diff --git a/tensorflow/python/saved_model/tag_constants.py b/tensorflow/python/saved_model/tag_constants.py
index 52868bdf99b4734a99d7b9dac301f00783402d77..e2facafda51919d3f1e0ccbe646db522ed0bc49b 100644
--- a/tensorflow/python/saved_model/tag_constants.py
+++ b/tensorflow/python/saved_model/tag_constants.py
@@ -31,9 +31,13 @@ TRAINING = "train"
 # Tag for the `gpu` graph.
 GPU = "gpu"
 
+# Tag for the `tpu` graph.
+TPU = "tpu"
+
 _allowed_symbols = [
     "SERVING",
     "TRAINING",
-    "GPU"
+    "GPU",
+    "TPU"
 ]
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/summary/summary_iterator.py b/tensorflow/python/summary/summary_iterator.py
index 301f560d41378b0ec29537cd82e3e3b333f59674..6969c4cf1500bf4b1fda900336158e5af4395ea6 100644
--- a/tensorflow/python/summary/summary_iterator.py
+++ b/tensorflow/python/summary/summary_iterator.py
@@ -13,301 +13,14 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Reads Summaries from and writes Summaries to event files."""
+"""Provides a method for reading events from an event file via an iterator."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os.path
-import threading
-import time
-
-import six
-
-from tensorflow.core.framework import graph_pb2
-from tensorflow.core.framework import summary_pb2
 from tensorflow.core.util import event_pb2
-from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import tf_record
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import compat
-
-
-class SummaryWriter(object):
-  """Writes `Summary` protocol buffers to event files.
-
-  The `SummaryWriter` class provides a mechanism to create an event file in a
-  given directory and add summaries and events to it. The class updates the
-  file contents asynchronously. This allows a training program to call methods
-  to add data to the file directly from the training loop, without slowing down
-  training.
-  """
-
-  def __init__(self, logdir, graph=None, max_queue=10, flush_secs=120,
-               graph_def=None):
-    """Creates a `SummaryWriter` and an event file.
-
-    On construction the summary writer creates a new event file in `logdir`.
-    This event file will contain `Event` protocol buffers constructed when you
-    call one of the following functions: `add_summary()`, `add_session_log()`,
-    `add_event()`, or `add_graph()`.
-
-    If you pass a `Graph` to the constructor it is added to
-    the event file. (This is equivalent to calling `add_graph()` later).
-
-    TensorBoard will pick the graph from the file and display it graphically so
-    you can interactively explore the graph you built. You will usually pass
-    the graph from the session in which you launched it:
-
-    ```python
-    ...create a graph...
-    # Launch the graph in a session.
-    sess = tf.Session()
-    # Create a summary writer, add the 'graph' to the event file.
-    writer = tf.summary.FileWriter(<some-directory>, sess.graph)
-    ```
-
-    The other arguments to the constructor control the asynchronous writes to
-    the event file:
-
-    *  `flush_secs`: How often, in seconds, to flush the added summaries
-       and events to disk.
-    *  `max_queue`: Maximum number of summaries or events pending to be
-       written to disk before one of the 'add' calls block.
-
-    Args:
-      logdir: A string. Directory where event file will be written.
-      graph: A `Graph` object, such as `sess.graph`.
-      max_queue: Integer. Size of the queue for pending events and summaries.
-      flush_secs: Number. How often, in seconds, to flush the
-        pending events and summaries to disk.
-      graph_def: DEPRECATED: Use the `graph` argument instead.
-    """
-    self._logdir = logdir
-    if not gfile.IsDirectory(self._logdir):
-      gfile.MakeDirs(self._logdir)
-    self._event_queue = six.moves.queue.Queue(max_queue)
-    self._ev_writer = pywrap_tensorflow.EventsWriter(
-        compat.as_bytes(os.path.join(self._logdir, "events")))
-    self._closed = False
-    self._worker = _EventLoggerThread(self._event_queue, self._ev_writer,
-                                      flush_secs)
-    # For storing used tags for session.run() outputs.
-    self._session_run_tags = {}
-    self._worker.start()
-    if graph is not None or graph_def is not None:
-      # Calling it with both graph and graph_def for backward compatibility.
-      self.add_graph(graph=graph, graph_def=graph_def)
-
-  def get_logdir(self):
-    """Returns the directory where event file will be written."""
-    return self._logdir
-
-  def reopen(self):
-    """Reopens the summary writer.
-
-    Can be called after `close()` to add more events in the same directory.
-    The events will go into a new events file.
-
-    Does nothing if the summary writer was not closed.
-    """
-    if self._closed:
-      self._closed = False
-
-  def add_summary(self, summary, global_step=None):
-    """Adds a `Summary` protocol buffer to the event file.
-
-    This method wraps the provided summary in an `Event` protocol buffer
-    and adds it to the event file.
-
-    You can pass the result of evaluating any summary op, using
-    @{tf.Session.run} or
-    @{tf.Tensor.eval}, to this
-    function. Alternatively, you can pass a `tf.Summary` protocol
-    buffer that you populate with your own data. The latter is
-    commonly done to report evaluation results in event files.
-
-    Args:
-      summary: A `Summary` protocol buffer, optionally serialized as a string.
-      global_step: Number. Optional global step value to record with the
-        summary.
-    """
-    if isinstance(summary, bytes):
-      summ = summary_pb2.Summary()
-      summ.ParseFromString(summary)
-      summary = summ
-    event = event_pb2.Event(wall_time=time.time(), summary=summary)
-    if global_step is not None:
-      event.step = int(global_step)
-    self.add_event(event)
-
-  def add_session_log(self, session_log, global_step=None):
-    """Adds a `SessionLog` protocol buffer to the event file.
-
-    This method wraps the provided session in an `Event` protocol buffer
-    and adds it to the event file.
-
-    Args:
-      session_log: A `SessionLog` protocol buffer.
-      global_step: Number. Optional global step value to record with the
-        summary.
-    """
-    event = event_pb2.Event(wall_time=time.time(), session_log=session_log)
-    if global_step is not None:
-      event.step = int(global_step)
-    self.add_event(event)
-
-  def add_event(self, event):
-    """Adds an event to the event file.
-
-    Args:
-      event: An `Event` protocol buffer.
-    """
-    if not self._closed:
-      self._event_queue.put(event)
-
-  def _add_graph_def(self, graph_def, global_step=None):
-    graph_bytes = graph_def.SerializeToString()
-    event = event_pb2.Event(wall_time=time.time(), graph_def=graph_bytes)
-    if global_step is not None:
-      event.step = int(global_step)
-    self._event_queue.put(event)
-
-  def add_graph(self, graph, global_step=None, graph_def=None):
-    """Adds a `Graph` to the event file.
-
-    The graph described by the protocol buffer will be displayed by
-    TensorBoard. Most users pass a graph in the constructor instead.
-
-    Args:
-      graph: A `Graph` object, such as `sess.graph`.
-      global_step: Number. Optional global step counter to record with the
-        graph.
-      graph_def: DEPRECATED. Use the `graph` parameter instead.
-
-    Raises:
-      ValueError: If both graph and graph_def are passed to the method.
-    """
-
-    if graph is not None and graph_def is not None:
-      raise ValueError("Please pass only graph, or graph_def (deprecated), "
-                       "but not both.")
-
-    if isinstance(graph, ops.Graph) or isinstance(graph_def, ops.Graph):
-      # The user passed a `Graph`.
-
-      # Check if the user passed it via the graph or the graph_def argument and
-      # correct for that.
-      if not isinstance(graph, ops.Graph):
-        logging.warning("When passing a `Graph` object, please use the `graph`"
-                        " named argument instead of `graph_def`.")
-        graph = graph_def
-
-      # Serialize the graph with additional info.
-      true_graph_def = graph.as_graph_def(add_shapes=True)
-    elif (isinstance(graph, graph_pb2.GraphDef)
-          or isinstance(graph_def, graph_pb2.GraphDef)):
-      # The user passed a `GraphDef`.
-      logging.warning("Passing a `GraphDef` to the SummaryWriter is deprecated."
-                      " Pass a `Graph` object instead, such as `sess.graph`.")
-
-      # Check if the user passed it via the graph or the graph_def argument and
-      # correct for that.
-      if isinstance(graph, graph_pb2.GraphDef):
-        true_graph_def = graph
-      else:
-        true_graph_def = graph_def
-
-    else:
-      # The user passed neither `Graph`, nor `GraphDef`.
-      raise TypeError("The passed graph must be an instance of `Graph` "
-                      "or the deprecated `GraphDef`")
-    # Finally, add the graph_def to the summary writer.
-    self._add_graph_def(true_graph_def, global_step)
-
-  def add_run_metadata(self, run_metadata, tag, global_step=None):
-    """Adds a metadata information for a single session.run() call.
-
-    Args:
-      run_metadata: A `RunMetadata` protobuf object.
-      tag: The tag name for this metadata.
-      global_step: Number. Optional global step counter to record with the
-        StepStats.
-
-    Raises:
-      ValueError: If the provided tag was already used for this type of event.
-    """
-    if tag in self._session_run_tags:
-      raise ValueError("The provided tag was already used for this event type")
-    self._session_run_tags[tag] = True
-
-    tagged_metadata = event_pb2.TaggedRunMetadata()
-    tagged_metadata.tag = tag
-    # Store the `RunMetadata` object as bytes in order to have postponed
-    # (lazy) deserialization when used later.
-    tagged_metadata.run_metadata = run_metadata.SerializeToString()
-    event = event_pb2.Event(wall_time=time.time(),
-                            tagged_run_metadata=tagged_metadata)
-    if global_step is not None:
-      event.step = int(global_step)
-    self._event_queue.put(event)
-
-  def flush(self):
-    """Flushes the event file to disk.
-
-    Call this method to make sure that all pending events have been written to
-    disk.
-    """
-    self._event_queue.join()
-    self._ev_writer.Flush()
-
-  def close(self):
-    """Flushes the event file to disk and close the file.
-
-    Call this method when you do not need the summary writer anymore.
-    """
-    self.flush()
-    self._ev_writer.Close()
-    self._closed = True
-
-
-class _EventLoggerThread(threading.Thread):
-  """Thread that logs events."""
-
-  def __init__(self, queue, ev_writer, flush_secs):
-    """Creates an _EventLoggerThread.
-
-    Args:
-      queue: A Queue from which to dequeue events.
-      ev_writer: An event writer. Used to log brain events for
-       the visualizer.
-      flush_secs: How often, in seconds, to flush the
-        pending file to disk.
-    """
-    threading.Thread.__init__(self)
-    self.daemon = True
-    self._queue = queue
-    self._ev_writer = ev_writer
-    self._flush_secs = flush_secs
-    # The first event will be flushed immediately.
-    self._next_event_flush_time = 0
-
-  def run(self):
-    while True:
-      event = self._queue.get()
-      try:
-        self._ev_writer.WriteEvent(event)
-        # Flush the event writer every so often.
-        now = time.time()
-        if now > self._next_event_flush_time:
-          self._ev_writer.Flush()
-          # Do it again in two minutes.
-          self._next_event_flush_time = now + self._flush_secs
-      finally:
-        self._queue.task_done()
 
 
 def summary_iterator(path):
@@ -352,37 +65,3 @@ def summary_iterator(path):
   # pylint: enable=line-too-long
   for r in tf_record.tf_record_iterator(path):
     yield event_pb2.Event.FromString(r)
-
-
-class SummaryWriterCache(object):
-  """Cache for summary writers.
-
-  This class caches summary writers, one per directory.
-  """
-  # Cache, keyed by directory.
-  _cache = {}
-
-  # Lock protecting _SUMMARY_WRITERS.
-  _lock = threading.RLock()
-
-  @staticmethod
-  def clear():
-    """Clear cached summary writers. Currently only used for unit tests."""
-    with SummaryWriterCache._lock:
-      SummaryWriterCache._cache = {}
-
-  @staticmethod
-  def get(logdir):
-    """Returns the SummaryWriter for the specified directory.
-
-    Args:
-      logdir: str, name of the directory.
-
-    Returns:
-      A `SummaryWriter`.
-    """
-    with SummaryWriterCache._lock:
-      if logdir not in SummaryWriterCache._cache:
-        SummaryWriterCache._cache[logdir] = SummaryWriter(
-            logdir, graph=ops.get_default_graph())
-      return SummaryWriterCache._cache[logdir]
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index d221dd523b2835d51e61487c22caee961ec28e5f..344702097f658db14ae0923e1bdee3843a72645f 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -33,6 +33,8 @@ limitations under the License.
 %include "tensorflow/python/client/tf_session.i"
 %include "tensorflow/python/client/device_lib.i"
 
+%include "tensorflow/python/lib/core/bfloat16.i"
+
 %include "tensorflow/python/lib/io/file_io.i"
 %include "tensorflow/python/training/quantize_training.i"
 %include "tensorflow/python/training/server_lib.i"
diff --git a/tensorflow/python/tools/optimize_for_inference_test.py b/tensorflow/python/tools/optimize_for_inference_test.py
index 447057cfe9fc3d7aa7bd78739ba8f1caee1ec757..6dd24c0dca1d326592e4f33eba4e6233248dac5f 100644
--- a/tensorflow/python/tools/optimize_for_inference_test.py
+++ b/tensorflow/python/tools/optimize_for_inference_test.py
@@ -272,7 +272,7 @@ class OptimizeForInferenceTest(test.TestCase):
     for node in optimized_graph_def.node:
       self.assertNotEqual("Conv2D", node.op)
       self.assertNotEqual("MirrorPad", node.op)
-      
+
 
   def testFusePadAndConv(self):
     with self.test_session() as sess:
diff --git a/tensorflow/python/training/adadelta_test.py b/tensorflow/python/training/adadelta_test.py
index de59768d0bab5d138d6090a14fb39a3106da8562..50f435236b41fcda7ab5ea37a4e96b72dd1043e7 100644
--- a/tensorflow/python/training/adadelta_test.py
+++ b/tensorflow/python/training/adadelta_test.py
@@ -112,17 +112,16 @@ class AdadeltaOptimizerTest(test.TestCase):
               # Check that the accumulators have been updated
               for slot_idx in range(2):
                 self.assertAllCloseAccordingToType(
-                    np.array(
-                        [accum, accum], dtype=dtype.as_numpy_dtype()),
+                    np.array([accum, accum], dtype=dtype.as_numpy_dtype()),
                     slot[slot_idx].eval(),
-                    rtol=1e-3)
+                    rtol=1e-5)
 
                 self.assertAllCloseAccordingToType(
                     np.array(
                         [accum_update, accum_update],
                         dtype=dtype.as_numpy_dtype()),
                     slot_update[slot_idx].eval(),
-                    rtol=1e-3)
+                    rtol=1e-5)
 
               # Check that the parameters have been updated
               self.assertAllCloseAccordingToType(
@@ -130,14 +129,14 @@ class AdadeltaOptimizerTest(test.TestCase):
                       [var0_init[0] - tot_update, var0_init[1] - tot_update],
                       dtype=dtype.as_numpy_dtype()),
                   var0.eval(),
-                  rtol=1e-3)
+                  rtol=1e-5)
 
               self.assertAllCloseAccordingToType(
                   np.array(
                       [var1_init[0] - tot_update, var1_init[1] - tot_update],
                       dtype=dtype.as_numpy_dtype()),
                   var1.eval(),
-                  rtol=1e-3)
+                  rtol=1e-5)
 
   def testBasic(self):
     self.doTestBasic(use_resource=False)
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index 0d534db60dc92443d2795e751a574018bc03f612..ffb66abc4c1a38353d602a711cab86b0d63b9e96 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -207,6 +207,9 @@ class AdamOptimizerTest(test.TestCase):
           # Validate updated params
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          if use_resource:
+            self.assertEqual("var0_%d/Adam:0" % (i,),
+                             opt.get_slot(var=var0, name="m").name)
 
   def testBasic(self):
     with self.test_session():
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 1fb00343ef23d6b6dc9ca41f4868f0a7d80feb7c..b499cdf7f8a296a01f54da1c81ee3d39a8227e5f 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -514,6 +514,8 @@ class StepCounterHook(session_run_hook.SessionRunHook):
 
     self._summary_writer = summary_writer
     self._output_dir = output_dir
+    self._last_global_step = None
+    self._global_step_check_count = 0
 
   def begin(self):
     if self._summary_writer is None and self._output_dir:
@@ -545,6 +547,30 @@ class StepCounterHook(session_run_hook.SessionRunHook):
             self._summary_writer.add_summary(summary, global_step)
           logging.info("%s: %g", self._summary_tag, steps_per_sec)
 
+    # Check whether the global step has been increased. Here, we do not use the
+    # timer.last_triggered_step as the timer might record a different global
+    # step value such that the comparison could be unreliable. For simplicity,
+    # we just compare the stale_global_step with previously recorded version.
+    if stale_global_step == self._last_global_step:
+      # Here, we use a counter to count how many times we have observed that the
+      # global step has not been increased. For some Optimizers, the global step
+      # is not increased each time by design. For example, SyncReplicaOptimizer
+      # doesn't increase the global step in worker's main train step.
+      self._global_step_check_count += 1
+      if self._global_step_check_count % 20 == 0:
+        self._global_step_check_count = 0
+        logging.warning(
+            "It seems that global step (tf.train.get_global_step) has not "
+            "been increased. Current value (could be stable): %s vs previous "
+            "value: %s. You could increase the global step by passing "
+            "tf.train.get_global_step() to Optimizer.apply_gradients or "
+            "Optimizer.minimize.", stale_global_step, self._last_global_step)
+    else:
+      # Whenever we observe the increment, reset the counter.
+      self._global_step_check_count = 0
+
+    self._last_global_step = stale_global_step
+
 
 class NanLossDuringTrainingError(RuntimeError):
 
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index e7ff7e12211ae57a8589c799efbf9eab3b3fe5da..2547661e5250e94136a100aa8c30c9dbb7455018 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -780,9 +780,12 @@ class StepCounterHookTest(test.TestCase):
       hook.begin()
       sess.run(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
-      for _ in range(30):
-        time.sleep(0.01)
-        mon_sess.run(train_op)
+      with test.mock.patch.object(tf_logging, 'warning') as mock_log:
+        for _ in range(30):
+          time.sleep(0.01)
+          mon_sess.run(train_op)
+        # logging.warning should not be called.
+        self.assertIsNone(mock_log.call_args)
       hook.end(sess)
       summary_writer.assert_summaries(
           test_case=self,
@@ -857,6 +860,24 @@ class StepCounterHookTest(test.TestCase):
       summary_value = summary_writer.summaries[2][0].value[0]
       self.assertEqual('bar/foo/sec', summary_value.tag)
 
+  def test_log_warning_if_global_step_not_increased(self):
+    with ops.Graph().as_default(), session_lib.Session() as sess:
+      variables.get_or_create_global_step()
+      train_op = training_util._increment_global_step(0)  # keep same.
+      sess.run(variables_lib.global_variables_initializer())
+      hook = basic_session_run_hooks.StepCounterHook(
+          every_n_steps=1, every_n_secs=None)
+      hook.begin()
+      mon_sess = monitored_session._HookedSession(sess, [hook])
+      mon_sess.run(train_op)  # Run one step to record global step.
+      with test.mock.patch.object(tf_logging, 'warning') as mock_log:
+        for _ in range(30):
+          mon_sess.run(train_op)
+        self.assertRegexpMatches(
+            str(mock_log.call_args),
+            'global step.*has not been increased')
+      hook.end(sess)
+
 
 class SummarySaverHookTest(test.TestCase):
 
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index 802b930b0e391685b07802cbf6973b763e52d147..f0c28e7b89d08aed7bafb610fead9e285586e126 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -362,7 +362,13 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
   The function returns the decayed learning rate.  It is computed as:
 
   ```python
-  decayed_learning_rate = learning_rate / (1 + decay_rate * t)
+  decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
+  ```
+
+  or, if `staircase` is `True`, as:
+
+  ```python
+  decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
   ```
 
   Example: decay 1/t with a rate of 0.5:
@@ -371,8 +377,9 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
   ...
   global_step = tf.Variable(0, trainable=False)
   learning_rate = 0.1
-  k = 0.5
-  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, k)
+  decay_steps = 1.0
+  decay_rate = 0.5
+  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate)
 
   # Passing global_step to minimize() will increment it at each step.
   learning_step = (
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 7268b3abc93f911a29b11cb95b1f005db6f49167..6865513b0e4aad18d77887770a11243642958e7a 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -234,23 +234,38 @@ class MomentumOptimizerTest(test.TestCase):
           self.assertAllClose(var0_np, var0.eval())
           self.assertAllClose(var1_np, var1.eval())
 
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+
+      # pylint: disable=cell-var-from-loop
+      def loss():
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
         pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        loss = pred * pred
-        sgd_op = momentum_lib.MomentumOptimizer(
-            learning_rate=1.0, momentum=0.0).minimize(loss)
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-111, -138]], var0.eval())
+        return pred * pred
+      # pylint: enable=cell-var-from-loop
+
+      opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0)
+      sgd_op = opt.minimize(loss if context.in_eager_mode() else loss())
+      self.evaluate(variables.global_variables_initializer())
+      # Run 1 step of sgd
+      self.evaluate(sgd_op)
+      # Validate updated params
+      self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testMinimizeWith2DIndiciesForEmbeddingLookup(self):
+    var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
+
+    def loss():
+      return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
+
+    opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0)
+    sgd_op = opt.minimize(loss if context.in_eager_mode() else loss())
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(sgd_op)
+    self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
 
   def testTensorLearningRateAndMomentum(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index e931555470354d1f5c76ad7d46cff1308b015116..b9bffa6b5c272034660bf9502cc294cfba619d2c 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -52,7 +52,6 @@ _PREEMPTION_ERRORS = (errors.AbortedError, errors.UnavailableError)
 USE_DEFAULT = object()
 
 
-# TODO(touts): Share that with the Supervisor.
 class Scaffold(object):
   """Structure to create or gather pieces commonly needed to train a model.
 
@@ -266,8 +265,10 @@ class Scaffold(object):
 
   @staticmethod
   def _default_local_init_op():
-    return control_flow_ops.group(variables.local_variables_initializer(),
-                                  lookup_ops.tables_initializer())
+    return control_flow_ops.group(
+        variables.local_variables_initializer(),
+        lookup_ops.tables_initializer(),
+        resources.initialize_resources(resources.local_resources()))
 
 
 def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index b31d02eb8d7afe2dd675192fc99fb7c24b515c00..56cf4d42ee194885057d8bf45d9b3c1c407c4a11 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -644,7 +644,8 @@ class Optimizer(object):
     Returns:
       Valid types for loss, variables and gradients.
     """
-    return set([dtypes.float16, dtypes.float32, dtypes.float64])
+    return set(
+        [dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64])
 
   def _create_slots(self, var_list):
     """Create all slots needed by the variables.
diff --git a/tensorflow/python/training/quantize_training.i b/tensorflow/python/training/quantize_training.i
index 40c60769731d3f7255647a07141d86b1c2594b01..17ffcd6e0758c9c1bc8bab864b6b7a2a18bc9cbf 100644
--- a/tensorflow/python/training/quantize_training.i
+++ b/tensorflow/python/training/quantize_training.i
@@ -65,6 +65,9 @@ def do_quantize_training_on_graphdef(input_graph, num_bits):
 
   graph.ParseFromString(result_graph_string)
   return graph
+
+do_quantize_training_on_graphdef._tf_api_names = [
+    'train.do_quantize_training_on_graphdef']
 %}
 
 %unignoreall
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 5bddde16984f8009ce1684e9fdd3cbf33effa42e..ba6301e785947c8347ef23b81491e684bee62974 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -349,7 +349,7 @@ class BaseSaverBuilder(object):
     last_device = None
     for shard, (device, saveables) in enumerate(per_device):
       last_device = device
-      with ops.device(device):
+      with ops.device(_set_cpu0(device)):
         sharded_filename = self.sharded_filename(tmp_checkpoint_prefix, shard,
                                                  num_shards_tensor)
         sharded_prefixes.append(sharded_filename)
@@ -357,7 +357,7 @@ class BaseSaverBuilder(object):
 
     with ops.control_dependencies([x.op for x in sharded_saves]):
       # Co-locates the merge step with the last device.
-      with ops.device(last_device):
+      with ops.device(_set_cpu0(last_device)):
         # V2 format write path consists of a metadata merge step.  Once merged,
         # attempts to delete the temporary directory, "<user-fed prefix>_temp".
         merge_step = gen_io_ops.merge_v2_checkpoints(
@@ -523,7 +523,10 @@ class BaseSaverBuilder(object):
     if not isinstance(op_list, (list, tuple, set)):
       raise TypeError("Variables to save should be passed in a dict or a "
                       "list: %s" % op_list)
-    op_list = set(op_list)
+    # When ResourceVariables are converted to Tensors, read ops are added to the
+    # graph. Sorting the op_list ensures that the resulting graph is always
+    # constructed in a deterministic way:
+    op_list = sorted(op_list, key=lambda x: x.name)
     names_to_saveables = {}
     # pylint: disable=protected-access
     for var in op_list:
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 744b17dd224297cbefedfe562ff106fe1200664f..207e4a28426f95af4d5947964cf9133be10bc0fa 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -38,6 +38,7 @@ from tensorflow.core.protobuf import queue_runner_pb2
 from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -164,6 +165,18 @@ class SaverTest(test.TestCase):
   def testResourceBasic(self):
     self.basicSaveRestore(resource_variable_ops.ResourceVariable)
 
+  def testResourceVariableReadOpsAddedDeterministically(self):
+    graph_defs = []
+    num_graphs = 10
+    for _ in range(num_graphs):
+      with ops_lib.Graph().as_default() as g:
+        for i in range(20):
+          resource_variable_ops.ResourceVariable(i, name="var%s" % i)
+        saver_module.Saver()
+        graph_defs.append(g.as_graph_def())
+    for i in range(num_graphs - 1):
+      self.assertEqual(graph_defs[i], graph_defs[i + 1])
+
   def testEagerBasic(self):
     with context.eager_mode():
       ckpt_prefix = os.path.join(self.get_temp_dir(), "ckpt")
@@ -529,6 +542,23 @@ class SaverTest(test.TestCase):
       save = saver_module.Saver({"v0": v0_2})
       variables.global_variables_initializer().run()
 
+  def testSharedServerOnGPU(self):
+    if not test.is_gpu_available():
+      return
+    save_path = os.path.join(self.get_temp_dir(), "gpu")
+    with session.Session("", graph=ops_lib.Graph()) as sess:
+      with sess.graph.device(test.gpu_device_name()):
+        v0_1 = variables.Variable(123.45)
+      save = saver_module.Saver({"v0": v0_1}, sharded=True, allow_empty=True)
+      variables.global_variables_initializer().run()
+      save.save(sess, save_path)
+
+    with session.Session("", graph=ops_lib.Graph()) as sess:
+      with sess.graph.device(test.gpu_device_name()):
+        v0_2 = variables.Variable(543.21)
+      save = saver_module.Saver({"v0": v0_2}, sharded=True, allow_empty=True)
+      variables.global_variables_initializer().run()
+
   def testVariables(self):
     save_path = os.path.join(self.get_temp_dir(), "variables")
     with session.Session("", graph=ops_lib.Graph()) as sess:
@@ -714,6 +744,8 @@ class SaverTest(test.TestCase):
 
 class SaveRestoreShardedTest(test.TestCase):
 
+  _WRITE_VERSION = saver_pb2.SaverDef.V1
+
   def _get_test_dir(self, dirname):
     test_dir = os.path.join(self.get_temp_dir(), dirname)
     gfile.MakeDirs(test_dir)
@@ -739,6 +771,7 @@ class SaveRestoreShardedTest(test.TestCase):
               "t0": t0.saveable,
               "t1": t1.saveable
           },
+          write_version=self._WRITE_VERSION,
           sharded=True)
       variables.global_variables_initializer().run()
       t0.insert("k1", 30.0).run()
@@ -759,7 +792,13 @@ class SaveRestoreShardedTest(test.TestCase):
         with sess.graph.device("/cpu:0"):
           v0 = variables.Variable(111, name="v0")
           t0 = saver_test_utils.CheckpointedOp(name="t0")
-        save = saver_module.Saver({"v0": v0, "t0": t0.saveable}, sharded=True)
+        save = saver_module.Saver(
+            {
+                "v0": v0,
+                "t0": t0.saveable
+            },
+            write_version=self._WRITE_VERSION,
+            sharded=True)
         variables.global_variables_initializer().run()
         t0.insert("k11", 33.0).run()
         self.assertEqual(111, v0.eval())
@@ -777,7 +816,13 @@ class SaveRestoreShardedTest(test.TestCase):
         with sess.graph.device("/cpu:0"):
           v1 = variables.Variable(222)
           t1 = saver_test_utils.CheckpointedOp(name="t1")
-        save = saver_module.Saver({"v1": v1, "t1": t1.saveable}, sharded=True)
+        save = saver_module.Saver(
+            {
+                "v1": v1,
+                "t1": t1.saveable
+            },
+            write_version=self._WRITE_VERSION,
+            sharded=True)
         variables.global_variables_initializer().run()
         t1.insert("k22", 44.0).run()
         self.assertEqual(222, v1.eval())
@@ -805,6 +850,7 @@ class SaveRestoreShardedTest(test.TestCase):
               "t0": t0.saveable,
               "t1": t1.saveable
           },
+          write_version=self._WRITE_VERSION,
           sharded=True)
       variables.global_variables_initializer().run()
       t0.insert("k11", 33.0).run()
@@ -970,6 +1016,10 @@ class SaveRestoreShardedTest(test.TestCase):
     self._testPartitionedVariables(use_resource=True)
 
 
+class SaveRestoreShardedTestV2(SaveRestoreShardedTest):
+  _WRITE_VERSION = saver_pb2.SaverDef.V2
+
+
 class MaxToKeepTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -2105,6 +2155,31 @@ class MetaGraphTest(test.TestCase):
               10, size=[1, 10])
       })
 
+  def testPreserveDatasetAndFunctions(self):
+    with ops_lib.Graph().as_default() as g:
+      dataset = dataset_ops.Dataset.range(10).map(lambda x: x * x)
+      iterator = dataset.make_one_shot_iterator()
+      next_element = iterator.get_next()
+      _ = array_ops.identity(next_element, name="output")
+
+      # Generate three MetaGraphDef protos using different code paths.
+      meta_graph_def_simple = saver_module.export_meta_graph()
+      meta_graph_def_devices_cleared = saver_module.export_meta_graph(
+          clear_devices=True)
+      meta_graph_def_from_graph_def = saver_module.export_meta_graph(
+          clear_devices=True, graph_def=g.as_graph_def())
+
+    for meta_graph_def in [meta_graph_def_simple,
+                           meta_graph_def_devices_cleared,
+                           meta_graph_def_from_graph_def]:
+      with session.Session(graph=ops_lib.Graph()) as sess:
+        saver_module.import_meta_graph(meta_graph_def, import_scope="new_model")
+        sess.run(variables.global_variables_initializer())
+        for i in range(10):
+          self.assertEqual(i * i, sess.run("new_model/output:0"))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run("new_model/output:0")
+
 
 class CheckpointReaderTest(test.TestCase):
 
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index 2091eca0b9c6f0af4a043a4639b6fb72b90cef56..29da67a30a58c1b8b8e172b2ccede340880fef58 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -307,6 +307,12 @@ class ClusterSpec(object):
   def __ne__(self, other):
     return self._cluster_spec != other
 
+  def __str__(self):
+    key_values = self.as_dict()
+    string_items = [
+        repr(k) + ": " + repr(key_values[k]) for k in sorted(key_values)]
+    return "ClusterSpec({" + ", ".join(string_items) + "})"
+
   def as_dict(self):
     """Returns a dictionary from job names to their tasks.
 
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
index 0a8ec4901c9ef050014b6a04cdab34ca08f292c1..063044f0d05d4237830e415ac2ad800c98ae8beb 100644
--- a/tensorflow/python/training/server_lib_test.py
+++ b/tensorflow/python/training/server_lib_test.py
@@ -241,6 +241,95 @@ class GrpcServerTest(test.TestCase):
       queue_runner_impl.start_queue_runners(sess)
       sess.run(var.assign(3.0))
 
+  def testIsolateSessionState(self):
+    server = self._cached_server
+
+    init_value = array_ops.placeholder(dtypes.int32)
+    v = variables.Variable(init_value, validate_shape=False, name="v")
+
+    sharing_config = config_pb2.ConfigProto(isolate_session_state=False)
+    sharing_sess_0 = session.Session(server.target, config=sharing_config)
+    sharing_sess_1 = session.Session(server.target, config=sharing_config)
+
+    isolate_config = config_pb2.ConfigProto(isolate_session_state=True)
+    isolate_sess_0 = session.Session(server.target, config=isolate_config)
+    isolate_sess_1 = session.Session(server.target, config=isolate_config)
+
+    # Initially all variables are initialized.
+    for sess in [sharing_sess_0, sharing_sess_1,
+                 isolate_sess_0, isolate_sess_1]:
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        sess.run(v)
+
+    # Shared sessions will see each other's updates, but isolated sessions
+    # will not.
+    sharing_sess_0.run(v.initializer, feed_dict={init_value: 86})
+    self.assertAllEqual(86, sharing_sess_0.run(v))
+    self.assertAllEqual(86, sharing_sess_1.run(v))
+    with self.assertRaises(errors_impl.FailedPreconditionError):
+      isolate_sess_0.run(v)
+    with self.assertRaises(errors_impl.FailedPreconditionError):
+      isolate_sess_1.run(v)
+
+    # Changing the shape works because `validate_shape` is False.
+    sharing_sess_1.run(v.initializer, feed_dict={init_value: [86, 99]})
+    self.assertAllEqual([86, 99], sharing_sess_0.run(v))
+    self.assertAllEqual([86, 99], sharing_sess_1.run(v))
+    with self.assertRaises(errors_impl.FailedPreconditionError):
+      isolate_sess_0.run(v)
+    with self.assertRaises(errors_impl.FailedPreconditionError):
+      isolate_sess_1.run(v)
+
+    # Initializing in an isolated session will only affect the state in that
+    # session.
+    isolate_sess_0.run(v.initializer, feed_dict={init_value: 37})
+    self.assertAllEqual([86, 99], sharing_sess_0.run(v))
+    self.assertAllEqual([86, 99], sharing_sess_1.run(v))
+    self.assertAllEqual(37, isolate_sess_0.run(v))
+    with self.assertRaises(errors_impl.FailedPreconditionError):
+      isolate_sess_1.run(v)
+
+    # Isolated sessions can have different shapes for the same variable.
+    isolate_sess_1.run(v.initializer, feed_dict={init_value: [19, 86]})
+    self.assertAllEqual([86, 99], sharing_sess_0.run(v))
+    self.assertAllEqual([86, 99], sharing_sess_1.run(v))
+    self.assertAllEqual(37, isolate_sess_0.run(v))
+    self.assertAllEqual([19, 86], isolate_sess_1.run(v))
+
+  def testShapeChangingIsolateState(self):
+    server = self._cached_server
+    sharing_config = config_pb2.ConfigProto(isolate_session_state=False)
+    isolate_config = config_pb2.ConfigProto(isolate_session_state=True)
+
+    with ops.Graph().as_default():
+      w_vector = variables.Variable([1, 2, 3], name="w")
+      with session.Session(server.target, config=sharing_config) as sess:
+        with self.assertRaises(errors_impl.FailedPreconditionError):
+          sess.run(w_vector)
+        sess.run(w_vector.initializer)
+        self.assertAllEqual([1, 2, 3], sess.run(w_vector))
+
+    with ops.Graph().as_default():
+      w_vector = variables.Variable([4, 5, 6], name="w")
+      with session.Session(server.target, config=sharing_config) as sess:
+        self.assertAllEqual([1, 2, 3], sess.run(w_vector))
+        sess.run(w_vector.initializer)
+        self.assertAllEqual([4, 5, 6], sess.run(w_vector))
+
+    with ops.Graph().as_default():
+      w_scalar = variables.Variable(86, name="w")
+      with session.Session(server.target, config=sharing_config) as sess:
+        with self.assertRaises(errors_impl.InvalidArgumentError):
+          sess.run(w_scalar.initializer)
+
+    with ops.Graph().as_default():
+      w_scalar = variables.Variable(37, name="w")
+      with session.Session(server.target, config=isolate_config) as sess:
+        with self.assertRaises(errors_impl.FailedPreconditionError):
+          sess.run(w_scalar)
+        sess.run(w_scalar.initializer)
+        self.assertAllEqual(37, sess.run(w_scalar))
+
 
 class ServerDefTest(test.TestCase):
 
@@ -332,6 +421,17 @@ class ServerDefTest(test.TestCase):
 
 class ClusterSpecTest(test.TestCase):
 
+  def testStringConversion(self):
+    cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:1111"],
+        "worker": ["worker0:3333", "worker1:4444"]
+    })
+
+    expected_str = (
+        "ClusterSpec({'ps': ['ps0:1111'], 'worker': ['worker0:3333', "
+        "'worker1:4444']})")
+    self.assertEqual(expected_str, str(cluster_spec))
+
   def testProtoDictDefEquivalences(self):
     cluster_spec = server_lib.ClusterSpec({
         "ps": ["ps0:2222", "ps1:2222"],
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index a634a842b67033d5fde6bf8cf819f681e892a247..e4514aaea223b6b254a7a72e11e6b70b576fd54b 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -36,11 +36,15 @@ from tensorflow.python.training import coordinator
 from tensorflow.python.training import saver as saver_mod
 from tensorflow.python.training import session_manager as session_manager_mod
 from tensorflow.python.training import training_util
+from tensorflow.python.util import deprecation
 
 
 class Supervisor(object):
   """A training helper that checkpoints models and computes summaries.
 
+  This class is deprecated. Please use
+  ${tf.train.MonitoredTrainingSession} instead.
+
   The Supervisor is a small wrapper around a `Coordinator`, a `Saver`,
   and a `SessionManager` that takes care of common needs of TensorFlow
   training programs.
@@ -198,6 +202,8 @@ class Supervisor(object):
   # the default behavior should be used.
   USE_DEFAULT = 0
 
+  @deprecation.deprecated(None,
+                          "Please switch to tf.train.MonitoredTrainingSession")
   def __init__(self,
                graph=None,
                ready_op=USE_DEFAULT,
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index 2a97d45daa9f69b2df2f082492d65ab1292000f6..47702fdad05d13015e0cbf7768129b0c53b6c14c 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -99,7 +99,7 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
   # Note that if you want to have 2 backup replicas, you can change
   # total_num_replicas=52 and make sure this number matches how many physical
   # replicas you started in your job.
-  opt = tf.SyncReplicasOptimizer(opt, replicas_to_aggregate=50,
+  opt = tf.train.SyncReplicasOptimizer(opt, replicas_to_aggregate=50,
                                  total_num_replicas=50)
 
   # Some models have startup_delays to help stabilize the model but when using
@@ -449,7 +449,7 @@ class _SyncReplicasOptimizerHook(session_run_hook.SessionRunHook):
   """A SessionRunHook handles ops related to SyncReplicasOptimizer."""
 
   def __init__(self, sync_optimizer, is_chief, num_tokens):
-    """Creates hook to handle SyncReplicaOptimizer initialization ops.
+    """Creates hook to handle SyncReplicasOptimizer initialization ops.
 
     Args:
       sync_optimizer: `SyncReplicasOptimizer` which this hook will initialize.
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 25dbc78d7ae2577f05456b946ed4f516b942e05b..5c066e2bef1eb557b81b4996a4848fb18318ab4e 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -116,7 +116,7 @@ def flatten(nest):
   used instead. The same convention is followed in `pack_sequence_as`. This
   correctly repacks dicts and `OrderedDict`s after they have been flattened,
   and also allows flattening an `OrderedDict` and then repacking it back using
-  a correponding plain dict, or vice-versa.
+  a corresponding plain dict, or vice-versa.
   Dictionaries with non-sortable keys cannot be flattened.
 
   Users must not modify any collections used in `nest` while this function is
@@ -293,10 +293,10 @@ def pack_sequence_as(structure, flat_sequence):
   If `structure` is or contains a dict instance, the keys will be sorted to
   pack the flat sequence in deterministic order. This is true also for
   `OrderedDict` instances: their sequence order is ignored, the sorting order of
-  keys is used instead. The same convention is followed in `pack_sequence_as`.
+  keys is used instead. The same convention is followed in `flatten`.
   This correctly repacks dicts and `OrderedDict`s after they have been
   flattened, and also allows flattening an `OrderedDict` and then repacking it
-  back using a correponding plain dict, or vice-versa.
+  back using a corresponding plain dict, or vice-versa.
   Dictionaries with non-sortable keys cannot be flattened.
 
   Args:
@@ -456,9 +456,9 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
       if set(input_tree) != set(shallow_tree):
         raise ValueError(
             "The two structures don't have the same keys. Input "
-            "structure has keys %s, while shallow structure has keys %s."
-            % (list(_six.iterkeys(input_tree)),
-               list(_six.iterkeys(shallow_tree))))
+            "structure has keys %s, while shallow structure has keys %s." %
+            (list(_six.iterkeys(input_tree)),
+             list(_six.iterkeys(shallow_tree))))
 
       input_tree = list(_six.iteritems(input_tree))
       shallow_tree = list(_six.iteritems(shallow_tree))
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 26aeaeec19b334b466f185fe765974fca61ae3b8..3d9e9f96849c1b7415892ec9341947565ed89664 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -388,8 +388,9 @@ class NestTest(test.TestCase):
     inp_ab1 = {"a": (1, 1), "b": {"c": (2, 2)}}
     inp_ab2 = {"a": (1, 1), "b": {"d": (2, 2)}}
     expected_message = (
-        "The two structures don't have the same keys. Input "
-        "structure has keys \['c'\], while shallow structure has keys \['d'\].")
+        r"The two structures don't have the same keys. Input "
+        r"structure has keys \['c'\], while shallow structure has "
+        r"keys \['d'\].")
 
     with self.assertRaisesRegexp(ValueError, expected_message):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
@@ -438,8 +439,7 @@ class NestTest(test.TestCase):
     input_tree_flattened_as_shallow_tree = nest.flatten_up_to(shallow_tree,
                                                               input_tree)
     self.assertEqual(input_tree_flattened_as_shallow_tree, [0, 1, 2, 3, 4])
-    shallow_tree = collections.OrderedDict([("a", 0),
-                                            ("c", {"d": 3, "e": 1})])
+    shallow_tree = collections.OrderedDict([("a", 0), ("c", {"d": 3, "e": 1})])
     input_tree_flattened_as_shallow_tree = nest.flatten_up_to(shallow_tree,
                                                               input_tree)
     self.assertEqual(input_tree_flattened_as_shallow_tree,
diff --git a/tensorflow/python/util/py_checkpoint_reader.i b/tensorflow/python/util/py_checkpoint_reader.i
index 0cd095d9d947f5cf76adaf83dc16272c4374573e..8004898cbcbce7ce593ce35efdc6493e052468bd 100644
--- a/tensorflow/python/util/py_checkpoint_reader.i
+++ b/tensorflow/python/util/py_checkpoint_reader.i
@@ -164,6 +164,8 @@ def NewCheckpointReader(filepattern):
   with errors.raise_exception_on_not_ok_status() as status:
     from tensorflow.python.util import compat
     return CheckpointReader(compat.as_bytes(filepattern), status)
+
+NewCheckpointReader._tf_api_names = ['train.NewCheckpointReader']
 %}
 
 %include "tensorflow/c/checkpoint_reader.h"
diff --git a/tensorflow/python/util/stat_summarizer.i b/tensorflow/python/util/stat_summarizer.i
index 80739195872a056e7a5443dfb81ab1440300dbff..6aeaa0e31b9b48f7e6705ab7146828cc0e0e5e08 100644
--- a/tensorflow/python/util/stat_summarizer.i
+++ b/tensorflow/python/util/stat_summarizer.i
@@ -27,8 +27,8 @@ limitations under the License.
 
 %ignoreall
 
-%unignore NewStatSummarizer;
-%unignore DeleteStatSummarizer;
+%unignore _NewStatSummarizer;
+%unignore _DeleteStatSummarizer;
 %unignore tensorflow;
 %unignore tensorflow::StatSummarizer;
 %unignore tensorflow::StatSummarizer::StatSummarizer;
@@ -43,21 +43,20 @@ limitations under the License.
 
 // TODO(ashankar): Remove the unused argument from the API.
 %{
-tensorflow::StatSummarizer* NewStatSummarizer(
+tensorflow::StatSummarizer* _NewStatSummarizer(
       const string& unused) {
   return new tensorflow::StatSummarizer(tensorflow::StatSummarizerOptions());
 }
 %}
 
-
 %{
-void DeleteStatSummarizer(tensorflow::StatSummarizer* ss) {
+void _DeleteStatSummarizer(tensorflow::StatSummarizer* ss) {
   delete ss;
 }
 %}
 
-tensorflow::StatSummarizer* NewStatSummarizer(const string& unused);
-void DeleteStatSummarizer(tensorflow::StatSummarizer* ss);
+tensorflow::StatSummarizer* _NewStatSummarizer(const string& unused);
+void _DeleteStatSummarizer(tensorflow::StatSummarizer* ss);
 
 %extend tensorflow::StatSummarizer {
   void ProcessStepStatsStr(const string& step_stats_str) {
@@ -77,3 +76,21 @@ void DeleteStatSummarizer(tensorflow::StatSummarizer* ss);
 
 %include "tensorflow/core/util/stat_summarizer.h"
 %unignoreall
+
+%insert("python") %{
+
+# Wrapping NewStatSummarizer and DeletStatSummarizer because
+# SWIG-generated functions are built-in functions and do not support
+# setting _tf_api_names attribute.
+
+def NewStatSummarizer(unused):
+  return _NewStatSummarizer(unused)
+
+def DeleteStatSummarizer(stat_summarizer):
+  _DeleteStatSummarizer(stat_summarizer)
+
+NewStatSummarizer._tf_api_names = ["contrib.stat_summarizer.NewStatSummarizer"]
+DeleteStatSummarizer._tf_api_names = [
+    "contrib.stat_summarizer.DeleteStatSummarizer"]
+StatSummarizer._tf_api_names = ["contrib.stat_summarizer.StatSummarizer"]
+%}
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 9ed125704b1cf2ced585db0b169a184d27e1ad72..d14e71038851db80a3837254bb2e0d694480fe40 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -45,6 +45,26 @@ def getargspec(object):  # pylint: disable=redefined-builtin
                if d.decorator_argspec is not None), _inspect.getargspec(target))
 
 
+def getfullargspec(obj):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getfullargspec and fallback to
+  inspect.getargspec in Python 2.
+
+  Args:
+    obj: A callable, possibly decorated.
+
+  Returns:
+    The `FullArgSpec` (`ArgSpec` in Python 2) that describes the signature of
+    the outermost decorator that changes the callable's signature. If the
+    callable is not decorated, `inspect.getfullargspec()`
+    (`inspect.getargspec()` in Python 2) will be called directly on the
+    callable.
+  """
+  spec_fn = getattr(_inspect, 'getfullargspec', getattr(_inspect, 'getargspec'))
+  decorators, target = tf_decorator.unwrap(obj)
+  return next((d.decorator_argspec for d in decorators
+               if d.decorator_argspec is not None), spec_fn(target))
+
+
 def getcallargs(func, *positional, **named):
   """TFDecorator-aware replacement for inspect.getcallargs.
 
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index c3d7611ad43b05f510481925fbfe1f930cf95ff8..a41fa7df253bcf4bce280574b89ed0dda8330521 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -29,7 +29,7 @@ bool WarnedThatSetIsNotSequence = false;
 
 // Returns 1 if `o` is considered a sequence for the purposes of Flatten().
 // Returns 0 otherwise.
-// Returns -1 if an error occured.
+// Returns -1 if an error occurred.
 int IsSequenceHelper(PyObject* o) {
   if (PyDict_Check(o)) return true;
   if (PySet_Check(o) && !WarnedThatSetIsNotSequence) {
diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h
index eb1b19c5d963d56c6175251a54e2ab5072a01760..072f08554688276a05d9be85718de8750bd874c2 100644
--- a/tensorflow/stream_executor/blas.h
+++ b/tensorflow/stream_executor/blas.h
@@ -30,8 +30,8 @@ limitations under the License.
 //  Stream stream{stream_exec};
 //  stream
 //    .Init()
-//    .ThenBlasAxpy(1024, 5.5, x, 1, &y, 1)
-//    .BlockHostUntilDone();
+//    .ThenBlasAxpy(1024, 5.5, x, 1, &y, 1);
+//  SE_CHECK_OK(stream.BlockHostUntilDone());
 //
 // By using stream operations in this manner the user can easily intermix custom
 // kernel launches (via StreamExecutor::ThenLaunch()) with these pre-canned BLAS
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index d78362d4fbac3a6058743383d832bfc3df133a2f..6031091b7441a7da6bfd30fbc4ad77f07fd7c674 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -561,7 +561,7 @@ static bool TensorOpMathEnabled() {
   static bool is_enabled = [] {
     bool ret;
     TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DISABLE_TENSOR_OP_MATH",
-                                               /*default=*/false, &ret));
+                                               /*default_val=*/false, &ret));
     return !ret;
   }();
   return is_enabled;
@@ -2677,7 +2677,7 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
       // CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD,
       // clang-format on
   };
-#if CUDNN_VERSION >= 5110
+#if CUDNN_VERSION >= 5100
   if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED);
   }
@@ -2761,14 +2761,27 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
   float zero = 0.0;
 
   if (is_training) {
-    stream->ThenMemZero(batch_mean, batch_mean->size());
-    stream->ThenMemZero(batch_var, batch_var->size());
+    CHECK_EQ(batch_mean->is_null(), batch_var->is_null())
+        << "batch_mean and batch_var must both be null or both be non-null";
+
+    void* batch_mean_opaque;
+    void* batch_var_opaque;
+    if (!batch_mean->is_null() && !batch_var->is_null()) {
+      stream->ThenMemZero(batch_mean, batch_mean->size());
+      stream->ThenMemZero(batch_var, batch_var->size());
+      batch_mean_opaque = batch_mean->opaque();
+      batch_var_opaque = batch_var->opaque();
+    } else {
+      batch_mean_opaque = nullptr;
+      batch_var_opaque = nullptr;
+    }
+
     status = wrap::cudnnBatchNormalizationForwardTraining(
         parent_, ToHandle(dnn_handle_), mode, &one, &zero,
         x_descriptor.handle(), x.opaque(), x_descriptor.handle(), y->opaque(),
         scale_offset_descriptor.handle(), scale.opaque(), offset.opaque(), 1.0,
-        batch_mean->opaque(), batch_var->opaque(), epsilon,
-        saved_mean->opaque(), saved_inv_var->opaque());
+        batch_mean_opaque, batch_var_opaque, epsilon, saved_mean->opaque(),
+        saved_inv_var->opaque());
 #if CUDNN_VERSION < 5000
     CHECK(inv_var_to_var);
     inv_var_to_var();
@@ -2797,28 +2810,28 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
 bool CudnnSupport::DoBatchNormalizationBackward(
     Stream* stream, const DeviceMemory<float>& y_backprop,
     const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
-    const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+    const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
     const dnn::BatchDescriptor& x_desc,
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
     DeviceMemory<float>* offset_backprop) {
   return DoBatchNormalizationBackwardImpl(
       stream, CUDNN_DATA_FLOAT, CUDNN_DATA_FLOAT, y_backprop, x, scale, mean,
-      variance, x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
+      inv_var, x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
       offset_backprop);
 }
 
 bool CudnnSupport::DoBatchNormalizationBackward(
     Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
     const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
-    const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+    const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
     const dnn::BatchDescriptor& x_desc,
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<Eigen::half>* x_backprop, DeviceMemory<float>* scale_backprop,
     DeviceMemory<float>* offset_backprop) {
   return DoBatchNormalizationBackwardImpl(
       stream, CUDNN_DATA_HALF, CUDNN_DATA_FLOAT, y_backprop, x, scale, mean,
-      variance, x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
+      inv_var, x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
       offset_backprop);
 }
 
@@ -2827,7 +2840,7 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl(
     Stream* stream, int cudnn_input_type, int cudnn_scale_type,
     const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
     const DeviceMemory<U>& scale, const DeviceMemory<U>& mean,
-    const DeviceMemory<U>& variance, const dnn::BatchDescriptor& x_desc,
+    const DeviceMemory<U>& inv_var, const dnn::BatchDescriptor& x_desc,
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
     DeviceMemory<U>* offset_backprop) {
@@ -2854,7 +2867,7 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl(
       y_backprop.opaque(), x_descriptor.handle(), x_backprop->opaque(),
       scale_offset_descriptor.handle(), scale.opaque(),
       scale_backprop->opaque(), offset_backprop->opaque(), epsilon,
-      mean.opaque(), variance.opaque());
+      mean.opaque(), inv_var.opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to enqueue backward batch normalization on stream: "
                << ToString(status);
@@ -4251,7 +4264,12 @@ bool CudnnSupport::DoDepthConcatenate(
   for (size_t i = 0; i < input_data.size(); ++i) {
     const auto& dimensions = input_dimensions[i];
     tmp.resize(dimensions.ElementCount());
-    stream->ThenMemcpyD2H<float>(*input_data[i], &tmp).BlockHostUntilDone();
+    stream->ThenMemcpyD2H<float>(*input_data[i], &tmp);
+    port::Status block_status = stream->BlockHostUntilDone();
+    if (!block_status.ok()) {
+      LOG(ERROR) << "BlockHostUntilDone failed: " << block_status;
+      return false;
+    }
 
     for (int64 batch = 0; batch < output_dimensions.count(); ++batch) {
       for (int64 yx = 0; yx < area; ++yx) {
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 14986286f1dd4c4ced1ebaf6adbada8e52096b92..ee28c0bf57a51a63be7ebbce5c8f80e09737bb16 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -226,7 +226,7 @@ class CudnnSupport : public dnn::DnnSupport {
   bool DoBatchNormalizationBackward(
       Stream* stream, const DeviceMemory<float>& y_backprop,
       const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
-      const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+      const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
       const dnn::BatchDescriptor& x_desc,
       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
       DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
@@ -235,7 +235,7 @@ class CudnnSupport : public dnn::DnnSupport {
   bool DoBatchNormalizationBackward(
       Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
       const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
-      const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+      const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
       const dnn::BatchDescriptor& x_desc,
       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
       DeviceMemory<Eigen::half>* x_backprop,
@@ -637,7 +637,7 @@ class CudnnSupport : public dnn::DnnSupport {
       Stream* stream, int cudnn_input_type, int cudnn_scale_type,
       const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
       const DeviceMemory<U>& scale, const DeviceMemory<U>& mean,
-      const DeviceMemory<U>& variance, const dnn::BatchDescriptor& x_desc,
+      const DeviceMemory<U>& inv_var, const dnn::BatchDescriptor& x_desc,
       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
       DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
       DeviceMemory<U>* offset_backprop);
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index b6a96ed3e5cbda044c00bb9b940d68f80373587a..a017ff64d4c69b6952b442464877dc26a800ad37 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -1115,19 +1115,20 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::SynchronizeStream(CudaContext* context,
-                                                CUstream stream) {
+/* static */ port::Status CUDADriver::SynchronizeStream(CudaContext *context,
+                                                        CUstream stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
   CUresult res = cuStreamSynchronize(stream);
   if (res != CUDA_SUCCESS) {
-    LOG(ERROR) << "could not synchronize on CUDA stream: " << ToString(res)
-               << " :: " << port::CurrentStackTrace();
-    return false;
+    port::Status status = port::InternalError(
+        port::StrCat("could not synchronize on CUDA stream: ", ToString(res)));
+    LOG(ERROR) << status << " :: " << port::CurrentStackTrace();
+    return status;
   }
   VLOG(2) << "successfully synchronized stream " << stream << " on context "
           << context;
-  return true;
+  return port::Status::OK();
 }
 
 /* static */ bool CUDADriver::IsStreamIdle(CudaContext *context,
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index 68494aba6597c2cd1ee52a7b4cb411cd50fad77b..4002ba2021d1a2e2c36bd1786a3084ee8c08bb78 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -304,7 +304,7 @@ class CUDADriver {
   // amount of time?
   //
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
-  static bool SynchronizeStream(CudaContext* context, CUstream stream);
+  static port::Status SynchronizeStream(CudaContext* context, CUstream stream);
 
   // Blocks the calling thread until the operations associated with the context
   // have been completed, via cuCtxSynchronize.
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 6c522264e1de2b3c141eeb7c46b77c10a2f4335f..878fa8d9ad724340dc26329520893c9b6489444c 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -108,11 +108,6 @@ static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase *gpu_mem) {
   return AsCudaDevicePtr(*gpu_mem);
 }
 
-static CudaContext* GetCudaContext(Stream *stream) {
-  return static_cast<CUDAExecutor *>(stream->parent()->implementation())
-      ->cuda_context();
-}
-
 CudaContext* ExtractCudaContext(CUDAExecutor *cuda_exec) {
   CHECK(cuda_exec != nullptr);
   return cuda_exec->cuda_context();
@@ -380,14 +375,14 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
 
   void **kernel_params = const_cast<void **>(args.argument_addresses().data());
 
-  if (!CUDADriver::LaunchKernel(GetCudaContext(stream), cufunc, block_dims.x,
-                                block_dims.y, block_dims.z, thread_dims.x,
-                                thread_dims.y, thread_dims.z,
-                                args.number_of_shared_bytes(), custream,
-                                kernel_params, nullptr /* = extra */)) {
-    LOG(ERROR) << "failed to launch CUDA kernel with args: "
+  if (!CUDADriver::LaunchKernel(context_, cufunc, block_dims.x, block_dims.y,
+                                block_dims.z, thread_dims.x, thread_dims.y,
+                                thread_dims.z, args.number_of_shared_bytes(),
+                                custream, kernel_params,
+                                nullptr /* = extra */)) {
+    LOG(ERROR) << "failed to launch CUDA kernel " << kernel.name() << " with "
                << args.number_of_arguments()
-               << "; thread dim: " << thread_dims.ToString()
+               << " args; thread dim: " << thread_dims.ToString()
                << "; block dim: " << block_dims.ToString();
     return false;
   }
@@ -669,7 +664,7 @@ bool CUDAExecutor::StopTimer(Stream *stream, Timer *timer) {
   return AsCUDATimer(timer)->Stop(AsCUDAStream(stream));
 }
 
-bool CUDAExecutor::BlockHostUntilDone(Stream *stream) {
+port::Status CUDAExecutor::BlockHostUntilDone(Stream *stream) {
   return CUDADriver::SynchronizeStream(context_, AsCUDAStreamValue(stream));
 }
 
@@ -930,16 +925,129 @@ struct UnqueryableDeviceParams {
   uint64 shared_memory_alloc_granularity;
 };
 
+// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
+// https://developer.download.nvidia.com/compute/cuda/CUDA_Occupancy_calculator.xls
 static const UnqueryableDeviceParams kAllUnqueryableDeviceParams[] = {
-  {
-    3, 5,       // compute capability (3.5)
-    16,         // blocks_per_core_limit
-    64 * 1024,  // registers_per_core_limit
-    255,        // registers_per_thread_limit
-    4,          // warp_alloc_granularity
-    256,        // register_alloc_granularity
-    256         // shared_memory_alloc_granularity
-  }
+    {
+        2, 0,       // compute capability (2.0)
+        8,          // blocks_per_core_limit
+        32 * 1024,  // registers_per_core_limit
+        63,         // registers_per_thread_limit
+        2,          // warp_alloc_granularity
+        64,         // register_alloc_granularity
+        128,        // shared_memory_alloc_granularity
+    },
+    {
+        2, 1,       // compute capability (2.1)
+        8,          // blocks_per_core_limit
+        32 * 1024,  // registers_per_core_limit
+        63,         // registers_per_thread_limit
+        2,          // warp_alloc_granularity
+        64,         // register_alloc_granularity
+        128,        // shared_memory_alloc_granularity
+    },
+    {
+        3, 0,       // compute capability (3.0)
+        16,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        63,         // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        3, 2,       // compute capability (3.2)
+        16,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        3, 5,       // compute capability (3.5)
+        16,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        3, 7,        // compute capability (3.7)
+        16,          // blocks_per_core_limit
+        128 * 1024,  // registers_per_core_limit
+        255,         // registers_per_thread_limit
+        4,           // warp_alloc_granularity
+        256,         // register_alloc_granularity
+        256,         // shared_memory_alloc_granularity
+    },
+    {
+        5, 0,       // compute capability (5.0)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        5, 2,       // compute capability (5.2)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        5, 3,       // compute capability (5.3)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        6, 0,       // compute capability (6.0)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        2,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        6, 1,       // compute capability (6.1)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        6, 2,       // compute capability (6.2)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    // TODO(jlebar): Confirm the alloc granularity values for sm_70.  These are
+    // not published in the spreadsheet linked above.  Currently we guess that
+    // they're the same as sm_60.
+    {
+        7, 0,       // compute capability (7.0)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        2,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
 };
 
 DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 8ff4a30d6251dfe4cbbbf1a9c632b6383e964436..dbbbcd476f096ff912d391604ba349f6cb979478 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -152,7 +152,7 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
 
   Event::Status PollForEventStatus(Event *event) override;
 
-  bool BlockHostUntilDone(Stream *stream) override;
+  port::Status BlockHostUntilDone(Stream *stream) override;
 
   int PlatformDeviceCount() override { return CUDADriver::GetDeviceCount(); }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc
index 874ac1ab6574bbf95b05893f34131b2cee9acc72..3a738461489212a026197bc58777883349ba4b54 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@@ -197,7 +197,7 @@ void CudaPlatform::UnregisterTraceListener(TraceListener* listener) {
 static void InitializeCudaPlatform() {
   // Disabling leak checking, MultiPlatformManager does not destroy its
   // registered platforms.
-  
+
   std::unique_ptr<cuda::CudaPlatform> platform(new cuda::CudaPlatform);
   SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 29fd6d0e8707d35989b8c1644641002acf505c09..44144a06139bf8661432cb930e53ba5218aac823 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -470,10 +470,10 @@ string ConvolutionDescriptor::ToShortString() const {
 PoolingDescriptor::PoolingDescriptor(int ndims)
     : mode_(dnn::PoolingMode::kMaximum),
       ndims_(ndims),
+      propagate_nans_(false),
       window_(ndims, 0),
       padding_(ndims, 0),
-      strides_(ndims, 1),
-      propagate_nans_(false) {}
+      strides_(ndims, 1) {}
 
 PoolingDescriptor::PoolingDescriptor() : PoolingDescriptor(/*ndims=*/2) {}
 
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 0d2cd4a9f2fb3068f9a803e616ff5fa1712f4945..f4162b096299ca9405e1f3045e370d0da1acf8da 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -908,8 +908,8 @@ class DnnSupport {
   //    the running variance.
   //  reserve_space_1: saved mean, to be reused in the backward gradient
   //    computation.
-  //  reserve_space_2: saved variance, to be reused in the backward gradient
-  //    computation.
+  //  reserve_space_2: saved inv_var (1/sqrt(epsilon + variance), to be reused
+  //    in the backward gradient computation.
   //  is_training: Set to true for training, false for inference.
   //  var_to_inv_var: a function to convert the variance to inverted variance
   //    for cuDNN v4 forward inference.
@@ -957,6 +957,7 @@ class DnnSupport {
   //  y_backprop: gradient with regard to output y.
   //  x: input data.
   //  scale: scaling parameters.
+  //  inv_var: 1/sqrt(epsilon + variance) of x.
   //  x_desc: dimensions of the input data, which is the same as the dimensions
   //    of the output.
   //  scale_offset_desc: dimensions of scale and offset.
@@ -967,7 +968,7 @@ class DnnSupport {
   virtual bool DoBatchNormalizationBackward(
       Stream* stream, const DeviceMemory<float>& y_backprop,
       const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
-      const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+      const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
       const dnn::BatchDescriptor& x_desc,
       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
       DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
@@ -981,7 +982,7 @@ class DnnSupport {
   virtual bool DoBatchNormalizationBackward(
       Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
       const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
-      const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+      const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
       const dnn::BatchDescriptor& x_desc,
       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
       DeviceMemory<Eigen::half>* x_backprop,
@@ -1132,7 +1133,7 @@ class DnnSupport {
   //    space in order to speed up the convolution operation.
   //  algorithm: an integer to specify which algorithm should be used for the
   //    operation. kDefaultAlgorithm means the system will pick an algorithm
-  //    by default. The coding of the algorithm is be interpretted by the
+  //    by default. The coding of the algorithm is be interpreted by the
   //    underlying implementation.
   //  output_profile_result: the output profile result for this call. The
   //    profiling is only enabled when this is not nullptr.
@@ -2023,7 +2024,7 @@ class DnnSupport {
   //  output_h_desc: descriptor for the output "h" state.
   //  output_h_data: the memory region that stores the output "h" data.
   //  output_c_desc: descriptor for the output "c" state.
-  //  output_c_data: the memory region that stores the outptu "c" data. This
+  //  output_c_data: the memory region that stores the output "c" data. This
   //    must be specified for LSTM models.
   //  is_training: whether this is used in training or inference. That decides
   //    whether respace_space data need to be produced.
@@ -2032,7 +2033,7 @@ class DnnSupport {
   //  retains the data and feed it to the backward pass.
   //  workspace_allocator: an allocator to create temporary workspace used in
   //    this kernel. The caller is responsible for retaining the memory long
-  //    enough for the lifespan of this operation, and recycles aftewards.
+  //    enough for the lifespan of this operation, and recycles afterwards.
   virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                             const dnn::RnnSequenceTensorDescriptor& input_desc,
                             const DeviceMemory<Eigen::half>& input_data,
@@ -2111,7 +2112,7 @@ class DnnSupport {
   //  output_h_desc: descriptor for the output "h" state.
   //  output_h_data: the memory region that stores the output "h" data.
   //  output_c_desc: descriptor for the output "c" state.
-  //  output_c_data: the memory region that stores the outptu "c" data. This
+  //  output_c_data: the memory region that stores the output "c" data. This
   //    must be specified for LSTM models.
   //  output_backprop_data: the device memory region that contains the backprop
   //    to the output sequence.
diff --git a/tensorflow/stream_executor/fft.h b/tensorflow/stream_executor/fft.h
index 98cd77e2062bef45dd46e73ac29782eb12591e64..408516a416aba1982368bfcef94c823a4b08899c 100644
--- a/tensorflow/stream_executor/fft.h
+++ b/tensorflow/stream_executor/fft.h
@@ -34,8 +34,8 @@ limitations under the License.
 //     stream_exec.AsFft()->Create1dPlan(&stream, 1024, Type::kC2CForward);
 //  stream
 //    .Init()
-//    .ThenFft(plan.get(), x, &y)
-//    .BlockHostUntilDone();
+//    .ThenFft(plan.get(), x, &y);
+//  SE_CHECK_OK(stream.BlockHostUntilDone());
 //
 // By using stream operations in this manner the user can easily intermix custom
 // kernel launches (via StreamExecutor::ThenLaunch()) with these pre-canned FFT
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index 0af2c8cc3d751aa35958a21c81a71496f994e1fb..542f521ef778c3a69ec9adba74405131e07bcf1a 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -162,7 +162,7 @@ void HostExecutor::DeallocateStream(Stream *stream) {}
 
 bool HostExecutor::CreateStreamDependency(Stream *dependent, Stream *other) {
   AsHostStream(dependent)->EnqueueTask(
-      [other]() { other->BlockHostUntilDone(); });
+      [other]() { SE_CHECK_OK(other->BlockHostUntilDone()); });
   AsHostStream(dependent)->BlockUntilDone();
   return true;
 }
@@ -177,9 +177,9 @@ bool HostExecutor::StopTimer(Stream *stream, Timer *timer) {
   return true;
 }
 
-bool HostExecutor::BlockHostUntilDone(Stream *stream) {
+port::Status HostExecutor::BlockHostUntilDone(Stream *stream) {
   AsHostStream(stream)->BlockUntilDone();
-  return true;
+  return port::Status::OK();
 }
 
 DeviceDescription *HostExecutor::PopulateDeviceDescription() const {
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index 77b07e4a577fe321901a19369107701ec1904a80..e2c0e6d6b77130bd190b026f1eaff68d21dbf632 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -139,7 +139,7 @@ class HostExecutor : public internal::StreamExecutorInterface {
 
   bool StopTimer(Stream *stream, Timer *timer) override;
 
-  bool BlockHostUntilDone(Stream *stream) override;
+  port::Status BlockHostUntilDone(Stream *stream) override;
 
   int PlatformDeviceCount() override { return 1; }
 
diff --git a/tensorflow/stream_executor/lib/static_threadlocal.h b/tensorflow/stream_executor/lib/static_threadlocal.h
index 6e2bd0d45563644e7572f5a2fae2dd76ee6a6ca1..02720cbd261253ca9ccfafa84963526844385919 100644
--- a/tensorflow/stream_executor/lib/static_threadlocal.h
+++ b/tensorflow/stream_executor/lib/static_threadlocal.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STATIC_THREADLOCAL_H_
 
 #ifdef _MSC_VER
-#define __thread __declspec(thread) 
+#define __thread __declspec(thread)
 #endif
 
 // For POD types in TLS mode, s_obj_VAR is the thread-local variable.
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 22fd6bce78ff0e907444be7f161b27c159a75214..ba5001e273632c893b05eea64542f1b156e28c47 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -342,7 +342,7 @@ Stream &Stream::ThenBatchNormalizationForward(
 Stream &Stream::ThenBatchNormalizationBackward(
     const DeviceMemory<float> &y_backprop, const DeviceMemory<float> &x,
     const DeviceMemory<float> &scale, const DeviceMemory<float> &mean,
-    const DeviceMemory<float> &variance, const dnn::BatchDescriptor &x_desc,
+    const DeviceMemory<float> &inv_var, const dnn::BatchDescriptor &x_desc,
     const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
     DeviceMemory<float> *x_backprop, DeviceMemory<float> *scale_backprop,
     DeviceMemory<float> *offset_backprop) {
@@ -352,7 +352,7 @@ Stream &Stream::ThenBatchNormalizationBackward(
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
       CheckError(dnn->DoBatchNormalizationBackward(
-          this, y_backprop, x, scale, mean, variance, x_desc, scale_offset_desc,
+          this, y_backprop, x, scale, mean, inv_var, x_desc, scale_offset_desc,
           epsilon, x_backprop, scale_backprop, offset_backprop));
     } else {
       SetErrorAndLogNoDnnSupport();
@@ -392,7 +392,7 @@ Stream &Stream::ThenBatchNormalizationForward(
 Stream &Stream::ThenBatchNormalizationBackward(
     const DeviceMemory<Eigen::half> &y_backprop,
     const DeviceMemory<Eigen::half> &x, const DeviceMemory<float> &scale,
-    const DeviceMemory<float> &mean, const DeviceMemory<float> &variance,
+    const DeviceMemory<float> &mean, const DeviceMemory<float> &inv_var,
     const dnn::BatchDescriptor &x_desc,
     const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
     DeviceMemory<Eigen::half> *x_backprop, DeviceMemory<float> *scale_backprop,
@@ -403,7 +403,7 @@ Stream &Stream::ThenBatchNormalizationBackward(
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
       CheckError(dnn->DoBatchNormalizationBackward(
-          this, y_backprop, x, scale, mean, variance, x_desc, scale_offset_desc,
+          this, y_backprop, x, scale, mean, inv_var, x_desc, scale_offset_desc,
           epsilon, x_backprop, scale_backprop, offset_backprop));
     } else {
       SetErrorAndLogNoDnnSupport();
@@ -5055,22 +5055,24 @@ Stream &Stream::ThenEnqueueOnBackgroundThread(
   });
 }
 
-bool Stream::BlockHostUntilDone() {
+port::Status Stream::BlockHostUntilDone() {
   VLOG_CALL();
 
   if (!ok()) {
-    LOG(INFO)
-        << "stream " << this
-        << " did not block host until done; was already in an error state";
-    return false;
+    port::Status status = port::Status(
+        port::error::INTERNAL,
+        "stream did not block host until done; was already in an error state");
+    LOG(INFO) << status << " " << this;
+    return status;
   }
 
+  port::Status first_error;
   {
     // Wait until all active sub-streams have done their tasks.
     mutex_lock lock{mu_};
     for (auto &stream : sub_streams_) {
       if (!stream.second) {
-        CheckError(stream.first->BlockHostUntilDone());
+        first_error.Update(stream.first->BlockHostUntilDone());
         // Set this sub-stream as available.
         stream.second = true;
       }
@@ -5079,8 +5081,9 @@ bool Stream::BlockHostUntilDone() {
 
   temporary_memory_manager_.DeallocateFinalizedTemporaries();
 
-  CheckError(parent_->BlockHostUntilDone(this));
-  return ok();
+  first_error.Update(parent_->BlockHostUntilDone(this));
+  CheckError(first_error.ok());
+  return first_error;
 }
 
 }  // namespace gputools
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 023cffb96510fea0cf2fc54bd609fa38cf124b0a..a2fb2ea2375d0f245ae3bf3ccb04803d01663def 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -113,7 +113,7 @@ class Stream {
 
   // Initialize the stream. This must be performed before entraining any other
   // operations.
-  Stream &Init();
+  Stream &Init() LOCKS_EXCLUDED(mu_);
 
   // Initializes timer t via the StreamExecutor.
   Stream &InitTimer(Timer *t);
@@ -124,11 +124,11 @@ class Stream {
   // Get or create a sub-stream from this stream. If there is any sub-stream in
   // the pool that can be reused then just return this sub-stream.  Otherwise
   // create a new sub-stream.
-  Stream *GetOrCreateSubStream();
+  Stream *GetOrCreateSubStream() LOCKS_EXCLUDED(mu_);
 
   // Return the sub-stream back to the host stream so that it can be reused
   // later.
-  void ReturnSubStream(Stream *sub_stream);
+  void ReturnSubStream(Stream *sub_stream) LOCKS_EXCLUDED(mu_);
 
   // Allocate temporary memories. The stream will deallocate them when blocked
   // or destroyed.
@@ -234,7 +234,7 @@ class Stream {
   Stream &ThenBatchNormalizationBackward(
       const DeviceMemory<float> &y_backprop, const DeviceMemory<float> &x,
       const DeviceMemory<float> &scale, const DeviceMemory<float> &mean,
-      const DeviceMemory<float> &variance, const dnn::BatchDescriptor &x_desc,
+      const DeviceMemory<float> &inv_var, const dnn::BatchDescriptor &x_desc,
       const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
       DeviceMemory<float> *x_backprop, DeviceMemory<float> *scale_backprop,
       DeviceMemory<float> *offset_backprop);
@@ -255,7 +255,7 @@ class Stream {
   Stream &ThenBatchNormalizationBackward(
       const DeviceMemory<Eigen::half> &y_backprop,
       const DeviceMemory<Eigen::half> &x, const DeviceMemory<float> &scale,
-      const DeviceMemory<float> &mean, const DeviceMemory<float> &variance,
+      const DeviceMemory<float> &mean, const DeviceMemory<float> &inv_var,
       const dnn::BatchDescriptor &x_desc,
       const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
       DeviceMemory<Eigen::half> *x_backprop,
@@ -1903,8 +1903,9 @@ class Stream {
   // entrained on the stream (enqueued to this point in program
   // execution) to complete.
   //
-  // Returns true if the stream is ok().
-  bool BlockHostUntilDone();
+  // Returns an OK status if the blocking was successful and the stream is ok().
+  // Otherwise returns an error describing why the blocking failed.
+  port::Status BlockHostUntilDone() LOCKS_EXCLUDED(mu_);
 
   // Warning! This method interacts with internal threads in
   // sometimes-unpredictable ways and is intended for GPU-Executor-internal
@@ -1960,14 +1961,14 @@ class Stream {
   friend struct ThenBlasImpl;  // for implementing ThenBlasXXX.
   friend class ocl::CLBlas;    // for parent_.
 
-  bool InErrorState() const {
+  bool InErrorState() const LOCKS_EXCLUDED(mu_) {
     tf_shared_lock lock{mu_};
     return !ok_;
   }
 
   // Sets the error state if operation_retcode is false.
   // This is a useful shorthand for many stream routines.
-  void CheckError(bool operation_retcode) {
+  void CheckError(bool operation_retcode) LOCKS_EXCLUDED(mu_) {
     if (operation_retcode) {
       return;
     }
diff --git a/tensorflow/stream_executor/stream_executor_internal.cc b/tensorflow/stream_executor/stream_executor_internal.cc
index 95b285b992df91eb1adc01423bb07e2298dba9c4..273d970b6fa4a581381689191b183a30f4f2bcd3 100644
--- a/tensorflow/stream_executor/stream_executor_internal.cc
+++ b/tensorflow/stream_executor/stream_executor_internal.cc
@@ -15,9 +15,6 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
-
 namespace perftools {
 namespace gputools {
 namespace internal {
@@ -40,7 +37,6 @@ StreamExecutorFactory* MakeOpenCLExecutorImplementation() {
 
 StreamExecutorFactory MakeHostExecutorImplementation;
 
-
 }  // namespace internal
 }  // namespace gputools
 }  // namespace perftools
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 14445a7657be10a6d3d93ef0aabebcfa17d38b72..37ef182e1445a85dd0a97eac02ba064a26dc0f1d 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -219,7 +219,7 @@ class StreamExecutorInterface {
   virtual void DeallocateTimer(Timer *timer) = 0;
   virtual bool StartTimer(Stream *stream, Timer *timer) = 0;
   virtual bool StopTimer(Stream *stream, Timer *timer) = 0;
-  virtual bool BlockHostUntilDone(Stream *stream) = 0;
+  virtual port::Status BlockHostUntilDone(Stream *stream) = 0;
   virtual int PlatformDeviceCount() = 0;
   virtual port::Status EnablePeerAccessTo(StreamExecutorInterface *other) = 0;
   virtual bool CanEnablePeerAccessTo(StreamExecutorInterface *other) = 0;
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 76afb85068bafb805678a9bc03b55b2efa1523c6..afca1c2e597b55b1b8d0b76d4e79995d6f6af822 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -432,8 +432,8 @@ bool StreamExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
   return implementation_->Launch(stream, thread_dims, block_dims, kernel, args);
 }
 
-bool StreamExecutor::BlockHostUntilDone(Stream *stream) {
-  bool result;
+port::Status StreamExecutor::BlockHostUntilDone(Stream *stream) {
+  port::Status result;
   SCOPED_TRACE(TraceListener::BlockHostUntilDone, &result, stream);
 
   result = implementation_->BlockHostUntilDone(stream);
@@ -566,19 +566,18 @@ port::Status StreamExecutor::SynchronousMemcpyD2H(
           << device_src.opaque() << ", size=" << size
           << ", host_dst=" << host_dst << ")" << StackTraceIfVLOG10();
 
-  port::Status result{port::Status::OK()};
+  port::Status result;
   SCOPED_TRACE(TraceListener::SynchronousMemcpyD2H, &result, device_src, size,
                host_dst);
 
-  port::Status status =
-      implementation_->SynchronousMemcpy(host_dst, device_src, size);
-  if (!status.ok()) {
-    return port::Status{port::error::INTERNAL,
-                        port::Printf("failed to synchronously memcpy "
-                                     "device-to-host: device %p to host %p "
-                                     "size %lld: %s",
-                                     device_src.opaque(), host_dst, size,
-                                     status.ToString().c_str())};
+  result = implementation_->SynchronousMemcpy(host_dst, device_src, size);
+  if (!result.ok()) {
+    result = port::Status{port::error::INTERNAL,
+                          port::Printf("failed to synchronously memcpy "
+                                       "device-to-host: device %p to host %p "
+                                       "size %lld: %s",
+                                       device_src.opaque(), host_dst, size,
+                                       result.ToString().c_str())};
   }
 
   return result;
@@ -590,19 +589,18 @@ port::Status StreamExecutor::SynchronousMemcpyH2D(
           << ", size=" << size << ", device_dst" << device_dst->opaque() << ")"
           << StackTraceIfVLOG10();
 
-  port::Status result{port::Status::OK()};
+  port::Status result;
   SCOPED_TRACE(TraceListener::SynchronousMemcpyH2D, &result, host_src, size,
                device_dst);
 
-  port::Status status =
-      implementation_->SynchronousMemcpy(device_dst, host_src, size);
-  if (!status.ok()) {
+  result = implementation_->SynchronousMemcpy(device_dst, host_src, size);
+  if (!result.ok()) {
     result = port::Status{
         port::error::INTERNAL,
         port::Printf("failed to synchronously memcpy host-to-device: host "
                      "%p to device %p size %lld: %s",
                      host_src, device_dst->opaque(), size,
-                     status.ToString().c_str())};
+                     result.ToString().c_str())};
   }
 
   return result;
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 66c50d47e95fe4e9bf6df24cd61139630000cefb..a2a77218cbbafeeb9d4d8ca04b2e0a8a5024ebf9 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -481,7 +481,7 @@ class StreamExecutor {
   // Causes the host code to synchronously wait for operations entrained onto
   // stream to complete. Effectively a join on the asynchronous device
   // operations enqueued on the stream before this program point.
-  bool BlockHostUntilDone(Stream *stream);
+  port::Status BlockHostUntilDone(Stream *stream);
 
   // Synchronously allocates size bytes on the underlying platform and returns
   // an opaque void* representing that allocation. In the case of failure,
diff --git a/tensorflow/stream_executor/trace_listener.h b/tensorflow/stream_executor/trace_listener.h
index 88c54f982b3cfde925dbe0ca4f7bc3a738e5f3ac..d1e87c348b1f867009fdb6b741d984b2f58cef21 100644
--- a/tensorflow/stream_executor/trace_listener.h
+++ b/tensorflow/stream_executor/trace_listener.h
@@ -65,7 +65,8 @@ class TraceListener {
                                             const port::Status* result) {}
 
   virtual void BlockHostUntilDoneBegin(int64 correlation_id, Stream* stream) {}
-  virtual void BlockHostUntilDoneComplete(int64 correlation_id, bool result) {}
+  virtual void BlockHostUntilDoneComplete(int64 correlation_id,
+                                          const port::Status* result) {}
 };
 
 }  // namespace gputools
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index a3ba363469c0a9251ac9325d376001beae6ff98a..00328477c9ce94d0c99f4f59e9860e49a481d1d3 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1,45 +1,43 @@
 # -*- Python -*-
 
-
 # Return the options to use for a C++ library or binary build.
 # Uses the ":optmode" config_setting to pick the options.
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
     "tf_cuda_tests_tags",
     "tf_sycl_tests_tags",
+    "tf_additional_grpc_deps_py",
     "tf_additional_xla_deps_py",
-    "if_static",)
+    "if_static",
+)
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
     "if_cuda",
-    "cuda_default_copts",)
-
+    "cuda_default_copts",
+)
 load(
     "//third_party/mkl:build_defs.bzl",
-    "if_mkl",)
+    "if_mkl",
+)
 
 def register_extension_info(**kwargs):
     pass
 
-
 # Given a source file, generate a test name.
 # i.e. "common_runtime/direct_session_test.cc" becomes
 #      "common_runtime_direct_session_test"
 def src_to_test_name(src):
   return src.replace("/", "_").split(".")[0]
 
-
 def full_path(relative_paths):
   return [PACKAGE_NAME + "/" + relative for relative in relative_paths]
 
-
 # List of proto files for android builds
 def tf_android_core_proto_sources(core_proto_sources_relative):
   return [
       "//tensorflow/core:" + p for p in core_proto_sources_relative
   ]
 
-
 # Returns the list of pb.h and proto.h headers that are generated for
 # tf_android_core_proto_sources().
 def tf_android_core_proto_headers(core_proto_sources_relative):
@@ -51,13 +49,11 @@ def tf_android_core_proto_headers(core_proto_sources_relative):
       for p in core_proto_sources_relative
   ])
 
-
 # Sanitize a dependency so that it works correctly from code that includes
 # TensorFlow as a submodule.
 def clean_dep(dep):
   return str(Label(dep))
 
-
 def if_android_x86(a):
   return select({
       clean_dep("//tensorflow:android_x86"): a,
@@ -65,35 +61,30 @@ def if_android_x86(a):
       "//conditions:default": [],
   })
 
-
 def if_android_arm(a):
   return select({
       clean_dep("//tensorflow:android_arm"): a,
       "//conditions:default": [],
   })
 
-
 def if_android_arm64(a):
   return select({
       clean_dep("//tensorflow:android_arm64"): a,
       "//conditions:default": [],
   })
 
-
 def if_android_mips(a):
   return select({
       clean_dep("//tensorflow:android_mips"): a,
       "//conditions:default": [],
   })
 
-
 def if_not_android(a):
   return select({
       clean_dep("//tensorflow:android"): [],
       "//conditions:default": a,
   })
 
-
 def if_not_android_mips_and_mips64(a):
   return select({
       clean_dep("//tensorflow:android_mips"): [],
@@ -101,20 +92,23 @@ def if_not_android_mips_and_mips64(a):
       "//conditions:default": a,
   })
 
-
 def if_android(a):
   return select({
       clean_dep("//tensorflow:android"): a,
       "//conditions:default": [],
   })
 
-
 def if_ios(a):
   return select({
       clean_dep("//tensorflow:ios"): a,
       "//conditions:default": [],
   })
 
+def if_ios_x86_64(a):
+  return select({
+      clean_dep("//tensorflow:ios_x86_64"): a,
+      "//conditions:default": [],
+  })
 
 def if_mobile(a):
   return select({
@@ -123,7 +117,6 @@ def if_mobile(a):
       "//conditions:default": [],
   })
 
-
 def if_not_mobile(a):
   return select({
       clean_dep("//tensorflow:android"): [],
@@ -131,7 +124,6 @@ def if_not_mobile(a):
       "//conditions:default": a,
   })
 
-
 def if_not_windows(a):
   return select({
       clean_dep("//tensorflow:windows"): [],
@@ -139,6 +131,12 @@ def if_not_windows(a):
       "//conditions:default": a,
   })
 
+def if_windows(a):
+  return select({
+      clean_dep("//tensorflow:windows"): a,
+      clean_dep("//tensorflow:windows_msvc"): a,
+      "//conditions:default": [],
+  })
 
 def if_linux_x86_64(a):
   return select({
@@ -152,22 +150,40 @@ def if_darwin(a):
       "//conditions:default": [],
   })
 
-WIN_COPTS = [
-    "/DLANG_CXX11",
-    "/D__VERSION__=\\\"MSVC\\\"",
-    "/DPLATFORM_WINDOWS",
-    "/DTF_COMPILE_LIBRARY",
-    "/DEIGEN_HAS_C99_MATH",
-    "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
-    "/DEIGEN_AVOID_STL_ARRAY",
-    "/Iexternal/gemmlowp",
-    "/wd4018", # -Wno-sign-compare
-    "/U_HAS_EXCEPTIONS", "/D_HAS_EXCEPTIONS=1", "/EHsc", # -fno-exceptions
-    "/DNOGDI",
-]
+def get_win_copts(is_external=False):
+    WINDOWS_COPTS = [
+        "/DLANG_CXX11",
+        "/D__VERSION__=\\\"MSVC\\\"",
+        "/DPLATFORM_WINDOWS",
+        "/DEIGEN_HAS_C99_MATH",
+        "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
+        "/DEIGEN_AVOID_STL_ARRAY",
+        "/Iexternal/gemmlowp",
+        "/wd4018",  # -Wno-sign-compare
+        "/U_HAS_EXCEPTIONS",
+        "/D_HAS_EXCEPTIONS=1",
+        "/EHsc",  # -fno-exceptions
+        "/DNOGDI",
+    ]
+    if is_external:
+      return WINDOWS_COPTS + ["/UTF_COMPILE_LIBRARY"]
+    else:
+      return WINDOWS_COPTS + ["/DTF_COMPILE_LIBRARY"]
 
 # LINT.IfChange
-def tf_copts():
+def tf_copts(android_optimization_level_override="-O2", is_external=False):
+  # For compatibility reasons, android_optimization_level_override
+  # is currently only being set for Android.
+  # To clear this value, and allow the CROSSTOOL default
+  # to be used, pass android_optimization_level_override=None
+  android_copts = [
+      "-std=c++11",
+      "-DTF_LEAN_BINARY",
+      "-Wno-narrowing",
+      "-fomit-frame-pointer",
+  ]
+  if android_optimization_level_override:
+    android_copts.append(android_optimization_level_override)
   return (
       if_not_windows([
           "-DEIGEN_AVOID_STL_ARRAY",
@@ -179,22 +195,20 @@ def tf_copts():
       + if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML", "-fopenmp",])
       + if_android_arm(["-mfpu=neon"])
       + if_linux_x86_64(["-msse3"])
+      + if_ios_x86_64(["-msse4.1"])
+      + select({
+            "//tensorflow:framework_shared_object": [],
+            "//conditions:default": ["-DTENSORFLOW_MONOLITHIC_BUILD"],
+      })
       + select({
-            clean_dep("//tensorflow:android"): [
-                "-std=c++11",
-                "-DTF_LEAN_BINARY",
-                "-O2",
-                "-Wno-narrowing",
-                "-fomit-frame-pointer",
-            ],
+            clean_dep("//tensorflow:android"): android_copts,
             clean_dep("//tensorflow:darwin"): [],
-            clean_dep("//tensorflow:windows"): WIN_COPTS,
-            clean_dep("//tensorflow:windows_msvc"): WIN_COPTS,
+            clean_dep("//tensorflow:windows"): get_win_copts(is_external),
+            clean_dep("//tensorflow:windows_msvc"): get_win_copts(is_external),
             clean_dep("//tensorflow:ios"): ["-std=c++11"],
             "//conditions:default": ["-pthread"]
       }))
 
-
 def tf_opts_nortti_if_android():
   return if_android([
       "-fno-rtti",
@@ -202,13 +216,11 @@ def tf_opts_nortti_if_android():
       "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
   ])
 
-
 # LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
 
-
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate a library for that file.
-def tf_gen_op_libs(op_lib_names, deps=None):
+def tf_gen_op_libs(op_lib_names, deps=None, is_external=True):
   # Make library out of each op so it can also be used to generate wrappers
   # for various languages.
   if not deps:
@@ -216,20 +228,18 @@ def tf_gen_op_libs(op_lib_names, deps=None):
   for n in op_lib_names:
     native.cc_library(
         name=n + "_op_lib",
-        copts=tf_copts(),
+        copts=tf_copts(is_external=is_external),
         srcs=["ops/" + n + ".cc"],
         deps=deps + [clean_dep("//tensorflow/core:framework")],
         visibility=["//visibility:public"],
         alwayslink=1,
         linkstatic=1,)
 
-
 def _make_search_paths(prefix, levels_to_root):
   return ",".join(
       ["-rpath,%s/%s" % (prefix, "/".join([".."] * search_level))
        for search_level in range(levels_to_root + 1)])
 
-
 def _rpath_linkopts(name):
   # Search parent directories up to the TensorFlow root directory for shared
   # object dependencies, even if this op shared object is deeply nested
@@ -248,7 +258,6 @@ def _rpath_linkopts(name):
       ],
   })
 
-
 # Bazel-generated shared objects which must be linked into TensorFlow binaries
 # to define symbols from //tensorflow/core:framework and //tensorflow/core:lib.
 def tf_binary_additional_srcs():
@@ -258,7 +267,6 @@ def tf_binary_additional_srcs():
           clean_dep("//tensorflow:libtensorflow_framework.so"),
       ])
 
-
 def tf_cc_shared_object(
     name,
     srcs=[],
@@ -281,9 +289,9 @@ def tf_cc_shared_object(
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_cc_shared_object",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "tf_cc_shared_object",
+    label_regex_for_dep = "{extension_name}",
+)
 
 # Links in the framework shared object
 # (//third_party/tensorflow:libtensorflow_framework.so) when not building
@@ -293,9 +301,11 @@ def tf_cc_binary(name,
                  srcs=[],
                  deps=[],
                  linkopts=[],
+                 copts=tf_copts(),
                  **kwargs):
   native.cc_binary(
       name=name,
+      copts=copts,
       srcs=srcs + tf_binary_additional_srcs(),
       deps=deps + if_mkl(
           [
@@ -306,9 +316,9 @@ def tf_cc_binary(name,
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_cc_binary",
-    label_regex_for_dep="{extension_name}.*")
-
+    extension_name = "tf_cc_binary",
+    label_regex_for_dep = "{extension_name}.*",
+)
 
 def tf_gen_op_wrapper_cc(name,
                          out_ops_file,
@@ -316,7 +326,9 @@ def tf_gen_op_wrapper_cc(name,
                          op_gen=clean_dep("//tensorflow/cc:cc_op_gen_main"),
                          deps=None,
                          override_file=None,
-                         include_internal_ops=0):
+                         include_internal_ops=0,
+                         # ApiDefs will be loaded in the order specified in this list.
+                         api_def_srcs=[]):
   # Construct an op generator binary for these ops.
   tool = out_ops_file + "_gen_cc"
   if deps == None:
@@ -324,16 +336,31 @@ def tf_gen_op_wrapper_cc(name,
   tf_cc_binary(
       name=tool,
       copts=tf_copts(),
-      linkopts=["-lm"],
+      linkopts=if_not_windows(["-lm"]),
       linkstatic=1,  # Faster to link this one-time-use binary dynamically
       deps=[op_gen] + deps)
 
+  srcs = api_def_srcs[:]
+
   if override_file == None:
-    srcs = []
     override_arg = ","
   else:
-    srcs = [override_file]
+    srcs += [override_file]
     override_arg = "$(location " + override_file + ")"
+
+  if not api_def_srcs:
+    api_def_args_str = ","
+  else:
+    api_def_args = []
+    for api_def_src in api_def_srcs:
+      # Add directory of the first ApiDef source to args.
+      # We are assuming all ApiDefs in a single api_def_src are in the
+      # same directory.
+      api_def_args.append(
+          " $$(dirname $$(echo $(locations " + api_def_src +
+          ") | cut -d\" \" -f1))")
+    api_def_args_str = ",".join(api_def_args)
+
   native.genrule(
       name=name + "_genrule",
       outs=[
@@ -344,8 +371,7 @@ def tf_gen_op_wrapper_cc(name,
       tools=[":" + tool] + tf_binary_additional_srcs(),
       cmd=("$(location :" + tool + ") $(location :" + out_ops_file + ".h) " +
            "$(location :" + out_ops_file + ".cc) " + override_arg + " " +
-           str(include_internal_ops)))
-
+           str(include_internal_ops) + " " + api_def_args_str))
 
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate individual C++ .cc and .h
@@ -387,7 +413,9 @@ def tf_gen_op_wrappers_cc(name,
                           op_gen=clean_dep("//tensorflow/cc:cc_op_gen_main"),
                           override_file=None,
                           include_internal_ops=0,
-                          visibility=None):
+                          visibility=None,
+                          # ApiDefs will be loaded in the order apecified in this list.
+                          api_def_srcs=[]):
   subsrcs = other_srcs[:]
   subhdrs = other_hdrs[:]
   internalsrcs = []
@@ -399,7 +427,8 @@ def tf_gen_op_wrappers_cc(name,
         pkg=pkg,
         op_gen=op_gen,
         override_file=override_file,
-        include_internal_ops=include_internal_ops)
+        include_internal_ops=include_internal_ops,
+        api_def_srcs=api_def_srcs)
     subsrcs += ["ops/" + n + ".cc"]
     subhdrs += ["ops/" + n + ".h"]
     internalsrcs += ["ops/" + n + "_internal.cc"]
@@ -436,7 +465,6 @@ def tf_gen_op_wrappers_cc(name,
       alwayslink=1,
       visibility=[clean_dep("//tensorflow:internal")])
 
-
 # Generates a Python library target wrapping the ops registered in "deps".
 #
 # Args:
@@ -457,6 +485,8 @@ def tf_gen_op_wrappers_cc(name,
 #     "name" arg)
 #   op_whitelist: if not empty, only op names in this list will be wrapped. It
 #     is invalid to specify both "hidden" and "op_whitelist".
+#   cc_linkopts: Optional linkopts to be added to tf_cc_binary that contains the
+#     specified ops.
 def tf_gen_op_wrapper_py(name,
                          out=None,
                          hidden=None,
@@ -465,7 +495,9 @@ def tf_gen_op_wrapper_py(name,
                          require_shape_functions=False,
                          hidden_file=None,
                          generated_target_name=None,
-                         op_whitelist=[]):
+                         op_whitelist=[],
+                         cc_linkopts=[],
+                         api_def_srcs=[]):
   if (hidden or hidden_file) and op_whitelist:
     fail('Cannot pass specify both hidden and op_whitelist.')
 
@@ -475,7 +507,7 @@ def tf_gen_op_wrapper_py(name,
     deps = [str(Label("//tensorflow/core:" + name + "_op_lib"))]
   tf_cc_binary(
       name=tool_name,
-      linkopts=["-lm"],
+      linkopts=if_not_windows(["-lm"]) + cc_linkopts,
       copts=tf_copts(),
       linkstatic=1,  # Faster to link this one-time-use binary dynamically
       deps=([
@@ -498,22 +530,39 @@ def tf_gen_op_wrapper_py(name,
     op_list_arg = "''"
     op_list_is_whitelist = False
 
+  # Prepare ApiDef directories to pass to the genrule.
+  if not api_def_srcs:
+    api_def_args_str = ","
+  else:
+    api_def_args = []
+    for api_def_src in api_def_srcs:
+      # Add directory of the first ApiDef source to args.
+      # We are assuming all ApiDefs in a single api_def_src are in the
+      # same directory.
+      api_def_args.append(
+          "$$(dirname $$(echo $(locations " + api_def_src +
+          ") | cut -d\" \" -f1))")
+    api_def_args_str = ",".join(api_def_args)
+
   if hidden_file:
     # `hidden_file` is file containing a list of op names to be hidden in the
     # generated module.
     native.genrule(
         name=name + "_pygenrule",
         outs=[out],
-        srcs=[hidden_file],
+        srcs=api_def_srcs + [hidden_file],
         tools=[tool_name] + tf_binary_additional_srcs(),
-        cmd=("$(location " + tool_name + ") @$(location " + hidden_file + ") " +
+        cmd=("$(location " + tool_name + ") " + api_def_args_str +
+             " @$(location " + hidden_file + ") " +
              ("1" if require_shape_functions else "0") + " > $@"))
   else:
     native.genrule(
         name=name + "_pygenrule",
         outs=[out],
+        srcs=api_def_srcs,
         tools=[tool_name] + tf_binary_additional_srcs(),
-        cmd=("$(location " + tool_name + ") " + op_list_arg + " " +
+        cmd=("$(location " + tool_name + ") " + api_def_args_str + " " +
+             op_list_arg + " " +
              ("1" if require_shape_functions else "0") + " " +
              ("1" if op_list_is_whitelist else "0") + " > $@"))
 
@@ -529,7 +578,6 @@ def tf_gen_op_wrapper_py(name,
           clean_dep("//tensorflow/python:framework_for_generated_wrappers_v2"),
       ],)
 
-
 # Define a bazel macro that creates cc_test for tensorflow.
 #
 # Links in the framework shared object
@@ -552,7 +600,7 @@ def tf_cc_test(name,
       name="%s%s" % (name, suffix),
       srcs=srcs + tf_binary_additional_srcs(),
       copts=tf_copts() + extra_copts,
-      linkopts=["-lpthread", "-lm"] + linkopts + _rpath_linkopts(name),
+      linkopts=if_not_windows(["-lpthread", "-lm"]) + linkopts + _rpath_linkopts(name),
       deps=deps + if_mkl(
           [
               "//third_party/mkl:intel_binary_blob",
@@ -572,9 +620,9 @@ def tf_cc_test(name,
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_cc_test",
-    label_regex_for_dep="{extension_name}.*")
-
+    extension_name = "tf_cc_test",
+    label_regex_for_dep = "{extension_name}.*",
+)
 
 # Part of the testing workflow requires a distinguishable name for the build
 # rules that involve a GPU, even if otherwise identical to the base rule.
@@ -599,9 +647,9 @@ def tf_cc_test_gpu(name,
       args=args)
 
 register_extension_info(
-    extension_name="tf_cc_test_gpu",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "tf_cc_test_gpu",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_cuda_cc_test(name,
                     srcs=[],
@@ -643,9 +691,9 @@ def tf_cuda_cc_test(name,
       args=args)
 
 register_extension_info(
-    extension_name="tf_cuda_cc_test",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "tf_cuda_cc_test",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_cuda_only_cc_test(name,
                     srcs=[],
@@ -666,7 +714,7 @@ def tf_cuda_only_cc_test(name,
       deps=deps + if_cuda([
           clean_dep("//tensorflow/core:cuda"),
           clean_dep("//tensorflow/core:gpu_lib")]),
-      linkopts=["-lpthread", "-lm"] + linkopts + _rpath_linkopts(name),
+      linkopts=if_not_windows(["-lpthread", "-lm"]) + linkopts + _rpath_linkopts(name),
       linkstatic=linkstatic or select({
           # cc_tests with ".so"s in srcs incorrectly link on Darwin
           # unless linkstatic=1.
@@ -677,9 +725,9 @@ def tf_cuda_only_cc_test(name,
       tags=tags + tf_cuda_tests_tags())
 
 register_extension_info(
-    extension_name="tf_cuda_only_cc_test",
-    label_regex_for_dep="{extension_name}_gpu")
-
+    extension_name = "tf_cuda_only_cc_test",
+    label_regex_for_dep = "{extension_name}_gpu",
+)
 
 # Create a cc_test for each of the tensorflow tests listed in "tests"
 def tf_cc_tests(srcs,
@@ -703,7 +751,6 @@ def tf_cc_tests(srcs,
         linkopts=linkopts,
         nocopts=nocopts)
 
-
 def tf_cc_test_mkl(srcs,
                    deps,
                    name="",
@@ -713,7 +760,6 @@ def tf_cc_test_mkl(srcs,
                    args=None):
   if_mkl(tf_cc_tests(srcs, deps, name, linkstatic=linkstatic, tags=tags, size=size, args=args, nocopts="-fno-exceptions"))
 
-
 def tf_cc_tests_gpu(srcs,
                     deps,
                     name="",
@@ -723,7 +769,6 @@ def tf_cc_tests_gpu(srcs,
                     args=None):
   tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args)
 
-
 def tf_cuda_cc_tests(srcs,
                      deps,
                      name="",
@@ -756,9 +801,9 @@ def tf_java_test(name,
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_java_test",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "tf_java_test",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def _cuda_copts():
   """Gets the appropriate set of copts for (maybe) CUDA compilation.
@@ -778,10 +823,8 @@ def _cuda_copts():
       ]),
   })
 
-
 # Build defs for TensorFlow kernels
 
-
 # When this target is built using --config=cuda, a cc_library is built
 # that passes -DGOOGLE_CUDA=1 and '-x cuda', linking in additional
 # libraries needed by GPU kernels.
@@ -805,11 +848,11 @@ def tf_gpu_kernel_library(srcs,
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_gpu_kernel_library",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "tf_gpu_kernel_library",
+    label_regex_for_dep = "{extension_name}",
+)
 
-def tf_cuda_library(deps=None, cuda_deps=None, copts=None, **kwargs):
+def tf_cuda_library(deps=None, cuda_deps=None, copts=tf_copts(), **kwargs):
   """Generate a cc_library with a conditional set of CUDA dependencies.
 
   When the library is built with --config=cuda:
@@ -829,8 +872,6 @@ def tf_cuda_library(deps=None, cuda_deps=None, copts=None, **kwargs):
     deps = []
   if not cuda_deps:
     cuda_deps = []
-  if not copts:
-    copts = []
 
   native.cc_library(
       deps=deps + if_cuda(cuda_deps + [
@@ -841,10 +882,9 @@ def tf_cuda_library(deps=None, cuda_deps=None, copts=None, **kwargs):
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_cuda_library",
-    label_regex_for_dep="{extension_name}")
-
-
+    extension_name = "tf_cuda_library",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_kernel_library(name,
                       prefix=None,
@@ -853,7 +893,8 @@ def tf_kernel_library(name,
                       hdrs=None,
                       deps=None,
                       alwayslink=1,
-                      copts=tf_copts(),
+                      copts=None,
+                      is_external=False,
                       **kwargs):
   """A rule to build a TensorFlow OpKernel.
 
@@ -882,7 +923,8 @@ def tf_kernel_library(name,
     hdrs = []
   if not deps:
     deps = []
-
+  if not copts:
+    copts = tf_copts(is_external=is_external)
   if prefix:
     if native.glob([prefix + "*.cu.cc"], exclude=["*test*"]):
       if not gpu_srcs:
@@ -915,9 +957,9 @@ def tf_kernel_library(name,
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_kernel_library",
-    label_regex_for_dep="{extension_name}(_gpu)?")
-
+    extension_name = "tf_kernel_library",
+    label_regex_for_dep = "{extension_name}(_gpu)?",
+)
 
 def tf_mkl_kernel_library(name,
                           prefix=None,
@@ -956,9 +998,9 @@ def tf_mkl_kernel_library(name,
       ))
 
 register_extension_info(
-    extension_name="tf_mkl_kernel_library",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "tf_mkl_kernel_library",
+    label_regex_for_dep = "{extension_name}",
+)
 
 # Bazel rules for building swig files.
 def _py_wrap_cc_impl(ctx):
@@ -992,44 +1034,41 @@ def _py_wrap_cc_impl(ctx):
       progress_message="SWIGing " + src.path)
   return struct(files=depset(outputs))
 
-
 _py_wrap_cc = rule(
-    attrs={
-        "srcs":
-            attr.label_list(
-                mandatory=True,
-                allow_files=True,),
-        "swig_includes":
-            attr.label_list(
-                cfg="data",
-                allow_files=True,),
-        "deps":
-            attr.label_list(
-                allow_files=True,
-                providers=["cc"],),
-        "toolchain_deps":
-            attr.label_list(
-                allow_files=True,),
-        "module_name":
-            attr.string(mandatory=True),
-        "py_module_name":
-            attr.string(mandatory=True),
-        "_swig":
-            attr.label(
-                default=Label("@swig//:swig"),
-                executable=True,
-                cfg="host",),
-        "_swiglib":
-            attr.label(
-                default=Label("@swig//:templates"),
-                allow_files=True,),
+    attrs = {
+        "srcs": attr.label_list(
+            mandatory = True,
+            allow_files = True,
+        ),
+        "swig_includes": attr.label_list(
+            cfg = "data",
+            allow_files = True,
+        ),
+        "deps": attr.label_list(
+            allow_files = True,
+            providers = ["cc"],
+        ),
+        "toolchain_deps": attr.label_list(
+            allow_files = True,
+        ),
+        "module_name": attr.string(mandatory = True),
+        "py_module_name": attr.string(mandatory = True),
+        "_swig": attr.label(
+            default = Label("@swig//:swig"),
+            executable = True,
+            cfg = "host",
+        ),
+        "_swiglib": attr.label(
+            default = Label("@swig//:templates"),
+            allow_files = True,
+        ),
     },
-    outputs={
+    outputs = {
         "cc_out": "%{module_name}.cc",
         "py_out": "%{py_module_name}.py",
     },
-    implementation=_py_wrap_cc_impl,)
-
+    implementation = _py_wrap_cc_impl,
+)
 
 def _get_repository_roots(ctx, files):
   """Returns abnormal root directories under which files reside.
@@ -1060,7 +1099,6 @@ def _get_repository_roots(ctx, files):
       result[root] -= 1
   return [k for v, k in sorted([(v, k) for k, v in result.items()])]
 
-
 # Bazel rule for collecting the header files that a target depends on.
 def _transitive_hdrs_impl(ctx):
   outputs = depset()
@@ -1068,21 +1106,20 @@ def _transitive_hdrs_impl(ctx):
     outputs += dep.cc.transitive_headers
   return struct(files=outputs)
 
-
 _transitive_hdrs = rule(
-    attrs={
+    attrs = {
         "deps": attr.label_list(
-            allow_files=True,
-            providers=["cc"],),
+            allow_files = True,
+            providers = ["cc"],
+        ),
     },
-    implementation=_transitive_hdrs_impl,)
-
+    implementation = _transitive_hdrs_impl,
+)
 
 def transitive_hdrs(name, deps=[], **kwargs):
   _transitive_hdrs(name=name + "_gather", deps=deps)
   native.filegroup(name=name, srcs=[":" + name + "_gather"])
 
-
 # Create a header only library that includes all the headers exported by
 # the libraries in deps.
 def cc_header_only_library(name, deps=[], includes=[], **kwargs):
@@ -1108,7 +1145,6 @@ def cc_header_only_library(name, deps=[], includes=[], **kwargs):
                     includes=includes,
                     **kwargs)
 
-
 def tf_custom_op_library_additional_deps():
   return [
       "@protobuf_archive//:protobuf_headers",
@@ -1117,7 +1153,6 @@ def tf_custom_op_library_additional_deps():
       clean_dep("//tensorflow/core:framework_headers_lib"),
   ]
 
-
 # Traverse the dependency graph along the "deps" attribute of the
 # target and return a struct with one field called 'tf_collected_deps'.
 # tf_collected_deps will be the union of the deps of the current target
@@ -1131,16 +1166,15 @@ def _collect_deps_aspect_impl(target, ctx):
         alldeps = alldeps | dep.tf_collected_deps
   return struct(tf_collected_deps=alldeps)
 
-
 collect_deps_aspect = aspect(
-    implementation=_collect_deps_aspect_impl, attr_aspects=["deps"])
-
+    attr_aspects = ["deps"],
+    implementation = _collect_deps_aspect_impl,
+)
 
 def _dep_label(dep):
   label = dep.label
   return label.package + ":" + label.name
 
-
 # This rule checks that the transitive dependencies of targets listed
 # in the 'deps' attribute don't depend on the targets listed in
 # the 'disallowed_deps' attribute.
@@ -1157,22 +1191,24 @@ def _check_deps_impl(ctx):
                   disallowed_dep))
   return struct()
 
-
 check_deps = rule(
     _check_deps_impl,
-    attrs={
-        "deps":
-            attr.label_list(
-                aspects=[collect_deps_aspect], mandatory=True,
-                allow_files=True),
-        "disallowed_deps":
-            attr.label_list(mandatory=True, allow_files=True)
-    },)
-
+    attrs = {
+        "deps": attr.label_list(
+            aspects = [collect_deps_aspect],
+            mandatory = True,
+            allow_files = True,
+        ),
+        "disallowed_deps": attr.label_list(
+            mandatory = True,
+            allow_files = True,
+        ),
+    },
+)
 
 # Helper to build a dynamic library (.so) from the sources containing
 # implementations of custom ops and kernels.
-def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[]):
+def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[], linkopts=[]):
   cuda_deps = [
       clean_dep("//tensorflow/core:stream_executor_headers_lib"),
       "@local_config_cuda//cuda:cuda_headers",
@@ -1200,8 +1236,8 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[]):
       srcs=srcs,
       deps=deps + if_cuda(cuda_deps),
       data=[name + "_check_deps"],
-      copts=tf_copts(),
-      linkopts=select({
+      copts=tf_copts(is_external=True),
+      linkopts=linkopts + select({
           "//conditions:default": [
               "-lm",
           ],
@@ -1209,9 +1245,9 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[]):
       }),)
 
 register_extension_info(
-    extension_name="tf_custom_op_library",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "tf_custom_op_library",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_custom_op_py_library(name,
                             srcs=[],
@@ -1230,18 +1266,16 @@ def tf_custom_op_py_library(name,
       deps=deps,)
 
 register_extension_info(
-    extension_name="tf_custom_op_py_library",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "tf_custom_op_py_library",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_extension_linkopts():
   return []  # No extension link opts
 
-
 def tf_extension_copts():
   return []  # No extension c opts
 
-
 def tf_py_wrap_cc(name,
                              srcs,
                              swig_includes=[],
@@ -1309,19 +1343,39 @@ def tf_py_wrap_cc(name,
           "//conditions:default": [":" + cc_library_name],
       }))
 
-
-def py_test(deps=[], **kwargs):
+# This macro is for running python tests against system installed pip package
+# on Windows.
+#
+# py_test is built as an exectuable python zip file on Windows, which contains all
+# dependencies of the target. Because of the C++ extensions, it would be very
+# inefficient if the py_test zips all runfiles, plus we don't need them when running
+# tests against system installed pip package. So we'd like to get rid of the deps
+# of py_test in this case.
+#
+# In order to trigger the tests without bazel clean after getting rid of deps,
+# we introduce the following :
+# 1. When --define=no_tensorflow_py_deps=true, the py_test depends on a marker
+#    file of the pip package, the test gets to rerun when the pip package change.
+#    Note that this only works on Windows. See the definition of
+#    //tensorflow/tools/pip_package:win_pip_package_marker for specific reasons.
+# 2. When --define=no_tensorflow_py_deps=false (by default), it's a normal py_test.
+def py_test(deps=[], data=[], **kwargs):
   native.py_test(
       deps=select({
           "//conditions:default": deps,
-          clean_dep("//tensorflow:no_tensorflow_py_deps"): []
+          clean_dep("//tensorflow:no_tensorflow_py_deps"): [],
+      }),
+      data = data + select({
+          "//conditions:default": [],
+          clean_dep("//tensorflow:no_tensorflow_py_deps"):
+          ["//tensorflow/tools/pip_package:win_pip_package_marker"],
       }),
       **kwargs)
 
 register_extension_info(
-    extension_name="py_test",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "py_test",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_py_test(name,
                srcs,
@@ -1333,10 +1387,13 @@ def tf_py_test(name,
                shard_count=1,
                additional_deps=[],
                flaky=0,
-               xla_enabled=False):
+               xla_enabled=False,
+               grpc_enabled=False):
   if xla_enabled:
     additional_deps = additional_deps + tf_additional_xla_deps_py()
-  native.py_test(
+  if grpc_enabled:
+    additional_deps = additional_deps + tf_additional_grpc_deps_py()
+  py_test(
       name=name,
       size=size,
       srcs=srcs,
@@ -1346,20 +1403,17 @@ def tf_py_test(name,
       visibility=[clean_dep("//tensorflow:internal")],
       shard_count=shard_count,
       data=data,
-      deps=select({
-          "//conditions:default": [
-              clean_dep("//tensorflow/python:extra_py_tests_deps"),
-              clean_dep("//tensorflow/python:gradient_checker"),
+      deps=[
+            clean_dep("//tensorflow/python:extra_py_tests_deps"),
+            clean_dep("//tensorflow/python:gradient_checker"),
           ] + additional_deps,
-          clean_dep("//tensorflow:no_tensorflow_py_deps"): []
-      }),
       flaky=flaky,
       srcs_version="PY2AND3")
 
 register_extension_info(
-    extension_name="tf_py_test",
-    label_regex_map={"additional_deps": "deps:{extension_name}"})
-
+    extension_name = "tf_py_test",
+    label_regex_map = {"additional_deps": "deps:{extension_name}"},
+)
 
 def cuda_py_test(name,
                  srcs,
@@ -1371,7 +1425,8 @@ def cuda_py_test(name,
                  additional_deps=[],
                  tags=[],
                  flaky=0,
-                 xla_enabled=False):
+                 xla_enabled=False,
+                 grpc_enabled=False):
   test_tags = tags + tf_cuda_tests_tags()
   tf_py_test(
       name=name,
@@ -1384,12 +1439,13 @@ def cuda_py_test(name,
       shard_count=shard_count,
       additional_deps=additional_deps,
       flaky=flaky,
-      xla_enabled=xla_enabled)
+      xla_enabled=xla_enabled,
+      grpc_enabled=grpc_enabled)
 
 register_extension_info(
-    extension_name="cuda_py_test",
-    label_regex_map={"additional_deps": "additional_deps:{extension_name}"})
-
+    extension_name = "cuda_py_test",
+    label_regex_map = {"additional_deps": "additional_deps:{extension_name}"},
+)
 
 def sycl_py_test(name,
                  srcs,
@@ -1401,7 +1457,8 @@ def sycl_py_test(name,
                  additional_deps=[],
                  tags=[],
                  flaky=0,
-                 xla_enabled=False):
+                 xla_enabled=False,
+                 grpc_enabled=False):
   test_tags = tags + tf_sycl_tests_tags()
   tf_py_test(
       name=name,
@@ -1414,12 +1471,13 @@ def sycl_py_test(name,
       shard_count=shard_count,
       additional_deps=additional_deps,
       flaky=flaky,
-      xla_enabled=xla_enabled)
+      xla_enabled=xla_enabled,
+      grpc_enabled=grpc_enabled)
 
 register_extension_info(
-    extension_name="sycl_py_test",
-    label_regex_map={"additional_deps": "additional_deps:{extension_name}"})
-
+    extension_name = "sycl_py_test",
+    label_regex_map = {"additional_deps": "additional_deps:{extension_name}"},
+)
 
 def py_tests(name,
              srcs,
@@ -1429,7 +1487,8 @@ def py_tests(name,
              tags=[],
              shard_count=1,
              prefix="",
-             xla_enabled=False):
+             xla_enabled=False,
+             grpc_enabled=False):
   for src in srcs:
     test_name = src.split("/")[-1].split(".")[0]
     if prefix:
@@ -1443,8 +1502,8 @@ def py_tests(name,
         shard_count=shard_count,
         data=data,
         additional_deps=additional_deps,
-        xla_enabled=xla_enabled)
-
+        xla_enabled=xla_enabled,
+        grpc_enabled=grpc_enabled)
 
 def cuda_py_tests(name,
                   srcs,
@@ -1454,7 +1513,8 @@ def cuda_py_tests(name,
                   shard_count=1,
                   tags=[],
                   prefix="",
-                  xla_enabled=False):
+                  xla_enabled=False,
+                  grpc_enabled=False):
   test_tags = tags + tf_cuda_tests_tags()
   py_tests(
       name=name,
@@ -1465,8 +1525,8 @@ def cuda_py_tests(name,
       tags=test_tags,
       shard_count=shard_count,
       prefix=prefix,
-      xla_enabled=xla_enabled)
-
+      xla_enabled=xla_enabled,
+      grpc_enabled=grpc_enabled)
 
 # Creates a genrule named <name> for running tools/proto_text's generator to
 # make the proto_text functions, for the protos passed in <srcs>.
@@ -1490,12 +1550,10 @@ def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs):
       ],)
   return struct(hdrs=out_hdrs, srcs=out_srcs)
 
-
 def tf_genrule_cmd_append_to_srcs(to_append):
   return ("cat $(SRCS) > $(@) && " + "echo >> $(@) && " + "echo " + to_append +
           " >> $(@)")
 
-
 def tf_version_info_genrule():
   native.genrule(
       name="version_info_gen",
@@ -1510,7 +1568,6 @@ def tf_version_info_genrule():
       local=1,
       tools=[clean_dep("//tensorflow/tools/git:gen_git_source.py")],)
 
-
 def tf_py_build_info_genrule():
   native.genrule(
       name="py_build_info_gen",
@@ -1520,14 +1577,15 @@ def tf_py_build_info_genrule():
       local=1,
       tools=[clean_dep("//tensorflow/tools/build_info:gen_build_info.py")],)
 
-
 def cc_library_with_android_deps(deps,
                                  android_deps=[],
                                  common_deps=[],
+                                 copts=tf_copts(),
                                  **kwargs):
   deps = if_not_android(deps) + if_android(android_deps) + common_deps
-  native.cc_library(deps=deps, **kwargs)
+  native.cc_library(deps=deps, copts=copts, **kwargs)
 
 register_extension_info(
-    extension_name="cc_library_with_android_deps",
-    label_regex_for_dep="{extension_name}")
+    extension_name = "cc_library_with_android_deps",
+    label_regex_for_dep = "{extension_name}",
+)
diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds
index bddb87f00cb5fd1ede2cb9d5cc4079d6e66f7896..3ff824e5e1707c65b5ad3cc22dd32267953964c6 100644
--- a/tensorflow/tf_exported_symbols.lds
+++ b/tensorflow/tf_exported_symbols.lds
@@ -4,3 +4,4 @@
 *TF_*
 *TFE_*
 *nsync_*
+*pywrap_xla*
diff --git a/tensorflow/tf_version_script.lds b/tensorflow/tf_version_script.lds
index 11f66c5c8b27f412b2023d6f3036c56d3d1e530c..6b28943f01cfdb174fd135c670a6bb409ee0e102 100644
--- a/tensorflow/tf_version_script.lds
+++ b/tensorflow/tf_version_script.lds
@@ -5,6 +5,7 @@ tensorflow {
     *TF_*;
     *TFE_*;
     *nsync_*;
+    *pywrap_xla*;
   local:
     *;
 };
diff --git a/tensorflow/third_party/mpi/mpi.bzl b/tensorflow/third_party/mpi/mpi.bzl
deleted file mode 100644
index 38ce91c4d069fc311d5e7f17a49ff7904c9c67eb..0000000000000000000000000000000000000000
--- a/tensorflow/third_party/mpi/mpi.bzl
+++ /dev/null
@@ -1,17 +0,0 @@
-#OpenMPI and Mvapich/mpich require different headers
-#based on the configuration options return one or the other
-
-def mpi_hdr():
-    MPI_LIB_IS_OPENMPI=True
-    hdrs = []    
-    if MPI_LIB_IS_OPENMPI:
-        hdrs = ["mpi.h", "mpi_portable_platform.h"]   #When using OpenMPI
-    else:
-        hdrs = ["mpi.h",  "mpio.h", "mpicxx.h"]        #When using MVAPICH
-    return hdrs
-
-def if_mpi(if_true, if_false = []):
-    return select({
-        "//tensorflow:with_mpi_support": if_true,
-        "//conditions:default": if_false
-    })
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index 3896a21b99f4756239a7ae9f3db9593504845aea..fa0f9b59aa938168cb3d318797c797eeabc9c7d9 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -41,7 +41,17 @@ genrule(
     # every module exported using tf_export. For e.g. if an op is decorated with
     # @tf_export('module1.module2', 'module3'). Then, outs should include
     # api/module1/module2/__init__.py and api/module3/__init__.py.
-    outs = ["api/__init__.py"],
+    outs = [
+        "api/__init__.py",
+        "api/bitwise/__init__.py",
+        "api/contrib/__init__.py",
+        "api/contrib/stat_summarizer/__init__.py",
+        "api/image/__init__.py",
+        "api/linalg/__init__.py",
+        "api/nn/__init__.py",
+        "api/spectral/__init__.py",
+        "api/train/__init__.py",
+    ],
     cmd = "$(location create_python_api) $(OUTS)",
     tools = ["create_python_api"],
 )
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 5f1286aaf6c913cd299ebbfb65949ace0f593417..aab856b723cf2686e8fc9feb156b9be28470fc98 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -107,7 +107,8 @@ def get_api_imports():
   # Import all required modules in their parent modules.
   # For e.g. if we import 'tf.foo.bar.Value'. Then, we also
   # import 'bar' in 'tf.foo'.
-  for dest_module in module_imports.keys():
+  dest_modules = set(module_imports.keys())
+  for dest_module in dest_modules:
     dest_module_split = dest_module.split('.')
     for dest_submodule_index in range(1, len(dest_module_split)):
       dest_submodule = '.'.join(dest_module_split[:dest_submodule_index])
diff --git a/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
index da6af3919e96bd6145c33a84aca89c44473ce66c..009d64aed09ddcb47410d6ee6fb42fca42861ddd 100644
--- a/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
@@ -46,6 +46,10 @@ tf_class {
     name: "INTRA_OP_PARALLELISM_THREADS_FIELD_NUMBER"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "ISOLATE_SESSION_STATE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "LOG_DEVICE_PLACEMENT_FIELD_NUMBER"
     mtype: "<type \'int\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
index 30f7e4e11655797fbd8f0ea65c2eb84768ca486b..875d802a9c458e299f73c130bb2b37c5d8828aad 100644
--- a/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
@@ -18,6 +18,14 @@ tf_class {
     name: "DESCRIPTOR"
     mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
   }
+  member {
+    name: "EXPERIMENTAL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Experimental"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
   member {
     name: "Extensions"
     mtype: "<type \'getset_descriptor\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
index d12514fe77845a1502538c0f78355e8eaf3b83a5..42de5c0c80023ad5bd7f33a564780060998307c1 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
@@ -2,6 +2,10 @@ path: "tensorflow.data.Dataset"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "output_classes"
+    mtype: "<class \'abc.abstractproperty\'>"
+  }
   member {
     name: "output_shapes"
     mtype: "<class \'abc.abstractproperty\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 002d0c6a9f932dd2a3a687dcbc740fc5a1222218..e2fc8d6cb1d318cc50828f22e8e575cc28c7aaad 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDataset\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "output_classes"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shapes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-iterator.pbtxt
index e62f6b247ae7a259385aa83d13ffa98fda0124a8..1f9aeb6ad62e1030c6e78f731fb5e05b876899e6 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-iterator.pbtxt
@@ -6,6 +6,10 @@ tf_class {
     name: "initializer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_classes"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shapes"
     mtype: "<type \'property\'>"
@@ -16,15 +20,15 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'iterator_resource\', \'initializer\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'iterator_resource\', \'initializer\', \'output_types\', \'output_shapes\', \'output_classes\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_string_handle"
-    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\', \'output_classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "from_structure"
-    argspec: "args=[\'output_types\', \'output_shapes\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'output_types\', \'output_shapes\', \'shared_name\', \'output_classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "get_next"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-sparse-type.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-sparse-type.pbtxt
deleted file mode 100644
index b25f9a029f996d94fde2800f6e87e6d8a8846e99..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/tensorflow.data.-sparse-type.pbtxt
+++ /dev/null
@@ -1,13 +0,0 @@
-path: "tensorflow.data.SparseType"
-tf_class {
-  is_instance: "<class \'tensorflow.python.data.util.sparse.SparseType\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
index 2b476dab66c8d2b5a475fe1bbf95ce1d3615ebba..9770389e5ef1e29a80ae1da2725d9862f6521ff9 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "output_classes"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shapes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
index c4c5ac077595c520f0d5a7a0ae8e3cf89472f5de..7263230c1c7182bb812cb2e433aedd415bcd16c7 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDataset\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "output_classes"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shapes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.pbtxt
index b9f54a4d72ebd11050657620d2cc5ace0f7d6e29..56fb270a49943a916012ccfcaf816a9156f4fed8 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.pbtxt
@@ -12,10 +12,6 @@ tf_module {
     name: "Iterator"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "SparseType"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "TFRecordDataset"
     mtype: "<class \'abc.ABCMeta\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt
index cfe09345acccc410ad3041a965901134440e3c77..ca96f4eaece0020235d24901f51306a65676c1c9 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt
index 2e6578bae1604f69e4697bb4668dd69d94bd68b5..d0508acd9f4f6c190b205301223599cf5b027955 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt
@@ -68,6 +68,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -84,6 +88,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt
index d42b0e82e4fab3e30d3ebf1b8bea8b44bb61ea0f..ff0fbb56cd4b9e4c288a168a7c3d9e83c552b0e2 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt
@@ -68,6 +68,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -84,6 +88,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt
index 710164743e851f0bb5c31ebe78b260b623e87378..d75e4a2f88b29ff7f638d72f98876a230b191dce 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt
@@ -68,6 +68,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -84,6 +88,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt
index 6cc361672ed8da313e1bebc41fbf093e019d38ad..b838b9ae21decba0323211f08d09fe373ababf23 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt
index 40ad07d1be4bdea9585eb276debb1fdf3dfff583..6f06b7d50dd9f5f405673d572503ff549f148f33 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt
@@ -55,6 +55,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -71,6 +75,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt
index 8f34d25fea873827997ecd9df10cf1b3bfd0e56b..d34f9cde5d4d4161883f6d1b4646f22f054d16ad 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt
@@ -65,6 +65,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -81,6 +85,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt
index 0ae88fba3b4fd176641cc17c916181cc9a6a12c6..df268b8d99eb6bf22264ddb63231074413686efa 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt
index e7cd595e946cb91f162a2a1af8753e44cdfbc0e1..303dcb4ed3bf8416b822bb010c2e87e8ef03b7c9 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt
index 7a4a16ff836a485e65cb6e061e27b92907cb4a63..ecda8acb15c49c390eaae203a0082e78e53499bd 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt
@@ -68,6 +68,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -84,6 +88,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt
index 14c8c34cc2d8efacec706bdb894d9f069d5e7033..92b9eeea223b488cda1ebcabd31ec808e78fcf70 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt
index 30db6d3f35c1c8ea7bbc376a20093302dd373bd9..9aa7f9a63465c78f79ae4a8a11bc63d92d027dab 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt
@@ -68,6 +68,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -84,6 +88,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt
index 46cbdf225f68e879fd18ef4a07048746a9a71b08..d1b9d3069629c552d6c6048642934f422a13dce7 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
index 9eb4cb8ce935a314e70866a635ce7248195e0481..018e8c909a23a9e7093c1bb411643d7db629b21c 100644
--- a/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "categorical_column_with_vocabulary_file"
-    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'num_oov_buckets\', \'default_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \"<dtype: \'string\'>\"], "
+    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'num_oov_buckets\', \'default_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \"<dtype: \'string\'>\"], "
   }
   member_method {
     name: "categorical_column_with_vocabulary_list"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
index 07b8d900da5dbd9f2c9396ecaf06b9d22ef50a0b..af8278be93c70aa08f1275605c3766cad366f7ed 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
@@ -156,7 +156,7 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit"
@@ -164,7 +164,7 @@ tf_class {
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -224,7 +224,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
index 546bac44e4c9905d13c4f3b0e3d9c1b5cc6c5e59..c17fbc45bd665da258c2e79764871174fb755b05 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
@@ -169,7 +169,7 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit"
@@ -177,7 +177,7 @@ tf_class {
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -245,7 +245,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt
index 6620a9d308f46cd87cedf482929e75bb5afdbaea..7de4008c4541b9054543927cad167293c5a4cf5c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -29,7 +29,7 @@ tf_class {
   }
   member_method {
     name: "on_train_end"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "set_model"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
index 59508c2f11073caca1f30544efaea435730ce228..508cea005a52002c408e64d17641f0aa69437ad3 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
@@ -90,7 +90,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'function\', \'mask\', \'arguments\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'function\', \'output_shape\', \'mask\', \'arguments\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
index 4e522813a5a3956b4888f95b2f14ecd52d897256..af287497ddd4ec9567a8390edc71da947bf298dd 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
@@ -156,7 +156,7 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit"
@@ -164,7 +164,7 @@ tf_class {
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -224,7 +224,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
index ddbb358c84ca50fceb4fb71eddf0083f034f65e1..0fd7dd9e291ebf45acf2f099c62b2820f3cb0b37 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
@@ -169,7 +169,7 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit"
@@ -177,7 +177,7 @@ tf_class {
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -245,7 +245,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
index 66cd37bb3a378ccd1bbdffd79f87338c9b4cf265..04174bff5f04fead68af68afeec80316867009a4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'directory\', \'image_data_generator\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'data_format\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'None\', \'\', \'png\', \'False\'], "
+    argspec: "args=[\'self\', \'directory\', \'image_data_generator\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'data_format\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'None\', \'\', \'png\', \'False\', \'nearest\'], "
   }
   member_method {
     name: "next"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
index 7e33285e7abbc10df7f697e10071e429c5183d9e..41f27d1f740457f4b7c4f74cb089a448a0fed845 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
@@ -16,7 +16,7 @@ tf_class {
   }
   member_method {
     name: "flow_from_directory"
-    argspec: "args=[\'self\', \'directory\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'\', \'png\', \'False\'], "
+    argspec: "args=[\'self\', \'directory\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'\', \'png\', \'False\', \'nearest\'], "
   }
   member_method {
     name: "random_transform"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
index ebd9c079b543e79eb0d6cfa369394362e9a8825f..d920fef7702aeb716ba53d9edad28e749b11410b 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
@@ -54,15 +54,15 @@ tf_module {
   }
   member_method {
     name: "conv2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv2d_backprop_filter"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv2d_backprop_input"
-    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv2d_transpose"
@@ -70,11 +70,11 @@ tf_module {
   }
   member_method {
     name: "conv3d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv3d_backprop_filter_v2"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv3d_transpose"
@@ -106,15 +106,15 @@ tf_module {
   }
   member_method {
     name: "depthwise_conv2d_native"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "depthwise_conv2d_native_backprop_filter"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "depthwise_conv2d_native_backprop_input"
-    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "dilation2d"
@@ -234,7 +234,7 @@ tf_module {
   }
   member_method {
     name: "quantized_conv2d"
-    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "quantized_max_pool"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 0edd4153d772459d941cb260c26fd9e09f017f12..4c66cb68c2d70a8fbac113f0caf849f92284c925 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -124,6 +124,10 @@ tf_module {
     name: "LogMessage"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
+  member {
+    name: "MONOLITHIC_BUILD"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "MetaGraphDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -1140,6 +1144,10 @@ tf_module {
     name: "group"
     argspec: "args=[], varargs=inputs, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "guarantee_const"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "hessians"
     argspec: "args=[\'ys\', \'xs\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\'], varargs=None, keywords=None, defaults=[\'hessians\', \'False\', \'False\', \'None\'], "
@@ -1394,7 +1402,7 @@ tf_module {
   }
   member_method {
     name: "multinomial"
-    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\', \'output_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "multiply"
@@ -1706,11 +1714,11 @@ tf_module {
   }
   member_method {
     name: "serialize_many_sparse"
-    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
   }
   member_method {
     name: "serialize_sparse"
-    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
   }
   member_method {
     name: "serialize_tensor"
@@ -1838,15 +1846,15 @@ tf_module {
   }
   member_method {
     name: "sparse_segment_mean"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "sparse_segment_sqrt_n"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "sparse_segment_sum"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "sparse_slice"
@@ -2062,7 +2070,7 @@ tf_module {
   }
   member_method {
     name: "while_loop"
-    argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\'], "
+    argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\', \'maximum_iterations\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\', \'None\'], "
   }
   member_method {
     name: "write_file"
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt
index 35e49ee9f4a6ee5b4da2b034ece1c1e3b2136254..6af72498d74d4bbc12e7ca68ad1e0a6f0c237e0a 100644
--- a/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "SERVING"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "TPU"
+    mtype: "<type \'str\'>"
+  }
   member {
     name: "TRAINING"
     mtype: "<type \'str\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.variable_scope.pbtxt b/tensorflow/tools/api/golden/tensorflow.variable_scope.pbtxt
index de1ad7e860a616f6737cd451b9c7d90d1ab079c9..e62dec93e6f06a10f48d72b0cda74426887806fb 100644
--- a/tensorflow/tools/api/golden/tensorflow.variable_scope.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.variable_scope.pbtxt
@@ -4,6 +4,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name_or_scope\', \'default_name\', \'values\', \'initializer\', \'regularizer\', \'caching_device\', \'partitioner\', \'custom_getter\', \'reuse\', \'dtype\', \'use_resource\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'name_or_scope\', \'default_name\', \'values\', \'initializer\', \'regularizer\', \'caching_device\', \'partitioner\', \'custom_getter\', \'reuse\', \'dtype\', \'use_resource\', \'constraint\', \'auxiliary_name_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index a8fdf4c9a07a21269920c61d7f560562dab7b5f4..afcbf50944cc47b3ae3086b17279f2ce2fdc6ee7 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -248,14 +248,15 @@ class ApiCompatibilityTest(test.TestCase):
       logging.info('No differences found between API and golden.')
 
   @unittest.skipUnless(
-      sys.version_info.major == 2 and os.uname()[0] == 'Linux',
-      'API compabitility test goldens are generated using python2 on Linux.')
+      sys.version_info.major == 2,
+      'API compabitility test goldens are generated using python2.')
   def testAPIBackwardsCompatibility(self):
     # Extract all API stuff.
     visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
 
     public_api_visitor = public_api.PublicAPIVisitor(visitor)
     public_api_visitor.do_not_descend_map['tf'].append('contrib')
+    public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
     traverse.traverse(tf, public_api_visitor)
 
     proto_dict = visitor.GetProtos()
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index 2d59299da4d313f4bf8c5174480f355c3575fa30..ecab6f8769ae2d0126f63580030ed6ff756015d0 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -530,7 +530,7 @@ int Main(int argc, char** argv) {
   }
 
   // Capture overall inference time without stat logging overhead. This is the
-  // timing data that can be compared to other libaries.
+  // timing data that can be compared to other libraries.
   SleepSeconds(inter_benchmark_sleep_seconds);
   int64 no_stat_time_us = 0;
   int64 no_stat_num_runs = 0;
@@ -622,7 +622,7 @@ int Main(int argc, char** argv) {
     RecordBenchmarkEntry(output_prefix, benchmark_name, "meta-first-inference",
                          warmup_runs, warmup_time_us / 1000000.0);
 
-    // Time from starting to intialize TF to getting the first result back.
+    // Time from starting to initialize TF to getting the first result back.
     // This also assumes that only one warmup run is performed.
     RecordBenchmarkEntry(
         output_prefix, benchmark_name, "meta-init-plus-first-inference", 1,
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu b/tensorflow/tools/ci_build/Dockerfile.gpu
index 2d46ccb6b17ac3ab3af49c1649074eda8a840331..7591ecc04efa887ec1d35ba92881386f5a25241d 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu
@@ -1,8 +1,8 @@
-FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
 
 LABEL maintainer="Jan Prach <jendap@google.com>"
 
-# In the Ubuntu 14.04 images, cudnn is placed in system paths. Move them to
+# In the Ubuntu 16.04 images, cudnn is placed in system paths. Move them to
 # /usr/local/cuda
 RUN cp -P /usr/include/cudnn.h /usr/local/cuda/include
 RUN cp -P /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu_clang b/tensorflow/tools/ci_build/Dockerfile.gpu_clang
index 0ecd8c75e036fc18d37882834ed467d0edb096b1..438a7ec532862b9cf6be57ef2712790c35a9f354 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu_clang
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu_clang
@@ -1,8 +1,8 @@
-FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
 
 LABEL maintainer="Ilya Biryukov <ibiryukov@google.com>"
 
-# In the Ubuntu 14.04 images, cudnn is placed in system paths. Move them to
+# In the Ubuntu 16.04 images, cudnn is placed in system paths. Move them to
 # /usr/local/cuda
 RUN cp /usr/include/cudnn.h /usr/local/cuda/include
 RUN cp /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
diff --git a/tensorflow/tools/ci_build/README.md b/tensorflow/tools/ci_build/README.md
index 202fcb9101a42336f5f33022c3b8608e53d83dae..f2161b700a0f642dfdb5c33d7d77934c02f14d54 100644
--- a/tensorflow/tools/ci_build/README.md
+++ b/tensorflow/tools/ci_build/README.md
@@ -67,10 +67,10 @@ this UI, to see the logs for a failed build:
     the build tool divided the target into multiple shards or ran the test
     multiple times. Each test log is specific to the shard, run, and attempt.
     To see a specific log:
-    
+
     1.  Click on the log icon that is on the right next to the shard, run,
         and attempt number.
-        
+
     2.  In the grid that appears on the right, click on the specific shard,
         run, and attempt to view its log. You can also type the desired shard,
         run, or attempt number in the field above its grid.
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 552df1434eab8c4414b8b9a8f7be9c61998d8462..82042b93c02275b51530b306d8cf4519482e5410 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -296,13 +296,11 @@ create_activate_virtualenv_and_install_tensorflow() {
     die "FAILED to create virtualenv directory: ${VIRTUALENV_DIR}"
   fi
 
-  # Verify that virtualenv exists
-  if [[ -z $(which virtualenv) ]]; then
-    die "FAILED: virtualenv not available on path"
-  fi
-
-  virtualenv ${VIRTUALENV_FLAGS} \
-    -p "${PYTHON_BIN_PATH}" "${VIRTUALENV_DIR}" || \
+  # Use the virtualenv from the default python version (i.e., python-virtualenv)
+  # to create the virtualenv directory for testing. Use the -p flag to specify
+  # the python version inside the to-be-created virtualenv directory.
+  ${PYTHON_BIN_PATH} -m virtualenv -p "${PYTHON_BIN_PATH}" ${VIRTUALENV_FLAGS} \
+    "${VIRTUALENV_DIR}" || \
     die "FAILED: Unable to create virtualenv"
 
   source "${VIRTUALENV_DIR}/bin/activate" || \
@@ -345,7 +343,7 @@ do_clean_virtualenv_smoke_test() {
   then
     echo "Smoke test of tensorflow install in clean virtualenv PASSED."
   else
-    echo "Smoke test of tensroflow install in clean virtualenv FAILED."
+    echo "Smoke test of tensorflow install in clean virtualenv FAILED."
     return 1
   fi
 
diff --git a/tensorflow/tools/ci_build/builds/print_build_info.sh b/tensorflow/tools/ci_build/builds/print_build_info.sh
index 7c43419a76ff26be7370326a9113f4e3db2a2b1c..e366abf8bb831688d90a0e3eabed101e42bdaf96 100755
--- a/tensorflow/tools/ci_build/builds/print_build_info.sh
+++ b/tensorflow/tools/ci_build/builds/print_build_info.sh
@@ -88,7 +88,7 @@ fi
 # Print info
 echo "TF_BUILD_INFO = {"\
 "container_type: \"${CONTAINER_TYPE}\", "\
-"command: \"${COMMAND[@]}\", "\
+"command: \"${COMMAND[*]}\", "\
 "source_HEAD: \"${TF_HEAD}\", "\
 "source_remote_origin: \"${TF_FETCH_URL}\", "\
 "OS: \"${OS}\", "\
diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh
index 4f1c61b8e9a799712e2e9def88868b44f3393325..caa3a40817c80b27271f76de0a95a743cb2916f6 100755
--- a/tensorflow/tools/ci_build/builds/test_user_ops.sh
+++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh
@@ -76,17 +76,17 @@ echo "PYTHON_BIN_PATH: ${PYTHON_BIN_PATH}"
 
 pushd "${TMP_DIR}"
 
-# Obtain paths include and lib paths to the TensorFlow installation
-TF_INC=$("${PYTHON_BIN_PATH}" \
-         -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')
-TF_LIB=$("${PYTHON_BIN_PATH}" \
-         -c 'import tensorflow as tf; print(tf.sysconfig.get_lib())')
-
-if [[ -z "${TF_INC}" ]]; then
-  die "FAILED to determine TensorFlow include path"
+# Obtain compilation and linking flags
+TF_CFLAGS=( $("${PYTHON_BIN_PATH}" \
+	      -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
+TF_LFLAGS=( $("${PYTHON_BIN_PATH}" \
+	      -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
+
+if [[ -z "${TF_CFLAGS[*]}" || -z "${TF_LFLAGS[*]}" ]]; then
+  die "FAILED to determine TensorFlow compilation or linking flags"
 else
-  echo "TensorFlow include path: ${TF_INC}"
-  TF_INCLUDE_PATH="-I${TF_INC} -I${TF_INC}/external/nsync/public"
+  echo "TensorFlow compile flags: ${TF_CFLAGS[*]}"
+  echo "TensorFlow link flags: ${TF_LFLAGS[*]}"
 fi
 
 # Check g++ availability
@@ -145,7 +145,7 @@ if [[ ${IS_GPU} == "0" ]]; then
 
   "${GPP_BIN}" -std=c++11 ${EXTRA_GPP_FLAGS} \
     -shared "${SRC_FILE}" -o "${USER_OP_SO}" \
-    -fPIC ${TF_INCLUDE_PATH} -L "${TF_LIB}" -ltensorflow_framework  || \
+    -fPIC ${TF_CFLAGS[@]} ${TF_LFLAGS[@]}  || \
     die "g++ compilation of ${SRC_FILE} FAILED"
 
 else
@@ -184,7 +184,7 @@ else
   OP_KERNEL_O=$(echo "${OP_KERNEL_CC}" | sed -e 's/\.cc/\.o/')
   "${NVCC_BIN}" -std=c++11 \
       -c -o "${OP_KERNEL_O}" "${OP_KERNEL_CU}" \
-      ${TF_INCLUDE_PATH} -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC || \
+      ${TF_CFLAGS[@]} -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC || \
       die "nvcc compilation of ${OP_KERNEL_CC} FAILED"
 
   CUDA_LIB_DIR="/usr/local/cuda/lib64"
@@ -203,8 +203,8 @@ else
   USER_OP_SO="add_one.so"
   "${GPP_BIN}" -std=c++11 ${EXTRA_GPP_FLAGS} \
       -shared -o "${USER_OP_SO}" "${OP_KERNEL_CC}" \
-      "${OP_KERNEL_O}" ${TF_INCLUDE_PATH} -L "${CUDA_LIB_DIR}" -L "${TF_LIB}" \
-      -fPIC -lcudart -ltensorflow_framework || \
+      "${OP_KERNEL_O}" ${TF_CFLAGS[@]} -L "${CUDA_LIB_DIR}" ${TF_LFLAGS[@]} \
+      -fPIC -lcudart || \
       die "g++ compilation of ${OP_KERNEL_CC}" FAILED
 fi
 
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 404a9a6b6296652c009d5725919a21c9cd6e8178..4021d794b6c4aa171c041d1d6da8ce5b6a1f6a67 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -99,7 +99,8 @@ do_pylint() {
 "^tensorflow/contrib/eager/python/metrics_impl\.py.*\[E0202.*method-hidden "\
 "^tensorflow/python/platform/gfile\.py.*\[E0301.*non-iterator "\
 "^tensorflow/python/keras/_impl/keras/callbacks\.py.*\[E1133.*not-an-iterable "\
-"^tensorflow/python/keras/_impl/keras/layers/recurrent\.py.*\[E0203.*access-member-before-definition"
+"^tensorflow/python/keras/_impl/keras/layers/recurrent\.py.*\[E0203.*access-member-before-definition "\
+"^tensorflow/python/kernel_tests/constant_op_eager_test.py.*\[E0303.*invalid-length-returned"
 
   echo "ERROR_WHITELIST=\"${ERROR_WHITELIST}\""
 
diff --git a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
index 6e7b752c06f43fe7f8fa26bd52a28ed33f38edd8..cfeaebdbf57c01fef7cd81dae76217429336d0ff 100755
--- a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
+++ b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
@@ -45,7 +45,7 @@ for i in `seq 0 $((TF_GPU_COUNT-1))`; do
       # This export only works within the brackets, so it is isolated to one
       # single command.
       export CUDA_VISIBLE_DEVICES=$i
-      echo "Running test $@ on GPU $CUDA_VISIBLE_DEVICES"
+      echo "Running test $* on GPU $CUDA_VISIBLE_DEVICES"
       $@
     )
     return_code=$?
diff --git a/tensorflow/tools/ci_build/install/install_deb_packages.sh b/tensorflow/tools/ci_build/install/install_deb_packages.sh
index 4ab307c9253a8019f2c794b696db030722751770..96408105339d9a3e21aecb3bae9894551f8b6811 100755
--- a/tensorflow/tools/ci_build/install/install_deb_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_deb_packages.sh
@@ -48,6 +48,7 @@ apt-get install -y --no-install-recommends \
     git \
     libcurl4-openssl-dev \
     libtool \
+    libssl-dev \
     mlocate \
     openjdk-8-jdk \
     openjdk-8-jre-headless \
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index b8ed1ab7676ff4efaef01dd5009effbf5ab05a92..da58ac2407a847ed5b57c949a69d3890fe4df4cf 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -27,6 +27,9 @@ easy_install3 -U pip
 pip2 install wheel
 pip3 install wheel
 
+pip2 install virtualenv
+pip3 install virtualenv
+
 # Install six.
 pip2 install --upgrade six==1.10.0
 pip3 install --upgrade six==1.10.0
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index 479242aa4376883f851486ca38a859a75d4f4f51..9881bd99c35b29920c6db21b572d1956eb497dae 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -39,6 +39,8 @@ if [[ -z $pip35_version ]]; then
 fi
 
 set -e
+pip3.5 install --upgrade virtualenv
+
 # Install six.
 pip3.5 install --upgrade absl-py
 pip3.5 install --upgrade six==1.10.0
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index c354aaa154e8d01ba69f157dd195ef439270c2ec..1ca12c6c608858d78a696eed69da1ad1037de364 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -26,25 +26,25 @@ apt-get update
 
 set -e
 # Install Python 3.6 and dev library
-apt-get install -y --no-install-recommends python3.6 libpython3.6-dev
-
-# Install pip3.6
-set +e
-pip35_version=$(pip3.6 --version | grep "python 3.6")
-if [[ -z $pip35_version ]]; then
-  set -e
-  wget -q https://bootstrap.pypa.io/get-pip.py
-  python3.6 get-pip.py
-  rm -f get-pip.py
-fi
+wget https://www.python.org/ftp/python/3.6.1/Python-3.6.1.tar.xz
+tar xvf Python-3.6.1.tar.xz
+cd Python-3.6.1
+
+./configure
+make altinstall
+pip3.6 -V
+which pip3.6
+ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
+
+pip3 install --upgrade virtualenv
 
 set -e
 # Install six.
-pip3.6 install --upgrade absl-py
-pip3.6 install --upgrade six==1.10.0
+pip3 install --upgrade absl-py
+pip3 install --upgrade six==1.10.0
 
 # Install protobuf.
-pip3.6 install --upgrade protobuf==3.3.0
+pip3 install --upgrade protobuf==3.3.0
 
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
@@ -54,22 +54,22 @@ rm -rf /usr/lib/python3/dist-packages/six*
 # numpy needs to be installed from source to fix segfaults. See:
 # https://github.com/tensorflow/tensorflow/issues/6968
 # This workaround isn't needed for Ubuntu 16.04 or later.
-pip3.6 install --no-binary=:all: --upgrade numpy==1.12.0
+pip3 install --no-binary=:all: --upgrade numpy==1.12.0
 
-pip3.6 install scipy==0.18.1
+pip3 install scipy==0.18.1
 
-pip3.6 install scikit-learn==0.18.1
+pip3 install scikit-learn==0.18.1
 
 # pandas required by `inflow`
 pip3 install pandas==0.19.2
 
 # Install recent-enough version of wheel for Python 3.6 wheel builds
-pip3.6 install wheel==0.29.0
+pip3 install wheel==0.29.0
 
-pip3.6 install portpicker
+pip3 install portpicker
 
-pip3.6 install werkzeug
+pip3 install werkzeug
 
-pip3.6 install grpcio
+pip3 install grpcio
 
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
index df196f829cd920b538fd0032950a9282c3043617..ac83e90f766aab1769fc920d2938f3607aabc786 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
@@ -28,6 +28,8 @@ echo ""
 export PYTHON_BIN_PATH=`which python3`
 
 export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=8.0
+export TF_CUDNN_VERSION=6
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
 
 yes "" | $PYTHON_BIN_PATH configure.py
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
index abd256a895ea751f84ec946a85a4331fe5b23440..6b80f44729b2a7d30bb754e07728ce4614b7cb16 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
@@ -28,6 +28,8 @@ echo ""
 export PYTHON_BIN_PATH=`which python3`
 
 export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=8.0
+export TF_CUDNN_VERSION=6
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
 
 yes "" | $PYTHON_BIN_PATH configure.py
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 88116d9f246cabdf19c8b24bf8c95fdf52076fe0..1bd1852ffc570166ecc6efca1420bc54d702ed89 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -82,6 +82,7 @@ if [[ $1 == "PI_ONE" ]]; then
 else
   PI_COPTS='--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
   --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
+  --copt=-O3
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8'
diff --git a/tensorflow/tools/ci_build/remote/remote_docker_build.sh b/tensorflow/tools/ci_build/remote/remote_docker_build.sh
index 3ac6840f4e7a881da4ab973a7fadd921ed288828..e00a66aabaf1068c772aabce2391616518be44d4 100755
--- a/tensorflow/tools/ci_build/remote/remote_docker_build.sh
+++ b/tensorflow/tools/ci_build/remote/remote_docker_build.sh
@@ -124,7 +124,7 @@ function build_tf_image {
 
 
 function publish_tf_image {
-  $gcr_tf_image="gcr.io/tensorflow/${tf_image}"
+  gcr_tf_image="gcr.io/tensorflow/${tf_image}"
   docker tag $tf_image $gcr_tf_image
   gcloud docker -- push $gcr_tf_image
 }
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 6c964c722767c345c38a1c3ab97fb88cc946ad25..0c9f3bb5b3adb2986a881926d216e01a36170801 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -21,7 +21,6 @@ failing_cpu_cc_tests="\
     //tensorflow/core:lib_core_status_test + \
     //tensorflow/core:lib_monitoring_collection_registry_test + \
     //tensorflow/core:lib_strings_numbers_test + \
-    //tensorflow/core:lib_strings_str_util_test + \
     //tensorflow/core/platform/hadoop:hadoop_file_system_test + \
     //tensorflow/core:platform_file_system_test + \
     //tensorflow/core:platform_logging_test + \
@@ -88,7 +87,7 @@ extra_failing_gpu_cc_tests="\
     //tensorflow/core:cuda_libdevice_path_test + \
     //tensorflow/core:common_runtime_direct_session_test + \
     //tensorflow/core:common_runtime_direct_session_with_tracking_alloc_test + \
-    //tensorflow/core:gpu_tracer_test + \
+    //tensorflow/core:device_tracer_test + \
     //tensorflow/core:ops_math_grad_test \
 "
 
@@ -96,10 +95,6 @@ exclude_cpu_cc_tests="${failing_cpu_cc_tests} + ${broken_cpu_cc_tests}"
 
 exclude_gpu_cc_tests="${extra_failing_gpu_cc_tests} + ${exclude_cpu_cc_tests}"
 
-function clean_output_base() {
-  bazel clean --expunge
-}
-
 function run_configure_for_cpu_build {
   # Due to a bug in Bazel: https://github.com/bazelbuild/bazel/issues/2182
   # yes "" | ./configure doesn't work on Windows, so we set all the
@@ -115,7 +110,7 @@ function run_configure_for_cpu_build {
     export TF_NEED_MKL=0
   fi
   export TF_NEED_VERBS=0
-  export TF_NEED_GCP=0
+  export TF_NEED_GCP=1
   export TF_NEED_HDFS=0
   export TF_NEED_OPENCL_SYCL=0
   echo "" | ./configure
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 4a653698a2d7c12ce59a53bf96e1551a633f7cab..f88e7176f0803dab98efd4f9f2ca5fd8757a7272 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -36,12 +36,6 @@ export BAZEL_SH=${BAZEL_SH:-"C:/tools/msys64/usr/bin/bash"}
 export PYTHON_BIN_PATH="C:/Program Files/Anaconda3/python.exe"
 export PYTHON_LIB_PATH="C:/Program Files/Anaconda3/lib/site-packages"
 
-# Set Python path for cc_configure.bzl
-export BAZEL_PYTHON="C:/Program Files/Anaconda3/python.exe"
-
-# Set Visual Studio path
-export BAZEL_VS="C:/Program Files (x86)/Microsoft Visual Studio 14.0"
-
 # Add python into PATH, it's needed because gen_git_source.py uses
 # '/usr/bin/env python' as a shebang
 export PATH="/c/Program Files/Anaconda3:$PATH"
@@ -53,13 +47,3 @@ export PATH="/c/Program Files/Anaconda3/Scripts:$PATH"
 export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/bin:$PATH"
 export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/extras/CUPTI/libx64:$PATH"
 export PATH="/c/tools/cuda/bin:$PATH"
-
-# Set the common build options on Windows
-export BUILD_OPTS='--config=monolithic --copt=-w --host_copt=-w --verbose_failures --experimental_ui'
-
-# Build TF with wrapper-less CROSSTOOL
-# TODO(pcloudy): Remove this after wrapper-less CROSSTOOL becomes default
-export NO_MSVC_WRAPPER=1
-
-export USE_DYNAMIC_CRT=1
-
diff --git a/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh b/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
index 8c419347d6f4b3af2e47bb96f246dc7281a92364..748a961e44c5429664e37a1456adcf02a56fa3d4 100644
--- a/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
@@ -42,8 +42,6 @@ source "tensorflow/tools/ci_build/windows/bazel/common_env.sh" \
 source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
   || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; }
 
-clean_output_base
-
 run_configure_for_cpu_build
 
 # Compliling the following test is extremely slow with -c opt
@@ -54,5 +52,5 @@ passing_tests=$(bazel query "kind(cc_test, //tensorflow/cc/... + //tensorflow/co
   # We need to strip \r so that the result could be store into a variable under MSYS
   tr '\r' ' ')
 
-bazel test $BUILD_OPTS -k $slow_compiling_test --test_output=errors
-bazel test -c opt $BUILD_OPTS -k $passing_tests --test_output=errors
+bazel test -k $slow_compiling_test --test_output=errors
+bazel test -c opt -k $passing_tests --test_output=errors
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
index 6e600e2dcfb8380690764d43c4b731a8da6b5dc4..957729bb37db3ae49800c277f4090a52117c699d 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
@@ -30,11 +30,13 @@ IF DEFINED SWIG_EXE (ECHO SWIG_EXE is set to %SWIG_EXE%) ELSE (SET SWIG_EXE="C:\
 IF DEFINED PY_EXE (ECHO PY_EXE is set to %PY_EXE%) ELSE (SET PY_EXE="C:\Program Files\Anaconda3\python.exe")
 IF DEFINED PY_LIB (ECHO PY_LIB is set to %PY_LIB%) ELSE (SET PY_LIB="C:\Program Files\Anaconda3\libs\python35.lib")
 
+IF DEFINED DISABLE_FORCEINLINE (ECHO DISABLE_FORCEINLINE is set to %DISABLE_FORCEINLINE%) ELSE (SET DISABLE_FORCEINLINE="OFF")
+
 SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
 SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
 
 :: Run cmake to create Visual Studio Project files.
-%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY%
+%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE%
 
 :: Run msbuild in the resulting VS project files to build a pip package.
-%MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 /verbosity:minimal tf_python_build_pip_package.vcxproj
\ No newline at end of file
+%MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index f6e3d2e6c716178609b4aeb7e25d4dc12ac12f34..31b4226a301e536ea43f9da30006feef7ec60d5d 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -44,9 +44,7 @@ source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
 
 run_configure_for_cpu_build
 
-clean_output_base
-
-bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
+bazel build -c opt tensorflow/tools/pip_package:build_pip_package || exit $?
 
 # Create a python test directory to avoid package name conflict
 PY_TEST_DIR="py_test_dir"
@@ -60,11 +58,8 @@ reinstall_tensorflow_pip ${PIP_NAME}
 
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
-# TODO(pcloudy): Remove TF_SAVER_LENIENT_NAMES after
-# https://github.com/tensorflow/tensorflow/issues/12844 is fixed.
-bazel test -c opt $BUILD_OPTS -k --test_output=errors \
+bazel test -c opt -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
-  --test_tag_filters=-no_pip,-no_windows \
-  --build_tag_filters=-no_pip,-no_windows --build_tests_only \
-  --test_env=TF_SAVER_LENIENT_NAMES=True \
+  --test_tag_filters=-no_pip,-no_windows,-no_oss \
+  --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
   //${PY_TEST_DIR}/tensorflow/python/...
diff --git a/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh b/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh
index 3fd960deabbb0ace8c9598589f9f9a72fd09b3a9..f26f8727e51bf0247578c1cdfaa67e1b0f7f299d 100644
--- a/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh
@@ -56,5 +56,5 @@ passing_tests=$(bazel query "kind(cc_test, //tensorflow/cc/... + //tensorflow/co
 
 # TODO(pcloudy): There is a bug in Bazel preventing build with GPU support without -c opt
 # Re-enable this test after it is fixed.
-# bazel test --config=win-cuda $BUILD_OPTS -k $slow_compiling_test --test_output=errors
-bazel test -c opt --config=win-cuda $BUILD_OPTS -k $passing_tests --test_output=errors
+# bazel test --config=win-cuda -k $slow_compiling_test --test_output=errors
+bazel test -c opt --config=win-cuda -k $passing_tests --test_output=errors
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
index 44d8252a7a9b30b21097de13252e9f3a8af5b4cb..5a362de3992156fea8a5fc6ab4c70ba67ab47f89 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
@@ -31,11 +31,13 @@ IF DEFINED PY_EXE (ECHO PY_EXE is set to %PY_EXE%) ELSE (SET PY_EXE="C:\Program
 IF DEFINED PY_LIB (ECHO PY_LIB is set to %PY_LIB%) ELSE (SET PY_LIB="C:\Program Files\Anaconda3\libs\python35.lib")
 IF DEFINED CUDNN_HOME (ECHO CUDNN_HOME is set to %CUDNN_HOME%) ELSE (SET CUDNN_HOME="c:\tools\cuda")
 verbosity:quiet
+IF DEFINED DISABLE_FORCEINLINE (ECHO DISABLE_FORCEINLINE is set to %DISABLE_FORCEINLINE%) ELSE (SET DISABLE_FORCEINLINE="OFF")
+
 SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
 SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
 
 :: Run cmake to create Visual Studio Project files.
-%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_ENABLE_GPU=ON -DCUDNN_HOME=%CUDNN_HOME% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY%
+%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_ENABLE_GPU=ON -DCUDNN_HOME=%CUDNN_HOME% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE%
 
 :: Run msbuild in the resulting VS project files to build a pip package.
-%MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 /verbosity:minimal tf_python_build_pip_package.vcxproj
+%MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index 25d327c8188666e34477daa0e888a9169c709c66..922bb67bbf6ce34f55acad6d3399bd810032abd0 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -44,9 +44,7 @@ source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
 
 run_configure_for_gpu_build
 
-clean_output_base
-
-bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
+bazel build -c opt tensorflow/tools/pip_package:build_pip_package || exit $?
 
 # Create a python test directory to avoid package name conflict
 PY_TEST_DIR="py_test_dir"
@@ -61,11 +59,8 @@ reinstall_tensorflow_pip ${PIP_NAME}
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
 # GPU tests are very flaky when running concurrently, so set local_test_jobs=1
-# TODO(pcloudy): Remove TF_SAVER_LENIENT_NAMES after
-# https://github.com/tensorflow/tensorflow/issues/12844 is fixed.
-bazel test -c opt $BUILD_OPTS -k --test_output=errors \
+bazel test -c opt -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
-  --test_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu \
-  --build_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu \
-  --test_env=TF_SAVER_LENIENT_NAMES=True \
+  --test_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu,no_oss \
+  --build_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu,no_oss \
   --local_test_jobs=1 --build_tests_only //${PY_TEST_DIR}/tensorflow/python/...
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
index 9ac3613f27e1bc96501490b7610f047785b9ada2..80f2b590c9428b19822952d8b72ca9f0a1359a50 100755
--- a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
@@ -44,13 +44,12 @@ export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/tools/lib_package:clic
 export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/java:libtensorflow_jni.so"
 export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/tools/lib_package:jnilicenses_generate"
 
-clean_output_base
 run_configure_for_cpu_build
 
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel build -c opt ${BUILD_OPTS} \
+bazel build -c opt \
   tensorflow:libtensorflow.so \
   tensorflow/tools/lib_package:clicenses_generate \
   tensorflow/java:libtensorflow_jni.so \
diff --git a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
index a94a627dfb632cf01518c2022fd01b168afb4a7e..88333de856a21b3faeb49f4d88c290ca89288a6e 100755
--- a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
+++ b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
@@ -28,6 +28,8 @@ echo ""
 export PYTHON_BIN_PATH=`which python3`
 
 export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=8.0
+export TF_CUDNN_VERSION=6
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
 
 yes "" | $PYTHON_BIN_PATH configure.py
diff --git a/tensorflow/tools/dist_test/python/census_widendeep.py b/tensorflow/tools/dist_test/python/census_widendeep.py
index 6f578d6f673ccfe013a5f39472922e221d2bf2bb..8feb5386e9881596c20fba9e537a0439c8187ac4 100644
--- a/tensorflow/tools/dist_test/python/census_widendeep.py
+++ b/tensorflow/tools/dist_test/python/census_widendeep.py
@@ -263,8 +263,7 @@ if __name__ == "__main__":
       "--data_dir",
       type=str,
       default="/tmp/census-data",
-      help="Directory for storing the census data"
-  )
+      help="Directory for storing the census data")
   parser.add_argument(
       "--model_dir",
       type=str,
diff --git a/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh b/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh
index ea4906588da52e069f7f720d5432d326a977f22e..e703e78531bf7d34285b5faef874ddff94495950 100755
--- a/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh
+++ b/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh
@@ -43,7 +43,7 @@
 # NOTES:
 # If you have the error "$'\r': command not found"
 # Please run the command below to remove trailing '\r' character that causes the error:
-#   sed -i 's/\r$//' dist_mnist_test.sh 
+#   sed -i 's/\r$//' dist_mnist_test.sh
 
 
 # Configurations
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 1a0145b0785598a99d6c6d30c8a01827a627e6d9..0a6860e79157551244b31376d01c1c47c9d03f31 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -69,11 +69,8 @@ RUN mkdir /bazel && \
     rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
 
 # Download and build TensorFlow.
-
-RUN git clone https://github.com/tensorflow/tensorflow.git && \
-    cd tensorflow && \
-    git checkout r1.4
 WORKDIR /tensorflow
+RUN git clone --branch=r1.4 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
@@ -101,4 +98,3 @@ EXPOSE 6006
 EXPOSE 8888
 
 WORKDIR /root
-CMD ["/bin/bash"]
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
new file mode 100644
index 0000000000000000000000000000000000000000..8180e5e7fb65e1eff693265ed388496b356563dd
--- /dev/null
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -0,0 +1,85 @@
+FROM tensorflow/tensorflow:latest-devel
+
+LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
+
+# These arguments are parameterized. Use --build-args to override.
+ARG TF_BRANCH=r1.4
+ARG WHL_DIR=/whl
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        golang \
+        vim \
+        emacs \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN pip --no-cache-dir install --upgrade \
+        pip setuptools
+
+RUN pip --no-cache-dir install wheel 
+
+# Download and build TensorFlow.
+WORKDIR /
+RUN rm -rf tensorflow && \
+    git clone https://github.com/tensorflow/tensorflow.git && \
+    cd tensorflow && \
+    git checkout ${TF_BRANCH}
+WORKDIR /tensorflow
+
+# Configure the build for CPU with MKL by accepting default build options and
+# setting library locations
+ENV CI_BUILD_PYTHON=python \
+   LD_LIBRARY_PATH=${LD_LIBRARY_PATH} \
+    PYTHON_BIN_PATH=/usr/bin/python \
+    PYTHON_LIB_PATH=/usr/local/lib/python2.7/dist-packages \
+    CC_OPT_FLAGS='-march=native' \
+    TF_NEED_JEMALLOC=0 \
+    TF_NEED_GCP=0 \
+    TF_NEED_CUDA=0 \
+    TF_NEED_HDFS=0 \
+    TF_NEED_S3=0 \
+    TF_NEED_OPENCL=0 \
+    TF_NEED_GDR=0 \
+    TF_ENABLE_XLA=0 \
+    TF_NEED_VERBS=0 \
+    TF_NEED_MPI=0
+RUN ./configure
+
+# Build and Install TensorFlow.
+# The 'mkl' option builds with Intel(R) Math Kernel Library (MKL), which detects
+# the platform it is currently running on and takes appropriately optimized 
+# paths. The -march=native option is for code that is not in MKL, and assumes
+# this container will be run on the same architecture on which it is built.
+RUN LD_LIBRARY_PATH=${LD_LIBRARY_PATH} \
+    bazel build --config=mkl \
+                --config="opt" \
+                --copt="-march=native" \
+                --copt="-O3" \
+                //tensorflow/tools/pip_package:build_pip_package && \
+    mkdir ${WHL_DIR} && \
+    bazel-bin/tensorflow/tools/pip_package/build_pip_package ${WHL_DIR}
+
+# Clean up Bazel cache when done, but leave the whl.
+# This will upgrade the default Tensorflow version with the Intel MKL version
+RUN pip --no-cache-dir install --upgrade ${WHL_DIR}/tensorflow-*.whl && \
+    rm -rf /root/.cache
+
+WORKDIR /root
+
+#add welcome message with instructions
+
+RUN echo '[ ! -z "$TERM" -a -r /etc/motd ] && cat /etc/issue && cat /etc/motd' \
+	>> /etc/bash.bashrc \
+	; echo "\
+||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||\n\
+|								\n\
+| Docker container running Ubuntu				\n\
+| with TensorFlow ${TF_BRANCH} optimized for CPU		\n\
+| with Intel(R) MKL						\n\
+|								\n\
+||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||\n\
+\n "\
+	> /etc/motd
+
+CMD ["/bin/bash"]
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 21a44ee40447a628579952b47cbc64a263f07cbf..4164cc3f8886c09f167575b48fba659daae65fbf 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -1,11 +1,20 @@
-FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
+FROM nvidia/cuda:9.0-base-ubuntu16.04
 
 LABEL maintainer="Craig Citro <craigcitro@google.com>"
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-dev-9-0 \
+        cuda-cudart-dev-9-0 \
+        cuda-cufft-dev-9-0 \
+        cuda-curand-dev-9-0 \
+        cuda-cusolver-dev-9-0 \
+        cuda-cusparse-dev-9-0 \
         curl \
         git \
+        libcudnn7=7.0.5.15-1+cuda9.0 \
+        libcudnn7-dev=7.0.5.15-1+cuda9.0 \
         libcurl3-dev \
         libfreetype6-dev \
         libpng12-dev \
@@ -17,12 +26,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         unzip \
         zip \
         zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
         wget \
         && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/* && \
+    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
 
 RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
     python get-pip.py && \
@@ -70,18 +78,16 @@ RUN mkdir /bazel && \
     rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
 
 # Download and build TensorFlow.
-
-RUN git clone https://github.com/tensorflow/tensorflow.git && \
-    cd tensorflow && \
-    git checkout r1.4
 WORKDIR /tensorflow
+RUN git clone --branch=r1.4 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0,6.1
-
+ENV TF_CUDA_VERSION=9.0
+ENV TF_CUDNN_VERSION=7
 
 RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
     LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
@@ -102,5 +108,3 @@ WORKDIR /root
 EXPOSE 6006
 # IPython
 EXPOSE 8888
-
-RUN ["/bin/bash"]
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7 b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
deleted file mode 100644
index 9bcc3925a8accc57fc1cc01a46e2445eda62decc..0000000000000000000000000000000000000000
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
+++ /dev/null
@@ -1,117 +0,0 @@
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
-
-LABEL maintainer="Gunhan Gulsoy <gunan@google.com>"
-
-# It is possible to override these for releases.
-ARG TF_BRANCH=master
-ARG BAZEL_VERSION=0.5.4
-ARG TF_AVAILABLE_CPUS=32
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        curl \
-        git \
-        golang \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        python-dev \
-        python-pip \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
-        wget \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN pip --no-cache-dir install --upgrade \
-        pip setuptools
-
-RUN pip --no-cache-dir install \
-        ipykernel \
-        jupyter \
-        matplotlib \
-        numpy \
-        scipy \
-        sklearn \
-        pandas \
-        wheel \
-        && \
-    python -m ipykernel.kernelspec
-
-# Set up our notebook config.
-COPY jupyter_notebook_config.py /root/.jupyter/
-
-# Jupyter has issues with being run directly:
-#   https://github.com/ipython/ipython/issues/7062
-# We just add a little wrapper script.
-COPY run_jupyter.sh /
-
-# Set up Bazel.
-
-# Running bazel inside a `docker build` command causes trouble, cf:
-#   https://github.com/bazelbuild/bazel/issues/134
-# The easiest solution is to set up a bazelrc file forcing --batch.
-RUN echo "startup --batch" >>/etc/bazel.bazelrc
-# Similarly, we need to workaround sandboxing issues:
-#   https://github.com/bazelbuild/bazel/issues/418
-RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
-    >>/etc/bazel.bazelrc
-WORKDIR /
-RUN mkdir /bazel && \
-    cd /bazel && \
-    wget --quiet https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    wget --quiet https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE && \
-    chmod +x bazel-*.sh && \
-    ./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
-
-# Download and build TensorFlow.
-WORKDIR /
-RUN git clone https://github.com/tensorflow/tensorflow.git && \
-    cd tensorflow && \
-    git checkout ${TF_BRANCH}
-WORKDIR /tensorflow
-
-# Configure the build for our CUDA configuration.
-ENV CI_BUILD_PYTHON=python \
-    LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH} \
-    CUDNN_INSTALL_PATH=/usr/lib/x86_64-linux-gnu \
-    PYTHON_BIN_PATH=/usr/bin/python \
-    PYTHON_LIB_PATH=/usr/local/lib/python2.7/dist-packages \
-    TF_NEED_CUDA=1 \
-    TF_CUDA_VERSION=9.0 \
-    TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0,6.1,7.0 \
-    TF_CUDNN_VERSION=7
-RUN ./configure
-
-# Build and Install TensorFlow.
-RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
-    LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
-    bazel build -c opt \
-                --config=cuda \
-                --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
-                --jobs=${TF_AVAILABLE_CPUS} \
-                tensorflow/tools/pip_package:build_pip_package && \
-    mkdir /pip_pkg && \
-    bazel-bin/tensorflow/tools/pip_package/build_pip_package /pip_pkg && \
-    pip --no-cache-dir install --upgrade /pip_pkg/tensorflow-*.whl && \
-    rm -rf /pip_pkg && \
-    rm -rf /root/.cache
-# Clean up pip wheel and Bazel cache when done.
-
-WORKDIR /root
-
-# TensorBoard
-EXPOSE 6006
-# IPython
-EXPOSE 8888
-
-RUN ["/bin/bash"]
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index e212d10290a93261e88cf9464076e5714e16ac43..b6682cd68163ec870ed815b45ac4fdd9233f88c6 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn6-runtime-ubuntu16.04
+FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04
 
 LABEL maintainer="Craig Citro <craigcitro@google.com>"
 
diff --git a/tensorflow/tools/docker/README.md b/tensorflow/tools/docker/README.md
index 39b66552349325c6df794bf71fcf5ec0977758d0..f46c56e11aa72cd0df20f0d8478de2f42dbb3b72 100644
--- a/tensorflow/tools/docker/README.md
+++ b/tensorflow/tools/docker/README.md
@@ -41,7 +41,7 @@ Note: If you would have a problem running nvidia-docker you may try the old meth
 we have used. But it is not recommended. If you find a bug in nvidia-docker, please report
 it there and try using nvidia-docker as described above.
 
-    $ # The old, not recommended way to run docker with gpu support: 
+    $ # The old, not recommended way to run docker with gpu support:
     $ export CUDA_SO=$(\ls /usr/lib/x86_64-linux-gnu/libcuda.* | xargs -I{} echo '-v {}:{}')
     $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
     $ docker run -it -p 8888:8888 $CUDA_SO $DEVICES gcr.io/tensorflow/tensorflow:latest-gpu
@@ -65,8 +65,7 @@ from a binary docker image such as for example `tensorflow/tensorflow:latest` wi
 not work. One needs to execute the script from a developer docker image since by
 contrast with a binary docker image it contains not only the compiled solution but
 also the tensorflow source code. Please select the appropriate developer docker
-image of tensorflow at
-[tensorflow/tensorflow repository on dockerhub](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
+image of tensorflow at `tensorflow/tensorflow:[.](https://hub.docker.com/r/tensorflow/tensorflow/tags/)`.
 
 The smallest command line to generate a docker image will then be:
 ```docker run -it tensorflow/tensorflow:"right_tag"```
diff --git a/tensorflow/tools/docker/notebooks/2_getting_started.ipynb b/tensorflow/tools/docker/notebooks/2_getting_started.ipynb
index e171b439feea95dc649a1013f78386bc008515ff..b0963ebc3f7efb3d10957c6ed0a7175e5b7a1cb3 100644
--- a/tensorflow/tools/docker/notebooks/2_getting_started.ipynb
+++ b/tensorflow/tools/docker/notebooks/2_getting_started.ipynb
@@ -159,7 +159,7 @@
         "X = np.array([np.linspace(-2, 4, num_examples), np.linspace(-6, 6, num_examples)])\n",
         "X += np.random.randn(2, num_examples)\n",
         "x, y = X\n",
-        "x_with_bias = np.array([(1., a) for a in x]).astype(np.float32)\n",
+        "bias_with_x = np.array([(1., a) for a in x]).astype(np.float32)\n",
         "\n",
         "losses = []\n",
         "training_steps = 50\n",
@@ -167,7 +167,7 @@
         "\n",
         "with tf.Session() as sess:\n",
         "    # Set up all the tensors, variables, and operations.\n",
-        "    input = tf.constant(x_with_bias)\n",
+        "    input = tf.constant(bias_with_x)\n",
         "    target = tf.constant(np.transpose([y]).astype(np.float32))\n",
         "    weights = tf.Variable(tf.random_normal([2, 1], 0, 0.1))\n",
         "\n",
@@ -583,7 +583,7 @@
         "# Split into x and y\n",
         "x, y = X\n",
         "# Add the bias node which always has a value of 1\n",
-        "x_with_bias = np.array([(1., a) for a in x]).astype(np.float32)\n",
+        "bias_with_x = np.array([(1., a) for a in x]).astype(np.float32)\n",
         "\n",
         "# Keep track of the loss at each iteration so we can chart it later\n",
         "losses = []\n",
@@ -598,7 +598,7 @@
         "with tf.Session() as sess:\n",
         "    # Set up all the tensors.\n",
         "    # Our input layer is the x value and the bias node.\n",
-        "    input = tf.constant(x_with_bias)\n",
+        "    input = tf.constant(bias_with_x)\n",
         "    # Our target is the y values. They need to be massaged to the right shape.\n",
         "    target = tf.constant(np.transpose([y]).astype(np.float32))\n",
         "    # Weights are a variable. They change every time through the loop.\n",
@@ -621,7 +621,7 @@
         "    loss = tf.nn.l2_loss(yerror)\n",
         "\n",
         "    # Perform gradient descent. \n",
-        "    # This essentially just updates weights, like weights += grads * learning_rate\n",
+        "    # This essentially just updates weights, like weights -= grads * learning_rate\n",
         "    # using the partial derivative of the loss with respect to the\n",
         "    # weights. It's the direction we want to go to move toward lower error.\n",
         "    update_weights = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)\n",
@@ -743,7 +743,7 @@
         "with tf.Session() as sess:\n",
         "    # Set up all the tensors.\n",
         "    # The input is the x values with the bias appended on to each x.\n",
-        "    input = tf.constant(x_with_bias)\n",
+        "    input = tf.constant(bias_with_x)\n",
         "    # We're trying to find the best fit for the target y values.\n",
         "    target = tf.constant(np.transpose([y]).astype(np.float32))\n",
         "    # Let's set up the weights randomly\n",
diff --git a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
index 614a19c178d021133041cef4dbfddd7cd4b6c020..5585ebdcd366ec9db0c47004647970cb27c8bb75 100644
--- a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
+++ b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
@@ -135,6 +135,8 @@
     "from six.moves.urllib.request import urlretrieve\n",
     "\n",
     "SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/'\n",
+    "#SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'\n",
+    "# for those who have no access to google storage, use lecun's repo please\n",
     "WORK_DIRECTORY = \"/tmp/mnist-data\"\n",
     "\n",
     "def maybe_download(filename):\n",
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index 80a07b9b3ba7fb278b01862880893aa0a2693a28..e7de7df856e1928e3541f6eae4c224134720b69c 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -265,7 +265,7 @@ else
   DOCKERFILE="${TMP_DIR}/Dockerfile"
 
   # Modify the devel Dockerfile to specify the git branch
-  sed -r "s/([\s]*git checkout )(.*)/\1${TF_DOCKER_BUILD_DEVEL_BRANCH}/g" \
+  sed "s/^RUN git clone --branch=.* --depth=1/RUN git clone --branch=${TF_DOCKER_BUILD_DEVEL_BRANCH} --depth=1/" \
       "${ORIG_DOCKERFILE}" > "${DOCKERFILE}"
 
   # Modify python/pip version if necessary.
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index c0cde1d3bdd9023479a19112df36d3d88411da67..003f972070cb05aa6f34a3748d47f019744de058 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import fnmatch
 import os
 import sys
 
@@ -198,12 +199,12 @@ def add_dict_to_dict(add_from, add_to):
       add_to[key] = add_from[key]
 
 
-# Exclude some libaries in contrib from the documentation altogether.
+# Exclude some libraries in contrib from the documentation altogether.
 def _get_default_private_map():
   return {'tf.test': ['mock']}
 
 
-# Exclude members of some libaries.
+# Exclude members of some libraries.
 def _get_default_do_not_descend_map():
   # TODO(wicke): Shrink this list once the modules get sealed.
   return {
@@ -384,10 +385,26 @@ class _UpdateTags(py_guide_parser.PyGuideParser):
 EXCLUDED = set(['__init__.py', 'OWNERS', 'README.txt'])
 
 
-def _other_docs(src_dir, output_dir, reference_resolver):
-  """Convert all the files in `src_dir` and write results to `output_dir`."""
-  header = '<!-- DO NOT EDIT! Automatically generated file. -->\n'
+def _other_docs(src_dir, output_dir, reference_resolver, file_pattern='*.md'):
+  """Fix @{} references in all files under `src_dir` matching `file_pattern`.
 
+  A matching directory structure, with the modified files is
+  written to `output_dir`.
+
+  `{"__init__.py","OWNERS","README.txt"}` are skipped.
+
+  Files not matching `file_pattern` (using `fnmatch`) are copied with no change.
+
+  Also, files in the `api_guides/python` directory get explicit ids set on all
+  heading-2s to ensure back-links work.
+
+  Args:
+    src_dir: The directory to convert files from.
+    output_dir: The root directory to write the resulting files to.
+    reference_resolver: A `parser.ReferenceResolver` to make the replacements.
+    file_pattern: Only replace references in files matching file_patters,
+      using fnmatch. Non-matching files are copied unchanged.
+  """
   # Iterate through all the source files and process them.
   tag_updater = _UpdateTags()
   for dirpath, _, filenames in os.walk(src_dir):
@@ -415,21 +432,21 @@ def _other_docs(src_dir, output_dir, reference_resolver):
 
       suffix = os.path.relpath(path=full_in_path, start=src_dir)
       full_out_path = os.path.join(output_dir, suffix)
-      if not base_name.endswith('.md'):
-        print('Copying non-md file %s...' % suffix)
+      if not fnmatch.fnmatch(base_name, file_pattern):
+        print('Copying un-matched file %s...' % suffix)
         open(full_out_path, 'w').write(open(full_in_path).read())
         continue
       if dirpath.endswith('/api_guides/python'):
         print('Processing Python guide %s...' % base_name)
-        md_string = tag_updater.process(full_in_path)
+        content = tag_updater.process(full_in_path)
       else:
         print('Processing doc %s...' % suffix)
-        md_string = open(full_in_path).read()
+        content = open(full_in_path).read()
 
-      output = reference_resolver.replace_references(md_string,
-                                                     relative_path_to_root)
+      content = reference_resolver.replace_references(content,
+                                                      relative_path_to_root)
       with open(full_out_path, 'w') as f:
-        f.write(header + output)
+        f.write(content)
 
   print('Done.')
 
diff --git a/tensorflow/tools/git/gen/branch_ref b/tensorflow/tools/git/gen/branch_ref
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/tensorflow/tools/git/gen/branch_ref
@@ -0,0 +1 @@
+
diff --git a/tensorflow/tools/git/gen/head b/tensorflow/tools/git/gen/head
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/tensorflow/tools/git/gen/head
@@ -0,0 +1 @@
+
diff --git a/tensorflow/tools/git/gen/spec.json b/tensorflow/tools/git/gen/spec.json
new file mode 100644
index 0000000000000000000000000000000000000000..176bbc21ccb9112d5c29f0351ec937c302a1383e
--- /dev/null
+++ b/tensorflow/tools/git/gen/spec.json
@@ -0,0 +1,3 @@
+{
+  "git": false
+}
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 0307d2a0ebee820fee0867c35c5761f2f8607aea..2e27487d2ff3beb40ec3045989ff8187480c9b43 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -180,6 +180,13 @@ const int tf_cxx11_abi_flag() {
   return 0;
 #endif
 }
+const int tf_monolithic_build() {
+#ifdef TENSORFLOW_MONOLITHIC_BUILD
+  return 1;
+#else
+  return 0;
+#endif
+}
 """ % git_version
   open(filename, "w").write(contents)
 
diff --git a/tensorflow/tools/git/gen_git_source.sh b/tensorflow/tools/git/gen_git_source.sh
index 788f9e6e5730f9e4699011298d689bc26226fb65..db20bb00e84b47bd15244e70b925f59e62731deb 100755
--- a/tensorflow/tools/git/gen_git_source.sh
+++ b/tensorflow/tools/git/gen_git_source.sh
@@ -36,5 +36,12 @@ const int tf_cxx11_abi_flag() {
   return 0;
 #endif
 }
+const int tf_monolithic_build() {
+#ifdef TENSORFLOW_MONOLITHIC_BUILD
+  return 1;
+#else
+  return 0;
+#endif
+}
 EOF
 
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index 9216008600b0969ae95a985f54511a24f4fac3e7..58489b28c8b6738e22e72002ab97c1c0b994b790 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -128,6 +128,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md
index c7f7eca25749bc20b2bca95956e919b861c4a71d..345d9eadb858cadebe03ecb3297aea52ba54bd37 100644
--- a/tensorflow/tools/graph_transforms/README.md
+++ b/tensorflow/tools/graph_transforms/README.md
@@ -95,9 +95,9 @@ transforms to modify the graph with. The transforms are given as a list of
 names, and can each have arguments themselves. These transforms define the
 pipeline of modifications that are applied in order to produce the output.
 Sometimes you need some transforms to happen before others, and the ordering
-within the list lets you specify which happen first. 
-Note that the optimization 
-`remove_nodes(op=Identity, op=CheckNumerics)` will break the model with control 
+within the list lets you specify which happen first.
+Note that the optimization
+`remove_nodes(op=Identity, op=CheckNumerics)` will break the model with control
 flow operations, such as `tf.cond`, `tf.map_fn`, and `tf.while`.
 
 ## Inspecting Graphs
diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.cc b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
index f2934a79bdf65473092cbf80fafbda888d7b9c7c..250f54e20fba6e24fe95741b1437ac3718ace6fb 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_lib.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
@@ -39,9 +39,9 @@ limitations under the License.
 namespace tensorflow {
 namespace graph_transforms {
 namespace {
-using StringPieceSet = std::unordered_set<StringPiece, StringPiece::Hasher>;
+using StringPieceSet = std::unordered_set<StringPiece, StringPieceHasher>;
 template <typename T>
-using StringPieceMap = std::unordered_map<StringPiece, T, StringPiece::Hasher>;
+using StringPieceMap = std::unordered_map<StringPiece, T, StringPieceHasher>;
 }  // namespace
 
 Status ReplaceSendRecvs(const GraphDef& original_graph_def,
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index 20d443c7e9070d0c82191c70ec1a855deeeb8f0b..96324d0deab400078fdf388bff69001f8e2df9aa 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -89,7 +89,10 @@ Status ObtainTensorSlice(const GraphDef& input_graph_def,
                          string* shape_slice_string) {
   string restore_node_name;
   for (const auto& node : input_graph_def.node()) {
-    if (StringPiece(node.name()).starts_with("save/Assign") &&
+    std::vector<string> node_name_parts = str_util::Split(node.name(), "/");
+    if (node_name_parts.size() == 2 &&
+        StringPiece(node_name_parts[0]).starts_with("save") &&
+        StringPiece(node_name_parts[1]).starts_with("Assign") &&
         node.input(0) == tensor_name) {
       restore_node_name = node.input(1);
       break;
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index e3cbd67721aa04f170878f1d369ed65b7fde630e..d80d5ecc6a6f6324baf63dcaf5ed0ae556db1d81 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -6,6 +6,7 @@ package(default_visibility = ["//visibility:private"])
 load(
     "//tensorflow:tensorflow.bzl",
     "if_not_windows",
+    "if_windows",
     "transitive_hdrs",
 )
 load("//third_party/mkl:build_defs.bzl", "if_mkl")
@@ -155,8 +156,8 @@ sh_binary(
             "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
             "//tensorflow/contrib/data/python/ops:prefetching_py",
             "//tensorflow/contrib/eager/python/examples:examples_pip",
+            "//tensorflow/contrib/eager/python:checkpointable",
             "//tensorflow/contrib/eager/python:evaluator",
-            "//tensorflow/contrib/eager/python:summary_writer",
             "//tensorflow/contrib/gan:gan",
             "//tensorflow/contrib/graph_editor:graph_editor_pip",
             "//tensorflow/contrib/keras:keras",
@@ -194,3 +195,23 @@ sh_binary(
         ],
     }) + if_mkl(["//third_party/mkl:intel_binary_blob"]),
 )
+
+# A genrule for generating a marker file for the pip package on Windows
+#
+# This only works on Windows, because :simple_console_for_windows is a
+# python zip file containing everything we need for building the pip package.
+# However, on other platforms, due to https://github.com/bazelbuild/bazel/issues/4223,
+# when C++ extensions change, this generule doesn't rebuild.
+genrule(
+    name = "win_pip_package_marker",
+    srcs = if_windows([
+        ":build_pip_package",
+        ":simple_console_for_windows",
+    ]),
+    outs = ["win_pip_package_marker_file"],
+    cmd = select({
+        "//conditions:default": "touch $@",
+        "//tensorflow:windows": "md5sum $(locations :build_pip_package) $(locations :simple_console_for_windows) > $@",
+    }),
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 8249703ba717f25dbfb324557727b636c6640cc5..f5203bc5448ff2d9a9e9352f8968c4a8a31c336a 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -24,7 +24,7 @@ function real_path() {
 function cp_external() {
   local src_dir=$1
   local dest_dir=$2
-  for f in `find "$src_dir" -maxdepth 1 -mindepth 1 ! -name '*local_config_cuda*'`; do
+  for f in `find "$src_dir" -maxdepth 1 -mindepth 1 ! -name '*local_config_cuda*' ! -name '*org_tensorflow*'`; do
     cp -R "$f" "$dest_dir"
   done
 }
@@ -92,7 +92,6 @@ function main() {
       bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow/tensorflow \
       "${TMPDIR}"
     mkdir "${TMPDIR}/external"
-    # Note: this makes an extra copy of org_tensorflow.
     cp_external \
       bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles \
       "${TMPDIR}/external"
@@ -123,7 +122,6 @@ function main() {
         bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/tensorflow \
         "${TMPDIR}"
       mkdir "${TMPDIR}/external"
-      # Note: this makes an extra copy of org_tensorflow.
       cp_external \
         bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles \
         "${TMPDIR}/external"
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index 3677aaa886f8c51843548e1c8941ca9053a6bd10..22e1584b780bcefbc278105b794b932aacdc9992 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -42,6 +42,7 @@ BLACKLIST = [
     "//tensorflow/python:extra_py_tests_deps",
     "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     "//tensorflow:no_tensorflow_py_deps",
+    "//tensorflow/tools/pip_package:win_pip_package_marker",
     "//tensorflow/python:test_ops_2",
     "//tensorflow/python:tf_optimizer",
     "//tensorflow/python:compare_test_proto_py",
@@ -66,9 +67,6 @@ BLACKLIST = [
     "//tensorflow/contrib/timeseries/examples:data/period_trend.csv",  # pylint:disable=line-too-long
     "//tensorflow/contrib/timeseries/python/timeseries:test_utils",
     "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:test_utils",  # pylint:disable=line-too-long
-
-    # TODO(yifeif): Remove when py_library(testonly=1) is ignored.
-    "//tensorflow/contrib/summary:summary_test_internal",
 ]
 
 
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index a493c6f2aaee66a3c413788e8fe3eb206e26cb66..8fa39b6248589793c5f3c13dc819ea0c46ccf8ff 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -32,12 +32,11 @@ from setuptools.dist import Distribution
 _VERSION = '1.4.0'
 
 REQUIRED_PACKAGES = [
-    'absl-py',
-    'enum34 >= 1.1.6',
+    'absl-py >= 0.1.6',
     'numpy >= 1.12.1',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-    'tensorflow-tensorboard >= 0.4.0rc1, < 0.5.0',
+    'tensorflow-tensorboard',
 ]
 
 project_name = 'tensorflow'
@@ -55,16 +54,17 @@ else:
   # mock comes with unittest.mock for python3, need to install for python2
   REQUIRED_PACKAGES.append('mock >= 2.0.0')
 
-# remove tensorboard from tf-nightly packages
+# tf-nightly should depend on tb-nightly
 if 'tf_nightly' in project_name:
-  for package in REQUIRED_PACKAGES:
-    if 'tensorflow-tensorboard' in package:
-      REQUIRED_PACKAGES.remove(package)
+  for i, pkg in enumerate(REQUIRED_PACKAGES):
+    if 'tensorboard' in pkg:
+      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.5.0a0, < 1.6.0a0'
       break
 
-# weakref.finalize was introduced in Python 3.4
+# weakref.finalize and enum were introduced in Python 3.4
 if sys.version_info < (3, 4):
   REQUIRED_PACKAGES.append('backports.weakref >= 1.0rc1')
+  REQUIRED_PACKAGES.append('enum34 >= 1.1.6')
 
 # pylint: disable=line-too-long
 CONSOLE_SCRIPTS = [
@@ -76,13 +76,13 @@ CONSOLE_SCRIPTS = [
     # is now declared by the tensorboard pip package. If we remove the
     # TensorBoard command, pip will inappropriately remove it during install,
     # even though the command is not removed, just moved to a different wheel.
-    'tensorboard = tensorboard.main:main',
+    'tensorboard = tensorboard.main:run_main',
 ]
 # pylint: enable=line-too-long
 
 # remove the tensorboard console script if building tf_nightly
 if 'tf_nightly' in project_name:
-  CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:main')
+  CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:run_main')
 
 TEST_PACKAGES = [
     'scipy >= 0.15.1',
@@ -177,6 +177,7 @@ def find_files(pattern, root):
 
 matches = ['../' + x for x in find_files('*', 'external') if '.py' not in x]
 matches += ['../' + x for x in find_files('*', '_solib_k8') if '.py' not in x]
+matches += ['../' + x for x in find_files('*', '_solib_local') if '.py' not in x]
 
 if os.name == 'nt':
   EXTENSION_NAME = 'python/_pywrap_tensorflow_internal.pyd'
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions.cc b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
index ecb29a65a08b098cd167e5cbb2bdb5821e01a543..f0bb59acf801ba586fa8258b5b1ad9f202f014bf 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
@@ -132,6 +132,7 @@ int MainImpl(int argc, char** argv) {
       FILE* f = fopen(path.c_str(), "w");
       if (f == nullptr) return -1;
       if (fwrite(data.c_str(), 1, data.size(), f) != data.size()) {
+        fclose(f);
         return -1;
       }
       if (fclose(f) != 0) {
diff --git a/tensorflow/tools/test/performance.bzl b/tensorflow/tools/test/performance.bzl
index b5c4bbf5a700aedfea7abf7f1c07a62df0155cfc..cee53dd5b61e50126948e3652865a32f45eab092 100644
--- a/tensorflow/tools/test/performance.bzl
+++ b/tensorflow/tools/test/performance.bzl
@@ -21,8 +21,9 @@ def tf_cc_logged_benchmark(
     fail(" ".join(("Target must be a single well-defined test, e.g.,",
                    "//path/to:test. Received: %s" % target)))
 
-  all_tags = list(depset(tags) + \
-                  depset(["benchmark-test", "local", "manual", "regression-test"]))
+  all_tags = (
+    depset(tags) + depset(
+      ["benchmark-test", "local", "manual", "regression-test"])).to_list()
 
   tf_py_test(
       name = name,
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 8e62228c1b7c98d00c18bae7b834e799c47fbd1f..6a496f53f0a78a8a41c64472e75331488de2665a 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -1,40 +1,21 @@
 # TensorFlow external dependencies that can be loaded in WORKSPACE files.
 
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
-
-load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
-load(
-    "@io_bazel_rules_closure//closure/private:java_import_external.bzl",
-    "java_import_external",
-)
-load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
 load("//third_party/py:python_configure.bzl", "python_configure")
-load(
-    "//third_party/toolchains/cpus/arm:arm_compiler_configure.bzl",
-    "arm_compiler_configure",
-)
-
-def _is_windows(repository_ctx):
-  """Returns true if the host operating system is windows."""
-  return repository_ctx.os.name.lower().find("windows") != -1
-
-def _get_env_var(repository_ctx, name):
-  """Find an environment variable."""
-  if name in repository_ctx.os.environ:
-    return repository_ctx.os.environ[name]
-  else:
-    return None
+load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
+load("//third_party/toolchains/cpus/arm:arm_compiler_configure.bzl", "arm_compiler_configure")
+load("//third_party:repo.bzl", "tf_http_archive")
+load("@io_bazel_rules_closure//closure/private:java_import_external.bzl", "java_import_external")
+load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
 
 # Parse the bazel version string from `native.bazel_version`.
 def _parse_bazel_version(bazel_version):
   # Remove commit from version.
   version = bazel_version.split(" ", 1)[0]
-
   # Split into (release, date) parts and only return the release
   # as a tuple of integers.
   parts = version.split("-", 1)
-
   # Turn "release" into a tuple of strings
   version_tuple = ()
   for number in parts[0].split("."):
@@ -57,79 +38,6 @@ def check_version(bazel_version):
       fail("\nCurrent Bazel version is {}, expected at least {}\n".format(
           native.bazel_version, bazel_version))
 
-def _repos_are_siblings():
-  return Label("@foo//bar").workspace_root.startswith("../")
-
-# Temporary workaround to support including TensorFlow as a submodule until this
-# use-case is supported in the next Bazel release.
-def _temp_workaround_http_archive_impl(repo_ctx):
-  repo_ctx.template("BUILD", repo_ctx.attr.build_file, {
-      "%prefix%": ".." if _repos_are_siblings() else "external",
-      "%ws%": repo_ctx.attr.repository
-  }, False)
-  repo_ctx.download_and_extract(repo_ctx.attr.urls, "", repo_ctx.attr.sha256,
-                                "", repo_ctx.attr.strip_prefix)
-  if repo_ctx.attr.patch_file != None:
-    _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
-
-temp_workaround_http_archive = repository_rule(
-    attrs = {
-        "build_file": attr.label(),
-        "repository": attr.string(),
-        "patch_file": attr.label(default = None),
-        "urls": attr.string_list(default = []),
-        "sha256": attr.string(default = ""),
-        "strip_prefix": attr.string(default = ""),
-    },
-    implementation = _temp_workaround_http_archive_impl,
-)
-
-# Executes specified command with arguments and calls 'fail' if it exited with
-# non-zero code
-def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
-  result = repo_ctx.execute(cmd_and_args, timeout=10)
-  if result.return_code != 0:
-    fail(("Non-zero return code({1}) when executing '{0}':\n" + "Stdout: {2}\n"
-          + "Stderr: {3}").format(" ".join(cmd_and_args), result.return_code,
-                                  result.stdout, result.stderr))
-
-# Apply a patch_file to the repository root directory
-# Runs 'patch -p1'
-def _apply_patch(repo_ctx, patch_file):
-  # Don't check patch on Windows, because patch is only available under bash.
-  if not _is_windows(repo_ctx) and not repo_ctx.which("patch"):
-    fail("patch command is not found, please install it")
-
-  cmd = [
-      "patch", "-p1", "-d", repo_ctx.path("."), "-i", repo_ctx.path(patch_file)
-  ]
-  if _is_windows(repo_ctx):
-    bazel_sh = _get_env_var(repo_ctx, "BAZEL_SH")
-    if not bazel_sh:
-      fail("BAZEL_SH environment variable is not set")
-    cmd = [bazel_sh, "-c", " ".join(cmd)]
-  _execute_and_check_ret_code(repo_ctx, cmd)
-
-# Download the repository and apply a patch to its root
-def _patched_http_archive_impl(repo_ctx):
-  repo_ctx.download_and_extract(
-      repo_ctx.attr.urls,
-      sha256=repo_ctx.attr.sha256,
-      stripPrefix=repo_ctx.attr.strip_prefix)
-  _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
-
-patched_http_archive = repository_rule(
-    attrs = {
-        "patch_file": attr.label(),
-        "build_file": attr.label(),
-        "repository": attr.string(),
-        "urls": attr.string_list(default = []),
-        "sha256": attr.string(default = ""),
-        "strip_prefix": attr.string(default = ""),
-    },
-    implementation = _patched_http_archive_impl,
-)
-
 # If TensorFlow is linked as a submodule.
 # path_prefix is no longer used.
 # tf_repo_name is thought to be under consideration.
@@ -151,63 +59,64 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl",
       urls = [
-          "https://mirror.bazel.build/github.com/01org/mkl-dnn/releases/download/v0.9/mklml_lnx_2018.0.20170720.tgz",
-          "https://github.com/01org/mkl-dnn/releases/download/v0.9/mklml_lnx_2018.0.20170720.tgz",
+          "https://mirror.bazel.build/github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz",
+          "https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz",
       ],
-      sha256 = "57ba56c4c243f403ff78f417ff854ef50b9eddf4a610a917b7c95e7fa8553a4b",
-      strip_prefix = "mklml_lnx_2018.0.20170720",
+      sha256 = "6b07cb7e5451db67c2e31e785ae458b18f7f363c60a61685488f69e9ae7199d4",
+      strip_prefix = "mklml_lnx_2018.0.1.20171007",
       build_file = str(Label("//third_party/mkl:mkl.BUILD")),
-      repository = tf_repo_name,
   )
 
   if path_prefix:
     print("path_prefix was specified to tf_workspace but is no longer used " +
           "and will be removed in the future.")
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/01org/mkl-dnn/archive/b01e3a55a07be62172e713bcd2644c5176360212.tar.gz",
-          "https://github.com/01org/mkl-dnn/archive/b01e3a55a07be62172e713bcd2644c5176360212.tar.gz",
+          "https://mirror.bazel.build/github.com/01org/mkl-dnn/archive/aab753280e83137ba955f8f19d72cb6aaba545ef.tar.gz",
+          "https://github.com/01org/mkl-dnn/archive/aab753280e83137ba955f8f19d72cb6aaba545ef.tar.gz",
       ],
-      sha256 = "0d529ad4c49dc799e6df07c2b88b115d0668735da15fb3b3862d28d33fa68165",
-      strip_prefix = "mkl-dnn-b01e3a55a07be62172e713bcd2644c5176360212",
+      sha256 = "fb67f255a96bd4ad39b8dd104eca5aa92200c95c1ed36e59641e6c0478eefd11",
+      strip_prefix = "mkl-dnn-aab753280e83137ba955f8f19d72cb6aaba545ef",
       build_file = str(Label("//third_party/mkl_dnn:mkldnn.BUILD")),
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "com_google_absl",
       urls = [
-          "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/cc4bed2d74f7c8717e31f9579214ab52a9c9c610.tar.gz",
-          "https://github.com/abseil/abseil-cpp/archive/cc4bed2d74f7c8717e31f9579214ab52a9c9c610.tar.gz",
+          "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/720c017e30339fd1786ce4aac68bc8559736e53f.tar.gz",
+          "https://github.com/abseil/abseil-cpp/archive/720c017e30339fd1786ce4aac68bc8559736e53f.tar.gz",
       ],
-     sha256 = "f1a7349f88d2846210c42e2f7271dabeee404c2a3b4198e34a797993e3569b03",
-     strip_prefix = "abseil-cpp-cc4bed2d74f7c8717e31f9579214ab52a9c9c610",
+     sha256 = "5996380e3e8b981f55d1c8d58e709c00dbb4806ba367be75d0925a68cc2f6478",
+     strip_prefix = "abseil-cpp-720c017e30339fd1786ce4aac68bc8559736e53f",
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "eigen_archive",
       urls = [
-          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/429aa5254200.tar.gz",
-          "https://bitbucket.org/eigen/eigen/get/429aa5254200.tar.gz",
+          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/c2947c341c68.tar.gz",
+          "https://bitbucket.org/eigen/eigen/get/c2947c341c68.tar.gz",
       ],
-      sha256 = "61d8b6fc4279dd1dda986fb1677d15e3d641c07a3ea5abe255790b1f0c0c14e9",
-      strip_prefix = "eigen-eigen-429aa5254200",
+      sha256 = "f21f8ab8a8dbcb91cd0deeade19a043f47708d0da7a4000164cdf203b4a71e34",
+      strip_prefix = "eigen-eigen-c2947c341c68",
       build_file = str(Label("//third_party:eigen.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "arm_compiler",
-      build_file = str(Label("//:arm_compiler.BUILD")),
       sha256 = "970285762565c7890c6c087d262b0a18286e7d0384f13a37786d8521773bc969",
       strip_prefix = "tools-0e906ebc527eab1cdbf7adabff5b474da9562e9f/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf",
       urls = [
           "https://mirror.bazel.build/github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
+          # Please uncomment me, when the next upgrade happens. Then
+          # remove the whitelist entry in third_party/repo.bzl.
           # "https://github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
       ],
+      build_file = str(Label("//:arm_compiler.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "libxsmm_archive",
       urls = [
           "https://mirror.bazel.build/github.com/hfp/libxsmm/archive/1.8.1.tar.gz",
@@ -218,15 +127,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:libxsmm.BUILD")),
   )
 
-  native.bind(
-      name = "xsmm_avx",
-      actual = "@libxsmm_archive//third_party:xsmm_avx",
-  )
-
-  native.new_http_archive(
+  tf_http_archive(
       name = "ortools_archive",
       urls = [
           "https://mirror.bazel.build/github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
+          # Please uncomment me, when the next upgrade happens. Then
+          # remove the whitelist entry in third_party/repo.bzl.
           # "https://github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
       ],
       sha256 = "932075525642b04ac6f1b50589f1df5cd72ec2f448b721fd32234cf183f0e755",
@@ -234,17 +140,18 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:ortools.BUILD")),
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "com_googlesource_code_re2",
       urls = [
-          "https://mirror.bazel.build/github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
-          "https://github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
+          "https://mirror.bazel.build/github.com/google/re2/archive/26cd968b735e227361c9703683266f01e5df7857.tar.gz",
+          "https://github.com/google/re2/archive/26cd968b735e227361c9703683266f01e5df7857.tar.gz",
+
       ],
-      sha256 = "bd63550101e056427c9e7ff12a408c1c8b74e9803f393ca916b2926fc2c4906f",
-      strip_prefix = "re2-b94b7cd42e9f02673cd748c1ac1d16db4052514c",
+      sha256 = "e57eeb837ac40b5be37b2c6197438766e73343ffb32368efea793dfd8b28653b",
+      strip_prefix = "re2-26cd968b735e227361c9703683266f01e5df7857",
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "gemmlowp",
       urls = [
           "https://mirror.bazel.build/github.com/google/gemmlowp/archive/010bb3e71a26ca1d0884a167081d092b43563996.zip",
@@ -254,7 +161,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       strip_prefix = "gemmlowp-010bb3e71a26ca1d0884a167081d092b43563996",
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "farmhash_archive",
       urls = [
           "https://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz",
@@ -265,12 +172,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:farmhash.BUILD")),
   )
 
-  native.bind(
-      name = "farmhash",
-      actual = "@farmhash//:farmhash",
-  )
-
-  native.new_http_archive(
+  tf_http_archive(
       name = "highwayhash",
       urls = [
           "https://mirror.bazel.build/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
@@ -281,7 +183,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:highwayhash.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "nasm",
       urls = [
           "https://mirror.bazel.build/www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2",
@@ -292,7 +194,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:nasm.BUILD")),
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "jpeg",
       urls = [
           "https://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
@@ -301,10 +203,9 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       sha256 = "c15a9607892113946379ccea3ca8b85018301b200754f209453ab21674268e77",
       strip_prefix = "libjpeg-turbo-1.5.1",
       build_file = str(Label("//third_party/jpeg:jpeg.BUILD")),
-      repository = tf_repo_name,
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "png_archive",
       urls = [
           "https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.2.53.tar.gz",
@@ -315,7 +216,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:png.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "sqlite_archive",
       urls = [
           "https://mirror.bazel.build/www.sqlite.org/2017/sqlite-amalgamation-3200000.zip",
@@ -323,10 +224,10 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "208780b3616f9de0aeb50822b7a8f5482f6515193859e91ed61637be6ad74fd4",
       strip_prefix = "sqlite-amalgamation-3200000",
-      build_file = str(Label("//third_party:sqlite.BUILD"))
+      build_file = str(Label("//third_party:sqlite.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "gif_archive",
       urls = [
           "https://mirror.bazel.build/ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
@@ -337,7 +238,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:gif.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "six_archive",
       urls = [
           "https://mirror.bazel.build/pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
@@ -348,17 +249,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:six.BUILD")),
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "absl_py",
       urls = [
-          "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/231e3870b976c1dc61dce1749138661d21556028.tar.gz",
-          "https://github.com/abseil/abseil-py/archive/231e3870b976c1dc61dce1749138661d21556028.tar.gz",
+          "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/acec853355ef987eae48a8d87a79351c15dff593.tar.gz",
+          "https://github.com/abseil/abseil-py/archive/acec853355ef987eae48a8d87a79351c15dff593.tar.gz",
       ],
-      sha256 = "8ea2b23bfdb9ae7622f3e5d95236bc600c8d8509a2f38c84732b3145585d4f73",
-      strip_prefix = "abseil-py-231e3870b976c1dc61dce1749138661d21556028",
+      sha256 = "29e4584e778bee13aa4093824133d131d927cc160561892880118d9ff7b95a6a",
+      strip_prefix = "abseil-py-acec853355ef987eae48a8d87a79351c15dff593",
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "org_python_pypi_backports_weakref",
       urls = [
           "https://mirror.bazel.build/pypi.python.org/packages/bc/cc/3cdb0a02e7e96f6c70bd971bc8a90b8463fda83e264fa9c5c1c98ceabd81/backports.weakref-1.0rc1.tar.gz",
@@ -369,7 +270,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:backports_weakref.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "com_github_andreif_codegen",
       urls = [
           "https://mirror.bazel.build/github.com/andreif/codegen/archive/1.0.tar.gz",
@@ -391,12 +292,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       },
   )
 
-  native.bind(
-      name = "six",
-      actual = "@six_archive//:six",
-  )
-
-  patched_http_archive(
+  tf_http_archive(
       name = "protobuf_archive",
       urls = [
           "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
@@ -411,20 +307,10 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       patch_file = str(Label("//third_party/protobuf:add_noinlines.patch")),
   )
 
-  native.bind(
-      name = "protobuf",
-      actual = "@protobuf_archive//:protobuf",
-  )
-
-  native.bind(
-      name = "protobuf_headers",
-      actual = "@protobuf_archive//:protobuf_headers",
-  )
-
   # We need to import the protobuf library under the names com_google_protobuf
   # and com_google_protobuf_cc to enable proto_library support in bazel.
   # Unfortunately there is no way to alias http_archives at the moment.
-  native.http_archive(
+  tf_http_archive(
       name = "com_google_protobuf",
       urls = [
           "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
@@ -434,7 +320,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "com_google_protobuf_cc",
       urls = [
           "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
@@ -444,17 +330,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "nsync",
       urls = [
-          "https://mirror.bazel.build/github.com/google/nsync/archive/93815892dddafe9146a5f7e7042281d59d0f4323.tar.gz",
-          "https://github.com/google/nsync/archive/93815892dddafe9146a5f7e7042281d59d0f4323.tar.gz",
+          "https://mirror.bazel.build/github.com/google/nsync/archive/8502189abfa44c249c01c2cad64e6ed660a9a668.tar.gz",
+          "https://github.com/google/nsync/archive/8502189abfa44c249c01c2cad64e6ed660a9a668.tar.gz",
       ],
-      sha256 = "e3bd4555415ace511338fc27e595351738eea4e9006f1612b76c82914770716b",
-      strip_prefix = "nsync-93815892dddafe9146a5f7e7042281d59d0f4323",
+      sha256 = "51f81ff4202bbb820cdbedc061bd2eb6765f2b5c06489e7a8694bedac329e8f8",
+      strip_prefix = "nsync-8502189abfa44c249c01c2cad64e6ed660a9a668",
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "com_google_googletest",
       urls = [
           "https://mirror.bazel.build/github.com/google/googletest/archive/9816b96a6ddc0430671693df90192bbee57108b6.zip",
@@ -464,7 +350,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       strip_prefix = "googletest-9816b96a6ddc0430671693df90192bbee57108b6",
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "com_github_gflags_gflags",
       urls = [
           "https://mirror.bazel.build/github.com/gflags/gflags/archive/f8a0efe03aa69b3336d8e228b37d4ccb17324b88.tar.gz",
@@ -474,12 +360,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       strip_prefix = "gflags-f8a0efe03aa69b3336d8e228b37d4ccb17324b88",
   )
 
-  native.bind(
-      name = "python_headers",
-      actual = str(Label("//util/python:python_headers")),
-  )
-
-  native.new_http_archive(
+  tf_http_archive(
       name = "pcre",
       sha256 = "ccdf7e788769838f8285b3ee672ed573358202305ee361cfec7a4a4fb005bbc7",
       urls = [
@@ -490,7 +371,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:pcre.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "swig",
       sha256 = "58a475dbbd4a4d7075e5fe86d4e54c9edde39847cdb96a3053d87cb64a23a453",
       urls = [
@@ -502,7 +383,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:swig.BUILD")),
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "curl",
       sha256 = "ff3e80c1ca6a068428726cd7dd19037a47cc538ce58ef61c59587191039b2ca6",
       urls = [
@@ -511,58 +392,19 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       strip_prefix = "curl-7.49.1",
       build_file = str(Label("//third_party:curl.BUILD")),
-      repository = tf_repo_name
-  )
-
-  # grpc expects //external:protobuf_clib and //external:protobuf_compiler
-  # to point to the protobuf's compiler library.
-  native.bind(
-      name = "protobuf_clib",
-      actual = "@protobuf_archive//:protoc_lib",
-  )
-
-  native.bind(
-      name = "libssl",
-      actual = "@boringssl//:ssl",
   )
 
-  # gRPC has includes directly from their third_party path for nanopb, so we
-  # must depend on their version of it.
-  native.bind(
-      name = "nanopb",
-      actual = "@grpc//third_party/nanopb:nanopb",
-  )
-
-  native.http_archive(
+  tf_http_archive(
       name = "grpc",
       urls = [
-          "https://mirror.bazel.build/github.com/grpc/grpc/archive/54e8f37e537794c2d814c1604c1282125f64f093.tar.gz",
-          "https://github.com/grpc/grpc/archive/54e8f37e537794c2d814c1604c1282125f64f093.tar.gz",
+          "https://mirror.bazel.build/github.com/grpc/grpc/archive/f836c7e941beb003289dc6e9a58a6e47f5caa5f0.tar.gz",
+          "https://github.com/grpc/grpc/archive/f836c7e941beb003289dc6e9a58a6e47f5caa5f0.tar.gz",
       ],
-      sha256 = "c2166b6d96daddf72fe45b2c594210c65ca17ec3c1b2e12089159a9529edb5e4",
-      strip_prefix = "grpc-54e8f37e537794c2d814c1604c1282125f64f093",
+      sha256 = "676425fc19e0290443b21f1804e5d1096456b6512b349606e3eae8e63299e6ee",
+      strip_prefix = "grpc-f836c7e941beb003289dc6e9a58a6e47f5caa5f0",
   )
 
-  # gRPC wants the existence of a cares dependence but its contents are not
-  # actually important since we have set GRPC_ARES=0 in tools/bazel.rc
-  native.bind(
-      name = "cares",
-      actual = "@grpc//third_party/nanopb:nanopb",
-  )
-
-  # protobuf expects //external:grpc_cpp_plugin to point to grpc's
-  # C++ plugin code generator.
-  native.bind(
-      name = "grpc_cpp_plugin",
-      actual = "@grpc//:grpc_cpp_plugin",
-  )
-
-  native.bind(
-      name = "grpc_lib",
-      actual = "@grpc//:grpc++_unsecure",
-  )
-
-  native.new_http_archive(
+  tf_http_archive(
       name = "linenoise",
       sha256 = "7f51f45887a3d31b4ce4fa5965210a5e64637ceac12720cfce7954d6a2e812f7",
       urls = [
@@ -575,19 +417,18 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
 
   # TODO(phawkins): currently, this rule uses an unofficial LLVM mirror.
   # Switch to an official source of snapshots if/when possible.
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/823bedeb8e23a095173389fa05680597eba3f569.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/823bedeb8e23a095173389fa05680597eba3f569.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/9ab4c272cb604a7f947865428c4ef2169fee2100.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/9ab4c272cb604a7f947865428c4ef2169fee2100.tar.gz",
       ],
-      sha256 = "93464bc760fd0319ebd0a5831fe477fdc4954f3612a29cc64d7405eaee8e00b2",
-      strip_prefix = "llvm-823bedeb8e23a095173389fa05680597eba3f569",
+      sha256 = "1b1b7d3800a94ca2302e3dd670dbe84238749583027883784b55297059d83da8",
+      strip_prefix = "llvm-9ab4c272cb604a7f947865428c4ef2169fee2100",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
-      repository = tf_repo_name,
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "lmdb",
       urls = [
           "https://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
@@ -598,7 +439,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:lmdb.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "jsoncpp_git",
       urls = [
           "https://mirror.bazel.build/github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
@@ -609,12 +450,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:jsoncpp.BUILD")),
   )
 
-  native.bind(
-      name = "jsoncpp",
-      actual = "@jsoncpp_git//:jsoncpp",
-  )
-
-  native.http_archive(
+  tf_http_archive(
       name = "boringssl",
       urls = [
           "https://mirror.bazel.build/github.com/google/boringssl/archive/a0fb951d2a26a8ee746b52f3ba81ab011a0af778.tar.gz",
@@ -624,7 +460,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       strip_prefix = "boringssl-a0fb951d2a26a8ee746b52f3ba81ab011a0af778",
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "zlib_archive",
       urls = [
           "https://mirror.bazel.build/zlib.net/zlib-1.2.8.tar.gz",
@@ -635,12 +471,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:zlib.BUILD")),
   )
 
-  native.bind(
-      name = "zlib",
-      actual = "@zlib_archive//:zlib",
-  )
-
-  native.new_http_archive(
+  tf_http_archive(
       name = "fft2d",
       urls = [
           "https://mirror.bazel.build/www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz",
@@ -650,7 +481,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party/fft2d:fft2d.BUILD")),
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "snappy",
       urls = [
           "https://mirror.bazel.build/github.com/google/snappy/archive/1.1.4.tar.gz",
@@ -659,10 +490,9 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       sha256 = "2f7504c73d85bac842e893340333be8cb8561710642fc9562fccdd9d2c3fcc94",
       strip_prefix = "snappy-1.1.4",
       build_file = str(Label("//third_party:snappy.BUILD")),
-      repository = tf_repo_name,
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "nccl_archive",
       urls = [
           "https://mirror.bazel.build/github.com/nvidia/nccl/archive/03d856977ecbaac87e598c0c4bafca96761b9ac7.tar.gz",
@@ -671,19 +501,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       sha256 = "2ca86fb6179ecbff789cc67c836139c1bbc0324ed8c04643405a30bf26325176",
       strip_prefix = "nccl-03d856977ecbaac87e598c0c4bafca96761b9ac7",
       build_file = str(Label("//third_party:nccl.BUILD")),
-      repository = tf_repo_name,
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "aws",
       urls = [
-          "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.0.90.tar.gz",
-          "https://github.com/aws/aws-sdk-cpp/archive/1.0.90.tar.gz",
+          "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
+          "https://github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
       ],
-      sha256 = "f599b57aec4f03ad696044dd430b2d201864113937353adc346f53ad47991319",
-      strip_prefix = "aws-sdk-cpp-1.0.90",
+      sha256 = "b888d8ce5fc10254c3dd6c9020c7764dd53cf39cf011249d0b4deda895de1b7c",
+      strip_prefix = "aws-sdk-cpp-1.3.15",
       build_file = str(Label("//third_party:aws.BUILD")),
-      repository = tf_repo_name
   )
 
   java_import_external(
@@ -711,7 +539,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       testonly_ = True,
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "jemalloc",
       urls = [
           "https://mirror.bazel.build/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
@@ -720,7 +548,6 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
       strip_prefix = "jemalloc-4.4.0",
       build_file = str(Label("//third_party:jemalloc.BUILD")),
-      repository = tf_repo_name,
   )
 
   java_import_external(
@@ -758,7 +585,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       licenses = ["notice"],  # Apache 2.0
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "com_google_pprof",
       urls = [
           "https://mirror.bazel.build/github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
@@ -769,7 +596,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:pprof.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "cub_archive",
       urls = [
           "https://mirror.bazel.build/github.com/NVlabs/cub/archive/1.7.4.zip",
@@ -780,12 +607,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:cub.BUILD")),
   )
 
-  native.bind(
-      name = "cub",
-      actual = "@cub_archive//:cub",
-  )
-
-  native.new_http_archive(
+  tf_http_archive(
       name = "cython",
       sha256 = "6dcd30b5ceb887b2b965ee7ceb82ea3acb5f0642fe2206c7636b45acea4798e5",
       urls = [
@@ -794,19 +616,20 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       strip_prefix = "cython-3732784c45cfb040a5b0936951d196f83a12ea17",
       build_file = str(Label("//third_party:cython.BUILD")),
+      delete = ["BUILD.bazel"],
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "bazel_toolchains",
       urls = [
-          "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/af4681c3d19f063f090222ec3d04108c4e0ca255.tar.gz",
-          "https://github.com/bazelbuild/bazel-toolchains/archive/af4681c3d19f063f090222ec3d04108c4e0ca255.tar.gz",
+          "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/b49ba3689f46ac50e9277dafd8ff32b26951f82e.tar.gz",
+          "https://github.com/bazelbuild/bazel-toolchains/archive/b49ba3689f46ac50e9277dafd8ff32b26951f82e.tar.gz",
       ],
-      sha256 = "d58bb2d6c8603f600d522b6104d6192a65339aa26cbba9f11ff5c4b36dedb928",
-      strip_prefix = "bazel-toolchains-af4681c3d19f063f090222ec3d04108c4e0ca255",
+      sha256 = "1266f1e27b4363c83222f1a776397c7a069fbfd6aacc9559afa61cdd73e1b429",
+      strip_prefix = "bazel-toolchains-b49ba3689f46ac50e9277dafd8ff32b26951f82e",
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "arm_neon_2_x86_sse",
       sha256 = "c8d90aa4357f8079d427e87a6f4c493da1fa4140aee926c05902d7ec1533d9a5",
       strip_prefix = "ARM_NEON_2_x86_SSE-0f77d9d182265259b135dad949230ecbf1a2633d",
@@ -817,23 +640,109 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:arm_neon_2_x86_sse.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "flatbuffers",
-      build_file = "third_party/flatbuffers/flatbuffers.BUILD",
       strip_prefix = "flatbuffers-971a68110e4fc1bace10fcb6deeb189e7e1a34ce",
       sha256 = "874088d2ee0d9f8524191f77209556415f03dd44e156276edf19e5b90ceb5f55",
       urls = [
           "https://mirror.bazel.build/github.com/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz",
           "https://github.com/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz",
       ],
+      build_file = str(Label("//third_party/flatbuffers:flatbuffers.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "tflite_mobilenet",
-      build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
       sha256 = "23f814d1c076bdf03715dfb6cab3713aa4fbdf040fd5448c43196bd2e97a4c1b",
       urls = [
           "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
           "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
       ],
+      build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
+  )
+
+  tf_http_archive(
+      name = "tflite_smartreply",
+      sha256 = "8980151b85a87a9c1a3bb1ed4748119e4a85abd3cb5744d83da4d4bd0fbeef7c",
+      urls = [
+          "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip",
+          "https://storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip"
+      ],
+      build_file = str(Label("//third_party:tflite_smartreply.BUILD")),
+  )
+
+  ##############################################################################
+  # BIND DEFINITIONS
+  #
+  # Please do not add bind() definitions unless we have no other choice.
+  # If that ends up being the case, please leave a comment explaining
+  # why we can't depend on the canonical build target.
+
+  # gRPC wants a cares dependency but its contents is not actually
+  # important since we have set GRPC_ARES=0 in tools/bazel.rc
+  native.bind(
+      name = "cares",
+      actual = "@grpc//third_party/nanopb:nanopb",
+  )
+
+  # Needed by Protobuf
+  native.bind(
+      name = "grpc_cpp_plugin",
+      actual = "@grpc//:grpc_cpp_plugin",
+  )
+
+  # gRPC has three empty C++ functions which it wants the user to define
+  # at build time. https://github.com/grpc/grpc/issues/13590
+  native.bind(
+      name = "grpc_lib",
+      actual = "@grpc//:grpc++_unsecure",
+  )
+
+  # Needed by gRPC
+  native.bind(
+      name = "libssl",
+      actual = "@boringssl//:ssl",
+  )
+
+  # Needed by gRPC
+  native.bind(
+      name = "nanopb",
+      actual = "@grpc//third_party/nanopb:nanopb",
+  )
+
+  # Needed by gRPC
+  native.bind(
+      name = "protobuf",
+      actual = "@protobuf_archive//:protobuf",
+  )
+
+  # gRPC expects //external:protobuf_clib and //external:protobuf_compiler
+  # to point to Protobuf's compiler library.
+  native.bind(
+      name = "protobuf_clib",
+      actual = "@protobuf_archive//:protoc_lib",
+  )
+
+  # Needed by gRPC
+  native.bind(
+      name = "protobuf_headers",
+      actual = "@protobuf_archive//:protobuf_headers",
+  )
+
+  # Needed by Protobuf
+  native.bind(
+      name = "python_headers",
+      actual = str(Label("//util/python:python_headers")),
+  )
+
+  # Needed by Protobuf
+  native.bind(
+      name = "six",
+      actual = "@six_archive//:six",
+  )
+
+  # Needed by gRPC
+  native.bind(
+      name = "zlib",
+      actual = "@zlib_archive//:zlib",
   )
diff --git a/third_party/aws.BUILD b/third_party/aws.BUILD
index bc9e37ffb3873118960f65a6d566ca5b34e0d613..bf5310aa1657dee5e0ccc623b2028a4f8ab7aca3 100644
--- a/third_party/aws.BUILD
+++ b/third_party/aws.BUILD
@@ -7,21 +7,21 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("@%ws%//third_party:common.bzl", "template_rule")
+load("@org_tensorflow//third_party:common.bzl", "template_rule")
 
 cc_library(
     name = "aws",
     srcs = select({
-        "@%ws%//tensorflow:linux_x86_64": glob([
+        "@org_tensorflow//tensorflow:linux_x86_64": glob([
             "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp",
         ]),
-        "@%ws%//tensorflow:darwin": glob([
+        "@org_tensorflow//tensorflow:darwin": glob([
             "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp",
         ]),
-        "@%ws%//tensorflow:linux_ppc64le": glob([
+        "@org_tensorflow//tensorflow:linux_ppc64le": glob([
             "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp",
         ]),
-        "@%ws%//tensorflow:raspberry_pi_armeabi": glob([
+        "@org_tensorflow//tensorflow:raspberry_pi_armeabi": glob([
             "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp",
         ]),
         "//conditions:default": [],
@@ -53,17 +53,17 @@ cc_library(
         "aws-cpp-sdk-core/include/aws/core/SDKConfig.h",
     ],
     defines = select({
-        "@%ws%//tensorflow:linux_x86_64": [
+        "@org_tensorflow//tensorflow:linux_x86_64": [
             "PLATFORM_LINUX",
             "ENABLE_CURL_CLIENT",
             "ENABLE_NO_ENCRYPTION",
         ],
-        "@%ws%//tensorflow:darwin": [
+        "@org_tensorflow//tensorflow:darwin": [
             "PLATFORM_APPLE",
             "ENABLE_CURL_CLIENT",
             "ENABLE_NO_ENCRYPTION",
         ],
-        "@%ws%//tensorflow:linux_ppc64le": [
+        "@org_tensorflow//tensorflow:linux_ppc64le": [
             "PLATFORM_LINUX",
             "ENABLE_CURL_CLIENT",
             "ENABLE_NO_ENCRYPTION",
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index 805a30d2620078becaf1fe08a6856fce70decc50..4def6f94892329e0d8b594b824babd60ea259351 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -6,10 +6,11 @@ licenses(["notice"])  # MIT/X derivative license
 exports_files(["COPYING"])
 
 CURL_WIN_COPTS = [
-    "/I%prefix%/curl/lib",
+    "/Iexternal/curl/lib",
     "/DHAVE_CONFIG_H",
     "/DCURL_DISABLE_FTP",
     "/DCURL_DISABLE_NTLM",
+    "/DCURL_DISABLE_PROXY",
     "/DHAVE_LIBZ",
     "/DHAVE_ZLIB_H",
     # Defining _USING_V110_SDK71_ is hackery to defeat curl's incorrect
@@ -23,6 +24,8 @@ CURL_WIN_SRCS = [
     "lib/asyn-thread.c",
     "lib/inet_ntop.c",
     "lib/system_win32.c",
+    "lib/vtls/schannel.c",
+    "lib/idn_win32.c",
 ]
 
 cc_library(
@@ -224,14 +227,14 @@ cc_library(
         "lib/wildcard.h",
         "lib/x509asn1.h",
     ] + select({
-        "@%ws%//tensorflow:darwin": [
+        "@org_tensorflow//tensorflow:darwin": [
             "lib/vtls/darwinssl.c",
         ],
-        "@%ws%//tensorflow:ios": [
+        "@org_tensorflow//tensorflow:ios": [
             "lib/vtls/darwinssl.c",
         ],
-        "@%ws%//tensorflow:windows": CURL_WIN_SRCS,
-        "@%ws%//tensorflow:windows_msvc": CURL_WIN_SRCS,
+        "@org_tensorflow//tensorflow:windows": CURL_WIN_SRCS,
+        "@org_tensorflow//tensorflow:windows_msvc": CURL_WIN_SRCS,
         "//conditions:default": [
             "lib/vtls/openssl.c",
         ],
@@ -248,10 +251,10 @@ cc_library(
         "include/curl/typecheck-gcc.h",
     ],
     copts = select({
-        "@%ws%//tensorflow:windows": CURL_WIN_COPTS,
-        "@%ws%//tensorflow:windows_msvc": CURL_WIN_COPTS,
+        "@org_tensorflow//tensorflow:windows": CURL_WIN_COPTS,
+        "@org_tensorflow//tensorflow:windows_msvc": CURL_WIN_COPTS,
         "//conditions:default": [
-            "-I%prefix%/curl/lib",
+            "-Iexternal/curl/lib",
             "-D_GNU_SOURCE",
             "-DHAVE_CONFIG_H",
             "-DCURL_DISABLE_FTP",
@@ -261,14 +264,14 @@ cc_library(
             "-Wno-string-plus-int",
         ],
     }) + select({
-        "@%ws%//tensorflow:darwin": [
+        "@org_tensorflow//tensorflow:darwin": [
             "-fno-constant-cfstrings",
         ],
-        "@%ws%//tensorflow:windows": [
+        "@org_tensorflow//tensorflow:windows": [
             # See curl.h for discussion of write size and Windows
             "/DCURL_MAX_WRITE_SIZE=16384",
         ],
-        "@%ws%//tensorflow:windows_msvc": [
+        "@org_tensorflow//tensorflow:windows_msvc": [
             # See curl.h for discussion of write size and Windows
             "/DCURL_MAX_WRITE_SIZE=16384",
         ],
@@ -276,23 +279,30 @@ cc_library(
             "-DCURL_MAX_WRITE_SIZE=65536",
         ],
     }),
+    defines = ["CURL_STATICLIB"],
     includes = ["include"],
     linkopts = select({
-        "@%ws%//tensorflow:android": [
+        "@org_tensorflow//tensorflow:android": [
             "-pie",
         ],
-        "@%ws%//tensorflow:darwin": [
+        "@org_tensorflow//tensorflow:darwin": [
             "-Wl,-framework",
             "-Wl,CoreFoundation",
             "-Wl,-framework",
             "-Wl,Security",
         ],
-        "@%ws%//tensorflow:ios": [],
-        "@%ws%//tensorflow:windows": [
-            "-Wl,ws2_32.lib",
+        "@org_tensorflow//tensorflow:ios": [],
+        "@org_tensorflow//tensorflow:windows": [
+            "-DEFAULTLIB:ws2_32.lib",
+            "-DEFAULTLIB:advapi32.lib",
+            "-DEFAULTLIB:crypt32.lib",
+            "-DEFAULTLIB:Normaliz.lib",
         ],
-        "@%ws%//tensorflow:windows_msvc": [
-            "-Wl,ws2_32.lib",
+        "@org_tensorflow//tensorflow:windows_msvc": [
+            "-DEFAULTLIB:ws2_32.lib",
+            "-DEFAULTLIB:advapi32.lib",
+            "-DEFAULTLIB:crypt32.lib",
+            "-DEFAULTLIB:Normaliz.lib",
         ],
         "//conditions:default": [
             "-lrt",
@@ -302,9 +312,9 @@ cc_library(
     deps = [
         "@zlib_archive//:zlib",
     ] + select({
-        "@%ws%//tensorflow:ios": [],
-        "@%ws%//tensorflow:windows": [],
-        "@%ws%//tensorflow:windows_msvc": [],
+        "@org_tensorflow//tensorflow:ios": [],
+        "@org_tensorflow//tensorflow:windows": [],
+        "@org_tensorflow//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "@boringssl//:ssl",
         ],
@@ -312,7 +322,7 @@ cc_library(
 )
 
 CURL_BIN_WIN_COPTS = [
-    "/I%prefix%/curl/lib",
+    "/Iexternal/curl/lib",
     "/DHAVE_CONFIG_H",
     "/DCURL_DISABLE_LIBCURL_OPTION",
 ]
@@ -406,10 +416,10 @@ cc_binary(
         "src/tool_xattr.h",
     ],
     copts = select({
-        "@%ws%//tensorflow:windows": CURL_BIN_WIN_COPTS,
-        "@%ws%//tensorflow:windows_msvc": CURL_BIN_WIN_COPTS,
+        "@org_tensorflow//tensorflow:windows": CURL_BIN_WIN_COPTS,
+        "@org_tensorflow//tensorflow:windows_msvc": CURL_BIN_WIN_COPTS,
         "//conditions:default": [
-            "-I%prefix%/curl/lib",
+            "-Iexternal/curl/lib",
             "-D_GNU_SOURCE",
             "-DHAVE_CONFIG_H",
             "-DCURL_DISABLE_LIBCURL_OPTION",
@@ -438,12 +448,22 @@ genrule(
         "#  include \"lib/config-win32.h\"",
         "#  define BUILDING_LIBCURL 1",
         "#  define CURL_DISABLE_CRYPTO_AUTH 1",
+        "#  define CURL_DISABLE_DICT 1",
+        "#  define CURL_DISABLE_FILE 1",
+        "#  define CURL_DISABLE_GOPHER 1",
         "#  define CURL_DISABLE_IMAP 1",
         "#  define CURL_DISABLE_LDAP 1",
         "#  define CURL_DISABLE_LDAPS 1",
         "#  define CURL_DISABLE_POP3 1",
         "#  define CURL_PULL_WS2TCPIP_H 1",
-        "#  define HTTP_ONLY 1",
+        "#  define CURL_DISABLE_SMTP 1",
+        "#  define CURL_DISABLE_TELNET 1",
+        "#  define CURL_DISABLE_TFTP 1",
+        "#  define CURL_PULL_WS2TCPIP_H 1",
+        "#  define USE_WINDOWS_SSPI 1",
+        "#  define USE_WIN32_IDN 1",
+        "#  define USE_SCHANNEL 1",
+        "#  define WANT_IDN_PROTOTYPES 1",
         "#elif defined(__APPLE__)",
         "#  define HAVE_FSETXATTR_6 1",
         "#  define HAVE_SETMODE 1",
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h
index 89190eb1affbc710bb031c3b1eab09e9a07222f5..2864f8329990325c73aadb32018ae975809cb09d 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h
@@ -151,7 +151,7 @@ Extract3DPatches(
 
   // TODO(mjanusz): Consider getting rid of pad(), and stride() and extend
   // extract_patches to take additional parameters for padding/striding,
-  // similarly to etract_image_patches.
+  // similarly to extract_image_patches.
   return input.pad(paddings, padding_value).extract_patches(patch_dims).reshape(pre_stride_dims).stride(strides);
 }
 
diff --git a/third_party/examples/eager/spinn/BUILD b/third_party/examples/eager/spinn/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..0e39d4696fb5b4efafc94b4b96965d232ae4e473
--- /dev/null
+++ b/third_party/examples/eager/spinn/BUILD
@@ -0,0 +1,14 @@
+licenses(["notice"])  # 3-clause BSD.
+
+py_binary(
+    name = "spinn",
+    srcs = ["spinn.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/eager/python:tfe",
+        "//tensorflow/contrib/eager/python/examples/spinn:data",
+        "@six_archive//:six",
+    ],
+)
diff --git a/third_party/examples/eager/spinn/LICENSE b/third_party/examples/eager/spinn/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..09d493bf1fc257505c1336f3f87425568ab9da3c
--- /dev/null
+++ b/third_party/examples/eager/spinn/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2017, 
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/examples/eager/spinn/README.md b/third_party/examples/eager/spinn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c00d8d9015939575bc1d72ad911fa3f31d890caa
--- /dev/null
+++ b/third_party/examples/eager/spinn/README.md
@@ -0,0 +1,54 @@
+# SPINN with TensorFlow eager execution
+
+SPINN, or Stack-Augmented Parser-Interpreter Neural Network, is a recursive
+neural network that utilizes syntactic parse information for natural language
+understanding.
+
+SPINN was originally described by:
+Bowman, S.R., Gauthier, J., Rastogi A., Gupta, R., Manning, C.D., & Potts, C.
+  (2016). A Fast Unified Model for Parsing and Sentence Understanding.
+  https://arxiv.org/abs/1603.06021
+
+Our implementation is based on @jekbradbury's PyTorch implementation at:
+https://github.com/jekbradbury/examples/blob/spinn/snli/spinn.py,
+
+which was released under the BSD 3-Clause License at:
+https://github.com/jekbradbury/examples/blob/spinn/LICENSE
+
+##  Content
+
+Python source file(s):
+- `spinn.py`: Model definition and training routines written with TensorFlow
+  eager execution idioms.
+
+## To run
+
+- Make sure you have installed the latest `tf-nightly` or `tf-nightly-gpu` pip
+  package of TensorFlow in order to access the eager execution feature.
+
+- Download and extract the raw SNLI data and GloVe embedding vectors.
+  For example:
+
+  ```bash
+  curl -fSsL https://nlp.stanford.edu/projects/snli/snli_1.0.zip --create-dirs -o /tmp/spinn-data/snli/snli_1.0.zip
+  unzip -d /tmp/spinn-data/snli /tmp/spinn-data/snli/snli_1.0.zip
+  curl -fSsL http://nlp.stanford.edu/data/glove.42B.300d.zip --create-dirs -o /tmp/spinn-data/glove/glove.42B.300d.zip
+  unzip -d /tmp/spinn-data/glove /tmp/spinn-data/glove/glove.42B.300d.zip
+  ```
+
+- Train model. E.g.,
+
+  ```bash
+  python spinn.py --data_root /tmp/spinn-data --logdir /tmp/spinn-logs
+  ```
+
+  During training, model checkpoints and TensorBoard summaries will be written
+  periodically to the directory specified with the `--logdir` flag.
+  The training script will reload a saved checkpoint from the directory if it
+  can find one there.
+
+  To view the summaries with TensorBoard:
+
+  ```bash
+  tensorboard --logdir /tmp/spinn-logs
+  ```
diff --git a/third_party/examples/eager/spinn/spinn.py b/third_party/examples/eager/spinn/spinn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2fa18eeb1077c8a1ccd4ab0bcd178f952e17270
--- /dev/null
+++ b/third_party/examples/eager/spinn/spinn.py
@@ -0,0 +1,732 @@
+r"""Implementation of SPINN in TensorFlow eager execution.
+
+SPINN: Stack-Augmented Parser-Interpreter Neural Network.
+
+Ths file contains model definition and code for training the model.
+
+The model definition is based on PyTorch implementation at:
+  https://github.com/jekbradbury/examples/tree/spinn/snli
+
+which was released under a BSD 3-Clause License at:
+https://github.com/jekbradbury/examples/blob/spinn/LICENSE:
+
+Copyright (c) 2017,
+All rights reserved.
+
+See ./LICENSE for more details.
+
+Instructions for use:
+* See `README.md` for details on how to prepare the SNLI and GloVe data.
+* Suppose you have prepared the data at "/tmp/spinn-data", use the folloing
+  command to train the model:
+
+  ```bash
+  python spinn.py --data_root /tmp/spinn-data --logdir /tmp/spinn-logs
+  ```
+
+  Checkpoints and TensorBoard summaries will be written to "/tmp/spinn-logs".
+
+References:
+* Bowman, S.R., Gauthier, J., Rastogi A., Gupta, R., Manning, C.D., & Potts, C.
+  (2016). A Fast Unified Model for Parsing and Sentence Understanding.
+  https://arxiv.org/abs/1603.06021
+* Bradbury, J. (2017). Recursive Neural Networks with PyTorch.
+  https://devblogs.nvidia.com/parallelforall/recursive-neural-networks-pytorch/
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import itertools
+import os
+import sys
+import time
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+import tensorflow.contrib.eager as tfe
+from tensorflow.contrib.eager.python.examples.spinn import data
+
+
+def _bundle(lstm_iter):
+  """Concatenate a list of Tensors along 1st axis and split result into two.
+
+  Args:
+    lstm_iter: A `list` of `N` dense `Tensor`s, each of which has the shape
+      (R, 2 * M).
+
+  Returns:
+    A `list` of two dense `Tensor`s, each of which has the shape (N * R, M).
+  """
+  return tf.split(tf.concat(lstm_iter, 0), 2, axis=1)
+
+
+def _unbundle(state):
+  """Concatenate a list of Tensors along 2nd axis and split result.
+
+  This is the inverse of `_bundle`.
+
+  Args:
+    state: A `list` of two dense `Tensor`s, each of which has the shape (R, M).
+
+  Returns:
+    A `list` of `R` dense `Tensors`, each of which has the shape (1, 2 * M).
+  """
+  return tf.split(tf.concat(state, 1), state[0].shape[0], axis=0)
+
+
+class Reducer(tfe.Network):
+  """A module that applies reduce operation on left and right vectors."""
+
+  def __init__(self, size, tracker_size=None):
+    super(Reducer, self).__init__()
+    self.left = self.track_layer(tf.layers.Dense(5 * size, activation=None))
+    self.right = self.track_layer(
+        tf.layers.Dense(5 * size, activation=None, use_bias=False))
+    if tracker_size is not None:
+      self.track = self.track_layer(
+          tf.layers.Dense(5 * size, activation=None, use_bias=False))
+    else:
+      self.track = None
+
+  def call(self, left_in, right_in, tracking=None):
+    """Invoke forward pass of the Reduce module.
+
+    This method feeds a linear combination of `left_in`, `right_in` and
+    `tracking` into a Tree LSTM and returns the output of the Tree LSTM.
+
+    Args:
+      left_in: A list of length L. Each item is a dense `Tensor` with
+        the shape (1, n_dims). n_dims is the size of the embedding vector.
+      right_in: A list of the same length as `left_in`. Each item should have
+        the same shape as the items of `left_in`.
+      tracking: Optional list of the same length as `left_in`. Each item is a
+        dense `Tensor` with shape (1, tracker_size * 2). tracker_size is the
+        size of the Tracker's state vector.
+
+    Returns:
+      Output: A list of length batch_size. Each item has the shape (1, n_dims).
+    """
+    left, right = _bundle(left_in), _bundle(right_in)
+    lstm_in = self.left(left[0]) + self.right(right[0])
+    if self.track and tracking:
+      lstm_in += self.track(_bundle(tracking)[0])
+    return _unbundle(self._tree_lstm(left[1], right[1], lstm_in))
+
+  def _tree_lstm(self, c1, c2, lstm_in):
+    a, i, f1, f2, o = tf.split(lstm_in, 5, axis=1)
+    c = tf.tanh(a) * tf.sigmoid(i) + tf.sigmoid(f1) * c1 + tf.sigmoid(f2) * c2
+    h = tf.sigmoid(o) * tf.tanh(c)
+    return h, c
+
+
+class Tracker(tfe.Network):
+  """A module that tracks the history of the sentence with an LSTM."""
+
+  def __init__(self, tracker_size, predict):
+    """Constructor of Tracker.
+
+    Args:
+      tracker_size: Number of dimensions of the underlying `LSTMCell`.
+      predict: (`bool`) Whether prediction mode is enabled.
+    """
+    super(Tracker, self).__init__()
+    self._rnn = self.track_layer(tf.nn.rnn_cell.LSTMCell(tracker_size))
+    self._state_size = tracker_size
+    if predict:
+      self._transition = self.track_layer(tf.layers.Dense(4))
+    else:
+      self._transition = None
+
+  def reset_state(self):
+    self.state = None
+
+  def call(self, bufs, stacks):
+    """Invoke the forward pass of the Tracker module.
+
+    This method feeds the concatenation of the top two elements of the stacks
+    into an LSTM cell and returns the resultant state of the LSTM cell.
+
+    Args:
+      bufs: A `list` of length batch_size. Each item is a `list` of
+        max_sequence_len (maximum sequence length of the batch). Each item
+        of the nested list is a dense `Tensor` of shape (1, d_proj), where
+        d_proj is the size of the word embedding vector or the size of the
+        vector space that the word embedding vector is projected to.
+      stacks: A `list` of size batch_size. Each item is a `list` of
+        variable length corresponding to the current height of the stack.
+        Each item of the nested list is a dense `Tensor` of shape (1, d_proj).
+
+    Returns:
+      1. A list of length batch_size. Each item is a dense `Tensor` of shape
+        (1, d_tracker * 2).
+      2.  If under predict mode, result of applying a Dense layer on the
+        first state vector of the RNN. Else, `None`.
+    """
+    buf = _bundle([buf[-1] for buf in bufs])[0]
+    stack1 = _bundle([stack[-1] for stack in stacks])[0]
+    stack2 = _bundle([stack[-2] for stack in stacks])[0]
+    x = tf.concat([buf, stack1, stack2], 1)
+    if self.state is None:
+      batch_size = int(x.shape[0])
+      zeros = tf.zeros((batch_size, self._state_size), dtype=tf.float32)
+      self.state = [zeros, zeros]
+    _, self.state = self._rnn(x, self.state)
+    unbundled = _unbundle(self.state)
+    if self._transition:
+      return unbundled, self._transition(self.state[0])
+    else:
+      return unbundled, None
+
+
+class SPINN(tfe.Network):
+  """Stack-augmented Parser-Interpreter Neural Network.
+
+  See https://arxiv.org/abs/1603.06021 for more details.
+  """
+
+  def __init__(self, config):
+    """Constructor of SPINN.
+
+    Args:
+      config: A `namedtupled` with the following attributes.
+        d_proj - (`int`) number of dimensions of the vector space to project the
+          word embeddings to.
+        d_tracker - (`int`) number of dimensions of the Tracker's state vector.
+        d_hidden - (`int`) number of the dimensions of the hidden state, for the
+          Reducer module.
+        n_mlp_layers - (`int`) number of multi-layer perceptron layers to use to
+          convert the output of the `Feature` module to logits.
+        predict - (`bool`) Whether the Tracker will enabled predictions.
+    """
+    super(SPINN, self).__init__()
+    self.config = config
+    self.reducer = self.track_layer(Reducer(config.d_hidden, config.d_tracker))
+    if config.d_tracker is not None:
+      self.tracker = self.track_layer(Tracker(config.d_tracker, config.predict))
+    else:
+      self.tracker = None
+
+  def call(self, buffers, transitions, training=False):
+    """Invoke the forward pass of the SPINN model.
+
+    Args:
+      buffers: Dense `Tensor` of shape
+        (max_sequence_len, batch_size, config.d_proj).
+      transitions: Dense `Tensor` with integer values that represent the parse
+        trees of the sentences. A value of 2 indicates "reduce"; a value of 3
+        indicates "shift". Shape: (max_sequence_len * 2 - 3, batch_size).
+      training: Whether the invocation is under training mode.
+
+    Returns:
+      Output `Tensor` of shape (batch_size, config.d_embed).
+    """
+    max_sequence_len, batch_size, d_proj = (int(x) for x in buffers.shape)
+
+    # Split the buffers into left and right word items and put the initial
+    # items in a stack.
+    splitted = tf.split(
+        tf.reshape(tf.transpose(buffers, [1, 0, 2]), [-1, d_proj]),
+        max_sequence_len * batch_size, axis=0)
+    buffers = [splitted[k:k + max_sequence_len]
+               for k in xrange(0, len(splitted), max_sequence_len)]
+    stacks = [[buf[0], buf[0]] for buf in buffers]
+
+    if self.tracker:
+      # Reset tracker state for new batch.
+      self.tracker.reset_state()
+
+    num_transitions = transitions.shape[0]
+
+    # Iterate through transitions and perform the appropriate stack-pop, reduce
+    # and stack-push operations.
+    transitions = transitions.numpy()
+    for i in xrange(num_transitions):
+      trans = transitions[i]
+      if self.tracker:
+        # Invoke tracker to obtain the current tracker states for the sentences.
+        tracker_states, trans_hypothesis = self.tracker(buffers, stacks)
+        if trans_hypothesis:
+          trans = tf.argmax(trans_hypothesis, axis=-1)
+      else:
+        tracker_states = itertools.repeat(None)
+      lefts, rights, trackings = [], [], []
+      for transition, buf, stack, tracking in zip(
+          trans, buffers, stacks, tracker_states):
+        if int(transition) == 3:  # Shift.
+          stack.append(buf.pop())
+        elif int(transition) == 2:  # Reduce.
+          rights.append(stack.pop())
+          lefts.append(stack.pop())
+          trackings.append(tracking)
+
+      if rights:
+        reducer_output = self.reducer(lefts, rights, trackings)
+        reduced = iter(reducer_output)
+
+        for transition, stack in zip(trans, stacks):
+          if int(transition) == 2:  # Reduce.
+            stack.append(next(reduced))
+    return _bundle([stack.pop() for stack in stacks])[0]
+
+
+class SNLIClassifier(tfe.Network):
+  """SNLI Classifier Model.
+
+  A model aimed at solving the SNLI (Standford Natural Language Inference)
+  task, using the SPINN model from above. For details of the task, see:
+    https://nlp.stanford.edu/projects/snli/
+  """
+
+  def __init__(self, config, embed):
+    """Constructor of SNLICLassifier.
+
+    Args:
+      config: A namedtuple containing required configurations for the model. It
+        needs to have the following attributes.
+        projection - (`bool`) whether the word vectors are to be projected onto
+          another vector space (of `d_proj` dimensions).
+        d_proj - (`int`) number of dimensions of the vector space to project the
+          word embeddings to.
+        embed_dropout - (`float`) dropout rate for the word embedding vectors.
+        n_mlp_layers - (`int`) number of multi-layer perceptron (MLP) layers to
+          use to convert the output of the `Feature` module to logits.
+        mlp_dropout - (`float`) dropout rate of the MLP layers.
+        d_out - (`int`) number of dimensions of the final output of the MLP
+          layers.
+        lr - (`float`) learning rate.
+      embed: A embedding matrix of shape (vocab_size, d_embed).
+    """
+    super(SNLIClassifier, self).__init__()
+    self.config = config
+    self.embed = tf.constant(embed)
+
+    self.projection = self.track_layer(tf.layers.Dense(config.d_proj))
+    self.embed_bn = self.track_layer(tf.layers.BatchNormalization())
+    self.embed_dropout = self.track_layer(
+        tf.layers.Dropout(rate=config.embed_dropout))
+    self.encoder = self.track_layer(SPINN(config))
+
+    self.feature_bn = self.track_layer(tf.layers.BatchNormalization())
+    self.feature_dropout = self.track_layer(
+        tf.layers.Dropout(rate=config.mlp_dropout))
+
+    self.mlp_dense = []
+    self.mlp_bn = []
+    self.mlp_dropout = []
+    for _ in xrange(config.n_mlp_layers):
+      self.mlp_dense.append(self.track_layer(tf.layers.Dense(config.d_mlp)))
+      self.mlp_bn.append(
+          self.track_layer(tf.layers.BatchNormalization()))
+      self.mlp_dropout.append(
+          self.track_layer(tf.layers.Dropout(rate=config.mlp_dropout)))
+    self.mlp_output = self.track_layer(tf.layers.Dense(
+        config.d_out,
+        kernel_initializer=tf.random_uniform_initializer(minval=-5e-3,
+                                                         maxval=5e-3)))
+
+  def call(self,
+           premise,
+           premise_transition,
+           hypothesis,
+           hypothesis_transition,
+           training=False):
+    """Invoke the forward pass the SNLIClassifier model.
+
+    Args:
+      premise: The word indices of the premise sentences, with shape
+        (max_prem_seq_len, batch_size).
+      premise_transition: The transitions for the premise sentences, with shape
+        (max_prem_seq_len * 2 - 3, batch_size).
+      hypothesis: The word indices of the hypothesis sentences, with shape
+        (max_hypo_seq_len, batch_size).
+      hypothesis_transition: The transitions for the hypothesis sentences, with
+        shape (max_hypo_seq_len * 2 - 3, batch_size).
+      training: Whether the invocation is under training mode.
+
+    Returns:
+      The logits, as a dense `Tensor` of shape (batch_size, d_out), where d_out
+      is the size of the output vector.
+    """
+    # Perform embedding lookup on the premise and hypothesis inputs, which have
+    # the word-index format.
+    premise_embed = tf.nn.embedding_lookup(self.embed, premise)
+    hypothesis_embed = tf.nn.embedding_lookup(self.embed, hypothesis)
+
+    if self.config.projection:
+      # Project the embedding vectors to another vector space.
+      premise_embed = self.projection(premise_embed)
+      hypothesis_embed = self.projection(hypothesis_embed)
+
+    # Perform batch normalization and dropout on the possibly projected word
+    # vectors.
+    premise_embed = self.embed_bn(premise_embed, training=training)
+    hypothesis_embed = self.embed_bn(hypothesis_embed, training=training)
+    premise_embed = self.embed_dropout(premise_embed, training=training)
+    hypothesis_embed = self.embed_dropout(hypothesis_embed, training=training)
+
+    # Run the batch-normalized and dropout-processed word vectors through the
+    # SPINN encoder.
+    premise = self.encoder(premise_embed, premise_transition,
+                           training=training)
+    hypothesis = self.encoder(hypothesis_embed, hypothesis_transition,
+                              training=training)
+
+    # Combine encoder outputs for premises and hypotheses into logits.
+    # Then apply batch normalization and dropuout on the logits.
+    logits = tf.concat(
+        [premise, hypothesis, premise - hypothesis, premise * hypothesis], 1)
+    logits = self.feature_dropout(
+        self.feature_bn(logits, training=training), training=training)
+
+    # Apply the multi-layer perceptron on the logits.
+    for dense, bn, dropout in zip(
+        self.mlp_dense, self.mlp_bn, self.mlp_dropout):
+      logits = tf.nn.elu(dense(logits))
+      logits = dropout(bn(logits, training=training), training=training)
+    logits = self.mlp_output(logits)
+    return logits
+
+
+class SNLIClassifierTrainer(object):
+  """A class that coordinates the training of an SNLIClassifier."""
+
+  def __init__(self, snli_classifier, lr):
+    """Constructor of SNLIClassifierTrainer.
+
+    Args:
+      snli_classifier: An instance of `SNLIClassifier`.
+      lr: Learning rate.
+    """
+    self._model = snli_classifier
+    # Create a custom learning rate Variable for the RMSProp optimizer, because
+    # the learning rate needs to be manually decayed later (see
+    # decay_learning_rate()).
+    self._learning_rate = tfe.Variable(lr, name="learning_rate")
+    self._optimizer = tf.train.RMSPropOptimizer(self._learning_rate,
+                                                epsilon=1e-6)
+
+  def loss(self, labels, logits):
+    """Calculate the loss given a batch of data.
+
+    Args:
+      labels: The truth labels, with shape (batch_size,).
+      logits: The logits output from the forward pass of the SNLIClassifier
+        model, with shape (batch_size, d_out), where d_out is the output
+        dimension size of the SNLIClassifier.
+
+    Returns:
+      The loss value, as a scalar `Tensor`.
+    """
+    return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=labels, logits=logits))
+
+  def train_batch(self,
+                  labels,
+                  premise,
+                  premise_transition,
+                  hypothesis,
+                  hypothesis_transition):
+    """Train model on batch of data.
+
+    Args:
+      labels: The truth labels, with shape (batch_size,).
+      premise: The word indices of the premise sentences, with shape
+        (max_prem_seq_len, batch_size).
+      premise_transition: The transitions for the premise sentences, with shape
+        (max_prem_seq_len * 2 - 3, batch_size).
+      hypothesis: The word indices of the hypothesis sentences, with shape
+        (max_hypo_seq_len, batch_size).
+      hypothesis_transition: The transitions for the hypothesis sentences, with
+        shape (max_hypo_seq_len * 2 - 3, batch_size).
+
+    Returns:
+      1. loss value as a scalar `Tensor`.
+      2. logits as a dense `Tensor` of shape (batch_size, d_out), where d_out is
+        the output dimension size of the SNLIClassifier.
+    """
+    with tfe.GradientTape() as tape:
+      tape.watch(self._model.variables)
+      logits = self._model(premise,
+                           premise_transition,
+                           hypothesis,
+                           hypothesis_transition,
+                           training=True)
+      loss = self.loss(labels, logits)
+    gradients = tape.gradient(loss, self._model.variables)
+    self._optimizer.apply_gradients(zip(gradients, self._model.variables),
+                                    global_step=tf.train.get_global_step())
+    return loss, logits
+
+  def decay_learning_rate(self, decay_by):
+    """Decay learning rate of the optimizer by factor decay_by."""
+    self._learning_rate.assign(self._learning_rate * decay_by)
+    print("Decayed learning rate of optimizer to: %s" %
+          self._learning_rate.numpy())
+
+  @property
+  def learning_rate(self):
+    return self._learning_rate
+
+
+def _batch_n_correct(logits, label):
+  """Calculate number of correct predictions in a batch.
+
+  Args:
+    logits: A logits Tensor of shape `(batch_size, num_categories)` and dtype
+      `float32`.
+    label: A labels Tensor of shape `(batch_size,)` and dtype `int64`
+
+  Returns:
+    Number of correct predictions.
+  """
+  return tf.reduce_sum(
+      tf.cast((tf.equal(
+          tf.argmax(logits, axis=1), label)), tf.float32)).numpy()
+
+
+def _evaluate_on_dataset(snli_data, batch_size, model, trainer, use_gpu):
+  """Run evaluation on a dataset.
+
+  Args:
+    snli_data: The `data.SnliData` to use in this evaluation.
+    batch_size: The batch size to use during this evaluation.
+    model: An instance of `SNLIClassifier` to evaluate.
+    trainer: An instance of `SNLIClassifierTrainer to use for this
+      evaluation.
+    use_gpu: Whether GPU is being used.
+
+  Returns:
+    1. Average loss across all examples of the dataset.
+    2. Average accuracy rate across all examples of the dataset.
+  """
+  mean_loss = tfe.metrics.Mean()
+  accuracy = tfe.metrics.Accuracy()
+  for label, prem, prem_trans, hypo, hypo_trans in _get_dataset_iterator(
+      snli_data, batch_size):
+    if use_gpu:
+      label, prem, hypo = label.gpu(), prem.gpu(), hypo.gpu()
+    logits = model(prem, prem_trans, hypo, hypo_trans, training=False)
+    loss_val = trainer.loss(label, logits)
+    batch_size = tf.shape(label)[0]
+    mean_loss(loss_val, weights=batch_size.gpu() if use_gpu else batch_size)
+    accuracy(tf.argmax(logits, axis=1), label)
+  return mean_loss.result().numpy(), accuracy.result().numpy()
+
+
+def _get_dataset_iterator(snli_data, batch_size):
+  """Get a data iterator for a split of SNLI data.
+
+  Args:
+    snli_data: A `data.SnliData` object.
+    batch_size: The desired batch size.
+
+  Returns:
+    A dataset iterator.
+  """
+  with tf.device("/device:CPU:0"):
+    # Some tf.data ops, such as ShuffleDataset, are available only on CPU.
+    dataset = tf.data.Dataset.from_generator(
+        snli_data.get_generator(batch_size),
+        (tf.int64, tf.int64, tf.int64, tf.int64, tf.int64))
+    dataset = dataset.shuffle(snli_data.num_batches(batch_size))
+    return tfe.Iterator(dataset)
+
+
+def train_spinn(embed, train_data, dev_data, test_data, config):
+  """Train a SPINN model.
+
+  Args:
+    embed: The embedding matrix as a float32 numpy array with shape
+      [vocabulary_size, word_vector_len]. word_vector_len is the length of a
+      word embedding vector.
+    train_data: An instance of `data.SnliData`, for the train split.
+    dev_data: Same as above, for the dev split.
+    test_data: Same as above, for the test split.
+    config: A configuration object. See the argument to this Python binary for
+      details.
+
+  Returns:
+    1. Final loss value on the test split.
+    2. Final fraction of correct classifications on the test split.
+  """
+  use_gpu = tfe.num_gpus() > 0 and not config.force_cpu
+  device = "gpu:0" if use_gpu else "cpu:0"
+  print("Using device: %s" % device)
+
+  log_header = (
+      "  Time Epoch Iteration Progress    (%Epoch)   Loss   Dev/Loss"
+      "     Accuracy  Dev/Accuracy")
+  log_template = (
+      "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} {} "
+      "{:12.4f} {}")
+  dev_log_template = (
+      "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} "
+      "{:8.6f} {:12.4f} {:12.4f}")
+
+  summary_writer = tf.contrib.summary.create_file_writer(
+      config.logdir, flush_millis=10000)
+  train_len = train_data.num_batches(config.batch_size)
+  with tf.device(device), \
+       tfe.restore_variables_on_create(
+           tf.train.latest_checkpoint(config.logdir)), \
+       summary_writer.as_default(), \
+       tf.contrib.summary.always_record_summaries():
+    model = SNLIClassifier(config, embed)
+    global_step = tf.train.get_or_create_global_step()
+    trainer = SNLIClassifierTrainer(model, config.lr)
+
+    start = time.time()
+    iterations = 0
+    mean_loss = tfe.metrics.Mean()
+    accuracy = tfe.metrics.Accuracy()
+    print(log_header)
+    for epoch in xrange(config.epochs):
+      batch_idx = 0
+      for label, prem, prem_trans, hypo, hypo_trans in _get_dataset_iterator(
+          train_data, config.batch_size):
+        if use_gpu:
+          label, prem, hypo = label.gpu(), prem.gpu(), hypo.gpu()
+          # prem_trans and hypo_trans are used for dynamic control flow and can
+          # remain on CPU. Same in _evaluate_on_dataset().
+
+        iterations += 1
+        batch_train_loss, batch_train_logits = trainer.train_batch(
+            label, prem, prem_trans, hypo, hypo_trans)
+        batch_size = tf.shape(label)[0]
+        mean_loss(batch_train_loss.numpy(),
+                  weights=batch_size.gpu() if use_gpu else batch_size)
+        accuracy(tf.argmax(batch_train_logits, axis=1), label)
+
+        if iterations % config.save_every == 0:
+          all_variables = (
+              model.variables + [trainer.learning_rate] + [global_step])
+          saver = tfe.Saver(all_variables)
+          saver.save(os.path.join(config.logdir, "ckpt"),
+                     global_step=global_step)
+
+        if iterations % config.dev_every == 0:
+          dev_loss, dev_frac_correct = _evaluate_on_dataset(
+              dev_data, config.batch_size, model, trainer, use_gpu)
+          print(dev_log_template.format(
+              time.time() - start,
+              epoch, iterations, 1 + batch_idx, train_len,
+              100.0 * (1 + batch_idx) / train_len,
+              mean_loss.result(), dev_loss,
+              accuracy.result() * 100.0, dev_frac_correct * 100.0))
+          tf.contrib.summary.scalar("dev/loss", dev_loss)
+          tf.contrib.summary.scalar("dev/accuracy", dev_frac_correct)
+        elif iterations % config.log_every == 0:
+          mean_loss_val = mean_loss.result()
+          accuracy_val = accuracy.result()
+          print(log_template.format(
+              time.time() - start,
+              epoch, iterations, 1 + batch_idx, train_len,
+              100.0 * (1 + batch_idx) / train_len,
+              mean_loss_val, " " * 8, accuracy_val * 100.0, " " * 12))
+          tf.contrib.summary.scalar("train/loss", mean_loss_val)
+          tf.contrib.summary.scalar("train/accuracy", accuracy_val)
+          # Reset metrics.
+          mean_loss = tfe.metrics.Mean()
+          accuracy = tfe.metrics.Accuracy()
+
+        batch_idx += 1
+      if (epoch + 1) % config.lr_decay_every == 0:
+        trainer.decay_learning_rate(config.lr_decay_by)
+
+    test_loss, test_frac_correct = _evaluate_on_dataset(
+        test_data, config.batch_size, model, trainer, use_gpu)
+    print("Final test loss: %g; accuracy: %g%%" %
+          (test_loss, test_frac_correct * 100.0))
+
+
+def main(_):
+  config = FLAGS
+
+  # Load embedding vectors.
+  vocab = data.load_vocabulary(FLAGS.data_root)
+  word2index, embed = data.load_word_vectors(FLAGS.data_root, vocab)
+
+  print("Loading train, dev and test data...")
+  train_data = data.SnliData(
+      os.path.join(FLAGS.data_root, "snli/snli_1.0/snli_1.0_train.txt"),
+      word2index, sentence_len_limit=FLAGS.sentence_len_limit)
+  dev_data = data.SnliData(
+      os.path.join(FLAGS.data_root, "snli/snli_1.0/snli_1.0_dev.txt"),
+      word2index, sentence_len_limit=FLAGS.sentence_len_limit)
+  test_data = data.SnliData(
+      os.path.join(FLAGS.data_root, "snli/snli_1.0/snli_1.0_test.txt"),
+      word2index, sentence_len_limit=FLAGS.sentence_len_limit)
+
+  train_spinn(embed, train_data, dev_data, test_data, config)
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(
+      description=
+      "TensorFlow eager implementation of the SPINN SNLI classifier.")
+  parser.add_argument("--data_root", type=str, default="/tmp/spinn-data",
+                      help="Root directory in which the training data and "
+                      "embedding matrix are found. See README.md for how to "
+                      "generate such a directory.")
+  parser.add_argument("--sentence_len_limit", type=int, default=-1,
+                      help="Maximum allowed sentence length (# of words). "
+                      "The default of -1 means unlimited.")
+  parser.add_argument("--logdir", type=str, default="/tmp/spinn-logs",
+                      help="Directory in which summaries will be written for "
+                      "TensorBoard.")
+  parser.add_argument("--epochs", type=int, default=50,
+                      help="Number of epochs to train.")
+  parser.add_argument("--batch_size", type=int, default=128,
+                      help="Batch size to use during training.")
+  parser.add_argument("--d_proj", type=int, default=600,
+                      help="Dimensions to project the word embedding vectors "
+                      "to.")
+  parser.add_argument("--d_hidden", type=int, default=300,
+                      help="Size of the hidden layer of the Tracker.")
+  parser.add_argument("--d_out", type=int, default=4,
+                      help="Output dimensions of the SNLIClassifier.")
+  parser.add_argument("--d_mlp", type=int, default=1024,
+                      help="Size of each layer of the multi-layer perceptron "
+                      "of the SNLICLassifier.")
+  parser.add_argument("--n_mlp_layers", type=int, default=2,
+                      help="Number of layers in the multi-layer perceptron "
+                      "of the SNLICLassifier.")
+  parser.add_argument("--d_tracker", type=int, default=64,
+                      help="Size of the tracker LSTM.")
+  parser.add_argument("--log_every", type=int, default=50,
+                      help="Print log and write TensorBoard summary every _ "
+                      "training batches.")
+  parser.add_argument("--lr", type=float, default=2e-3,
+                      help="Initial learning rate.")
+  parser.add_argument("--lr_decay_by", type=float, default=0.75,
+                      help="The ratio to multiply the learning rate by every "
+                      "time the learning rate is decayed.")
+  parser.add_argument("--lr_decay_every", type=float, default=1,
+                      help="Decay the learning rate every _ epoch(s).")
+  parser.add_argument("--dev_every", type=int, default=1000,
+                      help="Run evaluation on the dev split every _ training "
+                      "batches.")
+  parser.add_argument("--save_every", type=int, default=1000,
+                      help="Save checkpoint every _ training batches.")
+  parser.add_argument("--embed_dropout", type=float, default=0.08,
+                      help="Word embedding dropout rate.")
+  parser.add_argument("--mlp_dropout", type=float, default=0.07,
+                      help="SNLIClassifier multi-layer perceptron dropout "
+                      "rate.")
+  parser.add_argument("--no-projection", action="store_false",
+                      dest="projection",
+                      help="Whether word embedding vectors are projected to "
+                      "another set of vectors (see d_proj).")
+  parser.add_argument("--predict_transitions", action="store_true",
+                      dest="predict",
+                      help="Whether the Tracker will perform prediction.")
+  parser.add_argument("--force_cpu", action="store_true", dest="force_cpu",
+                      help="Force use CPU-only regardless of whether a GPU is "
+                      "available.")
+  FLAGS, unparsed = parser.parse_known_args()
+
+  tfe.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/third_party/flatbuffers/flatbuffers.BUILD b/third_party/flatbuffers/flatbuffers.BUILD
index e1563103c86fcadf876442d0985a4e07e25ae2d2..c06c269bb2b93d0ee1c39b6045a131383eb4f8f0 100644
--- a/third_party/flatbuffers/flatbuffers.BUILD
+++ b/third_party/flatbuffers/flatbuffers.BUILD
@@ -4,10 +4,19 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
+config_setting(
+    name = "freebsd",
+    values = {"cpu": "freebsd"},
+    visibility = ["//visibility:public"],
+)
+
 FLATBUFFERS_COPTS = [
     "-fexceptions",
-    "-Wno-implicit-fallthrough",
-]
+] + select({
+    "@bazel_tools//src:windows": [],
+    "@bazel_tools//src:windows_msvc": [],
+    "//conditions:default": ["-Wno-implicit-fallthrough"],
+})
 
 # Public flatc library to compile flatbuffer files at runtime.
 cc_library(
@@ -104,10 +113,14 @@ cc_binary(
         "grpc/",
         "include/",
     ],
-    linkopts = [
+    linkopts = select({
+    ":freebsd": [
         "-lm",
-        "-ldl",
     ],
+    "//conditions:default": [
+        "-lm",
+        "-ldl",
+    ]}),
     deps = [
         ":flatc_library",
     ],
diff --git a/third_party/gif.BUILD b/third_party/gif.BUILD
index 27808a9d645e93644a8c2fac40974306dad444a7..78fbd6c0e098512d01478eba70fe614f0266c317 100644
--- a/third_party/gif.BUILD
+++ b/third_party/gif.BUILD
@@ -21,7 +21,7 @@ cc_library(
     ],
     hdrs = ["lib/gif_lib.h"],
     defines = select({
-        #"@%ws%//tensorflow:android": [
+        #"@org_tensorflow//tensorflow:android": [
         ":android": [
             "S_IREAD=S_IRUSR",
             "S_IWRITE=S_IWUSR",
diff --git a/third_party/jemalloc.BUILD b/third_party/jemalloc.BUILD
index a2addf2c66bc3aa396455ab34208d6ef756b70f2..1b0829b8fea64c74fa9b462c0716cef6385dad96 100644
--- a/third_party/jemalloc.BUILD
+++ b/third_party/jemalloc.BUILD
@@ -5,7 +5,7 @@ licenses(["notice"])  # BSD
 
 exports_files(["COPYING"])
 
-load("@%ws%//third_party:common.bzl", "template_rule")
+load("@org_tensorflow//third_party:common.bzl", "template_rule")
 
 cc_library(
     name = "jemalloc_headers",
@@ -97,10 +97,10 @@ cc_library(
     includes = ["include"],
     # pthread_atfork() is called for PPC.
     linkopts = select({
-        "@%ws%//tensorflow:linux_ppc64le": [
+        "@org_tensorflow//tensorflow:linux_ppc64le": [
             "-lpthread",
         ],
-        "@%ws%//tensorflow:linux_x86_64": [
+        "@org_tensorflow//tensorflow:linux_x86_64": [
             "-lpthread",
         ],
         "//conditions:default": [
@@ -208,8 +208,8 @@ genrule(
     name = "size_classes_h",
     outs = ["include/jemalloc/internal/size_classes.h"],
     cmd = select({
-        "@%ws%//tensorflow:linux_ppc64le": "$(location :size_classes_sh) \"3 4\" 3 16 2 >$@",
-        "@%ws%//tensorflow:linux_x86_64": "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
+        "@org_tensorflow//tensorflow:linux_ppc64le": "$(location :size_classes_sh) \"3 4\" 3 16 2 >$@",
+        "@org_tensorflow//tensorflow:linux_x86_64": "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
         "//conditions:default": "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
     }),
     tools = [":size_classes_sh"],
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index f6078052ecedd71b9af29eae628529c9045781f7..527a08c4b3732e7cfd0048d6ce4616617afcf4c2 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -5,7 +5,7 @@ licenses(["notice"])  # custom notice-style license, see LICENSE.md
 
 exports_files(["LICENSE.md"])
 
-load("@%ws%//third_party:common.bzl", "template_rule")
+load("@org_tensorflow//third_party:common.bzl", "template_rule")
 
 libjpegturbo_nocopts = "-[W]error"
 
@@ -323,14 +323,18 @@ JCONFIG_NOWIN_COMMON_SUBSTITUTIONS = {
     "#undef RIGHT_SHIFT_IS_UNSIGNED": "",
 }
 
-JCONFIG_NOWIN_SIMD_SUBSTITUTIONS = JCONFIG_NOWIN_COMMON_SUBSTITUTIONS + {
+JCONFIG_NOWIN_SIMD_SUBSTITUTIONS = {
     "#undef WITH_SIMD": "#define WITH_SIMD 1",
 }
 
-JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS = JCONFIG_NOWIN_COMMON_SUBSTITUTIONS + {
+JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS = {
     "#undef WITH_SIMD": "",
 }
 
+JCONFIG_NOWIN_SIMD_SUBSTITUTIONS.update(JCONFIG_NOWIN_COMMON_SUBSTITUTIONS)
+
+JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS.update(JCONFIG_NOWIN_COMMON_SUBSTITUTIONS)
+
 template_rule(
     name = "jconfig_nowin_nosimd",
     src = "jconfig.h.in",
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index 533c0766c71a18e614f2f101a4e74b7f35fd26c3..8b73ddabdd7ff5de7374ffbbb76e7bf954c27765 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -20,7 +20,7 @@ def if_mkl(if_true, if_false = []):
 
     """
     return select({
-        "//third_party/mkl:using_mkl": if_true,
+        str(Label("//third_party/mkl:using_mkl")): if_true,
         "//conditions:default": if_false
     })
 
@@ -60,7 +60,6 @@ mkl_repository = repository_rule(
     ],
     attrs = {
         "build_file": attr.label(),
-        "repository": attr.string(),
         "urls": attr.string_list(default = []),
         "sha256": attr.string(default = ""),
         "strip_prefix": attr.string(default = ""),
diff --git a/third_party/nccl.BUILD b/third_party/nccl.BUILD
index 06b9b8ff68a5e8aa877d605daf02bec1ea4d6bfa..b2b8e188248f90805bc2904dca9111550a7dfed8 100644
--- a/third_party/nccl.BUILD
+++ b/third_party/nccl.BUILD
@@ -44,18 +44,18 @@ cc_library(
         "-O3",
     ] + cuda_default_copts(),
     linkopts = select({
-        "@%ws%//tensorflow:android": [
+        "@org_tensorflow//tensorflow:android": [
             "-pie",
         ],
-        "@%ws%//tensorflow:darwin": [
+        "@org_tensorflow//tensorflow:darwin": [
             "-Wl,-framework",
             "-Wl,CoreFoundation",
             "-Wl,-framework",
             "-Wl,Security",
         ],
-        "@%ws%//tensorflow:ios": [],
-        "@%ws%//tensorflow:windows": [
-            "ws2_32.lib",
+        "@org_tensorflow//tensorflow:ios": [],
+        "@org_tensorflow//tensorflow:windows": [
+            "-DEFAULTLIB:ws2_32.lib",
         ],
         "//conditions:default": [
             "-lrt",
diff --git a/third_party/pcre.BUILD b/third_party/pcre.BUILD
index 68aadd1d408685291beaee3ebe0607f35e130ff1..e2cdec40295d369548ff26e3493b5d2300041916 100644
--- a/third_party/pcre.BUILD
+++ b/third_party/pcre.BUILD
@@ -50,12 +50,12 @@ cc_library(
         "-DNEWLINE=10",
         "-DNO_RECURSE",
         "-DPARENS_NEST_LIMIT=50",
-        "-DPCRE_STATIC=1",
         "-DPOSIX_MALLOC_THRESHOLD=10",
         "-DSTDC_HEADERS=1",
         "-DSUPPORT_UCP",
         "-DSUPPORT_UTF",
     ],
+    defines = ["PCRE_STATIC=1"],
     includes = ["."],
     visibility = ["@swig//:__pkg__"],  # Please use RE2
     alwayslink = 1,
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..c29fef9629570955b4c4f192c03627bc65b2d49d
--- /dev/null
+++ b/third_party/repo.bzl
@@ -0,0 +1,106 @@
+# Copyright 2017 The TensorFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for defining TensorFlow Bazel dependencies."""
+
+_SINGLE_URL_WHITELIST = depset([
+    "arm_compiler",
+    "ortools_archive",
+])
+
+def _is_windows(ctx):
+  return ctx.os.name.lower().find("windows") != -1
+
+def _get_env_var(ctx, name):
+  if name in ctx.os.environ:
+    return ctx.os.environ[name]
+  else:
+    return None
+
+# Executes specified command with arguments and calls 'fail' if it exited with
+# non-zero code
+def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
+  result = repo_ctx.execute(cmd_and_args, timeout=10)
+  if result.return_code != 0:
+    fail(("Non-zero return code({1}) when executing '{0}':\n" + "Stdout: {2}\n"
+          + "Stderr: {3}").format(" ".join(cmd_and_args), result.return_code,
+                                  result.stdout, result.stderr))
+
+def _repos_are_siblings():
+  return Label("@foo//bar").workspace_root.startswith("../")
+
+# Apply a patch_file to the repository root directory
+# Runs 'patch -p1'
+def _apply_patch(ctx, patch_file):
+  # Don't check patch on Windows, because patch is only available under bash.
+  if not _is_windows(ctx) and not ctx.which("patch"):
+    fail("patch command is not found, please install it")
+  cmd = ["patch", "-p1", "-d", ctx.path("."), "-i", ctx.path(patch_file)]
+  if _is_windows(ctx):
+    bazel_sh = _get_env_var(ctx, "BAZEL_SH")
+    if not bazel_sh:
+      fail("BAZEL_SH environment variable is not set")
+    cmd = [bazel_sh, "-c", " ".join(cmd)]
+  _execute_and_check_ret_code(ctx, cmd)
+
+def _apply_delete(ctx, paths):
+  for path in paths:
+    if path.startswith("/"):
+      fail("refusing to rm -rf path starting with '/': " + path)
+    if ".." in path:
+      fail("refusing to rm -rf path containing '..': " + path)
+  _execute_and_check_ret_code(
+      ctx, ["rm", "-rf"] + [ctx.path(path) for path in paths])
+
+def _tf_http_archive(ctx):
+  if ("mirror.bazel.build" not in ctx.attr.urls[0] or
+      (len(ctx.attr.urls) < 2 and
+       ctx.attr.name not in _SINGLE_URL_WHITELIST)):
+    fail("tf_http_archive(urls) must have redundant URLs. The " +
+         "mirror.bazel.build URL must be present and it must come first. " +
+         "Even if you don't have permission to mirror the file, please " +
+         "put the correctly formatted mirror URL there anyway, because " +
+         "someone will come along shortly thereafter and mirror the file.")
+  ctx.download_and_extract(
+      ctx.attr.urls,
+      "",
+      ctx.attr.sha256,
+      ctx.attr.type,
+      ctx.attr.strip_prefix)
+  if ctx.attr.delete:
+    _apply_delete(ctx, ctx.attr.delete)
+  if ctx.attr.patch_file != None:
+    _apply_patch(ctx, ctx.attr.patch_file)
+  if ctx.attr.build_file != None:
+    ctx.template("BUILD", ctx.attr.build_file, {
+        "%prefix%": ".." if _repos_are_siblings() else "external",
+    }, False)
+
+tf_http_archive = repository_rule(
+    implementation=_tf_http_archive,
+    attrs={
+        "sha256": attr.string(mandatory=True),
+        "urls": attr.string_list(mandatory=True, allow_empty=False),
+        "strip_prefix": attr.string(),
+        "type": attr.string(),
+        "delete": attr.string_list(),
+        "patch_file": attr.label(),
+        "build_file": attr.label(),
+    })
+"""Downloads and creates Bazel repos for dependencies.
+
+This is a swappable replacement for both http_archive() and
+new_http_archive() that offers some additional features. It also helps
+ensure best practices are followed.
+"""
diff --git a/third_party/snappy.BUILD b/third_party/snappy.BUILD
index 9c00b7068a802a361effab207409138c79addde7..fd48ed8941e159a8d6176ef3f4e1982d6600e1c2 100644
--- a/third_party/snappy.BUILD
+++ b/third_party/snappy.BUILD
@@ -50,8 +50,8 @@ genrule(
            "-e 's/@ac_cv_have_stddef_h@/1/g' " +
            "-e 's/@ac_cv_have_stdint_h@/1/g' " +
            select({
-               "@%ws%//tensorflow:windows": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
-               "@%ws%//tensorflow:windows_msvc": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
+               "@org_tensorflow//tensorflow:windows": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
+               "@org_tensorflow//tensorflow:windows_msvc": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
                "//conditions:default": "-e 's/@ac_cv_have_sys_uio_h@/1/g' ",
            }) +
            "-e 's/@SNAPPY_MAJOR@/1/g' " +
diff --git a/third_party/sycl/crosstool/trisycl.tpl b/third_party/sycl/crosstool/trisycl.tpl
index b470772fbfba65270846b0176078b3244a08171c..87a70d8f9549d57f0fc1a2c4b56ac1c4af065e71 100644
--- a/third_party/sycl/crosstool/trisycl.tpl
+++ b/third_party/sycl/crosstool/trisycl.tpl
@@ -11,10 +11,12 @@ CPU_C_COMPILER = ('%{host_c_compiler}')
 CURRENT_DIR = os.path.dirname(sys.argv[0])
 TRISYCL_INCLUDE_DIR = CURRENT_DIR + '/../sycl/include'
 
+
 def main():
   compiler_flags = []
 
-  remove_flags = ('-Wl,--no-undefined', '-Wno-unused-but-set-variable', '-Wignored-attributes', '-fno-exceptions')
+  remove_flags = ('-Wl,--no-undefined', '-Wno-unused-but-set-variable',
+                  '-Wignored-attributes', '-fno-exceptions')
   # remove -fsamotoze-coverage from string with g++
   if 'g++' in CPU_CXX_COMPILER:
     remove_flags += ('-fsanitize-coverage',)
@@ -22,52 +24,62 @@ def main():
   else:
     compiler_flags += ['-fopenmp=libomp']
 
-  compiler_flags += [flag for flag in sys.argv[1:] if not flag.startswith(remove_flags)]
-
+  compiler_flags += [
+      flag for flag in sys.argv[1:] if not flag.startswith(remove_flags)
+  ]
 
   output_file_index = compiler_flags.index('-o') + 1
   output_file_name = compiler_flags[output_file_index]
 
-  if(output_file_index == 1):
+  if (output_file_index == 1):
     # we are linking
-    return call([CPU_CXX_COMPILER] + compiler_flags +
-                ['-Wl,--no-undefined'])
+    return call([CPU_CXX_COMPILER] + compiler_flags + ['-Wl,--no-undefined'])
 
   # find what we compile
   compiling_cpp = 0
-  if('-c' in compiler_flags):
-      compiled_file_index = compiler_flags.index('-c') + 1
-      compiled_file_name = compiler_flags[compiled_file_index]
-      if(compiled_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP',
-                                      '.C', '.cxx'))):
-        compiling_cpp = 1;
-
-  debug_flags = ['-DTRISYCL_DEBUG', '-DBOOST_LOG_DYN_LINK', '-DTRISYCL_TRACE_KERNEL', '-lpthread', '-lboost_log', '-g', '-rdynamic']
+  if ('-c' in compiler_flags):
+    compiled_file_index = compiler_flags.index('-c') + 1
+    compiled_file_name = compiler_flags[compiled_file_index]
+    if (compiled_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP', '.C',
+                                     '.cxx'))):
+      compiling_cpp = 1
+
+  debug_flags = [
+      '-DTRISYCL_DEBUG', '-DBOOST_LOG_DYN_LINK', '-DTRISYCL_TRACE_KERNEL',
+      '-lpthread', '-lboost_log', '-g', '-rdynamic'
+  ]
 
   opt_flags = ['-DNDEBUG', '-DBOOST_DISABLE_ASSERTS', '-O3']
 
-  compiler_flags = compiler_flags + ['-DEIGEN_USE_SYCL=1',
-                                     '-DEIGEN_HAS_C99_MATH',
-                                     '-DEIGEN_MAX_ALIGN_BYTES=16',
-                                     '-DTENSORFLOW_USE_SYCL'] + opt_flags
+  compiler_flags = compiler_flags + [
+      '-DEIGEN_USE_SYCL=1', '-DEIGEN_HAS_C99_MATH',
+      '-DEIGEN_MAX_ALIGN_BYTES=16', '-DTENSORFLOW_USE_SYCL'
+  ] + opt_flags
 
-  if(compiling_cpp == 1):
+  if (compiling_cpp == 1):
     # create a blacklist of folders that will be skipped when compiling
     # with triSYCL
-    skip_extensions = [".cu.cc"]
-    skip_folders = ["tensorflow/compiler", "tensorflow/docs_src", "tensorflow/tensorboard", "third_party", "external", "hexagon"]
+    skip_extensions = ['.cu.cc']
+    skip_folders = [
+        'tensorflow/compiler', 'tensorflow/docs_src', 'tensorflow/tensorboard',
+        'third_party', 'external', 'hexagon'
+    ]
     skip_folders = [(folder + '/') for folder in skip_folders]
     # if compiling external project skip triSYCL
-    if any(compiled_file_name.endswith(_ext) for _ext in skip_extensions) or any(_folder in output_file_name for _folder in skip_folders):
+    if any(
+        compiled_file_name.endswith(_ext) for _ext in skip_extensions) or any(
+            _folder in output_file_name for _folder in skip_folders):
       return call([CPU_CXX_COMPILER] + compiler_flags)
 
-    host_compiler_flags = ['-xc++', '-Wno-unused-variable',
-                           '-I', TRISYCL_INCLUDE_DIR] + compiler_flags
+    host_compiler_flags = [
+        '-xc++', '-Wno-unused-variable', '-I', TRISYCL_INCLUDE_DIR
+    ] + compiler_flags
     x = call([CPU_CXX_COMPILER] + host_compiler_flags)
     return x
   else:
     # compile for C
     return call([CPU_C_COMPILER] + compiler_flags)
 
+
 if __name__ == '__main__':
   sys.exit(main())
diff --git a/third_party/sycl/sycl/BUILD.tpl b/third_party/sycl/sycl/BUILD.tpl
index b6ceaadda7c4d71aa2b8cb60a8ab65d05156765b..21b1a2bbf7d320327d8f6e35124e6ef47019130b 100755
--- a/third_party/sycl/sycl/BUILD.tpl
+++ b/third_party/sycl/sycl/BUILD.tpl
@@ -1,9 +1,9 @@
 licenses(["notice"])  # Apache 2.0
 
 load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
-load("platform", "sycl_library_path")
+load(":platform.bzl", "sycl_library_path")
 
-load("platform", "readlink_command")
+load(":platform.bzl", "readlink_command")
 
 package(default_visibility = ["//visibility:public"])
 
diff --git a/third_party/sycl/sycl_configure.bzl b/third_party/sycl/sycl_configure.bzl
index a0c9e4e43a88d05e8fa59ab4d97951679ddc9673..5b9d0eb383d1b069c2107c2c22a59c3790cb721e 100644
--- a/third_party/sycl/sycl_configure.bzl
+++ b/third_party/sycl/sycl_configure.bzl
@@ -67,7 +67,6 @@ def find_computecpp_root(repository_ctx):
 
 def find_trisycl_include_dir(repository_ctx):
   """Find triSYCL include directory. """
-  sycl_name = ""
   if _TRISYCL_INCLUDE_DIR in repository_ctx.os.environ:
     sycl_name = repository_ctx.os.environ[_TRISYCL_INCLUDE_DIR].strip()
     if sycl_name.startswith("/"):
diff --git a/third_party/tflite_smartreply.BUILD b/third_party/tflite_smartreply.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..75663eff48595b3a9aaa6c336d564cc3796e29cd
--- /dev/null
+++ b/third_party/tflite_smartreply.BUILD
@@ -0,0 +1,13 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "model_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)